Spaces:
Runtime error
Runtime error
add pages
Browse files
app.py
CHANGED
|
@@ -35,6 +35,9 @@ css = """
|
|
| 35 |
.cell-menu-button {
|
| 36 |
z-index: -1;
|
| 37 |
}
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
|
| 40 |
def to_json_df(con: Connection, tbl: Table) -> pd.DataFrame:
|
|
@@ -70,7 +73,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 70 |
loading_codes_json = gr.JSON([], visible=False)
|
| 71 |
with gr.Row():
|
| 72 |
with gr.Column():
|
| 73 |
-
gr.Markdown("#
|
| 74 |
with gr.Group():
|
| 75 |
with gr.Tab("Select Dataset"):
|
| 76 |
with gr.Row():
|
|
@@ -82,6 +85,11 @@ with gr.Blocks(css=css) as demo:
|
|
| 82 |
with gr.Tab("Use Locally"):
|
| 83 |
use_locally_markdown = gr.Markdown()
|
| 84 |
dataframe = gr.DataFrame(to_json_df(memory_con, empty_tbl), interactive=True, wrap=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
def show_subset_dropdown(dataset: str):
|
| 87 |
if dataset and "/" not in dataset.strip().strip("/"):
|
|
@@ -97,7 +105,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 97 |
split = (splits or [""])[0]
|
| 98 |
return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
|
| 99 |
|
| 100 |
-
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str):
|
| 101 |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
| 102 |
if session and dataset and subset and split and pattern:
|
| 103 |
duckdb_file = session + ".duckdb"
|
|
@@ -105,13 +113,13 @@ with gr.Blocks(css=css) as demo:
|
|
| 105 |
setup_edits(con, dataset, pattern)
|
| 106 |
# Uncomment to have one edit for testing
|
| 107 |
# con.sql("INSERT OR REPLACE INTO edits SELECT 2 AS rowid, * FROM dataset LIMIT 1")
|
| 108 |
-
tbl = con.sql(f"SELECT * FROM edited_dataset LIMIT {PAGE_SIZE}")
|
| 109 |
return dict(value=to_json_df(con, tbl))
|
| 110 |
else:
|
| 111 |
return dict(value=to_json_df(memory_con, empty_tbl))
|
| 112 |
|
| 113 |
|
| 114 |
-
@demo.load(inputs=session_state, outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe, session_state, share_link_textbox, use_locally_markdown])
|
| 115 |
def _fetch_datasets(session: str | None, request: gr.Request):
|
| 116 |
datasets = list(HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]))
|
| 117 |
session = request.query_params.get(SESSIONS_DIR) or session
|
|
@@ -128,7 +136,8 @@ with gr.Blocks(css=css) as demo:
|
|
| 128 |
splits = show_split_dropdown(subsets["value"], loading_codes)
|
| 129 |
splits["value"] = split if session else splits["value"]
|
| 130 |
session = session if isinstance(session, str) else f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
|
| 131 |
-
|
|
|
|
| 132 |
return {
|
| 133 |
dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
|
| 134 |
loading_codes_json: loading_codes,
|
|
@@ -140,45 +149,80 @@ with gr.Blocks(css=css) as demo:
|
|
| 140 |
use_locally_markdown: (
|
| 141 |
f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
|
| 142 |
f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
|
| 143 |
-
)
|
|
|
|
|
|
|
|
|
|
| 144 |
}
|
| 145 |
|
| 146 |
-
@dataset_dropdown.select(inputs=[session_state, dataset_dropdown], outputs=[session_state, loading_codes_json, subset_dropdown, split_dropdown, dataframe])
|
| 147 |
def _show_subset_dropdown(session: str | None, dataset: str):
|
| 148 |
subsets, loading_codes = show_subset_dropdown(dataset)
|
| 149 |
splits = show_split_dropdown(subsets["value"], loading_codes)
|
| 150 |
session = f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
|
| 151 |
-
|
|
|
|
| 152 |
return {
|
| 153 |
loading_codes_json: loading_codes,
|
| 154 |
subset_dropdown: gr.Dropdown(**subsets),
|
| 155 |
split_dropdown: gr.Dropdown(**splits),
|
| 156 |
session_state: session,
|
| 157 |
dataframe: gr.DataFrame(**input_dataframe),
|
|
|
|
|
|
|
|
|
|
| 158 |
}
|
| 159 |
|
| 160 |
-
@subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[session_state, split_dropdown, dataframe])
|
| 161 |
def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
|
| 162 |
splits = show_split_dropdown(subset, loading_codes)
|
| 163 |
session = f"{dataset.replace('/', '--')}--{subset}--{splits['value']}--{uuid4()}"
|
| 164 |
-
|
|
|
|
| 165 |
return {
|
| 166 |
split_dropdown: gr.Dropdown(**splits),
|
| 167 |
session_state: session,
|
| 168 |
dataframe: gr.DataFrame(**input_dataframe),
|
|
|
|
|
|
|
|
|
|
| 169 |
}
|
| 170 |
|
| 171 |
-
@split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[session_state, dataframe])
|
| 172 |
def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
|
| 173 |
session = f"{dataset.replace('/', '--')}--{subset}--{split}--{uuid4()}"
|
| 174 |
-
|
|
|
|
| 175 |
return {
|
| 176 |
session_state: session,
|
| 177 |
dataframe: gr.DataFrame(**input_dataframe),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
}
|
| 179 |
|
| 180 |
-
@dataframe.input(inputs=[dataframe, session_state, dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json])
|
| 181 |
-
def _dataframe_input(df: pd.DataFrame, session: str | None, dataset: str, subset: str, split: str, loading_codes: list[dict]):
|
| 182 |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
| 183 |
if session and dataset and subset and split and pattern:
|
| 184 |
duckdb_file = session + ".duckdb"
|
|
@@ -188,11 +232,12 @@ with gr.Blocks(css=css) as demo:
|
|
| 188 |
columns = empty_dataset_tbl.columns
|
| 189 |
dtypes = empty_dataset_tbl.dtypes
|
| 190 |
tbl = from_json_df(con, df, columns=columns, dtypes=dtypes)
|
|
|
|
| 191 |
# TODO add edits for page > 1
|
| 192 |
# Note: Here we don't use INSERT OR REPLACE because of Not implemented Error: List Update is not supported.
|
| 193 |
-
con.sql(f"DELETE FROM edits WHERE rowid IN range({
|
| 194 |
try:
|
| 195 |
-
con.sql(f"INSERT INTO edits SELECT * FROM (SELECT unnest(range({
|
| 196 |
except duckdb.ConversionException as e:
|
| 197 |
raise gr.Error(str(e).split('\n')[0], title="duckdb.ConversionException")
|
| 198 |
print(f"Saved {dataset} edits")
|
|
|
|
| 35 |
.cell-menu-button {
|
| 36 |
z-index: -1;
|
| 37 |
}
|
| 38 |
+
.centered {
|
| 39 |
+
text-align: center;
|
| 40 |
+
}
|
| 41 |
"""
|
| 42 |
|
| 43 |
def to_json_df(con: Connection, tbl: Table) -> pd.DataFrame:
|
|
|
|
| 73 |
loading_codes_json = gr.JSON([], visible=False)
|
| 74 |
with gr.Row():
|
| 75 |
with gr.Column():
|
| 76 |
+
gr.Markdown("# π€ (WIP) Hugging Face Dataset Spreadsheets π\n\nEdit any dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)", elem_classes="centered")
|
| 77 |
with gr.Group():
|
| 78 |
with gr.Tab("Select Dataset"):
|
| 79 |
with gr.Row():
|
|
|
|
| 85 |
with gr.Tab("Use Locally"):
|
| 86 |
use_locally_markdown = gr.Markdown()
|
| 87 |
dataframe = gr.DataFrame(to_json_df(memory_con, empty_tbl), interactive=True, wrap=True)
|
| 88 |
+
with gr.Row():
|
| 89 |
+
prev_button = gr.Button("< Previous", min_width=140, interactive=False)
|
| 90 |
+
with gr.Column(scale=9, min_width=0):
|
| 91 |
+
page_html = gr.HTML("Page 1", elem_classes="centered")
|
| 92 |
+
next_button = gr.Button("Next >", min_width=140)
|
| 93 |
|
| 94 |
def show_subset_dropdown(dataset: str):
|
| 95 |
if dataset and "/" not in dataset.strip().strip("/"):
|
|
|
|
| 105 |
split = (splits or [""])[0]
|
| 106 |
return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
|
| 107 |
|
| 108 |
+
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page: int):
|
| 109 |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
| 110 |
if session and dataset and subset and split and pattern:
|
| 111 |
duckdb_file = session + ".duckdb"
|
|
|
|
| 113 |
setup_edits(con, dataset, pattern)
|
| 114 |
# Uncomment to have one edit for testing
|
| 115 |
# con.sql("INSERT OR REPLACE INTO edits SELECT 2 AS rowid, * FROM dataset LIMIT 1")
|
| 116 |
+
tbl = con.sql(f"SELECT * FROM edited_dataset LIMIT {PAGE_SIZE} OFFSET {(page - 1) * PAGE_SIZE}")
|
| 117 |
return dict(value=to_json_df(con, tbl))
|
| 118 |
else:
|
| 119 |
return dict(value=to_json_df(memory_con, empty_tbl))
|
| 120 |
|
| 121 |
|
| 122 |
+
@demo.load(inputs=session_state, outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe, session_state, share_link_textbox, use_locally_markdown, prev_button, next_button, page_html])
|
| 123 |
def _fetch_datasets(session: str | None, request: gr.Request):
|
| 124 |
datasets = list(HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]))
|
| 125 |
session = request.query_params.get(SESSIONS_DIR) or session
|
|
|
|
| 136 |
splits = show_split_dropdown(subsets["value"], loading_codes)
|
| 137 |
splits["value"] = split if session else splits["value"]
|
| 138 |
session = session if isinstance(session, str) else f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
|
| 139 |
+
page = 1
|
| 140 |
+
input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session, page)
|
| 141 |
return {
|
| 142 |
dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
|
| 143 |
loading_codes_json: loading_codes,
|
|
|
|
| 149 |
use_locally_markdown: (
|
| 150 |
f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
|
| 151 |
f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
|
| 152 |
+
),
|
| 153 |
+
prev_button: gr.Button(interactive=False),
|
| 154 |
+
next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
|
| 155 |
+
page_html: f"Page {page}",
|
| 156 |
}
|
| 157 |
|
| 158 |
+
@dataset_dropdown.select(inputs=[session_state, dataset_dropdown], outputs=[session_state, loading_codes_json, subset_dropdown, split_dropdown, dataframe, prev_button, next_button, page_html])
|
| 159 |
def _show_subset_dropdown(session: str | None, dataset: str):
|
| 160 |
subsets, loading_codes = show_subset_dropdown(dataset)
|
| 161 |
splits = show_split_dropdown(subsets["value"], loading_codes)
|
| 162 |
session = f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
|
| 163 |
+
page = 1
|
| 164 |
+
input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session, page)
|
| 165 |
return {
|
| 166 |
loading_codes_json: loading_codes,
|
| 167 |
subset_dropdown: gr.Dropdown(**subsets),
|
| 168 |
split_dropdown: gr.Dropdown(**splits),
|
| 169 |
session_state: session,
|
| 170 |
dataframe: gr.DataFrame(**input_dataframe),
|
| 171 |
+
prev_button: gr.Button(interactive=False),
|
| 172 |
+
next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
|
| 173 |
+
page_html: f"Page {page}",
|
| 174 |
}
|
| 175 |
|
| 176 |
+
@subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[session_state, split_dropdown, dataframe, prev_button, next_button, page_html])
|
| 177 |
def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
|
| 178 |
splits = show_split_dropdown(subset, loading_codes)
|
| 179 |
session = f"{dataset.replace('/', '--')}--{subset}--{splits['value']}--{uuid4()}"
|
| 180 |
+
page = 1
|
| 181 |
+
input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes, session, page)
|
| 182 |
return {
|
| 183 |
split_dropdown: gr.Dropdown(**splits),
|
| 184 |
session_state: session,
|
| 185 |
dataframe: gr.DataFrame(**input_dataframe),
|
| 186 |
+
prev_button: gr.Button(interactive=False),
|
| 187 |
+
next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
|
| 188 |
+
page_html: f"Page {page}",
|
| 189 |
}
|
| 190 |
|
| 191 |
+
@split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[session_state, dataframe, prev_button, next_button, page_html])
|
| 192 |
def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
|
| 193 |
session = f"{dataset.replace('/', '--')}--{subset}--{split}--{uuid4()}"
|
| 194 |
+
page = 1
|
| 195 |
+
input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page)
|
| 196 |
return {
|
| 197 |
session_state: session,
|
| 198 |
dataframe: gr.DataFrame(**input_dataframe),
|
| 199 |
+
prev_button: gr.Button(interactive=False),
|
| 200 |
+
next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
|
| 201 |
+
page_html: f"Page {page}",
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
@next_button.click(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, session_state, page_html], outputs=[dataframe, prev_button, next_button, page_html])
|
| 205 |
+
def _show_next_page(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page_str: str) -> pd.DataFrame:
|
| 206 |
+
page = int(page_str.split(" ")[-1]) + 1
|
| 207 |
+
input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page)
|
| 208 |
+
return {
|
| 209 |
+
dataframe: gr.DataFrame(**input_dataframe),
|
| 210 |
+
prev_button: gr.Button(elem_classes="", interactive=True),
|
| 211 |
+
page_html: f"Page {page}",
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
@prev_button.click(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, session_state, page_html], outputs=[dataframe, prev_button, next_button, page_html])
|
| 215 |
+
def _show_prev_page(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page_str: str) -> pd.DataFrame:
|
| 216 |
+
page = int(page_str.split(" ")[-1]) - 1
|
| 217 |
+
input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page)
|
| 218 |
+
return {
|
| 219 |
+
dataframe: gr.DataFrame(**input_dataframe),
|
| 220 |
+
prev_button: gr.Button(interactive=False) if page == 1 else gr.Button(elem_classes="", interactive=True),
|
| 221 |
+
page_html: f"Page {page}",
|
| 222 |
}
|
| 223 |
|
| 224 |
+
@dataframe.input(inputs=[dataframe, session_state, dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, page_html])
|
| 225 |
+
def _dataframe_input(df: pd.DataFrame, session: str | None, dataset: str, subset: str, split: str, loading_codes: list[dict], page_str: str):
|
| 226 |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
| 227 |
if session and dataset and subset and split and pattern:
|
| 228 |
duckdb_file = session + ".duckdb"
|
|
|
|
| 232 |
columns = empty_dataset_tbl.columns
|
| 233 |
dtypes = empty_dataset_tbl.dtypes
|
| 234 |
tbl = from_json_df(con, df, columns=columns, dtypes=dtypes)
|
| 235 |
+
page = int(page_str.split(" ")[-1])
|
| 236 |
# TODO add edits for page > 1
|
| 237 |
# Note: Here we don't use INSERT OR REPLACE because of Not implemented Error: List Update is not supported.
|
| 238 |
+
con.sql(f"DELETE FROM edits WHERE rowid IN range({(page - 1) * PAGE_SIZE}, {page * PAGE_SIZE})")
|
| 239 |
try:
|
| 240 |
+
con.sql(f"INSERT INTO edits SELECT * FROM (SELECT unnest(range({(page - 1) * PAGE_SIZE}, {page * PAGE_SIZE})) AS rowid) POSITIONAL JOIN tbl")
|
| 241 |
except duckdb.ConversionException as e:
|
| 242 |
raise gr.Error(str(e).split('\n')[0], title="duckdb.ConversionException")
|
| 243 |
print(f"Saved {dataset} edits")
|