alberto
import Image
6152d82
import requests
import pandas as pd
from io import BytesIO
from PIL import Image
class QuestionLoader:
def __init__(self, token: str):
self._headers = {"Authorization": f"Bearer {token}"}
self.url = f"https://huggingface.co/datasets/gaia-benchmark/GAIA/resolve/main/2023/validation/"
def preprocess_question(self, question: dict):
text = question.get("question")
task_id = question.get("task_id")
file_id = question.get("file_name")
question_kwargs = {"task": text}
if file_id:
response = requests.get(self.url + file_id, headers=self._headers)
extension = file_id.split(".")[-1]
if extension in {"mp3"}:
with open("tmp.mp3", 'wb') as f:
f.write(response.content)
question_kwargs["additional_args"] = {"attached_audio_file_path": "tmp.mp3"}
elif extension in {"png", "jpeg"}:
question_kwargs["images"] = [Image.open(BytesIO(response.content))]
elif extension in {"xlsx"}:
table = pd.read_excel(BytesIO(response.content))
question_kwargs["additional_args"] = {"attached_table": table}
elif extension in {"py"}:
question_kwargs["additional_args"] = {"attached_python_code": response.content}
else:
raise ValueError(f"Unrecognized file extenstion {extension}")
return question_kwargs