import requests import pandas as pd from io import BytesIO class QuestionLoader: def __init__(self, token: str): self._headers = {"Authorization": f"Bearer {token}"} self.url = f"https://huggingface.co/datasets/gaia-benchmark/GAIA/resolve/main/2023/validation/" def preprocess_question(self, question: dict): text = question.get("question") task_id = question.get("task_id") file_id = question.get("file_name") question_kwargs = {"task": text} if file_id: response = requests.get(self.url + file_id, headers=self._headers) extension = file_id.split(".")[-1] if extension in {"mp3"}: with open("tmp.mp3", 'wb') as f: f.write(response.content) question_kwargs["additional_args"] = {"attached_audio_file_path": "tmp.mp3"} elif extension in {"png", "jpeg"}: question_kwargs["images"] = [Image.open(BytesIO(response.content))] elif extension in {"xlsx"}: table = pd.read_excel(BytesIO(response.content)) question_kwargs["additional_args"] = {"attached_table": table} elif extension in {"py"}: question_kwargs["additional_args"] = {"attached_python_code": response.content} else: raise ValueError(f"Unrecognized file extenstion {extension}") return question_kwargs