| import requests | |
| import pandas as pd | |
| from io import BytesIO | |
| from PIL import Image | |
| class QuestionLoader: | |
| def __init__(self, token: str): | |
| self._headers = {"Authorization": f"Bearer {token}"} | |
| self.url = f"https://huggingface.co/datasets/gaia-benchmark/GAIA/resolve/main/2023/validation/" | |
| def preprocess_question(self, question: dict): | |
| text = question.get("question") | |
| task_id = question.get("task_id") | |
| file_id = question.get("file_name") | |
| question_kwargs = {"task": text} | |
| if file_id: | |
| response = requests.get(self.url + file_id, headers=self._headers) | |
| extension = file_id.split(".")[-1] | |
| if extension in {"mp3"}: | |
| with open("tmp.mp3", 'wb') as f: | |
| f.write(response.content) | |
| question_kwargs["additional_args"] = {"attached_audio_file_path": "tmp.mp3"} | |
| elif extension in {"png", "jpeg"}: | |
| question_kwargs["images"] = [Image.open(BytesIO(response.content))] | |
| elif extension in {"xlsx"}: | |
| table = pd.read_excel(BytesIO(response.content)) | |
| question_kwargs["additional_args"] = {"attached_table": table} | |
| elif extension in {"py"}: | |
| question_kwargs["additional_args"] = {"attached_python_code": response.content} | |
| else: | |
| raise ValueError(f"Unrecognized file extenstion {extension}") | |
| return question_kwargs |