File size: 1,336 Bytes
7993ea5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import requests
import pandas as pd
from io import BytesIO

class QuestionLoader:

  def __init__(self, token: str):
    self._headers = {"Authorization": f"Bearer {token}"}
    self.url = f"https://huggingface.co/datasets/gaia-benchmark/GAIA/resolve/main/2023/validation/"

  def preprocess_question(self, question: dict):
    text = question.get("question")
    task_id = question.get("task_id")
    file_id = question.get("file_name")

    question_kwargs = {"task": text}
    if file_id:
        response = requests.get(self.url + file_id, headers=self._headers)
        extension = file_id.split(".")[-1]
        if extension in {"mp3"}:
          with open("tmp.mp3", 'wb') as f:
            f.write(response.content)
          question_kwargs["additional_args"] = {"attached_audio_file_path": "tmp.mp3"}
        elif extension in {"png", "jpeg"}:
          question_kwargs["images"] = [Image.open(BytesIO(response.content))]
        elif extension in {"xlsx"}:
          table = pd.read_excel(BytesIO(response.content))
          question_kwargs["additional_args"] = {"attached_table": table}
        elif extension in {"py"}:
          question_kwargs["additional_args"] = {"attached_python_code": response.content}
        else:
          raise ValueError(f"Unrecognized file extenstion {extension}")
    return question_kwargs