Spaces:

groupeonepoint
/

LongDocumentQuestioner

Sleeping

App Files Files Community

NicolasGaudemet commited on May 27, 2023

Commit

3fd3fa4

1 Parent(s): dc9ded5

Update document_questioner_app.py

Browse files

Files changed (1) hide show

document_questioner_app.py +73 -31

document_questioner_app.py CHANGED Viewed

@@ -6,11 +6,12 @@ from langchain.document_loaders import PyPDFLoader
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.indexes import VectorstoreIndexCreator
-from langchain.chains import RetrievalQAWithSourcesChain
 from langchain.prompts import PromptTemplate
 from langchain.chat_models import ChatOpenAI
-def question_document(Document, Question):
     # loads a PDF document
     if not Document:
@@ -20,48 +21,89 @@ def question_document(Document, Question):
     loader = PyPDFLoader(Document.name)
     docs = loader.load()
     # Create embeddings
     embeddings = OpenAIEmbeddings(openai_api_key = os.environ['OpenaiKey'])
     # Write in DB
-    docsearch = Chroma.from_documents(docs, embeddings, ids=["page" + str(d.metadata["page"]) for d in docs])
     # Define LLM
-    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2, openai_api_key = os.environ['OpenaiKey'])
     # Customize map_reduce prompts
-    question_template = """{context}
-    Precise the number starting the above text in your answer. It corresponds to its page number in the document it is from. Label this number as "page".
-    Also make sure to answer in the same langage than the following question.
-    QUESTION : {question}
-    ANSWER :
-    """
-    combine_template = """{summaries}
-    Note that the above text is based on transient extracts from one source document.
-    So make sure to not mention different documents or extracts or passages or portions or texts. There is only one, entire document.
-    Also make sure to answer in the same langage than the following question.
-    QUESTION : {question}.
-    ANSWER :
-    """
-    question_prompt = PromptTemplate(template = question_template, input_variables=['context', 'question'])
-    combine_prompt = PromptTemplate(template = combine_template, input_variables=['summaries', 'question'])
     # Define chain
-    chain_type_kwargs = { "combine_prompt" : combine_prompt, "question_prompt" : question_prompt} #, "return_intermediate_steps" : True}
-    qa = RetrievalQAWithSourcesChain.from_chain_type(llm = llm, chain_type = "map_reduce", chain_type_kwargs = chain_type_kwargs, retriever=docsearch.as_retriever(), return_source_documents = True)
-    answer = qa({"question" : Question}, return_only_outputs = True)
-    return answer["answer"]
-iface = gr.Interface(
-    fn = question_document,
-    inputs= ["file","text"],
-    outputs = gr.Textbox(label="Réponse"),
-    title="Interrogateur de PDF",
-    description="par Nicolas \nPermet d'interroger un document PDF",
-    allow_flagging = "never")
-iface.launch()

 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.indexes import VectorstoreIndexCreator
+from langchain.chains import ConversationalRetrievalChain
 from langchain.prompts import PromptTemplate
 from langchain.chat_models import ChatOpenAI
+from langchain.llms import OpenAI
+def load_document(Document):
     # loads a PDF document
     if not Document:
     loader = PyPDFLoader(Document.name)
     docs = loader.load()
+    global k
+    k = len(docs)
     # Create embeddings
     embeddings = OpenAIEmbeddings(openai_api_key = os.environ['OpenaiKey'])
     # Write in DB
+    global docsearch
+    docsearch = Chroma.from_documents(docs, embeddings, ids=["page" + str(d.metadata["page"]) for d in docs], k=1)
+    global chat_history
+    chat_history = []
+    return "Endodage créé"
+def get_chat_history(inputs) -> str:
+    res = []
+    for human, ai in inputs:
+        res.append(f"Question : {human}\nRéponse : {ai}")
+    return "\n".join(res)
+def question_document(Question):
+    if "docsearch" not in globals():
+        return "Merci d'encoder un document PDF"
     # Define LLM
+    turbo = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key = os.environ['OpenaiKey'])
+    davinci = OpenAI(model_name = "text-davinci-003", openai_api_key = os.environ['OpenaiKey'])
     # Customize map_reduce prompts
+    #question_template = """{context}
+    #Precise the number starting the above text in your answer. It corresponds to its page number in the document it is from. Label this number as "page".
+    #Also make sure to answer in the same langage than the following question.
+    #QUESTION : {question}
+    #ANSWER :
+    #"""
+    #combine_template = """{summaries}
+    #Note that the above text is based on transient extracts from one source document.
+    #So make sure to not mention different documents or extracts or passages or portions or texts. There is only one, entire document.
+    #Also make sure to answer in the same langage than the following question.
+    #QUESTION : {question}.
+    #ANSWER :
+    #"""
+    #question_prompt = PromptTemplate(template = question_template, input_variables=['context', 'question'])
+    #combine_prompt = PromptTemplate(template = combine_template, input_variables=['summaries', 'question'])
     # Define chain
+    #chain_type_kwargs = { "combine_prompt" : combine_prompt, "question_prompt" : question_prompt} #, "return_intermediate_steps" : True}
+    #qa = RetrievalQAWithSourcesChain.from_chain_type(llm = llm, chain_type = "map_reduce", chain_type_kwargs = chain_type_kwargs, retriever=docsearch.as_retriever(), return_source_documents = True)
+    vectordbkwargs = {"search_distance": 10}
+    search_kwargs={"k" : k}
+    qa = ConversationalRetrievalChain.from_llm(llm = turbo, chain_type = "map_reduce",retriever=docsearch.as_retriever(search_kwargs = search_kwargs), get_chat_history = get_chat_history, return_source_documents = True)
+    answer = qa({"question" : Question,"chat_history":chat_history, "vectordbkwargs": vectordbkwargs}, return_only_outputs = True)
+    chat_history.append((Question, answer["answer"]))
+    #answer = qa({"question" : Question}, )
+    print(answer)
+    return "".join(get_chat_history(chat_history))
+with gr.Blocks() as demo:
+    gr.Markdown(
+    """
+    # Interrogateur de PDF
+    par Nicolas et Alex
+    """)
+    with gr.Row():
+      with gr.Column():
+        input_file = gr.inputs.File(label="Charger un document")
+        greet_btnee = gr.Button("Encoder le document")
+        output_words = gr.outputs.Textbox(label="Encodage")
+        greet_btnee.click(fn=load_document, inputs=input_file, outputs = output_words)
+      with gr.Column():
+        text = gr.inputs.Textbox(label="Question")
+        greet_btn = gr.Button("Poser une question")
+        answer = gr.Textbox(label = "Réponse", lines = 8)
+        greet_btn.click(fn = question_document, inputs = text, outputs = answer)
+demo.launch()