Spaces:

sunbal7
/

PDFQueryApplication

Sleeping

App Files Files Community

sunbal7 commited on Jun 19

Commit

3acced2

verified ·

1 Parent(s): 99e6cea

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -36

app.py CHANGED Viewed

@@ -4,11 +4,11 @@ import fitz  # PyMuPDF
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
-from langchain_community.llms import HuggingFaceHub
 from langchain.chains import RetrievalQA
-import tempfile
 import os
-import base64
 # Page configuration
 st.set_page_config(
@@ -72,29 +72,52 @@ st.markdown("""
         from { opacity: 0; }
         to { opacity: 1; }
     }
 </style>
 """, unsafe_allow_html=True)
 # Initialize session state
 if 'pdf_processed' not in st.session_state:
     st.session_state.pdf_processed = False
-if 'qa_chain' not in st.session_state:
-    st.session_state.qa_chain = None
 if 'pages' not in st.session_state:
     st.session_state.pages = []
-# Load models with caching
 @st.cache_resource
 def load_embedding_model():
     return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-@st.cache_resource
-def load_qa_model():
-    return HuggingFaceHub(
-        repo_id="google/flan-t5-xxl",
-        model_kwargs={"temperature": 0.5, "max_length": 512},
-        huggingfacehub_api_token=os.getenv("HF_API_KEY")
-    )
 def process_pdf(pdf_file):
     """Extract text from PDF and create vector store"""
@@ -103,8 +126,9 @@ def process_pdf(pdf_file):
         text = ""
         st.session_state.pages = []
         for page in doc:
-            text += page.get_text()
-            st.session_state.pages.append(page.get_text())
     with st.spinner("🔍 Processing text..."):
         text_splitter = RecursiveCharacterTextSplitter(
@@ -115,19 +139,44 @@ def process_pdf(pdf_file):
         chunks = text_splitter.split_text(text)
         embeddings = load_embedding_model()
-        vector_store = FAISS.from_texts(chunks, embeddings)
-        qa_model = load_qa_model()
-        st.session_state.qa_chain = RetrievalQA.from_chain_type(
-            llm=qa_model,
-            chain_type="stuff",
-            retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
-            return_source_documents=True
-        )
     st.session_state.pdf_processed = True
     st.success("✅ PDF processed successfully!")
 def generate_qa_for_chapter(start_page, end_page):
     """Generate Q&A for specific chapter pages"""
     if start_page < 1 or end_page > len(st.session_state.pages) or start_page > end_page:
@@ -144,17 +193,19 @@ def generate_qa_for_chapter(start_page, end_page):
     chunks = text_splitter.split_text(chapter_text)
     qa_pairs = []
-    qa_model = load_qa_model()
     with st.spinner(f"🧠 Generating Q&A for pages {start_page}-{end_page}..."):
         for i, chunk in enumerate(chunks):
             if i % 2 == 0:  # Generate question
-                prompt = f"Generate a study question based on: {chunk[:500]}"
-                question = qa_model(prompt)[:120] + "?"
             else:  # Generate answer
-                prompt = f"Answer the question: {qa_pairs[-1][0]} using context: {chunk[:500]}"
-                answer = qa_model(prompt)
-                qa_pairs[-1] = (qa_pairs[-1][0], answer)
     return qa_pairs
@@ -175,8 +226,8 @@ if pdf_file:
         # Navigation tabs
         selected_tab = option_menu(
             None,
-            ["Ask Questions", "Generate Chapter Q&A"],
-            icons=["chat", "book"],
             menu_icon="cast",
             default_index=0,
             orientation="horizontal",
@@ -194,11 +245,11 @@ if pdf_file:
             if user_question:
                 with st.spinner("🤔 Thinking..."):
-                    result = st.session_state.qa_chain({"query": user_question})
-                    st.markdown(f"<div class='card'><b>Answer:</b> {result['result']}</div>", unsafe_allow_html=True)
                     with st.expander("🔍 See source passages"):
-                        for i, doc in enumerate(result["source_documents"]):
                             st.markdown(f"**Passage {i+1}:** {doc.page_content[:500]}...")
         # Chapter Q&A Generation Tab
@@ -224,11 +275,24 @@ if pdf_file:
                         """, unsafe_allow_html=True)
                 else:
                     st.warning("No Q&A pairs generated. Try a different page range.")
 # Footer
 st.markdown("---")
 st.markdown("""
 <div style="text-align: center; padding: 20px;">
-    Built with ❤️ for students | PDF Study Assistant v1.0
 </div>
 """, unsafe_allow_html=True)

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain.chains import RetrievalQA
+from langchain_community.llms import HuggingFaceEndpoint
+import requests
 import os
+import json
 # Page configuration
 st.set_page_config(
         from { opacity: 0; }
         to { opacity: 1; }
     }
+    .spinner {
+        display: flex;
+        justify-content: center;
+        align-items: center;
+        height: 100px;
+    }
 </style>
 """, unsafe_allow_html=True)
 # Initialize session state
 if 'pdf_processed' not in st.session_state:
     st.session_state.pdf_processed = False
+if 'vector_store' not in st.session_state:
+    st.session_state.vector_store = None
 if 'pages' not in st.session_state:
     st.session_state.pages = []
+if 'history' not in st.session_state:
+    st.session_state.history = []
+# Load embedding model with caching
 @st.cache_resource
 def load_embedding_model():
     return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+def query_hf_inference_api(prompt, model="google/flan-t5-xxl", max_tokens=200):
+    """Query Hugging Face Inference API directly"""
+    API_URL = f"https://api-inference.huggingface.co/models/{model}"
+    headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"}
+    payload = {
+        "inputs": prompt,
+        "parameters": {
+            "max_new_tokens": max_tokens,
+            "temperature": 0.5,
+            "do_sample": False
+        }
+    }
+    try:
+        response = requests.post(API_URL, headers=headers, json=payload)
+        response.raise_for_status()
+        result = response.json()
+        return result[0]['generated_text'] if result else ""
+    except Exception as e:
+        st.error(f"Error querying model: {str(e)}")
+        return ""
 def process_pdf(pdf_file):
     """Extract text from PDF and create vector store"""
         text = ""
         st.session_state.pages = []
         for page in doc:
+            page_text = page.get_text()
+            text += page_text
+            st.session_state.pages.append(page_text)
     with st.spinner("🔍 Processing text..."):
         text_splitter = RecursiveCharacterTextSplitter(
         chunks = text_splitter.split_text(text)
         embeddings = load_embedding_model()
+        st.session_state.vector_store = FAISS.from_texts(chunks, embeddings)
     st.session_state.pdf_processed = True
     st.success("✅ PDF processed successfully!")
+def ask_question(question):
+    """Answer a question using the vector store and Hugging Face API"""
+    if not st.session_state.vector_store:
+        return "PDF not processed yet", []
+    # Find relevant passages
+    docs = st.session_state.vector_store.similarity_search(question, k=3)
+    context = "\n\n".join([doc.page_content for doc in docs])
+    # Format prompt for the model
+    prompt = f"""
+    Based on the following context, answer the question.
+    If the answer isn't in the context, say "I don't know".
+    Context:
+    {context}
+    Question: {question}
+    Answer:
+    """
+    # Query the model
+    answer = query_hf_inference_api(prompt)
+    # Add to history
+    st.session_state.history.append({
+        "question": question,
+        "answer": answer,
+        "sources": [doc.page_content for doc in docs]
+    })
+    return answer, docs
 def generate_qa_for_chapter(start_page, end_page):
     """Generate Q&A for specific chapter pages"""
     if start_page < 1 or end_page > len(st.session_state.pages) or start_page > end_page:
     chunks = text_splitter.split_text(chapter_text)
     qa_pairs = []
     with st.spinner(f"🧠 Generating Q&A for pages {start_page}-{end_page}..."):
         for i, chunk in enumerate(chunks):
             if i % 2 == 0:  # Generate question
+                prompt = f"Based on this text, generate one study question: {chunk[:500]}"
+                question = query_hf_inference_api(prompt, max_tokens=100)
+                if question and not question.endswith("?"):
+                    question += "?"
             else:  # Generate answer
+                if qa_pairs:  # Ensure we have a question to answer
+                    prompt = f"Answer this question: {qa_pairs[-1][0]} using this context: {chunk[:500]}"
+                    answer = query_hf_inference_api(prompt, max_tokens=200)
+                    qa_pairs[-1] = (qa_pairs[-1][0], answer)
     return qa_pairs
         # Navigation tabs
         selected_tab = option_menu(
             None,
+            ["Ask Questions", "Generate Chapter Q&A", "History"],
+            icons=["chat", "book", "clock-history"],
             menu_icon="cast",
             default_index=0,
             orientation="horizontal",
             if user_question:
                 with st.spinner("🤔 Thinking..."):
+                    answer, docs = ask_question(user_question)
+                    st.markdown(f"<div class='card'><b>Answer:</b> {answer}</div>", unsafe_allow_html=True)
                     with st.expander("🔍 See source passages"):
+                        for i, doc in enumerate(docs):
                             st.markdown(f"**Passage {i+1}:** {doc.page_content[:500]}...")
         # Chapter Q&A Generation Tab
                         """, unsafe_allow_html=True)
                 else:
                     st.warning("No Q&A pairs generated. Try a different page range.")
+        # History Tab
+        elif selected_tab == "History":
+            st.markdown("### ⏳ Question History")
+            if not st.session_state.history:
+                st.info("No questions asked yet.")
+            else:
+                for i, item in enumerate(reversed(st.session_state.history)):
+                    with st.expander(f"Q{i+1}: {item['question']}"):
+                        st.markdown(f"**Answer:** {item['answer']}")
+                        st.markdown("**Source Passages:**")
+                        for j, source in enumerate(item['sources']):
+                            st.markdown(f"{j+1}. {source[:500]}...")
 # Footer
 st.markdown("---")
 st.markdown("""
 <div style="text-align: center; padding: 20px;">
+    Built with ❤️ for students | PDF Study Assistant v2.0
 </div>
 """, unsafe_allow_html=True)