Spaces:
Sleeping
Sleeping
| from dotenv import load_dotenv | |
| import streamlit as st | |
| import pickle | |
| from PyPDF2 import PdfReader | |
| from transformers import pipeline | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| import numpy as np | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| # Define a function to manually chunk text | |
| def chunk_text(text, chunk_size=1000, chunk_overlap=200): | |
| chunks = [] | |
| i = 0 | |
| while i < len(text): | |
| chunks.append(text[i:i + chunk_size]) | |
| i += chunk_size - chunk_overlap | |
| return chunks | |
| # Function to generate embeddings using sentence-transformers | |
| def generate_embeddings(text_chunks, model_name='all-MiniLM-L6-v2'): | |
| model = SentenceTransformer(model_name) | |
| embeddings = model.encode(text_chunks, convert_to_tensor=False) | |
| return embeddings | |
| # Function to find the most relevant chunk based on the cosine similarity | |
| def find_best_chunk(query_embedding, text_embeddings): | |
| cosine_similarities = np.dot(text_embeddings, query_embedding) / ( | |
| np.linalg.norm(text_embeddings, axis=1) * np.linalg.norm(query_embedding) | |
| ) | |
| best_index = np.argmax(cosine_similarities) | |
| return best_index, cosine_similarities[best_index] | |
| # Main Streamlit app function | |
| def main(): | |
| st.header("LLM-powered PDF Chatbot π¬") | |
| # Upload a PDF file | |
| pdf = st.file_uploader("Upload your PDF", type='pdf') | |
| if pdf is not None: | |
| pdf_reader = PdfReader(pdf) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| # Split text into chunks | |
| chunks = chunk_text(text) | |
| # Generate embeddings for the chunks | |
| store_name = pdf.name[:-4] | |
| st.write(f'{store_name}') | |
| if os.path.exists(f"{store_name}.pkl"): | |
| with open(f"{store_name}.pkl", "rb") as f: | |
| text_embeddings = pickle.load(f) | |
| st.write('Embeddings Loaded from the Disk') | |
| else: | |
| text_embeddings = generate_embeddings(chunks) | |
| with open(f"{store_name}.pkl", "wb") as f: | |
| pickle.dump(text_embeddings, f) | |
| # Accept user questions/query | |
| query = st.text_input("Ask questions about your PDF file:") | |
| if query: | |
| # Generate embeddings for the query | |
| query_embedding = generate_embeddings([query])[0] | |
| # Find the best chunk for the query | |
| best_index, similarity = find_best_chunk(query_embedding, text_embeddings) | |
| best_chunk = chunks[best_index] | |
| # Use Hugging Face pipeline for question answering | |
| qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") | |
| result = qa_pipeline(question=query, context=best_chunk) | |
| st.write(result['answer']) | |
| def set_bg_from_url(url, opacity=1): | |
| footer = """ | |
| <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-gH2yIJqKdNHPEq0n4Mqa/HGKIhSkIHeL5AyhkYV8i59U5AR6csBvApHHNl/vI1Bx" crossorigin="anonymous"> | |
| <footer> | |
| <div style='visibility: visible;margin-top:7rem;justify-content:center;display:flex;'> | |
| <p style="font-size:1.1rem;"> | |
| Made by Asmae El-ghezzaz | |
| | |
| <a href="https://www.linkedin.com/in/asmae-el-ghezzaz/"> | |
| <svg xmlns="http://www.w3.org/2000/svg" width="23" height="23" fill="white" class="bi bi-linkedin" viewBox="0 0 16 16"> | |
| <path d="M0 1.146C0 .513.526 0 1.175 0h13.65C15.474 0 16 .513 16 1.146v13.708c0 .633-.526 1.146-1.175 1.146H1.175C.526 16 0 15.487 0 14.854V1.146zm4.943 12.248V6.169H2.542v7.225h2.401zm-1.2-8.212c.837 0 1.358-.554 1.358-1.248-.015-.709-.52-1.248-1.342-1.248-.822 0-1.359.54-1.359 1.248 0 .694.521 1.248 1.327 1.248h.016zm4.908 8.212V9.359c0-.216.016-.432.08-.586.173-.431.568-.878 1.232-.878.869 0 1.216.662 1.216 1.634v3.865h2.401V9.25c0-2.22-1.184-3.252-2.764-3.252-1.274 0-1.845.7-2.165 1.193v.025h-.016a5.54 5.54 0 0 1 .016-.025V6.169h-2.4c.03.678 0 7.225 0 7.225h2.4z"/> | |
| </svg> | |
| </a> | |
| | |
| <a href="https://github.com/aelghezzaz"> | |
| <svg xmlns="http://www.w3.org/2000/svg" width="23" height="23" fill="white" class="bi bi-github" viewBox="0 0 16 16"> | |
| <path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.012 8.012 0 0 0 16 8c0-4.42-3.58-8-8-8z"/> | |
| </svg> | |
| </a> | |
| </p> | |
| </div> | |
| </footer> | |
| """ | |
| st.markdown(footer, unsafe_allow_html=True) | |
| # Set background image using HTML and CSS | |
| st.markdown( | |
| f""" | |
| <style> | |
| body {{ | |
| background: url('{url}') no-repeat center center fixed; | |
| background-size: cover; | |
| opacity: {opacity}; | |
| }} | |
| </style> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| # Set background image from URL | |
| set_bg_from_url("https://www.1access.com/wp-content/uploads/2019/10/GettyImages-1180389186.jpg", opacity=0.5) | |
| if __name__ == '__main__': | |
| main() | |