Spaces:
Paused
Paused
| """Generation and evaluation of evals.""" | |
| from random import randint | |
| from typing import ClassVar | |
| import numpy as np | |
| import pandas as pd | |
| from pydantic import BaseModel, Field, field_validator | |
| from sqlmodel import Session, func, select | |
| from tqdm.auto import tqdm, trange | |
| from raglite._config import RAGLiteConfig | |
| from raglite._database import Chunk, Document, Eval, create_database_engine | |
| from raglite._extract import extract_with_llm | |
| from raglite._rag import rag | |
| from raglite._search import hybrid_search, retrieve_segments, vector_search | |
| from raglite._typing import SearchMethod | |
| def insert_evals( # noqa: C901 | |
| *, num_evals: int = 100, max_contexts_per_eval: int = 20, config: RAGLiteConfig | None = None | |
| ) -> None: | |
| """Generate and insert evals into the database.""" | |
| class QuestionResponse(BaseModel): | |
| """A specific question about the content of a set of document contexts.""" | |
| question: str = Field( | |
| ..., | |
| description="A specific question about the content of a set of document contexts.", | |
| min_length=1, | |
| ) | |
| system_prompt: ClassVar[str] = """ | |
| You are given a set of contexts extracted from a document. | |
| You are a subject matter expert on the document's topic. | |
| Your task is to generate a question to quiz other subject matter experts on the information in the provided context. | |
| The question MUST satisfy ALL of the following criteria: | |
| - The question SHOULD integrate as much of the provided context as possible. | |
| - The question MUST NOT be a general or open question, but MUST instead be as specific to the provided context as possible. | |
| - The question MUST be completely answerable using ONLY the information in the provided context, without depending on any background information. | |
| - The question MUST be entirely self-contained and able to be understood in full WITHOUT access to the provided context. | |
| - The question MUST NOT reference the existence of the context, directly or indirectly. | |
| - The question MUST treat the context as if its contents are entirely part of your working memory. | |
| """.strip() | |
| def validate_question(cls, value: str) -> str: | |
| """Validate the question.""" | |
| question = value.strip().lower() | |
| if "context" in question or "document" in question or "question" in question: | |
| raise ValueError | |
| if not question.endswith("?"): | |
| raise ValueError | |
| return value | |
| config = config or RAGLiteConfig() | |
| engine = create_database_engine(config) | |
| with Session(engine) as session: | |
| for _ in trange(num_evals, desc="Generating evals", unit="eval", dynamic_ncols=True): | |
| # Sample a random document from the database. | |
| seed_document = session.exec(select(Document).order_by(func.random()).limit(1)).first() | |
| if seed_document is None: | |
| error_message = "First run `insert_document()` before generating evals." | |
| raise ValueError(error_message) | |
| # Sample a random chunk from that document. | |
| seed_chunk = session.exec( | |
| select(Chunk) | |
| .where(Chunk.document_id == seed_document.id) | |
| .order_by(func.random()) | |
| .limit(1) | |
| ).first() | |
| if seed_chunk is None: | |
| continue | |
| # Expand the seed chunk into a set of related chunks. | |
| related_chunk_ids, _ = vector_search( | |
| np.mean(seed_chunk.embedding_matrix, axis=0, keepdims=True), | |
| num_results=randint(2, max_contexts_per_eval // 2), # noqa: S311 | |
| config=config, | |
| ) | |
| related_chunks = retrieve_segments(related_chunk_ids, config=config) | |
| # Extract a question from the seed chunk's related chunks. | |
| try: | |
| question_response = extract_with_llm( | |
| QuestionResponse, related_chunks, config=config | |
| ) | |
| except ValueError: | |
| continue | |
| else: | |
| question = question_response.question | |
| # Search for candidate chunks to answer the generated question. | |
| candidate_chunk_ids, _ = hybrid_search( | |
| question, num_results=max_contexts_per_eval, config=config | |
| ) | |
| candidate_chunks = [session.get(Chunk, chunk_id) for chunk_id in candidate_chunk_ids] | |
| # Determine which candidate chunks are relevant to answer the generated question. | |
| class ContextEvalResponse(BaseModel): | |
| """Indicate whether the provided context can be used to answer a given question.""" | |
| hit: bool = Field( | |
| ..., | |
| description="True if the provided context contains (a part of) the answer to the given question, false otherwise.", | |
| ) | |
| system_prompt: ClassVar[str] = f""" | |
| You are given a context extracted from a document. | |
| You are a subject matter expert on the document's topic. | |
| Your task is to answer whether the provided context contains (a part of) the answer to this question: "{question}" | |
| An example of a context that does NOT contain (a part of) the answer is a table of contents. | |
| """.strip() | |
| relevant_chunks = [] | |
| for candidate_chunk in tqdm( | |
| candidate_chunks, desc="Evaluating chunks", unit="chunk", dynamic_ncols=True | |
| ): | |
| try: | |
| context_eval_response = extract_with_llm( | |
| ContextEvalResponse, str(candidate_chunk), config=config | |
| ) | |
| except ValueError: # noqa: PERF203 | |
| pass | |
| else: | |
| if context_eval_response.hit: | |
| relevant_chunks.append(candidate_chunk) | |
| if not relevant_chunks: | |
| continue | |
| # Answer the question using the relevant chunks. | |
| class AnswerResponse(BaseModel): | |
| """Answer a question using the provided context.""" | |
| answer: str = Field( | |
| ..., | |
| description="A complete answer to the given question using the provided context.", | |
| min_length=1, | |
| ) | |
| system_prompt: ClassVar[str] = f""" | |
| You are given a set of contexts extracted from a document. | |
| You are a subject matter expert on the document's topic. | |
| Your task is to generate a complete answer to the following question using the provided context: "{question}" | |
| The answer MUST satisfy ALL of the following criteria: | |
| - The answer MUST integrate as much of the provided context as possible. | |
| - The answer MUST be entirely self-contained and able to be understood in full WITHOUT access to the provided context. | |
| - The answer MUST NOT reference the existence of the context, directly or indirectly. | |
| - The answer MUST treat the context as if its contents are entirely part of your working memory. | |
| """.strip() | |
| try: | |
| answer_response = extract_with_llm( | |
| AnswerResponse, | |
| [str(relevant_chunk) for relevant_chunk in relevant_chunks], | |
| config=config, | |
| ) | |
| except ValueError: | |
| continue | |
| else: | |
| answer = answer_response.answer | |
| # Store the eval in the database. | |
| eval_ = Eval.from_chunks( | |
| question=question, | |
| contexts=relevant_chunks, | |
| ground_truth=answer, | |
| ) | |
| session.add(eval_) | |
| session.commit() | |
| def answer_evals( | |
| num_evals: int = 100, | |
| search: SearchMethod = hybrid_search, | |
| *, | |
| config: RAGLiteConfig | None = None, | |
| ) -> pd.DataFrame: | |
| """Read evals from the database and answer them with RAG.""" | |
| # Read evals from the database. | |
| config = config or RAGLiteConfig() | |
| engine = create_database_engine(config) | |
| with Session(engine) as session: | |
| evals = session.exec(select(Eval).limit(num_evals)).all() | |
| # Answer evals with RAG. | |
| answers: list[str] = [] | |
| contexts: list[list[str]] = [] | |
| for eval_ in tqdm(evals, desc="Answering evals", unit="eval", dynamic_ncols=True): | |
| response = rag(eval_.question, search=search, config=config) | |
| answer = "".join(response) | |
| answers.append(answer) | |
| chunk_ids, _ = search(eval_.question, config=config) | |
| contexts.append(retrieve_segments(chunk_ids)) | |
| # Collect the answered evals. | |
| answered_evals: dict[str, list[str] | list[list[str]]] = { | |
| "question": [eval_.question for eval_ in evals], | |
| "answer": answers, | |
| "contexts": contexts, | |
| "ground_truth": [eval_.ground_truth for eval_ in evals], | |
| "ground_truth_contexts": [eval_.contexts for eval_ in evals], | |
| } | |
| answered_evals_df = pd.DataFrame.from_dict(answered_evals) | |
| return answered_evals_df | |
| def evaluate( | |
| answered_evals: pd.DataFrame | int = 100, | |
| config: RAGLiteConfig | None = None, | |
| ) -> pd.DataFrame: | |
| """Evaluate the performance of a set of answered evals with Ragas.""" | |
| try: | |
| from datasets import Dataset | |
| from langchain_community.chat_models import ChatLiteLLM | |
| from langchain_community.embeddings import LlamaCppEmbeddings | |
| from langchain_community.llms import LlamaCpp | |
| from ragas import RunConfig | |
| from ragas import evaluate as ragas_evaluate | |
| from raglite._litellm import LlamaCppPythonLLM | |
| except ImportError as import_error: | |
| error_message = "To use the `evaluate` function, please install the `ragas` extra." | |
| raise ImportError(error_message) from import_error | |
| # Create a set of answered evals if not provided. | |
| config = config or RAGLiteConfig() | |
| answered_evals_df = ( | |
| answered_evals | |
| if isinstance(answered_evals, pd.DataFrame) | |
| else answer_evals(num_evals=answered_evals, config=config) | |
| ) | |
| # Load the LLM. | |
| if config.llm.startswith("llama-cpp-python"): | |
| llm = LlamaCppPythonLLM().llm(model=config.llm) | |
| lc_llm = LlamaCpp( | |
| model_path=llm.model_path, | |
| n_batch=llm.n_batch, | |
| n_ctx=llm.n_ctx(), | |
| n_gpu_layers=-1, | |
| verbose=llm.verbose, | |
| ) | |
| else: | |
| lc_llm = ChatLiteLLM(model=config.llm) # type: ignore[call-arg] | |
| # Load the embedder. | |
| if not config.embedder.startswith("llama-cpp-python"): | |
| error_message = "Currently, only `llama-cpp-python` embedders are supported." | |
| raise NotImplementedError(error_message) | |
| embedder = LlamaCppPythonLLM().llm(model=config.embedder, embedding=True) | |
| lc_embedder = LlamaCppEmbeddings( # type: ignore[call-arg] | |
| model_path=embedder.model_path, | |
| n_batch=embedder.n_batch, | |
| n_ctx=embedder.n_ctx(), | |
| n_gpu_layers=-1, | |
| verbose=embedder.verbose, | |
| ) | |
| # Evaluate the answered evals with Ragas. | |
| evaluation_df = ragas_evaluate( | |
| dataset=Dataset.from_pandas(answered_evals_df), | |
| llm=lc_llm, | |
| embeddings=lc_embedder, | |
| run_config=RunConfig(max_workers=1), | |
| ).to_pandas() | |
| return evaluation_df | |