Spaces:

AyoubZolodick
/

rag_lite

Paused

EL GHAFRAOUI AYOUB

54f5afe about 1 year ago

6.31 kB

	"""Retrieval-augmented generation."""

	from collections.abc import AsyncIterator, Iterator

	from litellm import acompletion, completion, get_model_info # type: ignore[attr-defined]

	from raglite._config import RAGLiteConfig
	from raglite._database import Chunk
	from raglite._litellm import LlamaCppPythonLLM
	from raglite._search import hybrid_search, rerank_chunks, retrieve_segments
	from raglite._typing import SearchMethod

	RAG_SYSTEM_PROMPT = """
	You are a friendly and knowledgeable assistant that provides complete and insightful answers.
	Answer the user's question using only the context below.
	When responding, you MUST NOT reference the existence of the context, directly or indirectly.
	Instead, you MUST treat the context as if its contents are entirely part of your working memory.
	""".strip()


	def _max_contexts(
	prompt: str,
	*,
	max_contexts: int = 5,
	context_neighbors: tuple[int, ...] \| None = (-1, 1),
	messages: list[dict[str, str]] \| None = None,
	config: RAGLiteConfig \| None = None,
	) -> int:
	"""Determine the maximum number of contexts for RAG."""
	# If the user has configured a llama-cpp-python model, we ensure that LiteLLM's model info is up
	# to date by loading that LLM.
	config = config or RAGLiteConfig()
	if config.llm.startswith("llama-cpp-python"):
	_ = LlamaCppPythonLLM.llm(config.llm)
	# Get the model's maximum context size.
	llm_provider = "llama-cpp-python" if config.llm.startswith("llama-cpp") else None
	model_info = get_model_info(config.llm, custom_llm_provider=llm_provider)
	max_tokens = model_info.get("max_tokens") or 2048
	# Reduce the maximum number of contexts to take into account the LLM's context size.
	max_context_tokens = (
	max_tokens
	- sum(len(message["content"]) // 3 for message in messages or []) # Previous messages.
	- len(RAG_SYSTEM_PROMPT) // 3 # System prompt.
	- len(prompt) // 3 # User prompt.
	)
	max_tokens_per_context = config.chunk_max_size // 3
	max_tokens_per_context *= 1 + len(context_neighbors or [])
	max_contexts = min(max_contexts, max_context_tokens // max_tokens_per_context)
	if max_contexts <= 0:
	error_message = "Not enough context tokens available for RAG."
	raise ValueError(error_message)
	return max_contexts


	def _contexts( # noqa: PLR0913
	prompt: str,
	*,
	max_contexts: int = 5,
	context_neighbors: tuple[int, ...] \| None = (-1, 1),
	search: SearchMethod \| list[str] \| list[Chunk] = hybrid_search,
	messages: list[dict[str, str]] \| None = None,
	config: RAGLiteConfig \| None = None,
	) -> list[str]:
	"""Retrieve contexts for RAG."""
	# Determine the maximum number of contexts.
	max_contexts = _max_contexts(
	prompt,
	max_contexts=max_contexts,
	context_neighbors=context_neighbors,
	messages=messages,
	config=config,
	)
	# Retrieve the top chunks.
	config = config or RAGLiteConfig()
	chunks: list[str] \| list[Chunk]
	if callable(search):
	# If the user has configured a reranker, we retrieve extra contexts to rerank.
	extra_contexts = 3 * max_contexts if config.reranker else 0
	# Retrieve relevant contexts.
	chunk_ids, _ = search(prompt, num_results=max_contexts + extra_contexts, config=config)
	# Rerank the relevant contexts.
	chunks = rerank_chunks(query=prompt, chunk_ids=chunk_ids, config=config)
	else:
	# The user has passed a list of chunk_ids or chunks directly.
	chunks = search
	# Extend the top contexts with their neighbors and group chunks into contiguous segments.
	segments = retrieve_segments(chunks[:max_contexts], neighbors=context_neighbors, config=config)
	return segments


	def rag( # noqa: PLR0913
	prompt: str,
	*,
	max_contexts: int = 5,
	context_neighbors: tuple[int, ...] \| None = (-1, 1),
	search: SearchMethod \| list[str] \| list[Chunk] = hybrid_search,
	messages: list[dict[str, str]] \| None = None,
	system_prompt: str = RAG_SYSTEM_PROMPT,
	config: RAGLiteConfig \| None = None,
	) -> Iterator[str]:
	"""Retrieval-augmented generation."""
	# Get the contexts for RAG as contiguous segments of chunks.
	config = config or RAGLiteConfig()
	segments = _contexts(
	prompt,
	max_contexts=max_contexts,
	context_neighbors=context_neighbors,
	search=search,
	config=config,
	)
	system_prompt = f"{system_prompt}\n\n" + "\n\n".join(
	f'<context index="{i}">\n{segment.strip()}\n</context>'
	for i, segment in enumerate(segments)
	)
	# Stream the LLM response.
	stream = completion(
	model=config.llm,
	messages=[
	*(messages or []),
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": prompt},
	],
	stream=True,
	)
	for output in stream:
	token: str = output["choices"][0]["delta"].get("content") or ""
	yield token


	async def async_rag( # noqa: PLR0913
	prompt: str,
	*,
	max_contexts: int = 5,
	context_neighbors: tuple[int, ...] \| None = (-1, 1),
	search: SearchMethod \| list[str] \| list[Chunk] = hybrid_search,
	messages: list[dict[str, str]] \| None = None,
	system_prompt: str = RAG_SYSTEM_PROMPT,
	config: RAGLiteConfig \| None = None,
	) -> AsyncIterator[str]:
	"""Retrieval-augmented generation."""
	# Get the contexts for RAG as contiguous segments of chunks.
	config = config or RAGLiteConfig()
	segments = _contexts(
	prompt,
	max_contexts=max_contexts,
	context_neighbors=context_neighbors,
	search=search,
	config=config,
	)
	system_prompt = f"{system_prompt}\n\n" + "\n\n".join(
	f'<context index="{i}">\n{segment.strip()}\n</context>'
	for i, segment in enumerate(segments)
	)
	# Stream the LLM response.
	async_stream = await acompletion(
	model=config.llm,
	messages=[
	*(messages or []),
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": prompt},
	],
	stream=True,
	)
	async for output in async_stream:
	token: str = output["choices"][0]["delta"].get("content") or ""
	yield token