Chat_with_Meta_llama3_8b

Sleeping

App Files Files Community

Chat_with_Meta_llama3_8b / app.py

leoskyscape

Update app.py

b5dcf86 verified about 1 month ago

raw

history blame

2.28 kB

	import gradio as gr
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	# -----------------------------------------------------------
	# Automatically download GGUF model from Hugging Face Hub
	# -----------------------------------------------------------

	# Set the model repo + filename (change filename if needed)
	MODEL_REPO = "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated-GGUF"
	MODEL_FILE = "Meta-Llama-3.1-8B-Instruct-abliterated.Q4_K_M.gguf"

	print("Downloading model from HF Hub...")
	model_path = hf_hub_download(
	repo_id=MODEL_REPO,
	filename=MODEL_FILE
	)
	print(f"Model downloaded to: {model_path}")

	# -----------------------------------------------------------
	# Load model with llama.cpp
	# -----------------------------------------------------------

	llm = Llama(
	model_path=model_path,
	n_gpu_layers=-1, # GPU acceleration if available
	n_ctx=4096,
	chat_format="llama-3",
	)

	# -----------------------------------------------------------
	# Streaming chat function
	# -----------------------------------------------------------

	def chat_stream(message, history):

	messages = [{"role": "system", "content": "You are a helpful assistant."}]

	# Add previous conversation
	for user_msg, bot_msg in history:
	messages.append({"role": "user", "content": user_msg})
	if bot_msg:
	messages.append({"role": "assistant", "content": bot_msg})

	# Add latest user message
	messages.append({"role": "user", "content": message})

	partial = ""

	# Stream tokens from llama.cpp
	for chunk in llm.create_chat_completion(messages=messages, stream=True):
	delta = chunk["choices"][0]["delta"]
	if "content" in delta:
	partial += delta["content"]
	yield partial

	# -----------------------------------------------------------
	# Launch Gradio Chat Interface
	# -----------------------------------------------------------

	gr.ChatInterface(
	fn=chat_stream,
	title="Llama.cpp Chat (Streaming, Auto-Download Model)",
	chatbot=gr.Chatbot(height=600),
	textbox=gr.Textbox(placeholder="Ask me anything...", container=True),
	examples=["Hello!", "Write a poem.", "Explain how gravity works."],
	retry_btn=None,
	undo_btn=None,
	).launch()