File size: 2,277 Bytes
6bcba58 b5dcf86 58cf8ac b5dcf86 58cf8ac b5dcf86 58cf8ac b5dcf86 58cf8ac b5dcf86 89952e6 b5dcf86 58cf8ac b5dcf86 6373d41 b5dcf86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# -----------------------------------------------------------
# Automatically download GGUF model from Hugging Face Hub
# -----------------------------------------------------------
# Set the model repo + filename (change filename if needed)
MODEL_REPO = "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated-GGUF"
MODEL_FILE = "Meta-Llama-3.1-8B-Instruct-abliterated.Q4_K_M.gguf"
print("Downloading model from HF Hub...")
model_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILE
)
print(f"Model downloaded to: {model_path}")
# -----------------------------------------------------------
# Load model with llama.cpp
# -----------------------------------------------------------
llm = Llama(
model_path=model_path,
n_gpu_layers=-1, # GPU acceleration if available
n_ctx=4096,
chat_format="llama-3",
)
# -----------------------------------------------------------
# Streaming chat function
# -----------------------------------------------------------
def chat_stream(message, history):
messages = [{"role": "system", "content": "You are a helpful assistant."}]
# Add previous conversation
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
if bot_msg:
messages.append({"role": "assistant", "content": bot_msg})
# Add latest user message
messages.append({"role": "user", "content": message})
partial = ""
# Stream tokens from llama.cpp
for chunk in llm.create_chat_completion(messages=messages, stream=True):
delta = chunk["choices"][0]["delta"]
if "content" in delta:
partial += delta["content"]
yield partial
# -----------------------------------------------------------
# Launch Gradio Chat Interface
# -----------------------------------------------------------
gr.ChatInterface(
fn=chat_stream,
title="Llama.cpp Chat (Streaming, Auto-Download Model)",
chatbot=gr.Chatbot(height=600),
textbox=gr.Textbox(placeholder="Ask me anything...", container=True),
examples=["Hello!", "Write a poem.", "Explain how gravity works."],
retry_btn=None,
undo_btn=None,
).launch()
|