File size: 2,277 Bytes
6bcba58
b5dcf86
58cf8ac
 
b5dcf86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58cf8ac
b5dcf86
 
58cf8ac
b5dcf86
58cf8ac
 
b5dcf86
 
 
 
89952e6
b5dcf86
 
 
 
 
 
 
 
 
 
58cf8ac
 
b5dcf86
 
6373d41
b5dcf86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# -----------------------------------------------------------
# Automatically download GGUF model from Hugging Face Hub
# -----------------------------------------------------------

# Set the model repo + filename (change filename if needed)
MODEL_REPO = "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated-GGUF"
MODEL_FILE = "Meta-Llama-3.1-8B-Instruct-abliterated.Q4_K_M.gguf"

print("Downloading model from HF Hub...")
model_path = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE
)
print(f"Model downloaded to: {model_path}")

# -----------------------------------------------------------
# Load model with llama.cpp
# -----------------------------------------------------------

llm = Llama(
    model_path=model_path,
    n_gpu_layers=-1,       # GPU acceleration if available
    n_ctx=4096,
    chat_format="llama-3",
)

# -----------------------------------------------------------
# Streaming chat function
# -----------------------------------------------------------

def chat_stream(message, history):

    messages = [{"role": "system", "content": "You are a helpful assistant."}]

    # Add previous conversation
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        if bot_msg:
            messages.append({"role": "assistant", "content": bot_msg})

    # Add latest user message
    messages.append({"role": "user", "content": message})

    partial = ""

    # Stream tokens from llama.cpp
    for chunk in llm.create_chat_completion(messages=messages, stream=True):
        delta = chunk["choices"][0]["delta"]
        if "content" in delta:
            partial += delta["content"]
            yield partial

# -----------------------------------------------------------
# Launch Gradio Chat Interface
# -----------------------------------------------------------

gr.ChatInterface(
    fn=chat_stream,
    title="Llama.cpp Chat (Streaming, Auto-Download Model)",
    chatbot=gr.Chatbot(height=600),
    textbox=gr.Textbox(placeholder="Ask me anything...", container=True),
    examples=["Hello!", "Write a poem.", "Explain how gravity works."],
    retry_btn=None,
    undo_btn=None,
).launch()