import gradio as gr
import requests

LLAMA_API = "http://localhost:7860/completion"  # llama.cpp server URL

def chat_with_model(message, history):
    # Build the prompt
    full_prompt = "\n".join([f"User: {m[0]}\nAssistant: {m[1]}" for m in history])
    full_prompt += f"\nUser: {message}\nAssistant:"

    # Send to llama.cpp server
    resp = requests.post(LLAMA_API, json={
        "prompt": full_prompt,
        "n_predict": 256,
        "temperature": 0.7
    })
    text = resp.json().get("content", "")
    return text

chat = gr.ChatInterface(
    fn=chat_with_model,
    title="GPT-OSS 20B Chat",
    description="Chat with the local llama.cpp model running inside Docker",
)

if __name__ == "__main__":
    chat.launch(server_name="0.0.0.0", server_port=7861)