import gradio as gr import requests LLAMA_API = "http://localhost:7860/completion" # llama.cpp server URL def chat_with_model(message, history): # Build the prompt full_prompt = "\n".join([f"User: {m[0]}\nAssistant: {m[1]}" for m in history]) full_prompt += f"\nUser: {message}\nAssistant:" # Send to llama.cpp server resp = requests.post(LLAMA_API, json={ "prompt": full_prompt, "n_predict": 256, "temperature": 0.7 }) text = resp.json().get("content", "") return text chat = gr.ChatInterface( fn=chat_with_model, title="GPT-OSS 20B Chat", description="Chat with the local llama.cpp model running inside Docker", ) if __name__ == "__main__": chat.launch(server_name="0.0.0.0", server_port=7861)