Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| import spaces | |
| import os | |
| # Model configuration | |
| MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" # Small, efficient open-source model | |
| MAX_NEW_TOKENS = 512 | |
| TEMPERATURE = 0.7 | |
| TOP_P = 0.95 | |
| # Initialize model and tokenizer | |
| print(f"Loading model: {MODEL_NAME}") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| # Create text generation pipeline | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| device_map="auto" | |
| ) | |
| # Request GPU for 60 seconds per call (for Hugging Face Spaces) | |
| def generate_response(message, history, system_message, max_tokens, temperature, top_p): | |
| """Generate a response using the LLM.""" | |
| # Build conversation context | |
| messages = [] | |
| # Add system message if provided | |
| if system_message: | |
| messages.append({"role": "system", "content": system_message}) | |
| # Add conversation history | |
| for user_msg, assistant_msg in history: | |
| messages.append({"role": "user", "content": user_msg}) | |
| messages.append({"role": "assistant", "content": assistant_msg}) | |
| # Add current message | |
| messages.append({"role": "user", "content": message}) | |
| # Apply chat template | |
| text = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| # Generate response | |
| try: | |
| outputs = pipe( | |
| text, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| do_sample=True, | |
| return_full_text=False | |
| ) | |
| response = outputs[0]["generated_text"] | |
| return response | |
| except Exception as e: | |
| return f"Error generating response: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Open Source LLM Chat", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # Open Source LLM Chat | |
| This app uses **{model}** - an open-source language model. | |
| ### Features: | |
| - Interactive chat interface | |
| - Adjustable generation parameters | |
| - Custom system messages | |
| - Deployed on Hugging Face Spaces | |
| """.format(model=MODEL_NAME) | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| chatbot = gr.Chatbot( | |
| label="Chat History", | |
| height=500, | |
| bubble_full_width=False | |
| ) | |
| msg = gr.Textbox( | |
| label="Your Message", | |
| placeholder="Type your message here and press Enter...", | |
| lines=2 | |
| ) | |
| with gr.Row(): | |
| submit = gr.Button("Send", variant="primary") | |
| clear = gr.Button("Clear Chat") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Settings") | |
| system_message = gr.Textbox( | |
| label="System Message (Optional)", | |
| placeholder="You are a helpful AI assistant...", | |
| lines=3 | |
| ) | |
| max_tokens = gr.Slider( | |
| minimum=50, | |
| maximum=2048, | |
| value=MAX_NEW_TOKENS, | |
| step=50, | |
| label="Max Tokens" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=2.0, | |
| value=TEMPERATURE, | |
| step=0.1, | |
| label="Temperature (Creativity)" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=TOP_P, | |
| step=0.05, | |
| label="Top-p (Nucleus Sampling)" | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### Parameter Guide: | |
| - **Max Tokens**: Maximum length of response | |
| - **Temperature**: Higher = more creative, Lower = more focused | |
| - **Top-p**: Controls diversity of word choices | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| ### Tips: | |
| - Start with a clear, specific question | |
| - Adjust temperature for creative vs. factual responses | |
| - Use system messages to set the AI's behavior | |
| - Clear chat if responses become inconsistent | |
| """ | |
| ) | |
| # Handle message submission | |
| def user_submit(message, history): | |
| return "", history + [[message, None]] | |
| def bot_respond(history, system_message, max_tokens, temperature, top_p): | |
| if len(history) == 0 or history[-1][1] is not None: | |
| return history | |
| message = history[-1][0] | |
| bot_message = generate_response( | |
| message, | |
| history[:-1], # Don't include the current message in history | |
| system_message, | |
| max_tokens, | |
| temperature, | |
| top_p | |
| ) | |
| history[-1][1] = bot_message | |
| return history | |
| # Wire up the interface | |
| msg.submit( | |
| user_submit, | |
| [msg, chatbot], | |
| [msg, chatbot], | |
| queue=False | |
| ).then( | |
| bot_respond, | |
| [chatbot, system_message, max_tokens, temperature, top_p], | |
| chatbot | |
| ) | |
| submit.click( | |
| user_submit, | |
| [msg, chatbot], | |
| [msg, chatbot], | |
| queue=False | |
| ).then( | |
| bot_respond, | |
| [chatbot, system_message, max_tokens, temperature, top_p], | |
| chatbot | |
| ) | |
| clear.click(lambda: None, None, chatbot, queue=False) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch( | |
| share=False, | |
| show_error=True | |
| ) | |