import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer class VibeThinkerChat: def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"): print("Loading model and tokenizer...") self.model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto" ) self.tokenizer = AutoTokenizer.from_pretrained( model_path, trust_remote_code=True ) print("Model loaded successfully!") def generate_response(self, prompt, temperature=0.6, max_tokens=40960, top_p=0.95): messages = [ {"role": "user", "content": prompt} ] text = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device) generation_config = dict( max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_p=top_p, top_k=1 ) generated_ids = self.model.generate( model_inputs.input_ids, **generation_config ) generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] return response # Initialize model chat_model = VibeThinkerChat() def chat_interface(message, history, temperature, max_tokens): try: response = chat_model.generate_response( message, temperature=temperature, max_tokens=max_tokens ) return response except Exception as e: return f"Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="VibeThinker-1.5B Chat") as demo: gr.Markdown("# 🧠 VibeThinker-1.5B Chat Interface") gr.Markdown("A 1.5B parameter reasoning model optimized for math and coding problems.") with gr.Row(): with gr.Column(scale=3): chatbot = gr.Chatbot(height=500) msg = gr.Textbox( label="Your Message", placeholder="Ask a math or coding question...", lines=3 ) with gr.Row(): submit = gr.Button("Submit", variant="primary") clear = gr.Button("Clear") with gr.Column(scale=1): temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.6, step=0.1, label="Temperature", info="Recommended: 0.6 or 1.0" ) max_tokens = gr.Slider( minimum=512, maximum=40960, value=4096, step=512, label="Max Tokens", info="Maximum response length" ) def user_message(user_msg, history): return "", history + [[user_msg, None]] def bot_response(history, temp, max_tok): user_msg = history[-1][0] bot_msg = chat_interface(user_msg, history, temp, max_tok) history[-1][1] = bot_msg return history msg.submit(user_message, [msg, chatbot], [msg, chatbot], queue=False).then( bot_response, [chatbot, temperature, max_tokens], chatbot ) submit.click(user_message, [msg, chatbot], [msg, chatbot], queue=False).then( bot_response, [chatbot, temperature, max_tokens], chatbot ) clear.click(lambda: None, None, chatbot, queue=False) if __name__ == "__main__": demo.queue() demo.launch()