Spaces:

akhaliq
/

KAT-Dev

Running on Zero

App Files Files Community

akhaliq HF Staff commited on Sep 27

Commit

a5660ec

verified ·

1 Parent(s): 0763b5e

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -138

app.py CHANGED Viewed

@@ -1,141 +1,41 @@
-I see the issues! The error is happening because the custom streamer isn't handling the input correctly, and we're not properly setting the attention mask. Let me fix the streaming implementation:
-=== models.py ===
-import spaces
-import torch
-import numpy as np
-from typing import Generator
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from config import MODEL_NAME, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE
-# Global variables to store the model and tokenizer
-tokenizer = None
-model = None
-def initialize_model():
-    """Initializes and loads the model and tokenizer once onto the GPU."""
-    global tokenizer, model
-    if model is None:
-        try:
-            print(f"Loading model {MODEL_NAME}...")
-            # Use bfloat16 for efficiency on modern GPUs
-            dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-            tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-            model = AutoModelForCausalLM.from_pretrained(
-                MODEL_NAME,
-                torch_dtype=dtype,
-                device_map="auto"
-            )
-            model.eval()
-            # Set padding token if not defined
-            if tokenizer.pad_token_id is None:
-                tokenizer.pad_token_id = tokenizer.eos_token_id
-            print("Model loaded successfully.")
-        except Exception as e:
-            print(f"Failed to load model: {e}")
-            raise
-    return tokenizer, model
-# Call initialization
-try:
-    initialize_model()
-except Exception as e:
-    print(f"Warning: Global model initialization failed: {e}")
-@spaces.GPU(duration=120)
-def stream_generate_response(prompt: str, history: list) -> Generator[str, None, None]:
-    """
-    Generates a response from the KAT model with proper streaming.
-    """
-    global tokenizer, model
-    # Fallback initialization
-    if model is None or tokenizer is None:
-        initialize_model()
-    # Convert Gradio history format to the model's chat template format
-    messages = []
-    for human, bot in history:
-        if human:
-            messages.append({"role": "user", "content": human})
-        if bot:
-            messages.append({"role": "assistant", "content": bot})
-    # Add the current prompt
-    messages.append({"role": "user", "content": prompt})
-    # Apply chat template
-    text = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True,
     )
-    # Tokenize with attention mask
-    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
-    input_ids = inputs.input_ids.to(model.device)
-    attention_mask = inputs.attention_mask.to(model.device)
-    # Generate with streaming using yield-based approach
-    accumulated_text = ""
-    # Generate tokens incrementally
-    for _ in range(MAX_NEW_TOKENS):
-        with torch.no_grad():
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                return_dict=True
-            )
-        # Get next token probabilities
-        next_token_logits = outputs.logits[:, -1, :]
-        # Apply temperature
-        if TEMPERATURE > 0:
-            next_token_logits = next_token_logits / TEMPERATURE
-        # Apply softmax and sample
-        probs = torch.softmax(next_token_logits, dim=-1)
-        if DO_SAMPLE:
-            next_token = torch.multinomial(probs, num_samples=1)
-        else:
-            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
-        # Check for EOS token
-        if next_token.item() == tokenizer.eos_token_id:
-            break
-        # Decode the new token
-        new_token_text = tokenizer.decode(next_token[0], skip_special_tokens=True)
-        # Update accumulated text
-        accumulated_text += new_token_text
-        # Yield the current accumulated text
-        yield accumulated_text
-        # Prepare for next iteration
-        input_ids = torch.cat([input_ids, next_token], dim=-1)
-        attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=-1)
-        # Stop if we've reached max tokens
-        if input_ids.shape[-1] >= input_ids.shape[-1] + MAX_NEW_TOKENS:
-            break
-    # Final yield to ensure complete text
-    yield accumulated_text.strip()
-The key changes I made:
-1. **Fixed attention mask**: Now properly sets `attention_mask` when tokenizing to avoid the warning
-2. **Simplified streaming**: Using a manual token-by-token generation loop instead of the complex custom streamer
-3. **Proper tensor handling**: Correctly handles token tensors and decoding
-4. **EOS handling**: Properly stops generation when end-of-sequence token is encountered
-5. **Memory efficiency**: Uses `torch.no_grad()` for inference to save memory
-This implementation should now properly stream tokens one by one and yield the accumulated text to the Gradio interface for real-time display.

+import gradio as gr
+from models import stream_generate_response
+# Header Link
+ANYCODER_LINK = "<a href='https://huggingface.co/spaces/akhaliq/anycoder' target='_blank'>Built with anycoder</a>"
+with gr.Blocks(title="KAT-Dev Chat", theme=gr.themes.Soft()) as demo:
+    gr.HTML(
+        f"""
+        <div style="text-align: center; max-width: 800px; margin: 0 auto;">
+            <h1>💬 KAT-Dev LLM Chat</h1>
+            <p>Powered by Kwaipilot/KAT-Dev, a large language model. This application uses Hugging Face ZeroGPU for highly efficient inference.</p>
+            {ANYCODER_LINK}
+        </div>
+        """
     )
+    # ChatInterface handles the full conversational UI, streaming, and history management
+    chat_interface = gr.ChatInterface(
+        fn=stream_generate_response,
+        title="",  # Title moved to HTML block
+        chatbot=gr.Chatbot(
+            height=500,
+            show_copy_button=True,
+            layout="bubble"
+        ),
+        textbox=gr.Textbox(
+            placeholder="Ask the KAT model anything...",
+            container=False,
+            scale=7
+        ),
+        # Disable the default submit button text since we have an icon
+        submit_btn=True,
+        stop_btn=True,
+        # Concurrency limit handled by @spaces.GPU
+        concurrency_limit=10,
+    )
+demo.queue()
+demo.launch()