Spaces:

sitammeur
/

LFM2-Math-WebGPU

Running

App Files Files Community

sitammeur commited on 13 days ago

Commit

1a8ee5e

verified ·

1 Parent(s): f000f26

Update src/worker.js

Browse files

Files changed (1) hide show

src/worker.js +173 -173

src/worker.js CHANGED Viewed

@@ -1,173 +1,173 @@
-import {
-  AutoTokenizer,
-  AutoModelForCausalLM,
-  TextStreamer,
-  InterruptableStoppingCriteria,
-} from "@huggingface/transformers";
-/**
- * This class uses the Singleton pattern to enable lazy-loading of the pipeline
- */
-class TextGenerationPipeline {
-  static model_id = "onnx-community/LFM2-350M-Math-ONNX";
-  static async getInstance(progress_callback = null) {
-    this.tokenizer ??= AutoTokenizer.from_pretrained(this.model_id, {
-      progress_callback,
-    });
-    this.model ??= AutoModelForCausalLM.from_pretrained(this.model_id, {
-      dtype: "fp16",
-      device: "webgpu",
-      progress_callback,
-    });
-    return Promise.all([this.tokenizer, this.model]);
-  }
-}
-const stopping_criteria = new InterruptableStoppingCriteria();
-let past_key_values_cache = null;
-/**
- * Generate text based on the input messages
- */
-async function generate(messages) {
-  // Retrieve the text-generation pipeline.
-  const [tokenizer, model] = await TextGenerationPipeline.getInstance();
-  const inputs = tokenizer.apply_chat_template(messages, {
-    add_generation_prompt: true,
-    return_dict: true,
-  });
-  let startTime;
-  let numTokens = 0;
-  let tps;
-  const token_callback_function = () => {
-    startTime ??= performance.now();
-    if (numTokens++ > 0) {
-      tps = (numTokens / (performance.now() - startTime)) * 1000;
-    }
-  };
-  const callback_function = (output) => {
-    self.postMessage({
-      status: "update",
-      output,
-      tps,
-      numTokens,
-    });
-  };
-  const streamer = new TextStreamer(tokenizer, {
-    skip_prompt: true,
-    skip_special_tokens: true,
-    callback_function,
-    token_callback_function,
-  });
-  // Tell the main thread we are starting
-  self.postMessage({ status: "start" });
-  const { past_key_values, sequences } = await model.generate({
-    ...inputs,
-    // TODO: Add when model is fixed
-    past_key_values: past_key_values_cache,
-    // Sampling
-    do_sample: false,
-    temperature: 0.6,
-    top_p: 0.95,
-    repetition_penalty: 1.05,
-    max_new_tokens: 512,
-    streamer,
-    stopping_criteria,
-    return_dict_in_generate: true,
-  });
-  past_key_values_cache = past_key_values;
-  const decoded = tokenizer.batch_decode(sequences, {
-    skip_special_tokens: true,
-  });
-  // Send the output back to the main thread
-  self.postMessage({
-    status: "complete",
-    output: decoded,
-  });
-}
-/**
- * Helper function to perform feature detection for WebGPU
- */
-async function check() {
-  try {
-    const adapter = await navigator.gpu.requestAdapter();
-    if (!adapter) {
-      throw new Error("WebGPU is not supported (no adapter found)");
-    }
-  } catch (e) {
-    self.postMessage({
-      status: "error",
-      data: e.toString(),
-    });
-  }
-}
-/**
- * Helper function to load the model and tokenizer
- */
-async function load() {
-  self.postMessage({
-    status: "loading",
-    data: "Loading model...",
-  });
-  // Load the pipeline and save it for future use.
-  const [tokenizer, model] = await TextGenerationPipeline.getInstance((x) => {
-    // We also add a progress callback to the pipeline so that we can
-    // track model loading.
-    self.postMessage(x);
-  });
-  self.postMessage({
-    status: "loading",
-    data: "Compiling shaders and warming up the model...",
-  });
-  // Run model with dummy input to compile shaders
-  const inputs = tokenizer("a");
-  await model.generate({ ...inputs, max_new_tokens: 1 });
-  self.postMessage({ status: "ready" });
-}
-// Listen for messages from the main thread
-self.addEventListener("message", async (e) => {
-  const { type, data } = e.data;
-  switch (type) {
-    case "check":
-      check();
-      break;
-    case "load":
-      load();
-      break;
-    case "generate":
-      stopping_criteria.reset();
-      generate(data);
-      break;
-    case "interrupt":
-      stopping_criteria.interrupt();
-      break;
-    case "reset":
-      past_key_values_cache = null;
-      stopping_criteria.reset();
-      break;
-  }
-});

+import {
+  AutoTokenizer,
+  AutoModelForCausalLM,
+  TextStreamer,
+  InterruptableStoppingCriteria,
+} from "@huggingface/transformers";
+/**
+ * This class uses the Singleton pattern to enable lazy-loading of the pipeline
+ */
+class TextGenerationPipeline {
+  static model_id = "onnx-community/LFM2-350M-Math-ONNX";
+  static async getInstance(progress_callback = null) {
+    this.tokenizer ??= AutoTokenizer.from_pretrained(this.model_id, {
+      progress_callback,
+    });
+    this.model ??= AutoModelForCausalLM.from_pretrained(this.model_id, {
+      dtype: "q4f16",
+      device: "webgpu",
+      progress_callback,
+    });
+    return Promise.all([this.tokenizer, this.model]);
+  }
+}
+const stopping_criteria = new InterruptableStoppingCriteria();
+let past_key_values_cache = null;
+/**
+ * Generate text based on the input messages
+ */
+async function generate(messages) {
+  // Retrieve the text-generation pipeline.
+  const [tokenizer, model] = await TextGenerationPipeline.getInstance();
+  const inputs = tokenizer.apply_chat_template(messages, {
+    add_generation_prompt: true,
+    return_dict: true,
+  });
+  let startTime;
+  let numTokens = 0;
+  let tps;
+  const token_callback_function = () => {
+    startTime ??= performance.now();
+    if (numTokens++ > 0) {
+      tps = (numTokens / (performance.now() - startTime)) * 1000;
+    }
+  };
+  const callback_function = (output) => {
+    self.postMessage({
+      status: "update",
+      output,
+      tps,
+      numTokens,
+    });
+  };
+  const streamer = new TextStreamer(tokenizer, {
+    skip_prompt: true,
+    skip_special_tokens: true,
+    callback_function,
+    token_callback_function,
+  });
+  // Tell the main thread we are starting
+  self.postMessage({ status: "start" });
+  const { past_key_values, sequences } = await model.generate({
+    ...inputs,
+    // TODO: Add when model is fixed
+    past_key_values: past_key_values_cache,
+    // Sampling
+    do_sample: false,
+    temperature: 0.6,
+    top_p: 0.95,
+    repetition_penalty: 1.05,
+    max_new_tokens: 512,
+    streamer,
+    stopping_criteria,
+    return_dict_in_generate: true,
+  });
+  past_key_values_cache = past_key_values;
+  const decoded = tokenizer.batch_decode(sequences, {
+    skip_special_tokens: true,
+  });
+  // Send the output back to the main thread
+  self.postMessage({
+    status: "complete",
+    output: decoded,
+  });
+}
+/**
+ * Helper function to perform feature detection for WebGPU
+ */
+async function check() {
+  try {
+    const adapter = await navigator.gpu.requestAdapter();
+    if (!adapter) {
+      throw new Error("WebGPU is not supported (no adapter found)");
+    }
+  } catch (e) {
+    self.postMessage({
+      status: "error",
+      data: e.toString(),
+    });
+  }
+}
+/**
+ * Helper function to load the model and tokenizer
+ */
+async function load() {
+  self.postMessage({
+    status: "loading",
+    data: "Loading model...",
+  });
+  // Load the pipeline and save it for future use.
+  const [tokenizer, model] = await TextGenerationPipeline.getInstance((x) => {
+    // We also add a progress callback to the pipeline so that we can
+    // track model loading.
+    self.postMessage(x);
+  });
+  self.postMessage({
+    status: "loading",
+    data: "Compiling shaders and warming up the model...",
+  });
+  // Run model with dummy input to compile shaders
+  const inputs = tokenizer("a");
+  await model.generate({ ...inputs, max_new_tokens: 1 });
+  self.postMessage({ status: "ready" });
+}
+// Listen for messages from the main thread
+self.addEventListener("message", async (e) => {
+  const { type, data } = e.data;
+  switch (type) {
+    case "check":
+      check();
+      break;
+    case "load":
+      load();
+      break;
+    case "generate":
+      stopping_criteria.reset();
+      generate(data);
+      break;
+    case "interrupt":
+      stopping_criteria.interrupt();
+      break;
+    case "reset":
+      past_key_values_cache = null;
+      stopping_criteria.reset();
+      break;
+  }
+});