Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

chat_template.jinja +5 -2
config.json +3 -1
configuration_moondream3.py +1 -1
image_processing_moondream3.py +7 -1
modeling_moondream3.py +320 -101
processing_moondream3.py +31 -31

chat_template.jinja CHANGED Viewed

@@ -57,15 +57,18 @@
     {{ raise_exception("caption length must be one of: short, normal, long.") }}
   {%- endif -%}
 <|md_reserved_0|>describe<|md_reserved_1|>{{ length }}<|md_reserved_2|>
 {%- elif lower.startswith('query:') -%}
   {% set q = text[6:] | trim -%}
 <|md_reserved_0|>query<|md_reserved_1|>{{ q }}<|md_reserved_2|>
 {%- elif lower.startswith('detect:') -%}
   {% set q = text[7:] | trim -%}
-<|md_reserved_0|>det<|md_reserved_1|>{{ q }}<|md_reserved_2|>
 {%- elif lower.startswith('point:') -%}
   {% set q = text[6:] | trim -%}
-<|md_reserved_0|>point<|md_reserved_1|>{{ q }}<|md_reserved_2|>
 {%- else -%}
   {% set q = text -%}
 <|md_reserved_0|>query<|md_reserved_1|>{{ q }}<|md_reserved_2|>

     {{ raise_exception("caption length must be one of: short, normal, long.") }}
   {%- endif -%}
 <|md_reserved_0|>describe<|md_reserved_1|>{{ length }}<|md_reserved_2|>
+{%- elif lower.startswith('reason:') -%}
+  {% set q = text[7:] | trim -%}
+<|md_reserved_0|>query<|md_reserved_1|>{{ q }}<|md_reserved_2|><|md_reserved_3|>
 {%- elif lower.startswith('query:') -%}
   {% set q = text[6:] | trim -%}
 <|md_reserved_0|>query<|md_reserved_1|>{{ q }}<|md_reserved_2|>
 {%- elif lower.startswith('detect:') -%}
   {% set q = text[7:] | trim -%}
+<|md_reserved_0|>det<|md_reserved_1|> {{ q }}<|md_reserved_2|>
 {%- elif lower.startswith('point:') -%}
   {% set q = text[6:] | trim -%}
+<|md_reserved_0|>point<|md_reserved_1|> {{ q }}<|md_reserved_2|>
 {%- else -%}
   {% set q = text -%}
 <|md_reserved_0|>query<|md_reserved_1|>{{ q }}<|md_reserved_2|>

config.json CHANGED Viewed

@@ -37,7 +37,9 @@
     "output_router_logits": false,
     "prefix_attn": 730,
     "bos_token_id": 0,
-    "rms_norm_eps": 1e-06,
     "rope_parameters": {
       "rope_theta": 1500000.0,
       "rope_type": "default"

     "output_router_logits": false,
     "prefix_attn": 730,
     "bos_token_id": 0,
+    "eos_token_id": 0,
+    "coord_token_id": 5,
+    "rms_norm_eps": 1e-05,
     "rope_parameters": {
       "rope_theta": 1500000.0,
       "rope_type": "default"

configuration_moondream3.py CHANGED Viewed

@@ -54,7 +54,7 @@ class Moondream3TextConfig(PretrainedConfig):
             The non-linear activation function.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
             The epsilon used by the rms normalization layers.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions.

             The non-linear activation function.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-5):
             The epsilon used by the rms normalization layers.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions.

image_processing_moondream3.py CHANGED Viewed

@@ -204,7 +204,13 @@ def prepare_crops(image, max_crops=12, overlap_margin=4):
     )
     all_crops = overlap_crops["crops"]
     all_crops = np.transpose(all_crops, (0, 3, 1, 2))
-    all_crops = (((all_crops / 255.0) - 0.5) / 0.5)
     return all_crops.tolist(), overlap_crops["tiling"]
 class Moondream3ImageProcessor(BaseImageProcessor):

     )
     all_crops = overlap_crops["crops"]
     all_crops = np.transpose(all_crops, (0, 3, 1, 2))
+    all_crops =     all_crops = (
+        torch.from_numpy(all_crops)
+        .to(device="cpu", dtype=torch.bfloat16)
+        .div_(255.0)
+        .sub_(0.5)
+        .div_(0.5)
+    )
     return all_crops.tolist(), overlap_crops["tiling"]
 class Moondream3ImageProcessor(BaseImageProcessor):

modeling_moondream3.py CHANGED Viewed

@@ -26,6 +26,7 @@ from PIL import Image
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.masking_utils import create_causal_mask
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -34,6 +35,7 @@ from transformers.processing_utils import Unpack
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.generation import GenerationMixin
 from transformers.utils import logging, TransformersKwargs
 from .configuration_moondream3 import Moondream3Config, Moondream3TextConfig, Moondream3VisionConfig, Moondream3RegionConfig
@@ -41,48 +43,49 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "Moondream3Config"
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
     Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
     Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
-    rot_dim = cos.shape[-1]
-    def rotate_prefix(x):
-        x_rot = x[..., :rot_dim]
-        x_pass = x[..., rot_dim:]
-        x_rot = (x_rot * cos) + (rotate_half(x_rot) * sin)
-        return torch.cat([x_rot, x_pass], dim=-1)
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    return rotate_prefix(q), rotate_prefix(k)
 class Moondream3RotaryEmbedding(nn.Module):
-    inv_freq: torch.Tensor  # fix linting for `register_buffer`
     def __init__(self, config: Moondream3Config, device=None):
         super().__init__()
@@ -108,43 +111,34 @@ class Moondream3RotaryEmbedding(nn.Module):
     ) -> tuple["torch.Tensor", float]:
         """
         Computes the inverse frequencies according to the original RoPE implementation
-        Args:
-            config ([`~transformers.PreTrainedConfig`]):
-                The model configuration.
-            device (`torch.device`):
-                The device to use for initialization of the inverse frequencies.
-            seq_len (`int`, *optional*):
-                The current sequence length. Unused for this type of RoPE.
-        Returns:
-            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
-            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
         """
-        base = config.rope_parameters["rope_theta"]
         dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
-        dim = dim // 2
-        attention_factor = 1.0  # Unused in this type of RoPE
-        # Compute the inverse frequencies
         inv_freq = 1.0 / (
-            base ** (torch.arange(0, dim, 2, dtype=torch.float).to(device=device)[: (dim // 2)] / dim)
         )
         return inv_freq, attention_factor
     @torch.no_grad()
-    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
-        position_ids_expanded = position_ids[:, None, :].float()
-        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos() * self.attention_scaling
-            sin = emb.sin() * self.attention_scaling
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 class Moondream3Attention(nn.Module):
@@ -202,6 +196,8 @@ class Moondream3Attention(nn.Module):
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
         input_shape = hidden_states.shape[:-1]
         bsz, q_len, _ = hidden_states.size()
         # Get qkv combined for tau (before splitting)
@@ -223,7 +219,9 @@ class Moondream3Attention(nn.Module):
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         if self.use_tau:
             query_states = query_states * tau_q
@@ -234,20 +232,38 @@ class Moondream3Attention(nn.Module):
                 tau_v_repeated = tau_v
             value_states = value_states * tau_v_repeated
         cos, sin = None, None
         if position_embeddings is not None:
             cos, sin = position_embeddings
             query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-        if past_key_values is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
         attn_output, attn_weights = ALL_ATTENTION_FUNCTIONS["sdpa"](
             self,
             query_states,
-            key_states,
             value_states,
             attention_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
@@ -256,6 +272,9 @@ class Moondream3Attention(nn.Module):
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
         return attn_output, attn_weights
 class Moondream3MLP(nn.Module):
@@ -266,7 +285,6 @@ class Moondream3MLP(nn.Module):
         self.out_size = self.hidden_size if out_size is None else out_size
         self.hidden_act = hidden_act
         self.gated = gated
-        # Ungated MLP: up_proj and down_proj following HF conventions
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
         self.down_proj = nn.Linear(self.intermediate_size, self.out_size, bias=bias)
         self.gate_proj = None
@@ -276,17 +294,20 @@ class Moondream3MLP(nn.Module):
     def forward(self, x) -> torch.Tensor:
         if self.gated:
-            h = self.act_fn(self.up_proj(x))
-            g = self.gate_proj(x)
-            x = h * (g + 1)
         else:
             x = self.act_fn(self.up_proj(x))
         return self.down_proj(x)
 class Moondream3SparseMoeBlock(nn.Module):
-    def __init__(self, config: Moondream3TextConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.moe_intermediate_size = config.moe_intermediate_size
         self.num_experts = config.num_experts
@@ -295,32 +316,29 @@ class Moondream3SparseMoeBlock(nn.Module):
         self.gate = nn.Linear(self.hidden_size, self.num_experts, bias=True)
         self.experts = nn.ModuleList([Moondream3MLP(hidden_size=self.hidden_size, intermediate_size=self.moe_intermediate_size, hidden_act="gelu", gated=True, bias=False) for _ in range(self.num_experts)])
-    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits: torch.Tensor = self.gate(hidden_states)
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
-        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
         routing_weights = routing_weights.to(hidden_states.dtype)
         final_hidden_states = torch.zeros(
             (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
         )
-        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
         for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
-            idx, top_x = torch.where(expert_mask[expert_idx])
             if top_x.shape[0] == 0:
                 continue
             current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
             current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
             final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
         final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
@@ -330,6 +348,7 @@ class Moondream3SparseMoeBlock(nn.Module):
 class Moondream3DecoderLayer(nn.Module):
     def __init__(self, config: Moondream3TextConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
         self.self_attn = Moondream3Attention(config, layer_idx, use_tau=True)
@@ -337,7 +356,7 @@ class Moondream3DecoderLayer(nn.Module):
         self.is_moe_layer = layer_idx >= config.moe_start_layer
         if self.is_moe_layer:
-            self.mlp = Moondream3SparseMoeBlock(config)
         else:
             self.mlp = Moondream3MLP(self.hidden_size, self.intermediate_size)
@@ -358,6 +377,8 @@ class Moondream3DecoderLayer(nn.Module):
         # Apply layer norm like original
         l_in = self.input_layernorm(hidden_states)
         # Attention
         hidden_states_attn, self_attn_weights = self.self_attn(
@@ -374,10 +395,12 @@ class Moondream3DecoderLayer(nn.Module):
         # MLP
         if self.is_moe_layer:
-            hidden_states_mlp, router_logits = self.mlp(l_in)
         else:
             hidden_states_mlp = self.mlp(l_in)
             router_logits = None
         # Add both attention and MLP to residual like original
         hidden_states = residual + hidden_states_attn + hidden_states_mlp
@@ -538,8 +561,6 @@ class Moondream3TextModel(Moondream3PreTrainedModel):
             if output_router_logits and layer_outputs[-1] is not None:
                 all_router_logits += (layer_outputs[-1],)
-        hidden_states = self.norm(hidden_states)
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
@@ -794,8 +815,8 @@ class Moondream3RegionEncoder(nn.Module):
         self.register_buffer("size_freq", size_freq.T)
     def fourier_features(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
-        x_proj = torch.matmul(x, w) * 2 * torch.pi
-        return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
     def encode_coordinate(self, coord: torch.Tensor) -> torch.Tensor:
         fourier_features = self.fourier_features(coord, self.coord_freq)
@@ -815,7 +836,7 @@ class Moondream3RegionDecoder(nn.Module):
         return self.coord_decoder(hidden_state)
     def decode_size(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        return self.size_decoder(hidden_state)
 class Moondream3Model(Moondream3PreTrainedModel):
     def __init__(self, config: Moondream3Config):
@@ -885,38 +906,59 @@ class Moondream3Model(Moondream3PreTrainedModel):
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position: torch.Tensor = torch.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
-        causal_mask = create_causal_mask(
             config=self.config,
             input_embeds=inputs_embeds,
-            attention_mask=attention_mask,
             cache_position=cache_position,
             past_key_values=past_key_values,
             position_ids=position_ids,
         )
-        if pixel_values is not None and input_ids.shape[-1] > 1:
-            # Vision embeds
-            pixel_values = pixel_values.to(dtype=self.vision_model.embeddings.projection.weight.dtype)
-            image_embeds = self.vision_model(pixel_values, tiling=tiling)["last_hidden_state"]  # [B,P,D]
-            prefix = inputs_embeds[:, :1, :]                      # keep the first token
-            suffix = inputs_embeds[:, 1 + image_embeds.shape[1] :, :]                 # keep the rest after the image span
-            inputs_embeds = torch.cat([prefix, image_embeds, suffix], dim=1)
-            # N/A when doing BSZ 1 since create_causal_mask returns None in the case since theres no padding tokens
-            if causal_mask is not None:
-                img_len = image_embeds.shape[1]
-                causal_mask[:, :, :1 + img_len, :1 + img_len] = True
-                causal_mask[:, :, :1 + img_len, 1 + img_len:] = False
         outputs = self.text_model(
             input_ids=None,
-            attention_mask=causal_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
@@ -942,11 +984,17 @@ class Moondream3Model(Moondream3PreTrainedModel):
             attentions=getattr(outputs, "attentions", None),
         )
 class Moondream3ForConditionalGeneration(Moondream3PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config: Moondream3Config):
         super().__init__(config)
         self.model = Moondream3Model(config)
         self.vocab_size = config.text_config.vocab_size
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=True)
@@ -970,6 +1018,15 @@ class Moondream3ForConditionalGeneration(Moondream3PreTrainedModel, GenerationMi
     def get_decoder(self):
         return self.model.text_model
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -988,6 +1045,9 @@ class Moondream3ForConditionalGeneration(Moondream3PreTrainedModel, GenerationMi
         logits_to_keep: int = 0,
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         # Get hidden states from the base model (it already builds the multimodal prefix)
         model_outputs = self.model(
             input_ids=input_ids,
@@ -1005,7 +1065,6 @@ class Moondream3ForConditionalGeneration(Moondream3PreTrainedModel, GenerationMi
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
         )
         hidden_states = model_outputs.last_hidden_state  # [B, T, D]
         # Compute logits; only keep the tail if requested
@@ -1016,8 +1075,127 @@ class Moondream3ForConditionalGeneration(Moondream3PreTrainedModel, GenerationMi
         else:
             hs = hidden_states
         logits = self.lm_head(hs)  # [B, T', V]
         loss = None
         if labels is not None:
             # Shift if your training uses standard LM convention; here we assume labels aligned with hs
@@ -1031,6 +1209,47 @@ class Moondream3ForConditionalGeneration(Moondream3PreTrainedModel, GenerationMi
             attentions=getattr(model_outputs, "attentions", None),
         )
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()

 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.masking_utils import create_causal_mask
+from dataclasses import dataclass
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.generation import GenerationMixin
+from transformers.generation.utils import GenerateDecoderOnlyOutput
 from transformers.utils import logging, TransformersKwargs
 from .configuration_moondream3 import Moondream3Config, Moondream3TextConfig, Moondream3VisionConfig, Moondream3RegionConfig
 _CONFIG_FOR_DOC = "Moondream3Config"
+import torch
+DEBUG=True
+def apply_rotary_pos_emb(
+    q: torch.Tensor,  # [B, H, L, D]
+    k: torch.Tensor,  # [B, H, L, D]
+    cos: torch.Tensor,  # [B, L, rot_dim]
+    sin: torch.Tensor,  # [B, L, rot_dim]
+    rot_dim: int = 32,
+):
+    """
+    Apply rotary position embeddings to query and key tensors.
     Args:
+        q: Query tensor [batch, num_heads, seq_len, head_dim]
+        k: Key tensor [batch, num_heads, seq_len, head_dim]
+        cos: Cosine frequencies [batch, seq_len, rot_dim]
+        sin: Sine frequencies [batch, seq_len, rot_dim]
+        rot_dim: Number of dimensions to apply rotation to (default: 32)
     Returns:
+        Tuple of (rotated_q, rotated_k)
     """
+    def apply_rope(x):
+        dtype = x.dtype
+        x = x.to(torch.float64)
+        x_rot, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+        d_q = x_rot.shape[-1] // 2
+        xq_r, xq_i = x_rot[..., :d_q], x_rot[..., d_q:]
+        xq_out_r = xq_r * cos - xq_i * sin
+        xq_out_i = xq_r * sin + xq_i * cos
+        xq_out = torch.stack((xq_out_r, xq_out_i), dim=-1).flatten(-2)
+        return torch.cat([xq_out, x_pass], dim=-1)
+    return apply_rope(q), apply_rope(k)
 class Moondream3RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor
     def __init__(self, config: Moondream3Config, device=None):
         super().__init__()
     ) -> tuple["torch.Tensor", float]:
         """
         Computes the inverse frequencies according to the original RoPE implementation
         """
+        base = config.rope_parameters["rope_theta"]  # Should be 1500000.0 to match original
         dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        dim //= 2
+        attention_factor = 1.0
+        # Compute the inverse frequencies - matches your original formula
         inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.float64)[: (dim // 2)] / dim)
         )
+        if device is not None:
+            inv_freq = inv_freq.to(device=device)
         return inv_freq, attention_factor
     @torch.no_grad()
+    @dynamic_rope_update
     def forward(self, x, position_ids):
+        # inv_freq shape: [dim//2]
+        # position_ids shape: [batch_size, seq_len]
+        inv_freq_expanded = self.inv_freq[None, :, None].to(torch.float64).expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].to(torch.float64)
+        freqs = (inv_freq_expanded.to(torch.float64) @ position_ids_expanded.to(torch.float64)).transpose(1, 2)
+        cfreqs = torch.exp(1j * freqs).unsqueeze(1).expand(-1, self.config.num_attention_heads, -1, -1)
+        return cfreqs.real, cfreqs.imag
 class Moondream3Attention(nn.Module):
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
         input_shape = hidden_states.shape[:-1]
+        if isinstance(self.config, Moondream3TextConfig) and DEBUG:
+            torch.save(hidden_states, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_input_states")
         bsz, q_len, _ = hidden_states.size()
         # Get qkv combined for tau (before splitting)
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if isinstance(self.config, Moondream3TextConfig) and DEBUG:
+            torch.save(value_states, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_pre_tau_value")
         if self.use_tau:
             query_states = query_states * tau_q
                 tau_v_repeated = tau_v
             value_states = value_states * tau_v_repeated
+        if isinstance(self.config, Moondream3TextConfig) and DEBUG:
+            torch.save(value_states, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_post_tau_value")
+            torch.save(key_states, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_pre_rope_key")
         cos, sin = None, None
         if position_embeddings is not None:
             cos, sin = position_embeddings
+            if isinstance(self.config, Moondream3TextConfig) and DEBUG:
+                torch.save(cos, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_cos")
+                torch.save(sin, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_sin")
             query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if isinstance(self.config, Moondream3TextConfig) and DEBUG:
+            torch.save(key_states, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_post_rope_key")
+        query_states, key_states = query_states.to(value_states.dtype), key_states.to(value_states.dtype)
+        if past_key_values is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        if isinstance(self.config, Moondream3TextConfig) and DEBUG:
+            torch.save(key_states, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_post_cache_key")
+            torch.save(attention_mask, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_attn_mask")
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
         attn_output, attn_weights = ALL_ATTENTION_FUNCTIONS["sdpa"](
             self,
             query_states,
+            key_states,
             value_states,
             attention_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
+        if isinstance(self.config, Moondream3TextConfig) and DEBUG:
+            torch.save(attn_output, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_attn_out")
         return attn_output, attn_weights
 class Moondream3MLP(nn.Module):
         self.out_size = self.hidden_size if out_size is None else out_size
         self.hidden_act = hidden_act
         self.gated = gated
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
         self.down_proj = nn.Linear(self.intermediate_size, self.out_size, bias=bias)
         self.gate_proj = None
     def forward(self, x) -> torch.Tensor:
         if self.gated:
+            # separate up and gate causes precision issues
+            combined_weight = torch.cat([self.up_proj.weight, self.gate_proj.weight], dim=0)
+            h_full = F.linear(x, combined_weight)
+            h, g = h_full.chunk(2, dim=-1)
+            x = self.act_fn(h) * (g + 1)
         else:
             x = self.act_fn(self.up_proj(x))
         return self.down_proj(x)
 class Moondream3SparseMoeBlock(nn.Module):
+    def __init__(self, config: Moondream3TextConfig, layer_idx = None):
         super().__init__()
+        self.layer_idx = layer_idx
         self.hidden_size = config.hidden_size
         self.moe_intermediate_size = config.moe_intermediate_size
         self.num_experts = config.num_experts
         self.gate = nn.Linear(self.hidden_size, self.num_experts, bias=True)
         self.experts = nn.ModuleList([Moondream3MLP(hidden_size=self.hidden_size, intermediate_size=self.moe_intermediate_size, hidden_act="gelu", gated=True, bias=False) for _ in range(self.num_experts)])
+    def forward(self, hidden_states: torch.Tensor, cache_position=None) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits: torch.Tensor = self.gate(hidden_states)
+        routing_weights, selected_experts = torch.topk(router_logits, self.top_k, dim=-1)
+        routing_weights = F.softmax(routing_weights, dim=-1, dtype=torch.float32)
         routing_weights = routing_weights.to(hidden_states.dtype)
         final_hidden_states = torch.zeros(
             (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
         )
         for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
+            top_x, idx = (selected_experts == expert_idx).nonzero(as_tuple=True)
             if top_x.shape[0] == 0:
                 continue
             current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            # torch.save(current_state, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_e{expert_idx}")
             current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+            # torch.save(current_hidden_states, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_e{expert_idx}")
             final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
         final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
 class Moondream3DecoderLayer(nn.Module):
     def __init__(self, config: Moondream3TextConfig, layer_idx: int):
         super().__init__()
+        self.layer_idx = layer_idx
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
         self.self_attn = Moondream3Attention(config, layer_idx, use_tau=True)
         self.is_moe_layer = layer_idx >= config.moe_start_layer
         if self.is_moe_layer:
+            self.mlp = Moondream3SparseMoeBlock(config, layer_idx=layer_idx)
         else:
             self.mlp = Moondream3MLP(self.hidden_size, self.intermediate_size)
         # Apply layer norm like original
         l_in = self.input_layernorm(hidden_states)
+        if DEBUG:
+            torch.save(l_in, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_ln_out")
         # Attention
         hidden_states_attn, self_attn_weights = self.self_attn(
         # MLP
         if self.is_moe_layer:
+            hidden_states_mlp, router_logits = self.mlp(l_in, cache_position=cache_position)
         else:
             hidden_states_mlp = self.mlp(l_in)
             router_logits = None
+        if DEBUG:
+            torch.save(hidden_states_mlp, f"dbg/hf_l{self.layer_idx}_c{cache_position[-1].item()}_mlp_out")
         # Add both attention and MLP to residual like original
         hidden_states = residual + hidden_states_attn + hidden_states_mlp
             if output_router_logits and layer_outputs[-1] is not None:
                 all_router_logits += (layer_outputs[-1],)
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
         self.register_buffer("size_freq", size_freq.T)
     def fourier_features(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
+        x_proj = 2 * torch.pi * x @ w
+        return torch.cat([x_proj.cos(), x_proj.sin()], dim=-1)
     def encode_coordinate(self, coord: torch.Tensor) -> torch.Tensor:
         fourier_features = self.fourier_features(coord, self.coord_freq)
         return self.coord_decoder(hidden_state)
     def decode_size(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return self.size_decoder(hidden_state).view(hidden_state.shape[0],2,-1)
 class Moondream3Model(Moondream3PreTrainedModel):
     def __init__(self, config: Moondream3Config):
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens, device=inputs_embeds.device
             )
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
+        def image_mask_function(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int):
+            # set all up to `self.config.vision_config.prefix_len` to true
+            return kv_idx <= q_idx
+        if pixel_values is not None:
+            # Vision embeds
+            pixel_values = pixel_values.to(dtype=self.vision_model.embeddings.projection.weight.dtype)
+            image_embeds = self.vision_model(pixel_values, tiling=tiling)["last_hidden_state"]  # [B,P,D]
+            prefix = self.text_model.embed_tokens(
+                torch.full((input_ids.shape[0], 1), self.config.text_config.bos_token_id, dtype=input_ids.dtype, device=input_ids.device)
+            )
+            embeds = torch.cat([prefix, image_embeds], dim=1)
+            cache_pos = torch.arange(embeds.shape[-2], device=embeds.device)
+            pos = cache_pos.unsqueeze(0).expand(embeds.shape[0],-1)
+            attn_mask = torch.full(
+                (embeds.shape[0], 1, embeds.shape[-2], pos.shape[-1]),
+                True,
+                dtype=torch.bool,
+                device=embeds.device,
+            )
+            outputs = self.text_model(
+                input_ids=None,
+                attention_mask=attn_mask,
+                position_ids=pos,
+                past_key_values=past_key_values,
+                inputs_embeds=embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=True,
+                cache_position=cache_pos,
+            )
+        attn_mask = create_causal_mask(
             config=self.config,
             input_embeds=inputs_embeds,
+            attention_mask=torch.cat([torch.ones(attention_mask.shape[0], cache_position[-1] + 1 - attention_mask.shape[-1], device=attention_mask.device, dtype=attention_mask.dtype), attention_mask], dim=-1),
             cache_position=cache_position,
             past_key_values=past_key_values,
             position_ids=position_ids,
+            and_mask_function=image_mask_function
         )
         outputs = self.text_model(
             input_ids=None,
+            attention_mask=attn_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             attentions=getattr(outputs, "attentions", None),
         )
+@dataclass
+class Moondream3GenerateOutput(GenerateDecoderOnlyOutput):
+    objects: Optional[list[dict[str,float]]] = None
 class Moondream3ForConditionalGeneration(Moondream3PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config: Moondream3Config):
         super().__init__(config)
+        self.objects = None
         self.model = Moondream3Model(config)
         self.vocab_size = config.text_config.vocab_size
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=True)
     def get_decoder(self):
         return self.model.text_model
+    def _prepare_generated_length(
+        self,
+        generation_config,
+        **kwargs,
+    ):
+        generation_config = super()._prepare_generated_length(generation_config, **kwargs)
+        generation_config.max_length += 730
+        return generation_config
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         logits_to_keep: int = 0,
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if pixel_values is not None and inputs_embeds is None:
+            position_ids += self.config.vision_config.prefix_len
+            cache_position += self.config.vision_config.prefix_len
         # Get hidden states from the base model (it already builds the multimodal prefix)
         model_outputs = self.model(
             input_ids=input_ids,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
         )
         hidden_states = model_outputs.last_hidden_state  # [B, T, D]
         # Compute logits; only keep the tail if requested
         else:
             hs = hidden_states
+        hs = self.model.text_model.norm(hs)
         logits = self.lm_head(hs)  # [B, T', V]
+        pred = torch.argmax(logits, dim=-1)
+        pos_ids = position_ids[:,-1:] + 1
+        cache_pos = cache_position[-1:] + 1
+        mask = torch.ones(
+            hidden_states.shape[0], 1, device=self.device, dtype=torch.long
+        )
+        while torch.any(pred == 5):
+            batch_mask = (pred[:, -1] == 5)
+            hidden_states = hidden_states[:, -1:, :]
+            x_logits = self.model.region_decoder.decode_coordinate(hidden_states)
+            x_center = torch.argmax(x_logits, dim=-1) / x_logits.size(-1)
+            next_embeds = self.model.region_encoder.encode_coordinate(x_center.to(x_logits.dtype)).unsqueeze(1)
+            model_outputs = self.model(
+                input_ids=None,
+                pixel_values=None,
+                tiling=None,
+                attention_mask=mask,
+                position_ids=pos_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=next_embeds,
+                labels=None,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=True,
+                cache_position=cache_pos,
+                logits_to_keep=logits_to_keep,
+            )
+            hidden_states = model_outputs.last_hidden_state  # [B, T, D]
+            y_logits = self.model.region_decoder.decode_coordinate(hidden_states)
+            y_center = torch.argmax(y_logits, dim=-1) / y_logits.size(-1)
+            next_embeds = self.model.region_encoder.encode_coordinate(y_center.to(y_logits.dtype)).unsqueeze(1)
+            coords = torch.cat([x_center, y_center], dim=1)
+            coords = coords * (batch_mask).unsqueeze(1)
+            pos_ids += 1
+            cache_pos = cache_pos + 1
+            bbox = None
+            if input_ids[0,1] == 7235:
+                model_outputs = self.model(
+                    input_ids=None,
+                    pixel_values=None,
+                    tiling=None,
+                    attention_mask=mask,
+                    position_ids=pos_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=next_embeds,
+                    labels=None,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=True,
+                    cache_position=cache_pos,
+                    logits_to_keep=logits_to_keep,
+                )
+                hidden_states = model_outputs.last_hidden_state  # [B, T, D]
+                size_logits = self.model.region_decoder.decode_size(hidden_states)
+                bins = torch.argmax(size_logits, dim=-1)
+                w_bin = bins[:,0]
+                h_bin = bins[:,1]
+                w = torch.pow(2.0, (w_bin.float() / 1023.0) * 10.0 - 10.0)
+                h = torch.pow(2.0, (h_bin.float() / 1023.0) * 10.0 - 10.0)
+                next_embeds = (
+                    self.model.region_encoder.encode_size(
+                        torch.stack([w, h],dim=-1).to(size_logits.dtype)
+                    )
+                ).unsqueeze(1)
+                bbox = [
+                    x_center.item() - w.item() / 2,
+                    y_center.item() - h.item() / 2,
+                    x_center.item() + w.item() / 2,
+                    y_center.item() + h.item() / 2,
+                ]
+                bbox = bbox * (batch_mask).unsqueeze(1)
+                pos_ids += 1
+                cache_pos = cache_pos + 1
+            new = coords.unsqueeze(1) if bbox is None else bbox.unsqueeze(1)
+            if self.objects is None:
+                self.objects = new
+            else:
+                self.objects = torch.cat([self.objects, new], dim=1)
+            model_outputs = self.model(
+                input_ids=None,
+                pixel_values=None,
+                tiling=None,
+                attention_mask=mask,
+                position_ids=pos_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=next_embeds,
+                labels=None,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=True,
+                cache_position=cache_pos,
+                logits_to_keep=logits_to_keep,
+            )
+            pos_ids += 1
+            cache_pos = cache_pos + 1
+            hidden_states = model_outputs.last_hidden_state  # [B, T, D]
+            indices = torch.tensor(
+                [self.config.text_config.coord_token_id, self.config.text_config.eos_token_id],
+                device=self.device,
+            )
+            hidden_states = self.model.text_model.norm(hidden_states)
+            logits = hidden_states @ self.lm_head.weight[indices].T + self.lm_head.bias[indices]
+            logits_full = torch.full((logits.shape[0], logits.shape[1], self.config.text_config.vocab_size), float('-inf'), device=logits.device, dtype=logits.dtype)
+            logits_full[:, :, torch.tensor([5,0])] = logits
+            logits = logits_full
+            pred[batch_mask] = torch.argmax(logits, dim=-1)[batch_mask]
         loss = None
         if labels is not None:
             # Shift if your training uses standard LM convention; here we assume labels aligned with hs
             attentions=getattr(model_outputs, "attentions", None),
         )
+    def generate(self, **kwargs) -> Union[Moondream3GenerateOutput, torch.LongTensor]:
+        outputs = super().generate(**kwargs)
+        if len(self.objects) > 0:
+            if isinstance(outputs, torch.Tensor):
+                outputs = self.objects
+                self.objects = []
+            else:
+                outputs = Moondream3GenerateOutput(
+                    **outputs,
+                    objects=self.objects
+                )
+                self.objects = []
+        return outputs
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        **model_kwargs
+    ):
+        model_inputs = super().prepare_inputs_for_generation(input_ids, **model_kwargs)
+        model_inputs["position_ids"] += model_inputs["cache_position"].unsqueeze(0) - model_inputs["position_ids"]
+        return model_inputs
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs,
+        model_kwargs,
+        is_encoder_decoder,
+        num_new_tokens: int = 1,
+    ):
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs,
+            model_kwargs,
+            is_encoder_decoder=is_encoder_decoder,
+            num_new_tokens=num_new_tokens,
+        )
+        model_kwargs["pixel_values"] = None
+        model_kwargs["tiling"] = None
+        return model_kwargs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()

processing_moondream3.py CHANGED Viewed

@@ -211,40 +211,40 @@ class Moondream3Processor(ProcessorMixin):
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None)
-        if "input_ids" in text_inputs:
-            # prepend 1 bos_token_id and 729 image_token_id to the text_inputs
-            for i in range(len(text_inputs["input_ids"])):
-                prepended_tokens = [self.tokenizer.bos_token_id] + [self.image_token_id] * 729
-                text_inputs["input_ids"][i] = prepended_tokens + text_inputs["input_ids"][i]
-        if "attention_mask" in text_inputs:
-            # attend to the 730 prepended tokens
-            for i in range(len(text_inputs["attention_mask"])):
-                prepended_mask = [1] * 730
-                text_inputs["attention_mask"][i] = prepended_mask + text_inputs["attention_mask"][i]
         return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
-    def apply_chat_template(
-        self,
-        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
-        chat_template: Optional[str] = None,
-        **kwargs,
-    ) -> str:
-            # Call the original behavior first
-        out = super().apply_chat_template(
-            conversation=conversation,
-            chat_template=chat_template,
-            **kwargs,
-        )
-        # Only post-process when:
-        # - user requested assistant mask
-        # - output is a dict (tokenized + return_dict=True path)
-        if isinstance(out, BatchFeature) and kwargs.get("return_assistant_tokens_mask", False):
-            if "assistant_masks" in out and out["assistant_masks"] is not None:
-                out["assistant_masks"] = _rotate_right_array(out["assistant_masks"], 730)
-        return out
     @property
     def model_input_names(self):

         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None)
+        # if "input_ids" in text_inputs:
+        #     # prepend 1 bos_token_id and 729 image_token_id to the text_inputs
+        #     for i in range(len(text_inputs["input_ids"])):
+        #         prepended_tokens = [self.tokenizer.bos_token_id] + [self.image_token_id] * 729
+        #         text_inputs["input_ids"][i] = prepended_tokens + text_inputs["input_ids"][i]
+        # if "attention_mask" in text_inputs:
+        #     # attend to the 730 prepended tokens
+        #     for i in range(len(text_inputs["attention_mask"])):
+        #         prepended_mask = [1] * 730
+        #         text_inputs["attention_mask"][i] = prepended_mask + text_inputs["attention_mask"][i]
         return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+    # def apply_chat_template(
+    #     self,
+    #     conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
+    #     chat_template: Optional[str] = None,
+    #     **kwargs,
+    # ) -> str:
+    #         # Call the original behavior first
+    #     out = super().apply_chat_template(
+    #         conversation=conversation,
+    #         chat_template=chat_template,
+    #         **kwargs,
+    #     )
+    #     # Only post-process when:
+    #     # - user requested assistant mask
+    #     # - output is a dict (tokenized + return_dict=True path)
+    #     if isinstance(out, BatchFeature) and kwargs.get("return_assistant_tokens_mask", False):
+    #         if "assistant_masks" in out and out["assistant_masks"] is not None:
+    #             out["assistant_masks"] = _rotate_right_array(out["assistant_masks"], 730)
+    #     return out
     @property
     def model_input_names(self):