yizheapple commited on 10 days ago

Commit

10d76e2

verified ·

1 Parent(s): acccd4b

Add model

Browse files

Files changed (19) hide show

README.md +56 -3
compression-128/adapters.pth +3 -0
compression-128/chat_template.jinja +24 -0
compression-128/config.json +40 -0
compression-128/decoder_first_last_layers.pth +3 -0
compression-128/modeling_clara.py +1712 -0
compression-128/special_tokens_map.json +24 -0
compression-128/tokenizer.json +0 -0
compression-128/tokenizer.model +3 -0
compression-128/tokenizer_config.json +44 -0
compression-16/adapters.pth +3 -0
compression-16/chat_template.jinja +24 -0
compression-16/config.json +40 -0
compression-16/decoder_first_last_layers.pth +3 -0
compression-16/modeling_clara.py +1712 -0
compression-16/special_tokens_map.json +24 -0
compression-16/tokenizer.json +0 -0
compression-16/tokenizer.model +3 -0
compression-16/tokenizer_config.json +44 -0

README.md CHANGED Viewed

@@ -1,3 +1,56 @@
----
-license: apple-amlr
----

+---
+license: unknown
+base_model:
+  - mistralai/Mistral-7B-Instruct-v0.2
+tags:
+  - rag
+  - compression
+  - retrieval
+  - end-to-end
+  - generation
+---
+# CLaRa-7B-E2E (Compression-16 & 128)
+The **CLaRa-7B-E2E** model is our fully end-to-end unified RAG model, jointly optimizing retrieval and generation with 16× and 128x document compression.
+**Training recipe:** End-to-end finetuning with differentiable top-k retrieval and a unified language-modeling objective.
+**Benchmarks:** Strong retrieval-augmented QA performance under aggressive compression.
+---
+## More details and usage examples:
+Paper: [CLaRa: Bridging Retrieval and Generation with Continuous Latent Reasoning](https://arxiv.org/abs/2511.18659)
+GitHub: https://github.com/apple/ml-clara
+---
+## Example Usage (End-to-End Inference)
+```python
+from transformers import AutoModel
+unirag = AutoModel.from_pretrained(
+    "/mnt/ceph_rbd/model/CLaRa-7B-E2E/compression-16",
+    trust_remote_code=True
+).to("cuda")
+# Example documents and question
+documents = [[
+    "Weldenia is a monotypic genus of flowering plant in the family Commelinaceae...",
+] * 20]
+questions = [
+    "Which genus of plant grows originally in Mexico and Guatemala, Phylica or Weldenia?"
+]
+# End-to-end usage (retrieval + generation)
+# The effective top-k is controlled by `generation_top_k` in config.json.
+out = unirag.generate_from_questions(
+    questions=questions,
+    documents=documents,
+    max_new_tokens=64
+)
+print("Generated answer", out)

compression-128/adapters.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf779cf29ab86f4a0592370e6b66664b7448c53c59b86de811c4c2849867b230
+size 252096669

compression-128/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,24 @@

+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message['role'] == 'user' %}
+        {%- if loop.first and system_message is defined %}
+            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}
+        {%- else %}
+            {{- ' [INST] ' + message['content'] + ' [/INST]' }}
+        {%- endif %}
+    {%- elif message['role'] == 'assistant' %}
+        {{- ' ' + message['content'] + eos_token}}
+    {%- else %}
+        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}
+    {%- endif %}
+{%- endfor %}

compression-128/config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "ae_mode": "token",
+  "attn_implementation": null,
+  "auto_map": {
+    "AutoConfig": "modeling_clara.CLaRaConfig",
+    "AutoModel": "modeling_clara.CLaRa"
+  },
+  "compr_base_model_name": "/mnt/ceph_rbd/model/Mistral-7B-Instruct-v0.2",
+  "compr_every_n_layer": null,
+  "compr_linear_type": "concat",
+  "compr_mlp_hidden_dim": 8096,
+  "compr_model_name": null,
+  "compr_n_layers": 5,
+  "compr_rate": 128,
+  "compr_rms_norm": false,
+  "compr_use_mlp": false,
+  "decoder_model_name": "/mnt/conductor_data/data/hf_models/Mistral-7B-Instruct-v0.2",
+  "device_map": null,
+  "different_mem_tokens": true,
+  "doc_max_length": 256,
+  "generation_top_k": 5,
+  "kbtc_training": false,
+  "load_adapters": true,
+  "load_pretrained_checkpoint": false,
+  "lora": true,
+  "lora_compressor": false,
+  "lora_r": 16,
+  "lora_r_compressor": 16,
+  "max_new_tokens": 128,
+  "model_type": "CLaRa",
+  "optimize_mem_tokens": true,
+  "pad_token_id": 2,
+  "pure_inference": false,
+  "quantization": "no",
+  "sep": true,
+  "stage2_retrieval_top_n": 1,
+  "training_form": "both_separately",
+  "training_stage": "stage2",
+  "transformers_version": "4.53.3"
+}

compression-128/decoder_first_last_layers.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3029ac143f3cc3a23462daebe83b2eddc4bba5117530a7e93ea28fb59ac44e06
+size 524372021

compression-128/modeling_clara.py ADDED Viewed

	@@ -0,0 +1,1712 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+import warnings
+import os
+import torch
+import gc
+import time
+import json
+import copy
+import random
+import requests
+import re
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.functional import gelu
+from jinja2.exceptions import TemplateError
+from peft import LoraConfig
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    PreTrainedModel,
+    PretrainedConfig,
+    StoppingCriteria,
+    StoppingCriteriaList
+)
+from huggingface_hub import hf_hub_download
+from typing import List, Dict, Any, Optional, Tuple
+# Environment setup
+torch.set_printoptions(threshold=float("inf"))
+os.environ["NCCL_TIMEOUT"] = "5400"
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+# Constants
+IGNORE_INDEX = -100
+PARAPHRASE_INSTRUCTIONS = [
+    'Background: {docs} means the same as',
+    "Background: {docs} Can you put the above sentences in your own terms?",
+    "Background: {docs} Please provide a reinterpretation of the preceding background text.",
+    "These two expressions are equivalent in essence:\n(1) {docs}\n(2)",
+    "Background: {docs} is a paraphrase of what?",
+    "Background: {docs} Could you give me a different version of the background sentences above?",
+    "In other words, background: {docs} is just another way of saying:",
+    "You're getting across the same point whether you say background: {docs} or",
+    "Background: {docs} After unpacking the ideas in the background information above, we got:",
+    "Background: {docs} Please offer a restatement of the background sentences I've just read.",
+    "Background: {docs}, which also means:",
+    "Strip away the mystery, and you'll find background: {docs} is simply another rendition of:",
+    "The essence of background: {docs} is captured again in the following statement:",
+]
+class StopOnCriteria(StoppingCriteria):
+    """Custom stopping criteria for generation."""
+    def __init__(self, tokenizer, stop_strings: List[str] = None, stop_token_ids: List[int] = None):
+        self.tokenizer = tokenizer
+        self.stop_strings = stop_strings or []
+        self.stop_token_ids = stop_token_ids or []
+        self.reason = None
+    def __call__(self, input_ids, scores, **kwargs):
+        # Check if last token is in stop_token_ids
+        last_token = input_ids[0, -1].item()
+        if last_token in self.stop_token_ids:
+            self.reason = f"stop_token_{last_token}"
+            return True
+        # Check if any stop_strings appear in generated text
+        text = self.tokenizer.decode(input_ids[0], skip_special_tokens=False)
+        for stop_str in self.stop_strings:
+            if stop_str in text:
+                self.reason = f"stop_string_{stop_str}"
+                return True
+        return False
+class LlamaRMSNorm(nn.Module):
+    """Llama-style RMS normalization layer."""
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class Converter(nn.Module):
+    """Converter module for dimension transformation."""
+    def __init__(self, input_dim: int, output_dim: int):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.rms_norm = LlamaRMSNorm(input_dim)
+        self.dense_in = nn.Linear(input_dim, output_dim)
+        self.dense_out = nn.Linear(output_dim, output_dim)
+        self._print_trainable_parameters()
+    def _print_trainable_parameters(self):
+        """Print parameter statistics."""
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        total_params = sum(p.numel() for p in self.parameters())
+        print(f"Converter trainable parameters: {trainable_params}, Total parameters: {total_params}")
+    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
+        embeddings = self.rms_norm(embeddings)
+        x = self.dense_in(embeddings)
+        x = self.dense_out(gelu(x))
+        return x.to(torch.float32)
+class CLaRaConfig(PretrainedConfig):
+    """Configuration class for CLaRa model."""
+    model_type = "CLaRa"
+    def __init__(self,
+                 decoder_model_name: str = "meta-llama/Llama-2-7b-chat-hf",
+                 doc_max_length: int = 128,
+                 quantization: str = 'no',
+                 sep: bool = False,
+                 compr_model_name: str = "google-bert/bert-base-uncased",
+                 compr_rate: int = 64,
+                 compr_n_layers: int = None,
+                 compr_every_n_layer: int = None,
+                 compr_base_model_name: str = '/mnt/ceph_rbd/model/Mistral-7B-Instruct-v0.2',
+                 compr_rms_norm: bool = False,
+                 compr_mlp_hidden_dim: int = 8096,
+                 compr_use_mlp: bool = True,
+                 compr_linear_type: str = "concat",
+                 lora: bool = False,
+                 lora_compressor: bool = False,
+                 training_form: str = "both",
+                 training_stage: str = "stage1",
+                 generation_top_k: int = 1,
+                 lora_r: int = 16,
+                 lora_r_compressor: int = None,
+                 load_adapters: bool = True,
+                 kbtc_training: bool = False,
+                 optimize_mem_tokens: bool = False,
+                 different_mem_tokens: bool = False,
+                 attn_implementation: str = None,
+                 _attn_implementation_autoset: bool = True,
+                 ae_mode: str = "token",
+                 max_new_tokens: int = 128,
+                 stage2_retrieval_top_n: int = 1,
+                 load_pretrained_checkpoint: bool = False,
+                 device_map=None,
+                 auto_map: dict = {
+                     "AutoConfig": "modeling_clara.CLaRaConfig",
+                     "AutoModel": "modeling_clara.CLaRa"
+                 },
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.decoder_model_name = decoder_model_name
+        self.doc_max_length = doc_max_length
+        self.quantization = quantization
+        self.sep = sep
+        self.compr_model_name = compr_model_name
+        self.compr_rate = compr_rate
+        self.compr_use_mlp = compr_use_mlp
+        self.compr_mlp_hidden_dim = compr_mlp_hidden_dim
+        self.compr_n_layers = compr_n_layers
+        self.compr_every_n_layer = compr_every_n_layer
+        self.compr_base_model_name = compr_base_model_name
+        self.compr_rms_norm = compr_rms_norm
+        self.compr_linear_type = compr_linear_type
+        self.lora = lora
+        self.lora_compressor = lora_compressor
+        self.training_form = training_form
+        self.lora_r = lora_r
+        self.lora_r_compressor = lora_r_compressor or lora_r
+        self.load_adapters = load_adapters
+        self.optimize_mem_tokens = optimize_mem_tokens
+        self.different_mem_tokens = different_mem_tokens
+        self.kbtc_training = kbtc_training
+        self.training_stage = training_stage
+        self.device_map = device_map
+        self.attn_implementation = attn_implementation
+        self._attn_implementation_autoset = _attn_implementation_autoset
+        self.ae_mode = ae_mode
+        self.max_new_tokens = max_new_tokens
+        self.auto_map = auto_map
+        self.load_pretrained_checkpoint = load_pretrained_checkpoint
+        self.generation_top_k = generation_top_k
+        self.stage2_retrieval_top_n = stage2_retrieval_top_n
+        if training_form == 'compressor':
+            assert compr_model_name is not None and not self.lora
+# Utility functions
+def remote_generate(docs: List[str], questions: List[str], api_url: str) -> List[str]:
+    """Generate responses using remote API."""
+    response = requests.post(
+        f"{api_url}/generate",
+        json={"docs": docs, "questions": questions}
+    )
+    return response.json()["texts"]
+def add_memory_tokens_to_inputs(input_ids: torch.Tensor,
+                               attention_mask: torch.Tensor,
+                               n_mem_tokens: int,
+                               tokenizer) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Add memory tokens to input sequences."""
+    assert len(tokenizer.mem_tokens) == n_mem_tokens
+    mem_tokens = torch.stack([tokenizer.mem_token_ids_pt] * input_ids.size(0), 0)
+    assert len(mem_tokens) == input_ids.size(0)
+    assert len(mem_tokens[0]) == n_mem_tokens
+    input_ids = torch.cat([input_ids, mem_tokens], dim=1)
+    attention_mask = torch.cat([attention_mask, torch.ones(input_ids.size(0), n_mem_tokens)], dim=1)
+    return input_ids, attention_mask
+def build_pos_mask(pos_index: List[List[int]], N: int, device: torch.device) -> torch.Tensor:
+    """Build positive mask for retrieval training."""
+    if isinstance(pos_index, (list, tuple)):
+        B = len(pos_index)
+        mask = torch.zeros(B, N, dtype=torch.bool, device=device)
+        for b, idxs in enumerate(pos_index):
+            if len(idxs) > 0:
+                mask[b, torch.as_tensor(idxs, device=device, dtype=torch.long)] = True
+        return mask
+    else:  # tensor [B, M]
+        B, M = pos_index.shape
+        mask = torch.zeros(B, N, dtype=torch.bool, device=device)
+        for m in range(M):
+            col = pos_index[:, m]
+            v = col >= 0
+            if v.any():
+                mask[v, col[v]] = True
+        return mask
+def differentiable_topk_top_1(logits: torch.Tensor, k: int, temperature: float = 1.0) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Implements differentiable top-1 selection using Gumbel-Softmax."""
+    y = logits / temperature
+    y_soft = F.softmax(y, dim=-1).float()
+    # Hard one-hot version
+    index = y_soft.argmax(dim=-1, keepdim=True)
+    y_hard = torch.zeros_like(y_soft).scatter_(-1, index, 1.0)
+    # Straight-through estimator
+    z = y_hard + y_soft - y_soft.detach()
+    z = z.unsqueeze(1).to(logits.dtype)
+    return z, index
+def differentiable_topk(logits: torch.Tensor, k: int, temperature: float = 1.0) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Differentiable top-k selection."""
+    B, N = logits.shape
+    perturbed = logits / max(temperature, 1e-6)
+    # Hard top-k indices
+    topk_vals, topk_idx = perturbed.topk(k, dim=-1)
+    K_hard = torch.zeros(B, k, N, device=logits.device, dtype=logits.dtype)
+    K_hard.scatter_(2, topk_idx.unsqueeze(-1), 1.0)
+    # Soft distributions for each slot
+    K_soft = torch.zeros_like(K_hard)
+    taken = torch.zeros(B, N, device=logits.device, dtype=logits.dtype)
+    for j in range(k):
+        mask = (1.0 - taken.detach())
+        masked = perturbed + (mask + 1e-8).log()
+        pj = F.softmax(masked, dim=-1).float()
+        K_soft[:, j, :] = pj
+        taken = torch.clamp(taken + K_hard[:, j, :], max=1.0)
+    # Straight-through estimator
+    W = K_hard + (K_soft - K_soft.detach())
+    return W, topk_idx
+class CLaRa(PreTrainedModel):
+    """CLaRa: Unified Retrieval-Augmented Generation Model."""
+    config_class = CLaRaConfig
+    def __init__(self, cfg: CLaRaConfig):
+        super().__init__(cfg)
+        self.decoder_model_name = cfg.decoder_model_name
+        self.decoder = self._create_decoder(cfg)
+        self.doc_max_length = cfg.doc_max_length
+        print(f'Base decoder parameters: {self.decoder.num_parameters()}')
+        # Model configuration
+        self.compr_model_name = cfg.compr_model_name
+        self.training_form = cfg.training_form
+        self.lora = cfg.lora
+        self.adapter_keys = []
+        self.compr = None
+        # Initialize LoRA adapters if needed
+        if cfg.lora and not getattr(cfg, 'pure_inference', False):
+            self._setup_lora_adapters(cfg)
+        print(f'Model adapter keys: {self.adapter_keys}')
+        # Initialize tokenizer and resize embeddings
+        self.decoder_tokenizer = self._create_decoder_tokenizer(cfg)
+        self.decoder.resize_token_embeddings(len(self.decoder_tokenizer))
+        self._configure_generation_config()
+        # Model parameters
+        self.generation_top_k = cfg.generation_top_k
+        self.training_stage = cfg.training_stage
+        self.stage2_retrieval_top_n = cfg.stage2_retrieval_top_n
+        self.sep = cfg.sep
+        self.compr_rate = cfg.compr_rate
+        self.local_rank = os.getenv('LOCAL_RANK', '0')
+        self.n_mem_tokens = self.doc_max_length // self.compr_rate
+        self.hidden_size = self.decoder.config.hidden_size
+        # Setup adapters and memory token optimization
+        if self.lora:
+            self._setup_adapter_training()
+        else:
+            print(f'Total trainable parameters: {self.num_parameters(only_trainable=True)}')
+        self._prepare_mem_tokens_optimization()
+        # Retrieval configuration
+        self.url_retrieval = "http://127.0.0.1:5004/queries"
+    def _create_decoder(self, cfg: CLaRaConfig) -> AutoModelForCausalLM:
+        """Create and configure the decoder model."""
+        if not torch.cuda.is_available():
+            return AutoModelForCausalLM.from_pretrained(
+                cfg.decoder_model_name,
+                torch_dtype=torch.bfloat16,
+                resume_download=True,
+                trust_remote_code=True,
+                device_map=cfg.device_map
+            )
+        if cfg.quantization == "no":
+            return AutoModelForCausalLM.from_pretrained(
+                cfg.decoder_model_name,
+                torch_dtype=torch.bfloat16,
+                attn_implementation=cfg.attn_implementation,
+                device_map=cfg.device_map
+            )
+        elif cfg.quantization == "int4":
+            quant_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type='nf4',
+                bnb_4bit_compute_dtype='bfloat16',
+            )
+            return AutoModelForCausalLM.from_pretrained(
+                cfg.decoder_model_name,
+                quantization_config=quant_config,
+                attn_implementation=cfg.attn_implementation,
+                torch_dtype=torch.bfloat16,
+                resume_download=True,
+                trust_remote_code=True,
+                device_map=cfg.device_map
+            )
+        elif cfg.quantization == "int8":
+            quant_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+                llm_int8_enable_fp32_cpu_offload=True,
+                bnb_4bit_compute_dtype='bfloat16',
+            )
+            return AutoModelForCausalLM.from_pretrained(
+                cfg.decoder_model_name,
+                quantization_config=quant_config,
+                attn_implementation=cfg.attn_implementation,
+                torch_dtype=torch.bfloat16,
+                resume_download=True,
+                trust_remote_code=True,
+                device_map=cfg.device_map
+            )
+        else:
+            raise NotImplementedError(f"Quantization {cfg.quantization} not supported")
+    def _setup_lora_adapters(self, cfg: CLaRaConfig):
+        """Setup LoRA adapters based on training stage."""
+        peft_config = self._get_peft_config(lora_r=cfg.lora_r)
+        if cfg.training_stage == "stage1" and cfg.load_adapters:
+            print('Loading encoder and decoder adapter for stage1')
+            self.decoder.add_adapter(peft_config, 'decoder_adapter')
+            self.adapter_keys.append('decoder_adapter')
+            self.decoder.add_adapter(peft_config, 'encoder_adapter')
+            self.adapter_keys.append('encoder_adapter')
+        elif cfg.training_stage == "stage2" and cfg.load_adapters:
+            if 'decoder_adapter' not in self.adapter_keys:
+                self.decoder.add_adapter(peft_config, 'decoder_adapter')
+                self.adapter_keys.append('decoder_adapter')
+            if 'query_reasoner_adapter' not in self.adapter_keys:
+                self.decoder.add_adapter(peft_config, 'query_reasoner_adapter')
+                self.adapter_keys.append('query_reasoner_adapter')
+        elif cfg.training_stage == 'stage1_2':
+            if not cfg.load_adapters:
+                print('Loading decoder adapter for stage1_2')
+                self.decoder.add_adapter(peft_config, 'decoder_adapter')
+                self.adapter_keys.append('decoder_adapter')
+            elif cfg.load_adapters:
+                print('Loading encoder and decoder adapter for stage1_2')
+                self.decoder.add_adapter(peft_config, 'encoder_adapter')
+                self.adapter_keys.append('encoder_adapter')
+                self.decoder.add_adapter(peft_config, 'decoder_adapter')
+                self.adapter_keys.append('decoder_adapter')
+        elif cfg.training_stage == 'stage2_reasoning':
+            if not cfg.load_adapters:
+                print('Loading decoder adapter for stage2_reasoning')
+                self.decoder.add_adapter(peft_config, 'decoder_adapter')
+                self.adapter_keys.append('decoder_adapter')
+    def _setup_adapter_training(self):
+        """Setup adapters for training."""
+        for adapter_key in self.adapter_keys:
+            self.decoder.set_adapter(adapter_key)
+            print(f'Adapter {adapter_key} trainable parameters: {self.num_parameters(only_trainable=True)}')
+        self._set_all_adapters()
+    def _configure_generation_config(self):
+        """Configure generation parameters."""
+        self.decoder.generation_config.top_p = None
+        self.decoder.generation_config.temperature = None
+        self.decoder.generation_config.pad_token_id = self.decoder_tokenizer.pad_token_id
+    @staticmethod
+    def _create_decoder_tokenizer(cfg: CLaRaConfig) -> AutoTokenizer:
+        """Create and configure the decoder tokenizer."""
+        tokenizer = AutoTokenizer.from_pretrained(
+            cfg.decoder_model_name,
+            use_fast=True,
+            padding_side='left'
+        )
+        # Define special tokens
+        n_mem_tokens = cfg.doc_max_length // cfg.compr_rate
+        existing_special_tokens = tokenizer.special_tokens_map.get("additional_special_tokens", [])
+        if cfg.different_mem_tokens:
+            mem_tokens = [f'<MEM{i}>' for i in range(n_mem_tokens)]
+            tokenizer.add_special_tokens({
+                'additional_special_tokens': existing_special_tokens + mem_tokens + ['<AE>', '<ENC>', '<SEP>']
+            })
+            tokenizer.mem_tokens = mem_tokens
+        else:
+            tokenizer.add_special_tokens({
+                'additional_special_tokens': existing_special_tokens + ['<MEM>', '<AE>', '<ENC>', '<SEP>']
+            })
+            tokenizer.mem_tokens = ['<MEM>'] * n_mem_tokens
+        tokenizer.mem_token_ids = [tokenizer.convert_tokens_to_ids(token) for token in tokenizer.mem_tokens]
+        tokenizer.mem_token_ids_pt = torch.LongTensor(tokenizer.mem_token_ids)
+        # Additional special tokens
+        tokenizer.ae_token = '<AE>'
+        tokenizer.ae_token_id = tokenizer.convert_tokens_to_ids('<AE>')
+        tokenizer.enc_token = '<ENC>'
+        tokenizer.sep_token = '<SEP>'
+        tokenizer.sep_token_id = tokenizer.convert_tokens_to_ids('<SEP>')
+        # Handle model-specific tokens
+        if tokenizer.bos_token is None and 'qwen' in cfg.decoder_model_name.lower():
+            tokenizer.bos_token = tokenizer.special_tokens_map['additional_special_tokens'][0]
+            tokenizer.bos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.bos_token)
+        if tokenizer.eos_token is None and "qwen" in cfg.decoder_model_name.lower():
+            tokenizer.eos_token = tokenizer.special_tokens_map['additional_special_tokens'][1]
+            tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
+        # KBTC training tokens
+        if cfg.kbtc_training:
+            tokenizer.add_special_tokens({'additional_special_tokens': ['<KBTC>']})
+            tokenizer.kbtc_token = '<KBTC>'
+            tokenizer.kbtc_token_id = tokenizer.convert_tokens_to_ids('<KBTC>')
+        # Set pad token
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = tokenizer.bos_token_id
+        print(f'Memory token count: {n_mem_tokens}')
+        return tokenizer
+    def _get_peft_config(self, lora_r: int) -> LoraConfig:
+        """Build the PEFT configuration."""
+        return LoraConfig(
+            task_type="CAUSAL_LM",
+            r=lora_r,
+            lora_alpha=2*lora_r,
+            target_modules='all-linear',
+            lora_dropout=0.1
+        )
+    def _prepare_mem_tokens_optimization(self):
+        """Setup memory token optimization if enabled."""
+        if self.config.optimize_mem_tokens and self.compr is None:
+            # Enable gradients for input embeddings
+            self.decoder.get_input_embeddings().weight.requires_grad = True
+            # Apply hook to zero gradients except for memory tokens
+            def hook(grad):
+                mask = torch.zeros_like(grad)
+                mask[self.decoder_tokenizer.mem_token_ids] = 1.0
+                return grad * mask
+            self.decoder.get_input_embeddings().weight.register_hook(hook)
+    def _set_all_adapters(self):
+        """Activate all adapters for training."""
+        if len(self.adapter_keys) > 0:
+            self.decoder.set_adapter(self.adapter_keys)
+    # Core compression and generation methods
+    def compress(self, enc_input_ids: torch.Tensor, enc_attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compress input documents."""
+        if self.compr:
+            return self.compr(enc_input_ids, enc_attention_mask)
+        else:
+            return self._compr_decoder(enc_input_ids, enc_attention_mask)
+    def _compr_decoder(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Use decoder as compressor."""
+        assert input_ids.size() == attention_mask.size()
+        if 'encoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('encoder_adapter')
+        else:
+            raise ValueError(f"encoder_adapter not in adapter_keys: {self.adapter_keys}")
+        # Get embeddings from decoder
+        emb = self.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True
+        ).hidden_states[-1]
+        # Create mask for memory tokens
+        mask = torch.isin(
+            input_ids,
+            self.decoder_tokenizer.mem_token_ids_pt.to(input_ids.device)
+        )
+        # Calculate MSE loss between memory and non-memory regions
+        attn = attention_mask.bool()
+        mem_mask = mask & attn
+        non_mem_mask = (~mask) & attn
+        mem_len = mem_mask.sum(dim=1)
+        non_mem_len = non_mem_mask.sum(dim=1)
+        if (mem_len == 0).any():
+            raise ValueError("Some samples have no memory tokens")
+        if (non_mem_len == 0).any():
+            raise ValueError("Some samples have no non-memory tokens")
+        mem_sum = (emb * mem_mask.unsqueeze(-1)).sum(dim=1)
+        non_mem_sum = (emb * non_mem_mask.unsqueeze(-1)).sum(dim=1)
+        mem_mean = mem_sum / mem_len.unsqueeze(-1)
+        non_mem_mean = non_mem_sum / non_mem_len.unsqueeze(-1)
+        mse_loss = F.mse_loss(non_mem_mean, mem_mean, reduction='mean')
+        return emb[mask].reshape(emb.size(0), -1, emb.size(-1)), mse_loss
+    def _compr_query_reasoner_stage2(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+        """Query reasoning compression for stage 2."""
+        assert input_ids.size() == attention_mask.size()
+        if 'query_reasoner_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('query_reasoner_adapter')
+        else:
+            raise ValueError(f"query_reasoner_adapter not in adapter_keys: {self.adapter_keys}")
+        emb = self.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True
+        ).hidden_states[-1]
+        mask = torch.isin(
+            input_ids,
+            self.decoder_tokenizer.mem_token_ids_pt.to(input_ids.device)
+        )
+        return emb[mask].reshape(emb.size(0), -1)
+    # Generation methods
+    def generate_from_questions(self,
+                               questions: List[str],
+                               max_new_tokens: int = 128,
+                               temperature: float = 0.5,
+                               documents: List[List[str]] = None,
+                               stage2_mips: bool = False,
+                               stage2_retrieval_top_n: int = None,
+                               time_count: bool = False) -> Tuple[List[str], torch.Tensor]:
+        """Generate answers from questions using query reasoning."""
+        if "query_reasoner_adapter" not in self.adapter_keys:
+            raise ValueError("Query reasoner adapter not found")
+        self.eval()
+        with torch.no_grad():
+            # Encode questions
+            self.decoder.set_adapter('query_reasoner_adapter')
+            flat_questions = [q for q in questions]
+            if time_count:
+                start_time = time.time()
+            q_tok = self._prepare_encoder_inputs(flat_questions, max_length=self.doc_max_length)
+            query_reps = self._compr_query_reasoner_stage2(
+                q_tok["input_ids"].to(self.decoder.device),
+                q_tok["attention_mask"].to(self.decoder.device)
+            )
+            # Document retrieval and selection
+            if stage2_mips:
+                retrieved_doc_embeddings = self._retrieve_embeddings(
+                    query_reps, stage2_retrieval_top_n=stage2_retrieval_top_n
+                )
+                scores = torch.bmm(
+                    query_reps.unsqueeze(1),
+                    retrieved_doc_embeddings.transpose(1, 2)
+                ).squeeze(1)
+                z, topk_idx = differentiable_topk(scores, self.generation_top_k, temperature=0.5)
+                selected_doc_embeddings = torch.einsum('bkn,bnd->bkd', z, retrieved_doc_embeddings)
+                selected_doc_embeddings = selected_doc_embeddings.view(
+                    selected_doc_embeddings.size(0) * selected_doc_embeddings.size(1),
+                    -1, self.hidden_size
+                )
+            else:
+                # Use provided documents
+                flat_documents = sum(documents, [])
+                if time_count:
+                    start_time1 = time.time()
+                input_encoder = self._prepare_encoder_inputs(flat_documents, max_length=self.doc_max_length)
+                device = self.decoder.device
+                enc_input_ids = input_encoder['input_ids'].to(device)
+                enc_attention_mask = input_encoder['attention_mask'].to(device)
+                retrieved_doc_embeddings, _ = self.compress(enc_input_ids, enc_attention_mask)
+                if time_count:
+                    start_time2 = time.time()
+                    compress_time = start_time2 - start_time1
+                B = len(questions)
+                stage2_retrieval_top_n = retrieved_doc_embeddings.shape[0] // B
+                retrieved_doc_embeddings = retrieved_doc_embeddings.reshape(B, stage2_retrieval_top_n, -1)
+                query_reps = query_reps.to(retrieved_doc_embeddings.dtype)
+                if time_count:
+                    start_time3 = time.time()
+                scores = torch.bmm(
+                    F.normalize(query_reps, dim=-1, p=2).unsqueeze(1).float(),
+                    F.normalize(retrieved_doc_embeddings, dim=-1, p=2).float().transpose(1, 2)
+                ).squeeze(1)
+                z, topk_idx = differentiable_topk(scores, self.generation_top_k, temperature=0.02)
+                selected_doc_embeddings = torch.einsum('bkn,bnd->bkd', z.to(retrieved_doc_embeddings.dtype), retrieved_doc_embeddings)
+                selected_doc_embeddings = selected_doc_embeddings.view(
+                    selected_doc_embeddings.size(0) * selected_doc_embeddings.size(1),
+                    -1, self.hidden_size
+                )
+                if time_count:
+                    start_time4 = time.time()
+                    query_time = start_time4 - start_time3 + start_time1 - start_time
+            # Generate instructions and decode
+            if time_count:
+                start_time5 = time.time()
+            instructions = [
+                self._blend_prompt_and_selected_memory_tokens(query=q)[1]
+                for q in questions
+            ]
+            decoder_inputs = self.decoder_tokenizer(
+                instructions,
+                return_tensors='pt',
+                padding="longest",
+                add_special_tokens=False,
+                truncation=True,
+                max_length=1024,
+            )
+            dec_input_ids = decoder_inputs['input_ids'].to(self.decoder.device)
+            dec_attention_mask = decoder_inputs['attention_mask'].to(self.decoder.device)
+            # Replace memory token embeddings
+            inputs_embeds = self._replace_emb_stage2(selected_doc_embeddings, dec_input_ids)
+            # Switch to decoder adapter for generation
+            if 'decoder_adapter' in self.adapter_keys:
+                self.decoder.set_adapter('decoder_adapter')
+            # Generate answers
+            output_ids = self.decoder.generate(
+                inputs_embeds=inputs_embeds,
+                attention_mask=dec_attention_mask,
+                do_sample=False,
+                top_p=None,
+                temperature=None,
+                max_new_tokens=max_new_tokens,
+                pad_token_id=self.decoder_tokenizer.pad_token_id
+            )
+            if time_count:
+                start_time6 = time.time()
+                generate_time = start_time6 - start_time5
+            # Decode generated tokens
+            decoded = self.decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        if time_count:
+            return decoded, topk_idx, compress_time, query_time, generate_time, compress_time + query_time + generate_time
+        else:
+            return decoded, topk_idx
+    def generate_from_paraphrase(self, questions: list[str], documents: list[list[str]], max_new_tokens: int = 128) -> list[str]:
+        """
+        Generates answers from documents (via compression then decoding)
+        questions: list of string
+        documents: list of list of strings (they should all be of equal length: the nb of doc for each question)
+        """
+        self.generation_top_k = len(documents[0])
+        assert len(documents) == len(questions)
+        assert all([len(context) == len(documents[0]) for context in documents])
+        flat_documents = sum(documents, [])
+        model_input = {}
+        # Creating encoder inputs:
+        input_encoder = self._prepare_encoder_inputs(flat_documents, max_length=self.doc_max_length)
+        device = self.decoder.device
+        model_input['enc_input_ids'], model_input['enc_attention_mask'] = input_encoder['input_ids'].to(device), input_encoder['attention_mask'].to(device)
+        # Creating decoder inputs
+        instr = [self._blend_prompt_and_memory_tokens(query="", stage = "stage1", paraphrase_loss = True) for q in questions]
+        inp_dec = self.decoder_tokenizer(instr, return_tensors='pt', padding="longest", add_special_tokens=False, truncation=True,  max_length=1024)
+        model_input['dec_input_ids'], model_input['dec_attention_mask'] = inp_dec['input_ids'].to(device), inp_dec['attention_mask'].to(device)
+        # Generation
+        return self._generate(model_input, max_new_tokens=max_new_tokens)
+    def generate_from_text(self,
+                          questions: List[str],
+                          documents: List[List[str]],
+                          max_new_tokens: int = 128) -> List[str]:
+        """Generate answers from documents via compression then decoding."""
+        self.generation_top_k = len(documents[0])
+        assert len(documents) == len(questions)
+        assert all(len(context) == len(documents[0]) for context in documents)
+        flat_documents = sum(documents, [])
+        # Create encoder inputs
+        input_encoder = self._prepare_encoder_inputs(flat_documents, max_length=self.doc_max_length)
+        device = self.decoder.device
+        enc_input_ids = input_encoder['input_ids'].to(device)
+        enc_attention_mask = input_encoder['attention_mask'].to(device)
+        # Create decoder inputs
+        instructions = [self._blend_prompt_and_memory_tokens(query=q, stage="stage1_2") for q in questions]
+        inp_dec = self.decoder_tokenizer(
+            instructions,
+            return_tensors='pt',
+            padding="longest",
+            add_special_tokens=False,
+            truncation=True,
+            max_length=1024
+        )
+        dec_input_ids = inp_dec['input_ids'].to(device)
+        dec_attention_mask = inp_dec['attention_mask'].to(device)
+        # Generate
+        return self._generate({
+            'enc_input_ids': enc_input_ids,
+            'enc_attention_mask': enc_attention_mask,
+            'dec_input_ids': dec_input_ids,
+            'dec_attention_mask': dec_attention_mask
+        }, max_new_tokens=max_new_tokens)
+    def generate_from_compressed_documents_and_questions(self,
+                                                        questions: List[str],
+                                                        compressed_documents: torch.Tensor,
+                                                        max_new_tokens: int = 128) -> List[str]:
+        """Generate answers from compressed documents."""
+        self.generation_top_k = compressed_documents.size(0) // len(questions)
+        assert compressed_documents.size(0) % self.generation_top_k == 0
+        # Create decoder inputs
+        instructions = [self._blend_prompt_and_memory_tokens(query=q, stage="stage1_2") for q in questions]
+        inp_dec = self.decoder_tokenizer(
+            instructions,
+            return_tensors='pt',
+            padding="longest",
+            add_special_tokens=False,
+            truncation=True,
+            max_length=1024
+        )
+        device = self.decoder.device
+        dec_input_ids = inp_dec['input_ids'].to(device)
+        dec_attention_mask = inp_dec['attention_mask'].to(device)
+        # Create input decoder embeddings from prompt + compressed documents
+        inputs_embeds = self._replace_emb(compressed_documents, dec_input_ids)
+        # Activate decoder generator
+        if 'decoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('decoder_adapter')
+        output_ids = self.decoder.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=dec_attention_mask,
+            max_new_tokens=max_new_tokens
+        )
+        return self.decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+    def compress_documents(self, documents: List[str]) -> torch.Tensor:
+        """Compress a list of documents."""
+        input_encoder = self._prepare_encoder_inputs(documents, max_length=self.doc_max_length)
+        enc_input_ids = input_encoder['input_ids'].to(self.decoder.device)
+        attention_mask = input_encoder['attention_mask'].to(self.decoder.device)
+        return self.compress(enc_input_ids=enc_input_ids, enc_attention_mask=attention_mask)
+    # Helper methods
+    def _prepare_encoder_inputs(self, texts: List[str], max_length: int, q_texts: List[str] = None) -> Dict[str, torch.Tensor]:
+        """Create inputs for the encoder."""
+        if q_texts is not None:
+            assert len(texts) == len(q_texts)
+        if self.compr is None:
+            return self._prepare_encoder_inputs_to_decoder(texts, max_length, q_texts)
+        else:
+            return self.compr.prepare_inputs(texts, max_length, q_texts)
+    def _prepare_encoder_inputs_to_decoder(self, texts: List[str], max_length: int, q_texts: List[str] = None) -> Dict[str, torch.Tensor]:
+        """Prepare encoder inputs when using decoder as compressor."""
+        if q_texts is not None:
+            texts_to_encode = [
+                self.decoder_tokenizer.enc_token +
+                self.decoder_tokenizer.bos_token +
+                '\nQuery:\n' + query +
+                'Document:\n' + text +
+                self.decoder_tokenizer.eos_token
+                for text, query in zip(texts, q_texts)
+            ]
+            inp_enc = self.decoder_tokenizer(
+                texts_to_encode,
+                return_tensors='pt',
+                padding='max_length',
+                max_length=max_length + 8,
+                truncation=True,
+                add_special_tokens=False
+            )
+        else:
+            inp_enc = [
+                self.decoder_tokenizer.enc_token +
+                self.decoder_tokenizer.bos_token +
+                text +
+                self.decoder_tokenizer.eos_token
+                for text in texts
+            ]
+            inp_enc = self.decoder_tokenizer(
+                inp_enc,
+                return_tensors='pt',
+                padding="max_length",
+                max_length=max_length + 3,
+                truncation=True,
+                add_special_tokens=False
+            )
+        num_mem_tokens = self.doc_max_length // self.compr_rate
+        assert num_mem_tokens == len(self.decoder_tokenizer.mem_tokens)
+        inp_enc['input_ids'], inp_enc['attention_mask'] = add_memory_tokens_to_inputs(
+            inp_enc['input_ids'],
+            inp_enc['attention_mask'],
+            num_mem_tokens,
+            tokenizer=self.decoder_tokenizer
+        )
+        return inp_enc
+    def _replace_emb(self, compressed_embs: torch.Tensor, dec_input_ids: torch.Tensor) -> torch.Tensor:
+        """Replace memory tokens in decoder input with compressed embeddings."""
+        indices = range(0, compressed_embs.size(0) + 1, self.generation_top_k)
+        return self._replace_embeddings(compressed_embs, dec_input_ids, indices)
+    def _replace_emb_stage2(self, compressed_embs: torch.Tensor, dec_input_ids: torch.Tensor) -> torch.Tensor:
+        """Replace memory tokens for stage 2."""
+        indices = range(0, compressed_embs.size(0) + 1, self.generation_top_k)
+        return self._replace_embeddings(compressed_embs, dec_input_ids, indices)
+    def _replace_embeddings(self, compressed_embs: torch.Tensor, dec_input_ids: torch.Tensor, indices: range) -> torch.Tensor:
+        """Replace memory tokens with compressed embeddings."""
+        inputs_embeds = self.decoder.get_input_embeddings()(dec_input_ids)
+        num_embs = compressed_embs.size(1)
+        slot_len = num_embs + (1 if self.sep else 0)
+        # Get first memory token indices
+        first_mem_token_indices = torch.argmax(
+            (dec_input_ids == self.decoder_tokenizer.mem_token_ids[0]).int(), dim=1
+        )
+        batch_size = inputs_embeds.size(0)
+        # Replace with compressed embeddings
+        for i in range(batch_size):
+            for j in range(indices[i], indices[i + 1]):
+                start_idx = first_mem_token_indices[i].item() + (j - indices[i]) * slot_len
+                assert inputs_embeds[i, start_idx:start_idx + num_embs, :].size() == compressed_embs[j].size()
+                inputs_embeds[i, start_idx:start_idx + num_embs, :] = compressed_embs[j]
+        return inputs_embeds
+    def _retrieve_embeddings(self, questions: torch.Tensor, stage2_retrieval_top_n: int = 1) -> torch.Tensor:
+        """Retrieve embeddings of documents."""
+        response = requests.post(
+            self.url_retrieval,
+            json={
+                "queries": questions.detach().cpu().float().numpy().tolist(),
+                'k': self.generation_top_k
+            }
+        )
+        if response.status_code != 200:
+            raise Exception(f"Error: {response.status_code} - {response.text}")
+        results = response.json()
+        retrieval_embeddings = results['retrieved_embeddings']
+        retrieval_embeddings = torch.tensor(
+            retrieval_embeddings,
+            dtype=torch.bfloat16,
+            device=questions.device
+        )
+        if len(retrieval_embeddings.shape) == 4:
+            retrieval_embeddings = retrieval_embeddings.reshape(
+                retrieval_embeddings.shape[0] * retrieval_embeddings.shape[1],
+                retrieval_embeddings.shape[2], -1
+            )
+        return retrieval_embeddings
+    def _blend_prompt_and_memory_tokens(self, query: str, answer: str = None, qa_loss: bool = False,
+                                       paraphrase_loss: bool = False, stage: str = "stage1") -> Tuple[int, str]:
+        """Blend prompt with memory tokens for different training stages."""
+        mem_tokens_str = ''.join(self.decoder_tokenizer.mem_tokens) + self.decoder_tokenizer.sep_token
+        docs = mem_tokens_str * self.generation_top_k
+        if stage == "stage1":
+            if qa_loss:
+                return self._blend_qa_prompt(docs, query, answer)
+            elif paraphrase_loss:
+                return self._blend_paraphrase_prompt(docs, answer)
+        elif stage == "stage1_2":
+            return self._blend_standard_prompt(docs, query, answer)
+        raise ValueError(f"Unknown stage: {stage}")
+    def _blend_qa_prompt(self, docs: str, query: List[str], answer: List[str]) -> Tuple[int, str]:
+        """Create QA prompt for stage 1."""
+        prompt_system = 'You are a helpful assistant. Given a document, your task is to generate some single questions to cover all key information of the document and answer them sequentially.'
+        prompt_user = f"Background:\n{docs}"
+        sys_prompt = [{"role": "system", "content": prompt_system}]
+        user_prompt = [{"role": "user", "content": prompt_user.replace(':\ ', ': ')}]
+        qa_lines = [f"Question: {q}\nAnswer: {a}" for q, a in zip(query, answer)]
+        query_answer = "\n".join(qa_lines)
+        assistant_prompt = [{"role": "assistant", "content": query_answer}]
+        try:
+            prompt = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False
+            )
+            response = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt + assistant_prompt,
+                tokenize=False,
+                add_generation_prompt=False,
+                enable_thinking=False
+            )
+            prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+        except TemplateError as e:
+            if "System role not supported" in str(e):
+                messages = [{"role": "user", "content": sys_prompt[0]['content'] + '\n' + user_prompt[0]['content']}]
+                prompt = self.decoder_tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
+                )
+                prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+                # Handle response for unsupported system role
+                messages_with_answer = messages + assistant_prompt
+                response = self.decoder_tokenizer.apply_chat_template(
+                    messages_with_answer, tokenize=False, add_generation_prompt=False, enable_thinking=False
+                )
+            else:
+                raise e
+        return prompt_len, response
+    def _blend_paraphrase_prompt(self, docs: str, answer: str) -> Tuple[int, str]:
+        """Create paraphrase prompt for stage 1."""
+        prompt_system = 'You are a helpful assistant. Your task is follow the instructions to paraphrase the background information.'
+        prompt_user = random.choice(PARAPHRASE_INSTRUCTIONS).format(docs=docs)
+        sys_prompt = [{"role": "system", "content": prompt_system}]
+        user_prompt = [{"role": "user", "content": prompt_user.replace(':\ ', ': ')}]
+        try:
+            prompt = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False
+            )
+            if answer is None:
+                return prompt
+            assistant_prompt = [{"role": "assistant", "content": answer}]
+            response = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt + assistant_prompt,
+                tokenize=False,
+                add_generation_prompt=False,
+                enable_thinking=False
+            )
+            prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+        except TemplateError as e:
+            if "System role not supported" in str(e):
+                combined_content = prompt_system + '\n' + prompt_user.replace(':\ ', ': ')
+                messages = [{"role": "user", "content": combined_content}]
+                prompt = self.decoder_tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
+                )
+                if answer is None:
+                    return prompt
+                prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+                messages_with_answer = messages + [{"role": "assistant", "content": answer}]
+                response = self.decoder_tokenizer.apply_chat_template(
+                    messages_with_answer, tokenize=False, add_generation_prompt=False, enable_thinking=False
+                )
+            else:
+                raise e
+        return prompt_len, response
+    def _blend_standard_prompt(self, docs: str, query: str, answer: str) -> Tuple[int, str]:
+        """Create standard prompt for stage 1_2."""
+        prompt_system = 'You are a helpful assistant. Your task is to extract relevant information from provided documents and to answer to questions as briefly as possible.'
+        prompt_user = f"Background:\n{docs}\n\nQuestion:{query}"
+        sys_prompt = [{"role": "system", "content": prompt_system}]
+        user_prompt = [{"role": "user", "content": prompt_user.replace(':\ ', ': ')}]
+        try:
+            prompt = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False
+            )
+            if answer is None:
+                return prompt
+            assistant_prompt = [{"role": "assistant", "content": answer}]
+            response = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt + assistant_prompt,
+                tokenize=False,
+                add_generation_prompt=False,
+                enable_thinking=False
+            )
+            prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+        except TemplateError as e:
+            if "System role not supported" in str(e):
+                combined_content = prompt_system + '\n' + prompt_user.replace(':\ ', ': ')
+                messages = [{"role": "user", "content": combined_content}]
+                prompt = self.decoder_tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
+                )
+                if answer is None:
+                    return prompt
+                prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+                messages_with_answer = messages + [{"role": "assistant", "content": answer}]
+                response = self.decoder_tokenizer.apply_chat_template(
+                    messages_with_answer, tokenize=False, add_generation_prompt=False, enable_thinking=False
+                )
+            else:
+                raise e
+        return prompt_len, response
+    def _blend_prompt_and_selected_memory_tokens(self, query: str, answer: str = None) -> Tuple[int, str]:
+        """Create prompt for stage 2 with selected memory tokens."""
+        mem_tokens_str = ''.join(self.decoder_tokenizer.mem_tokens) + self.decoder_tokenizer.sep_token
+        docs = mem_tokens_str * self.generation_top_k
+        prompt_system = 'You are a helpful assistant. Your task is to extract relevant information from provided documents and to answer to questions as briefly as possible.'
+        prompt_user = f"Background:\n{docs}\n\nQuestion:{query}"
+        sys_prompt = [{"role": "system", "content": prompt_system}]
+        user_prompt = [{"role": "user", "content": prompt_user.replace(':\ ', ': ')}]
+        try:
+            prompt = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False
+            )
+            prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+            if answer is not None:
+                assistant_prompt = [{"role": "assistant", "content": answer}]
+                response = self.decoder_tokenizer.apply_chat_template(
+                    sys_prompt + user_prompt + assistant_prompt,
+                    tokenize=False,
+                    add_generation_prompt=False,
+                    enable_thinking=False
+                )
+            else:
+                response = prompt
+        except TemplateError as e:
+            if "System role not supported" in str(e):
+                combined_content = prompt_system + '\n' + prompt_user.replace(':\ ', ': ')
+                messages = [{"role": "user", "content": combined_content}]
+                prompt = self.decoder_tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True,
+                    enable_thinking=False
+                )
+                prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+                if answer is not None:
+                    messages_with_answer = messages + [{"role": "assistant", "content": answer}]
+                    response = self.decoder_tokenizer.apply_chat_template(
+                        messages_with_answer,
+                        tokenize=False,
+                        add_generation_prompt=False,
+                        enable_thinking=False
+                    )
+                else:
+                    response = prompt
+            else:
+                raise e
+        return prompt_len, response
+    # Model saving and loading methods
+    def save_pretrained(self, save_directory: str, **kwargs):
+        """Save only the LoRA adapters and their configurations."""
+        if self.lora:
+            if not os.path.exists(save_directory):
+                os.makedirs(save_directory)
+            # Save LoRA adapter weights
+            torch.save(
+                self._get_all_adapters_state_dict(),
+                os.path.join(save_directory, "adapters.pth")
+            )
+            # Save first and last layers of decoder
+            torch.save(
+                self._get_decoder_first_and_last_layer_state_dict(),
+                os.path.join(save_directory, "decoder_first_last_layers.pth")
+            )
+            # Save configuration
+            self.config.save_pretrained(save_directory)
+        else:
+            super().save_pretrained(save_directory, **kwargs)
+    def _get_all_adapters_state_dict(self) -> Dict[str, Dict[str, torch.Tensor]]:
+        """Return the state dicts of all adapters."""
+        return {
+            key: {k: v.cpu() for k, v in self.decoder.get_adapter_state_dict(key).items()}
+            for key in self.adapter_keys
+        }
+    def _get_decoder_first_and_last_layer_state_dict(self) -> Dict[str, torch.Tensor]:
+        """Get first and last layers that change when adding tokens."""
+        out = {}
+        for k, v in self.decoder.named_parameters():
+            if 'lm_head.weight' in k or 'embed_tokens.weight' in k:
+                out[k] = v.cpu()
+        return out
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
+        """Load model from pretrained checkpoint."""
+        # Load configuration
+        config = CLaRaConfig.from_pretrained(pretrained_model_name_or_path)
+        # Update config with kwargs
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+        map_location = torch.device("cpu") if not torch.cuda.is_available() else None
+        if config.lora:
+            # Delay adapter construction
+            config.load_adapters = False
+            if 'device_map' in kwargs:
+                config.device_map = kwargs['device_map']
+            # Initialize model
+            print(f"Initializing model from trained checkpoint: {config}")
+            model = cls(config)
+            # Load first and last layers
+            try:
+                first_and_last_layers_path = hf_hub_download(
+                    repo_id=pretrained_model_name_or_path,
+                    filename="decoder_first_last_layers.pth"
+                )
+            except Exception:
+                first_and_last_layers_path = os.path.join(
+                    pretrained_model_name_or_path, "decoder_first_last_layers.pth"
+                )
+            if os.path.exists(first_and_last_layers_path):
+                first_and_last_decoder_state_dict = torch.load(
+                    first_and_last_layers_path, map_location=map_location, weights_only=True
+                )
+                for key in first_and_last_decoder_state_dict:
+                    assert key in model.decoder.state_dict()
+                model.decoder.load_state_dict(first_and_last_decoder_state_dict, strict=False)
+            else:
+                print(f'First and last layer not found: {first_and_last_layers_path}')
+            peft_config = model._get_peft_config(lora_r=config.lora_r)
+            # Load LoRA adapters
+            try:
+                adapters_path = hf_hub_download(
+                    repo_id=pretrained_model_name_or_path,
+                    filename="adapters.pth"
+                )
+            except Exception:
+                adapters_path = os.path.join(pretrained_model_name_or_path, "adapters.pth")
+            if os.path.exists(adapters_path):
+                adapters_state_dict = torch.load(adapters_path, map_location=map_location, weights_only=True)
+                model._load_adapters_from_state_dict(adapters_state_dict, peft_config, config)
+            else:
+                warnings.warn(f'Adapters not found at {adapters_path}')
+            model._set_all_adapters()
+            config.load_adapters = True
+            return model
+        else:
+            return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+    def _load_adapters_from_state_dict(self, adapters_state_dict: Dict, peft_config: LoraConfig, config: CLaRaConfig):
+        """Load adapters from state dict based on training stage."""
+        if not getattr(config, 'pure_inference', False):
+            for key, val in adapters_state_dict.items():
+                # Skip certain adapters based on training stage
+                if config.training_stage == 'stage1' and key == 'query_reasoner_adapter':
+                    continue
+                elif config.training_stage == 'stage1_2' and key in ['query_reasoner_adapter', 'decoder_adapter']:
+                    continue
+                elif config.training_stage == 'stage2_reasoning' and key == 'decoder_adapter':
+                    continue
+                self._load_adapter_from_state_dict(
+                    peft_config=peft_config,
+                    adapter_name=key,
+                    adapter_state_dict=val
+                )
+        else:
+            # Load all adapters for pure inference
+            for key, val in adapters_state_dict.items():
+                self._load_adapter_from_state_dict(
+                    peft_config=peft_config,
+                    adapter_name=key,
+                    adapter_state_dict=val
+                )
+        # Handle special cases for stage 2 training
+        if config.training_stage == 'stage2' and 'query_reasoner_adapter' not in adapters_state_dict:
+            self._handle_query_reasoner_adapter_loading(adapters_state_dict, peft_config)
+    def _load_adapter_from_state_dict(self, peft_config: LoraConfig, adapter_name: str, adapter_state_dict: Dict):
+        """Create adapter from state dict."""
+        print(f'Loading checkpoint adapter: {adapter_name}')
+        self.decoder.load_adapter(
+            peft_config=peft_config,
+            adapter_name=adapter_name,
+            adapter_state_dict=adapter_state_dict
+        )
+        self.adapter_keys.append(adapter_name)
+    def _handle_query_reasoner_adapter_loading(self, adapters_state_dict: Dict, peft_config: LoraConfig):
+        """Handle special loading logic for query reasoner adapter."""
+        if 'encoder_adapter' in adapters_state_dict and 'query_reasoner_adapter' not in adapters_state_dict:
+            # Rename encoder adapter to query reasoner adapter
+            renamed = {}
+            for k, v in adapters_state_dict['encoder_adapter'].items():
+                new_k = k.replace('encoder_adapter', 'query_reasoner_adapter')
+                renamed[new_k] = v.detach().clone()
+            self._load_adapter_from_state_dict(
+                peft_config=peft_config,
+                adapter_name='query_reasoner_adapter',
+                adapter_state_dict=renamed
+            )
+            print('Loaded query_reasoner_adapter from stage 1 compressor checkpoint')
+        else:
+            # Create new adapter randomly
+            self.decoder.add_adapter(peft_config, 'query_reasoner_adapter')
+            self.adapter_keys.append('query_reasoner_adapter')
+            print('Loaded query_reasoner_adapter randomly for stage 2 training')
+    # Forward pass methods
+    def forward(self,
+                batch: Dict = None,
+                questions: List[str] = None,
+                documents: List[List[str]] = None,
+                answers: List[str] = None,
+                original_answer_gen_api: str = None,
+                stage2_mips: bool = False,
+                stage2_retrieval_top_n: int = None) -> Tuple[torch.Tensor, Dict]:
+        """
+        Forward pass with support for both batch and legacy interfaces.
+        Args:
+            batch: Preprocessed batch dict (new interface)
+            questions: List of questions (legacy interface)
+            documents: List of document lists (legacy interface)
+            answers: List of answers (legacy interface)
+            original_answer_gen_api: API URL for generation (legacy interface)
+            stage2_mips: Whether to use MIPS for stage2
+            stage2_retrieval_top_n: Top-n for stage2 retrieval
+        Returns:
+            Tuple of (loss, additional_outputs_dict)
+        """
+        if batch is not None:
+            return self._forward_batch(batch, stage2_mips, stage2_retrieval_top_n)
+        else:
+            return self._forward_legacy(questions, documents, answers, original_answer_gen_api)
+    def _forward_batch(self, batch: Dict, stage2_mips: bool, stage2_retrieval_top_n: int) -> Tuple[torch.Tensor, Dict]:
+        """Handle batch-based forward pass."""
+        stage = batch.get("stage", None)
+        if stage in ["stage1", "stage1_2"]:
+            return self._forward_stage1_batch(batch)
+        elif stage == "stage2":
+            return self._forward_stage2_batch(batch, stage2_mips, stage2_retrieval_top_n)
+        elif stage == "stage2_pretrain_retrieval":
+            return self._forward_stage2_pretrain_batch(batch, stage2_mips, stage2_retrieval_top_n)
+        elif stage == "stage2_reasoning":
+            return self._forward_stage2_reasoning_batch(batch)
+        else:
+            raise ValueError(f"Unknown stage: {stage}")
+    def _forward_stage1_batch(self, batch: Dict) -> Tuple[torch.Tensor, Dict]:
+        """Forward pass for stage 1 training."""
+        # Move tensors to device
+        enc_input_ids = batch["enc_input_ids"].to(self.decoder.device)
+        enc_attention_mask = batch["enc_attention_mask"].to(self.decoder.device)
+        dec_input_ids = batch["dec_input_ids"].to(self.decoder.device)
+        dec_attention_mask = batch["dec_attention_mask"].to(self.decoder.device)
+        labels = batch["labels"].to(self.decoder.device)
+        out = self._forward_stage_1(
+            enc_input_ids=enc_input_ids,
+            enc_attention_mask=enc_attention_mask,
+            dec_input_ids=dec_input_ids,
+            dec_attention_mask=dec_attention_mask,
+            labels=labels,
+        )
+        return out["loss"], {"logits": out["logits"], "mse_loss": out["mse_loss"]}
+    def _forward_stage2_batch(self, batch: Dict, stage2_mips: bool, stage2_retrieval_top_n: int) -> Tuple[torch.Tensor, Dict]:
+        """Forward pass for stage 2 training."""
+        self.decoder.set_adapter('query_reasoner_adapter')
+        B = batch["labels"].shape[0]
+        query_reps = self._compr_query_reasoner_stage2(
+            batch["query_input_ids"].to(self.decoder.device),
+            batch["query_attention_mask"].to(self.decoder.device)
+        )
+        enc_input_ids = batch["enc_input_ids"].to(self.decoder.device)
+        enc_attention_mask = batch["enc_attention_mask"].to(self.decoder.device)
+        dec_input_ids = batch["dec_input_ids"].to(self.decoder.device)
+        dec_attention_mask = batch["dec_attention_mask"].to(self.decoder.device)
+        labels = batch["labels"].to(self.decoder.device)
+        # Document retrieval and selection
+        if stage2_mips:
+            retrieved_doc_embeddings = self._retrieve_embeddings(
+                query_reps, stage2_retrieval_top_n=stage2_retrieval_top_n
+            )
+            scores = torch.bmm(
+                query_reps.unsqueeze(1),
+                retrieved_doc_embeddings.transpose(1, 2)
+            ).squeeze(1)
+            z, topk_idx = differentiable_topk(scores, self.generation_top_k, temperature=1)
+            selected = torch.einsum('bkn,bnd->bkd', z, retrieved_doc_embeddings)
+            selected = selected.view(selected.size(0) * selected.size(1), -1, self.hidden_size)
+        else:
+            with torch.no_grad():
+                retrieved_doc_embeddings, mse_loss = self.compress(enc_input_ids, enc_attention_mask)
+            stage2_retrieval_top_n = retrieved_doc_embeddings.shape[0] // B
+            retrieved_doc_embeddings = retrieved_doc_embeddings.reshape(B, stage2_retrieval_top_n, -1)
+            query_reps = query_reps.to(retrieved_doc_embeddings.dtype)
+            scores = torch.bmm(
+                F.normalize(query_reps, dim=-1, p=2).unsqueeze(1).float(),
+                F.normalize(retrieved_doc_embeddings, dim=-1, p=2).float().transpose(1, 2)
+            ).squeeze(1)
+            z, topk_idx = differentiable_topk(scores, self.generation_top_k, temperature=0.02)
+            selected = torch.einsum('bkn,bnd->bkd', z.to(retrieved_doc_embeddings.dtype), retrieved_doc_embeddings)
+            selected = selected.view(selected.size(0) * selected.size(1), -1, self.hidden_size)
+        inputs_embeds = self._replace_emb_stage2(selected, dec_input_ids)
+        if 'decoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('decoder_adapter')
+        dec_out = self.decoder(
+            inputs_embeds=inputs_embeds,
+            attention_mask=dec_attention_mask,
+            labels=labels,
+        )
+        self.decoder.set_adapter(['decoder_adapter', 'query_reasoner_adapter'])
+        return dec_out.loss, {"logits": dec_out.logits, "topk_idx": topk_idx, "mse_loss": mse_loss}
+    def _forward_stage2_pretrain_batch(self, batch: Dict, stage2_mips: bool, stage2_retrieval_top_n: int) -> Tuple[torch.Tensor, Dict]:
+        """Forward pass for stage 2 pretraining with retrieval."""
+        self.decoder.set_adapter('query_reasoner_adapter')
+        B = batch["labels"].shape[0]
+        N = batch["enc_input_ids"].shape[0] // B
+        device = self.decoder.device
+        query_reps = self._compr_query_reasoner_stage2(
+            batch["query_input_ids"].to(device),
+            batch["query_attention_mask"].to(device)
+        )
+        enc_input_ids = batch["enc_input_ids"].to(device)
+        enc_attention_mask = batch["enc_attention_mask"].to(device)
+        with torch.no_grad():
+            retrieved_doc_embeddings, mse_loss = self.compress(enc_input_ids, enc_attention_mask)
+        stage2_retrieval_top_n = retrieved_doc_embeddings.shape[0] // B
+        retrieved_doc_embeddings = retrieved_doc_embeddings.reshape(B, stage2_retrieval_top_n, -1)
+        query_reps = query_reps.to(retrieved_doc_embeddings.dtype)
+        scores = torch.bmm(
+            F.normalize(query_reps, dim=-1, p=2).unsqueeze(1).float(),
+            F.normalize(retrieved_doc_embeddings, dim=-1, p=2).float().transpose(1, 2)
+        ).squeeze(1)
+        pos_index = batch["pos_index"]
+        pos_mask = build_pos_mask(pos_index, N, device)
+        tau = 0.02
+        logits = scores / tau
+        pos_logits = logits.masked_fill(~pos_mask, float('-inf'))
+        num = torch.logsumexp(pos_logits, dim=-1)
+        den = torch.logsumexp(logits, dim=-1)
+        loss_vec = -(num - den)
+        valid = pos_mask.any(dim=-1)
+        loss = loss_vec[valid].mean()
+        topk = self.generation_top_k
+        topk_idx = logits.topk(k=min(topk, N), dim=-1).indices
+        return loss, {"logits": [[]], "topk_idx": topk_idx, "mse_loss": mse_loss}
+    def _forward_stage2_reasoning_batch(self, batch: Dict) -> Tuple[torch.Tensor, Dict]:
+        """Forward pass for stage 2 reasoning training."""
+        B = batch["labels"].shape[0]
+        enc_input_ids = batch["enc_input_ids"].to(self.decoder.device)
+        enc_attention_mask = batch["enc_attention_mask"].to(self.decoder.device)
+        dec_input_ids = batch["dec_input_ids"].to(self.decoder.device)
+        dec_attention_mask = batch["dec_attention_mask"].to(self.decoder.device)
+        labels = batch["labels"].to(self.decoder.device)
+        if sum(batch["docs_num"]) != 0:
+            with torch.no_grad():
+                selected, mse_loss = self.compress(enc_input_ids, enc_attention_mask)
+                indices = batch["docs_num"]
+                inputs_embeds = self._replace_reasoning_embeddings(selected, dec_input_ids, indices)
+        else:
+            inputs_embeds = self.decoder.get_input_embeddings()(dec_input_ids)
+            mse_loss = 0
+        if 'decoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('decoder_adapter')
+        dec_out = self.decoder(
+            inputs_embeds=inputs_embeds,
+            attention_mask=dec_attention_mask,
+            labels=labels,
+        )
+        self.decoder.set_adapter(['decoder_adapter'])
+        return dec_out.loss, {"logits": dec_out.logits, "mse_loss": mse_loss}
+    def _forward_stage_1(self,
+                        enc_input_ids: torch.LongTensor = None,
+                        enc_attention_mask: torch.LongTensor = None,
+                        dec_input_ids: torch.LongTensor = None,
+                        dec_attention_mask: torch.LongTensor = None,
+                        labels: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
+        """Stage 1 forward pass for document compression and QA."""
+        assert enc_input_ids.size() == enc_attention_mask.size()
+        # Flatten 3D inputs to 2D if needed
+        if len(enc_input_ids.size()) == 3:
+            batch_size, top_k, seq_length = enc_input_ids.size()
+            enc_input_ids = enc_input_ids.view(batch_size * top_k, seq_length)
+            enc_attention_mask = enc_attention_mask.view(batch_size * top_k, seq_length)
+        assert enc_input_ids.size(0) == dec_input_ids.size(0) * self.generation_top_k
+        # Compress documents
+        compressed_embs, mse_loss = self.compress(enc_input_ids, enc_attention_mask)
+        # Replace memory tokens with compressed embeddings
+        inputs_embeds = self._replace_emb(compressed_embs, dec_input_ids)
+        # Detach if compressor-only training
+        if (self.training_form == "compressor") and (self.compr is None):
+            inputs_embeds = inputs_embeds.detach()
+        # Set decoder adapter
+        if 'decoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('decoder_adapter')
+        # Forward through decoder
+        decoder_outputs = self.decoder(
+            inputs_embeds=inputs_embeds,
+            attention_mask=dec_attention_mask,
+            labels=labels
+        )
+        # Reactivate all adapters
+        self.decoder.set_adapter(['decoder_adapter', 'encoder_adapter'])
+        return {
+            "loss": decoder_outputs.loss,
+            "logits": decoder_outputs.logits,
+            "mse_loss": mse_loss
+        }
+    def _replace_reasoning_embeddings(self,
+                                    compressed_embs: torch.Tensor,
+                                    dec_input_ids: torch.LongTensor,
+                                    docs_per_example: List[int]) -> torch.Tensor:
+        """Replace memory slots with compressed embeddings for reasoning."""
+        device = dec_input_ids.device
+        inputs_embeds = self.decoder.get_input_embeddings()(dec_input_ids)
+        num_embs = compressed_embs.size(1)
+        slot_len = num_embs + (1 if getattr(self, "sep", False) else 0)
+        if not isinstance(docs_per_example, torch.Tensor):
+            docs_per_example = torch.tensor(docs_per_example, device=device, dtype=torch.long)
+        else:
+            docs_per_example = docs_per_example.to(device=device, dtype=torch.long)
+        offsets = torch.zeros(docs_per_example.size(0) + 1, device=device, dtype=torch.long)
+        offsets[1:] = torch.cumsum(docs_per_example, dim=0)
+        total_docs = int(offsets[-1].item())
+        assert total_docs == compressed_embs.size(0)
+        mem_id = self.decoder_tokenizer.mem_token_ids[0]
+        B, L, H = inputs_embeds.size()
+        for i in range(B):
+            # Find first memory token position
+            mem_pos = (dec_input_ids[i] == mem_id).nonzero(as_tuple=True)[0]
+            if mem_pos.numel() == 0:
+                continue
+            first_mem_idx = int(mem_pos[0].item())
+            n_docs_i = int(docs_per_example[i].item())
+            base = int(offsets[i].item())
+            needed_len = first_mem_idx + n_docs_i * slot_len
+            assert needed_len <= L
+            for local_j in range(n_docs_i):
+                global_j = base + local_j
+                start_idx = first_mem_idx + local_j * slot_len
+                target_slice = inputs_embeds[i, start_idx:start_idx + num_embs, :]
+                src = compressed_embs[global_j]
+                assert target_slice.size() == src.size()
+                inputs_embeds[i, start_idx:start_idx + num_embs, :] = src
+        return inputs_embeds
+    def _generate(self, model_input: Dict[str, torch.Tensor], max_new_tokens: int = 128,
+                 return_doc_embeddings: bool = False) -> List[str]:
+        """Generate text from model inputs."""
+        enc_input_ids = model_input['enc_input_ids']
+        enc_attention_mask = model_input['enc_attention_mask']
+        dec_input_ids = model_input['dec_input_ids']
+        dec_attention_mask = model_input['dec_attention_mask']
+        assert enc_input_ids.size() == enc_attention_mask.size()
+        if len(enc_input_ids.size()) == 3:
+            batch_size, top_k, seq_length = enc_input_ids.size()
+            enc_input_ids = enc_input_ids.view(batch_size * top_k, seq_length)
+            enc_attention_mask = enc_attention_mask.view(batch_size * top_k, seq_length)
+        assert enc_input_ids.size(0) == dec_input_ids.size(0) * self.generation_top_k
+        compressed_embs, _ = self.compress(enc_input_ids.to('cuda'), enc_attention_mask.to('cuda'))
+        inputs_embeds = self._replace_emb(compressed_embs, dec_input_ids.to('cuda'))
+        if 'decoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('decoder_adapter')
+        output_ids = self.decoder.generate(
+            inputs_embeds=inputs_embeds.to("cuda"),
+            attention_mask=dec_attention_mask.to("cuda"),
+            do_sample=False,
+            top_p=None,
+            max_new_tokens=max_new_tokens
+        )
+        decoded = self.decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        if return_doc_embeddings:
+            assert 'batch_size' in locals() and 'top_k' in locals()
+            compressed_embs = compressed_embs.view(batch_size, top_k, compressed_embs.size(1), compressed_embs.size(2))
+            return decoded, compressed_embs
+        else:
+            return decoded
+# Example usage and testing
+if __name__ == '__main__':
+    # Example configuration
+    cfg = CLaRaConfig(
+        decoder_model_name='/mnt/ceph_rbd/model/Mistral-7B-Instruct-v0.2',
+        compr_model_name="mistral_trimmed",
+        compr_rate=64,
+        compr_n_layers=5,
+        compr_mlp_hidden_dim=8096,
+        compr_use_mlp=False,
+        lora=True,
+        lora_compressor=True,
+        training_form="both",
+        load_adapters=True,
+        kbtc_training=False,
+        optimize_mem_tokens=True,
+        different_mem_tokens=True,
+        attn_implementation='flash_attention_2'
+    )
+    # Initialize model
+    clara = CLaRa(cfg)
+    # Save and reload test
+    clara.save_pretrained('test_ckpt')
+    del clara
+    torch.cuda.empty_cache()
+    gc.collect()
+    # Reload model
+    clara = CLaRa.from_pretrained('test_ckpt')
+    print("Model successfully loaded!")

compression-128/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

compression-128/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

compression-128/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

compression-128/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

compression-16/adapters.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f328d35c099fe367727256b3c66ee18c5c1e68b5eb9b8b515e1cfa8ee8023d48
+size 252096669

compression-16/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,24 @@

+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message['role'] == 'user' %}
+        {%- if loop.first and system_message is defined %}
+            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}
+        {%- else %}
+            {{- ' [INST] ' + message['content'] + ' [/INST]' }}
+        {%- endif %}
+    {%- elif message['role'] == 'assistant' %}
+        {{- ' ' + message['content'] + eos_token}}
+    {%- else %}
+        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}
+    {%- endif %}
+{%- endfor %}

compression-16/config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "ae_mode": "token",
+  "attn_implementation": null,
+  "auto_map": {
+    "AutoConfig": "modeling_clara.CLaRaConfig",
+    "AutoModel": "modeling_clara.CLaRa"
+  },
+  "compr_base_model_name": "/mnt/ceph_rbd/model/Mistral-7B-Instruct-v0.2",
+  "compr_every_n_layer": null,
+  "compr_linear_type": "concat",
+  "compr_mlp_hidden_dim": 8096,
+  "compr_model_name": null,
+  "compr_n_layers": 5,
+  "compr_rate": 16,
+  "compr_rms_norm": false,
+  "compr_use_mlp": false,
+  "decoder_model_name": "/mnt/ceph_rbd/model/Mistral-7B-Instruct-v0.2",
+  "device_map": null,
+  "different_mem_tokens": true,
+  "doc_max_length": 256,
+  "generation_top_k": 5,
+  "kbtc_training": false,
+  "load_adapters": true,
+  "load_pretrained_checkpoint": false,
+  "lora": true,
+  "lora_compressor": false,
+  "lora_r": 16,
+  "lora_r_compressor": 16,
+  "max_new_tokens": 128,
+  "model_type": "CLaRa",
+  "optimize_mem_tokens": true,
+  "pad_token_id": 2,
+  "pure_inference": false,
+  "quantization": "no",
+  "sep": true,
+  "stage2_retrieval_top_n": 1,
+  "training_form": "both_separately",
+  "training_stage": "stage2",
+  "transformers_version": "4.53.3"
+}

compression-16/decoder_first_last_layers.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5618c932bf93fb9e092d1a6eac240497d129c7aa7d4b87fb0b25065bdfa6c62f
+size 524601397

compression-16/modeling_clara.py ADDED Viewed

	@@ -0,0 +1,1712 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+import warnings
+import os
+import torch
+import gc
+import time
+import json
+import copy
+import random
+import requests
+import re
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.functional import gelu
+from jinja2.exceptions import TemplateError
+from peft import LoraConfig
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    PreTrainedModel,
+    PretrainedConfig,
+    StoppingCriteria,
+    StoppingCriteriaList
+)
+from huggingface_hub import hf_hub_download
+from typing import List, Dict, Any, Optional, Tuple
+# Environment setup
+torch.set_printoptions(threshold=float("inf"))
+os.environ["NCCL_TIMEOUT"] = "5400"
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+# Constants
+IGNORE_INDEX = -100
+PARAPHRASE_INSTRUCTIONS = [
+    'Background: {docs} means the same as',
+    "Background: {docs} Can you put the above sentences in your own terms?",
+    "Background: {docs} Please provide a reinterpretation of the preceding background text.",
+    "These two expressions are equivalent in essence:\n(1) {docs}\n(2)",
+    "Background: {docs} is a paraphrase of what?",
+    "Background: {docs} Could you give me a different version of the background sentences above?",
+    "In other words, background: {docs} is just another way of saying:",
+    "You're getting across the same point whether you say background: {docs} or",
+    "Background: {docs} After unpacking the ideas in the background information above, we got:",
+    "Background: {docs} Please offer a restatement of the background sentences I've just read.",
+    "Background: {docs}, which also means:",
+    "Strip away the mystery, and you'll find background: {docs} is simply another rendition of:",
+    "The essence of background: {docs} is captured again in the following statement:",
+]
+class StopOnCriteria(StoppingCriteria):
+    """Custom stopping criteria for generation."""
+    def __init__(self, tokenizer, stop_strings: List[str] = None, stop_token_ids: List[int] = None):
+        self.tokenizer = tokenizer
+        self.stop_strings = stop_strings or []
+        self.stop_token_ids = stop_token_ids or []
+        self.reason = None
+    def __call__(self, input_ids, scores, **kwargs):
+        # Check if last token is in stop_token_ids
+        last_token = input_ids[0, -1].item()
+        if last_token in self.stop_token_ids:
+            self.reason = f"stop_token_{last_token}"
+            return True
+        # Check if any stop_strings appear in generated text
+        text = self.tokenizer.decode(input_ids[0], skip_special_tokens=False)
+        for stop_str in self.stop_strings:
+            if stop_str in text:
+                self.reason = f"stop_string_{stop_str}"
+                return True
+        return False
+class LlamaRMSNorm(nn.Module):
+    """Llama-style RMS normalization layer."""
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class Converter(nn.Module):
+    """Converter module for dimension transformation."""
+    def __init__(self, input_dim: int, output_dim: int):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.rms_norm = LlamaRMSNorm(input_dim)
+        self.dense_in = nn.Linear(input_dim, output_dim)
+        self.dense_out = nn.Linear(output_dim, output_dim)
+        self._print_trainable_parameters()
+    def _print_trainable_parameters(self):
+        """Print parameter statistics."""
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        total_params = sum(p.numel() for p in self.parameters())
+        print(f"Converter trainable parameters: {trainable_params}, Total parameters: {total_params}")
+    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
+        embeddings = self.rms_norm(embeddings)
+        x = self.dense_in(embeddings)
+        x = self.dense_out(gelu(x))
+        return x.to(torch.float32)
+class CLaRaConfig(PretrainedConfig):
+    """Configuration class for CLaRa model."""
+    model_type = "CLaRa"
+    def __init__(self,
+                 decoder_model_name: str = "meta-llama/Llama-2-7b-chat-hf",
+                 doc_max_length: int = 128,
+                 quantization: str = 'no',
+                 sep: bool = False,
+                 compr_model_name: str = "google-bert/bert-base-uncased",
+                 compr_rate: int = 64,
+                 compr_n_layers: int = None,
+                 compr_every_n_layer: int = None,
+                 compr_base_model_name: str = '/mnt/ceph_rbd/model/Mistral-7B-Instruct-v0.2',
+                 compr_rms_norm: bool = False,
+                 compr_mlp_hidden_dim: int = 8096,
+                 compr_use_mlp: bool = True,
+                 compr_linear_type: str = "concat",
+                 lora: bool = False,
+                 lora_compressor: bool = False,
+                 training_form: str = "both",
+                 training_stage: str = "stage1",
+                 generation_top_k: int = 1,
+                 lora_r: int = 16,
+                 lora_r_compressor: int = None,
+                 load_adapters: bool = True,
+                 kbtc_training: bool = False,
+                 optimize_mem_tokens: bool = False,
+                 different_mem_tokens: bool = False,
+                 attn_implementation: str = None,
+                 _attn_implementation_autoset: bool = True,
+                 ae_mode: str = "token",
+                 max_new_tokens: int = 128,
+                 stage2_retrieval_top_n: int = 1,
+                 load_pretrained_checkpoint: bool = False,
+                 device_map=None,
+                 auto_map: dict = {
+                     "AutoConfig": "modeling_clara.CLaRaConfig",
+                     "AutoModel": "modeling_clara.CLaRa"
+                 },
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.decoder_model_name = decoder_model_name
+        self.doc_max_length = doc_max_length
+        self.quantization = quantization
+        self.sep = sep
+        self.compr_model_name = compr_model_name
+        self.compr_rate = compr_rate
+        self.compr_use_mlp = compr_use_mlp
+        self.compr_mlp_hidden_dim = compr_mlp_hidden_dim
+        self.compr_n_layers = compr_n_layers
+        self.compr_every_n_layer = compr_every_n_layer
+        self.compr_base_model_name = compr_base_model_name
+        self.compr_rms_norm = compr_rms_norm
+        self.compr_linear_type = compr_linear_type
+        self.lora = lora
+        self.lora_compressor = lora_compressor
+        self.training_form = training_form
+        self.lora_r = lora_r
+        self.lora_r_compressor = lora_r_compressor or lora_r
+        self.load_adapters = load_adapters
+        self.optimize_mem_tokens = optimize_mem_tokens
+        self.different_mem_tokens = different_mem_tokens
+        self.kbtc_training = kbtc_training
+        self.training_stage = training_stage
+        self.device_map = device_map
+        self.attn_implementation = attn_implementation
+        self._attn_implementation_autoset = _attn_implementation_autoset
+        self.ae_mode = ae_mode
+        self.max_new_tokens = max_new_tokens
+        self.auto_map = auto_map
+        self.load_pretrained_checkpoint = load_pretrained_checkpoint
+        self.generation_top_k = generation_top_k
+        self.stage2_retrieval_top_n = stage2_retrieval_top_n
+        if training_form == 'compressor':
+            assert compr_model_name is not None and not self.lora
+# Utility functions
+def remote_generate(docs: List[str], questions: List[str], api_url: str) -> List[str]:
+    """Generate responses using remote API."""
+    response = requests.post(
+        f"{api_url}/generate",
+        json={"docs": docs, "questions": questions}
+    )
+    return response.json()["texts"]
+def add_memory_tokens_to_inputs(input_ids: torch.Tensor,
+                               attention_mask: torch.Tensor,
+                               n_mem_tokens: int,
+                               tokenizer) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Add memory tokens to input sequences."""
+    assert len(tokenizer.mem_tokens) == n_mem_tokens
+    mem_tokens = torch.stack([tokenizer.mem_token_ids_pt] * input_ids.size(0), 0)
+    assert len(mem_tokens) == input_ids.size(0)
+    assert len(mem_tokens[0]) == n_mem_tokens
+    input_ids = torch.cat([input_ids, mem_tokens], dim=1)
+    attention_mask = torch.cat([attention_mask, torch.ones(input_ids.size(0), n_mem_tokens)], dim=1)
+    return input_ids, attention_mask
+def build_pos_mask(pos_index: List[List[int]], N: int, device: torch.device) -> torch.Tensor:
+    """Build positive mask for retrieval training."""
+    if isinstance(pos_index, (list, tuple)):
+        B = len(pos_index)
+        mask = torch.zeros(B, N, dtype=torch.bool, device=device)
+        for b, idxs in enumerate(pos_index):
+            if len(idxs) > 0:
+                mask[b, torch.as_tensor(idxs, device=device, dtype=torch.long)] = True
+        return mask
+    else:  # tensor [B, M]
+        B, M = pos_index.shape
+        mask = torch.zeros(B, N, dtype=torch.bool, device=device)
+        for m in range(M):
+            col = pos_index[:, m]
+            v = col >= 0
+            if v.any():
+                mask[v, col[v]] = True
+        return mask
+def differentiable_topk_top_1(logits: torch.Tensor, k: int, temperature: float = 1.0) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Implements differentiable top-1 selection using Gumbel-Softmax."""
+    y = logits / temperature
+    y_soft = F.softmax(y, dim=-1).float()
+    # Hard one-hot version
+    index = y_soft.argmax(dim=-1, keepdim=True)
+    y_hard = torch.zeros_like(y_soft).scatter_(-1, index, 1.0)
+    # Straight-through estimator
+    z = y_hard + y_soft - y_soft.detach()
+    z = z.unsqueeze(1).to(logits.dtype)
+    return z, index
+def differentiable_topk(logits: torch.Tensor, k: int, temperature: float = 1.0) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Differentiable top-k selection."""
+    B, N = logits.shape
+    perturbed = logits / max(temperature, 1e-6)
+    # Hard top-k indices
+    topk_vals, topk_idx = perturbed.topk(k, dim=-1)
+    K_hard = torch.zeros(B, k, N, device=logits.device, dtype=logits.dtype)
+    K_hard.scatter_(2, topk_idx.unsqueeze(-1), 1.0)
+    # Soft distributions for each slot
+    K_soft = torch.zeros_like(K_hard)
+    taken = torch.zeros(B, N, device=logits.device, dtype=logits.dtype)
+    for j in range(k):
+        mask = (1.0 - taken.detach())
+        masked = perturbed + (mask + 1e-8).log()
+        pj = F.softmax(masked, dim=-1).float()
+        K_soft[:, j, :] = pj
+        taken = torch.clamp(taken + K_hard[:, j, :], max=1.0)
+    # Straight-through estimator
+    W = K_hard + (K_soft - K_soft.detach())
+    return W, topk_idx
+class CLaRa(PreTrainedModel):
+    """CLaRa: Unified Retrieval-Augmented Generation Model."""
+    config_class = CLaRaConfig
+    def __init__(self, cfg: CLaRaConfig):
+        super().__init__(cfg)
+        self.decoder_model_name = cfg.decoder_model_name
+        self.decoder = self._create_decoder(cfg)
+        self.doc_max_length = cfg.doc_max_length
+        print(f'Base decoder parameters: {self.decoder.num_parameters()}')
+        # Model configuration
+        self.compr_model_name = cfg.compr_model_name
+        self.training_form = cfg.training_form
+        self.lora = cfg.lora
+        self.adapter_keys = []
+        self.compr = None
+        # Initialize LoRA adapters if needed
+        if cfg.lora and not getattr(cfg, 'pure_inference', False):
+            self._setup_lora_adapters(cfg)
+        print(f'Model adapter keys: {self.adapter_keys}')
+        # Initialize tokenizer and resize embeddings
+        self.decoder_tokenizer = self._create_decoder_tokenizer(cfg)
+        self.decoder.resize_token_embeddings(len(self.decoder_tokenizer))
+        self._configure_generation_config()
+        # Model parameters
+        self.generation_top_k = cfg.generation_top_k
+        self.training_stage = cfg.training_stage
+        self.stage2_retrieval_top_n = cfg.stage2_retrieval_top_n
+        self.sep = cfg.sep
+        self.compr_rate = cfg.compr_rate
+        self.local_rank = os.getenv('LOCAL_RANK', '0')
+        self.n_mem_tokens = self.doc_max_length // self.compr_rate
+        self.hidden_size = self.decoder.config.hidden_size
+        # Setup adapters and memory token optimization
+        if self.lora:
+            self._setup_adapter_training()
+        else:
+            print(f'Total trainable parameters: {self.num_parameters(only_trainable=True)}')
+        self._prepare_mem_tokens_optimization()
+        # Retrieval configuration
+        self.url_retrieval = "http://127.0.0.1:5004/queries"
+    def _create_decoder(self, cfg: CLaRaConfig) -> AutoModelForCausalLM:
+        """Create and configure the decoder model."""
+        if not torch.cuda.is_available():
+            return AutoModelForCausalLM.from_pretrained(
+                cfg.decoder_model_name,
+                torch_dtype=torch.bfloat16,
+                resume_download=True,
+                trust_remote_code=True,
+                device_map=cfg.device_map
+            )
+        if cfg.quantization == "no":
+            return AutoModelForCausalLM.from_pretrained(
+                cfg.decoder_model_name,
+                torch_dtype=torch.bfloat16,
+                attn_implementation=cfg.attn_implementation,
+                device_map=cfg.device_map
+            )
+        elif cfg.quantization == "int4":
+            quant_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type='nf4',
+                bnb_4bit_compute_dtype='bfloat16',
+            )
+            return AutoModelForCausalLM.from_pretrained(
+                cfg.decoder_model_name,
+                quantization_config=quant_config,
+                attn_implementation=cfg.attn_implementation,
+                torch_dtype=torch.bfloat16,
+                resume_download=True,
+                trust_remote_code=True,
+                device_map=cfg.device_map
+            )
+        elif cfg.quantization == "int8":
+            quant_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+                llm_int8_enable_fp32_cpu_offload=True,
+                bnb_4bit_compute_dtype='bfloat16',
+            )
+            return AutoModelForCausalLM.from_pretrained(
+                cfg.decoder_model_name,
+                quantization_config=quant_config,
+                attn_implementation=cfg.attn_implementation,
+                torch_dtype=torch.bfloat16,
+                resume_download=True,
+                trust_remote_code=True,
+                device_map=cfg.device_map
+            )
+        else:
+            raise NotImplementedError(f"Quantization {cfg.quantization} not supported")
+    def _setup_lora_adapters(self, cfg: CLaRaConfig):
+        """Setup LoRA adapters based on training stage."""
+        peft_config = self._get_peft_config(lora_r=cfg.lora_r)
+        if cfg.training_stage == "stage1" and cfg.load_adapters:
+            print('Loading encoder and decoder adapter for stage1')
+            self.decoder.add_adapter(peft_config, 'decoder_adapter')
+            self.adapter_keys.append('decoder_adapter')
+            self.decoder.add_adapter(peft_config, 'encoder_adapter')
+            self.adapter_keys.append('encoder_adapter')
+        elif cfg.training_stage == "stage2" and cfg.load_adapters:
+            if 'decoder_adapter' not in self.adapter_keys:
+                self.decoder.add_adapter(peft_config, 'decoder_adapter')
+                self.adapter_keys.append('decoder_adapter')
+            if 'query_reasoner_adapter' not in self.adapter_keys:
+                self.decoder.add_adapter(peft_config, 'query_reasoner_adapter')
+                self.adapter_keys.append('query_reasoner_adapter')
+        elif cfg.training_stage == 'stage1_2':
+            if not cfg.load_adapters:
+                print('Loading decoder adapter for stage1_2')
+                self.decoder.add_adapter(peft_config, 'decoder_adapter')
+                self.adapter_keys.append('decoder_adapter')
+            elif cfg.load_adapters:
+                print('Loading encoder and decoder adapter for stage1_2')
+                self.decoder.add_adapter(peft_config, 'encoder_adapter')
+                self.adapter_keys.append('encoder_adapter')
+                self.decoder.add_adapter(peft_config, 'decoder_adapter')
+                self.adapter_keys.append('decoder_adapter')
+        elif cfg.training_stage == 'stage2_reasoning':
+            if not cfg.load_adapters:
+                print('Loading decoder adapter for stage2_reasoning')
+                self.decoder.add_adapter(peft_config, 'decoder_adapter')
+                self.adapter_keys.append('decoder_adapter')
+    def _setup_adapter_training(self):
+        """Setup adapters for training."""
+        for adapter_key in self.adapter_keys:
+            self.decoder.set_adapter(adapter_key)
+            print(f'Adapter {adapter_key} trainable parameters: {self.num_parameters(only_trainable=True)}')
+        self._set_all_adapters()
+    def _configure_generation_config(self):
+        """Configure generation parameters."""
+        self.decoder.generation_config.top_p = None
+        self.decoder.generation_config.temperature = None
+        self.decoder.generation_config.pad_token_id = self.decoder_tokenizer.pad_token_id
+    @staticmethod
+    def _create_decoder_tokenizer(cfg: CLaRaConfig) -> AutoTokenizer:
+        """Create and configure the decoder tokenizer."""
+        tokenizer = AutoTokenizer.from_pretrained(
+            cfg.decoder_model_name,
+            use_fast=True,
+            padding_side='left'
+        )
+        # Define special tokens
+        n_mem_tokens = cfg.doc_max_length // cfg.compr_rate
+        existing_special_tokens = tokenizer.special_tokens_map.get("additional_special_tokens", [])
+        if cfg.different_mem_tokens:
+            mem_tokens = [f'<MEM{i}>' for i in range(n_mem_tokens)]
+            tokenizer.add_special_tokens({
+                'additional_special_tokens': existing_special_tokens + mem_tokens + ['<AE>', '<ENC>', '<SEP>']
+            })
+            tokenizer.mem_tokens = mem_tokens
+        else:
+            tokenizer.add_special_tokens({
+                'additional_special_tokens': existing_special_tokens + ['<MEM>', '<AE>', '<ENC>', '<SEP>']
+            })
+            tokenizer.mem_tokens = ['<MEM>'] * n_mem_tokens
+        tokenizer.mem_token_ids = [tokenizer.convert_tokens_to_ids(token) for token in tokenizer.mem_tokens]
+        tokenizer.mem_token_ids_pt = torch.LongTensor(tokenizer.mem_token_ids)
+        # Additional special tokens
+        tokenizer.ae_token = '<AE>'
+        tokenizer.ae_token_id = tokenizer.convert_tokens_to_ids('<AE>')
+        tokenizer.enc_token = '<ENC>'
+        tokenizer.sep_token = '<SEP>'
+        tokenizer.sep_token_id = tokenizer.convert_tokens_to_ids('<SEP>')
+        # Handle model-specific tokens
+        if tokenizer.bos_token is None and 'qwen' in cfg.decoder_model_name.lower():
+            tokenizer.bos_token = tokenizer.special_tokens_map['additional_special_tokens'][0]
+            tokenizer.bos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.bos_token)
+        if tokenizer.eos_token is None and "qwen" in cfg.decoder_model_name.lower():
+            tokenizer.eos_token = tokenizer.special_tokens_map['additional_special_tokens'][1]
+            tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
+        # KBTC training tokens
+        if cfg.kbtc_training:
+            tokenizer.add_special_tokens({'additional_special_tokens': ['<KBTC>']})
+            tokenizer.kbtc_token = '<KBTC>'
+            tokenizer.kbtc_token_id = tokenizer.convert_tokens_to_ids('<KBTC>')
+        # Set pad token
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = tokenizer.bos_token_id
+        print(f'Memory token count: {n_mem_tokens}')
+        return tokenizer
+    def _get_peft_config(self, lora_r: int) -> LoraConfig:
+        """Build the PEFT configuration."""
+        return LoraConfig(
+            task_type="CAUSAL_LM",
+            r=lora_r,
+            lora_alpha=2*lora_r,
+            target_modules='all-linear',
+            lora_dropout=0.1
+        )
+    def _prepare_mem_tokens_optimization(self):
+        """Setup memory token optimization if enabled."""
+        if self.config.optimize_mem_tokens and self.compr is None:
+            # Enable gradients for input embeddings
+            self.decoder.get_input_embeddings().weight.requires_grad = True
+            # Apply hook to zero gradients except for memory tokens
+            def hook(grad):
+                mask = torch.zeros_like(grad)
+                mask[self.decoder_tokenizer.mem_token_ids] = 1.0
+                return grad * mask
+            self.decoder.get_input_embeddings().weight.register_hook(hook)
+    def _set_all_adapters(self):
+        """Activate all adapters for training."""
+        if len(self.adapter_keys) > 0:
+            self.decoder.set_adapter(self.adapter_keys)
+    # Core compression and generation methods
+    def compress(self, enc_input_ids: torch.Tensor, enc_attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compress input documents."""
+        if self.compr:
+            return self.compr(enc_input_ids, enc_attention_mask)
+        else:
+            return self._compr_decoder(enc_input_ids, enc_attention_mask)
+    def _compr_decoder(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Use decoder as compressor."""
+        assert input_ids.size() == attention_mask.size()
+        if 'encoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('encoder_adapter')
+        else:
+            raise ValueError(f"encoder_adapter not in adapter_keys: {self.adapter_keys}")
+        # Get embeddings from decoder
+        emb = self.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True
+        ).hidden_states[-1]
+        # Create mask for memory tokens
+        mask = torch.isin(
+            input_ids,
+            self.decoder_tokenizer.mem_token_ids_pt.to(input_ids.device)
+        )
+        # Calculate MSE loss between memory and non-memory regions
+        attn = attention_mask.bool()
+        mem_mask = mask & attn
+        non_mem_mask = (~mask) & attn
+        mem_len = mem_mask.sum(dim=1)
+        non_mem_len = non_mem_mask.sum(dim=1)
+        if (mem_len == 0).any():
+            raise ValueError("Some samples have no memory tokens")
+        if (non_mem_len == 0).any():
+            raise ValueError("Some samples have no non-memory tokens")
+        mem_sum = (emb * mem_mask.unsqueeze(-1)).sum(dim=1)
+        non_mem_sum = (emb * non_mem_mask.unsqueeze(-1)).sum(dim=1)
+        mem_mean = mem_sum / mem_len.unsqueeze(-1)
+        non_mem_mean = non_mem_sum / non_mem_len.unsqueeze(-1)
+        mse_loss = F.mse_loss(non_mem_mean, mem_mean, reduction='mean')
+        return emb[mask].reshape(emb.size(0), -1, emb.size(-1)), mse_loss
+    def _compr_query_reasoner_stage2(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+        """Query reasoning compression for stage 2."""
+        assert input_ids.size() == attention_mask.size()
+        if 'query_reasoner_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('query_reasoner_adapter')
+        else:
+            raise ValueError(f"query_reasoner_adapter not in adapter_keys: {self.adapter_keys}")
+        emb = self.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True
+        ).hidden_states[-1]
+        mask = torch.isin(
+            input_ids,
+            self.decoder_tokenizer.mem_token_ids_pt.to(input_ids.device)
+        )
+        return emb[mask].reshape(emb.size(0), -1)
+    # Generation methods
+    def generate_from_questions(self,
+                               questions: List[str],
+                               max_new_tokens: int = 128,
+                               temperature: float = 0.5,
+                               documents: List[List[str]] = None,
+                               stage2_mips: bool = False,
+                               stage2_retrieval_top_n: int = None,
+                               time_count: bool = False) -> Tuple[List[str], torch.Tensor]:
+        """Generate answers from questions using query reasoning."""
+        if "query_reasoner_adapter" not in self.adapter_keys:
+            raise ValueError("Query reasoner adapter not found")
+        self.eval()
+        with torch.no_grad():
+            # Encode questions
+            self.decoder.set_adapter('query_reasoner_adapter')
+            flat_questions = [q for q in questions]
+            if time_count:
+                start_time = time.time()
+            q_tok = self._prepare_encoder_inputs(flat_questions, max_length=self.doc_max_length)
+            query_reps = self._compr_query_reasoner_stage2(
+                q_tok["input_ids"].to(self.decoder.device),
+                q_tok["attention_mask"].to(self.decoder.device)
+            )
+            # Document retrieval and selection
+            if stage2_mips:
+                retrieved_doc_embeddings = self._retrieve_embeddings(
+                    query_reps, stage2_retrieval_top_n=stage2_retrieval_top_n
+                )
+                scores = torch.bmm(
+                    query_reps.unsqueeze(1),
+                    retrieved_doc_embeddings.transpose(1, 2)
+                ).squeeze(1)
+                z, topk_idx = differentiable_topk(scores, self.generation_top_k, temperature=0.5)
+                selected_doc_embeddings = torch.einsum('bkn,bnd->bkd', z, retrieved_doc_embeddings)
+                selected_doc_embeddings = selected_doc_embeddings.view(
+                    selected_doc_embeddings.size(0) * selected_doc_embeddings.size(1),
+                    -1, self.hidden_size
+                )
+            else:
+                # Use provided documents
+                flat_documents = sum(documents, [])
+                if time_count:
+                    start_time1 = time.time()
+                input_encoder = self._prepare_encoder_inputs(flat_documents, max_length=self.doc_max_length)
+                device = self.decoder.device
+                enc_input_ids = input_encoder['input_ids'].to(device)
+                enc_attention_mask = input_encoder['attention_mask'].to(device)
+                retrieved_doc_embeddings, _ = self.compress(enc_input_ids, enc_attention_mask)
+                if time_count:
+                    start_time2 = time.time()
+                    compress_time = start_time2 - start_time1
+                B = len(questions)
+                stage2_retrieval_top_n = retrieved_doc_embeddings.shape[0] // B
+                retrieved_doc_embeddings = retrieved_doc_embeddings.reshape(B, stage2_retrieval_top_n, -1)
+                query_reps = query_reps.to(retrieved_doc_embeddings.dtype)
+                if time_count:
+                    start_time3 = time.time()
+                scores = torch.bmm(
+                    F.normalize(query_reps, dim=-1, p=2).unsqueeze(1).float(),
+                    F.normalize(retrieved_doc_embeddings, dim=-1, p=2).float().transpose(1, 2)
+                ).squeeze(1)
+                z, topk_idx = differentiable_topk(scores, self.generation_top_k, temperature=0.02)
+                selected_doc_embeddings = torch.einsum('bkn,bnd->bkd', z.to(retrieved_doc_embeddings.dtype), retrieved_doc_embeddings)
+                selected_doc_embeddings = selected_doc_embeddings.view(
+                    selected_doc_embeddings.size(0) * selected_doc_embeddings.size(1),
+                    -1, self.hidden_size
+                )
+                if time_count:
+                    start_time4 = time.time()
+                    query_time = start_time4 - start_time3 + start_time1 - start_time
+            # Generate instructions and decode
+            if time_count:
+                start_time5 = time.time()
+            instructions = [
+                self._blend_prompt_and_selected_memory_tokens(query=q)[1]
+                for q in questions
+            ]
+            decoder_inputs = self.decoder_tokenizer(
+                instructions,
+                return_tensors='pt',
+                padding="longest",
+                add_special_tokens=False,
+                truncation=True,
+                max_length=1024,
+            )
+            dec_input_ids = decoder_inputs['input_ids'].to(self.decoder.device)
+            dec_attention_mask = decoder_inputs['attention_mask'].to(self.decoder.device)
+            # Replace memory token embeddings
+            inputs_embeds = self._replace_emb_stage2(selected_doc_embeddings, dec_input_ids)
+            # Switch to decoder adapter for generation
+            if 'decoder_adapter' in self.adapter_keys:
+                self.decoder.set_adapter('decoder_adapter')
+            # Generate answers
+            output_ids = self.decoder.generate(
+                inputs_embeds=inputs_embeds,
+                attention_mask=dec_attention_mask,
+                do_sample=False,
+                top_p=None,
+                temperature=None,
+                max_new_tokens=max_new_tokens,
+                pad_token_id=self.decoder_tokenizer.pad_token_id
+            )
+            if time_count:
+                start_time6 = time.time()
+                generate_time = start_time6 - start_time5
+            # Decode generated tokens
+            decoded = self.decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        if time_count:
+            return decoded, topk_idx, compress_time, query_time, generate_time, compress_time + query_time + generate_time
+        else:
+            return decoded, topk_idx
+    def generate_from_paraphrase(self, questions: list[str], documents: list[list[str]], max_new_tokens: int = 128) -> list[str]:
+        """
+        Generates answers from documents (via compression then decoding)
+        questions: list of string
+        documents: list of list of strings (they should all be of equal length: the nb of doc for each question)
+        """
+        self.generation_top_k = len(documents[0])
+        assert len(documents) == len(questions)
+        assert all([len(context) == len(documents[0]) for context in documents])
+        flat_documents = sum(documents, [])
+        model_input = {}
+        # Creating encoder inputs:
+        input_encoder = self._prepare_encoder_inputs(flat_documents, max_length=self.doc_max_length)
+        device = self.decoder.device
+        model_input['enc_input_ids'], model_input['enc_attention_mask'] = input_encoder['input_ids'].to(device), input_encoder['attention_mask'].to(device)
+        # Creating decoder inputs
+        instr = [self._blend_prompt_and_memory_tokens(query="", stage = "stage1", paraphrase_loss = True) for q in questions]
+        inp_dec = self.decoder_tokenizer(instr, return_tensors='pt', padding="longest", add_special_tokens=False, truncation=True,  max_length=1024)
+        model_input['dec_input_ids'], model_input['dec_attention_mask'] = inp_dec['input_ids'].to(device), inp_dec['attention_mask'].to(device)
+        # Generation
+        return self._generate(model_input, max_new_tokens=max_new_tokens)
+    def generate_from_text(self,
+                          questions: List[str],
+                          documents: List[List[str]],
+                          max_new_tokens: int = 128) -> List[str]:
+        """Generate answers from documents via compression then decoding."""
+        self.generation_top_k = len(documents[0])
+        assert len(documents) == len(questions)
+        assert all(len(context) == len(documents[0]) for context in documents)
+        flat_documents = sum(documents, [])
+        # Create encoder inputs
+        input_encoder = self._prepare_encoder_inputs(flat_documents, max_length=self.doc_max_length)
+        device = self.decoder.device
+        enc_input_ids = input_encoder['input_ids'].to(device)
+        enc_attention_mask = input_encoder['attention_mask'].to(device)
+        # Create decoder inputs
+        instructions = [self._blend_prompt_and_memory_tokens(query=q, stage="stage1_2") for q in questions]
+        inp_dec = self.decoder_tokenizer(
+            instructions,
+            return_tensors='pt',
+            padding="longest",
+            add_special_tokens=False,
+            truncation=True,
+            max_length=1024
+        )
+        dec_input_ids = inp_dec['input_ids'].to(device)
+        dec_attention_mask = inp_dec['attention_mask'].to(device)
+        # Generate
+        return self._generate({
+            'enc_input_ids': enc_input_ids,
+            'enc_attention_mask': enc_attention_mask,
+            'dec_input_ids': dec_input_ids,
+            'dec_attention_mask': dec_attention_mask
+        }, max_new_tokens=max_new_tokens)
+    def generate_from_compressed_documents_and_questions(self,
+                                                        questions: List[str],
+                                                        compressed_documents: torch.Tensor,
+                                                        max_new_tokens: int = 128) -> List[str]:
+        """Generate answers from compressed documents."""
+        self.generation_top_k = compressed_documents.size(0) // len(questions)
+        assert compressed_documents.size(0) % self.generation_top_k == 0
+        # Create decoder inputs
+        instructions = [self._blend_prompt_and_memory_tokens(query=q, stage="stage1_2") for q in questions]
+        inp_dec = self.decoder_tokenizer(
+            instructions,
+            return_tensors='pt',
+            padding="longest",
+            add_special_tokens=False,
+            truncation=True,
+            max_length=1024
+        )
+        device = self.decoder.device
+        dec_input_ids = inp_dec['input_ids'].to(device)
+        dec_attention_mask = inp_dec['attention_mask'].to(device)
+        # Create input decoder embeddings from prompt + compressed documents
+        inputs_embeds = self._replace_emb(compressed_documents, dec_input_ids)
+        # Activate decoder generator
+        if 'decoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('decoder_adapter')
+        output_ids = self.decoder.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=dec_attention_mask,
+            max_new_tokens=max_new_tokens
+        )
+        return self.decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+    def compress_documents(self, documents: List[str]) -> torch.Tensor:
+        """Compress a list of documents."""
+        input_encoder = self._prepare_encoder_inputs(documents, max_length=self.doc_max_length)
+        enc_input_ids = input_encoder['input_ids'].to(self.decoder.device)
+        attention_mask = input_encoder['attention_mask'].to(self.decoder.device)
+        return self.compress(enc_input_ids=enc_input_ids, enc_attention_mask=attention_mask)
+    # Helper methods
+    def _prepare_encoder_inputs(self, texts: List[str], max_length: int, q_texts: List[str] = None) -> Dict[str, torch.Tensor]:
+        """Create inputs for the encoder."""
+        if q_texts is not None:
+            assert len(texts) == len(q_texts)
+        if self.compr is None:
+            return self._prepare_encoder_inputs_to_decoder(texts, max_length, q_texts)
+        else:
+            return self.compr.prepare_inputs(texts, max_length, q_texts)
+    def _prepare_encoder_inputs_to_decoder(self, texts: List[str], max_length: int, q_texts: List[str] = None) -> Dict[str, torch.Tensor]:
+        """Prepare encoder inputs when using decoder as compressor."""
+        if q_texts is not None:
+            texts_to_encode = [
+                self.decoder_tokenizer.enc_token +
+                self.decoder_tokenizer.bos_token +
+                '\nQuery:\n' + query +
+                'Document:\n' + text +
+                self.decoder_tokenizer.eos_token
+                for text, query in zip(texts, q_texts)
+            ]
+            inp_enc = self.decoder_tokenizer(
+                texts_to_encode,
+                return_tensors='pt',
+                padding='max_length',
+                max_length=max_length + 8,
+                truncation=True,
+                add_special_tokens=False
+            )
+        else:
+            inp_enc = [
+                self.decoder_tokenizer.enc_token +
+                self.decoder_tokenizer.bos_token +
+                text +
+                self.decoder_tokenizer.eos_token
+                for text in texts
+            ]
+            inp_enc = self.decoder_tokenizer(
+                inp_enc,
+                return_tensors='pt',
+                padding="max_length",
+                max_length=max_length + 3,
+                truncation=True,
+                add_special_tokens=False
+            )
+        num_mem_tokens = self.doc_max_length // self.compr_rate
+        assert num_mem_tokens == len(self.decoder_tokenizer.mem_tokens)
+        inp_enc['input_ids'], inp_enc['attention_mask'] = add_memory_tokens_to_inputs(
+            inp_enc['input_ids'],
+            inp_enc['attention_mask'],
+            num_mem_tokens,
+            tokenizer=self.decoder_tokenizer
+        )
+        return inp_enc
+    def _replace_emb(self, compressed_embs: torch.Tensor, dec_input_ids: torch.Tensor) -> torch.Tensor:
+        """Replace memory tokens in decoder input with compressed embeddings."""
+        indices = range(0, compressed_embs.size(0) + 1, self.generation_top_k)
+        return self._replace_embeddings(compressed_embs, dec_input_ids, indices)
+    def _replace_emb_stage2(self, compressed_embs: torch.Tensor, dec_input_ids: torch.Tensor) -> torch.Tensor:
+        """Replace memory tokens for stage 2."""
+        indices = range(0, compressed_embs.size(0) + 1, self.generation_top_k)
+        return self._replace_embeddings(compressed_embs, dec_input_ids, indices)
+    def _replace_embeddings(self, compressed_embs: torch.Tensor, dec_input_ids: torch.Tensor, indices: range) -> torch.Tensor:
+        """Replace memory tokens with compressed embeddings."""
+        inputs_embeds = self.decoder.get_input_embeddings()(dec_input_ids)
+        num_embs = compressed_embs.size(1)
+        slot_len = num_embs + (1 if self.sep else 0)
+        # Get first memory token indices
+        first_mem_token_indices = torch.argmax(
+            (dec_input_ids == self.decoder_tokenizer.mem_token_ids[0]).int(), dim=1
+        )
+        batch_size = inputs_embeds.size(0)
+        # Replace with compressed embeddings
+        for i in range(batch_size):
+            for j in range(indices[i], indices[i + 1]):
+                start_idx = first_mem_token_indices[i].item() + (j - indices[i]) * slot_len
+                assert inputs_embeds[i, start_idx:start_idx + num_embs, :].size() == compressed_embs[j].size()
+                inputs_embeds[i, start_idx:start_idx + num_embs, :] = compressed_embs[j]
+        return inputs_embeds
+    def _retrieve_embeddings(self, questions: torch.Tensor, stage2_retrieval_top_n: int = 1) -> torch.Tensor:
+        """Retrieve embeddings of documents."""
+        response = requests.post(
+            self.url_retrieval,
+            json={
+                "queries": questions.detach().cpu().float().numpy().tolist(),
+                'k': self.generation_top_k
+            }
+        )
+        if response.status_code != 200:
+            raise Exception(f"Error: {response.status_code} - {response.text}")
+        results = response.json()
+        retrieval_embeddings = results['retrieved_embeddings']
+        retrieval_embeddings = torch.tensor(
+            retrieval_embeddings,
+            dtype=torch.bfloat16,
+            device=questions.device
+        )
+        if len(retrieval_embeddings.shape) == 4:
+            retrieval_embeddings = retrieval_embeddings.reshape(
+                retrieval_embeddings.shape[0] * retrieval_embeddings.shape[1],
+                retrieval_embeddings.shape[2], -1
+            )
+        return retrieval_embeddings
+    def _blend_prompt_and_memory_tokens(self, query: str, answer: str = None, qa_loss: bool = False,
+                                       paraphrase_loss: bool = False, stage: str = "stage1") -> Tuple[int, str]:
+        """Blend prompt with memory tokens for different training stages."""
+        mem_tokens_str = ''.join(self.decoder_tokenizer.mem_tokens) + self.decoder_tokenizer.sep_token
+        docs = mem_tokens_str * self.generation_top_k
+        if stage == "stage1":
+            if qa_loss:
+                return self._blend_qa_prompt(docs, query, answer)
+            elif paraphrase_loss:
+                return self._blend_paraphrase_prompt(docs, answer)
+        elif stage == "stage1_2":
+            return self._blend_standard_prompt(docs, query, answer)
+        raise ValueError(f"Unknown stage: {stage}")
+    def _blend_qa_prompt(self, docs: str, query: List[str], answer: List[str]) -> Tuple[int, str]:
+        """Create QA prompt for stage 1."""
+        prompt_system = 'You are a helpful assistant. Given a document, your task is to generate some single questions to cover all key information of the document and answer them sequentially.'
+        prompt_user = f"Background:\n{docs}"
+        sys_prompt = [{"role": "system", "content": prompt_system}]
+        user_prompt = [{"role": "user", "content": prompt_user.replace(':\ ', ': ')}]
+        qa_lines = [f"Question: {q}\nAnswer: {a}" for q, a in zip(query, answer)]
+        query_answer = "\n".join(qa_lines)
+        assistant_prompt = [{"role": "assistant", "content": query_answer}]
+        try:
+            prompt = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False
+            )
+            response = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt + assistant_prompt,
+                tokenize=False,
+                add_generation_prompt=False,
+                enable_thinking=False
+            )
+            prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+        except TemplateError as e:
+            if "System role not supported" in str(e):
+                messages = [{"role": "user", "content": sys_prompt[0]['content'] + '\n' + user_prompt[0]['content']}]
+                prompt = self.decoder_tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
+                )
+                prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+                # Handle response for unsupported system role
+                messages_with_answer = messages + assistant_prompt
+                response = self.decoder_tokenizer.apply_chat_template(
+                    messages_with_answer, tokenize=False, add_generation_prompt=False, enable_thinking=False
+                )
+            else:
+                raise e
+        return prompt_len, response
+    def _blend_paraphrase_prompt(self, docs: str, answer: str) -> Tuple[int, str]:
+        """Create paraphrase prompt for stage 1."""
+        prompt_system = 'You are a helpful assistant. Your task is follow the instructions to paraphrase the background information.'
+        prompt_user = random.choice(PARAPHRASE_INSTRUCTIONS).format(docs=docs)
+        sys_prompt = [{"role": "system", "content": prompt_system}]
+        user_prompt = [{"role": "user", "content": prompt_user.replace(':\ ', ': ')}]
+        try:
+            prompt = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False
+            )
+            if answer is None:
+                return prompt
+            assistant_prompt = [{"role": "assistant", "content": answer}]
+            response = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt + assistant_prompt,
+                tokenize=False,
+                add_generation_prompt=False,
+                enable_thinking=False
+            )
+            prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+        except TemplateError as e:
+            if "System role not supported" in str(e):
+                combined_content = prompt_system + '\n' + prompt_user.replace(':\ ', ': ')
+                messages = [{"role": "user", "content": combined_content}]
+                prompt = self.decoder_tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
+                )
+                if answer is None:
+                    return prompt
+                prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+                messages_with_answer = messages + [{"role": "assistant", "content": answer}]
+                response = self.decoder_tokenizer.apply_chat_template(
+                    messages_with_answer, tokenize=False, add_generation_prompt=False, enable_thinking=False
+                )
+            else:
+                raise e
+        return prompt_len, response
+    def _blend_standard_prompt(self, docs: str, query: str, answer: str) -> Tuple[int, str]:
+        """Create standard prompt for stage 1_2."""
+        prompt_system = 'You are a helpful assistant. Your task is to extract relevant information from provided documents and to answer to questions as briefly as possible.'
+        prompt_user = f"Background:\n{docs}\n\nQuestion:{query}"
+        sys_prompt = [{"role": "system", "content": prompt_system}]
+        user_prompt = [{"role": "user", "content": prompt_user.replace(':\ ', ': ')}]
+        try:
+            prompt = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False
+            )
+            if answer is None:
+                return prompt
+            assistant_prompt = [{"role": "assistant", "content": answer}]
+            response = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt + assistant_prompt,
+                tokenize=False,
+                add_generation_prompt=False,
+                enable_thinking=False
+            )
+            prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+        except TemplateError as e:
+            if "System role not supported" in str(e):
+                combined_content = prompt_system + '\n' + prompt_user.replace(':\ ', ': ')
+                messages = [{"role": "user", "content": combined_content}]
+                prompt = self.decoder_tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
+                )
+                if answer is None:
+                    return prompt
+                prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+                messages_with_answer = messages + [{"role": "assistant", "content": answer}]
+                response = self.decoder_tokenizer.apply_chat_template(
+                    messages_with_answer, tokenize=False, add_generation_prompt=False, enable_thinking=False
+                )
+            else:
+                raise e
+        return prompt_len, response
+    def _blend_prompt_and_selected_memory_tokens(self, query: str, answer: str = None) -> Tuple[int, str]:
+        """Create prompt for stage 2 with selected memory tokens."""
+        mem_tokens_str = ''.join(self.decoder_tokenizer.mem_tokens) + self.decoder_tokenizer.sep_token
+        docs = mem_tokens_str * self.generation_top_k
+        prompt_system = 'You are a helpful assistant. Your task is to extract relevant information from provided documents and to answer to questions as briefly as possible.'
+        prompt_user = f"Background:\n{docs}\n\nQuestion:{query}"
+        sys_prompt = [{"role": "system", "content": prompt_system}]
+        user_prompt = [{"role": "user", "content": prompt_user.replace(':\ ', ': ')}]
+        try:
+            prompt = self.decoder_tokenizer.apply_chat_template(
+                sys_prompt + user_prompt,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False
+            )
+            prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+            if answer is not None:
+                assistant_prompt = [{"role": "assistant", "content": answer}]
+                response = self.decoder_tokenizer.apply_chat_template(
+                    sys_prompt + user_prompt + assistant_prompt,
+                    tokenize=False,
+                    add_generation_prompt=False,
+                    enable_thinking=False
+                )
+            else:
+                response = prompt
+        except TemplateError as e:
+            if "System role not supported" in str(e):
+                combined_content = prompt_system + '\n' + prompt_user.replace(':\ ', ': ')
+                messages = [{"role": "user", "content": combined_content}]
+                prompt = self.decoder_tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True,
+                    enable_thinking=False
+                )
+                prompt_len = len(self.decoder_tokenizer.encode(prompt, add_special_tokens=False))
+                if answer is not None:
+                    messages_with_answer = messages + [{"role": "assistant", "content": answer}]
+                    response = self.decoder_tokenizer.apply_chat_template(
+                        messages_with_answer,
+                        tokenize=False,
+                        add_generation_prompt=False,
+                        enable_thinking=False
+                    )
+                else:
+                    response = prompt
+            else:
+                raise e
+        return prompt_len, response
+    # Model saving and loading methods
+    def save_pretrained(self, save_directory: str, **kwargs):
+        """Save only the LoRA adapters and their configurations."""
+        if self.lora:
+            if not os.path.exists(save_directory):
+                os.makedirs(save_directory)
+            # Save LoRA adapter weights
+            torch.save(
+                self._get_all_adapters_state_dict(),
+                os.path.join(save_directory, "adapters.pth")
+            )
+            # Save first and last layers of decoder
+            torch.save(
+                self._get_decoder_first_and_last_layer_state_dict(),
+                os.path.join(save_directory, "decoder_first_last_layers.pth")
+            )
+            # Save configuration
+            self.config.save_pretrained(save_directory)
+        else:
+            super().save_pretrained(save_directory, **kwargs)
+    def _get_all_adapters_state_dict(self) -> Dict[str, Dict[str, torch.Tensor]]:
+        """Return the state dicts of all adapters."""
+        return {
+            key: {k: v.cpu() for k, v in self.decoder.get_adapter_state_dict(key).items()}
+            for key in self.adapter_keys
+        }
+    def _get_decoder_first_and_last_layer_state_dict(self) -> Dict[str, torch.Tensor]:
+        """Get first and last layers that change when adding tokens."""
+        out = {}
+        for k, v in self.decoder.named_parameters():
+            if 'lm_head.weight' in k or 'embed_tokens.weight' in k:
+                out[k] = v.cpu()
+        return out
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
+        """Load model from pretrained checkpoint."""
+        # Load configuration
+        config = CLaRaConfig.from_pretrained(pretrained_model_name_or_path)
+        # Update config with kwargs
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+        map_location = torch.device("cpu") if not torch.cuda.is_available() else None
+        if config.lora:
+            # Delay adapter construction
+            config.load_adapters = False
+            if 'device_map' in kwargs:
+                config.device_map = kwargs['device_map']
+            # Initialize model
+            print(f"Initializing model from trained checkpoint: {config}")
+            model = cls(config)
+            # Load first and last layers
+            try:
+                first_and_last_layers_path = hf_hub_download(
+                    repo_id=pretrained_model_name_or_path,
+                    filename="decoder_first_last_layers.pth"
+                )
+            except Exception:
+                first_and_last_layers_path = os.path.join(
+                    pretrained_model_name_or_path, "decoder_first_last_layers.pth"
+                )
+            if os.path.exists(first_and_last_layers_path):
+                first_and_last_decoder_state_dict = torch.load(
+                    first_and_last_layers_path, map_location=map_location, weights_only=True
+                )
+                for key in first_and_last_decoder_state_dict:
+                    assert key in model.decoder.state_dict()
+                model.decoder.load_state_dict(first_and_last_decoder_state_dict, strict=False)
+            else:
+                print(f'First and last layer not found: {first_and_last_layers_path}')
+            peft_config = model._get_peft_config(lora_r=config.lora_r)
+            # Load LoRA adapters
+            try:
+                adapters_path = hf_hub_download(
+                    repo_id=pretrained_model_name_or_path,
+                    filename="adapters.pth"
+                )
+            except Exception:
+                adapters_path = os.path.join(pretrained_model_name_or_path, "adapters.pth")
+            if os.path.exists(adapters_path):
+                adapters_state_dict = torch.load(adapters_path, map_location=map_location, weights_only=True)
+                model._load_adapters_from_state_dict(adapters_state_dict, peft_config, config)
+            else:
+                warnings.warn(f'Adapters not found at {adapters_path}')
+            model._set_all_adapters()
+            config.load_adapters = True
+            return model
+        else:
+            return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+    def _load_adapters_from_state_dict(self, adapters_state_dict: Dict, peft_config: LoraConfig, config: CLaRaConfig):
+        """Load adapters from state dict based on training stage."""
+        if not getattr(config, 'pure_inference', False):
+            for key, val in adapters_state_dict.items():
+                # Skip certain adapters based on training stage
+                if config.training_stage == 'stage1' and key == 'query_reasoner_adapter':
+                    continue
+                elif config.training_stage == 'stage1_2' and key in ['query_reasoner_adapter', 'decoder_adapter']:
+                    continue
+                elif config.training_stage == 'stage2_reasoning' and key == 'decoder_adapter':
+                    continue
+                self._load_adapter_from_state_dict(
+                    peft_config=peft_config,
+                    adapter_name=key,
+                    adapter_state_dict=val
+                )
+        else:
+            # Load all adapters for pure inference
+            for key, val in adapters_state_dict.items():
+                self._load_adapter_from_state_dict(
+                    peft_config=peft_config,
+                    adapter_name=key,
+                    adapter_state_dict=val
+                )
+        # Handle special cases for stage 2 training
+        if config.training_stage == 'stage2' and 'query_reasoner_adapter' not in adapters_state_dict:
+            self._handle_query_reasoner_adapter_loading(adapters_state_dict, peft_config)
+    def _load_adapter_from_state_dict(self, peft_config: LoraConfig, adapter_name: str, adapter_state_dict: Dict):
+        """Create adapter from state dict."""
+        print(f'Loading checkpoint adapter: {adapter_name}')
+        self.decoder.load_adapter(
+            peft_config=peft_config,
+            adapter_name=adapter_name,
+            adapter_state_dict=adapter_state_dict
+        )
+        self.adapter_keys.append(adapter_name)
+    def _handle_query_reasoner_adapter_loading(self, adapters_state_dict: Dict, peft_config: LoraConfig):
+        """Handle special loading logic for query reasoner adapter."""
+        if 'encoder_adapter' in adapters_state_dict and 'query_reasoner_adapter' not in adapters_state_dict:
+            # Rename encoder adapter to query reasoner adapter
+            renamed = {}
+            for k, v in adapters_state_dict['encoder_adapter'].items():
+                new_k = k.replace('encoder_adapter', 'query_reasoner_adapter')
+                renamed[new_k] = v.detach().clone()
+            self._load_adapter_from_state_dict(
+                peft_config=peft_config,
+                adapter_name='query_reasoner_adapter',
+                adapter_state_dict=renamed
+            )
+            print('Loaded query_reasoner_adapter from stage 1 compressor checkpoint')
+        else:
+            # Create new adapter randomly
+            self.decoder.add_adapter(peft_config, 'query_reasoner_adapter')
+            self.adapter_keys.append('query_reasoner_adapter')
+            print('Loaded query_reasoner_adapter randomly for stage 2 training')
+    # Forward pass methods
+    def forward(self,
+                batch: Dict = None,
+                questions: List[str] = None,
+                documents: List[List[str]] = None,
+                answers: List[str] = None,
+                original_answer_gen_api: str = None,
+                stage2_mips: bool = False,
+                stage2_retrieval_top_n: int = None) -> Tuple[torch.Tensor, Dict]:
+        """
+        Forward pass with support for both batch and legacy interfaces.
+        Args:
+            batch: Preprocessed batch dict (new interface)
+            questions: List of questions (legacy interface)
+            documents: List of document lists (legacy interface)
+            answers: List of answers (legacy interface)
+            original_answer_gen_api: API URL for generation (legacy interface)
+            stage2_mips: Whether to use MIPS for stage2
+            stage2_retrieval_top_n: Top-n for stage2 retrieval
+        Returns:
+            Tuple of (loss, additional_outputs_dict)
+        """
+        if batch is not None:
+            return self._forward_batch(batch, stage2_mips, stage2_retrieval_top_n)
+        else:
+            return self._forward_legacy(questions, documents, answers, original_answer_gen_api)
+    def _forward_batch(self, batch: Dict, stage2_mips: bool, stage2_retrieval_top_n: int) -> Tuple[torch.Tensor, Dict]:
+        """Handle batch-based forward pass."""
+        stage = batch.get("stage", None)
+        if stage in ["stage1", "stage1_2"]:
+            return self._forward_stage1_batch(batch)
+        elif stage == "stage2":
+            return self._forward_stage2_batch(batch, stage2_mips, stage2_retrieval_top_n)
+        elif stage == "stage2_pretrain_retrieval":
+            return self._forward_stage2_pretrain_batch(batch, stage2_mips, stage2_retrieval_top_n)
+        elif stage == "stage2_reasoning":
+            return self._forward_stage2_reasoning_batch(batch)
+        else:
+            raise ValueError(f"Unknown stage: {stage}")
+    def _forward_stage1_batch(self, batch: Dict) -> Tuple[torch.Tensor, Dict]:
+        """Forward pass for stage 1 training."""
+        # Move tensors to device
+        enc_input_ids = batch["enc_input_ids"].to(self.decoder.device)
+        enc_attention_mask = batch["enc_attention_mask"].to(self.decoder.device)
+        dec_input_ids = batch["dec_input_ids"].to(self.decoder.device)
+        dec_attention_mask = batch["dec_attention_mask"].to(self.decoder.device)
+        labels = batch["labels"].to(self.decoder.device)
+        out = self._forward_stage_1(
+            enc_input_ids=enc_input_ids,
+            enc_attention_mask=enc_attention_mask,
+            dec_input_ids=dec_input_ids,
+            dec_attention_mask=dec_attention_mask,
+            labels=labels,
+        )
+        return out["loss"], {"logits": out["logits"], "mse_loss": out["mse_loss"]}
+    def _forward_stage2_batch(self, batch: Dict, stage2_mips: bool, stage2_retrieval_top_n: int) -> Tuple[torch.Tensor, Dict]:
+        """Forward pass for stage 2 training."""
+        self.decoder.set_adapter('query_reasoner_adapter')
+        B = batch["labels"].shape[0]
+        query_reps = self._compr_query_reasoner_stage2(
+            batch["query_input_ids"].to(self.decoder.device),
+            batch["query_attention_mask"].to(self.decoder.device)
+        )
+        enc_input_ids = batch["enc_input_ids"].to(self.decoder.device)
+        enc_attention_mask = batch["enc_attention_mask"].to(self.decoder.device)
+        dec_input_ids = batch["dec_input_ids"].to(self.decoder.device)
+        dec_attention_mask = batch["dec_attention_mask"].to(self.decoder.device)
+        labels = batch["labels"].to(self.decoder.device)
+        # Document retrieval and selection
+        if stage2_mips:
+            retrieved_doc_embeddings = self._retrieve_embeddings(
+                query_reps, stage2_retrieval_top_n=stage2_retrieval_top_n
+            )
+            scores = torch.bmm(
+                query_reps.unsqueeze(1),
+                retrieved_doc_embeddings.transpose(1, 2)
+            ).squeeze(1)
+            z, topk_idx = differentiable_topk(scores, self.generation_top_k, temperature=1)
+            selected = torch.einsum('bkn,bnd->bkd', z, retrieved_doc_embeddings)
+            selected = selected.view(selected.size(0) * selected.size(1), -1, self.hidden_size)
+        else:
+            with torch.no_grad():
+                retrieved_doc_embeddings, mse_loss = self.compress(enc_input_ids, enc_attention_mask)
+            stage2_retrieval_top_n = retrieved_doc_embeddings.shape[0] // B
+            retrieved_doc_embeddings = retrieved_doc_embeddings.reshape(B, stage2_retrieval_top_n, -1)
+            query_reps = query_reps.to(retrieved_doc_embeddings.dtype)
+            scores = torch.bmm(
+                F.normalize(query_reps, dim=-1, p=2).unsqueeze(1).float(),
+                F.normalize(retrieved_doc_embeddings, dim=-1, p=2).float().transpose(1, 2)
+            ).squeeze(1)
+            z, topk_idx = differentiable_topk(scores, self.generation_top_k, temperature=0.02)
+            selected = torch.einsum('bkn,bnd->bkd', z.to(retrieved_doc_embeddings.dtype), retrieved_doc_embeddings)
+            selected = selected.view(selected.size(0) * selected.size(1), -1, self.hidden_size)
+        inputs_embeds = self._replace_emb_stage2(selected, dec_input_ids)
+        if 'decoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('decoder_adapter')
+        dec_out = self.decoder(
+            inputs_embeds=inputs_embeds,
+            attention_mask=dec_attention_mask,
+            labels=labels,
+        )
+        self.decoder.set_adapter(['decoder_adapter', 'query_reasoner_adapter'])
+        return dec_out.loss, {"logits": dec_out.logits, "topk_idx": topk_idx, "mse_loss": mse_loss}
+    def _forward_stage2_pretrain_batch(self, batch: Dict, stage2_mips: bool, stage2_retrieval_top_n: int) -> Tuple[torch.Tensor, Dict]:
+        """Forward pass for stage 2 pretraining with retrieval."""
+        self.decoder.set_adapter('query_reasoner_adapter')
+        B = batch["labels"].shape[0]
+        N = batch["enc_input_ids"].shape[0] // B
+        device = self.decoder.device
+        query_reps = self._compr_query_reasoner_stage2(
+            batch["query_input_ids"].to(device),
+            batch["query_attention_mask"].to(device)
+        )
+        enc_input_ids = batch["enc_input_ids"].to(device)
+        enc_attention_mask = batch["enc_attention_mask"].to(device)
+        with torch.no_grad():
+            retrieved_doc_embeddings, mse_loss = self.compress(enc_input_ids, enc_attention_mask)
+        stage2_retrieval_top_n = retrieved_doc_embeddings.shape[0] // B
+        retrieved_doc_embeddings = retrieved_doc_embeddings.reshape(B, stage2_retrieval_top_n, -1)
+        query_reps = query_reps.to(retrieved_doc_embeddings.dtype)
+        scores = torch.bmm(
+            F.normalize(query_reps, dim=-1, p=2).unsqueeze(1).float(),
+            F.normalize(retrieved_doc_embeddings, dim=-1, p=2).float().transpose(1, 2)
+        ).squeeze(1)
+        pos_index = batch["pos_index"]
+        pos_mask = build_pos_mask(pos_index, N, device)
+        tau = 0.02
+        logits = scores / tau
+        pos_logits = logits.masked_fill(~pos_mask, float('-inf'))
+        num = torch.logsumexp(pos_logits, dim=-1)
+        den = torch.logsumexp(logits, dim=-1)
+        loss_vec = -(num - den)
+        valid = pos_mask.any(dim=-1)
+        loss = loss_vec[valid].mean()
+        topk = self.generation_top_k
+        topk_idx = logits.topk(k=min(topk, N), dim=-1).indices
+        return loss, {"logits": [[]], "topk_idx": topk_idx, "mse_loss": mse_loss}
+    def _forward_stage2_reasoning_batch(self, batch: Dict) -> Tuple[torch.Tensor, Dict]:
+        """Forward pass for stage 2 reasoning training."""
+        B = batch["labels"].shape[0]
+        enc_input_ids = batch["enc_input_ids"].to(self.decoder.device)
+        enc_attention_mask = batch["enc_attention_mask"].to(self.decoder.device)
+        dec_input_ids = batch["dec_input_ids"].to(self.decoder.device)
+        dec_attention_mask = batch["dec_attention_mask"].to(self.decoder.device)
+        labels = batch["labels"].to(self.decoder.device)
+        if sum(batch["docs_num"]) != 0:
+            with torch.no_grad():
+                selected, mse_loss = self.compress(enc_input_ids, enc_attention_mask)
+                indices = batch["docs_num"]
+                inputs_embeds = self._replace_reasoning_embeddings(selected, dec_input_ids, indices)
+        else:
+            inputs_embeds = self.decoder.get_input_embeddings()(dec_input_ids)
+            mse_loss = 0
+        if 'decoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('decoder_adapter')
+        dec_out = self.decoder(
+            inputs_embeds=inputs_embeds,
+            attention_mask=dec_attention_mask,
+            labels=labels,
+        )
+        self.decoder.set_adapter(['decoder_adapter'])
+        return dec_out.loss, {"logits": dec_out.logits, "mse_loss": mse_loss}
+    def _forward_stage_1(self,
+                        enc_input_ids: torch.LongTensor = None,
+                        enc_attention_mask: torch.LongTensor = None,
+                        dec_input_ids: torch.LongTensor = None,
+                        dec_attention_mask: torch.LongTensor = None,
+                        labels: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
+        """Stage 1 forward pass for document compression and QA."""
+        assert enc_input_ids.size() == enc_attention_mask.size()
+        # Flatten 3D inputs to 2D if needed
+        if len(enc_input_ids.size()) == 3:
+            batch_size, top_k, seq_length = enc_input_ids.size()
+            enc_input_ids = enc_input_ids.view(batch_size * top_k, seq_length)
+            enc_attention_mask = enc_attention_mask.view(batch_size * top_k, seq_length)
+        assert enc_input_ids.size(0) == dec_input_ids.size(0) * self.generation_top_k
+        # Compress documents
+        compressed_embs, mse_loss = self.compress(enc_input_ids, enc_attention_mask)
+        # Replace memory tokens with compressed embeddings
+        inputs_embeds = self._replace_emb(compressed_embs, dec_input_ids)
+        # Detach if compressor-only training
+        if (self.training_form == "compressor") and (self.compr is None):
+            inputs_embeds = inputs_embeds.detach()
+        # Set decoder adapter
+        if 'decoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('decoder_adapter')
+        # Forward through decoder
+        decoder_outputs = self.decoder(
+            inputs_embeds=inputs_embeds,
+            attention_mask=dec_attention_mask,
+            labels=labels
+        )
+        # Reactivate all adapters
+        self.decoder.set_adapter(['decoder_adapter', 'encoder_adapter'])
+        return {
+            "loss": decoder_outputs.loss,
+            "logits": decoder_outputs.logits,
+            "mse_loss": mse_loss
+        }
+    def _replace_reasoning_embeddings(self,
+                                    compressed_embs: torch.Tensor,
+                                    dec_input_ids: torch.LongTensor,
+                                    docs_per_example: List[int]) -> torch.Tensor:
+        """Replace memory slots with compressed embeddings for reasoning."""
+        device = dec_input_ids.device
+        inputs_embeds = self.decoder.get_input_embeddings()(dec_input_ids)
+        num_embs = compressed_embs.size(1)
+        slot_len = num_embs + (1 if getattr(self, "sep", False) else 0)
+        if not isinstance(docs_per_example, torch.Tensor):
+            docs_per_example = torch.tensor(docs_per_example, device=device, dtype=torch.long)
+        else:
+            docs_per_example = docs_per_example.to(device=device, dtype=torch.long)
+        offsets = torch.zeros(docs_per_example.size(0) + 1, device=device, dtype=torch.long)
+        offsets[1:] = torch.cumsum(docs_per_example, dim=0)
+        total_docs = int(offsets[-1].item())
+        assert total_docs == compressed_embs.size(0)
+        mem_id = self.decoder_tokenizer.mem_token_ids[0]
+        B, L, H = inputs_embeds.size()
+        for i in range(B):
+            # Find first memory token position
+            mem_pos = (dec_input_ids[i] == mem_id).nonzero(as_tuple=True)[0]
+            if mem_pos.numel() == 0:
+                continue
+            first_mem_idx = int(mem_pos[0].item())
+            n_docs_i = int(docs_per_example[i].item())
+            base = int(offsets[i].item())
+            needed_len = first_mem_idx + n_docs_i * slot_len
+            assert needed_len <= L
+            for local_j in range(n_docs_i):
+                global_j = base + local_j
+                start_idx = first_mem_idx + local_j * slot_len
+                target_slice = inputs_embeds[i, start_idx:start_idx + num_embs, :]
+                src = compressed_embs[global_j]
+                assert target_slice.size() == src.size()
+                inputs_embeds[i, start_idx:start_idx + num_embs, :] = src
+        return inputs_embeds
+    def _generate(self, model_input: Dict[str, torch.Tensor], max_new_tokens: int = 128,
+                 return_doc_embeddings: bool = False) -> List[str]:
+        """Generate text from model inputs."""
+        enc_input_ids = model_input['enc_input_ids']
+        enc_attention_mask = model_input['enc_attention_mask']
+        dec_input_ids = model_input['dec_input_ids']
+        dec_attention_mask = model_input['dec_attention_mask']
+        assert enc_input_ids.size() == enc_attention_mask.size()
+        if len(enc_input_ids.size()) == 3:
+            batch_size, top_k, seq_length = enc_input_ids.size()
+            enc_input_ids = enc_input_ids.view(batch_size * top_k, seq_length)
+            enc_attention_mask = enc_attention_mask.view(batch_size * top_k, seq_length)
+        assert enc_input_ids.size(0) == dec_input_ids.size(0) * self.generation_top_k
+        compressed_embs, _ = self.compress(enc_input_ids.to('cuda'), enc_attention_mask.to('cuda'))
+        inputs_embeds = self._replace_emb(compressed_embs, dec_input_ids.to('cuda'))
+        if 'decoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('decoder_adapter')
+        output_ids = self.decoder.generate(
+            inputs_embeds=inputs_embeds.to("cuda"),
+            attention_mask=dec_attention_mask.to("cuda"),
+            do_sample=False,
+            top_p=None,
+            max_new_tokens=max_new_tokens
+        )
+        decoded = self.decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        if return_doc_embeddings:
+            assert 'batch_size' in locals() and 'top_k' in locals()
+            compressed_embs = compressed_embs.view(batch_size, top_k, compressed_embs.size(1), compressed_embs.size(2))
+            return decoded, compressed_embs
+        else:
+            return decoded
+# Example usage and testing
+if __name__ == '__main__':
+    # Example configuration
+    cfg = CLaRaConfig(
+        decoder_model_name='/mnt/ceph_rbd/model/Mistral-7B-Instruct-v0.2',
+        compr_model_name="mistral_trimmed",
+        compr_rate=64,
+        compr_n_layers=5,
+        compr_mlp_hidden_dim=8096,
+        compr_use_mlp=False,
+        lora=True,
+        lora_compressor=True,
+        training_form="both",
+        load_adapters=True,
+        kbtc_training=False,
+        optimize_mem_tokens=True,
+        different_mem_tokens=True,
+        attn_implementation='flash_attention_2'
+    )
+    # Initialize model
+    clara = CLaRa(cfg)
+    # Save and reload test
+    clara.save_pretrained('test_ckpt')
+    del clara
+    torch.cuda.empty_cache()
+    gc.collect()
+    # Reload model
+    clara = CLaRa.from_pretrained('test_ckpt')
+    print("Model successfully loaded!")

compression-16/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

compression-16/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

compression-16/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

compression-16/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}