where is weights
Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at zeroentropy/zerank-2 and are newly initialized: ['score.weight']
same here!
>>> model = CrossEncoder("zeroentropy/zerank-2", trust_remote_code=True)
Loading weights: 1%| | 2/398 [00:00<00:00, 3876.44it/s, Materializing param=model.layers.0.input_layernorm.weight]
Loading weights: 100%|ββββββββββββββββββββββββββββββββββ| 398/398 [00:00<00:00, 5486.03it/s, Materializing param=model.norm.weight]
Qwen3ForSequenceClassification LOAD REPORT from: zeroentropy/zerank-2
Key | Status |
-------------+---------+-
score.weight | MISSING |
Notes:
- MISSING :those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
So the head gets random init...
Had the same issue.
I pointed codex with gpt-5.3-high at it and this is what it came up with:
I identified a real model-loading bug behind your logs (score.weight gets randomly initialized each cold start). Iβm patching scripts/modal_reranker.py to load ZeroEntropy models through their native modeling_zeranker.py path instead of CrossEncoder sequence-classification mode, and Iβll also make smoke output explicit about expected shutdown.
This is the code it created to make this model run on Modal so you might need to adapt it for your usage:
import os
import types
from importlib import util as importlib_util
from typing import Any
import modal
APP_NAME = os.getenv("MODAL_RERANK_APP_NAME", "latent-papers-rerank")
DEFAULT_MODEL = os.getenv("MODAL_RERANK_DEFAULT_MODEL", "zeroentropy/zerank-2")
GPU_TYPE = os.getenv("MODAL_RERANK_GPU", "L4")
MODEL_CACHE_DIR = "/root/.cache/huggingface"
_default_batch_size = int(os.getenv("MODAL_RERANK_BATCH_SIZE", "16"))
if _default_batch_size <= 0:
raise ValueError("MODAL_RERANK_BATCH_SIZE must be >= 1")
_default_max_length = int(os.getenv("MODAL_RERANK_MAX_LENGTH", "1024"))
if _default_max_length <= 0:
raise ValueError("MODAL_RERANK_MAX_LENGTH must be >= 1")
HF_CACHE_VOLUME = modal.Volume.from_name("latent-papers-hf-cache", create_if_missing=True)
IMAGE = modal.Image.debian_slim(python_version="3.12").pip_install(
"sentence-transformers>=5.2.2",
"torch>=2.10.0",
"accelerate>=1.12.0",
)
app = modal.App(APP_NAME)
@app
.cls(
image=IMAGE,
gpu=GPU_TYPE,
volumes={MODEL_CACHE_DIR: HF_CACHE_VOLUME},
max_containers=8,
scaledown_window=300,
timeout=3600,
)
@modal
.concurrent(max_inputs=8)
class ZerankReranker:
def _is_zeroentropy_reranker(self, model_name: str) -> bool:
normalized = (model_name or "").strip().lower()
return normalized.startswith("zeroentropy/zerank-")
def _load_zeroentropy_model(self, model_name: str) -> Any:
from huggingface_hub import hf_hub_download
module_path = hf_hub_download(model_name, "modeling_zeranker.py")
spec = importlib_util.spec_from_file_location("modeling_zeranker", module_path)
if spec is None or spec.loader is None:
raise RuntimeError(f"Failed to load {module_path} as a Python module.")
module = importlib_util.module_from_spec(spec)
spec.loader.exec_module(module)
# The upstream helper hardcodes MODEL_PATH/global_device.
module.MODEL_PATH = model_name
model_device = module.torch.device("cuda")
module.global_device = model_device
tokenizer, model = module.load_model(model_device)
model.eval()
yes_token_id = tokenizer.encode("Yes", add_special_tokens=False)[0]
state = types.SimpleNamespace(
inner_tokenizer=tokenizer,
inner_model=model,
inner_yes_token_id=yes_token_id,
)
class _Predictor:
def predict(self, pairs: list[tuple[str, str]], **_: Any) -> list[float]:
with module.torch.inference_mode():
return module.predict(state, query_documents=pairs)
return _Predictor()
def _load_model(self, model_name: str) -> None:
self.model_name = model_name
if self._is_zeroentropy_reranker(model_name):
self.model = self._load_zeroentropy_model(model_name)
print(f"Loaded ZeroEntropy reranker using native runtime: {model_name}")
return
from sentence_transformers import CrossEncoder
self.model = CrossEncoder(
model_name,
device="cuda",
trust_remote_code=True,
max_length=_default_max_length,
)
print(f"Loaded CrossEncoder reranker: {model_name}")
@modal
.enter()
def setup(self) -> None:
self._load_model(DEFAULT_MODEL)
@modal
.method()
def rerank(self, *, query: str, documents: list[str], model: str | None = None) -> dict[str, Any]:
if not isinstance(query, str):
raise TypeError("query must be a string")
if not isinstance(documents, list):
raise TypeError("documents must be a list of strings")
selected_model = (model or "").strip() or DEFAULT_MODEL
if selected_model != self.model_name:
self._load_model(selected_model)
if not documents:
return {"scores": []}
pairs = [(query, str(doc or "")) for doc in documents]
scores = self.model.predict(
pairs,
batch_size=_default_batch_size,
show_progress_bar=False,
)
return {"scores": [float(score) for score in scores]}
@app
.local_entrypoint()
def smoke(
query: str = "How can retrieval augmentation improve reasoning?",
document: str = "This paper introduces a reranking method for query-document relevance.",
) -> None:
model = ZerankReranker()
result = model.rerank.remote(
query=query,
documents=[document],
model=DEFAULT_MODEL,
)
print(result)
print("Smoke test completed. This is expected to shut down after local entrypoint returns.")