Upload folder using huggingface_hub
Browse files- README.md +3 -6
- app.py +778 -0
- requirements.txt +4 -0
README.md
CHANGED
|
@@ -1,10 +1,7 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom: gray
|
| 5 |
-
colorTo: blue
|
| 6 |
sdk: static
|
| 7 |
-
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
| 1 |
---
|
| 2 |
+
title: VOXLECT
|
| 3 |
+
emoji: π
|
|
|
|
|
|
|
| 4 |
sdk: static
|
|
|
|
| 5 |
---
|
| 6 |
|
| 7 |
+
# VOXLECT
|
app.py
ADDED
|
@@ -0,0 +1,778 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VOXLECT
|
| 2 |
+
|
| 3 |
+
# ============================================================================
|
| 4 |
+
# CELL 1: SETUP AND INSTALLATION (VERIFIED)
|
| 5 |
+
# ============================================================================
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import warnings
|
| 9 |
+
warnings.filterwarnings('ignore')
|
| 10 |
+
|
| 11 |
+
print("π VoxLect Indic LID Whisper Large v3 - Final Setup")
|
| 12 |
+
print("=" * 60)
|
| 13 |
+
|
| 14 |
+
# Mount Google Drive
|
| 15 |
+
from google.colab import drive
|
| 16 |
+
|
| 17 |
+
# Install packages
|
| 18 |
+
print("π¦ Installing packages...")
|
| 19 |
+
|
| 20 |
+
# Clone VoxLect repository (correct syntax)
|
| 21 |
+
print("π₯ Cloning VoxLect repository...")
|
| 22 |
+
|
| 23 |
+
# Python path
|
| 24 |
+
sys.path.insert(0, '/content/voxlect')
|
| 25 |
+
sys.path.insert(0, '/content/voxlect/src')
|
| 26 |
+
|
| 27 |
+
print("β
Installation complete!")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# ============================================================================
|
| 31 |
+
# CELL 2: MANDATORY MONKEY PATCH (ATTENTION COMPATIBILITY)
|
| 32 |
+
# ============================================================================
|
| 33 |
+
import transformers.models.whisper.modeling_whisper as whisper_modeling
|
| 34 |
+
|
| 35 |
+
print("π§ Applying attention compatibility patch...")
|
| 36 |
+
|
| 37 |
+
_OriginalWhisperAttention = whisper_modeling.WhisperAttention
|
| 38 |
+
|
| 39 |
+
class PatchedWhisperAttention(_OriginalWhisperAttention):
|
| 40 |
+
def _get_attn_impl(self):
|
| 41 |
+
try:
|
| 42 |
+
attn_impl = super()._get_attn_impl()
|
| 43 |
+
if attn_impl is None:
|
| 44 |
+
return "eager"
|
| 45 |
+
return attn_impl
|
| 46 |
+
except AttributeError:
|
| 47 |
+
return "eager"
|
| 48 |
+
|
| 49 |
+
whisper_modeling.WhisperAttention = PatchedWhisperAttention
|
| 50 |
+
|
| 51 |
+
print("β
Monkey patch applied.")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ============================================================================
|
| 55 |
+
# CELL 3: MODEL LOADING & LABEL LIST
|
| 56 |
+
# ============================================================================
|
| 57 |
+
import torch
|
| 58 |
+
import torch.nn.functional as F
|
| 59 |
+
import librosa
|
| 60 |
+
import pandas as pd
|
| 61 |
+
import numpy as np
|
| 62 |
+
from datetime import datetime
|
| 63 |
+
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
|
| 64 |
+
from IPython.display import display
|
| 65 |
+
|
| 66 |
+
# Import VoxLect after patch
|
| 67 |
+
from src.model.dialect.whisper_dialect import WhisperWrapper
|
| 68 |
+
|
| 69 |
+
# Folder code -> ground truth mapping
|
| 70 |
+
CUSTOM_FOLDER_MAPPING = {
|
| 71 |
+
"as": "assamese", "bn": "bengali", "br": "bodo", "doi": "dogri",
|
| 72 |
+
"en": "english", "gu": "gujarati", "hi": "hindi", "kn": "kannada",
|
| 73 |
+
"ks": "kashmiri", "kok": "konkani", "mai": "maithili", "ml": "malayalam",
|
| 74 |
+
"mni": "manipuri", "mr": "marathi", "ne": "nepali", "or": "odia",
|
| 75 |
+
"pa": "punjabi", "sa": "sanskrit", "sat": "santali", "sd": "sindhi",
|
| 76 |
+
"ta": "tamil", "te": "telugu", "ur": "urdu"
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
# IMPORTANT: label order used by the model (adjust if the model card lists a different order)
|
| 80 |
+
LABEL_LIST = [
|
| 81 |
+
"assamese", "bengali", "bodo", "dogri", "english", "gujarati",
|
| 82 |
+
"hindi", "kannada", "kashmiri", "konkani", "maithili", "malayalam",
|
| 83 |
+
"manipuri", "marathi", "nepali", "odia", "punjabi", "sanskrit",
|
| 84 |
+
"santali", "sindhi", "tamil", "telugu", "urdu"
|
| 85 |
+
]
|
| 86 |
+
|
| 87 |
+
# Update these paths
|
| 88 |
+
AUDIO_FOLDER = "/content/drive/MyDrive/Audio_files" # <-- set your path
|
| 89 |
+
RESULTS_FOLDER = "/content/drive/MyDrive/voxlect_1_results"
|
| 90 |
+
os.makedirs(RESULTS_FOLDER, exist_ok=True)
|
| 91 |
+
|
| 92 |
+
# Device and model
|
| 93 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 94 |
+
MODEL_NAME = "tiantiaf/voxlect-indic-lid-whisper-large-v3"
|
| 95 |
+
print(f"π§ Device: {device}")
|
| 96 |
+
|
| 97 |
+
print(f"π Loading model: {MODEL_NAME}")
|
| 98 |
+
model = WhisperWrapper.from_pretrained(MODEL_NAME).to(device)
|
| 99 |
+
model.eval()
|
| 100 |
+
print("β
Model loaded successfully!")
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# ============================================================================
|
| 104 |
+
# CELL 4: AUDIO IO & PREDICTION (ROBUST)
|
| 105 |
+
# ============================================================================
|
| 106 |
+
def trim_silence(audio, threshold=0.01, win=2048, hop=512):
|
| 107 |
+
rms = librosa.feature.rms(y=audio, frame_length=win, hop_length=hop)[0]
|
| 108 |
+
mask = rms > threshold * (rms.max() if rms.size else 1.0)
|
| 109 |
+
if not mask.any():
|
| 110 |
+
return audio
|
| 111 |
+
idx = np.where(mask)[0]
|
| 112 |
+
start = max(int(idx[0] * hop), 0)
|
| 113 |
+
end = min(int((idx[-1] + 1) * hop), len(audio))
|
| 114 |
+
return audio[start:end]
|
| 115 |
+
|
| 116 |
+
def load_audio(file_path, target_sr=16000, max_duration=15.0):
|
| 117 |
+
try:
|
| 118 |
+
audio, sr = librosa.load(file_path, sr=target_sr, mono=True)
|
| 119 |
+
|
| 120 |
+
# Optional: trim leading/trailing silence to improve discrimination
|
| 121 |
+
audio = trim_silence(audio, threshold=0.01)
|
| 122 |
+
|
| 123 |
+
# Duration control
|
| 124 |
+
max_samples = int(max_duration * target_sr)
|
| 125 |
+
if len(audio) > max_samples:
|
| 126 |
+
audio = audio[:max_samples]
|
| 127 |
+
min_samples = int(3.0 * target_sr)
|
| 128 |
+
if len(audio) < min_samples:
|
| 129 |
+
audio = np.pad(audio, (0, min_samples - len(audio)), 'constant')
|
| 130 |
+
|
| 131 |
+
# Normalize peak to 1 for stability across files
|
| 132 |
+
peak = np.abs(audio).max()
|
| 133 |
+
if peak > 0:
|
| 134 |
+
audio = audio / peak
|
| 135 |
+
|
| 136 |
+
# Diagnostics (optional; comment out after verifying)
|
| 137 |
+
# print(f"dbg: {os.path.basename(file_path)} len={len(audio)} mean={audio.mean():.4f} std={audio.std():.4f}")
|
| 138 |
+
|
| 139 |
+
return torch.from_numpy(audio).float().unsqueeze(0)
|
| 140 |
+
except Exception as e:
|
| 141 |
+
print(f" β Error loading {os.path.basename(file_path)}: {e}")
|
| 142 |
+
return None
|
| 143 |
+
|
| 144 |
+
def predict_language(audio_tensor, k=5):
|
| 145 |
+
if audio_tensor is None:
|
| 146 |
+
return {"predicted_language": "error", "confidence": 0.0, "top_5_predictions": [], "error_message": "Audio load failed"}
|
| 147 |
+
try:
|
| 148 |
+
audio_tensor = audio_tensor.to(device)
|
| 149 |
+
with torch.no_grad():
|
| 150 |
+
logits, _ = model(audio_tensor, return_feature=True) # expect [1, C]
|
| 151 |
+
if logits.dim() == 1:
|
| 152 |
+
logits = logits.unsqueeze(0)
|
| 153 |
+
if logits.size(0) != 1:
|
| 154 |
+
logits = logits[:1, :]
|
| 155 |
+
|
| 156 |
+
probs = F.softmax(logits, dim=1)[0] # softmax over classes
|
| 157 |
+
top_probs, top_idx = torch.topk(probs, k)
|
| 158 |
+
|
| 159 |
+
top = []
|
| 160 |
+
for rank, (p, ix) in enumerate(zip(top_probs.tolist(), top_idx.tolist()), start=1):
|
| 161 |
+
idx = int(ix)
|
| 162 |
+
lang = LABEL_LIST[idx] if 0 <= idx < len(LABEL_LIST) else f"unknown_{idx}"
|
| 163 |
+
top.append({"rank": rank, "language": lang, "confidence": float(p)})
|
| 164 |
+
|
| 165 |
+
return {"predicted_language": top[0]["language"], "confidence": top[0]["confidence"], "top_5_predictions": top}
|
| 166 |
+
except Exception as e:
|
| 167 |
+
return {"predicted_language": "error", "confidence": 0.0, "top_5_predictions": [], "error_message": str(e)}
|
| 168 |
+
|
| 169 |
+
def find_audio_files(base_path):
|
| 170 |
+
if not os.path.exists(base_path):
|
| 171 |
+
print(f"β Path not found: {base_path}")
|
| 172 |
+
return []
|
| 173 |
+
audio_files = []
|
| 174 |
+
for root, _, files in os.walk(base_path):
|
| 175 |
+
folder = os.path.basename(root).lower()
|
| 176 |
+
gt = CUSTOM_FOLDER_MAPPING.get(folder, "unknown")
|
| 177 |
+
for file in files:
|
| 178 |
+
if file.lower().endswith(('.wav', '.mp3', '.m4a', '.flac', '.ogg')):
|
| 179 |
+
audio_files.append({
|
| 180 |
+
"file_path": os.path.join(root, file),
|
| 181 |
+
"filename": file,
|
| 182 |
+
"ground_truth": gt
|
| 183 |
+
})
|
| 184 |
+
print(f"β
Found {len(audio_files)} audio files.")
|
| 185 |
+
if audio_files:
|
| 186 |
+
print("π Ground Truth Distribution:")
|
| 187 |
+
print(pd.Series([f['ground_truth'] for f in audio_files]).value_counts())
|
| 188 |
+
return audio_files
|
| 189 |
+
|
| 190 |
+
print("β
Audio & prediction functions ready.")
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
# ============================================================================
|
| 194 |
+
# CELL 5: BATCH PROCESSING -> CSV
|
| 195 |
+
# ============================================================================
|
| 196 |
+
def run_batch_processing():
|
| 197 |
+
files = find_audio_files(AUDIO_FOLDER)
|
| 198 |
+
if not files:
|
| 199 |
+
return pd.DataFrame()
|
| 200 |
+
|
| 201 |
+
results = []
|
| 202 |
+
total = len(files)
|
| 203 |
+
print("\nπ Processing audio files...")
|
| 204 |
+
for i, f in enumerate(files, 1):
|
| 205 |
+
print(f" [{i}/{total}] Processing {f['filename']}...", end="")
|
| 206 |
+
audio_tensor = load_audio(f['file_path'])
|
| 207 |
+
pred = predict_language(audio_tensor)
|
| 208 |
+
if pred['predicted_language'] == 'error':
|
| 209 |
+
print(f" -> Error: {pred.get('error_message', 'Unknown error')}")
|
| 210 |
+
else:
|
| 211 |
+
print(f" -> Predicted: {pred['predicted_language']}")
|
| 212 |
+
results.append({**f, **pred})
|
| 213 |
+
|
| 214 |
+
df = pd.DataFrame(results)
|
| 215 |
+
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 216 |
+
out_csv = f"{RESULTS_FOLDER}/voxlect_results_{ts}.csv"
|
| 217 |
+
df.to_csv(out_csv, index=False)
|
| 218 |
+
print(f"\nβ
Saved results to: {out_csv}")
|
| 219 |
+
return df
|
| 220 |
+
|
| 221 |
+
results_df = run_batch_processing()
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
# ============================================================================
|
| 225 |
+
# CELL 6: DETAILED ACCURACY ANALYSIS
|
| 226 |
+
# ============================================================================
|
| 227 |
+
def run_detailed_analysis(df):
|
| 228 |
+
print("\n" + "=" * 70)
|
| 229 |
+
print("π DETAILED ACCURACY ANALYSIS")
|
| 230 |
+
print("=" * 70)
|
| 231 |
+
|
| 232 |
+
valid = df[(df['ground_truth'] != 'unknown') & (df['predicted_language'] != 'error')].copy()
|
| 233 |
+
if valid.empty:
|
| 234 |
+
print("β No valid results for analysis.")
|
| 235 |
+
return
|
| 236 |
+
|
| 237 |
+
y_true = valid['ground_truth'].values
|
| 238 |
+
y_pred = valid['predicted_language'].values
|
| 239 |
+
|
| 240 |
+
print(f"\nπ― Overall Accuracy (Top-1): {accuracy_score(y_true, y_pred):.2%}")
|
| 241 |
+
|
| 242 |
+
labels = sorted(set(list(y_true) + list(y_pred)))
|
| 243 |
+
print("\nπ Classification Report:")
|
| 244 |
+
print(classification_report(y_true, y_pred, labels=labels, zero_division=0))
|
| 245 |
+
|
| 246 |
+
print("\nπ Confusion Matrix:")
|
| 247 |
+
cm = confusion_matrix(y_true, y_pred, labels=labels)
|
| 248 |
+
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
|
| 249 |
+
display(cm_df)
|
| 250 |
+
|
| 251 |
+
mis = valid[valid['ground_truth'] != valid['predicted_language']].copy()
|
| 252 |
+
if not mis.empty:
|
| 253 |
+
print("\nβ Top 10 Most Common Misclassifications:")
|
| 254 |
+
top_errs = (mis.groupby(['ground_truth', 'predicted_language'])
|
| 255 |
+
.size().sort_values(ascending=False).head(10))
|
| 256 |
+
print(top_errs)
|
| 257 |
+
|
| 258 |
+
if 'top_5_predictions' in mis.columns:
|
| 259 |
+
def correct_rank(row):
|
| 260 |
+
true_lang = row['ground_truth']
|
| 261 |
+
preds = row['top_5_predictions']
|
| 262 |
+
if isinstance(preds, str):
|
| 263 |
+
try:
|
| 264 |
+
preds = eval(preds)
|
| 265 |
+
except Exception:
|
| 266 |
+
return None
|
| 267 |
+
for p in preds:
|
| 268 |
+
if p.get('language') == true_lang:
|
| 269 |
+
return p.get('rank')
|
| 270 |
+
return None
|
| 271 |
+
|
| 272 |
+
mis['correct_rank_in_top5'] = mis.apply(correct_rank, axis=1)
|
| 273 |
+
c_in_top5 = mis['correct_rank_in_top5'].notna().sum()
|
| 274 |
+
print(f"\nπ Correct language in Top-5 for misclassified: {c_in_top5}/{len(mis)} ({c_in_top5/len(mis):.1%})")
|
| 275 |
+
|
| 276 |
+
top1_correct = (valid['ground_truth'] == valid['predicted_language']).sum()
|
| 277 |
+
top5_acc = (top1_correct + c_in_top5) / len(valid)
|
| 278 |
+
print(f"π― Overall Accuracy (Top-5): {top5_acc:.2%}")
|
| 279 |
+
|
| 280 |
+
print("\nπ Analysis complete!")
|
| 281 |
+
|
| 282 |
+
if 'results_df' in locals() and not results_df.empty:
|
| 283 |
+
run_detailed_analysis(results_df)
|
| 284 |
+
else:
|
| 285 |
+
print("Run Cell 5 first to generate results.")
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
# ============================================================================
|
| 291 |
+
# CELL: EXPORT DETAILED ANALYSIS TO EXCEL
|
| 292 |
+
# Requires: results_df (from Cell 5), LABEL_LIST, RESULTS_FOLDER
|
| 293 |
+
# ============================================================================
|
| 294 |
+
import json
|
| 295 |
+
import pandas as pd
|
| 296 |
+
import numpy as np
|
| 297 |
+
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
|
| 298 |
+
|
| 299 |
+
def export_detailed_excel(results_df, results_folder, label_order=None, filename_prefix="voxlect_analysis"):
|
| 300 |
+
# Guard
|
| 301 |
+
if results_df is None or results_df.empty:
|
| 302 |
+
print("β No results to export. Run batch processing first.")
|
| 303 |
+
return None
|
| 304 |
+
|
| 305 |
+
# Prepare valid subset
|
| 306 |
+
valid = results_df[
|
| 307 |
+
(results_df['ground_truth'] != 'unknown') &
|
| 308 |
+
(results_df['predicted_language'] != 'error') &
|
| 309 |
+
results_df['predicted_language'].notna()
|
| 310 |
+
].copy()
|
| 311 |
+
|
| 312 |
+
# Choose label order for confusion matrix/report
|
| 313 |
+
if label_order is None:
|
| 314 |
+
label_order = sorted(set(valid['ground_truth']) | set(valid['predicted_language']))
|
| 315 |
+
|
| 316 |
+
# Overall Topβ1 accuracy
|
| 317 |
+
top1_acc = accuracy_score(valid['ground_truth'], valid['predicted_language'])
|
| 318 |
+
|
| 319 |
+
# Topβ5 accuracy (if top_5_predictions available)
|
| 320 |
+
top5_acc = None
|
| 321 |
+
if 'top_5_predictions' in valid.columns:
|
| 322 |
+
def correct_in_top5(row):
|
| 323 |
+
target = row['ground_truth']
|
| 324 |
+
preds = row['top_5_predictions']
|
| 325 |
+
if isinstance(preds, str):
|
| 326 |
+
try:
|
| 327 |
+
preds = json.loads(preds)
|
| 328 |
+
except Exception:
|
| 329 |
+
try:
|
| 330 |
+
preds = eval(preds)
|
| 331 |
+
except Exception:
|
| 332 |
+
return False
|
| 333 |
+
for p in preds:
|
| 334 |
+
if p.get('language') == target:
|
| 335 |
+
return True
|
| 336 |
+
return False
|
| 337 |
+
|
| 338 |
+
valid['correct_in_top5'] = valid.apply(correct_in_top5, axis=1)
|
| 339 |
+
top5_acc = (valid['correct_in_top5'].sum() / len(valid)) if len(valid) else None
|
| 340 |
+
|
| 341 |
+
# Classification report (dict β DataFrame)
|
| 342 |
+
cls_report = classification_report(
|
| 343 |
+
valid['ground_truth'],
|
| 344 |
+
valid['predicted_language'],
|
| 345 |
+
labels=label_order,
|
| 346 |
+
zero_division=0,
|
| 347 |
+
output_dict=True
|
| 348 |
+
)
|
| 349 |
+
report_df = pd.DataFrame(cls_report).T.reset_index().rename(columns={"index": "label"})
|
| 350 |
+
|
| 351 |
+
# Confusion matrix
|
| 352 |
+
cm = confusion_matrix(valid['ground_truth'], valid['predicted_language'], labels=label_order)
|
| 353 |
+
cm_df = pd.DataFrame(cm, index=label_order, columns=label_order)
|
| 354 |
+
cm_df.index.name = "True\\Pred"
|
| 355 |
+
|
| 356 |
+
# Top misclassifications
|
| 357 |
+
mis = valid[valid['ground_truth'] != valid['predicted_language']].copy()
|
| 358 |
+
if not mis.empty:
|
| 359 |
+
top_mis = (mis.groupby(['ground_truth', 'predicted_language'])
|
| 360 |
+
.size()
|
| 361 |
+
.reset_index(name='count')
|
| 362 |
+
.sort_values('count', ascending=False))
|
| 363 |
+
else:
|
| 364 |
+
top_mis = pd.DataFrame(columns=['ground_truth', 'predicted_language', 'count'])
|
| 365 |
+
|
| 366 |
+
# Overview sheet
|
| 367 |
+
overview_rows = [
|
| 368 |
+
{"metric": "total_predictions", "value": int(len(results_df))},
|
| 369 |
+
{"metric": "valid_predictions_for_eval", "value": int(len(valid))},
|
| 370 |
+
{"metric": "top1_accuracy", "value": float(top1_acc) if len(valid) else None},
|
| 371 |
+
{"metric": "top5_accuracy", "value": float(top5_acc) if top5_acc is not None else None},
|
| 372 |
+
]
|
| 373 |
+
overview_df = pd.DataFrame(overview_rows)
|
| 374 |
+
|
| 375 |
+
# Full predictions (convert top_5_predictions to JSON string for readability)
|
| 376 |
+
full_preds = results_df.copy()
|
| 377 |
+
if 'top_5_predictions' in full_preds.columns:
|
| 378 |
+
full_preds['top_5_predictions'] = full_preds['top_5_predictions'].apply(
|
| 379 |
+
lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, (list, dict)) else str(x)
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
# Write to Excel with multiple sheets
|
| 383 |
+
ts = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
|
| 384 |
+
out_xlsx = os.path.join(results_folder, f"{filename_prefix}_{ts}.xlsx")
|
| 385 |
+
|
| 386 |
+
with pd.ExcelWriter(out_xlsx, engine="xlsxwriter") as writer:
|
| 387 |
+
overview_df.to_excel(writer, sheet_name="overview", index=False)
|
| 388 |
+
report_df.to_excel(writer, sheet_name="per_language_metrics", index=False)
|
| 389 |
+
cm_df.to_excel(writer, sheet_name="confusion_matrix")
|
| 390 |
+
top_mis.to_excel(writer, sheet_name="top_misclassifications", index=False)
|
| 391 |
+
full_preds.to_excel(writer, sheet_name="full_predictions", index=False)
|
| 392 |
+
|
| 393 |
+
# Optional: auto width
|
| 394 |
+
for sheet in ["overview", "per_language_metrics", "confusion_matrix", "top_misclassifications", "full_predictions"]:
|
| 395 |
+
try:
|
| 396 |
+
ws = writer.sheets[sheet]
|
| 397 |
+
for i, col in enumerate(pd.read_excel(out_xlsx, sheet_name=sheet).columns):
|
| 398 |
+
width = max(12, min(60, int(full_preds[col].astype(str).map(len).max()) if sheet == "full_predictions" else 20))
|
| 399 |
+
ws.set_column(i, i, width)
|
| 400 |
+
except Exception:
|
| 401 |
+
pass
|
| 402 |
+
|
| 403 |
+
print(f"β
Excel report saved to: {out_xlsx}")
|
| 404 |
+
return out_xlsx
|
| 405 |
+
|
| 406 |
+
# Run export
|
| 407 |
+
excel_path = export_detailed_excel(results_df, RESULTS_FOLDER, label_order=sorted(set(LABEL_LIST)))
|
| 408 |
+
print("Excel path:", excel_path)
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
# ============================================================================
|
| 412 |
+
# CELL: TOP-5 RANK POSITION ANALYSIS
|
| 413 |
+
# Requires: results_df (from batch), where 'top_5_predictions' is a list/dict or JSON string
|
| 414 |
+
# ============================================================================
|
| 415 |
+
import json
|
| 416 |
+
import pandas as pd
|
| 417 |
+
import numpy as np
|
| 418 |
+
|
| 419 |
+
def parse_top5(cell):
|
| 420 |
+
"""Return a list of dicts [{'rank':int,'language':str,'confidence':float}, ...] from cell."""
|
| 421 |
+
if isinstance(cell, list):
|
| 422 |
+
return cell
|
| 423 |
+
if isinstance(cell, dict):
|
| 424 |
+
return [cell]
|
| 425 |
+
if isinstance(cell, str):
|
| 426 |
+
# Try JSON first, then eval as fallback
|
| 427 |
+
try:
|
| 428 |
+
v = json.loads(cell)
|
| 429 |
+
return v if isinstance(v, list) else [v]
|
| 430 |
+
except Exception:
|
| 431 |
+
try:
|
| 432 |
+
v = eval(cell)
|
| 433 |
+
return v if isinstance(v, list) else [v]
|
| 434 |
+
except Exception:
|
| 435 |
+
return []
|
| 436 |
+
return []
|
| 437 |
+
|
| 438 |
+
def compute_rank_in_top5(df):
|
| 439 |
+
"""Add 'correct_rank_in_top5' and a readable verdict column per row."""
|
| 440 |
+
df = df.copy()
|
| 441 |
+
|
| 442 |
+
def get_rank(row):
|
| 443 |
+
gt = row.get('ground_truth', None)
|
| 444 |
+
preds = parse_top5(row.get('top_5_predictions', []))
|
| 445 |
+
if not gt or not preds:
|
| 446 |
+
return None
|
| 447 |
+
for p in preds:
|
| 448 |
+
if isinstance(p, dict) and p.get('language') == gt:
|
| 449 |
+
# Ensure rank is int-like and 1-based
|
| 450 |
+
r = p.get('rank', None)
|
| 451 |
+
try:
|
| 452 |
+
r = int(r)
|
| 453 |
+
if 1 <= r <= 5:
|
| 454 |
+
return r
|
| 455 |
+
except Exception:
|
| 456 |
+
pass
|
| 457 |
+
return None
|
| 458 |
+
|
| 459 |
+
df['correct_rank_in_top5'] = df.apply(get_rank, axis=1)
|
| 460 |
+
df['top5_verdict'] = df['correct_rank_in_top5'].apply(
|
| 461 |
+
lambda r: f"Rank {int(r)}" if pd.notna(r) else "Not-in-Top-5"
|
| 462 |
+
)
|
| 463 |
+
return df
|
| 464 |
+
|
| 465 |
+
def per_language_rank_summary(df):
|
| 466 |
+
"""Build a per-language summary of rank distribution (1..5 and Not-in-Top-5)."""
|
| 467 |
+
# Consider only rows with known ground truth and a prediction attempt
|
| 468 |
+
subset = df[(df['ground_truth'].notna()) & (df['predicted_language'] != 'error')].copy()
|
| 469 |
+
subset['rank_bin'] = subset['correct_rank_in_top5'].apply(lambda r: int(r) if pd.notna(r) else 0) # 0 = Not-in-Top-5
|
| 470 |
+
|
| 471 |
+
# Pivot counts per language vs rank_bin
|
| 472 |
+
counts = (subset
|
| 473 |
+
.groupby(['ground_truth', 'rank_bin'])
|
| 474 |
+
.size()
|
| 475 |
+
.reset_index(name='count'))
|
| 476 |
+
|
| 477 |
+
# Make a wide table with columns for Rank1..Rank5 and Not-in-Top-5
|
| 478 |
+
rank_cols = {0: "Not-in-Top-5", 1: "Rank 1", 2: "Rank 2", 3: "Rank 3", 4: "Rank 4", 5: "Rank 5"}
|
| 479 |
+
summary = (counts
|
| 480 |
+
.assign(rank_label=lambda x: x['rank_bin'].map(rank_cols))
|
| 481 |
+
.pivot(index='ground_truth', columns='rank_label', values='count')
|
| 482 |
+
.fillna(0)
|
| 483 |
+
.astype(int)
|
| 484 |
+
.reset_index()
|
| 485 |
+
.rename(columns={'ground_truth': 'language'}))
|
| 486 |
+
|
| 487 |
+
# Add totals and in-top-5 rate
|
| 488 |
+
summary['Total'] = summary[[c for c in summary.columns if c.startswith('Rank ') or c == 'Not-in-Top-5']].sum(axis=1)
|
| 489 |
+
in_top5_cols = [c for c in summary.columns if c.startswith('Rank ')]
|
| 490 |
+
summary['In-Top-5'] = summary[in_top5_cols].sum(axis=1)
|
| 491 |
+
summary['In-Top-5 Rate'] = (summary['In-Top-5'] / summary['Total']).replace([np.inf, np.nan], 0.0)
|
| 492 |
+
|
| 493 |
+
# Order columns nicely
|
| 494 |
+
ordered_cols = ['language', 'Total', 'In-Top-5', 'In-Top-5 Rate', 'Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5', 'Not-in-Top-5']
|
| 495 |
+
for c in ordered_cols:
|
| 496 |
+
if c not in summary.columns:
|
| 497 |
+
summary[c] = 0 if c != 'In-Top-5 Rate' else 0.0
|
| 498 |
+
summary = summary[ordered_cols]
|
| 499 |
+
return summary.sort_values(by=['In-Top-5 Rate','Rank 1','Rank 2','Rank 3','Rank 4','Rank 5'], ascending=False)
|
| 500 |
+
|
| 501 |
+
# 1) Compute per-row rank
|
| 502 |
+
results_ranked = compute_rank_in_top5(results_df)
|
| 503 |
+
|
| 504 |
+
# 2) Show first few rows with the new columns
|
| 505 |
+
display(results_ranked[['filename','ground_truth','predicted_language','top5_verdict','correct_rank_in_top5']].head(20))
|
| 506 |
+
|
| 507 |
+
# 3) Build per-language summary
|
| 508 |
+
rank_summary = per_language_rank_summary(results_ranked)
|
| 509 |
+
display(rank_summary)
|
| 510 |
+
|
| 511 |
+
# 4) Optionally save both to Excel or CSV
|
| 512 |
+
from datetime import datetime
|
| 513 |
+
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 514 |
+
|
| 515 |
+
rank_csv = os.path.join(RESULTS_FOLDER, f"top5_rank_per_file_{ts}.csv")
|
| 516 |
+
results_ranked.to_csv(rank_csv, index=False)
|
| 517 |
+
print("β
Per-file Topβ5 rank CSV:", rank_csv)
|
| 518 |
+
|
| 519 |
+
summary_csv = os.path.join(RESULTS_FOLDER, f"top5_rank_summary_{ts}.csv")
|
| 520 |
+
rank_summary.to_csv(summary_csv, index=False)
|
| 521 |
+
print("β
Per-language Topβ5 rank summary CSV:", summary_csv)
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
# ============================================================================
|
| 525 |
+
# CELL A: FEATURE EXTRACTION (DURATION, SNR, SILENCE RATIO)
|
| 526 |
+
# ============================================================================
|
| 527 |
+
import librosa
|
| 528 |
+
import numpy as np
|
| 529 |
+
import pandas as pd
|
| 530 |
+
import os
|
| 531 |
+
|
| 532 |
+
def compute_features(row, target_sr=16000):
|
| 533 |
+
p = row['file_path']
|
| 534 |
+
try:
|
| 535 |
+
y, sr = librosa.load(p, sr=target_sr, mono=True)
|
| 536 |
+
dur = len(y) / target_sr
|
| 537 |
+
|
| 538 |
+
# Energy-based SNR proxy: ratio of voiced/active RMS to global RMS (not true SNR but indicative)
|
| 539 |
+
rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
|
| 540 |
+
global_rms = np.sqrt(np.mean(y**2) + 1e-12)
|
| 541 |
+
active_mask = rms > 0.1 * np.max(rms) if rms.size else np.array([False])
|
| 542 |
+
active_rms = np.mean(rms[active_mask]) if active_mask.any() else 0.0
|
| 543 |
+
snr_proxy = 20.0 * np.log10((active_rms + 1e-9) / (global_rms + 1e-9))
|
| 544 |
+
|
| 545 |
+
# Silence ratio: frames below threshold
|
| 546 |
+
thr = 0.02 * np.max(rms) if rms.size else 0.0
|
| 547 |
+
silence_ratio = float((rms < thr).mean() if rms.size else 1.0)
|
| 548 |
+
|
| 549 |
+
# Spectral centroid mean (proxy for brightness / channel)
|
| 550 |
+
sc = librosa.feature.spectral_centroid(y=y, sr=target_sr)[0]
|
| 551 |
+
sc_mean = float(np.mean(sc)) if sc.size else 0.0
|
| 552 |
+
|
| 553 |
+
return pd.Series({
|
| 554 |
+
'duration_s': dur,
|
| 555 |
+
'snr_proxy_db': float(snr_proxy),
|
| 556 |
+
'silence_ratio': silence_ratio,
|
| 557 |
+
'spec_centroid_mean': sc_mean
|
| 558 |
+
})
|
| 559 |
+
except Exception as e:
|
| 560 |
+
return pd.Series({
|
| 561 |
+
'duration_s': np.nan,
|
| 562 |
+
'snr_proxy_db': np.nan,
|
| 563 |
+
'silence_ratio': np.nan,
|
| 564 |
+
'spec_centroid_mean': np.nan
|
| 565 |
+
})
|
| 566 |
+
|
| 567 |
+
features = results_df.apply(compute_features, axis=1)
|
| 568 |
+
results_feat = pd.concat([results_df, features], axis=1)
|
| 569 |
+
print("β
Features added: ['duration_s','snr_proxy_db','silence_ratio','spec_centroid_mean']")
|
| 570 |
+
display(results_feat.head())
|
| 571 |
+
|
| 572 |
+
|
| 573 |
+
# ============================================================================
|
| 574 |
+
# CELL B: CALIBRATION & EXPECTED CALIBRATION ERROR (ECE)
|
| 575 |
+
# ============================================================================
|
| 576 |
+
import numpy as np
|
| 577 |
+
import pandas as pd
|
| 578 |
+
|
| 579 |
+
def extract_top1_conf(row):
|
| 580 |
+
preds = row.get('top_5_predictions', [])
|
| 581 |
+
if isinstance(preds, str):
|
| 582 |
+
try:
|
| 583 |
+
import json
|
| 584 |
+
preds = json.loads(preds)
|
| 585 |
+
except Exception:
|
| 586 |
+
preds = eval(preds)
|
| 587 |
+
if isinstance(preds, list) and preds:
|
| 588 |
+
return float(preds[0].get('confidence', np.nan))
|
| 589 |
+
return np.nan
|
| 590 |
+
|
| 591 |
+
def compute_ece(df, n_bins=15):
|
| 592 |
+
df = df.copy()
|
| 593 |
+
df['top1_conf'] = df.apply(extract_top1_conf, axis=1)
|
| 594 |
+
df = df[(df['predicted_language'] != 'error') & df['top1_conf'].notna()]
|
| 595 |
+
|
| 596 |
+
conf = df['top1_conf'].to_numpy()
|
| 597 |
+
correct = (df['predicted_language'] == df['ground_truth']).to_numpy().astype(float)
|
| 598 |
+
|
| 599 |
+
bins = np.linspace(0.0, 1.0, n_bins + 1)
|
| 600 |
+
ece = 0.0
|
| 601 |
+
bin_stats = []
|
| 602 |
+
for i in range(n_bins):
|
| 603 |
+
m, M = bins[i], bins[i+1]
|
| 604 |
+
mask = (conf >= m) & (conf < M) if i < n_bins-1 else (conf >= m) & (conf <= M)
|
| 605 |
+
if mask.any():
|
| 606 |
+
acc = correct[mask].mean()
|
| 607 |
+
conf_mean = conf[mask].mean()
|
| 608 |
+
wt = mask.mean()
|
| 609 |
+
ece += wt * abs(acc - conf_mean)
|
| 610 |
+
bin_stats.append({'bin_low': m, 'bin_high': M, 'bin_acc': acc, 'bin_conf': conf_mean, 'weight': wt})
|
| 611 |
+
else:
|
| 612 |
+
bin_stats.append({'bin_low': m, 'bin_high': M, 'bin_acc': np.nan, 'bin_conf': np.nan, 'weight': 0.0})
|
| 613 |
+
|
| 614 |
+
return ece, pd.DataFrame(bin_stats)
|
| 615 |
+
|
| 616 |
+
ece, ece_bins = compute_ece(results_feat)
|
| 617 |
+
print(f"π― Expected Calibration Error (ECE): {ece:.4f}")
|
| 618 |
+
display(ece_bins)
|
| 619 |
+
|
| 620 |
+
|
| 621 |
+
# ============================================================================
|
| 622 |
+
# CELL C: ROBUSTNESS SLICES (DURATION, SNR, SILENCE)
|
| 623 |
+
# ============================================================================
|
| 624 |
+
import numpy as np
|
| 625 |
+
import pandas as pd
|
| 626 |
+
|
| 627 |
+
def slice_acc(df, col, bins):
|
| 628 |
+
df = df.copy()
|
| 629 |
+
df = df[(df['predicted_language'] != 'error') & df[col].notna()]
|
| 630 |
+
labels = [f"[{bins[i]:.2f},{bins[i+1]:.2f})" for i in range(len(bins)-1)]
|
| 631 |
+
df['bin'] = pd.cut(df[col], bins=bins, labels=labels, include_lowest=True)
|
| 632 |
+
grp = df.groupby('bin').apply(lambda x: (x['ground_truth'] == x['predicted_language']).mean())
|
| 633 |
+
return grp.reset_index(name=f'accuracy_by_{col}')
|
| 634 |
+
|
| 635 |
+
dur_bins = [0, 2, 4, 6, 8, 12, np.inf]
|
| 636 |
+
snr_bins = [-40, -10, 0, 5, 10, 20, np.inf]
|
| 637 |
+
sil_bins = [0, 0.2, 0.4, 0.6, 0.8, 1.01]
|
| 638 |
+
|
| 639 |
+
acc_dur = slice_acc(results_feat, 'duration_s', dur_bins)
|
| 640 |
+
acc_snr = slice_acc(results_feat, 'snr_proxy_db', snr_bins)
|
| 641 |
+
acc_sil = slice_acc(results_feat, 'silence_ratio', sil_bins)
|
| 642 |
+
|
| 643 |
+
print("β±οΈ Accuracy vs Duration:")
|
| 644 |
+
display(acc_dur)
|
| 645 |
+
print("π Accuracy vs SNR proxy:")
|
| 646 |
+
display(acc_snr)
|
| 647 |
+
print("π€« Accuracy vs Silence ratio:")
|
| 648 |
+
display(acc_sil)
|
| 649 |
+
|
| 650 |
+
|
| 651 |
+
# ============================================================================
|
| 652 |
+
# CELL D: CONFUSION ASYMMETRY TABLE
|
| 653 |
+
# ============================================================================
|
| 654 |
+
from collections import defaultdict
|
| 655 |
+
import pandas as pd
|
| 656 |
+
|
| 657 |
+
valid = results_df[(results_df['ground_truth'] != 'unknown') & (results_df['predicted_language'] != 'error')].copy()
|
| 658 |
+
pairs = valid[valid['ground_truth'] != valid['predicted_language']][['ground_truth','predicted_language']]
|
| 659 |
+
flow = (pairs.groupby(['ground_truth','predicted_language']).size()
|
| 660 |
+
.reset_index(name='count')
|
| 661 |
+
.sort_values('count', ascending=False))
|
| 662 |
+
print("π Top asymmetric confusions:")
|
| 663 |
+
display(flow.head(30))
|
| 664 |
+
|
| 665 |
+
|
| 666 |
+
# ============================================================================
|
| 667 |
+
# CELL E: EMBEDDING CLUSTER QUALITY (SILHOUETTE)
|
| 668 |
+
# ============================================================================
|
| 669 |
+
|
| 670 |
+
import torch
|
| 671 |
+
import numpy as np
|
| 672 |
+
from sklearn.metrics import silhouette_score
|
| 673 |
+
import pandas as pd
|
| 674 |
+
|
| 675 |
+
def extract_embeddings(df, batch_size=16):
|
| 676 |
+
embs = []
|
| 677 |
+
labs = []
|
| 678 |
+
for _, row in df.iterrows():
|
| 679 |
+
t = load_audio(row['file_path'])
|
| 680 |
+
if t is None:
|
| 681 |
+
continue
|
| 682 |
+
with torch.no_grad():
|
| 683 |
+
# Many wrappers return (logits, features); if not, skip
|
| 684 |
+
try:
|
| 685 |
+
logits, feat = model(t.to(device), return_feature=True)
|
| 686 |
+
# Flatten feature vector (assume [1, D] or [1, T, D] -> take mean over time)
|
| 687 |
+
if feat is None:
|
| 688 |
+
continue
|
| 689 |
+
feat_np = feat.detach().cpu().numpy()
|
| 690 |
+
if feat_np.ndim == 3: # [B, T, D]
|
| 691 |
+
feat_np = feat_np.mean(axis=1)
|
| 692 |
+
elif feat_np.ndim == 2: # [B, D]
|
| 693 |
+
pass
|
| 694 |
+
else:
|
| 695 |
+
continue
|
| 696 |
+
embs.append(feat_np.squeeze(0))
|
| 697 |
+
labs.append(row['ground_truth'])
|
| 698 |
+
except Exception:
|
| 699 |
+
continue
|
| 700 |
+
if not embs:
|
| 701 |
+
return None, None
|
| 702 |
+
X = np.vstack(embs)
|
| 703 |
+
y = np.array(labs)
|
| 704 |
+
return X, y
|
| 705 |
+
|
| 706 |
+
sample = valid.groupby('ground_truth').head(20).reset_index(drop=True) if len(valid) > 0 else pd.DataFrame()
|
| 707 |
+
X, y = extract_embeddings(sample) if not sample.empty else (None, None)
|
| 708 |
+
if X is not None and len(np.unique(y)) > 1 and len(y) >= 10:
|
| 709 |
+
sil = silhouette_score(X, y, metric='euclidean')
|
| 710 |
+
print(f"π Silhouette score (higher=better cluster separation): {sil:.3f}")
|
| 711 |
+
else:
|
| 712 |
+
print("βΉοΈ Not enough data or embeddings to compute silhouette score.")
|
| 713 |
+
|
| 714 |
+
|
| 715 |
+
# ============================================================================
|
| 716 |
+
# CELL F: HARD-EXAMPLE MINING
|
| 717 |
+
# ============================================================================
|
| 718 |
+
import pandas as pd
|
| 719 |
+
import json
|
| 720 |
+
|
| 721 |
+
def top5_gap(row):
|
| 722 |
+
preds = row.get('top_5_predictions', [])
|
| 723 |
+
if isinstance(preds, str):
|
| 724 |
+
try:
|
| 725 |
+
preds = json.loads(preds)
|
| 726 |
+
except Exception:
|
| 727 |
+
preds = eval(preds)
|
| 728 |
+
if not preds or len(preds) < 2:
|
| 729 |
+
return np.nan
|
| 730 |
+
return float(preds[0]['confidence'] - preds[1]['confidence'])
|
| 731 |
+
|
| 732 |
+
valid = results_feat[(results_feat['ground_truth'] != 'unknown') & (results_feat['predicted_language'] != 'error')].copy()
|
| 733 |
+
valid['top5_gap'] = valid.apply(top5_gap, axis=1)
|
| 734 |
+
|
| 735 |
+
# Hardest misclassifications: small margin, wrong prediction
|
| 736 |
+
hard_mis = valid[valid['ground_truth'] != valid['predicted_language']].copy()
|
| 737 |
+
hard_mis = hard_mis.sort_values(['top5_gap','snr_proxy_db','duration_s'], ascending=[True, True, True]).head(30)
|
| 738 |
+
print("π₯ Hardest misclassifications (low margin, low SNR/duration):")
|
| 739 |
+
display(hard_mis[['filename','ground_truth','predicted_language','top5_gap','snr_proxy_db','duration_s','silence_ratio']])
|
| 740 |
+
|
| 741 |
+
# Ambiguous-but-correct: small margin but correct prediction
|
| 742 |
+
ambig_correct = valid[valid['ground_truth'] == valid['predicted_language']].copy()
|
| 743 |
+
ambig_correct = ambig_correct.sort_values(['top5_gap','snr_proxy_db','duration_s'], ascending=[True, True, True]).head(30)
|
| 744 |
+
print("π Ambiguous but correct (low margin):")
|
| 745 |
+
display(ambig_correct[['filename','ground_truth','predicted_language','top5_gap','snr_proxy_db','duration_s','silence_ratio']])
|
| 746 |
+
|
| 747 |
+
|
| 748 |
+
# ============================================================================
|
| 749 |
+
# CELL G: SAVE EXTENDED ANALYSIS TO EXCEL
|
| 750 |
+
# ============================================================================
|
| 751 |
+
import sys, subprocess, os
|
| 752 |
+
def ensure_pkg(pkg):
|
| 753 |
+
try:
|
| 754 |
+
__import__(pkg)
|
| 755 |
+
except Exception:
|
| 756 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
|
| 757 |
+
|
| 758 |
+
ensure_pkg("xlsxwriter")
|
| 759 |
+
|
| 760 |
+
from pandas import ExcelWriter
|
| 761 |
+
ts = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
|
| 762 |
+
xlsx_path = os.path.join(RESULTS_FOLDER, f"voxlect_extended_analysis_{ts}.xlsx")
|
| 763 |
+
|
| 764 |
+
with pd.ExcelWriter(xlsx_path, engine="xlsxwriter") as w:
|
| 765 |
+
results_feat.to_excel(w, sheet_name="results_with_features", index=False)
|
| 766 |
+
if 'ece' in locals():
|
| 767 |
+
pd.DataFrame([{'ECE': ece}]).to_excel(w, sheet_name="calibration_overview", index=False)
|
| 768 |
+
ece_bins.to_excel(w, sheet_name="calibration_bins", index=False)
|
| 769 |
+
acc_dur.to_excel(w, sheet_name="acc_vs_duration", index=False)
|
| 770 |
+
acc_snr.to_excel(w, sheet_name="acc_vs_snr", index=False)
|
| 771 |
+
acc_sil.to_excel(w, sheet_name="acc_vs_silence", index=False)
|
| 772 |
+
flow.to_excel(w, sheet_name="confusion_asymmetry", index=False)
|
| 773 |
+
if 'hard_mis' in locals():
|
| 774 |
+
hard_mis.to_excel(w, sheet_name="hard_misclassifications", index=False)
|
| 775 |
+
if 'ambig_correct' in locals():
|
| 776 |
+
ambig_correct.to_excel(w, sheet_name="ambiguous_correct", index=False)
|
| 777 |
+
|
| 778 |
+
print("β
Extended analysis Excel saved to:", xlsx_path)
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy
|
| 2 |
+
pandas
|
| 3 |
+
torch
|
| 4 |
+
transformers
|