kasimali commited on
Commit
5fae178
Β·
verified Β·
1 Parent(s): 8e5c07e

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +3 -6
  2. app.py +778 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -1,10 +1,7 @@
1
  ---
2
- title: Voxlect
3
- emoji: 🐠
4
- colorFrom: gray
5
- colorTo: blue
6
  sdk: static
7
- pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: VOXLECT
3
+ emoji: πŸš€
 
 
4
  sdk: static
 
5
  ---
6
 
7
+ # VOXLECT
app.py ADDED
@@ -0,0 +1,778 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VOXLECT
2
+
3
+ # ============================================================================
4
+ # CELL 1: SETUP AND INSTALLATION (VERIFIED)
5
+ # ============================================================================
6
+ import os
7
+ import sys
8
+ import warnings
9
+ warnings.filterwarnings('ignore')
10
+
11
+ print("πŸš€ VoxLect Indic LID Whisper Large v3 - Final Setup")
12
+ print("=" * 60)
13
+
14
+ # Mount Google Drive
15
+ from google.colab import drive
16
+
17
+ # Install packages
18
+ print("πŸ“¦ Installing packages...")
19
+
20
+ # Clone VoxLect repository (correct syntax)
21
+ print("πŸ“₯ Cloning VoxLect repository...")
22
+
23
+ # Python path
24
+ sys.path.insert(0, '/content/voxlect')
25
+ sys.path.insert(0, '/content/voxlect/src')
26
+
27
+ print("βœ… Installation complete!")
28
+
29
+
30
+ # ============================================================================
31
+ # CELL 2: MANDATORY MONKEY PATCH (ATTENTION COMPATIBILITY)
32
+ # ============================================================================
33
+ import transformers.models.whisper.modeling_whisper as whisper_modeling
34
+
35
+ print("πŸ”§ Applying attention compatibility patch...")
36
+
37
+ _OriginalWhisperAttention = whisper_modeling.WhisperAttention
38
+
39
+ class PatchedWhisperAttention(_OriginalWhisperAttention):
40
+ def _get_attn_impl(self):
41
+ try:
42
+ attn_impl = super()._get_attn_impl()
43
+ if attn_impl is None:
44
+ return "eager"
45
+ return attn_impl
46
+ except AttributeError:
47
+ return "eager"
48
+
49
+ whisper_modeling.WhisperAttention = PatchedWhisperAttention
50
+
51
+ print("βœ… Monkey patch applied.")
52
+
53
+
54
+ # ============================================================================
55
+ # CELL 3: MODEL LOADING & LABEL LIST
56
+ # ============================================================================
57
+ import torch
58
+ import torch.nn.functional as F
59
+ import librosa
60
+ import pandas as pd
61
+ import numpy as np
62
+ from datetime import datetime
63
+ from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
64
+ from IPython.display import display
65
+
66
+ # Import VoxLect after patch
67
+ from src.model.dialect.whisper_dialect import WhisperWrapper
68
+
69
+ # Folder code -> ground truth mapping
70
+ CUSTOM_FOLDER_MAPPING = {
71
+ "as": "assamese", "bn": "bengali", "br": "bodo", "doi": "dogri",
72
+ "en": "english", "gu": "gujarati", "hi": "hindi", "kn": "kannada",
73
+ "ks": "kashmiri", "kok": "konkani", "mai": "maithili", "ml": "malayalam",
74
+ "mni": "manipuri", "mr": "marathi", "ne": "nepali", "or": "odia",
75
+ "pa": "punjabi", "sa": "sanskrit", "sat": "santali", "sd": "sindhi",
76
+ "ta": "tamil", "te": "telugu", "ur": "urdu"
77
+ }
78
+
79
+ # IMPORTANT: label order used by the model (adjust if the model card lists a different order)
80
+ LABEL_LIST = [
81
+ "assamese", "bengali", "bodo", "dogri", "english", "gujarati",
82
+ "hindi", "kannada", "kashmiri", "konkani", "maithili", "malayalam",
83
+ "manipuri", "marathi", "nepali", "odia", "punjabi", "sanskrit",
84
+ "santali", "sindhi", "tamil", "telugu", "urdu"
85
+ ]
86
+
87
+ # Update these paths
88
+ AUDIO_FOLDER = "/content/drive/MyDrive/Audio_files" # <-- set your path
89
+ RESULTS_FOLDER = "/content/drive/MyDrive/voxlect_1_results"
90
+ os.makedirs(RESULTS_FOLDER, exist_ok=True)
91
+
92
+ # Device and model
93
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
94
+ MODEL_NAME = "tiantiaf/voxlect-indic-lid-whisper-large-v3"
95
+ print(f"πŸ”§ Device: {device}")
96
+
97
+ print(f"πŸ”„ Loading model: {MODEL_NAME}")
98
+ model = WhisperWrapper.from_pretrained(MODEL_NAME).to(device)
99
+ model.eval()
100
+ print("βœ… Model loaded successfully!")
101
+
102
+
103
+ # ============================================================================
104
+ # CELL 4: AUDIO IO & PREDICTION (ROBUST)
105
+ # ============================================================================
106
+ def trim_silence(audio, threshold=0.01, win=2048, hop=512):
107
+ rms = librosa.feature.rms(y=audio, frame_length=win, hop_length=hop)[0]
108
+ mask = rms > threshold * (rms.max() if rms.size else 1.0)
109
+ if not mask.any():
110
+ return audio
111
+ idx = np.where(mask)[0]
112
+ start = max(int(idx[0] * hop), 0)
113
+ end = min(int((idx[-1] + 1) * hop), len(audio))
114
+ return audio[start:end]
115
+
116
+ def load_audio(file_path, target_sr=16000, max_duration=15.0):
117
+ try:
118
+ audio, sr = librosa.load(file_path, sr=target_sr, mono=True)
119
+
120
+ # Optional: trim leading/trailing silence to improve discrimination
121
+ audio = trim_silence(audio, threshold=0.01)
122
+
123
+ # Duration control
124
+ max_samples = int(max_duration * target_sr)
125
+ if len(audio) > max_samples:
126
+ audio = audio[:max_samples]
127
+ min_samples = int(3.0 * target_sr)
128
+ if len(audio) < min_samples:
129
+ audio = np.pad(audio, (0, min_samples - len(audio)), 'constant')
130
+
131
+ # Normalize peak to 1 for stability across files
132
+ peak = np.abs(audio).max()
133
+ if peak > 0:
134
+ audio = audio / peak
135
+
136
+ # Diagnostics (optional; comment out after verifying)
137
+ # print(f"dbg: {os.path.basename(file_path)} len={len(audio)} mean={audio.mean():.4f} std={audio.std():.4f}")
138
+
139
+ return torch.from_numpy(audio).float().unsqueeze(0)
140
+ except Exception as e:
141
+ print(f" ❌ Error loading {os.path.basename(file_path)}: {e}")
142
+ return None
143
+
144
+ def predict_language(audio_tensor, k=5):
145
+ if audio_tensor is None:
146
+ return {"predicted_language": "error", "confidence": 0.0, "top_5_predictions": [], "error_message": "Audio load failed"}
147
+ try:
148
+ audio_tensor = audio_tensor.to(device)
149
+ with torch.no_grad():
150
+ logits, _ = model(audio_tensor, return_feature=True) # expect [1, C]
151
+ if logits.dim() == 1:
152
+ logits = logits.unsqueeze(0)
153
+ if logits.size(0) != 1:
154
+ logits = logits[:1, :]
155
+
156
+ probs = F.softmax(logits, dim=1)[0] # softmax over classes
157
+ top_probs, top_idx = torch.topk(probs, k)
158
+
159
+ top = []
160
+ for rank, (p, ix) in enumerate(zip(top_probs.tolist(), top_idx.tolist()), start=1):
161
+ idx = int(ix)
162
+ lang = LABEL_LIST[idx] if 0 <= idx < len(LABEL_LIST) else f"unknown_{idx}"
163
+ top.append({"rank": rank, "language": lang, "confidence": float(p)})
164
+
165
+ return {"predicted_language": top[0]["language"], "confidence": top[0]["confidence"], "top_5_predictions": top}
166
+ except Exception as e:
167
+ return {"predicted_language": "error", "confidence": 0.0, "top_5_predictions": [], "error_message": str(e)}
168
+
169
+ def find_audio_files(base_path):
170
+ if not os.path.exists(base_path):
171
+ print(f"❌ Path not found: {base_path}")
172
+ return []
173
+ audio_files = []
174
+ for root, _, files in os.walk(base_path):
175
+ folder = os.path.basename(root).lower()
176
+ gt = CUSTOM_FOLDER_MAPPING.get(folder, "unknown")
177
+ for file in files:
178
+ if file.lower().endswith(('.wav', '.mp3', '.m4a', '.flac', '.ogg')):
179
+ audio_files.append({
180
+ "file_path": os.path.join(root, file),
181
+ "filename": file,
182
+ "ground_truth": gt
183
+ })
184
+ print(f"βœ… Found {len(audio_files)} audio files.")
185
+ if audio_files:
186
+ print("πŸ“Š Ground Truth Distribution:")
187
+ print(pd.Series([f['ground_truth'] for f in audio_files]).value_counts())
188
+ return audio_files
189
+
190
+ print("βœ… Audio & prediction functions ready.")
191
+
192
+
193
+ # ============================================================================
194
+ # CELL 5: BATCH PROCESSING -> CSV
195
+ # ============================================================================
196
+ def run_batch_processing():
197
+ files = find_audio_files(AUDIO_FOLDER)
198
+ if not files:
199
+ return pd.DataFrame()
200
+
201
+ results = []
202
+ total = len(files)
203
+ print("\nπŸš€ Processing audio files...")
204
+ for i, f in enumerate(files, 1):
205
+ print(f" [{i}/{total}] Processing {f['filename']}...", end="")
206
+ audio_tensor = load_audio(f['file_path'])
207
+ pred = predict_language(audio_tensor)
208
+ if pred['predicted_language'] == 'error':
209
+ print(f" -> Error: {pred.get('error_message', 'Unknown error')}")
210
+ else:
211
+ print(f" -> Predicted: {pred['predicted_language']}")
212
+ results.append({**f, **pred})
213
+
214
+ df = pd.DataFrame(results)
215
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
216
+ out_csv = f"{RESULTS_FOLDER}/voxlect_results_{ts}.csv"
217
+ df.to_csv(out_csv, index=False)
218
+ print(f"\nβœ… Saved results to: {out_csv}")
219
+ return df
220
+
221
+ results_df = run_batch_processing()
222
+
223
+
224
+ # ============================================================================
225
+ # CELL 6: DETAILED ACCURACY ANALYSIS
226
+ # ============================================================================
227
+ def run_detailed_analysis(df):
228
+ print("\n" + "=" * 70)
229
+ print("πŸ“Š DETAILED ACCURACY ANALYSIS")
230
+ print("=" * 70)
231
+
232
+ valid = df[(df['ground_truth'] != 'unknown') & (df['predicted_language'] != 'error')].copy()
233
+ if valid.empty:
234
+ print("❌ No valid results for analysis.")
235
+ return
236
+
237
+ y_true = valid['ground_truth'].values
238
+ y_pred = valid['predicted_language'].values
239
+
240
+ print(f"\n🎯 Overall Accuracy (Top-1): {accuracy_score(y_true, y_pred):.2%}")
241
+
242
+ labels = sorted(set(list(y_true) + list(y_pred)))
243
+ print("\nπŸ“ˆ Classification Report:")
244
+ print(classification_report(y_true, y_pred, labels=labels, zero_division=0))
245
+
246
+ print("\nπŸ”€ Confusion Matrix:")
247
+ cm = confusion_matrix(y_true, y_pred, labels=labels)
248
+ cm_df = pd.DataFrame(cm, index=labels, columns=labels)
249
+ display(cm_df)
250
+
251
+ mis = valid[valid['ground_truth'] != valid['predicted_language']].copy()
252
+ if not mis.empty:
253
+ print("\n❌ Top 10 Most Common Misclassifications:")
254
+ top_errs = (mis.groupby(['ground_truth', 'predicted_language'])
255
+ .size().sort_values(ascending=False).head(10))
256
+ print(top_errs)
257
+
258
+ if 'top_5_predictions' in mis.columns:
259
+ def correct_rank(row):
260
+ true_lang = row['ground_truth']
261
+ preds = row['top_5_predictions']
262
+ if isinstance(preds, str):
263
+ try:
264
+ preds = eval(preds)
265
+ except Exception:
266
+ return None
267
+ for p in preds:
268
+ if p.get('language') == true_lang:
269
+ return p.get('rank')
270
+ return None
271
+
272
+ mis['correct_rank_in_top5'] = mis.apply(correct_rank, axis=1)
273
+ c_in_top5 = mis['correct_rank_in_top5'].notna().sum()
274
+ print(f"\nπŸ” Correct language in Top-5 for misclassified: {c_in_top5}/{len(mis)} ({c_in_top5/len(mis):.1%})")
275
+
276
+ top1_correct = (valid['ground_truth'] == valid['predicted_language']).sum()
277
+ top5_acc = (top1_correct + c_in_top5) / len(valid)
278
+ print(f"🎯 Overall Accuracy (Top-5): {top5_acc:.2%}")
279
+
280
+ print("\n🏁 Analysis complete!")
281
+
282
+ if 'results_df' in locals() and not results_df.empty:
283
+ run_detailed_analysis(results_df)
284
+ else:
285
+ print("Run Cell 5 first to generate results.")
286
+
287
+
288
+
289
+
290
+ # ============================================================================
291
+ # CELL: EXPORT DETAILED ANALYSIS TO EXCEL
292
+ # Requires: results_df (from Cell 5), LABEL_LIST, RESULTS_FOLDER
293
+ # ============================================================================
294
+ import json
295
+ import pandas as pd
296
+ import numpy as np
297
+ from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
298
+
299
+ def export_detailed_excel(results_df, results_folder, label_order=None, filename_prefix="voxlect_analysis"):
300
+ # Guard
301
+ if results_df is None or results_df.empty:
302
+ print("❌ No results to export. Run batch processing first.")
303
+ return None
304
+
305
+ # Prepare valid subset
306
+ valid = results_df[
307
+ (results_df['ground_truth'] != 'unknown') &
308
+ (results_df['predicted_language'] != 'error') &
309
+ results_df['predicted_language'].notna()
310
+ ].copy()
311
+
312
+ # Choose label order for confusion matrix/report
313
+ if label_order is None:
314
+ label_order = sorted(set(valid['ground_truth']) | set(valid['predicted_language']))
315
+
316
+ # Overall Top‑1 accuracy
317
+ top1_acc = accuracy_score(valid['ground_truth'], valid['predicted_language'])
318
+
319
+ # Top‑5 accuracy (if top_5_predictions available)
320
+ top5_acc = None
321
+ if 'top_5_predictions' in valid.columns:
322
+ def correct_in_top5(row):
323
+ target = row['ground_truth']
324
+ preds = row['top_5_predictions']
325
+ if isinstance(preds, str):
326
+ try:
327
+ preds = json.loads(preds)
328
+ except Exception:
329
+ try:
330
+ preds = eval(preds)
331
+ except Exception:
332
+ return False
333
+ for p in preds:
334
+ if p.get('language') == target:
335
+ return True
336
+ return False
337
+
338
+ valid['correct_in_top5'] = valid.apply(correct_in_top5, axis=1)
339
+ top5_acc = (valid['correct_in_top5'].sum() / len(valid)) if len(valid) else None
340
+
341
+ # Classification report (dict β†’ DataFrame)
342
+ cls_report = classification_report(
343
+ valid['ground_truth'],
344
+ valid['predicted_language'],
345
+ labels=label_order,
346
+ zero_division=0,
347
+ output_dict=True
348
+ )
349
+ report_df = pd.DataFrame(cls_report).T.reset_index().rename(columns={"index": "label"})
350
+
351
+ # Confusion matrix
352
+ cm = confusion_matrix(valid['ground_truth'], valid['predicted_language'], labels=label_order)
353
+ cm_df = pd.DataFrame(cm, index=label_order, columns=label_order)
354
+ cm_df.index.name = "True\\Pred"
355
+
356
+ # Top misclassifications
357
+ mis = valid[valid['ground_truth'] != valid['predicted_language']].copy()
358
+ if not mis.empty:
359
+ top_mis = (mis.groupby(['ground_truth', 'predicted_language'])
360
+ .size()
361
+ .reset_index(name='count')
362
+ .sort_values('count', ascending=False))
363
+ else:
364
+ top_mis = pd.DataFrame(columns=['ground_truth', 'predicted_language', 'count'])
365
+
366
+ # Overview sheet
367
+ overview_rows = [
368
+ {"metric": "total_predictions", "value": int(len(results_df))},
369
+ {"metric": "valid_predictions_for_eval", "value": int(len(valid))},
370
+ {"metric": "top1_accuracy", "value": float(top1_acc) if len(valid) else None},
371
+ {"metric": "top5_accuracy", "value": float(top5_acc) if top5_acc is not None else None},
372
+ ]
373
+ overview_df = pd.DataFrame(overview_rows)
374
+
375
+ # Full predictions (convert top_5_predictions to JSON string for readability)
376
+ full_preds = results_df.copy()
377
+ if 'top_5_predictions' in full_preds.columns:
378
+ full_preds['top_5_predictions'] = full_preds['top_5_predictions'].apply(
379
+ lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, (list, dict)) else str(x)
380
+ )
381
+
382
+ # Write to Excel with multiple sheets
383
+ ts = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
384
+ out_xlsx = os.path.join(results_folder, f"{filename_prefix}_{ts}.xlsx")
385
+
386
+ with pd.ExcelWriter(out_xlsx, engine="xlsxwriter") as writer:
387
+ overview_df.to_excel(writer, sheet_name="overview", index=False)
388
+ report_df.to_excel(writer, sheet_name="per_language_metrics", index=False)
389
+ cm_df.to_excel(writer, sheet_name="confusion_matrix")
390
+ top_mis.to_excel(writer, sheet_name="top_misclassifications", index=False)
391
+ full_preds.to_excel(writer, sheet_name="full_predictions", index=False)
392
+
393
+ # Optional: auto width
394
+ for sheet in ["overview", "per_language_metrics", "confusion_matrix", "top_misclassifications", "full_predictions"]:
395
+ try:
396
+ ws = writer.sheets[sheet]
397
+ for i, col in enumerate(pd.read_excel(out_xlsx, sheet_name=sheet).columns):
398
+ width = max(12, min(60, int(full_preds[col].astype(str).map(len).max()) if sheet == "full_predictions" else 20))
399
+ ws.set_column(i, i, width)
400
+ except Exception:
401
+ pass
402
+
403
+ print(f"βœ… Excel report saved to: {out_xlsx}")
404
+ return out_xlsx
405
+
406
+ # Run export
407
+ excel_path = export_detailed_excel(results_df, RESULTS_FOLDER, label_order=sorted(set(LABEL_LIST)))
408
+ print("Excel path:", excel_path)
409
+
410
+
411
+ # ============================================================================
412
+ # CELL: TOP-5 RANK POSITION ANALYSIS
413
+ # Requires: results_df (from batch), where 'top_5_predictions' is a list/dict or JSON string
414
+ # ============================================================================
415
+ import json
416
+ import pandas as pd
417
+ import numpy as np
418
+
419
+ def parse_top5(cell):
420
+ """Return a list of dicts [{'rank':int,'language':str,'confidence':float}, ...] from cell."""
421
+ if isinstance(cell, list):
422
+ return cell
423
+ if isinstance(cell, dict):
424
+ return [cell]
425
+ if isinstance(cell, str):
426
+ # Try JSON first, then eval as fallback
427
+ try:
428
+ v = json.loads(cell)
429
+ return v if isinstance(v, list) else [v]
430
+ except Exception:
431
+ try:
432
+ v = eval(cell)
433
+ return v if isinstance(v, list) else [v]
434
+ except Exception:
435
+ return []
436
+ return []
437
+
438
+ def compute_rank_in_top5(df):
439
+ """Add 'correct_rank_in_top5' and a readable verdict column per row."""
440
+ df = df.copy()
441
+
442
+ def get_rank(row):
443
+ gt = row.get('ground_truth', None)
444
+ preds = parse_top5(row.get('top_5_predictions', []))
445
+ if not gt or not preds:
446
+ return None
447
+ for p in preds:
448
+ if isinstance(p, dict) and p.get('language') == gt:
449
+ # Ensure rank is int-like and 1-based
450
+ r = p.get('rank', None)
451
+ try:
452
+ r = int(r)
453
+ if 1 <= r <= 5:
454
+ return r
455
+ except Exception:
456
+ pass
457
+ return None
458
+
459
+ df['correct_rank_in_top5'] = df.apply(get_rank, axis=1)
460
+ df['top5_verdict'] = df['correct_rank_in_top5'].apply(
461
+ lambda r: f"Rank {int(r)}" if pd.notna(r) else "Not-in-Top-5"
462
+ )
463
+ return df
464
+
465
+ def per_language_rank_summary(df):
466
+ """Build a per-language summary of rank distribution (1..5 and Not-in-Top-5)."""
467
+ # Consider only rows with known ground truth and a prediction attempt
468
+ subset = df[(df['ground_truth'].notna()) & (df['predicted_language'] != 'error')].copy()
469
+ subset['rank_bin'] = subset['correct_rank_in_top5'].apply(lambda r: int(r) if pd.notna(r) else 0) # 0 = Not-in-Top-5
470
+
471
+ # Pivot counts per language vs rank_bin
472
+ counts = (subset
473
+ .groupby(['ground_truth', 'rank_bin'])
474
+ .size()
475
+ .reset_index(name='count'))
476
+
477
+ # Make a wide table with columns for Rank1..Rank5 and Not-in-Top-5
478
+ rank_cols = {0: "Not-in-Top-5", 1: "Rank 1", 2: "Rank 2", 3: "Rank 3", 4: "Rank 4", 5: "Rank 5"}
479
+ summary = (counts
480
+ .assign(rank_label=lambda x: x['rank_bin'].map(rank_cols))
481
+ .pivot(index='ground_truth', columns='rank_label', values='count')
482
+ .fillna(0)
483
+ .astype(int)
484
+ .reset_index()
485
+ .rename(columns={'ground_truth': 'language'}))
486
+
487
+ # Add totals and in-top-5 rate
488
+ summary['Total'] = summary[[c for c in summary.columns if c.startswith('Rank ') or c == 'Not-in-Top-5']].sum(axis=1)
489
+ in_top5_cols = [c for c in summary.columns if c.startswith('Rank ')]
490
+ summary['In-Top-5'] = summary[in_top5_cols].sum(axis=1)
491
+ summary['In-Top-5 Rate'] = (summary['In-Top-5'] / summary['Total']).replace([np.inf, np.nan], 0.0)
492
+
493
+ # Order columns nicely
494
+ ordered_cols = ['language', 'Total', 'In-Top-5', 'In-Top-5 Rate', 'Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5', 'Not-in-Top-5']
495
+ for c in ordered_cols:
496
+ if c not in summary.columns:
497
+ summary[c] = 0 if c != 'In-Top-5 Rate' else 0.0
498
+ summary = summary[ordered_cols]
499
+ return summary.sort_values(by=['In-Top-5 Rate','Rank 1','Rank 2','Rank 3','Rank 4','Rank 5'], ascending=False)
500
+
501
+ # 1) Compute per-row rank
502
+ results_ranked = compute_rank_in_top5(results_df)
503
+
504
+ # 2) Show first few rows with the new columns
505
+ display(results_ranked[['filename','ground_truth','predicted_language','top5_verdict','correct_rank_in_top5']].head(20))
506
+
507
+ # 3) Build per-language summary
508
+ rank_summary = per_language_rank_summary(results_ranked)
509
+ display(rank_summary)
510
+
511
+ # 4) Optionally save both to Excel or CSV
512
+ from datetime import datetime
513
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
514
+
515
+ rank_csv = os.path.join(RESULTS_FOLDER, f"top5_rank_per_file_{ts}.csv")
516
+ results_ranked.to_csv(rank_csv, index=False)
517
+ print("βœ… Per-file Top‑5 rank CSV:", rank_csv)
518
+
519
+ summary_csv = os.path.join(RESULTS_FOLDER, f"top5_rank_summary_{ts}.csv")
520
+ rank_summary.to_csv(summary_csv, index=False)
521
+ print("βœ… Per-language Top‑5 rank summary CSV:", summary_csv)
522
+
523
+
524
+ # ============================================================================
525
+ # CELL A: FEATURE EXTRACTION (DURATION, SNR, SILENCE RATIO)
526
+ # ============================================================================
527
+ import librosa
528
+ import numpy as np
529
+ import pandas as pd
530
+ import os
531
+
532
+ def compute_features(row, target_sr=16000):
533
+ p = row['file_path']
534
+ try:
535
+ y, sr = librosa.load(p, sr=target_sr, mono=True)
536
+ dur = len(y) / target_sr
537
+
538
+ # Energy-based SNR proxy: ratio of voiced/active RMS to global RMS (not true SNR but indicative)
539
+ rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
540
+ global_rms = np.sqrt(np.mean(y**2) + 1e-12)
541
+ active_mask = rms > 0.1 * np.max(rms) if rms.size else np.array([False])
542
+ active_rms = np.mean(rms[active_mask]) if active_mask.any() else 0.0
543
+ snr_proxy = 20.0 * np.log10((active_rms + 1e-9) / (global_rms + 1e-9))
544
+
545
+ # Silence ratio: frames below threshold
546
+ thr = 0.02 * np.max(rms) if rms.size else 0.0
547
+ silence_ratio = float((rms < thr).mean() if rms.size else 1.0)
548
+
549
+ # Spectral centroid mean (proxy for brightness / channel)
550
+ sc = librosa.feature.spectral_centroid(y=y, sr=target_sr)[0]
551
+ sc_mean = float(np.mean(sc)) if sc.size else 0.0
552
+
553
+ return pd.Series({
554
+ 'duration_s': dur,
555
+ 'snr_proxy_db': float(snr_proxy),
556
+ 'silence_ratio': silence_ratio,
557
+ 'spec_centroid_mean': sc_mean
558
+ })
559
+ except Exception as e:
560
+ return pd.Series({
561
+ 'duration_s': np.nan,
562
+ 'snr_proxy_db': np.nan,
563
+ 'silence_ratio': np.nan,
564
+ 'spec_centroid_mean': np.nan
565
+ })
566
+
567
+ features = results_df.apply(compute_features, axis=1)
568
+ results_feat = pd.concat([results_df, features], axis=1)
569
+ print("βœ… Features added: ['duration_s','snr_proxy_db','silence_ratio','spec_centroid_mean']")
570
+ display(results_feat.head())
571
+
572
+
573
+ # ============================================================================
574
+ # CELL B: CALIBRATION & EXPECTED CALIBRATION ERROR (ECE)
575
+ # ============================================================================
576
+ import numpy as np
577
+ import pandas as pd
578
+
579
+ def extract_top1_conf(row):
580
+ preds = row.get('top_5_predictions', [])
581
+ if isinstance(preds, str):
582
+ try:
583
+ import json
584
+ preds = json.loads(preds)
585
+ except Exception:
586
+ preds = eval(preds)
587
+ if isinstance(preds, list) and preds:
588
+ return float(preds[0].get('confidence', np.nan))
589
+ return np.nan
590
+
591
+ def compute_ece(df, n_bins=15):
592
+ df = df.copy()
593
+ df['top1_conf'] = df.apply(extract_top1_conf, axis=1)
594
+ df = df[(df['predicted_language'] != 'error') & df['top1_conf'].notna()]
595
+
596
+ conf = df['top1_conf'].to_numpy()
597
+ correct = (df['predicted_language'] == df['ground_truth']).to_numpy().astype(float)
598
+
599
+ bins = np.linspace(0.0, 1.0, n_bins + 1)
600
+ ece = 0.0
601
+ bin_stats = []
602
+ for i in range(n_bins):
603
+ m, M = bins[i], bins[i+1]
604
+ mask = (conf >= m) & (conf < M) if i < n_bins-1 else (conf >= m) & (conf <= M)
605
+ if mask.any():
606
+ acc = correct[mask].mean()
607
+ conf_mean = conf[mask].mean()
608
+ wt = mask.mean()
609
+ ece += wt * abs(acc - conf_mean)
610
+ bin_stats.append({'bin_low': m, 'bin_high': M, 'bin_acc': acc, 'bin_conf': conf_mean, 'weight': wt})
611
+ else:
612
+ bin_stats.append({'bin_low': m, 'bin_high': M, 'bin_acc': np.nan, 'bin_conf': np.nan, 'weight': 0.0})
613
+
614
+ return ece, pd.DataFrame(bin_stats)
615
+
616
+ ece, ece_bins = compute_ece(results_feat)
617
+ print(f"🎯 Expected Calibration Error (ECE): {ece:.4f}")
618
+ display(ece_bins)
619
+
620
+
621
+ # ============================================================================
622
+ # CELL C: ROBUSTNESS SLICES (DURATION, SNR, SILENCE)
623
+ # ============================================================================
624
+ import numpy as np
625
+ import pandas as pd
626
+
627
+ def slice_acc(df, col, bins):
628
+ df = df.copy()
629
+ df = df[(df['predicted_language'] != 'error') & df[col].notna()]
630
+ labels = [f"[{bins[i]:.2f},{bins[i+1]:.2f})" for i in range(len(bins)-1)]
631
+ df['bin'] = pd.cut(df[col], bins=bins, labels=labels, include_lowest=True)
632
+ grp = df.groupby('bin').apply(lambda x: (x['ground_truth'] == x['predicted_language']).mean())
633
+ return grp.reset_index(name=f'accuracy_by_{col}')
634
+
635
+ dur_bins = [0, 2, 4, 6, 8, 12, np.inf]
636
+ snr_bins = [-40, -10, 0, 5, 10, 20, np.inf]
637
+ sil_bins = [0, 0.2, 0.4, 0.6, 0.8, 1.01]
638
+
639
+ acc_dur = slice_acc(results_feat, 'duration_s', dur_bins)
640
+ acc_snr = slice_acc(results_feat, 'snr_proxy_db', snr_bins)
641
+ acc_sil = slice_acc(results_feat, 'silence_ratio', sil_bins)
642
+
643
+ print("⏱️ Accuracy vs Duration:")
644
+ display(acc_dur)
645
+ print("πŸ”Š Accuracy vs SNR proxy:")
646
+ display(acc_snr)
647
+ print("🀫 Accuracy vs Silence ratio:")
648
+ display(acc_sil)
649
+
650
+
651
+ # ============================================================================
652
+ # CELL D: CONFUSION ASYMMETRY TABLE
653
+ # ============================================================================
654
+ from collections import defaultdict
655
+ import pandas as pd
656
+
657
+ valid = results_df[(results_df['ground_truth'] != 'unknown') & (results_df['predicted_language'] != 'error')].copy()
658
+ pairs = valid[valid['ground_truth'] != valid['predicted_language']][['ground_truth','predicted_language']]
659
+ flow = (pairs.groupby(['ground_truth','predicted_language']).size()
660
+ .reset_index(name='count')
661
+ .sort_values('count', ascending=False))
662
+ print("πŸ”€ Top asymmetric confusions:")
663
+ display(flow.head(30))
664
+
665
+
666
+ # ============================================================================
667
+ # CELL E: EMBEDDING CLUSTER QUALITY (SILHOUETTE)
668
+ # ============================================================================
669
+
670
+ import torch
671
+ import numpy as np
672
+ from sklearn.metrics import silhouette_score
673
+ import pandas as pd
674
+
675
+ def extract_embeddings(df, batch_size=16):
676
+ embs = []
677
+ labs = []
678
+ for _, row in df.iterrows():
679
+ t = load_audio(row['file_path'])
680
+ if t is None:
681
+ continue
682
+ with torch.no_grad():
683
+ # Many wrappers return (logits, features); if not, skip
684
+ try:
685
+ logits, feat = model(t.to(device), return_feature=True)
686
+ # Flatten feature vector (assume [1, D] or [1, T, D] -> take mean over time)
687
+ if feat is None:
688
+ continue
689
+ feat_np = feat.detach().cpu().numpy()
690
+ if feat_np.ndim == 3: # [B, T, D]
691
+ feat_np = feat_np.mean(axis=1)
692
+ elif feat_np.ndim == 2: # [B, D]
693
+ pass
694
+ else:
695
+ continue
696
+ embs.append(feat_np.squeeze(0))
697
+ labs.append(row['ground_truth'])
698
+ except Exception:
699
+ continue
700
+ if not embs:
701
+ return None, None
702
+ X = np.vstack(embs)
703
+ y = np.array(labs)
704
+ return X, y
705
+
706
+ sample = valid.groupby('ground_truth').head(20).reset_index(drop=True) if len(valid) > 0 else pd.DataFrame()
707
+ X, y = extract_embeddings(sample) if not sample.empty else (None, None)
708
+ if X is not None and len(np.unique(y)) > 1 and len(y) >= 10:
709
+ sil = silhouette_score(X, y, metric='euclidean')
710
+ print(f"πŸ“ Silhouette score (higher=better cluster separation): {sil:.3f}")
711
+ else:
712
+ print("ℹ️ Not enough data or embeddings to compute silhouette score.")
713
+
714
+
715
+ # ============================================================================
716
+ # CELL F: HARD-EXAMPLE MINING
717
+ # ============================================================================
718
+ import pandas as pd
719
+ import json
720
+
721
+ def top5_gap(row):
722
+ preds = row.get('top_5_predictions', [])
723
+ if isinstance(preds, str):
724
+ try:
725
+ preds = json.loads(preds)
726
+ except Exception:
727
+ preds = eval(preds)
728
+ if not preds or len(preds) < 2:
729
+ return np.nan
730
+ return float(preds[0]['confidence'] - preds[1]['confidence'])
731
+
732
+ valid = results_feat[(results_feat['ground_truth'] != 'unknown') & (results_feat['predicted_language'] != 'error')].copy()
733
+ valid['top5_gap'] = valid.apply(top5_gap, axis=1)
734
+
735
+ # Hardest misclassifications: small margin, wrong prediction
736
+ hard_mis = valid[valid['ground_truth'] != valid['predicted_language']].copy()
737
+ hard_mis = hard_mis.sort_values(['top5_gap','snr_proxy_db','duration_s'], ascending=[True, True, True]).head(30)
738
+ print("πŸ”₯ Hardest misclassifications (low margin, low SNR/duration):")
739
+ display(hard_mis[['filename','ground_truth','predicted_language','top5_gap','snr_proxy_db','duration_s','silence_ratio']])
740
+
741
+ # Ambiguous-but-correct: small margin but correct prediction
742
+ ambig_correct = valid[valid['ground_truth'] == valid['predicted_language']].copy()
743
+ ambig_correct = ambig_correct.sort_values(['top5_gap','snr_proxy_db','duration_s'], ascending=[True, True, True]).head(30)
744
+ print("πŸŒ€ Ambiguous but correct (low margin):")
745
+ display(ambig_correct[['filename','ground_truth','predicted_language','top5_gap','snr_proxy_db','duration_s','silence_ratio']])
746
+
747
+
748
+ # ============================================================================
749
+ # CELL G: SAVE EXTENDED ANALYSIS TO EXCEL
750
+ # ============================================================================
751
+ import sys, subprocess, os
752
+ def ensure_pkg(pkg):
753
+ try:
754
+ __import__(pkg)
755
+ except Exception:
756
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
757
+
758
+ ensure_pkg("xlsxwriter")
759
+
760
+ from pandas import ExcelWriter
761
+ ts = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
762
+ xlsx_path = os.path.join(RESULTS_FOLDER, f"voxlect_extended_analysis_{ts}.xlsx")
763
+
764
+ with pd.ExcelWriter(xlsx_path, engine="xlsxwriter") as w:
765
+ results_feat.to_excel(w, sheet_name="results_with_features", index=False)
766
+ if 'ece' in locals():
767
+ pd.DataFrame([{'ECE': ece}]).to_excel(w, sheet_name="calibration_overview", index=False)
768
+ ece_bins.to_excel(w, sheet_name="calibration_bins", index=False)
769
+ acc_dur.to_excel(w, sheet_name="acc_vs_duration", index=False)
770
+ acc_snr.to_excel(w, sheet_name="acc_vs_snr", index=False)
771
+ acc_sil.to_excel(w, sheet_name="acc_vs_silence", index=False)
772
+ flow.to_excel(w, sheet_name="confusion_asymmetry", index=False)
773
+ if 'hard_mis' in locals():
774
+ hard_mis.to_excel(w, sheet_name="hard_misclassifications", index=False)
775
+ if 'ambig_correct' in locals():
776
+ ambig_correct.to_excel(w, sheet_name="ambiguous_correct", index=False)
777
+
778
+ print("βœ… Extended analysis Excel saved to:", xlsx_path)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ torch
4
+ transformers