root
commited on
Commit
·
c95399f
1
Parent(s):
5d5eb0f
ss
Browse files
app.py
CHANGED
|
@@ -24,7 +24,12 @@ from utils import (
|
|
| 24 |
)
|
| 25 |
from emotionanalysis import MusicAnalyzer
|
| 26 |
import librosa
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
import tempfile
|
| 29 |
import os
|
| 30 |
import soundfile as sf
|
|
@@ -3196,8 +3201,13 @@ def detect_voice_activity(audio_file):
|
|
| 3196 |
print("To use voice activity detection:")
|
| 3197 |
print("1. Create an account at https://huggingface.co")
|
| 3198 |
print("2. Generate a token at https://huggingface.co/settings/tokens")
|
| 3199 |
-
print("3. Accept the terms for pyannote
|
| 3200 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3201 |
|
| 3202 |
# Create fallback segments based on audio duration
|
| 3203 |
# This creates segments approximately every 5 seconds
|
|
@@ -3224,8 +3234,36 @@ def detect_voice_activity(audio_file):
|
|
| 3224 |
print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
|
| 3225 |
return estimated_segments
|
| 3226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3227 |
# Initialize the voice activity detection pipeline
|
| 3228 |
try:
|
|
|
|
| 3229 |
vad_pipeline = Pipeline.from_pretrained(
|
| 3230 |
"pyannote/voice-activity-detection",
|
| 3231 |
use_auth_token=hf_token
|
|
@@ -4339,18 +4377,27 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
|
|
| 4339 |
# Add voice detection info box
|
| 4340 |
with gr.Accordion("Voice Activity Detection", open=True):
|
| 4341 |
gr.Markdown("""
|
| 4342 |
-
|
| 4343 |
-
|
| 4344 |
-
This app uses pyannote/voice-activity-detection to identify vocal segments in music.
|
| 4345 |
-
|
| 4346 |
-
**Important:** This model requires Hugging Face authentication:
|
| 4347 |
-
|
| 4348 |
-
1. Create an account at [huggingface.co](https://huggingface.co)
|
| 4349 |
-
2. Generate a token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
|
| 4350 |
-
3. Accept the terms at [huggingface.co/pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
|
| 4351 |
-
4. Set the HF_TOKEN environment variable
|
| 4352 |
|
| 4353 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4354 |
""")
|
| 4355 |
|
| 4356 |
with gr.Column(scale=2):
|
|
|
|
| 24 |
)
|
| 25 |
from emotionanalysis import MusicAnalyzer
|
| 26 |
import librosa
|
| 27 |
+
try:
|
| 28 |
+
from pyannote.audio import Pipeline
|
| 29 |
+
PYANNOTE_AVAILABLE = True
|
| 30 |
+
except ImportError:
|
| 31 |
+
print("WARNING: pyannote.audio is not properly installed. Voice detection will use fallback mode.")
|
| 32 |
+
PYANNOTE_AVAILABLE = False
|
| 33 |
import tempfile
|
| 34 |
import os
|
| 35 |
import soundfile as sf
|
|
|
|
| 3201 |
print("To use voice activity detection:")
|
| 3202 |
print("1. Create an account at https://huggingface.co")
|
| 3203 |
print("2. Generate a token at https://huggingface.co/settings/tokens")
|
| 3204 |
+
print("3. Accept the terms for pyannote models at:")
|
| 3205 |
+
print(" - https://huggingface.co/pyannote/segmentation")
|
| 3206 |
+
print(" - https://huggingface.co/pyannote/voice-activity-detection")
|
| 3207 |
+
print("4. Set 'pyannote' environment variable with your token:")
|
| 3208 |
+
print(" - Linux/Mac: export pyannote=your_token_here")
|
| 3209 |
+
print(" - Windows: set pyannote=your_token_here")
|
| 3210 |
+
print(" - Hugging Face Spaces: Add a 'pyannote' Secret in Settings")
|
| 3211 |
|
| 3212 |
# Create fallback segments based on audio duration
|
| 3213 |
# This creates segments approximately every 5 seconds
|
|
|
|
| 3234 |
print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
|
| 3235 |
return estimated_segments
|
| 3236 |
|
| 3237 |
+
# Check if pyannote is available
|
| 3238 |
+
if not PYANNOTE_AVAILABLE:
|
| 3239 |
+
print("pyannote.audio is not available. Using fallback voice detection.")
|
| 3240 |
+
# Create fallback segments based on audio duration
|
| 3241 |
+
y, sr = load_audio(audio_file, SAMPLE_RATE)
|
| 3242 |
+
duration = extract_audio_duration(y, sr)
|
| 3243 |
+
|
| 3244 |
+
# Create segments of 4-5 seconds each, with small gaps between them
|
| 3245 |
+
estimated_segments = []
|
| 3246 |
+
segment_duration = 4.5
|
| 3247 |
+
gap_duration = 1.0
|
| 3248 |
+
|
| 3249 |
+
current_pos = 0.0
|
| 3250 |
+
while current_pos < duration:
|
| 3251 |
+
segment_end = min(current_pos + segment_duration, duration)
|
| 3252 |
+
estimated_segments.append({
|
| 3253 |
+
"start": current_pos,
|
| 3254 |
+
"end": segment_end,
|
| 3255 |
+
"duration": segment_end - current_pos
|
| 3256 |
+
})
|
| 3257 |
+
current_pos = segment_end + gap_duration
|
| 3258 |
+
if current_pos >= duration:
|
| 3259 |
+
break
|
| 3260 |
+
|
| 3261 |
+
print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
|
| 3262 |
+
return estimated_segments
|
| 3263 |
+
|
| 3264 |
# Initialize the voice activity detection pipeline
|
| 3265 |
try:
|
| 3266 |
+
print(f"Attempting to load pyannote/voice-activity-detection with auth token: {'[PROVIDED]' if hf_token else '[MISSING]'}")
|
| 3267 |
vad_pipeline = Pipeline.from_pretrained(
|
| 3268 |
"pyannote/voice-activity-detection",
|
| 3269 |
use_auth_token=hf_token
|
|
|
|
| 4377 |
# Add voice detection info box
|
| 4378 |
with gr.Accordion("Voice Activity Detection", open=True):
|
| 4379 |
gr.Markdown("""
|
| 4380 |
+
### Voice Detection Authentication Required
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4381 |
|
| 4382 |
+
This app uses pyannote/voice-activity-detection to identify vocal segments in music.
|
| 4383 |
+
|
| 4384 |
+
**Important:** This model requires Hugging Face authentication:
|
| 4385 |
+
|
| 4386 |
+
1. Create an account at [huggingface.co](https://huggingface.co)
|
| 4387 |
+
2. Generate a token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
|
| 4388 |
+
3. Accept the terms at [huggingface.co/pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
|
| 4389 |
+
4. Set the "pyannote" environment variable with your token:
|
| 4390 |
+
- In Linux/Mac: `export pyannote="your_token_here"`
|
| 4391 |
+
- In Windows: `set pyannote=your_token_here`
|
| 4392 |
+
- In Hugging Face Spaces: Add a "pyannote" Secret in the Settings tab
|
| 4393 |
+
|
| 4394 |
+
Without authentication, the app will use estimated segments based on audio duration.
|
| 4395 |
+
|
| 4396 |
+
**Technical Note:** If you're having trouble with authentication, make sure:
|
| 4397 |
+
1. The pyannote.audio package is properly installed
|
| 4398 |
+
2. You've accepted the model terms at [huggingface.co/pyannote/voice-activity-detection](https://huggingface.co/pyannote/voice-activity-detection)
|
| 4399 |
+
3. The provided token has READ access permission
|
| 4400 |
+
4. You've added hf.co to your allowed domains if using a scoped token
|
| 4401 |
""")
|
| 4402 |
|
| 4403 |
with gr.Column(scale=2):
|