root
commited on
Commit
·
5d5eb0f
1
Parent(s):
651b0cd
ss
Browse files
app.py
CHANGED
|
@@ -1656,9 +1656,36 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
|
|
| 1656 |
print(f"DEBUG: templates is not a list, it's {type(templates)}")
|
| 1657 |
# If it's not a list, create a single-item list
|
| 1658 |
if templates is not None:
|
| 1659 |
-
templates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1660 |
else:
|
| 1661 |
templates = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1662 |
|
| 1663 |
# Split lyrics into lines
|
| 1664 |
lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
|
|
@@ -1689,18 +1716,32 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
|
|
| 1689 |
|
| 1690 |
# Extract the template string from different possible formats
|
| 1691 |
template_str = None
|
| 1692 |
-
if isinstance(template, dict)
|
| 1693 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1694 |
elif isinstance(template, str):
|
| 1695 |
template_str = template
|
| 1696 |
else:
|
| 1697 |
-
|
| 1698 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1699 |
|
| 1700 |
if not isinstance(template_str, str):
|
| 1701 |
print(f"DEBUG: template_str is not a string, it's {type(template_str)}")
|
| 1702 |
continue
|
| 1703 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1704 |
# Handle multiple phrases in template - process ALL phrases, not just the first
|
| 1705 |
template_phrases = [template_str]
|
| 1706 |
if "|" in template_str:
|
|
@@ -3148,28 +3189,93 @@ def detect_voice_activity(audio_file):
|
|
| 3148 |
try:
|
| 3149 |
print("Detecting voice activity in audio...")
|
| 3150 |
# Get HF_TOKEN from environment or set your token here
|
| 3151 |
-
hf_token = os.environ.get("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3152 |
|
| 3153 |
# Initialize the voice activity detection pipeline
|
| 3154 |
-
|
| 3155 |
-
|
| 3156 |
-
|
| 3157 |
-
|
| 3158 |
-
|
| 3159 |
-
|
| 3160 |
-
|
| 3161 |
-
|
| 3162 |
-
|
| 3163 |
-
|
| 3164 |
-
|
| 3165 |
-
|
| 3166 |
-
|
| 3167 |
-
|
| 3168 |
-
|
| 3169 |
-
|
| 3170 |
-
|
| 3171 |
-
|
| 3172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3173 |
|
| 3174 |
except Exception as e:
|
| 3175 |
print(f"Error detecting voice activity: {str(e)}")
|
|
@@ -3551,8 +3657,14 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
|
|
| 3551 |
|
| 3552 |
# Helper function to convert numpy values to floats - FIXED
|
| 3553 |
def ensure_float(value):
|
| 3554 |
-
if isinstance(value, np.ndarray)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3555 |
return float(value)
|
|
|
|
|
|
|
| 3556 |
return value
|
| 3557 |
|
| 3558 |
# Format the timeline with enhanced scientific headers
|
|
@@ -4117,10 +4229,38 @@ def display_results(audio_file, lyrics_requirements=None):
|
|
| 4117 |
emotion_text = "No emotion analysis available."
|
| 4118 |
try:
|
| 4119 |
emotion_results = music_analyzer.analyze_music(audio_file)
|
| 4120 |
-
|
| 4121 |
-
|
| 4122 |
-
|
| 4123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4124 |
|
| 4125 |
# Keep basic beat analysis without section information
|
| 4126 |
y, sr = load_audio(audio_file, SAMPLE_RATE)
|
|
@@ -4128,9 +4268,15 @@ def display_results(audio_file, lyrics_requirements=None):
|
|
| 4128 |
|
| 4129 |
# Add beat analysis info
|
| 4130 |
emotion_text += f"\n\nBeat Analysis:\n"
|
| 4131 |
-
|
| 4132 |
-
|
| 4133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4134 |
|
| 4135 |
# Add voice activity segments if available
|
| 4136 |
if voice_segments:
|
|
@@ -4189,6 +4335,23 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
|
|
| 4189 |
|
| 4190 |
For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music.
|
| 4191 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4192 |
|
| 4193 |
with gr.Column(scale=2):
|
| 4194 |
# Use tabs for better organization of outputs
|
|
@@ -4260,4 +4423,4 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
|
|
| 4260 |
""")
|
| 4261 |
|
| 4262 |
# Launch the app
|
| 4263 |
-
demo.launch()
|
|
|
|
| 1656 |
print(f"DEBUG: templates is not a list, it's {type(templates)}")
|
| 1657 |
# If it's not a list, create a single-item list
|
| 1658 |
if templates is not None:
|
| 1659 |
+
if isinstance(templates, str):
|
| 1660 |
+
# If it's a string, we need to parse it properly
|
| 1661 |
+
templates = [templates]
|
| 1662 |
+
elif isinstance(templates, dict):
|
| 1663 |
+
# If it's a dict, extract relevant information
|
| 1664 |
+
if "templates" in templates:
|
| 1665 |
+
templates = templates["templates"]
|
| 1666 |
+
if not isinstance(templates, list):
|
| 1667 |
+
templates = [templates]
|
| 1668 |
+
else:
|
| 1669 |
+
# Create a single element list with the dict
|
| 1670 |
+
templates = [templates]
|
| 1671 |
+
else:
|
| 1672 |
+
templates = [templates]
|
| 1673 |
else:
|
| 1674 |
templates = []
|
| 1675 |
+
|
| 1676 |
+
# Ensure all templates are strings or properly formatted dicts
|
| 1677 |
+
for i, template in enumerate(templates[:]):
|
| 1678 |
+
if isinstance(template, dict):
|
| 1679 |
+
if "syllable_template" not in template and "text" in template:
|
| 1680 |
+
# Try to use text field if syllable_template is missing
|
| 1681 |
+
template["syllable_template"] = template["text"]
|
| 1682 |
+
elif not isinstance(template, str):
|
| 1683 |
+
# Convert non-string, non-dict templates to strings if possible
|
| 1684 |
+
try:
|
| 1685 |
+
templates[i] = str(template)
|
| 1686 |
+
except:
|
| 1687 |
+
# Remove this template if it can't be converted
|
| 1688 |
+
templates.pop(i)
|
| 1689 |
|
| 1690 |
# Split lyrics into lines
|
| 1691 |
lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
|
|
|
|
| 1716 |
|
| 1717 |
# Extract the template string from different possible formats
|
| 1718 |
template_str = None
|
| 1719 |
+
if isinstance(template, dict):
|
| 1720 |
+
# Try various keys that might contain template information
|
| 1721 |
+
for key in ["syllable_template", "template", "text", "pattern"]:
|
| 1722 |
+
if key in template and template[key] is not None:
|
| 1723 |
+
template_str = template[key]
|
| 1724 |
+
break
|
| 1725 |
elif isinstance(template, str):
|
| 1726 |
template_str = template
|
| 1727 |
else:
|
| 1728 |
+
# Try to convert to string
|
| 1729 |
+
try:
|
| 1730 |
+
template_str = str(template)
|
| 1731 |
+
print(f"DEBUG: Converted template {i+1} from {type(template)} to string")
|
| 1732 |
+
except:
|
| 1733 |
+
print(f"DEBUG: Skipping template {i+1}, not a string or dict with syllable_template")
|
| 1734 |
+
continue
|
| 1735 |
|
| 1736 |
if not isinstance(template_str, str):
|
| 1737 |
print(f"DEBUG: template_str is not a string, it's {type(template_str)}")
|
| 1738 |
continue
|
| 1739 |
|
| 1740 |
+
# Safety check for empty strings
|
| 1741 |
+
if not template_str.strip():
|
| 1742 |
+
print(f"DEBUG: Skipping empty template {i+1}")
|
| 1743 |
+
continue
|
| 1744 |
+
|
| 1745 |
# Handle multiple phrases in template - process ALL phrases, not just the first
|
| 1746 |
template_phrases = [template_str]
|
| 1747 |
if "|" in template_str:
|
|
|
|
| 3189 |
try:
|
| 3190 |
print("Detecting voice activity in audio...")
|
| 3191 |
# Get HF_TOKEN from environment or set your token here
|
| 3192 |
+
hf_token = os.environ.get("pyannote", None)
|
| 3193 |
+
|
| 3194 |
+
if not hf_token:
|
| 3195 |
+
print("Warning: No Hugging Face token provided. Voice activity detection requires authentication.")
|
| 3196 |
+
print("To use voice activity detection:")
|
| 3197 |
+
print("1. Create an account at https://huggingface.co")
|
| 3198 |
+
print("2. Generate a token at https://huggingface.co/settings/tokens")
|
| 3199 |
+
print("3. Accept the terms for pyannote/segmentation at https://huggingface.co/pyannote/segmentation")
|
| 3200 |
+
print("4. Set HF_TOKEN environment variable or provide it directly in the code")
|
| 3201 |
+
|
| 3202 |
+
# Create fallback segments based on audio duration
|
| 3203 |
+
# This creates segments approximately every 5 seconds
|
| 3204 |
+
y, sr = load_audio(audio_file, SAMPLE_RATE)
|
| 3205 |
+
duration = extract_audio_duration(y, sr)
|
| 3206 |
+
|
| 3207 |
+
# Create segments of 4-5 seconds each, with small gaps between them
|
| 3208 |
+
estimated_segments = []
|
| 3209 |
+
segment_duration = 4.5
|
| 3210 |
+
gap_duration = 1.0
|
| 3211 |
+
|
| 3212 |
+
current_pos = 0.0
|
| 3213 |
+
while current_pos < duration:
|
| 3214 |
+
segment_end = min(current_pos + segment_duration, duration)
|
| 3215 |
+
estimated_segments.append({
|
| 3216 |
+
"start": current_pos,
|
| 3217 |
+
"end": segment_end,
|
| 3218 |
+
"duration": segment_end - current_pos
|
| 3219 |
+
})
|
| 3220 |
+
current_pos = segment_end + gap_duration
|
| 3221 |
+
if current_pos >= duration:
|
| 3222 |
+
break
|
| 3223 |
+
|
| 3224 |
+
print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
|
| 3225 |
+
return estimated_segments
|
| 3226 |
|
| 3227 |
# Initialize the voice activity detection pipeline
|
| 3228 |
+
try:
|
| 3229 |
+
vad_pipeline = Pipeline.from_pretrained(
|
| 3230 |
+
"pyannote/voice-activity-detection",
|
| 3231 |
+
use_auth_token=hf_token
|
| 3232 |
+
)
|
| 3233 |
+
|
| 3234 |
+
# Process the audio file
|
| 3235 |
+
output = vad_pipeline(audio_file)
|
| 3236 |
+
|
| 3237 |
+
# Extract voice segments
|
| 3238 |
+
voice_segments = []
|
| 3239 |
+
for speech in output.get_timeline().support():
|
| 3240 |
+
voice_segments.append({
|
| 3241 |
+
"start": speech.start,
|
| 3242 |
+
"end": speech.end,
|
| 3243 |
+
"duration": speech.end - speech.start
|
| 3244 |
+
})
|
| 3245 |
+
|
| 3246 |
+
print(f"Detected {len(voice_segments)} voice segments")
|
| 3247 |
+
return voice_segments
|
| 3248 |
+
|
| 3249 |
+
except Exception as auth_error:
|
| 3250 |
+
print(f"Authentication error with pyannote models: {str(auth_error)}")
|
| 3251 |
+
print("Make sure you have:")
|
| 3252 |
+
print("1. Created a Hugging Face account")
|
| 3253 |
+
print("2. Generated a token at https://huggingface.co/settings/tokens")
|
| 3254 |
+
print("3. Accepted terms for pyannote/segmentation at https://huggingface.co/pyannote/segmentation")
|
| 3255 |
+
|
| 3256 |
+
# Create fallback segments as above
|
| 3257 |
+
y, sr = load_audio(audio_file, SAMPLE_RATE)
|
| 3258 |
+
duration = extract_audio_duration(y, sr)
|
| 3259 |
+
|
| 3260 |
+
# Create segments of 4-5 seconds each with small gaps
|
| 3261 |
+
estimated_segments = []
|
| 3262 |
+
segment_duration = 4.5
|
| 3263 |
+
gap_duration = 1.0
|
| 3264 |
+
|
| 3265 |
+
current_pos = 0.0
|
| 3266 |
+
while current_pos < duration:
|
| 3267 |
+
segment_end = min(current_pos + segment_duration, duration)
|
| 3268 |
+
estimated_segments.append({
|
| 3269 |
+
"start": current_pos,
|
| 3270 |
+
"end": segment_end,
|
| 3271 |
+
"duration": segment_end - current_pos
|
| 3272 |
+
})
|
| 3273 |
+
current_pos = segment_end + gap_duration
|
| 3274 |
+
if current_pos >= duration:
|
| 3275 |
+
break
|
| 3276 |
+
|
| 3277 |
+
print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
|
| 3278 |
+
return estimated_segments
|
| 3279 |
|
| 3280 |
except Exception as e:
|
| 3281 |
print(f"Error detecting voice activity: {str(e)}")
|
|
|
|
| 3657 |
|
| 3658 |
# Helper function to convert numpy values to floats - FIXED
|
| 3659 |
def ensure_float(value):
|
| 3660 |
+
if isinstance(value, np.ndarray):
|
| 3661 |
+
if value.size == 1:
|
| 3662 |
+
return float(value.item())
|
| 3663 |
+
return float(value[0]) if value.size > 0 else 0.0
|
| 3664 |
+
elif isinstance(value, np.number):
|
| 3665 |
return float(value)
|
| 3666 |
+
elif value is None:
|
| 3667 |
+
return 0.0
|
| 3668 |
return value
|
| 3669 |
|
| 3670 |
# Format the timeline with enhanced scientific headers
|
|
|
|
| 4229 |
emotion_text = "No emotion analysis available."
|
| 4230 |
try:
|
| 4231 |
emotion_results = music_analyzer.analyze_music(audio_file)
|
| 4232 |
+
|
| 4233 |
+
# Safe formatting helper function to handle any value type
|
| 4234 |
+
def safe_format(value, format_spec=None):
|
| 4235 |
+
if value is None:
|
| 4236 |
+
return "N/A"
|
| 4237 |
+
try:
|
| 4238 |
+
if isinstance(value, (int, float)):
|
| 4239 |
+
if format_spec:
|
| 4240 |
+
return format(value, format_spec)
|
| 4241 |
+
return str(value)
|
| 4242 |
+
if isinstance(value, np.ndarray):
|
| 4243 |
+
if value.size == 1:
|
| 4244 |
+
val = value.item()
|
| 4245 |
+
if format_spec:
|
| 4246 |
+
return format(val, format_spec)
|
| 4247 |
+
return str(val)
|
| 4248 |
+
return str(value[0]) if value.size > 0 else "N/A"
|
| 4249 |
+
return str(value)
|
| 4250 |
+
except:
|
| 4251 |
+
return "N/A"
|
| 4252 |
+
|
| 4253 |
+
# Get summary values safely
|
| 4254 |
+
tempo = emotion_results.get('summary', {}).get('tempo', 0)
|
| 4255 |
+
key = emotion_results.get('summary', {}).get('key', 'Unknown')
|
| 4256 |
+
mode = emotion_results.get('summary', {}).get('mode', '')
|
| 4257 |
+
primary_emotion = emotion_results.get('summary', {}).get('primary_emotion', 'Unknown')
|
| 4258 |
+
primary_theme = emotion_results.get('summary', {}).get('primary_theme', 'Unknown')
|
| 4259 |
+
|
| 4260 |
+
emotion_text = (f"Tempo: {safe_format(tempo, '.1f')} BPM\n"
|
| 4261 |
+
f"Key: {key} {mode}\n"
|
| 4262 |
+
f"Primary Emotion: {primary_emotion}\n"
|
| 4263 |
+
f"Primary Theme: {primary_theme}")
|
| 4264 |
|
| 4265 |
# Keep basic beat analysis without section information
|
| 4266 |
y, sr = load_audio(audio_file, SAMPLE_RATE)
|
|
|
|
| 4268 |
|
| 4269 |
# Add beat analysis info
|
| 4270 |
emotion_text += f"\n\nBeat Analysis:\n"
|
| 4271 |
+
|
| 4272 |
+
# Get beat info values safely
|
| 4273 |
+
tempo = beats_info.get('tempo', 0)
|
| 4274 |
+
time_sig = beats_info.get('time_signature', 4)
|
| 4275 |
+
beat_count = beats_info.get('beat_count', 0)
|
| 4276 |
+
|
| 4277 |
+
emotion_text += f"- Tempo: {safe_format(tempo, '.1f')} BPM\n"
|
| 4278 |
+
emotion_text += f"- Time Signature: {time_sig}/4\n"
|
| 4279 |
+
emotion_text += f"- Total Beats: {beat_count}\n"
|
| 4280 |
|
| 4281 |
# Add voice activity segments if available
|
| 4282 |
if voice_segments:
|
|
|
|
| 4335 |
|
| 4336 |
For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music.
|
| 4337 |
""")
|
| 4338 |
+
|
| 4339 |
+
# Add voice detection info box
|
| 4340 |
+
with gr.Accordion("Voice Activity Detection", open=True):
|
| 4341 |
+
gr.Markdown("""
|
| 4342 |
+
### Voice Detection Authentication Required
|
| 4343 |
+
|
| 4344 |
+
This app uses pyannote/voice-activity-detection to identify vocal segments in music.
|
| 4345 |
+
|
| 4346 |
+
**Important:** This model requires Hugging Face authentication:
|
| 4347 |
+
|
| 4348 |
+
1. Create an account at [huggingface.co](https://huggingface.co)
|
| 4349 |
+
2. Generate a token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
|
| 4350 |
+
3. Accept the terms at [huggingface.co/pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
|
| 4351 |
+
4. Set the HF_TOKEN environment variable
|
| 4352 |
+
|
| 4353 |
+
Without authentication, the app will use estimated segments based on audio duration.
|
| 4354 |
+
""")
|
| 4355 |
|
| 4356 |
with gr.Column(scale=2):
|
| 4357 |
# Use tabs for better organization of outputs
|
|
|
|
| 4423 |
""")
|
| 4424 |
|
| 4425 |
# Launch the app
|
| 4426 |
+
demo.launch(share=True)
|