root
commited on
Commit
·
8515dc5
1
Parent(s):
e3108aa
ss
Browse files- 5.16appbeforesyllables.py +336 -0
- app.py +357 -48
- beat_analysis.py +392 -0
- emotionanalysis.py +15 -24
- requirements.txt +1 -0
5.16appbeforesyllables.py
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import io
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import torch
|
| 5 |
+
import numpy as np
|
| 6 |
+
import re
|
| 7 |
+
import pronouncing # Add this to requirements.txt for syllable counting
|
| 8 |
+
import functools # Add this for lru_cache functionality
|
| 9 |
+
from transformers import (
|
| 10 |
+
AutoModelForAudioClassification,
|
| 11 |
+
AutoFeatureExtractor,
|
| 12 |
+
AutoTokenizer,
|
| 13 |
+
pipeline,
|
| 14 |
+
AutoModelForCausalLM,
|
| 15 |
+
BitsAndBytesConfig
|
| 16 |
+
)
|
| 17 |
+
from huggingface_hub import login
|
| 18 |
+
from utils import (
|
| 19 |
+
load_audio,
|
| 20 |
+
extract_audio_duration,
|
| 21 |
+
extract_mfcc_features,
|
| 22 |
+
format_genre_results,
|
| 23 |
+
ensure_cuda_availability
|
| 24 |
+
)
|
| 25 |
+
from emotionanalysis import MusicAnalyzer
|
| 26 |
+
import librosa
|
| 27 |
+
|
| 28 |
+
# Login to Hugging Face Hub if token is provided
|
| 29 |
+
if "HF_TOKEN" in os.environ:
|
| 30 |
+
login(token=os.environ["HF_TOKEN"])
|
| 31 |
+
|
| 32 |
+
# Constants
|
| 33 |
+
GENRE_MODEL_NAME = "dima806/music_genres_classification"
|
| 34 |
+
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593"
|
| 35 |
+
LLM_MODEL_NAME = "Qwen/Qwen3-32B"
|
| 36 |
+
SAMPLE_RATE = 22050 # Standard sample rate for audio processing
|
| 37 |
+
|
| 38 |
+
# Check CUDA availability (for informational purposes)
|
| 39 |
+
CUDA_AVAILABLE = ensure_cuda_availability()
|
| 40 |
+
|
| 41 |
+
# Load models at initialization time
|
| 42 |
+
print("Loading genre classification model...")
|
| 43 |
+
try:
|
| 44 |
+
genre_feature_extractor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME)
|
| 45 |
+
genre_model = AutoModelForAudioClassification.from_pretrained(
|
| 46 |
+
GENRE_MODEL_NAME,
|
| 47 |
+
device_map="auto" if CUDA_AVAILABLE else None
|
| 48 |
+
)
|
| 49 |
+
# Create a convenience wrapper function with the same interface as before
|
| 50 |
+
def get_genre_model():
|
| 51 |
+
return genre_model, genre_feature_extractor
|
| 52 |
+
except Exception as e:
|
| 53 |
+
print(f"Error loading genre model: {str(e)}")
|
| 54 |
+
genre_model = None
|
| 55 |
+
genre_feature_extractor = None
|
| 56 |
+
|
| 57 |
+
# Load LLM and tokenizer at initialization time
|
| 58 |
+
print("Loading Qwen LLM model with 4-bit quantization...")
|
| 59 |
+
try:
|
| 60 |
+
# Configure 4-bit quantization for better performance
|
| 61 |
+
quantization_config = BitsAndBytesConfig(
|
| 62 |
+
load_in_4bit=True,
|
| 63 |
+
bnb_4bit_quant_type="nf4",
|
| 64 |
+
bnb_4bit_compute_dtype=torch.float16,
|
| 65 |
+
bnb_4bit_use_double_quant=True
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
|
| 69 |
+
llm_model = AutoModelForCausalLM.from_pretrained(
|
| 70 |
+
LLM_MODEL_NAME,
|
| 71 |
+
quantization_config=quantization_config,
|
| 72 |
+
device_map="auto",
|
| 73 |
+
trust_remote_code=True,
|
| 74 |
+
torch_dtype=torch.float16,
|
| 75 |
+
use_cache=True
|
| 76 |
+
)
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"Error loading LLM model: {str(e)}")
|
| 79 |
+
llm_tokenizer = None
|
| 80 |
+
llm_model = None
|
| 81 |
+
|
| 82 |
+
# Create music analyzer instance
|
| 83 |
+
music_analyzer = MusicAnalyzer()
|
| 84 |
+
|
| 85 |
+
# Process uploaded audio file
|
| 86 |
+
def process_audio(audio_file):
|
| 87 |
+
if audio_file is None:
|
| 88 |
+
return "No audio file provided", None, None, None, None, None, None
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
# Load and analyze audio
|
| 92 |
+
y, sr = load_audio(audio_file, sr=SAMPLE_RATE)
|
| 93 |
+
|
| 94 |
+
# Basic audio information
|
| 95 |
+
duration = extract_audio_duration(y, sr)
|
| 96 |
+
|
| 97 |
+
# Analyze music with MusicAnalyzer
|
| 98 |
+
music_analysis = music_analyzer.analyze_music(audio_file)
|
| 99 |
+
|
| 100 |
+
# Extract key information
|
| 101 |
+
tempo = music_analysis["rhythm_analysis"]["tempo"]
|
| 102 |
+
time_signature = music_analysis["rhythm_analysis"]["estimated_time_signature"]
|
| 103 |
+
emotion = music_analysis["emotion_analysis"]["primary_emotion"]
|
| 104 |
+
theme = music_analysis["theme_analysis"]["primary_theme"]
|
| 105 |
+
|
| 106 |
+
# Use genre classification directly instead of pipeline
|
| 107 |
+
if genre_model is not None and genre_feature_extractor is not None:
|
| 108 |
+
# Resample audio to 16000 Hz for the genre model
|
| 109 |
+
y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000)
|
| 110 |
+
|
| 111 |
+
# Extract features
|
| 112 |
+
inputs = genre_feature_extractor(
|
| 113 |
+
y_16k,
|
| 114 |
+
sampling_rate=16000,
|
| 115 |
+
return_tensors="pt"
|
| 116 |
+
).to(genre_model.device)
|
| 117 |
+
|
| 118 |
+
# Classify genre
|
| 119 |
+
with torch.no_grad():
|
| 120 |
+
outputs = genre_model(**inputs)
|
| 121 |
+
logits = outputs.logits
|
| 122 |
+
probs = torch.nn.functional.softmax(logits, dim=-1)
|
| 123 |
+
|
| 124 |
+
# Get top genres
|
| 125 |
+
values, indices = torch.topk(probs[0], k=5)
|
| 126 |
+
top_genres = [(genre_model.config.id2label[idx.item()], val.item()) for val, idx in zip(values, indices)]
|
| 127 |
+
else:
|
| 128 |
+
# Fallback if model loading failed
|
| 129 |
+
top_genres = [("Unknown", 1.0)]
|
| 130 |
+
|
| 131 |
+
# Format genre results for display
|
| 132 |
+
genre_results_text = format_genre_results(top_genres)
|
| 133 |
+
primary_genre = top_genres[0][0]
|
| 134 |
+
|
| 135 |
+
# Generate lyrics using LLM
|
| 136 |
+
lyrics = generate_lyrics(music_analysis, primary_genre, duration)
|
| 137 |
+
|
| 138 |
+
# Prepare analysis summary
|
| 139 |
+
analysis_summary = f"""
|
| 140 |
+
### Music Analysis Results
|
| 141 |
+
|
| 142 |
+
**Duration:** {duration:.2f} seconds
|
| 143 |
+
**Tempo:** {tempo:.1f} BPM
|
| 144 |
+
**Time Signature:** {time_signature}
|
| 145 |
+
**Key:** {music_analysis["tonal_analysis"]["key"]} {music_analysis["tonal_analysis"]["mode"]}
|
| 146 |
+
**Primary Emotion:** {emotion}
|
| 147 |
+
**Primary Theme:** {theme}
|
| 148 |
+
**Top Genre:** {primary_genre}
|
| 149 |
+
|
| 150 |
+
{genre_results_text}
|
| 151 |
+
"""
|
| 152 |
+
|
| 153 |
+
return analysis_summary, lyrics, tempo, time_signature, emotion, theme, primary_genre
|
| 154 |
+
|
| 155 |
+
except Exception as e:
|
| 156 |
+
error_msg = f"Error processing audio: {str(e)}"
|
| 157 |
+
print(error_msg)
|
| 158 |
+
return error_msg, None, None, None, None, None, None
|
| 159 |
+
|
| 160 |
+
def generate_lyrics(music_analysis, genre, duration):
|
| 161 |
+
try:
|
| 162 |
+
# Extract meaningful information for context
|
| 163 |
+
tempo = music_analysis["rhythm_analysis"]["tempo"]
|
| 164 |
+
key = music_analysis["tonal_analysis"]["key"]
|
| 165 |
+
mode = music_analysis["tonal_analysis"]["mode"]
|
| 166 |
+
emotion = music_analysis["emotion_analysis"]["primary_emotion"]
|
| 167 |
+
theme = music_analysis["theme_analysis"]["primary_theme"]
|
| 168 |
+
|
| 169 |
+
# Verify LLM is loaded
|
| 170 |
+
if llm_model is None or llm_tokenizer is None:
|
| 171 |
+
return "Error: LLM model not properly loaded"
|
| 172 |
+
|
| 173 |
+
# Construct prompt for the LLM with stronger instruction to avoid thinking
|
| 174 |
+
prompt = f"""Write lyrics for a {genre} song with these specifications:
|
| 175 |
+
- Key: {key} {mode}
|
| 176 |
+
- Tempo: {tempo} BPM
|
| 177 |
+
- Emotion: {emotion}
|
| 178 |
+
- Theme: {theme}
|
| 179 |
+
- Duration: {duration:.1f} seconds
|
| 180 |
+
- Time signature: {music_analysis["rhythm_analysis"]["estimated_time_signature"]}
|
| 181 |
+
|
| 182 |
+
CRITICAL INSTRUCTIONS:
|
| 183 |
+
- The lyrics should be in English
|
| 184 |
+
- Write ONLY the raw lyrics with no structural labels
|
| 185 |
+
- DO NOT include any thinking, reasoning, or explanations
|
| 186 |
+
- DO NOT include <think> tags or thinking processes
|
| 187 |
+
- DO NOT include [verse], [chorus], [bridge], or any other section markers
|
| 188 |
+
- DO NOT number the verses or lines
|
| 189 |
+
- DO NOT use bullet points
|
| 190 |
+
- Format as simple line-by-line lyrics only
|
| 191 |
+
- Make sure the lyrics match the specified duration and tempo
|
| 192 |
+
- Keep lyrics concise enough to fit the duration when sung at the given tempo
|
| 193 |
+
"""
|
| 194 |
+
|
| 195 |
+
# Generate lyrics using the LLM model directly
|
| 196 |
+
messages = [
|
| 197 |
+
{"role": "user", "content": prompt}
|
| 198 |
+
]
|
| 199 |
+
|
| 200 |
+
# Apply chat template
|
| 201 |
+
text = llm_tokenizer.apply_chat_template(
|
| 202 |
+
messages,
|
| 203 |
+
tokenize=False,
|
| 204 |
+
add_generation_prompt=True
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
# Tokenize and move to model device
|
| 208 |
+
model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device)
|
| 209 |
+
|
| 210 |
+
# Generate with optimized parameters
|
| 211 |
+
generated_ids = llm_model.generate(
|
| 212 |
+
**model_inputs,
|
| 213 |
+
max_new_tokens=1024,
|
| 214 |
+
do_sample=True,
|
| 215 |
+
temperature=0.6, # Lower temperature for more focused responses
|
| 216 |
+
top_p=0.9,
|
| 217 |
+
repetition_penalty=1.2,
|
| 218 |
+
pad_token_id=llm_tokenizer.eos_token_id
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
# Decode the output
|
| 222 |
+
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
|
| 223 |
+
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
|
| 224 |
+
|
| 225 |
+
# ENHANCED post-processing to remove ALL thinking elements
|
| 226 |
+
|
| 227 |
+
# Remove thinking tags and their content
|
| 228 |
+
lyrics = re.sub(r'<think>.*?</think>', '', lyrics, flags=re.DOTALL)
|
| 229 |
+
lyrics = re.sub(r'\[thinking\].*?\[/thinking\]', '', lyrics, flags=re.DOTALL)
|
| 230 |
+
|
| 231 |
+
# Remove any lines with section labels
|
| 232 |
+
lyrics = re.sub(r'^\[.*?\].*$', '', lyrics, flags=re.MULTILINE)
|
| 233 |
+
|
| 234 |
+
# Remove common prefixes and thinking text (expanded list)
|
| 235 |
+
thinking_prefixes = [
|
| 236 |
+
r'^(Here are|Here is|These are|This is|Let me|I will|I\'ll).*?:\s*',
|
| 237 |
+
r'^Okay, let\'s.*$',
|
| 238 |
+
r'^First, I need to.*$',
|
| 239 |
+
r'^Let me brainstorm.*$',
|
| 240 |
+
r'^I\'ll write.*$',
|
| 241 |
+
r'^Let\'s create.*$',
|
| 242 |
+
r'^For a.*song.*$',
|
| 243 |
+
r'^Now I\'ll.*$',
|
| 244 |
+
r'^Taking into account.*$',
|
| 245 |
+
r'^Given the specifications.*$',
|
| 246 |
+
r'^Based on the.*$',
|
| 247 |
+
r'^Considering the.*$'
|
| 248 |
+
]
|
| 249 |
+
|
| 250 |
+
for pattern in thinking_prefixes:
|
| 251 |
+
lyrics = re.sub(pattern, '', lyrics, flags=re.MULTILINE|re.IGNORECASE)
|
| 252 |
+
|
| 253 |
+
lyrics = re.sub(r'^Title:.*?$', '', lyrics, flags=re.MULTILINE).strip()
|
| 254 |
+
|
| 255 |
+
# Remove all section markers in any format
|
| 256 |
+
lyrics = re.sub(r'^\s*(Verse|Chorus|Bridge|Pre.?Chorus|Intro|Outro|Refrain|Hook|Breakdown)(\s*\d*|\s*[A-Z])?:?\s*$', '', lyrics, flags=re.MULTILINE|re.IGNORECASE)
|
| 257 |
+
lyrics = re.sub(r'\[(Verse|Chorus|Bridge|Pre.?Chorus|Intro|Outro|Refrain|Hook|Breakdown)(\s*\d*|\s*[A-Z])?\]', '', lyrics, flags=re.IGNORECASE)
|
| 258 |
+
|
| 259 |
+
# Remove lines with obvious thinking content
|
| 260 |
+
lyrics = re.sub(r'^.*?(think|brainstorm|consider|syllable|count|rhyme|scheme|tempo|calculate|bpm).*$', '', lyrics, flags=re.MULTILINE|re.IGNORECASE)
|
| 261 |
+
|
| 262 |
+
# Remove any empty lines at beginning, collapse multiple blank lines, and trim
|
| 263 |
+
lyrics = re.sub(r'^\s*\n', '', lyrics)
|
| 264 |
+
lyrics = re.sub(r'\n\s*\n\s*\n+', '\n\n', lyrics)
|
| 265 |
+
lyrics = lyrics.strip()
|
| 266 |
+
|
| 267 |
+
# One final check - if lyrics still starts with obvious thinking, try to find the actual lyrics
|
| 268 |
+
if re.match(r'.*?(I need to|Let me|Okay|Hmm|I will|I\'ll|First|Let\'s|Now).*', lyrics[:100], re.IGNORECASE):
|
| 269 |
+
# Look for a double line break which often separates thinking from lyrics
|
| 270 |
+
parts = lyrics.split('\n\n')
|
| 271 |
+
if len(parts) > 1:
|
| 272 |
+
# Take everything after the first paragraph as the actual lyrics
|
| 273 |
+
lyrics = '\n\n'.join(parts[1:])
|
| 274 |
+
|
| 275 |
+
return lyrics
|
| 276 |
+
|
| 277 |
+
except Exception as e:
|
| 278 |
+
error_msg = f"Error generating lyrics: {str(e)}"
|
| 279 |
+
print(error_msg)
|
| 280 |
+
return error_msg
|
| 281 |
+
|
| 282 |
+
# Create Gradio interface
|
| 283 |
+
def create_interface():
|
| 284 |
+
with gr.Blocks(title="Music Analysis & Lyrics Generator") as demo:
|
| 285 |
+
gr.Markdown("# Music Analysis & Lyrics Generator")
|
| 286 |
+
gr.Markdown("Upload a music file or record audio to analyze it and generate matching lyrics")
|
| 287 |
+
|
| 288 |
+
with gr.Row():
|
| 289 |
+
with gr.Column(scale=1):
|
| 290 |
+
audio_input = gr.Audio(
|
| 291 |
+
label="Upload or Record Audio",
|
| 292 |
+
type="filepath",
|
| 293 |
+
sources=["upload", "microphone"]
|
| 294 |
+
)
|
| 295 |
+
analyze_btn = gr.Button("Analyze and Generate Lyrics", variant="primary")
|
| 296 |
+
|
| 297 |
+
with gr.Column(scale=2):
|
| 298 |
+
with gr.Tab("Analysis"):
|
| 299 |
+
analysis_output = gr.Textbox(label="Music Analysis Results", lines=10)
|
| 300 |
+
|
| 301 |
+
with gr.Row():
|
| 302 |
+
tempo_output = gr.Number(label="Tempo (BPM)")
|
| 303 |
+
time_sig_output = gr.Textbox(label="Time Signature")
|
| 304 |
+
emotion_output = gr.Textbox(label="Primary Emotion")
|
| 305 |
+
theme_output = gr.Textbox(label="Primary Theme")
|
| 306 |
+
genre_output = gr.Textbox(label="Primary Genre")
|
| 307 |
+
|
| 308 |
+
with gr.Tab("Generated Lyrics"):
|
| 309 |
+
lyrics_output = gr.Textbox(label="Generated Lyrics", lines=20)
|
| 310 |
+
|
| 311 |
+
# Set up event handlers
|
| 312 |
+
analyze_btn.click(
|
| 313 |
+
fn=process_audio,
|
| 314 |
+
inputs=[audio_input],
|
| 315 |
+
outputs=[analysis_output, lyrics_output, tempo_output, time_sig_output,
|
| 316 |
+
emotion_output, theme_output, genre_output]
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
gr.Markdown("""
|
| 320 |
+
## How it works
|
| 321 |
+
1. Upload or record a music file
|
| 322 |
+
2. The system analyzes tempo, beats, time signature and other musical features
|
| 323 |
+
3. It detects emotion, theme, and music genre
|
| 324 |
+
4. Using this information, it generates lyrics that match the style and length of your music
|
| 325 |
+
""")
|
| 326 |
+
|
| 327 |
+
return demo
|
| 328 |
+
|
| 329 |
+
# Launch the app
|
| 330 |
+
demo = create_interface()
|
| 331 |
+
|
| 332 |
+
if __name__ == "__main__":
|
| 333 |
+
demo.launch()
|
| 334 |
+
else:
|
| 335 |
+
# For Hugging Face Spaces
|
| 336 |
+
app = demo
|
app.py
CHANGED
|
@@ -4,8 +4,8 @@ import gradio as gr
|
|
| 4 |
import torch
|
| 5 |
import numpy as np
|
| 6 |
import re
|
| 7 |
-
import pronouncing
|
| 8 |
-
import functools
|
| 9 |
from transformers import (
|
| 10 |
AutoModelForAudioClassification,
|
| 11 |
AutoFeatureExtractor,
|
|
@@ -22,8 +22,12 @@ from utils import (
|
|
| 22 |
format_genre_results,
|
| 23 |
ensure_cuda_availability
|
| 24 |
)
|
| 25 |
-
from emotionanalysis import MusicAnalyzer
|
| 26 |
import librosa
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# Login to Hugging Face Hub if token is provided
|
| 29 |
if "HF_TOKEN" in os.environ:
|
|
@@ -85,7 +89,7 @@ music_analyzer = MusicAnalyzer()
|
|
| 85 |
# Process uploaded audio file
|
| 86 |
def process_audio(audio_file):
|
| 87 |
if audio_file is None:
|
| 88 |
-
return "No audio file provided", None, None, None, None, None, None
|
| 89 |
|
| 90 |
try:
|
| 91 |
# Load and analyze audio
|
|
@@ -97,9 +101,24 @@ def process_audio(audio_file):
|
|
| 97 |
# Analyze music with MusicAnalyzer
|
| 98 |
music_analysis = music_analyzer.analyze_music(audio_file)
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
# Extract key information
|
| 101 |
tempo = music_analysis["rhythm_analysis"]["tempo"]
|
| 102 |
-
time_signature = music_analysis["rhythm_analysis"]["estimated_time_signature"]
|
| 103 |
emotion = music_analysis["emotion_analysis"]["primary_emotion"]
|
| 104 |
theme = music_analysis["theme_analysis"]["primary_theme"]
|
| 105 |
|
|
@@ -135,6 +154,9 @@ def process_audio(audio_file):
|
|
| 135 |
# Generate lyrics using LLM
|
| 136 |
lyrics = generate_lyrics(music_analysis, primary_genre, duration)
|
| 137 |
|
|
|
|
|
|
|
|
|
|
| 138 |
# Prepare analysis summary
|
| 139 |
analysis_summary = f"""
|
| 140 |
### Music Analysis Results
|
|
@@ -148,14 +170,26 @@ def process_audio(audio_file):
|
|
| 148 |
**Top Genre:** {primary_genre}
|
| 149 |
|
| 150 |
{genre_results_text}
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
-
return analysis_summary, lyrics, tempo, time_signature, emotion, theme, primary_genre
|
| 154 |
|
| 155 |
except Exception as e:
|
| 156 |
error_msg = f"Error processing audio: {str(e)}"
|
| 157 |
print(error_msg)
|
| 158 |
-
return error_msg, None, None, None, None, None, None
|
| 159 |
|
| 160 |
def generate_lyrics(music_analysis, genre, duration):
|
| 161 |
try:
|
|
@@ -166,33 +200,48 @@ def generate_lyrics(music_analysis, genre, duration):
|
|
| 166 |
emotion = music_analysis["emotion_analysis"]["primary_emotion"]
|
| 167 |
theme = music_analysis["theme_analysis"]["primary_theme"]
|
| 168 |
|
|
|
|
|
|
|
|
|
|
| 169 |
# Verify LLM is loaded
|
| 170 |
if llm_model is None or llm_tokenizer is None:
|
| 171 |
return "Error: LLM model not properly loaded"
|
| 172 |
-
|
| 173 |
-
# Construct prompt for the LLM
|
| 174 |
-
prompt = f"""Write lyrics for a {genre} song with these specifications:
|
| 175 |
-
- Key: {key} {mode}
|
| 176 |
-
- Tempo: {tempo} BPM
|
| 177 |
-
- Emotion: {emotion}
|
| 178 |
-
- Theme: {theme}
|
| 179 |
-
- Duration: {duration:.1f} seconds
|
| 180 |
-
- Time signature: {music_analysis["rhythm_analysis"]["estimated_time_signature"]}
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
"""
|
| 193 |
|
| 194 |
-
# Generate lyrics using the LLM model
|
| 195 |
-
# Format as chat message
|
| 196 |
messages = [
|
| 197 |
{"role": "user", "content": prompt}
|
| 198 |
]
|
|
@@ -214,7 +263,7 @@ IMPORTANT INSTRUCTIONS:
|
|
| 214 |
do_sample=True,
|
| 215 |
temperature=0.7,
|
| 216 |
top_p=0.9,
|
| 217 |
-
repetition_penalty=1.
|
| 218 |
pad_token_id=llm_tokenizer.eos_token_id
|
| 219 |
)
|
| 220 |
|
|
@@ -222,33 +271,289 @@ IMPORTANT INSTRUCTIONS:
|
|
| 222 |
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
|
| 223 |
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
|
| 224 |
|
| 225 |
-
#
|
| 226 |
-
#
|
| 227 |
-
lyrics = re.sub(r'^\[.*?\].*$', '', lyrics, flags=re.MULTILINE)
|
| 228 |
|
| 229 |
-
#
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
-
# Remove
|
| 234 |
-
lyrics = re.sub(r'
|
| 235 |
-
lyrics = re.sub(r'\[
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
-
#
|
| 238 |
-
|
|
|
|
| 239 |
|
| 240 |
-
#
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
except Exception as e:
|
| 248 |
error_msg = f"Error generating lyrics: {str(e)}"
|
| 249 |
print(error_msg)
|
| 250 |
return error_msg
|
| 251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
# Create Gradio interface
|
| 253 |
def create_interface():
|
| 254 |
with gr.Blocks(title="Music Analysis & Lyrics Generator") as demo:
|
|
@@ -277,13 +582,16 @@ def create_interface():
|
|
| 277 |
|
| 278 |
with gr.Tab("Generated Lyrics"):
|
| 279 |
lyrics_output = gr.Textbox(label="Generated Lyrics", lines=20)
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
# Set up event handlers
|
| 282 |
analyze_btn.click(
|
| 283 |
fn=process_audio,
|
| 284 |
inputs=[audio_input],
|
| 285 |
outputs=[analysis_output, lyrics_output, tempo_output, time_sig_output,
|
| 286 |
-
emotion_output, theme_output, genre_output]
|
| 287 |
)
|
| 288 |
|
| 289 |
gr.Markdown("""
|
|
@@ -291,7 +599,8 @@ def create_interface():
|
|
| 291 |
1. Upload or record a music file
|
| 292 |
2. The system analyzes tempo, beats, time signature and other musical features
|
| 293 |
3. It detects emotion, theme, and music genre
|
| 294 |
-
4. Using
|
|
|
|
| 295 |
""")
|
| 296 |
|
| 297 |
return demo
|
|
|
|
| 4 |
import torch
|
| 5 |
import numpy as np
|
| 6 |
import re
|
| 7 |
+
import pronouncing
|
| 8 |
+
import functools
|
| 9 |
from transformers import (
|
| 10 |
AutoModelForAudioClassification,
|
| 11 |
AutoFeatureExtractor,
|
|
|
|
| 22 |
format_genre_results,
|
| 23 |
ensure_cuda_availability
|
| 24 |
)
|
| 25 |
+
from emotionanalysis import MusicAnalyzer
|
| 26 |
import librosa
|
| 27 |
+
from beat_analysis import BeatAnalyzer # Import the BeatAnalyzer class
|
| 28 |
+
|
| 29 |
+
# Initialize beat analyzer
|
| 30 |
+
beat_analyzer = BeatAnalyzer()
|
| 31 |
|
| 32 |
# Login to Hugging Face Hub if token is provided
|
| 33 |
if "HF_TOKEN" in os.environ:
|
|
|
|
| 89 |
# Process uploaded audio file
|
| 90 |
def process_audio(audio_file):
|
| 91 |
if audio_file is None:
|
| 92 |
+
return "No audio file provided", None, None, None, None, None, None, None
|
| 93 |
|
| 94 |
try:
|
| 95 |
# Load and analyze audio
|
|
|
|
| 101 |
# Analyze music with MusicAnalyzer
|
| 102 |
music_analysis = music_analyzer.analyze_music(audio_file)
|
| 103 |
|
| 104 |
+
# Extract time signature from MusicAnalyzer result
|
| 105 |
+
time_signature = music_analysis["rhythm_analysis"]["estimated_time_signature"]
|
| 106 |
+
|
| 107 |
+
# Ensure time signature is one of the supported ones (4/4, 3/4, 2/4, 6/8)
|
| 108 |
+
if time_signature not in ["4/4", "3/4", "2/4", "6/8"]:
|
| 109 |
+
time_signature = "4/4" # Default to 4/4 if unsupported
|
| 110 |
+
music_analysis["rhythm_analysis"]["estimated_time_signature"] = time_signature
|
| 111 |
+
|
| 112 |
+
# Analyze beat patterns and create lyrics template using MusicAnalyzer's time signature
|
| 113 |
+
beat_analysis = beat_analyzer.analyze_beat_pattern(audio_file, time_signature=time_signature)
|
| 114 |
+
lyric_templates = beat_analyzer.create_lyric_template(beat_analysis)
|
| 115 |
+
|
| 116 |
+
# Store these in the music_analysis dict for use in lyrics generation
|
| 117 |
+
music_analysis["beat_analysis"] = beat_analysis
|
| 118 |
+
music_analysis["lyric_templates"] = lyric_templates
|
| 119 |
+
|
| 120 |
# Extract key information
|
| 121 |
tempo = music_analysis["rhythm_analysis"]["tempo"]
|
|
|
|
| 122 |
emotion = music_analysis["emotion_analysis"]["primary_emotion"]
|
| 123 |
theme = music_analysis["theme_analysis"]["primary_theme"]
|
| 124 |
|
|
|
|
| 154 |
# Generate lyrics using LLM
|
| 155 |
lyrics = generate_lyrics(music_analysis, primary_genre, duration)
|
| 156 |
|
| 157 |
+
# Create beat/stress/syllable matching analysis
|
| 158 |
+
beat_match_analysis = analyze_lyrics_rhythm_match(lyrics, lyric_templates, primary_genre)
|
| 159 |
+
|
| 160 |
# Prepare analysis summary
|
| 161 |
analysis_summary = f"""
|
| 162 |
### Music Analysis Results
|
|
|
|
| 170 |
**Top Genre:** {primary_genre}
|
| 171 |
|
| 172 |
{genre_results_text}
|
| 173 |
+
"""
|
| 174 |
+
|
| 175 |
+
# Add beat analysis summary
|
| 176 |
+
if lyric_templates:
|
| 177 |
+
analysis_summary += f"""
|
| 178 |
+
### Beat Analysis
|
| 179 |
+
|
| 180 |
+
**Total Phrases:** {len(lyric_templates)}
|
| 181 |
+
**Average Beats Per Phrase:** {np.mean([t['num_beats'] for t in lyric_templates]):.1f}
|
| 182 |
+
**Beat Pattern Examples:**
|
| 183 |
+
- Phrase 1: {lyric_templates[0]['stress_pattern'] if lyric_templates else 'N/A'}
|
| 184 |
+
- Phrase 2: {lyric_templates[1]['stress_pattern'] if len(lyric_templates) > 1 else 'N/A'}
|
| 185 |
+
"""
|
| 186 |
|
| 187 |
+
return analysis_summary, lyrics, tempo, time_signature, emotion, theme, primary_genre, beat_match_analysis
|
| 188 |
|
| 189 |
except Exception as e:
|
| 190 |
error_msg = f"Error processing audio: {str(e)}"
|
| 191 |
print(error_msg)
|
| 192 |
+
return error_msg, None, None, None, None, None, None, None
|
| 193 |
|
| 194 |
def generate_lyrics(music_analysis, genre, duration):
|
| 195 |
try:
|
|
|
|
| 200 |
emotion = music_analysis["emotion_analysis"]["primary_emotion"]
|
| 201 |
theme = music_analysis["theme_analysis"]["primary_theme"]
|
| 202 |
|
| 203 |
+
# Get beat analysis and templates
|
| 204 |
+
lyric_templates = music_analysis.get("lyric_templates", [])
|
| 205 |
+
|
| 206 |
# Verify LLM is loaded
|
| 207 |
if llm_model is None or llm_tokenizer is None:
|
| 208 |
return "Error: LLM model not properly loaded"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
+
# If no templates, fall back to original method
|
| 211 |
+
if not lyric_templates:
|
| 212 |
+
# Simplified prompt
|
| 213 |
+
prompt = f"""Write song lyrics for a {genre} song in {key} {mode} with tempo {tempo} BPM. The emotion is {emotion} and theme is {theme}.
|
| 214 |
+
|
| 215 |
+
ONLY WRITE THE ACTUAL LYRICS. NO EXPLANATIONS OR META-TEXT.
|
| 216 |
+
"""
|
| 217 |
+
else:
|
| 218 |
+
# Create phrase examples
|
| 219 |
+
num_phrases = len(lyric_templates)
|
| 220 |
+
|
| 221 |
+
# Create a more direct prompt with examples
|
| 222 |
+
prompt = f"""Write song lyrics for a {genre} song in {key} {mode} with tempo {tempo} BPM. The emotion is {emotion} and theme is {theme}.
|
| 223 |
+
|
| 224 |
+
I need EXACTLY {num_phrases} lines of lyrics - one line for each musical phrase. Not one more, not one less.
|
| 225 |
+
|
| 226 |
+
FORMAT:
|
| 227 |
+
- Just write {num_phrases} plain text lines
|
| 228 |
+
- Each line should be simple song lyrics (no annotations, no numbers, no labeling)
|
| 229 |
+
- Don't include any explanations, thinking tags, or meta-commentary
|
| 230 |
+
- Don't use any <think> or [thinking] tags
|
| 231 |
+
- Don't include [Verse], [Chorus] or section markers
|
| 232 |
+
- Don't include line numbers
|
| 233 |
+
|
| 234 |
+
EXAMPLE OF WHAT I WANT (for a {num_phrases}-line song):
|
| 235 |
+
Lost in the shadows of yesterday
|
| 236 |
+
Dreams fade away like morning dew
|
| 237 |
+
Time slips through fingers like desert sand
|
| 238 |
+
Memories echo in empty rooms
|
| 239 |
+
(... and so on for exactly {num_phrases} lines)
|
| 240 |
+
|
| 241 |
+
JUST THE PLAIN LYRICS, EXACTLY {num_phrases} LINES.
|
| 242 |
"""
|
| 243 |
|
| 244 |
+
# Generate lyrics using the LLM model
|
|
|
|
| 245 |
messages = [
|
| 246 |
{"role": "user", "content": prompt}
|
| 247 |
]
|
|
|
|
| 263 |
do_sample=True,
|
| 264 |
temperature=0.7,
|
| 265 |
top_p=0.9,
|
| 266 |
+
repetition_penalty=1.2,
|
| 267 |
pad_token_id=llm_tokenizer.eos_token_id
|
| 268 |
)
|
| 269 |
|
|
|
|
| 271 |
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
|
| 272 |
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
|
| 273 |
|
| 274 |
+
# ULTRA AGGRESSIVE CLEANING - COMPLETELY REVISED
|
| 275 |
+
# ------------------------------------------------
|
|
|
|
| 276 |
|
| 277 |
+
# 1. First, look for any standard dividers that might separate thinking from lyrics
|
| 278 |
+
divider_patterns = [
|
| 279 |
+
r'Here are the lyrics:',
|
| 280 |
+
r'Here is my song:',
|
| 281 |
+
r'The lyrics:',
|
| 282 |
+
r'My lyrics:',
|
| 283 |
+
r'Song lyrics:',
|
| 284 |
+
r'\*\*\*+',
|
| 285 |
+
r'===+',
|
| 286 |
+
r'---+',
|
| 287 |
+
r'```',
|
| 288 |
+
r'Lyrics:'
|
| 289 |
+
]
|
| 290 |
+
|
| 291 |
+
for pattern in divider_patterns:
|
| 292 |
+
matches = re.finditer(pattern, lyrics, re.IGNORECASE)
|
| 293 |
+
for match in matches:
|
| 294 |
+
# Keep only content after the divider
|
| 295 |
+
lyrics = lyrics[match.end():].strip()
|
| 296 |
|
| 297 |
+
# 2. Remove thinking tags completely before splitting into lines
|
| 298 |
+
lyrics = re.sub(r'<think>.*?</think>', '', lyrics, flags=re.DOTALL)
|
| 299 |
+
lyrics = re.sub(r'\[thinking\].*?\[/thinking\]', '', lyrics, flags=re.DOTALL)
|
| 300 |
+
lyrics = re.sub(r'<think>', '', lyrics, flags=re.DOTALL)
|
| 301 |
+
lyrics = re.sub(r'</think>', '', lyrics, flags=re.DOTALL)
|
| 302 |
+
lyrics = re.sub(r'\[thinking\]', '', lyrics, flags=re.DOTALL)
|
| 303 |
+
lyrics = re.sub(r'\[/thinking\]', '', lyrics, flags=re.DOTALL)
|
| 304 |
|
| 305 |
+
# 3. Split text into lines for aggressive line-by-line filtering
|
| 306 |
+
lines = lyrics.strip().split('\n')
|
| 307 |
+
clean_lines = []
|
| 308 |
|
| 309 |
+
# 4. Define comprehensive patterns for non-lyrical content
|
| 310 |
+
non_lyric_patterns = [
|
| 311 |
+
# Meta-commentary
|
| 312 |
+
r'^(note|thinking|thoughts|let me|i will|i am going|i would|i can|i need to|i have to|i should|let\'s|here|now)',
|
| 313 |
+
r'^(first|second|third|next|finally|importantly|remember|so|ok|okay|as requested|as asked|considering)',
|
| 314 |
+
# Explanations
|
| 315 |
+
r'syllable[s]?|phrase|rhythm|beats?|tempo|bpm|instruction|follow|alignment|match|corresponding',
|
| 316 |
+
r'verses?|chorus|bridge|section|stanza|part|template|format|pattern|example',
|
| 317 |
+
r'requirements?|guidelines?|song structure|stressed|unstressed',
|
| 318 |
+
# Technical language
|
| 319 |
+
r'generated|output|result|provide|create|write|draft|version',
|
| 320 |
+
# Annotations and numbering
|
| 321 |
+
r'^line \d+|^\d+[\.\):]|^\[\w+\]|^[\*\-\+] ',
|
| 322 |
+
# Questions or analytical statements
|
| 323 |
+
r'\?$|analysis|evaluate|review|check|ensure',
|
| 324 |
+
# Instruction-like statements
|
| 325 |
+
r'make sure|please note|important|notice|pay attention'
|
| 326 |
+
]
|
| 327 |
|
| 328 |
+
# 5. Identify which lines are likely actual lyrics vs non-lyrics
|
| 329 |
+
for line in lines:
|
| 330 |
+
line = line.strip()
|
| 331 |
+
|
| 332 |
+
# Skip empty lines or lines with just spaces/tabs
|
| 333 |
+
if not line or line.isspace():
|
| 334 |
+
continue
|
| 335 |
+
|
| 336 |
+
# Skip lines that match any non-lyric pattern
|
| 337 |
+
should_skip = False
|
| 338 |
+
for pattern in non_lyric_patterns:
|
| 339 |
+
if re.search(pattern, line.lower()):
|
| 340 |
+
should_skip = True
|
| 341 |
+
break
|
| 342 |
+
|
| 343 |
+
if should_skip:
|
| 344 |
+
continue
|
| 345 |
+
|
| 346 |
+
# Skip section headers
|
| 347 |
+
if (line.startswith('[') and ']' in line) or (line.startswith('(') and ')' in line and len(line) < 20):
|
| 348 |
+
continue
|
| 349 |
+
|
| 350 |
+
# Skip lines that look like annotations (not prose-like)
|
| 351 |
+
if ':' in line and not any(word in line.lower() for word in ['like', 'when', 'where', 'how', 'why', 'what']):
|
| 352 |
+
if len(line.split(':')[0]) < 15: # Short prefixes followed by colon are likely annotations
|
| 353 |
+
continue
|
| 354 |
+
|
| 355 |
+
# Skip very short lines that aren't likely to be lyrics (unless it's just a few words which could be valid)
|
| 356 |
+
if len(line) < 3:
|
| 357 |
+
continue
|
| 358 |
+
|
| 359 |
+
# Skip lines that are numbered or bulleted
|
| 360 |
+
if re.match(r'^\d+\.|\(#\d+\)|\d+\)', line):
|
| 361 |
+
continue
|
| 362 |
+
|
| 363 |
+
# Skip markdown-style emphasis or headers
|
| 364 |
+
if re.match(r'^#{1,6} |^\*\*|^__', line):
|
| 365 |
+
continue
|
| 366 |
+
|
| 367 |
+
# Skip lines with think tags
|
| 368 |
+
if '<think>' in line.lower() or '</think>' in line.lower() or '[thinking]' in line.lower() or '[/thinking]' in line.lower():
|
| 369 |
+
continue
|
| 370 |
+
|
| 371 |
+
# Add this line as it passed all filters
|
| 372 |
+
clean_lines.append(line)
|
| 373 |
+
|
| 374 |
+
# 6. Additional block-level filters for common patterns
|
| 375 |
+
# Check beginning of lyrics for common prefixes
|
| 376 |
+
if clean_lines and any(clean_lines[0].lower().startswith(prefix) for prefix in
|
| 377 |
+
['here are', 'these are', 'below are', 'following are']):
|
| 378 |
+
clean_lines = clean_lines[1:] # Skip the first line
|
| 379 |
+
|
| 380 |
+
# 7. Process blocks of lines to detect explanation blocks
|
| 381 |
+
if len(clean_lines) > 3:
|
| 382 |
+
# Check for explanation blocks at the beginning
|
| 383 |
+
first_three = ' '.join(clean_lines[:3]).lower()
|
| 384 |
+
if any(term in first_three for term in ['i will', 'i have created', 'i\'ll provide', 'i\'ll write']):
|
| 385 |
+
# This looks like an explanation, skip the first few lines
|
| 386 |
+
start_idx = 0
|
| 387 |
+
for i, line in enumerate(clean_lines):
|
| 388 |
+
if i >= 3 and not any(term in line.lower() for term in ['i will', 'created', 'write', 'provide']):
|
| 389 |
+
start_idx = i
|
| 390 |
+
break
|
| 391 |
+
clean_lines = clean_lines[start_idx:]
|
| 392 |
+
|
| 393 |
+
# Check for explanation blocks at the end
|
| 394 |
+
last_three = ' '.join(clean_lines[-3:]).lower()
|
| 395 |
+
if any(term in last_three for term in ['hope this', 'these lyrics', 'as you can see', 'this song', 'i have']):
|
| 396 |
+
# This looks like an explanation at the end, truncate
|
| 397 |
+
end_idx = len(clean_lines)
|
| 398 |
+
for i in range(len(clean_lines) - 1, max(0, len(clean_lines) - 4), -1):
|
| 399 |
+
if i < len(clean_lines) and not any(term in clean_lines[i].lower() for term in
|
| 400 |
+
['hope', 'these lyrics', 'as you can see', 'this song']):
|
| 401 |
+
end_idx = i + 1
|
| 402 |
+
break
|
| 403 |
+
clean_lines = clean_lines[:end_idx]
|
| 404 |
+
|
| 405 |
+
# 8. Cleanup - Remove remaining annotations or thinking
|
| 406 |
+
for i in range(len(clean_lines)):
|
| 407 |
+
# Remove trailing thoughts/annotations
|
| 408 |
+
clean_lines[i] = re.sub(r'\s+//.*$', '', clean_lines[i])
|
| 409 |
+
clean_lines[i] = re.sub(r'\s+\(.*?\)$', '', clean_lines[i])
|
| 410 |
+
|
| 411 |
+
# Remove thinking tags completely
|
| 412 |
+
clean_lines[i] = re.sub(r'<think>.*?</think>', '', clean_lines[i], flags=re.DOTALL)
|
| 413 |
+
clean_lines[i] = re.sub(r'\[thinking\].*?\[/thinking\]', '', clean_lines[i], flags=re.DOTALL)
|
| 414 |
+
clean_lines[i] = re.sub(r'<think>', '', clean_lines[i])
|
| 415 |
+
clean_lines[i] = re.sub(r'</think>', '', clean_lines[i])
|
| 416 |
+
clean_lines[i] = re.sub(r'\[thinking\]', '', clean_lines[i])
|
| 417 |
+
clean_lines[i] = re.sub(r'\[/thinking\]', '', clean_lines[i])
|
| 418 |
+
|
| 419 |
+
# 9. Filter out any remaining empty lines after tag removal
|
| 420 |
+
clean_lines = [line for line in clean_lines if line.strip() and not line.isspace()]
|
| 421 |
+
|
| 422 |
+
# 10. If we have lyric templates, ensure we have the correct number of lines
|
| 423 |
+
if lyric_templates:
|
| 424 |
+
num_required = len(lyric_templates)
|
| 425 |
+
|
| 426 |
+
# If we have too many lines, keep just the best ones
|
| 427 |
+
if len(clean_lines) > num_required:
|
| 428 |
+
# Keep the first num_required lines
|
| 429 |
+
clean_lines = clean_lines[:num_required]
|
| 430 |
+
|
| 431 |
+
# If we don't have enough lines, generate placeholders
|
| 432 |
+
while len(clean_lines) < num_required:
|
| 433 |
+
placeholder = f"Echoes of {emotion} fill the {genre} night"
|
| 434 |
+
if len(clean_lines) > 0:
|
| 435 |
+
# Try to make the placeholder somewhat related to previous lines
|
| 436 |
+
last_words = [word for line in clean_lines[-1:] for word in line.split() if len(word) > 3]
|
| 437 |
+
if last_words:
|
| 438 |
+
import random
|
| 439 |
+
word = random.choice(last_words)
|
| 440 |
+
placeholder = f"{word.capitalize()} whispers through the {emotion} silence"
|
| 441 |
+
|
| 442 |
+
clean_lines.append(placeholder)
|
| 443 |
+
|
| 444 |
+
# Assemble final lyrics
|
| 445 |
+
final_lyrics = '\n'.join(clean_lines)
|
| 446 |
+
|
| 447 |
+
# 11. Final sanity check - if we have nothing or garbage, return an error
|
| 448 |
+
if not final_lyrics or len(final_lyrics) < 10:
|
| 449 |
+
return "The model generated only thinking content but no actual lyrics. Please try again."
|
| 450 |
+
|
| 451 |
+
return final_lyrics
|
| 452 |
|
| 453 |
except Exception as e:
|
| 454 |
error_msg = f"Error generating lyrics: {str(e)}"
|
| 455 |
print(error_msg)
|
| 456 |
return error_msg
|
| 457 |
|
| 458 |
+
def analyze_lyrics_rhythm_match(lyrics, lyric_templates, genre="pop"):
|
| 459 |
+
"""Analyze how well the generated lyrics match the beat patterns and syllable requirements"""
|
| 460 |
+
if not lyric_templates or not lyrics:
|
| 461 |
+
return "No beat templates or lyrics available for analysis."
|
| 462 |
+
|
| 463 |
+
# Split lyrics into lines
|
| 464 |
+
lines = lyrics.strip().split('\n')
|
| 465 |
+
lines = [line for line in lines if line.strip()] # Remove empty lines
|
| 466 |
+
|
| 467 |
+
# Prepare analysis result
|
| 468 |
+
result = "### Beat & Syllable Match Analysis\n\n"
|
| 469 |
+
result += "| Line | Syllables | Target Range | Match | Stress Pattern |\n"
|
| 470 |
+
result += "| ---- | --------- | ------------ | ----- | -------------- |\n"
|
| 471 |
+
|
| 472 |
+
# Maximum number of lines to analyze (either all lines or all templates)
|
| 473 |
+
line_count = min(len(lines), len(lyric_templates))
|
| 474 |
+
|
| 475 |
+
# Track overall match statistics
|
| 476 |
+
total_matches = 0
|
| 477 |
+
total_range_matches = 0
|
| 478 |
+
total_stress_matches = 0
|
| 479 |
+
total_stress_percentage = 0
|
| 480 |
+
total_ideal_matches = 0
|
| 481 |
+
|
| 482 |
+
for i in range(line_count):
|
| 483 |
+
line = lines[i]
|
| 484 |
+
template = lyric_templates[i]
|
| 485 |
+
|
| 486 |
+
# Check match between line and template with genre awareness
|
| 487 |
+
check_result = beat_analyzer.check_syllable_stress_match(line, template, genre)
|
| 488 |
+
|
| 489 |
+
# Get match symbols
|
| 490 |
+
syllable_match = "✓" if check_result["matches_beat_count"] else ("✓*" if check_result["within_range"] else "✗")
|
| 491 |
+
stress_match = "✓" if check_result["stress_matches"] else f"{int(check_result['stress_match_percentage']*100)}%"
|
| 492 |
+
|
| 493 |
+
# Update stats
|
| 494 |
+
if check_result["matches_beat_count"]:
|
| 495 |
+
total_matches += 1
|
| 496 |
+
if check_result["within_range"]:
|
| 497 |
+
total_range_matches += 1
|
| 498 |
+
if check_result["stress_matches"]:
|
| 499 |
+
total_stress_matches += 1
|
| 500 |
+
total_stress_percentage += check_result["stress_match_percentage"]
|
| 501 |
+
|
| 502 |
+
# Track how close we are to ideal count for this genre
|
| 503 |
+
if abs(check_result["syllable_count"] - check_result["ideal_syllable_count"]) <= 1:
|
| 504 |
+
total_ideal_matches += 1
|
| 505 |
+
|
| 506 |
+
# Create visual representation of the stress pattern
|
| 507 |
+
stress_visual = ""
|
| 508 |
+
for char in template['stress_pattern']:
|
| 509 |
+
if char == "S":
|
| 510 |
+
stress_visual += "X" # Strong
|
| 511 |
+
elif char == "M":
|
| 512 |
+
stress_visual += "x" # Medium
|
| 513 |
+
else:
|
| 514 |
+
stress_visual += "." # Weak
|
| 515 |
+
|
| 516 |
+
# Add line to results table
|
| 517 |
+
result += f"| {i+1} | {check_result['syllable_count']} | {check_result['min_expected']}-{check_result['max_expected']} | {syllable_match} | {stress_visual} |\n"
|
| 518 |
+
|
| 519 |
+
# Add summary statistics
|
| 520 |
+
if line_count > 0:
|
| 521 |
+
exact_match_rate = (total_matches / line_count) * 100
|
| 522 |
+
range_match_rate = (total_range_matches / line_count) * 100
|
| 523 |
+
ideal_match_rate = (total_ideal_matches / line_count) * 100
|
| 524 |
+
stress_match_rate = (total_stress_matches / line_count) * 100
|
| 525 |
+
avg_stress_percentage = (total_stress_percentage / line_count) * 100
|
| 526 |
+
|
| 527 |
+
result += f"\n**Summary:**\n"
|
| 528 |
+
result += f"- Exact syllable match rate: {exact_match_rate:.1f}%\n"
|
| 529 |
+
result += f"- Genre-appropriate syllable range match rate: {range_match_rate:.1f}%\n"
|
| 530 |
+
result += f"- Ideal genre syllable count match rate: {ideal_match_rate:.1f}%\n"
|
| 531 |
+
result += f"- Perfect stress pattern match rate: {stress_match_rate:.1f}%\n"
|
| 532 |
+
result += f"- Average stress pattern accuracy: {avg_stress_percentage:.1f}%\n"
|
| 533 |
+
result += f"- Overall rhythmic accuracy: {((range_match_rate + avg_stress_percentage) / 2):.1f}%\n"
|
| 534 |
+
|
| 535 |
+
# Add genre-specific notes
|
| 536 |
+
result += f"\n**Genre Notes ({genre}):**\n"
|
| 537 |
+
|
| 538 |
+
# Add appropriate genre notes based on genre
|
| 539 |
+
if genre.lower() == "pop":
|
| 540 |
+
result += "- Pop music typically allows 1-3 syllables per beat using melisma and syncopation\n"
|
| 541 |
+
result += "- Strong downbeats often align with stressed syllables of important words\n"
|
| 542 |
+
elif genre.lower() == "rock":
|
| 543 |
+
result += "- Rock music often uses 1-2 syllables per beat with some variation\n"
|
| 544 |
+
result += "- Emphasis on strong beats for impact and rhythmic drive\n"
|
| 545 |
+
elif genre.lower() in ["hiphop", "rap"]:
|
| 546 |
+
result += "- Hip-hop/rap often features 2-5 syllables per beat through rapid delivery\n"
|
| 547 |
+
result += "- Complex rhyme patterns and fast delivery create higher syllable density\n"
|
| 548 |
+
elif genre.lower() in ["folk", "country"]:
|
| 549 |
+
result += "- Folk/country music often stays closer to 1:1 syllable-to-beat ratio\n"
|
| 550 |
+
result += "- Narrative focus leads to clearer enunciation of syllables\n"
|
| 551 |
+
else:
|
| 552 |
+
result += "- This genre typically allows for flexible syllable-to-beat relationships\n"
|
| 553 |
+
result += "- Syllable count can vary based on vocal style and song section\n"
|
| 554 |
+
|
| 555 |
+
return result
|
| 556 |
+
|
| 557 |
# Create Gradio interface
|
| 558 |
def create_interface():
|
| 559 |
with gr.Blocks(title="Music Analysis & Lyrics Generator") as demo:
|
|
|
|
| 582 |
|
| 583 |
with gr.Tab("Generated Lyrics"):
|
| 584 |
lyrics_output = gr.Textbox(label="Generated Lyrics", lines=20)
|
| 585 |
+
|
| 586 |
+
with gr.Tab("Beat Matching"):
|
| 587 |
+
beat_match_output = gr.Markdown(label="Beat & Syllable Matching Analysis")
|
| 588 |
|
| 589 |
# Set up event handlers
|
| 590 |
analyze_btn.click(
|
| 591 |
fn=process_audio,
|
| 592 |
inputs=[audio_input],
|
| 593 |
outputs=[analysis_output, lyrics_output, tempo_output, time_sig_output,
|
| 594 |
+
emotion_output, theme_output, genre_output, beat_match_output]
|
| 595 |
)
|
| 596 |
|
| 597 |
gr.Markdown("""
|
|
|
|
| 599 |
1. Upload or record a music file
|
| 600 |
2. The system analyzes tempo, beats, time signature and other musical features
|
| 601 |
3. It detects emotion, theme, and music genre
|
| 602 |
+
4. Using beat patterns and syllable stress analysis, it generates perfectly aligned lyrics
|
| 603 |
+
5. Each line of the lyrics is matched to the beat pattern of the corresponding musical phrase
|
| 604 |
""")
|
| 605 |
|
| 606 |
return demo
|
beat_analysis.py
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import librosa
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pronouncing
|
| 4 |
+
import re
|
| 5 |
+
from functools import lru_cache
|
| 6 |
+
import string
|
| 7 |
+
from nltk.corpus import cmudict
|
| 8 |
+
import nltk
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
nltk.data.find('corpora/cmudict')
|
| 12 |
+
except LookupError:
|
| 13 |
+
nltk.download('cmudict')
|
| 14 |
+
|
| 15 |
+
class BeatAnalyzer:
|
| 16 |
+
def __init__(self):
|
| 17 |
+
# Mapping for standard stress patterns by time signature
|
| 18 |
+
# Simplified to only include 4/4, 3/4, 2/4, and 6/8
|
| 19 |
+
self.stress_patterns = {
|
| 20 |
+
# Format: Strong (1.0), Medium (0.5), Weak (0.0)
|
| 21 |
+
"4/4": [1.0, 0.0, 0.5, 0.0], # Strong, weak, medium, weak
|
| 22 |
+
"3/4": [1.0, 0.0, 0.0], # Strong, weak, weak
|
| 23 |
+
"2/4": [1.0, 0.0], # Strong, weak
|
| 24 |
+
"6/8": [1.0, 0.0, 0.0, 0.5, 0.0, 0.0] # Strong, weak, weak, medium, weak, weak
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
self.cmudict = None
|
| 28 |
+
try:
|
| 29 |
+
self.cmudict = cmudict.dict()
|
| 30 |
+
except:
|
| 31 |
+
pass # Fall back to rule-based counting if cmudict is not available
|
| 32 |
+
|
| 33 |
+
# Genre-specific syllable-to-beat ratio guidelines
|
| 34 |
+
self.genre_syllable_ratios = {
|
| 35 |
+
# Genre: (min_ratio, typical_ratio, max_ratio)
|
| 36 |
+
'pop': (0.7, 1.5, 3.0), # Pop tends to have more syllables per beat
|
| 37 |
+
'rock': (0.7, 1.2, 2.5), # Rock can vary widely
|
| 38 |
+
'hiphop': (1.5, 3.0, 5.0), # Hip hop often has many syllables per beat
|
| 39 |
+
'rap': (2.0, 4.0, 7.0), # Rap often has very high syllable counts
|
| 40 |
+
'folk': (0.8, 1.0, 1.5), # Folk often has close to 1:1 ratio
|
| 41 |
+
'country': (0.7, 1.2, 2.0), # Country tends to be moderate
|
| 42 |
+
'jazz': (0.5, 1.0, 3.0), # Jazz can be very flexible
|
| 43 |
+
'reggae': (0.6, 1.0, 1.5), # Reggae often emphasizes specific beats
|
| 44 |
+
'soul': (0.7, 1.2, 2.0), # Soul music tends to be expressive
|
| 45 |
+
'r&b': (0.8, 1.5, 2.5), # R&B can have melisma
|
| 46 |
+
'electronic': (0.5, 1.0, 2.0), # Electronic music varies widely
|
| 47 |
+
'disco': (1.0, 1.5, 2.5), # Disco tends to have more syllables
|
| 48 |
+
'classical': (0.5, 1.0, 2.0), # Classical can vary by subgenre
|
| 49 |
+
'metal': (0.8, 1.5, 3.0), # Metal often has more syllables on strong beats
|
| 50 |
+
'blues': (0.5, 0.8, 1.5), # Blues often extends syllables
|
| 51 |
+
'default': (0.7, 1.5, 3.0) # Default for unknown genres
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
@lru_cache(maxsize=128)
|
| 55 |
+
def count_syllables(self, word):
|
| 56 |
+
"""Count syllables in a word using CMU dictionary if available, otherwise use rule-based method."""
|
| 57 |
+
word = word.lower().strip()
|
| 58 |
+
word = re.sub(r'[^a-z]', '', word) # Remove non-alphabetic characters
|
| 59 |
+
|
| 60 |
+
if not word:
|
| 61 |
+
return 0
|
| 62 |
+
|
| 63 |
+
# Try using CMUDict first if available
|
| 64 |
+
if self.cmudict and word in self.cmudict:
|
| 65 |
+
return max([len(list(y for y in x if y[-1].isdigit())) for x in self.cmudict[word]])
|
| 66 |
+
|
| 67 |
+
# Rule-based syllable counting as fallback
|
| 68 |
+
# Modified version from NLTK's implementation
|
| 69 |
+
vowels = "aeiouy"
|
| 70 |
+
double_vowels = ['aa', 'ae', 'ai', 'ao', 'au', 'ay', 'ea', 'ee', 'ei', 'eo', 'eu', 'ey', 'ia', 'ie', 'ii', 'io', 'iu', 'oa', 'oe', 'oi', 'oo', 'ou', 'oy', 'ua', 'ue', 'ui', 'uo', 'uy']
|
| 71 |
+
prev_was_vowel = False
|
| 72 |
+
count = 0
|
| 73 |
+
final_e = False
|
| 74 |
+
|
| 75 |
+
if word.endswith('e') and not word.endswith('le'):
|
| 76 |
+
final_e = True
|
| 77 |
+
|
| 78 |
+
for i, char in enumerate(word):
|
| 79 |
+
if char in vowels:
|
| 80 |
+
# Check if current char and previous char form a dipthong
|
| 81 |
+
if prev_was_vowel and i > 0 and (word[i-1:i+1] in double_vowels):
|
| 82 |
+
prev_was_vowel = True
|
| 83 |
+
continue
|
| 84 |
+
|
| 85 |
+
if not prev_was_vowel:
|
| 86 |
+
count += 1
|
| 87 |
+
prev_was_vowel = True
|
| 88 |
+
else:
|
| 89 |
+
prev_was_vowel = False
|
| 90 |
+
|
| 91 |
+
# Handle edge cases
|
| 92 |
+
if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
|
| 93 |
+
count += 1
|
| 94 |
+
elif final_e:
|
| 95 |
+
count = max(count-1, 1) # Remove last 'e', but ensure at least 1 syllable
|
| 96 |
+
elif word.endswith('y') and not prev_was_vowel:
|
| 97 |
+
count += 1
|
| 98 |
+
|
| 99 |
+
# Ensure at least one syllable
|
| 100 |
+
return max(count, 1)
|
| 101 |
+
|
| 102 |
+
def analyze_beat_pattern(self, audio_path, sr=22050, time_signature="4/4"):
|
| 103 |
+
"""Analyze beat patterns and stresses in music using the provided time signature."""
|
| 104 |
+
# Load audio
|
| 105 |
+
y, sr = librosa.load(audio_path, sr=sr)
|
| 106 |
+
|
| 107 |
+
# Get tempo and beat frames
|
| 108 |
+
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
|
| 109 |
+
beat_times = librosa.frames_to_time(beat_frames, sr=sr)
|
| 110 |
+
|
| 111 |
+
# Get beat strengths using onset envelope
|
| 112 |
+
onset_env = librosa.onset.onset_strength(y=y, sr=sr)
|
| 113 |
+
beat_strengths = onset_env[beat_frames]
|
| 114 |
+
|
| 115 |
+
# Normalize beat strengths
|
| 116 |
+
if len(beat_strengths) > 0 and np.max(beat_strengths) > np.min(beat_strengths):
|
| 117 |
+
beat_strengths = (beat_strengths - np.min(beat_strengths)) / (np.max(beat_strengths) - np.min(beat_strengths))
|
| 118 |
+
|
| 119 |
+
# Parse time signature
|
| 120 |
+
if '/' in time_signature:
|
| 121 |
+
num, denom = map(int, time_signature.split('/'))
|
| 122 |
+
else:
|
| 123 |
+
num, denom = 4, 4 # Default to 4/4
|
| 124 |
+
|
| 125 |
+
# Group beats into bars (each bar is one phrase based on time signature)
|
| 126 |
+
bars = []
|
| 127 |
+
current_bar = []
|
| 128 |
+
|
| 129 |
+
for i, (time, strength) in enumerate(zip(beat_times, beat_strengths)):
|
| 130 |
+
# Determine metrical position and stress
|
| 131 |
+
metrical_position = i % num
|
| 132 |
+
|
| 133 |
+
# Define stress pattern according to time signature
|
| 134 |
+
if time_signature == "4/4":
|
| 135 |
+
if metrical_position == 0: # First beat (strongest)
|
| 136 |
+
stress = "S" # Strong
|
| 137 |
+
elif metrical_position == 2: # Third beat (medium)
|
| 138 |
+
stress = "M" # Medium
|
| 139 |
+
else: # Second and fourth beats (weak)
|
| 140 |
+
stress = "W" # Weak
|
| 141 |
+
elif time_signature == "3/4":
|
| 142 |
+
if metrical_position == 0: # First beat (strongest)
|
| 143 |
+
stress = "S" # Strong
|
| 144 |
+
else: # Other beats (weak)
|
| 145 |
+
stress = "W" # Weak
|
| 146 |
+
elif time_signature == "6/8":
|
| 147 |
+
if metrical_position == 0: # First beat (strongest)
|
| 148 |
+
stress = "S" # Strong
|
| 149 |
+
elif metrical_position == 3: # Fourth beat (medium)
|
| 150 |
+
stress = "M" # Medium
|
| 151 |
+
else: # Other beats (weak)
|
| 152 |
+
stress = "W" # Weak
|
| 153 |
+
elif time_signature == "2/4":
|
| 154 |
+
if metrical_position == 0: # First beat (strongest)
|
| 155 |
+
stress = "S" # Strong
|
| 156 |
+
else: # Second beat (weak)
|
| 157 |
+
stress = "W" # Weak
|
| 158 |
+
else:
|
| 159 |
+
# Default pattern for other time signatures
|
| 160 |
+
if metrical_position == 0:
|
| 161 |
+
stress = "S"
|
| 162 |
+
else:
|
| 163 |
+
stress = "W"
|
| 164 |
+
|
| 165 |
+
# Add beat to current bar
|
| 166 |
+
current_bar.append({
|
| 167 |
+
'time': time,
|
| 168 |
+
'strength': strength,
|
| 169 |
+
'stress': stress,
|
| 170 |
+
'metrical_position': metrical_position
|
| 171 |
+
})
|
| 172 |
+
|
| 173 |
+
# When we complete a bar, add it to our bars list
|
| 174 |
+
if metrical_position == num - 1 or i == len(beat_times) - 1:
|
| 175 |
+
if current_bar:
|
| 176 |
+
bars.append(current_bar)
|
| 177 |
+
current_bar = []
|
| 178 |
+
|
| 179 |
+
# If there's any remaining beats, add them as a partial bar
|
| 180 |
+
if current_bar:
|
| 181 |
+
bars.append(current_bar)
|
| 182 |
+
|
| 183 |
+
# Organize beats into phrases (one phrase = one bar)
|
| 184 |
+
phrases = []
|
| 185 |
+
|
| 186 |
+
for i, bar in enumerate(bars):
|
| 187 |
+
phrase_beats = bar
|
| 188 |
+
|
| 189 |
+
if not phrase_beats:
|
| 190 |
+
continue
|
| 191 |
+
|
| 192 |
+
# Calculate the phrase information
|
| 193 |
+
phrase = {
|
| 194 |
+
'id': i,
|
| 195 |
+
'num_beats': len(phrase_beats),
|
| 196 |
+
'beats': phrase_beats,
|
| 197 |
+
'stress_pattern': ''.join(beat['stress'] for beat in phrase_beats),
|
| 198 |
+
'start_time': phrase_beats[0]['time'],
|
| 199 |
+
'end_time': phrase_beats[-1]['time'] + (phrase_beats[-1]['time'] - phrase_beats[-2]['time'] if len(phrase_beats) > 1 else 0.5),
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
phrases.append(phrase)
|
| 203 |
+
|
| 204 |
+
return {
|
| 205 |
+
'tempo': tempo,
|
| 206 |
+
'time_signature': time_signature,
|
| 207 |
+
'num_beats': len(beat_times),
|
| 208 |
+
'beat_times': beat_times.tolist(),
|
| 209 |
+
'beat_strengths': beat_strengths.tolist(),
|
| 210 |
+
'phrases': phrases
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
def create_lyric_template(self, beat_analysis):
|
| 214 |
+
"""Create templates for lyrics based on beat phrases."""
|
| 215 |
+
templates = []
|
| 216 |
+
|
| 217 |
+
if not beat_analysis or 'phrases' not in beat_analysis:
|
| 218 |
+
return templates
|
| 219 |
+
|
| 220 |
+
phrases = beat_analysis['phrases']
|
| 221 |
+
|
| 222 |
+
for i, phrase in enumerate(phrases):
|
| 223 |
+
duration = phrase['end_time'] - phrase['start_time']
|
| 224 |
+
|
| 225 |
+
template = {
|
| 226 |
+
'id': phrase['id'],
|
| 227 |
+
'start_time': phrase['start_time'],
|
| 228 |
+
'end_time': phrase['end_time'],
|
| 229 |
+
'duration': duration,
|
| 230 |
+
'num_beats': phrase['num_beats'],
|
| 231 |
+
'stress_pattern': phrase['stress_pattern'],
|
| 232 |
+
'syllable_guide': self.generate_phrase_guide(phrase)
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
templates.append(template)
|
| 236 |
+
|
| 237 |
+
return templates
|
| 238 |
+
|
| 239 |
+
def generate_phrase_guide(self, template, words_per_beat=0.5):
|
| 240 |
+
"""Generate a guide for each phrase to help the LLM."""
|
| 241 |
+
num_beats = template['num_beats']
|
| 242 |
+
stress_pattern = template['stress_pattern']
|
| 243 |
+
|
| 244 |
+
# Create a visual representation of the stress pattern
|
| 245 |
+
# S = Strong stress, M = Medium stress, W = Weak stress
|
| 246 |
+
visual_pattern = ""
|
| 247 |
+
for i, stress in enumerate(stress_pattern):
|
| 248 |
+
if stress == "S":
|
| 249 |
+
visual_pattern += "STRONG "
|
| 250 |
+
elif stress == "M":
|
| 251 |
+
visual_pattern += "medium "
|
| 252 |
+
else:
|
| 253 |
+
visual_pattern += "weak "
|
| 254 |
+
|
| 255 |
+
# Estimate number of words based on beats (very rough estimate)
|
| 256 |
+
est_words = max(1, int(num_beats * words_per_beat))
|
| 257 |
+
|
| 258 |
+
# Estimate syllables - more flexible now, allowing for reasonable ranges
|
| 259 |
+
# Typical song might have 1-3 syllables per beat
|
| 260 |
+
min_syllables = num_beats
|
| 261 |
+
max_syllables = num_beats * 3
|
| 262 |
+
|
| 263 |
+
guide = f"~{est_words} words, ~{min_syllables}-{max_syllables} syllables | Pattern: {visual_pattern}"
|
| 264 |
+
return guide
|
| 265 |
+
|
| 266 |
+
def check_syllable_stress_match(self, text, template, genre="pop"):
|
| 267 |
+
"""Check if lyrics match the syllable and stress pattern with genre-specific flexibility."""
|
| 268 |
+
# Split text into words and count syllables
|
| 269 |
+
words = text.split()
|
| 270 |
+
syllable_count = sum(self.count_syllables(word) for word in words)
|
| 271 |
+
|
| 272 |
+
# Get expected syllable count based on number of beats
|
| 273 |
+
expected_count = template['num_beats']
|
| 274 |
+
|
| 275 |
+
# Get syllable-to-beat ratios based on genre
|
| 276 |
+
genre_lower = genre.lower()
|
| 277 |
+
if genre_lower in self.genre_syllable_ratios:
|
| 278 |
+
min_ratio, typical_ratio, max_ratio = self.genre_syllable_ratios[genre_lower]
|
| 279 |
+
else:
|
| 280 |
+
min_ratio, typical_ratio, max_ratio = self.genre_syllable_ratios['default']
|
| 281 |
+
|
| 282 |
+
# Calculate flexible min and max syllable expectations based on genre
|
| 283 |
+
min_expected = max(1, int(expected_count * min_ratio))
|
| 284 |
+
max_expected = int(expected_count * max_ratio)
|
| 285 |
+
|
| 286 |
+
# Check if syllable count falls within genre-appropriate range
|
| 287 |
+
within_range = min_expected <= syllable_count <= max_expected
|
| 288 |
+
|
| 289 |
+
# Consider typical ratio - how close are we to the ideal for this genre?
|
| 290 |
+
ideal_count = int(expected_count * typical_ratio)
|
| 291 |
+
closeness_to_ideal = 1.0 - min(abs(syllable_count - ideal_count) / (max_expected - min_expected + 1), 1.0)
|
| 292 |
+
|
| 293 |
+
# Get detailed syllable breakdown for stress analysis
|
| 294 |
+
word_syllables = []
|
| 295 |
+
for word in words:
|
| 296 |
+
count = self.count_syllables(word)
|
| 297 |
+
word_syllables.append(count)
|
| 298 |
+
|
| 299 |
+
# Analyze stress pattern match using a more flexible approach
|
| 300 |
+
stress_pattern = template['stress_pattern']
|
| 301 |
+
|
| 302 |
+
# Simple stress matching algorithm (can be improved in future versions)
|
| 303 |
+
# We need to map syllables to beats in a more flexible way
|
| 304 |
+
syllable_to_beat_mapping = self._map_syllables_to_beats(word_syllables, stress_pattern)
|
| 305 |
+
|
| 306 |
+
# Calculate stress match score based on alignment of stressed syllables with strong beats
|
| 307 |
+
stress_match_percentage = self._calculate_stress_match(words, word_syllables, syllable_to_beat_mapping, stress_pattern)
|
| 308 |
+
|
| 309 |
+
# Consider a stress match if the percentage is high enough
|
| 310 |
+
stress_matches = stress_match_percentage >= 0.7
|
| 311 |
+
|
| 312 |
+
return {
|
| 313 |
+
'syllable_count': syllable_count,
|
| 314 |
+
'expected_count': expected_count,
|
| 315 |
+
'min_expected': min_expected,
|
| 316 |
+
'max_expected': max_expected,
|
| 317 |
+
'within_range': within_range,
|
| 318 |
+
'matches_beat_count': syllable_count == expected_count, # Exact match (strict)
|
| 319 |
+
'close_match': within_range, # Flexible match (based on genre)
|
| 320 |
+
'stress_matches': stress_matches,
|
| 321 |
+
'stress_match_percentage': stress_match_percentage,
|
| 322 |
+
'closeness_to_ideal': closeness_to_ideal,
|
| 323 |
+
'word_syllables': word_syllables,
|
| 324 |
+
'ideal_syllable_count': ideal_count
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
def _map_syllables_to_beats(self, word_syllables, stress_pattern):
|
| 328 |
+
"""Map syllables to beats in a flexible way."""
|
| 329 |
+
total_syllables = sum(word_syllables)
|
| 330 |
+
total_beats = len(stress_pattern)
|
| 331 |
+
|
| 332 |
+
# Simple mapping for now - this could be improved with more sophisticated algorithms
|
| 333 |
+
if total_syllables <= total_beats:
|
| 334 |
+
# Fewer syllables than beats - some beats have no syllables (prolongation)
|
| 335 |
+
mapping = []
|
| 336 |
+
syllable_index = 0
|
| 337 |
+
for beat_index in range(total_beats):
|
| 338 |
+
if syllable_index < total_syllables:
|
| 339 |
+
mapping.append((syllable_index, beat_index))
|
| 340 |
+
syllable_index += 1
|
| 341 |
+
return mapping
|
| 342 |
+
else:
|
| 343 |
+
# More syllables than beats - some beats have multiple syllables (melisma/syncopation)
|
| 344 |
+
mapping = []
|
| 345 |
+
syllables_per_beat = total_syllables / total_beats
|
| 346 |
+
for beat_index in range(total_beats):
|
| 347 |
+
start_syllable = int(beat_index * syllables_per_beat)
|
| 348 |
+
end_syllable = int((beat_index + 1) * syllables_per_beat)
|
| 349 |
+
for syllable_index in range(start_syllable, end_syllable):
|
| 350 |
+
if syllable_index < total_syllables:
|
| 351 |
+
mapping.append((syllable_index, beat_index))
|
| 352 |
+
return mapping
|
| 353 |
+
|
| 354 |
+
def _calculate_stress_match(self, words, word_syllables, syllable_to_beat_mapping, stress_pattern):
|
| 355 |
+
"""Calculate how well syllable stresses match beat stresses."""
|
| 356 |
+
# This is a simplified version - real stress analysis would be more complex
|
| 357 |
+
# For now, we'll assume the first syllable of each word is stressed
|
| 358 |
+
|
| 359 |
+
# First, create a flat list of all syllables with their stress (1 = stressed, 0 = unstressed)
|
| 360 |
+
syllable_stresses = []
|
| 361 |
+
for word, syllable_count in zip(words, word_syllables):
|
| 362 |
+
# Simple assumption: first syllable is stressed, rest are unstressed
|
| 363 |
+
for i in range(syllable_count):
|
| 364 |
+
if i == 0: # First syllable of word
|
| 365 |
+
syllable_stresses.append(1) # Stressed
|
| 366 |
+
else:
|
| 367 |
+
syllable_stresses.append(0) # Unstressed
|
| 368 |
+
|
| 369 |
+
# Count matches between syllable stress and beat stress
|
| 370 |
+
matches = 0
|
| 371 |
+
total_mapped = 0
|
| 372 |
+
|
| 373 |
+
for syllable_index, beat_index in syllable_to_beat_mapping:
|
| 374 |
+
if syllable_index < len(syllable_stresses):
|
| 375 |
+
syllable_stress = syllable_stresses[syllable_index]
|
| 376 |
+
beat_stress = 1 if stress_pattern[beat_index] == 'S' else (0.5 if stress_pattern[beat_index] == 'M' else 0)
|
| 377 |
+
|
| 378 |
+
# Consider it a match if:
|
| 379 |
+
# - Stressed syllable on Strong beat
|
| 380 |
+
# - Unstressed syllable on Weak beat
|
| 381 |
+
# - Some partial credit for other combinations
|
| 382 |
+
if (syllable_stress == 1 and beat_stress > 0.5) or (syllable_stress == 0 and beat_stress < 0.5):
|
| 383 |
+
matches += 1
|
| 384 |
+
elif syllable_stress == 1 and beat_stress == 0.5: # Stressed syllable on Medium beat
|
| 385 |
+
matches += 0.7
|
| 386 |
+
|
| 387 |
+
total_mapped += 1
|
| 388 |
+
|
| 389 |
+
if total_mapped == 0:
|
| 390 |
+
return 0
|
| 391 |
+
|
| 392 |
+
return matches / total_mapped
|
emotionanalysis.py
CHANGED
|
@@ -36,15 +36,12 @@ class MusicAnalyzer:
|
|
| 36 |
self.key_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
|
| 37 |
|
| 38 |
# Common time signatures and their beat patterns with weights for prior probability
|
|
|
|
| 39 |
self.common_time_signatures = {
|
| 40 |
-
"4/4": {"beats_per_bar": 4, "beat_pattern": [1.0, 0.2, 0.5, 0.2], "weight": 0.
|
| 41 |
"3/4": {"beats_per_bar": 3, "beat_pattern": [1.0, 0.2, 0.3], "weight": 0.25},
|
| 42 |
"2/4": {"beats_per_bar": 2, "beat_pattern": [1.0, 0.3], "weight": 0.15},
|
| 43 |
-
"6/8": {"beats_per_bar": 6, "beat_pattern": [1.0, 0.2, 0.3, 0.8, 0.2, 0.3], "weight": 0.
|
| 44 |
-
"5/4": {"beats_per_bar": 5, "beat_pattern": [1.0, 0.2, 0.4, 0.7, 0.2], "weight": 0.10},
|
| 45 |
-
"7/8": {"beats_per_bar": 7, "beat_pattern": [1.0, 0.2, 0.3, 0.8, 0.2, 0.2, 0.3], "weight": 0.10},
|
| 46 |
-
"9/8": {"beats_per_bar": 9, "beat_pattern": [1.0, 0.2, 0.3, 0.8, 0.2, 0.3, 0.7, 0.2, 0.3], "weight": 0.10},
|
| 47 |
-
"12/8": {"beats_per_bar": 12, "beat_pattern": [1.0, 0.2, 0.3, 0.6, 0.2, 0.3, 0.8, 0.2, 0.3, 0.6, 0.2, 0.3], "weight": 0.15}
|
| 48 |
}
|
| 49 |
|
| 50 |
# Add common accent patterns for different time signatures
|
|
@@ -52,11 +49,7 @@ class MusicAnalyzer:
|
|
| 52 |
"4/4": [[1, 0, 0, 0], [1, 0, 2, 0], [1, 0, 2, 0, 3, 0, 2, 0]],
|
| 53 |
"3/4": [[1, 0, 0], [1, 0, 2]],
|
| 54 |
"2/4": [[1, 0], [1, 2]],
|
| 55 |
-
"6/8": [[1, 0, 0, 2, 0, 0], [1, 0, 0, 2, 0, 3]]
|
| 56 |
-
"5/4": [[1, 0, 0, 2, 0], [1, 0, 2, 0, 0]],
|
| 57 |
-
"7/8": [[1, 0, 0, 2, 0, 0, 0], [1, 0, 0, 2, 0, 3, 0]],
|
| 58 |
-
"9/8": [[1, 0, 0, 2, 0, 0, 3, 0, 0]],
|
| 59 |
-
"12/8": [[1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0]]
|
| 60 |
}
|
| 61 |
|
| 62 |
# Expected rhythm density (relative note density per beat) for different time signatures
|
|
@@ -64,9 +57,7 @@ class MusicAnalyzer:
|
|
| 64 |
"4/4": [1.0, 0.7, 0.8, 0.6],
|
| 65 |
"3/4": [1.0, 0.6, 0.7],
|
| 66 |
"6/8": [1.0, 0.5, 0.4, 0.8, 0.5, 0.4],
|
| 67 |
-
"2/4": [1.0, 0.6]
|
| 68 |
-
"5/4": [1.0, 0.6, 0.8, 0.7, 0.6],
|
| 69 |
-
"7/8": [1.0, 0.5, 0.4, 0.8, 0.5, 0.4, 0.5]
|
| 70 |
}
|
| 71 |
|
| 72 |
def load_audio(self, file_path, sr=22050, duration=None):
|
|
@@ -320,7 +311,7 @@ class MusicAnalyzer:
|
|
| 320 |
|
| 321 |
# Find peaks in spectrum
|
| 322 |
peaks = signal.find_peaks(S_tempo, height=np.max(S_tempo)*0.1, distance=5)[0]
|
| 323 |
-
|
| 324 |
if len(peaks) == 0:
|
| 325 |
return {"time_signature": "4/4", "confidence": 0.4}
|
| 326 |
|
|
@@ -448,18 +439,18 @@ class MusicAnalyzer:
|
|
| 448 |
|
| 449 |
def _estimate_from_tempo(self, tempo):
|
| 450 |
"""Use tempo to help estimate likely time signature"""
|
| 451 |
-
# Statistical tendencies: slower tempos often in compound meters (6/8
|
| 452 |
-
#
|
| 453 |
|
| 454 |
scores = {}
|
| 455 |
|
| 456 |
if tempo < 70:
|
| 457 |
# Slow tempos favor compound meters
|
| 458 |
scores = {
|
| 459 |
-
"4/4": 0.
|
| 460 |
-
"3/4": 0.
|
| 461 |
-
"
|
| 462 |
-
"
|
| 463 |
}
|
| 464 |
elif 70 <= tempo <= 120:
|
| 465 |
# Medium tempos favor 4/4, 3/4
|
|
@@ -467,15 +458,15 @@ class MusicAnalyzer:
|
|
| 467 |
"4/4": 0.7,
|
| 468 |
"3/4": 0.6,
|
| 469 |
"2/4": 0.4,
|
| 470 |
-
"6/8": 0.
|
| 471 |
}
|
| 472 |
else:
|
| 473 |
# Fast tempos favor simpler meters
|
| 474 |
scores = {
|
| 475 |
"4/4": 0.6,
|
| 476 |
"2/4": 0.7,
|
| 477 |
-
"
|
| 478 |
-
"
|
| 479 |
}
|
| 480 |
|
| 481 |
# Find best match
|
|
|
|
| 36 |
self.key_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
|
| 37 |
|
| 38 |
# Common time signatures and their beat patterns with weights for prior probability
|
| 39 |
+
# Simplified to only include 4/4, 3/4, 2/4, and 6/8
|
| 40 |
self.common_time_signatures = {
|
| 41 |
+
"4/4": {"beats_per_bar": 4, "beat_pattern": [1.0, 0.2, 0.5, 0.2], "weight": 0.45},
|
| 42 |
"3/4": {"beats_per_bar": 3, "beat_pattern": [1.0, 0.2, 0.3], "weight": 0.25},
|
| 43 |
"2/4": {"beats_per_bar": 2, "beat_pattern": [1.0, 0.3], "weight": 0.15},
|
| 44 |
+
"6/8": {"beats_per_bar": 6, "beat_pattern": [1.0, 0.2, 0.3, 0.8, 0.2, 0.3], "weight": 0.15}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
}
|
| 46 |
|
| 47 |
# Add common accent patterns for different time signatures
|
|
|
|
| 49 |
"4/4": [[1, 0, 0, 0], [1, 0, 2, 0], [1, 0, 2, 0, 3, 0, 2, 0]],
|
| 50 |
"3/4": [[1, 0, 0], [1, 0, 2]],
|
| 51 |
"2/4": [[1, 0], [1, 2]],
|
| 52 |
+
"6/8": [[1, 0, 0, 2, 0, 0], [1, 0, 0, 2, 0, 3]]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
}
|
| 54 |
|
| 55 |
# Expected rhythm density (relative note density per beat) for different time signatures
|
|
|
|
| 57 |
"4/4": [1.0, 0.7, 0.8, 0.6],
|
| 58 |
"3/4": [1.0, 0.6, 0.7],
|
| 59 |
"6/8": [1.0, 0.5, 0.4, 0.8, 0.5, 0.4],
|
| 60 |
+
"2/4": [1.0, 0.6]
|
|
|
|
|
|
|
| 61 |
}
|
| 62 |
|
| 63 |
def load_audio(self, file_path, sr=22050, duration=None):
|
|
|
|
| 311 |
|
| 312 |
# Find peaks in spectrum
|
| 313 |
peaks = signal.find_peaks(S_tempo, height=np.max(S_tempo)*0.1, distance=5)[0]
|
| 314 |
+
|
| 315 |
if len(peaks) == 0:
|
| 316 |
return {"time_signature": "4/4", "confidence": 0.4}
|
| 317 |
|
|
|
|
| 439 |
|
| 440 |
def _estimate_from_tempo(self, tempo):
|
| 441 |
"""Use tempo to help estimate likely time signature"""
|
| 442 |
+
# Statistical tendencies: slower tempos often in compound meters (6/8)
|
| 443 |
+
# Fast tempos often favor simple meters (2/4)
|
| 444 |
|
| 445 |
scores = {}
|
| 446 |
|
| 447 |
if tempo < 70:
|
| 448 |
# Slow tempos favor compound meters
|
| 449 |
scores = {
|
| 450 |
+
"4/4": 0.5,
|
| 451 |
+
"3/4": 0.4,
|
| 452 |
+
"2/4": 0.3,
|
| 453 |
+
"6/8": 0.7
|
| 454 |
}
|
| 455 |
elif 70 <= tempo <= 120:
|
| 456 |
# Medium tempos favor 4/4, 3/4
|
|
|
|
| 458 |
"4/4": 0.7,
|
| 459 |
"3/4": 0.6,
|
| 460 |
"2/4": 0.4,
|
| 461 |
+
"6/8": 0.3
|
| 462 |
}
|
| 463 |
else:
|
| 464 |
# Fast tempos favor simpler meters
|
| 465 |
scores = {
|
| 466 |
"4/4": 0.6,
|
| 467 |
"2/4": 0.7,
|
| 468 |
+
"3/4": 0.4,
|
| 469 |
+
"6/8": 0.2
|
| 470 |
}
|
| 471 |
|
| 472 |
# Find best match
|
requirements.txt
CHANGED
|
@@ -13,3 +13,4 @@ scipy>=1.12.0
|
|
| 13 |
soundfile>=0.12.1
|
| 14 |
matplotlib>=3.7.0
|
| 15 |
pronouncing>=0.2.0
|
|
|
|
|
|
| 13 |
soundfile>=0.12.1
|
| 14 |
matplotlib>=3.7.0
|
| 15 |
pronouncing>=0.2.0
|
| 16 |
+
nltk>=3.8.1
|