Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM | |
| import torch | |
| import unicodedata | |
| import re | |
| import whisper | |
| import tempfile | |
| import os | |
| import nltk | |
| nltk.download('punkt') | |
| from nltk.tokenize import sent_tokenize | |
| # Additions for file processing | |
| import fitz # PyMuPDF for PDF | |
| import docx | |
| from bs4 import BeautifulSoup | |
| import markdown2 | |
| import chardet | |
| # --- Device selection --- | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| # --- Load translation models --- | |
| def load_models(): | |
| en_dar_model_path = "/LocaleNLP/English_Hausa" | |
| en_wol_model_path = "/LocaleNLP/eng_wolof" | |
| en_hau_model_path = "/LocaleNLP/English_Darija" | |
| en_dar_model = AutoModelForSeq2SeqLM.from_pretrained(en_dar_model_path).to(device) | |
| en_dar_tokenizer = MarianTokenizer.from_pretrained(en_dar_model_path) | |
| en_wol_model = AutoModelForSeq2SeqLM.from_pretrained(en_wol_model_path).to(device) | |
| en_wol_tokenizer = MarianTokenizer.from_pretrained(en_wol_model_path) | |
| en_hau_model = AutoModelForSeq2SeqLM.from_pretrained(en_hau_model_path).to(device) | |
| en_hau_tokenizer = MarianTokenizer.from_pretrained(en_hau_model_path) | |
| en_dar_translator = pipeline("translation", model=en_dar_model, tokenizer=en_dar_tokenizer, device=0 if device.type == 'cuda' else -1) | |
| en_wol_translator = pipeline("translation", model=en_wol_model, tokenizer=en_wol_tokenizer, device=0 if device.type == 'cuda' else -1) | |
| en_hau_translator = pipeline("translation", model=en_hau_model, tokenizer=en_hau_tokenizer, device=0 if device.type == 'cuda' else -1) | |
| return en_dar_translator, en_hau_translator, en_wol_translator | |
| def load_whisper_model(): | |
| return whisper.load_model("base") | |
| def transcribe_audio(audio_path): | |
| whisper_model = load_whisper_model() | |
| return whisper_model.transcribe(audio_path)["text"] | |
| def translate(text, target_lang): | |
| en_bm_translator, en_dar_translator, en_hau_translator, en_wol_translator = load_models() | |
| if target_lang == "Darija (Morocco)": | |
| translator = en_dar_translator | |
| elif target_lang == "Hausa (Nigeria)": | |
| translator = en_hau_translator | |
| elif target_lang == "Wolof (Senegal)": | |
| translator = en_wol_translator | |
| else: | |
| raise ValueError("Unsupported target language") | |
| lang_tag = { | |
| "Darija (Morocco)": ">>dar<<", | |
| "Hausa (Nigeria)": ">>hau<<", | |
| "Wolof (Senegal)": ">>wol<<" | |
| }[target_lang] | |
| paragraphs = text.split("\n") | |
| translated_output = [] | |
| with torch.no_grad(): | |
| for para in paragraphs: | |
| if not para.strip(): | |
| translated_output.append("") | |
| continue | |
| sentences = [s.strip() for s in para.split('. ') if s.strip()] | |
| formatted = [f"{lang_tag} {s}" for s in sentences] | |
| results = translator(formatted, | |
| max_length=5000, | |
| num_beams=5, | |
| early_stopping=True, | |
| no_repeat_ngram_size=3, | |
| repetition_penalty=1.5, | |
| length_penalty=1.2) | |
| translated_sentences = [r['translation_text'].capitalize() for r in results] | |
| translated_output.append('. '.join(translated_sentences)) | |
| return "\n".join(translated_output) | |
| # --- Extract text from file --- | |
| def extract_text_from_file(uploaded_file): | |
| file_type = uploaded_file.name.split('.')[-1].lower() | |
| content = uploaded_file.read() | |
| if file_type == "pdf": | |
| with fitz.open(stream=content, filetype="pdf") as doc: | |
| return "\n".join([page.get_text() for page in doc]) | |
| elif file_type == "docx": | |
| doc = docx.Document(uploaded_file) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| else: | |
| encoding = chardet.detect(content)['encoding'] | |
| if encoding: | |
| content = content.decode(encoding, errors='ignore') | |
| if file_type in ("html", "htm"): | |
| soup = BeautifulSoup(content, "html.parser") | |
| return soup.get_text() | |
| elif file_type == "md": | |
| html = markdown2.markdown(content) | |
| soup = BeautifulSoup(html, "html.parser") | |
| return soup.get_text() | |
| elif file_type == "srt": | |
| return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", content) | |
| elif file_type in ("txt", "text"): | |
| return content | |
| else: | |
| raise ValueError("Unsupported file type") | |
| # --- Main Streamlit App --- | |
| def main(): | |
| st.set_page_config(page_title="LocaleNLP Translator", layout="wide", initial_sidebar_state="expanded") | |
| with st.sidebar: | |
| st.image("localenpl5.jpeg", use_container_width=True) | |
| st.markdown(""" | |
| <h3 style='text-align: left; color: #4B8BBE;'>π Models Overview</h3> | |
| At <b>LocaleNLP</b>, we develop AI-powered NLP tools for low-resource languages across Africa and Asia. | |
| π This application showcases translation using custom MarianMT models trained on bilingual datasets in English and Bambara, Darija, Hausa, and Wolof. | |
| """, unsafe_allow_html=True) | |
| st.markdown("""<h4 style='text-align: center; color: #306998;'>Translate between English, Darija, Hausa and Wolof</h4>""", unsafe_allow_html=True) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| input_mode = st.selectbox("Select input mode:", ("Text", "Audio", "File")) | |
| st.markdown("""<hr>""", unsafe_allow_html=True) | |
| with col2: | |
| target_lang = st.selectbox("Select target language:", ("Darija (Morocco)", "Hausa (Nigeria)", "Wolof (Senegal)")) | |
| st.markdown("""<hr>""", unsafe_allow_html=True) | |
| col3, col4 = st.columns(2) | |
| with col3: | |
| input_text = "" | |
| if input_mode == "Text": | |
| input_text = st.text_area("βοΈ Enter English text:", height=250) | |
| elif input_mode == "Audio": | |
| audio_file = st.file_uploader("π Upload audio (.wav, .mp3, .m4a)", type=["wav", "mp3", "m4a"]) | |
| if audio_file: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=f".{audio_file.type.split('/')[-1]}") as tmp: | |
| tmp.write(audio_file.read()) | |
| tmp_path = tmp.name | |
| with st.spinner("Transcribing..."): | |
| input_text = transcribe_audio(tmp_path) | |
| os.remove(tmp_path) | |
| st.text_area("π Transcribed Text:", value=input_text, height=150) | |
| elif input_mode == "File": | |
| uploaded_file = st.file_uploader("π Upload document (PDF, Word, HTML, Markdown, SRT)", type=["pdf", "docx", "html", "htm", "md", "srt", "txt"]) | |
| if uploaded_file: | |
| try: | |
| input_text = extract_text_from_file(uploaded_file) | |
| st.text_area("π Extracted Text:", value=input_text, height=200) | |
| except Exception as e: | |
| st.error(f"Error extracting text: {str(e)}") | |
| with col4: | |
| if input_text: | |
| with st.spinner("Translating..."): | |
| translated_text = translate(input_text, target_lang) | |
| st.write("Output in "+ target_lang+ " language") | |
| st.success(translated_text) | |
| # Optional download | |
| st.download_button( | |
| label="πΎ Download Translation", | |
| data=translated_text, | |
| file_name=f"translated_{target_lang.replace(' ', '_').lower()}.txt", | |
| mime="text/plain" | |
| ) | |
| else: | |
| st.info("Translation will appear here.") | |
| st.markdown("""<hr><div style='text-align: center; color: #4B8BBE; font-size: 0.9rem'>LocaleNLP Β© 2025 β’ Empowering communities through AI and language</div>""", unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| main() | |