Mgolo commited on
Commit
4b69e7c
Β·
verified Β·
1 Parent(s): d60bb78

Upload 4 files

Browse files
Files changed (4) hide show
  1. app (2).py +193 -0
  2. localenpl5 (1).jpeg +0 -0
  3. requirements (1).txt +12 -0
  4. requirements.txt +14 -0
app (2).py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM
3
+ import torch
4
+ import unicodedata
5
+ import re
6
+ import whisper
7
+ import tempfile
8
+ import os
9
+
10
+ import nltk
11
+ nltk.download('punkt')
12
+ from nltk.tokenize import sent_tokenize
13
+
14
+ # Additions for file processing
15
+ import fitz # PyMuPDF for PDF
16
+ import docx
17
+ from bs4 import BeautifulSoup
18
+ import markdown2
19
+ import chardet
20
+
21
+
22
+ # --- Device selection ---
23
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
24
+
25
+ # --- Load translation models ---
26
+ @st.cache_resource
27
+ def load_models():
28
+ en_dar_model_path = "/LocaleNLP/English_Hausa"
29
+ en_wol_model_path = "/LocaleNLP/eng_wolof"
30
+ en_hau_model_path = "/LocaleNLP/English_Darija"
31
+
32
+
33
+ en_dar_model = AutoModelForSeq2SeqLM.from_pretrained(en_dar_model_path).to(device)
34
+ en_dar_tokenizer = MarianTokenizer.from_pretrained(en_dar_model_path)
35
+
36
+ en_wol_model = AutoModelForSeq2SeqLM.from_pretrained(en_wol_model_path).to(device)
37
+ en_wol_tokenizer = MarianTokenizer.from_pretrained(en_wol_model_path)
38
+
39
+ en_hau_model = AutoModelForSeq2SeqLM.from_pretrained(en_hau_model_path).to(device)
40
+ en_hau_tokenizer = MarianTokenizer.from_pretrained(en_hau_model_path)
41
+
42
+ en_dar_translator = pipeline("translation", model=en_dar_model, tokenizer=en_dar_tokenizer, device=0 if device.type == 'cuda' else -1)
43
+ en_wol_translator = pipeline("translation", model=en_wol_model, tokenizer=en_wol_tokenizer, device=0 if device.type == 'cuda' else -1)
44
+ en_hau_translator = pipeline("translation", model=en_hau_model, tokenizer=en_hau_tokenizer, device=0 if device.type == 'cuda' else -1)
45
+
46
+ return en_dar_translator, en_hau_translator, en_wol_translator
47
+
48
+ @st.cache_resource
49
+ def load_whisper_model():
50
+ return whisper.load_model("base")
51
+
52
+ def transcribe_audio(audio_path):
53
+ whisper_model = load_whisper_model()
54
+ return whisper_model.transcribe(audio_path)["text"]
55
+
56
+ def translate(text, target_lang):
57
+ en_bm_translator, en_dar_translator, en_hau_translator, en_wol_translator = load_models()
58
+
59
+ if target_lang == "Darija (Morocco)":
60
+ translator = en_dar_translator
61
+ elif target_lang == "Hausa (Nigeria)":
62
+ translator = en_hau_translator
63
+ elif target_lang == "Wolof (Senegal)":
64
+ translator = en_wol_translator
65
+ else:
66
+ raise ValueError("Unsupported target language")
67
+
68
+ lang_tag = {
69
+
70
+ "Darija (Morocco)": ">>dar<<",
71
+ "Hausa (Nigeria)": ">>hau<<",
72
+ "Wolof (Senegal)": ">>wol<<"
73
+ }[target_lang]
74
+
75
+ paragraphs = text.split("\n")
76
+ translated_output = []
77
+
78
+ with torch.no_grad():
79
+ for para in paragraphs:
80
+ if not para.strip():
81
+ translated_output.append("")
82
+ continue
83
+ sentences = [s.strip() for s in para.split('. ') if s.strip()]
84
+ formatted = [f"{lang_tag} {s}" for s in sentences]
85
+
86
+ results = translator(formatted,
87
+ max_length=5000,
88
+ num_beams=5,
89
+ early_stopping=True,
90
+ no_repeat_ngram_size=3,
91
+ repetition_penalty=1.5,
92
+ length_penalty=1.2)
93
+ translated_sentences = [r['translation_text'].capitalize() for r in results]
94
+ translated_output.append('. '.join(translated_sentences))
95
+
96
+ return "\n".join(translated_output)
97
+
98
+ # --- Extract text from file ---
99
+ def extract_text_from_file(uploaded_file):
100
+ file_type = uploaded_file.name.split('.')[-1].lower()
101
+ content = uploaded_file.read()
102
+
103
+ if file_type == "pdf":
104
+ with fitz.open(stream=content, filetype="pdf") as doc:
105
+ return "\n".join([page.get_text() for page in doc])
106
+ elif file_type == "docx":
107
+ doc = docx.Document(uploaded_file)
108
+ return "\n".join([para.text for para in doc.paragraphs])
109
+ else:
110
+ encoding = chardet.detect(content)['encoding']
111
+ if encoding:
112
+ content = content.decode(encoding, errors='ignore')
113
+ if file_type in ("html", "htm"):
114
+ soup = BeautifulSoup(content, "html.parser")
115
+ return soup.get_text()
116
+ elif file_type == "md":
117
+ html = markdown2.markdown(content)
118
+ soup = BeautifulSoup(html, "html.parser")
119
+ return soup.get_text()
120
+ elif file_type == "srt":
121
+ return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", content)
122
+ elif file_type in ("txt", "text"):
123
+ return content
124
+ else:
125
+ raise ValueError("Unsupported file type")
126
+
127
+ # --- Main Streamlit App ---
128
+ def main():
129
+ st.set_page_config(page_title="LocaleNLP Translator", layout="wide", initial_sidebar_state="expanded")
130
+
131
+ with st.sidebar:
132
+ st.image("localenpl5.jpeg", use_container_width=True)
133
+ st.markdown("""
134
+ <h3 style='text-align: left; color: #4B8BBE;'>🌐 Models Overview</h3>
135
+ At <b>LocaleNLP</b>, we develop AI-powered NLP tools for low-resource languages across Africa and Asia.
136
+ πŸ“Œ This application showcases translation using custom MarianMT models trained on bilingual datasets in English and Bambara, Darija, Hausa, and Wolof.
137
+ """, unsafe_allow_html=True)
138
+
139
+ st.markdown("""<h4 style='text-align: center; color: #306998;'>Translate between English, Darija, Hausa and Wolof</h4>""", unsafe_allow_html=True)
140
+
141
+ col1, col2 = st.columns(2)
142
+ with col1:
143
+ input_mode = st.selectbox("Select input mode:", ("Text", "Audio", "File"))
144
+ st.markdown("""<hr>""", unsafe_allow_html=True)
145
+ with col2:
146
+ target_lang = st.selectbox("Select target language:", ("Darija (Morocco)", "Hausa (Nigeria)", "Wolof (Senegal)"))
147
+ st.markdown("""<hr>""", unsafe_allow_html=True)
148
+
149
+ col3, col4 = st.columns(2)
150
+ with col3:
151
+ input_text = ""
152
+ if input_mode == "Text":
153
+ input_text = st.text_area("✏️ Enter English text:", height=250)
154
+ elif input_mode == "Audio":
155
+ audio_file = st.file_uploader("πŸ”Š Upload audio (.wav, .mp3, .m4a)", type=["wav", "mp3", "m4a"])
156
+ if audio_file:
157
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f".{audio_file.type.split('/')[-1]}") as tmp:
158
+ tmp.write(audio_file.read())
159
+ tmp_path = tmp.name
160
+ with st.spinner("Transcribing..."):
161
+ input_text = transcribe_audio(tmp_path)
162
+ os.remove(tmp_path)
163
+ st.text_area("πŸ“ Transcribed Text:", value=input_text, height=150)
164
+ elif input_mode == "File":
165
+ uploaded_file = st.file_uploader("πŸ“„ Upload document (PDF, Word, HTML, Markdown, SRT)", type=["pdf", "docx", "html", "htm", "md", "srt", "txt"])
166
+ if uploaded_file:
167
+ try:
168
+ input_text = extract_text_from_file(uploaded_file)
169
+ st.text_area("πŸ“ƒ Extracted Text:", value=input_text, height=200)
170
+ except Exception as e:
171
+ st.error(f"Error extracting text: {str(e)}")
172
+
173
+ with col4:
174
+ if input_text:
175
+ with st.spinner("Translating..."):
176
+ translated_text = translate(input_text, target_lang)
177
+ st.write("Output in "+ target_lang+ " language")
178
+ st.success(translated_text)
179
+
180
+ # Optional download
181
+ st.download_button(
182
+ label="πŸ’Ύ Download Translation",
183
+ data=translated_text,
184
+ file_name=f"translated_{target_lang.replace(' ', '_').lower()}.txt",
185
+ mime="text/plain"
186
+ )
187
+ else:
188
+ st.info("Translation will appear here.")
189
+
190
+ st.markdown("""<hr><div style='text-align: center; color: #4B8BBE; font-size: 0.9rem'>LocaleNLP Β© 2025 β€’ Empowering communities through AI and language</div>""", unsafe_allow_html=True)
191
+
192
+ if __name__ == "__main__":
193
+ main()
localenpl5 (1).jpeg ADDED
requirements (1).txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair
2
+ pandas
3
+ streamlit
4
+ streamlit-chat
5
+ transformers
6
+ torch
7
+ pillow
8
+ sentencepiece
9
+ datasets
10
+ keras
11
+ sacremoses
12
+ safetensors
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair
2
+ pandas
3
+ streamlit
4
+ transformers
5
+ torch
6
+ openai-whisper
7
+ nltk
8
+ PyMuPDF
9
+ python-docx
10
+ beautifulsoup4
11
+ markdown2
12
+ chardet
13
+ sentencepiece
14
+ sacremoses