Spaces:

vonewman
/

ner_app

Runtime error

App Files Files Community

vonewman commited on Oct 29, 2023

Commit

08eb663

1 Parent(s): 447a922

Upload app.py

Browse files

Files changed (1) hide show

app.py +116 -0

app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import streamlit as st
+import pandas as pd
+import re
+import json
+import transformers
+import torch
+from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
+st.set_page_config(
+    page_title="Named Entity Recognition Wolof",
+    page_icon="📘"
+)
+def convert_df(df: pd.DataFrame):
+    return df.to_csv(index=False).encode('utf-8')
+def convert_json(df: pd.DataFrame):
+    result = df.to_json(orient="index")
+    parsed = json.loads(result)
+    json_string = json.dumps(parsed)
+    return json_string
+def load_model():
+    model = AutoModelForTokenClassification.from_pretrained("vonewman/wolof-finetuned-ner")
+    trainer = Trainer(model=model)
+    tokenizer = AutoTokenizer.from_pretrained("vonewman/wolof-finetuned-ner")
+    return trainer, model, tokenizer
+def align_word_ids(texts):
+    trainer, model, tokenizer = load_model()
+    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=218, truncation=True)
+    word_ids = tokenized_inputs.word_ids()
+    previous_word_idx = None
+    label_ids = []
+    for word_idx in word_ids:
+        if word_idx is None:
+            label_ids.append(-100)
+        elif word_idx != previous_word_idx:
+            try:
+                label_ids.append(1)
+            except:
+                label_ids.append(-100)
+        else:
+            try:
+                label_ids.append(1 if label_all_tokens else -100)
+            except:
+                label_ids.append(-100)
+        previous_word_idx = word_idx
+    return label_ids
+def predict_ner_labels(model, tokenizer, sentence):
+    use_cuda = torch.cuda.is_available()
+    device = torch.device("cuda" if use_cuda else "cpu")
+    if use_cuda:
+        model = model.cuda()
+    text = tokenizer(sentence, padding='max_length', max_length=218, truncation=True, return_tensors="pt")
+    mask = text['attention_mask'].to(device)
+    input_id = text['input_ids'].to(device)
+    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)
+    logits = model(input_id, mask, None)
+    logits_clean = logits[0][label_ids != -100]
+    predictions = logits_clean.argmax(dim=1).tolist()
+    prediction_label = [id2tag[i] for i in predictions]
+    return prediction_label
+id2tag = {0: 'O', 1: 'B-LOC', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-DATE', 6: 'B-DATE', 7: 'I-ORG', 8: 'I-LOC'}
+def tag_sentence(text):
+    trainer, model, tokenizer = load_model()
+    predictions = predict_ner_labels(model, tokenizer, text)
+    # Créez un DataFrame avec les colonnes "words" et "tags"
+    df = pd.DataFrame({'words': text.split(), 'tags': predictions})
+    return df
+st.title("📘 Named Entity Recognition Wolof")
+with st.form(key='my_form'):
+    x1 = st.text_input(label='Enter a sentence:', max_chars=250)
+    submit_button = st.form_submit_button(label='🏷️ Create tags')
+if submit_button:
+    if re.sub('\s+', '', x1) == '':
+        st.error('Please enter a non-empty sentence.')
+    elif re.match(r'\A\s*\w+\s*\Z', x1):
+        st.error("Please enter a sentence with at least one word")
+    else:
+        st.markdown("### Tagged Sentence")
+        st.header("")
+        results = tag_sentence(x1)
+        cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
+        with c1:
+            csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results),
+                                           file_name="results.csv", mime='text/csv', key='csv')
+        with c2:
+            textbutton = st.download_button(label="📥 Download .txt", data=convert_df(results),
+                                            file_name="results.text", mime='text/plain', key='text')
+        with c3:
+            jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results),
+                                            file_name="results.json", mime='application/json', key='json')
+        st.header("")
+        c1, c2, c3 = st.columns([1, 3, 1])
+        with c2:
+            st.table(results[['words', 'tags']])
+st.header("")
+st.header("")
+st.header("")
+with st.expander("ℹ️ - About this app", expanded=True):
+    st.write(
+        """
+-   The **Named Entity Recognition Wolof** app is a tool that performs named entity recognition in Wolof.
+-   The available entities are: *corporation*, *location*, *person*, and *date*.
+-   The app uses the [XLMRoberta model](https://huggingface.co/xlm-roberta-base), fine-tuned on the [masakhaNER](https://huggingface.co/datasets/masakhane/masakhaner2) dataset.
+-   The model uses the **byte-level BPE tokenizer**. Each sentence is first tokenized.
+        """
+)