🚨 BanglaIPA 🚨

BanglaIPA: Towards Robust Text-to-IPA Transcription with Contextual Rewriting in Bengali

📝 Paper, 🖥️ Github

BanglaIPA - Bengali text to International Phonetic Alphabet (IPA) transcription system is trained on standard Bengali and six regional dialects of Bangladesh using the DUAL-IPA dataset from Bengali.AI.

Load the BanglaIPA System

Prerequisite

!pip install tensorflow

Log in to HuggingFace

from huggingface_hub import login
login("TOKEN")

Load BanglaIPA model

## BanglaIPA
from huggingface_hub import snapshot_download
import os

local_dir = snapshot_download(
    repo_id="Jakir057/BanglaIPA"
)
print(local_dir)

MODEL_PATH = os.path.join(local_dir, "BanglaIPA")
print(f"Model path={MODEL_PATH}")

Transcription Generation

import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import numpy as np
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

def get_vocab():
  """
  Returns sorted list of Bengali characters, IPA characters, special tokens and other characters seen in the training set.
  """
  vb = ['', '[UNK]', '[start]', '[end]', 'া', 'র', '্', 'ে', 'ি', 'ন', 'ক', 'ব', 'স', 'ল', 'ত', 'ম', 'প', 'ু', 'দ', 'ট', 'য়', 'জ', '।', 'ো', 'গ', 'হ', 'য', 'শ', 'ী', 'ই', 'চ', 'ভ', 'আ', 'ও', 'ছ', 'ষ', 'ড', 'ফ', 'অ', 'ধ', 'খ', 'ড়', 'উ', 'ণ', 'এ', 'থ', 'ং', 'ঁ', 'ূ', 'ৃ', 'ঠ', 'ঘ', 'ঞ', 'ঙ', 'ৌ', '‘', 'ৎ', 'ঝ', 'ৈ', '়', 'ঢ', 'ঃ', 'ঈ', '\u200c', 'ৗ', 'a', 'ঐ', 'd', 'w', 'ঋ', 'i', 'e', 't', 's', 'n', 'm', 'b', '“', 'u', 'r', 'œ', 'o', '–', 'ঊ', 'ঢ়', 'Í', 'g', 'p', '\xad', 'h', 'c', 'l', 'ঔ', 'ƒ', '”', 'Ñ', '¡', 'y', 'j', 'f', '→', '—', 'ø', 'è', '¦', '¥', 'x', 'v', 'k']
  vipa = ['', '[UNK]', '[start]', '[end]', 'ɐ', 'ɾ', 'i', 'o', 'e', '̪', 't', 'n', 'k', 'ɔ', 'ʃ', 'b', 'd', 'l', 'u', 'p', 'm', 'ʰ', 'ɟ', '͡', '̯', 'g', 'ʱ', '।', 'c', 'ʲ', 'h', 's', 'ŋ', 'ɛ', 'ɽ', '̃', 'ʷ', '‘', '“', '–', '”', '—', 'w', 'j']
  v = vb + vipa
  s = set()
  for ch in v:
    s.add(ch)
  vocab = sorted(list(s))
  return vocab

def get_vectorization():
  """
  Performs vectorization.
  """
  vocab = get_vocab()
  vocab_size = len(vocab)
  sequence_length = 64
  bn_vectorization = TextVectorization(
      max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length,
      vocabulary=vocab
  )
  ipa_vectorization = TextVectorization(
      max_tokens=vocab_size,
      output_mode="int",
      output_sequence_length=sequence_length + 1,
      vocabulary=vocab
  )
  return bn_vectorization, ipa_vectorization

def decode_sequence(input_sentence, bn_vectorization, ipa_vectorization, banglaipa_model):
    """
    Generate IPA for subword.
    
    Args:
      - input_sentence (str): Synthetic sentence where every adjacent characters has a space between them.
      - bn_vectorization: TextVectorization
      - en_vectorization: TextVectorization
      - banglaipa_model: Transformer model
    Returns:
      - str: String of IPA characters and special tokens where adjacent characters are separated with a space.
    """
    max_decoded_sentence_length = 64
    spa_vocab = ipa_vectorization.get_vocabulary()
    spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
    tokenized_input_sentence = bn_vectorization([input_sentence])
    decoded_sentence = '[start]'
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = ipa_vectorization([decoded_sentence])[:, :-1]
        predictions = banglaipa_model([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = spa_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == '[UNK]':
            break
    return decoded_sentence

def sentence_to_word(sentence):
  """
  Generate word from synthetic sentence by removing spaces between adjacent characters.

  Args:
    - sentence (str): Synthetic sentence.
  Returns:
    - str: subword/word
  """
  trg=''
  for ch in sentence:
      if ch != " ":
        trg += ch
  return trg

def word_to_sentence(word):
  """
  Generate synthetic sentence from word by inserting spaces between adjacent characters.

  Args:
    - word (str): subword/word segement
  Returns:
    - str: Synthetic sentence
  """
  sentence = ""
  for ch in word:
    sentence += (ch + " ")
  return sentence

def get_subword2ipa(word, bn_vectorization, ipa_vectorization, banglaipa_model):
    translated = decode_sequence(word_to_sentence(word), bn_vectorization, ipa_vectorization, banglaipa_model)
    trg = sentence_to_word(translated)
    trg = trg[7:]
    trg = trg[:-5]
    return trg

if __name__ == "__main__":
    path = MODEL_PATH
    banglaipa_model=tf.saved_model.load(path)
    print("BanglaIPA model loaded.")
    bn_vectorization, ipa_vectorization = get_vectorization()
    text = "একটি বাছাই করুন গণিত প্রথম গণিত দ্বিতীয় পত্র"
    ipa = ""
    words = text.split(" ")
    for word in words:
        trg = get_subword2ipa(word, bn_vectorization, ipa_vectorization, banglaipa_model)
        print(word, trg)
        ipa += (trg + " ")
    print(f"IPA={ipa}")

## python inference.py
# # Output:
# BanglaIPA model loaded.
# একটি ekti
# বাছাই bɐcʰɐ͡i̯
# করুন koɾun
# গণিত gonit̪o
# প্রথম pɾot̪ʰom
# গণিত gonit̪o
# দ্বিতীয় d̪it̪iʲo
# পত্র pɔt̪ɾo
# IPA=ekti bɐcʰɐ͡i̯ koɾun gonit̪o pɾot̪ʰom gonit̪o d̪it̪iʲo pɔt̪ɾo

Citation

@misc
{hasan2026banglaiparobusttexttoipatranscription,
  title={BanglaIPA: Towards Robust Text-to-IPA Transcription with Contextual Rewriting in Bengali},
  author={Jakir Hasan and Shrestha Datta and Md Saiful Islam and Shubhashis Roy Dipta and Ameya Debnath},
  year={2026},
  eprint={2601.01778},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2601.01778},
}
Downloads last month
-
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Papers for Jakir057/BanglaIPA