|
|
import os |
|
|
import json |
|
|
from transformers import PreTrainedTokenizer |
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
|
|
class MedicalcodeTokenizer(PreTrainedTokenizer): |
|
|
def __init__(self, vocab_file, code2tokens_file, embedding_file, **kwargs): |
|
|
with open(vocab_file, "r") as f: |
|
|
self.vocab = json.load(f) |
|
|
with open(code2tokens_file, "r") as f: |
|
|
self.code2tok = json.load(f) |
|
|
with open(embedding_file, 'r') as f: |
|
|
self.code2emb = json.load(f) |
|
|
|
|
|
self.ids_to_tokens = {v: k for k, v in self.vocab.items()} |
|
|
self.tokens_to_ids = self.vocab |
|
|
|
|
|
super().__init__(**kwargs) |
|
|
|
|
|
def _tokenize(self, text): |
|
|
if text in self.code2tok: |
|
|
return [f"token_{t}" for t in self.code2tok[text]] |
|
|
return self._infer_and_register(text) |
|
|
return self._infer_and_register(text) |
|
|
|
|
|
def _convert_token_to_id(self, token): |
|
|
return self.vocab.get(token, self.vocab.get(self.unk_token, 0)) |
|
|
|
|
|
def _convert_id_to_token(self, idx): |
|
|
return self.ids_to_tokens.get(idx, self.unk_token) |
|
|
|
|
|
def embed(self, text): |
|
|
tokens = self._tokenize(text) |
|
|
if text in self.code2emb: |
|
|
return self.code2emb[text] |
|
|
return tokens |
|
|
|
|
|
def get_vocab(self): |
|
|
return self.vocab |
|
|
|
|
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): |
|
|
return token_ids_0 if token_ids_1 is None else token_ids_0 + token_ids_1 |
|
|
|
|
|
def get_special_tokens_mask(self, token_ids, already_has_special_tokens=False): |
|
|
return [0] * len(token_ids) |
|
|
|
|
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): |
|
|
return [0] * len(token_ids_0) if token_ids_1 is None else [0] * len(token_ids_0) + [1] * len(token_ids_1) |
|
|
|
|
|
def save_pretrained(self, save_directory): |
|
|
os.makedirs(save_directory, exist_ok=True) |
|
|
with open(os.path.join(save_directory, "vocab.json"), "w") as f: |
|
|
json.dump(self.vocab, f, indent=2) |
|
|
with open(os.path.join(save_directory, "code2tokens.json"), "w") as f: |
|
|
json.dump(self.code2tok, f, indent=2) |
|
|
with open(os.path.join(save_directory, "code2embeddings.json"), "w") as f: |
|
|
json.dump(self.code2emb, f, indent=2) |
|
|
tokenizer_config = { |
|
|
"tokenizer_class": "tokenizer.MedicalcodeTokenizer", |
|
|
"vocab_file": "vocab.json", |
|
|
"code2tokens_file": "code2tokens.json", |
|
|
"code2embedding_file": "code2embeddings.json", |
|
|
"auto_map": { |
|
|
"AutoTokenizer": ["tokenizer", "MedicalcodeTokenizer"] |
|
|
} |
|
|
} |
|
|
with open(os.path.join(save_directory, "tokenizer_config.json"), "w") as f: |
|
|
json.dump(tokenizer_config, f, indent=2) |
|
|
|
|
|
@classmethod |
|
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): |
|
|
vocab_file = hf_hub_download(pretrained_model_name_or_path, "vocab.json") |
|
|
code2tokens_file = hf_hub_download(pretrained_model_name_or_path, "code2tokens.json") |
|
|
embedding_file = hf_hub_download(pretrained_model_name_or_path, "code2embeddings.json") |
|
|
return cls(vocab_file=vocab_file, code2tokens_file=code2tokens_file, embedding_file=embedding_file, **kwargs) |
|
|
|