for entity in entities:
text = re.sub(r"\b%s\b" % entity, IOB_prefixes(entity, word_tokenize), text)
return [(g[0], "O") if len(g) <= 1 else (g[1], g[0]) for g in
[w.split("_en_") for w in word_tokenize.tokenize(text)]]
@staticmethod
def word_embeddings(processed_pos_tag_data, vocab, word_vectors):
After Change
word_tokenize = Tokenizer(tokenizer_selected=NLTK_TOKENIZER)
entities.sort(key=lambda s: len(word_tokenize.tokenize(s)), reverse=True)
tokenized_original_text = word_tokenize.tokenize(text)
for entity in entities:
text = re.sub(r"\b%s\b" % entity, IOB_prefixes(entity, word_tokenize), text)
tokenized_text = word_tokenize.tokenize(text)
labels = ["B" if "B_en_" in tokenized_text[i]
else "I" if "I_en_" in tokenized_text[i]
else "O" for i in range(len(tokenized_original_text))]
return tokenized_original_text, labels
// return [(g[0], "O") if len(g) <= 1 else (g[1], g[0]) for g in