2e801652a510dea463b43cb718b87b54bf4acab3,models/crf_v2/crf_preprocess_data.py,CrfPreprocessData,pre_process_text_,#Any#Any#,14

Before Change


        for entity in entities:
            text = re.sub(r"\b%s\b" % entity, IOB_prefixes(entity, word_tokenize), text)

        return [(g[0], "O") if len(g) <= 1 else (g[1], g[0]) for g in
                [w.split("_en_") for w in word_tokenize.tokenize(text)]]

    @staticmethod
    def word_embeddings(processed_pos_tag_data, vocab, word_vectors):

After Change



        word_tokenize = Tokenizer(tokenizer_selected=NLTK_TOKENIZER)
        entities.sort(key=lambda s: len(word_tokenize.tokenize(s)), reverse=True)
        tokenized_original_text = word_tokenize.tokenize(text)

        for entity in entities:
            text = re.sub(r"\b%s\b" % entity, IOB_prefixes(entity, word_tokenize), text)

        tokenized_text = word_tokenize.tokenize(text)

        labels = ["B" if "B_en_" in tokenized_text[i]
                  else "I" if "I_en_" in tokenized_text[i]
                  else "O" for i in range(len(tokenized_original_text))]

        return tokenized_original_text, labels
        // return [(g[0], "O") if len(g) <= 1 else (g[1], g[0]) for g in
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 6

Instances


Project Name: hellohaptik/chatbot_ner
Commit Name: 2e801652a510dea463b43cb718b87b54bf4acab3
Time: 2018-09-09
Author: pratik.jayarao@haptik.co
File Name: models/crf_v2/crf_preprocess_data.py
Class Name: CrfPreprocessData
Method Name: pre_process_text_


Project Name: snipsco/snips-nlu
Commit Name: be5b7adef5a75a8db9af53c641cf6e2233fb2832
Time: 2017-04-12
Author: clement.doumouro@snips.ai
File Name: snips_nlu/slot_filler/data_augmentation.py
Class Name:
Method Name: get_noise_iterator


Project Name: snipsco/snips-nlu
Commit Name: aae0ba842e293a63d6aaee45553712532054ef79
Time: 2017-04-12
Author: clement.doumouro@snips.ai
File Name: snips_nlu/slot_filler/data_augmentation.py
Class Name:
Method Name: get_noise_iterator