cea292769af4ac688649573a11b20f4d69024e3d,tests/unit_test/processor_units/test_processor_units.py,,test_bert_tokenizer_unit,#,145

Before Change


        "[UNK]", "[CLS]", "[SEP]", "want", "////want", "////ed", "wa", "un", "runn",
        "////ing", ","
    ]
    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
        vocab_writer.write("".join(
            [x + "\n" for x in vocab_tokens]).encode("utf-8"))

    vocab_file = vocab_writer.name

    tokenizer_unit = units.BertTokenize(vocab_file, do_lower_case=True)
    os.unlink(vocab_file)

After Change




def test_bert_tokenizer_unit():
    vocab_tokens = [
        "[PAD]", "further", "////more", ",", "under", "the", "micro", "////scope", "neither",
        "entity", "contains", "glands", ".", "此", "外", "在", "显", "微", "镜", "下"
    ]
    raw_text = "furthermore, \r under the microscope \t neither entity  \n contains sebaceous glands. 此外, 在显微镜下"

    golden_tokens = ["further", "////more", ",", "under", "the", "micro", "////scope", "neither", "entity", "contains",
                     "[UNK]", "glands", ".", "此", "外", ",", "在", "显", "微", "镜", "下"]

In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 3

Instances

Link

Project Name: NTMC-Community/MatchZoo

Commit Name: cea292769af4ac688649573a11b20f4d69024e3d

Time: 2019-05-15

Author: 469413628@qq.com

File Name: tests/unit_test/processor_units/test_processor_units.py

Class Name:

Method Name: test_bert_tokenizer_unit

Link

Project Name: tensorflow/minigo

Commit Name: c42da55607fa672a1f0acf3dbd9fb8e8fbdc57a6

Time: 2019-11-20

Author: tmadams@google.com

File Name: dual_net.py

Class Name:

Method Name: freeze_graph_tpu

Link

Project Name: ray-project/ray

Commit Name: 898e47242527e402f4d014fd084c31843d64d1b8

Time: 2020-06-27

Author: mehrdadn@users.noreply.github.com

File Name: python/ray/test_utils.py

Class Name:

Method Name: run_string_as_driver_nonblocking