8fb4a9c6bdf17aad1e6e0ee89d05e3f792179965,extract_features.py,,convert_lst_to_features,#Any#Any#Any#,138

Before Change


def convert_lst_to_features(lst_str, seq_length, tokenizer):
    Loads a data file into a list of `InputBatch`s.

    features = []
    examples = read_examples(lst_str)
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)

        if tokens_b:
            // Modifies `tokens_a` and `tokens_b` in place so that the total
            // length is less than the specified length.
            // Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
        else:
            // Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > seq_length - 2:
                tokens_a = tokens_a[0:(seq_length - 2)]

        // The convention in BERT is:
        // (a) For sequence pairs:
        //  tokens:   [CLS] is this jack ////son ////ville ? [SEP] no it is not . [SEP]
        //  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        // (b) For single sequences:
        //  tokens:   [CLS] the dog is hairy . [SEP]
        //  type_ids: 0     0   0   0  0     0 0
        //
        // Where "type_ids" are used to indicate whether this is the first
        // sequence or the second sequence. The embedding vectors for `type=0` and
        // `type=1` were learned during pre-training and are added to the wordpiece
        // embedding vector (and position vector). This is not *strictly* necessary
        // since the [SEP] token unambiguously separates the sequences, but it makes
        // it easier for the model to learn the concept of sequences.
        //
        // For classification tasks, the first vector (corresponding to [CLS]) is
        // used as as the "sentence vector". Note that this only makes sense because
        // the entire model is fine-tuned.
        tokens = []
        input_type_ids = []
        tokens.append("[CLS]")
        input_type_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            input_type_ids.append(0)
        tokens.append("[SEP]")
        input_type_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                input_type_ids.append(1)
            tokens.append("[SEP]")
            input_type_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        // The mask has 1 for real tokens and 0 for padding tokens. Only real
        // tokens are attended to.
        input_mask = [1] * len(input_ids)

        // Zero-pad up to the sequence length.
        while len(input_ids) < seq_length:
            input_ids.append(0)
            input_mask.append(0)
            input_type_ids.append(0)

        assert len(input_ids) == seq_length
        assert len(input_mask) == seq_length
        assert len(input_type_ids) == seq_length

        if ex_index < 5:
            tf.logging.info("*** Example ***")
            tf.logging.info("unique_id: %s" % (example.unique_id))
            tf.logging.info("tokens: %s" % " ".join(
                [tokenization.printable_text(x) for x in tokens]))
            tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            tf.logging.info(
                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))

        features.append(
            InputFeatures(
                unique_id=example.unique_id,
                tokens=tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                input_type_ids=input_type_ids))
    return features


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    Truncates a sequence pair in place to the maximum length.

After Change


            tf.logging.info(
                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))

        yield InputFeatures(
            unique_id=example.unique_id,
            tokens=tokens,
            input_ids=input_ids,
            input_mask=input_mask,
            input_type_ids=input_type_ids)


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    Truncates a sequence pair in place to the maximum length.

In pattern: SUPERPATTERN

Frequency: 4

Non-data size: 6

Instances

Link

Project Name: hanxiao/bert-as-service

Commit Name: 8fb4a9c6bdf17aad1e6e0ee89d05e3f792179965

Time: 2018-11-08

Author: hanhxiao@tencent.com

File Name: extract_features.py

Class Name:

Method Name: convert_lst_to_features

Link

Project Name: HazyResearch/fonduer

Commit Name: da1ea08d080269f4099a2bbbe704f985219904ea

Time: 2018-08-23

Author: jrausch@inf.ethz.ch

File Name: fonduer/parser/parser.py

Class Name: ParserUDF

Method Name: _parse_sentence

Link

Project Name: graphbrain/graphbrain

Commit Name: be9467b5faed0506bda9dcdf231cc9fae3954170

Time: 2020-06-16

Author: telmo@telmomenezes.net

File Name: graphbrain/agents/txt_parser.py

Class Name:

Method Name: paragraphs

Link

Project Name: TheAlgorithms/Python

Commit Name: 938dd0bbb5145aa7c60127745ae0571cb20a2387

Time: 2019-12-06

Author: vargasnikolass@gmail.com

File Name: maths/prime_numbers.py

Class Name:

Method Name: primes