def convert_lst_to_features(lst_str, seq_length, tokenizer):
Loads a data file into a list of `InputBatch`s.
features = []examples = read_examples(lst_str)
for (ex_index, example) in enumerate(examples):
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
// Modifies `tokens_a` and `tokens_b` in place so that the total
// length is less than the specified length.
// Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
else:
// Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > seq_length - 2:
tokens_a = tokens_a[0:(seq_length - 2)]
// The convention in BERT is:
// (a) For sequence pairs:
// tokens: [CLS] is this jack ////son ////ville ? [SEP] no it is not . [SEP]
// type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
// (b) For single sequences:
// tokens: [CLS] the dog is hairy . [SEP]
// type_ids: 0 0 0 0 0 0 0
//
// Where "type_ids" are used to indicate whether this is the first
// sequence or the second sequence. The embedding vectors for `type=0` and
// `type=1` were learned during pre-training and are added to the wordpiece
// embedding vector (and position vector). This is not *strictly* necessary
// since the [SEP] token unambiguously separates the sequences, but it makes
// it easier for the model to learn the concept of sequences.
//
// For classification tasks, the first vector (corresponding to [CLS]) is
// used as as the "sentence vector". Note that this only makes sense because
// the entire model is fine-tuned.
tokens = []
input_type_ids = []
tokens.append("[CLS]")
input_type_ids.append(0)
for token in tokens_a:
tokens.append(token)
input_type_ids.append(0)
tokens.append("[SEP]")
input_type_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
input_type_ids.append(1)
tokens.append("[SEP]")
input_type_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
// The mask has 1 for real tokens and 0 for padding tokens. Only real
// tokens are attended to.
input_mask = [1] * len(input_ids)
// Zero-pad up to the sequence length.
while len(input_ids) < seq_length:
input_ids.append(0)
input_mask.append(0)
input_type_ids.append(0)
assert len(input_ids) == seq_length
assert len(input_mask) == seq_length
assert len(input_type_ids) == seq_length
if ex_index < 5:
tf.logging.info("*** Example ***")
tf.logging.info("unique_id: %s" % (example.unique_id))
tf.logging.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
tf.logging.info(
"input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
features.append(
InputFeatures(
unique_id=example.unique_id,
tokens=tokens,
input_ids=input_ids,
input_mask=input_mask,
input_type_ids=input_type_ids))return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
Truncates a sequence pair in place to the maximum length.