351bcbc392bfa8869fc01e5a047273b53d24419e,finetune/utils.py,,finetune_to_indico_sequence,#Any#Any#Any#Any#Any#,312

Before Change


        n_tokens = len(tokens)

        doc_text = ""
        doc_annotations = []
        annotation_start = 0
        annotation_end = 0
        start_idx = 0
        end_idx = 0
        for sub_str, label in zip(doc_seq, label_seq):
            stripped_text = sub_str.strip()
            annotation_start = raw_text.find(stripped_text, annotation_end)
            annotation_end = annotation_start + len(stripped_text)

            if not subtoken_predictions:
                // round to nearest token
                while start_idx < n_tokens and annotation_start >= token_starts[start_idx]:
                    start_idx += 1
                annotation_start = token_starts[start_idx - 1]
                while end_idx < (n_tokens - 1) and annotation_end > token_ends[end_idx]:
                    end_idx += 1
                annotation_end = token_ends[end_idx]
            
            text = raw_text[annotation_start:annotation_end]
            if label != none_value:
                doc_annotations.append(
                    {
                        "start": annotation_start,
                        "end": annotation_end,
                        "label": label,
                        "text": text
                    }
                )
        annotations.append(doc_annotations)
    return raw_texts, annotations

After Change


        n_tokens = len(tokens)

        doc_text = ""
        doc_annotations = set([])
        annotation_start = 0
        annotation_end = 0
        start_idx = 0
        end_idx = 0
        for sub_str, label in zip(doc_seq, label_seq):
            stripped_text = sub_str.strip()
            annotation_start = raw_text.find(stripped_text, annotation_end)
            annotation_end = annotation_start + len(stripped_text)

            if not subtoken_predictions:
                // round to nearest token
                while start_idx < n_tokens and annotation_start >= token_starts[start_idx]:
                    start_idx += 1
                annotation_start = token_starts[start_idx - 1]
                while end_idx < (n_tokens - 1) and annotation_end > token_ends[end_idx]:
                    end_idx += 1
                annotation_end = token_ends[end_idx]
            
            text = raw_text[annotation_start:annotation_end]
            if label != none_value:
                doc_annotations.add(
                    (
                        ("start", annotation_start),
                        ("end", annotation_end),
                        ("label", label),
                        ("text",  text)
                    )
                )
        doc_annotations = sorted([dict(items) for items in doc_annotations], key=lambda x: x["start"])
        annotations.append(doc_annotations)
    return raw_texts, annotations
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 7

Instances


Project Name: IndicoDataSolutions/finetune
Commit Name: 351bcbc392bfa8869fc01e5a047273b53d24419e
Time: 2018-08-01
Author: madison@indico.io
File Name: finetune/utils.py
Class Name:
Method Name: finetune_to_indico_sequence


Project Name: allenai/allennlp
Commit Name: 40ec35876d38c4797ad3ee9bf911b019faa5a61d
Time: 2017-08-25
Author: mattg@allenai.org
File Name: allennlp/nn/initializers.py
Class Name: InitializerApplicator
Method Name: __call__


Project Name: facebookresearch/ParlAI
Commit Name: 2cbc5eb42ee7c742a5ea0eca3f2de945a513c00a
Time: 2019-07-31
Author: 5313281+dexterju@users.noreply.github.com
File Name: parlai/core/metrics.py
Class Name: Metrics
Method Name: __init__