351bcbc392bfa8869fc01e5a047273b53d24419e,finetune/utils.py,,finetune_to_indico_sequence,#Any#Any#Any#Any#Any#,312

Before Change


        n_tokens = len(tokens)

        doc_text = ""
        doc_annotations = []
        annotation_start = 0
        annotation_end = 0
        start_idx = 0
        end_idx = 0
        for sub_str, label in zip(doc_seq, label_seq):
            stripped_text = sub_str.strip()
            annotation_start = raw_text.find(stripped_text, annotation_end)
            annotation_end = annotation_start + len(stripped_text)

            if not subtoken_predictions:
                // round to nearest token
                while start_idx < n_tokens and annotation_start >= token_starts[start_idx]:
                    start_idx += 1
                annotation_start = token_starts[start_idx - 1]
                while end_idx < (n_tokens - 1) and annotation_end > token_ends[end_idx]:
                    end_idx += 1
                annotation_end = token_ends[end_idx]
            
            text = raw_text[annotation_start:annotation_end]
            if label != none_value:
                doc_annotations.append(
                    {
                        "start": annotation_start,
                        "end": annotation_end,
                        "label": label,
                        "text": text
                    }
                )
        annotations.append(doc_annotations)
    return raw_texts, annotations

After Change


        n_tokens = len(tokens)

        doc_text = ""
        doc_annotations = set([])
        annotation_start = 0
        annotation_end = 0
        start_idx = 0
        end_idx = 0
        for sub_str, label in zip(doc_seq, label_seq):
            stripped_text = sub_str.strip()
            annotation_start = raw_text.find(stripped_text, annotation_end)
            annotation_end = annotation_start + len(stripped_text)

            if not subtoken_predictions:
                // round to nearest token
                while start_idx < n_tokens and annotation_start >= token_starts[start_idx]:
                    start_idx += 1
                annotation_start = token_starts[start_idx - 1]
                while end_idx < (n_tokens - 1) and annotation_end > token_ends[end_idx]:
                    end_idx += 1
                annotation_end = token_ends[end_idx]
            
            text = raw_text[annotation_start:annotation_end]
            if label != none_value:
                doc_annotations.add(
                    (
                        ("start", annotation_start),
                        ("end", annotation_end),
                        ("label", label),
                        ("text",  text)
                    )
                )
        doc_annotations = sorted([dict(items) for items in doc_annotations], key=lambda x: x["start"])
        annotations.append(doc_annotations)
    return raw_texts, annotations

In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 7

Instances

Link

Project Name: IndicoDataSolutions/finetune

Commit Name: 351bcbc392bfa8869fc01e5a047273b53d24419e

Time: 2018-08-01

Author: madison@indico.io

File Name: finetune/utils.py

Class Name:

Method Name: finetune_to_indico_sequence

Link

Project Name: allenai/allennlp

Commit Name: 40ec35876d38c4797ad3ee9bf911b019faa5a61d

Time: 2017-08-25

Author: mattg@allenai.org

File Name: allennlp/nn/initializers.py

Class Name: InitializerApplicator

Method Name: __call__

Link

Project Name: facebookresearch/ParlAI

Commit Name: 2cbc5eb42ee7c742a5ea0eca3f2de945a513c00a

Time: 2019-07-31

Author: 5313281+dexterju@users.noreply.github.com

File Name: parlai/core/metrics.py

Class Name: Metrics

Method Name: __init__