b8bfc14deadd0d6de44c7fb32f60c4b324e00b97,nala/structures/data.py,Dataset,stats,#Dataset#,239

Before Change



                if is_abs:
                    total_token_full += len(part.text.split(" "))
                    if doc_id not in full_unique_list:
                        full_doc_nr += 1
                        full_unique_list.append(doc_id)

                    if tokens:
                        total_token_abstract += tokens
                    else:
                        total_token_abstract += len(part.text.split(" "))

After Change


        full_doc_nr = 0

        // helper lists with unique pubmed ids that were already found
        abstract_unique_list = set([])
        full_unique_list = set([])

        // nl-docid set
        nl_doc_id_set = { "empty" }

        // is abstract var
        is_abstract = True

        // precompile abstract match
        regex_abstract_id = re.compile(r"^s[12][shp]")

        for pubmedid, partid, is_abs, ann in self.all_annotations_with_ids_and_is_abstract():
            // abstract?
            if not is_abs:
                is_abstract = False
            else:
                if regex_abstract_id.match(partid) or partid == "abstract":
                // NOTE added issue /ኬ for this
                    is_abstract = True
                else:
                    is_abstract = False


            if ann.class_id == MUT_CLASS_ID:
                // preprocessing
                token_nr = len(ann.text.split(" "))
                mentions_nr += 1
                mentions_token_nr += token_nr

                // TODO make parameterisable to just check for pure nl mentions
                if ann.subclass == 1 or ann.subclass == 2:
                    // total nr increase
                    nl_nr += 1
                    nl_token_nr += token_nr

                    // min doc attribute
                    if pubmedid not in nl_doc_id_set:
                        nl_doc_id_set.add(pubmedid)

                    // abstract nr of tokens increase
                    if is_abstract:
                        abstract_mentions_nr += 1
                        abstract_token_nr += token_nr
                        abstract_nl_mentions.append(ann.text)
                    else:
                        // full document nr of tokens increase
                        full_document_mentions_nr += 1
                        full_document_token_nr += token_nr
                        full_nl_mentions.append(ann.text)

                    // nl text mention add to []
                    nl_mentions.append(ann.text)

        // post-processing for abstract vs full document tokens
        for doc_id, doc in self.documents.items():
            for partid, part in doc.parts.items():
                if not part.is_abstract:
                    is_abs = False
                else:
                    is_abs = True
                    // if not regex_abstract_id.match(partid) and not "abstract" in partid:
                    // // if regex_abstract_id.match(partid) or partid == "abstract" or (len(partid) > 7 and partid[:8] == "abstract"):
                    //     is_abs = False
                    // else:
                    //     is_abs = True

                if len(part.sentences) > 0:
                        tokens = sum(1 for sublist in part.sentences for _ in sublist)
                        // print(tokens, len(part.text.split(" ")))
                else:
                    tokens = False

                if not is_abs:
                    full_unique_list.add(doc_id)

                    if tokens:
                        total_token_full += tokens
                    else:
                        total_token_full += len(part.text.split(" "))
                else:
                    abstract_unique_list.add(doc_id)

                    if tokens:
                        total_token_abstract += tokens
                    else:
                        total_token_abstract += len(part.text.split(" "))

        abstract_unique_list = abstract_unique_list.difference(full_unique_list)
        abstract_doc_nr = len(abstract_unique_list)
        full_doc_nr = len(full_unique_list)

        report_dict = {
            "nl_mention_nr": nl_nr,
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 4

Non-data size: 4

Instances


Project Name: Rostlab/nalaf
Commit Name: b8bfc14deadd0d6de44c7fb32f60c4b324e00b97
Time: 2015-10-13
Author: carsten.uhlig@gmail.com
File Name: nala/structures/data.py
Class Name: Dataset
Method Name: stats


Project Name: pgmpy/pgmpy
Commit Name: daa7947850ea7df6763ec076548ae4b372ee3fb9
Time: 2015-03-25
Author: pratyaksh@me.com
File Name: pgmpy/inference/ExactInference.py
Class Name: VariableElimination
Method Name: induced_graph


Project Name: dask/distributed
Commit Name: 909a943b67b6b472a2d77afa13a8caa61f25f972
Time: 2019-07-25
Author: jcrist@users.noreply.github.com
File Name: distributed/security.py
Class Name: Security
Method Name: __init__


Project Name: HazyResearch/fonduer
Commit Name: b9dc15ecef0393a55a953acff3c1d0278b19d9f5
Time: 2018-09-01
Author: lwhsiao@stanford.edu
File Name: src/fonduer/utils/utils_udf.py
Class Name:
Method Name: add_keys