dfc58ff63a64e9a3b83afaeded706192505f08fa,robotreviewer/textprocessing/pdfreader.py,PdfReader,parse_xml,#PdfReader#Any#,117

Before Change


        output = MultiDict()
        full_text_bits = []
        author_list = []
        author_bits = []
        path = []
        for event, elem in ET.iterparse(StringIO(xml_string.encode("utf-8")),events=("start", "end")):
            if event == "start":
                path.append(elem.tag)
            elif event == "end":
                if elem.tag=="{http://www.tei-c.org/ns/1.0}abstract":
                    output.grobid["abstract"] = (self._extract_text(elem))
                elif elem.tag=="{http://www.tei-c.org/ns/1.0}title" and "{http://www.tei-c.org/ns/1.0}titleStmt" in path:
                    output.grobid["title"] = self._extract_text(elem)
                elif elem.tag in ["{http://www.tei-c.org/ns/1.0}head", "{http://www.tei-c.org/ns/1.0}p"]:
                    full_text_bits.extend([self._extract_text(elem), "\n"])
                elif elem.tag=="{http://www.tei-c.org/ns/1.0}forename":
                    author_bits.append(self._extract_text(elem))
                elif elem.tag=="{http://www.tei-c.org/ns/1.0}surname":
                    author_bits.append(self._extract_text(elem))
                    author_list.append(author_bits)
                    author_bits = []

                path.pop()
        output.grobid["text"] = "\n".join(full_text_bits)
        output.grobid["authors"] = author_bits

After Change


                elif elem.tag in ["{http://www.tei-c.org/ns/1.0}head", "{http://www.tei-c.org/ns/1.0}p"]:
                    full_text_bits.extend([self._extract_text(elem), "\n"])
                elif elem.tag=="{http://www.tei-c.org/ns/1.0}author":
                    author_list.append(re.sub("\s+"," ", self._extract_text(elem)))
                    
                path.pop()
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 7

Instances


Project Name: ijmarshall/robotreviewer
Commit Name: dfc58ff63a64e9a3b83afaeded706192505f08fa
Time: 2016-08-17
Author: byron.wallace@gmail.com
File Name: robotreviewer/textprocessing/pdfreader.py
Class Name: PdfReader
Method Name: parse_xml


Project Name: ijmarshall/robotreviewer
Commit Name: 6a7db8d13e7d24a209a0bb22a3ebde24057b0adb
Time: 2016-08-17
Author: byron.wallace@gmail.com
File Name: robotreviewer/textprocessing/pdfreader.py
Class Name: PdfReader
Method Name: parse_xml


Project Name: dmlc/gluon-nlp
Commit Name: 8880acf0899efee237251cbd01c7ff81fc535789
Time: 2018-06-22
Author: szhengac@users.noreply.github.com
File Name: scripts/nmt/bleu.py
Class Name:
Method Name: _split_compound_word