dfc58ff63a64e9a3b83afaeded706192505f08fa,robotreviewer/textprocessing/pdfreader.py,PdfReader,parse_xml,#PdfReader#Any#,117

Before Change


        output = MultiDict()
        full_text_bits = []
        author_list = []
        author_bits = []
        path = []
        for event, elem in ET.iterparse(StringIO(xml_string.encode("utf-8")),events=("start", "end")):
            if event == "start":
                path.append(elem.tag)
            elif event == "end":
                if elem.tag=="{http://www.tei-c.org/ns/1.0}abstract":
                    output.grobid["abstract"] = (self._extract_text(elem))
                elif elem.tag=="{http://www.tei-c.org/ns/1.0}title" and "{http://www.tei-c.org/ns/1.0}titleStmt" in path:
                    output.grobid["title"] = self._extract_text(elem)
                elif elem.tag in ["{http://www.tei-c.org/ns/1.0}head", "{http://www.tei-c.org/ns/1.0}p"]:
                    full_text_bits.extend([self._extract_text(elem), "\n"])
                elif elem.tag=="{http://www.tei-c.org/ns/1.0}forename":
                    author_bits.append(self._extract_text(elem))
                elif elem.tag=="{http://www.tei-c.org/ns/1.0}surname":
                    author_bits.append(self._extract_text(elem))
                    author_list.append(author_bits)
                    author_bits = []

After Change



        output.grobid["text"] = "\n".join(full_text_bits)
        output.grobid["authors"] = author_list
        log.info("author list: %s" % author_list)
        
        return output

    def _extract_text(self, elem):
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 4

Non-data size: 6

Instances


Project Name: ijmarshall/robotreviewer
Commit Name: dfc58ff63a64e9a3b83afaeded706192505f08fa
Time: 2016-08-17
Author: byron.wallace@gmail.com
File Name: robotreviewer/textprocessing/pdfreader.py
Class Name: PdfReader
Method Name: parse_xml


Project Name: deepchem/deepchem
Commit Name: 3b3a06ad8402079c2d18718349d5f0f212ac7b81
Time: 2020-12-11
Author: nfrey213@gmail.com
File Name: deepchem/feat/base_classes.py
Class Name: ComplexFeaturizer
Method Name: featurize


Project Name: ijmarshall/robotreviewer
Commit Name: 6a7db8d13e7d24a209a0bb22a3ebde24057b0adb
Time: 2016-08-17
Author: byron.wallace@gmail.com
File Name: robotreviewer/textprocessing/pdfreader.py
Class Name: PdfReader
Method Name: parse_xml


Project Name: ilastik/ilastik
Commit Name: fe073644f6a8f37e9ce57df903bf12b560690fc3
Time: 2012-09-14
Author: christoph.straehle@iwr.uni-heidelberg.de
File Name: lazyflow/operators/obsolete/classifierOperators.py
Class Name: OpPredictRandomForest
Method Name: execute