dfc58ff63a64e9a3b83afaeded706192505f08fa,robotreviewer/textprocessing/pdfreader.py,PdfReader,parse_xml,#PdfReader#Any#,117
Before Change
output = MultiDict()
full_text_bits = []
author_list = []
author_bits = []
path = []
for event, elem in ET.iterparse(StringIO(xml_string.encode("utf-8")),events=("start", "end")):
if event == "start":
path.append(elem.tag)
elif event == "end":
if elem.tag=="{http://www.tei-c.org/ns/1.0}abstract":
output.grobid["abstract"] = (self._extract_text(elem))
elif elem.tag=="{http://www.tei-c.org/ns/1.0}title" and "{http://www.tei-c.org/ns/1.0}titleStmt" in path:
output.grobid["title"] = self._extract_text(elem)
elif elem.tag in ["{http://www.tei-c.org/ns/1.0}head", "{http://www.tei-c.org/ns/1.0}p"]:
full_text_bits.extend([self._extract_text(elem), "\n"])
elif elem.tag=="{http://www.tei-c.org/ns/1.0}forename":
author_bits.append(self._extract_text(elem))
elif elem.tag=="{http://www.tei-c.org/ns/1.0}surname":
author_bits.append(self._extract_text(elem))
author_list.append(author_bits)
author_bits = []
path.pop()
output.grobid["text"] = "\n".join(full_text_bits)
output.grobid["authors"] = author_bits
After Change
elif elem.tag in ["{http://www.tei-c.org/ns/1.0}head", "{http://www.tei-c.org/ns/1.0}p"]:
full_text_bits.extend([self._extract_text(elem), "\n"])
elif elem.tag=="{http://www.tei-c.org/ns/1.0}author":
author_list.append(re.sub("\s+"," ", self._extract_text(elem)))
path.pop()
In pattern: SUPERPATTERN
Frequency: 3
Non-data size: 7
Instances
Project Name: ijmarshall/robotreviewer
Commit Name: dfc58ff63a64e9a3b83afaeded706192505f08fa
Time: 2016-08-17
Author: byron.wallace@gmail.com
File Name: robotreviewer/textprocessing/pdfreader.py
Class Name: PdfReader
Method Name: parse_xml
Project Name: ijmarshall/robotreviewer
Commit Name: 6a7db8d13e7d24a209a0bb22a3ebde24057b0adb
Time: 2016-08-17
Author: byron.wallace@gmail.com
File Name: robotreviewer/textprocessing/pdfreader.py
Class Name: PdfReader
Method Name: parse_xml
Project Name: dmlc/gluon-nlp
Commit Name: 8880acf0899efee237251cbd01c7ff81fc535789
Time: 2018-06-22
Author: szhengac@users.noreply.github.com
File Name: scripts/nmt/bleu.py
Class Name:
Method Name: _split_compound_word