e14c08dd732e73cbe2b3e249cba2632663abdd27,tests/parser/test_parser.py,,test_parse_style,#Any#,429

Before Change


    session = Meta.init("postgres://localhost:5432/" + ATTRIBUTE).Session()

    // SpaCy on mac has issue on parallel parseing
    if os.name == "posix":
        PARALLEL = 1
    else:
        PARALLEL = 2  // Travis only gives 2 cores

    max_docs = 1
    docs_path = "tests/data/html_extended/ext_diseases.html"
    pdf_path = "tests/data/pdf_extended/ext_diseases.pdf"

    // Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

    // Create an Parser and parse the md document
    parser = Parser(structural=True, lingual=True, visual=True, pdf_path=pdf_path)
    parser.apply(preprocessor, parallelism=PARALLEL)

    // Grab the document
    doc = session.query(Document).order_by(Document.name).all()[0]

After Change



    // Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc, text = next(preprocessor.parse_file(docs_path, "ext_diseases"))

    // Create an Parser and parse the diseases document
    parser_udf = get_parser_udf(
        structural=True, lingual=True, visual=True, pdf_path=pdf_path
    )
    for _ in parser_udf.apply((doc, text)):
        pass

    // Grab the sentences parsed by the Parser
    sentences = doc.sentences

    logger.warning("Doc: {}".format(doc))
    for i, sentence in enumerate(sentences):
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 13

Instances


Project Name: HazyResearch/fonduer
Commit Name: e14c08dd732e73cbe2b3e249cba2632663abdd27
Time: 2018-08-29
Author: hiromu.hota@hal.hitachi.com
File Name: tests/parser/test_parser.py
Class Name:
Method Name: test_parse_style


Project Name: HazyResearch/fonduer
Commit Name: e14c08dd732e73cbe2b3e249cba2632663abdd27
Time: 2018-08-29
Author: hiromu.hota@hal.hitachi.com
File Name: tests/parser/test_parser.py
Class Name:
Method Name: test_parse_style


Project Name: HazyResearch/fonduer
Commit Name: e14c08dd732e73cbe2b3e249cba2632663abdd27
Time: 2018-08-29
Author: hiromu.hota@hal.hitachi.com
File Name: tests/parser/test_parser.py
Class Name:
Method Name: test_simple_tokenizer


Project Name: HazyResearch/fonduer
Commit Name: e14c08dd732e73cbe2b3e249cba2632663abdd27
Time: 2018-08-29
Author: hiromu.hota@hal.hitachi.com
File Name: tests/parser/test_parser.py
Class Name:
Method Name: test_parse_document_diseases