3252f2117a4b693ca001613b13c28cc2d8cd9eb7,tests/candidates/test_candidates.py,,test_ngrams,#,424

Before Change



    docs_path = "tests/data/pure_html/lincoln_short.html"

    logger.info("Parsing...")
    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
    corpus_parser = Parser(session, structural=True, lingual=True)
    corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
    assert session.query(Document).count() == max_docs

After Change



def test_ngrams():
    Test ngram limits in mention extraction
    file_name = "lincoln_short"
    docs_path = f"tests/data/pure_html/{file_name}.html"
    doc = parse_doc(docs_path, file_name)

    // Mention Extraction
    Person = mention_subclass("Person")
    person_ngrams = MentionNgrams(n_max=3)
    person_matcher = PersonMatcher()

    mention_extractor_udf = MentionExtractorUDF(
        [Person], [person_ngrams], [person_matcher]
    )
    doc = mention_extractor_udf.apply(doc)

    assert len(doc.persons) == 118
    mentions = doc.persons
    assert len([x for x in mentions if x.context.get_num_words() == 1]) == 49
    assert len([x for x in mentions if x.context.get_num_words() > 3]) == 0

    // Test for unigram exclusion
    for mention in doc.persons[:]:
        doc.persons.remove(mention)
    assert len(doc.persons) == 0

    person_ngrams = MentionNgrams(n_min=2, n_max=3)
    mention_extractor_udf = MentionExtractorUDF(
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 4

Non-data size: 5

Instances


Project Name: HazyResearch/fonduer
Commit Name: 3252f2117a4b693ca001613b13c28cc2d8cd9eb7
Time: 2020-02-14
Author: hiromu.hota@hal.hitachi.com
File Name: tests/candidates/test_candidates.py
Class Name:
Method Name: test_ngrams


Project Name: HazyResearch/fonduer
Commit Name: 3252f2117a4b693ca001613b13c28cc2d8cd9eb7
Time: 2020-02-14
Author: hiromu.hota@hal.hitachi.com
File Name: tests/candidates/test_candidates.py
Class Name:
Method Name: test_multimodal_cand


Project Name: HazyResearch/fonduer
Commit Name: 3252f2117a4b693ca001613b13c28cc2d8cd9eb7
Time: 2020-02-14
Author: hiromu.hota@hal.hitachi.com
File Name: tests/candidates/test_candidates.py
Class Name:
Method Name: test_mention_longest_match


Project Name: HazyResearch/fonduer
Commit Name: 3252f2117a4b693ca001613b13c28cc2d8cd9eb7
Time: 2020-02-14
Author: hiromu.hota@hal.hitachi.com
File Name: tests/candidates/test_candidates.py
Class Name:
Method Name: test_row_col_ngram_extraction