3252f2117a4b693ca001613b13c28cc2d8cd9eb7,tests/candidates/test_candidates.py,,test_ngrams,#,424
Before Change
docs_path = "tests/data/pure_html/lincoln_short.html"
logger.info("Parsing...")
doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
corpus_parser = Parser(session, structural=True, lingual=True)
corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
assert session.query(Document).count() == max_docs
After Change
def test_ngrams():
Test ngram limits in mention extraction
file_name = "lincoln_short"
docs_path = f"tests/data/pure_html/{file_name}.html"
doc = parse_doc(docs_path, file_name)
// Mention Extraction
Person = mention_subclass("Person")
person_ngrams = MentionNgrams(n_max=3)
person_matcher = PersonMatcher()
mention_extractor_udf = MentionExtractorUDF(
[Person], [person_ngrams], [person_matcher]
)
doc = mention_extractor_udf.apply(doc)
assert len(doc.persons) == 118
mentions = doc.persons
assert len([x for x in mentions if x.context.get_num_words() == 1]) == 49
assert len([x for x in mentions if x.context.get_num_words() > 3]) == 0
// Test for unigram exclusion
for mention in doc.persons[:]:
doc.persons.remove(mention)
assert len(doc.persons) == 0
person_ngrams = MentionNgrams(n_min=2, n_max=3)
mention_extractor_udf = MentionExtractorUDF(
In pattern: SUPERPATTERN
Frequency: 4
Non-data size: 5
Instances
Project Name: HazyResearch/fonduer
Commit Name: 3252f2117a4b693ca001613b13c28cc2d8cd9eb7
Time: 2020-02-14
Author: hiromu.hota@hal.hitachi.com
File Name: tests/candidates/test_candidates.py
Class Name:
Method Name: test_ngrams
Project Name: HazyResearch/fonduer
Commit Name: 3252f2117a4b693ca001613b13c28cc2d8cd9eb7
Time: 2020-02-14
Author: hiromu.hota@hal.hitachi.com
File Name: tests/candidates/test_candidates.py
Class Name:
Method Name: test_multimodal_cand
Project Name: HazyResearch/fonduer
Commit Name: 3252f2117a4b693ca001613b13c28cc2d8cd9eb7
Time: 2020-02-14
Author: hiromu.hota@hal.hitachi.com
File Name: tests/candidates/test_candidates.py
Class Name:
Method Name: test_mention_longest_match
Project Name: HazyResearch/fonduer
Commit Name: 3252f2117a4b693ca001613b13c28cc2d8cd9eb7
Time: 2020-02-14
Author: hiromu.hota@hal.hitachi.com
File Name: tests/candidates/test_candidates.py
Class Name:
Method Name: test_row_col_ngram_extraction