3252f2117a4b693ca001613b13c28cc2d8cd9eb7,tests/candidates/test_candidates.py,,test_mention_longest_match,#,508
Before Change
def test_mention_longest_match():
Test longest match filtering in mention extraction.
// SpaCy on mac has issue on parallel parsing
PARALLEL = 1
max_docs = 1
session = Meta.init(CONN_STRING).Session()
docs_path = "tests/data/pure_html/lincoln_short.html"
// Parsing
logger.info("Parsing...")
doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
corpus_parser = Parser(session, structural=True, lingual=True)
corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
docs = session.query(Document).order_by(Document.name).all()
// Mention Extraction
name_ngrams = MentionNgramsPart(n_max=3)
place_ngrams = MentionNgramsTemp(n_max=4)
Name = mention_subclass("Name")
Place = mention_subclass("Place")
def is_birthplace_table_row(mention):
if not mention.sentence.is_tabular():
return False
ngrams = get_row_ngrams(mention, lower=True)
if "birth_place" in ngrams:
return True
else:
return False
birthplace_matcher = LambdaFunctionMatcher(
func=is_birthplace_table_row, longest_match_only=False
)
mention_extractor = MentionExtractor(
session,
[Name, Place],
[name_ngrams, place_ngrams],
[PersonMatcher(), birthplace_matcher],
)
mention_extractor.apply(docs, parallelism=PARALLEL)
mentions = session.query(Place).all()
mention_spans = [x.context.get_span() for x in mentions]
assert "Sinking Spring Farm" in mention_spans
assert "Farm" in mention_spans
assert len(mention_spans) == 23
birthplace_matcher = LambdaFunctionMatcher(
func=is_birthplace_table_row, longest_match_only=True
)
mention_extractor = MentionExtractor(
session,
[Name, Place],
[name_ngrams, place_ngrams],
[PersonMatcher(), birthplace_matcher],
)
mention_extractor.apply(docs, parallelism=PARALLEL)
mentions = session.query(Place).all()
mention_spans = [x.context.get_span() for x in mentions]
assert "Sinking Spring Farm" in mention_spans
assert "Farm" not in mention_spans
assert len(mention_spans) == 4
After Change
def test_mention_longest_match():
Test longest match filtering in mention extraction.
file_name = "lincoln_short"
docs_path = f"tests/data/pure_html/{file_name}.html"
doc = parse_doc(docs_path, file_name)
// Mention Extraction
name_ngrams = MentionNgramsPart(n_max=3)
place_ngrams = MentionNgramsTemp(n_max=4)
Name = mention_subclass("Name")
Place = mention_subclass("Place")
def is_birthplace_table_row(mention):
if not mention.sentence.is_tabular():
return False
ngrams = get_row_ngrams(mention, lower=True)
if "birth_place" in ngrams:
return True
else:
return False
birthplace_matcher = LambdaFunctionMatcher(
func=is_birthplace_table_row, longest_match_only=False
)
mention_extractor_udf = MentionExtractorUDF(
[Name, Place],
[name_ngrams, place_ngrams],
[PersonMatcher(), birthplace_matcher],
)
doc = mention_extractor_udf.apply(doc)
mentions = doc.places
mention_spans = [x.context.get_span() for x in mentions]
assert "Sinking Spring Farm" in mention_spans
assert "Farm" in mention_spans
assert len(mention_spans) == 23
// Clear manually
for mention in doc.places[:]:
doc.places.remove(mention)
birthplace_matcher = LambdaFunctionMatcher(
func=is_birthplace_table_row, longest_match_only=True
)
mention_extractor_udf = MentionExtractorUDF(
[Name, Place],
[name_ngrams, place_ngrams],
[PersonMatcher(), birthplace_matcher],
)
doc = mention_extractor_udf.apply(doc)
mentions = doc.places
mention_spans = [x.context.get_span() for x in mentions]
assert "Sinking Spring Farm" in mention_spans
assert "Farm" not in mention_spans
assert len(mention_spans) == 4
In pattern: SUPERPATTERN
Frequency: 3
Non-data size: 26
Instances
Project Name: HazyResearch/fonduer
Commit Name: 3252f2117a4b693ca001613b13c28cc2d8cd9eb7
Time: 2020-02-14
Author: hiromu.hota@hal.hitachi.com
File Name: tests/candidates/test_candidates.py
Class Name:
Method Name: test_mention_longest_match
Project Name: HazyResearch/fonduer
Commit Name: 3252f2117a4b693ca001613b13c28cc2d8cd9eb7
Time: 2020-02-14
Author: hiromu.hota@hal.hitachi.com
File Name: tests/candidates/test_candidates.py
Class Name:
Method Name: test_mention_longest_match
Project Name: HazyResearch/fonduer
Commit Name: 3252f2117a4b693ca001613b13c28cc2d8cd9eb7
Time: 2020-02-14
Author: hiromu.hota@hal.hitachi.com
File Name: tests/candidates/test_candidates.py
Class Name:
Method Name: test_ngrams
Project Name: HazyResearch/fonduer
Commit Name: a52ca17b208754909818f41b2f3340d77290b70c
Time: 2020-02-14
Author: hiromu.hota@hal.hitachi.com
File Name: tests/utils/test_visualizer.py
Class Name:
Method Name: test_visualizer