PARALLEL = 4
max_docs = 1
session = Meta.init(CONN_STRING).Session()
docs_path = "tests/data/pure_html/lincoln_short.html"
logger.info("Parsing...")
doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
corpus_parser = Parser(session, structural=True, lingual=True)
corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
assert session.query(Document).count() == max_docs
assert session.query(Sentence).count() == 503
docs = session.query(Document).order_by(Document.name).all()
// Mention Extraction
Person = mention_subclass("Person")
person_ngrams = MentionNgrams(n_max=3)
person_matcher = PersonMatcher()
mention_extractor = MentionExtractor(
session, [Person], [person_ngrams], [person_matcher]
)
mention_extractor.apply(docs, parallelism=PARALLEL)
assert session.query(Person).count() == 118
mentions = session.query(Person).all()
assert len([x for x in mentions if x.context.get_num_words() == 1]) == 49
assert len([x for x in mentions if x.context.get_num_words() > 3]) == 0
// Test for unigram exclusion
After Change
assert len([x for x in mentions if x.context.get_num_words() > 3]) == 0
// Test for unigram exclusion
for mention in doc.persons[:]:
doc.persons.remove(mention)
assert len(doc.persons) == 0