session = Meta.init("postgres://localhost:5432/" + ATTRIBUTE).Session()
// SpaCy on mac has issue on parallel parseing
if os.name == "posix":
PARALLEL = 1
else:
PARALLEL = 2 // Travis only gives 2 cores
max_docs = 1
docs_path = "tests/data/html_extended/ext_diseases.html"
pdf_path = "tests/data/pdf_extended/ext_diseases.pdf"
// Preprocessor for the Docs
preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
// Create an Parser and parse the md document
parser = Parser(structural=True, lingual=True, visual=True, pdf_path=pdf_path)
parser.apply(preprocessor, parallelism=PARALLEL)
// Grab the document
doc = session.query(Document).order_by(Document.name).all()[0]
After Change
// Preprocessor for the Docs
preprocessor = HTMLDocPreprocessor(docs_path)
doc, text = next(preprocessor.parse_file(docs_path, "ext_diseases"))
// Create an Parser and parse the diseases document
parser_udf = get_parser_udf(
structural=True, lingual=True, visual=True, pdf_path=pdf_path
)
for _ in parser_udf.apply((doc, text)):
pass
// Grab the sentences parsed by the Parser
sentences = doc.sentences
logger.warning("Doc: {}".format(doc))
for i, sentence in enumerate(sentences):