// helper lists with unique pubmed ids that were already found
abstract_unique_list = set([])
full_unique_list = set([])
// nl-docid set
nl_doc_id_set = { "empty" }
// is abstract var
is_abstract = True
// precompile abstract match
regex_abstract_id = re.compile(r"^s[12][shp]")
for pubmedid, partid, is_abs, ann in self.all_annotations_with_ids_and_is_abstract():
// abstract?
if not is_abs:
is_abstract = False
else:
if regex_abstract_id.match(partid) or partid == "abstract":
// NOTE added issue /ኬ for this
is_abstract = True
else:
is_abstract = False
if ann.class_id == MUT_CLASS_ID:
// preprocessing
token_nr = len(ann.text.split(" "))
mentions_nr += 1
mentions_token_nr += token_nr
// TODO make parameterisable to just check for pure nl mentions
if ann.subclass == 1 or ann.subclass == 2:
// total nr increase
nl_nr += 1
nl_token_nr += token_nr
// min doc attribute
if pubmedid not in nl_doc_id_set:
nl_doc_id_set.add(pubmedid)
// abstract nr of tokens increase
if is_abstract:
abstract_mentions_nr += 1
abstract_token_nr += token_nr
abstract_nl_mentions.append(ann.text)
else:
// full document nr of tokens increase
full_document_mentions_nr += 1
full_document_token_nr += token_nr
full_nl_mentions.append(ann.text)
// nl text mention add to []
nl_mentions.append(ann.text)
// post-processing for abstract vs full document tokens
for doc_id, doc in self.documents.items():
for partid, part in doc.parts.items():
if not part.is_abstract:
is_abs = False
else:
is_abs = True
// if not regex_abstract_id.match(partid) and not "abstract" in partid:
// // if regex_abstract_id.match(partid) or partid == "abstract" or (len(partid) > 7 and partid[:8] == "abstract"):
// is_abs = False
// else:
// is_abs = True
if len(part.sentences) > 0:
tokens = sum(1 for sublist in part.sentences for _ in sublist)
// print(tokens, len(part.text.split(" ")))
else:
tokens = False
if not is_abs:
full_unique_list.add(doc_id)
if tokens:
total_token_full += tokens
else:
total_token_full += len(part.text.split(" "))
else:
abstract_unique_list.add(doc_id)
if tokens:
total_token_abstract += tokens
else:
total_token_abstract += len(part.text.split(" "))
abstract_unique_list = abstract_unique_list.difference(full_unique_list)
abstract_doc_nr = len(abstract_unique_list)
full_doc_nr = len(full_unique_list)
report_dict = {
"nl_mention_nr": nl_nr,