if sum(len(sents) for sents in new_sentences_by_line) < min_num_sentences:
counter_inc_fn("filtered-doc-toofewsentences")
return
yield (url, "\n".join(" ".join(sent) for sent in new_sentences_by_line))
def remove_duplicate_text(pages, sentence_window_size=3):
Utility to remove duplicate sentence windows across text documents.
After Change
features: The page features with sentences removed.
url, join_values = el
features = join_values["features"]
assert len(features) == 1, "Invalid page count (%d) for %s" % (len(features),
url)
features = features[0]
text = features["text"]
sentences_to_remove = set(join_values["sentences"])
sentences_by_line = _get_sentences_by_line(text, lower=False)
new_sentences_by_line = []
for line_sentences in sentences_by_line:
indices_to_remove = set()
for i in range(
len(line_sentences) - min(len(line_sentences), max_window_size) + 1):
sentence_window = tuple(
s.lower() for s in line_sentences[i:i+max_window_size])
if sentence_window in sentences_to_remove:
indices_to_remove.update(range(i, i+len(sentence_window)))
counter_inc_fn("filtered-sentence-duplicate", len(indices_to_remove))
new_line_sentences = [
s for i, s in enumerate(line_sentences) if i not in indices_to_remove]
if new_line_sentences:
new_sentences_by_line.append(new_line_sentences)
if sum(len(sents) for sents in new_sentences_by_line) < min_num_sentences:
counter_inc_fn("filtered-doc-toofewsentences")
return
features["text"] = "\n".join(" ".join(sent) for sent in new_sentences_by_line)yield (url, features)
def remove_duplicate_text(pages, sentence_window_size=3):
Utility to remove duplicate sentence windows across text documents.