97645e74a08ad396d12870bd9fc613d135142527,tensorflow_datasets/text/c4_utils.py,,_remove_sentences_from_text,#Any#Any#Any#Any#,233

Before Change


  if sum(len(sents) for sents in new_sentences_by_line) < min_num_sentences:
    counter_inc_fn("filtered-doc-toofewsentences")
    return
  yield (url, "\n".join(" ".join(sent) for sent in new_sentences_by_line))


def remove_duplicate_text(pages, sentence_window_size=3):
  Utility to remove duplicate sentence windows across text documents.

After Change


    features: The page features with sentences removed.
  
  url, join_values = el
  features = join_values["features"]

  assert len(features) == 1, "Invalid page count (%d) for %s" % (len(features),
                                                                 url)
  features = features[0]
  text = features["text"]
  sentences_to_remove = set(join_values["sentences"])
  sentences_by_line = _get_sentences_by_line(text, lower=False)
  new_sentences_by_line = []
  for line_sentences in sentences_by_line:
    indices_to_remove = set()
    for i in range(
        len(line_sentences) - min(len(line_sentences), max_window_size) + 1):
      sentence_window = tuple(
          s.lower() for s in line_sentences[i:i+max_window_size])
      if sentence_window in sentences_to_remove:
        indices_to_remove.update(range(i, i+len(sentence_window)))
    counter_inc_fn("filtered-sentence-duplicate", len(indices_to_remove))
    new_line_sentences = [
        s for i, s in enumerate(line_sentences) if i not in indices_to_remove]
    if new_line_sentences:
      new_sentences_by_line.append(new_line_sentences)
  if sum(len(sents) for sents in new_sentences_by_line) < min_num_sentences:
    counter_inc_fn("filtered-doc-toofewsentences")
    return
  features["text"] = "\n".join(" ".join(sent) for sent in new_sentences_by_line)
  yield (url, features)


def remove_duplicate_text(pages, sentence_window_size=3):
  Utility to remove duplicate sentence windows across text documents.
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 4

Non-data size: 7

Instances


Project Name: tensorflow/datasets
Commit Name: 97645e74a08ad396d12870bd9fc613d135142527
Time: 2019-10-30
Author: no-reply@google.com
File Name: tensorflow_datasets/text/c4_utils.py
Class Name:
Method Name: _remove_sentences_from_text


Project Name: PyMVPA/PyMVPA
Commit Name: 05a60523adcab106a4abe2abb82bd922a8030df8
Time: 2007-09-18
Author: michael.hanke@gmail.com
File Name: mvpa/algorithms.py
Class Name:
Method Name: SpheresInMask


Project Name: tensorflow/datasets
Commit Name: dc355077a51dc721fbb7fa01653d7650790f1739
Time: 2020-04-01
Author: jpuigcerver@google.com
File Name: tensorflow_datasets/image_classification/food101.py
Class Name: Food101
Method Name: _generate_examples


Project Name: tensorflow/datasets
Commit Name: dc355077a51dc721fbb7fa01653d7650790f1739
Time: 2020-04-01
Author: jpuigcerver@google.com
File Name: tensorflow_datasets/image_classification/cars196.py
Class Name: Cars196
Method Name: _generate_examples