7d3c51ba6059fc7ccadbd44c9c9961fb6700ef88,nltk/tokenize/treebank.py,TreebankWordTokenizer,TreebankWordTokenizer_1,#,49
Before Change
]
// List of contractions adapted from Robert MacIntyre"s tokenizer.
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
re.compile(r"(?i)\b(d)("ye)\b"),
re.compile(r"(?i)\b(gim)(me)\b"),
re.compile(r"(?i)\b(gon)(na)\b"),
re.compile(r"(?i)\b(got)(ta)\b"),
re.compile(r"(?i)\b(lem)(me)\b"),
re.compile(r"(?i)\b(mor)("n)\b"),
re.compile(r"(?i)\b(wan)(na) ")]
CONTRACTIONS3 = [re.compile(r"(?i) ("t)(is)\b"),
re.compile(r"(?i) ("t)(was)\b")]
CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
re.compile(r"(?i)\b(wha)(t)(cha)\b")]
After Change
// List of contractions adapted from Robert MacIntyre"s tokenizer.
_contractions = MacIntyreContractions()
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
def tokenize(self, text, return_str=False):
for regexp, substitution in self.STARTING_QUOTES:
text = regexp.sub(substitution, text)
In pattern: SUPERPATTERN
Frequency: 3
Non-data size: 4
Instances Project Name: nltk/nltk
Commit Name: 7d3c51ba6059fc7ccadbd44c9c9961fb6700ef88
Time: 2017-05-03
Author: alvations@gmail.com
File Name: nltk/tokenize/treebank.py
Class Name: TreebankWordTokenizer
Method Name: TreebankWordTokenizer_1
Project Name: matplotlib/matplotlib
Commit Name: 0b92b4f5530fee68432f13075a1ddc866748f9d1
Time: 2020-11-20
Author: anntzer.lee@gmail.com
File Name: examples/showcase/firefox.py
Class Name:
Method Name: svg_parse
Project Name: mozilla/bugbug
Commit Name: ef65ed0d5cba305de3f3761e929719a0515ce6f1
Time: 2019-01-23
Author: mcastelluccio@mozilla.com
File Name: bugbug/repository.py
Class Name:
Method Name: download_commits