////////////////////////////
wc = 0
tag_counts = FreqDist()
taglen = 7
word_set = set()
if simplify_wsj_tag and args.simplify_tags and args.corpus not in ["conll2000", "switchboard"]:
kwargs = {"simplify_tags": True}
else:
kwargs = {}
for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
if len(tag) > taglen:
taglen = len(tag)
if args.corpus in ["conll2000", "switchboard"] and simplify_wsj_tag and args.simplify_tags:
tag = simplify_wsj_tag(tag)
wc += 1
// loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
if not isinstance(tag, basestring): tag = str(tag)
tag_counts.inc(tag)
word_set.add(word)
////////////////////////
//// output ////
After Change
////////////////////////////
wc = 0
tag_counts = collections.defaultdict(int)
taglen = 7
word_set = set()
if simplify_wsj_tag and args.simplify_tags and args.corpus not in ["conll2000", "switchboard"]:
kwargs = {"simplify_tags": True}
elif not simplify_wsj_tag and args.tagset:
kwargs = {"tagset": args.tagset}
else:
kwargs = {}
for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
if not tag:
continue
if len(tag) > taglen:
taglen = len(tag)
if args.corpus in ["conll2000", "switchboard"] and simplify_wsj_tag and args.simplify_tags:
tag = simplify_wsj_tag(tag)
wc += 1
// loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
if not isinstance(tag, basestring): tag = str(tag)
tag_counts[tag] += 1
word_set.add(word)
////////////////////////
//// output ////