////////////////////////////
wc = 0
tag_counts = FreqDist()
taglen = 7
word_set = set()
if simplify_wsj_tag and args.simplify_tags and args.corpus not in ["conll2000", "switchboard"]:
kwargs = {"simplify_tags": True}
else:
kwargs = {}
for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
if len(tag) > taglen:
taglen = len(tag)
if args.corpus in ["conll2000", "switchboard"] and simplify_wsj_tag and args.simplify_tags:
tag = simplify_wsj_tag(tag)
wc += 1
// loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
if not isinstance(tag, basestring): tag = str(tag)
tag_counts.inc(tag)
word_set.add(word)
////////////////////////
//// output ////
After Change
////////////////////////////
wc = 0
tag_counts = collections.defaultdict(int)
taglen = 7
word_set = set()
if simplify_wsj_tag and args.simplify_tags and args.corpus not in ["conll2000", "switchboard"]:
kwargs = {"simplify_tags": True}
elif not simplify_wsj_tag and args.tagset:
kwargs = {"tagset": args.tagset}
else:
kwargs = {}
for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
if not tag:
continue
if len(tag) > taglen:
taglen = len(tag)
if args.corpus in ["conll2000", "switchboard"] and simplify_wsj_tag and args.simplify_tags:
tag = simplify_wsj_tag(tag)
wc += 1
// loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
if not isinstance(tag, basestring): tag = str(tag)
tag_counts[tag] += 1
word_set.add(word)
////////////////////////
//// output ////
////////////////////////
print("%d total words\n%d unique words\n%d tags\n" % (wc, len(word_set), len(tag_counts)))
if args.sort == "tag":
sort_key = lambda tc: tc[0]
elif args.sort == "count":
sort_key = lambda tc: tc[1]
else:
raise ValueError("%s is not a valid sort option" % args.sort)
sorted_tag_counts = sorted(tag_counts.items(), key=sort_key, reverse=args.reverse)
countlen = max(len(str(sorted_tag_counts[0][1])) + 2, 9)
// simple reSt table format
print(" ".join(["Tag".center(taglen), "Count".center(countlen)]))
print(" ".join(["="*taglen, "="*(countlen)]))