bc128d9596ed07d1c8d5d98f35b1f6905ad4d819,analyze_tagged_corpus.py,,,#,12

Before Change


////////////////////////////

wc = 0
tag_counts = FreqDist()
taglen = 7
word_set = set()

if simplify_wsj_tag and args.simplify_tags and args.corpus not in ["conll2000", "switchboard"]:
	kwargs = {"simplify_tags": True}
else:
	kwargs = {}

for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
	if len(tag) > taglen:
		taglen = len(tag)
	
	if args.corpus in ["conll2000", "switchboard"] and simplify_wsj_tag and args.simplify_tags:
		tag = simplify_wsj_tag(tag)
	
	wc += 1
	// loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
	if not isinstance(tag, basestring): tag = str(tag)
	tag_counts.inc(tag)
	word_set.add(word)

////////////////////////
//// output ////

After Change


////////////////////////////

wc = 0
tag_counts = collections.defaultdict(int)
taglen = 7
word_set = set()

if simplify_wsj_tag and args.simplify_tags and args.corpus not in ["conll2000", "switchboard"]:
	kwargs = {"simplify_tags": True}
elif not simplify_wsj_tag and args.tagset:
	kwargs = {"tagset": args.tagset}
else:
	kwargs = {}

for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
	if not tag:
		continue
	
	if len(tag) > taglen:
		taglen = len(tag)
	
	if args.corpus in ["conll2000", "switchboard"] and simplify_wsj_tag and args.simplify_tags:
		tag = simplify_wsj_tag(tag)
	
	wc += 1
	// loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
	if not isinstance(tag, basestring): tag = str(tag)
	tag_counts[tag] += 1
	word_set.add(word)

////////////////////////
//// output ////
////////////////////////

print("%d total words\n%d unique words\n%d tags\n" % (wc, len(word_set), len(tag_counts)))

if args.sort == "tag":
	sort_key = lambda tc: tc[0]
elif args.sort == "count":
	sort_key = lambda tc: tc[1]
else:
	raise ValueError("%s is not a valid sort option" % args.sort)

sorted_tag_counts = sorted(tag_counts.items(), key=sort_key, reverse=args.reverse)
countlen = max(len(str(sorted_tag_counts[0][1])) + 2, 9)
// simple reSt table format
print("  ".join(["Tag".center(taglen), "Count".center(countlen)]))
print("  ".join(["="*taglen, "="*(countlen)]))
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 7

Instances


Project Name: japerk/nltk-trainer
Commit Name: bc128d9596ed07d1c8d5d98f35b1f6905ad4d819
Time: 2014-01-05
Author: japerk@gmail.com
File Name: analyze_tagged_corpus.py
Class Name:
Method Name:


Project Name: japerk/nltk-trainer
Commit Name: 46248d91fbec1af87b58502c7169d3d21ef47376
Time: 2014-04-21
Author: japerk@gmail.com
File Name: nltk_trainer/classification/scoring.py
Class Name:
Method Name: sum_category_word_scores


Project Name: japerk/nltk-trainer
Commit Name: 2ca3b0d5a88d414a87c343981b80ed1204b8dd8d
Time: 2014-01-05
Author: japerk@gmail.com
File Name: analyze_chunked_corpus.py
Class Name:
Method Name: