bc128d9596ed07d1c8d5d98f35b1f6905ad4d819,analyze_tagged_corpus.py,,,#,12

Before Change


////////////////////////////

wc = 0
tag_counts = FreqDist()
taglen = 7
word_set = set()

if simplify_wsj_tag and args.simplify_tags and args.corpus not in ["conll2000", "switchboard"]:
	kwargs = {"simplify_tags": True}
else:
	kwargs = {}

for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
	if len(tag) > taglen:
		taglen = len(tag)
	
	if args.corpus in ["conll2000", "switchboard"] and simplify_wsj_tag and args.simplify_tags:
		tag = simplify_wsj_tag(tag)
	
	wc += 1
	// loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
	if not isinstance(tag, basestring): tag = str(tag)
	tag_counts.inc(tag)
	word_set.add(word)

////////////////////////
//// output ////

After Change


////////////////////////////

wc = 0
tag_counts = collections.defaultdict(int)
taglen = 7
word_set = set()

if simplify_wsj_tag and args.simplify_tags and args.corpus not in ["conll2000", "switchboard"]:
	kwargs = {"simplify_tags": True}
elif not simplify_wsj_tag and args.tagset:
	kwargs = {"tagset": args.tagset}
else:
	kwargs = {}

for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
	if not tag:
		continue
	
	if len(tag) > taglen:
		taglen = len(tag)
	
	if args.corpus in ["conll2000", "switchboard"] and simplify_wsj_tag and args.simplify_tags:
		tag = simplify_wsj_tag(tag)
	
	wc += 1
	// loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
	if not isinstance(tag, basestring): tag = str(tag)
	tag_counts[tag] += 1
	word_set.add(word)

////////////////////////
//// output ////
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 5

Instances


Project Name: japerk/nltk-trainer
Commit Name: bc128d9596ed07d1c8d5d98f35b1f6905ad4d819
Time: 2014-01-05
Author: japerk@gmail.com
File Name: analyze_tagged_corpus.py
Class Name:
Method Name:


Project Name: japerk/nltk-trainer
Commit Name: 46248d91fbec1af87b58502c7169d3d21ef47376
Time: 2014-04-21
Author: japerk@gmail.com
File Name: nltk_trainer/classification/scoring.py
Class Name:
Method Name: sum_category_word_scores


Project Name: japerk/nltk-trainer
Commit Name: 2ca3b0d5a88d414a87c343981b80ed1204b8dd8d
Time: 2014-01-05
Author: japerk@gmail.com
File Name: analyze_chunked_corpus.py
Class Name:
Method Name:


Project Name: japerk/nltk-trainer
Commit Name: bc128d9596ed07d1c8d5d98f35b1f6905ad4d819
Time: 2014-01-05
Author: japerk@gmail.com
File Name: analyze_tagged_corpus.py
Class Name:
Method Name: