vocabs = {}
keys = self.extended_features.keys()
for key in keys:
vocabs[key] = Counter()
maxw = 0
maxs = 0
for file in files:
if file is None:
continue
sl = 0
with codecs.open(file, encoding="utf-8", mode="r") as f:
for line in f:
line = line.strip()
if line == "":
maxs = max(maxs, sl)
sl = 0
else:
states = re.split("\s", line)
sl += 1
w = states[0]
vocab_word[self.cleanup_fn(w)] += 1
maxw = max(maxw, len(w))
for k in w:
vocab_ch[k] += 1
for key, index in self.extended_features.items():
vocabs[key][states[index]] += 1
self.max_word_length = min(maxw, self.max_word_length) if self.max_word_length > 0 else maxw
self.max_sentence_length = min(maxs, self.max_sentence_length) if self.max_sentence_length > 0 else maxs
print("Max sentence length %d" % self.max_sentence_length)
print("Max word length %d" % self.max_word_length)