875a33806acea37f602d0ad20fb77cd42432bbb6,scripts/tokenize/tokenize_pad.py,,,#,6

Before Change



from lingpy import *

rcParams["debug"]=True

wl = Wordlist("pad_data_qlc.qlc")
wl.tokenize("pad_orthography_profile")

After Change


infile = codecs.open("pad_data_qlc.qlc", "r", "utf-8")
header = infile.readline() // skip pad header

t = Tokenizer("pad_orthography_profile")

print()
print("ID"+"\t"+"ORIGINAL"+"\t"+"RULES")
for line in infile:
    line = line.strip()
    tokens = line.split("\t")
    id = tokens[0]
    counterpart = tokens[2]
    grapheme_clusters =t.grapheme_clusters(counterpart)
    rules = t.rules(grapheme_clusters)
    print(id+"\t"+counterpart+"\t"+rules)



// this tokenize does not work because of the way the orthography rules are currently written, i.e. 
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 6

Instances


Project Name: lingpy/lingpy
Commit Name: 875a33806acea37f602d0ad20fb77cd42432bbb6
Time: 2013-11-08
Author: bambooforest@gmail.com
File Name: scripts/tokenize/tokenize_pad.py
Class Name:
Method Name:


Project Name: CyberZHG/keras-bert
Commit Name: 2df9a66ac40d1017792b1c93f34e47e214726d7d
Time: 2019-03-18
Author: CyberZHG@gmail.com
File Name: demo/load_model/load_and_pool.py
Class Name:
Method Name:


Project Name: CyberZHG/keras-bert
Commit Name: 2df9a66ac40d1017792b1c93f34e47e214726d7d
Time: 2019-03-18
Author: CyberZHG@gmail.com
File Name: demo/load_model/load_and_extract.py
Class Name:
Method Name: