build_data.untar(dpath, fname)
// According to the author, V2 holds the latest data
dpext = os.path.join(dpath, "insuranceQA-master/V2")
// read vocab file
vocab_path = os.path.join(dpext, "vocabulary")
d_vocab = read_vocab(vocab_path)
// read label2answer file
label2answer_path_gz = os.path.join(dpext, "InsuranceQA.label2answer.token.encoded.gz")
d_label_answer = read_label2answer(label2answer_path_gz, d_vocab)
// TODO: right now it uses 100 by default, but 500, 1000, 1500 (// of label candidates) should also be available
train_path_gz = os.path.join(dpext, "InsuranceQA.question.anslabel.token.100.pool.solr.train.encoded.gz")
valid_path_gz = os.path.join(dpext, "InsuranceQA.question.anslabel.token.100.pool.solr.valid.encoded.gz")
test_path_gz = os.path.join(dpext, "InsuranceQA.question.anslabel.token.100.pool.solr.test.encoded.gz")
create_fb_format(dpath, "train", train_path_gz, d_vocab, d_label_answer)
After Change
build_data.download(url, dpath, fname)
build_data.untar(dpath, fname)
ParseInsuranceQAV1.build(dpath)
ParseInsuranceQAV2.build(dpath)
// Mark the data as built.
build_data.mark_done(dpath, version_string=version)