from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
SRC = Field(tokenize = "spacy",
tokenizer_language="de",
init_token = "<sos>",
eos_token = "<eos>",
lower = True)
TRG = Field(tokenize = "spacy",
tokenizer_language="en",
init_token = "<sos>",
eos_token = "<eos>",
lower = True)
train_data, valid_data, test_data = Multi30k.splits(exts = (".de", ".en"),
fields = (SRC, TRG))
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Now that we"ve defined ``train_data``, we can see an extremely useful
// feature of ``torchtext``"s ``Field``: the ``build_vocab`` method
// now allows us to create the vocabulary associated with each language
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Once these lines of code have been run, ``SRC.vocab.stoi`` will be a
After Change
from torchtext.utils import download_from_url, extract_archive
import io
url_base = "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/"
train_urls = ("train.de.gz", "train.en.gz")
val_urls = ("val.de.gz", "val.en.gz")
test_urls = ("test_2016_flickr.de.gz", "test_2016_flickr.en.gz")
train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]
de_tokenizer = get_tokenizer("spacy", language="de")
en_tokenizer = get_tokenizer("spacy", language="en")