all_workers = (self.world_size * num_workers_per_node)
offset = self.rank * num_workers_per_node + node_worker_id
self.vectorizer.mxlen = self.nctx
read_file_order = list(range(offset, len(files), all_workers))
// If we have multiple files per worker, possibly shuffle the file read order
if self.shuffle:
read_file_order = np.random.permutation(read_file_order)
for file_idx in read_file_order:
file = files[file_idx]
with open(file) as rf:
lines = rf.readlines()
if self.shuffle:
random.shuffle(lines)
for l in lines:
response = self.process_line(l)
yield response