Tests that the reader only returns half of the expected data consistently
with reader_factory(synthetic_dataset.url, cur_shard=0, shard_count=5) as reader:
with reader_factory(synthetic_dataset.url, cur_shard=0, shard_count=5) as reader_2:
results_1 = []
expected = []
for row in reader:
actual = dict(row._asdict())
results_1.append(actual)
expected.append(next(d for d in synthetic_dataset.data if d["id"] == actual["id"]))
assert results_1, "Shard 0 is expected to have at least one row"
results_2 = [dict(row._asdict()) for row in reader_2]
assert results_2, "Shard 0 is expected to have at least one row"
// Since order is non deterministic, we need to sort results by id
results_1.sort(key=lambda x: x["id"])
results_2.sort(key=lambda x: x["id"])
expected.sort(key=lambda x: x["id"])
np.testing.assert_equal(expected, results_1)
np.testing.assert_equal(results_1, results_2)
assert len(results_1) < len(synthetic_dataset.data)
// Test that separate partitions also have no overlap by checking ids
id_set = set([item["id"] for item in results_1])
for partition in range(1, 5):
with reader_factory(synthetic_dataset.url, cur_shard=partition,
shard_count=5) as reader_other:
After Change
Tests that the reader only returns half of the expected data consistently
with reader_factory(synthetic_dataset.url, cur_shard=0, shard_count=5) as reader:
with reader_factory(synthetic_dataset.url, cur_shard=0, shard_count=5) as reader_2:
results_1 = set(_readout_all_ids(reader))
results_2 = set(_readout_all_ids(reader_2))
assert results_1, "Non empty shard expected"