re_extract_authors = re.compile(r"([^\d]+)(?:\d{4}(?:\?| or \d{1,2})?-(?:ca\. )?\d{4}|[bdfl]\.(?: ca\.)? \d{4}\??|-\d{4}|\d{4} or \d{1,2}|\d{2}th cent\.)\.?")
re_clean_authors = re.compile(r"^[,; ]+|[,.]+\s*?$")
metadata = []
with ZipFile(self._filename, mode="r") as f:
subf = io.StringIO(f.read("ota-master/metadata.tsv").decode("utf-8"))
for row in csv.DictReader(subf, delimiter="\t"):
// only include English-language works (99.9% of all works)
if not row["Language"].startswith("English"):
continue
// clean up years
year_match = re_extract_year.search(row["Year"])
if year_match:
row["Year"] = year_match.group()
else:
row["Year"] = None
// extract and clean up authors
authors = re_extract_authors.findall(row["Author"]) or [row["Author"]]
row["Author"] = [re_clean_authors.sub("", author) for author in authors]
// get rid of uniform "Language" and "License" fields
del row["Language"]
del row["License"]
metadata.append({key.lower(): val for key, val in row.items()})
self.metadata = metadata