try:
event_name = event_["eventName"]
// only process these two event types
if event_name not in [OBJECT_DELETE, OBJECT_PUT]:
continue
bucket = unquote(event_["s3"]["bucket"]["name"])
// In the grand tradition of IE6, S3 events turn spaces into "+"
key = unquote_plus(event_["s3"]["object"]["key"])
version_id = event_["s3"]["object"].get("versionId")
version_id = unquote(version_id) if version_id else None
// OBJECT_DELETE does not include "eTag"
etag = unquote(event_["s3"]["object"].get("eTag", ""))
// Get two levels of extensions to handle files like .csv.gz
path = pathlib.PurePosixPath(key)
ext1 = path.suffix
ext2 = path.with_suffix("").suffix
ext = (ext2 + ext1).lower()
// Handle delete first and then continue so that
// head_object and get_object (below) don"t fail
if event_name == OBJECT_DELETE:
batch_processor.append(
event_name,
bucket=bucket,
ext=ext,
etag=etag,
key=key,
last_modified=now_like_boto3(),
text="",
version_id=version_id
)
continue
try:
head = retry_s3(
"head",
bucket,
key,
s3_client=s3_client,
version_id=version_id,
etag=etag
)
except botocore.exceptions.ClientError as exception:
// "null" version sometimes results in 403s for buckets
// that have changed versioning, retry without it
if (exception.response.get("Error", {}).get("Code") == "403"
and version_id == "null"):
head = retry_s3(
"head",
bucket,
key,
s3_client=s3_client,
version_id=None,
etag=etag
)
else:
raise exception
size = head["ContentLength"]
last_modified = head["LastModified"]
meta = head["Metadata"]
try:
text = get_contents(
bucket,
key,
ext,
etag=etag,
version_id=version_id,
s3_client=s3_client,
size=size
)
// we still want an entry for this document in elastic so that, e.g.,
// the file counts from elastic are correct. re-raise below.
except Exception as exc: // pylint: disable=broad-except
text = ""
content_exception = exc
print("Content extraction failed", exc, bucket, key, etag, version_id)
// decode Quilt-specific metadata
if meta and "helium" in meta:
try:
decoded_helium = json.loads(meta["helium"])
meta["helium"] = decoded_helium or {}
except (KeyError, json.JSONDecodeError):
print("Unable to parse Quilt "helium" metadata", meta)
batch_processor.append(
event_name,
bucket=bucket,
key=key,
After Change
try:
event_name = event_["eventName"]
// Process all Create:* and Remove:* events
if not any(event_name.startswith(n) for n in EVENT_PREFIX.values()):
continue
bucket = unquote(event_["s3"]["bucket"]["name"])
// In the grand tradition of IE6, S3 events turn spaces into "+"
key = unquote_plus(event_["s3"]["object"]["key"])
version_id = event_["s3"]["object"].get("versionId")