// Make a copy of inverted_features and update the target transform to be// identity or one hot depending on the schema.inverted_features_target = copy.deepcopy(inverted_features)for name, transform_set in six.iteritems(inverted_features_target):
if transform_set == set([constant.TARGET_TRANSFORM]):
target_schema = next(col["type"].lower() for col in schema if col["name"] == name)if target_schema in constant.NUMERIC_SCHEMA:
inverted_features_target[name] = {constant.IDENTITY_TRANSFORM}
else:
inverted_features_target[name] = {constant.ONE_HOT_TRANSFORM}
// initialize the resultsdef _init_numerical_results():
return {"min": float("inf"),
"max": float("-inf"),
"count": 0,
"sum": 0.0}
numerical_results = collections.defaultdict(_init_numerical_results)
vocabs = collections.defaultdict(lambda: collections.defaultdict(int))
num_examples = 0// for each file, update the numerical stats from that file, and update the set// of unique labels.for input_file in input_files:
with file_io.FileIO(input_file, "r") as f:
for line in csv.reader(f):
iflen(header) != len(line):
raise ValueError("Schema has %d columns but a csv line only has %d columns." %
(len(header), len(line)))
parsed_line = dict(zip(header, line))
num_examples += 1for col_name, transform_set in six.iteritems(inverted_features_target):
// All transforms in transform_set require the same analysis. So look// at the first transform.transform_name = next(iter(transform_set))if transform_name in constant.TEXT_TRANSFORMS:
split_strings = parsed_line[col_name].split(" ")
// If a label is in the row N times, increase it"s vocab count by 1.