// Make a copy of inverted_features and update the target transform to be
// identity or one hot depending on the schema.
inverted_features_target = copy.deepcopy(inverted_features)
for name, transform_set in six.iteritems(inverted_features_target):
if transform_set == set([constant.TARGET_TRANSFORM]):
target_schema = next(col["type"].lower() for col in schema if col["name"] == name)
if target_schema in constant.NUMERIC_SCHEMA:
inverted_features_target[name] = {constant.IDENTITY_TRANSFORM}
else:
inverted_features_target[name] = {constant.ONE_HOT_TRANSFORM}
// initialize the results
def _init_numerical_results():
return {"min": float("inf"),
"max": float("-inf"),
"count": 0,
"sum": 0.0}
numerical_results = collections.defaultdict(_init_numerical_results)
vocabs = collections.defaultdict(lambda: collections.defaultdict(int))
num_examples = 0
// for each file, update the numerical stats from that file, and update the set
// of unique labels.
for input_file in input_files:
with file_io.FileIO(input_file, "r") as f:
for line in csv.reader(f):
if len(header) != len(line):
raise ValueError("Schema has %d columns but a csv line only has %d columns." %
(len(header), len(line)))
parsed_line = dict(zip(header, line))
num_examples += 1
for col_name, transform_set in six.iteritems(inverted_features_target):
// All transforms in transform_set require the same analysis. So look
// at the first transform.
transform_name = next(iter(transform_set))
if transform_name in constant.TEXT_TRANSFORMS:
split_strings = parsed_line[col_name].split(" ")
// If a label is in the row N times, increase it"s vocab count by 1.