print("And here is the number of missing (nan, None, etc.) values for this column:")
print(bad_rows.shape[0])
print("We will remove these values, and continue with training on the cleaned dataset")
X_df = X_df.dropna(subset=[self.output_column])
// Remove the output column from the dataset, and store it into the y varaible
y = list(X_df.pop(self.output_column))
print("removed the output column")
// If this is a classifier, try to turn all the y values into proper ints
// Some classifiers play more nicely if you give them category labels as ints rather than strings, so we"ll make our jobs easier here if we can.
if self.type_of_estimator == "classifier":
// The entire column must be turned into floats. If any value fails, don"t convert anything in the column to floats
try:
y_ints = []
for val in y:
y_ints.append(int(val))
y = y_ints
except:
pass
else:
// If this is a regressor, turn all the values into floats if possible, and remove this row if they cannot be turned into floats
indices_to_delete = []
y_floats = []
bad_vals = []
for idx, val in enumerate(y):
try:
float_val = utils.clean_val(val)
y_floats.append(float_val)
except ValueError as err:
indices_to_delete.append(idx)
bad_vals.append(val)
y = y_floats
// Even more verbose logging here since these values are not just missing, they"re strings for a regression problem
if len(indices_to_delete) > 0:
print("The y values given included some bad values that the machine learning algorithms will not be able to train on.")
print("The rows at these indices have been deleted because their y value could not be turned into a float:")
print(indices_to_delete)
print("These were the bad values")
print(bad_vals)
// indices_to_delete = set(indices_to_delete)
X_df = X_df.drop(X_df.index(indices_to_delete))
// X_df = [row for idx, row in enumerate(X_df) if idx not in indices_to_delete]
return X_df, y