33851dea0f6c75ca1e685037393ea7160506b53a,auto_ml/predictor.py,Predictor,_prepare_for_training,#Predictor#Any#,207

Before Change


            for idx, val in enumerate(y):
                try:
                    float_val = float(val)
                    if pd.notnull(float_val):
                        y_floats.append(float_val)
                    else:
                        indices_to_delete.append(idx)
                        bad_vals.append(val)
                except:
                    indices_to_delete.append(idx)
                    bad_vals.append(val)

After Change


            print("And here is the number of missing (nan, None, etc.) values for this column:")
            print(bad_rows.shape[0])
            print("We will remove these values, and continue with training on the cleaned dataset")
        X_df = X_df.dropna(subset=[self.output_column])


        // Remove the output column from the dataset, and store it into the y varaible
        y = list(X_df.pop(self.output_column))

        print("removed the output column")

        // If this is a classifier, try to turn all the y values into proper ints
        // Some classifiers play more nicely if you give them category labels as ints rather than strings, so we"ll make our jobs easier here if we can. 
        if self.type_of_estimator == "classifier":
            // The entire column must be turned into floats. If any value fails, don"t convert anything in the column to floats
            try:
                y_ints = []
                for val in y:
                    y_ints.append(int(val))
                y = y_ints
            except:
                pass
        else:
            // If this is a regressor, turn all the values into floats if possible, and remove this row if they cannot be turned into floats
            indices_to_delete = []
            y_floats = []
            bad_vals = []
            for idx, val in enumerate(y):
                try:
                    float_val = utils.clean_val(val)
                    y_floats.append(float_val)
                except ValueError as err:
                    indices_to_delete.append(idx)
                    bad_vals.append(val)

            y = y_floats

            // Even more verbose logging here since these values are not just missing, they"re strings for a regression problem
            if len(indices_to_delete) > 0:
                print("The y values given included some bad values that the machine learning algorithms will not be able to train on.")
                print("The rows at these indices have been deleted because their y value could not be turned into a float:")
                print(indices_to_delete)
                print("These were the bad values")
                print(bad_vals)
                // indices_to_delete = set(indices_to_delete)
                X_df = X_df.drop(X_df.index(indices_to_delete))
                // X_df = [row for idx, row in enumerate(X_df) if idx not in indices_to_delete]

        return X_df, y
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 4

Instances


Project Name: ClimbsRocks/auto_ml
Commit Name: 33851dea0f6c75ca1e685037393ea7160506b53a
Time: 2016-10-08
Author: climbsbytes@gmail.com
File Name: auto_ml/predictor.py
Class Name: Predictor
Method Name: _prepare_for_training


Project Name: catalyst-cooperative/pudl
Commit Name: d9187309769bd34e34294003cef5290e512c6fbc
Time: 2020-07-31
Author: zane.selvans@catalyst.coop
File Name: src/pudl/transform/eia861.py
Class Name:
Method Name: _harvest_associations


Project Name: bashtage/linearmodels
Commit Name: a0be73979eded2a2cea2fd526ea5db87b1abf3c5
Time: 2020-01-20
Author: kevin.k.sheppard@gmail.com
File Name: linearmodels/system/results.py
Class Name: SystemResults
Method Name: _out_of_sample