019f0c822cd8f2833d3b25855f2df99cd5a465a4,auto_ml/utils_model_training.py,FinalModelATC,fit,#FinalModelATC#Any#Any#,57

Before Change


        if self.model_name in ["LGBMClassifier", "LGBMRegressor"]:
            X_fit = X.toarray()

        try:
            if self.model_name[:12] == "DeepLearning":

                print("\nWe will stop training early if we have not seen an improvement in training accuracy in 25 epochs")
                from keras.callbacks import EarlyStopping
                early_stopping = EarlyStopping(monitor="loss", patience=25, verbose=1)
                self.model.fit(X_fit, y, callbacks=[early_stopping])

            elif self.model_name[:4] == "LGBM":

                X_fit, X_test, y, y_test = train_test_split(X_fit, y, test_size=0.15)

                if self.type_of_estimator == "regressor":
                    eval_metric = "rmse"
                elif self.type_of_estimator == "classifier":
                    if len(set(y_test)) > 2:
                        eval_metric = "multi_logloss"
                    else:
                        eval_metric = "binary_logloss"


                cat_feature_indices = self.get_categorical_feature_indices()
                self.model.fit(X_fit, y, eval_set=[(X_test, y_test)], early_stopping_rounds=50, eval_metric=eval_metric, eval_names=["random_holdout_set_from_training_data"], categorical_feature=cat_feature_indices)

            elif self.model_name[:8] == "CatBoost":
                X_fit = X_fit.toarray()

                if self.type_of_estimator == "classifier" and len(pd.Series(y).unique()) > 2:
                    // TODO: we might have to modify the format of the y values, converting them all to ints, then back again (sklearn has a useful inverse_transform on some preprocessing classes)
                    self.model.set_params(loss_function="MultiClass")

                cat_feature_indices = self.get_categorical_feature_indices()

                self.model.fit(X_fit, y, cat_features=cat_feature_indices)

            elif self.model_name[:16] == "GradientBoosting":
                if scipy.sparse.issparse(X_fit):
                    X_fit = X_fit.todense()

                patience = 20
                best_val_loss = -10000000000
                num_worse_rounds = 0
                best_model = deepcopy(self.model)
                X_fit, X_test, y, y_test = train_test_split(X_fit, y, test_size=0.15)

                // Add a variable number of trees each time, depending how far into the process we are
                if os.environ.get("is_test_suite", False) == "True":
                    num_iters = list(range(1, 50, 1)) + list(range(50, 100, 2)) + list(range(100, 250, 3))
                else:
                    num_iters = list(range(1, 50, 1)) + list(range(50, 100, 2)) + list(range(100, 250, 3)) + list(range(250, 500, 5)) + list(range(500, 1000, 10)) + list(range(1000, 2000, 20)) + list(range(2000, 10000, 100))

                try:
                    for num_iter in num_iters:
                        warm_start = True
                        if num_iter == 1:
                            warm_start = False

                        self.model.set_params(n_estimators=num_iter, warm_start=warm_start)
                        self.model.fit(X_fit, y)

                        if self.training_prediction_intervals == True:
                            val_loss = self.model.score(X_test, y_test)
                        else:
                            try:
                                val_loss = self._scorer.score(self, X_test, y_test)
                            except Exception as e:
                                val_loss = self.model.score(X_test, y_test)

                        if val_loss - self.min_step_improvement > best_val_loss:
                            best_val_loss = val_loss
                            num_worse_rounds = 0
                            best_model = deepcopy(self.model)
                        else:
                            num_worse_rounds += 1
                        print("[" + str(num_iter) + "] random_holdout_set_from_training_data\"s score is: " + str(round(val_loss, 3)))
                        if num_worse_rounds >= patience:
                            break
                except KeyboardInterrupt:
                    print("Heard KeyboardInterrupt. Stopping training, and using the best checkpointed GradientBoosting model")
                    pass

                self.model = best_model
                print("The number of estimators that were the best for this training dataset: " + str(self.model.get_params()["n_estimators"]))
                print("The best score on a random 15 percent holdout set of the training data: " + str(best_val_loss))

            else:
                self.model.fit(X_fit, y)

        except TypeError as e:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()
            self.model.fit(X_fit, y)

        except KeyboardInterrupt as e:
            print("Stopping training at this point because we heard a KeyboardInterrupt")
            print("If the model is functional at this point, we will output the model in its latest form")
            print("Note that not all models can be interrupted and still used, and that this feature generally is an unofficial beta-release feature that is known to fail on occasion")
            pass

        return self

    def remove_categorical_values(self, features):
        clean_features = set([])

After Change



            cat_feature_indices = self.get_categorical_feature_indices()
            if cat_feature_indices is None:
                self.model.fit(X_fit, y, eval_set=[(X_test, y_test)], early_stopping_rounds=50, eval_metric=eval_metric, eval_names=["random_holdout_set_from_training_data"])
            else:
                self.model.fit(X_fit, y, eval_set=[(X_test, y_test)], early_stopping_rounds=50, eval_metric=eval_metric, eval_names=["random_holdout_set_from_training_data"], categorical_feature=cat_feature_indices)

In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 4

Instances

Link

Project Name: ClimbsRocks/auto_ml

Commit Name: 019f0c822cd8f2833d3b25855f2df99cd5a465a4

Time: 2017-08-17

Author: ClimbsBytes@gmail.com

File Name: auto_ml/utils_model_training.py

Class Name: FinalModelATC

Method Name: fit

Link

Project Name: scikit-learn/scikit-learn

Commit Name: 9b39c4c4d20eef7a2b0b8420945f09d3731e1b67

Time: 2020-02-16

Author: sci@feldbauer.org

File Name: sklearn/feature_extraction/tests/test_text.py

Class Name:

Method Name: test_countvectorizer_custom_vocabulary_gap_index

Link

Project Name: scikit-learn/scikit-learn

Commit Name: 9b39c4c4d20eef7a2b0b8420945f09d3731e1b67

Time: 2020-02-16

Author: sci@feldbauer.org

File Name: sklearn/feature_extraction/tests/test_text.py

Class Name:

Method Name: test_countvectorizer_custom_vocabulary_repeated_indices