5aaa63d7ecf85d341fec9b758d17f2bf4cde6042,src/pudl/transform/ferc1.py,FERCPlantClassifier,predict,#FERCPlantClassifier#Any#Any#,1292

Before Change


            // Grab the index values of the rows in the masked dataframe which
            // are NOT all NaN -- these are the indices of the *other* records
            // which found the record x to be one of their best matches.
            w_m = w_m.dropna(how="all").index.values

            // Now look up the indices of the records which were found to be
            // best matches to the record x.
            b_m = self._best_of.loc[idx, self._years].astype(int)

After Change


                "You must train classifer before predicting data!")

        out_df = pd.DataFrame(
            data=[],
            index=pd.Index([], name="seed_id"),
            columns=self._years)
        tmp_best = (
            self._best_of.loc[:, ["record_id"] + list(self._years)]
            .append(pd.DataFrame(data=[""], index=[-1], columns=["record_id"]))
        )
        // For each record_id we"ve been given:
        for x in X:
            // Find the index associated with the record ID we are predicting
            // a grouping for:
            idx = tmp_best[tmp_best.record_id == x].index.values[0]

            // Mask the best_of dataframe, keeping only those entries where
            // the index of the chosen record_id appears -- this results in a
            // huge dataframe almost full of NaN values.
            w_m = (
                tmp_best[self._years][tmp_best[self._years] == idx]
                // Grab the index values of the rows in the masked dataframe which
                // are NOT all NaN -- these are the indices of the *other* records
                // which found the record x to be one of their best matches.
                .dropna(how="all").index.values
            )

            // Now look up the indices of the records which were found to be
            // best matches to the record x.
            b_m = tmp_best.loc[idx, self._years].astype(int)

            // Here we require that there is no conflict between the two sets
            // of indices -- that every time a record shows up in a grouping,
            // that grouping is either the same, or a subset of the other
            // groupings that it appears in. When no sufficiently good match
            // is found the "index" in the _best_of array is set to -1, so
            // requiring that the b_m value be >=0 screens out those no-match
            // cases. This is okay -- we"re just trying to require that the
            // groupings be internally self-consistent, not that they are
            // completely identical. Being flexible on this dramatically
            // increases the number of records that get assigned a plant ID.
            if np.array_equiv(w_m, b_m[b_m >= 0].values):
                // This line is causing a warning. In cases where there are
                // some years no sufficiently good match exists, and so b_m
                // doesn"t contain an index. Instead, it has a -1 sentinel
                // value, which isn"t a label for which a record exists, which
                // upsets .loc. Need to find some way around this... but for
                // now it does what we want. We could use .iloc instead, but
                // then the -1 sentinel value maps to the last entry in the
                // dataframe, which also isn"t what we want.  Blargh.
                new_grp = tmp_best.loc[b_m, "record_id"]

                // Stack the new list of record_ids on our output DataFrame:
                out_df = out_df.append(
                    pd.DataFrame(
                        data=new_grp.values.reshape(1, len(self._years)),
                        index=pd.Index(
                            [tmp_best.loc[idx, "record_id"]],
                            name="seed_id"),
                        columns=self._years))
        return out_df

    def score(self, X, y=None):  // noqa: N803
        Scores a collection of FERC plant categorizations.
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 4

Non-data size: 4

Instances


Project Name: catalyst-cooperative/pudl
Commit Name: 5aaa63d7ecf85d341fec9b758d17f2bf4cde6042
Time: 2020-03-11
Author: zane.selvans@catalyst.coop
File Name: src/pudl/transform/ferc1.py
Class Name: FERCPlantClassifier
Method Name: predict


Project Name: bashtage/linearmodels
Commit Name: a0be73979eded2a2cea2fd526ea5db87b1abf3c5
Time: 2020-01-20
Author: kevin.k.sheppard@gmail.com
File Name: linearmodels/system/results.py
Class Name: SystemResults
Method Name: _out_of_sample


Project Name: ScottfreeLLC/AlphaPy
Commit Name: 45204c65d1ba431f92347ae269db596410af7cb3
Time: 2017-05-12
Author: Mark.R.Conway@gmail.com
File Name: alphapy/analysis.py
Class Name:
Method Name: run_analysis


Project Name: metagenome-atlas/atlas
Commit Name: 3ab0e54a3ce88c7018a34192461dd47e5867d357
Time: 2017-01-28
Author: joe.brown@pnnl.gov
File Name: atlas/tables.py
Class Name:
Method Name: col_split