5aaa63d7ecf85d341fec9b758d17f2bf4cde6042,src/pudl/transform/ferc1.py,FERCPlantClassifier,predict,#FERCPlantClassifier#Any#Any#,1292

Before Change


                // Save the seed record_id for use in indexing the output:
                out_idx = out_idx + [self._best_of.loc[idx, "record_id"]]

        out_df["seed_id"] = out_idx
        out_df = out_df.set_index("seed_id")
        out_df = out_df.fillna("")
        return out_df

After Change



        out_df = pd.DataFrame(
            data=[],
            index=pd.Index([], name="seed_id"),
            columns=self._years)
        tmp_best = (
            self._best_of.loc[:, ["record_id"] + list(self._years)]
            .append(pd.DataFrame(data=[""], index=[-1], columns=["record_id"]))
        )
        // For each record_id we"ve been given:
        for x in X:
            // Find the index associated with the record ID we are predicting
            // a grouping for:
            idx = tmp_best[tmp_best.record_id == x].index.values[0]

            // Mask the best_of dataframe, keeping only those entries where
            // the index of the chosen record_id appears -- this results in a
            // huge dataframe almost full of NaN values.
            w_m = (
                tmp_best[self._years][tmp_best[self._years] == idx]
                // Grab the index values of the rows in the masked dataframe which
                // are NOT all NaN -- these are the indices of the *other* records
                // which found the record x to be one of their best matches.
                .dropna(how="all").index.values
            )

            // Now look up the indices of the records which were found to be
            // best matches to the record x.
            b_m = tmp_best.loc[idx, self._years].astype(int)

            // Here we require that there is no conflict between the two sets
            // of indices -- that every time a record shows up in a grouping,
            // that grouping is either the same, or a subset of the other
            // groupings that it appears in. When no sufficiently good match
            // is found the "index" in the _best_of array is set to -1, so
            // requiring that the b_m value be >=0 screens out those no-match
            // cases. This is okay -- we"re just trying to require that the
            // groupings be internally self-consistent, not that they are
            // completely identical. Being flexible on this dramatically
            // increases the number of records that get assigned a plant ID.
            if np.array_equiv(w_m, b_m[b_m >= 0].values):
                // This line is causing a warning. In cases where there are
                // some years no sufficiently good match exists, and so b_m
                // doesn"t contain an index. Instead, it has a -1 sentinel
                // value, which isn"t a label for which a record exists, which
                // upsets .loc. Need to find some way around this... but for
                // now it does what we want. We could use .iloc instead, but
                // then the -1 sentinel value maps to the last entry in the
                // dataframe, which also isn"t what we want.  Blargh.
                new_grp = tmp_best.loc[b_m, "record_id"]

                // Stack the new list of record_ids on our output DataFrame:
                out_df = out_df.append(
                    pd.DataFrame(
                        data=new_grp.values.reshape(1, len(self._years)),
                        index=pd.Index(
                            [tmp_best.loc[idx, "record_id"]],
                            name="seed_id"),
                        columns=self._years))
        return out_df

    def score(self, X, y=None):  // noqa: N803
        Scores a collection of FERC plant categorizations.
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 4

Non-data size: 3

Instances


Project Name: catalyst-cooperative/pudl
Commit Name: 5aaa63d7ecf85d341fec9b758d17f2bf4cde6042
Time: 2020-03-11
Author: zane.selvans@catalyst.coop
File Name: src/pudl/transform/ferc1.py
Class Name: FERCPlantClassifier
Method Name: predict


Project Name: pandas-dev/pandas
Commit Name: 5a7514ccb18a3a506d453dd048c665c33835ee56
Time: 2020-12-11
Author: jbrockmendel@gmail.com
File Name: pandas/tests/indexes/categorical/test_indexing.py
Class Name: TestGetIndexer
Method Name: test_get_indexer_non_unique


Project Name: bokeh/bokeh
Commit Name: 3c8314eafc16c5c363c75a57320843036f459196
Time: 2018-07-31
Author: karelvandeplassche@gmail.com
File Name: bokeh/models/sources.py
Class Name: ColumnDataSource
Method Name: _data_from_df


Project Name: ixaxaar/pytorch-dnc
Commit Name: d7f916627aca77365beadc2c7c1af6504f445249
Time: 2017-11-30
Author: root@ixaxaar.in
File Name: dnc/sparse_memory.py
Class Name: SparseMemory
Method Name: rebuild_indexes