// Save the seed record_id for use in indexing the output:
out_idx = out_idx + [self._best_of.loc[idx, "record_id"]]
out_df["seed_id"] = out_idx
out_df = out_df.set_index("seed_id")
out_df = out_df.fillna("")
return out_df
After Change
out_df = pd.DataFrame(
data=[],
index=pd.Index([], name="seed_id"),
columns=self._years)
tmp_best = (
self._best_of.loc[:, ["record_id"] + list(self._years)]
.append(pd.DataFrame(data=[""], index=[-1], columns=["record_id"]))
)
// For each record_id we"ve been given:
for x in X:
// Find the index associated with the record ID we are predicting
// a grouping for:
idx = tmp_best[tmp_best.record_id == x].index.values[0]
// Mask the best_of dataframe, keeping only those entries where
// the index of the chosen record_id appears -- this results in a
// huge dataframe almost full of NaN values.
w_m = (
tmp_best[self._years][tmp_best[self._years] == idx]
// Grab the index values of the rows in the masked dataframe which
// are NOT all NaN -- these are the indices of the *other* records
// which found the record x to be one of their best matches.
.dropna(how="all").index.values
)
// Now look up the indices of the records which were found to be
// best matches to the record x.
b_m = tmp_best.loc[idx, self._years].astype(int)
// Here we require that there is no conflict between the two sets
// of indices -- that every time a record shows up in a grouping,
// that grouping is either the same, or a subset of the other
// groupings that it appears in. When no sufficiently good match
// is found the "index" in the _best_of array is set to -1, so
// requiring that the b_m value be >=0 screens out those no-match
// cases. This is okay -- we"re just trying to require that the
// groupings be internally self-consistent, not that they are
// completely identical. Being flexible on this dramatically
// increases the number of records that get assigned a plant ID.
if np.array_equiv(w_m, b_m[b_m >= 0].values):
// This line is causing a warning. In cases where there are
// some years no sufficiently good match exists, and so b_m
// doesn"t contain an index. Instead, it has a -1 sentinel
// value, which isn"t a label for which a record exists, which
// upsets .loc. Need to find some way around this... but for
// now it does what we want. We could use .iloc instead, but
// then the -1 sentinel value maps to the last entry in the
// dataframe, which also isn"t what we want. Blargh.
new_grp = tmp_best.loc[b_m, "record_id"]
// Stack the new list of record_ids on our output DataFrame:
out_df = out_df.append(
pd.DataFrame(
data=new_grp.values.reshape(1, len(self._years)),
index=pd.Index(
[tmp_best.loc[idx, "record_id"]],
name="seed_id"),
columns=self._years))
return out_df
def score(self, X, y=None): // noqa: N803
Scores a collection of FERC plant categorizations.