p_df = eia860_dfs["plant"].copy()
// Replace empty strings, whitespace, and "." fields with real NA values
p_df.replace(to_replace=r"^\.$", value=np.nan, regex=True, inplace=True)p_df.replace(to_replace=r"^\s$", value=np.nan, regex=True, inplace=True)p_df.replace(to_replace=r"^$", value=np.nan, regex=True, inplace=True)
// Cast integer values in sector to floats to avoid type errors
//p_df["sector_id"] = p_df["sector_id"].astype(float)
// Cast various types in transmission_distribution_owner_id to int
// p_df["transmission_distribution_owner_id"] = \
// p_df["transmission_distribution_owner_id"].astype(int)
// Cast values in zip_code to strings to avoid type errors
p_df["zip_code"] = p_df["zip_code"].astype(str)
// A subset of the columns have "X" values, where other columns_to_fix
// have "N" values. Replacing these values with "N" will make for uniform
// values that can be converted to Boolean True and False pairs.
p_df.ash_impoundment_lined = \
p_df.ash_impoundment_lined.replace(to_replace="X", value="N")
p_df.natural_gas_storage = \
p_df.natural_gas_storage.replace(to_replace="X", value="N")
p_df.liquefied_natural_gas_storage = \
p_df.liquefied_natural_gas_storage.replace(to_replace="X", value="N")
boolean_columns_to_fix = [
"ferc_cogen_status",
"ferc_small_power_producer",
"ferc_exempt_wholesale_generator",
"ash_impoundment",
"ash_impoundment_lined",
"energy_storage",
"natural_gas_storage",
"liquefied_natural_gas_storage"
]
for column in boolean_columns_to_fix:
p_df[column] = p_df[column].fillna("False")
p_df[column] = p_df[column].replace(
to_replace=["Y", "N"], value=[True, False])
// Ensure plant & operator IDs are integers.
p_df["plant_id_eia"] = p_df["plant_id_eia"].astype(int)
p_df["utility_id_eia"] = p_df["utility_id_eia"].astype(int)
p_df["primary_purpose_naics_id"] = \
p_df["primary_purpose_naics_id"].astype(int)
After Change
// Populating the "plants_eia860" table
p_df = eia860_dfs["plant"].copy()
// Replace empty strings, whitespace, and "." fields with real NA values
p_df = pudl.helpers.fix_eia_na(p_df)
// Cast values in zip_code to strings to avoid type errors
p_df["zip_code"] = p_df["zip_code"].astype(str)
// A subset of the columns have "X" values, where other columns_to_fix
// have "N" values. Replacing these values with "N" will make for uniform
// values that can be converted to Boolean True and False pairs.
p_df.ash_impoundment_lined = \
p_df.ash_impoundment_lined.replace(to_replace="X", value="N")
p_df.natural_gas_storage = \
p_df.natural_gas_storage.replace(to_replace="X", value="N")
p_df.liquefied_natural_gas_storage = \
p_df.liquefied_natural_gas_storage.replace(to_replace="X", value="N")
boolean_columns_to_fix = [
"ferc_cogen_status",
"ferc_small_power_producer",
"ferc_exempt_wholesale_generator",
"ash_impoundment",
"ash_impoundment_lined",
"energy_storage",
"natural_gas_storage",
"liquefied_natural_gas_storage"
]
for column in boolean_columns_to_fix:
p_df[column] = p_df[column].fillna("False")
p_df[column] = p_df[column].replace(
to_replace=["Y", "N"], value=[True, False])
// Ensure plant & operator IDs are integers.
p_df["plant_id_eia"] = p_df["plant_id_eia"].astype(int)
p_df["utility_id_eia"] = p_df["utility_id_eia"].astype(int)
p_df["primary_purpose_naics_id"] = \
p_df["primary_purpose_naics_id"].astype(int)