// we then get the data, and define a target column we will try to predict,
// as well as a dirty colum we will encode with the different methods.
// the rest will have a standard encoding
data_path = fetching.get_data_dir()
fetching.fetch_employee_salaries()
data_file = os.path.join(data_path, "employee_salaries", "rows.csv")
df = pd.read_csv(data_file).astype(str)
df["Current Annual Salary"] = [float(s[1:]) for s
in df["Current Annual Salary"]]
df["Year First Hired"] = [int(s.split("/")[-1])
After Change
from dirty_cat.datasets import fetch_employee_salaries
description = fetch_employee_salaries()
df = pd.read_csv(description["path"]).astype(str)
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// and carry out some basic preprocessing: