//make a stratified split of the data
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
train_set = housing.loc[train_index]
test_set = housing.loc[test_index]
After Change
//////////////////
city_lat_long = pd.read_csv("cal_cities_lat_long.csv")
city_pop_data = pd.read_csv("cal_populations_city.csv")
county_pop_data = pd.read_csv("cal_populations_county.csv")
original, had to change because we only want to deal with cities we have
both location and population data on.
city_coords = {}
for dat in city_lat_long.iterrows():
row = dat[1]
city_coords[row["Name"]] = (float(row["Latitude"]), float(row["Longitude"]))
//how we deiscovered the need for the change
present = []
absent = []
for city in city_coords.keys():
if city in city_pop_data["City"].values:
present.append(city)
else:
absent.append(city)
len(present)
len(absent)
absent
city_coords = {}
for dat in city_lat_long.iterrows():
row = dat[1]
if row["Name"] not in city_pop_data["City"].values:
continue
else:
city_coords[row["Name"]] = (float(row["Latitude"]), float(row["Longitude"]))