def knn_train_test(train, target, df):
np.random.seed(1)
shuffled_index = np.random.permutation(df.index)
rand_df = df.reindex(shuffled_index)
train_index = int(len(rand_df)/2)
train_set = rand_df.iloc[0:train_index]
test_set = rand_df.iloc[train_index:]
k_rmses = {}
k_values = [1,3,5,7,9]
for k in k_values:
knn = KNeighborsRegressor(n_neighbors = k)
knn.fit(train_set[[train]], train_set[target])
predictions = knn.predict(test_set[[train]])
mse = mean_squared_error(test_set[target],predictions)
rmse = np.sqrt(mse)
k_rmses[k] = rmse
return k_rmses
mult_rmses = {}
train_cols = norm_cars.columns.drop('price')
for col in train_cols:
rmses = knn_train_test(col, 'price',norm_cars)
mult_rmses[col] = rmses
mult_rmses
import matplotlib.pyplot as plt
%matplotlib inline
for k,v in mult_rmses.items():
x = list(v.keys())
y = list(v.values())
plt.plot(x,y)
plt.xlabel('k value')
plt.ylabel('RMSE')
I expected this chart to match the solution code for this guided project, with lines for each variable showing the RMSE value that corresponds to each k value.
What actually happened:
My matplotlib plot is doubling back right in the middle, even though there are no dictionary keys with duplicate values. So halfway through my chart, I have more lines than I need.
My code is almost completely the same as the solution notebook, so I’m having a hard time finding the source of this error. Thanks for any help!