Project Guided Predicting Car Prices

Screen Link: https://app.dataquest.io/m/155/guided-project%3A-predicting-car-prices/3/univariate-model
I rewrote the knn_train_test() function. I think my definition is quite a typical ML function format. The only thing I have concern is the shuffling of dataframe. The solution used : np.random.permutation(), while I used random.shuffle(df). I check the error, didn’t know if the error started from shuffling. By the way, I checked some online videos. There are so many ways to shuffle the dataframe. eg df.sample(), np.random.permutation(), and df.shuffle(), sklearn.utils.shuffle(df). I know some of them I have to reindex the df. Some don’t. Also regarding random state for shuffling, some you need to put np.random.seed(1) first or some you need to put inside the function as an argument. df.sample( ```
random_state=42). I know some of those functions have something to do with running speed. But I still don’t know the nuance of those shuffling methods. Will you causing problems when dealing with n-d arrays? Can someone help me? Thank you.

My Code:

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import random

def knn_train_test(train_col, target_col, df):

    np.random.seed(1)
    random.shuffle(df)
    
    X = np.array(df[[train_col]])
    y = np.array(df[target_col])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
    clf = KNeighborsRegressor()
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    MSE = mean_squared_error(y_test, predictions)
    RMSE = np.sqrt(MSE)
    return RMSE

rmse_results = {}
train_cols = numeric_cars.columns.drop('price')

# For each column (minus `price`), train a model, return RMSE value
# and add to the dictionary `rmse_results`.
for col in train_cols:
    rmse_val = knn_train_test(col, 'price', numeric_cars)
    rmse_results[col] = rmse_val

rmse_results_series = pd.Series(rmse_results)
rmse_results_series.sort_values()

error comes back

KeyErrorTraceback (most recent call last)
/dataquest/system/env/python3/lib/python3.4/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2524             try:
-> 2525                 return self._engine.get_loc(key)
   2526             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 127

During handling of the above exception, another exception occurred:

KeyErrorTraceback (most recent call last)
<ipython-input-17-12b7de792792> in <module>()
     24 # and add to the dictionary `rmse_results`.
     25 for col in train_cols:
---> 26     rmse_val = knn_train_test(col, 'price', numeric_cars)
     27     rmse_results[col] = rmse_val
     28 

<ipython-input-17-12b7de792792> in knn_train_test(train_col, target_col, df)
      6 
      7     np.random.seed(1)
----> 8     random.shuffle(df)
      9 
     10     X = np.array(df[[train_col]])

/dataquest/system/env/python3/lib/python3.4/random.py in shuffle(self, x, random)
    270                 # pick an element in x[:i+1] with which to exchange x[i]
    271                 j = randbelow(i+1)
--> 272                 x[i], x[j] = x[j], x[i]
    273         else:
    274             _int = int

/dataquest/system/env/python3/lib/python3.4/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2137             return self._getitem_multilevel(key)
   2138         else:
-> 2139             return self._getitem_column(key)
   2140 
   2141     def _getitem_column(self, key):

/dataquest/system/env/python3/lib/python3.4/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2144         # get column
   2145         if self.columns.is_unique:
-> 2146             return self._get_item_cache(key)
   2147 
   2148         # duplicate columns & possible reduce dimensionality

/dataquest/system/env/python3/lib/python3.4/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1840         res = cache.get(item)
   1841         if res is None:
-> 1842             values = self._data.get(item)
   1843             res = self._box_item_values(item, values)
   1844             cache[item] = res

/dataquest/system/env/python3/lib/python3.4/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3841 
   3842             if not isna(item):
-> 3843                 loc = self.items.get_loc(item)
   3844             else:
   3845                 indexer = np.arange(len(self.items))[isna(self.items)]

/dataquest/system/env/python3/lib/python3.4/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2525                 return self._engine.get_loc(key)
   2526             except KeyError:
-> 2527                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2528 
   2529         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 127

dataquestion solution is:

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

def knn_train_test(train_col, target_col, df):
    knn = KNeighborsRegressor()
    np.random.seed(1)
        
    # Randomize order of rows in data frame.
    shuffled_index = np.random.permutation(df.index)
    rand_df = df.reindex(shuffled_index)

    # Divide number of rows in half and round.
    last_train_row = int(len(rand_df) / 2)
    
    # Select the first half and set as training set.
    # Select the second half and set as test set.
    train_df = rand_df.iloc[0:last_train_row]
    test_df = rand_df.iloc[last_train_row:]
    
    # Fit a KNN model using default k value.
    knn.fit(train_df[[train_col]], train_df[target_col])
    
    # Make predictions using model.
    predicted_labels = knn.predict(test_df[[train_col]])

    # Calculate and return RMSE.
    mse = mean_squared_error(test_df[target_col], predicted_labels)
    rmse = np.sqrt(mse)
    return rmse

rmse_results = {}
train_cols = numeric_cars.columns.drop('price')

# For each column (minus `price`), train a model, return RMSE value
# and add to the dictionary `rmse_results`.
for col in train_cols:
    rmse_val = knn_train_test(col, 'price', numeric_cars)
    rmse_results[col] = rmse_val

# Create a Series object from the dictionary so 
# we can easily view the results, sort, etc
rmse_results_series = pd.Series(rmse_results)
rmse_results_series.sort_values()

random.shuffle will work for either python lists or numpy arrays, as far as I know. It won’t work for Pandas Dataframe.

You are most likely getting that error because of that. Best to work with the one currently mentioned in the course to keep it simple.

1 Like

Thank you doctor. Fix my bugs, I add this line of code to convert dataframe to a list of lists. Before I always assumed numpy array and dataframe are the same thing. Actually they are different.

np.random.seed(1)
data = df.astype(float).values.tolist()
random.shuffle(data)
1 Like