Link to my notebook as it stands
Basics (14).ipynb (97.3 KB)
Immediately relevant code
def select_fatures(DF):
# na_count = DF[numeric_cols].isna().sum()
# bad_cols = na_count[na_count >0]
# feature_list = numeric_cols.drop(target)
# target_list = ['SalePrice']
# feature_list = ['Gr Liv Area']
return feature_list
def train_and_test(DF, transform= False, feature_list = ['Gr Liv Area']):
train = DF[:1460]
test = DF[1460:]
target = 'SalePrice'
if transform == True:
transform(DF)
lr = LinearRegression()
lr.fit(train[feature_list],train[target])
train_prediction = lr.predict(train[feature_list])
test_prediction = lr.predict(test[feature_list])
train_rmse = math.sqrt( mse(train_prediction, train['SalePrice']))
test_rmse = math.sqrt( mse(test_prediction, test['SalePrice']))
print('train_rmse {}, \n test_rmse {}'.format(train_rmse,test_rmse))
numeric_cols = data.select_dtypes(['int64','float64']).columns
data['years_until_remod'] = (data['Year Remod/Add'] - data['Year Built'])
numeric_cols = data.select_dtypes(['int64','float64']).columns
num_data =data[numeric_cols].drop(columns = ['Order',
'PID',
'Year Built',
'Year Remod/Add',
'Lot Frontage',
'Garage Yr Blt',
'MS SubClass'])
num_data = num_data[num_data['years_until_remod']>= 0]
num_data = num_data.dropna(axis = 0)
cleaned_data = num_data
print(cleaned_data.shape, ' ', data.shape)
import seaborn as sns
sns.heatmap(cleaned_data.corr())
plt.show()
clean_corr = cleaned_data.corr()['SalePrice']
hi_corr = clean_corr[clean_corr> .4].sort_values(ascending=False)
print(hi_corr)
hi_corr_index = hi_corr.index
train_and_test(cleaned_data, feature_list = hi_corr_index)
relevant output:
train_rmse 1.1867434335950793e-11,
test_rmse 1.1984677298704358e-11
I do not believe my rmse should be that small. At this point in the project I should not be getting within fractions of a penny of accurately predicting prices.
Click here to view the jupyter notebook file in a new tab