source page https://app.dataquest.io/m/65/guided-project%3A-predicting-the-stock-market/7/next-steps
if __name__ == "__main__":
import pandas as pd
data = pd.read_csv('sphist.csv')
pd.to_datetime(data['Date'])
from datetime import datetime
data = data.sort_values(by = ['Date'], ascending = True)
data['5_day'] = data['Close'].rolling(5).mean().shift()
data['5_day'] = data['5_day'].fillna(value = 0)
data['30_day'] = data['Close'].rolling(30).mean().shift()
data['30_day'] = data['30_day'].fillna(value = 0)
data['365_day'] = data['Close'].rolling(365).mean().shift()
data['365_day'] = data['365_day'].fillna(value = 0)
data = data[data['Date']>'1951-01-03']
data = data.dropna(axis = 0)
train = data[data['Date'] <'2013-01-01']
test = data[data['Date'] >='2013-01-01']
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
column_focus = ['5_day',
'30_day',
'365_day']
lr.fit(train[column_focus],train['Close'])
pred = lr.predict(test[column_focus])
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(test['Close'],pred)
mean_ = test['Close'].mean()
max_ = test['Close'].max()
min_ = test['Close'].min()
print('mean absolute error: {}\n mean: {}\n max: {}\n min: {}'.format(mae,mean_,max_,min_))
results in
mean absolute error: 16.14225437004503
mean: 1874.890338389716
max: 2130.820068
min: 1457.150024
This seems like gross overfit, but I did it on a rolling average for a period of 5 days, 30 days, and 365 days. It feels like overfit shouldn’t really be possible.