 # Why is min-max scaling not used in the solution provided to the project in the Linear Regression course?

My Code:

``````def transform_features(data):
numerical_features = data.select_dtypes(['float', 'int','float64','int64']).columns
numerical_features = numerical_features.drop(['PID','Order'])

#drop the columns with more than 5 percent missing values
null_columns = data[numerical_features].isnull().sum()
columns_to_drop = null_columns[null_columns > data.shape/20]
data = data.drop(columns_to_drop.index,axis=1)
null_columns.drop(index=columns_to_drop.index,inplace=True)

#fill remaining values with the most occurring value in the columns i.e. mode
selected_columns = null_columns[null_columns > 0].index
for col in selected_columns:
data[col] = data[col].fillna(data[col].mode())

#remove every text column with at least one null value
text_cols = data.select_dtypes(['object']).columns
text_cols_drop = text_cols[data[text_cols].isnull().sum()>0]
data.drop(text_cols_drop,inplace=True,axis=1)

#creating features  which better capture the information using the existing features
years_until_remod = data['Year Remod/Add'] - data['Year Built']
years_since_remod = data['Yr Sold'] - data['Year Remod/Add']
data['years_until_remod'] = years_until_remod
data['years_since_remod'] = years_since_remod

#dropping the rows containing negative values as these values make no sense
index_drop = [850,1702,2180,2181]
data.drop(index_drop,inplace=True,axis=0)

#dropping additional columns which might leak data about the final sale
data.drop(['PID','Order',"Mo Sold", "Sale Condition", "Sale Type", "Yr Sold",'Year Built', 'Year Remod/Add'], axis=1, inplace=True)
return data

def select_features(transformed_data):
#selecting features which have correlation more than or equal to 0.4 with SalePrice columm
corr_matrix = transformed_data.corr()
sorted_corr = abs(corr_matrix['SalePrice']).sort_values()
transformed_data.drop(sorted_corr[sorted_corr < 0.4].index, axis=1,inplace=True)

#removing 2 columns which have a very high correlation with other columns other than SalePrice column
transformed_data.drop('Garage Cars',inplace=True,axis=1)
transformed_data.drop('1st Flr SF', inplace=True, axis=1)
saleprice = transformed_data['SalePrice']
transformed_data.drop(columns=['SalePrice'],axis=1,inplace=True)

#changing the categorical variables
nominal_variables = ['MS Zoning', 'Street', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config',
'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style',
'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Exter Qual', 'Exter Cond', 'Foundation',
'Heating', 'Heating QC','Central Air','Kitchen Qual', 'Functional', 'Paved Drive', ]

#removing columns with more than 7 unique categorical values
for col in nominal_variables:
no_of_diff_values = transformed_data[col].value_counts().shape
if no_of_diff_values >  7:
transformed_data.drop(col,axis=1,inplace=True)
nominal_variables.remove(col)

#removing columns which contain the same categorical value for more than 95 percent of the rows
for col in nominal_variables:
value_cnts = transformed_data[col].value_counts()
if value_cnts > 0.95*len(transformed_data):
transformed_data.drop(col,axis=1,inplace=True)
nominal_variables.remove(col)
nominal_variables = transformed_data.select_dtypes(include=['object'])
for col in nominal_variables:
transformed_data[col] = transformed_data[col].astype('category')

#converting nominal variables into dummy columns
for col in nominal_variables:
dummy_col = pd.get_dummies(transformed_data[col])
transformed_data = pd.concat([transformed_data,dummy_col],axis=1)
transformed_data.drop(col,axis=1,inplace=True)

transformed_data = (transformed_data-transformed_data.min())/(transformed_data.min() - transformed_data.max())
transformed_data['SalePrice'] = saleprice
return transformed_data

def train_and_test(df):
#splitting into test and train datasets
train = df[:1460]
test = df[1460:]

#selecting numerical columns
columns_numerical = list(train.select_dtypes(['float', 'integer']).columns)
columns_numerical.remove('SalePrice')
target = 'SalePrice'

#Linear Regression Model
lr = LinearRegression()
lr.fit(train[columns_numerical], train[target])
test_prediction = lr.predict(test[columns_numerical])

#Error Calculation
mse = mean_squared_error(test[target],test_prediction)
rmse = np.sqrt(mse)
return rmse

data = pd.read_csv('C:/Users/Dhruv Pandya/Desktop/books and stuff/datasets/AmesHousing.txt', delimiter = '\t')
transformed_data = transform_features(data)
filtered_data = select_features(transformed_data)
rmse = train_and_test(filtered_data)
rmse
``````

What I expected to happen:
When I used min max scaling in my version of the project which followed the same procedure described in the missions of the project. I just added min mix scaling as I thought that it will help in better estimation as the dummy columns will also be having values ranging from 0 to 1. I expected that my rmse value will be comparable to that obtained in the solution.

What actually happened:

The rmse value calculated with min max scaling came out to be extremely large. When I removed the min max scaling from my solution, the rmse value is similar to that of the solution provided.