Forum

Notifications
Clear all

[Solved] SUP-2 Regression exercise Errors in the console  

   RSS

0

Hi , this is regarding SUP-2 Regression exercise. I am using Spyder (python 3.7)

When I execute these codes, yet finished...I've got errors, how can i get rid of these?

 


# AI Singapore
# Regression 2 Exercise
# Exercise: Building a Regression job template

# 1. Import required libraries
import numpy as np
import pandas as pd
import datetime as d

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import joblib

# Information on Data
# https://www.kaggle.com/c/home-data-for-ml-course/data

# Custom Classes and Functions
def display_df_info(df_name, my_df, v=False):
"""Convenience function to display information about a dataframe"""

print("Data: {}".format(df_name))
print("Shape (rows, cols) = {}".format(my_df.shape))
print("First few rows...")
print(my_df.head())

# Optional: Display other optional information with the (v)erbose flag
if v:
print("Dataframe Info:")
print(my_df.info())

class GetAge(BaseEstimator, TransformerMixin):
"""Custom Transformer: Calculate age (years only) relative to current year. Note that
the col values will be replaced but the original col name remains. When the transformer is
used in a pipeline, this is not an issue as the names are not used. However, if the data
from the pipeline is to be converted back to a DataFrame, then the col name change should
be done to reflect the correct data content."""

def fit(self, X, y=None):
return self

def transform(self,X):
current_year = int(d.datetime.now().year)

"""TASK: Replace the 'YearBuilt' column values with the calculated age (subtract the
current year from the original values).
"""
X.apply(lambda x: current_year - x)

return X

def main():

# DATA INPUT
############
file_path = "D:/Python/_regression/kaggle/test.csv"#TASK: Modify to path of file
input_data =pd.read_csv(file_path)# TASK: Read in the input csv file using pandas
display_df_info("Raw Input", input_data)

# Seperate out the outcome variable from the loaded dataframe
output_var_name = 'SalePrice'
output_var = input_data[output_var_name]
input_data.drop(output_var_name, axis=1, inplace=True)

# DATA ENGINEERING / MODEL DEFINITION
#####################################

# Subsetting the columns: define features to keep
feature_names = ['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd','HouseStyle']# TASK: Define the names of the columns to keep
features = input_data[feature_names]
display_df_info('Features before Transform', features, v=True)
print(features.info())

# Create the pipeline ...
# 1. Pre-processing
# Define variables made up of lists. Each list is a set of columns that will go through the same data transformations.
numerical_features = ['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd''LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd'] # TASK: Define numerical column names
categorical_features = ['HouseStyle'] # TASK: Define categorical column names

"""TASK:
Define the data processing steps (transformers) to be applied to the numerical features in the dataset.

At a minimum, use 2 transformers: GetAge() and one other. Combine them using make_pipeline() or Pipeline()
"""
preprocess = make_column_transformer(
(GetAge(), numerical_features),
(OneHotEncoder(), categorical_features)
)

# 2. Combine pre-processing with ML algorithm
model = make_pipeline(
preprocess,
# TASK : replace with ML algorithm from scikit
LinearRegression()
)

# TRAINING
##########
# Train/Test Split
"""TASK:
Split the data in test and train sets by completing the train_test_split function below. Define a random_state value so that
the experiment is repeatable.
"""
x_train, x_test, y_train, y_test = train_test_split(input_data, output_var, test_size= 0.3, random_state=42) # TASK: Complete the code

# Train the pipeline
model.fit(x_train, y_train)


 

Errors I am getting is here


runfile('D:/Python/_regression/kaggle/Ex_LinearRegression_start.py', wdir='D:/Python/_regression/kaggle')
Data: Raw Input
Shape (rows, cols) = (1459, 80)
First few rows...
Id MSSubClass MSZoning ... YrSold SaleType SaleCondition
0 1461 20 RH ... 2010 WD Normal
1 1462 20 RL ... 2010 WD Normal
2 1463 60 RL ... 2010 WD Normal
3 1464 60 RL ... 2010 WD Normal
4 1465 120 RL ... 2010 WD Normal

[5 rows x 80 columns]
Traceback (most recent call last):

File "D:\Python\lib\site-packages\pandas\core\indexes\base.py", line 2646, in get_loc
return self._engine.get_loc(key)

File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc

File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc

File "pandas\_libs\hashtable_class_helper.pxi", line 1618, in pandas._libs.hashtable.PyObjectHashTable.get_item

File "pandas\_libs\hashtable_class_helper.pxi", line 1626, in pandas._libs.hashtable.PyObjectHashTable.get_item

KeyError: 'SalePrice'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):

File "D:\Python\_regression\kaggle\Ex_LinearRegression_start.py", line 154, in
main()

File "D:\Python\_regression\kaggle\Ex_LinearRegression_start.py", line 68, in main
output_var = input_data[output_var_name]

File "D:\Python\lib\site-packages\pandas\core\frame.py", line 2800, in __getitem__
indexer = self.columns.get_loc(key)

File "D:\Python\lib\site-packages\pandas\core\indexes\base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))

File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc

File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc

File "pandas\_libs\hashtable_class_helper.pxi", line 1618, in pandas._libs.hashtable.PyObjectHashTable.get_item

File "pandas\_libs\hashtable_class_helper.pxi", line 1626, in pandas._libs.hashtable.PyObjectHashTable.get_item

KeyError: 'SalePrice'

 

1 Answer
1

Hi @zenchong!

It looks to me that you're using the test.csv from the dataset, which is why you have only 80 columns of data. You should be using train.csv, which has 81 columns, including the 'SalePrice' column that you're currently missing 😀  

Share:

Delete your account