Forum

Notifications
Clear all

SUP-2: Regression assistance needed  

   RSS

0

I have amended my code according to the instructions and i have reproduced it below, but i still get errors. Can someone please explain to me what the errors i get mean?

 

# Custom Classes and Functions
def display_df_info(df_name, my_df, v=False):
"""Convenience function to display information about a dataframe"""

print("Data: {}".format(df_name))
print("Shape (rows, cols) = {}".format(my_df.shape))
print("First few rows...")
print(my_df.head())

# Optional: Display other optional information with the (v)erbose flag
if v:
print("Dataframe Info:")
print(my_df.info())

class GetAge(BaseEstimator, TransformerMixin):
"""Custom Transformer: Calculate age (years only) relative to current year. Note that
the col values will be replaced but the original col name remains. When the transformer is
used in a pipeline, this is not an issue as the names are not used. However, if the data
from the pipeline is to be converted back to a DataFrame, then the col name change should
be done to reflect the correct data content."""

def fit(self, X, y=None):
return self

def transform(self, X):
current_year = int(d.datetime.now().year)
X['YearBuilt'] = X['YearBuilt'].apply(lambda i: (current_year - i))

"""TASK: Replace the 'YearBuilt' column values with the calculated age (subtract the
current year from the original values).
"""

return X

 

def main():

# DATA INPUT
############
file_path = "train.csv" #TASK: Modify to path of file
input_data = pd.read_csv(file_path, index_col = "Id")# TASK: Read in the input csv file using pandas
display_df_info("Raw Input", input_data)

# Seperate out the outcome variable from the loaded dataframe
output_var_name = 'SalePrice'
output_var = input_data[output_var_name]input_data.drop(output_var_name, axis=1, inplace=True)

# DATA ENGINEERING / MODEL DEFINITION
#####################################

# Subsetting the columns: define features to keep
feature_names = ["LotArea","YearBuilt","1stFlrSF","2ndFlrSF","FullBath","BedroomAbvGr","TotRmsAbvGrd","HouseStyle"]# TASK: Define the names of the columns to keep
features = input_data[feature_names]display_df_info('Features before Transform', features, v=True)

# Create the pipeline ...
# 1. Pre-processing
# Define variables made up of lists. Each list is a set of columns that will go through the same data transformations.
numerical_features = ["LotArea","YearBuilt","1stFlrSF","2ndFlrSF","FullBath","BedroomAbvGr","TotRmsAbvGrd"] # TASK: Define numerical column names
categorical_features = ["HouseStyle"] # TASK: Define categorical column names

"""TASK:
Define the data processing steps (transformers) to be applied to the numerical features in the dataset.

At a minimum, use 2 transformers: GetAge() and one other. Combine them using make_pipeline() or Pipeline()
"""
preprocess = make_column_transformer(
("""TASK: Define transformers""", numerical_features),
(OneHotEncoder(), categorical_features)
)

# 2. Combine pre-processing with ML algorithm
model = make_pipeline(
preprocess, LinearRegression()
# TASK : replace with ML algorithm from scikit
)

# TRAINING
##########
# Train/Test Split
"""TASK:
Split the data in test and train sets by completing the train_test_split function below. Define a random_state value so that
the experiment is repeatable.
"""
x_train, x_test, y_train, y_test = train_test_split(input_data, output_var, test_size = 0.3, random_state = 42) # TASK: Complete the code

# Train the pipeline
model.fit(x_train, y_train)

# Optional: Train with cross-validation and/or parameter grid search

# SCORING/EVALUATION
####################
# Fit the model on the test data
pred_test = model.predict(x_test)

# Display the results of the metrics
"""TASK:
Calculate the RMSE and Coeff of Determination between the actual and predicted sale prices.

Name your variables rmse and r2 respectively.
"""
rmse = np.sqrt(mean_squared_error(pred_test, y_test))
r2 = model.score(x_test, y_test)

print("Results on Test Data")
print("####################")
print("RMSE: {:.2f}".format(rmse))
print("R2 Score: {:.5f}".format(r2))

# Compare actual vs predicted values
"""TASK:
Create a new dataframe which combines the actual and predicted Sale Prices from the test dataset. You
may also add columns with other information such as difference, abs diff, %tage difference etc.

Name your variable compare
"""
compare = input_data
compare['SalePrice_Pred'] = pred_test

display_df_info('Actual vs Predicted Comparison', compare)

# Save the model
with open('my_model_lr.joblib', 'wb') as fo:
joblib.dump(model, fo)

if __name__ == '__main__':
main()

 

 

2 Answers
0

This is the error that i get:

 

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
 in 
     94 
     95 if __name__ == '__main__':
---> 96     main()

 in main()
     52 
     53     # Train the pipeline
---> 54     model.fit(x_train, y_train)
     55 
     56     # Optional: Train with cross-validation and/or parameter grid search

/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    328         """
    329         fit_params_steps = self._check_fit_params(**fit_params)
--> 330         Xt = self._fit(X, y, **fit_params_steps)
    331         with _print_elapsed_time('Pipeline',
    332                                  self._log_message(len(self.steps) - 1)):

/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    290                 cloned_transformer = clone(transformer)
    291             # Fit or load from cache the current transformer
--> 292             X, fitted_transformer = fit_transform_one_cached(
    293                 cloned_transformer, X, y, None,
    294                 message_clsname='Pipeline',

/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    738     with _print_elapsed_time(message_clsname, message):
    739         if hasattr(transformer, 'fit_transform'):
--> 740             res = transformer.fit_transform(X, y, **fit_params)
    741         else:
    742             res = transformer.fit(X, y, **fit_params).transform(X)

/opt/anaconda3/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
    525         # set n_features_in_ attribute
    526         self._check_n_features(X, reset=True)
--> 527         self._validate_transformers()
    528         self._validate_column_callables(X)
    529         self._validate_remainder(X)

/opt/anaconda3/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py in _validate_transformers(self)
    285             if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
    286                     hasattr(t, "transform")):
--> 287                 raise TypeError("All estimators should implement fit and "
    288                                 "transform, or can be 'drop' or 'passthrough' "
    289                                 "specifiers. '%s' (type %s) doesn't." %

TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. 'TASK: Define transformers' (type ) doesn't.
0

Hi, 

The error is because you have not used any transformer for the numerical features (see the preprocess variable in main). The task is to replace the string,

"""TASK: Define transformers"""
 
with at least 2 operations, one of which is GetAge(), to apply to the numerical features. Once you have identified these operations, you can combine them using make_pipeline() and replace this comment string.
 
Hope that helps.
 
Azmi
Share:

Delete your account