SUP-2: Regression assistance needed
I have amended my code according to the instructions and i have reproduced it below, but i still get errors. Can someone please explain to me what the errors i get mean?
# Custom Classes and Functions
def display_df_info(df_name, my_df, v=False):
"""Convenience function to display information about a dataframe"""
print("Data: {}".format(df_name))
print("Shape (rows, cols) = {}".format(my_df.shape))
print("First few rows...")
print(my_df.head())
# Optional: Display other optional information with the (v)erbose flag
if v:
print("Dataframe Info:")
print(my_df.info())
class GetAge(BaseEstimator, TransformerMixin):
"""Custom Transformer: Calculate age (years only) relative to current year. Note that
the col values will be replaced but the original col name remains. When the transformer is
used in a pipeline, this is not an issue as the names are not used. However, if the data
from the pipeline is to be converted back to a DataFrame, then the col name change should
be done to reflect the correct data content."""
def fit(self, X, y=None):
return self
def transform(self, X):
current_year = int(d.datetime.now().year)
X['YearBuilt'] = X['YearBuilt'].apply(lambda i: (current_year - i))
"""TASK: Replace the 'YearBuilt' column values with the calculated age (subtract the
current year from the original values).
"""
return X
def main():
# DATA INPUT
############
file_path = "train.csv" #TASK: Modify to path of file
input_data = pd.read_csv(file_path, index_col = "Id")# TASK: Read in the input csv file using pandas
display_df_info("Raw Input", input_data)
# Seperate out the outcome variable from the loaded dataframe
output_var_name = 'SalePrice'
output_var = input_data[output_var_name]input_data.drop(output_var_name, axis=1, inplace=True)
# DATA ENGINEERING / MODEL DEFINITION
#####################################
# Subsetting the columns: define features to keep
feature_names = ["LotArea","YearBuilt","1stFlrSF","2ndFlrSF","FullBath","BedroomAbvGr","TotRmsAbvGrd","HouseStyle"]# TASK: Define the names of the columns to keep
features = input_data[feature_names]display_df_info('Features before Transform', features, v=True)
# Create the pipeline ...
# 1. Pre-processing
# Define variables made up of lists. Each list is a set of columns that will go through the same data transformations.
numerical_features = ["LotArea","YearBuilt","1stFlrSF","2ndFlrSF","FullBath","BedroomAbvGr","TotRmsAbvGrd"] # TASK: Define numerical column names
categorical_features = ["HouseStyle"] # TASK: Define categorical column names
"""TASK:
Define the data processing steps (transformers) to be applied to the numerical features in the dataset.
At a minimum, use 2 transformers: GetAge() and one other. Combine them using make_pipeline() or Pipeline()
"""
preprocess = make_column_transformer(
("""TASK: Define transformers""", numerical_features),
(OneHotEncoder(), categorical_features)
)
# 2. Combine pre-processing with ML algorithm
model = make_pipeline(
preprocess, LinearRegression()
# TASK : replace with ML algorithm from scikit
)
# TRAINING
##########
# Train/Test Split
"""TASK:
Split the data in test and train sets by completing the train_test_split function below. Define a random_state value so that
the experiment is repeatable.
"""
x_train, x_test, y_train, y_test = train_test_split(input_data, output_var, test_size = 0.3, random_state = 42) # TASK: Complete the code
# Train the pipeline
model.fit(x_train, y_train)
# Optional: Train with cross-validation and/or parameter grid search
# SCORING/EVALUATION
####################
# Fit the model on the test data
pred_test = model.predict(x_test)
# Display the results of the metrics
"""TASK:
Calculate the RMSE and Coeff of Determination between the actual and predicted sale prices.
Name your variables rmse and r2 respectively.
"""
rmse = np.sqrt(mean_squared_error(pred_test, y_test))
r2 = model.score(x_test, y_test)
print("Results on Test Data")
print("####################")
print("RMSE: {:.2f}".format(rmse))
print("R2 Score: {:.5f}".format(r2))
# Compare actual vs predicted values
"""TASK:
Create a new dataframe which combines the actual and predicted Sale Prices from the test dataset. You
may also add columns with other information such as difference, abs diff, %tage difference etc.
Name your variable compare
"""
compare = input_data
compare['SalePrice_Pred'] = pred_test
display_df_info('Actual vs Predicted Comparison', compare)
# Save the model
with open('my_model_lr.joblib', 'wb') as fo:
joblib.dump(model, fo)
if __name__ == '__main__':
main()
This is the error that i get:
--------------------------------------------------------------------------- TypeError Traceback (most recent call last)in 94 95 if __name__ == '__main__': ---> 96 main() in main() 52 53 # Train the pipeline ---> 54 model.fit(x_train, y_train) 55 56 # Optional: Train with cross-validation and/or parameter grid search /opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params) 328 """ 329 fit_params_steps = self._check_fit_params(**fit_params) --> 330 Xt = self._fit(X, y, **fit_params_steps) 331 with _print_elapsed_time('Pipeline', 332 self._log_message(len(self.steps) - 1)): /opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps) 290 cloned_transformer = clone(transformer) 291 # Fit or load from cache the current transformer --> 292 X, fitted_transformer = fit_transform_one_cached( 293 cloned_transformer, X, y, None, 294 message_clsname='Pipeline', /opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs) 350 351 def __call__(self, *args, **kwargs): --> 352 return self.func(*args, **kwargs) 353 354 def call_and_shelve(self, *args, **kwargs): /opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params) 738 with _print_elapsed_time(message_clsname, message): 739 if hasattr(transformer, 'fit_transform'): --> 740 res = transformer.fit_transform(X, y, **fit_params) 741 else: 742 res = transformer.fit(X, y, **fit_params).transform(X) /opt/anaconda3/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y) 525 # set n_features_in_ attribute 526 self._check_n_features(X, reset=True) --> 527 self._validate_transformers() 528 self._validate_column_callables(X) 529 self._validate_remainder(X) /opt/anaconda3/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py in _validate_transformers(self) 285 if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not 286 hasattr(t, "transform")): --> 287 raise TypeError("All estimators should implement fit and " 288 "transform, or can be 'drop' or 'passthrough' " 289 "specifiers. '%s' (type %s) doesn't." % TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. 'TASK: Define transformers' (type ) doesn't.
Hi,
The error is because you have not used any transformer for the numerical features (see the preprocess variable in main). The task is to replace the string,

Latest Post: Welcome to the revamped Kelaberetiv! Our newest member: SLim Recent Posts Unread Posts Tags
Forum Icons: Forum contains no unread posts Forum contains unread posts
Topic Icons: Not Replied Replied Active Hot Sticky Unapproved Solved Private Closed