models = {
'Linear Regression': LinearRegression(),
'Random Forest': RandomForestRegressor(random_state=42),
'Gradient Boosting': GradientBoostingRegressor(random_state=42),
'Support Vector Machine': SVR(),
'K-Nearest Neighbors': KNeighborsRegressor()
}
reg_pipelines = {}
resultados_supervisado = []
for name, model in models.items():
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
pipeline.fit(X_train, y_train)
reg_pipelines[name] = pipeline
cv_results = cross_validate(pipeline, X_train, y_train, cv=5,
scoring={'R2': 'r2', 'MAE': 'neg_mean_absolute_error', 'RMSE': 'neg_root_mean_squared_error'})
resultados_supervisado.append({
'Model': name,
'R2 (train)': cv_results['test_R2'].mean(),
'MAE (train)': -cv_results['test_MAE'].mean(),
'RMSE (train)': -cv_results['test_RMSE'].mean()
})
df_train_results = pd.DataFrame(resultados_supervisado).set_index('Model')
# Evaluation on Test Set
resultados_test = []
for name, pipeline in reg_pipelines.items():
y_pred = pipeline.predict(X_test)
resultados_test.append({
'Model': name,
'R2 (test)': r2_score(y_test, y_pred),
'MAE (test)': mean_absolute_error(y_test, y_pred),
'RMSE (test)': np.sqrt(mean_squared_error(y_test, y_pred)),
})
df_test_results = pd.DataFrame(resultados_test).set_index('Model')
df_results = df_test_results.merge(df_train_results, on='Model')
# Plotting metrics with Plotly
fig = make_subplots(rows=1, cols=3, subplot_titles=("R2 Score", "MAE", "RMSE"))
metrics = ['R2', 'MAE', 'RMSE']
for i, metric in enumerate(metrics):
fig.add_trace(go.Bar(name=f'{metric} train', x=df_results.index, y=df_results[f'{metric} (train)']), row=1, col=i+1)
fig.add_trace(go.Bar(name=f'{metric} test', x=df_results.index, y=df_results[f'{metric} (test)']), row=1, col=i+1)
fig.update_layout(title_text="Model Comparison", barmode='group')
fig.show()
display(df_results)