from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas
np.random.seed(42)
%matplotlib inline
plt.rcParams['figure.figsize'] = (10,6)
Nov 12, 2020
Final Project Groups and Topic Ideas
https://canvas.upenn.edu/courses/1533812/discussion_topics/6292804
We'll load data compiled from two data sources:
# Load the data
data = pd.read_csv("./data/gdp_vs_satisfaction.csv")
data.head()
Country | life_satisfaction | gdp_per_capita | |
---|---|---|---|
0 | Australia | 7.3 | 50961.87 |
1 | Austria | 7.1 | 43724.03 |
2 | Belgium | 6.9 | 40106.63 |
3 | Brazil | 6.4 | 8670.00 |
4 | Canada | 7.4 | 43331.96 |
# Linear models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
# Model selection
from sklearn.model_selection import train_test_split
# Pipelines
from sklearn.pipeline import make_pipeline
# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
First step: set up the test/train split of input data:
# Do a 70/30 train/test split
train_set, test_set = train_test_split(data, test_size=0.3, random_state=42)
# Features
X_train = train_set['gdp_per_capita'].values
X_train = X_train[:, np.newaxis]
X_test = test_set['gdp_per_capita'].values
X_test = X_test[:, np.newaxis]
# Labels
y_train = train_set['life_satisfaction'].values
y_test = test_set['life_satisfaction'].values
As we increase the polynomial degree (add more and more polynomial features) two things happen:
This is the classic case of overfitting — our model does not generalize well at all.
Ridge
adds regularization to the linear regression least squares modelRemember, regularization penalizes large parameter values and complex fits
Let's gain some intuition:
Important
LinearModel
and scales input features with StandardScaler
StandardScaler
and PolynomialFeatures(degree=3)
pre-processing to featuresSet up a grid of GDP per capita points to make predictions for:
# The values we want to predict (ranging from our min to max GDP per capita)
gdp_pred = np.linspace(1e3, 1.1e5, 100)
# Sklearn needs the second axis!
X_pred = gdp_pred[:, np.newaxis]
# Create a pre-processing pipeline
# This scales and adds polynomial features up to degree = 3
pipe = make_pipeline(StandardScaler(), PolynomialFeatures(degree=3))
# BASELINE: Setup and fit a linear model (with scaled features)
linear = LinearRegression()
scaler = StandardScaler()
linear.fit(scaler.fit_transform(X_train), y_train)
with plt.style.context("fivethirtyeight"):
fig, ax = plt.subplots(figsize=(10, 6))
## Plot the data
ax.scatter(
data["gdp_per_capita"] / 1e5,
data["life_satisfaction"],
label="Data",
s=100,
zorder=10,
color="#666666",
)
## Evaluate the linear fit
print("Linear fit")
training_score = linear.score(scaler.fit_transform(X_train), y_train)
print(f"Training Score = {training_score}")
test_score = linear.score(scaler.fit_transform(X_test), y_test)
print(f"Test Score = {test_score}")
print()
## Plot the linear fit
ax.plot(
X_pred / 1e5,
linear.predict(scaler.fit_transform(X_pred)),
color="k",
label="Linear fit",
)
## Ridge regression: linear model with regularization
# Plot the predicted values for each alpha
for alpha in [0, 10, 100, 1e5]:
print(f"alpha = {alpha}")
# Create out Ridge model with this alpha
ridge = Ridge(alpha=alpha)
# Fit the model on the training set
# NOTE: Use the pipeline that includes polynomial features
ridge.fit(pipe.fit_transform(X_train), y_train)
# Evaluate on the training set
training_score = ridge.score(pipe.fit_transform(X_train), y_train)
print(f"Training Score = {training_score}")
# Evaluate on the test set
test_score = ridge.score(pipe.fit_transform(X_test), y_test)
print(f"Test Score = {test_score}")
# Plot the ridge results
y_pred = ridge.predict(pipe.fit_transform(X_pred))
ax.plot(X_pred / 1e5, y_pred, label=f"alpha = {alpha}")
print()
# Plot formatting
ax.legend(ncol=2, loc=0)
ax.set_ylim(4, 8)
ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
ax.set_ylabel("Life Satisfaction")
Linear fit Training Score = 0.4638100579740341 Test Score = 0.3595958514715957 alpha = 0 Training Score = 0.6458898101593085 Test Score = 0.5597457659851046 alpha = 10 Training Score = 0.5120282691427858 Test Score = 0.3833564210378835 alpha = 100 Training Score = 0.1815398751108913 Test Score = -0.05242399995626967 alpha = 100000.0 Training Score = 0.0020235571180508005 Test Score = -0.26129559971586125
LinearRegression()
with the polynomial featuresMore feature engineering!
In this case, I've done the hard work for you and pulled additional country properties from the OECD's website.
data2 = pd.read_csv("./data/gdp_vs_satisfaction_more_features.csv")
data2.head()
Country | life_satisfaction | GDP per capita | Air pollution | Employment rate | Feeling safe walking alone at night | Homicide rate | Life expectancy | Quality of support network | Voter turnout | Water quality | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Australia | 7.3 | 50961.87 | 5.0 | 73.0 | 63.5 | 1.1 | 82.5 | 95.0 | 91.0 | 93.0 |
1 | Austria | 7.1 | 43724.03 | 16.0 | 72.0 | 80.6 | 0.5 | 81.7 | 92.0 | 80.0 | 92.0 |
2 | Belgium | 6.9 | 40106.63 | 15.0 | 63.0 | 70.1 | 1.0 | 81.5 | 91.0 | 89.0 | 84.0 |
3 | Brazil | 6.4 | 8670.00 | 10.0 | 61.0 | 35.6 | 26.7 | 74.8 | 90.0 | 79.0 | 73.0 |
4 | Canada | 7.4 | 43331.96 | 7.0 | 73.0 | 82.2 | 1.3 | 81.9 | 93.0 | 68.0 | 91.0 |
We'll move beyond simple linear regression and see if we can get a better fit.
A decision tree learns decision rules from the input features:
For a specific corner of the input feature space, the decision tree predicts an output target value
Decision trees can be very deep with many nodes — this will lead to overfitting your dataset!
This is an example of ensemble learning: combining multiple estimators to achieve better overall accuracy than any one model could achieve
from sklearn.ensemble import RandomForestRegressor
Let's split our data into training and test sets again:
# Split the data 70/30
train_set, test_set = train_test_split(data2, test_size=0.3, random_state=42)
# the target labels
y_train = train_set["life_satisfaction"].values
y_test = test_set["life_satisfaction"].values
# The features
feature_cols = [col for col in data2.columns if col not in ["life_satisfaction", "Country"]]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values
import seaborn as sns
sns.heatmap(
train_set[feature_cols].corr(),
cmap="coolwarm",
annot=True,
vmin=-1,
vmax=1
)
<AxesSubplot:>
New: Pipelines
support models as the last step!
Pipeline
behaves just like a model, but it runs the transformations beforehand.fit()
function of the pipeline instead of the modelEstablish a baseline with a linear model:
# Linear model pipeline with two steps
linear_pipe = make_pipeline(StandardScaler(), LinearRegression())
# Fit the pipeline
# NEW: This applies all of the transformations, and then fits the model
print("Linear regression")
linear_pipe.fit(X_train, y_train)
# NEW: Print the training score
training_score = linear_pipe.score(X_train, y_train)
print(f"Training Score = {training_score}")
# NEW: Print the test score
test_score = linear_pipe.score(X_test, y_test)
print(f"Test Score = {test_score}")
Linear regression Training Score = 0.7553332657461678 Test Score = 0.647886559044683
Now fit a random forest:
# Random forest model pipeline with two steps
forest_pipe = make_pipeline(
StandardScaler(), RandomForestRegressor(n_estimators=100, max_depth=2, random_state=42)
)
# Fit a random forest
print("Random forest")
forest_pipe.fit(X_train, y_train)
# Print the training score
training_score = forest_pipe.score(X_train, y_train)
print(f"Training Score = {training_score}")
# Print the test score
test_score = forest_pipe.score(X_test, y_test)
print(f"Test Score = {test_score}")
Random forest Training Score = 0.8460559576380231 Test Score = 0.862756366860489
Because random forests are an ensemble method with multiple estimators, the algorithm can learn which features help improve the fit the most.
feature_importances_
attributefit()
!# What are the named steps?
forest_pipe.named_steps
{'standardscaler': StandardScaler(), 'randomforestregressor': RandomForestRegressor(max_depth=2, random_state=42)}
# Get the forest model
forest_model = forest_pipe['randomforestregressor']
forest_model.feature_importances_
array([0.67746013, 0.0038663 , 0.13108609, 0.06579352, 0.00985913, 0.01767323, 0.02635605, 0.00601776, 0.06188779])
# Make the dataframe
importance = pd.DataFrame(
{"Feature": feature_cols, "Importance": forest_model.feature_importances_}
).sort_values("Importance")
importance
Feature | Importance | |
---|---|---|
1 | Air pollution | 0.003866 |
7 | Voter turnout | 0.006018 |
4 | Homicide rate | 0.009859 |
5 | Life expectancy | 0.017673 |
6 | Quality of support network | 0.026356 |
8 | Water quality | 0.061888 |
3 | Feeling safe walking alone at night | 0.065794 |
2 | Employment rate | 0.131086 |
0 | GDP per capita | 0.677460 |
# Plot
importance.hvplot.barh(
x="Feature", y="Importance", title="Does Money Make You Happier?"
)
For more information, see the scikit-learn docs
The cross_val_score()
function will automatically partition the training set into k folds, fit the model to the subset, and return the scores for each partition.
It takes a Pipeline
object, the training features, and the training labels as arguments
from sklearn.model_selection import cross_val_score
cross_val_score?
# Make a linear pipeline
linear_pipe = make_pipeline(StandardScaler(), LinearRegression())
# Run the 3-fold cross validation
scores = cross_val_score(
linear_pipe,
X_train,
y_train,
cv=3,
)
# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
R^2 scores = [ 0.02064625 -0.84773581 -0.53652985] Scores mean = -0.4545398042994612 Score std dev = 0.35922474493059214
forest_pipe['randomforestregressor'].score?
Object `score` not found.
model = forest_pipe['randomforestregressor']
model.score?
# Make a random forest pipeline
forest_pipe = make_pipeline(
StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)
# Run the 3-fold cross validation
scores = cross_val_score(
forest_pipe,
X_train,
y_train,
cv=3,
)
# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
R^2 scores = [0.51950656 0.78033403 0.66526384] Scores mean = 0.655034812029183 Score std dev = 0.10672774411781198
n_estimators
is a model hyperparameter(via the fit() method)
This is when cross validation becomes very important
from sklearn.model_selection import GridSearchCV
Let's do a search over the n_estimators
parameter and the max_depth
parameter:
# Create our regression pipeline
pipe = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=42))
pipe
Pipeline(steps=[('standardscaler', StandardScaler()), ('randomforestregressor', RandomForestRegressor(random_state=42))])
"[step name]__[parameter name]"
model_step = "randomforestregressor"
param_grid = {
f"{model_step}__n_estimators": [5, 10, 15, 20, 30, 50, 100, 200],
f"{model_step}__max_depth": [2, 5, 7, 9, 13, 21, 33, 51],
}
param_grid
{'randomforestregressor__n_estimators': [5, 10, 15, 20, 30, 50, 100, 200], 'randomforestregressor__max_depth': [2, 5, 7, 9, 13, 21, 33, 51]}
# Create the grid and use 3-fold CV
grid = GridSearchCV(pipe, param_grid, cv=3)
# Run the search
grid.fit(X_train, y_train)
GridSearchCV(cv=3, estimator=Pipeline(steps=[('standardscaler', StandardScaler()), ('randomforestregressor', RandomForestRegressor(random_state=42))]), param_grid={'randomforestregressor__max_depth': [2, 5, 7, 9, 13, 21, 33, 51], 'randomforestregressor__n_estimators': [5, 10, 15, 20, 30, 50, 100, 200]})
# The best estimator
grid.best_estimator_
Pipeline(steps=[('standardscaler', StandardScaler()), ('randomforestregressor', RandomForestRegressor(max_depth=7, random_state=42))])
# The best hyper parameters
grid.best_params_
{'randomforestregressor__max_depth': 7, 'randomforestregressor__n_estimators': 100}
We'll define a helper utility function to calculate the accuracy in terms of the mean absolute percent error
def evaluate(model, X_test, y_test):
"""
Given a model and test features/targets, print out the
mean absolute error and accuracy
"""
# Make the predictions
predictions = model.predict(X_test)
# Absolute error
errors = abs(predictions - y_test)
avg_error = np.mean(errors)
# Mean absolute percentage error
mape = 100 * np.mean(errors / y_test)
# Accuracy
accuracy = 100 - mape
print("Model Performance")
print(f"Average Absolute Error: {avg_error:0.4f}")
print(f"Accuracy = {accuracy:0.2f}%.")
return accuracy
# Setup the pipeline
linear = make_pipeline(StandardScaler(), LinearRegression())
# Fit on train set
linear.fit(X_train, y_train)
# Evaluate on test set
evaluate(linear, X_test, y_test)
Model Performance Average Absolute Error: 0.3281 Accuracy = 94.93%.
94.92864894582036
# Initialize the pipeline
base_model = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=42))
# Fit the training set
base_model.fit(X_train, y_train)
# Evaluate on the test set
base_accuracy = evaluate(base_model, X_test, y_test)
Model Performance Average Absolute Error: 0.2291 Accuracy = 96.47%.
Small improvement!
# Evaluate the best random forest model
best_random = grid.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)
# What's the improvement?
improvement = 100 * (random_accuracy - base_accuracy) / base_accuracy
print(f'Improvement of {improvement:0.4f}%.')
Model Performance Average Absolute Error: 0.2288 Accuracy = 96.47%. Improvement of 0.0025%.
cross_val_score
GridSearchCV
In this part, we'll use a random forest model and housing data from the Office of Property Assessment to predict residential sale prices in Philadelphia
Note: We'll focus on the first two components in this analysis (and in assignment #7)
Too often, these models perpetuate inequality: low-value homes are over-assessed and high-value homes are under-assessed
Let's download data for properties in Philadelphia that had their last sale during 2019.
Sources:
import carto2gpd
# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"
# The table name
table_name = "opa_properties_public"
# Only pull 2019 sales for single family residential properties
where = "sale_date >= '2019-01-01' and sale_date < '2020-01-01' and category_code_description = 'Single Family'"
# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)
salesRaw.head()
geometry | cartodb_id | assessment_date | basements | beginning_point | book_and_page | building_code | building_code_description | category_code | category_code_description | ... | type_heater | unfinished | unit | utility | view_type | year_built | year_built_estimate | zip_code | zoning | objectid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | POINT (-75.14854 39.93144) | 9 | None | 0 | 54'7" E OF AMERICAN | 3547465 | R30 | ROW B/GAR 2 STY MASONRY | 1 | Single Family | ... | A | None | None | None | I | 1960 | Y | 191475336 | RSA5 | 782244809 |
1 | POINT (-75.14721 39.93033) | 21 | None | C | 40'4 1/2" W HOWARD ST | 3517081 | O50 | ROW 3 STY MASONRY | 1 | Single Family | ... | A | None | None | None | I | 1920 | Y | 191476128 | RSA5 | 782244835 |
2 | POINT (-75.14927 39.93033) | 54 | None | D | 182'11" W PHILIP | 3527524 | O50 | ROW 3 STY MASONRY | 1 | Single Family | ... | A | None | None | None | I | 1920 | None | 191476037 | RSA5 | 782244867 |
3 | POINT (-75.14871 39.92927) | 102 | None | C | 83'W 2ND ST | 3550739 | O30 | ROW 2 STY MASONRY | 1 | Single Family | ... | H | None | None | None | I | 1900 | Y | 191476003 | RSA5 | 782244971 |
4 | POINT (-75.14876 39.93011) | 116 | None | A | 28 FT W PHILIP | 3534307 | O30 | ROW 2 STY MASONRY | 1 | Single Family | ... | A | None | None | None | I | 1920 | Y | 191476012 | RSA5 | 782244873 |
5 rows × 77 columns
len(salesRaw)
25955
import missingno as msno
# We'll visualize the first half of columns
# and then the second half
ncol = len(salesRaw.columns)
fields1 = salesRaw.columns[:ncol//2]
fields2 = salesRaw.columns[ncol//2:]
# The first half of columns
msno.bar(salesRaw[fields1]);
# The second half of columns
msno.bar(salesRaw[fields2]);
# The feature columns we want to use
cols = [
"sale_price",
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"exterior_condition",
"zip_code",
]
# Trim to these columns and remove NaNs
sales = salesRaw[cols].dropna()
# Trim zip code to only the first five digits
sales['zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5)
# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]
len(sales)
20134
# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)
# the target labels: log of sale price
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])
# The features
feature_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values
# Make a random forest pipeline
forest = make_pipeline(
StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)
# Run the 10-fold cross validation
scores = cross_val_score(
forest,
X_train,
y_train,
cv=10,
)
# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
R^2 scores = [0.30717128 0.2842306 0.29924934 0.26703964 0.33972801 0.27736425 0.35993026 0.29179749 0.28254819 0.27334465] Scores mean = 0.2982403717905361 Score std dev = 0.02849696303924513
# Fit on the training data
forest.fit(X_train, y_train)
# What's the test score?
forest.score(X_test, y_test)
0.3180679726974225
Model appears to generalize reasonably well!
Note: we should also be optimizing hyperparameters to see if we can find additional improvements!
# Extract the regressor from the pipeline
regressor = forest["randomforestregressor"]
# Create the data frame of importances
importance = pd.DataFrame(
{"Feature": feature_cols, "Importance": regressor.feature_importances_}
)
importance.hvplot.barh(x="Feature", y="Importance")