from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas

np.random.seed(42)
%matplotlib inline


plt.rcParams['figure.figsize'] = (10,6)


# Load the data 
data = pd.read_csv("./data/gdp_vs_satisfaction.csv")
data.head()


# Linear models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

# Model selection
from sklearn.model_selection import train_test_split

# Pipelines
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures


# Do a 70/30 train/test split
train_set, test_set = train_test_split(data, test_size=0.3, random_state=42)

# Features
X_train = train_set['gdp_per_capita'].values
X_train = X_train[:, np.newaxis]

X_test = test_set['gdp_per_capita'].values
X_test = X_test[:, np.newaxis]

# Labels
y_train = train_set['life_satisfaction'].values
y_test = test_set['life_satisfaction'].values


# The values we want to predict (ranging from our min to max GDP per capita)
gdp_pred = np.linspace(1e3, 1.1e5, 100)

# Sklearn needs the second axis!
X_pred = gdp_pred[:, np.newaxis]


# Create a pre-processing pipeline
# This scales and adds polynomial features up to degree = 3
pipe = make_pipeline(StandardScaler(), PolynomialFeatures(degree=3))

# BASELINE: Setup and fit a linear model (with scaled features)
linear = LinearRegression()
scaler = StandardScaler()
linear.fit(scaler.fit_transform(X_train), y_train)


with plt.style.context("fivethirtyeight"):

    fig, ax = plt.subplots(figsize=(10, 6))

    ## Plot the data
    ax.scatter(
        data["gdp_per_capita"] / 1e5,
        data["life_satisfaction"],
        label="Data",
        s=100,
        zorder=10,
        color="#666666",
    )

    ## Evaluate the linear fit
    print("Linear fit")
    training_score = linear.score(scaler.fit_transform(X_train), y_train)
    print(f"Training Score = {training_score}")

    test_score = linear.score(scaler.fit_transform(X_test), y_test)
    print(f"Test Score = {test_score}")
    print()


    ## Plot the linear fit
    ax.plot(
        X_pred / 1e5,
        linear.predict(scaler.fit_transform(X_pred)),
        color="k",
        label="Linear fit",
    )

    ## Ridge regression: linear model with regularization 
    # Plot the predicted values for each alpha
    for alpha in [0, 10, 100, 1e5]:
        print(f"alpha = {alpha}")

        # Create out Ridge model with this alpha
        ridge = Ridge(alpha=alpha)

        # Fit the model on the training set
        # NOTE: Use the pipeline that includes polynomial features
        ridge.fit(pipe.fit_transform(X_train), y_train)

        # Evaluate on the training set
        training_score = ridge.score(pipe.fit_transform(X_train), y_train)
        print(f"Training Score = {training_score}")

        # Evaluate on the test set
        test_score = ridge.score(pipe.fit_transform(X_test), y_test)
        print(f"Test Score = {test_score}")

        # Plot the ridge results
        y_pred = ridge.predict(pipe.fit_transform(X_pred))
        ax.plot(X_pred / 1e5, y_pred, label=f"alpha = {alpha}")

        print()

    # Plot formatting
    ax.legend(ncol=2, loc=0)
    ax.set_ylim(4, 8)
    ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
    ax.set_ylabel("Life Satisfaction")

Linear fit
Training Score = 0.4638100579740341
Test Score = 0.3595958514715957

alpha = 0
Training Score = 0.6458898101593085
Test Score = 0.5597457659851046

alpha = 10
Training Score = 0.5120282691427858
Test Score = 0.3833564210378835

alpha = 100
Training Score = 0.1815398751108913
Test Score = -0.05242399995626967

alpha = 100000.0
Training Score = 0.0020235571180508005
Test Score = -0.26129559971586125


data2 = pd.read_csv("./data/gdp_vs_satisfaction_more_features.csv")


data2.head()


from sklearn.ensemble import RandomForestRegressor


# Split the data 70/30
train_set, test_set = train_test_split(data2, test_size=0.3, random_state=42)

# the target labels
y_train = train_set["life_satisfaction"].values
y_test = test_set["life_satisfaction"].values

# The features
feature_cols = [col for col in data2.columns if col not in ["life_satisfaction", "Country"]]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values


import seaborn as sns


sns.heatmap(
    train_set[feature_cols].corr(), 
    cmap="coolwarm", 
    annot=True, 
    vmin=-1, 
    vmax=1
)

<AxesSubplot:>


# Linear model pipeline with two steps
linear_pipe = make_pipeline(StandardScaler(), LinearRegression())

# Fit the pipeline
# NEW: This applies all of the transformations, and then fits the model
print("Linear regression")
linear_pipe.fit(X_train, y_train)

# NEW: Print the training score
training_score = linear_pipe.score(X_train, y_train)
print(f"Training Score = {training_score}")

# NEW: Print the test score
test_score = linear_pipe.score(X_test, y_test)
print(f"Test Score = {test_score}")

Linear regression
Training Score = 0.7553332657461678
Test Score = 0.647886559044683


# Random forest model pipeline with two steps
forest_pipe = make_pipeline(
    StandardScaler(), RandomForestRegressor(n_estimators=100, max_depth=2, random_state=42)
)

# Fit a random forest
print("Random forest")
forest_pipe.fit(X_train, y_train)

# Print the training score
training_score = forest_pipe.score(X_train, y_train)
print(f"Training Score = {training_score}")

# Print the test score
test_score = forest_pipe.score(X_test, y_test)
print(f"Test Score = {test_score}")

Random forest
Training Score = 0.8460559576380231
Test Score = 0.862756366860489


# What are the named steps?
forest_pipe.named_steps

{'standardscaler': StandardScaler(),
 'randomforestregressor': RandomForestRegressor(max_depth=2, random_state=42)}


# Get the forest model
forest_model = forest_pipe['randomforestregressor']


forest_model.feature_importances_

array([0.67746013, 0.0038663 , 0.13108609, 0.06579352, 0.00985913,
       0.01767323, 0.02635605, 0.00601776, 0.06188779])


# Make the dataframe
importance = pd.DataFrame(
    {"Feature": feature_cols, "Importance": forest_model.feature_importances_}
).sort_values("Importance")


importance


# Plot
importance.hvplot.barh(
    x="Feature", y="Importance", title="Does Money Make You Happier?"
)


from sklearn.model_selection import cross_val_score


cross_val_score?


# Make a linear pipeline
linear_pipe = make_pipeline(StandardScaler(), LinearRegression())

# Run the 3-fold cross validation
scores = cross_val_score(
    linear_pipe,
    X_train,
    y_train,
    cv=3,
)

# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())

R^2 scores =  [ 0.02064625 -0.84773581 -0.53652985]
Scores mean =  -0.4545398042994612
Score std dev =  0.35922474493059214


forest_pipe['randomforestregressor'].score?

Object `score` not found.


model = forest_pipe['randomforestregressor']


model.score?


# Make a random forest pipeline
forest_pipe = make_pipeline(
    StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)

# Run the 3-fold cross validation
scores = cross_val_score(
    forest_pipe,
    X_train,
    y_train,
    cv=3,
)

# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())

R^2 scores =  [0.51950656 0.78033403 0.66526384]
Scores mean =  0.655034812029183
Score std dev =  0.10672774411781198


from sklearn.model_selection import GridSearchCV


# Create our regression pipeline
pipe = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=42))
pipe

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])


model_step = "randomforestregressor"
param_grid = {
    f"{model_step}__n_estimators": [5, 10, 15, 20, 30, 50, 100, 200],
    f"{model_step}__max_depth": [2, 5, 7, 9, 13, 21, 33, 51],
}

param_grid

{'randomforestregressor__n_estimators': [5, 10, 15, 20, 30, 50, 100, 200],
 'randomforestregressor__max_depth': [2, 5, 7, 9, 13, 21, 33, 51]}


# Create the grid and use 3-fold CV
grid = GridSearchCV(pipe, param_grid, cv=3)

# Run the search
grid.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor(random_state=42))]),
             param_grid={'randomforestregressor__max_depth': [2, 5, 7, 9, 13,
                                                              21, 33, 51],
                         'randomforestregressor__n_estimators': [5, 10, 15, 20,
                                                                 30, 50, 100,
                                                                 200]})


# The best estimator
grid.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=7, random_state=42))])


# The best hyper parameters
grid.best_params_

{'randomforestregressor__max_depth': 7,
 'randomforestregressor__n_estimators': 100}


def evaluate(model, X_test, y_test):
    """
    Given a model and test features/targets, print out the 
    mean absolute error and accuracy
    """
    # Make the predictions
    predictions = model.predict(X_test)

    # Absolute error
    errors = abs(predictions - y_test)
    avg_error = np.mean(errors)

    # Mean absolute percentage error
    mape = 100 * np.mean(errors / y_test)

    # Accuracy
    accuracy = 100 - mape

    print("Model Performance")
    print(f"Average Absolute Error: {avg_error:0.4f}")
    print(f"Accuracy = {accuracy:0.2f}%.")

    return accuracy


# Setup the pipeline
linear = make_pipeline(StandardScaler(), LinearRegression())

# Fit on train set
linear.fit(X_train, y_train)

# Evaluate on test set
evaluate(linear, X_test, y_test)

Model Performance
Average Absolute Error: 0.3281
Accuracy = 94.93%.

94.92864894582036


# Initialize the pipeline
base_model = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=42))

# Fit the training set
base_model.fit(X_train, y_train)

# Evaluate on the test set
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Average Absolute Error: 0.2291
Accuracy = 96.47%.


# Evaluate the best random forest model
best_random = grid.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

# What's the improvement?
improvement = 100 * (random_accuracy - base_accuracy) / base_accuracy
print(f'Improvement of {improvement:0.4f}%.')

Model Performance
Average Absolute Error: 0.2288
Accuracy = 96.47%.
Improvement of 0.0025%.


import carto2gpd


# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"

# The table name
table_name = "opa_properties_public"

# Only pull 2019 sales for single family residential properties
where = "sale_date >= '2019-01-01' and sale_date < '2020-01-01' and category_code_description = 'Single Family'"

# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)


salesRaw.head()


len(salesRaw)

25955


import missingno as msno


# We'll visualize the first half of columns
# and then the second half
ncol = len(salesRaw.columns)

fields1 = salesRaw.columns[:ncol//2]
fields2 = salesRaw.columns[ncol//2:]


# The first half of columns
msno.bar(salesRaw[fields1]);


# The second half of columns
msno.bar(salesRaw[fields2]);


# The feature columns we want to use
cols = [
    "sale_price",
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "exterior_condition",
    "zip_code",
]

# Trim to these columns and remove NaNs
sales = salesRaw[cols].dropna()

# Trim zip code to only the first five digits
sales['zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5)


# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]


len(sales)

20134


# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)

# the target labels: log of sale price
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])

# The features
feature_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values


# Make a random forest pipeline
forest = make_pipeline(
    StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)

# Run the 10-fold cross validation
scores = cross_val_score(
    forest,
    X_train,
    y_train,
    cv=10,
)

# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())

R^2 scores =  [0.30717128 0.2842306  0.29924934 0.26703964 0.33972801 0.27736425
 0.35993026 0.29179749 0.28254819 0.27334465]
Scores mean =  0.2982403717905361
Score std dev =  0.02849696303924513


# Fit on the training data
forest.fit(X_train, y_train)

# What's the test score?
forest.score(X_test, y_test)

0.3180679726974225


# Extract the regressor from the pipeline
regressor = forest["randomforestregressor"]


# Create the data frame of importances
importance = pd.DataFrame(
    {"Feature": feature_cols, "Importance": regressor.feature_importances_}
)
importance.hvplot.barh(x="Feature", y="Importance")

	Country	life_satisfaction	gdp_per_capita
0	Australia	7.3	50961.87
1	Austria	7.1	43724.03
2	Belgium	6.9	40106.63
3	Brazil	6.4	8670.00
4	Canada	7.4	43331.96

	Country	life_satisfaction	GDP per capita	Air pollution	Employment rate	Feeling safe walking alone at night	Homicide rate	Life expectancy	Quality of support network	Voter turnout	Water quality
0	Australia	7.3	50961.87	5.0	73.0	63.5	1.1	82.5	95.0	91.0	93.0
1	Austria	7.1	43724.03	16.0	72.0	80.6	0.5	81.7	92.0	80.0	92.0
2	Belgium	6.9	40106.63	15.0	63.0	70.1	1.0	81.5	91.0	89.0	84.0
3	Brazil	6.4	8670.00	10.0	61.0	35.6	26.7	74.8	90.0	79.0	73.0
4	Canada	7.4	43331.96	7.0	73.0	82.2	1.3	81.9	93.0	68.0	91.0

	Feature	Importance
1	Air pollution	0.003866
7	Voter turnout	0.006018
4	Homicide rate	0.009859
5	Life expectancy	0.017673
6	Quality of support network	0.026356
8	Water quality	0.061888
3	Feeling safe walking alone at night	0.065794
2	Employment rate	0.131086
0	GDP per capita	0.677460

	geometry	cartodb_id	assessment_date	basements	beginning_point	book_and_page	building_code	building_code_description	category_code	category_code_description	...	type_heater	unfinished	unit	utility	view_type	year_built	year_built_estimate	zip_code	zoning	objectid
0	POINT (-75.14854 39.93144)	9	None	0	54'7" E OF AMERICAN	3547465	R30	ROW B/GAR 2 STY MASONRY	1	Single Family	...	A	None	None	None	I	1960	Y	191475336	RSA5	782244809
1	POINT (-75.14721 39.93033)	21	None	C	40'4 1/2" W HOWARD ST	3517081	O50	ROW 3 STY MASONRY	1	Single Family	...	A	None	None	None	I	1920	Y	191476128	RSA5	782244835
2	POINT (-75.14927 39.93033)	54	None	D	182'11" W PHILIP	3527524	O50	ROW 3 STY MASONRY	1	Single Family	...	A	None	None	None	I	1920	None	191476037	RSA5	782244867
3	POINT (-75.14871 39.92927)	102	None	C	83'W 2ND ST	3550739	O30	ROW 2 STY MASONRY	1	Single Family	...	H	None	None	None	I	1900	Y	191476003	RSA5	782244971
4	POINT (-75.14876 39.93011)	116	None	A	28 FT W PHILIP	3534307	O30	ROW 2 STY MASONRY	1	Single Family	...	A	None	None	None	I	1920	Y	191476012	RSA5	782244873

Week 11B: Supervised Learning with Scikit-Learn¶

Reminder¶

The plan for today¶

Supervised learning with scikit-learn¶

Example: does money make people happier?¶

Where we left off: overfitting¶

Regularization to the rescue?¶

Takeaways¶

Recap: what we learned so far¶

How can we improve?¶

Decision trees: a more sophisticated modeling algorithm¶

A decision tree classifier for the Iris data set¶

Regression with decision trees is similar¶

Decision trees suffer from overfitting¶

Random forests: an ensemble solution to overfitting¶

Let's check for correlations in our input data¶

Let's do some fitting...¶

Which variables matter the most?¶

Let's improve our fitting with k-fold cross validation¶

Let's do 3-fold cross validation¶

Takeaway: the random forest model is clearly more accurate¶

Question: why did I choose to use 100 estimators in the RF model?¶

Enter GridSearchCV¶

Make the grid of parameters to search¶

Now let's evaluate!¶

Linear model results¶

Random forest results with default parameters¶

The random forest model with the optimal hyperparameters¶

Recap¶

Part 2: Modeling residential sales in Philadelphia¶

Machine learning models are increasingly common in the real estate industry¶

The hedonic approach to housing prices¶

What contributes to the price of a house?¶

Why are these kinds of models important?¶

Philadelphia's assessments are...not good¶

Data from the Office of Property Assessment¶

The OPA is messy¶

Let's focus on numerical features only first¶

Test score slightly better¶

Which variables were most important?¶

More next lecture...¶

That's it!¶