from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas
np.random.seed(42)
%matplotlib inline
pd.options.display.max_columns = 999
plt.rcParams['figure.figsize'] = (10,6)
Nov 17, 2020
Thanksgiving holiday will alter our schedule a bit:
The Canvas calendar and course home page schedule should reflect these changes.
Focus: much more hands-on experience with featuring engineering and adding spatial based features
First, let's setup all of the imports we'll need from scikit learn:
# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# Model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# Pipelines
from sklearn.pipeline import make_pipeline
# Preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
Let's download data for single-family properties in Philadelphia that had their last sale during 2019.
Sources:
import carto2gpd
# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"
# The table name
table_name = "opa_properties_public"
# Only pull 2019 sales for single family residential properties
where = "sale_date >= '2019-01-01' AND sale_date < '2020-01-01'"
where = where + " AND category_code_description = 'Single Family'"
# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)
# Optional: put it a reproducible order for test/training splits later
salesRaw = salesRaw.sort_values("parcel_number")
salesRaw.head()
geometry | cartodb_id | assessment_date | basements | beginning_point | book_and_page | building_code | building_code_description | category_code | category_code_description | census_tract | central_air | cross_reference | date_exterior_condition | depth | exempt_building | exempt_land | exterior_condition | fireplaces | frontage | fuel | garage_spaces | garage_type | general_construction | geographic_ward | homestead_exemption | house_extension | house_number | interior_condition | location | mailing_address_1 | mailing_address_2 | mailing_care_of | mailing_city_state | mailing_street | mailing_zip | market_value | market_value_date | number_of_bathrooms | number_of_bedrooms | number_of_rooms | number_stories | off_street_open | other_building | owner_1 | owner_2 | parcel_number | parcel_shape | quality_grade | recording_date | registry_number | sale_date | sale_price | separate_utilities | sewer | site_type | state_code | street_code | street_designation | street_direction | street_name | suffix | taxable_building | taxable_land | topography | total_area | total_livable_area | type_heater | unfinished | unit | utility | view_type | year_built | year_built_estimate | zip_code | zoning | objectid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | POINT (-75.14854 39.93144) | 52 | None | 0 | 54'7" E OF AMERICAN | 3547465 | R30 | ROW B/GAR 2 STY MASONRY | 1 | Single Family | 770 | Y | None | None | 90.0 | 0 | 0 | 4 | 0 | 18.03 | None | 1 | A | A | 01 | 0.0 | 00 | 00222 | 4 | 222 WHARTON ST | None | None | None | SOUDERTON PA | 135 NORFOLK CT | 18964 | 249400 | None | 2 | 3 | 6 | 2 | 0 | None | CARROLL PATRICK J | CARROLL JULIA ANN | 011001660 | E | None | 2019-08-06T00:00:00Z | 009S170309 | 2019-07-17T00:00:00Z | 340000 | None | None | A | 1001 | 82740 | ST | None | WHARTON | None | 184058 | 65342 | F | 1622.70 | 1266 | A | None | None | None | I | 1960 | Y | 191475336 | RSA5 | 784571849 |
0 | POINT (-75.14721 39.93033) | 7 | None | C | 40'4 1/2" W HOWARD ST | 3517081 | O50 | ROW 3 STY MASONRY | 1 | Single Family | 710 | Y | None | None | 40.0 | 0 | 0 | 3 | 0 | 13.50 | None | 0 | 0 | A | 01 | 0.0 | 00 | 00117 | 3 | 117 REED ST | None | None | None | NEW YORK NY | 245 PARK AVENUE, 26TH FL | 10167 | 283800 | None | 1 | 4 | 7 | 3 | 0 | None | SFR SA I LLC | None | 011008200 | E | None | 2019-05-24T00:00:00Z | 009S170231 | 2019-05-17T00:00:00Z | 1283700 | None | None | None | 1001 | 67780 | ST | None | REED | None | 209444 | 74356 | F | 540.00 | 1131 | A | None | None | None | I | 1920 | Y | 191476128 | RSA5 | 784571875 |
1 | POINT (-75.14927 39.93033) | 40 | None | D | 182'11" W PHILIP | 3527524 | O50 | ROW 3 STY MASONRY | 1 | Single Family | 771 | Y | None | None | 48.0 | 45000 | 0 | 2 | 0 | 15.41 | None | 0 | 0 | A | 01 | 45000.0 | 00 | 00236 | 2 | 236 REED ST | None | None | None | None | None | None | 343100 | None | 1 | 3 | 6 | 3 | 0 | None | GOSS ADAM M | GOSS SARA L | 011013000 | E | None | 2019-06-19T00:00:00Z | 010S110184 | 2019-05-17T00:00:00Z | 397500 | None | None | None | 1001 | 67780 | ST | None | REED | None | 212703 | 85397 | F | 739.68 | 1436 | A | None | None | None | I | 1920 | None | 191476037 | RSA5 | 784571907 |
4 | POINT (-75.14876 39.93011) | 104 | None | A | 28 FT W PHILIP | 3534307 | O30 | ROW 2 STY MASONRY | 1 | Single Family | 771 | Y | None | None | 39.0 | 172346 | 0 | 2 | 0 | 14.00 | None | 0 | 0 | A | 01 | 0.0 | 00 | 00205 | 2 | 205 GERRITT ST | None | None | None | None | None | None | 282200 | None | 1 | 3 | 6 | 2 | 0 | None | PLATT JACOB | None | 011013900 | E | None | 2019-07-05T00:00:00Z | 010S110173 | 2019-06-06T00:00:00Z | 285000 | None | None | A | 1001 | 36680 | ST | None | GERRITT | None | 39614 | 70240 | F | 546.00 | 868 | A | None | None | None | I | 1920 | Y | 191476012 | RSA5 | 784571913 |
5 | POINT (-75.14886 39.93012) | 106 | None | C | 56 FT W PHILIP | 3465875 | O30 | ROW 2 STY MASONRY | 1 | Single Family | 771 | N | None | None | 39.0 | 100900 | 0 | 4 | 0 | 14.00 | None | 0 | 0 | A | 01 | 0.0 | 00 | 00209 | 4 | 209 GERRITT ST | None | None | None | None | None | None | 186400 | None | 1 | 3 | 6 | 2 | 0 | None | DEMCHUK JOHN C | DEMCHUK TRACY A | 011014100 | E | None | 2019-01-15T00:00:00Z | 010S110171 | 2019-01-08T00:00:00Z | 3 | None | None | A | 1001 | 36680 | ST | None | GERRITT | None | 36663 | 48837 | F | 546.00 | 1008 | A | None | None | None | I | 1920 | Y | 191476012 | RSA5 | 784571915 |
len(salesRaw)
25955
# The feature columns we want to use
cols = [
"sale_price",
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"exterior_condition",
"zip_code",
"geometry"
]
# Trim to these columns and remove NaNs
sales = salesRaw[cols].dropna()
# Trim zip code to only the first five digits
sales['zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5)
# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]
len(sales)
20131
# Split the data 70/30
train_set, test_set = train_test_split(sales,
test_size=0.3,
random_state=42)
# the target labels: log of sale price
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])
# The features
feature_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values
Run a linear regression model as a baseline:
# Make a linear model pipeline
linear_pipeline = make_pipeline(StandardScaler(), LinearRegression())
# Fit on the training data
linear_pipeline.fit(X_train, y_train)
# What's the test score?
linear_pipeline.score(X_test, y_test)
0.17734906621926338
Run cross-validation on a random forest model:
# Make a random forest pipeline
forest_pipeline = make_pipeline(
StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)
# Run the 10-fold cross validation
scores = cross_val_score(
forest_pipeline,
X_train,
y_train,
cv=10,
)
# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
R^2 scores = [0.28774608 0.30557794 0.31837843 0.35032902 0.28971327 0.3553058 0.32448587 0.29367637 0.31938367 0.29449505] Scores mean = 0.31390915011126685 Score std dev = 0.02308144298765
# Fit on the training data
forest_pipeline.fit(X_train, y_train)
# What's the test score?
forest_pipeline.score(X_test, y_test)
0.3012744612160241
The model appears to generalize reasonably well
Note: we should also be optimizing hyperparameters to see if we can find additional improvements!
# Extract the regressor from the pipeline
forest_model = forest_pipeline["randomforestregressor"]
# Create the data frame of importances
importance = pd.DataFrame(
{"Feature": feature_cols, "Importance": forest_model.feature_importances_}
).sort_values("Importance")
importance.hvplot.barh(x="Feature", y="Importance")
We can use a technique called one-hot encoding
Steps:
OneHotEncoder
object is a preprocessor that will perform the vectorization stepColumnTransformer
object will help us apply different transformers to numerical and categorical columnsfrom sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
Let's try out the example data of colors:
# Example data of colors
colors = np.array(["red", "green", "blue", "red"])
colors = colors[:, np.newaxis]
colors.shape
(4, 1)
colors
array([['red'], ['green'], ['blue'], ['red']], dtype='<U5')
# Initialize the OHE transformer
ohe = OneHotEncoder()
# Fit the transformer and then transform the colors
ohe.fit_transform(colors).toarray()
array([[0., 0., 1.], [0., 1., 0.], [1., 0., 0.], [0., 0., 1.]])
# The corresponding category for each column
ohe.categories_
[array(['blue', 'green', 'red'], dtype='<U5')]
Let's apply separate transformers for our numerical and categorical columns:
# Numerical columns
num_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
]
# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]
# Set up the column transformer with two transformers
# ----> Scale the numerical columns
# ----> One-hot encode the categorical columns
transformer = ColumnTransformer(
transformers=[
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
]
)
Note: the handle_unknown='ignore'
parameter ensures that if categories show up in our training set, but not our test set, no error will be raised.
Initialize the pipeline object, using the column transformer and the random forest regressor
# Initialize the pipeline
# NOTE: only use 10 estimators here so it will run in a reasonable time
pipe = make_pipeline(
transformer, RandomForestRegressor(n_estimators=10,
random_state=42)
)
Now, let's fit the model.
train_set
and test_set
X_train
and X_test
numpy arrays. # Fit the training set
pipe.fit(train_set, y_train);
# What's the test score?
pipe.score(test_set, y_test)
0.559083987329791
$R^2$ of ~0.30 improved to $R^2$ of ~0.56!
Takeaway: neighborhood based effects play a crucial role in determining housing prices.
Side Note: to fully validate the model we should run $k$-fold cross validation and optimize hyperparameters of the model as well...
This will be part of assignment #7
But first, we need to know the column names! The one-hot encoder created a column for each category type...
# The column transformer...
transformer
ColumnTransformer(transformers=[('num', StandardScaler(), ['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']), ('cat', OneHotEncoder(handle_unknown='ignore'), ['exterior_condition', 'zip_code'])])
# The steps in the column transformer
transformer.named_transformers_
{'num': StandardScaler(), 'cat': OneHotEncoder(handle_unknown='ignore'), 'remainder': 'drop'}
# The one-hot step
ohe = transformer.named_transformers_['cat']
ohe
OneHotEncoder(handle_unknown='ignore')
# One column for each category type!
ohe_cols = ohe.get_feature_names()
ohe_cols
array(['x0_0', 'x0_1', 'x0_2', 'x0_3', 'x0_4', 'x0_5', 'x0_6', 'x0_7', 'x1_19102', 'x1_19103', 'x1_19104', 'x1_19106', 'x1_19107', 'x1_19111', 'x1_19114', 'x1_19115', 'x1_19116', 'x1_19118', 'x1_19119', 'x1_19120', 'x1_19121', 'x1_19122', 'x1_19123', 'x1_19124', 'x1_19125', 'x1_19126', 'x1_19127', 'x1_19128', 'x1_19129', 'x1_19130', 'x1_19131', 'x1_19132', 'x1_19133', 'x1_19134', 'x1_19135', 'x1_19136', 'x1_19137', 'x1_19138', 'x1_19139', 'x1_19140', 'x1_19141', 'x1_19142', 'x1_19143', 'x1_19144', 'x1_19145', 'x1_19146', 'x1_19147', 'x1_19148', 'x1_19149', 'x1_19150', 'x1_19151', 'x1_19152', 'x1_19153', 'x1_19154'], dtype=object)
# Full list of columns is numerical + one-hot
features = num_cols + list(ohe_cols)
features
['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories', 'x0_0', 'x0_1', 'x0_2', 'x0_3', 'x0_4', 'x0_5', 'x0_6', 'x0_7', 'x1_19102', 'x1_19103', 'x1_19104', 'x1_19106', 'x1_19107', 'x1_19111', 'x1_19114', 'x1_19115', 'x1_19116', 'x1_19118', 'x1_19119', 'x1_19120', 'x1_19121', 'x1_19122', 'x1_19123', 'x1_19124', 'x1_19125', 'x1_19126', 'x1_19127', 'x1_19128', 'x1_19129', 'x1_19130', 'x1_19131', 'x1_19132', 'x1_19133', 'x1_19134', 'x1_19135', 'x1_19136', 'x1_19137', 'x1_19138', 'x1_19139', 'x1_19140', 'x1_19141', 'x1_19142', 'x1_19143', 'x1_19144', 'x1_19145', 'x1_19146', 'x1_19147', 'x1_19148', 'x1_19149', 'x1_19150', 'x1_19151', 'x1_19152', 'x1_19153', 'x1_19154']
random_forest = pipe["randomforestregressor"]
# Create the dataframe with importances
importance = pd.DataFrame(
{"Feature": features, "Importance": random_forest.feature_importances_}
)
importance.head(n=20)
Feature | Importance | |
---|---|---|
0 | total_livable_area | 0.158573 |
1 | total_area | 0.192858 |
2 | garage_spaces | 0.012014 |
3 | fireplaces | 0.002742 |
4 | number_of_bathrooms | 0.169654 |
5 | number_of_bedrooms | 0.014315 |
6 | number_stories | 0.017482 |
7 | x0_0 | 0.000236 |
8 | x0_1 | 0.006463 |
9 | x0_2 | 0.024824 |
10 | x0_3 | 0.029688 |
11 | x0_4 | 0.012139 |
12 | x0_5 | 0.006955 |
13 | x0_6 | 0.004804 |
14 | x0_7 | 0.014958 |
15 | x1_19102 | 0.001416 |
16 | x1_19103 | 0.003051 |
17 | x1_19104 | 0.002927 |
18 | x1_19106 | 0.003286 |
19 | x1_19107 | 0.001287 |
# Sort by importance and get the top 30
# SORT IN DESCENDING ORDER
importance = importance.sort_values("Importance", ascending=False).iloc[:30]
# Plot
importance.hvplot.barh(x="Feature", y="Importance", height=700, flip_yaxis=True)
Garbage in, garbage out
Takeway: If your input features are poorly designed (for example, completely unrelated to thing you want to predict), then no matter how good your machine learning model is or how well you "train" it, then the model will never be able to do the translation from features to predicted value.
Yes, let's add distance-based features
The strategy
Distance from each sale to:
Source: https://www.opendataphilly.org/dataset/311-service-and-information-requests
We'll only pull data from 2019.
# the 311 table
table_name = "public_cases_fc"
# Peak at the first row of data
carto2gpd.get(carto_url, table_name, limit=1)
geometry | cartodb_id | objectid | service_request_id | status | status_notes | service_name | service_code | agency_responsible | service_notice | requested_datetime | updated_datetime | expected_datetime | address | zipcode | media_url | lat | lon | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | POINT (-75.21826 39.95295) | 1 | 10 | 8967065 | Closed | Information Provided | Graffiti Removal | SR-CL01 | Community Life Improvement Program | 7 Business Days | 2015-01-11T14:45:25Z | 2015-08-12T03:47:02Z | 2015-01-19T19:00:00Z | 309 S 48TH ST | None | https://d21tc4b3k3r3vo.cloudfront.net/uploads/... | 39.952947 | -75.218262 |
# Select only those for grafitti and in 2019
where_2019 = "requested_datetime >= '01-01-2019' and requested_datetime < '01-01-2020'"
where_grafitti = "service_name = 'Graffiti Removal'"
where = f"{where_2019} and {where_grafitti}"
# Pull the subset we want
graffiti = carto2gpd.get(carto_url, table_name, where=where)
# Remove rows with missing geometries
graffiti = graffiti.loc[graffiti.geometry.notnull()]
len(graffiti)
16704
graffiti.head()
geometry | cartodb_id | objectid | service_request_id | status | status_notes | service_name | service_code | agency_responsible | service_notice | requested_datetime | updated_datetime | expected_datetime | address | zipcode | media_url | lat | lon | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | POINT (-75.14774 40.02489) | 322815 | 7674671 | 13040415 | Closed | Other | Graffiti Removal | SR-CL01 | Community Life Improvement Program | 7 Business Days | 2019-12-16T09:29:29Z | 2019-12-16T12:01:08Z | 2019-12-24T19:00:00Z | 4709 N BROAD ST | None | None | 40.024886 | -75.147742 |
1 | POINT (-75.14328 39.95198) | 234046 | 6739855 | 12697168 | Closed | Issue Resolved | Graffiti Removal | SR-CL01 | Community Life Improvement Program | 7 Business Days | 2019-06-16T11:58:06Z | 2019-06-26T08:16:02Z | 2019-06-25T20:00:00Z | 101 N 2ND ST | None | https://d17aqltn7cihbm.cloudfront.net/uploads/... | 39.951980 | -75.143276 |
2 | POINT (-75.15174 39.93389) | 307747 | 5943225 | 12472082 | Closed | Issue Resolved | Graffiti Removal | SR-CL01 | Community Life Improvement Program | 7 Business Days | 2019-02-14T09:31:54Z | 2019-02-19T08:30:28Z | 2019-02-24T19:00:00Z | 400 WASHINGTON AVE | None | https://d17aqltn7cihbm.cloudfront.net/uploads/... | 39.933889 | -75.151736 |
3 | POINT (-75.16847 39.94070) | 213014 | 6729909 | 12703388 | Closed | Issue Resolved | Graffiti Removal | SR-CL01 | Community Life Improvement Program | 7 Business Days | 2019-06-18T19:27:03Z | 2019-06-25T09:00:46Z | 2019-06-27T20:00:00Z | 1516 WEBSTER ST | None | https://d17aqltn7cihbm.cloudfront.net/uploads/... | 39.940702 | -75.168468 |
4 | POINT (-75.16130 39.91676) | 710570 | 7672847 | 13021942 | Closed | Issue Resolved | Graffiti Removal | SR-CL01 | Community Life Improvement Program | 7 Business Days | 2019-12-04T10:40:29Z | 2019-12-16T09:06:26Z | 2019-12-15T19:00:00Z | 2602 S 8TH ST | None | https://d17aqltn7cihbm.cloudfront.net/uploads/... | 39.916762 | -75.161305 |
We will need to:
# Do the CRS conversion
sales_3857 = sales.to_crs(epsg=3857)
graffiti_3857 = graffiti.to_crs(epsg=3857)
def get_xy_from_geometry(df):
"""
Return a numpy array with two columns, where the
first holds the `x` geometry coordinate and the second
column holds the `y` geometry coordinate
"""
x = df.geometry.x
y = df.geometry.y
return np.column_stack((x, y)) # stack as columns
# Extract x/y for sales
salesXY = get_xy_from_geometry(sales_3857)
# Extract x/y for grafitti calls
graffitiXY = get_xy_from_geometry(graffiti_3857)
salesXY.shape
(20131, 2)
graffitiXY.shape
(16704, 2)
For this, we will use the $k$ nearest neighbors algorithm from scikit learn.
For each sale:
from sklearn.neighbors import NearestNeighbors
# STEP 1: Initialize the algorithm
k = 5
nbrs = NearestNeighbors(n_neighbors=k)
# STEP 2: Fit the algorithm on the "neighbors" dataset
nbrs.fit(graffitiXY)
# STEP 3: Get distances for sale to neighbors
grafDists, grafIndices = nbrs.kneighbors(salesXY)
*Note: I am using k=5
here without any real justification. In practice, you would want to try a few different k values to try to identify the best value to use.
grafDists
: For each sale, the distances to the 5 nearest graffiti callsgrafIndices
: For each sale, the index of each of the 5 neighbors in the original datasetprint("length of sales = ", len(salesXY))
print("shape of grafDists = ", grafDists.shape)
print("shape of grafIndices = ", grafIndices.shape)
length of sales = 20131 shape of grafDists = (20131, 5) shape of grafIndices = (20131, 5)
# The distances from the first sale to the 5 nearest neighbors
grafDists[0]
array([ 63.06999367, 80.52485949, 125.34776876, 164.90764118, 167.68676032])
# The coordinates for the first sale
x0, y0 = salesXY[0]
x0, y0
(-8365497.20665795, 4855985.04901042)
# The indices for the 5 nearest graffiti calls
grafIndices[0]
array([9734, 6213, 1154, 1608, 1449])
# the graffiti neighbors
sale0_neighbors = graffitiXY[grafIndices[0]]
sale0_neighbors
array([[-8365557.65314145, 4856003.05030855], [-8365555.31543214, 4856040.79507179], [-8365412.26988648, 4855892.86545449], [-8365551.75320844, 4856140.67421387], [-8365531.27042213, 4856149.23947774]])
# Access the first and second column for x/y values
neighbors_x = sale0_neighbors[:,0]
neighbors_y = sale0_neighbors[:,1]
# The x/y differences between neighbors and first sale coordinates
dx = (neighbors_x - x0)
dy = (neighbors_y - y0)
# The Euclidean dist
manual_dists = (dx**2 + dy**2) ** 0.5
manual_dists
array([ 63.06999367, 80.52485949, 125.34776876, 164.90764118, 167.68676032])
grafDists[0]
array([ 63.06999367, 80.52485949, 125.34776876, 164.90764118, 167.68676032])
We'll average over the column axis: axis=1
# Average distance to neighbors
avgGrafDist = grafDists.mean(axis=1)
# Set zero distances to be small, but nonzero
# IMPORTANT: THIS WILL AVOID INF DISTANCES WHEN DOING THE LOG
avgGrafDist[avgGrafDist==0] = 1e-5
# Calculate log of distances
sales['logDistGraffiti'] = np.log10(avgGrafDist)
sales.head()
sale_price | total_livable_area | total_area | garage_spaces | fireplaces | number_of_bathrooms | number_of_bedrooms | number_stories | exterior_condition | zip_code | geometry | logDistGraffiti | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 340000 | 1266 | 1622.70 | 1 | 0 | 2 | 3 | 2 | 4 | 19147 | POINT (-75.14854 39.93144) | 2.080292 |
1 | 397500 | 1436 | 739.68 | 0 | 0 | 1 | 3 | 3 | 2 | 19147 | POINT (-75.14927 39.93033) | 2.232137 |
4 | 285000 | 868 | 546.00 | 0 | 0 | 1 | 3 | 2 | 2 | 19147 | POINT (-75.14876 39.93011) | 2.202199 |
6 | 277500 | 890 | 683.70 | 0 | 0 | 1 | 2 | 2 | 3 | 19147 | POINT (-75.14852 39.92954) | 2.092351 |
7 | 453610 | 1320 | 896.00 | 0 | 0 | 2 | 3 | 3 | 2 | 19147 | POINT (-75.14871 39.92962) | 2.076465 |
# Load the City Limits to plot too
import esri2gpd
# From OpenDataPhilly's page
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/City_Limits/FeatureServer/0"
city_limits = esri2gpd.get(url).to_crs(epsg=3857)
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
# Plot the log of the Graffiti distance
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=sales["logDistGraffiti"].values, gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
Use the osmnx
package to get subway stops in Philly — we can use the ox.geometries_from_polygon()
function.
station=subway
: see the OSM Wikipediaimport osmnx as ox
# Get the geometry from the city limits
city_limits_outline = city_limits.to_crs(epsg=4326).squeeze().geometry
city_limits_outline
# Get the subway stops within the city limits
subway = ox.geometries_from_polygon(city_limits_outline, tags={"station": "subway"})
# Convert to 3857 (meters)
subway = subway.to_crs(epsg=3857)
subway.head()
unique_id | osmid | element_type | addr:city | name | network | operator | platforms | public_transport | railway | station | subway | wikidata | wikipedia | geometry | wheelchair | addr:postcode | operator_1 | addr:housenumber | addr:street | railway:position | internet_access | old_name | addr:state | short_name | elevator | tram | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | node/469917297 | 469917297 | node | Philadelphia | 15th-16th & Locust | PATCO | PATCO | 1 | station | station | subway | yes | Q4551078 | en:15–16th & Locust (PATCO station) | POINT (-8367551.107 4858463.438) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | node/469917298 | 469917298 | node | Philadelphia | 9th-10th & Locust | PATCO | PATCO | 1 | station | station | subway | yes | Q4646737 | en:9–10th & Locust (PATCO station) | POINT (-8366424.042 4858281.683) | yes | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | node/471026103 | 471026103 | node | Philadelphia | 12th-13th & Locust | PATCO | PATCO | 1 | station | station | subway | yes | Q4548965 | en:12–13th & Locust (PATCO station) | POINT (-8366949.703 4858366.817) | no | 19107 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | node/650938316 | 650938316 | node | NaN | 63rd Street | SEPTA | SEPTA | NaN | station | station | subway | yes | NaN | NaN | POINT (-8376424.717 4860524.238) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | node/650959043 | 650959043 | node | NaN | 56th Street | SEPTA | SEPTA | NaN | station | station | subway | yes | Q4640769 | NaN | POINT (-8374883.844 4860274.795) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
fig, ax = plt.subplots(figsize=(6,6))
# Plot the subway locations
subway.plot(ax=ax, markersize=10, color='crimson')
# City limits, too
city_limits.plot(ax=ax, facecolor='none', edgecolor='black', linewidth=4)
ax.set_axis_off()
The stops on the Market-Frankford and Broad St. subway lines!
We'll use $k=1$ to get the distance to the nearest stop.
# STEP 1: x/y coordinates of subway stops (in EPGS=3857)
subwayXY = get_xy_from_geometry(subway.to_crs(epsg=3857))
# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=1)
# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(subwayXY)
# STEP 4: Get distances for sale to neighbors
subwayDists, subwayIndices = nbrs.kneighbors(salesXY)
# STEP 5: add back to the original dataset
sales["logDistSubway"] = np.log10(subwayDists[:, 0])
fig, ax = plt.subplots(figsize=(10,10), facecolor=plt.get_cmap('viridis')(0))
# Plot the log of the subway distance
x = salesXY[:,0]
y = salesXY[:,1]
ax.hexbin(x, y, C=sales['logDistSubway'].values, gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor='none', edgecolor='white', linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
Looks like it worked!
Let's have a look at the correlations of numerical columns:
import seaborn as sns
cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"logDistGraffiti", # NEW
"logDistSubway", # NEW
"sale_price"
]
sns.heatmap(sales[cols].corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1);
# Numerical columns
num_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"logDistGraffiti", # NEW
"logDistSubway" # NEW
]
# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]
# Set up the column transformer with two transformers
transformer = ColumnTransformer(
transformers=[
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
]
)
# Initialize the pipeline
# NOTE: only use 20 estimators here so it will run in a reasonable time
pipe = make_pipeline(
transformer, RandomForestRegressor(n_estimators=20, random_state=42)
)
# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)
# the target labels
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])
# Fit the training set
# REMINDER: use the training dataframe objects here rather than numpy array
pipe.fit(train_set, y_train);
# What's the test score?
# REMINDER: use the test dataframe rather than numpy array
pipe.score(test_set, y_test)
0.6163490515951564
$R^2$ of ~0.58 improved to $R^2$ of ~0.62
def plot_feature_importances(pipeline, num_cols, transformer, top=20, **kwargs):
"""
Utility function to plot the feature importances from the input
random forest regressor.
Parameters
----------
pipeline :
the pipeline object
num_cols :
list of the numerical columns
transformer :
the transformer preprocessing step
top : optional
the number of importances to plot
**kwargs : optional
extra keywords passed to the hvplot function
"""
# The one-hot step
ohe = transformer.named_transformers_["cat"]
# One column for each category type!
ohe_cols = ohe.get_feature_names()
# Full list of columns is numerical + one-hot
features = num_cols + list(ohe_cols)
# The regressor
regressor = pipeline["randomforestregressor"]
# Create the dataframe with importances
importance = pd.DataFrame(
{"Feature": features, "Importance": regressor.feature_importances_}
)
# Sort importance in descending order and get the top
importance = importance.sort_values("Importance", ascending=False).iloc[:top]
# Plot
return importance.hvplot.barh(
x="Feature", y="Importance", flip_yaxis=True, **kwargs
)
plot_feature_importances(pipe, num_cols, transformer, top=30, height=500)
Modify the get_xy_from_geometry()
function to use the "centroid" of the geometry column.
Note: you can take the centroid of a Point() or Polygon() object. For a Point(), you just get the x/y coordinates back.
def get_xy_from_geometry(df):
"""
Return a numpy array with two columns, where the
first holds the `x` geometry coordinate and the second
column holds the `y` geometry coordinate
"""
# NEW: use the centroid.x and centroid.y to support Polygon() and Point() geometries
x = df.geometry.centroid.x
y = df.geometry.centroid.y
return np.column_stack((x, y)) # stack as columns
New feature: Distance to the nearest university/college
New feature: Distance to the nearest park centroid
Notes
x
and y
coordinates of the park centroids and calculate the distance to these centroids. geometry.centroid.x
and geometry.centroid.y
values to access these coordinates.
New feature: Distance to City Hall.
Notes
New feature: Distance to the 5 nearest new construction permits from 2019
Notes
permitdescription
equals 'NEW CONSTRUCTION PERMIT'permitissuedate
column
New feature: Distance to the 5 nearest aggravated assaults in 2019
Notes
Text_General_Code
equals 'Aggravated Assault No Firearm' or 'Aggravated Assault Firearm'dispatch_date
column
New feature: Distance to the 5 nearest abandoned vehicle 311 calls in 2019
Notes
service_name
equals 'Abandoned Vehicle'requested_datetime
column