from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas
import esri2gpd
import carto2gpd
import seaborn as sns
np.random.seed(42)
%matplotlib inline
pd.options.display.max_columns = 999
plt.rcParams['figure.figsize'] = (10,6)
Nov 19, 2020
Thanksgiving holiday will alter our schedule a bit:
The Canvas calendar and course home page schedule should reflect these changes.
We'll start with an exercise to add additional distance-based features to our housing price model...
The strategy
Distance from each sale to:
# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# Model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# Pipelines
from sklearn.pipeline import make_pipeline
# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
# Neighbors
from sklearn.neighbors import NearestNeighbors
Load and clean the data:
# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"
# The table name
table_name = "opa_properties_public"
# Only pull 2019 sales for single family residential properties
where = "sale_date >= '2019-01-01' AND sale_date < '2020-01-01'"
where = where + " AND category_code_description = 'Single Family'"
# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)
# Optional: put it a reproducible order for test/training splits later
salesRaw = salesRaw.sort_values("parcel_number")
# The feature columns we want to use
cols = [
"sale_price",
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"exterior_condition",
"zip_code",
"geometry"
]
# Trim to these columns and remove NaNs
sales = salesRaw[cols].dropna()
# Trim zip code to only the first five digits
sales['zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5)
# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]
len(sales)
20131
Add new distance-based features:
def get_xy_from_geometry(df):
"""
Return a numpy array with two columns, where the
first holds the `x` geometry coordinate and the second
column holds the `y` geometry coordinate
Note: this works with both Point() and Polygon() objects.
"""
# NEW: use the centroid.x and centroid.y to support Polygon() and Point() geometries
x = df.geometry.centroid.x
y = df.geometry.centroid.y
return np.column_stack((x, y)) # stack as columns
# Convert to meters and EPSG=3857
sales_3857 = sales.to_crs(epsg=3857)
# Extract x/y for sales
salesXY = get_xy_from_geometry(sales_3857)
Source: https://www.opendataphilly.org/dataset/311-service-and-information-requests
Download the data:
# Select only those for grafitti and in 2019
where = "requested_datetime >= '01-01-2019' and requested_datetime < '01-01-2020'"
where = where + " AND service_name = 'Graffiti Removal'"
# Pull the subset we want
graffiti = carto2gpd.get(carto_url, "public_cases_fc", where=where)
# Remove rows with missing geometries
graffiti = graffiti.loc[graffiti.geometry.notnull()]
Get the x/y coordinates for the grafitti calls:
# Convert to meters in EPSG=3857
graffiti_3857 = graffiti.to_crs(epsg=3857)
# Extract x/y for grafitti calls
graffitiXY = get_xy_from_geometry(graffiti_3857)
Run the neighbors algorithm to calculate the new feature:
# STEP 1: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=5)
# STEP 2: Fit the algorithm on the "neighbors" dataset
nbrs.fit(graffitiXY)
# STEP 3: Get distances for sale to neighbors
grafDists, grafIndices = nbrs.kneighbors(salesXY)
# STEP 4: Get average distance to neighbors
avgGrafDist = grafDists.mean(axis=1)
# Set zero distances to be small, but nonzero
# IMPORTANT: THIS WILL AVOID INF DISTANCES WHEN DOING THE LOG
avgGrafDist[avgGrafDist==0] = 1e-5
# STEP 5: Add the new feature
sales['logDistGraffiti'] = np.log10(avgGrafDist)
Use the osmnx
package to get subway stops in Philly — we can use the ox.geometries_from_polygon()
function.
station=subway
: see the OSM Wikipediaimport osmnx as ox
Get the geometry polygon for the Philadelphia city limits:
# Download the Philadelphia city limits
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/City_Limits/FeatureServer/0"
city_limits = esri2gpd.get(url).to_crs(epsg=3857)
# Get the geometry from the city limits
city_limits_outline = city_limits.to_crs(epsg=4326).squeeze().geometry
city_limits_outline
Use osmnx to query OpenStreetMap to get all subway stations within the city limits:
# Get the subway stops within the city limits
subway = ox.geometries_from_polygon(city_limits_outline, tags={"station": "subway"})
# Convert to 3857 (meters)
subway = subway.to_crs(epsg=3857)
Get the distance to the nearest subway station ($k=1$):
# STEP 1: x/y coordinates of subway stops (in EPGS=3857)
subwayXY = get_xy_from_geometry(subway.to_crs(epsg=3857))
# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=1)
# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(subwayXY)
# STEP 4: Get distances for sale to neighbors
subwayDists, subwayIndices = nbrs.kneighbors(salesXY)
# STEP 5: add back to the original dataset
sales["logDistSubway"] = np.log10(subwayDists.mean(axis=1))
sales.head()
sale_price | total_livable_area | total_area | garage_spaces | fireplaces | number_of_bathrooms | number_of_bedrooms | number_stories | exterior_condition | zip_code | geometry | logDistGraffiti | logDistSubway | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 340000 | 1266 | 1622.70 | 1 | 0 | 2 | 3 | 2 | 4 | 19147 | POINT (-75.14854 39.93144) | 2.080292 | 3.338733 |
1 | 397500 | 1436 | 739.68 | 0 | 0 | 1 | 3 | 3 | 2 | 19147 | POINT (-75.14927 39.93033) | 2.232137 | 3.330483 |
4 | 285000 | 868 | 546.00 | 0 | 0 | 1 | 3 | 2 | 2 | 19147 | POINT (-75.14876 39.93011) | 2.202199 | 3.341756 |
6 | 277500 | 890 | 683.70 | 0 | 0 | 1 | 2 | 2 | 3 | 19147 | POINT (-75.14852 39.92954) | 2.092351 | 3.347019 |
7 | 453610 | 1320 | 896.00 | 0 | 0 | 2 | 3 | 3 | 2 | 19147 | POINT (-75.14871 39.92962) | 2.076465 | 3.342907 |
# Numerical columns
num_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"logDistGraffiti", # NEW
"logDistSubway" # NEW
]
# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]
# Set up the column transformer with two transformers
transformer = ColumnTransformer(
transformers=[
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
]
)
# Initialize the pipeline
# NOTE: only use 20 estimators here so it will run in a reasonable time
pipe = make_pipeline(
transformer, RandomForestRegressor(n_estimators=20, random_state=42)
)
# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)
# the target labels
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])
# Fit the training set
# REMINDER: use the training dataframe objects here rather than numpy array
pipe.fit(train_set, y_train);
# What's the test score?
# REMINDER: use the test dataframe rather than numpy array
pipe.score(test_set, y_test)
0.6163490515951564
New feature: Distance to the nearest university/college
# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Universities_Colleges/FeatureServer/0"
univs = esri2gpd.get(url)
# Get the X/Y
univXY = get_xy_from_geometry(univs.to_crs(epsg=3857))
# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(univXY)
univDists, _ = nbrs.kneighbors(salesXY)
# Add the new feature
sales['logDistUniv'] = np.log10(univDists.mean(axis=1))
fig, ax = plt.subplots(figsize=(10,10), facecolor=plt.get_cmap('viridis')(0))
x = salesXY[:,0]
y = salesXY[:,1]
ax.hexbin(x, y, C=np.log10(univDists.mean(axis=1)), gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor='none', edgecolor='white', linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to Nearest University/College", fontsize=16, color='white');
New feature: Distance to the nearest park centroid
Notes
x
and y
coordinates of the park centroids and calculate the distance to these centroids. geometry.centroid.x
and geometry.centroid.y
values to access these coordinates.# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Assets/FeatureServer/0"
parks = esri2gpd.get(url)
# Get the X/Y
parksXY = get_xy_from_geometry(parks.to_crs(epsg=3857))
# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(parksXY)
parksDists, _ = nbrs.kneighbors(salesXY)
# Add the new feature
sales["logDistParks"] = np.log10(parksDists.mean(axis=1))
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(parksDists.mean(axis=1)), gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to Nearest Park", fontsize=16, color="white");
New feature: Distance to City Hall.
Notes
# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/CITY_LANDMARKS/FeatureServer/0"
cityHall = esri2gpd.get(
url, where="NAME = 'City Hall' AND FEAT_TYPE = 'Municipal Building'"
)
# Get the X/Y
cityHallXY = get_xy_from_geometry(cityHall.to_crs(epsg=3857))
# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(cityHallXY)
cityHallDist, _ = nbrs.kneighbors(salesXY)
# Add the new feature
sales["logDistCityHall"] = np.log10(cityHallDist.mean(axis=1))
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(cityHallDist.mean(axis=1)), gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to City Hall", fontsize=16, color="white");
New feature: Distance to the 5 nearest new construction permits from 2019
Notes
permitdescription
equals 'NEW CONSTRUCTION PERMIT'permitissuedate
column# Table name
table_name = "permits"
# Where clause
where = "permitissuedate >= '2019-01-01' AND permitissuedate < '2020-01-01'"
where = where + " AND permitdescription='NEW CONSTRUCTION PERMIT'"
# Query
permits = carto2gpd.get(carto_url, table_name, where=where)
# Remove missing
permits = permits.loc[permits.geometry.notnull()]
# Get the X/Y
permitsXY = get_xy_from_geometry(permits.to_crs(epsg=3857))
# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(permitsXY)
permitsDist, _ = nbrs.kneighbors(salesXY)
# Add the new feature
sales["logDistPermits"] = np.log10(permitsDist.mean(axis=1))
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(permitsDist.mean(axis=1)), gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to 5 Closest Building Permits", fontsize=16, color="white");
New feature: Distance to the 5 nearest aggravated assaults in 2019
Notes
Text_General_Code
equals 'Aggravated Assault No Firearm' or 'Aggravated Assault Firearm'dispatch_date
column# Table name
table_name = "incidents_part1_part2"
# Where selection
where = "dispatch_date >= '2019-01-01' AND dispatch_date < '2020-01-01'"
where = where + " AND Text_General_Code IN ('Aggravated Assault No Firearm', 'Aggravated Assault Firearm')"
# Query
assaults = carto2gpd.get(carto_url, table_name, where=where)
# Remove missing
assaults = assaults.loc[assaults.geometry.notnull()]
# Get the X/Y
assaultsXY = get_xy_from_geometry(assaults.to_crs(epsg=3857))
# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(assaultsXY)
assaultDists, _ = nbrs.kneighbors(salesXY)
# Add the new feature
sales['logDistAssaults'] = np.log10(assaultDists.mean(axis=1))
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(assaultDists.mean(axis=1)), gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to 5 Closest Assaults", fontsize=16, color="white");
New feature: Distance to the 5 nearest abandoned vehicle 311 calls in 2019
Notes
service_name
equals 'Abandoned Vehicle'requested_datetime
column# Table name
table_name = "public_cases_fc"
# Where selection
where = "requested_datetime >= '2019-01-01' AND requested_datetime < '2020-01-01'"
where = where + " AND service_name = 'Abandoned Vehicle'"
# Query
cars = carto2gpd.get(carto_url, table_name, where=where)
# Remove missing
cars = cars.loc[cars.geometry.notnull()]
# Get the X/Y
carsXY = get_xy_from_geometry(cars.to_crs(epsg=3857))
# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(carsXY)
carDists, _ = nbrs.kneighbors(salesXY)
# Handle any sales that have 0 distances
carDists[carDists == 0] = 1e-5 # a small, arbitrary value
# Add the new feature
sales["logDistCars"] = np.log10(carDists.mean(axis=1))
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(carDists.mean(axis=1)), gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title(
"Distance to 5 Closest Abandoned Vehichle 311 Calls", fontsize=16, color="white"
);
# Numerical columns
num_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"logDistGraffiti", # NEW
"logDistSubway", # NEW
"logDistUniv", # NEW
"logDistParks", # NEW
"logDistCityHall", # NEW
"logDistPermits", # NEW
"logDistAssaults", # NEW
"logDistCars" # NEW
]
# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]
# Set up the column transformer with two transformers
transformer = ColumnTransformer(
transformers=[
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
]
)
# Two steps in pipeline: preprocessor and then regressor
pipe = make_pipeline(
transformer, RandomForestRegressor(n_estimators=20, random_state=42)
)
# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)
# the target labels
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])
# Fit the training set
pipe.fit(train_set, y_train);
# What's the test score?
pipe.score(test_set, y_test)
0.6549287880700467
# The one-hot step
ohe = transformer.named_transformers_["cat"]
# One column for each category type!
ohe_cols = ohe.get_feature_names()
# Full list of columns is numerical + one-hot
features = num_cols + list(ohe_cols)
# The regressor
regressor = pipe["randomforestregressor"]
# Create the dataframe with importances
importance = pd.DataFrame(
{"Feature": features, "Importance": regressor.feature_importances_}
)
# Sort importance in descending order and get the top
importance = importance.sort_values("Importance", ascending=False).iloc[:30]
# Plot
importance.hvplot.barh(
x="Feature", y="Importance", flip_yaxis=True, height=500
)
The technical problem: predict bikeshare trip counts for the Indego bikeshare in Philadelphia
For more info, see this blog post from Pew
Most important: adding new stations in new areas will not affect the demand for existing stations.
This allows the results from the predictive model for demand, built on existing stations, to translate to new stations.
The key assumption is that the bikeshare is not yet at full capacity, and riders in new areas will not decrease the demand in other areas.
Typically, this is a pretty safe assumption. But I encourage you to use historical data to verify it!
The data page also includes the live status of all of the stations in the system.
API endpoint: http://www.rideindego.com/stations/json
Two important columns:
totalDocks
: the total number of docks at each stationkioskId
: the unique identifier for each stationimport requests
# API endpoint
API_endpoint = "http://www.rideindego.com/stations/json/"
# Get the JSON
stations_geojson = requests.get(API_endpoint).json()
# Convert to a GeoDataFrame
stations = gpd.GeoDataFrame.from_features(stations_geojson, crs="EPSG:4326")
stations.head()
geometry | id | name | coordinates | totalDocks | docksAvailable | bikesAvailable | classicBikesAvailable | smartBikesAvailable | electricBikesAvailable | rewardBikesAvailable | rewardDocksAvailable | kioskStatus | kioskPublicStatus | kioskConnectionStatus | kioskType | addressStreet | addressCity | addressState | addressZipCode | bikes | closeTime | eventEnd | eventStart | isEventBased | isVirtual | kioskId | notes | openTime | publicText | timeZone | trikesAvailable | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | POINT (-75.16374 39.95378) | 3004 | Municipal Services Building Plaza | [-75.16374, 39.95378] | 30 | 22 | 8 | 8 | 0 | 0 | 8 | 22 | FullService | Active | Active | 1 | 1401 John F. Kennedy Blvd. | Philadelphia | PA | 19102 | [{'dockNumber': 14, 'isElectric': False, 'isAv... | None | None | None | False | False | 3004 | None | None | None | 0 | 39.95378 | -75.16374 | |
1 | POINT (-75.14403 39.94733) | 3005 | Welcome Park, NPS | [-75.14403, 39.94733] | 13 | 7 | 5 | 5 | 0 | 0 | 5 | 7 | FullService | Active | Active | 1 | 191 S. 2nd St. | Philadelphia | PA | 19106 | [{'dockNumber': 1, 'isElectric': False, 'isAva... | None | None | None | False | False | 3005 | None | None | None | 0 | 39.94733 | -75.14403 | |
2 | POINT (-75.20311 39.95220) | 3006 | 40th & Spruce | [-75.20311, 39.9522] | 17 | 13 | 3 | 3 | 0 | 0 | 3 | 13 | FullService | Active | Active | 1 | 246 S. 40th St. | Philadelphia | PA | 19104 | [{'dockNumber': 9, 'isElectric': False, 'isAva... | None | None | None | False | False | 3006 | None | None | None | 0 | 39.95220 | -75.20311 | |
3 | POINT (-75.15993 39.94517) | 3007 | 11th & Pine, Kahn Park | [-75.15993, 39.94517] | 20 | 4 | 16 | 10 | 0 | 6 | 16 | 4 | FullService | Active | Active | 1 | 328 S. 11th St. | Philadelphia | PA | 19107 | [{'dockNumber': 1, 'isElectric': True, 'isAvai... | None | None | None | False | False | 3007 | None | None | None | 0 | 39.94517 | -75.15993 | |
4 | POINT (-75.15055 39.98078) | 3008 | Temple University Station | [-75.15055, 39.98078] | 19 | 15 | 4 | 4 | 0 | 0 | 4 | 15 | FullService | Active | Active | 1 | 1076 Berks Street | Philadelphia | PA | 19122 | [{'dockNumber': 6, 'isElectric': False, 'isAva... | None | None | None | False | False | 3008 | None | None | None | 0 | 39.98078 | -75.15055 |
import contextily as ctx
/Users/nhand/opt/miniconda3/envs/musa-550-fall-2020/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject return f(*args, **kwds)
fig, ax = plt.subplots(figsize=(15, 10))
# stations
stations_3857 = stations.to_crs(epsg=3857)
stations_3857.plot(ax=ax, column='totalDocks', legend=True)
# plot the basemap underneath
ctx.add_basemap(ax=ax, crs=stations_3857.crs, source=ctx.providers.CartoDB.DarkMatter)
ax.set_axis_off()
all_trips = pd.read_csv("./data/indego-trips-2018-2019.csv.tar.gz")
/Users/nhand/opt/miniconda3/envs/musa-550-fall-2020/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3051: DtypeWarning: Columns (10,14) have mixed types.Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
all_trips.head()
trip_id | duration | start_time | end_time | start_station | start_lat | start_lon | end_station | end_lat | end_lon | bike_id | plan_duration | trip_route_category | passholder_type | bike_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 223869188 | 18 | 2018-01-01 00:24:00 | 2018-01-01 00:42:00 | 3124 | 39.952950 | -75.139793 | 3073 | 39.961430 | -75.152420 | 3708 | 30.0 | One Way | Indego30 | NaN |
1 | 223905597 | 572 | 2018-01-01 00:38:00 | 2018-01-01 10:10:00 | 3023 | 39.950481 | -75.172859 | 3066 | 39.945610 | -75.173477 | 3288 | 365.0 | One Way | Indego365 | NaN |
2 | 223872811 | 22 | 2018-01-01 00:48:00 | 2018-01-01 01:10:00 | 3026 | 39.941380 | -75.145638 | 3023 | 39.950481 | -75.172859 | 11735 | 30.0 | One Way | Indego30 | NaN |
3 | 223872810 | 21 | 2018-01-01 01:03:00 | 2018-01-01 01:24:00 | 3045 | 39.947922 | -75.162369 | 3037 | 39.954239 | -75.161377 | 5202 | 30.0 | One Way | Indego30 | NaN |
4 | 223872809 | 4 | 2018-01-01 01:05:00 | 2018-01-01 01:09:00 | 3115 | 39.972630 | -75.167572 | 3058 | 39.967159 | -75.170013 | 5142 | 30.0 | One Way | Indego30 | NaN |
start_trips = (
all_trips.groupby("start_station").size().reset_index(name="total_start_trips")
)
start_trips.head()
start_station | total_start_trips | |
---|---|---|
0 | 3000 | 469 |
1 | 3004 | 13712 |
2 | 3005 | 8253 |
3 | 3006 | 8843 |
4 | 3007 | 20323 |
# Now merge in the geometry for each station
bike_data = (
stations[["geometry", "kioskId"]]
.merge(start_trips, left_on="kioskId", right_on="start_station")
.to_crs(epsg=3857)
)
Let's plot it...
fig, ax = plt.subplots(figsize=(10,10))
# stations
bike_data.plot(ax=ax, column='total_start_trips', legend=True)
# plot the basemap underneath
ctx.add_basemap(ax=ax, crs=bike_data.crs, source=ctx.providers.CartoDB.DarkMatter)
ax.set_axis_off()
Trips are clearly concentrated in Center City...
There are lots of possible options. Generally speaking, the possible features fall into a few different categories:
Let's add a few from each category...
Let's use the number of docks per stations:
bike_data = bike_data.merge(stations[["kioskId", "totalDocks"]], on="kioskId")
bike_data.head()
geometry | kioskId | start_station | total_start_trips | totalDocks | |
---|---|---|---|---|---|
0 | POINT (-8367189.263 4859227.987) | 3004 | 3004 | 13712 | 30 |
1 | POINT (-8364995.156 4858291.368) | 3005 | 3005 | 8253 | 13 |
2 | POINT (-8371571.911 4858998.543) | 3006 | 3006 | 8843 | 17 |
3 | POINT (-8366765.136 4857977.729) | 3007 | 3007 | 20323 | 20 |
4 | POINT (-8365720.959 4863149.674) | 3008 | 3008 | 4887 | 19 |
We'll try out percent commuting by car first.
import cenpy
/Users/nhand/opt/miniconda3/envs/musa-550-fall-2020/lib/python3.7/site-packages/patsy/constraint.py:13: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working from collections import Mapping
acs = cenpy.remote.APIConnection("ACSDT5Y2018")
acs.varslike("B08134").sort_index().iloc[:15]
label | concept | predicateType | group | limit | predicateOnly | attributes | required | |
---|---|---|---|---|---|---|---|---|
B08134_001E | Estimate!!Total | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_001EA,B08134_001M,B08134_001MA | NaN |
B08134_002E | Estimate!!Total!!Less than 10 minutes | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_002EA,B08134_002M,B08134_002MA | NaN |
B08134_003E | Estimate!!Total!!10 to 14 minutes | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_003EA,B08134_003M,B08134_003MA | NaN |
B08134_004E | Estimate!!Total!!15 to 19 minutes | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_004EA,B08134_004M,B08134_004MA | NaN |
B08134_005E | Estimate!!Total!!20 to 24 minutes | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_005EA,B08134_005M,B08134_005MA | NaN |
B08134_006E | Estimate!!Total!!25 to 29 minutes | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_006EA,B08134_006M,B08134_006MA | NaN |
B08134_007E | Estimate!!Total!!30 to 34 minutes | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_007EA,B08134_007M,B08134_007MA | NaN |
B08134_008E | Estimate!!Total!!35 to 44 minutes | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_008EA,B08134_008M,B08134_008MA | NaN |
B08134_009E | Estimate!!Total!!45 to 59 minutes | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_009EA,B08134_009M,B08134_009MA | NaN |
B08134_010E | Estimate!!Total!!60 or more minutes | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_010EA,B08134_010M,B08134_010MA | NaN |
B08134_011E | Estimate!!Total!!Car, truck, or van | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_011EA,B08134_011M,B08134_011MA | NaN |
B08134_012E | Estimate!!Total!!Car, truck, or van!!Less than... | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_012EA,B08134_012M,B08134_012MA | NaN |
B08134_013E | Estimate!!Total!!Car, truck, or van!!10 to 14 ... | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_013EA,B08134_013M,B08134_013MA | NaN |
B08134_014E | Estimate!!Total!!Car, truck, or van!!15 to 19 ... | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_014EA,B08134_014M,B08134_014MA | NaN |
B08134_015E | Estimate!!Total!!Car, truck, or van!!20 to 24 ... | MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME... | int | B08134 | 0 | NaN | B08134_015EA,B08134_015M,B08134_015MA | NaN |
# Means of Transportation to Work by Travel Time to Work
variables = ["NAME", "B08134_001E", "B08134_011E"]
# Pull data by census tract
census_data = acs.query(
cols=variables,
geo_unit="block group:*",
geo_filter={"state": "42",
"county": "101",
"tract": "*"},
)
# Convert to the data to floats
for col in variables[1:]:
census_data[col] = census_data[col].astype(float)
# The percent commuting by car
census_data["percent_car"] = census_data["B08134_011E"] / census_data["B08134_001E"]
census_data.head()
NAME | B08134_001E | B08134_011E | state | county | tract | block group | percent_car | |
---|---|---|---|---|---|---|---|---|
0 | Block Group 3, Census Tract 288, Philadelphia ... | 436.0 | 252.0 | 42 | 101 | 028800 | 3 | 0.577982 |
1 | Block Group 1, Census Tract 288, Philadelphia ... | 389.0 | 182.0 | 42 | 101 | 028800 | 1 | 0.467866 |
2 | Block Group 2, Census Tract 298, Philadelphia ... | 141.0 | 99.0 | 42 | 101 | 029800 | 2 | 0.702128 |
3 | Block Group 4, Census Tract 298, Philadelphia ... | 488.0 | 416.0 | 42 | 101 | 029800 | 4 | 0.852459 |
4 | Block Group 3, Census Tract 298, Philadelphia ... | 231.0 | 165.0 | 42 | 101 | 029800 | 3 | 0.714286 |
Merge with block group geometries for Philadelphia:
acs.set_mapservice("tigerWMS_ACS2018")
Connection to American Community Survey: 1-Year Estimates: Detailed Tables 5-Year(ID: https://api.census.gov/data/id/ACSDT5Y2018) With MapServer: Census Current (2018) WMS
acs.mapservice.layers[10]
(ESRILayer) Census Block Groups
# Use SQL to return geometries only for Philadelphia County in PA
where_clause = "STATE = 42 AND COUNTY = 101"
# Query for tracts
block_groups = acs.mapservice.layers[10].query(where=where_clause)
/Users/nhand/opt/miniconda3/envs/musa-550-fall-2020/lib/python3.7/site-packages/pyproj/crs/crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6 return _prepare_from_string(" ".join(pjargs))
census_data = block_groups.merge(
census_data,
left_on=["STATE", "COUNTY", "TRACT", "BLKGRP"],
right_on=["state", "county", "tract", "block group"],
)
Finally, let's merge the census data into our dataframe of features by spatially joining the stations and the block groups:
Each station gets the census data associated with the block group the station is within.
bike_data = gpd.sjoin(
bike_data,
census_data.to_crs(bike_data.crs)[["geometry", "percent_car"]],
op="within",
).drop(labels=['index_right'], axis=1)
bike_data.head()
geometry | kioskId | start_station | total_start_trips | totalDocks | percent_car | |
---|---|---|---|---|---|---|
0 | POINT (-8367189.263 4859227.987) | 3004 | 3004 | 13712 | 30 | 0.342105 |
17 | POINT (-8367777.030 4859245.413) | 3021 | 3021 | 27615 | 34 | 0.342105 |
80 | POINT (-8367386.298 4859137.951) | 3108 | 3108 | 22954 | 22 | 0.342105 |
130 | POINT (-8367616.730 4858873.658) | 3202 | 3202 | 2683 | 21 | 0.342105 |
133 | POINT (-8367644.560 4859267.196) | 3205 | 3205 | 128 | 20 | 0.342105 |
Note: scikit-learn contains preprocessing transformers to do more advanced imputation methods. The documentation contains a good description of the options.
In particular, the SimpleImputer
object (see the docs) can fill values based on the median, mean, most frequent value, or a constant value.
missing = bike_data['percent_car'].isnull()
bike_data.loc[missing, 'percent_car'] = bike_data['percent_car'].median()
Let's add two new features:
Search https://wiki.openstreetmap.org/wiki/Map_Features for OSM identifier of restaurants
restaurants = ox.geometries_from_polygon(
city_limits_outline, tags={"amenity": "restaurant"}
).to_crs(epsg=3857)
restaurants.head()
unique_id | osmid | element_type | amenity | created_by | name | source | wheelchair | geometry | cuisine | addr:housenumber | addr:street | craft | microbrewery | outdoor_seating | addr:city | addr:postcode | phone | diet:vegetarian | opening_hours | website | description | addr:state | addr:housename | designation | diet:vegan | fax | bar | takeaway | smoking | alt_name | short_name | name:zh | brand | brand:wikidata | brand:wikipedia | official_name | drive_in | internet_access | payment:cash | delivery | payment:bitcoin | diet:halal | addr:country | air_conditioning | capacity | note | contact:phone | operator | diet:pescetarian | lgbtq | payment:credit_cards | payment:debit_cards | reservation | source:cuisine | diet:kosher | wheelchair:description | amenity_1 | wikidata | name:en | name:ca | internet_access:fee | level | toilets | start_date | payment:american_express | payment:discover_card | payment:mastercard | payment:visa | internet_access:description | internet_access:ssid | addr:unit | opening_hours:covid19 | nodes | building | historic | ship:type | wikipedia | tourism | building:levels | area | building:material | name:ja | name:zn | nonsquare | kitchen_hours | ||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | node/333786044 | 333786044 | node | restaurant | Potlatch 0.10f | Sam's Morning Glory | knowledge | limited | POINT (-8366653.605 4857351.702) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | node/333786448 | 333786448 | node | restaurant | Potlatch 0.10f | Supper | knowledge | NaN | POINT (-8366545.636 4857610.625) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | node/566683522 | 566683522 | node | restaurant | NaN | Spring Garden Pizza & Restaurant | NaN | NaN | POINT (-8366499.795 4860429.601) | pizza | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | node/596230881 | 596230881 | node | restaurant | NaN | NaN | NaN | NaN | POINT (-8369759.051 4873925.895) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | node/596245658 | 596245658 | node | restaurant | NaN | Earth Bread & Brewery | NaN | NaN | POINT (-8370151.987 4874548.737) | NaN | 7136 | Germantown Ave | brewery | yes | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Get x/y values for the stations:
stationsXY = get_xy_from_geometry(bike_data)
# STEP 1: x/y coordinates of restaurants (in EPGS=3857)
restsXY = get_xy_from_geometry(restaurants.to_crs(epsg=3857))
# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=10)
# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(restsXY)
# STEP 4: Get distances for stations to neighbors
restsDists, restsIndices = nbrs.kneighbors(stationsXY)
# STEP 5: add back to the original dataset
bike_data['logDistRests'] = np.log10(restsDists.mean(axis=1))
bike_data.head()
geometry | kioskId | start_station | total_start_trips | totalDocks | percent_car | logDistRests | |
---|---|---|---|---|---|---|---|
0 | POINT (-8367189.263 4859227.987) | 3004 | 3004 | 13712 | 30 | 0.342105 | 2.593406 |
17 | POINT (-8367777.030 4859245.413) | 3021 | 3021 | 27615 | 34 | 0.342105 | 2.569178 |
80 | POINT (-8367386.298 4859137.951) | 3108 | 3108 | 22954 | 22 | 0.342105 | 2.598887 |
130 | POINT (-8367616.730 4858873.658) | 3202 | 3202 | 2683 | 21 | 0.342105 | 2.249772 |
133 | POINT (-8367644.560 4859267.196) | 3205 | 3205 | 128 | 20 | 0.342105 | 2.619373 |
fig, ax = plt.subplots(figsize=(10,10))
# stations
bike_data.plot(ax=ax, column='logDistRests', legend=True)
# plot the basemap underneath
ctx.add_basemap(ax=ax, crs=bike_data.crs, source=ctx.providers.CartoDB.DarkMatter)
ax.set_axis_off()
Available from OpenDataPhilly
url = "http://data.phl.opendata.arcgis.com/datasets/95366b115d93443eae4cc6f498cb3ca3_0.geojson"
ccbd = gpd.read_file(url).to_crs(epsg=3857)
ccbd_geo = ccbd.squeeze().geometry
ccbd_geo
# Add the new feature: 1 if within CC and 0 otherwise
bike_data['within_centerCity'] = bike_data.geometry.within(ccbd_geo).astype(int)
xmin, ymin, xmax, ymax = bike_data.to_crs(epsg=4326).total_bounds
ox.graph_from_bbox?
Create the graph object from the bounding box:
G = ox.graph_from_bbox(ymax, ymin, xmax, xmin, network_type='bike')
G
<networkx.classes.multidigraph.MultiDiGraph at 0x7fa043f17390>
Let's plot the graph using the built-in plotting from osmnx:
ox.plot_graph(G, figsize=(12, 12), node_size=0);
Now extract the nodes (intersections) from the graph into a GeoDataFrame:
interesections = ox.graph_to_gdfs(G, nodes=True, edges=False).to_crs(epsg=3857)
interesections.head()
y | x | osmid | highway | geometry | |
---|---|---|---|---|---|
109727072 | 39.920114 | -75.176365 | 109727072 | NaN | POINT (-8368594.627 4854340.318) |
109727084 | 39.918876 | -75.176639 | 109727084 | NaN | POINT (-8368625.162 4854160.598) |
109727165 | 39.917601 | -75.176933 | 109727165 | NaN | POINT (-8368657.901 4853975.496) |
109727173 | 39.917476 | -75.176959 | 109727173 | NaN | POINT (-8368660.784 4853957.411) |
109727183 | 39.917126 | -75.177038 | 109727183 | NaN | POINT (-8368669.590 4853906.554) |
Now, compute the average distance to the 10 nearest intersections:
# STEP 1: x/y coordinates of restaurants (in EPGS=3857)
intersectionsXY = get_xy_from_geometry(interesections.to_crs(epsg=3857))
# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=10)
# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(intersectionsXY)
# STEP 4: Get distances for stations to neighbors
interDists, interIndices = nbrs.kneighbors(stationsXY)
# STEP 5: add back to the original dataset
bike_data['logIntersectionDists'] = np.log10(interDists.mean(axis=1))
Let's plot the stations, coloring by the new feature (distance to intersections):
fig, ax = plt.subplots(figsize=(10,10))
# stations
bike_data.plot(ax=ax, column='logIntersectionDists', legend=True)
# plot the basemap underneath
ctx.add_basemap(ax=ax, crs=bike_data.crs, source=ctx.providers.CartoDB.DarkMatter)
ax.set_axis_off()
We will add two new features:
First, find the nearest 5 stations:
N = 5
k = N + 1
nbrs = NearestNeighbors(n_neighbors=k)
nbrs.fit(stationsXY)
stationDists, stationIndices = nbrs.kneighbors(stationsXY)
The log of the distances to the 5 nearest stations:
bike_data["logStationDists"] = np.log10(stationDists[:, 1:].mean(axis=1))
fig, ax = plt.subplots(figsize=(10,10))
# stations
bike_data.plot(ax=ax, column='logStationDists', legend=True)
# plot the basemap underneath
ctx.add_basemap(ax=ax, crs=bike_data.crs, source=ctx.providers.CartoDB.DarkMatter)
ax.set_axis_off()
Now, let's add the trip counts of the 5 neighboring stations:
Use the indices returned by the NearestNeighbors() algorithm to identify which stations are the 5 neighbors in the original data
stationIndices[:, 1:]
array([[ 2, 47, 129, 4, 22], [ 4, 88, 5, 129, 3], [ 0, 4, 129, 3, 1], [127, 5, 2, 4, 1], [ 1, 129, 2, 3, 0], [127, 88, 3, 1, 123], [ 57, 56, 19, 27, 28], [113, 8, 118, 86, 35], [113, 7, 39, 32, 30], [ 96, 61, 55, 122, 12], [ 41, 21, 137, 66, 75], [ 39, 33, 32, 83, 14], [ 73, 55, 13, 131, 3], [ 73, 12, 89, 135, 76], [118, 46, 77, 39, 11], [ 42, 44, 135, 76, 104], [ 85, 82, 81, 20, 75], [ 65, 128, 52, 18, 63], [ 17, 34, 70, 128, 65], [ 27, 6, 56, 57, 24], [ 75, 49, 87, 50, 85], [ 10, 93, 41, 90, 121], [ 25, 47, 122, 0, 23], [ 24, 25, 61, 122, 22], [ 23, 48, 25, 27, 61], [ 22, 122, 23, 47, 24], [ 27, 48, 24, 28, 56], [ 26, 19, 56, 24, 48], [ 59, 56, 26, 27, 6], [ 32, 30, 31, 43, 39], [ 31, 32, 29, 8, 39], [ 30, 32, 29, 8, 113], [ 29, 30, 39, 8, 31], [ 34, 72, 70, 11, 18], [ 33, 70, 72, 18, 119], [ 36, 113, 7, 86, 8], [ 35, 103, 86, 113, 7], [ 79, 40, 45, 78, 89], [ 80, 115, 40, 94, 19], [ 32, 118, 11, 8, 29], [115, 37, 96, 79, 80], [ 10, 66, 137, 21, 49], [ 44, 15, 43, 119, 76], [ 44, 42, 119, 29, 15], [ 42, 43, 15, 119, 76], [ 79, 37, 78, 120, 112], [ 14, 98, 77, 83, 118], [ 0, 22, 25, 2, 129], [ 24, 26, 23, 25, 22], [ 50, 87, 20, 66, 75], [ 49, 105, 87, 134, 66], [136, 68, 52, 67, 65], [ 65, 128, 17, 63, 51], [121, 90, 114, 58, 82], [112, 62, 117, 78, 74], [122, 12, 9, 61, 25], [ 6, 27, 28, 19, 59], [ 6, 56, 19, 38, 27], [ 81, 82, 53, 59, 16], [ 28, 56, 81, 6, 26], [ 97, 67, 64, 63, 136], [ 23, 9, 122, 55, 24], [ 54, 104, 74, 112, 15], [ 65, 17, 52, 64, 60], [ 97, 63, 60, 18, 83], [ 52, 17, 128, 63, 18], [ 41, 49, 50, 10, 75], [ 60, 136, 51, 63, 105], [ 51, 69, 136, 87, 85], [ 68, 85, 47, 48, 22], [ 34, 72, 71, 18, 123], [ 88, 123, 70, 5, 128], [119, 70, 34, 33, 123], [131, 12, 76, 13, 127], [135, 15, 89, 104, 78], [ 20, 49, 90, 66, 16], [ 73, 131, 15, 42, 135], [ 46, 14, 98, 108, 118], [ 45, 89, 37, 79, 74], [ 45, 37, 78, 40, 120], [ 94, 38, 115, 84, 40], [ 82, 58, 16, 59, 28], [ 81, 58, 16, 53, 90], [ 46, 98, 11, 64, 14], [ 94, 80, 38, 95, 115], [ 16, 69, 68, 26, 20], [107, 7, 36, 103, 35], [ 49, 136, 68, 50, 20], [ 71, 5, 1, 123, 4], [ 13, 135, 78, 74, 37], [ 53, 121, 75, 82, 16], [ 99, 92, 100, 132, 109], [ 99, 132, 91, 100, 111], [121, 114, 21, 53, 90], [ 80, 84, 95, 38, 115], [ 94, 116, 84, 80, 79], [ 9, 40, 61, 115, 55], [ 64, 60, 63, 110, 67], [ 46, 83, 77, 14, 102], [ 91, 92, 100, 132, 109], [ 91, 109, 99, 102, 110], [106, 108, 102, 77, 107], [109, 98, 100, 77, 108], [107, 36, 86, 35, 7], [ 15, 74, 62, 44, 135], [ 50, 134, 67, 49, 87], [101, 108, 102, 107, 77], [ 86, 103, 108, 36, 7], [ 77, 107, 102, 14, 46], [110, 100, 102, 97, 98], [109, 97, 60, 134, 111], [134, 132, 105, 110, 50], [117, 120, 54, 78, 45], [ 7, 8, 35, 31, 30], [121, 53, 93, 90, 58], [ 40, 38, 80, 96, 37], [120, 95, 117, 45, 112], [112, 120, 133, 54, 116], [ 14, 39, 8, 7, 11], [ 72, 42, 43, 34, 70], [112, 117, 116, 45, 79], [114, 53, 90, 93, 21], [ 25, 22, 23, 55, 61], [ 71, 5, 88, 127, 131], [126, 125, 130, 133, 117], [126, 124, 130, 133, 117], [124, 125, 130, 133, 117], [ 5, 3, 131, 1, 88], [ 17, 52, 65, 71, 88], [ 4, 2, 0, 1, 47], [124, 126, 125, 133, 117], [127, 73, 5, 3, 12], [ 92, 111, 91, 99, 100], [117, 112, 120, 116, 54], [105, 111, 50, 66, 49], [ 74, 89, 15, 13, 76], [ 51, 68, 67, 87, 52], [ 41, 10, 66, 21, 111]])
# the total trips for the stations from original data frame
total_start_trips = bike_data['total_start_trips'].values
# get the trips for the 5 nearest neighbors (ignoring first match)
neighboring_trips = total_start_trips[stationIndices[:, 1:]]
neighboring_trips
array([[22954, 11139, 1418, 128, 16455], [ 128, 17070, 2248, 1418, 2683], [13712, 128, 1418, 2683, 27615], [10996, 2248, 22954, 128, 27615], [27615, 1418, 22954, 2683, 13712], [10996, 17070, 2683, 27615, 19088], [ 8585, 16600, 10807, 16130, 8024], [ 3041, 8671, 7132, 7069, 7160], [ 3041, 8843, 16449, 1445, 16300], [21939, 22248, 26655, 16519, 32531], [ 9262, 2561, 1179, 12182, 2882], [16449, 20303, 1445, 6515, 6307], [14434, 26655, 15446, 16259, 2683], [14434, 32531, 13027, 492, 18357], [ 7132, 6621, 2869, 16449, 16831], [23842, 10586, 492, 18357, 12916], [10150, 5327, 5013, 2179, 2882], [10366, 5301, 9803, 8972, 27760], [ 3574, 14102, 16345, 5301, 10366], [16130, 8253, 16600, 8585, 11548], [ 2882, 8449, 9918, 4582, 10150], [ 4887, 4420, 9262, 6889, 4291], [ 8328, 11139, 16519, 13712, 12184], [11548, 8328, 22248, 16519, 16455], [12184, 11121, 8328, 16130, 22248], [16455, 16519, 12184, 11139, 11548], [16130, 11121, 11548, 8024, 16600], [ 7165, 10807, 16600, 11548, 11121], [ 7380, 16600, 7165, 16130, 8253], [ 1445, 16300, 1259, 10303, 16449], [ 1259, 1445, 24396, 8671, 16449], [16300, 1445, 24396, 8671, 3041], [24396, 16300, 16449, 8671, 1259], [14102, 9410, 16345, 16831, 8972], [20303, 16345, 9410, 8972, 13962], [ 935, 3041, 8843, 7069, 8671], [ 7160, 6032, 7069, 3041, 8843], [ 9353, 12550, 11771, 13572, 13027], [10183, 11821, 12550, 7073, 10807], [ 1445, 7132, 16831, 8671, 24396], [11821, 11313, 21939, 9353, 10183], [ 4887, 12182, 1179, 2561, 8449], [10586, 22399, 10303, 13962, 18357], [10586, 23842, 13962, 24396, 22399], [23842, 10303, 22399, 13962, 18357], [ 9353, 11313, 13572, 5809, 4023], [ 6307, 2788, 2869, 6515, 7132], [13712, 16455, 8328, 22954, 1418], [11548, 7165, 12184, 8328, 16455], [ 4582, 9918, 2179, 12182, 2882], [ 8449, 5723, 9918, 1109, 12182], [ 231, 13813, 9803, 16969, 10366], [10366, 5301, 3574, 27760, 15925], [ 4291, 6889, 3120, 4757, 5327], [ 4023, 12081, 7857, 13572, 16470], [16519, 32531, 20323, 22248, 8328], [ 8253, 16130, 8024, 10807, 7380], [ 8253, 16600, 10807, 19439, 16130], [ 5013, 5327, 7240, 7380, 9276], [ 8024, 16600, 5013, 8253, 7165], [22767, 16969, 24391, 27760, 231], [12184, 20323, 16519, 26655, 11548], [ 9543, 12916, 16470, 4023, 22399], [10366, 3574, 9803, 24391, 14359], [22767, 27760, 14359, 8972, 6515], [ 9803, 3574, 5301, 27760, 8972], [ 9262, 8449, 4582, 4887, 2882], [14359, 231, 15925, 27760, 5723], [15925, 10849, 231, 9918, 10150], [13813, 10150, 11139, 11121, 16455], [14102, 9410, 15997, 8972, 19088], [17070, 19088, 16345, 2248, 5301], [13962, 16345, 14102, 20303, 19088], [16259, 32531, 18357, 15446, 10996], [ 492, 22399, 13027, 12916, 13572], [ 2179, 8449, 6889, 12182, 9276], [14434, 16259, 22399, 23842, 492], [ 6621, 6307, 2788, 3866, 7132], [11771, 13027, 11313, 9353, 16470], [11771, 11313, 13572, 12550, 5809], [ 7073, 19439, 11821, 7897, 12550], [ 5327, 4757, 9276, 7380, 8024], [ 5013, 4757, 9276, 7240, 6889], [ 6621, 2788, 16831, 24391, 6307], [ 7073, 10183, 19439, 12849, 11821], [ 9276, 10849, 13813, 7165, 2179], [ 4983, 8843, 935, 6032, 7160], [ 8449, 231, 13813, 4582, 2179], [15997, 2248, 27615, 19088, 128], [15446, 492, 13572, 16470, 11313], [ 7240, 4291, 2882, 5327, 9276], [ 2558, 3135, 3096, 738, 9104], [ 2558, 738, 2023, 3096, 3108], [ 4291, 3120, 2561, 7240, 6889], [10183, 7897, 12849, 19439, 11821], [ 7073, 6219, 7897, 10183, 9353], [20323, 12550, 22248, 11821, 26655], [24391, 14359, 27760, 8476, 16969], [ 6621, 6515, 2869, 6307, 1568], [ 2023, 3135, 3096, 738, 9104], [ 2023, 9104, 2558, 1568, 8476], [ 1524, 3866, 1568, 2869, 4983], [ 9104, 2788, 3096, 2869, 3866], [ 4983, 935, 7069, 7160, 8843], [22399, 16470, 12081, 10586, 492], [ 4582, 1109, 16969, 8449, 9918], [ 2415, 3866, 1568, 4983, 2869], [ 7069, 6032, 3866, 935, 8843], [ 2869, 4983, 1568, 6307, 6621], [ 8476, 3096, 1568, 22767, 2788], [ 9104, 22767, 14359, 1109, 3108], [ 1109, 738, 5723, 8476, 4582], [ 7857, 5809, 9543, 13572, 11771], [ 8843, 8671, 7160, 1259, 16300], [ 4291, 7240, 4420, 6889, 4757], [12550, 19439, 10183, 21939, 11313], [ 5809, 12849, 7857, 11771, 4023], [ 4023, 5809, 3313, 9543, 6219], [ 6307, 16449, 8671, 8843, 16831], [ 9410, 23842, 10303, 14102, 16345], [ 4023, 7857, 6219, 11771, 9353], [ 3120, 7240, 6889, 4420, 2561], [ 8328, 16455, 12184, 26655, 22248], [15997, 2248, 17070, 10996, 16259], [ 570, 918, 2646, 3313, 7857], [ 570, 1075, 2646, 3313, 7857], [ 1075, 918, 2646, 3313, 7857], [ 2248, 2683, 16259, 27615, 17070], [ 3574, 9803, 10366, 15997, 17070], [ 128, 22954, 13712, 27615, 11139], [ 1075, 570, 918, 3313, 7857], [10996, 14434, 2248, 2683, 32531], [ 3135, 3108, 2023, 2558, 3096], [ 7857, 4023, 5809, 6219, 9543], [ 5723, 3108, 4582, 12182, 8449], [16470, 13027, 22399, 15446, 18357], [15925, 13813, 16969, 9918, 9803], [ 9262, 4887, 12182, 2561, 3108]])
# Add to features
bike_data['laggedTrips'] = neighboring_trips.mean(axis=1)
Yes!
We can use seaborn to make a quick plot:
sns.regplot(bike_data['laggedTrips'], bike_data['total_start_trips']);
/Users/nhand/opt/miniconda3/envs/musa-550-fall-2020/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
Again, use seaborn to investigate.
Remember: we don't want to include multipl features that are highly correlated in our model.
feature_cols = [
"total_start_trips",
"totalDocks",
"percent_car",
"logDistRests",
"within_centerCity",
"logIntersectionDists",
"logStationDists",
"laggedTrips",
]
sns.heatmap(
bike_data[feature_cols].corr(), cmap="coolwarm", annot=True, vmin=-1, vmax=1
)
<AxesSubplot:>
Just as before, the plan is to:
We'll use a 60%/40% split, due to the relatively small number of stations.
# Remove unnecessary columns
bike_features = bike_data.drop(labels=["geometry", "kioskId", "start_station"], axis=1)
# Split the data
train_set, test_set = train_test_split(
bike_features,
test_size=0.4,
random_state=12345,
)
# the target labels: log of total start trips
y_train = np.log(train_set["total_start_trips"])
y_test = np.log(test_set["total_start_trips"])
# Extract out only the features we want to use
feature_cols = [
"totalDocks",
"percent_car",
"logDistRests",
"within_centerCity",
"logIntersectionDists",
"logStationDists",
"laggedTrips",
]
train_set = train_set[feature_cols]
test_set = test_set[feature_cols]
Let's run a simple grid search to try to optimize our hyperparameters.
# Setup the pipeline with a standard scaler
pipe = make_pipeline(
StandardScaler(), RandomForestRegressor(random_state=42)
)
Try out a few different values for two of the main parameters for random forests: n_estimators
and max_depth
:
model_name = "randomforestregressor"
param_grid = {
f"{model_name}__n_estimators": [5, 10, 15, 20, 30],
f"{model_name}__max_depth": [None, 2, 5, 7, 9, 13],
}
param_grid
{'randomforestregressor__n_estimators': [5, 10, 15, 20, 30], 'randomforestregressor__max_depth': [None, 2, 5, 7, 9, 13]}
Important: just like last week, we will need to prefix the parameter name with the name of the pipeline step, in this case, "randomforestregressor".
# Create the grid and use 5-fold CV
grid = GridSearchCV(pipe, param_grid, cv=5)
# Run the search
grid.fit(train_set, y_train);
grid.best_params_
{'randomforestregressor__max_depth': 2, 'randomforestregressor__n_estimators': 10}
# Evaluate the best random forest model
best_random = grid.best_estimator_
grid.score(test_set, y_test)
0.18716529448176544
# Set up a linear pipeline
linear = make_pipeline(StandardScaler(), LinearRegression())
# Fit on train set
linear.fit(train_set, y_train)
# Evaluate on test set
linear.score(test_set, y_test)
0.13285784403476963
From our earlier correlation analysis, we should expect the most important features to be:
# The best model
regressor = grid.best_estimator_["randomforestregressor"]
# Create the dataframe with importances
importance = pd.DataFrame(
{"Feature": train_set.columns, "Importance": regressor.feature_importances_}
)
# Sort importance in descending order and get the top
importance = importance.sort_values("Importance", ascending=False)
# Plot
importance.hvplot.barh(x="Feature", y="Importance", flip_yaxis=True, height=300)
importance
Feature | Importance | |
---|---|---|
5 | logStationDists | 0.314664 |
6 | laggedTrips | 0.268413 |
2 | logDistRests | 0.225963 |
4 | logIntersectionDists | 0.101272 |
0 | totalDocks | 0.089687 |
1 | percent_car | 0.000000 |
3 | within_centerCity | 0.000000 |
We'll plot the predicted and actual trip values
Use the test set index (test_set.index
) to get the data from the original data frame (bike_data
).
This will ensure we will have geometry
info (not used in the modeling) for our test data set:
test_set.index
Int64Index([ 48, 120, 9, 18, 88, 133, 22, 73, 34, 91, 84, 44, 32, 100, 131, 103, 10, 85, 11, 90, 95, 93, 68, 70, 40, 50, 49, 57, 3, 25, 27, 114, 39, 128, 116, 29, 109, 72, 45, 12, 23, 63, 89, 124, 47, 113, 99, 104, 20, 75, 31, 80, 1, 79, 78, 135], dtype='int64')
# Extract the test data from the original dataset
# This will include the geometry data
X = bike_data.loc[test_set.index]
# test data extracted from our original data frame
X.head()
geometry | kioskId | start_station | total_start_trips | totalDocks | percent_car | logDistRests | within_centerCity | logIntersectionDists | logStationDists | laggedTrips | |
---|---|---|---|---|---|---|---|---|---|---|---|
48 | POINT (-8366907.625 4860485.663) | 3059 | 3059 | 13813 | 21 | 0.383838 | 2.450591 | 0 | 1.700616 | 2.829790 | 9414.6 |
120 | POINT (-8366648.250 4858924.483) | 3185 | 3185 | 8328 | 21 | 0.174432 | 2.271648 | 1 | 2.094480 | 2.539369 | 13569.0 |
9 | POINT (-8365428.189 4860591.687) | 3013 | 3013 | 9276 | 17 | 0.565826 | 2.687592 | 0 | 1.931635 | 2.964495 | 5110.2 |
18 | POINT (-8369358.880 4859364.493) | 3022 | 3022 | 20303 | 21 | 0.227273 | 2.818733 | 0 | 1.820301 | 2.774140 | 13132.0 |
88 | POINT (-8373896.262 4862805.375) | 3117 | 3117 | 1524 | 19 | 0.835106 | 3.400058 | 0 | 1.916505 | 3.419593 | 3140.2 |
test_set.head()
totalDocks | percent_car | logDistRests | within_centerCity | logIntersectionDists | logStationDists | laggedTrips | |
---|---|---|---|---|---|---|---|
48 | 21 | 0.383838 | 2.450591 | 0 | 1.700616 | 2.829790 | 9414.6 |
120 | 21 | 0.174432 | 2.271648 | 1 | 2.094480 | 2.539369 | 13569.0 |
9 | 17 | 0.565826 | 2.687592 | 0 | 1.931635 | 2.964495 | 5110.2 |
18 | 21 | 0.227273 | 2.818733 | 0 | 1.820301 | 2.774140 | 13132.0 |
88 | 19 | 0.835106 | 3.400058 | 0 | 1.916505 | 3.419593 | 3140.2 |
Now, make our predictions, and convert them from log to raw counts:
# Predictions for log of total trip counts
log_predictions = grid.best_estimator_.predict(test_set)
# Convert the predicted test values from log
X['prediction'] = np.exp(log_predictions)
Let's make the plot with side-by-side panels for actual and predicted:
# Plot two columns
fig, axs = plt.subplots(ncols=2, figsize=(10,10))
# Predicted values
X.plot(ax=axs[0], column='prediction')
ctx.add_basemap(ax=axs[0], crs=X.crs, source=ctx.providers.CartoDB.DarkMatter)
axs[0].set_title("Predicted Trip Counts")
# Actual values
X.plot(ax=axs[1], column='total_start_trips')
ctx.add_basemap(ax=axs[1], crs=X.crs, source=ctx.providers.CartoDB.DarkMatter)
axs[1].set_title("Actual Trip Counts")
axs[0].set_axis_off()
axs[1].set_axis_off()
Yes!
This is a classic example of underfitting, for a few reasons:
When trying to improve the accuracy of the model, another option is incorporating additional data. In this case, we can look to other cities and include trip data for these cities. Some good places to start: