from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas
import esri2gpd
import carto2gpd
import seaborn as sns

np.random.seed(42)

%matplotlib inline


pd.options.display.max_columns = 999


plt.rcParams['figure.figsize'] = (10,6)


# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Pipelines
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder

# Neighbors
from sklearn.neighbors import NearestNeighbors


# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"

# The table name
table_name = "opa_properties_public"

# Only pull 2019 sales for single family residential properties
where = "sale_date >= '2019-01-01' AND sale_date < '2020-01-01'" 
where = where + " AND category_code_description = 'Single Family'"

# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)

# Optional: put it a reproducible order for test/training splits later
salesRaw = salesRaw.sort_values("parcel_number")

# The feature columns we want to use
cols = [
    "sale_price",
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "exterior_condition",
    "zip_code",
    "geometry"
]

# Trim to these columns and remove NaNs
sales = salesRaw[cols].dropna()

# Trim zip code to only the first five digits
sales['zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5)

# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]


len(sales)

20131


def get_xy_from_geometry(df):
    """
    Return a numpy array with two columns, where the 
    first holds the `x` geometry coordinate and the second 
    column holds the `y` geometry coordinate
    
    Note: this works with both Point() and Polygon() objects.
    """
    # NEW: use the centroid.x and centroid.y to support Polygon() and Point() geometries 
    x = df.geometry.centroid.x
    y = df.geometry.centroid.y
    
    return np.column_stack((x, y)) # stack as columns


# Convert to meters and EPSG=3857
sales_3857 = sales.to_crs(epsg=3857)

# Extract x/y for sales
salesXY = get_xy_from_geometry(sales_3857)


# Select only those for grafitti and in 2019
where = "requested_datetime >= '01-01-2019' and requested_datetime < '01-01-2020'"
where = where + " AND service_name = 'Graffiti Removal'"

# Pull the subset we want
graffiti = carto2gpd.get(carto_url, "public_cases_fc", where=where)

# Remove rows with missing geometries
graffiti = graffiti.loc[graffiti.geometry.notnull()]


# Convert to meters in EPSG=3857
graffiti_3857 = graffiti.to_crs(epsg=3857)

# Extract x/y for grafitti calls
graffitiXY = get_xy_from_geometry(graffiti_3857)


# STEP 1: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=5)

# STEP 2: Fit the algorithm on the "neighbors" dataset
nbrs.fit(graffitiXY)

# STEP 3: Get distances for sale to neighbors
grafDists, grafIndices = nbrs.kneighbors(salesXY) 

# STEP 4: Get average distance to neighbors
avgGrafDist = grafDists.mean(axis=1)

# Set zero distances to be small, but nonzero
# IMPORTANT: THIS WILL AVOID INF DISTANCES WHEN DOING THE LOG
avgGrafDist[avgGrafDist==0] = 1e-5

# STEP 5: Add the new feature
sales['logDistGraffiti'] = np.log10(avgGrafDist)


import osmnx as ox


# Download the Philadelphia city limits
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/City_Limits/FeatureServer/0"
city_limits = esri2gpd.get(url).to_crs(epsg=3857)

# Get the geometry from the city limits
city_limits_outline = city_limits.to_crs(epsg=4326).squeeze().geometry


city_limits_outline


# Get the subway stops within the city limits
subway = ox.geometries_from_polygon(city_limits_outline, tags={"station": "subway"})

# Convert to 3857 (meters)
subway = subway.to_crs(epsg=3857)


# STEP 1: x/y coordinates of subway stops (in EPGS=3857)
subwayXY = get_xy_from_geometry(subway.to_crs(epsg=3857))

# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=1)

# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(subwayXY)

# STEP 4: Get distances for sale to neighbors
subwayDists, subwayIndices = nbrs.kneighbors(salesXY)

# STEP 5: add back to the original dataset
sales["logDistSubway"] = np.log10(subwayDists.mean(axis=1))


sales.head()


# Numerical columns
num_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "logDistGraffiti", # NEW
    "logDistSubway" # NEW
]

# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]


# Set up the column transformer with two transformers
transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)


# Initialize the pipeline
# NOTE: only use 20 estimators here so it will run in a reasonable time
pipe = make_pipeline(
    transformer, RandomForestRegressor(n_estimators=20, random_state=42)
)


# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)

# the target labels
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])


# Fit the training set
# REMINDER: use the training dataframe objects here rather than numpy array
pipe.fit(train_set, y_train);


# What's the test score?
# REMINDER: use the test dataframe rather than numpy array
pipe.score(test_set, y_test)

0.6163490515951564


# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Universities_Colleges/FeatureServer/0"
univs = esri2gpd.get(url)

# Get the X/Y
univXY = get_xy_from_geometry(univs.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(univXY)
univDists, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales['logDistUniv'] = np.log10(univDists.mean(axis=1))


fig, ax = plt.subplots(figsize=(10,10), facecolor=plt.get_cmap('viridis')(0))

x = salesXY[:,0]
y = salesXY[:,1]
ax.hexbin(x, y, C=np.log10(univDists.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor='none', edgecolor='white', linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to Nearest University/College", fontsize=16, color='white');


# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Assets/FeatureServer/0"
parks = esri2gpd.get(url)

# Get the X/Y
parksXY = get_xy_from_geometry(parks.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(parksXY)
parksDists, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales["logDistParks"] = np.log10(parksDists.mean(axis=1))


fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(parksDists.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to Nearest Park", fontsize=16, color="white");


# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/CITY_LANDMARKS/FeatureServer/0"
cityHall = esri2gpd.get(
    url, where="NAME = 'City Hall' AND FEAT_TYPE = 'Municipal Building'"
)

# Get the X/Y
cityHallXY = get_xy_from_geometry(cityHall.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(cityHallXY)
cityHallDist, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales["logDistCityHall"] = np.log10(cityHallDist.mean(axis=1))


fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(cityHallDist.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to City Hall", fontsize=16, color="white");


# Table name
table_name = "permits"

# Where clause
where = "permitissuedate >= '2019-01-01' AND permitissuedate < '2020-01-01'"
where = where + " AND permitdescription='NEW CONSTRUCTION PERMIT'"

# Query
permits = carto2gpd.get(carto_url, table_name, where=where)

# Remove missing
permits = permits.loc[permits.geometry.notnull()]

# Get the X/Y
permitsXY = get_xy_from_geometry(permits.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(permitsXY)
permitsDist, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales["logDistPermits"] = np.log10(permitsDist.mean(axis=1))


fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(permitsDist.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to 5 Closest Building Permits", fontsize=16, color="white");


# Table name
table_name = "incidents_part1_part2"

# Where selection
where = "dispatch_date >= '2019-01-01' AND dispatch_date < '2020-01-01'"
where = where + " AND Text_General_Code IN ('Aggravated Assault No Firearm', 'Aggravated Assault Firearm')"

# Query
assaults = carto2gpd.get(carto_url, table_name, where=where)

# Remove missing 
assaults = assaults.loc[assaults.geometry.notnull()]
    
# Get the X/Y
assaultsXY = get_xy_from_geometry(assaults.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(assaultsXY)
assaultDists, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales['logDistAssaults'] = np.log10(assaultDists.mean(axis=1))


fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(assaultDists.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to 5 Closest Assaults", fontsize=16, color="white");


# Table name
table_name = "public_cases_fc"

# Where selection
where = "requested_datetime >= '2019-01-01' AND requested_datetime < '2020-01-01'"
where = where + " AND service_name = 'Abandoned Vehicle'"

# Query
cars = carto2gpd.get(carto_url, table_name, where=where)

# Remove missing
cars = cars.loc[cars.geometry.notnull()]

# Get the X/Y
carsXY = get_xy_from_geometry(cars.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(carsXY)
carDists, _ = nbrs.kneighbors(salesXY)

# Handle any sales that have 0 distances
carDists[carDists == 0] = 1e-5  # a small, arbitrary value

# Add the new feature
sales["logDistCars"] = np.log10(carDists.mean(axis=1))


fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(carDists.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title(
    "Distance to 5 Closest Abandoned Vehichle 311 Calls", fontsize=16, color="white"
);


# Numerical columns
num_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "logDistGraffiti", # NEW
    "logDistSubway", # NEW
    "logDistUniv", # NEW
    "logDistParks", # NEW
    "logDistCityHall", # NEW 
    "logDistPermits", # NEW
    "logDistAssaults", # NEW
    "logDistCars" # NEW
]

# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]


# Set up the column transformer with two transformers
transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)


# Two steps in pipeline: preprocessor and then regressor
pipe = make_pipeline(
    transformer, RandomForestRegressor(n_estimators=20, random_state=42)
)


# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)

# the target labels
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])


# Fit the training set
pipe.fit(train_set, y_train);


# What's the test score?
pipe.score(test_set, y_test)

0.6549287880700467


# The one-hot step
ohe = transformer.named_transformers_["cat"]

# One column for each category type!
ohe_cols = ohe.get_feature_names()

# Full list of columns is numerical + one-hot
features = num_cols + list(ohe_cols)

# The regressor
regressor = pipe["randomforestregressor"]

# Create the dataframe with importances
importance = pd.DataFrame(
    {"Feature": features, "Importance": regressor.feature_importances_}
)

# Sort importance in descending order and get the top
importance = importance.sort_values("Importance", ascending=False).iloc[:30]

# Plot
importance.hvplot.barh(
    x="Feature", y="Importance", flip_yaxis=True, height=500
)


import requests


# API endpoint
API_endpoint = "http://www.rideindego.com/stations/json/"

# Get the JSON
stations_geojson = requests.get(API_endpoint).json()

# Convert to a GeoDataFrame
stations = gpd.GeoDataFrame.from_features(stations_geojson, crs="EPSG:4326")


stations.head()


import contextily as ctx

/Users/nhand/opt/miniconda3/envs/musa-550-fall-2020/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
  return f(*args, **kwds)


fig, ax = plt.subplots(figsize=(15, 10))

# stations
stations_3857 = stations.to_crs(epsg=3857)
stations_3857.plot(ax=ax, column='totalDocks', legend=True)

# plot the basemap underneath
ctx.add_basemap(ax=ax, crs=stations_3857.crs, source=ctx.providers.CartoDB.DarkMatter)

ax.set_axis_off()


all_trips = pd.read_csv("./data/indego-trips-2018-2019.csv.tar.gz")

/Users/nhand/opt/miniconda3/envs/musa-550-fall-2020/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3051: DtypeWarning: Columns (10,14) have mixed types.Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)


all_trips.head()


start_trips = (
    all_trips.groupby("start_station").size().reset_index(name="total_start_trips")
)


start_trips.head()


# Now merge in the geometry for each station
bike_data = (
    stations[["geometry", "kioskId"]]
    .merge(start_trips, left_on="kioskId", right_on="start_station")
    .to_crs(epsg=3857)
)


fig, ax = plt.subplots(figsize=(10,10))

# stations
bike_data.plot(ax=ax, column='total_start_trips', legend=True)

# plot the basemap underneath
ctx.add_basemap(ax=ax, crs=bike_data.crs, source=ctx.providers.CartoDB.DarkMatter)

ax.set_axis_off()


bike_data = bike_data.merge(stations[["kioskId", "totalDocks"]], on="kioskId")
bike_data.head()


import cenpy

/Users/nhand/opt/miniconda3/envs/musa-550-fall-2020/lib/python3.7/site-packages/patsy/constraint.py:13: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working
  from collections import Mapping


acs = cenpy.remote.APIConnection("ACSDT5Y2018")


acs.varslike("B08134").sort_index().iloc[:15]


# Means of Transportation to Work by Travel Time to Work
variables = ["NAME", "B08134_001E", "B08134_011E"]

# Pull data by census tract
census_data = acs.query(
    cols=variables,
    geo_unit="block group:*",
    geo_filter={"state": "42",
                "county": "101", 
                "tract": "*"},
)

# Convert to the data to floats
for col in variables[1:]:
    census_data[col] = census_data[col].astype(float)

# The percent commuting by car
census_data["percent_car"] = census_data["B08134_011E"] / census_data["B08134_001E"]


census_data.head()


acs.set_mapservice("tigerWMS_ACS2018")

Connection to American Community Survey: 1-Year Estimates: Detailed Tables 5-Year(ID: https://api.census.gov/data/id/ACSDT5Y2018)
With MapServer: Census Current (2018) WMS


acs.mapservice.layers[10]

(ESRILayer) Census Block Groups


# Use SQL to return geometries only for Philadelphia County in PA
where_clause = "STATE = 42 AND COUNTY = 101"

# Query for tracts
block_groups = acs.mapservice.layers[10].query(where=where_clause)

/Users/nhand/opt/miniconda3/envs/musa-550-fall-2020/lib/python3.7/site-packages/pyproj/crs/crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))


census_data = block_groups.merge(
    census_data,
    left_on=["STATE", "COUNTY", "TRACT", "BLKGRP"],
    right_on=["state", "county", "tract", "block group"],
)


bike_data = gpd.sjoin(
    bike_data,
    census_data.to_crs(bike_data.crs)[["geometry", "percent_car"]],
    op="within",
).drop(labels=['index_right'], axis=1)


bike_data.head()


missing = bike_data['percent_car'].isnull()
bike_data.loc[missing, 'percent_car'] = bike_data['percent_car'].median()


restaurants = ox.geometries_from_polygon(
    city_limits_outline, tags={"amenity": "restaurant"}
).to_crs(epsg=3857)

restaurants.head()


stationsXY = get_xy_from_geometry(bike_data)


# STEP 1: x/y coordinates of restaurants (in EPGS=3857)
restsXY = get_xy_from_geometry(restaurants.to_crs(epsg=3857))

# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=10)

# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(restsXY)

# STEP 4: Get distances for stations to neighbors
restsDists, restsIndices = nbrs.kneighbors(stationsXY)

# STEP 5: add back to the original dataset
bike_data['logDistRests'] = np.log10(restsDists.mean(axis=1))


bike_data.head()


fig, ax = plt.subplots(figsize=(10,10))

# stations
bike_data.plot(ax=ax, column='logDistRests', legend=True)

# plot the basemap underneath
ctx.add_basemap(ax=ax, crs=bike_data.crs, source=ctx.providers.CartoDB.DarkMatter)

ax.set_axis_off()


url = "http://data.phl.opendata.arcgis.com/datasets/95366b115d93443eae4cc6f498cb3ca3_0.geojson"
ccbd = gpd.read_file(url).to_crs(epsg=3857)


ccbd_geo = ccbd.squeeze().geometry

ccbd_geo


# Add the new feature: 1 if within CC and 0 otherwise
bike_data['within_centerCity'] = bike_data.geometry.within(ccbd_geo).astype(int)


xmin, ymin, xmax, ymax = bike_data.to_crs(epsg=4326).total_bounds


ox.graph_from_bbox?


G = ox.graph_from_bbox(ymax, ymin, xmax, xmin, network_type='bike')

G

<networkx.classes.multidigraph.MultiDiGraph at 0x7fa043f17390>


ox.plot_graph(G, figsize=(12, 12), node_size=0);


interesections = ox.graph_to_gdfs(G, nodes=True, edges=False).to_crs(epsg=3857)


interesections.head()


# STEP 1: x/y coordinates of restaurants (in EPGS=3857)
intersectionsXY = get_xy_from_geometry(interesections.to_crs(epsg=3857))

# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=10)

# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(intersectionsXY)

# STEP 4: Get distances for stations to neighbors
interDists, interIndices = nbrs.kneighbors(stationsXY)

# STEP 5: add back to the original dataset
bike_data['logIntersectionDists'] = np.log10(interDists.mean(axis=1))


fig, ax = plt.subplots(figsize=(10,10))

# stations
bike_data.plot(ax=ax, column='logIntersectionDists', legend=True)

# plot the basemap underneath
ctx.add_basemap(ax=ax, crs=bike_data.crs, source=ctx.providers.CartoDB.DarkMatter)

ax.set_axis_off()


N = 5
k = N + 1
nbrs = NearestNeighbors(n_neighbors=k)
nbrs.fit(stationsXY)

stationDists, stationIndices = nbrs.kneighbors(stationsXY)


bike_data["logStationDists"] = np.log10(stationDists[:, 1:].mean(axis=1))


fig, ax = plt.subplots(figsize=(10,10))

# stations
bike_data.plot(ax=ax, column='logStationDists', legend=True)

# plot the basemap underneath
ctx.add_basemap(ax=ax, crs=bike_data.crs, source=ctx.providers.CartoDB.DarkMatter)

ax.set_axis_off()


stationIndices[:, 1:]

array([[  2,  47, 129,   4,  22],
       [  4,  88,   5, 129,   3],
       [  0,   4, 129,   3,   1],
       [127,   5,   2,   4,   1],
       [  1, 129,   2,   3,   0],
       [127,  88,   3,   1, 123],
       [ 57,  56,  19,  27,  28],
       [113,   8, 118,  86,  35],
       [113,   7,  39,  32,  30],
       [ 96,  61,  55, 122,  12],
       [ 41,  21, 137,  66,  75],
       [ 39,  33,  32,  83,  14],
       [ 73,  55,  13, 131,   3],
       [ 73,  12,  89, 135,  76],
       [118,  46,  77,  39,  11],
       [ 42,  44, 135,  76, 104],
       [ 85,  82,  81,  20,  75],
       [ 65, 128,  52,  18,  63],
       [ 17,  34,  70, 128,  65],
       [ 27,   6,  56,  57,  24],
       [ 75,  49,  87,  50,  85],
       [ 10,  93,  41,  90, 121],
       [ 25,  47, 122,   0,  23],
       [ 24,  25,  61, 122,  22],
       [ 23,  48,  25,  27,  61],
       [ 22, 122,  23,  47,  24],
       [ 27,  48,  24,  28,  56],
       [ 26,  19,  56,  24,  48],
       [ 59,  56,  26,  27,   6],
       [ 32,  30,  31,  43,  39],
       [ 31,  32,  29,   8,  39],
       [ 30,  32,  29,   8, 113],
       [ 29,  30,  39,   8,  31],
       [ 34,  72,  70,  11,  18],
       [ 33,  70,  72,  18, 119],
       [ 36, 113,   7,  86,   8],
       [ 35, 103,  86, 113,   7],
       [ 79,  40,  45,  78,  89],
       [ 80, 115,  40,  94,  19],
       [ 32, 118,  11,   8,  29],
       [115,  37,  96,  79,  80],
       [ 10,  66, 137,  21,  49],
       [ 44,  15,  43, 119,  76],
       [ 44,  42, 119,  29,  15],
       [ 42,  43,  15, 119,  76],
       [ 79,  37,  78, 120, 112],
       [ 14,  98,  77,  83, 118],
       [  0,  22,  25,   2, 129],
       [ 24,  26,  23,  25,  22],
       [ 50,  87,  20,  66,  75],
       [ 49, 105,  87, 134,  66],
       [136,  68,  52,  67,  65],
       [ 65, 128,  17,  63,  51],
       [121,  90, 114,  58,  82],
       [112,  62, 117,  78,  74],
       [122,  12,   9,  61,  25],
       [  6,  27,  28,  19,  59],
       [  6,  56,  19,  38,  27],
       [ 81,  82,  53,  59,  16],
       [ 28,  56,  81,   6,  26],
       [ 97,  67,  64,  63, 136],
       [ 23,   9, 122,  55,  24],
       [ 54, 104,  74, 112,  15],
       [ 65,  17,  52,  64,  60],
       [ 97,  63,  60,  18,  83],
       [ 52,  17, 128,  63,  18],
       [ 41,  49,  50,  10,  75],
       [ 60, 136,  51,  63, 105],
       [ 51,  69, 136,  87,  85],
       [ 68,  85,  47,  48,  22],
       [ 34,  72,  71,  18, 123],
       [ 88, 123,  70,   5, 128],
       [119,  70,  34,  33, 123],
       [131,  12,  76,  13, 127],
       [135,  15,  89, 104,  78],
       [ 20,  49,  90,  66,  16],
       [ 73, 131,  15,  42, 135],
       [ 46,  14,  98, 108, 118],
       [ 45,  89,  37,  79,  74],
       [ 45,  37,  78,  40, 120],
       [ 94,  38, 115,  84,  40],
       [ 82,  58,  16,  59,  28],
       [ 81,  58,  16,  53,  90],
       [ 46,  98,  11,  64,  14],
       [ 94,  80,  38,  95, 115],
       [ 16,  69,  68,  26,  20],
       [107,   7,  36, 103,  35],
       [ 49, 136,  68,  50,  20],
       [ 71,   5,   1, 123,   4],
       [ 13, 135,  78,  74,  37],
       [ 53, 121,  75,  82,  16],
       [ 99,  92, 100, 132, 109],
       [ 99, 132,  91, 100, 111],
       [121, 114,  21,  53,  90],
       [ 80,  84,  95,  38, 115],
       [ 94, 116,  84,  80,  79],
       [  9,  40,  61, 115,  55],
       [ 64,  60,  63, 110,  67],
       [ 46,  83,  77,  14, 102],
       [ 91,  92, 100, 132, 109],
       [ 91, 109,  99, 102, 110],
       [106, 108, 102,  77, 107],
       [109,  98, 100,  77, 108],
       [107,  36,  86,  35,   7],
       [ 15,  74,  62,  44, 135],
       [ 50, 134,  67,  49,  87],
       [101, 108, 102, 107,  77],
       [ 86, 103, 108,  36,   7],
       [ 77, 107, 102,  14,  46],
       [110, 100, 102,  97,  98],
       [109,  97,  60, 134, 111],
       [134, 132, 105, 110,  50],
       [117, 120,  54,  78,  45],
       [  7,   8,  35,  31,  30],
       [121,  53,  93,  90,  58],
       [ 40,  38,  80,  96,  37],
       [120,  95, 117,  45, 112],
       [112, 120, 133,  54, 116],
       [ 14,  39,   8,   7,  11],
       [ 72,  42,  43,  34,  70],
       [112, 117, 116,  45,  79],
       [114,  53,  90,  93,  21],
       [ 25,  22,  23,  55,  61],
       [ 71,   5,  88, 127, 131],
       [126, 125, 130, 133, 117],
       [126, 124, 130, 133, 117],
       [124, 125, 130, 133, 117],
       [  5,   3, 131,   1,  88],
       [ 17,  52,  65,  71,  88],
       [  4,   2,   0,   1,  47],
       [124, 126, 125, 133, 117],
       [127,  73,   5,   3,  12],
       [ 92, 111,  91,  99, 100],
       [117, 112, 120, 116,  54],
       [105, 111,  50,  66,  49],
       [ 74,  89,  15,  13,  76],
       [ 51,  68,  67,  87,  52],
       [ 41,  10,  66,  21, 111]])


# the total trips for the stations from original data frame
total_start_trips = bike_data['total_start_trips'].values


# get the trips for the 5 nearest neighbors (ignoring first match)
neighboring_trips = total_start_trips[stationIndices[:, 1:]]


neighboring_trips

array([[22954, 11139,  1418,   128, 16455],
       [  128, 17070,  2248,  1418,  2683],
       [13712,   128,  1418,  2683, 27615],
       [10996,  2248, 22954,   128, 27615],
       [27615,  1418, 22954,  2683, 13712],
       [10996, 17070,  2683, 27615, 19088],
       [ 8585, 16600, 10807, 16130,  8024],
       [ 3041,  8671,  7132,  7069,  7160],
       [ 3041,  8843, 16449,  1445, 16300],
       [21939, 22248, 26655, 16519, 32531],
       [ 9262,  2561,  1179, 12182,  2882],
       [16449, 20303,  1445,  6515,  6307],
       [14434, 26655, 15446, 16259,  2683],
       [14434, 32531, 13027,   492, 18357],
       [ 7132,  6621,  2869, 16449, 16831],
       [23842, 10586,   492, 18357, 12916],
       [10150,  5327,  5013,  2179,  2882],
       [10366,  5301,  9803,  8972, 27760],
       [ 3574, 14102, 16345,  5301, 10366],
       [16130,  8253, 16600,  8585, 11548],
       [ 2882,  8449,  9918,  4582, 10150],
       [ 4887,  4420,  9262,  6889,  4291],
       [ 8328, 11139, 16519, 13712, 12184],
       [11548,  8328, 22248, 16519, 16455],
       [12184, 11121,  8328, 16130, 22248],
       [16455, 16519, 12184, 11139, 11548],
       [16130, 11121, 11548,  8024, 16600],
       [ 7165, 10807, 16600, 11548, 11121],
       [ 7380, 16600,  7165, 16130,  8253],
       [ 1445, 16300,  1259, 10303, 16449],
       [ 1259,  1445, 24396,  8671, 16449],
       [16300,  1445, 24396,  8671,  3041],
       [24396, 16300, 16449,  8671,  1259],
       [14102,  9410, 16345, 16831,  8972],
       [20303, 16345,  9410,  8972, 13962],
       [  935,  3041,  8843,  7069,  8671],
       [ 7160,  6032,  7069,  3041,  8843],
       [ 9353, 12550, 11771, 13572, 13027],
       [10183, 11821, 12550,  7073, 10807],
       [ 1445,  7132, 16831,  8671, 24396],
       [11821, 11313, 21939,  9353, 10183],
       [ 4887, 12182,  1179,  2561,  8449],
       [10586, 22399, 10303, 13962, 18357],
       [10586, 23842, 13962, 24396, 22399],
       [23842, 10303, 22399, 13962, 18357],
       [ 9353, 11313, 13572,  5809,  4023],
       [ 6307,  2788,  2869,  6515,  7132],
       [13712, 16455,  8328, 22954,  1418],
       [11548,  7165, 12184,  8328, 16455],
       [ 4582,  9918,  2179, 12182,  2882],
       [ 8449,  5723,  9918,  1109, 12182],
       [  231, 13813,  9803, 16969, 10366],
       [10366,  5301,  3574, 27760, 15925],
       [ 4291,  6889,  3120,  4757,  5327],
       [ 4023, 12081,  7857, 13572, 16470],
       [16519, 32531, 20323, 22248,  8328],
       [ 8253, 16130,  8024, 10807,  7380],
       [ 8253, 16600, 10807, 19439, 16130],
       [ 5013,  5327,  7240,  7380,  9276],
       [ 8024, 16600,  5013,  8253,  7165],
       [22767, 16969, 24391, 27760,   231],
       [12184, 20323, 16519, 26655, 11548],
       [ 9543, 12916, 16470,  4023, 22399],
       [10366,  3574,  9803, 24391, 14359],
       [22767, 27760, 14359,  8972,  6515],
       [ 9803,  3574,  5301, 27760,  8972],
       [ 9262,  8449,  4582,  4887,  2882],
       [14359,   231, 15925, 27760,  5723],
       [15925, 10849,   231,  9918, 10150],
       [13813, 10150, 11139, 11121, 16455],
       [14102,  9410, 15997,  8972, 19088],
       [17070, 19088, 16345,  2248,  5301],
       [13962, 16345, 14102, 20303, 19088],
       [16259, 32531, 18357, 15446, 10996],
       [  492, 22399, 13027, 12916, 13572],
       [ 2179,  8449,  6889, 12182,  9276],
       [14434, 16259, 22399, 23842,   492],
       [ 6621,  6307,  2788,  3866,  7132],
       [11771, 13027, 11313,  9353, 16470],
       [11771, 11313, 13572, 12550,  5809],
       [ 7073, 19439, 11821,  7897, 12550],
       [ 5327,  4757,  9276,  7380,  8024],
       [ 5013,  4757,  9276,  7240,  6889],
       [ 6621,  2788, 16831, 24391,  6307],
       [ 7073, 10183, 19439, 12849, 11821],
       [ 9276, 10849, 13813,  7165,  2179],
       [ 4983,  8843,   935,  6032,  7160],
       [ 8449,   231, 13813,  4582,  2179],
       [15997,  2248, 27615, 19088,   128],
       [15446,   492, 13572, 16470, 11313],
       [ 7240,  4291,  2882,  5327,  9276],
       [ 2558,  3135,  3096,   738,  9104],
       [ 2558,   738,  2023,  3096,  3108],
       [ 4291,  3120,  2561,  7240,  6889],
       [10183,  7897, 12849, 19439, 11821],
       [ 7073,  6219,  7897, 10183,  9353],
       [20323, 12550, 22248, 11821, 26655],
       [24391, 14359, 27760,  8476, 16969],
       [ 6621,  6515,  2869,  6307,  1568],
       [ 2023,  3135,  3096,   738,  9104],
       [ 2023,  9104,  2558,  1568,  8476],
       [ 1524,  3866,  1568,  2869,  4983],
       [ 9104,  2788,  3096,  2869,  3866],
       [ 4983,   935,  7069,  7160,  8843],
       [22399, 16470, 12081, 10586,   492],
       [ 4582,  1109, 16969,  8449,  9918],
       [ 2415,  3866,  1568,  4983,  2869],
       [ 7069,  6032,  3866,   935,  8843],
       [ 2869,  4983,  1568,  6307,  6621],
       [ 8476,  3096,  1568, 22767,  2788],
       [ 9104, 22767, 14359,  1109,  3108],
       [ 1109,   738,  5723,  8476,  4582],
       [ 7857,  5809,  9543, 13572, 11771],
       [ 8843,  8671,  7160,  1259, 16300],
       [ 4291,  7240,  4420,  6889,  4757],
       [12550, 19439, 10183, 21939, 11313],
       [ 5809, 12849,  7857, 11771,  4023],
       [ 4023,  5809,  3313,  9543,  6219],
       [ 6307, 16449,  8671,  8843, 16831],
       [ 9410, 23842, 10303, 14102, 16345],
       [ 4023,  7857,  6219, 11771,  9353],
       [ 3120,  7240,  6889,  4420,  2561],
       [ 8328, 16455, 12184, 26655, 22248],
       [15997,  2248, 17070, 10996, 16259],
       [  570,   918,  2646,  3313,  7857],
       [  570,  1075,  2646,  3313,  7857],
       [ 1075,   918,  2646,  3313,  7857],
       [ 2248,  2683, 16259, 27615, 17070],
       [ 3574,  9803, 10366, 15997, 17070],
       [  128, 22954, 13712, 27615, 11139],
       [ 1075,   570,   918,  3313,  7857],
       [10996, 14434,  2248,  2683, 32531],
       [ 3135,  3108,  2023,  2558,  3096],
       [ 7857,  4023,  5809,  6219,  9543],
       [ 5723,  3108,  4582, 12182,  8449],
       [16470, 13027, 22399, 15446, 18357],
       [15925, 13813, 16969,  9918,  9803],
       [ 9262,  4887, 12182,  2561,  3108]])


# Add to features
bike_data['laggedTrips'] = neighboring_trips.mean(axis=1)


sns.regplot(bike_data['laggedTrips'], bike_data['total_start_trips']);

/Users/nhand/opt/miniconda3/envs/musa-550-fall-2020/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning


feature_cols = [
    "total_start_trips",
    "totalDocks",
    "percent_car",
    "logDistRests",
    "within_centerCity",
    "logIntersectionDists",
    "logStationDists",
    "laggedTrips",
]


sns.heatmap(
    bike_data[feature_cols].corr(), cmap="coolwarm", annot=True, vmin=-1, vmax=1
)

<AxesSubplot:>


# Remove unnecessary columns
bike_features = bike_data.drop(labels=["geometry", "kioskId", "start_station"], axis=1)


# Split the data 
train_set, test_set = train_test_split(
    bike_features,
    test_size=0.4,
    random_state=12345,
)

# the target labels: log of total start trips
y_train = np.log(train_set["total_start_trips"])
y_test = np.log(test_set["total_start_trips"])


# Extract out only the features we want to use
feature_cols = [
    "totalDocks",
    "percent_car",
    "logDistRests",
    "within_centerCity",
    "logIntersectionDists",
    "logStationDists",
    "laggedTrips",
]

train_set = train_set[feature_cols]
test_set = test_set[feature_cols]


# Setup the pipeline with a standard scaler
pipe = make_pipeline(
    StandardScaler(), RandomForestRegressor(random_state=42)
)


model_name = "randomforestregressor"
param_grid = {
    f"{model_name}__n_estimators": [5, 10, 15, 20, 30],
    f"{model_name}__max_depth": [None, 2, 5, 7, 9, 13],
}

param_grid

{'randomforestregressor__n_estimators': [5, 10, 15, 20, 30],
 'randomforestregressor__max_depth': [None, 2, 5, 7, 9, 13]}


# Create the grid and use 5-fold CV
grid = GridSearchCV(pipe, param_grid, cv=5)

# Run the search
grid.fit(train_set, y_train);


grid.best_params_

{'randomforestregressor__max_depth': 2,
 'randomforestregressor__n_estimators': 10}


# Evaluate the best random forest model
best_random = grid.best_estimator_
grid.score(test_set, y_test)

0.18716529448176544


# Set up a linear pipeline
linear = make_pipeline(StandardScaler(), LinearRegression())

# Fit on train set
linear.fit(train_set, y_train)

# Evaluate on test set
linear.score(test_set, y_test)

0.13285784403476963


# The best model
regressor = grid.best_estimator_["randomforestregressor"]

# Create the dataframe with importances
importance = pd.DataFrame(
    {"Feature": train_set.columns, "Importance": regressor.feature_importances_}
)

# Sort importance in descending order and get the top
importance = importance.sort_values("Importance", ascending=False)

# Plot
importance.hvplot.barh(x="Feature", y="Importance", flip_yaxis=True, height=300)


importance


test_set.index

Int64Index([ 48, 120,   9,  18,  88, 133,  22,  73,  34,  91,  84,  44,  32,
            100, 131, 103,  10,  85,  11,  90,  95,  93,  68,  70,  40,  50,
             49,  57,   3,  25,  27, 114,  39, 128, 116,  29, 109,  72,  45,
             12,  23,  63,  89, 124,  47, 113,  99, 104,  20,  75,  31,  80,
              1,  79,  78, 135],
           dtype='int64')


# Extract the test data from the original dataset
# This will include the geometry data
X = bike_data.loc[test_set.index]


# test data extracted from our original data frame
X.head()


test_set.head()


# Predictions for log of total trip counts
log_predictions = grid.best_estimator_.predict(test_set)

# Convert the predicted test values from log
X['prediction'] = np.exp(log_predictions)


# Plot two columns
fig, axs = plt.subplots(ncols=2, figsize=(10,10))

# Predicted values
X.plot(ax=axs[0], column='prediction')
ctx.add_basemap(ax=axs[0], crs=X.crs, source=ctx.providers.CartoDB.DarkMatter)
axs[0].set_title("Predicted Trip Counts")

# Actual values
X.plot(ax=axs[1], column='total_start_trips')
ctx.add_basemap(ax=axs[1], crs=X.crs, source=ctx.providers.CartoDB.DarkMatter)
axs[1].set_title("Actual Trip Counts")


axs[0].set_axis_off()
axs[1].set_axis_off()

	sale_price	total_livable_area	total_area	garage_spaces	number_of_bathrooms	number_of_bedrooms	number_stories	exterior_condition	zip_code	geometry	logDistGraffiti	logDistSubway
2	340000	1266	1622.70	1	2	3	2	4	19147	POINT (-75.14854 39.93144)	2.080292	3.338733
1	397500	1436	739.68	0	1	3	3	2	19147	POINT (-75.14927 39.93033)	2.232137	3.330483
4	285000	868	546.00	0	1	3	2	2	19147	POINT (-75.14876 39.93011)	2.202199	3.341756
6	277500	890	683.70	0	1	2	2	3	19147	POINT (-75.14852 39.92954)	2.092351	3.347019
7	453610	1320	896.00	0	2	3	3	2	19147	POINT (-75.14871 39.92962)	2.076465	3.342907

	geometry	id	name	coordinates	totalDocks	docksAvailable	bikesAvailable	classicBikesAvailable	electricBikesAvailable	rewardBikesAvailable	rewardDocksAvailable	kioskStatus	kioskPublicStatus	kioskConnectionStatus	kioskType	addressStreet	addressCity	addressState	addressZipCode	bikes	closeTime	eventEnd	eventStart	isEventBased	isVirtual	kioskId	notes	openTime	timeZone	latitude	longitude
0	POINT (-75.16374 39.95378)	3004	Municipal Services Building Plaza	[-75.16374, 39.95378]	30	22	8	8	0	8	22	FullService	Active	Active	1	1401 John F. Kennedy Blvd.	Philadelphia	PA	19102	[{'dockNumber': 14, 'isElectric': False, 'isAv...	None	None	None	False	False	3004	None	None	None	39.95378	-75.16374
1	POINT (-75.14403 39.94733)	3005	Welcome Park, NPS	[-75.14403, 39.94733]	13	7	5	5	0	5	7	FullService	Active	Active	1	191 S. 2nd St.	Philadelphia	PA	19106	[{'dockNumber': 1, 'isElectric': False, 'isAva...	None	None	None	False	False	3005	None	None	None	39.94733	-75.14403
2	POINT (-75.20311 39.95220)	3006	40th & Spruce	[-75.20311, 39.9522]	17	13	3	3	0	3	13	FullService	Active	Active	1	246 S. 40th St.	Philadelphia	PA	19104	[{'dockNumber': 9, 'isElectric': False, 'isAva...	None	None	None	False	False	3006	None	None	None	39.95220	-75.20311
3	POINT (-75.15993 39.94517)	3007	11th & Pine, Kahn Park	[-75.15993, 39.94517]	20	4	16	10	6	16	4	FullService	Active	Active	1	328 S. 11th St.	Philadelphia	PA	19107	[{'dockNumber': 1, 'isElectric': True, 'isAvai...	None	None	None	False	False	3007	None	None	None	39.94517	-75.15993
4	POINT (-75.15055 39.98078)	3008	Temple University Station	[-75.15055, 39.98078]	19	15	4	4	0	4	15	FullService	Active	Active	1	1076 Berks Street	Philadelphia	PA	19122	[{'dockNumber': 6, 'isElectric': False, 'isAva...	None	None	None	False	False	3008	None	None	None	39.98078	-75.15055

	trip_id	duration	start_time	end_time	start_station	start_lat	start_lon	end_station	end_lat	end_lon	bike_id	plan_duration	trip_route_category	passholder_type	bike_type
0	223869188	18	2018-01-01 00:24:00	2018-01-01 00:42:00	3124	39.952950	-75.139793	3073	39.961430	-75.152420	3708	30.0	One Way	Indego30	NaN
1	223905597	572	2018-01-01 00:38:00	2018-01-01 10:10:00	3023	39.950481	-75.172859	3066	39.945610	-75.173477	3288	365.0	One Way	Indego365	NaN
2	223872811	22	2018-01-01 00:48:00	2018-01-01 01:10:00	3026	39.941380	-75.145638	3023	39.950481	-75.172859	11735	30.0	One Way	Indego30	NaN
3	223872810	21	2018-01-01 01:03:00	2018-01-01 01:24:00	3045	39.947922	-75.162369	3037	39.954239	-75.161377	5202	30.0	One Way	Indego30	NaN
4	223872809	4	2018-01-01 01:05:00	2018-01-01 01:09:00	3115	39.972630	-75.167572	3058	39.967159	-75.170013	5142	30.0	One Way	Indego30	NaN

	NAME	B08134_001E	B08134_011E	state	county	tract	block group	percent_car
0	Block Group 3, Census Tract 288, Philadelphia ...	436.0	252.0	42	101	028800	3	0.577982
1	Block Group 1, Census Tract 288, Philadelphia ...	389.0	182.0	42	101	028800	1	0.467866
2	Block Group 2, Census Tract 298, Philadelphia ...	141.0	99.0	42	101	029800	2	0.702128
3	Block Group 4, Census Tract 298, Philadelphia ...	488.0	416.0	42	101	029800	4	0.852459
4	Block Group 3, Census Tract 298, Philadelphia ...	231.0	165.0	42	101	029800	3	0.714286

	y	x	osmid	highway	geometry
109727072	39.920114	-75.176365	109727072	NaN	POINT (-8368594.627 4854340.318)
109727084	39.918876	-75.176639	109727084	NaN	POINT (-8368625.162 4854160.598)
109727165	39.917601	-75.176933	109727165	NaN	POINT (-8368657.901 4853975.496)
109727173	39.917476	-75.176959	109727173	NaN	POINT (-8368660.784 4853957.411)
109727183	39.917126	-75.177038	109727183	NaN	POINT (-8368669.590 4853906.554)

	geometry	kioskId	start_station	total_start_trips	totalDocks
0	POINT (-8367189.263 4859227.987)	3004	3004	13712	30
1	POINT (-8364995.156 4858291.368)	3005	3005	8253	13
2	POINT (-8371571.911 4858998.543)	3006	3006	8843	17
3	POINT (-8366765.136 4857977.729)	3007	3007	20323	20
4	POINT (-8365720.959 4863149.674)	3008	3008	4887	19

	label	concept	predicateType	group	predicateOnly	attributes	required
B08134_001E	Estimate!!Total	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_001EA,B08134_001M,B08134_001MA	NaN
B08134_002E	Estimate!!Total!!Less than 10 minutes	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_002EA,B08134_002M,B08134_002MA	NaN
B08134_003E	Estimate!!Total!!10 to 14 minutes	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_003EA,B08134_003M,B08134_003MA	NaN
B08134_004E	Estimate!!Total!!15 to 19 minutes	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_004EA,B08134_004M,B08134_004MA	NaN
B08134_005E	Estimate!!Total!!20 to 24 minutes	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_005EA,B08134_005M,B08134_005MA	NaN
B08134_006E	Estimate!!Total!!25 to 29 minutes	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_006EA,B08134_006M,B08134_006MA	NaN
B08134_007E	Estimate!!Total!!30 to 34 minutes	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_007EA,B08134_007M,B08134_007MA	NaN
B08134_008E	Estimate!!Total!!35 to 44 minutes	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_008EA,B08134_008M,B08134_008MA	NaN
B08134_009E	Estimate!!Total!!45 to 59 minutes	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_009EA,B08134_009M,B08134_009MA	NaN
B08134_010E	Estimate!!Total!!60 or more minutes	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_010EA,B08134_010M,B08134_010MA	NaN
B08134_011E	Estimate!!Total!!Car, truck, or van	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_011EA,B08134_011M,B08134_011MA	NaN
B08134_012E	Estimate!!Total!!Car, truck, or van!!Less than...	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_012EA,B08134_012M,B08134_012MA	NaN
B08134_013E	Estimate!!Total!!Car, truck, or van!!10 to 14 ...	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_013EA,B08134_013M,B08134_013MA	NaN
B08134_014E	Estimate!!Total!!Car, truck, or van!!15 to 19 ...	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_014EA,B08134_014M,B08134_014MA	NaN
B08134_015E	Estimate!!Total!!Car, truck, or van!!20 to 24 ...	MEANS OF TRANSPORTATION TO WORK BY TRAVEL TIME...	int	B08134	NaN	B08134_015EA,B08134_015M,B08134_015MA	NaN

	unique_id	osmid	element_type	amenity	created_by	name	source	wheelchair	geometry	cuisine	addr:housenumber	addr:street	craft	microbrewery	outdoor_seating	addr:city	addr:postcode	phone	diet:vegetarian	opening_hours	website	description	addr:state	addr:housename	designation	diet:vegan	email	fax	bar	takeaway	smoking	alt_name	short_name	name:zh	brand	brand:wikidata	brand:wikipedia	official_name	twitter	drive_in	internet_access	payment:cash	delivery	payment:bitcoin	diet:halal	addr:country	air_conditioning	capacity	note	contact:phone	operator	diet:pescetarian	lgbtq	payment:credit_cards	payment:debit_cards	reservation	source:cuisine	diet:kosher	wheelchair:description	facebook	amenity_1	wikidata	name:en	name:ca	internet_access:fee	level	toilets	start_date	payment:american_express	payment:discover_card	payment:mastercard	payment:visa	internet_access:description	internet_access:ssid	addr:unit	opening_hours:covid19	nodes	building	historic	ship:type	wikipedia	tourism	building:levels	area	building:material	name:ja	name:zn	nonsquare	kitchen_hours
0	node/333786044	333786044	node	restaurant	Potlatch 0.10f	Sam's Morning Glory	knowledge	limited	POINT (-8366653.605 4857351.702)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	node/333786448	333786448	node	restaurant	Potlatch 0.10f	Supper	knowledge	NaN	POINT (-8366545.636 4857610.625)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	node/566683522	566683522	node	restaurant	NaN	Spring Garden Pizza & Restaurant	NaN	NaN	POINT (-8366499.795 4860429.601)	pizza	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	node/596230881	596230881	node	restaurant	NaN	NaN	NaN	NaN	POINT (-8369759.051 4873925.895)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	node/596245658	596245658	node	restaurant	NaN	Earth Bread & Brewery	NaN	NaN	POINT (-8370151.987 4874548.737)	NaN	7136	Germantown Ave	brewery	yes	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Feature	Importance
5	logStationDists	0.314664
6	laggedTrips	0.268413
2	logDistRests	0.225963
4	logIntersectionDists	0.101272
0	totalDocks	0.089687
1	percent_car	0.000000
3	within_centerCity	0.000000

	geometry	kioskId	start_station	total_start_trips	totalDocks	percent_car	logDistRests	within_centerCity	logIntersectionDists	logStationDists	laggedTrips
48	POINT (-8366907.625 4860485.663)	3059	3059	13813	21	0.383838	2.450591	0	1.700616	2.829790	9414.6
120	POINT (-8366648.250 4858924.483)	3185	3185	8328	21	0.174432	2.271648	1	2.094480	2.539369	13569.0
9	POINT (-8365428.189 4860591.687)	3013	3013	9276	17	0.565826	2.687592	0	1.931635	2.964495	5110.2
18	POINT (-8369358.880 4859364.493)	3022	3022	20303	21	0.227273	2.818733	0	1.820301	2.774140	13132.0
88	POINT (-8373896.262 4862805.375)	3117	3117	1524	19	0.835106	3.400058	0	1.916505	3.419593	3140.2

	geometry	kioskId	start_station	total_start_trips	totalDocks	percent_car
0	POINT (-8367189.263 4859227.987)	3004	3004	13712	30	0.342105
17	POINT (-8367777.030 4859245.413)	3021	3021	27615	34	0.342105
80	POINT (-8367386.298 4859137.951)	3108	3108	22954	22	0.342105
130	POINT (-8367616.730 4858873.658)	3202	3202	2683	21	0.342105
133	POINT (-8367644.560 4859267.196)	3205	3205	128	20	0.342105

	geometry	kioskId	start_station	total_start_trips	totalDocks	percent_car	logDistRests
0	POINT (-8367189.263 4859227.987)	3004	3004	13712	30	0.342105	2.593406
17	POINT (-8367777.030 4859245.413)	3021	3021	27615	34	0.342105	2.569178
80	POINT (-8367386.298 4859137.951)	3108	3108	22954	22	0.342105	2.598887
130	POINT (-8367616.730 4858873.658)	3202	3202	2683	21	0.342105	2.249772
133	POINT (-8367644.560 4859267.196)	3205	3205	128	20	0.342105	2.619373

Lecture 12B: Predictive Modeling Part 2¶

Housekeeping¶

Schedule next week¶

Picking up where we left off¶

Spatial amenity/disamenity features¶

Examples of new possible features...¶

Example #1: 311 Graffiti Calls¶

Example #2: Subway stops¶

Now, let's run the model...¶

Can we improve on this?¶

Exercise: How about other spatial features?¶

Example 1: Universities¶

Example 2: Parks¶

Example 3: City Hall¶

Example 4: New Construction Permits¶

Example 5: Aggravated Assaults¶

Example 6: Abandonded Vehicle 311 Calls¶

Fit the updated model¶

More improvement!¶

Feature importances:¶

Part 2: Predicting bikeshare demand in Philadelphia¶

The policy question: how to best expand a bikeshare program?¶

Using predictive modeling as a policy tool¶

What are the key assumptions here?¶

Is this a good assumption?¶

Getting trip data for the Indego bike share¶

Let's plot the stations, colored by the number of docks¶

Load all trips from 2018 and 2019¶

Dependent variable: total trips by starting station¶

What features to use?¶

1. Internal characteristics¶

2. Census demographic data¶

"Impute" missing values with the median value¶

3. Amenities/disamenities¶

Restaurants¶

Within the Center City business district¶

4. Transportation Network¶

5. Neighboring Stations¶

Notes¶

Is there a correlation between trip counts and the spatial lag?¶

Let's look at the correlations of all of our features¶

Let's fit a model!¶

Perform our test/train split¶

Random forest results¶

Evaluate the best estimator on the test set:¶

Evaluate a linear model (baseline) on the test set¶

Which features were the most important?¶

Let's analyze the spatial structure of the predictions visually¶

The data frame indices lines up!¶

The results are... not great¶

The good¶

The bad¶

Can we improve the model?¶

Features to investigate:¶

Other options for bikeshare data¶