from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas

np.random.seed(42)

%matplotlib inline


pd.options.display.max_columns = 999


plt.rcParams['figure.figsize'] = (10,6)


# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Pipelines
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures


import carto2gpd


# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"

# The table name
table_name = "opa_properties_public"

# Only pull 2019 sales for single family residential properties
where = "sale_date >= '2019-01-01' AND sale_date < '2020-01-01'" 
where = where + " AND category_code_description = 'Single Family'"

# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)

# Optional: put it a reproducible order for test/training splits later
salesRaw = salesRaw.sort_values("parcel_number")


salesRaw.head()


len(salesRaw)

25955


# The feature columns we want to use
cols = [
    "sale_price",
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "exterior_condition",
    "zip_code",
    "geometry"
]

# Trim to these columns and remove NaNs
sales = salesRaw[cols].dropna()

# Trim zip code to only the first five digits
sales['zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5)


# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]


len(sales)

20131


# Split the data 70/30
train_set, test_set = train_test_split(sales, 
                                       test_size=0.3, 
                                       random_state=42)

# the target labels: log of sale price
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])

# The features
feature_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values


# Make a linear model pipeline
linear_pipeline = make_pipeline(StandardScaler(), LinearRegression())

# Fit on the training data
linear_pipeline.fit(X_train, y_train)

# What's the test score?
linear_pipeline.score(X_test, y_test)

0.17734906621926338


# Make a random forest pipeline
forest_pipeline = make_pipeline(
    StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)

# Run the 10-fold cross validation
scores = cross_val_score(
    forest_pipeline,
    X_train,
    y_train,
    cv=10,
)

# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())

R^2 scores =  [0.28774608 0.30557794 0.31837843 0.35032902 0.28971327 0.3553058
 0.32448587 0.29367637 0.31938367 0.29449505]
Scores mean =  0.31390915011126685
Score std dev =  0.02308144298765


# Fit on the training data
forest_pipeline.fit(X_train, y_train)

# What's the test score?
forest_pipeline.score(X_test, y_test)

0.3012744612160241


# Extract the regressor from the pipeline
forest_model = forest_pipeline["randomforestregressor"]


# Create the data frame of importances
importance = pd.DataFrame(
    {"Feature": feature_cols, "Importance": forest_model.feature_importances_}
).sort_values("Importance")


importance.hvplot.barh(x="Feature", y="Importance")


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


# Example data of colors
colors = np.array(["red", "green", "blue", "red"])
colors = colors[:, np.newaxis]


colors.shape

(4, 1)


colors

array([['red'],
       ['green'],
       ['blue'],
       ['red']], dtype='<U5')


# Initialize the OHE transformer
ohe = OneHotEncoder()

# Fit the transformer and then transform the colors 
ohe.fit_transform(colors).toarray()

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])


# The corresponding category for each column
ohe.categories_

[array(['blue', 'green', 'red'], dtype='<U5')]


# Numerical columns
num_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
]

# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]


# Set up the column transformer with two transformers
# ----> Scale the numerical columns
# ----> One-hot encode the categorical columns

transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)


# Initialize the pipeline
# NOTE: only use 10 estimators here so it will run in a reasonable time
pipe = make_pipeline(
    transformer, RandomForestRegressor(n_estimators=10, 
                                       random_state=42)
)


# Fit the training set
pipe.fit(train_set, y_train);


# What's the test score?
pipe.score(test_set, y_test)

0.559083987329791


# The column transformer...
transformer

ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['total_livable_area', 'total_area',
                                  'garage_spaces', 'fireplaces',
                                  'number_of_bathrooms', 'number_of_bedrooms',
                                  'number_stories']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['exterior_condition', 'zip_code'])])


# The steps in the column transformer
transformer.named_transformers_

{'num': StandardScaler(),
 'cat': OneHotEncoder(handle_unknown='ignore'),
 'remainder': 'drop'}


# The one-hot step
ohe = transformer.named_transformers_['cat']

ohe

OneHotEncoder(handle_unknown='ignore')


# One column for each category type!
ohe_cols = ohe.get_feature_names()

ohe_cols

array(['x0_0', 'x0_1', 'x0_2', 'x0_3', 'x0_4', 'x0_5', 'x0_6', 'x0_7',
       'x1_19102', 'x1_19103', 'x1_19104', 'x1_19106', 'x1_19107',
       'x1_19111', 'x1_19114', 'x1_19115', 'x1_19116', 'x1_19118',
       'x1_19119', 'x1_19120', 'x1_19121', 'x1_19122', 'x1_19123',
       'x1_19124', 'x1_19125', 'x1_19126', 'x1_19127', 'x1_19128',
       'x1_19129', 'x1_19130', 'x1_19131', 'x1_19132', 'x1_19133',
       'x1_19134', 'x1_19135', 'x1_19136', 'x1_19137', 'x1_19138',
       'x1_19139', 'x1_19140', 'x1_19141', 'x1_19142', 'x1_19143',
       'x1_19144', 'x1_19145', 'x1_19146', 'x1_19147', 'x1_19148',
       'x1_19149', 'x1_19150', 'x1_19151', 'x1_19152', 'x1_19153',
       'x1_19154'], dtype=object)


# Full list of columns is numerical + one-hot 
features = num_cols + list(ohe_cols)

features

['total_livable_area',
 'total_area',
 'garage_spaces',
 'fireplaces',
 'number_of_bathrooms',
 'number_of_bedrooms',
 'number_stories',
 'x0_0',
 'x0_1',
 'x0_2',
 'x0_3',
 'x0_4',
 'x0_5',
 'x0_6',
 'x0_7',
 'x1_19102',
 'x1_19103',
 'x1_19104',
 'x1_19106',
 'x1_19107',
 'x1_19111',
 'x1_19114',
 'x1_19115',
 'x1_19116',
 'x1_19118',
 'x1_19119',
 'x1_19120',
 'x1_19121',
 'x1_19122',
 'x1_19123',
 'x1_19124',
 'x1_19125',
 'x1_19126',
 'x1_19127',
 'x1_19128',
 'x1_19129',
 'x1_19130',
 'x1_19131',
 'x1_19132',
 'x1_19133',
 'x1_19134',
 'x1_19135',
 'x1_19136',
 'x1_19137',
 'x1_19138',
 'x1_19139',
 'x1_19140',
 'x1_19141',
 'x1_19142',
 'x1_19143',
 'x1_19144',
 'x1_19145',
 'x1_19146',
 'x1_19147',
 'x1_19148',
 'x1_19149',
 'x1_19150',
 'x1_19151',
 'x1_19152',
 'x1_19153',
 'x1_19154']


random_forest = pipe["randomforestregressor"]

# Create the dataframe with importances
importance = pd.DataFrame(
    {"Feature": features, "Importance": random_forest.feature_importances_}
)


importance.head(n=20)


# Sort by importance and get the top 30
# SORT IN DESCENDING ORDER
importance = importance.sort_values("Importance", ascending=False).iloc[:30]

# Plot
importance.hvplot.barh(x="Feature", y="Importance", height=700, flip_yaxis=True)


# the 311 table
table_name = "public_cases_fc"

# Peak at the first row of data
carto2gpd.get(carto_url, table_name, limit=1)


# Select only those for grafitti and in 2019
where_2019 = "requested_datetime >= '01-01-2019' and requested_datetime < '01-01-2020'"
where_grafitti = "service_name = 'Graffiti Removal'"
where = f"{where_2019} and {where_grafitti}"
 
# Pull the subset we want
graffiti = carto2gpd.get(carto_url, table_name, where=where)


# Remove rows with missing geometries
graffiti = graffiti.loc[graffiti.geometry.notnull()]


len(graffiti)

16704


graffiti.head()


# Do the CRS conversion
sales_3857 = sales.to_crs(epsg=3857)
graffiti_3857 = graffiti.to_crs(epsg=3857)


def get_xy_from_geometry(df):
    """
    Return a numpy array with two columns, where the 
    first holds the `x` geometry coordinate and the second 
    column holds the `y` geometry coordinate
    """
    x = df.geometry.x
    y = df.geometry.y
    
    return np.column_stack((x, y)) # stack as columns


# Extract x/y for sales
salesXY = get_xy_from_geometry(sales_3857)

# Extract x/y for grafitti calls
graffitiXY = get_xy_from_geometry(graffiti_3857)


salesXY.shape

(20131, 2)


graffitiXY.shape

(16704, 2)


from sklearn.neighbors import NearestNeighbors


# STEP 1: Initialize the algorithm
k = 5
nbrs = NearestNeighbors(n_neighbors=k)

# STEP 2: Fit the algorithm on the "neighbors" dataset
nbrs.fit(graffitiXY)

# STEP 3: Get distances for sale to neighbors
grafDists, grafIndices = nbrs.kneighbors(salesXY)


print("length of sales = ", len(salesXY))
print("shape of grafDists = ", grafDists.shape)
print("shape of grafIndices = ", grafIndices.shape)

length of sales =  20131
shape of grafDists =  (20131, 5)
shape of grafIndices =  (20131, 5)


# The distances from the first sale to the 5 nearest neighbors
grafDists[0]

array([ 63.06999367,  80.52485949, 125.34776876, 164.90764118,
       167.68676032])


# The coordinates for the first sale
x0, y0 = salesXY[0]
x0, y0

(-8365497.20665795, 4855985.04901042)


# The indices for the 5 nearest graffiti calls
grafIndices[0]

array([9734, 6213, 1154, 1608, 1449])


# the graffiti neighbors
sale0_neighbors = graffitiXY[grafIndices[0]]
sale0_neighbors

array([[-8365557.65314145,  4856003.05030855],
       [-8365555.31543214,  4856040.79507179],
       [-8365412.26988648,  4855892.86545449],
       [-8365551.75320844,  4856140.67421387],
       [-8365531.27042213,  4856149.23947774]])


# Access the first and second column for x/y values
neighbors_x = sale0_neighbors[:,0]
neighbors_y = sale0_neighbors[:,1]

# The x/y differences between neighbors and first sale coordinates
dx = (neighbors_x - x0)
dy = (neighbors_y - y0)

# The Euclidean dist
manual_dists = (dx**2 + dy**2) ** 0.5


manual_dists

array([ 63.06999367,  80.52485949, 125.34776876, 164.90764118,
       167.68676032])


grafDists[0]

array([ 63.06999367,  80.52485949, 125.34776876, 164.90764118,
       167.68676032])


# Average distance to neighbors
avgGrafDist = grafDists.mean(axis=1)

# Set zero distances to be small, but nonzero
# IMPORTANT: THIS WILL AVOID INF DISTANCES WHEN DOING THE LOG
avgGrafDist[avgGrafDist==0] = 1e-5

# Calculate log of distances
sales['logDistGraffiti'] = np.log10(avgGrafDist)


sales.head()


# Load the City Limits to plot too
import esri2gpd

# From OpenDataPhilly's page
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/City_Limits/FeatureServer/0"
city_limits = esri2gpd.get(url).to_crs(epsg=3857)


fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

# Plot the log of the Graffiti distance
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=sales["logDistGraffiti"].values, gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")


import osmnx as ox


# Get the geometry from the city limits
city_limits_outline = city_limits.to_crs(epsg=4326).squeeze().geometry

city_limits_outline


# Get the subway stops within the city limits
subway = ox.geometries_from_polygon(city_limits_outline, tags={"station": "subway"})

# Convert to 3857 (meters)
subway = subway.to_crs(epsg=3857)

subway.head()


fig, ax = plt.subplots(figsize=(6,6))

# Plot the subway locations
subway.plot(ax=ax, markersize=10, color='crimson')

# City limits, too
city_limits.plot(ax=ax, facecolor='none', edgecolor='black', linewidth=4)

ax.set_axis_off()


# STEP 1: x/y coordinates of subway stops (in EPGS=3857)
subwayXY = get_xy_from_geometry(subway.to_crs(epsg=3857))

# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=1)

# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(subwayXY)

# STEP 4: Get distances for sale to neighbors
subwayDists, subwayIndices = nbrs.kneighbors(salesXY)

# STEP 5: add back to the original dataset
sales["logDistSubway"] = np.log10(subwayDists[:, 0])


fig, ax = plt.subplots(figsize=(10,10), facecolor=plt.get_cmap('viridis')(0))

# Plot the log of the subway distance
x = salesXY[:,0]
y = salesXY[:,1]
ax.hexbin(x, y, C=sales['logDistSubway'].values, gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor='none', edgecolor='white', linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")


import seaborn as sns


cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "logDistGraffiti", # NEW
    "logDistSubway",  # NEW
    "sale_price"
]
sns.heatmap(sales[cols].corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1);


# Numerical columns
num_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "logDistGraffiti", # NEW
    "logDistSubway" # NEW
]

# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]


# Set up the column transformer with two transformers
transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)


# Initialize the pipeline
# NOTE: only use 20 estimators here so it will run in a reasonable time
pipe = make_pipeline(
    transformer, RandomForestRegressor(n_estimators=20, random_state=42)
)


# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)

# the target labels
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])


# Fit the training set
# REMINDER: use the training dataframe objects here rather than numpy array
pipe.fit(train_set, y_train);


# What's the test score?
# REMINDER: use the test dataframe rather than numpy array
pipe.score(test_set, y_test)

0.6163490515951564


def plot_feature_importances(pipeline, num_cols, transformer, top=20, **kwargs):
    """
    Utility function to plot the feature importances from the input
    random forest regressor.

    Parameters
    ----------
    pipeline :
        the pipeline object
    num_cols :
        list of the numerical columns
    transformer :
        the transformer preprocessing step
    top : optional
        the number of importances to plot
    **kwargs : optional
        extra keywords passed to the hvplot function
    """
    # The one-hot step
    ohe = transformer.named_transformers_["cat"]

    # One column for each category type!
    ohe_cols = ohe.get_feature_names()

    # Full list of columns is numerical + one-hot
    features = num_cols + list(ohe_cols)

    # The regressor
    regressor = pipeline["randomforestregressor"]

    # Create the dataframe with importances
    importance = pd.DataFrame(
        {"Feature": features, "Importance": regressor.feature_importances_}
    )

    # Sort importance in descending order and get the top
    importance = importance.sort_values("Importance", ascending=False).iloc[:top]

    # Plot
    return importance.hvplot.barh(
        x="Feature", y="Importance", flip_yaxis=True, **kwargs
    )


plot_feature_importances(pipe, num_cols, transformer, top=30, height=500)


def get_xy_from_geometry(df):
    """
    Return a numpy array with two columns, where the 
    first holds the `x` geometry coordinate and the second 
    column holds the `y` geometry coordinate
    """
    # NEW: use the centroid.x and centroid.y to support Polygon() and Point() geometries 
    x = df.geometry.centroid.x
    y = df.geometry.centroid.y
    
    return np.column_stack((x, y)) # stack as columns

	geometry	cartodb_id	assessment_date	basements	beginning_point	book_and_page	building_code	building_code_description	category_code	category_code_description	census_tract	central_air	cross_reference	date_exterior_condition	depth	exempt_building	exterior_condition	frontage	fuel	garage_spaces	garage_type	general_construction	geographic_ward	homestead_exemption	house_number	interior_condition	location	mailing_address_1	mailing_address_2	mailing_care_of	mailing_city_state	mailing_street	mailing_zip	market_value	market_value_date	number_of_bathrooms	number_of_bedrooms	number_of_rooms	number_stories	other_building	owner_1	owner_2	parcel_number	parcel_shape	quality_grade	recording_date	registry_number	sale_date	sale_price	separate_utilities	sewer	site_type	state_code	street_code	street_designation	street_direction	street_name	suffix	taxable_building	taxable_land	topography	total_area	total_livable_area	type_heater	unfinished	unit	utility	view_type	year_built	year_built_estimate	zip_code	zoning	objectid
2	POINT (-75.14854 39.93144)	52	None	0	54'7" E OF AMERICAN	3547465	R30	ROW B/GAR 2 STY MASONRY	1	Single Family	770	Y	None	None	90.0	0	4	18.03	None	1	A	A	01	0.0	00222	4	222 WHARTON ST	None	None	None	SOUDERTON PA	135 NORFOLK CT	18964	249400	None	2	3	6	2	None	CARROLL PATRICK J	CARROLL JULIA ANN	011001660	E	None	2019-08-06T00:00:00Z	009S170309	2019-07-17T00:00:00Z	340000	None	None	A	1001	82740	ST	None	WHARTON	None	184058	65342	F	1622.70	1266	A	None	None	None	I	1960	Y	191475336	RSA5	784571849
0	POINT (-75.14721 39.93033)	7	None	C	40'4 1/2" W HOWARD ST	3517081	O50	ROW 3 STY MASONRY	1	Single Family	710	Y	None	None	40.0	0	3	13.50	None	0	0	A	01	0.0	00117	3	117 REED ST	None	None	None	NEW YORK NY	245 PARK AVENUE, 26TH FL	10167	283800	None	1	4	7	3	None	SFR SA I LLC	None	011008200	E	None	2019-05-24T00:00:00Z	009S170231	2019-05-17T00:00:00Z	1283700	None	None	None	1001	67780	ST	None	REED	None	209444	74356	F	540.00	1131	A	None	None	None	I	1920	Y	191476128	RSA5	784571875
1	POINT (-75.14927 39.93033)	40	None	D	182'11" W PHILIP	3527524	O50	ROW 3 STY MASONRY	1	Single Family	771	Y	None	None	48.0	45000	2	15.41	None	0	0	A	01	45000.0	00236	2	236 REED ST	None	None	None	None	None	None	343100	None	1	3	6	3	None	GOSS ADAM M	GOSS SARA L	011013000	E	None	2019-06-19T00:00:00Z	010S110184	2019-05-17T00:00:00Z	397500	None	None	None	1001	67780	ST	None	REED	None	212703	85397	F	739.68	1436	A	None	None	None	I	1920	None	191476037	RSA5	784571907
4	POINT (-75.14876 39.93011)	104	None	A	28 FT W PHILIP	3534307	O30	ROW 2 STY MASONRY	1	Single Family	771	Y	None	None	39.0	172346	2	14.00	None	0	0	A	01	0.0	00205	2	205 GERRITT ST	None	None	None	None	None	None	282200	None	1	3	6	2	None	PLATT JACOB	None	011013900	E	None	2019-07-05T00:00:00Z	010S110173	2019-06-06T00:00:00Z	285000	None	None	A	1001	36680	ST	None	GERRITT	None	39614	70240	F	546.00	868	A	None	None	None	I	1920	Y	191476012	RSA5	784571913
5	POINT (-75.14886 39.93012)	106	None	C	56 FT W PHILIP	3465875	O30	ROW 2 STY MASONRY	1	Single Family	771	N	None	None	39.0	100900	4	14.00	None	0	0	A	01	0.0	00209	4	209 GERRITT ST	None	None	None	None	None	None	186400	None	1	3	6	2	None	DEMCHUK JOHN C	DEMCHUK TRACY A	011014100	E	None	2019-01-15T00:00:00Z	010S110171	2019-01-08T00:00:00Z	3	None	None	A	1001	36680	ST	None	GERRITT	None	36663	48837	F	546.00	1008	A	None	None	None	I	1920	Y	191476012	RSA5	784571915

	geometry	cartodb_id	objectid	service_request_id	status	status_notes	service_name	service_code	agency_responsible	service_notice	requested_datetime	updated_datetime	expected_datetime	address	zipcode	media_url	lat	lon
0	POINT (-75.14774 40.02489)	322815	7674671	13040415	Closed	Other	Graffiti Removal	SR-CL01	Community Life Improvement Program	7 Business Days	2019-12-16T09:29:29Z	2019-12-16T12:01:08Z	2019-12-24T19:00:00Z	4709 N BROAD ST	None	None	40.024886	-75.147742
1	POINT (-75.14328 39.95198)	234046	6739855	12697168	Closed	Issue Resolved	Graffiti Removal	SR-CL01	Community Life Improvement Program	7 Business Days	2019-06-16T11:58:06Z	2019-06-26T08:16:02Z	2019-06-25T20:00:00Z	101 N 2ND ST	None	https://d17aqltn7cihbm.cloudfront.net/uploads/...	39.951980	-75.143276
2	POINT (-75.15174 39.93389)	307747	5943225	12472082	Closed	Issue Resolved	Graffiti Removal	SR-CL01	Community Life Improvement Program	7 Business Days	2019-02-14T09:31:54Z	2019-02-19T08:30:28Z	2019-02-24T19:00:00Z	400 WASHINGTON AVE	None	https://d17aqltn7cihbm.cloudfront.net/uploads/...	39.933889	-75.151736
3	POINT (-75.16847 39.94070)	213014	6729909	12703388	Closed	Issue Resolved	Graffiti Removal	SR-CL01	Community Life Improvement Program	7 Business Days	2019-06-18T19:27:03Z	2019-06-25T09:00:46Z	2019-06-27T20:00:00Z	1516 WEBSTER ST	None	https://d17aqltn7cihbm.cloudfront.net/uploads/...	39.940702	-75.168468
4	POINT (-75.16130 39.91676)	710570	7672847	13021942	Closed	Issue Resolved	Graffiti Removal	SR-CL01	Community Life Improvement Program	7 Business Days	2019-12-04T10:40:29Z	2019-12-16T09:06:26Z	2019-12-15T19:00:00Z	2602 S 8TH ST	None	https://d17aqltn7cihbm.cloudfront.net/uploads/...	39.916762	-75.161305

	sale_price	total_livable_area	total_area	garage_spaces	number_of_bathrooms	number_of_bedrooms	number_stories	exterior_condition	zip_code	geometry	logDistGraffiti
2	340000	1266	1622.70	1	2	3	2	4	19147	POINT (-75.14854 39.93144)	2.080292
1	397500	1436	739.68	0	1	3	3	2	19147	POINT (-75.14927 39.93033)	2.232137
4	285000	868	546.00	0	1	3	2	2	19147	POINT (-75.14876 39.93011)	2.202199
6	277500	890	683.70	0	1	2	2	3	19147	POINT (-75.14852 39.92954)	2.092351
7	453610	1320	896.00	0	2	3	3	2	19147	POINT (-75.14871 39.92962)	2.076465

	Feature	Importance
0	total_livable_area	0.158573
1	total_area	0.192858
2	garage_spaces	0.012014
3	fireplaces	0.002742
4	number_of_bathrooms	0.169654
5	number_of_bedrooms	0.014315
6	number_stories	0.017482
7	x0_0	0.000236
8	x0_1	0.006463
9	x0_2	0.024824
10	x0_3	0.029688
11	x0_4	0.012139
12	x0_5	0.006955
13	x0_6	0.004804
14	x0_7	0.014958
15	x1_19102	0.001416
16	x1_19103	0.003051
17	x1_19104	0.002927
18	x1_19106	0.003286
19	x1_19107	0.001287

	unique_id	osmid	element_type	addr:city	name	network	operator	platforms	public_transport	railway	station	subway	wikidata	wikipedia	geometry	wheelchair	addr:postcode	operator_1	addr:housenumber	addr:street	railway:position	internet_access	old_name	addr:state	short_name	elevator	tram
0	node/469917297	469917297	node	Philadelphia	15th-16th & Locust	PATCO	PATCO	1	station	station	subway	yes	Q4551078	en:15–16th & Locust (PATCO station)	POINT (-8367551.107 4858463.438)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	node/469917298	469917298	node	Philadelphia	9th-10th & Locust	PATCO	PATCO	1	station	station	subway	yes	Q4646737	en:9–10th & Locust (PATCO station)	POINT (-8366424.042 4858281.683)	yes	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	node/471026103	471026103	node	Philadelphia	12th-13th & Locust	PATCO	PATCO	1	station	station	subway	yes	Q4548965	en:12–13th & Locust (PATCO station)	POINT (-8366949.703 4858366.817)	no	19107	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	node/650938316	650938316	node	NaN	63rd Street	SEPTA	SEPTA	NaN	station	station	subway	yes	NaN	NaN	POINT (-8376424.717 4860524.238)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	node/650959043	650959043	node	NaN	56th Street	SEPTA	SEPTA	NaN	station	station	subway	yes	Q4640769	NaN	POINT (-8374883.844 4860274.795)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

Lecture 12A: Predictive Modeling Part 2¶

Housekeeping¶

Schedule Changes¶

This week: Predictive modeling continued¶

Last time¶

Review: Predicting housing prices in Philadelphia¶

Load data from the Office of Property Assessment¶

Let's focus on numerical features only first¶

Test score improved!¶

Which variables were most important?¶

On to new material...¶

How to handle categorical data?¶

One-hot encoding in scikit learn¶

Important¶

Substantial improvement on test set when including ZIP codes¶

But how crucial? Let's plot the importances¶

Takeaways¶

Why is feature engineering so important?¶

Adding spatial features to the housing price model¶

Spatial amenity/disamenity features¶

Examples of new possible features...¶

Example #1: 311 Graffiti Calls¶

Step 1: Download the data from the CARTO database¶

Step 2: Get the x/y coordinates of both datasets¶

Step 3: Calculate the nearest neighbor distances¶

What did we just calculate?¶

Can we reproduce these distances?¶

Use the log of the average distance as the new feature¶

Let's plot a hex map of the new feature!¶

Example #2: Subway stops¶

Now, get the distances to the nearest subway stop¶

Let's plot a hex map again!¶

What about correlations?¶

Now, let's re-run our model...did it help?¶

A small improvement!¶

How about the top 30 feature importances now?¶

Both new spatial features are in the top 5 in terms of importance!¶

Exercise: How about other spatial features?¶

Universities¶

Parks¶

City Hall¶

New Construction Permits¶

Aggravated Assaults¶

Abandonded Vehicle 311 Calls¶