In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
np.random.seed(42)
%matplotlib inline
In [2]:
plt.rcParams['figure.figsize'] = (10,6)

Week 10: Clustering Analysis in Python

Nov 3, 2020

Housekeeping

  • Assignment #5 (optional) due on Thursday (11/5)
  • Assignment #6 (optional) will be assigned on Thursday
  • Remember, you have to do one of homeworks #4, #5, or #6

Recap

  • Last week: urban street networks + interactive web maps
  • New tools: OSMnx, Pandana, and Folium

Where we left off last week: choropleth maps in Folium

Example: A map of households without internet for US counties

Two ways:

  • The easy way: folium.Choropleth
  • The hard way: folium.GeoJson
In [179]:
import folium

Load the data, remove counties with no households and add our new column:

In [180]:
# Load CSV data from the data/ folder
census_data = pd.read_csv("./data/internet_avail_census.csv", dtype={"geoid": str})

# Remove counties with no households
valid = census_data['universe'] > 0
census_data = census_data.loc[valid]

# Calculate the percent without internet
census_data['percent_no_internet'] = census_data['no_internet'] / census_data['universe']
In [181]:
census_data.head()
Out[181]:
NAME universe no_internet state county geoid percent_no_internet
0 Washington County, Mississippi 18299.0 6166.0 28 151 28151 0.336958
1 Perry County, Mississippi 4563.0 1415.0 28 111 28111 0.310103
2 Choctaw County, Mississippi 3164.0 1167.0 28 19 28019 0.368837
3 Itawamba County, Mississippi 8706.0 1970.0 28 57 28057 0.226281
4 Carroll County, Mississippi 3658.0 1218.0 28 15 28015 0.332969

Load the counties geometry data too:

In [182]:
# Load counties GeoJSOn from the data/ folder
counties = gpd.read_file("./data/us-counties-10m.geojson")
In [183]:
counties.head()
Out[183]:
id geometry
0 53073 MULTIPOLYGON (((-120.85361 49.00011, -120.7674...
1 30105 POLYGON ((-106.11238 48.99904, -106.15187 48.8...
2 30029 POLYGON ((-114.06985 48.99904, -114.05908 48.8...
3 16021 POLYGON ((-116.04755 49.00065, -116.04755 48.5...
4 30071 POLYGON ((-107.17840 49.00011, -107.20712 48.9...

The hard way: use folium.GeoJson

  • The good:
    • More customizable, and can add user interaction
  • The bad:

The steps involved

  1. Join data and geometry features into a single GeoDataFrame
  2. Define a function to style features based on data values
  3. Create GeoJSON layer and add it to the map

Step 1: Join the census data and features

Note: this is different than using folium.Choropleth, where data and features are stored in two separate data frames.

In [184]:
# Merge the county geometries with census data
# Left column: "id"
# Right column: "geoid"
census_joined = counties.merge(census_data, left_on="id", right_on="geoid")
In [185]:
census_joined.head()
Out[185]:
id geometry NAME universe no_internet state county geoid percent_no_internet
0 53073 MULTIPOLYGON (((-120.85361 49.00011, -120.7674... Whatcom County, Washington 85008.0 9189.0 53 73 53073 0.108096
1 30105 POLYGON ((-106.11238 48.99904, -106.15187 48.8... Valley County, Montana 3436.0 672.0 30 105 30105 0.195576
2 30029 POLYGON ((-114.06985 48.99904, -114.05908 48.8... Flathead County, Montana 38252.0 5662.0 30 29 30029 0.148018
3 16021 POLYGON ((-116.04755 49.00065, -116.04755 48.5... Boundary County, Idaho 4605.0 1004.0 16 21 16021 0.218024
4 30071 POLYGON ((-107.17840 49.00011, -107.20712 48.9... Phillips County, Montana 1770.0 484.0 30 71 30071 0.273446

Step 2: Normalize the data column to 0 to 1

  • We will use a matplotlib color map that requires data to be between 0 and 1
  • Normalize our "percent_no_internet" column to be between 0 and 1
In [186]:
# Minimum
min_val = census_joined['percent_no_internet'].min()

# Maximum
max_val = census_joined['percent_no_internet'].max()

# Calculate a normalized column
normalized = (census_joined['percent_no_internet'] - min_val) / (max_val - min_val)

# Add to the dataframe
census_joined['percent_no_internet_normalized'] = normalized

Step 3: Define our style functions

  • Create a matplotlib colormap object using plt.get_cmap()
  • Color map objects are functions: give the function a number between 0 and 1 and it will return a corresponding color from the color map
  • Based on the feature data, evaluate the color map and convert to a hex string
In [187]:
import matplotlib.colors as mcolors
In [188]:
# Use a red-purple colorbrewer color scheme
cmap = plt.get_cmap('RdPu')
In [189]:
# The minimum value of the color map as an RGB tuple
cmap(0)
Out[189]:
(1.0, 0.9686274509803922, 0.9529411764705882, 1.0)
In [190]:
# The minimum value of the color map as a hex string
mcolors.rgb2hex(cmap(0.0))
Out[190]:
'#fff7f3'
In [191]:
# The maximum value of the color map as a hex string
mcolors.rgb2hex(cmap(1.0))
Out[191]:
'#49006a'
In [192]:
def get_style(feature):
    """
    Given an input GeoJSON feature, return a style dict.
    
    Notes
    -----
    The color in the style dict is determined by the 
    "percent_no_internet_normalized" column in the 
    input "feature".
    """
    # Get the data value from the feature
    value = feature['properties']['percent_no_internet_normalized']
    
    # Evaluate the color map
    # NOTE: value must between 0 and 1
    rgb_color = cmap(value) # this is an RGB tuple
    
    # Convert to hex string
    color = mcolors.rgb2hex(rgb_color)
    
    # Return the style dictionary
    return {'weight': 0.5, 'color': color, 'fillColor': color, "fillOpacity": 0.75}
In [193]:
def get_highlighted_style(feature):
    """
    Return a style dict to use when the user highlights a 
    feature with the mouse.
    """
    
    return {"weight": 3, "color": "black"}

Step 4: Convert our data to GeoJSON

  • Tip: To limit the amount of data Folium has to process, it's best to trim our GeoDataFrame to only the columns we'll need before converting to GeoJSON
  • You can use the .to_json() function to convert to a GeoJSON string
In [197]:
needed_cols = ['NAME', 'percent_no_internet', 'percent_no_internet_normalized', 'geometry']
census_json = census_joined[needed_cols].to_json()
In [198]:
# STEP 1: Initialize the map
m = folium.Map(location=[40, -98], zoom_start=4)

# STEP 2: Add the GeoJson to the map
folium.GeoJson(
    census_json, # The geometry + data columns in GeoJSON format
    style_function=get_style, # The style function to color counties differently
    highlight_function=get_highlighted_style, 
    tooltip=folium.GeoJsonTooltip(['NAME', 'percent_no_internet'])
).add_to(m)



# avoid a rendering bug by saving as HTML and re-loading
m.save('percent_no_internet.html')

And viola!

The hard way is harder, but we have a tooltip and highlight interactivity!

In [199]:
from IPython.display import IFrame
In [200]:
IFrame('percent_no_internet.html', width=800, height=500)
Out[200]:

At-home exercise: Can we repeat this with altair?

Try to replicate the above interactive map exactly (minus the background tiles). This includes:

  • Using the red-purple colorbrewer scheme
  • Having a tooltip with the percentage and county name

Note: Altair's syntax is similar to the folium.Choropleth syntax — you should pass the counties GeoDataFrame to the alt.Chart() and then use the transform_lookup() to merge those geometries to the census data and pull in the census data column we need ("percent_without_internet").

Hints

In [201]:
import altair as alt
In [202]:
# Initialize the chart with the counties data
alt.Chart(counties).mark_geoshape(stroke="white", strokeWidth=0.25).encode(
    # Encode the color 
    color=alt.Color(
        "percent_no_internet:Q",
        title="Percent Without Internet",
        scale=alt.Scale(scheme="redpurple"),
        legend=alt.Legend(format=".0%")
    ),
    # Tooltip
    tooltip=[
        alt.Tooltip("NAME:N", title="Name"),
        alt.Tooltip("percent_no_internet:Q", title="Percent Without Internet", format=".1%"),
    ],
).transform_lookup(
    lookup="id", # The column name in the counties data to match on
    from_=alt.LookupData(census_data, "geoid", ["percent_no_internet", "NAME"]), # Match census data on "geoid"
).project(
    type="albersUsa"
).properties(
    width=700, height=500
)
Out[202]:

Leaflet/Folium plugins

One of leaflet's strengths: a rich set of open-source plugins

https://leafletjs.com/plugins.html

Many of these are available in Folium!

Example: Heatmap

In [203]:
from folium.plugins import HeatMap
In [204]:
HeatMap?

Example: A heatmap of new construction permits in Philadelphia in the last 30 days

New construction in Philadelphia requires a building permit, which we can pull from Open Data Philly.

Step 1: Download the data from CARTO

The "current_date" variable in SQL databases

You can use the pre-defined "current_date" variable to get the current date. For example, to get the permits from the past 30 days, we could do:

SELECT * FROM permits WHERE permitissuedate >= current_date - 30

Selecting only new construction permits

To select new construction permits, you can use the "permitdescription" field. There are two relevant categories:

  • "RESIDENTIAL BUILDING PERMIT"
  • "COMMERCIAL BUILDING PERMIT"

We can use the SQL IN command (documentation) to easily select rows that have these categories.

In [205]:
import carto2gpd
In [208]:
# API URL
url = "https://phl.carto.com/api/v2/sql"

# Table name on CARTO
table_name = "permits"

# The where clause, with two parts
DAYS = 30
where = f"permitissuedate >= current_date - {DAYS}"
where += " and permitdescription IN ('RESIDENTIAL BUILDING PERMIT', 'COMMERCIAL BUILDING PERMIT')"

where
Out[208]:
"permitissuedate >= current_date - 30 and permitdescription IN ('RESIDENTIAL BUILDING PERMIT', 'COMMERCIAL BUILDING PERMIT')"
In [212]:
# Run the query
permits = carto2gpd.get(url, table_name, where=where)
In [213]:
len(permits)
Out[213]:
774
In [214]:
permits.head()
Out[214]:
geometry cartodb_id objectid permitnumber addressobjectid parcel_id_num permittype permitdescription commercialorresidential typeofwork ... address unit_type unit_num zip censustract council_district opa_owner systemofrecord geocode_x geocode_y
0 POINT (-75.09823 39.99006) 4524 5031 RP-2020-010982 15333418 86114 RESIDENTIAL BUILDING RESIDENTIAL BUILDING PERMIT RESIDENTIAL NEW CONSTRUCTION ... 2500 E TIOGA ST None None 19134-5410 379 1 RICHMOND SQUARE ASSOCIATE ECLIPSE 2.711417e+06 250392.041887
1 POINT (-75.17882 40.05173) 4525 5032 RP-2020-013321 132652476 444786 RESIDENTIAL BUILDING RESIDENTIAL BUILDING PERMIT RESIDENTIAL ADDITION AND/OR ALTERATION ... 6465 MUSGRAVE ST None None 19119-2332 252 8 OMOWALE MANU, OMOWALE JACQUELINE ECLIPSE 2.688192e+06 272179.430982
2 POINT (-75.17833 39.93475) 4675 5357 1053816 128023588 1343565 RESIDENTIAL BUILDING RESIDENTIAL BUILDING PERMIT RESIDENTIAL NEW CONSTRUCTION ... 1336 S CAPITOL ST None None 19146-4307 31 2 REDEVELOPMENT AUTHORITY ECLIPSE 2.689571e+06 229588.497715
3 POINT (-75.16049 39.92359) 4879 5020 RP-2020-003363 135434285 514438 RESIDENTIAL BUILDING RESIDENTIAL BUILDING PERMIT RESIDENTIAL ADDITION AND/OR ALTERATION ... 2035 S DARIEN ST None None 19148-2338 41.01 1 NOVELLI DOMENIC, NOVELLI RUTH ECLIPSE 2.694689e+06 225669.572285
4 POINT (-75.16101 39.99039) 4902 5023 RP-2020-006048 132131445 230070 RESIDENTIAL BUILDING RESIDENTIAL BUILDING PERMIT RESIDENTIAL ADDITION AND/OR ALTERATION ... 2410 N COLORADO ST None None 19132-4309 167.01 5 TOLIVER BROTHERS LLC ECLIPSE 2.693832e+06 249990.520054

5 rows × 32 columns

Step 2: Remove missing geometries

Some permits don't have locations — use the .geometry.notnull() function to trim the data frame to those incidents with valid geometries.

In [215]:
permits = permits.loc[permits.geometry.notnull()].copy()

Step 3: Extract out the lat/lng coordinates

Note: We need an array of (latitude, longitude) pairs. Be careful about the order!

In [219]:
# Extract the lat and longitude from the geometery column
permits['lat'] = permits.geometry.y
permits['lng'] = permits.geometry.x
In [220]:
permits.head()
Out[220]:
geometry cartodb_id objectid permitnumber addressobjectid parcel_id_num permittype permitdescription commercialorresidential typeofwork ... unit_num zip censustract council_district opa_owner systemofrecord geocode_x geocode_y lat lng
0 POINT (-75.09823 39.99006) 4524 5031 RP-2020-010982 15333418 86114 RESIDENTIAL BUILDING RESIDENTIAL BUILDING PERMIT RESIDENTIAL NEW CONSTRUCTION ... None 19134-5410 379 1 RICHMOND SQUARE ASSOCIATE ECLIPSE 2.711417e+06 250392.041887 39.990062 -75.098232
1 POINT (-75.17882 40.05173) 4525 5032 RP-2020-013321 132652476 444786 RESIDENTIAL BUILDING RESIDENTIAL BUILDING PERMIT RESIDENTIAL ADDITION AND/OR ALTERATION ... None 19119-2332 252 8 OMOWALE MANU, OMOWALE JACQUELINE ECLIPSE 2.688192e+06 272179.430982 40.051730 -75.178822
2 POINT (-75.17833 39.93475) 4675 5357 1053816 128023588 1343565 RESIDENTIAL BUILDING RESIDENTIAL BUILDING PERMIT RESIDENTIAL NEW CONSTRUCTION ... None 19146-4307 31 2 REDEVELOPMENT AUTHORITY ECLIPSE 2.689571e+06 229588.497715 39.934752 -75.178325
3 POINT (-75.16049 39.92359) 4879 5020 RP-2020-003363 135434285 514438 RESIDENTIAL BUILDING RESIDENTIAL BUILDING PERMIT RESIDENTIAL ADDITION AND/OR ALTERATION ... None 19148-2338 41.01 1 NOVELLI DOMENIC, NOVELLI RUTH ECLIPSE 2.694689e+06 225669.572285 39.923588 -75.160494
4 POINT (-75.16101 39.99039) 4902 5023 RP-2020-006048 132131445 230070 RESIDENTIAL BUILDING RESIDENTIAL BUILDING PERMIT RESIDENTIAL ADDITION AND/OR ALTERATION ... None 19132-4309 167.01 5 TOLIVER BROTHERS LLC ECLIPSE 2.693832e+06 249990.520054 39.990392 -75.161006

5 rows × 34 columns

In [221]:
# Split out the residential and commercial
residential = permits.query("permitdescription == 'RESIDENTIAL BUILDING PERMIT'")
commercial = permits.query("permitdescription == 'COMMERCIAL BUILDING PERMIT'")
In [222]:
# Make a NumPy array (use the "values" attribute)
residential_coords = residential[['lat', 'lng']].values
commercial_coords = commercial[['lat', 'lng']].values
In [223]:
commercial_coords[:5]
Out[223]:
array([[ 39.960741, -75.169325],
       [ 39.981448, -75.132109],
       [ 39.95098 , -75.172382],
       [ 39.974696, -75.120228],
       [ 39.94724 , -75.153649]])

Step 4: Make a Folium map and add a HeatMap

The HeatMap takes the list of coordinates: the first column is latitude and the second column longitude

Commercial building permits

In [224]:
# Initialize map
m = folium.Map(
    location=[39.99, -75.13],
    tiles='Cartodb Positron',
    zoom_start=12
)


# Add heat map coordinates
HeatMap(commercial_coords).add_to(m)

m
Out[224]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Residential building permits

In [225]:
# Initialize map
m = folium.Map(
    location=[39.99, -75.13],
    tiles='Cartodb Positron',
    zoom_start=12
)


# Add heat map
HeatMap(residential_coords).add_to(m)

m
Out[225]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Takeaways

Commercial construction concentrated in the greater Center City area while residential construction is primarily outside of Center City...

That's it for interactive maps w/ Folium...

Now on to clustering...

Clustering in Python

  • Both spatial and non-spatial datasets
  • Two new techniques:
    • Non-spatial: K-means
    • Spatial: DBSCAN
  • Two labs/exercises this week:
    1. Grouping Philadelphia neighborhoods by AirBnb listings
    2. Identifying clusters in taxi rides in NYC

"Machine learning"

  • The computer learns patterns and properties of an input data set without the user specifying them beforehand
  • Can be both supervised and unsupervised

Supervised

  • Example: classification
  • Given a training set of labeled data, learn to assign labels to new data

Unsupervised

  • Example: clustering
  • Identify structure / clusters in data without any prior knowledge

Machine learning in Python: scikit-learn

  • State-of-the-art machine learning in Python
  • Easy to use, lots of functionality

Clustering is just one (of many) features

https://scikit-learn.org/stable/

Note: We will focus on clustering algorithms today and discuss a few other machine learning techniques in the next two weeks. If there is a specific scikit-learn use case we won't cover, I'm open to ideas for incorporating it as part of the final project.

Part 1: Non-spatial clustering

The goal

Partition a dataset into groups that have a similar set of attributes, or features, within the group and a dissimilar set of features between groups.

Minimize the intra-cluster variance and maximize the inter-cluster variance of features.

Some intuition

K-Means clustering

  • Simple but robust clustering algorithm
  • Widely used
  • Important: user must specify the number of clusters
  • Cannot be used to find density-based clusters

A good introduction

Andrew Ng's Coursera lecture

How does it work?

Minimizes the intra-cluster variance: minimizes the sum of the squared distances between all points in a cluster and the cluster centroid

K-means in action

Example: Clustering countries by health and income

  • Health expectancy in years vs. GDP per capita and population for 187 countries (as of 2015)
  • Data from Gapminder
In [226]:
import altair as alt
from vega_datasets import data as vega_data

Read the data from a URL:

In [227]:
gapminder = pd.read_csv(vega_data.gapminder_health_income.url)
gapminder.head()
Out[227]:
country income health population
0 Afghanistan 1925 57.63 32526562
1 Albania 10620 76.00 2896679
2 Algeria 13434 76.50 39666519
3 Andorra 46577 84.10 70473
4 Angola 7615 61.00 25021974

Plot it with altair

In [228]:
alt.Chart(gapminder).mark_circle().encode(
    alt.X("income:Q", scale=alt.Scale(type="log")),
    alt.Y("health:Q", scale=alt.Scale(zero=False)),
    size='population:Q',
    tooltip=list(gapminder.columns),
).interactive()
Out[228]:

K-Means with scikit-learn

In [229]:
from sklearn.cluster import KMeans

Let's start with 5 clusters

In [230]:
kmeans = KMeans(n_clusters=5)

Lot's of optional parameters, but n_clusters is the most important:

In [231]:
 kmeans
Out[231]:
KMeans(n_clusters=5)

Let's fit just income first

Use the fit() function

In [232]:
kmeans.fit(gapminder[['income']])
Out[232]:
KMeans(n_clusters=5)

Extract the cluster labels

Use the labels_ attribute

In [233]:
gapminder['label'] = kmeans.labels_

How big are our clusters?

In [234]:
gapminder.groupby('label').size()
Out[234]:
label
0     49
1      6
2    107
3     24
4      1
dtype: int64

Plot it again, coloring by our labels

In [235]:
alt.Chart(gapminder).mark_circle().encode(
    alt.X('income:Q', scale=alt.Scale(type='log')),
    alt.Y('health:Q', scale=alt.Scale(zero=False)),
    size='population:Q',
    color=alt.Color('label:N', scale=alt.Scale(scheme='dark2')),
    tooltip=list(gapminder.columns)
).interactive()
Out[235]:

Calculate average income by group

In [236]:
gapminder.groupby("label")['income'].mean().sort_values()
Out[236]:
label
2      5355.102804
0     21198.102041
3     42835.500000
1     74966.166667
4    132877.000000
Name: income, dtype: float64

Data is nicely partitioned into income levels

How about health, income, and population?

In [237]:
# Fit all three columns
kmeans.fit(gapminder[['income', 'health', 'population']])

# Extract the labels
gapminder['label'] = kmeans.labels_

Plot the new labels

In [238]:
alt.Chart(gapminder).mark_circle().encode(
    alt.X('income:Q', scale=alt.Scale(type='log')),
    alt.Y('health:Q', scale=alt.Scale(zero=False)),
    size='population:Q',
    color=alt.Color('label:N', scale=alt.Scale(scheme='dark2')),
    tooltip=list(gapminder.columns)
).interactive()
Out[238]:

It....didn't work that well

What's wrong?

K-means is distance-based, but our features have wildly different distance scales

scikit-learn to the rescue: pre-processing

  • Scikit-learn has a utility to normalize features with an average of zero and a variance of 1
  • Use the StandardScaler class
In [239]:
from sklearn.preprocessing import StandardScaler
In [240]:
scaler = StandardScaler()

Use the fit_transform() function to scale your features

In [241]:
gapminder_scaled = scaler.fit_transform(gapminder[['income', 'health', 'population']])

Important: The fit_transform() function converts the DataFrame to a numpy array:

In [242]:
# fit_transform() converts the data into a numpy array
gapminder_scaled[:5]
Out[242]:
array([[-0.79481258, -1.8171424 , -0.04592039],
       [-0.34333373,  0.55986273, -0.25325837],
       [-0.1972197 ,  0.62456075,  0.00404216],
       [ 1.52369617,  1.6079706 , -0.27303503],
       [-0.49936524, -1.38107777, -0.09843447]])
In [243]:
# mean of zero
gapminder_scaled.mean(axis=0)
Out[243]:
array([ 8.07434927e-17, -1.70511258e-15, -1.89984689e-17])
In [244]:
# variance of one
gapminder_scaled.std(axis=0)
Out[244]:
array([1., 1., 1.])

Now fit the scaled features

In [245]:
# Perform the fit
kmeans.fit(gapminder_scaled)

# Extract the labels
gapminder['label'] = kmeans.labels_
In [246]:
alt.Chart(gapminder).mark_circle().encode(
    alt.X('income:Q', scale=alt.Scale(type='log')),
    alt.Y('health:Q', scale=alt.Scale(zero=False)),
    size='population:Q',
    color=alt.Color('label:N', scale=alt.Scale(scheme='dark2')),
    tooltip=list(gapminder.columns)
).interactive()
Out[246]:
In [247]:
# Number of countries per cluster
gapminder.groupby("label").size()
Out[247]:
label
0    61
1    33
2     2
3    86
4     5
dtype: int64
In [248]:
# Average population per cluster
gapminder.groupby("label")['population'].mean().sort_values() / 1e6
Out[248]:
label
4       2.544302
0      21.441296
3      26.239937
1      31.674060
2    1343.549735
Name: population, dtype: float64
In [250]:
# Average life expectancy per cluster
gapminder.groupby("label")['health'].mean().sort_values()
Out[250]:
label
0    62.232951
2    71.850000
3    74.313837
1    80.830303
4    80.920000
Name: health, dtype: float64
In [251]:
# Average income per cluster
gapminder.groupby("label")['income'].mean().sort_values() / 1e3
Out[251]:
label
0     4.150623
2     9.618500
3    13.229907
1    41.048818
4    91.524200
Name: income, dtype: float64
In [252]:
gapminder.loc[gapminder['label']==4]
Out[252]:
country income health population label
24 Brunei 73003 78.7 423188 4
88 Kuwait 82633 80.7 3892115 4
97 Luxembourg 88314 81.1 567110 4
134 Qatar 132877 82.0 2235355 4
145 Singapore 80794 82.1 5603740 4
In [253]:
gapminder.loc[gapminder['label']==2]
Out[253]:
country income health population label
35 China 13334 76.9 1376048943 2
75 India 5903 66.8 1311050527 2
In [ ]: