from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
np.random.seed(42)
%matplotlib inline


plt.rcParams['figure.figsize'] = (10,6)


import altair as alt
from vega_datasets import data as vega_data


gapminder = pd.read_csv(vega_data.gapminder_health_income.url)
gapminder.head()


from sklearn.cluster import KMeans


kmeans = KMeans(n_clusters=5)


from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()


gapminder_scaled = scaler.fit_transform(gapminder[['income', 'health', 'population']])


# Perform the fit
kmeans.fit(gapminder_scaled)

# Extract the labels
gapminder['label'] = kmeans.labels_


alt.Chart(gapminder).mark_circle().encode(
    alt.X('income:Q', scale=alt.Scale(type='log')),
    alt.Y('health:Q', scale=alt.Scale(zero=False)),
    size='population:Q',
    color=alt.Color('label:N', scale=alt.Scale(scheme='dark2')),
    tooltip=list(gapminder.columns)
).interactive()


airbnb = pd.read_csv("data/philly_airbnb_by_neighborhoods.csv")
airbnb.head()


# Initialize the Kmeans object
kmeans = KMeans(n_clusters=5, random_state=42)

# Scale the data features we want
scaler = StandardScaler()
scaled_data = scaler.fit_transform(airbnb[['price_per_person', 'overall_satisfaction', 'N']])


# Run the fit!
kmeans.fit(scaled_data)

# Save the cluster labels
airbnb['label'] = kmeans.labels_


# New column "label"!
airbnb.head()


airbnb.groupby('label', as_index=False).size()


airbnb.groupby('label', as_index=False).mean().sort_values(by='price_per_person')


airbnb.loc[airbnb['label'] == 2]


airbnb.loc[airbnb['label'] == 3]


airbnb.loc[airbnb['label'] == 4]


hoods = gpd.read_file("./data/philly_neighborhoods.geojson")
hoods.head()


airbnb.head()


# do the merge
airbnb2 = hoods.merge(airbnb, left_on='Name', right_on='neighborhood', how='left')

# assign -1 to the neighborhoods without any listings
airbnb2['label'] = airbnb2['label'].fillna(-1)


# plot the data
airbnb2 = airbnb2.to_crs(epsg=3857)

# setup the figure
f, ax = plt.subplots(figsize=(10, 8))

# plot, coloring by label column
# specify categorical data and add legend
airbnb2.plot(
    column="label",
    cmap="Dark2",
    categorical=True,
    legend=True,
    edgecolor="k",
    lw=0.5,
    ax=ax,
)


ax.set_axis_off()
plt.axis("equal");


# plot map, where variables ares nested within `properties`,
alt.Chart(airbnb2.to_crs(epsg=4326)).mark_geoshape().properties(
    width=500, height=400,
).encode(
    tooltip=["Name:N", "label:N"],
    color=alt.Color("label:N", scale=alt.Scale(scheme="Dark2")),
)


coords = gpd.read_file('./data/osm_gps_philadelphia.geojson')
coords.head()


num_points = len(coords)

print(f"Total number of points = {num_points}")

Total number of points = 52358


from sklearn.cluster import dbscan


dbscan?


# some parameters to start with
eps = 50  # in meters
min_samples = 50

cores, labels = dbscan(coords[["x", "y"]], eps=eps, min_samples=min_samples)


# The first 5 elements
cores[:5]

array([ 1,  4,  6, 10, 12])


num_cores = len(cores)
print(f"Number of core samples = {num_cores}")

Number of core samples = 19370


# The first 5 elements
labels[:5]

array([-1,  0, -1, -1,  1])


# Add our labels to the original data
coords['label'] = labels


coords.head()


num_clusters = coords['label'].nunique() - 1
print(f"number of clusters = {num_clusters}")

number of clusters = 87


cluster_sizes = coords.groupby('label', as_index=False).size()

cluster_sizes


# All points get assigned a cluster label (-1 reserved for noise)
cluster_sizes['size'].sum() == num_points

True


num_noise = cluster_sizes.iloc[0]['size']

print(f"number of noise points = {num_noise}")

number of noise points = 29054


num_edges = num_points - num_cores - num_noise
print(f"Number of edge points = {num_edges}")

Number of edge points = 3934


# Setup figure and axis
f, ax = plt.subplots(figsize=(10, 10), facecolor="black")

# Plot the noise samples in grey
noise = coords.loc[coords["label"] == -1]
ax.scatter(noise["x"], noise["y"], c="grey", s=5, linewidth=0)

# Loop over each cluster number
for label_num in range(0, num_clusters):

    # Extract the samples with this label number
    this_cluster = coords.loc[coords["label"] == label_num]

    # Calculate the mean (x,y) point for this cluster in red
    x_mean = this_cluster["x"].mean()
    y_mean = this_cluster["y"].mean()
    
    # Plot this centroid point in red
    ax.scatter(x_mean, y_mean, linewidth=0, color="red")

# Format
ax.set_axis_off()
ax.set_aspect("equal")

	country	income	health	population
0	Afghanistan	1925	57.63	32526562
1	Albania	10620	76.00	2896679
2	Algeria	13434	76.50	39666519
3	Andorra	46577	84.10	70473
4	Angola	7615	61.00	25021974

	neighborhood	price_per_person	overall_satisfaction	N
0	ALLEGHENY_WEST	120.791667	4.666667	23
1	BELLA_VISTA	87.407920	3.158333	204
2	BELMONT	69.425000	3.250000	11
3	BREWERYTOWN	71.788188	1.943182	142
4	BUSTLETON	55.833333	1.250000	19

	neighborhood	price_per_person	overall_satisfaction	N	label
0	ALLEGHENY_WEST	120.791667	4.666667	23	1
1	BELLA_VISTA	87.407920	3.158333	204	1
2	BELMONT	69.425000	3.250000	11	1
3	BREWERYTOWN	71.788188	1.943182	142	1
4	BUSTLETON	55.833333	1.250000	19	0

	label	price_per_person	overall_satisfaction	N
1	1	73.199020	3.137213	76.550725
0	0	79.250011	0.697461	23.473684
4	4	116.601261	2.936508	389.933333
2	2	136.263996	3.000924	1499.000000
3	3	387.626984	5.000000	31.000000

	neighborhood	price_per_person	overall_satisfaction	N	label
16	EAST_PARK	193.388889	2.714286	42	4
19	FAIRMOUNT	144.764110	2.903614	463	4
20	FISHTOWN	59.283468	2.963816	477	4
23	FRANCISVILLE	124.795795	3.164062	300	4
35	GRADUATE_HOSPITAL	106.420417	3.180791	649	4
47	LOGAN_SQUARE	145.439414	3.139241	510	4
57	NORTHERN_LIBERTIES	145.004866	3.095506	367	4
60	OLD_CITY	111.708084	2.756637	352	4
70	POINT_BREEZE	63.801072	2.759542	435	4
72	QUEEN_VILLAGE	106.405744	3.125000	248	4
79	SOCIETY_HILL	133.598667	3.118421	165	4
82	SPRING_GARDEN	157.125692	3.454023	413	4
83	SPRUCE_HILL	48.095512	2.377358	399	4
88	UNIVERSITY_CITY	82.228062	2.231579	326	4
91	WASHINGTON_SQUARE	126.959118	3.063745	703	4

Lecture 10B: Clustering Analysis in Python¶

Housekeeping¶

Last lecture (10A)¶

Today¶

Part 1: Non-spatial clustering¶

Some intuition¶

K-Means clustering¶

Example: Clustering countries by health and income¶

K-Means with scikit-learn¶

Normalizing features with the pre-processing module¶

Exercise: Clustering neighborhoods by Airbnb stats¶

Two good references for Airbnb data¶

Step 1: Load the data with pandas¶

Step 2: Perform the K-Means fit¶

Step 3: Calculate average features per cluster¶

Step 4: Plot a choropleth, coloring neighborhoods by their cluster label¶

Step 5: Plot an interactive map¶

Based on these results, where would you want to stay?¶

Part 2: Spatial clustering¶

DBSCAN¶

Two key parameters¶

Example Scenario¶

Example Scenario¶

Importance of parameter choices¶

Example: OpenStreetMap GPS traces in Philadelphia¶

DBSCAN basics¶

Now let's plot the noise and clusters¶

Extending DBSCAN beyond just spatial coordinates¶

Exercise: Extracting patterns from NYC taxi rides¶

See Lecture 11A for solutions!¶

	Name	geometry
0	LAWNDALE	POLYGON ((-75.08616 40.05013, -75.08893 40.044...
1	ASTON_WOODBRIDGE	POLYGON ((-75.00860 40.05369, -75.00861 40.053...
2	CARROLL_PARK	POLYGON ((-75.22673 39.97720, -75.22022 39.974...
3	CHESTNUT_HILL	POLYGON ((-75.21278 40.08637, -75.21272 40.086...
4	BURNHOLME	POLYGON ((-75.08768 40.06861, -75.08758 40.068...

	x	y	geometry
0	-8370750.5	4865303.0	POINT (-8370750.500 4865303.000)
1	-8368298.0	4859096.5	POINT (-8368298.000 4859096.500)
2	-8365991.0	4860380.0	POINT (-8365991.000 4860380.000)
3	-8372306.5	4868231.0	POINT (-8372306.500 4868231.000)
4	-8376768.5	4864341.0	POINT (-8376768.500 4864341.000)

	label	size
0	-1	29054
1	0	113
2	1	4073
3	2	1787
4	3	3370
...	...	...
83	82	54
84	83	56
85	84	50
86	85	53
87	86	50