Sep 29, 2020
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point
from matplotlib import pyplot as plt
import seaborn as sns
import hvplot.pandas
import holoviews as hv
hv.extension("bokeh")
%matplotlib inline
# UNCOMMENT TO SEE ALL ROWS/COLUMNS IN DATAFRAMES
# pd.options.display.max_columns = 999
# pd.options.display.max_rows = 999
Or, how to pull data from the web using Python
Note: when accessing data via API, many services will require you to register an API key to prevent you from overloading the service with requests
This is an API for near-real-time data about earthquakes, and data is provided in GeoJSON format over the web.
The API has a separate endpoint for each version of the data that users might want. No authentication is required.
API documentation:
http://earthquake.usgs.gov/earthquakes/feed/v1.0/geojson.php
Sample API endpoint, for magnitude 4.5+ earthquakes in past day:
http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/4.5_day.geojson
Simply pass the URL to the gpd.read_file()
function:
# Download data on magnitude 2.5+ quakes from the past week
endpoint_url = "http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_week.geojson"
df = gpd.read_file(endpoint_url)
df.head()
fig, ax = plt.subplots(figsize=(10, 10))
# plot the country outline
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
world.to_crs(epsg=3857).plot(ax=ax, facecolor="none", edgecolor="black")
# plot the earthquakes
df.to_crs(epsg=3857).plot(ax=ax, color="crimson")
ax.set_axis_off()
A GeoService is a standardized format for returning GeoJSON files over the web.
Documentation http://geoservices.github.io/
OpenDataPhilly provides GeoService API endpoints for the geometry hosted on its platform
# base URL
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/Zipcodes_Poly/FeatureServer/0/"
esri2gpd
¶import esri2gpd
zip_codes = esri2gpd.get(url)
zip_codes.head()
zip_codes.crs
fig, ax = plt.subplots(figsize=(6, 6))
zip_codes.to_crs(epsg=3857).plot(ax=ax, facecolor="none", edgecolor="black")
ax.set_axis_off()
Note: the "API documentation" on OpenDataPhilly will link to the documentation for the CARTO database
For example: shooting victims in Philadelphia
https://phl.carto.com/api/v2/sql?q=SELECT+*,+ST_Y(the_geom)+AS+lat,+ST_X(the_geom)+AS+lng+FROM+shootings&filename=shootings&format=csv&skipfields=cartodb_id
The CARTO databases can be queried using SQL. This allows you to select specific data from the larger database.
CARTO API documentation: https://carto.com/developers/sql-api/
SQL documentation: https://www.postgresql.org/docs/9.1/sql.html
SELECT [field names] FROM [table name] WHERE [query]
We'll use Python's requests
library to query the API endpoint with our desired query.
import requests
# the API end point
API_endpoint = "https://phl.carto.com/api/v2/sql"
# the query
query = "SELECT * FROM shootings" # table name is "shootings"
# desired format of the returned features
output_format = 'geojson'
# fields to skip
skipfields = ["cartodb_id"]
# all of our request parameters
params = dict(q=query, format=output_format, skipfields=skipfields)
params
# make the request
r = requests.get(API_endpoint, params=params)
r
# Get the returned data in JSON format
# This is a dictionary
features = r.json()
# What are the keys?
list(features.keys())
features['type']
# Let's look at the first feature
features['features'][0]
Use the GeoDataFrame.from_features()
function to create a GeoDataFrame
shootings = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")
shootings.head()
# make sure we remove missing geometries
shootings = shootings.dropna(subset=['geometry'])
# convert to a better CRS
shootings = shootings.to_crs(epsg=3857)
A quick plot with geopandas to show the shootings as points
fig, ax = plt.subplots(figsize=(6, 6))
# ZIP codes
zip_codes.to_crs(epsg=3857).plot(ax=ax, facecolor="none", edgecolor="black")
# Shootings
shootings.plot(ax=ax, color="crimson")
ax.set_axis_off()
# initialize the axes
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap('viridis')(0))
# convert to Web Mercator and plot the hexbins
x = shootings.geometry.x
y = shootings.geometry.y
ax.hexbin(x, y, gridsize=40, mincnt=1, cmap='viridis')
# overlay the ZIP codes
zip_codes.to_crs(epsg=3857).plot(ax=ax,
facecolor='none',
linewidth=0.5,
edgecolor='white')
ax.set_axis_off()
The COUNT
function can be applied to count all rows.
query = "SELECT COUNT(*) FROM shootings"
params = dict(q=query)
r = requests.get(API_endpoint, params=params)
r.json()
Important: always good to check how many rows you might be downloading before hand.
The LIMIT
function limits the number of returned rows. It is very useful for taking a quick look at the format of a database.
# select the first 5
query = "SELECT * FROM shootings LIMIT 5"
params = dict(q=query, format="geojson")
r = requests.get(API_endpoint, params=params)
df = gpd.GeoDataFrame.from_features(r.json(), crs="EPSG:4326")
df
shootings.head()
query = "SELECT * FROM shootings WHERE fatal = 1"
# Make the request
params = dict(q=query, format="geojson")
r = requests.get(API_endpoint, params=params)
# Make the GeoDataFrame
fatal = gpd.GeoDataFrame.from_features(r.json(), crs="EPSG:4326")
print("number of fatal shootings = ", len(fatal))
query = "SELECT * FROM shootings WHERE date_ > '1/1/20'"
# Make the request
params = dict(q=query, format="geojson")
r = requests.get(API_endpoint, params=params)
# Make the GeoDataFrame
this_year = gpd.GeoDataFrame.from_features(r.json(), crs="EPSG:4326")
print("number of shootings this year = ", len(this_year))
carto2gpd
¶get()
function will query the databaseget_size()
function will use COUNT()
to get the total number of rowsimport carto2gpd
where = "date_ > '1/1/20' and fatal = 1.0"
df = carto2gpd.get(API_endpoint, 'shootings', where=where)
df.head()
# Limit results to the first 5
df = carto2gpd.get(API_endpoint, 'shootings', limit=5)
df
size = carto2gpd.get_size(API_endpoint, 'shootings')
print(size)
DateTime
objectsshootings['date'] = pd.to_datetime(shootings['date_'])
shootings["Month"] = shootings["date"].dt.month
shootings["Day of Week"] = shootings["date"].dt.dayofweek # Monday is 0, Sunday is 6
Use the familiar Groupby --> size()
count = shootings.groupby(['Month', 'Day of Week']).size()
count = count.reset_index(name='Count')
count.head()
hvplot
¶# Remember 0 is Monday and 6 is Sunday
count.hvplot.heatmap(
x="Day of Week",
y="Month",
C="Count",
cmap="viridis",
width=400,
height=500,
flip_yaxis=True,
)
Several 3rd party options with easier interfaces for accessing census data
The census provides web-based documentation:
https://www.census.gov/data/developers/guidance/api-user-guide.html
Several packages provide easier Python interfaces to census data based on the census API.
We'll focus on cenpy
- "Explore and download data from Census APIs"
Let's make this for Philadelphia in Python!
# First step: import cenpy
import cenpy
Functions to help you explore the Census API from Python
cenpy.explorer.available
: Returns information about available datasets in Census APIcenpy.explorer.explain
: Explain a specific Census datasetcenpy.explorer.fips_table
: Get a table of FIPS codes for a specific geographyavailable = cenpy.explorer.available()
available.head()
We can use the pandas filter()
to search for specific identifiers in the dataframe.
In this case, let's search for the American Community Survey datasets. We'll match index labels using regular expressions.
In particular, we'll search for labels that start with "ACS". In the language of regular expressions, we'll use the "^" to mean "match labels that start with"
For more info on regular expressions, the documentation for the re module is a good place to start.
# Return a dataframe of all datasets that start with "ACS"
# Axis=0 means to filter the index labels!
available.filter(regex="^ACS", axis=0)
Many flavors of ACS datasets are available — we want to use the detailed tables version, specifically the 5-year survey.
The relevant identifiers start with: "ACSDT5Y".
# Return a dataframe of all datasets that start with "ACSDT5Y"
available.filter(regex="^ACSDT5Y", axis=0)
Let's use the latest available data (2018). We can use the explain()
function to print out a description of the dataset:
cenpy.explorer.explain("ACSDT5Y2018")
Use the cenpy.remote.APIConnection
object, and pass it the name of the dataset.
acs = cenpy.remote.APIConnection("ACSDT5Y2018")
The .variables
attribute stores the available variables (across all Census tables).
We can use the varslike()
function to search the variables
dataframe (it's just a simple wrapper around the pandas filter()
function).
len(acs.variables)
acs.variables.head(n=10)
We're interested in variables about hispanic origin broken down by race — let's see if we can find the variables where the "Concept" column starts with "RACE"
acs.varslike?
acs.varslike("HISPANIC OR LATINO ORIGIN BY RACE", by='concept').sort_index() # searches along concept column
It looks like the table we want is "B03002" — we can also easily filter for all variables in this table
variables = [
"NAME",
"B03002_001E", # Total
"B03002_003E", # Not Hispanic, White
"B03002_004E", # Not Hispanic, Black
"B03002_005E", # Not Hispanic, American Indian
"B03002_006E", # Not Hispanic, Asian
"B03002_007E", # Not Hispanic, Native Hawaiian
"B03002_008E", # Not Hispanic, Other
"B03002_009E", # Not Hispanic, Two or More Races
"B03002_012E", # Hispanic
]
Note: we've also include the "NAME" variable which returns the name of the Census geography we are querying for
The Census API use heirarchy of geographies when requesting data.
For example, you cannot just request data for a specific county — you need to specify the state and the county.
.geographies
attribute¶This allows you to see:
acs.geographies['fips']
For the racial dot map, we'll use the most granular available geography: block group.
The hierarchy is: state --> county --> tract --> block group but we can use the *
operator for tracts so we'll need to know the FIPS codes for PA and Philadelphia County
counties = cenpy.explorer.fips_table("COUNTY")
counties.head()
# Trim to just Philadelphia
# Search for rows where name contains "Philadelphia"
counties.loc[ counties[3].str.contains("Philadelphia") ]
For Philadelphia County, the FIPS codes are:
philly_county_code = "101"
pa_state_code = "42"
You can also look up FIPS codes on Google! Wikipedia is usually a trustworthy source...
We'll use the .query()
function, which takes the following arguments:
cols
- the list of variables desired from the datasetgeo_unit
- string denoting the smallest geographic unit; syntax is "name:FIPS"geo_filter
- dictionary containing groupings of geo_units, if required by the hierarchyphilly_demo_data = acs.query(
cols=variables,
geo_unit="block group:*",
geo_filter={"state": pa_state_code,
"county": philly_county_code,
"tract": "*"},
)
philly_demo_data.head()
for variable in variables:
# Convert all variables EXCEPT for NAME
if variable != "NAME":
philly_demo_data[variable] = philly_demo_data[variable].astype(float)
If you forget to include required parts of the geography heirarchy, you'll get an error!
acs.query(
cols=variables,
geo_unit="block group:*",
geo_filter={"state": pa_state_code},
)
cenpy
includes an interface to the Census' [Tiger] shapefile database.
cenpy.tiger.available()
Set the ACS2018 database as the desired GeoService
acs.set_mapservice("tigerWMS_ACS2018")
The map service has many different layers — select the layer for our desired geography
acs.mapservice.layers
acs.mapservice.layers[10]
Query the service using the .query()
function.
We can use a SQL command to request census tracts only for Philadelphia county by specifying the other geographies in the hierachy (in this case, state and county)
acs.mapservice.layers[10].variables
# Use SQL to return geometries only for Philadelphia County in PA
where_clause = f"STATE = {pa_state_code} AND COUNTY = {philly_county_code}"
# Query for block groups
philly_block_groups = acs.mapservice.layers[10].query(where=where_clause)
philly_block_groups.head()
Merge based on multiple columns: state, county, tract, and block group IDs.
The relevant columns are:
philly_demo_final = philly_block_groups.merge(
philly_demo_data,
left_on=["STATE", "COUNTY", "TRACT", "BLKGRP"],
right_on=["state", "county", "tract", "block group"],
)
philly_demo_final.head()
Using geopandas
...
# Check the CRS
philly_demo_final.crs
fig, ax = plt.subplots(figsize=(10,10))
# Plot the choropleth
philly_demo_final.plot(ax=ax, column='B03002_001E', legend=True)
# Format
ax.set_title("Population of Philadelphia by Block Group", fontsize=16)
ax.set_axis_off()
hvplot
...¶cols = ['NAME_x', 'B03002_001E', 'geometry']
philly_demo_final[cols].hvplot(c='B03002_001E',
geo=True,
crs=3857,
legend=True,
width=600,
height=400,
cmap='viridis')
# Rename columns
philly_demo_final = philly_demo_final.rename(
columns={
"B03002_001E": "Total", # Total
"B03002_003E": "White", # Not Hispanic, White
"B03002_004E": "Black", # Not Hispanic, Black
"B03002_005E": "AI/AN", # Not Hispanic, American Indian
"B03002_006E": "Asian", # Not Hispanic, Asian
"B03002_007E": "NH/PI", # Not Hispanic, Native Hawaiian
"B03002_008E": "Other_", # Not Hispanic, Other
"B03002_009E": "Two Plus", # Not Hispanic, Two or More Races
"B03002_012E": "Hispanic", # Hispanic
}
)
# Add an "Other" column
cols = ['AI/AN', 'NH/PI','Other_', 'Two Plus']
philly_demo_final['Other'] = philly_demo_final[cols].sum(axis=1)
Given a polygon, create randomly distributed points that fall within the polygon.
def random_points_in_polygon(number, polygon):
"""
Generate a random number of points within the
specified polygon.
"""
points = []
min_x, min_y, max_x, max_y = polygon.bounds
i= 0
while i < number:
point = Point(np.random.uniform(min_x, max_x), np.random.uniform(min_y, max_y))
if polygon.contains(point):
points.append(point)
i += 1
return points
# get the first block group polygon in the data set
geo = philly_demo_final.iloc[0].geometry
geo
fig, ax = plt.subplots(figsize=(6, 6))
# Generate some random points
random_points = random_points_in_polygon(100, geo)
# Plot random points
gpd.GeoSeries(random_points).plot(ax=ax, markersize=20, color="red")
# Plot boundary of block group
gpd.GeoSeries([geo]).plot(ax=ax, facecolor="none", edgecolor="black")
ax.set_axis_off()
def generate_dot_map(data, people_per_dot):
"""
Given a GeoDataFrame with demographic columns, generate a dot
map according to the population in each geometry.
"""
results = []
for field in ["White", "Hispanic", "Black", "Asian", "Other"]:
# generate random points
pts = data.apply(
lambda row: random_points_in_polygon(
row[field] / people_per_dot, row["geometry"]
),
axis=1,
)
# combine into single GeoSeries
pts = gpd.GeoSeries(pts.apply(pd.Series).stack(), crs=data["geometry"].crs)
pts.name = "geometry"
# make into a GeoDataFrame
pts = gpd.GeoDataFrame(pts)
pts["field"] = field
# save
results.append(pts)
return gpd.GeoDataFrame(pd.concat(results), crs=data["geometry"].crs).reset_index(
drop=True
)
dot_map = generate_dot_map(philly_demo_final, people_per_dot=50)
print("number of points = ", len(dot_map))
dot_map.head()
# setup a custom color map from ColorBrewer
from matplotlib.colors import ListedColormap
cmap = ListedColormap(
["#3a833c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"]
)
# plot the dot map
dot_map_3857 = dot_map.to_crs(epsg=3857)
fig, ax = plt.subplots(figsize=(10, 10), facecolor="#cfcfcf")
# Plot
dot_map_3857.plot(
ax=ax,
column="field",
categorical=True,
legend=True,
alpha=1,
markersize=0.5,
cmap=cmap,
)
# format
ax.set_title("Philadelphia, PA", fontsize=16)
ax.text(
0.5, 0.95, "1 dot = 50 people", fontsize=12, transform=ax.transAxes, ha="center"
)
ax.set_axis_off()