In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pysal as ps
import seaborn as sns
sns.set_context("talk")
import matplotlib.pyplot as plt
import sqlite3
from shapely import wkb
%matplotlib inline


# Prep & Clean the data¶

We'll use a dataset I've built for this express purpose, gathering from data sources discussed in the Appendices of the competition document.

In [2]:
eng = sqlite3.connect('brexit_and_migrants.sql')
allofit = pd.read_sql('select * from allofit', eng)
allofit['geometry'] = allofit.the_geom.apply(lambda x: wkb.loads(x, hex=True))
allofit = gpd.GeoDataFrame(allofit)
allofit.columns = [col.replace(' - ', '') for col in allofit.columns]
allofit = allofit[allofit.lad15nm != 'Isles of Scilly'] # missing significant data here


And, we'll also read in the CDRC dataset, which I've also included in the sqlite database and joined up with LSOAs in England.

In [3]:
lsoas = pd.read_sql('select * from lsoa_cdrc_data', eng)
lsoas['geometry'] = lsoas.the_geom.apply(lambda x: wkb.loads(x, hex=True))
lsoas = gpd.GeoDataFrame(lsoas.drop('the_geom', axis=1))


Just to show, the lsoas:

In [4]:
f = plt.figure(figsize=(6,10))
lsoas.plot(color='lightgrey', linewidth=.05, edgecolor='k', ax=plt.gca())
plt.axis('off')
plt.show()


and the local authorities:

In [5]:
f = plt.figure(figsize=(6,10))
allofit.plot(color='lightgrey', linewidth=.1, edgecolor='k', ax=plt.gca())
plt.axis('off')
plt.show()


If you look carefully, you'll see I'm missing a few local authorities. But, altogether, I've got:

In [6]:
allofit.shape[0]

Out[6]:
371

of them.

### Merging the two datasets¶

Now, I'm going to merge up the data I have for the LAs with the LSOAs using geopandas, merging together the two by centroid containment in a spatial join:

In [7]:
from geopandas.tools import sjoin

In [8]:
joined = sjoin(allofit, lsoas.assign(geomdupe=lsoas.geometry,
centroids=lsoas.centroid)[['geometry', 'centroids', 'lsoa11cd']]\
.set_geometry('centroids').rename(columns=dict(geometry='lsoa_geometry')),
op='contains') # "trick" geopandas into taking the LSOA geometry by
# making it no longer called "geometry," and instead
# index geometries on the "centroids" column.


After the join (which you may not want to attempt if you don't have > 4 GB of memory in your computer, we can see that the assignment worked:

In [9]:
plt.figure(figsize=(6,8))
joined.set_geometry('lsoa_geometry').plot('Area Name', ax=plt.gca())
plt.axis('off')

Out[9]:
(105783.99201779434, 681788.2980403295, -20871.136885052583, 689838.7203237317)

# Creating the Divergences¶

In this step, I'll be computing the population volatility components. First, though, I'll bring in some tools from scikit-learn and scipy, as well as a progressbar indicator to keep track of longrunning computations.

In [10]:
import sklearn.metrics as skm
from tqdm import tqdm
from scipy import stats as st


To get a list of sorted years & ethnicities, I'm going to exploit some regularities in column structure to parse out the relevant combination of ethnicity-years in the CDRC data:

In [11]:
years = sorted(list(set([col.split('y')[-1]
for col in lsoas.filter(regex='_y[1234567890]')])))
ethnicities = sorted(list(set([col.split('_')[0]
for col in lsoas.filter(regex='_y[1234567890]')])))


Now, for each year, i'll

1. take that year and the next year,
2. grab all the ethnicities in both years,
3. re-normalize the distributions so they sum to one,
4. and then compute the earth-movers distance between the two vectors.
In [12]:
for i,year in tqdm(enumerate(years[:-1])):
next_year = years[i+1]

result = np.empty(lsoas.shape[0])

thisyear_distribution = lsoas[[col + '_y{}'.format(year) for col in ethnicities]].copy()
isnull = np.isnan(thisyear_distribution.values).all(axis=1)
thisyear_distribution.dropna(inplace=True)

thisyear_distribution_normed = (thisyear_distribution.values) / thisyear_distribution.values.sum(axis=1)[:,None]

nextyear_distribution = lsoas[[col + '_y{}'.format(next_year) for col in ethnicities]]
nextyear_distribution_normed = (nextyear_distribution.values ) / nextyear_distribution.values.sum(axis=1)[:,None]

distances = list(map(lambda x:
st.wasserstein_distance(*x), #earthmovers distance/Wasserman distance
#                     st.energy_distance(*x), #another useful divergence
#                     st.entropy(*x) # kullback-leibler divergence
zip(thisyear_distribution_normed, nextyear_distribution_normed)))

# Here, I account for missing data distances ahead of time.
# This lets us compute the divergences only for the data we have.
result[~isnull] = distances
result[isnull] = np.nan

# if, somehow, this fails to account for all infinite divergences,
# then something is wrong with our algorithm, since the distances
# are not sane. Thus, I check explicitly.
if np.isinf(result).any():
print("divergence!")
break

# finally, I assign the volatility for y,yt+1 (named referring to the second year)
# back to the lsoa dataframe
lsoas['distmove_{}'.format(next_year)] = result


0it [00:00, ?it/s]/home/lw17329/anaconda/envs/ana/lib/python3.5/site-packages/ipykernel_launcher.py:10: RuntimeWarning: invalid value encountered in true_divide
# Remove the CWD from sys.path while we load stuff.
/home/lw17329/anaconda/envs/ana/lib/python3.5/site-packages/ipykernel_launcher.py:13: RuntimeWarning: invalid value encountered in true_divide
del sys.path[0]
19it [00:15,  1.20it/s]


### Population Volatility average over five years¶

Now, I compute the average volatility over the five years, as well as the log mean volatility:

In [13]:
lsoas = lsoas.assign(fiveyear_log_distchange = np.log(lsoas.filter(regex='^distmove_201[012345]')+.0001).mean(axis=1))
lsoas = lsoas.assign(fiveyear_distchange = lsoas.filter(regex='^distmove_201[012345]').mean(axis=1))


#### And plot the volatility in a pretty figure:¶

In [14]:
from legendgram import legendgram as lg
import palettable as pltt

In [15]:
f,ax = plt.subplots(1,1,figsize=(12,16))

lsoas.plot('fiveyear_log_distchange', cmap='plasma', ax=ax,
linewidth=0, edgecolor=None, legend=False,
)
axnew = lg(f,ax, lsoas.fiveyear_distchange,
np.geomspace(0.0005,.01,num=9),
pltt.matplotlib.Plasma_9, legend_size=(1,.075), bins=100)
axnew.set_xlim(0,.006)
#ax.set_title("Five-Year Population Volatility", fontsize=20)
ax.axis('off')
axnew.set_xticklabels(np.linspace(0,.006, num=7), fontsize=30)
axnew.set_xlabel('Mean $d_{w}, 2011-2016$', fontsize=55)
plt.show()