%load_ext watermark
%watermark -v -u -n -t -z -a 'Samira Kumar' -p numpy,pandas,scipy,matplotlib,sklearn,fbprophet,bokeh,geopy,geoviews,holoviews
1: VISUALISATIONS
2: CRIME ALERT
3: FB PROPHET TO PREDICT CRIMES IN CHICAGO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import holoviews as hv
import geoviews as gv
import geoviews.tile_sources as gts
from bokeh.io import output_file, save, show
#Dropped NA values and other outliers (some location were outside chicago) and cleaned the dataset
df=pd.DataFrame(pd.read_csv('cleaned_file.csv'))
df.head()
Shapefiles can be downloaded from here: https://data.cityofchicago.org/browse?tags=shapefiles
beats_df=df.groupby(['Beat']).size().reset_index(name='Total_Crimes')
beats_df=beats_df.rename(columns={'Beat':'beat_num'})
beats_df.head()
%%opts Polygons (cmap='YlOrRd')
hv.extension('bokeh')
geometries = gpd.read_file('geo_export_3b3b25c2-a600-40c3-a663-2f7ad8dc2b9c.shp')
geometries['beat_num']=geometries['beat_num'].apply(int)
gdf = gpd.GeoDataFrame(pd.merge(beats_df, geometries))
plot_opts = dict(tools=['hover'], width=750, height=700, color_index='Total_Crimes',
colorbar=True, toolbar='above', xaxis=None, yaxis=None)
plot=gts.CartoLight *gv.Polygons(gdf, vdims=['beat_num', 'Total_Crimes'], label='Chicago Crime Police Beat Map').opts(plot=plot_opts,style=dict(alpha=0.7))
# gv.renderer('bokeh').save(plot, 'beat_crimes')
plot
%%opts Polygons (cmap='YlOrRd')
ward_df=df.groupby(['Ward']).size().reset_index(name='Total_Crimes')
ward_df=ward_df.rename(columns={'Ward':'ward'})
hv.extension('bokeh')
wards_shape = gpd.read_file('Boundaries - Wards (2015-)/geo_export_7fe30167-754d-4ed5-947f-c515456d9762.shp')
wards_shape['ward']=wards_shape['ward'].apply(int)
gdf = gpd.GeoDataFrame(pd.merge(ward_df, wards_shape))
plot_opts = dict(tools=['hover'], width=750, height=700, color_index='Total_Crimes',
colorbar=True, toolbar='above', xaxis=None, yaxis=None)
plot=gts.CartoLight *gv.Polygons(gdf, vdims=['ward', 'Total_Crimes'], label='Chicago Crime Ward Map').opts(plot=plot_opts,style=dict(alpha=0.7))
# gv.renderer('bokeh').save(plot, 'ward_crimes')
plot
In order to create a crime alert, we're clustering a sample of 500000 crimes into 200 clusters. The cluster size can be any number. Higher the cluster number, better the alert system.
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import pairwise_distances
data=df.sample(500000).copy()
ml = KMeans(n_clusters=200, init='k-means++')
ml.fit(data[['Longitude', 'Latitude']])
cluster = ml.cluster_centers_
cluster[:10]
X = data[['Longitude','Latitude']].values
predictions = ml.fit_predict(X)
kclustered = pd.concat([data.reset_index(),
pd.DataFrame({'Cluster':predictions})],
axis=1)
kclustered.drop('index', axis=1, inplace=True)
centers = ml.cluster_centers_
kcenters=pd.DataFrame(centers)
kcenters=kcenters.rename(columns={0:'Longitude',1:'Latitude'})
kcenters['Total Crimes']=kclustered.groupby('Cluster')['ID'].count().reset_index()['ID']
kcenters
from geopy.geocoders import Nominatim
geolocator=Nominatim(timeout=3)
address=[]
for index,row in kcenters.iterrows():
rev_location=geolocator.reverse(np.array([row.Latitude, row.Longitude]))
address.append(rev_location.address)
kcenters['Address']=address
kcenters.head()
import folium
m = folium.Map(location=[41.8781,-87.64], zoom_start=11)
for i in range(0,len(kcenters)):
folium.Circle(
location=[kcenters.iloc[i]['Latitude'], kcenters.iloc[i]['Longitude']],
popup = (
"<b>Location:</b> {loc}</br></br>"
"<b>Crimes: </b> {crime}<br>"
).format(loc=str(kcenters.iloc[i]['Address']), crime=str(kcenters.iloc[i]['Total Crimes'])),
radius=kcenters.iloc[i]['Total Crimes']/15,
color='red',
fill=True,
fill_color='red',
fill_opacity=0.5
).add_to(m)
folium.TileLayer('cartodbpositron').add_to(m)
m.save('clustered_200.html')
m
data['cluster'] = ml.predict(data[['Longitude','Latitude']])
data[['ID','Latitude','Longitude','Block','cluster']].sample(10)
from scipy.spatial import Voronoi
def voronoi_polygons_2d(vor, radius=None):
"""
Reconstruct infinite voronoi regions in a 2D diagram to finite
regions.
Input_args:
vor : Voronoi
Input diagram
radius : float, optional
Distance to 'points at infinity'.
:returns:
regions : list of tuples
Indices of vertices in each revised Voronoi regions.
vertices : list of tuples
Coordinates for revised Voronoi vertices. Same as coordinates
of input vertices, with 'points at infinity' appended to the
end.
"""
if vor.points.shape[1] != 2:
raise ValueError("Requires 2D input")
new_regions = []
new_vertices = vor.vertices.tolist()
center = vor.points.mean(axis=0)
if radius is None:
radius = vor.points.ptp().max()*2
# Construct a map containing all ridges for a given point
all_ridges = {}
for (p1, p2), (v1, v2) in zip(vor.ridge_points, vor.ridge_vertices):
all_ridges.setdefault(p1, []).append((p2, v1, v2))
all_ridges.setdefault(p2, []).append((p1, v1, v2))
# Reconstruct infinite regions
for p1, region in enumerate(vor.point_region):
vertices = vor.regions[region]
if all([v >= 0 for v in vertices]):
# finite region
new_regions.append(vertices)
continue
# reconstruct a non-finite region
ridges = all_ridges[p1]
new_region = [v for v in vertices if v >= 0]
for p2, v1, v2 in ridges:
if v2 < 0:
v1, v2 = v2, v1
if v1 >= 0:
# finite ridge: already in the region
continue
# Compute the missing endpoint of an infinite ridge
t = vor.points[p2] - vor.points[p1] # tangent
t /= np.linalg.norm(t)
n = np.array([-t[1], t[0]]) # normal
midpoint = vor.points[[p1, p2]].mean(axis=0)
direction = np.sign(np.dot(midpoint - center, n)) * n
far_point = vor.vertices[v2] + direction * radius
new_region.append(len(new_vertices))
new_vertices.append(far_point.tolist())
# sort region counterclockwise
vs = np.asarray([new_vertices[v] for v in new_region])
c = vs.mean(axis=0)
angles = np.arctan2(vs[:,1] - c[1], vs[:,0] - c[0])
new_region = np.array(new_region)[np.argsort(angles)]
# finish
new_regions.append(new_region.tolist())
return new_regions, np.asarray(new_vertices)
# make up data points
points = cluster
# compute Voronoi tesselation
vor = Voronoi(points)
# compute regions
regions, vertices = voronoi_polygons_2d(vor)
# prepare figure
plt.style.use('seaborn-white')
fig = plt.figure()
fig.set_size_inches(20,20)
#geomap
# centroids
plt.plot(points[:,0], points[:,1], 'wo',markersize=10)
# colorize
for region in regions:
polygon = vertices[region]
plt.fill(*zip(*polygon), alpha=0.4)
plt.scatter(data['Longitude'],data['Latitude'],c='red')
plt.xlim(vor.min_bound[0] - 0.1, vor.max_bound[0] + 0.1)
plt.ylim(vor.min_bound[1] - 0.1, vor.max_bound[1] + 0.1)
plt.show()
global_df = data.groupby(['cluster', 'Block']).size().reset_index()
global_df.columns = ['cluster', 'Block', 'count']
global_df.head()
topcrimes_df=global_df.sort_values('count',ascending=False)
topcrimes_df.groupby(['cluster'])['count'].max().reset_index()
#Sorting the cluster and removing duplicates would keep only one cluster for each block
topcrimes_df=topcrimes_df.sort_values('count', ascending=False).drop_duplicates(['cluster'])
topcrimes_df.to_csv('topcrimes_df.csv')
topcrimes_df.head(20)
from IPython.core.display import display, HTML
import requests, json
def get_crime_url(location):
# text = requests.utils.quote(location)
url = "http://maps.google.com/maps?q={},{}".format(location[1],location[0])
return url
def crime_alert_closest(lon, lat):
cluster = ml.predict(np.array([lon, lat]).reshape(1, -1))[0]
crime_block = str(topcrimes_df[topcrimes_df['cluster']==cluster].iloc[0]['Block'])
count = topcrimes_df[topcrimes_df['Block']==crime_block].iloc[0]['count']
# location=np.array([lon,lat])
location=df[df['Block']==crime_block][['Longitude','Latitude']].mean().values
url = get_crime_url(location)
if url:
crime_html = '<a href="{}">{}</a>'.format(url, crime_block)
else:
crime_html = crime_block
msg = "The most violent block closest to your location is {} and the total crimes in that block is {}".format(crime_html,count)
return display(HTML(msg))
crime_alert_closest( -87.6, 41.8)
crime_alert_closest(-88.627, 39.7)
crime_alert_closest(-87.62923881, 41.88393292)
from pygeocoder import Geocoder
from geopy.geocoders import Nominatim
def get_crime_url(location):
url = "http://maps.google.com/maps?q={},{}".format(location.item(0),location.item(1))
return url
def crime_alert(crime_lon, crime_lat, person_lon, person_lat):
msg=[]
cluster_crime = ml.predict(np.array([crime_lon,crime_lat]).reshape(1, -1))[0]
cluster_person = ml.predict(np.array([person_lon,person_lat]).reshape(1, -1))[0]
crime_block = str(topcrimes_df[topcrimes_df['cluster']==cluster_crime].iloc[0]['Block'])
# location=data[data['Block']==crime_block][['Longitude','Latitude']].mean().values
geolocator = Nominatim()
location_add=np.array([crime_lat, crime_lon])
rev_location = geolocator.reverse(location_add)
address=(rev_location.address)
url = get_crime_url(location_add)
if url:
crime_html = '<a href="{}">{}</a>'.format(url, address)
else:
crime_html = address
if cluster_crime==cluster_person:
msg = "There is a crime near your location, at {}".format(crime_html)
else:
msg='No crimes around you now'
return display(HTML(msg))
crime_alert(-87.627877, 41.931080,-85.62923881, 41.88393292)
crime_alert(-87.75, 41.88393292,-87.739, 41.892)
crime_alert(-87.789, 41.97,-87.79, 41.975)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
clean_df=pd.DataFrame(pd.read_csv('cleaned_file.csv'))
df_date_group=clean_df.groupby('dates').size().reset_index(name='Freq')
df_date_group['dates']=pd.to_datetime(df_date_group['dates'])
#Dropping dates from 2017 in order to have empty data for 2017-2018
df_date_group=df_date_group.drop(df_date_group.index[1827:1845])
df_date_group.tail()
from fbprophet import Prophet
crime_model = Prophet(interval_width=0.95)
crime_data = df_date_group.rename(columns={'dates': 'ds', 'Freq': 'y'})
crime_model.fit(crime_data)
crime_forecast = crime_model.make_future_dataframe(periods=365, freq='D')
crime_forecast = crime_model.predict(crime_forecast)
plt.figure(figsize=(20, 6))
crime_model.plot(crime_forecast, xlabel = 'Date', ylabel = 'Crimes')
plt.title('Crimes');
crime_model.plot_components(crime_forecast);
crime_forecast[crime_forecast['ds']>='2017-01-01']['yhat'].sum()
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool,ColumnDataSource,RangeTool,LegendItem,Legend
from bokeh.layouts import column
from bokeh.io import output_file,save
from bokeh.layouts import row,gridplot
from numpy import histogram, linspace
from scipy.stats.kde import gaussian_kde
output_notebook()
source_prophet = ColumnDataSource(data=dict(date=crime_forecast.ds,y=crime_forecast.yhat))
source_original = ColumnDataSource(data=dict(date=df_date_group.dates.dt.date,y=df_date_group.Freq))
prophet_forecast = figure(title='Total Crimes over the years (Predicted value in Red)',width=950, height=450,
tools='save,wheel_zoom,pan,reset,box_zoom',x_axis_type='datetime',sizing_mode="scale_width",x_range=(crime_forecast.ds.min(), crime_forecast.ds.max()))
prophet_forecast.scatter('date','y',source=source_original,line_width=2,
color='black',fill_alpha=0.5,size=2,legend='Actual Crimes')
prophet_forecast.line(crime_forecast.iloc[0:1827].ds,crime_forecast.iloc[0:1827].yhat,
line_width=2,color='blue',legend='Actual Value Trend')
prophet_forecast.line(crime_forecast.iloc[-365:].ds,crime_forecast.iloc[-365:].yhat,
line_width=2,color='red',line_alpha=0.3,legend='Predicted Value')
prophet_forecast.line(crime_forecast.ds,crime_forecast.yhat_lower,
line_width=2,color='lightblue',line_alpha=0.3,legend='95% Confidence Interval')
prophet_forecast.line(crime_forecast.ds,crime_forecast.yhat_upper,
line_width=2,color='lightblue',line_alpha=0.3)
prophet_forecast.xaxis.axis_label = "Year"
prophet_forecast.yaxis.axis_label = "Total Crimes"
prophet_forecast.xgrid.grid_line_color = None
prophet_forecast.ygrid.grid_line_color = None
prophet_train = figure(title="Date Selection",
plot_height=100, plot_width=950, y_range=prophet_forecast.y_range,
x_axis_type="datetime", y_axis_type=None,
tools="", toolbar_location=None, background_fill_color="#efefef")
range_tool_prophet = RangeTool(x_range=prophet_forecast.x_range)
range_tool_prophet.overlay.fill_color = "navy"
range_tool_prophet.overlay.fill_alpha = 0.2
prophet_train.scatter('date', 'y', source=source_original,size=1)
prophet_train.line('date', 'y', source=source_prophet)
prophet_train.ygrid.grid_line_color = None
prophet_train.add_tools(range_tool_prophet)
prophet_train.toolbar.active_multi = range_tool_prophet
show(column(prophet_forecast,prophet_train))
# output_file("FBProphet_Output.html", title="FBProphet Output")
# save((prophet_forecast,prophet_train))
from fbprophet import Prophet
crime_model = Prophet(interval_width=0.95)
crime_data = df_date_group.rename(columns={'dates': 'ds', 'Freq': 'y'})
crime_model.fit(crime_data)
crime_forecast = crime_model.make_future_dataframe(periods=730, freq='D')
crime_forecast = crime_model.predict(crime_forecast)
plt.figure(figsize=(20, 6))
crime_model.plot(crime_forecast, xlabel = 'Date', ylabel = 'Crimes')
plt.title('Crimes');
crime_forecast[(crime_forecast['ds']>='2017-01-01')&(crime_forecast['ds']<='2018-12-02')]['yhat'].sum()