In [ ]:
import wget
import glob
import zipfile

import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn import preprocessing

import matplotlib.pyplot as plt
from shapely.geometry import Point

Match Turnstile "Unit" with Station ID & Location

In [ ]:
#Download MTA station data with Station IDs and keep only Station Name, ID, and LAT LON
station_ids = pd.read_csv("")
station_ids = station_ids.drop(['Station ID', 'Complex ID', 'Borough', 'Line', 'Structure', 'Daytime Routes', 'Division'], axis=1)
station_ids.columns = ['STOP_ID', 'STATION', 'LAT', 'LON']
station_ids.STOP_ID = station_ids.STOP_ID.astype(str)

#Download volunteer data matching Station IDs with Turnstile UNITS
unit_ids = pd.read_csv("data/station_data/unit_ids.csv")
unit_ids = unit_ids.drop(['comments', 'previous errors', 'Line Name', 'Division', 'Station', 'Booth'], axis=1)
unit_ids.columns = ['UNIT', 'STOP_ID']
unit_ids.STOP_ID = unit_ids.STOP_ID.astype(str)
unit_ids.UNIT = unit_ids.UNIT.astype(str)

#Merge the LAT LON and NAMES with the UNITS on STATION IDs
station_ids_with_unit_ids = station_ids.merge(unit_ids, on="STOP_ID") 
station_ids_with_unit_ids = station_ids_with_unit_ids.drop(['STOP_ID'], axis=1)
station_ids_with_unit_ids.to_csv("data/station_data/stations.csv", index=False)

Combine and Clean All Turnstile Data

In [ ]:
#Open all downloaded text data, combine into one Dataframe, and clean
path = "data/text_data"
allFiles = glob.glob(path + "/*.txt")

#Combine it all into one dataframe
file_list = []
for file in allFiles:
    df = pd.read_csv(file,index_col=None,header=0)
turnstile_data = pd.concat(file_list, axis = 0, ignore_index = True)

#Remove extraneous data and remove non Subway lines like the PATH and Staten Island Railway
turnstile_data.drop(["C/A", "SCP", "DESC", "LINENAME"], axis=1, inplace = True)
turnstile_data.columns = ['UNIT', 'STATION', 'DIV', 'DATE', 'TIME', 'ENTRIES', 'EXITS']
turnstile_data = turnstile_data[turnstile_data.DIV != "SRT"]
turnstile_data = turnstile_data[turnstile_data.DIV != "RIT"]
turnstile_data = turnstile_data[turnstile_data.DIV != "PTH"]
turnstile_data.drop(["DIV"], axis=1, inplace = True)

#Reformat the date and time creating a Pandas datetime
turnstile_data['DATETIME'] = turnstile_data.DATE + ' ' + turnstile_data.TIME
turnstile_data['DATETIME'] = pd.to_datetime(turnstile_data['DATETIME'], format='%m/%d/%Y %H:%M:%S')
turnstile_data.drop(["DATE", "TIME", "STATION"], axis=1, inplace=True)

#Because the exits and entries are cumulative, difference them
turnstile_data.ENTRIES = turnstile_data.ENTRIES.diff()
turnstile_data.EXITS = turnstile_data.EXITS.diff()

#Remove negative numbers and extreme outliers
turnstile_data = turnstile_data[turnstile_data.ENTRIES >= 0]
turnstile_data = turnstile_data[turnstile_data.EXITS >= 0]
turnstile_data = turnstile_data[turnstile_data.ENTRIES <= turnstile_data.ENTRIES.quantile(.99)]
turnstile_data = turnstile_data[turnstile_data.EXITS <= turnstile_data.EXITS.quantile(0.99)]

#Write out cleaned data to csv
turnstile_data.to_csv("data/cleaned_data/nonaveraged_turnstile_data.csv", index=False)

Group, Average, and Scale the Data based on Hour and Day of Week

In [ ]:
#Read in data, reformat datetime, and then groupby and aggregate by station and time
turnstile_data = pd.read_csv("data/cleaned_data/nonaveraged_turnstile_data.csv")
turnstile_data['DATETIME'] = pd.to_datetime(turnstile_data['DATETIME'], format='%Y/%m/%d %H:%M:%S')
turnstile_data = turnstile_data.groupby(['UNIT', 'DATETIME'], as_index=True).aggregate('sum')

#Because the readings are every few hours, there are gaps in time. Moreover the timings don't match for each station. Thus we should interpolate and fill in the gaps.
turnstile_data = turnstile_data.groupby(level=0).apply(lambda x: x.reset_index(level=0, drop=True).resample("1H").interpolate(method='linear'))

#Group and average the times so instead of days over years, we have an average for every hour of the day of week
turnstile_data['HOUR'] = turnstile_data.DATETIME.dt.hour
turnstile_data['DAYOFWEEK'] = turnstile_data.DATETIME.dt.dayofweek
turnstile_data = turnstile_data.drop(['DATETIME'], axis=1)
turnstile_data = turnstile_data.groupby(['UNIT', 'DAYOFWEEK', 'HOUR'], as_index=False).mean()

#Normalize the Data across the Averaged Times as we don't want things like general population density to interfere
entries = turnstile_data[['ENTRIES']].values.astype(float)
entries_min_max_scaler = preprocessing.MinMaxScaler()
entries_scaled = entries_min_max_scaler.fit_transform(entries)
turnstile_data['NORMALIZED_ENTRIES'] = entries_scaled

exits = turnstile_data[['EXITS']].values.astype(float)
exits_min_max_scaler = preprocessing.MinMaxScaler()
exits_scaled = exits_min_max_scaler.fit_transform(exits)
turnstile_data['NORMALIZED_EXITS'] = exits_scaled

turnstile_data['INTENSITY'] = turnstile_data['NORMALIZED_EXITS'] - turnstile_data['NORMALIZED_ENTRIES']
turnstile_data = turnstile_data.drop(['NORMALIZED_ENTRIES', 'NORMALIZED_EXITS', 'ENTRIES', 'EXITS'], axis=1)

#Merge the data with the Stations data to get geometry
station_ids_with_unit_ids = pd.read_csv("data/station_data/stations.csv")
turnstile_data = turnstile_data.merge(station_ids_with_unit_ids, on="UNIT")
turnstile_data = turnstile_data.drop(['UNIT'], axis=1)
turnstile_data = turnstile_data.drop_duplicates()

#Write out the data to csv
turnstile_data.to_csv("data/cleaned_data/averaged_turnstile_data.csv", index=False)

#Turn the lat and lon into coordinates
turnstile_data['COORDINATES'] = list(zip(turnstile_data.LON.astype(float), turnstile_data.LAT.astype(float)))
turnstile_data = turnstile_data.drop(['LON', 'LAT'], axis=1)

#Turn coordinates into a geometry and write out a shapefile
turnstile_data['COORDINATES'] = turnstile_data['COORDINATES'].apply(Point)
turnstile_data = gpd.GeoDataFrame(turnstile_data, geometry='COORDINATES') = {'init' :'epsg:4326'}
turnstile_data.to_file(driver = 'ESRI Shapefile', filename= "data/shapefiles/nonpivoted_averaged_turnstile_data.shp")

Pivot the data so that each observation is a station and each column a specific time in the week

In [ ]:
#Read in the data
turnstile_data = pd.read_csv("data/cleaned_data/averaged_turnstile_data.csv")

#Rename the days of the weeks and times for better readability
daysofweek = {

times ={
turnstile_data.DAYOFWEEK = turnstile_data.DAYOFWEEK.replace(daysofweek)
turnstile_data['HOUR'] = turnstile_data['HOUR'].astype(str).replace(times)
turnstile_data['TIME'] = turnstile_data.DAYOFWEEK.astype('str') + turnstile_data.HOUR.astype('str') + "00"
turnstile_data = turnstile_data.drop(['DAYOFWEEK', 'HOUR'], axis=1)

#Pivot the data
turnstile_data = pd.pivot_table(turnstile_data, values='INTENSITY', index='STATION', columns='TIME')

#Merge pivoted data with station name and coordinates
stations = pd.read_csv('data/station_data/stations.csv')
stations = stations.drop(['UNIT'], axis=1)
stations = stations.drop_duplicates('STATION')
stations = stations.set_index('STATION')
turnstile_data = pd.merge(turnstile_data.reset_index(), stations, on='STATION').set_index('STATION')
turnstile_data = turnstile_data.reset_index()

#Write out pivoted data to shapefile
turnstile_data['COORDINATES'] = list(zip(turnstile_data.LON, turnstile_data.LAT))
turnstile_data = turnstile_data.drop(['LON', 'LAT'], axis=1)
turnstile_data['COORDINATES'] = turnstile_data['COORDINATES'].apply(Point)
turnstile_data = gpd.GeoDataFrame(turnstile_data, geometry='COORDINATES') = {'init' :'epsg:4326'}
turnstile_data.to_file(driver = 'ESRI Shapefile', filename= "data/shapefiles/pivoted_averaged_turnstile_data.shp")