import wget
import glob
import zipfile
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn import preprocessing
import matplotlib.pyplot as plt
from shapely.geometry import Point
#Download MTA station data with Station IDs and keep only Station Name, ID, and LAT LON
station_ids = pd.read_csv("http://web.mta.info/developers/data/nyct/subway/Stations.csv")
station_ids = station_ids.drop(['Station ID', 'Complex ID', 'Borough', 'Line', 'Structure', 'Daytime Routes', 'Division'], axis=1)
station_ids.columns = ['STOP_ID', 'STATION', 'LAT', 'LON']
station_ids.STOP_ID = station_ids.STOP_ID.astype(str)
#Download volunteer data matching Station IDs with Turnstile UNITS
unit_ids = pd.read_csv("data/station_data/unit_ids.csv")
unit_ids = unit_ids.drop(['comments', 'previous errors', 'Line Name', 'Division', 'Station', 'Booth'], axis=1)
unit_ids.columns = ['UNIT', 'STOP_ID']
unit_ids.STOP_ID = unit_ids.STOP_ID.astype(str)
unit_ids.UNIT = unit_ids.UNIT.astype(str)
#Merge the LAT LON and NAMES with the UNITS on STATION IDs
station_ids_with_unit_ids = station_ids.merge(unit_ids, on="STOP_ID")
station_ids_with_unit_ids = station_ids_with_unit_ids.drop(['STOP_ID'], axis=1)
station_ids_with_unit_ids.to_csv("data/station_data/stations.csv", index=False)
#Open all downloaded text data, combine into one Dataframe, and clean
path = "data/text_data"
allFiles = glob.glob(path + "/*.txt")
#Combine it all into one dataframe
file_list = []
for file in allFiles:
df = pd.read_csv(file,index_col=None,header=0)
file_list.append(df)
turnstile_data = pd.concat(file_list, axis = 0, ignore_index = True)
#Remove extraneous data and remove non Subway lines like the PATH and Staten Island Railway
turnstile_data.drop(["C/A", "SCP", "DESC", "LINENAME"], axis=1, inplace = True)
turnstile_data.columns = ['UNIT', 'STATION', 'DIV', 'DATE', 'TIME', 'ENTRIES', 'EXITS']
turnstile_data = turnstile_data[turnstile_data.DIV != "SRT"]
turnstile_data = turnstile_data[turnstile_data.DIV != "RIT"]
turnstile_data = turnstile_data[turnstile_data.DIV != "PTH"]
turnstile_data.drop(["DIV"], axis=1, inplace = True)
#Reformat the date and time creating a Pandas datetime
turnstile_data['DATETIME'] = turnstile_data.DATE + ' ' + turnstile_data.TIME
turnstile_data['DATETIME'] = pd.to_datetime(turnstile_data['DATETIME'], format='%m/%d/%Y %H:%M:%S')
turnstile_data.drop(["DATE", "TIME", "STATION"], axis=1, inplace=True)
#Because the exits and entries are cumulative, difference them
turnstile_data.ENTRIES = turnstile_data.ENTRIES.diff()
turnstile_data.EXITS = turnstile_data.EXITS.diff()
#Remove negative numbers and extreme outliers
turnstile_data = turnstile_data[turnstile_data.ENTRIES >= 0]
turnstile_data = turnstile_data[turnstile_data.EXITS >= 0]
turnstile_data = turnstile_data[turnstile_data.ENTRIES <= turnstile_data.ENTRIES.quantile(.99)]
turnstile_data = turnstile_data[turnstile_data.EXITS <= turnstile_data.EXITS.quantile(0.99)]
#Write out cleaned data to csv
turnstile_data.to_csv("data/cleaned_data/nonaveraged_turnstile_data.csv", index=False)
#Read in data, reformat datetime, and then groupby and aggregate by station and time
turnstile_data = pd.read_csv("data/cleaned_data/nonaveraged_turnstile_data.csv")
turnstile_data['DATETIME'] = pd.to_datetime(turnstile_data['DATETIME'], format='%Y/%m/%d %H:%M:%S')
turnstile_data = turnstile_data.groupby(['UNIT', 'DATETIME'], as_index=True).aggregate('sum')
#Because the readings are every few hours, there are gaps in time. Moreover the timings don't match for each station. Thus we should interpolate and fill in the gaps.
turnstile_data = turnstile_data.groupby(level=0).apply(lambda x: x.reset_index(level=0, drop=True).resample("1H").interpolate(method='linear'))
#Group and average the times so instead of days over years, we have an average for every hour of the day of week
turnstile_data.reset_index(inplace=True)
turnstile_data['HOUR'] = turnstile_data.DATETIME.dt.hour
turnstile_data['DAYOFWEEK'] = turnstile_data.DATETIME.dt.dayofweek
turnstile_data = turnstile_data.drop(['DATETIME'], axis=1)
turnstile_data = turnstile_data.groupby(['UNIT', 'DAYOFWEEK', 'HOUR'], as_index=False).mean()
#Normalize the Data across the Averaged Times as we don't want things like general population density to interfere
entries = turnstile_data[['ENTRIES']].values.astype(float)
entries_min_max_scaler = preprocessing.MinMaxScaler()
entries_scaled = entries_min_max_scaler.fit_transform(entries)
turnstile_data['NORMALIZED_ENTRIES'] = entries_scaled
exits = turnstile_data[['EXITS']].values.astype(float)
exits_min_max_scaler = preprocessing.MinMaxScaler()
exits_scaled = exits_min_max_scaler.fit_transform(exits)
turnstile_data['NORMALIZED_EXITS'] = exits_scaled
turnstile_data['INTENSITY'] = turnstile_data['NORMALIZED_EXITS'] - turnstile_data['NORMALIZED_ENTRIES']
turnstile_data = turnstile_data.drop(['NORMALIZED_ENTRIES', 'NORMALIZED_EXITS', 'ENTRIES', 'EXITS'], axis=1)
#Merge the data with the Stations data to get geometry
station_ids_with_unit_ids = pd.read_csv("data/station_data/stations.csv")
turnstile_data = turnstile_data.merge(station_ids_with_unit_ids, on="UNIT")
turnstile_data = turnstile_data.drop(['UNIT'], axis=1)
turnstile_data = turnstile_data.drop_duplicates()
#Write out the data to csv
turnstile_data.to_csv("data/cleaned_data/averaged_turnstile_data.csv", index=False)
#Turn the lat and lon into coordinates
turnstile_data['COORDINATES'] = list(zip(turnstile_data.LON.astype(float), turnstile_data.LAT.astype(float)))
turnstile_data = turnstile_data.drop(['LON', 'LAT'], axis=1)
#Turn coordinates into a geometry and write out a shapefile
turnstile_data['COORDINATES'] = turnstile_data['COORDINATES'].apply(Point)
turnstile_data = gpd.GeoDataFrame(turnstile_data, geometry='COORDINATES')
turnstile_data.crs = {'init' :'epsg:4326'}
turnstile_data.to_file(driver = 'ESRI Shapefile', filename= "data/shapefiles/nonpivoted_averaged_turnstile_data.shp")
#Read in the data
turnstile_data = pd.read_csv("data/cleaned_data/averaged_turnstile_data.csv")
#Rename the days of the weeks and times for better readability
daysofweek = {
0:'06SUN',
1:'00MON',
2:'01TUE',
3:'02WED',
4:'03THU',
5:'04FRI',
6:'05SAT'}
times ={
'0':'00',
'1':'01',
'2':'02',
'3':'03',
'4':'04',
'5':'05',
'6':'06',
'7':'07',
'8':'08',
'9':'09'
}
turnstile_data.DAYOFWEEK = turnstile_data.DAYOFWEEK.replace(daysofweek)
turnstile_data['HOUR'] = turnstile_data['HOUR'].astype(str).replace(times)
turnstile_data['TIME'] = turnstile_data.DAYOFWEEK.astype('str') + turnstile_data.HOUR.astype('str') + "00"
turnstile_data = turnstile_data.drop(['DAYOFWEEK', 'HOUR'], axis=1)
#Pivot the data
turnstile_data = pd.pivot_table(turnstile_data, values='INTENSITY', index='STATION', columns='TIME')
#Merge pivoted data with station name and coordinates
stations = pd.read_csv('data/station_data/stations.csv')
stations = stations.drop(['UNIT'], axis=1)
stations = stations.drop_duplicates('STATION')
stations = stations.set_index('STATION')
turnstile_data = pd.merge(turnstile_data.reset_index(), stations, on='STATION').set_index('STATION')
turnstile_data = turnstile_data.reset_index()
#Write out pivoted data to shapefile
turnstile_data['COORDINATES'] = list(zip(turnstile_data.LON, turnstile_data.LAT))
turnstile_data = turnstile_data.drop(['LON', 'LAT'], axis=1)
turnstile_data['COORDINATES'] = turnstile_data['COORDINATES'].apply(Point)
turnstile_data = gpd.GeoDataFrame(turnstile_data, geometry='COORDINATES')
turnstile_data.crs = {'init' :'epsg:4326'}
turnstile_data.to_file(driver = 'ESRI Shapefile', filename= "data/shapefiles/pivoted_averaged_turnstile_data.shp")