forked from msr-ds3/subway-flow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLineNameRetriever.R
166 lines (138 loc) · 7.52 KB
/
LineNameRetriever.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
library(dplyr)
setwd("/home/ewahmed/Desktop/SubwayData")
#Reading in the train sequences file
stop_times <- read.table("stop_times.txt",header=TRUE,
sep=",",fill=TRUE,quote = "",row.names = NULL,
stringsAsFactors = FALSE)
#Getting rid of columns I dont need (stop_headsign, pickup_type, dropoff_type, shape_dist_traveled)
stop_times <- data.frame(stop_times[,c(1,2,3,4,5)])
#Reading in train stations with their stop_ids file
stops <- read.table("modifiedstops.txt",header=TRUE,
sep=",",fill=TRUE,quote = "",row.names = NULL,
stringsAsFactors = FALSE)
#Joining the two data frames so that we know which trains go to which stops (only had ids before)
stop_times_names <- inner_join(stop_times,stops)
#Read in the information of the trips
trips <- read.table("trips.txt",header=TRUE,
sep=",",fill=TRUE,quote = "",row.names = NULL,
stringsAsFactors = FALSE)
#Joining with trips so we know which trains we are actually looking at
trains_names <- inner_join(stop_times_names,trips)
#Getting rid of unnecessary columns
trains_names <- data.frame(trains_names[,c(1,2,3,4,5,6,7)])
#Need to convert into Hour/Minute/Second format to subtract the arrival time from departure (to get duration)
trains_names$arrival_time <- as.POSIXct(trains_names$arrival_time, format='%H:%M:%S')
trains_names$departure_time <- as.POSIXct(trains_names$departure_time, format='%H:%M:%S')
#Subtracting times: column n - column n-1
trains_names <- mutate(trains_names, TravelTime = departure_time - lag(arrival_time))
#Getting rid of both departure and arrival time since we have TravelTime now
trains_names <- data.frame(trains_names[,c(7,6,4,5,8)])
#Combine trips with their stops to get format of B01 B02 etc
zero<- "0"
trains_names$stop_sequence <- paste(zero,trains_names$stop_sequence,sep="")
trains_names$TrainStop <- paste(trains_names$route_id,trains_names$stop_sequence,sep="")
#Getting rid of duplicate train info that we do not need
trains_names$counter=1
trains_names <- trains_names %>% group_by(TrainStop) %>% mutate(cdf = cumsum(counter))
unique_train_stops <- filter(trains_names,cdf==1)
#Renaming features and reformatting data frame... yet again
unique_train_stops<- data.frame(unique_train_stops[,c(1,6,2,5,3,4)])
names(unique_train_stops) <- c('Train','TrainStop','Station','TravelTime','StopID','Stop')
#Clean the duration so that we get rid of the weird durations (-7k etc)
unique_train_stops[unique_train_stops$Stop == "01",]$TravelTime=0
trains_names[trains_names$stop_sequence == "01",]$TravelTime=0
#Getting rid of North and south on StopIDs
unique_train_stops$StopID= substr(unique_train_stops$StopID,0,3)
trains_names$stop_id= substr(trains_names$stop_id,0,3)
#Combining Stations + StopIDs as one feature
unique_train_stops$Station <- paste(unique_train_stops$Station,unique_train_stops$StopID, sep="")
trains_names$X.stop_name. <- paste(trains_names$X.stop_name.,trains_names$stop_id,sep="")
#Getting the LineName so we can later join with Turnstile data
unique_train_stops$LineName = ""
linename_extraction <- unique_train_stops[order(unique_train_stops[,5], unique_train_stops[,1]), ]
linename_extraction$counter=1
linename_extraction <- linename_extraction %>% group_by(StopID) %>% mutate(cdf = cumsum(counter))
n <- 1:length(linename_extraction$Train)
for (i in seq (along=n)){
if(linename_extraction$cdf[i] ==1){
linename_extraction$LineName[i] = linename_extraction$Train[i]
i
}
}
n <- 1:length(linename_extraction$Train)
for (i in seq (along=n)){
if(linename_extraction$cdf[i] > 1){
if (linename_extraction$LineName[i-1]==linename_extraction$Train[i]){
linename_extraction$LineName[i] = linename_extraction$LineName[i-1]
}
else{
linename_extraction$LineName[i] = paste(linename_extraction$LineName[i-1],linename_extraction$Train[i],sep="")
i
}
}
}
maxcdf<- linename_extraction %>% group_by(StopID) %>% summarize(maxcdf = max(cdf))
linename_extraction <- inner_join(linename_extraction,maxcdf)
#Getting only the full linenames, not the buildup (ex. 1, 12, 123 vs just 123 each time)
unique_stopids <- filter(linename_extraction, cdf == maxcdf)
unique_stopids <- data.frame(unique_stopids[,c(5,7)])
#Reordering so that we have all trains and their stops with the stop_ids (even though this is a little inaccurate)
unique_train_lines <- unique_train_stops
unique_train_lines$LineName = NULL
unique_train_lines<- inner_join(unique_train_lines,unique_stopids)
#Loading unique transfer data (differentstopids)
setwd("/home/ewahmed/subway-flow/CleanTransfers/")
transfers <- read.table("differentstopids.txt",header=TRUE,
sep=",",fill=TRUE,quote = "",row.names = NULL,
stringsAsFactors = FALSE)
#Renaming so we can join the transfer data to the train_stopid_lines dataframe
names(transfers)<- c('StopID','TransferID','TransferTime')
names(stops)<- c('TransferID','StationName')
transfers <- inner_join(transfers, stops)
#Combining Stations + TransferIDs as one feature
transfers$StationName <- paste(transfers$StationName,transfers$TransferID, sep="")
#joining
transfers_stopids_lines <- inner_join(unique_train_lines,transfers)
#joining the two data frames together
transfer_lines <- train_stopid_lines
transfer_lines$Station=NULL
transfer_lines$Train=NULL
names(transfer_lines) <- c('TransferID','LineName2')
transfers_stopids_lines <- left_join(transfers_stopids_lines,transfer_lines)
transfers_stopids_lines <- transfers_stopids_lines %>% group_by(Train) %>% arrange(TransferID)
transfers_stopids_lines$Train= NULL
transfers_stopids_lines<- transfers_stopids_lines[!duplicated(transfers_stopids_lines),]
#joining the linenames together so we can merge with turnstile data
transfers_stopids_lines$LineName3 = ""
transfers_stopids_lines <- transfers_stopids_lines %>% group_by(StopID) %>% arrange(LineName2)
transfers_stopids_lines$counter=1
transfers_stopids_lines <- transfers_stopids_lines %>% group_by(StopID) %>% mutate(cdf = cumsum(counter))
n <- 1:length(transfers_stopids_lines$TransferID)
for (i in seq (along=n)){
if(transfers_stopids_lines$cdf[i] > 1){
transfers_stopids_lines$LineName3[i] = paste(transfers_stopids_lines$LineName3[i-1],transfers_stopids_lines$LineName2[i],sep="")
}
else{
transfers_stopids_lines$LineName3[i] = paste(transfers_stopids_lines$LineName[i],transfers_stopids_lines$LineName2[i],sep="")
i
}
}
maxcdf<- transfers_stopids_lines %>% group_by(StopID) %>% summarize(maxcdf = max(cdf))
transfers_stopids_lines <- inner_join(transfers_stopids_lines,maxcdf)
transfers_stopids_lines <- filter(transfers_stopids_lines,cdf==maxcdf)
transfers_lines <- data.frame(transfers_stopids_lines[,c(1,2,7)])
names(transfers_lines)<-c('Station','StopID','LineName')
all_stopid_line2 <- data.frame(train_stopid_lines[,c(3,4)])
transfers_lines$Station=NULL
transfers_lines<- rbind(all_stopid_line2,transfers_lines)
transfers_lines$counter = 1
transfers_lines <- transfers_lines %>% group_by(StopID) %>% mutate(cdf = cumsum(counter))
maxcdf<- transfers_lines %>% group_by(StopID) %>% summarize(maxcdf = max(cdf))
transfers_lines <- inner_join(transfers_lines,maxcdf)
transfers_lines<- filter(transfers_lines, cdf == maxcdf)
transfers_lines <- data.frame(transfers_lines[,c(1,2)])
names(transfers_lines)<- c('stop_id','line_name')
transfers_lines <- inner_join(transfers_lines,stops)
names(transfers_lines)<- c('stop_id','line_name','google_station')
#Export as R file - change the dir/file name per needs
write.csv(transfers_lines, "/home/ewahmed/subway-flow/OldGoogleLineNames.csv")