-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathData Cleaning .sql
58 lines (52 loc) · 1.7 KB
/
Data Cleaning .sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
-- Data Cleaning
DROP TABLE IF EXISTS `GDA.cleaned_combined_data`;
-- creating new table with clean data
CREATE TABLE IF NOT EXISTS `GDA.cleaned_combined_data` AS (
SELECT
a.ride_id, rideable_type, started_at, ended_at,
ride_length,
CASE EXTRACT(DAYOFWEEK FROM started_at)
WHEN 1 THEN 'SUN'
WHEN 2 THEN 'MON'
WHEN 3 THEN 'TUES'
WHEN 4 THEN 'WED'
WHEN 5 THEN 'THURS'
WHEN 6 THEN 'FRI'
WHEN 7 THEN 'SAT'
END AS day_of_week,
CASE EXTRACT(MONTH FROM started_at)
WHEN 1 THEN 'JAN'
WHEN 2 THEN 'FEB'
WHEN 3 THEN 'MAR'
WHEN 4 THEN 'APR'
WHEN 5 THEN 'MAY'
WHEN 6 THEN 'JUN'
WHEN 7 THEN 'JUL'
WHEN 8 THEN 'AUG'
WHEN 9 THEN 'SEP'
WHEN 10 THEN 'OCT'
WHEN 11 THEN 'NOV'
WHEN 12 THEN 'DEC'
END AS month,
start_station_name, end_station_name,
start_lat, start_lng, end_lat, end_lng, member_casual
FROM `GDA.2023_combined_data` a
JOIN (
SELECT ride_id, (
EXTRACT(HOUR FROM (ended_at - started_at)) * 60 +
EXTRACT(MINUTE FROM (ended_at - started_at)) +
EXTRACT(SECOND FROM (ended_at - started_at)) / 60) AS ride_length
FROM `GDA.2023_combined_data`
) b
ON a.ride_id = b.ride_id
WHERE
start_station_name IS NOT NULL AND
end_station_name IS NOT NULL AND
end_lat IS NOT NULL AND
end_lng IS NOT NULL AND
ride_length > 1 AND ride_length < 1440
);
ALTER TABLE `GDA.cleaned_combined_data` -- set ride_id as primary key
ADD PRIMARY KEY(ride_id) NOT ENFORCED;
SELECT COUNT(ride_id) AS no_of_rows -- returned 4243432 rows so 1476445 rows removed
FROM `GDA.cleaned_combined_data`;