# Install packages if not already installed
# (you can comment these out after first run)
# install.packages(c("tidyverse","lubridate","janitor","ggplot2","dplyr"))
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(ggplot2)
library(dplyr)
df1 <- read_csv("divvy_trips_2019_q1.csv")
## Rows: 365069 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): start_time, end_time, from_station_name, to_station_name, usertype,...
## dbl (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## num (1): tripduration
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df2 <- read_csv("divvy_trips_2020_q1.csv")
## Rows: 426887 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, started_at, ended_at, start_station_name, e...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, en...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
I standardized the column names in the Q1 2020 data frame to match those in the Q1 2019 data frame. Specifically, I renamed the “member” and “casual” labels in Q1 2020 to align with the “Subscriber” and “Customer” labels used in Q1 2019. After ensuring the column names and user type labels were consistent, I combined the common columns from both data frames into a single data frame called bike_rides.
# Rename 2020 dataset columns to match 2019
df2_clean <- df2 %>%
rename(
usertype = member_casual,
trip_id = ride_id,
start_time = started_at,
end_time = ended_at,
bikeid = rideable_type,
from_station_id = start_station_id,
from_station_name = start_station_name,
to_station_id = end_station_id,
to_station_name = end_station_name
) %>%
mutate(usertype = ifelse(usertype == "member", "Subscriber", "Customer"))
# Select only common columns
common_cols <- intersect(colnames(df1), colnames(df2_clean))
df1_common <- df1 %>% select(all_of(common_cols)) %>%
mutate(trip_id = as.character(trip_id), bikeid = as.character(bikeid))
df2_common <- df2_clean %>% select(all_of(common_cols))
# Merge
bike_rides <- bind_rows(df1_common, df2_common)
I removed any empty rows to ensure the dataset only included complete trip records.
# Remove empty rows
bike_rides <- janitor::remove_empty(bike_rides,which = c("rows"))
I converted the start and end time columns into proper date-time format, then calculated the trip duration in minutes by subtracting the start time from the end time. I rounded trip duration to two decimal places for consistency. I also extracted the day of the week from the start time to allow daily ride pattern analysis.
bike_rides <- bike_rides %>%
mutate(
start_time = ymd_hms(start_time),
end_time = ymd_hms(end_time),
trip_duration = round(as.numeric(difftime(end_time, start_time, units = "mins")), 2),
day_of_week = weekdays(start_time)
)
I calculated the average, maximum, and minimum trip duration for each usertype.
svc_time <- summarise(bike_rides,
min_time_subscriber = min(trip_duration[usertype == "Subscriber"], na.rm = TRUE),
max_time_subscriber = max(trip_duration[usertype == "Subscriber"], na.rm = TRUE),
mean_time_subscriber = mean(trip_duration[usertype == "Subscriber"], na.rm = TRUE),
min_time_customer = min(trip_duration[usertype == "Customer"], na.rm = TRUE),
max_time_customer = max(trip_duration[usertype == "Customer"], na.rm = TRUE),
mean_time_customer = mean(trip_duration[usertype == "Customer"], na.rm = TRUE)
)
svc_time
## # A tibble: 1 × 6
## min_time_subscriber max_time_subscriber mean_time_subscriber min_time_customer
## <dbl> <dbl> <dbl> <dbl>
## 1 0.02 101607. 13.3 -9.2
## # ℹ 2 more variables: max_time_customer <dbl>, mean_time_customer <dbl>
I examined the distribution of trip duration using high quantiles (95th, 99th, 99.5th, and 99.9th percentiles) to identify extreme outliers. Based on this, I restricted trips to a reasonable range, filtering out trips shorter than 1 minute and longer than 120 minutes.
quantile(bike_rides$trip_duration, probs = c(0.95, 0.99, 0.995, 0.999))
## 95% 99% 99.5% 99.9%
## 33.7300 77.1245 118.6845 890.1035
# Filter unreasonable trips
bike_rides <- bike_rides %>% filter(trip_duration > 1, trip_duration < 120)
I checked the consistency between station IDs and station names to ensure that each station ID maps to exactly one station name and vice versa, preventing issues caused by mismatched or duplicated entries.
# Check station id consistency in from_station_id column
id_conflicts <- bike_rides %>%
select(from_station_id, from_station_name) %>%
distinct() %>%
group_by(from_station_id) %>%
filter(n_distinct(from_station_name) > 1)
# Check station name consistency in from_station_name column
name_conflicts <- bike_rides %>%
select(from_station_id, from_station_name) %>%
distinct() %>%
group_by(from_station_name) %>%
filter(n_distinct(from_station_id) > 1)
# Check station id consistency in to_station_id column
id_conflicts <- bike_rides %>%
select(to_station_id, to_station_name) %>%
distinct() %>%
group_by(to_station_id) %>%
filter(n_distinct(to_station_name) > 1)
# Check station name consistency in to_station_name column
name_conflicts <- bike_rides %>%
select(to_station_id, to_station_name) %>%
distinct() %>%
group_by(to_station_name) %>%
filter(n_distinct(to_station_id) > 1)
I cleaned the inconsistencies in the station names and ids by removing unnecessary symbols, words, and empty spaces in all of the names. I then manually corrected a few remaining mismatches by ensuring that specific station IDs were assigned to their correct names.
# Remove extra symbols, words, and spaces around station names
bike_rides <- bike_rides %>%
mutate(
from_station_name = from_station_name %>%
str_replace_all("\\(\\*\\)", "") %>%
str_replace_all("\\(Temp\\)", "") %>%
str_squish(),
to_station_name = to_station_name %>%
str_replace_all("\\(\\*\\)", "") %>%
str_replace_all("\\(Temp\\)", "") %>%
str_squish()
) %>%
# Fix remaining name inconsistencies manually
mutate(
from_station_name = case_when(
from_station_id == 217 ~ "Elizabeth St & Fulton St",
from_station_id == 19 ~ "Throop St & Taylor St",
from_station_id == 208 ~ "Laflin St & Cullerton St",
from_station_id == 286 ~ "Franklin St & Adams St",
TRUE ~ from_station_name
),
to_station_name = case_when(
to_station_id == 217 ~ "Elizabeth St & Fulton St",
to_station_id == 19 ~ "Throop St & Taylor St",
to_station_id == 208 ~ "Laflin St & Cullerton St",
to_station_id == 286 ~ "Franklin St & Adams St",
TRUE ~ to_station_name
)
)
I created a time of day column using categorization of the hour in the start time to allow for further analysis.
bike_rides <- mutate(bike_rides, start_hour = lubridate::hour(start_time),
time_of_day = case_when(
start_hour >= 5 & start_hour < 12 ~ "Morning",
start_hour >= 12 & start_hour < 18 ~ "Afternoon",
start_hour >= 18 & start_hour < 22 ~ "Evening",
TRUE ~ "Night"
)
)
The dataset has been successfully cleaned and standardized. Inconsistent station names and IDs were corrected, extreme outliers in trip duration were removed, and new features such as trip duration, day of week, and time of day were created. The data is now clean and ready for analysis and visualization.
write.csv(bike_rides, "bike_rides.csv")
Upload bike_rides.csv into tableau
Double click bike_rides.csv
Open new worksheet
Columns: Trip Duration [Rename to Duration (minutes)]
Rows: CNTD(Trip Id) [Rename to Frequency]
Filters:
Usertype: Customer
Duration(min): 1-60
Title: “Distribution of Trip Duration for Customers (Trimmed to 1 hour)”
Duplicate Worksheet 1
Change Usertype filter to Subscriber
Title: “Distribution of Trip Duration for Subscribers (Trimmed to 1 hour)”
Create calculation
Text: Calculation1
Filter: Usertype: Customer
Title: “Average Customer Duration”
Duplicate worksheet 3
Change Usertype filter to Subscriber
Title: “Average Subscriber Duration”
Drag Worksheet 1 to bottom left
Drag Worksheet 2 to top left
Drag Worksheet 3 to bottom right
Drag Worksheet 4 to top right
Add bottom right caption “Data From: Q1 2019 and Q1 2020” to Worksheet 1
Columns: Time of Day
Rows: CNTD(Trip Id) [Rename to Frequency]
Filter: Usertype: Subscriber
Title: “Frequency of Subscriber Trips by Time of Day”
Duplicate Worksheet 5
Change Usertype filter to Customer
Title: “Frequency of Customer Trips by Time of Day”
Drag Worksheet 5 to left
Drag Worksheet 6 to right
Add bottom right caption “Data From: Q1 2019 and Q1 2020” to Worksheet 6
Columns: Day of Week
Rows: CNTD(Trip Id) [Rename to Frequency]
Filter: Usertype: Subscriber
Title: “Frequency of Subscriber Trips by Day of Week”
Duplicate Worksheet 7
Change Usertype filter to Customer
Title: “Frequency of Customer Trips by Day of Week”
Drag Worksheet 7 to top
Drag Worksheet 8 to bottom
Add bottom right caption “Data From: Q1 2019 and Q1 2020” to Worksheet 8
Columns: Time of Day
Rows: Day of Week
Filter: Usertype: Subscriber
Color: CNT(Trip Id): Blue-Teal
Title: “Subscriber Trip Frequency by Time of Day per Day of Week”
Duplicate Worksheet 9
Change Usertype filter to Subscriber
Change Color to Orange
Title: “Customer Trip Frequency by Time of Day per Day of Week”
Drag Worksheet 9 to left
Drag Worksheet 10 to right
Add bottom right caption “Data From: Q1 2019 and Q1 2020” to Worksheet 10
Hide legend
Columns: CNT(Trip Id) [Rename to Frequency]
Rows: From Station Name
Filter:
Usertype: Subscriber
From Station Name>Top/Bottom>By Field>Top 10, Field: Trip Id, Aggregation: Count
Title: “Subscriber Trip Frequency by From Station Name (Top 10)”
Duplicate Worksheet 11
Change filter to Customer
Color: Orange
Title: “Customer Trip Frequency by From Station Name (Top 10)”
Drag Worksheet 11 to top
Drag Worksheet 12 to bottom
Add bottom right caption “Data From: Q1 2019 and Q1 2020” to Worksheet 12
Columns: CNT(Trip Id) [Rename to Frequency]
Rows: to Station Name
Filter:
Usertype: Subscriber
To Station Name>Top/Bottom>By Field>Top 10, Field: Trip Id, Aggregation: Count
Title: “Subscriber Trip Frequency by To Station Name (Top 10)”
Duplicate Worksheet 13
Change filter to Customer
Color: Orange
Title: “Customer Trip Frequency by To Station Name (Top 10)”
Drag Worksheet 13 to top
Drag Worksheet 14 to bottom
Add bottom right caption “Data From: Q1 2019 and Q1 2020” to Worksheet 14
Columns: Time of Day
Rows: Day of Week
Filter:
Usertype: Subscriber
From Station Name>Top/Bottom>By Field>Top 10, Field: Trip Id, Aggregation: Count
Color: CNT(Trip Id): Blue-Teal
Title: “Subscriber Trip Frequency by Time of Day per Day of Week (Top 10 From Stations)”
Duplicate Worksheet 15
Change Usertype filter to Customer
Color: Orange
Title: “Customer Trip Frequency by Time of Day per Day of Week (Top 10 From Stations)”
Columns: Time of Day
Rows: Day of Week
Filter:
Usertype: Subscriber
To Station Name>Top/Bottom>By Field>Top 10, Field: Trip Id, Aggregation: Count
Color: CNT(Trip Id): Blue-Teal
Title: “Subscriber Trip Frequency by Time of Day per Day of Week (Top 10 To Stations)”
Duplicate Worksheet 17
Change Usertype filter to Customer
Color: Orange
Title: “Customer Trip Frequency by Time of Day per Day of Week (Top 10 To Stations)”
Drag Worksheet 15 to top left
Drag Worksheet 16 to top right
Drag Worksheet 17 to bottom left
Drag Worksheet 18 to bottom right
Add bottom right caption “Data From: Q1 2019 and Q1 2020” to Worksheet 18
Hide legend
The Tableau dashboards revealed distinct ridership patterns between subscribers and customers. Subscribers showed the highest trip frequency during weekday mornings and afternoons, aligning with common commuting hours, while their weekend usage was relatively low. In contrast, customers had the greatest activity during weekend afternoons, with significantly fewer rides on weekdays. Heatmaps by time of day and day of week reinforced these trends, and station analysis showed that subscribers’ trips clustered around transit-accessible downtown stations, whereas customers more often started and ended rides near popular leisure destinations. Together, these findings highlight a clear work-oriented usage among subscribers and leisure-oriented behavior among customers.
You can view my full presentation here:
Work
vs. Leisure