2017 Yearly Analysis

The data for the year 2015 gives us a benchmark to base our analysis on as 2015 was the first year in business for the given company.

Preparing Data for Analysis:

library("readxl")
library("ggplot2")
library("dygraphs")
setwd("D:/Case Study")
paths2017 <- vector()
paths2017 <- c(paths2017, paste(getwd(),"/Yearly_Data/2017/Quarterly/Divvy_Trips_2017_Q1.xlsx", sep = ""))
paths2017 <- c(paths2017, paste(getwd(),"/Yearly_Data/2017/Quarterly/Divvy_Trips_2017_Q2.xlsx", sep = ""))
paths2017 <- c(paths2017, paste(getwd(),"/Yearly_Data/2017/Quarterly/Divvy_Trips_2017_Q3.xlsx", sep = ""))
paths2017 <- c(paths2017, paste(getwd(),"/Yearly_Data/2017/Quarterly/Divvy_Trips_2017_Q4.xlsx", sep = ""))
AllPaths <- c(paths2017)
Trips <- list()
for(i in AllPaths)
{
  for(Mypath in i)
  { 
    Trips <- rbind(Trips,read_excel(Mypath, col_types = c("guess","guess","guess","guess","guess","guess","guess","guess","guess","guess","guess","numeric")))
  }
}

Summary of Data:

summary(Trips)
##     trip_id          start_time          end_time             bikeid    
##  Min.   :12979230   Length:3198080     Length:3198080     Min.   :   1  
##  1st Qu.:14052951   Class :character   Class :character   1st Qu.:1817  
##  Median :15643448   Mode  :character   Mode  :character   Median :3605  
##  Mean   :15326739                                         Mean   :3451  
##  3rd Qu.:16580314                                         3rd Qu.:5103  
##  Max.   :17536701                                         Max.   :6471  
##                                                                         
##   tripduration     from_station_id from_station_name  to_station_id  
##  Min.   :   60.0   Min.   :  2.0   Length:3198080     Min.   :  2.0  
##  1st Qu.:  392.0   1st Qu.: 74.0   Class :character   1st Qu.: 74.0  
##  Median :  664.0   Median :156.0   Mode  :character   Median :156.0  
##  Mean   :  926.2   Mean   :180.6                      Mean   :180.7  
##  3rd Qu.: 1113.0   3rd Qu.:268.0                      3rd Qu.:268.0  
##  Max.   :86338.0   Max.   :626.0                      Max.   :626.0  
##                                                                      
##  to_station_name      usertype            gender            birthyear     
##  Length:3198080     Length:3198080     Length:3198080     Min.   :1899    
##  Class :character   Class :character   Class :character   1st Qu.:1976    
##  Mode  :character   Mode  :character   Mode  :character   Median :1985    
##                                                           Mean   :1982    
##                                                           3rd Qu.:1990    
##                                                           Max.   :2017    
##                                                           NA's   :639854

Analysis Metrics:

  • Customer-Sub Ratio
  • Gender Demographic
  • Trip Duration

Customer-Sub Ratio:

UserTypeCol <- Trips$usertype
Subs <- 0
Customers <- 0

for(i in UserTypeCol)
  if(i == "Subscriber")
    Subs <- Subs + 1 else
      Customers <- Customers + 1
pie(c(Subs,Customers),label = c(paste("Subscribers = ", round(Subs*100/(Subs + Customers), 2), "%"), paste("Customers = ", round(Customers*100/(Subs + Customers), 2), "%")))

Gender Demographic:

ggplot(data = Trips) +
  geom_bar(mapping = aes(x = gender, fill = usertype), stat = "count")

Trip Duration:

ggplot(data = Trips, aes(trip_id, tripduration)) +
        geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'