# Analysis of Bike Trips in 2019

## Setup
Import all libraries.

In [1]:
!pip install chart_studio

Collecting chart_studio
[?25l  Downloading https://files.pythonhosted.org/packages/b9/3f/d2f3f506ba1aaf109f549f8b01d1483cd3e324c5ebe6b206acee66efdf46/chart_studio-1.0.0-py3-none-any.whl (76kB)
[K    100% |████████████████████████████████| 81kB 18.3MB/s ta 0:00:01
Installing collected packages: chart-studio
Successfully installed chart-studio-1.0.0
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import numpy as np
import pandas as pd
import os
import chart_studio.plotly as py
import plotly.graph_objects as go
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

## Download
Get all available trip data for 2019.

In [3]:
!curl -O "https://s3.amazonaws.com/tripdata/2019[01-10]-citibike-tripdata.csv.zip"
!unzip -o "*.zip"
!rm *.zip


[1/10]: https://s3.amazonaws.com/tripdata/201901-citibike-tripdata.csv.zip --> 201901-citibike-tripdata.csv.zip
--_curl_--https://s3.amazonaws.com/tripdata/201901-citibike-tripdata.csv.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 39.5M  100 39.5M    0     0  34.9M      0  0:00:01  0:00:01 --:--:-- 34.9M

[2/10]: https://s3.amazonaws.com/tripdata/201902-citibike-tripdata.csv.zip --> 201902-citibike-tripdata.csv.zip
--_curl_--https://s3.amazonaws.com/tripdata/201902-citibike-tripdata.csv.zip
100 38.5M  100 38.5M    0     0  42.5M      0 --:--:-- --:--:-- --:--:-- 60.5M

[3/10]: https://s3.amazonaws.com/tripdata/201903-citibike-tripdata.csv.zip --> 201903-citibike-tripdata.csv.zip
--_curl_--https://s3.amazonaws.com/tripdata/201903-citibike-tripdata.csv.zip
100 54.5M  100 54.5M    0     0  52.5M      0  0:00:01  0:00:01 --:--:-- 52.1M

[4/10]: https://s3.amazonaws.com/t

## Create DataFrame
Read 2019 trip data into a DataFrame.

In [4]:
files = !ls *.csv
df = pd.concat([pd.read_csv(f, header=0, skip_blank_lines=True) for f in files], keys=files)
df.head()

Unnamed: 0,Unnamed: 1,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
201901-citibike-tripdata.csv,0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1
201901-citibike-tripdata.csv,1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1
201901-citibike-tripdata.csv,2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1
201901-citibike-tripdata.csv,3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.99643,21579,Subscriber,1990,1
201901-citibike-tripdata.csv,4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,Great Jones St,40.727434,-73.99379,503.0,E 20 St & Park Ave,40.738274,-73.98752,35379,Subscriber,1979,1


In [5]:
df.describe()

Unnamed: 0,tripduration,start station id,start station latitude,start station longitude,end station id,end station latitude,end station longitude,bikeid,birth year,gender
count,18117780.0,18117600.0,18117780.0,18117780.0,18117600.0,18117780.0,18117780.0,18117780.0,18117780.0,18117780.0
mean,997.5303,1663.64,40.73724,-73.98201,1655.954,40.73693,-73.98221,29318.36,1980.243,1.162605
std,10709.81,1475.535,0.0304923,0.01977536,1475.292,0.03037217,0.0198618,7402.559,12.08459,0.5421212
min,61.0,72.0,40.6554,-74.02535,72.0,40.6554,-74.08364,14529.0,1857.0,0.0
25%,367.0,382.0,40.71755,-73.9953,380.0,40.71749,-73.99595,25165.0,1970.0,1.0
50%,626.0,514.0,40.73782,-73.98518,513.0,40.73705,-73.98565,30677.0,1983.0,1.0
75%,1101.0,3295.0,40.75725,-73.97121,3295.0,40.75715,-73.97137,34407.0,1990.0,1.0
max,3812666.0,3891.0,40.869,-73.878,3891.0,40.869,-73.878,42068.0,2003.0,2.0


## Find Most Popular Destinations
Group by `end station name` and plot top ten.

In [13]:
df_end_station = df.groupby(['end station name']).size().nlargest(30).reset_index(name='trips')
bar_end_station = go.Bar(x=df_end_station['end station name'],
                         y=df_end_station['trips'])
layout_end_station = go.Layout(title='Most Popular Destinations')
fig_end_station = go.Figure(data=[bar_end_station],
                            layout=layout_end_station)
iplot(fig_end_station)

## Find Most Popular Starts
Group by `start station name` and plot top ten.

In [14]:
df_start_station = df.groupby(['start station name']).size().nlargest(30).reset_index(name='trips')
bar_start_station = go.Bar(x=df_start_station['start station name'],
                           y=df_start_station['trips'])
layout_start_station = go.Layout(title='Most Popular Starts')
fig_start_station = go.Figure(data=[bar_start_station],
                              layout=layout_start_station)
iplot(fig_start_station)

## Create Bins for Part of Day
Label `part of day` to see if certain hours have more riders.

In [8]:
def bin_part_of_day(hour):
    if hour < 2:
        return 'night'
    if hour < 6:
        return 'before dawn'
    if hour < 10:
        return 'morning'
    if hour < 14:
        return 'midday'
    if hour < 18:
        return 'afternoon'
    if hour < 22:
        return 'evening'
    else:
        return 'night'
    
# Convert `start time` to DateTime
df['starttime'] = df['starttime'].apply(pd.to_datetime)
df['part of day'] = df['starttime'].apply(lambda x: bin_part_of_day(x.hour))
df.head()

Unnamed: 0,Unnamed: 1,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,part of day
201901-citibike-tripdata.csv,0,320,2019-01-01 00:01:47.401,2019-01-01 00:07:07.5810,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1,night
201901-citibike-tripdata.csv,1,316,2019-01-01 00:04:43.736,2019-01-01 00:10:00.6080,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1,night
201901-citibike-tripdata.csv,2,591,2019-01-01 00:06:03.997,2019-01-01 00:15:55.4380,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1,night
201901-citibike-tripdata.csv,3,2719,2019-01-01 00:07:03.545,2019-01-01 00:52:22.6500,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.99643,21579,Subscriber,1990,1,night
201901-citibike-tripdata.csv,4,303,2019-01-01 00:07:35.945,2019-01-01 00:12:39.5020,229.0,Great Jones St,40.727434,-73.99379,503.0,E 20 St & Park Ave,40.738274,-73.98752,35379,Subscriber,1979,1,night


In [17]:
df_partofday = df.groupby(['part of day']).size().reset_index(name='trips')
bar_partofday = go.Bar(x=df_partofday['part of day'],
                       y=df_partofday['trips'])
layout_partofday = go.Layout(title='Rides by Part of Day')
fig_partofday = go.Figure(data=[bar_partofday],
                          layout=layout_partofday)
iplot(fig_partofday)

## Plot by weekend
Plot to see if there rides by time of day differs during weekdays and weekends.

In [10]:
df['weekday'] = df['starttime'].apply(lambda x: x.dayofweek < 5)
df_weekday_partofday = df.groupby(['weekday','part of day']).size().reset_index(name='trips')

df_wdyonly = df_weekday_partofday[df_weekday_partofday['weekday']]
df_wkdonly = df_weekday_partofday[df_weekday_partofday['weekday'] != True]

bar_df_wdyonly = go.Bar(x=df_wdyonly['part of day'],
                        y=df_wdyonly['trips'])
layout_wdyonly = go.Layout(title='Weekday Rides by Part of Day')
fig_wdyonly = go.Figure(data=[bar_df_wdyonly],
                        layout=layout_wdyonly)

bar_df_wkdonly = go.Bar(x=df_wkdonly['part of day'],
                        y=df_wkdonly['trips'])
layout_wkdonly = go.Layout(title='Weekend Rides by Part of Day')
fig_wkdonly = go.Figure(data=[bar_df_wkdonly],
                        layout=layout_wkdonly)

iplot(fig_wdyonly)
iplot(fig_wkdonly)

## Find Most Popular Month by Rides
Chart rides by month.

In [11]:
df['month of year'] = df['starttime'].apply(lambda x: x.month)
df['month name'] = df['starttime'].apply(lambda x: x.strftime('%b'))
df_monthofyr = df.groupby(['month of year','month name']).size().reset_index(name='trips')
bar_monthofyr = go.Bar(x=df_monthofyr['month name'],
                       y=df_monthofyr['trips'])
layout_monthofyr = go.Layout(title='Rides by Month')
fig_monthofyr = go.Figure(data=[bar_monthofyr],
                          layout=layout_monthofyr)
iplot(fig_monthofyr)