# Uncomment the code given below, and run the line of code to install featuretools library

!pip install featuretools==0.27.0

Requirement already satisfied: featuretools==0.27.0 in /usr/local/lib/python3.10/dist-packages (0.27.0)
Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from featuretools==0.27.0) (1.11.4)
Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.10/dist-packages (from featuretools==0.27.0) (1.25.2)
Requirement already satisfied: pandas<2.0.0,>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from featuretools==0.27.0) (1.5.3)
Requirement already satisfied: tqdm>=4.32.0 in /usr/local/lib/python3.10/dist-packages (from featuretools==0.27.0) (4.66.4)
Requirement already satisfied: pyyaml>=5.4 in /usr/local/lib/python3.10/dist-packages (from featuretools==0.27.0) (6.0.1)
Requirement already satisfied: cloudpickle>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from featuretools==0.27.0) (2.2.1)
Requirement already satisfied: distributed>=2.12.0 in /usr/local/lib/python3.10/dist-packages (from featuretools==0.27.0) (2023.8.1)
Requirement already satisfied: dask[dataframe]>=2.12.0 in /usr/local/lib/python3.10/dist-packages (from featuretools==0.27.0) (2023.8.1)
Requirement already satisfied: psutil>=5.6.6 in /usr/local/lib/python3.10/dist-packages (from featuretools==0.27.0) (5.9.5)
Requirement already satisfied: click>=7.0.0 in /usr/local/lib/python3.10/dist-packages (from featuretools==0.27.0) (8.1.7)
Requirement already satisfied: fsspec>=2021.09.0 in /usr/local/lib/python3.10/dist-packages (from dask[dataframe]>=2.12.0->featuretools==0.27.0) (2023.6.0)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from dask[dataframe]>=2.12.0->featuretools==0.27.0) (24.0)
Requirement already satisfied: partd>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from dask[dataframe]>=2.12.0->featuretools==0.27.0) (1.4.2)
Requirement already satisfied: toolz>=0.10.0 in /usr/local/lib/python3.10/dist-packages (from dask[dataframe]>=2.12.0->featuretools==0.27.0) (0.12.1)
Requirement already satisfied: importlib-metadata>=4.13.0 in /usr/local/lib/python3.10/dist-packages (from dask[dataframe]>=2.12.0->featuretools==0.27.0) (7.1.0)
Requirement already satisfied: jinja2>=2.10.3 in /usr/local/lib/python3.10/dist-packages (from distributed>=2.12.0->featuretools==0.27.0) (3.1.4)
Requirement already satisfied: locket>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from distributed>=2.12.0->featuretools==0.27.0) (1.0.0)
Requirement already satisfied: msgpack>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from distributed>=2.12.0->featuretools==0.27.0) (1.0.8)
Requirement already satisfied: sortedcontainers>=2.0.5 in /usr/local/lib/python3.10/dist-packages (from distributed>=2.12.0->featuretools==0.27.0) (2.4.0)
Requirement already satisfied: tblib>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from distributed>=2.12.0->featuretools==0.27.0) (3.0.0)
Requirement already satisfied: tornado>=6.0.4 in /usr/local/lib/python3.10/dist-packages (from distributed>=2.12.0->featuretools==0.27.0) (6.3.3)
Requirement already satisfied: urllib3>=1.24.3 in /usr/local/lib/python3.10/dist-packages (from distributed>=2.12.0->featuretools==0.27.0) (2.0.7)
Requirement already satisfied: zict>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from distributed>=2.12.0->featuretools==0.27.0) (3.0.0)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas<2.0.0,>=1.2.0->featuretools==0.27.0) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<2.0.0,>=1.2.0->featuretools==0.27.0) (2023.4)
Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=4.13.0->dask[dataframe]>=2.12.0->featuretools==0.27.0) (3.18.1)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2>=2.10.3->distributed>=2.12.0->featuretools==0.27.0) (2.1.5)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas<2.0.0,>=1.2.0->featuretools==0.27.0) (1.16.0)

# Basic libraries of python for numeric and dataframe computations
import numpy as np
import pandas as pd

# Basic library for data visualization
import matplotlib.pyplot as plt

# Slightly advanced library for data visualization
import seaborn as sns

# Featauretools for feature engineering
import featuretools as ft

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Importing gradient boosting regressor, to make prediction
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

# Importing primitives
from featuretools.primitives import (Minute, Hour, Day, Month,
                                     Weekday, IsWeekend, Count, Sum, Mean, Median, Std, Min, Max)

# Used to ignore the warning given as output of the code
import warnings
warnings.filterwarnings("ignore")

print(ft.__version__)
%load_ext autoreload
%autoreload 2

0.27.0

# To preview first five rows.
def preview(df, n=5):
    """return n rows that have fewest number of nulls"""
    order = df.isnull().sum(axis=1).sort_values().head(n).index
    return df.loc[order]

# To see the feature importance of variables in the final model
def feature_importances(model, feature_names, n=10):
    importances = model.feature_importances_
    zipped = sorted(zip(feature_names, importances), key=lambda x: -x[1])
    for i, f in enumerate(zipped[:n]):
        print("%d: Feature: %s, %.3f" % (i+1, f[0], f[1]))

# To generate train and test dataset
def get_train_test_fm(feature_matrix, percentage):
    nrows = feature_matrix.shape[0]
    head = int(nrows * percentage)
    tail = nrows-head
    X_train = feature_matrix.head(head)
    y_train = X_train['trip_duration']
    X_train = X_train.drop(['trip_duration'], axis=1)
    imp = SimpleImputer()
    X_train = imp.fit_transform(X_train)
    X_test = feature_matrix.tail(tail)
    y_test = X_test['trip_duration']
    X_test = X_test.drop(['trip_duration'], axis=1)
    X_test = imp.transform(X_test)

    return (X_train, y_train, X_test,y_test)

def column_string(n):
    string = ""
    while n > 0:
        n, remainder = divmod(n - 1, 26)
        string = chr(65 + remainder) + string
    return string

# To compute features using automated feature engineering.
def compute_features(features, cutoff_time):
    # Shuffle so we don't see encoded features in the front or backs

    np.random.shuffle(features)
    feature_matrix = ft.calculate_feature_matrix(features,
                                                 cutoff_time=cutoff_time,
                                                 approximate='36d',
                                                 verbose=True)
    print("Finishing computing...")
    feature_matrix, features = ft.encode_features(feature_matrix, features,
                                                  to_encode=["pickup_neighborhood", "dropoff_neighborhood"],
                                                  include_unknown=False)
    return feature_matrix

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# If you are using Google Colab then while reading the files using 'pd.read_csv()', replace the location of CSV files with the exact location of the files in your drive folder.
trips = pd.read_csv('BUS_SERVICE.csv',
                        parse_dates=["pickup_datetime","dropoff_datetime"],
                        dtype={'vendor_id':"category",'passenger_count':'int64'},
                        encoding='utf-8')
trips["payment_type"] = trips["payment_type"].apply(str)
trips = trips.dropna(axis=0, how='any', subset=['trip_duration'])

pickup_neighborhoods = pd.read_csv("pickup_neighborhoods.csv", encoding='utf-8')
dropoff_neighborhoods = pd.read_csv("dropoff_neighborhoods.csv", encoding='utf-8')

preview(trips, 10)

# Drop the rows where at least one element is missing.
trips=trips.dropna()

trips. drop("dropoff_datetime", axis=1, inplace=True)

trips.head()

# Checking the info of the dataset
trips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1452 entries, 0 to 1499
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    1452 non-null   int64         
 1   Transport_service     1452 non-null   int64         
 2   pickup_datetime       1452 non-null   datetime64[ns]
 3   passenger_count       1452 non-null   int64         
 4   trip_distance         1452 non-null   float64       
 5   pickup_longitude      1452 non-null   float64       
 6   pickup_latitude       1452 non-null   float64       
 7   dropoff_longitude     1452 non-null   float64       
 8   dropoff_latitude      1452 non-null   float64       
 9   payment_type          1452 non-null   object        
 10  trip_duration         1452 non-null   int64         
 11  pickup_neighborhood   1452 non-null   object        
 12  dropoff_neighborhood  1452 non-null   object        
dtypes: datetime64[ns](1), float64(5), int64(4), object(3)
memory usage: 158.8+ KB

# Check the uniques values in each columns
trips.nunique()

id                      1452
Transport_service          4
pickup_datetime          259
passenger_count            4
trip_distance            531
pickup_longitude        1338
pickup_latitude         1418
dropoff_longitude       1331
dropoff_latitude        1416
payment_type               3
trip_duration            969
pickup_neighborhood       47
dropoff_neighborhood      49
dtype: int64

# Checking the descriptive stats of the data
trips.describe().T

# Chekcing the rows where trip distance is 0
trips[trips['trip_distance']==0]

trips['trip_distance']=trips['trip_distance'].replace(0,trips['trip_distance'].median())

trips[trips['trip_distance']==0].count()

id                      0
Transport_service       0
pickup_datetime         0
passenger_count         0
trip_distance           0
pickup_longitude        0
pickup_latitude         0
dropoff_longitude       0
dropoff_latitude        0
payment_type            0
trip_duration           0
pickup_neighborhood     0
dropoff_neighborhood    0
dtype: int64

trips[trips['trip_duration']==0]

trips.hist(figsize=(12,12))
plt.show()

sns.boxplot(x = trips['trip_distance'])
plt.show()

plt.figure(figsize=(10,5))
sns.countplot(x='passenger_count', hue='passenger_count', data=trips, palette="bright")
plt.show()

trips.passenger_count.value_counts(normalize=True)

2    0.263085
4    0.251377
3    0.244490
1    0.241047
Name: passenger_count, dtype: float64

trips.pickup_neighborhood.value_counts().sort_values(ascending=False).plot(kind='bar' ,figsize=(20,8))

<Axes: >

trips.dropoff_neighborhood.value_counts().sort_values(ascending=False).plot(kind='bar' ,figsize=(20,8))

<Axes: >

sns.scatterplot(x = trips['trip_distance'], y = trips['trip_duration'])

<Axes: xlabel='trip_distance', ylabel='trip_duration'>

sns.countplot(x = trips['passenger_count'], hue=trips['payment_type'])

<Axes: xlabel='passenger_count', ylabel='count'>

entities = {
        "trips": (trips, "id", 'pickup_datetime' ),
        "pickup_neighborhoods": (pickup_neighborhoods, "neighborhood_id"),
        "dropoff_neighborhoods": (dropoff_neighborhoods, "neighborhood_id"),
        }

relationships = [("pickup_neighborhoods", "neighborhood_id", "trips", "pickup_neighborhood"),
                 ("dropoff_neighborhoods", "neighborhood_id", "trips", "dropoff_neighborhood")]

cutoff_time = trips[['id', 'pickup_datetime']]
preview(cutoff_time, 10)

trans_primitives = [IsWeekend]

features = ft.dfs(entities=entities,
                  relationships=relationships,
                  target_entity="trips",
                  trans_primitives=trans_primitives,
                  agg_primitives=[],
                  ignore_variables={"trips": ["pickup_latitude", "pickup_longitude",
                                              "dropoff_latitude", "dropoff_longitude"]},
                  features_only=True)

print ("Number of features: %d" % len(features))
features

Number of features: 12

[<Feature: Transport_service>,
 <Feature: passenger_count>,
 <Feature: trip_distance>,
 <Feature: payment_type>,
 <Feature: trip_duration>,
 <Feature: pickup_neighborhood>,
 <Feature: dropoff_neighborhood>,
 <Feature: IS_WEEKEND(pickup_datetime)>,
 <Feature: pickup_neighborhoods.latitude>,
 <Feature: pickup_neighborhoods.longitude>,
 <Feature: dropoff_neighborhoods.latitude>,
 <Feature: dropoff_neighborhoods.longitude>]

def compute_features(features, cutoff_time):
    # shuffle so we don't see encoded features in the front or backs

    np.random.shuffle(features)
    feature_matrix = ft.calculate_feature_matrix(features,
                                                 cutoff_time=cutoff_time,
                                                 approximate='36d',
                                                 verbose=True,entities=entities, relationships=relationships)
    print("Finishing computing...")
    feature_matrix, features = ft.encode_features(feature_matrix, features,
                                                  to_encode=["pickup_neighborhood", "dropoff_neighborhood"],
                                                  include_unknown=False)
    return feature_matrix

feature_matrix1 = compute_features(features, cutoff_time)

Elapsed: 00:00 | Progress: 100%|██████████
Finishing computing...

preview(feature_matrix1, 5)

feature_matrix1.shape

(1452, 30)

plt.hist(np.sqrt(trips['trip_duration']))
plt.show()

plt.hist(np.log(trips['trip_duration']))
plt.show()

# Separates the whole feature matrix into train data feature matrix,train data labels, and test data feature matrix
X_train, y_train, X_test, y_test = get_train_test_fm(feature_matrix1,.75)
y_train = np.sqrt(y_train)
y_test = np.sqrt(y_test)

# RMSE
def rmse(predictions, targets):
    return np.sqrt(((targets - predictions) ** 2).mean())

# MAE
def mae(predictions, targets):
    return np.mean(np.abs((targets - predictions)))


# Model Performance on test and train data
def model_pref(model, x_train, x_test, y_train,y_test):

    # Insample Prediction
    y_pred_train = model.predict(x_train)
    y_observed_train = y_train

    # Prediction on test data
    y_pred_test = model.predict(x_test)
    y_observed_test = y_test

    print(
        pd.DataFrame(
            {
                "Data": ["Train", "Test"],
                'RSquared':
                    [r2_score(y_observed_train,y_pred_train),
                    r2_score(y_observed_test,y_pred_test )
                    ],
                "RMSE": [
                    rmse(y_pred_train, y_observed_train),
                    rmse(y_pred_test, y_observed_test),
                ],
                "MAE": [
                    mae(y_pred_train, y_observed_train),
                    mae(y_pred_test, y_observed_test),
                ],
            }
        )
    )

# Defining the model
lr1=LinearRegression()

# Fitting the model
lr1.fit(X_train,y_train)

LinearRegression()

LinearRegression()

model_pref(lr1, X_train, X_test,y_train,y_test)

    Data  RSquared      RMSE       MAE
0  Train  0.662819  5.246785  4.052007
1   Test  0.583352  5.583601  4.172844

dt = DecisionTreeRegressor()

dt.fit(X_train,y_train)

DecisionTreeRegressor()

DecisionTreeRegressor()

model_pref(dt, X_train, X_test,y_train,y_test)

    Data  RSquared      RMSE       MAE
0  Train  1.000000  0.000000  0.000000
1   Test  0.512846  6.037579  4.578264

dt_pruned = DecisionTreeRegressor(max_depth = 3)

dt_pruned.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=3)

DecisionTreeRegressor(max_depth=3)

model_pref(dt_pruned, X_train, X_test,y_train,y_test)

    Data  RSquared      RMSE       MAE
0  Train  0.748675  4.529805  3.404459
1   Test  0.681276  4.883566  3.780489

rf = RandomForestRegressor(n_estimators = 60, max_depth = 4)

rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=4, n_estimators=60)

RandomForestRegressor(max_depth=4, n_estimators=60)

model_pref(rf, X_train, X_test, y_train, y_test)

    Data  RSquared      RMSE       MAE
0  Train  0.811360  3.924453  2.893268
1   Test  0.732499  4.473960  3.529435

trans_primitives = [Minute, Hour, Day, Month, Weekday, IsWeekend]

features = ft.dfs(entities=entities,
                  relationships=relationships,
                  target_entity="trips",
                  trans_primitives=trans_primitives,
                  agg_primitives=[],
                  ignore_variables={"trips": ["pickup_latitude", "pickup_longitude",
                                              "dropoff_latitude", "dropoff_longitude"]},
                  features_only=True)

print ("Number of features: %d" % len(features))
features

Number of features: 17

[<Feature: Transport_service>,
 <Feature: passenger_count>,
 <Feature: trip_distance>,
 <Feature: payment_type>,
 <Feature: trip_duration>,
 <Feature: pickup_neighborhood>,
 <Feature: dropoff_neighborhood>,
 <Feature: DAY(pickup_datetime)>,
 <Feature: HOUR(pickup_datetime)>,
 <Feature: IS_WEEKEND(pickup_datetime)>,
 <Feature: MINUTE(pickup_datetime)>,
 <Feature: MONTH(pickup_datetime)>,
 <Feature: WEEKDAY(pickup_datetime)>,
 <Feature: pickup_neighborhoods.latitude>,
 <Feature: pickup_neighborhoods.longitude>,
 <Feature: dropoff_neighborhoods.latitude>,
 <Feature: dropoff_neighborhoods.longitude>]

feature_matrix2 = compute_features(features, cutoff_time)

Elapsed: 00:00 | Progress: 100%|██████████
Finishing computing...

feature_matrix2.shape

(1452, 35)

feature_matrix2.head()

# Separates the whole feature matrix into train data feature matrix,train data labels, and test data feature matrix
X_train2, y_train2, X_test2, y_test2 = get_train_test_fm(feature_matrix2,.75)
y_train2 = np.sqrt(y_train2)
y_test2 = np.sqrt(y_test2)

lr2=LinearRegression()

lr2.fit(X_train2,y_train2)

LinearRegression()

LinearRegression()

model_pref(lr2, X_train2, X_test2,y_train2,y_test2)

    Data  RSquared      RMSE       MAE
0  Train  0.663515  5.241371  4.060878
1   Test  0.602172  5.456041  4.065442

dt2=DecisionTreeRegressor()

dt2.fit(X_train2,y_train2)

DecisionTreeRegressor()

DecisionTreeRegressor()

model_pref(dt2, X_train2, X_test2,y_train2,y_test2)

    Data  RSquared      RMSE       MAE
0  Train  1.000000  0.000000  0.000000
1   Test  0.443529  6.452839  4.780904

dt_pruned2=DecisionTreeRegressor(max_depth=4)

dt_pruned2.fit(X_train2,y_train2)

DecisionTreeRegressor(max_depth=4)

DecisionTreeRegressor(max_depth=4)

model_pref(dt_pruned2, X_train2, X_test2,y_train2,y_test2)

    Data  RSquared      RMSE       MAE
0  Train  0.782259  4.216308  3.142310
1   Test  0.707492  4.678412  3.642763

rf2=RandomForestRegressor(n_estimators=60,max_depth=4)

rf2.fit(X_train2,y_train2)

RandomForestRegressor(max_depth=4, n_estimators=60)

RandomForestRegressor(max_depth=4, n_estimators=60)

model_pref(rf2, X_train2, X_test2,y_train2,y_test2)

    Data  RSquared      RMSE       MAE
0  Train  0.812512  3.912451  2.905518
1   Test  0.743528  4.380765  3.457879

trans_primitives = [Minute, Hour, Day, Month, Weekday, IsWeekend]
aggregation_primitives = [Count, Sum, Mean, Median, Std, Max, Min]

features = ft.dfs(entities=entities,
                  relationships=relationships,
                  target_entity="trips",
                  trans_primitives=trans_primitives,
                  agg_primitives=aggregation_primitives,
                  ignore_variables={"trips": ["pickup_latitude", "pickup_longitude",
                                              "dropoff_latitude", "dropoff_longitude"]},
                  features_only=True)

print ("Number of features: %d" % len(features))
features

Number of features: 67

[<Feature: Transport_service>,
 <Feature: passenger_count>,
 <Feature: trip_distance>,
 <Feature: payment_type>,
 <Feature: trip_duration>,
 <Feature: pickup_neighborhood>,
 <Feature: dropoff_neighborhood>,
 <Feature: DAY(pickup_datetime)>,
 <Feature: HOUR(pickup_datetime)>,
 <Feature: IS_WEEKEND(pickup_datetime)>,
 <Feature: MINUTE(pickup_datetime)>,
 <Feature: MONTH(pickup_datetime)>,
 <Feature: WEEKDAY(pickup_datetime)>,
 <Feature: pickup_neighborhoods.latitude>,
 <Feature: pickup_neighborhoods.longitude>,
 <Feature: dropoff_neighborhoods.latitude>,
 <Feature: dropoff_neighborhoods.longitude>,
 <Feature: pickup_neighborhoods.COUNT(trips)>,
 <Feature: pickup_neighborhoods.MAX(trips.Transport_service)>,
 <Feature: pickup_neighborhoods.MAX(trips.passenger_count)>,
 <Feature: pickup_neighborhoods.MAX(trips.trip_distance)>,
 <Feature: pickup_neighborhoods.MAX(trips.trip_duration)>,
 <Feature: pickup_neighborhoods.MEAN(trips.Transport_service)>,
 <Feature: pickup_neighborhoods.MEAN(trips.passenger_count)>,
 <Feature: pickup_neighborhoods.MEAN(trips.trip_distance)>,
 <Feature: pickup_neighborhoods.MEAN(trips.trip_duration)>,
 <Feature: pickup_neighborhoods.MEDIAN(trips.Transport_service)>,
 <Feature: pickup_neighborhoods.MEDIAN(trips.passenger_count)>,
 <Feature: pickup_neighborhoods.MEDIAN(trips.trip_distance)>,
 <Feature: pickup_neighborhoods.MEDIAN(trips.trip_duration)>,
 <Feature: pickup_neighborhoods.MIN(trips.Transport_service)>,
 <Feature: pickup_neighborhoods.MIN(trips.passenger_count)>,
 <Feature: pickup_neighborhoods.MIN(trips.trip_distance)>,
 <Feature: pickup_neighborhoods.MIN(trips.trip_duration)>,
 <Feature: pickup_neighborhoods.STD(trips.Transport_service)>,
 <Feature: pickup_neighborhoods.STD(trips.passenger_count)>,
 <Feature: pickup_neighborhoods.STD(trips.trip_distance)>,
 <Feature: pickup_neighborhoods.STD(trips.trip_duration)>,
 <Feature: pickup_neighborhoods.SUM(trips.Transport_service)>,
 <Feature: pickup_neighborhoods.SUM(trips.passenger_count)>,
 <Feature: pickup_neighborhoods.SUM(trips.trip_distance)>,
 <Feature: pickup_neighborhoods.SUM(trips.trip_duration)>,
 <Feature: dropoff_neighborhoods.COUNT(trips)>,
 <Feature: dropoff_neighborhoods.MAX(trips.Transport_service)>,
 <Feature: dropoff_neighborhoods.MAX(trips.passenger_count)>,
 <Feature: dropoff_neighborhoods.MAX(trips.trip_distance)>,
 <Feature: dropoff_neighborhoods.MAX(trips.trip_duration)>,
 <Feature: dropoff_neighborhoods.MEAN(trips.Transport_service)>,
 <Feature: dropoff_neighborhoods.MEAN(trips.passenger_count)>,
 <Feature: dropoff_neighborhoods.MEAN(trips.trip_distance)>,
 <Feature: dropoff_neighborhoods.MEAN(trips.trip_duration)>,
 <Feature: dropoff_neighborhoods.MEDIAN(trips.Transport_service)>,
 <Feature: dropoff_neighborhoods.MEDIAN(trips.passenger_count)>,
 <Feature: dropoff_neighborhoods.MEDIAN(trips.trip_distance)>,
 <Feature: dropoff_neighborhoods.MEDIAN(trips.trip_duration)>,
 <Feature: dropoff_neighborhoods.MIN(trips.Transport_service)>,
 <Feature: dropoff_neighborhoods.MIN(trips.passenger_count)>,
 <Feature: dropoff_neighborhoods.MIN(trips.trip_distance)>,
 <Feature: dropoff_neighborhoods.MIN(trips.trip_duration)>,
 <Feature: dropoff_neighborhoods.STD(trips.Transport_service)>,
 <Feature: dropoff_neighborhoods.STD(trips.passenger_count)>,
 <Feature: dropoff_neighborhoods.STD(trips.trip_distance)>,
 <Feature: dropoff_neighborhoods.STD(trips.trip_duration)>,
 <Feature: dropoff_neighborhoods.SUM(trips.Transport_service)>,
 <Feature: dropoff_neighborhoods.SUM(trips.passenger_count)>,
 <Feature: dropoff_neighborhoods.SUM(trips.trip_distance)>,
 <Feature: dropoff_neighborhoods.SUM(trips.trip_duration)>]

feature_matrix3 = compute_features(features, cutoff_time)

Elapsed: 00:00 | Progress: 100%|██████████
Finishing computing...

feature_matrix3.head()

# Separates the whole feature matrix into train data feature matrix,train data labels, and test data feature matrix
X_train3, y_train3, X_test3, y_test3 = get_train_test_fm(feature_matrix3,.75)
y_train3 = np.sqrt(y_train3)
y_test3 = np.sqrt(y_test3)

lr3=LinearRegression()

lr3.fit(X_train3,y_train3)

LinearRegression()

LinearRegression()

model_pref(lr3, X_train3, X_test3,y_train3,y_test3)

    Data  RSquared      RMSE       MAE
0  Train  0.663515  5.241371  4.060878
1   Test  0.602172  5.456041  4.065442

dt3=DecisionTreeRegressor()

dt3.fit(X_train3,y_train3)

DecisionTreeRegressor()

DecisionTreeRegressor()

model_pref(dt3, X_train3, X_test3,y_train3,y_test3)

    Data  RSquared      RMSE       MAE
0  Train  1.000000  0.000000  0.000000
1   Test  0.460613  6.353015  4.615836

dt_pruned3=DecisionTreeRegressor(max_depth=4)

dt_pruned3.fit(X_train3,y_train3)

DecisionTreeRegressor(max_depth=4)

DecisionTreeRegressor(max_depth=4)

model_pref(dt_pruned3, X_train3, X_test3,y_train3,y_test3)

    Data  RSquared      RMSE       MAE
0  Train  0.782259  4.216308  3.142310
1   Test  0.707492  4.678412  3.642763

rf3=RandomForestRegressor(n_estimators=60,max_depth=4)

rf3.fit(X_train3,y_train3)

RandomForestRegressor(max_depth=4, n_estimators=60)

RandomForestRegressor(max_depth=4, n_estimators=60)

model_pref(rf3, X_train3, X_test3,y_train3,y_test3)

    Data  RSquared      RMSE       MAE
0  Train  0.813560  3.901502  2.890251
1   Test  0.736628  4.439299  3.495446

y_pred = rf2.predict(X_test2)
y_pred = y_pred**2 # Undo the sqrt we took earlier
y_pred[5:]

array([1310.36581152,  491.96042817, 1154.01351154,  243.32842874,
       1410.35055774,  561.2419432 ,  637.20920833,  503.2504506 ,
       1538.82853773,  484.05824117, 1394.49283966,  252.7510008 ,
       1353.40355912,  397.88158635, 1728.70687041,  233.78503969,
       1099.38463292,  910.58300441, 1319.7395679 , 1858.61927511,
        602.21764906,  601.86946076, 1338.97923505,  825.11346544,
        486.95738007,  725.9025401 ,  387.62002221,  491.96042817,
        496.66511726, 1354.02235091,  264.66038492, 1404.97665601,
        498.04986964,  617.99999973,  982.69574926,  894.28316602,
       1406.22735429,  621.50646651, 1892.87597261,  610.01039359,
        302.87073866,  325.35322518,  646.85785694,  369.16753615,
        500.71891929,  646.71277962,  322.65800154,  649.122199  ,
        520.57075049,  491.96042817, 1818.95966758,  510.59388032,
        877.8462939 , 1548.97284034,  176.90428711,  230.10084969,
       1427.53836385, 1110.73160568, 1203.80060748,  631.28167863,
        649.0998901 ,  516.75113737,  990.39959202, 1417.0572914 ,
        404.51076264,  610.24798745,  643.91111265, 1309.74581498,
        611.79253966,  351.30080104,  847.14230659, 1123.62897593,
        632.52337598,  894.10551094,  749.1143638 ,  375.91976865,
       1912.78014281,  572.73265399,  640.95028067, 1238.83437943,
       1377.66808894,  998.12980684,  480.0388719 , 1192.56244415,
        880.58247992,  256.63161924,  427.83627759, 1531.66245904,
        500.64257866, 1441.35455951,  254.67956115, 1442.73369715,
        839.88891217,  537.58957045, 1300.48150491,  177.57269597,
        643.5663259 ,  649.7673518 , 1854.84141421,  640.45602086,
       1423.34083247, 1156.11382805,  636.11969233,  257.95698149,
        623.44379007,  697.82051786,  505.71840729, 1417.47548491,
        542.88191805,  640.45602086,  176.90428711,  481.89094592,
       1021.15891961, 1287.52741503,  838.45730444,  999.22770103,
        879.04879502,  330.88142458,  843.83388981,  445.49007045,
        617.85102455,  517.55484488,  614.52352994, 1392.46073453,
       1328.85468045,  646.84663826, 1109.23190147,  592.15335663,
        607.63847198,  682.77156102, 1436.10289607,  247.37724867,
       1507.79167314,  616.44007748,  802.04085881,  505.30975223,
       1619.5831465 , 1808.16162674,  347.36158096,  703.68474258,
        645.15075547,  610.24798745,  645.17083231, 1476.20173011,
       1342.46477937,  608.94805392,  447.15513386, 1420.48381063,
        367.12032433,  642.04543651, 1516.11804244, 1493.45644371,
        328.96409029, 1094.27121435, 1292.67201844,  367.38189656,
       1388.13793297, 1422.34905934, 1482.91983273,  608.47461888,
        392.79430515, 1635.75128668, 1166.32929069, 1068.82598204,
       1400.45869195,  310.58567883,  632.49258352, 1817.43617694,
        656.76808479,  456.70387598, 1444.68179689,  594.66928287,
        247.37724867, 1301.06705702,  996.84841588, 1708.56964099,
        538.02531975,  543.78781146,  890.92781763, 1687.82966743,
        228.24925846,  331.68546996, 1280.5902066 , 1277.86393324,
       1057.70943672, 1016.12302295,  500.71891929,  640.45602086,
        538.02531975,  543.78781146, 1338.70136976,  778.08787221,
       1050.24863046, 1029.22536208,  650.00155276, 1582.11491721,
        650.12912768, 1302.10432574, 1099.38463292, 1991.01959432,
       1344.72785149,  546.50703829, 1258.68225127,  853.03229982,
        247.37724867, 1588.69009032,  475.63344281, 1367.67267035,
       1183.43128158,  931.12292994, 1186.6527806 , 1175.73630144,
       1259.26135699, 1904.0942177 ,  636.72305414,  580.21536995,
       1230.05608682,  738.16130892, 1874.27931182, 1018.10592225,
       1134.28577523,  560.31161614,  274.35562467,  644.28267725,
       1923.40069714, 1311.28966178,  947.03466865,  998.12980684,
       1365.08792443, 1165.21673249,  752.55785354, 1283.35901432,
        754.34567111,  269.63379022, 1613.40832841, 1127.1083275 ,
       2000.11127237,  752.33553322,  437.56840338,  934.57033787,
        369.16753615, 1321.08319189,  605.37417045, 1287.83072554,
       1529.65323776, 1243.5914223 , 1171.06618796, 1241.96340264,
       1387.61485922,  615.27899835, 1175.32752408,  371.28047949,
       1130.37512189, 1021.10853735, 1028.18921084,  652.23607244,
        791.4256209 , 1441.33883581,  352.97887505, 1119.17414436,
        561.30280144, 1828.46860356,  220.81943374,  252.7510008 ,
        472.28136178, 1472.49834578,  495.30448814,  843.83388981,
        564.42211338,  591.79586021, 1835.03706854,  617.85102455,
       1124.7429811 , 2126.9000051 , 1306.72199624, 1287.52741503,
        235.46710728,  595.33800201,  214.93186295, 1149.83523057,
       1917.70807174,  898.40397198,  709.69652386, 1485.71940103,
        183.456568  ,  940.89287993,  666.17875197, 1845.2716145 ,
        893.31274116, 1059.1464748 ,  977.91886643,  760.19962659,
        639.63734265, 1383.82460288, 1489.59786038,  475.85697462,
        338.10327704,  352.97887505,  778.08787221,  709.0335948 ,
       1310.22777474,  610.94729887,  503.2504506 ,  259.04733498,
       1498.83889837,  572.16928787,  642.20928047, 1103.53133658,
        393.36737217,  581.98773237,  497.66756678,  367.38189656,
        728.87554217, 1463.51454129,  643.26416509,  743.86808514,
       1818.41731015, 1018.10592225, 1879.57549046,  488.07633697,
       1083.36944796,  894.87192928, 1274.56900385, 1594.70189723,
        791.97230501,  645.50678619, 1604.78510535, 1325.70035392,
        529.14738571, 1004.92286016,  337.74796079,  786.18164329,
        658.49177559, 1794.77387604,  604.21764497, 1226.3501858 ,
       1627.35496913,  176.90428711,  593.31768405,  224.03962767,
        642.05801073, 1101.31565444,  500.71891929, 1102.07530103,
       1043.68338894, 1263.25970077,  635.88774213,  645.64051997,
       1051.83221269, 1098.74055456,  395.60691755, 1694.78109429,
        740.93565661,  843.83388981,  176.90428711,  546.50703829,
        554.51729509,  472.05208293])

feature_importances(rf2, feature_matrix2.drop(['trip_duration'],axis=1).columns, n=10)

1: Feature: trip_distance, 0.941
2: Feature: dropoff_neighborhoods.longitude, 0.017
3: Feature: dropoff_neighborhoods.latitude, 0.015
4: Feature: pickup_neighborhoods.longitude, 0.014
5: Feature: pickup_neighborhoods.latitude, 0.005
6: Feature: MINUTE(pickup_datetime), 0.001
7: Feature: dropoff_neighborhood = AT, 0.001
8: Feature: dropoff_neighborhood = P, 0.001
9: Feature: pickup_neighborhood = AD, 0.001
10: Feature: HOUR(pickup_datetime), 0.001

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Nine_Predictive_Engineering/Case_Studies/New_York_City_Bus_Ride/Practice_Case_Study_NYC_BUS_Case_Study.ipynb"

	id	Transport_service	pickup_datetime	dropoff_datetime	passenger_count	trip_distance	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	payment_type	trip_duration	pickup_neighborhood	dropoff_neighborhood
0	0	3	2019-01-01 00:00:00	2019-01-01 00:06:00	1	1.32	-73.961258	40.796200	-73.950050	40.787312	3	387	AH	C
997	997	4	2019-01-01 02:38:00	2019-01-01 02:52:00	3	5.57	-73.952347	40.824081	-73.996429	40.759918	1	828	AL	P
996	996	4	2019-01-01 02:38:00	2019-01-01 02:41:00	2	0.20	-73.980171	40.745308	-73.984192	40.745892	2	184	Y	AO
995	995	3	2019-01-01 02:38:00	2019-01-01 02:52:00	1	2.83	-73.991966	40.759331	-74.004707	40.724442	2	815	P	AB
994	994	2	2019-01-01 02:38:00	2019-01-01 02:49:00	2	2.48	-73.961441	40.694302	-73.959938	40.662540	1	655	V	AF
993	993	3	2019-01-01 02:38:00	2019-01-01 02:40:00	4	0.72	-73.974014	40.782967	-73.969894	40.788155	2	162	I	U
992	992	2	2019-01-01 02:38:00	2019-01-01 03:13:00	3	10.50	-73.956985	40.766346	-73.966179	40.674366	3	2131	K	V
991	991	4	2019-01-01 02:38:00	2019-01-01 02:52:00	3	6.80	-73.911682	40.775295	-73.903908	40.817696	2	898	W	S
990	990	1	2019-01-01 02:38:00	2019-01-01 03:22:00	1	13.80	-73.976143	40.775990	-73.940956	40.676426	2	2693	AV	AW
989	989	2	2019-01-01 02:38:00	2019-01-01 03:08:00	1	9.00	-73.981331	40.780663	-73.848824	40.722755	3	1841	AV	B

	id	Transport_service	pickup_datetime	passenger_count	trip_distance	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	payment_type	trip_duration	pickup_neighborhood	dropoff_neighborhood
0	0	3	2019-01-01 00:00:00	1	1.32	-73.961258	40.796200	-73.950050	40.787312	3	387	AH	C
1	1	1	2019-01-01 00:01:00	3	13.70	-73.956169	40.707756	-73.939949	40.839558	3	1568	Z	S
2	2	4	2019-01-01 00:01:00	1	5.30	-73.993103	40.752632	-73.953903	40.816540	1	1219	D	AL
3	3	2	2019-01-01 00:01:00	2	7.19	-73.983009	40.731419	-73.930969	40.808460	2	873	AT	J
4	4	1	2019-01-01 00:02:00	2	2.90	-74.004631	40.747234	-73.976395	40.777237	1	1091	AG	AV

	count	mean	std	min	25%	50%	75%	max
id	1452.0	749.218320	434.555493	0.000000	369.750000	749.500000	1125.250000	1499.000000
Transport_service	1452.0	2.488292	1.123890	1.000000	1.000000	2.000000	4.000000	4.000000
passenger_count	1452.0	2.506198	1.111603	1.000000	2.000000	2.000000	4.000000	4.000000
trip_distance	1452.0	3.173320	2.798477	0.000000	1.300000	2.250000	4.260000	17.990000
pickup_longitude	1452.0	-73.973295	0.025778	-74.017311	-73.989311	-73.979614	-73.961092	-73.781807
pickup_latitude	1452.0	40.750311	0.029513	40.638889	40.733079	40.751900	40.768478	40.847500
dropoff_longitude	1452.0	-73.965157	0.034889	-74.024834	-73.988117	-73.974369	-73.950407	-73.770943
dropoff_latitude	1452.0	40.750691	0.038322	40.633560	40.729924	40.751728	40.774967	40.848629
trip_duration	1452.0	828.261019	528.279643	18.000000	428.000000	708.000000	1116.500000	3201.000000

	id	Transport_service	pickup_datetime	passenger_count	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	payment_type	trip_duration	pickup_neighborhood	dropoff_neighborhood
880	880	2	2019-01-01 02:15:00	4	-74.002586	40.750298	-74.002861	40.750446	2	36	AG	AG
1116	1116	1	2019-01-01 03:01:00	1	-73.987831	40.728558	-73.988747	40.727280	3	151	H	H
1455	1455	1	2019-01-01 04:09:00	1	-73.985893	40.763649	-73.985741	40.763672	2	80	AR	AR
1488	1488	2	2019-01-01 04:16:00	3	-74.014198	40.709988	-74.014198	40.709988	2	18	AU	AU

	id	pickup_datetime
0	0	2019-01-01 00:00:00
1004	1004	2019-01-01 02:39:00
1003	1003	2019-01-01 02:39:00
1002	1002	2019-01-01 02:39:00
1001	1001	2019-01-01 02:39:00
1000	1000	2019-01-01 02:39:00
999	999	2019-01-01 02:39:00
998	998	2019-01-01 02:39:00
997	997	2019-01-01 02:38:00
1005	1005	2019-01-01 02:39:00

	pickup_neighborhoods.latitude	passenger_count	trip_duration	IS_WEEKEND(pickup_datetime)	trip_distance	pickup_neighborhoods.longitude	dropoff_neighborhood = AM	dropoff_neighborhood = Y	dropoff_neighborhood = AT	dropoff_neighborhood = C	...	pickup_neighborhood = AM	pickup_neighborhood = AC	pickup_neighborhood = P	pickup_neighborhood = AO	pickup_neighborhood = N	pickup_neighborhood = I	pickup_neighborhood = R	dropoff_neighborhoods.latitude	payment_type	Transport_service
id
0	40.804349	1	387	False	1.32	-73.961716	False	False	False	True	...	False	False	False	False	False	False	False	40.783780	3	3
1004	40.744928	1	1215	False	4.00	-73.919159	False	False	False	False	...	False	False	False	False	False	False	False	40.766575	1	4
1003	40.715828	4	215	False	0.63	-73.954298	False	False	False	False	...	False	False	False	False	False	False	False	40.715828	1	3
1002	40.729670	3	561	False	3.20	-73.981693	True	False	False	False	...	False	False	False	False	False	False	False	40.775357	2	2
1001	40.721435	3	1576	False	8.28	-73.998366	False	False	False	False	...	False	False	False	False	False	False	False	40.793597	2	1

	payment_type	dropoff_neighborhoods.longitude	pickup_neighborhood = AD	pickup_neighborhood = AR	pickup_neighborhood = AT	pickup_neighborhood = AM	pickup_neighborhood = AC	pickup_neighborhood = P	pickup_neighborhood = AO	pickup_neighborhood = N	...	dropoff_neighborhood = P	dropoff_neighborhood = I	dropoff_neighborhood = N	IS_WEEKEND(pickup_datetime)	dropoff_neighborhoods.latitude	WEEKDAY(pickup_datetime)	Transport_service	DAY(pickup_datetime)	passenger_count	MINUTE(pickup_datetime)
id
0	3	-73.953145	False	False	False	False	False	False	False	False	...	False	False	False	False	40.783780	1	3	1	1	0
1	3	-73.934381	False	False	False	False	False	False	False	False	...	False	False	False	False	40.836792	1	1	1	3	1
2	1	-73.948046	False	False	False	False	False	False	False	False	...	False	False	False	False	40.818445	1	4	1	1	1
3	2	-73.940427	False	False	True	False	False	False	False	False	...	False	False	False	False	40.799573	1	2	1	2	1
4	1	-73.982322	False	False	False	False	False	False	False	False	...	False	False	False	False	40.776270	1	1	1	2	2

Practice Project - Predictive Analytics: New York City Bus Ride Duration Prediction¶

Objective¶

Dataset¶

We will do the following steps:¶

Import the necessary libraries¶

Load the datasets¶

View the Datasets¶

Let's check the first five rows of the data¶

Check the number of unique values in the dataset.¶

Question 1 : Check summary statistics of the dataset¶

Checking for the rows for which trip_distance is 0¶

Replacing the 0 values with median of the trip distance¶

Question 2: Univariate Analysis¶

Question 2.1: Build histogram for numerical columns¶

Question 2.2 Plotting countplot for Passenger_count¶

Question 2.3 Plotting countplot for pickup_neighborhood and dropoff_neighborhood¶

Bivariate analysis¶

Plot a scatter plot for trip distance and trip duration¶

Step 2: Prepare the Data¶

Question 3: Define entities and relationships for the Deep Feature Synthesis (2 Marks)¶

Step 3: Create baseline features using Deep Feature Synthesis¶

Question 4: Creating a baseline model with only 1 transform primitive¶

Question: 4.1 Define transform primitive for weekend and define features using dfs?¶

Question: 4.2 Compute features and define feature matrix¶

Model Building¶

Transforming the duration variable on sqrt and log¶

Splitting the data into train and test¶

Question 4.3 Build Linear regression using only weekend transform primitive¶

Check the performance of the model¶

Question 4.4 Building decision tree using only weekend transform primitive¶

Check the performance of the model¶

Question 4.5 Building Pruned decision tree using only weekend transform primitive¶

Check the performance of the model¶

Question 4.6 Building Random Forest using only weekend transform primitive¶

Check the performance of the model¶

Step 4: Adding more Transform Primitives and creating new model¶

Question 5: Create models with more transform primitives¶

Question 5.1 Define more transform primitives and define features using dfs?¶

Question 5.2 Compute features and define feature matrix¶

Question 5.3 Building Linear regression using more transform primitive¶

Check the performance of the model¶

Question 5.4 Building Decision tree using more transform primitive¶

Check the performance of the model¶

Question 5.5 Building Pruned Decision tree using more transform primitive¶

Check the performance of the model¶

Question 5.6 Building Random Forest using more transform primitive¶

Check the performance of the model¶

Step 5: Add Aggregation Primitives¶

Question 6: Create a Models with transform and aggregate primitive.¶

6.1 Define more transform and aggregate primitive and define features using dfs?¶

Question: 6.2 Compute features and define feature matrix¶

Question 6.3 Building Linear regression model with transform and aggregate primitive.¶

Check the performance of the model¶

Question 6.4 Building Decision tree with transform and aggregate primitive.¶

Check the performance of the model¶

Question 6.5 Building Pruned Decision tree with transform and aggregate primitive.¶

Check the performance of the model¶

Question 6.6 Building Random Forest with transform and aggregate primitive.¶

Check the performance of the model¶

Question 6.7 How do these aggregate transforms impact performance? How do they impact training time?¶

Question 7: What are some important features based on model2 and how can they affect the duration of the rides?¶

Question 7: What are some important features based on `model2` and how can they affect the duration of the rides?¶