import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

Load Clean Data

data = pd.read_csv('trip_Nov_1to14_clean.csv')
data = data.sample(n=10000, random_state=0)

Train Test Split

from sklearn.model_selection import train_test_split

to_drop=['trips_pooled', 'shared_trip_authorized']

X=data.drop(to_drop, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0, stratify = y)

X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=0)


from sklearn.dummy import DummyClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss

dummy = DummyClassifier(strategy='most_frequent', random_state=0).fit(X_train, y_train)

dummy_probs = dummy.predict_proba(X_test)

print('log_loss:', log_loss(y_test, dummy_probs))
print('train score accuracy', dummy.score(X_train, y_train))
print('test score accuracy', dummy.score(X_val, y_val))

print('train score precision', precision_score(y_val, dummy_pred))
print('test score precision', precision_score(y_val, dummy_pred))
print('train score recall', recall_score(y_val, dummy_pred))
print('test score recall', recall_score(y_val, dummy_pred))
print(classification_report(y_val, dummy_pred, labels=[0, 1]))
log_loss: 3.789397181613058
train score accuracy 0.8925714285714286
test score accuracy 0.8948571428571429
train score precision 0.0
test score precision 0.0
train score recall 0.0
test score recall 0.0

              precision    recall  f1-score   support

           0       0.89      1.00      0.94      3132
           1       0.00      0.00      0.00       368

    accuracy                           0.89      3500
   macro avg       0.45      0.50      0.47      3500
weighted avg       0.80      0.89      0.85      3500

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss

rf = RandomForestClassifier(max_depth=7).fit(X_train, y_train)
rf_probs = rf.predict_proba(X_val)

print('log_loss:', log_loss(y_test, rf_probs))
print('train score accuracy', rf.score(X_train, y_train))
print('test score accuracy', rf.score(X_val, y_val))

print('train score precision', precision_score(y_val, rf_pred))
print('test score precision', precision_score(y_val, rf_pred))
print('train score recall', recall_score(y_val, rf_pred))
print('test score recall', recall_score(y_val, rf_pred))
print(classification_report(y_val, rf_pred, labels=[0, 1]))
log_loss: 0.4744675393660643
train score accuracy 0.9634285714285714
test score accuracy 0.964
train score precision 1.0
test score precision 1.0
train score recall 0.657608695652174
test score recall 0.657608695652174

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3132
           1       1.00      0.66      0.79       368

    accuracy                           0.96      3500
   macro avg       0.98      0.83      0.89      3500
weighted avg       0.97      0.96      0.96      3500

from sklearn.inspection import permutation_importance

feat_importances = pd.Series(rf.feature_importances_, index=X.columns)

Cross Validation

import pandas as pd
import numpy as np 
import random

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)

import joblib
trip_small = data.sample(n=10000, random_state=0)
from sklearn.model_selection import train_test_split

to_drop=['trips_pooled', 'shared_trip_authorized']

X=trip_small.drop(to_drop, axis=1)

from sklearn.feature_selection import SelectKBest, chi2
X_new = SelectKBest(chi2, k=5).fit_transform(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_new,y, test_size=0.3, random_state=0, stratify = y)

X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=0)
from sklearn.ensemble import RandomForestClassifier
scoring = ['precision','f1', 'recall']

rf = RandomForestClassifier()

parameters = {
    'n_estimators': [5, 10, 50, 100, 200],
    'max_depth': [1, 2, 3, 5],
    'n_jobs': [-1, 0, 1],

cv = GridSearchCV(rf, parameters, cv=5, refit='recall', scoring=scoring),y_train)
joblib.dump(cv.best_estimator_, 'models/rf_model.pkl')
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features=5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
                       oob_score=False, random_state=None, verbose=0,
from sklearn.linear_model import LogisticRegression
scoring = ['precision','f1', 'recall']

lr = LogisticRegression(max_iter=1000, penalty='l1', solver='liblinear', n_jobs=-1)

parameters = {
    'C': [0.1, 1]

cv = GridSearchCV(lr, parameters, cv=5, refit='recall', scoring=scoring),y_train)
joblib.dump(cv.best_estimator_, 'models/lr_model.pkl')
from sklearn.svm import SVC
scoring = ['precision','f1', 'recall']
svc = SVC()

parameters = {
    'kernel': ['rbf', 'linear'],
    'C': [0.1, 1, 10]

cv = GridSearchCV(svc, parameters, cv=5,refit='recall', scoring=scoring ),y_train)
joblib.dump(cv.best_estimator_, 'models/svc_model.pkl')
from sklearn.ensemble import GradientBoostingClassifier
scoring = ['precision','f1', 'recall']

gb = GradientBoostingClassifier()

parameters = {
    'n_estimators': [50, 100],
    'max_depth': [3, 7],
    'learning_rate': [0.01, 0.1, 1]

cv = GridSearchCV(gb, parameters, cv=5, refit='recall', scoring=scoring ),y_train)
joblib.dump(cv.best_estimator_, 'models/gb_model.pkl')

Model Selection

models = {}
for mdl in ['rf', 'lr', 'svc','gb']:
    models[mdl] = joblib.load(f'clean_data/models/{mdl}_model.pkl')
import numpy as np
from time import time

from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss

def evaluate_model(name, model, X_val, y_val):
    start = time()
    pred = model.predict(X_val)
    #probs = model.predict_proba(X_val)
    #log_loss_score = log_loss(y_val, probs)
    accuracy = model.score(X_val, y_val)
    precision = precision_score(y_val, pred)
    recall = recall_score(y_val, pred)
    f1 = f1_score(y_val, pred, average='weighted')
    end = time()
    #print(f'{name} -- log_loss: {log_loss_score} Latency: {round(end-start, 2)}')
    print(f'{name} -- accuracy: {accuracy} Latency: {round(end-start, 2)}')
    print(f'{name} -- precision: {precision} Latency: {round(end-start, 2)}')
    print(f'{name} -- recall: {recall} Latency: {round(end-start, 2)}')
    print(f'{name} -- f1_score: {f1} Latency: {round(end-start, 2)}')
for name, mdl in models.items():
    evaluate_model(name, mdl, X_val, y_val)

Final Model

evaluate_model('gb', models['gb'], X_test, y_test)
from sklearn.ensemble import GradientBoostingClassifier
percision_point = 0.75
fpr_point = 0.16
from sklearn.metrics import precision_recall_curve, roc_curve, auc

y_lr = GradientBoostingClassifier(max_depth=7,learning_rate=0.1, n_estimators=100).fit(X_train, y_train).decision_function(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_lr)
closest_zero = np.argmin(np.abs(thresholds))
closest_zero_p = precision[closest_zero]
closest_zero_r = recall[closest_zero]

precision_index = np.where(np.isclose(precision, percision_point))

plt.figure(figsize=(6, 6))
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
plt.plot(precision, recall, label='Precision-Recall Curve')
plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3)
plt.xlabel('Precision', fontsize=16)
plt.ylabel('Recall', fontsize=16)

fpr, tpr,_ = roc_curve(y_test, y_lr)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 6))
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr, tpr, lw=3, label='Gradient Boosting ROC curve (area = {:0.2f})'.format(roc_auc))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curve', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')