import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
data = pd.read_csv('trip_Nov_1to14_clean.csv')
data = data.sample(n=10000, random_state=0)
from sklearn.model_selection import train_test_split
to_drop=['trips_pooled', 'shared_trip_authorized']
X=data.drop(to_drop, axis=1)
y=data['shared_trip_authorized']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0, stratify = y)
X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=0)
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.svm import SVC
# from sklearn.model_selection import learning_curve
# from sklearn.model_selection import ShuffleSplit
# def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
# n_jobs=None, train_sizes=np.linspace(.0001, .001, 10)):
# """
# Generate 3 plots: the test and training learning curve, the training
# samples vs fit times curve, the fit times vs score curve.
# Parameters
# ----------
# estimator : object type that implements the "fit" and "predict" methods
# An object of that type which is cloned for each validation.
# title : string
# Title for the chart.
# X : array-like, shape (n_samples, n_features)
# Training vector, where n_samples is the number of samples and
# n_features is the number of features.
# y : array-like, shape (n_samples) or (n_samples, n_features), optional
# Target relative to X for classification or regression;
# None for unsupervised learning.
# axes : array of 3 axes, optional (default=None)
# Axes to use for plotting the curves.
# ylim : tuple, shape (ymin, ymax), optional
# Defines minimum and maximum yvalues plotted.
# cv : int, cross-validation generator or an iterable, optional
# Determines the cross-validation splitting strategy.
# Possible inputs for cv are:
# - None, to use the default 5-fold cross-validation,
# - integer, to specify the number of folds.
# - :term:`CV splitter`,
# - An iterable yielding (train, test) splits as arrays of indices.
# For integer/None inputs, if ``y`` is binary or multiclass,
# :class:`StratifiedKFold` used. If the estimator is not a classifier
# or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
# Refer :ref:`User Guide <cross_validation>` for the various
# cross-validators that can be used here.
# n_jobs : int or None, optional (default=None)
# Number of jobs to run in parallel.
# ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
# ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
# for more details.
# train_sizes : array-like, shape (n_ticks,), dtype float or int
# Relative or absolute numbers of training examples that will be used to
# generate the learning curve. If the dtype is float, it is regarded as a
# fraction of the maximum size of the training set (that is determined
# by the selected validation method), i.e. it has to be within (0, 1].
# Otherwise it is interpreted as absolute sizes of the training sets.
# Note that for classification the number of samples usually have to
# be big enough to contain at least one sample from each class.
# (default: np.linspace(0.1, 1.0, 5))
# """
# if axes is None:
# _, axes = plt.subplots(1, 3, figsize=(20, 5))
# axes[0].set_title(title)
# if ylim is not None:
# axes[0].set_ylim(*ylim)
# axes[0].set_xlabel("Training examples")
# axes[0].set_ylabel("Score")
# train_sizes, train_scores, test_scores, fit_times, _ = \
# learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
# train_sizes=train_sizes,
# return_times=True)
# train_scores_mean = np.mean(train_scores, axis=1)
# train_scores_std = np.std(train_scores, axis=1)
# test_scores_mean = np.mean(test_scores, axis=1)
# test_scores_std = np.std(test_scores, axis=1)
# fit_times_mean = np.mean(fit_times, axis=1)
# fit_times_std = np.std(fit_times, axis=1)
# # Plot learning curve
# axes[0].grid()
# axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
# train_scores_mean + train_scores_std, alpha=0.1,
# color="r")
# axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
# test_scores_mean + test_scores_std, alpha=0.1,
# color="g")
# axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
# label="Training score")
# axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
# label="Cross-validation score")
# axes[0].legend(loc="best")
# # Plot n_samples vs fit_times
# axes[1].grid()
# axes[1].plot(train_sizes, fit_times_mean, 'o-')
# axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
# fit_times_mean + fit_times_std, alpha=0.1)
# axes[1].set_xlabel("Training examples")
# axes[1].set_ylabel("fit_times")
# axes[1].set_title("Scalability of the model")
# # Plot fit_time vs score
# axes[2].grid()
# axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
# axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
# test_scores_mean + test_scores_std, alpha=0.1)
# axes[2].set_xlabel("fit_times")
# axes[2].set_ylabel("Score")
# axes[2].set_title("Performance of the model")
# return plt
# fig, axes = plt.subplots(3, 2, figsize=(10, 15))
# title = "Learning Curves (RandomForest Classifier)"
# # Cross validation with 100 iterations to get smoother mean test and train
# # score curves, each time with 20% data randomly selected as a validation set.
# cv = ShuffleSplit(n_splits=20, test_size=0.2, random_state=0)
# estimator = RandomForestClassifier()
# plot_learning_curve(estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01),
# cv=cv, n_jobs=4)
# title = "Learning Curves (GradientBoostingClassifier)"
# cv = ShuffleSplit(n_splits=20, test_size=0.2, random_state=0)
# estimator = GradientBoostingClassifier()
# plot_learning_curve(estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01),
# cv=cv, n_jobs=4)
# plt.show()
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss
dummy = DummyClassifier(strategy='most_frequent', random_state=0).fit(X_train, y_train)
dummy_pred=dummy.predict(X_test)
dummy_probs = dummy.predict_proba(X_test)
print('log_loss:', log_loss(y_test, dummy_probs))
print('--'*12)
print('train score accuracy', dummy.score(X_train, y_train))
print('test score accuracy', dummy.score(X_val, y_val))
print('--'*12)
print('train score precision', precision_score(y_val, dummy_pred))
print('test score precision', precision_score(y_val, dummy_pred))
print('--'*12)
print('train score recall', recall_score(y_val, dummy_pred))
print('test score recall', recall_score(y_val, dummy_pred))
print('--'*12)
print('\n')
print(classification_report(y_val, dummy_pred, labels=[0, 1]))
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss
rf = RandomForestClassifier(max_depth=7).fit(X_train, y_train)
rf_pred=rf.predict(X_val)
rf_probs = rf.predict_proba(X_val)
print('log_loss:', log_loss(y_test, rf_probs))
print('--'*12)
print('train score accuracy', rf.score(X_train, y_train))
print('test score accuracy', rf.score(X_val, y_val))
print('--'*12)
print('train score precision', precision_score(y_val, rf_pred))
print('test score precision', precision_score(y_val, rf_pred))
print('--'*12)
print('train score recall', recall_score(y_val, rf_pred))
print('test score recall', recall_score(y_val, rf_pred))
print('--'*12)
print('\n')
print(classification_report(y_val, rf_pred, labels=[0, 1]))
from sklearn.inspection import permutation_importance
feat_importances = pd.Series(rf.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)
import joblib
trip_small = data.sample(n=10000, random_state=0)
from sklearn.model_selection import train_test_split
to_drop=['trips_pooled', 'shared_trip_authorized']
X=trip_small.drop(to_drop, axis=1)
y=trip_small['shared_trip_authorized']
from sklearn.feature_selection import SelectKBest, chi2
X_new = SelectKBest(chi2, k=5).fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new,y, test_size=0.3, random_state=0, stratify = y)
X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=0)
from sklearn.ensemble import RandomForestClassifier
scoring = ['precision','f1', 'recall']
rf = RandomForestClassifier()
parameters = {
'n_estimators': [5, 10, 50, 100, 200],
'max_depth': [1, 2, 3, 5],
'n_jobs': [-1, 0, 1],
'max_features':[1,2,3,4,5]
}
cv = GridSearchCV(rf, parameters, cv=5, refit='recall', scoring=scoring)
cv.fit(X_train,y_train)
print(cv.best_estimator_)
joblib.dump(cv.best_estimator_, 'models/rf_model.pkl')
from sklearn.linear_model import LogisticRegression
scoring = ['precision','f1', 'recall']
lr = LogisticRegression(max_iter=1000, penalty='l1', solver='liblinear', n_jobs=-1)
parameters = {
'C': [0.1, 1]
}
cv = GridSearchCV(lr, parameters, cv=5, refit='recall', scoring=scoring)
cv.fit(X_train,y_train)
print(cv.best_estimator_)
joblib.dump(cv.best_estimator_, 'models/lr_model.pkl')
from sklearn.svm import SVC
scoring = ['precision','f1', 'recall']
svc = SVC()
parameters = {
'kernel': ['rbf', 'linear'],
'C': [0.1, 1, 10]
}
cv = GridSearchCV(svc, parameters, cv=5,refit='recall', scoring=scoring )
cv.fit(X_train,y_train)
print(cv.best_estimator_)
joblib.dump(cv.best_estimator_, 'models/svc_model.pkl')
from sklearn.ensemble import GradientBoostingClassifier
scoring = ['precision','f1', 'recall']
gb = GradientBoostingClassifier()
parameters = {
'n_estimators': [50, 100],
'max_depth': [3, 7],
'learning_rate': [0.01, 0.1, 1]
}
cv = GridSearchCV(gb, parameters, cv=5, refit='recall', scoring=scoring )
cv.fit(X_train,y_train)
print(cv.best_estimator_)
joblib.dump(cv.best_estimator_, 'models/gb_model.pkl')
models = {}
for mdl in ['rf', 'lr', 'svc','gb']:
models[mdl] = joblib.load(f'clean_data/models/{mdl}_model.pkl')
import numpy as np
from time import time
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss
def evaluate_model(name, model, X_val, y_val):
start = time()
pred = model.predict(X_val)
#probs = model.predict_proba(X_val)
#log_loss_score = log_loss(y_val, probs)
accuracy = model.score(X_val, y_val)
precision = precision_score(y_val, pred)
recall = recall_score(y_val, pred)
f1 = f1_score(y_val, pred, average='weighted')
end = time()
#print(f'{name} -- log_loss: {log_loss_score} Latency: {round(end-start, 2)}')
print(f'{name} -- accuracy: {accuracy} Latency: {round(end-start, 2)}')
print(f'{name} -- precision: {precision} Latency: {round(end-start, 2)}')
print(f'{name} -- recall: {recall} Latency: {round(end-start, 2)}')
print(f'{name} -- f1_score: {f1} Latency: {round(end-start, 2)}')
print('--'*12)
for name, mdl in models.items():
evaluate_model(name, mdl, X_val, y_val)
evaluate_model('gb', models['gb'], X_test, y_test)
from sklearn.ensemble import GradientBoostingClassifier
percision_point = 0.75
fpr_point = 0.16
from sklearn.metrics import precision_recall_curve, roc_curve, auc
y_lr = GradientBoostingClassifier(max_depth=7,learning_rate=0.1, n_estimators=100).fit(X_train, y_train).decision_function(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_lr)
closest_zero = np.argmin(np.abs(thresholds))
closest_zero_p = precision[closest_zero]
closest_zero_r = recall[closest_zero]
precision_index = np.where(np.isclose(precision, percision_point))
plt.figure(figsize=(6, 6))
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
plt.plot(precision, recall, label='Precision-Recall Curve')
plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3)
plt.xlabel('Precision', fontsize=16)
plt.ylabel('Recall', fontsize=16)
plt.axes().set_aspect('equal')
plt.show()
plt.savefig('images/precision_recall_curve.png')
fpr, tpr,_ = roc_curve(y_test, y_lr)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6, 6))
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr, tpr, lw=3, label='Gradient Boosting ROC curve (area = {:0.2f})'.format(roc_auc))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curve', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.axes().set_aspect('equal')
plt.savefig('images/roc_curve.png')
plt.show()