import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
sns.set_style("ticks")
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams["figure.figsize"] = (8,6)
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 18
# read from Google drive
df=pd.read_csv("https://drive.google.com/uc?export=download&id=1LU5be_H1TD2Pp1OmI202to3YyKo9AzFY")
df.head()
df.shape
Feature
country : user country based on the IP address
age : user age. Self-reported at sign-up step
new_user : whether the user created the account during this session or had already an account and simply came back to the site
source : marketing channel source
total_pages_visited: number of total pages visited during the session. This can be seen as a proxy for time spent on site and engagement
converted: this is our label. 1 means they converted within the session, 0 means they left without buying anything.
The company goal is to increase conversion rate: # conversions / total sessions
Goal: Identifying the wrong data and dealing with it is a crucial step
# numerical features
df.describe()
# categorical features
df.country.value_counts()
df.source.value_counts()
Quick observations:
Anomaly data:
# look into extreme high age
sorted(df.age.unique(), reverse = True)
df[df['age'] > 100]
df = df[df['age'] < 110]
df[['country', 'converted']].groupby('country').mean()
# conversion rate by country
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#sns.set_style("ticks")
# 设置画布大小
fig = plt.figure(figsize = (12, 6))
ax1= fig.add_subplot(121)
sns.countplot(x = 'country', hue = 'converted', data = df);
ax1.set(title="Distribution of Conversion by Country", ylabel = 'log count');
ax1.set_yscale('log');
ax2 = fig.add_subplot(122)
sns.barplot(x = 'country', y = 'converted', data = df);
ax2.set(title="Mean Converstion by Country");
Observation:
# plot mean conversion wrt age
fig = plt.figure(figsize = (12, 6))
ax1= fig.add_subplot(121)
sns.distplot(x = df[df['converted']==0]['age'],kde=True,label = 'converted 0');
sns.distplot(x = df[df['converted']==1]['age'],kde=True,label = 'converted 1');
ax1.set(title="Distribution of Conversion by User Age",xlim=(10,80));
plt.legend()
ax2 = fig.add_subplot(122)
grouped_data = df[['age','converted']].groupby('age').mean().reset_index()
sns.lineplot(x = 'age', y='converted', data = grouped_data,markers=True)
ax2.set(title="Mean Converstion by User Age");
Observation:
# plot mean conversion wrt user type
fig = plt.figure(figsize = (12, 6))
ax1= fig.add_subplot(121)
sns.countplot(x = 'new_user', hue = 'converted', data = df);
ax1.set(title="Distribution of Conversion by user type", ylabel = 'log count');
ax1.set_yscale('log');
ax2 = fig.add_subplot(122)
sns.barplot(x = 'new_user', y = 'converted', data = df);
ax2.set(title="Mean Converstion by user type");
Observation:
# plot mean conversion wrt source
fig = plt.figure(figsize = (12, 6))
ax1= fig.add_subplot(121)
sns.countplot(x = 'source', hue = 'converted', data = df);
ax1.set(title="Distribution of Conversion by Source", ylabel = 'log count');
ax1.set_yscale('log');
ax2 = fig.add_subplot(122)
sns.barplot(x = 'source', y = 'converted', data = df);
ax2.set(title="Mean Converstion by Source");
conversion_by_source = df[['source','converted']].set_index('source').groupby(['converted','source']).size()
conversion_by_source=conversion_by_source.unstack(level = 0)
conversion_by_source
conversion_by_source.plot.bar(stacked = True);
Observation:
# plot conversion rate wrt total_page_visited
fig = plt.figure(figsize = (12, 6))
ax1= fig.add_subplot(121)
sns.countplot(x = 'total_pages_visited', hue = 'converted', data = df);
ax1.set(title="Distribution of Conversion by Total_Pages_Visited", ylabel = 'log count');
ax1.set_yscale('log');
ax2 = fig.add_subplot(122)
sns.lineplot(x="total_pages_visited", y="converted",estimator="mean", data=df);
ax2.set(title="Mean Converstion by Total_Pages_Visited");
Observation:
df.head()
# check data type
df.dtypes
# check missing value
df.isnull().sum()
# one-hot encoding for categorical variables
df_cleaned = pd.get_dummies(df, drop_first=True)
df_cleaned.head()
# specify features and target
target = df_cleaned['converted']
features =df_cleaned.drop('converted', axis = 1)
# check taget distribution
df_cleaned['converted'].value_counts()
# split traninig set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features,
target,
test_size=0.2,
random_state=1)
# train a random forest model
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(random_state=0,
oob_score=True,
n_jobs=-1)
# Train model
model = rf.fit(x_train, y_train)
# predict
train_preds = rf.predict_proba(x_train)[:,1]
test_preds = rf.predict_proba(x_test)[:,1]
# use AUC score as the major metric to evaluate the model
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_train, train_preds)
auc_train = metrics.auc(fpr, tpr)
print("Training Set AUC:",auc_train)
fpr, tpr, thresholds = metrics.roc_curve(y_test, test_preds)
auc_test = metrics.auc(fpr, tpr)
print("Test Set AUC:",auc_test)
# create ROC curve for the random forest model
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
false_positive_rate, true_positive_rate, threshold = roc_curve(y_train, train_preds)
false_positive_rate_test, true_positive_rate_test, threshold_test = roc_curve(y_test, test_preds)
# plot ROC currve
plt.title("ROC Curve for Random Forest Model")
plt.plot(false_positive_rate, true_positive_rate,label='Train ROC Curve (area = %0.3f)' % roc_auc_score(y_train, train_preds))
plt.plot(false_positive_rate_test, true_positive_rate_test,label='Test ROC Curve (area = %0.3f)' % roc_auc_score(y_test, test_preds))
plt.plot([0,1], ls="--")
plt.plot([0,0], [1,0], c=".7"), plt.plot([1, 1], c=".7")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.legend()
plt.show()
feat_importances = pd.Series(rf.feature_importances_, index=x_train.columns)
plt.figure(figsize = (12, 6))
feat_importances.sort_values().plot(kind='barh')
# Create plot title
plt.title("Feature Importance")
plt.show()
Observation:
Retrain the model:
x_train_new = x_train.drop('total_pages_visited', axis = 1)
x_test_new = x_test.drop('total_pages_visited', axis = 1)
# train a random forest model
from sklearn.ensemble import RandomForestClassifier
np.random.seed(4684)
rf2= RandomForestClassifier(max_features=3,
n_estimators=100,
class_weight={0:1, 1:10},
max_depth =20,
oob_score=True,
n_jobs=-1)
# Train model
model2 = rf2.fit(x_train_new, y_train)
# predict
train_preds2 = model2.predict_proba(x_train_new)[:,1]
test_preds2 = model2.predict_proba(x_test_new)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_train, train_preds2)
auc_train = metrics.auc(fpr, tpr)
print("Training Set AUC:",auc_train)
fpr, tpr, thresholds = metrics.roc_curve(y_test, test_preds2)
auc_test = metrics.auc(fpr, tpr)
print("Test Set AUC:",auc_test)
Observation:
# create ROC curve for the random forest model
false_positive_rate, true_positive_rate, threshold = roc_curve(y_train, train_preds2)
false_positive_rate_test, true_positive_rate_test, threshold_test = roc_curve(y_test, test_preds2)
# plot ROC currve
plt.title("ROC Curve for Random Forest Model")
plt.plot(false_positive_rate, true_positive_rate,label='Train ROC Curve (area = %0.3f)' % roc_auc_score(y_train, train_preds2))
plt.plot(false_positive_rate_test, true_positive_rate_test,label='Test ROC Curve (area = %0.3f)' % roc_auc_score(y_test, test_preds2))
plt.plot([0,1], ls="--")
plt.plot([0,0], [1,0], c=".7"), plt.plot([1, 1], c=".7")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.legend()
plt.show()
## Accuracy
test_preds_outcome =np.where(test_preds2>0.5, 1, 0)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_preds_outcome, y_test)
accuracy
# confusion matrix
from sklearn.metrics import confusion_matrix
#use pandas 'crosstab' function to produce a more readable confusion matrix
cm = pd.crosstab(y_test, test_preds_outcome,
rownames=['Actual'], colnames=['Predicted'], margins=True)
cm
## Choose a threshold
fpr, tpr, thresholds = metrics.roc_curve(y_test, test_preds2)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
optimal_threshold
# feature importance
feat_importances = pd.Series(rf2.feature_importances_, index=x_train_new.columns)
feat_importances.sort_values().plot(kind='barh')
plt.show()
# partial dependence plot
from pdpbox import pdp, info_plots
#country
pdp_iso = pdp.pdp_isolate(model=rf2,
dataset=x_train_new,
model_features=list(x_train_new),
feature=['country_Germany', 'country_UK', 'country_US'],
num_grid_points=50)
pdp_dataset = pd.Series(pdp_iso.pdp, index=pdp_iso.display_columns)
pdp_dataset.sort_values(ascending=False).plot(kind='bar', title='Country')
plt.show()
from pdpbox import pdp, info_plots
#source
pdp_iso = pdp.pdp_isolate(model=rf2,
dataset=x_train_new,
model_features=list(x_train_new),
feature=['source_Direct', 'source_Seo'],
num_grid_points=50)
pdp_dataset = pd.Series(pdp_iso.pdp, index=pdp_iso.display_columns)
pdp_dataset.sort_values(ascending=False).plot(kind='bar', title='source')
plt.show()
from pdpbox import pdp, info_plots
#source
pdp_iso = pdp.pdp_isolate(model=rf2,
dataset=x_train_new,
model_features=list(x_train_new),
feature='new_user',
num_grid_points=50)
pdp_dataset = pd.Series(pdp_iso.pdp, index=pdp_iso.display_columns)
pdp_dataset.sort_values(ascending=False).plot(kind='bar', title='New User')
plt.show()
#age
pdp_iso = pdp.pdp_isolate(model=rf2,
dataset=x_train_new,
model_features=list(x_train_new),
feature='age',
num_grid_points=50)
pdp_dataset = pd.Series(pdp_iso.pdp, index=pdp_iso.feature_grids)
pdp_dataset.plot(title='Age')
plt.show()
Insights:
# tree segmentation
# Let’s now build a simple decision tree and check the 2 or 3 most important segments:
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image
from IPython.display import display
tree = DecisionTreeClassifier(max_depth=2,class_weight={0:1, 1:10}, min_impurity_decrease = 0.001)
tree.fit(x_train_new, y_train)
#visualize it tree plot
dot_data = export_graphviz(tree,
out_file=None,
feature_names=x_train_new.columns,
proportion=True,
rotate=True,
filled=True)
graph = pydotplus.graph_from_dot_data(dot_data)
display(Image(graph.create_png()))
A simple small tree confirms exactly the random forest findings.
What feature matters and how to improve:
The site is working very well for young users. Definitely let’s tell marketing to advertise and use channels which are more likely to reach young people.
The site is working very well for Germany in terms of conversion. But the summary showed that there are few Germans coming to the site: way less than UK, despite a larger population. Again, marketing should get more Germans. Big opportunity.
Users with old accounts do much better. Targeted emails with offers to bring them back to the site could be a good idea to try.
Maybe go through the UI and figure out why older users perform so poorly? From ~30 y/o conversion clearly starts dropping. A good actionable metric here is conversion rate for people >=30 yr old. Building a team whose goal is to increase that number would be interesting.
Something is wrong with the Chinese version of the site. It is either poorly translated, doesn’t fit the local culture, or maybe some payment issue. Given how many users are based in China, fixing this should be a top priority. Huge opportunity.