import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#sns.set_style("ticks")
sns.set_style("white")
#plt.style.use('fivethirtyeight')
plt.style.use('ggplot')
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams["figure.figsize"] = (10,6)
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 22
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
df = pd.read_csv('data/ad_table.csv')
df.head()
df.info()
df.describe()
Observations:
# convert date to datetime type
df['date'] = pd.to_datetime(df['date'])
df[df['total_revenue']<0]
# There are 4 events with negative revenue
# We decide to remove those values
df = df[df['total_revenue']>=0]
# check if the funnel makes sense
len(df.query('shown<clicked | clicked<converted'))
# check the shown trend across time by different group of ads
import matplotlib.dates as mdates
sns.set_style("white")
g = sns.lineplot(x='date',y='shown',hue='ad',data = df,legend='')
g.xaxis.set_major_locator(mdates.MonthLocator())
g.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
plt.title('Ad impressions');
Observation:
# let's remove the zero values from shown data
df = df[df['shown']>0]
g = sns.lineplot(x='date',y='shown',hue='ad',data = df,legend='')
g.xaxis.set_major_locator(mdates.MonthLocator())
g.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
plt.title('Ad impressions');
# check the click trend
g = sns.lineplot(x='date',y='clicked',hue='ad',data = df,legend='')
g.xaxis.set_major_locator(mdates.MonthLocator())
g.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
plt.title('Ad clicked');
Observation:
df = df[df['clicked']>0]
g = sns.lineplot(x='date',y='clicked',hue='ad',data = df,legend='')
g.xaxis.set_major_locator(mdates.MonthLocator())
g.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
plt.title('Ad clicked');
# check the conversion trend
g = sns.lineplot(x='date',y='converted',hue='ad',data = df,legend='')
g.xaxis.set_major_locator(mdates.MonthLocator())
g.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
plt.title('Ad clicked');
# check out the final data
df.describe()
There are obviously tons of ad related metric. And they are all plausible and useful in different situations, depending on the main business focus of a given company. A few examples are:
Click-through-rate (CTR), # of clicks/ # of impressions, i.e. of all people seeing the ad, how many click on it?
Conversion rate (CR), # of conversions/ # of impressions, i.e. of all people seeing the ad, how many people end up becoming customers?
Profits: revenue from conversions - ad costs
My choice of metrics in this project:
# check CTR & profits by ad groups
data_rank = df.groupby('ad').apply(
lambda x: pd.Series({
'CTR': (x['clicked']/x['shown']).mean(),
'profits': (x['total_revenue']-x['clicked']*x['avg_cost_per_click']).mean(),
'count': x['shown'].mean()
})
).reset_index()
data_rank.head()
data_rank['profits'].describe()
# check the distribution of profits
sns.distplot(data_rank['profits'], kde = True, bins = 20);
# barplot: count of ads in each group
sns.barplot(x = 'ad', y='count', data = data_rank)
plt.xticks(rotation=45, ha='right');
# plot CTR vs.profits
plt.figure(figsize=(10,8))
plt.scatter(data_rank['profits'], data_rank['CTR'])
plt.axvline(x=-5000)
plt.xlabel('profits')
plt.ylabel('CTR')
plt.show()
# Get the 5 best ad groups with highest CTR and above profitability of -5k
data_rank[data_rank['profits']>-5000].sort_values(by='CTR',ascending = False).head()
# check the shown for ad group1
data_ad1 = df[df['ad']=='ad_group_1'][["date", "shown"]]
#plot the time series trend
plt.figure(figsize=(10,6))
g=sns.lineplot(x="date", y="shown", data=data_ad1)
g.xaxis.set_major_locator(mdates.MonthLocator())
g.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
g.set_title("Ad group 1")
plt.show()
#change column names and index
data_ad1['ds'] = data_ad1['date']
data_ad1.rename({'shown': 'y'}, axis=1, inplace=True)
data_ad1.set_index('date', inplace=True)
data_ad1.head()
# current max day
data_ad1['ds'].max()
#Let's build time series using prophet.
import fbprophet
#build the model
ts = fbprophet.Prophet()
ts.fit(data_ad1)
#make predictions until Dec 15. This is the number of days for which we need a prediction to get to Dec 15
days_predictions = (pd.to_datetime("2015-12-15")-data_ad1['ds'].max()).days
future_data = ts.make_future_dataframe(periods = days_predictions)
predictions = ts.predict(future_data)
#let's plot the prediction
ts.plot(predictions)
plt.show()
#plot the time series component
ts.plot_components(predictions)
plt.show()
Observations:
# fit the model to all groups and make predictions
#data set to store results
predictions_all_groups = pd.DataFrame({'ad_group':df['ad'].unique(),
'date': pd.to_datetime("2015-12-15")})
prediction_dec_15 = []
predictions_all_groups.head()
# loop through all groups
for level in predictions_all_groups.ad_group:
#subset data, only keep 1 ad group and columns needed
tmp = df.loc[df['ad']==level][["date", "shown"]]
#change column names and index
tmp['ds'] = tmp['date']
tmp.rename({'shown': 'y'}, axis=1, inplace=True)
tmp.set_index('date', inplace=True)
#build the model
ts_tmp = fbprophet.Prophet()
ts_tmp.fit(tmp)
#make predictions until Dec 15. This is the number of days for which we need a prediction to get to Dec 15
days_predictions_tmp = (pd.to_datetime("2015-12-15")-tmp['ds'].max()).days
future_data_tmp = ts_tmp.make_future_dataframe(periods = days_predictions_tmp)
predictions_tmp = ts_tmp.predict(future_data_tmp)
#store results for the last day, i.e. Dec 15 prediction
prediction_dec_15.append(round(predictions_tmp['yhat'].iat[-1]))
# prediction results
predictions_all_groups['predictions'] = prediction_dec_15
predictions_all_groups
# plot the overall avg_cost_per_click
g=sns.lineplot(x="date", y="avg_cost_per_click",hue="ad", data=df, legend="")
g.xaxis.set_major_locator(mdates.MonthLocator())
g.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
plt.title("Avg Cost per Click") ;
It is not easy to detect trend from the plot above, though the overall trends seem to be flat.
Strategy:
import statsmodels.api as sm
# store results
coefficient = []
p_value = []
# Loop through all ad groups
for level in df['ad'].unique():
#subset dataset and keep variables we care about
data_reg = df.loc[df['ad']==level][["date", "avg_cost_per_click"]]
#replace dates with numbers. 0 is the minimum date, 1 is minimum date + 1 day and so on
data_reg['date'] = (data_reg['date'] - data_reg['date'].min()).dt.days
#add intercept
data_reg['intercept'] = 1
#build the regression
lr = sm.OLS(data_reg['avg_cost_per_click'],data_reg.drop('avg_cost_per_click', axis=1)).fit()
#save coefficient and p-value
coefficient.append(lr.params[0])
p_value.append(lr.pvalues[0])
# put all together in a dataframe
lr_output = pd.DataFrame({
'ad_group': df['ad'].unique(),
'coefficients': coefficient,
'p_value': p_value})
# classify based on the rule described above.
# Note:we use Bonferroni rule for significance
lr_output['cluster'] = np.where((lr_output['coefficients']>0) & (lr_output['p_value']<0.05/40), 'going_up',
np.where((lr_output['coefficients']<0) & (lr_output['p_value']<0.05/40), 'going_down',
'flat'))
lr_output
Insights: