import pandas as pd
import numpy as np
import plotly.express as px
import mytools
#| echo: false
import plotly.io as pio
# 设置默认渲染器为 SVG
pio.renderers.default = "svg"网站首屏设计AB测试
研究方法
  df1 = pd.read_excel(R"data\ABtest_actions_full.xlsx")解释性研究是关于现象或事物之间因果关系的研究。解释性研究是在描述性研究的基础上,进一步探寻“为什么”。解释性研究要在描述性研究的基础上对变量之间的关系进行分析,以确定它们之间是否存在相关,并进而判断它们之间是否存在因果关系。
在定量研究方法中,解释性研究通常是首先提出研究假设,然后从理论假设出发,设计出调查方案(收集资料的方案)并采用各种调查方法去收集经验材料,最后通过对资料的分析来验证假设,达到对社会现象进行理论解释的目的。
明确数据分析目标
本案例属于解释性研究,也可以认为是评价性研究。
某在线教育机构,需要评价网站首页改版是否有效,进行了在线AB测试,在一段时间内,将访问用户随机分成2部分,分别访问原首页和改版后首页,搜集用户的浏览、点击和报名行为。
研究假设:改版后报名比例高于改版前的报名比例。 原假设:改版后报名比例小于等于改版前的报名比例。
数据获取
本研究采用在线AB实验法,在XXX与XX期间,对访问网站首页的用户进行随机分组分流,50%的用户访问原首页,50%的用户访问新首页,共获得样本数据XXX个。
数据清理
查看所有空白值
temp = df1[df1.isnull().T.any()]
temp| timestamp | id | group | action | duration | |
|---|---|---|---|---|---|
| 12 | 2016-09-24 22:43:35.120 | 701620 | control | view | NaN | 
| 227 | 2016-09-30 11:57:44.059 | 802405 | control | view | NaN | 
| 236 | 2016-09-30 17:17:07.709 | 524941 | control | view | NaN | 
| 261 | 2016-10-01 08:40:31.380 | 381758 | control | view | NaN | 
| 367 | 2016-10-03 20:08:57.011 | 757165 | control | enroll | NaN | 
| 605 | 2016-10-10 11:26:20.715 | 678491 | control | view | NaN | 
| 731 | 2016-10-14 04:19:46.021 | 878418 | control | view | NaN | 
| 1145 | 2016-10-27 04:48:21.083 | 767153 | control | view | NaN | 
| 1212 | 2016-10-29 09:37:41.649 | 196115 | control | enroll | NaN | 
| 1238 | 2016-10-29 23:41:44.643 | 542999 | control | view | NaN | 
| 1311 | 2016-11-01 05:42:50.439 | 599303 | control | view | NaN | 
| 1322 | 2016-11-01 10:07:53.009 | 281907 | control | view | NaN | 
| 1663 | 2016-11-11 13:56:17.708 | 742859 | control | view | NaN | 
| 1768 | 2016-11-14 12:04:15.515 | 860051 | control | view | NaN | 
| 1966 | 2016-11-20 09:06:10.553 | 864971 | control | view | NaN | 
| 2457 | 2016-12-04 22:55:30.055 | 443391 | control | enroll | NaN | 
| 2576 | 2016-12-08 11:34:10.808 | 928935 | control | view | NaN | 
| 2700 | 2016-12-11 18:23:52.131 | 417569 | control | view | NaN | 
| 2766 | 2016-12-13 10:17:36.711 | 496865 | control | view | NaN | 
| 2834 | 2016-12-14 17:24:26.749 | 429709 | control | enroll | NaN | 
| 3660 | 2017-01-07 08:14:30.071 | 631083 | control | enroll | NaN | 
| 3754 | 2017-01-10 01:45:10.342 | 403964 | control | view | NaN | 
| 3758 | 2017-01-10 03:04:52.819 | 883792 | control | view | NaN | 
| 3798 | 2017-01-11 05:16:49.001 | 910942 | control | view | NaN | 
| 3965 | 2017-01-15 20:17:33.700 | 508697 | control | view | NaN | 
## 删除空值
df2 = df1.dropna()## 查看重复值
df2[df2.duplicated(subset=['id','group','action'],keep='first')][['id','group','action']]| id | group | action | |
|---|---|---|---|
| 5 | 261869 | experiment | view | 
| 6 | 226546 | experiment | view | 
| 7 | 286353 | experiment | view | 
| 8 | 842279 | experiment | view | 
| 142 | 711838 | experiment | view | 
| 971 | 724590 | experiment | view | 
| 1013 | 314669 | experiment | view | 
| 1351 | 381744 | experiment | view | 
| 1765 | 831767 | experiment | view | 
| 1902 | 655009 | experiment | view | 
| 2293 | 645047 | experiment | view | 
| 2704 | 510055 | experiment | view | 
| 2919 | 661526 | experiment | view | 
| 2952 | 885859 | experiment | view | 
| 3559 | 661528 | experiment | view | 
| 3653 | 191559 | experiment | view | 
# 删除重复值
df3 = df2.drop_duplicates(subset=['id'],keep='last')查看变量类型
df3.dtypes.to_frame()| 0 | |
|---|---|
| timestamp | datetime64[ns] | 
| id | int64 | 
| group | object | 
| action | object | 
| duration | float64 | 
# 指定变量的类型
df4 = df3.astype({
    'group': 'category',
    'action': 'category',
})
df4.dtypes.to_frame()| 0 | |
|---|---|
| timestamp | datetime64[ns] | 
| id | int64 | 
| group | category | 
| action | category | 
| duration | float64 | 
## 异常值查找
df4['duration'].describe()count    4028.000000
mean      123.402057
std        72.587800
min         0.013856
25%        67.080495
50%       118.487843
75%       172.580542
max       421.567520
Name: duration, dtype: float64
fig = px.box(df4, y="duration")
fig.show()Unable to display output for mime type(s): application/vnd.plotly.v1+json
fig = px.histogram(df4, x="duration")
fig.show()Unable to display output for mime type(s): application/vnd.plotly.v1+json
# 数据清理完毕
df = df4.copy()数据分析
描述统计
先描述样本背景,对样本质量进行评价。再描述样本特征信息、样本基本现状,最后描述样本基本态度及其他维度。
描述统计分析也应该有理论依据或概念合理的分类。
推论统计
可进行一些相关性、差异性分析以及回归分析。合理分析变量之间的相关性。
## 样本规模
N = df.shape[0]
print(N)4028
mytools.gen_percent_table(df,'group')| group | 个数 | 百分比 | |
|---|---|---|---|
| 0 | experiment | 2079 | 51.61 | 
| 1 | control | 1949 | 48.39 | 
| 2 | 总和 | 4028 | 100.00 | 
# 构建绘图用数据表
sun_df = df.groupby(["group",'action']).size().reset_index(name='频数')
sun_df| group | action | 频数 | |
|---|---|---|---|
| 0 | control | enroll | 370 | 
| 1 | control | view | 1579 | 
| 2 | experiment | enroll | 438 | 
| 3 | experiment | view | 1641 | 
temp = sun_df.set_index(['group','action'])
temp['%'] = 100 * (temp / temp.groupby('group').sum())
temp.round(2)| 频数 | % | ||
|---|---|---|---|
| group | action | ||
| control | enroll | 370 | 18.98 | 
| view | 1579 | 81.02 | |
| experiment | enroll | 438 | 21.07 | 
| view | 1641 | 78.93 | 
sun_df = temp.reset_index()
sun_df| group | action | 频数 | % | |
|---|---|---|---|---|
| 0 | control | enroll | 370 | 18.984094 | 
| 1 | control | view | 1579 | 81.015906 | 
| 2 | experiment | enroll | 438 | 21.067821 | 
| 3 | experiment | view | 1641 | 78.932179 | 
fig = px.bar(
    sun_df,  # 带绘图数据 
    x="group",  # x轴
    y="%",  # y轴
    color="group",
    facet_col="action",  # 列
)
fig.show()Unable to display output for mime type(s): application/vnd.plotly.v1+json
fig = px.pie(df,names="action")
fig.show()Unable to display output for mime type(s): application/vnd.plotly.v1+json
fig = px.sunburst(sun_df,
                  path=['group','action'], 
                  values='%'
                 )
fig.show()Unable to display output for mime type(s): application/vnd.plotly.v1+json
### 双变量统计分析
result = pd.crosstab(
        df['action'],
        df['group'],
        normalize='columns',
        margins=True,
        margins_name='合计',
    )*100
result.round(2)| group | control | experiment | 合计 | 
|---|---|---|---|
| action | |||
| enroll | 18.98 | 21.07 | 20.06 | 
| view | 81.02 | 78.93 | 79.94 | 
可通过Z检验,对两个比例值是否存在统计显著性差异进行检验。
n1 = df.query('group =="control"').shape[0]
n2 = df.query('group =="experiment"').shape[0]
p1 = df.query('group =="control" and action =="enroll"').shape[0] / n1
p2 = df.query('group =="experiment" and action =="enroll"').shape[0] / n2
z, p_value = mytools.two_prop_equal_test(n1, n2, p1, p2)
print(z, p_value)-1.6503840862221497 0.049432201929688536
通过对AB测试的结果进行分析,发现实验组的报名比例较控制组有所增高,p=0.049,具有统计显著性,拒绝原假设,接受研究假设,即:改版后的首页较原版有利于提高学员报名率。