本文最早发表在csdn时间为:2023-04-17
本案例数据链接(数据是本人业余时间模拟数据,需要的自行下载):
数据点我下载
数据来源为工作中接触到的某公司后台数据,在完成工作相关分析后,本人对该部分数据虚拟重建用以复盘整理
学习是为了不落后,
整理则是为了不忘记。
通过本文您将学习到:
- 假设检验 相关知识
- python (numpy 、pandas 、 scipy )
- abtest原则
- 实验过程
- 指标确定原则
背景:
某线上查价平台最近上线了一个新指导价格的产品功能,需要评估该功能对供应商提供的价格是否产生影响作,以确定该项目是否有继续推广的价值。
由于每个商品会有10~15 个供应商报价,最后用户会在这10 ~15 个价格中选择一个成交,本次实验目的在测试该功能上线前后供应商报价价格是否发生集中变化 ,报价均价是否受到影响
确认检验指标:
清洗函数:
import numpy as np
import pandas as pd
pd_a=pd.read_csv(r'a_vx_captain_data.csv')
pd_b=pd.read_csv(r'b_vx_captain_data.csv')
def df_get_ssa_s_xbarbar(df_wat,level_3_name,random_n):
pd_a=df_wat
pd_a.columns=[ 'global_order_trace_id','stat_date','order_no','order_goods_cnt','user_sp_type_name','is_show','min_proposed_price','max_proposed_price','first_hint_time','order_division_id','settle_amt','order_settle_time',
# 'order_appoint_type_name',
'省','市','区县','一级类目','二级类目','三级类目','报价价格']
# ]
pd_a.global_order_trace_id=pd_a.index
df_avg=pd_a.groupby(['order_no'])['报价价格'].agg('mean') # 计算报价组内均价
df_avg_df=pd.DataFrame(df_avg)#'order_no', 计算总报价均价
df_avg_df.columns=['xbar']
pd_a_xbar=pd_a.merge(df_avg_df, on='order_no',right_index=False)
city_01=['北京市','深圳市','苏州市','重庆市','杭州市','上海市','广州市']
pd_a_xbar_city_01=pd_a_xbar[pd_a_xbar.市.isin(city_01)]
pd_a_xbar_city_01_ch_desk=pd_a_xbar_city_01[pd_a_xbar_city_01.三级类目==level_3_name ]
print('清洗前订单数',len(pd_a_xbar_city_01_ch_desk.order_no.unique()))
print('清洗前报价数',len(pd_a_xbar_city_01_ch_desk))
# pd_a_xbar_city_01_ch_desk=pd_a_xbar_city_01_ch_desk.sample(n=20000,random_state=22)
std_4=3*np.std(pd_a_xbar_city_01_ch_desk.报价价格) # 3倍标准差异常值检测
pd_a_xbar_city_01_ch_desk=pd_a_xbar_city_01_ch_desk[(pd_a_xbar_city_01_ch_desk.报价价格>=40)&(pd_a_xbar_city_01_ch_desk.报价价格<=std_4)] # 去除异常值
#随机抽取开关
# random_n=len(pd_a_xbar_city_01_ch_desk.order_no.unique())
random_order_no_list=pd.DataFrame({
'order_no':pd_a_xbar_city_01_ch_desk.order_no.unique()}).sample(random_n,random_state=26).order_no.tolist()#随机抽取数据
pd_a_xbar_city_01_ch_desk=pd_a_xbar_city_01_ch_desk[pd_a_xbar_city_01_ch_desk.order_no.isin(random_order_no_list)]
abtest_df=pd_a_xbar_city_01_ch_desk.报价价格.values
pd_a_xbar_city_01_ch_desk['xbarbar']=pd_a_xbar_city_01_ch_desk.报价价格.mean()
# pd_a_xbar_city_01_ch_desk['xbarbar']=pd_a_xbar_city_01_ch_desk.报价价格.mean()
pd_a_xbar_city_01_ch_desk['xbar_xbarbar_v2']=(pd_a_xbar_city_01_ch_desk.xbar-pd_a_xbar_city_01_ch_desk.xbarbar)**2
mean=pd_a_xbar_city_01_ch_desk.报价价格.mean()
ssa=pd_a_xbar_city_01_ch_desk.xbar_xbarbar_v2.sum()
ssa_s=np.sqrt(ssa/len(pd_a_xbar_city_01_ch_desk))
ss=np.std(pd_a_xbar_city_01_ch_desk.报价价格)
d=len(pd_a_xbar_city_01_ch_desk)
print('清洗后订单数',len(pd_a_xbar_city_01_ch_desk.order_no.unique()))
print('清洗后报价数',d)
print('ssa_s:%s,\nxbarbar:%s,\nss:%s'%(ssa_s,mean,ss))
return mean,ssa_s,ss,d,abtest_df
#26
外部主函数 :
mean_1,ssa_s_1,ss_1,d_1,abtest_df_1=df_get_ssa_s_xbarbar(pd_a,'笔记本',2343)
print('********************************************************')
mean_2,ssa_s_2,ss_2,d_2,abtest_df_2=df_get_ssa_s_xbarbar(pd_b,'笔记本',2343)
print('********************************************************')
from scipy.stats import f
F=ssa_s_1**2/ssa_s_2**2
fr=f.ppf(0.975,d_1-1,d_2-1)
fl=f.ppf(0.025,d_1-1,d_2-1)
print(f'实验数据F值为:{
F}')
print(f'alpha=0.975的双尾检验对应的临界值为:{
fr}')
print(f'alpha=0.025的双尾检验对应的临界值为:{
fl}')
# f.ppf(0.975,13229,22027)
if F>fr :
print('由于F>fr,落在右侧拒绝域,拒绝H0,接受H1,两组数据有显著性差异,a组数据方差比b组高,实验成功!!')
elif F<fl:
print('由于F<fl,落在左侧拒绝域,拒绝H0,接受H1,两组数据有显著性差异,a组数据方差比b组低,实验成功!!')
else:
print('由于fl<=F<=fr接受H0,拒绝H1,两组数据无差异,实验失败!!')
print('********************************************************')
#标准误
σ_bak=np.sqrt(np.std(abtest_df_1)**2/d_1+np.std(abtest_df_2)**2/d_2)
σ_bak
from scipy import stats
# Z_x1_x2=np.mean(abtest_df_1)-np.mean(abtest_df_2)
Z=(np.mean(abtest_df_1)-np.mean(abtest_df_2))/σ_bak
# Z
p_z=stats.norm.cdf(Z,0,1)
# Z_test= stats.norm.ppf(p_z,np.mean(abtest_df_1)-np.mean(abtest_df_2),σ_bak) #alpha,均值,s
# z_test_l = stats.norm.ppf(0.025,np.mean(abtest_df_1)-np.mean(abtest_df_2),σ_bak) #alpha,均值,s
# z_test_r = stats.norm.ppf(0.975,np.mean(abtest_df_1)-np.mean(abtest_df_2),σ_bak) #alpha,均值,s
z_test_l = stats.norm.ppf(0.025,0,1) #alpha,均值,s
z_test_r = stats.norm.ppf(0.975,0,1) #alpha,均值,s
print(f'实验数据Z值为:{
Z}')
# print(f'实验数据Z_test 对应值为:{Z_test}')
print(f'alpha=0.975的双尾检验对应的临界值为:{
z_test_r}')
print(f'alpha=0.025的双尾检验对应的临界值为:{
z_test_l}')
print(f':Z_test对应的 Z_p值为:{
p_z}' )
# f.ppf(0.975,13229,22027)
if Z>z_test_r:#p_z>0.975 :
print('由于Z>z_test_r,落在右侧拒绝域,拒绝H0,接受H1,两组数据有显著性差异!!A组数据比B组数据均值高')
elif Z<z_test_l:#p_z<0.025:
print(' 由于Z<z_test_l,落在左侧拒绝域,拒绝H0,接受H1,两组数据有显著性差异,B组数据比A组数据均值高')
else:
print('由于0.025<=P_z<=0.975,接受H0,拒绝H1,两组数据无差异,实验失败!!')