干货:
import numpy as np
np.set_printoptions(suppress=True) # 设置输出不用科学计数法
import pandas as pd
pd.set_option('display.float_format', lambda x : '%.2f' % x) # 设置输出浮点数为小数, 不用科学计数法
# 初始化
df = pd.DataFrame() # 初始化空df
df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) # 初始化矩阵
# 初始化矩阵并指定index和columns
df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["r1", "r2", "r3"], columns=["A", "B", "C"])
# A B C
# r1 1 2 3
# r2 4 5 6
# r3 7 8 9
# 左侧一列为index, 最顶上一行是columns
# 取出某一列可以直接像字典一样
print(df["A"]) # 注意打印出来的 "某一列" 作为series是以一列展示的
# 取出某行需要特殊操作
print(df.loc["r1"]) # 注意打印出来的 "某一行" 是作为series以一列展示的
# 也就是说 原本的 index:r1 行的各个columns作为了新的series的index
# 取出某元素的话就可以理解成从df取一列series再取某index的值
print(df["A"].loc["r1"])
print(df.loc["r1"].loc["A"])
# 或者直接用特殊操作直接取出 (注意是 index columns 的顺序)
print(df.loc["r1"]["A"])
print(df.loc["r1", "A"])
# 添加新列
# 直接用指定了顺序的list
df["D"] = [1, 1, 1] # 不能少
# 指定了kv关系的字典 生成的series, 会根据index的顺序对应上
df["E"] = pd.Series({"r2":2, "r1":1, "r3":3}) # 可以少, 默认nan
# 筛选
# 根据index或者columns
# 取出某些行或列 (指定含 index 或 columns)
print(df.reindex(index=["r2", "r1"]))
print(df.reindex(columns=["A", "C"]))
# 也可以不指定, 通过axis转换行列
print(df.reindex(["r1", "r2"]))
print(df.reindex(["A", "C"], axis=1))
# 屏蔽某些行或列
print(df.drop(["r1"]))
print(df.drop(["A"], axis=1))
# 排序
# 根据value排序 ascending为升序的开关,默认升序
df.sort_values("A", ascending=False) # 按照"A"的value大小排序,其他列也相同
df.sort_values("r1", ascending=False, axis=1) # 转换为行排序
# 根据index排序
df.sort_index(ascending=False) # 按照index排序
df.sort_index(ascending=False, axis=1) # 转化为columns
# 统计
# 直接在df上用是对列求和, 可以axis转为行统计
df.sum()
df.min()
df.max()
df.mean()
df.count()
# 万能公式 (!!!速度特别慢!!!)
# 遍历每一列,对每一列 都调用函数并传入该列为x,
# 返回的东西作为 value 与该列的 columns 生成series 作为apply的返回
# 也可以使用 axis转为行
a = df.apply(lambda x: x.min())
# 计算
# 可以直接用加减乘除余符号
df["A"] = df["A"] + df["B"]
df["A"] = df["A"].sub(df["B"])
# 计算时根据索引对应的值进行计算,
# 乘除法也是索引值对应的项进行计算(不是行列式计算)
# 没有该索引的一边会任务Nan,
# 与Nan进行任何计算都为Nan
print(df.T) # 矩阵的转置,行列索引也会跟着转置
# 转换
print(df.values) # df转二维数组
print(df["A"].to_list()) # series转数组
# 转为字典形式
print(df.to_dict())
print(df["A"].to_dict())
# 转为csv
print(df["A"].to_csv()) # 官方warning不建议使用series转csv
df.to_csv("./tmp.csv", # 指定输出路径
index=False, header=False, # 是否要带有行索引名和列名
float_format="%.2f", date_format="%Y-%m-%d %H:%M:%S", # 指定小数和时间的输出格式
sep=",", na_rep="None") # 指定表格的分隔符 和 指定用于替换Nan的值
# df = pd.read_csv() # 从csv读取数据,还有一系列其他的读数据的函数,再说吧。。
# 先贴个链接 https://www.cnblogs.com/datablog/p/6127000.html
之前用到的原始代码(已经被榨出干货了,没啥东西了):
# 太丑陋了还是不要看了。。。
#
# -*- coding:utf-8 -*-
import sys
import numpy as np
np.set_printoptions(suppress=True)
import matplotlib.pyplot as plt
import pandas as pd
#pd.set_option('display.max_columns', 10000, 'display.max_rows', 10000)
pd.set_option('display.float_format',lambda x : '%.2f' % x)
import math
import ujson
import time
api_src_vec = [
"ctrip",
"meiyaApi",
"travelzen",
"pkFareApi",
"igola",
"51bookApi",
"tuniuApi",
"flightroutesApi",
"qunarApi"
]
acc=1000
step=0.1
def get_group_name(x, index_vec=None):
if index_vec == None:
_step = int(step*acc)
first_flag = True
l=0
r=0
for i in [ x/acc for x in range(-1*acc, 1*acc, _step) ]:
l=i
r=i+step
if first_flag == True and x < r:
first_flag = False
ret = "(...,%.3f)" % (r)
return ret
first_flag=False
ret = "[%.3f,%.3f)"%(l,r)
if l <= x < r:
return ret
ret = "[%.3f,...)" % (l)
return ret
else:
_step = int(step*acc)
first_flag = True
l=0
r=0
for i in [ x/acc for x in range(-1*acc, 1*acc, _step) ]:
l=i
r=i+step
if first_flag == True:
first_flag = False
ret = "(...,%.3f)" % (r)
index_vec.append(ret)
continue
first_flag=False
ret = "[%.3f,%.3f)"%(l,r)
index_vec.append(ret)
ret = "[%.3f,...)" % (l)
index_vec[-1] = ret
return ""
num=1000
def get_group_name_limit(x,a,b):
ma = max(a,b)
mi = min(a,b)
step = round(ma-mi,3)
for i in [ x/num/1000 for x in range( math.ceil(mi*num*1000), math.floor(ma*num*1000), round(step*1000)) ]:
l=i
r=round(i+step/num,3)
# print(l,r)
if l <= x < r:
return "[%.3f,%.3f)"%(l,r)
#tmp_vec= []
#print(get_group_name(0.5, tmp_vec))
#print(tmp_vec)
#sys.exit()
def main():
# 读取数据json
data = ''
with open("/cmp_data") as f:
for line in f:
data = ujson.loads(line.strip())
break
col_title = api_src_vec
row_title = []
matrix = []
i = 0
# 解析json三层kv结构,转成矩阵
for qid in data.keys():
i+=1
if i > 30000000000:
break
for key in data[qid]:
line = []
row_title.append(qid+","+key)
for src in col_title:
if src in data[qid][key]:
line.append(data[qid][key][src])
else:
line.append(None)
matrix.append(np.array(line))
matrix = np.array(matrix)
get_report(matrix)
def get_report(matrix):
df = pd.DataFrame(matrix, index=[ "[%d]"%x for x in range(len(matrix)) ] ,columns=api_src_vec)
df_ratio = pd.DataFrame()
ret_count = {}
for src in df.columns[:]: # 遍历每个源
bin = ~df[src].isin([None])
tmp_df = df[bin]
# 计算出价格比例矩阵
df_ratio[src] = (df["ctrip"]-df[src])/df["ctrip"]
# 算出分组名
df_ratio[src+"_gname"] = df_ratio[src][bin].apply(lambda x: get_group_name(x))
new_index = []
get_group_name(0,new_index)
# 分组统计数量
df_count = df_ratio.groupby(src + "_gname").count()
result_vec = {}
# 补充没有数据的组,默认为零
for name in new_index:
if name not in df_count[src].index:
result_vec[name] = 0
else:
result_vec[name] = df_count[src][name]
ret_count[src] = result_vec
ret_count = pd.DataFrame(ret_count)
# 添加额外的计算结果
ret_count.loc['sum'] = ret_count.apply(lambda x: x.sum())
ret_count.loc['succ_sum'] = ret_count.apply(lambda x: x[10:20].sum())
ret_count.loc['fail_sum'] = ret_count.apply(lambda x: x[:10].sum())
for idx in ret_count.index:
if idx == "sum":
continue
for col in ret_count.columns:
cur = int(ret_count.loc[idx, col])
sum = int(ret_count.loc["sum", col])
ret_count.loc[idx, col] = "%.2f%%"%( cur*100.0 / sum )
print(ret_count)
app_idx = ["sum", "succ_sum", "fail_sum"]
####
#增加desc
desc_vec = [
"比ctrip劣势0.9及更多",
"比ctrip劣势0.1以内",
"比ctrip劣势0.1-0.2",
"比ctrip劣势0.2-0.3",
"比ctrip劣势0.3-0.4",
"比ctrip劣势0.4-0.5",
"比ctrip劣势0.5-0.6",
"比ctrip劣势0.6-0.7",
"比ctrip劣势0.7-0.8",
"比ctrip劣势0.8-0.9",
"比ctrip优势0.1以内(包括相等)",
"比ctrip优势0.1-0.2",
"比ctrip优势0.2-0.3",
"比ctrip优势0.3-0.4",
"比ctrip优势0.4-0.5",
"比ctrip优势0.5-0.6",
"比ctrip优势0.6-0.7",
"比ctrip优势0.7-0.8",
"比ctrip优势0.8-0.9",
"比ctrip优势0.9及更多",
"总数量",
"优势总数量(包括相等)",
"劣势总数量"
]
ret_count.insert(1, "desc", pd.Series( dict(zip(new_index + app_idx ,desc_vec))) )
#### 输出
print(ret_count)
ret_count.to_csv("./finally.csv")
#df = pd.DataFrame([['Snow','M',22],['Tyrion','M',32],['Sansa','F',18],['Arya','F',14]], index=["q","w","e","r"], columns=['name','gender','age'])
#print(df)
#df.loc["sum"] = df.apply(lambda x : x[2:].sum())
#df.loc["sum"] = dict(zip(df.columns, [0]*len(df.columns) ) )
#for a in df.columns:
# for b in df.index:
# df.loc[b,a] = 0
#print(df)
#sys.exit()
def demo():
a = np.array([[None, 2, 3], [4, 5, 6], [7, 8, 9]])
df1 = pd.DataFrame(a, index=['row0', 'row1', 'row2'], columns=list('ABC'))
# print(df1)
df2 = df1.copy()
# 删除/选取某列含有特定数值的行
# df1=df1[df1['A'].isin([1])]
# df1[df1['A'].isin([1])] 选取df1中A列包含数字1的行
bin = ~df1['A'].isin([None,4])
print(bin)
df1 = df1[bin]
# 通过~取反,选取不包含数字1的行
# print(df1)
def draw():
df = pd.read_csv("./finally.csv")
print(df)
df.plot()
# for src in df.columns:
# df[src].plot(label=src)
# plt.legend()
plt.show()
import tqdm
def cheapest_count():
data = ''
with open("/cmp_data") as f:
for line in f:
data = ujson.loads(line.strip())
break
col_title = api_src_vec
row_title = []
matrix = []
i = 0
for qid in tqdm.tqdm(data.keys()):
i+=1
if i > 30000000000:
break
for key in data[qid]:
line = []
row_title.append(qid+","+key)
for src in col_title:
if src in data[qid][key]:
line.append(data[qid][key][src])
else:
line.append(np.nan)
matrix.append(np.array(line))
matrix = np.array(matrix)
df = pd.DataFrame(matrix,columns=api_src_vec)
# df = df.iloc[0:500, :]
ret_df = pd.DataFrame(columns=api_src_vec)
ret_df.loc["sum"] = df.apply(lambda x : x.count())
ret_df.loc["cheapest_sum"] = dict(zip(ret_df.columns, [0]*len(ret_df.columns)))
ret_df.loc["cheapest_sum(api)"] = dict(zip(ret_df.columns, [0]*len(ret_df.columns)))
# df["min_price"] = df.apply(lambda x: x[:].min(), axis=1)
# df.apply(lambda x: print(x[~x.index.isin(["ctrip"])]), axis=1)
# sys.exit()
df["min_price(api)"] = df.apply(lambda x: x[~x.index.isin(["ctrip", "min_price(api)","min_price"])].min(), axis=1)
print(df)
print(ret_df)
print("=========")
# print(df[df["ctrip"]<df["min_price(api)"]])
# for idx in df.index:
# ret = []
# for col in df.columns:
# if df.loc[idx, "min_price(api)"] == df.loc[idx, col]:
# ret.append(col)
# print(ret)
for col in tqdm.tqdm(df.columns):
# bin = (df[col]>=df["min_price"])&(df[col]<=df["min_price"])
# ret_df.loc["cheapest_sum"][col] = df[col][bin].count()
bin = (df[col]>=df["min_price(api)"])&(df[col]<=df["min_price(api)"])
ret_df.loc["cheapest_sum(api)"][col] = df[col][bin].count()
# ret_df.loc["cheapest_sum_ratio"] = ret_df.loc["cheapest_sum"].div( ret_df.loc["sum"] )
# ret_df.loc["cheapest_sum_ratio"] = ret_df.loc["cheapest_sum_ratio"].apply(lambda x: "%.2f%%"%(x*100))
ret_df.loc["cheapest_sum_ratio(api)"] = ret_df.loc["cheapest_sum(api)"].div( ret_df.loc["sum"] )
ret_df.loc["cheapest_sum_ratio(api)"] = ret_df.loc["cheapest_sum_ratio(api)"].apply(lambda x: "%.2f%%"%(x*100))
# ret_df.insert(0, "desc", ["有价格总次数", "最低价次数","(api范围内)最低价次数","价格为最低价的比例","(api范围内)价格为最低价的比例"])
print(ret_df)
ret_df.to_csv("~/Desktop/ret_df.csv")
#a = [ [1,2,3], [4,5,6], [7,8,9] ]
#df = pd.DataFrame(a, index = ["a","b","c"], columns=["q","w","e"])
#print(df)
#df.apply(lambda x: print(x.index),axis=1)
#sys.exit()
if __name__ == "__main__":
#main()
cheapest_count()
# m_
#draw()
#demo()