0、
1、编程环境
python3
- 编辑器:PyCharm
- 环境:anaconda3
使用的工具包
#用于调用excel文件的
import openpyxl
#用于正则比较的包
import re
2、主函数
if \__name__ == "\__main__":
# targetSheet为需要输出统计结果的工作表,dataSheet为数据所在的工作表
# 提取excel文件
wb_targetSheet = openpyxl.load_workbook('D:/target.xlsx')
wb_dataSheet = openpyxl.load_workbook('D:/data.xlsx')
# 提取工作表
ws_targetSheet = getWorksheets(wb_targetSheet, "Sheetname")
ws_dataSheet = getWorksheets(wb_dataSheet, "Sheetname")
# column_origin_targetSheet 统计表 原始结果所在列
# column_output_targetSheet 统计表 误识别统计结果输出列
target_column = targetColumn(n, n)#column_origin_targetSheet,column_output_targetSheet
# column_origin_dataSheet 数据表 原始结果所在列
# column_tag_dataSheet 数据表 标注结果所在列
data_column = dataColumn(n, n) # column_origin_dataSheet,column_tag_dataSheet
# 统计表的各类数字统计所在列
# all = column_number[0] 统计表 提取总次数
# true = column_number[1] 统计表 识别正确数
# true_per = column_number[2] 统计表 识别正确占比
# false = column_number[3] 统计表 识别错误数
# false_per = column_number[4] 统计表 识别错误占比
# none = column_number[5] 统计表 无效次数
# none_per = column_number[6] 统计表 无效数据占比
column_number = param_number_return(n, n, n, n, n, n, n)
# 统计识别结果的主函数,并返回错误的数组
# targetSheet_row 统计表 原始识别结果总行数
# dataSheet_row 数据表 标注结果总行数
# 本函数导入参数分别为:ws_targetSheet, ws_dataSheet, targetSheet_row, dataSheet_row, target_column, data_column
err_str = Statistics(ws_targetSheet, ws_dataSheet, n, n, target_column, data_column, column_number)
# 在统计表第err_statistics行,输出 统计表原始结果无法识别的行
err_statistics = n
ws_targetSheet.cell(err_statistics, column=1).value = "原始结果无法识别行:"
ws_targetSheet.cell(err_statistics, column=2).value = err_str
# 保存统计工作表,在此命令执行之前生成和修改的数据不会保存
wb_targetSheet.save('D:/target.xlsx')
3、整表处理函数
#输出全部误识别结果统计到指定xlsx的指定Sheet
#targetSheet_row:输出表的行数
#dataSheet_row:数据表的行数
def Statistics(ws_targetSheet, ws_dataSheet, targetSheet_row, dataSheet_row, target_column, data_column, column_number):
column_origin_targetSheet = target_column[0]
column_output_targetSheet = target_column[1]
column_origin_dataSheet = data_column[0]
column_tag_dataSheet = data_column[1]
all = column_number[0]
true = column_number[1]
true_per = column_number[2]
false = column_number[3]
false_per = column_number[4]
none = column_number[5]
none_per = column_number[6]
# 用于存放无法识别的原始结果
err = {}
err_index = 0
for i in range(2, targetSheet_row+1):
err,err_index = getIdentificationNumber(ws_targetSheet, ws_dataSheet, i, dataSheet_row, column_origin_targetSheet, column_output_targetSheet, column_origin_dataSheet, column_tag_dataSheet, all, true, true_per, false, false_per, none, none_per, err, err_index)
err_index -= 1
err_str = ''
for k in range(0, err_index + 1):
err_str = err_str + str(err[k])
if k != err_index:
err_str = err_str + '、'
return err_str
4、统计表单条数据分析
#单条原始识别结果的误识别分析
def getIdentificationNumber(ws_targetSheet, ws_dataSheet, targetSheet_row, dataSheet_row, column_origin_targetSheet, column_output_targetSheet, column_origin_dataSheet, column_tag_dataSheet, all, true, true_per, false, false_per, none, none_per, err, err_index):
# 正则判断是否为时间格式文本,若为则需要处理
m = re.search(r'(\w)(\w)(:)(\w+)(:)(\w+)',str(ws_targetSheet.cell(row=targetSheet_row, column=column_origin_targetSheet).value))
if m != None and m.group(3) == ':' and m.group(5) == ':':
ws_targetSheet.cell(row=targetSheet_row, column=column_origin_targetSheet).value = str(m.group(2)) + str(m.group(3)) + str(m.group(4))
print('【' + str(targetSheet_row) + '】' + str(ws_targetSheet.cell(row=targetSheet_row, column=column_origin_targetSheet).value))
# 音频误识别结果统计,结果放入a数组
# a[2 * a_index]:记录误识别结果字符串 str类型
# a[2 * a_index + 1]:记录每种误识别结果字符串数量 int类型
a = {}
a_index = 0
# 统计正确、错误、无效数目
number_true = 0
number_false = 0
number_none = 0
# 针对每一条原始识别结果,搜索所有相关标注结果
for i in range(2, dataSheet_row+1):
if str(ws_dataSheet.cell(row=i, column=column_origin_dataSheet).value) == str(ws_targetSheet.cell(row=targetSheet_row, column=column_origin_targetSheet).value) \
and str(ws_dataSheet.cell(row=i, column=column_origin_dataSheet).value) != str(ws_dataSheet.cell(row=i, column=column_tag_dataSheet).value) \
and str(ws_dataSheet.cell(row=i, column=column_tag_dataSheet).value) != "/":
number_false += 1
Misidentification_repeat = 0
for j in range(0, a_index):
if ws_dataSheet.cell(row=i, column=column_tag_dataSheet).value == a[2 * j]:
Misidentification_repeat += 1
a[2 * j + 1] += 1
else:
Misidentification_repeat = Misidentification_repeat
if Misidentification_repeat == 0:
a[2 * a_index] = ws_dataSheet.cell(row=i, column=column_tag_dataSheet).value
a[2 * a_index + 1] = 1
a_index += 1
# 正确识别结果统计
elif str(ws_dataSheet.cell(row=i, column=column_origin_dataSheet).value) == str(ws_targetSheet.cell(row=targetSheet_row, column=column_origin_targetSheet).value) \
and str(ws_dataSheet.cell(row=i, column=column_origin_dataSheet).value) == str(ws_dataSheet.cell(row=i, column=column_tag_dataSheet).value):
number_true += 1
# 无效识别结果统计
elif str(ws_dataSheet.cell(row=i, column=column_origin_dataSheet).value) == str(ws_targetSheet.cell(row=targetSheet_row, column=column_origin_targetSheet).value) \
and str(ws_dataSheet.cell(row=i, column=column_tag_dataSheet).value) == "/":
number_none += 1
# 将a_index多加的一次减去,得到正确的a_index
a_index -= 1
number_all = number_true + number_none + number_false
print(number_all, number_true, number_false, number_none)
if number_all != 0:
ws_targetSheet.cell(row=targetSheet_row, column=all).value = number_all
ws_targetSheet.cell(row=targetSheet_row, column=true).value = number_true
number_true_per = float(number_true) / float(number_all)
ws_targetSheet.cell(row=targetSheet_row, column=true_per).value = number_true_per
ws_targetSheet.cell(row=targetSheet_row, column=false).value = number_false
number_false_per = float(number_false) / float(number_all)
ws_targetSheet.cell(row=targetSheet_row, column=false_per).value = number_false_per
ws_targetSheet.cell(row=targetSheet_row, column=none).value = number_none
number_none_per = float(number_none) / float(number_all)
ws_targetSheet.cell(row=targetSheet_row, column=none_per).value = number_none_per
else:
err[err_index] = str(targetSheet_row)
err_index += 1
ws_targetSheet.cell(row=targetSheet_row, column=all).value = 0
ws_targetSheet.cell(row=targetSheet_row, column=true).value = 0
ws_targetSheet.cell(row=targetSheet_row, column=true_per).value = 0
ws_targetSheet.cell(row=targetSheet_row, column=false).value = 0
ws_targetSheet.cell(row=targetSheet_row, column=false_per).value = 0
ws_targetSheet.cell(row=targetSheet_row, column=none).value = 0
ws_targetSheet.cell(row=targetSheet_row, column=none_per).value = 0
# 针对误识别条数进行排序
a = quickSort(arr=a, left=0, right=a_index)
# 从小到大排序
a = reverseSort(arr=a, left=0, right=a_index)
# 输出误识别结果
a_str = ''
for k in range(0, a_index+1):
a_str = a_str + str(a[2 * k]) + '(' + str(a[2 * k + 1]) + ')'
if k != a_index:
a_str = a_str + '、'
print(a_str + '\n')
ws_targetSheet.cell(row=targetSheet_row, column=column_output_targetSheet).value = a_str
return err,err_index
5、快速排序算法
#快速排序算法
#快速排序算法
def quickSort( arr, left, right):
left = 0 if not isinstance(left,(int, float)) else left
right = len(arr)-1 if not isinstance(right,(int, float)) else right
if left < right:
partitionIndex = partition(arr, left, right)
quickSort(arr, left, partitionIndex-1)
quickSort(arr, partitionIndex+1, right)
return arr
#分冶
#分冶法(配合实现本程序中的快速排序)
def partition(arr, left, right):
pivot = left
index = pivot+1
i = index
while i <= right:
if arr[2 * i + 1] < arr[2 * pivot + 1]:
swap(arr, 2*i, 2*index)
swap(arr, 2*i+1, 2*index+1)
index += 1
i += 1
swap(arr, 2*pivot+1, 2*(index-1)+1)
swap(arr, 2*pivot, 2*(index - 1))
return index-1
#交换
#交换数组元素值(配合实现本程序中的快速排序)
def swap(arr, i, j):
arr[i], arr[j] = arr[j], arr[i]
6、其他辅助函数
#倒序处理,从大到小排列
def reverseSort( arr, left, right):
while left<right :
temp = arr[2*left]
arr[2 * left] = arr[2 * right]
arr[2 * right] = temp
temp = arr[2 * left + 1]
arr[2 * left + 1] = arr[2 * right + 1]
arr[2 * right + 1] = temp
left += 1
right -= 1
return arr
#从xlsx获得单张Sheet
def getWorksheets(wb,sheetname):
ws = wb[sheetname]
return ws
#返回统计表的文本所处第几列
def targetColumn(column_origin_targetSheet, column_output_targetSheet):
return column_origin_targetSheet, column_output_targetSheet
#返回数据表的文本所处第几列
def dataColumn(column_origin_dataSheet, column_tag_dataSheet):
return column_origin_dataSheet, column_tag_dataSheet
#返回统计表的各类数字统计所在列
def param_number_return(all, true, true_per, false, false_per, none, none_per):
return all, true, true_per, false, false_per, none, none_per