# coding=utf-8
import pandas as pd
from pandas import DataFrame, Series
import re
def rule(item_str):
"""
对内容的规则
:param item_str: str
:return:
"""
item_str = re.sub('\s', "", item_str)
item_str = item_str.replace(",", ",").replace("(", "(").replace(")", ")").replace('"', "“")
return item_str
def get_csv(file_name, csv_filename, col_li):
"""
获取分类模型需要的csv
:param file_name: 原始excel文件
:param csv_filename: 转换后的csv文件
:param col_li: [] 【text, label】对应的列名
:return:
"""
df = pd.read_excel(file_name)[col_li].dropna()
# df["item_info"] = [for i in range()]
# df["工作内容"] = df["工作内容"]
# df["item_info"] = ["" for i in range(len(df[col_li[1]]))]
# 处理为需写入csv的数据
csv_key = ['text', 'label', 'item_info']
data = df.values.tolist()
data_list = []
for i in data:
text_content = rule(str(i[0]))
label_content = rule(str(i[1]))
if text_content in ["", "nan", "None"] or label_content in ["", "nan", "None"]:
continue
# 风险等级为所队,错误数据
# if label_content == "所队":
# continue
item_info = ""
data_list.append([text_content, label_content, item_info])
df2 = DataFrame(data=data_list, columns=csv_key)
df2.to_csv(csv_filename, index=False)
return "ok"
if __name__ == '__main__':
excel_path = r"../files_excl/作业风险数据.xlsx"
# excel_path = r"../files_excl/4-5月全部计划V2.xlsx"
# csv_path = r"../files_csv/feng_xian.csv"
# text_label_li = ["工作内容", "风险等级"]
csv_path = r"../files_csv/guan_kong.csv"
text_label_li = ["工作内容", "管控层级"]
res = get_csv(excel_path, csv_path, text_label_li)
print(res)
# coding=utf-8
import codecs
import xlrd
import csv
import re
def rule(item_str):
"""
对内容的规则
:param item_str: str
:return:
"""
item_str = re.sub('\s', "", item_str)
item_str = item_str.replace(",", ",").replace("(", "(").replace(")", ")").replace('"', "“")
return item_str
def xlsx_to_csv(file_path, csv_filename, col_li):
"""
用于分类模型, excel转csv
:param file_path: 文件路径
:param csv_filename: 转换后的csv文件名
:param col_li: 需写到csv列名【text,label】
:return:
"""
workbook = xlrd.open_workbook(file_path)
table = workbook.sheet_by_index(0)
row_num = table.nrows # sheet行数
# col_num = table.ncols # sheet列数
head_li = table.row_values(0, start_colx=0, end_colx=None) # 返回由该行中所有单元格的数据组成的列表
text_index = head_li.index(col_li[0])
label_index = head_li.index(col_li[1])
# print(text_index, label_index)
# print(head_li)
data_list = [["text", "label", "item_info"]]
for row in range(1, row_num):
item_li = table.row_values(row, start_colx=0, end_colx=None)
text_content = rule(str(item_li[text_index]))
label_content = rule(str(item_li[label_index]))
if text_content in ["", "nan", "None"] or label_content in ["", "nan", "None"]:
continue
# 风险等级为所队,错误数据
# if label_content == "所队":
# continue
item_info = ""
data_list.append([text_content, label_content, item_info])
# 写入csv
with codecs.open(csv_filename, "w", encoding="utf-8") as f:
csv_writer = csv.writer(f)
for row_content in data_list:
csv_writer.writerow(row_content)
return "ok"
if __name__ == '__main__':
excel_path = r"../files_excl/作业风险数据.xlsx"
# excel_path = r"../files_excl/4-5月全部计划V2.xlsx"
# csv_path = r"../files_csv/feng_xian.csv"
# text, label
# text_label_li = ["工作内容", "风险等级"]
csv_path = r"../files_csv/guan_kong.csv"
text_label_li = ["工作内容", "管控层级"]
res = xlsx_to_csv(excel_path, csv_path, text_label_li)
print(res)