import wget
import os
import pandas as pd
def main():
xlsx_file = "yourfilename.xlsx"
wav_scp = "wav.scp"
text_raw = "text.raw"
print("reading xlsx file...")
df = pd.read_excel(xlsx_file)
print("read xlsx finished.")
wav_output_dir="wav_slice"
if not os.path.exists(wav_output_dir):
os.makedirs(wav_output_dir)
print("创建路径:{}".format(wav_output_dir))
if os.path.exists(wav_scp):
os.remove(wav_scp)
print("删除文件:{}".format(wav_scp))
if os.path.exists(text_raw):
os.remove(text_raw)
print("删除文件:{}".format(text_raw))
suffix=os.getcwd()
print("当前路径:{}".format(suffix))
sub_lines = 100
data = df.loc[:sub_lines,['transfer_label_content','voice']]
print("总共有{}条数据".format(len(data.values)))
text_raw_content = []
wav_scp_content = []
with open(text_raw, 'w', encoding='utf-8') as f1:
with open(wav_scp, 'w', encoding='utf-8') as f2:
for text, url in data.values:
print("text:{}, url:{}".format(text, url))
wget.download(url,out=wav_output_dir)
utt = url.split("/")[-1].split(".")[0]
wav_path = os.path.join(suffix + "/" + wav_output_dir, url.split("/")[-1])
f1.write(utt + ' ' + text + '\n')
f2.write(utt + ' ' + wav_path + '\n')
if __name__ == "__main__":
main()
- 读取excel的各种操作参考:https://blog.csdn.net/L_Jane_H/article/details/125586000
写的很清楚详细
- 下载数据直接用wget.download()就可以了