这个脚本的输入有两个参数:$H $thchs
H代表当前工作路径,thchs代表数据文件路径
#!/bin/bash
# Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang). Apache 2.0.
# 2016 LeSpeech (Author: Xingyu Na)
#This script pepares the data directory for thchs30 recipe.
#It reads the corpus and get wav.scp and transcriptions.
dir=$1 # 读取输入的第一个参数,这里是工作路径
corpus_dir=$2 # 读取输入的第二个参数,这里是语料库文件路径,这个路径下就是train dev test文件夹
cd $dir # 切换到工作路径
echo "creating data/{train,dev,test}"
mkdir -p data/{train,dev,test} # 创建两级目录 data/train data/dev data/test
#create wav.scp, utt2spk.scp, spk2utt.scp, text
(
for x in train dev test; do
echo "cleaning data/$x"
cd $dir/data/$x
rm -rf wav.scp utt2spk spk2utt word.txt phone.txt text # 删除文件夹下所有文件
echo "preparing scps and text in data/$x"
#updated new "for loop" figured out the compatibility issue with Mac created by Xi Chen, in 03/06/2018
#for nn in `find $corpus_dir/$x/*.wav | sort -u | xargs -i basename {} .wav`; do
# 这一句有些复杂,find命令先将train文件夹下所有wav文件列出,然后sort命令去除重复内容,然后
# 将每一个wav文件的名字去除,也就是将..../train/A2_0.wav变成A2_0
for nn in `find $corpus_dir/$x -name "*.wav" | sort -u | xargs -I {} basename {} .wav`; do
spkid=`echo $nn | awk -F"_" '{print "" $1}'` # 用'_'来分割$nn并打印第一个分割部分。A2_0输出A2
spk_char=`echo $spkid | sed 's/\([A-Z]\).*/\1/'` # A2 输出 A (括号内的内容放到第一个位置)
spk_num=`echo $spkid | sed 's/[A-Z]\([0-9]\)/\1/'` # A2 输出 2 (括号内的内容放到第一个位置)
spkid=$(printf '%s%.2d' "$spk_char" "$spk_num") # 输出A02
utt_num=`echo $nn | awk -F"_" '{print $2}'` # A2_0 输出 0
uttid=$(printf '%s%.2d_%.3d' "$spk_char" "$spk_num" "$utt_num") #输出 A02_000
echo $uttid $corpus_dir/$x/$nn.wav >> wav.scp # 将文件名输出到wav.scp中
echo $uttid $spkid >> utt2spk # 将spkid输出到utt2spk中
echo $uttid `sed -n 1p $corpus_dir/data/$nn.wav.trn` >> word.txt # sed -n 1p只处理打印第一行(第一行时是中文)
echo $uttid `sed -n 3p $corpus_dir/data/$nn.wav.trn` >> phone.txt # # sed -n 3p只处理打印第三行(第三行是音节)
done
cp word.txt text
sort wav.scp -o wav.scp # 排序后输出
sort utt2spk -o utt2spk # 排序后输出
sort text -o text # 排序后输出
sort phone.txt -o phone.txt # 排序后输出
done
) || exit 1
utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
# 这里使用了工具对utt2spk进行操作转换
echo "creating test_phone for phone decoding"
(
rm -rf data/test_phone && cp -R data/test data/test_phone || exit 1
cd data/test_phone && rm text && cp phone.txt text || exit 1
)
# 对test_phone文件夹里的文件做了一点简单操作