实现要求:
现有 TtrainCACD.txt 文件,记录了2000人的照片地址信息,为16w行如下数据:
‘/home/d201/cropcacd/52_Christopher_Meloni_0001.jpg 41 41’
其中第一部分是照片地址,第二部分是身份号(即某人是第41号),第三部分是年龄。
现要求重新排序,两行数据为一对,每对都必须来自同一个人,最后将所有对随机打乱写入新的txt文件中。注:如果存在一个无法配对的,则将那一个删除。
代码实现:
from random import shuffle
import time
class Guoxuezhang():
def __init__(self, tips):
self.tips = tips
def get_new_list(self, txt_path):
f = open(txt_path)
lines = f.readlines()
print(len(lines))
person_list = []
for line in lines:
line_list = line.split(' ')
person_list.append(line_list[1])
# 计算一共有多少人
person_list_set = set(person_list)
# 将身份一样的照片放在一起
person_all = []
for person_id in person_list_set:
person_single = []
for line in lines:
line_list = line.split(' ')
if line_list[1] == person_id:
person_single.append(line)
# 判断奇数偶数,奇数-1
if len(person_single) == 1:
print('the person_id is : ', person_id)
if len(person_single) % 2 != 0:
person_single.remove(person_single[0])
# 将person_single 分成两部分,重新组合,两个一组
half_length = int(len(person_single) / 2)
list_1 = person_single[:half_length] # list1 = [1,2,3,4,4]
list_2 = person_single[half_length:] # list2 = [2,3,4,5,2]
merge_list = list(zip(list_1, list_2)) # merge_list = [(1, 2), (2, 3), (3, 4), (4, 5), (4, 2)]
# merge_list[1][1] = 3 merge_list[2] = (3, 4)
for i in merge_list:
person_all.append(i)
# 随机打乱顺序
shuffle(person_all)
person_all_shuffle = person_all
f.close()
return person_all_shuffle
def get_new_txt(self, person_all_shuffle, new_txt_path):
f_new = open(new_txt_path, 'w')
# 写入新新文件
for i in person_all_shuffle:
for j in i:
f_new.write(j)
f_new.close()
if __name__ == '__main__':
txt_path = '/home/jianghusanren/Pictures/TtrainCACD.txt'
new_txt_path = '/home/jianghusanren/Pictures/TtrainCACD_new.txt'
tips = 'The process has done!'
start_time = time.time()
G_xz = Guoxuezhang(tips)
person_all_shuffle = G_xz.get_new_list(txt_path)
G_xz.get_new_txt(person_all_shuffle, new_txt_path)
end_time = time.time()
print(end_time - start_time)
结果如下图: