生成文件路径表供文件关联分析读取

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx 
import re
import json
import os
from tqdm import trange, tqdm
from time import sleep
from subprocess import getoutput

def filesize_float(filepath):
    '''
    获得文件大小
    '''
    return os.path.getsize(filepath)/float(1024 **2 )

def filesize_to_MB(filepath):
    '''
    获得文件大小的MB单位
    '''
    return "%.10f MB"%(os.path.getsize(filepath)/float(1024 ** 2))

def filesize_to_GB(filepath):
    '''
    获得文件大小的GB单位
    '''
    return "%.10f GB"%(os.path.getsize(filepath)/float(1024 ** 3))

def filesize_to_TB(filepath):
    '''
    获得文件大小的TB单位
    '''
    return "%.10f TB"%(os.path.getsize(filepath)/float(1024 ** 4))

def getfilename(filepath):
    '''
    从路径中获得文件名
    '''
    return re.split("[/|,]",filepath)[-2]

def get_file_type_key(filename,types=['.py','txt','log','DS_Store','dp']):
    '''
    从文件名中获得疑似的文件类型
    '''
    init_name_key_list = getfilename(filename).split('.')
    if len(init_name_key_list) > 1:
        return init_name_key_list[-1]
    else:
        return "empty"
    
def listfile_in_log_function(dirpath=os.getcwd(),save_path="filepaths.log"):
    print(getoutput("figlet Start searching for the path within the folder:"))
    counter = 0
    detGB = 0
    L = [0]
    time = 0
    f = open(save_path,"w+")
    for dir_,folder,files in tqdm(os.walk(dirpath)):
        for file in files:
            time+=1
            temp = os.path.join(dir_,file)
            check_file = os.path.isfile(temp)
            if check_file :
                filesize_float = os.path.getsize(temp)/float(1024 * 1024)
                filesize="%.10f MB"%(filesize_float)
                counter += filesize_float
                L.append(counter/1024)  
                detGB += L[-1]-L[-2]
                if detGB >=1:
                    print("计算得出增加1GB","%.10f GB"%(detGB),"当前累计文件共计:","%.10f GB"%(counter/1024),"累计文件数量:","%.0f 个"%(time))
                    detGB = 0 
            f.writelines(repr({
    
    temp+","+filesize})+"\n")
    f.close()
    end = "{}{}{}".format("end ","%.10f GB"%(counter/1024),"%.10f MB"%(counter))
    print(getoutput("figlet Path storage end !"))
    print(end)
    return end

    
def loadfile_to_infotable(logpath="filepaths.log"
                          ,columnsnumber=40
                          ,save_path="filepath_table.csv"):
    print(getoutput("figlet preprocess all file path data"))
    counter = 0
    detGB = 0
    L = [0]
    time = 0
    result = []
    files = open(logpath,'r').readlines()
    for file in files:
        time += 1
        init_vector = np.zeros(columnsnumber).astype(str)
        temp = re.split(r"[}|,|{]|\n| |'",file)[2]
        check_file = os.path.isfile(temp)
        if check_file :
            '''计算显示信息'''
            filesize_float = os.path.getsize(temp)/float(1024 **2 )
            counter += filesize_float
            L.append(counter/(1024))  
            detGB += L[-1]-L[-2]
            if detGB >=1:
                print("计算得出增加1GB","%.10f GB"%(detGB),"当前累计文件共计:","%.10f GB"%(counter/1024),"累计文件数量:","%.0f 个"%(time))
                detGB = 0 
            '''整理成表'''
            init_clearing_path_level1= os.path.split(temp)
            dirpath,filename = init_clearing_path_level1[0],init_clearing_path_level1[-1]
            level3 = -1
            abspath_list = temp.split(os.sep)
            abspath_ = abspath_list[:-1]
            file_name = abspath_list[-1].split('.')
            file_path = abspath_list+file_name[:-1]
            for dir_s in file_path:
                level3 += 1
                init_vector[level3]= dir_s
            init_vector[columnsnumber-1]= file_name[-1]
            init_vector[columnsnumber-2]=filesize_to_MB(temp)
            init_vector[columnsnumber-3]=temp
        result.append(init_vector)
        counter+= check_file
        if counter % 1000 == 0:
            print(pd.DataFrame(init_vector).T)
    table = pd.DataFrame(np.array(result))
    table.to_csv(save_path)
    print(getoutput("figlet preprocess is complete !"))
    return table

if __name__ == "__main__":
    listfile_in_log_function()
    #print(getoutput("figlet save path is OK !"))
    print(loadfile_to_infotable())
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

def listfile_function(dirpath=os.getcwd(),n=1000):
    counter = 0
    result = []
    for dir_,folder,files in tqdm(os.walk(dirpath)):
        init_vector = np.zeros(5000).astype(str)
        for file in files:
            temp = os.path.join(dir_,file)
            check_file = os.path.isfile(temp)
            if check_file :
                init_clearing_path_level1= os.path.split(temp)
                dirpath,filename = init_clearing_path_level1[0],init_clearing_path_level1[-1]
                level3 = -1 
                abspath_list = temp.split(os.sep)
                abspath_ = abspath_list[:-1]
                file_name = abspath_list[-1].split('.')
                file_path = abspath_list+file_name[:-1]
                for dir_s in file_path:
                    level3 += 1
                    init_vector[level3]= dir_s
                init_vector[4999]= file_name[-1]
                init_vector[4998]="%.10f MB"%(os.path.getsize(temp)/float(1024 * 1024))
            result.append(init_vector)
            counter+= check_file
            if counter % 1000 == 0:
                print(pd.DataFrame(init_vector).T)
    table = pd.DataFrame(np.array(result))
    table.to_csv("filepath_table.csv")
    return table
            
listfile_function()

计算大小并持续写入(适合的大文件)

import os
import pandas as pd
import numpy as np
from tqdm import tqdm

def listfile_in_log_function(dirpath=os.getcwd()):
    counter = 0

    detGB = 0
    L = [0]
    time = 0
    f = open("filepaths.log","w+")
    for dir_,folder,files in tqdm(os.walk(dirpath)):
        
        for file in files:
            time+=1
            temp = os.path.join(dir_,file)
            print(temp)
            check_file = os.path.isfile(temp)
            if check_file :
                filesize_float = os.path.getsize(temp)/float(1024 * 1024)
                filesize="%.10f MB"%(filesize_float)
                counter += filesize_float
                L.append(counter/1024)  
                detGB += L[-1]-L[-2]
                if detGB >=1:
                    print("计算得出增加1GB","%.10f GB"%(detGB),"当前累计文件共计:","%.10f GB"%(counter/1024),"累计文件数量:","%.0f 个"%(time))
                    detGB = 0 
            f.writelines(repr({
    
    temp+","+filesize})+"\n")
    f.close()
    print("end","%.10f GB"%(counter/1024),"%.10f MB"%(counter))
    
            
listfile_in_log_function()

猜你喜欢

转载自blog.csdn.net/weixin_43069769/article/details/109418587