序列模式挖掘——PrefixSpan算法实战

序列模式PrefixSpan

在这里插入图片描述

#读取txt文件，提取体重含有序列的元素，并将其写入列表，返回列表及列表长度
def read(filename):
    S = []
    with open(filename, 'r') as input1:
        for line in input1.readlines():
            elements = line      #读取txt文件每一行
            for i in range(len(elements)):
                if elements[i]==' ' and elements[i+1:len(elements)].count(',')>0:#用条件剔除第一行
                    s=elements[i+1:len(elements)-1]#去掉回车符
                    S.append(s)
    return S,len(S)

在这里插入图片描述

#找到前缀，返回前缀列表
def union_str(string):
    strings=''
    #将S列表中字符串拼接在一起
    for i in range(len(string)):
        strings+=string[i]
        #print(strings)
    #将逗号去掉，使用set得到单个元素
    string_to=list(set(strings.replace(",", "")))
    return string_to

#得到后缀,string为序列，da为前缀
def dd(string,da):
    new_str1=[]
    for i in range(len(string)):
        if string[i-1]!="_" and string[i]==da and i!=len(string)-1 and i>0:
            new_str1=string[i:]
            new_str1=new_str1.replace(da,"_",1)#在序列中查找前缀da第一次出现的位置，用_替代da
            new_str1=new_str1.replace("_,","")#将_,替换为空
            break
        elif string[i]==da and i==0:
            new_str1=string[i:]
            #在序列中查找前缀da第一次出现的位置，用_替代da
            #('ad,c,bc,ae','a')'_d,c,bc,ae'
            new_str1=new_str1.replace(da,"_",1)
            #将_,替换为空('a,abc,ac,d,cf','a'),'abc,ac,d,cf'
            new_str1=new_str1.replace("_,","")
            break
        elif string[i]==da and i==len(string)-1:#da第一次出现在序列的末尾，则返回空值
            pass
        elif string[i:i+2]==da and i<len(string):
            new_str1=string[i:]
            new_str1=new_str1.replace(string[i:i+3],"")
            print(4)
    return new_str1

在这里插入图片描述

#前缀与后缀
def first_level(un,S,new_list=None):
    list2=[]
    list21=[]
    dict1={
    
    }
    if new_list==None:
        for lab in un:#遍历第一层序列的前缀列表,调用union_str
            list_all=[]
            for item in S:#遍历第一层序列，调用dd
                if len(dd(item,lab))>0:
                    list_all.append(dd(item,lab))
            print("以",lab,'前缀:\n',"后缀为",list_all,"support:",len(list_all))
            list1=[lab,list_all]#单个前缀和其所有后缀
            list2.extend(list1)#前缀和其所有后缀集
            dict1=dict(zip(list2[::2],list2[1::2]))#将前缀作为键，后缀作为值，创建字典
            list11=[lab,len(list_all)]#前缀和后缀个数
            list21.extend(list11)#前缀和其所有后缀个数集
            r_1_di=dict(zip(list21[::2],list21[1::2]))#将前缀作为键，后缀个数作为值，创建字典
           
    elif new_list!=None:   
        for lab1 in new_list:
            list_all=[]
            for item in S:#遍历第一层序列，调用dd
                if len(dd(item,lab1[-1]))>0 and ',' in lab1:
                    list_all.append(dd(item,lab1[-1]))
                elif len(dd(item,'_'+lab1[-1]))>0 and ',' not in lab1:
                    list_all.append(dd(item,'_'+lab1[-1]))
            print("以",lab1,'前缀:\n',"后缀为",list_all,"support:",len(list_all)) 
            list1=[lab1[-1],list_all]
            list2.extend(list1)
            dict1=dict(zip(list2[::2],list2[1::2]))#将前缀作为键，后缀作为值，创建字典
            list11=[lab1[-1],len(list_all)]
            list21.extend(list11)
            r_1_di=dict(zip(list21[::2],list21[1::2]))#将前缀作为键，后缀作为值，创建字典
            
    return dict1,r_1_di

在这里插入图片描述

def two_level(dict1,r_1_di):#对第一层得到的序列进行处理
    list2=[]
    dict2={
    
    }
    for key,value in dict1.items():
        if len(value)>1:#后缀个数大于1，'g': ['af,c,b,c']被排除
            #print(key,value)
            for j in range(len(value)):
                list_all_=list(filter(lambda x: x != '_', value))#移除单个'_'
            list1=[key,list_all_]#单个前缀和其所有后缀
            list2.extend(list1)#前缀和其所有后缀集
            dict2=dict(zip(list2[::2],list2[1::2]))#将前缀作为键，后缀作为值，创建字典
    name=[]
    for k,v in dict2.items():
        name.append(k)#提取字典中的键，即所有前缀
    '''for item in set(r_1_di):
        dict1.pop(item)'''
    dict3=r_1_di.copy()
    list2=list(set(r_1_di).difference(set(name)))#找出r_1_di的键与name不同项，即'g'
    for item in list2:
        dict3.pop(item)#剔除不同键，前缀为键，后缀个数为值
    return dict2,dict3,name

在这里插入图片描述

def match_dict(x):
    list11=[]
    for key in x.keys():
        if x.get((key))>1:#min_support=2
            list11.append(key)
    return list11#找到大于min_support的前缀
#某前缀下，后缀的集合再计算各个元素支持度
def mm(list_all):  
    string_to_1=''
    string_to=[]
    string_y_=[]
    string_x_=[]
    for item in list_all:
        string_to=item.split(',')#单个字符串切分
        for j in string_to:
            if j.startswith("_"):#以_开头的放在一起避免拆分
                string_y_.append(j)
            else:
                string_x_.append(j)#剩下的为一个列表，如何将两个字母拆分？？？
    #print(string_y_,'y')
    set_y=set(string_y_)
    dict_y = {
    
    }
    for item in set_y:
        dict_y.update({
    
    item:string_y_.count(item)})#计算后缀出现次数形成字典
    list_y=match_dict(dict_y)#二阶前缀（含_），支持度大于1
    #print(list_y,"含_")#支持度大于1
    
    #print(string_x_,'x')
    for i in range(len(string_x_)):
        string_to_1+=string_x_[i]
        string_to=string_to_1
    #print(string_to,"*")#将后缀拼接成字符串
    set_x=set(string_to)
    dict_x = {
    
    }
    for item in set_x:
        dict_x.update({
    
    item:string_to.count(item)})#计算后缀出现次数形成字典
    list_x=match_dict(dict_x)#二阶前缀，支持度大于1
    #print(list_x,"不含_")
    if len(list_y)>0:#list_y不为空
        list_x.extend(list_y)#二阶前缀  
    return list_x#返回二阶前缀
def  dss(dict2):   
    for key,value in dict2.items():
        print(key,'下需要处理的序列数',str(len(value)))
        list_x=mm(value)
        new_list=list_x.copy()
        for j in range(len(new_list)):
            if new_list[j].startswith("_"):
                new_list[j]=key+new_list[j][1]
            else:
                new_list[j]=key+','+new_list[j]
        #print(value,list_x,new_list)#value是序列，list_x是前缀，new_list是前前缀与前缀
        if len(list_x)>0:
            dict1_2,r_1_di_2=first_level(list_x,value,new_list)
            dict2_2,dict11_2,dict2_k_2=two_level(dict1_2,r_1_di_2)
            #print(dict2_2,dict11_2,'\n')
            
            if len(dict2_2)>1:
                print("接上一步，继续处理下一级：")
                dict2_3,dict22_3=dss(dict2_2)
                print("******处理完毕*******",'\n')
            else: 
                print("-----没有下一级处理-----")
        else:
            print('没有支持度大于2的下一级前缀','\n')
            dict2_2={
    
    }
            dict11_2={
    
    }
    return dict2_2,dict11_2

在这里插入图片描述

from collections import defaultdict as de
S,support=read("PrefixSpan.txt")
print("需要处理的",support,"个序列，它们是：")
print("序列存在列表中：",S)
un=union_str(S)#整合提取4个字符串中单个元素
print("长度为",str(len(un[0])),"的前缀是：",un)
dict1,r_1_di=first_level(un,S,None)#前缀和后缀
print('\n')
dict2,dict11,dict2_k=two_level(dict1,r_1_di)
#print(dict2,dict11)
print("长度为",str(len(dict2_k[0])),"支持度大于2的前缀是：",dict2_k,'\n')
dict2_2,dict11_2=dss(dict2)#递归

**整理课件不易，走过路过觉得课程内容不错，请帮忙点赞、收藏！Thanks♪(･ω･)ﾉ****如需转载，请注明出处。

参考文献

(1) https://www.cnblogs.com/pinard/p/6323182.html

序列模式挖掘——PrefixSpan算法实战

序列模式PrefixSpan

参考文献

猜你喜欢