使用泊松方法检查序列异常

import sys
from math import sqrt
def pearson_distance(vector1, vector2) :
    if len(vector1) == 1:
        return abs(vector1[0]-vector2[0])
    """
    Calculate distance between two vectors using pearson method
    See more : http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
    """
    '''
    print('vector1')
    print(vector1)
    print('vector2')
    print(vector2)
    '''
    sum1 = sum(vector1)
    sum2 = sum(vector2)

    sum1Sq = sum([pow(v,2) for v in vector1])
    sum2Sq = sum([pow(v,2) for v in vector2])

    pSum = sum([vector1[i] * vector2[i] for i in range(len(vector1))])

    num = pSum - (sum1*sum2/len(vector1))
    den = sqrt((sum1Sq - pow(sum1,2)/len(vector1)) * (sum2Sq - pow(sum2,2)/len(vector1)))

    if den == 0 : return 0.0
    return 1.0 - num/den
    
def pick_outlier(vlist,vdict):
    vlen = len(vdict[vlist[0]])
    avg = [0.0 for i in range(vlen) ]
    #print('avg init')
    #print(avg)
    for x in vdict:
        vec = vdict[x]
        for i in range(len(vec)) :
            avg[i] += vec[i]
    avg = [ item / len(vlist) for item in avg]
    #print('avg done')
    #print(avg)
    
    max1=0
    max1no=0
    max2=0
    max2no=0
    for i in range(1,len(vlist)-1):
        #print(i+1,abs(vlist[i]-vlist[i-1])+abs(vlist[i]-vlist[i+1]),abs(vlist[i]-median(vlist)),round(abs(vlist[i]-avg(vlist)),2))
        pd1 = pearson_distance(vdict[vlist[i]],vdict[vlist[i-1]])
        pd2 = pearson_distance(vdict[vlist[i]],vdict[vlist[i+1]])
        #print('pd1 pd2:',pd1,pd2)
        if pd1+pd2>max1:
            max1=pd1+pd2
            max1no=i+1
        avgpd = pearson_distance(vdict[vlist[i]],avg)    
        #print('avgpd:',avgpd)
        if avgpd>max2:
            max2=avgpd
            max2no=i+1
            
    print('pick max1 max2:',max1no,max2no,vlist[max1no-1],vlist[max2no-1])            
    if max1no == max2no:
        return max1no
    else:
        return -1    
    
fd = open(sys.argv[1])
vlist=[]
vdict={}
for v in fd:
    v = v.replace('\r','').replace('\n','')
    tmpv = v.split()
    vlist.append(tmpv[0])
    vdict[str(tmpv[0])]=[]
    for i in range(1,len(tmpv)):
        vdict[str(tmpv[0])].append(float(tmpv[i]))
fd.close()


roundno = 1
pickno = pick_outlier(vlist,vdict)
print(len(vlist),len(vdict))
while pickno != -1:
    print('round:',roundno,' pick outlier:',pickno)    
    roundno+=1
    '''
    if roundno>1:
        break
    '''    
    vdict.pop(vlist[pickno-1])
    vlist.pop(pickno-1)
    print(len(vlist),len(vdict))
    pickno = pick_outlier(vlist,vdict)
    

猜你喜欢

转载自blog.csdn.net/b0207191/article/details/95333891