聚类是无监督学习的一个例子,具体的定义百度一下吧!直接进入主题,先说明一下数据的问题,该数据是我重一家外国网站收集的关系死囚的临行前的一些最后遗言,以及死囚的一些个人数据,仅供参看。
先说明一下怎样爬取数据吧!该案例使用urllib2,bs4,SGMLParser库中知识,其中urllib2用于爬取数据,bs4和SGMLParser用于解析数据,并保存到文件中去。具体的直接看代码吧!
# coding=utf-8
import urllib2
from bs4 import BeautifulSoup
from sgmllib import SGMLParser
class FirstParser(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.__start_tbody = False
self.__start_tr = False
self.__start_td = False
self.__start_th = False
self.__start_a = False
self.__td_state = 0
self.__tr_value = []
self.data = []
def start_tbody(self, attr):
self.__start_tbody = True
def end_tbody(self):
self.__start_tbody = False
def start_tr(self, attrs):
if self.__start_tbody:
self.__start_tr = True
def end_tr(self):
if self.__start_tbody and self.__start_tr:
self.data.append(self.__tr_value)
self.__tr_value = []
self.__start_tr = False
def start_th(self, attrs):
if self.__start_tbody and self.__start_tr:
self.__start_th = True
def end_th(self):
if self.__start_tbody and self.__start_tr and self.__start_th:
self.__start_th = False
def start_td(self, attrs):
if self.__start_tbody and self.__start_tr:
self.__start_td = True
self.__td_state += 1
def end_td(self):
if self.__start_tbody and self.__start_tr and self.__start_td:
self.__start_td = False
self.__td_state = 0
def start_a(self, attrs):
if self.__start_tbody and self.__start_tr:
self.__tr_value.append(attrs[0][1])
# print attrs
self.__start_a = True
def end_a(self):
if self.__start_tbody and self.__start_tr and self.__start_td:
self.__start_a = False
def handle_data(self, data):
if self.__start_tbody and self.__start_tr and \
(self.__start_td or self.__start_th):
if self.__start_th:
self.__tr_value.append(data)
if self.__start_td:
# if self.__td_state != 2 or self.__td_state != 3:
self.__tr_value.append(data)
def read_first(page):
soup = BeautifulSoup(page, 'lxml')
value = []
for row in soup.find_all('tbody'):
tbody = row.find_all('tr')
print len(tbody)
for index, r in enumerate(tbody):
t = []
if index == 0:
for k in r.find_all('th'):
t.append(k.string)
else:
for k in r.find_all('td'):
t.append(k.string)
value.append(t)
return value
def download_second(url):
url = 'http://www.tdcj.state.tx.us/death_row/' + url
page = urllib2.urlopen(url).read()
page = page.replace('<br />', '')
soup = BeautifulSoup(page, 'lxml')
vl = []
v2 = []
for row in soup.find('table').find_all('tr'):
td = row.find_all('td')
vl.append(fun_replace(td[len(td) - 1].string))
p = soup.find_all('p')
for row in p[1:]:
temp = []
if len(row.find_all('span')) > 0:
# temp.append(fun_replace(str(row.find_all('span')[0].string)))
try:
temp.append(fun_replace(str(row.text.split('\r\n')[1].strip())))
except:
temp.append('')
else:
# temp.append(row.string)
temp.append('')
v2.append(temp)
return [vl, v2]
def download_three(url):
url = 'http://www.tdcj.state.tx.us/death_row/' + url
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'lxml')
p = soup.find_all('p')
v1 = []
if len(p) >= 6:
for index, row in enumerate(p):
if index % 2 == 1:
v1.append([fun_replace(p[index].string),
fun_replace(p[index + 1].string)])
if index >= 5:
break
return v1
def fun_replace(s):
return s.replace(',', '.') if s is not None else ''
def down_first():
url = 'http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html'
# page = urllib2.urlopen(url).read()
page = open('first.html').read()
first = FirstParser()
first.feed(page)
value = first.data
with open('first.txt', 'a+') as f:
for index, row in enumerate(value):
print row[0]
if index == 0:
continue
value = 'Execution,Name,TDCJ Number,Date of Birth,Date Received,' \
'Age (when Received),Education Level (Highest Grade Completed),' \
'Date of Offense,Age (at the time of Offense),County,Race,Gender,' \
'Hair Color,Height,Weight,Eye Color,Native County,Native State,' \
'Prior Occupation,Prior Prison Record,Summary of Incident,Co-Defendants,' \
'Race and Gender of Victim,Date of Execution,Offender,Last Statement,' \
'Last Name,First Name,Race,County\n'
f.write(value)
else:
try:
se = download_second(row[1])
th = download_three(row[3])
value = row[0] + ',' + se[0][0] + ',' + se[0][1] + ',' + se[0][2] + ',' + se[0][3] \
+ ',' + se[0][4] + ',' + se[0][5] \
+ ',' + se[0][6] + ',' + se[0][7] + ',' + se[0][8] + ',' + se[0][9] + ',' + se[0][10] \
+ ',' + se[0][11] + ',' + se[0][12] + ',' + se[0][13] + ',' + se[0][14] \
+ ',' + se[0][15] + ',' + se[0][16] + ',' + se[1][0][0] + ',' + se[1][1][0] + ',' \
+ se[1][2][0] + ',' + se[1][3][0] + ',' + se[1][4][0] + ',' + th[0][1] + ',' + th[1][0] \
+ ',' + th[2][0] + ',' + row[3] + ',' + row[4] + ',' + row[-2] + ',' + row[-1] + '\n'
f.write(value.encode('utf-8'))
except BaseException as e:
print e
# break
down_first()
这里不建议直接通过案例直接去爬取数据,数据中有些坑,好多数据是以图片的形式展现的,没有办法获取到的!可以直接下载案例中的数据去使用。
下面开市进行聚类分析
# coding=utf-8
import math
import random
import re
def height(s):
if s.find('\'') != -1:
t = s.replace('"', '').split('\'')
elif s.find('ft') != -1:
t = s.replace('.', '').replace('in', '').split('ft')
elif s.find('-') != -1:
t = s.split('-')
elif len(s.strip()) == 0:
t = ['5', '11']
else:
t = ['5', '11']
v = [float(t[0].strip()), float(t[1].strip() if len(t[1].strip()) != 0 else '0')]
return round((12 * v[0] + v[1]) * 30.48 / 12, 2)
def grade(s):
p = re.match(r'\d+', s)
if p is None:
return 12
else:
sp = p.span()
return int(s[sp[0]:sp[1]])
def load_dataset():
dataSet = []
labels = []
titles = []
with open('first.txt', 'r+') as f:
for index, row in enumerate(f.readlines()):
if index == 0:
titles = [row for row in row.strip().split(',')]
else:
t = [row for row in row.strip().split(',')]
dataSet.append([int(t[5]), grade(t[6]), height(t[13]), float(t[14].replace('lbs.', ''))])
labels.append([row.strip() for index, row in enumerate(t) if index not in (14, 13, 6, 5)])
return dataSet, labels, titles
def pearson(v1, v2):
'''
计算皮尔相关度
:param v1:
:param v2:
:return:
'''
sum1 = sum(v1)
sum2 = sum(v2)
sum1Sq = sum([pow(x, 2) for x in v1])
sum2Sq = sum([pow(x, 2) for x in v2])
psum = sum([v1[index] * v2[index] for index in range(len(v1))])
# 计算r
num = psum - (sum1 * sum2 / len(v1))
den = math.sqrt((sum1Sq - pow(sum1, 2) / len(v1)) * (sum2Sq - pow(sum2, 2) / len(v1)))
if den == 0: return 0
return 1.0 - num / den
def euclidean(v1, v2):
'''
欧几里得距离
:param v1:
:param v2:
:return:
'''
return math.sqrt(sum([pow(v1[i] - v2[i], 2) for i in range(len(v1))]))
class bicluster:
def __init__(self, vec, left=None, right=None, distance=0.0, id=None):
self.left = left
self.right = right
self.vec = vec
self.id = id
self.distance = distance
def hcluster(rows, distance=pearson):
'''
简单分类
:param rows:
:param distance:
:return:
'''
distances = {}
currentclustid = -1
clust = [bicluster(rows[i], id=i) for i in range(len(rows))]
while len(clust) > 1:
lowestpair = (0, 1)
closest = distance(clust[0].vec, clust[1].vec)
for i in range(len(clust)):
for j in range(i + 1, len(clust)):
if (clust[i].id, clust[j].id) not in distances:
distances[(clust[i].id, clust[j].id)] = \
distance(clust[i].vec, clust[j].vec)
d = distances[(clust[i].id, clust[j].id)]
if d < closest:
closest = d
lowestpair = (i, j)
mergevec = [
(clust[lowestpair[0]].vec[i] + clust[lowestpair[1]].vec[i]) / 2.0
for i in range(len(clust[0].vec))]
newcluster = bicluster(mergevec, left=clust[lowestpair[0]],
right=clust[lowestpair[1]],
distance=closest, id=currentclustid)
currentclustid -= 1
del clust[lowestpair[1]]
del clust[lowestpair[0]]
clust.append(newcluster)
return clust[0]
def find(clust, labels, data, distance=pearson):
'''
查找最合适的结果
:param clust:
:param labels:
:param data:
:param distance:
:return:
'''
while True:
left = clust.left
right = clust.right
if left is None and right is None:
return labels[clust.id]
else:
if left is None and right is not None:
clust = left
continue
elif left is not None and right is None:
clust = right
continue
else:
ls = distance(left.vec, data)
rs = distance(right.vec, data)
if ls <= rs:
clust = left
continue
else:
clust = right
continue
def kcluster(rows, distance=pearson, k=4):
'''
K-均值聚类
:param rows:
:param distance:
:param k:
:return:
'''
ranges = [(min([row[i] for row in rows]), max([row[i] for row in rows])) \
for i in range(len(rows[0]))]
clusters = [[random.random() * (ranges[i][1] - ranges[i][0]) + ranges[i][0] \
for i in range(len(rows[0]))] for j in range(k)]
lastmatches = None
for t in range(100):
# print 'Iteration %d' % t
bestmatches = [[] for i in range(k)]
for j in range(len(rows)):
row = rows[j]
bestmatch = 0
for i in range(k):
d = distance(clusters[i], row)
if d < distance(clusters[bestmatch], row): bestmatch = i
bestmatches[bestmatch].append(j)
if bestmatches == lastmatches: break
lastmatches = bestmatches
for i in range(k):
avgs = [0.0] * len(rows[0])
if len(bestmatches[i]) > 0:
for rowid in bestmatches[i]:
for m in range(len(rows[rowid])):
avgs[m] += rows[rowid][m]
for j in range(len(avgs)):
avgs[j] /= len(bestmatches[i])
clusters[i] = avgs
return bestmatches, clusters
def find_k(bestmatches, clusters, dataSet, labels, data, distance=pearson):
best = -1
best_value = 0
for i in range(len(clusters)):
t1 = distance(clusters[i], data)
if t1 <= best_value:
best = i
best_value = t1
best1 = -1
best_value1 = 0
for i, row in enumerate(bestmatches[best]):
t1 = distance(dataSet[row], data)
if t1 <= best_value1:
best1 = row
best_value1 = t1
return labels[best1], dataSet[best1]
具体不太了解的看注释,其中有两套方法,分类依据有皮尔逊相关系数和欧几里得距离,有分级聚类和k-均值聚类等,函数以参数的形式进行传递,有利于以后的扩展。
下面是测试程序
# coding=utf-8
from analysis import *
from show import *
dataSet, labels, titles = load_dataset()
#测试分级聚类,使用皮尔逊相关系数
clust = hcluster(dataSet, distance=pearson)
result = find(clust, labels, [26, 16, 176, 160], distance=pearson)
print result
#测试分级聚类,使用欧几里得距离
# clust = hcluster(dataSet, distance=euclidean)
# result = find(clust, labels, [26, 16, 176, 160], distance=euclidean)
# print result
#把分局结果进行分级显示
# printclust(clust)
#使用k-均值聚类,皮尔逊欧几里得系数
# bestmatches, clusters = kcluster(dataSet, pearson, 4)
# result = find_k(bestmatches, clusters, dataSet, labels, [26, 16, 176, 160], pearson)
# print result
本人菜鸟一枚,仅供共同学习使用,还请大神多多指导。接下来会写用KNN进行分类的方法。