代码来源于:https://github.com/hLvMxM/Learning_Data_Mining_with_Python/blob/master/Chapter 1/ch1_affinity.ipynb
其中注释是在自己学习中加上去的,
便于初学者看懂
分析文本为:affinity_dataset.txt
0 0 1 1 1
1 1 0 1 0
1 0 1 1 0
0 0 1 1 1
0 1 0 0 1
0 1 0 0 0
1 0 0 0 1
1 0 0 0 1
0 0 0 1 1
0 0 1 1 1
1 1 0 0 1
0 1 0 0 0
0 0 0 0 1
0 0 1 0 1
0 1 0 0 1
0 0 1 1 1
1 0 0 0 1
0 0 1 1 1
1 1 0 0 0
0 1 0 0 0
0 0 1 0 0
0 1 0 0 1
0 1 0 0 0
0 1 0 0 1
0 0 1 1 1
0 0 1 1 0
0 0 1 0 1
0 0 0 0 1
0 1 0 0 0
0 1 0 1 0
1 1 1 0 1
1 1 0 0 1
0 0 1 1 1
0 0 1 0 1
0 0 1 1 1
0 0 1 1 0
0 1 1 0 1
0 0 1 1 0
0 1 0 0 1
0 0 0 0 1
0 0 1 0 1
1 1 0 1 1
1 0 0 0 1
0 0 1 1 1
0 1 0 0 0
0 1 0 1 1
0 1 0 0 0
0 1 0 0 0
0 0 1 1 0
0 0 1 1 1
0 1 0 1 0
0 1 1 0 0
0 0 1 1 0
0 0 1 1 1
1 0 0 0 0
0 1 0 1 0
1 0 0 0 1
0 1 0 0 0
0 0 0 0 1
0 0 1 1 1
0 1 1 1 1
1 1 0 0 0
0 0 1 0 1
1 0 0 0 1
1 1 0 0 0
0 1 1 0 0
0 0 0 0 1
0 1 0 0 0
0 0 1 1 1
0 1 0 0 1
1 0 0 0 1
1 0 0 0 1
0 1 0 0 1
0 0 1 1 1
1 0 1 0 1
1 1 0 0 1
0 1 0 0 1
1 1 1 0 1
0 0 1 1 1
1 0 0 0 0
0 0 1 1 1
1 1 0 1 0
0 0 1 0 0
0 0 1 0 1
0 1 0 0 0
1 1 0 0 0
0 0 0 1 0
0 0 0 1 1
0 1 0 0 0
0 1 0 0 0
1 1 0 0 1
0 0 1 0 0
0 1 0 0 1
1 1 0 1 0
1 0 0 0 1
0 1 0 0 0
0 0 1 1 0
0 1 1 0 0
0 0 1 1 0
0 0 0 0 1
代码为:
# @Time : 2018/12/3 上午10:13
# @Author : 郑超
# @Desc :
# In [1]:
import numpy as np
from operator import itemgetter
from collections import defaultdict
dataset_filename = "affinity_dataset.txt"
X = np.loadtxt(dataset_filename)
n_samples, n_features = X.shape # 输出该数组的结构
features = ["bread", "milk", "cheese", "apples", "bananas"] # 将文件中的各项分别定义为一种商品
valid_rules = defaultdict(int) # 规则应验字典
invalid_rules = defaultdict(int) # 规则无效字典
num_occurences = defaultdict(int) # 条件相同字典
for sample in X:
for premise in range(n_features):
if sample[premise] == 0: continue
num_occurences[premise] += 1 # 满足第一个条件的总次数
for conclusion in range(n_features):
if premise == conclusion: continue # 同一个条件 进行跳过
if sample[conclusion] == 1:
valid_rules[(premise, conclusion)] += 1 # 满足第一个条件时同时满足条件二进行+1
else:
invalid_rules[(premise, conclusion)] += 1 # 满足第一个条件时不满足条件二进行+1
support = valid_rules # 支持度 支持度指数据集中规则应验的次数
confidence = defaultdict(float) # 让置信度支持浮点数
for premise, conclusion in valid_rules.keys(): # 遍历keys ,得到条件组合
confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise] # 计算置信度
# In [8]:
for premise, conclusion in confidence: # 遍历置信度字典
premise_name = features[premise]
conclusion_name = features[conclusion]
print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
print(" - Support: {0}".format(support[(premise, conclusion)]))
print("")
def print_rule(premise, conclusion, support, confidence, features):
# 定义函数,输出置信度和支持度
premise_name = features[premise]
conclusion_name = features[conclusion]
print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)])) # 输出置信度
print(" - Support: {0}\n".format(support[(premise, conclusion)])) # 输出支持度
"""输出支持度最高的前五个元素"""
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True) # 使支持度字典按照value 来进行倒叙排练
for index in range(5):
print("Rule #{0}".format(index + 1))
premise, conclusion = sorted_support[index][0]
print_rule(premise, conclusion, support, confidence, features)
print("*" * 60)
"""输出置信度最高的前五个元素"""
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True) # 使支持度字典按照value 来进行倒叙排练
for index in range(5):
print("Rule #{0}".format(index + 1))
premise, conclusion = sorted_confidence[index][0]
print_rule(premise, conclusion, support, confidence, features)