1 # coding: utf-8
2
3 # 利用python实现apriori算法
4
5 # In[1]:
6
7
8 #导入需要的库
9 from numpy import *
10
11
12 # In[2]:
13
14
15 def loadDataSet():
16 return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
17
18
19 # In[3]:
20
21
22 def createC1(dataSet):
23 C1=[]
24 for transaction in dataSet:
25 for item in transaction:
26 if not [item] in C1:
27 C1.append([item])
28 C1.sort()
29 return map(frozenset,C1)
30
31
32 # In[4]:
33
34
35 #计算Ck在数据集D中的支持度,并返回支持度大于minSupport的数据集
36 def scanD(D,Ck,minSupport):
37 ssCnt={}
38 for tid in D:
39 for can in Ck:
40 if can.issubset(tid):
41 if can not in ssCnt.keys():
42 ssCnt[can]=1
43 else :
44 ssCnt[can]+=1
45 numItems=float(len(D))
46 retList=[]
47 supportData={}
48 for key in ssCnt:
49 support=ssCnt[key]/numItems
50 if support>= minSupport:
51 retList.insert(0,key)
52 supportData[key]=support
53 return retList,supportData
54
55
56 # In[15]:
57
58
59 def aprioriGen(Lk,k):
60 retList=[]
61 lenLk=len(Lk)
62 for i in range(lenLk):
63 for j in range(i+1,lenLk):
64 L1=list(Lk[i])[:k-2]
65 L2=list(Lk[j])[:k-2]
66 L1.sort()
67 L2.sort()
68 if L1==L2:
69 retList.append(Lk[i] | Lk[j])
70 return retList
71
72
73
74 # In[14]:
75
76
77 def apriori(dataSet, minSupport=0.5):
78 C1=createC1(dataSet)
79 D=list(map(set,dataSet))
80 print(‘D:‘,D)
81 L1,supportData= scanD(D,C1,minSupport)
82 L=[L1]
83 k=2
84 while (len(L[k-2])>0):
85 Ck=aprioriGen(L[k-2], k)
86 Lk,supK= scanD(D,Ck,minSupport)
87 supportData.update(supK)
88 if len(Lk)==0:
89 break
90 L.append(Lk)
91 k+=1
92 return L,supportData
93
94
95 # In[19]:
96
97
98 def calConf(freqSet,H,supportData,brl,minCOnf=0.7):
99 prunedH=[]
100 for conseq in H:
101 cOnf=supportData[freqSet]/supportData[freqSet-conseq]
102 if conf >= minConf:
103 print(freqSet-conseq, ‘-->‘,conseq,‘conf‘,conf)
104 brl.append((freqSet-conseq,conseq,conf))
105 prunedH.append(conseq)
106 return prunedH
107
108
109 # In[21]:
110
111
112 def rulesFromConseq(freqSet,H,supportData,brl,minCOnf=0.7):
113 m=len(H[0])
114 if(len(freqSet)>(m+1)):
115 Hmpl=aprioriGen(H,m+1)
116 Hmpl=calConf(freqSet,Hmpl,supportData,brl,minConf)
117 print(‘Hmpl=‘,Hmpl)
118 print(‘len(Hmpl)=‘,len(Hmpl),‘len(freqSet)=‘,len(freqSet))
119 if(len(Hmpl)>1):
120 rulesFromConseq(freqSet,Hmpl,supportData,brl,minConf)
121
122
123 # In[9]:
124
125
126 def generateRules(L,supportData,minCOnf=0.7):
127 bigRuleList=[]
128 for i in range(1,len(L)):
129 for freqSet in L[i]:
130 H1=[frozenset([item]) for item in freqSet]
131 if(i>1):
132 rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)
133 else:
134 calConf(freqSet,H1,supportData,bigRuleList,minConf)
135 return bigRuleList
136
137
138 # In[10]:
139
140
141 def testApriori():
142 dataSet=loadDataSet()
143 print(‘dataSet:‘,dataSet)
144 L1,supportData1=apriori(dataSet,minSupport=0.7)
145 print(‘L(0.7):‘,L1)
146 print(‘supportData(0.7):‘,supportData1)
147 print(‘------------------------------------------‘)
148 L2,supportData2=apriori(dataSet,minSupport=0.5)
149 print(‘L(0.5):‘,L2)
150 print(‘supportData(0.5:).supportData2‘)
151 print(‘------------------------------------------‘)
152
153
154 # In[11]:
155
156
157 def testGenerateRules():
158 dataSet=loadDataSet()
159 L1,supportData1=apriori(dataSet,minSupport=0.2)
160 print(‘L(0.2):‘,L1)
161 print(‘minSupport(0.2):‘,supportData1)
162 rules=generateRules(L1,supportData1,minCOnf=1.1)
163 print(‘Rules:‘,rules)
164
165
166 # In[12]:
167
168
169 def main():
170 testApriori()
171 testGenerateRules()
172
173
174 # In[22]:
175
176
177 if __name__=="__main__":
178 main()