Merge pull request #63 from jiangzhonglian/master

更新 9.树回归的注释 	更新 11.apriori算法注释 	更新 12.fpGrowth注释说明
This commit is contained in:
片刻
2017-04-03 00:34:28 +08:00
committed by GitHub
4 changed files with 27 additions and 17 deletions

View File

@@ -103,6 +103,7 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
bestS, bestIndex, bestValue = inf, 0, 0
# 循环处理每一列对应的feature值
for featIndex in range(n-1):
# [0]表示这一列的[所有行],不要[0]就是一个array[[所有行]]
for splitVal in set(dataSet[:, featIndex].T.tolist()[0]):
# 对该列进行分组然后组内的成员的val值进行 二元切分
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
@@ -236,7 +237,7 @@ def linearSolve(dataSet):
# 如果矩阵的逆不存在,会造成程序异常
if linalg.det(xTx) == 0.0:
raise NameError('This matrix is singular, cannot do inverse,\ntry increasing the second value of ops')
# 最小二乘法求最优解
# 最小二乘法求最优解: w0*1+w1*x1=y
ws = xTx.I * (X.T * Y)
return ws, X, Y
@@ -291,7 +292,9 @@ if __name__ == "__main__":
# # 回归树
# myDat = loadDataSet('testData/RT_data1.txt')
# # myDat = loadDataSet('testData/RT_data2.txt')
# # print 'myDat=', myDat
# myMat = mat(myDat)
# # print 'myMat=', myMat
# myTree = createTree(myMat)
# print myTree
@@ -301,7 +304,7 @@ if __name__ == "__main__":
# myTree = createTree(myMat, ops=(0, 1))
# print myTree
# # 2.后剪枝就是:通过测试数据,对预测模型进行合并判断
# # 2. 后剪枝就是:通过测试数据,对预测模型进行合并判断
# myDatTest = loadDataSet('testData/RT_data3test.txt')
# myMat2Test = mat(myDatTest)
# myFinalTree = prune(myTree, myMat2Test)
@@ -330,11 +333,11 @@ if __name__ == "__main__":
print myTree2
print "模型树:", corrcoef(yHat2, testMat[:, 1],rowvar=0)[0, 1]
# # 线性回归
# ws, X, Y = linearSolve(trainMat)
# print ws
# m = len(testMat[:, 0])
# yHat3 = mat(zeros((m, 1)))
# for i in range(shape(testMat)[0]):
# yHat3[i] = testMat[i, 0]*ws[1, 0] + ws[0, 0]
# print "线性回归:", corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1]
# 线性回归
ws, X, Y = linearSolve(trainMat)
print ws
m = len(testMat[:, 0])
yHat3 = mat(zeros((m, 1)))
for i in range(shape(testMat)[0]):
yHat3[i] = testMat[i, 0]*ws[1, 0] + ws[0, 0]
print "线性回归:", corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1]

View File

@@ -100,7 +100,6 @@ def main(root):
# 退出按钮
Button(root, text="退出", fg="black", command=quit).grid(row=1, column=2)
# 创建一个画板 canvas
reDraw.f = Figure(figsize=(5, 4), dpi=100)
reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root)

View File

@@ -43,7 +43,7 @@ def scanD(D, Ck, minSupport):
Args:
D 原始数据集, D用来判断CK中的元素是否存在于原数据D中
Ck 合并后的数据集
Ck 所有key的元素集合
Returns:
retList 支持度大于阈值的集合
supportData 全量key的字典集合
@@ -141,6 +141,8 @@ def apriori(dataSet, minSupport=0.5):
if len(Lk) == 0:
break
# Lk表示满足频繁子项的集合L元素在增加
# l=[[set(1), set(2), set(3)]]
# l=[[set(1), set(2), set(3)] [set(1, 2), set(2, 3)]]
L.append(Lk)
k += 1
# print 'k=', k, len(L[k-2])
@@ -157,7 +159,7 @@ def calcConf(freqSet, H, supportData, brl, minConf=0.7):
brl bigRuleList的空数组
minConf 置信度的阈值
Returns:
prunedH 记录 信度大于阈值的集合
prunedH 记录 信度大于阈值的集合
"""
# 记录 可信度大于阈值的集合
prunedH = []
@@ -188,7 +190,7 @@ def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
"""
# H[0]是freqSet的元素组合的第一个元素
m = len(H[0])
# 判断freqSet的长度是否>组合的长度+1, 避免过度匹配 例如:计算过一边{1,2,3} 和 {1, 2} {1, 3},就没必要再计算了 {1,2,3}和{1,2,3}的组合关系
# 判断freqSet的长度是否>组合的长度+1, 避免过度匹配 例如:计算过一边{1,2,3} 和 {1, 2} {1, 3},就没必要再计算了进一步合并来计算 {1,2,3}和{1,2,3}的组合关系
if (len(freqSet) > (m + 1)):
print 'freqSet******************', len(freqSet), m + 1, freqSet, H, H[0]
# 合并数据集集合组合为2/3/..n的集合
@@ -209,7 +211,7 @@ def generateRules(L, supportData, minConf=0.7):
Args:
L 频繁项集的全集
supportData 所有元素和支持度的全集
minConf 信度的阈值
minConf 信度的阈值
Returns:
bigRuleList 关于 (A->B+置信度) 3个字段的组合
"""
@@ -217,7 +219,9 @@ def generateRules(L, supportData, minConf=0.7):
# 循环L频繁项集所有的统一大小组合2/../n个的组合从第2组开始
for i in range(1, len(L)):
# 获取频繁项集中每个组合的所有元素
# [[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])]]
for freqSet in L[i]:
# 假设freqSet=frozenset([1, 3]) H1=[1, 3]
# 组合总的元素并遍历子元素并转化为冻结的set集合再存放到list列表中
H1 = [frozenset([item]) for item in freqSet]
# 2个的组合走else, 2个以上的组合走if

View File

@@ -41,6 +41,7 @@ def loadSimpDat():
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
# ['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
return simpDat
@@ -49,7 +50,10 @@ def loadSimpDat():
def createInitSet(dataSet):
retDict = {}
for trans in dataSet:
retDict[frozenset(trans)] = 1
if not retDict.has_key(frozenset(trans)):
retDict[frozenset(trans)] = 1
else:
retDict[frozenset(trans)] += 1
return retDict
@@ -193,7 +197,7 @@ def findPrefixPath(basePat, treeNode):
# prefixPath[1:] 变frozenset后字母就变无序了
# condPats[frozenset(prefixPath)] = treeNode.count
condPats[frozenset(prefixPath[1:])] = treeNode.count
# 递归,寻找改节点的一个 相同值的链接节点
# 递归,寻找改节点的一个 相同值的链接节点
treeNode = treeNode.nodeLink
# print treeNode
return condPats