mirror of
https://github.com/apachecn/ailearning.git
synced 2026-04-13 18:01:04 +08:00
git 项目大瘦身
This commit is contained in:
38
src/py2.x/dl/activators.py
Normal file
38
src/py2.x/dl/activators.py
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class ReluActivator(object):
|
||||
def forward(self, weighted_input):
|
||||
#return weighted_input
|
||||
return max(0, weighted_input)
|
||||
|
||||
def backward(self, output):
|
||||
return 1 if output > 0 else 0
|
||||
|
||||
|
||||
class IdentityActivator(object):
|
||||
def forward(self, weighted_input):
|
||||
return weighted_input
|
||||
|
||||
def backward(self, output):
|
||||
return 1
|
||||
|
||||
|
||||
class SigmoidActivator(object):
|
||||
def forward(self, weighted_input):
|
||||
return 1.0 / (1.0 + np.exp(-weighted_input))
|
||||
|
||||
def backward(self, output):
|
||||
return output * (1 - output)
|
||||
|
||||
|
||||
class TanhActivator(object):
|
||||
def forward(self, weighted_input):
|
||||
return 2.0 / (1.0 + np.exp(-2 * weighted_input)) - 1.0
|
||||
|
||||
def backward(self, output):
|
||||
return 1 - output * output
|
||||
863
src/py2.x/dl/bp.py
Normal file
863
src/py2.x/dl/bp.py
Normal file
@@ -0,0 +1,863 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
import random
|
||||
from numpy import *
|
||||
|
||||
# sigmoid 函数
|
||||
def sigmoid(inX):
|
||||
'''
|
||||
Desc:
|
||||
sigmoid 函数实现
|
||||
Args:
|
||||
inX --- 输入向量
|
||||
Returns:
|
||||
对输入向量作用 sigmoid 函数之后得到的输出
|
||||
'''
|
||||
return 1.0 / (1 + exp(-inX))
|
||||
|
||||
|
||||
# 定义神经网络的节点类
|
||||
class Node(object):
|
||||
'''
|
||||
Desc:
|
||||
神经网络的节点类
|
||||
'''
|
||||
def __init__(self, layer_index, node_index):
|
||||
'''
|
||||
Desc:
|
||||
初始化一个节点
|
||||
Args:
|
||||
layer_index --- 层的索引,也就是表示第几层
|
||||
node_index --- 节点的索引,也就是表示节点的索引
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 设置节点所在的层的位置
|
||||
self.layer_index = layer_index
|
||||
# 设置层中的节点的索引
|
||||
self.node_index = node_index
|
||||
# 设置此节点的下游节点,也就是这个节点与下一层的哪个节点相连
|
||||
self.downstream = []
|
||||
# 设置此节点的上游节点,也就是哪几个节点的下游节点与此节点相连
|
||||
self.upstream = []
|
||||
# 此节点的输出
|
||||
self.output = 0
|
||||
# 此节点真实值与计算值之间的差值
|
||||
self.delta = 0
|
||||
|
||||
def set_output(self, output):
|
||||
'''
|
||||
Desc:
|
||||
设置节点的 output
|
||||
Args:
|
||||
output --- 节点的 output
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
self.output = output
|
||||
|
||||
def append_downstream_connection(self, conn):
|
||||
'''
|
||||
Desc:
|
||||
添加此节点的下游节点的连接
|
||||
Args:
|
||||
conn --- 当前节点的下游节点的连接的 list
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 使用 list 的 append 方法来将 conn 中的节点添加到 downstream 中
|
||||
self.downstream.append(conn)
|
||||
|
||||
def append_upstream_connection(self, conn):
|
||||
'''
|
||||
Desc:
|
||||
添加此节点的上游节点的连接
|
||||
Args:
|
||||
conn ---- 当前节点的上游节点的连接的 list
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 使用 list 的 append 方法来将 conn 中的节点添加到 upstream 中
|
||||
self.upstream.append(conn)
|
||||
|
||||
def calc_output(self):
|
||||
'''
|
||||
Desc:
|
||||
计算节点的输出,依据 output = sigmoid(wTx)
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 使用 reduce() 函数对其中的因素求和
|
||||
output = reduce(lambda ret, conn: ret + conn.upstream_node.output * conn.weight, self.upstream, 0)
|
||||
# 对上游节点的 output 乘 weights 之后求和得到的结果应用 sigmoid 函数,得到当前节点的 output
|
||||
self.output = sigmoid(output)
|
||||
|
||||
def calc_hidden_layer_delta(self):
|
||||
'''
|
||||
Desc:
|
||||
计算隐藏层的节点的 delta
|
||||
Args:
|
||||
output --- 节点的 output
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 根据 https://www.zybuluo.com/hanbingtao/note/476663 的 式4 计算隐藏层的delta
|
||||
downstream_delta = reduce(lambda ret, conn: ret + conn.downstream_node.delta * conn.weight, self.downstream, 0.0)
|
||||
# 计算此节点的 delta
|
||||
self.delta = self.output * (1 - self.output) * downstream_delta
|
||||
|
||||
def calc_output_layer_delta(self, label):
|
||||
'''
|
||||
Desc:
|
||||
计算输出层的 delta
|
||||
Args:
|
||||
label --- 输入向量对应的真实标签,不是计算得到的结果
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 就是那输出层的 delta
|
||||
self.delta = self.output * (1 - self.output) * (label - self.output)
|
||||
|
||||
def __str__(self):
|
||||
'''
|
||||
Desc:
|
||||
将节点的信息打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 打印格式:第几层 - 第几个节点,output 是多少,delta 是多少
|
||||
node_str = '%u-%u: output: %f delta: %f' % (self.layer_index, self.node_index, self.output, self.delta)
|
||||
# 下游节点
|
||||
downstream_str = reduce(lambda ret, conn: ret + '\n\t' + str(conn), self.downstream, '')
|
||||
# 上游节点
|
||||
upstream_str = reduce(lambda ret, conn: ret + '\n\t' + str(conn), self.upstream, '')
|
||||
# 将本节点 + 下游节点 + 上游节点 的信息打印出来
|
||||
return node_str + '\n\tdownstream:' + downstream_str + '\n\tupstream:' + upstream_str
|
||||
|
||||
|
||||
# ConstNode 对象,为了实现一个输出恒为 1 的节点(计算偏置项 wb 时需要)
|
||||
class ConstNode(object):
|
||||
'''
|
||||
Desc:
|
||||
常数项对象,即相当于计算的时候的偏置项
|
||||
'''
|
||||
def __init__(self, layer_index, node_index):
|
||||
'''
|
||||
Desc:
|
||||
初始化节点对象
|
||||
Args:
|
||||
layer_index --- 节点所属的层的编号
|
||||
node_index --- 节点的编号
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
self.layer_index = layer_index
|
||||
self.node_index = node_index
|
||||
self.downstream = []
|
||||
self.output = 1
|
||||
|
||||
|
||||
def append_downstream_connection(self, conn):
|
||||
'''
|
||||
Desc:
|
||||
添加一个到下游节点的连接
|
||||
Args:
|
||||
conn --- 到下游节点的连接
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 使用 list 的 append 方法将包含下游节点的 conn 添加到 downstream 中
|
||||
self.downstream.append(conn)
|
||||
|
||||
|
||||
def calc_hidden_layer_delta(self):
|
||||
'''
|
||||
Desc:
|
||||
计算隐藏层的 delta
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 使用我们的 公式 4 来计算下游节点的 delta,求和
|
||||
downstream_delta = reduce(lambda ret, conn: ret + conn.downstream_node.delta * conn.weight, self.downstream, 0.0)
|
||||
# 计算隐藏层的本节点的 delta
|
||||
self.delta = self.output * (1 - self.output) * downstream_delta
|
||||
|
||||
|
||||
def __str__(self):
|
||||
'''
|
||||
Desc:
|
||||
将节点信息打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 将节点的信息打印出来
|
||||
# 格式 第几层-第几个节点的 output
|
||||
node_str = '%u-%u: output: 1' % (self.layer_index, self.node_index)
|
||||
# 此节点的下游节点的信息
|
||||
downstream_str = reduce(lambda ret, conn: ret + '\n\t' + str(conn), self.downstream, '')
|
||||
# 将此节点与下游节点的信息组合,一起打印出来
|
||||
return node_str + '\n\tdownstream:' + downstream_str
|
||||
|
||||
|
||||
# 神经网络的层对象,负责初始化一层。此外,作为 Node 的集合对象,提供对 Node 集合的操作
|
||||
class Layer(object):
|
||||
'''
|
||||
Desc:
|
||||
神经网络的 Layer 类
|
||||
'''
|
||||
|
||||
def __init__(self, layer_index, node_count):
|
||||
'''
|
||||
Desc:
|
||||
神经网络的层对象的初始化
|
||||
Args:
|
||||
layer_index --- 层的索引
|
||||
node_count --- 节点的个数
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 设置 层的索引
|
||||
self.layer_index = layer_index
|
||||
# 设置层中的节点的 list
|
||||
self.nodes = []
|
||||
# 将 Node 节点添加到 nodes 中
|
||||
for i in range(node_count):
|
||||
self.nodes.append(Node(layer_index, i))
|
||||
# 将 ConstNode 节点也添加到 nodes 中
|
||||
self.nodes.append(ConstNode(layer_index, node_count))
|
||||
|
||||
def set_output(self, data):
|
||||
'''
|
||||
Desc:
|
||||
设置层的输出,当层是输入层时会用到
|
||||
Args:
|
||||
data --- 输出的值的 list
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 设置输入层中各个节点的 output
|
||||
for i in range(len(data)):
|
||||
self.nodes[i].set_output(data[i])
|
||||
|
||||
def calc_output(self):
|
||||
'''
|
||||
Desc:
|
||||
计算层的输出向量
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 遍历本层的所有节点(除去最后一个节点,因为它是恒为常数的偏置项b)
|
||||
# 调用节点的 calc_output 方法来计算输出向量
|
||||
for node in self.nodes[:-1]:
|
||||
node.calc_output()
|
||||
|
||||
def dump(self):
|
||||
'''
|
||||
Desc:
|
||||
将层信息打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 遍历层的所有的节点 nodes,将节点信息打印出来
|
||||
for node in self.nodes:
|
||||
print(node)
|
||||
|
||||
|
||||
# Connection 对象类,主要负责记录连接的权重,以及这个连接所关联的上下游的节点
|
||||
class Connection(object):
|
||||
'''
|
||||
Desc:
|
||||
Connection 对象,记录连接权重和连接所关联的上下游节点,注意,这里的 connection 没有 s ,不是复数
|
||||
'''
|
||||
def __init__(self, upstream_node, downstream_node):
|
||||
'''
|
||||
Desc:
|
||||
初始化 Connection 对象
|
||||
Args:
|
||||
upstream_node --- 上游节点
|
||||
downstream_node --- 下游节点
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 设置上游节点
|
||||
self.upstream_node = upstream_node
|
||||
# 设置下游节点
|
||||
self.downstream_node = downstream_node
|
||||
# 设置权重,这里设置的权重是 -0.1 到 0.1 之间的任何数
|
||||
self.weight = random.uniform(-0.1, 0.1)
|
||||
# 设置梯度 为 0.0
|
||||
self.gradient = 0.0
|
||||
|
||||
def calc_gradient(self):
|
||||
'''
|
||||
Desc:
|
||||
计算梯度
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 下游节点的 delta * 上游节点的 output 计算得到梯度
|
||||
self.gradient = self.downstream_node.delta * self.upstream_node.output
|
||||
|
||||
def update_weight(self, rate):
|
||||
'''
|
||||
Desc:
|
||||
根据梯度下降算法更新权重
|
||||
Args:
|
||||
rate --- 学习率 / 或者成为步长
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 调用计算梯度的函数来将梯度计算出来
|
||||
self.calc_gradient()
|
||||
# 使用梯度下降算法来更新权重
|
||||
self.weight += rate * self.gradient
|
||||
|
||||
def get_gradient(self):
|
||||
'''
|
||||
Desc:
|
||||
获取当前的梯度
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
当前的梯度 gradient
|
||||
'''
|
||||
return self.gradient
|
||||
|
||||
def __str__(self):
|
||||
'''
|
||||
Desc:
|
||||
将连接信息打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
连接信息进行返回
|
||||
'''
|
||||
# 格式为:上游节点的层的索引+上游节点的节点索引 ---> 下游节点的层的索引+下游节点的节点索引,最后一个数是权重
|
||||
return '(%u-%u) -> (%u-%u) = %f' % (
|
||||
self.upstream_node.layer_index,
|
||||
self.upstream_node.node_index,
|
||||
self.downstream_node.layer_index,
|
||||
self.downstream_node.node_index,
|
||||
self.weight)
|
||||
|
||||
|
||||
|
||||
# Connections 对象,提供 Connection 集合操作。
|
||||
class Connections(object):
|
||||
'''
|
||||
Desc:
|
||||
Connections 对象,提供 Connection 集合的操作,看清楚后面有没有 s ,不要看错
|
||||
'''
|
||||
def __init__(self):
|
||||
'''
|
||||
Desc:
|
||||
初始化 Connections 对象
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 初始化一个列表 list
|
||||
self.connections = []
|
||||
|
||||
def add_connection(self, connection):
|
||||
'''
|
||||
Desc:
|
||||
将 connection 中的节点信息 append 到 connections 中
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
self.connections.append(connection)
|
||||
|
||||
def dump(self):
|
||||
'''
|
||||
Desc:
|
||||
将 Connections 的节点信息打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
for conn in self.connections:
|
||||
print(conn)
|
||||
|
||||
|
||||
# Network 对象,提供相应 API
|
||||
class Network(object):
|
||||
'''
|
||||
Desc:
|
||||
Network 类
|
||||
'''
|
||||
def __init__(self, layers):
|
||||
'''
|
||||
Desc:
|
||||
初始化一个全连接神经网络
|
||||
Args:
|
||||
layers --- 二维数组,描述神经网络的每层节点数
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 初始化 connections,使用的是 Connections 对象
|
||||
self.connections = Connections()
|
||||
# 初始化 layers
|
||||
self.layers = []
|
||||
# 我们的神经网络的层数
|
||||
layer_count = len(layers)
|
||||
# 节点数
|
||||
node_count = 0
|
||||
# 遍历所有的层,将每层信息添加到 layers 中去
|
||||
for i in range(layer_count):
|
||||
self.layers.append(Layer(i, layers[i]))
|
||||
# 遍历除去输出层之外的所有层,将连接信息添加到 connections 对象中
|
||||
for layer in range(layer_count - 1):
|
||||
connections = [Connection(upstream_node, downstream_node) for upstream_node in self.layers[layer].nodes for downstream_node in self.layers[layer + 1].nodes[:-1]]
|
||||
# 遍历 connections,将 conn 添加到 connections 中
|
||||
for conn in connections:
|
||||
self.connections.add_connection(conn)
|
||||
# 为下游节点添加上游节点为 conn
|
||||
conn.downstream_node.append_upstream_connection(conn)
|
||||
# 为上游节点添加下游节点为 conn
|
||||
conn.upstream_node.append_downstream_connection(conn)
|
||||
|
||||
|
||||
def train(self, labels, data_set, rate, epoch):
|
||||
'''
|
||||
Desc:
|
||||
训练神经网络
|
||||
Args:
|
||||
labels --- 数组,训练样本标签,每个元素是一个样本的标签
|
||||
data_set --- 二维数组,训练样本的特征数据。每行数据是一个样本的特征
|
||||
rate --- 学习率
|
||||
epoch --- 迭代次数
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 循环迭代 epoch 次
|
||||
for i in range(epoch):
|
||||
# 遍历每个训练样本
|
||||
for d in range(len(data_set)):
|
||||
# 使用此样本进行训练(一条样本进行训练)
|
||||
self.train_one_sample(labels[d], data_set[d], rate)
|
||||
# print 'sample %d training finished' % d
|
||||
|
||||
def train_one_sample(self, label, sample, rate):
|
||||
'''
|
||||
Desc:
|
||||
内部函数,使用一个样本对网络进行训练
|
||||
Args:
|
||||
label --- 样本的标签
|
||||
sample --- 样本的特征
|
||||
rate --- 学习率
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 调用 Network 的 predict 方法,对这个样本进行预测
|
||||
self.predict(sample)
|
||||
# 计算根据此样本得到的结果的 delta
|
||||
self.calc_delta(label)
|
||||
# 更新权重
|
||||
self.update_weight(rate)
|
||||
|
||||
def calc_delta(self, label):
|
||||
'''
|
||||
Desc:
|
||||
计算每个节点的 delta
|
||||
Args:
|
||||
label --- 样本的真实值,也就是样本的标签
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 获取输出层的所有节点
|
||||
output_nodes = self.layers[-1].nodes
|
||||
# 遍历所有的 label
|
||||
for i in range(len(label)):
|
||||
# 计算输出层节点的 delta
|
||||
output_nodes[i].calc_output_layer_delta(label[i])
|
||||
# 这个用法就是切片的用法, [-2::-1] 就是将 layers 这个数组倒过来,从没倒过来的时候的倒数第二个元素开始,到翻转过来的倒数第一个数,比如这样:aaa = [1,2,3,4,5,6,7,8,9],bbb = aaa[-2::-1] ==> bbb = [8, 7, 6, 5, 4, 3, 2, 1]
|
||||
# 实际上就是除掉输出层之外的所有层按照相反的顺序进行遍历
|
||||
for layer in self.layers[-2::-1]:
|
||||
# 遍历每层的所有节点
|
||||
for node in layer.nodes:
|
||||
# 计算隐藏层的 delta
|
||||
node.calc_hidden_layer_delta()
|
||||
|
||||
def update_weight(self, rate):
|
||||
'''
|
||||
Desc:
|
||||
更新每个连接的权重
|
||||
Args:
|
||||
rate --- 学习率
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 按照正常顺序遍历除了输出层的层
|
||||
for layer in self.layers[:-1]:
|
||||
# 遍历每层的所有节点
|
||||
for node in layer.nodes:
|
||||
# 遍历节点的下游节点
|
||||
for conn in node.downstream:
|
||||
# 根据下游节点来更新连接的权重
|
||||
conn.update_weight(rate)
|
||||
|
||||
def calc_gradient(self):
|
||||
'''
|
||||
Desc:
|
||||
计算每个连接的梯度
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 按照正常顺序遍历除了输出层之外的层
|
||||
for layer in self.layers[:-1]:
|
||||
# 遍历层中的所有节点
|
||||
for node in layer.nodes:
|
||||
# 遍历节点的下游节点
|
||||
for conn in node.downstream:
|
||||
# 计算梯度
|
||||
conn.calc_gradient()
|
||||
|
||||
def get_gradient(self, label, sample):
|
||||
'''
|
||||
Desc:
|
||||
获得网络在一个样本下,每个连接上的梯度
|
||||
Args:
|
||||
label --- 样本标签
|
||||
sample --- 样本特征
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 调用 predict() 方法,利用样本的特征数据对样本进行预测
|
||||
self.predict(sample)
|
||||
# 计算 delta
|
||||
self.calc_delta(label)
|
||||
# 计算梯度
|
||||
self.calc_gradient()
|
||||
|
||||
def predict(self, sample):
|
||||
'''
|
||||
Desc:
|
||||
根据输入的样本预测输出值
|
||||
Args:
|
||||
sample --- 数组,样本的特征,也就是网络的输入向量
|
||||
Returns:
|
||||
使用我们的感知器规则计算网络的输出
|
||||
'''
|
||||
# 首先为输入层设置输出值output为样本的输入向量,即不发生任何变化
|
||||
self.layers[0].set_output(sample)
|
||||
# 遍历除去输入层开始到最后一层
|
||||
for i in range(1, len(self.layers)):
|
||||
# 计算 output
|
||||
self.layers[i].calc_output()
|
||||
# 将计算得到的输出,也就是我们的预测值返回
|
||||
return map(lambda node: node.output, self.layers[-1].nodes[:-1])
|
||||
|
||||
def dump(self):
|
||||
'''
|
||||
Desc:
|
||||
打印出我们的网络信息
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 遍历所有的 layers
|
||||
for layer in self.layers:
|
||||
# 将所有的层的信息打印出来
|
||||
layer.dump()
|
||||
|
||||
|
||||
# # ------------------------- 至此,基本上我们把 我们的神经网络实现完成,下面还会介绍一下对应的梯度检查相关的算法,现在我们首先回顾一下我们上面写道的类及他们的作用 ------------------------
|
||||
'''
|
||||
1、节点类的实现 Node :负责记录和维护节点自身信息以及这个节点相关的上下游连接,实现输出值和误差项的计算。如下:
|
||||
layer_index --- 节点所属的层的编号
|
||||
node_index --- 节点的编号
|
||||
downstream --- 下游节点
|
||||
upstream ---- 上游节点
|
||||
output ---- 节点的输出值
|
||||
delta ------ 节点的误差项
|
||||
|
||||
2、ConstNode 类,偏置项类的实现:实现一个输出恒为 1 的节点(计算偏置项的时候会用到),如下:
|
||||
layer_index --- 节点所属层的编号
|
||||
node_index ---- 节点的编号
|
||||
downstream ---- 下游节点
|
||||
没有记录上游节点,因为一个偏置项的输出与上游节点的输出无关
|
||||
output ----- 偏置项的输出
|
||||
|
||||
3、layer 类,负责初始化一层。作为的是 Node 节点的集合对象,提供对 Node 集合的操作。也就是说,layer 包含的是 Node 的集合。
|
||||
layer_index ---- 层的编号
|
||||
node_count ----- 层所包含的节点的个数
|
||||
def set_ouput() -- 设置层的输出,当层是输入层时会用到
|
||||
def calc_output -- 计算层的输出向量,调用的 Node 类的 计算输出 方法
|
||||
|
||||
4、Connection 类:负责记录连接的权重,以及这个连接所关联的上下游节点,如下:
|
||||
upstream_node --- 连接的上游节点
|
||||
downstream_node -- 连接的下游节点
|
||||
weight -------- random.uniform(-0.1, 0.1) 初始化为一个很小的随机数
|
||||
gradient -------- 0.0 梯度,初始化为 0.0
|
||||
def calc_gradient() --- 计算梯度,使用的是下游节点的 delta 与上游节点的 output 相乘计算得到
|
||||
def get_gradient() ---- 获取当前的梯度
|
||||
def update_weight() --- 根据梯度下降算法更新权重
|
||||
|
||||
5、Connections 类:提供对 Connection 集合操作,如下:
|
||||
def add_connection() --- 添加一个 connection
|
||||
|
||||
6、Network 类:提供相应的 API,如下:
|
||||
connections --- Connections 对象
|
||||
layers -------- 神经网络的层
|
||||
layer_count --- 神经网络的层数
|
||||
node_count --- 节点个数
|
||||
def train() --- 训练神经网络
|
||||
def train_one_sample() --- 用一个样本训练网络
|
||||
def calc_delta() --- 计算误差项
|
||||
def update_weight() --- 更新每个连接权重
|
||||
def calc_gradient() --- 计算每个连接的梯度
|
||||
def get_gradient() --- 获得网络在一个样本下,每个连接上的梯度
|
||||
def predict() --- 根据输入的样本预测输出值
|
||||
'''
|
||||
|
||||
# #--------------------------------------回顾完成了,有些问题可能还是没有弄懂,没事,我们接着看下面---------------------------------------------
|
||||
|
||||
class Normalizer(object):
|
||||
'''
|
||||
Desc:
|
||||
归一化工具类
|
||||
Args:
|
||||
object --- 对象
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
def __init__(self):
|
||||
'''
|
||||
Desc:
|
||||
初始化
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 初始化 16 进制的数,用来判断位的,分别是
|
||||
# 0x1 ---- 00000001
|
||||
# 0x2 ---- 00000010
|
||||
# 0x4 ---- 00000100
|
||||
# 0x8 ---- 00001000
|
||||
# 0x10 --- 00010000
|
||||
# 0x20 --- 00100000
|
||||
# 0x40 --- 01000000
|
||||
# 0x80 --- 10000000
|
||||
self.mask = [0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80]
|
||||
|
||||
def norm(self, number):
|
||||
'''
|
||||
Desc:
|
||||
对 number 进行规范化
|
||||
Args:
|
||||
number --- 要规范化的数据
|
||||
Returns:
|
||||
规范化之后的数据
|
||||
'''
|
||||
# 此方法就相当于判断一个 8 位的向量,哪一位上有数字,如果有就将这个数设置为 0.9 ,否则,设置为 0.1,通俗比较来说,就是我们这里用 0.9 表示 1,用 0.1 表示 0
|
||||
return map(lambda m: 0.9 if number & m else 0.1, self.mask)
|
||||
|
||||
def denorm(self, vec):
|
||||
'''
|
||||
Desc:
|
||||
对我们得到的向量进行反规范化
|
||||
Args:
|
||||
vec --- 得到的向量
|
||||
Returns:
|
||||
最终的预测结果
|
||||
'''
|
||||
# 进行二分类,大于 0.5 就设置为 1,小于 0.5 就设置为 0
|
||||
binary = map(lambda i: 1 if i > 0.5 else 0, vec)
|
||||
# 遍历 mask
|
||||
for i in range(len(self.mask)):
|
||||
binary[i] = binary[i] * self.mask[i]
|
||||
# 将结果相加得到最终的预测结果
|
||||
return reduce(lambda x,y: x + y, binary)
|
||||
|
||||
|
||||
def mean_square_error(vec1, vec2):
|
||||
'''
|
||||
Desc:
|
||||
计算平均平方误差
|
||||
Args:
|
||||
vec1 --- 第一个数
|
||||
vec2 --- 第二个数
|
||||
Returns:
|
||||
返回 1/2 * (x-y)^2 计算得到的值
|
||||
'''
|
||||
return 0.5 * reduce(lambda a, b: a + b, map(lambda v: (v[0] - v[1]) * (v[0] - v[1]), zip(vec1, vec2)))
|
||||
|
||||
|
||||
|
||||
def gradient_check(network, sample_feature, sample_label):
|
||||
'''
|
||||
Desc:
|
||||
梯度检查
|
||||
Args:
|
||||
network --- 神经网络对象
|
||||
sample_feature --- 样本的特征
|
||||
sample_label --- 样本的标签
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 计算网络误差
|
||||
network_error = lambda vec1, vec2: 0.5 * reduce(lambda a, b: a + b, map(lambda v: (v[0] - v[1]) * (v[0] - v[1]), zip(vec1, vec2)))
|
||||
|
||||
# 获取网络在当前样本下每个连接的梯度
|
||||
network.get_gradient(sample_feature, sample_label)
|
||||
|
||||
# 对每个权重做梯度检查
|
||||
for conn in network.connections.connections:
|
||||
# 获取指定连接的梯度
|
||||
actual_gradient = conn.get_gradient()
|
||||
|
||||
# 增加一个很小的值,计算网络的误差
|
||||
epsilon = 0.0001
|
||||
conn.weight += epsilon
|
||||
error1 = network_error(network.predict(sample_feature), sample_label)
|
||||
|
||||
# 减去一个很小的值,计算网络的误差
|
||||
conn.weight -= 2 * epsilon # 刚才加过了一次,因此这里需要减去2倍
|
||||
error2 = network_error(network.predict(sample_feature), sample_label)
|
||||
|
||||
# 根据式6计算期望的梯度值
|
||||
expected_gradient = (error2 - error1) / (2 * epsilon)
|
||||
|
||||
# 打印
|
||||
print('expected gradient: \t%f\nactual gradient: \t%f' % (expected_gradient, actual_gradient))
|
||||
|
||||
|
||||
def train_data_set():
|
||||
'''
|
||||
Desc:
|
||||
获取训练数据集
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
labels --- 训练数据集每条数据对应的标签
|
||||
'''
|
||||
# 调用 Normalizer() 类
|
||||
normalizer = Normalizer()
|
||||
# 初始化一个 list,用来存储后面的数据
|
||||
data_set = []
|
||||
labels = []
|
||||
# 0 到 256 ,其中以 8 为步长
|
||||
for i in range(0, 256, 8):
|
||||
# 调用 normalizer 对象的 norm 方法
|
||||
n = normalizer.norm(int(random.uniform(0, 256)))
|
||||
# 在 data_set 中 append n
|
||||
data_set.append(n)
|
||||
# 在 labels 中 append n
|
||||
labels.append(n)
|
||||
# 将它们返回
|
||||
return labels, data_set
|
||||
|
||||
|
||||
def train(network):
|
||||
'''
|
||||
Desc:
|
||||
使用我们的神经网络进行训练
|
||||
Args:
|
||||
network --- 神经网络对象
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 获取训练数据集
|
||||
labels, data_set = train_data_set()
|
||||
# 调用 network 中的 train方法来训练我们的神经网络
|
||||
network.train(labels, data_set, 0.3, 50)
|
||||
|
||||
|
||||
def test(network, data):
|
||||
'''
|
||||
Desc:
|
||||
对我们的全连接神经网络进行测试
|
||||
Args:
|
||||
network --- 神经网络对象
|
||||
data ------ 测试数据集
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 调用 Normalizer() 类
|
||||
normalizer = Normalizer()
|
||||
# 调用 norm 方法,对数据进行规范化
|
||||
norm_data = normalizer.norm(data)
|
||||
# 对测试数据进行预测
|
||||
predict_data = network.predict(norm_data)
|
||||
# 将结果打印出来
|
||||
print('\ttestdata(%u)\tpredict(%u)' % (data, normalizer.denorm(predict_data)))
|
||||
|
||||
|
||||
def correct_ratio(network):
|
||||
'''
|
||||
Desc:
|
||||
计算我们的神经网络的正确率
|
||||
Args:
|
||||
network --- 神经网络对象
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
normalizer = Normalizer()
|
||||
correct = 0.0
|
||||
for i in range(256):
|
||||
if normalizer.denorm(network.predict(normalizer.norm(i))) == i:
|
||||
correct += 1.0
|
||||
print('correct_ratio: %.2f%%' % (correct / 256 * 100))
|
||||
|
||||
|
||||
def gradient_check_test():
|
||||
'''
|
||||
Desc:
|
||||
梯度检查测试
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 创建一个有 3 层的网络,每层有 2 个节点
|
||||
net = Network([2, 2, 2])
|
||||
# 样本的特征
|
||||
sample_feature = [0.9, 0.1]
|
||||
# 样本对应的标签
|
||||
sample_label = [0.9, 0.1]
|
||||
# 使用梯度检查来查看是否正确
|
||||
gradient_check(net, sample_feature, sample_label)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
'''
|
||||
Desc:
|
||||
主函数
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 初始化一个神经网络,输入层 8 个节点,隐藏层 3 个节点,输出层 8 个节点
|
||||
net = Network([8, 3, 8])
|
||||
# 训练我们的神经网络
|
||||
train(net)
|
||||
# 将我们的神经网络的信息打印出来
|
||||
net.dump()
|
||||
# 打印出神经网络的正确率
|
||||
correct_ratio(net)
|
||||
466
src/py2.x/dl/cnn.py
Normal file
466
src/py2.x/dl/cnn.py
Normal file
@@ -0,0 +1,466 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
|
||||
from __future__ import print_function
|
||||
import numpy as np
|
||||
from activators import ReluActivator, IdentityActivator
|
||||
|
||||
|
||||
# 获取卷积区域
|
||||
def get_patch(input_array, i, j, filter_width,
|
||||
filter_height, stride):
|
||||
'''
|
||||
从输入数组中获取本次卷积的区域,
|
||||
自动适配输入为2D和3D的情况
|
||||
'''
|
||||
start_i = i * stride
|
||||
start_j = j * stride
|
||||
if input_array.ndim == 2:
|
||||
return input_array[
|
||||
start_i : start_i + filter_height,
|
||||
start_j : start_j + filter_width]
|
||||
elif input_array.ndim == 3:
|
||||
return input_array[:,
|
||||
start_i : start_i + filter_height,
|
||||
start_j : start_j + filter_width]
|
||||
|
||||
|
||||
# 获取一个2D区域的最大值所在的索引
|
||||
def get_max_index(array):
|
||||
max_i = 0
|
||||
max_j = 0
|
||||
max_value = array[0,0]
|
||||
for i in range(array.shape[0]):
|
||||
for j in range(array.shape[1]):
|
||||
if array[i,j] > max_value:
|
||||
max_value = array[i,j]
|
||||
max_i, max_j = i, j
|
||||
return max_i, max_j
|
||||
|
||||
|
||||
# 计算卷积
|
||||
def conv(input_array,
|
||||
kernel_array,
|
||||
output_array,
|
||||
stride, bias):
|
||||
'''
|
||||
计算卷积,自动适配输入为2D和3D的情况
|
||||
conv函数实现了2维和3维数组的卷积
|
||||
'''
|
||||
channel_number = input_array.ndim
|
||||
output_width = output_array.shape[1]
|
||||
output_height = output_array.shape[0]
|
||||
kernel_width = kernel_array.shape[-1]
|
||||
kernel_height = kernel_array.shape[-2]
|
||||
for i in range(output_height):
|
||||
for j in range(output_width):
|
||||
output_array[i][j] = (
|
||||
get_patch(input_array, i, j, kernel_width,
|
||||
kernel_height, stride) * kernel_array
|
||||
).sum() + bias
|
||||
|
||||
|
||||
# 为数组增加Zero padding
|
||||
def padding(input_array, zp):
|
||||
'''
|
||||
为数组增加Zero padding,自动适配输入为2D和3D的情况
|
||||
'''
|
||||
if zp == 0:
|
||||
return input_array
|
||||
else:
|
||||
if input_array.ndim == 3:
|
||||
input_width = input_array.shape[2]
|
||||
input_height = input_array.shape[1]
|
||||
input_depth = input_array.shape[0]
|
||||
padded_array = np.zeros((
|
||||
input_depth,
|
||||
input_height + 2 * zp,
|
||||
input_width + 2 * zp))
|
||||
padded_array[:,
|
||||
zp : zp + input_height,
|
||||
zp : zp + input_width] = input_array
|
||||
return padded_array
|
||||
elif input_array.ndim == 2:
|
||||
input_width = input_array.shape[1]
|
||||
input_height = input_array.shape[0]
|
||||
padded_array = np.zeros((
|
||||
input_height + 2 * zp,
|
||||
input_width + 2 * zp))
|
||||
padded_array[zp : zp + input_height,
|
||||
zp : zp + input_width] = input_array
|
||||
return padded_array
|
||||
|
||||
|
||||
# 对numpy数组进行element wise操作
|
||||
def element_wise_op(array, op):
|
||||
'''
|
||||
Desc:
|
||||
element_wise_op函数实现了对numpy数组进行按元素操作,并将返回值写回到数组中
|
||||
'''
|
||||
for i in np.nditer(array,
|
||||
op_flags=['readwrite']):
|
||||
i[...] = op(i)
|
||||
|
||||
|
||||
class Filter(object):
|
||||
'''
|
||||
Desc:
|
||||
Filter类保存了卷积层的参数以及梯度,并且实现了用梯度下降算法来更新参数。
|
||||
我们对参数的初始化采用了常用的策略,即:权重随机初始化为一个很小的值,而偏置项初始化为0。
|
||||
'''
|
||||
def __init__(self, width, height, depth):
|
||||
self.weights = np.random.uniform(-1e-4, 1e-4,
|
||||
(depth, height, width))
|
||||
self.bias = 0
|
||||
self.weights_grad = np.zeros(
|
||||
self.weights.shape)
|
||||
self.bias_grad = 0
|
||||
|
||||
def __repr__(self):
|
||||
return 'filter weights:\n%s\nbias:\n%s' % (
|
||||
repr(self.weights), repr(self.bias))
|
||||
|
||||
def get_weights(self):
|
||||
return self.weights
|
||||
|
||||
def get_bias(self):
|
||||
return self.bias
|
||||
|
||||
def update(self, learning_rate):
|
||||
self.weights -= learning_rate * self.weights_grad
|
||||
self.bias -= learning_rate * self.bias_grad
|
||||
|
||||
|
||||
class ConvLayer(object):
|
||||
'''
|
||||
Desc:
|
||||
用ConvLayer类来实现一个卷积层。下面的代码是初始化一个卷积层,可以在构造函数中设置卷积层的超参数。
|
||||
'''
|
||||
def __init__(self, input_width, input_height,
|
||||
channel_number, filter_width,
|
||||
filter_height, filter_number,
|
||||
zero_padding, stride, activator,
|
||||
learning_rate):
|
||||
self.input_width = input_width
|
||||
self.input_height = input_height
|
||||
self.channel_number = channel_number
|
||||
self.filter_width = filter_width
|
||||
self.filter_height = filter_height
|
||||
self.filter_number = filter_number
|
||||
self.zero_padding = zero_padding
|
||||
self.stride = stride
|
||||
self.output_width = \
|
||||
ConvLayer.calculate_output_size(
|
||||
self.input_width, filter_width, zero_padding,
|
||||
stride)
|
||||
self.output_height = \
|
||||
ConvLayer.calculate_output_size(
|
||||
self.input_height, filter_height, zero_padding,
|
||||
stride)
|
||||
self.output_array = np.zeros((self.filter_number,
|
||||
self.output_height, self.output_width))
|
||||
self.filters = []
|
||||
for i in range(filter_number):
|
||||
self.filters.append(Filter(filter_width,
|
||||
filter_height, self.channel_number))
|
||||
self.activator = activator
|
||||
self.learning_rate = learning_rate
|
||||
|
||||
def forward(self, input_array):
|
||||
'''
|
||||
Desc:
|
||||
计算卷积层的输出,输出结果保存在 self.output_array
|
||||
ConvLayer 类的 forward 方法实现了卷积层的前向计算(即计算根据输入来计算卷积层的输出)
|
||||
'''
|
||||
self.input_array = input_array
|
||||
self.padded_input_array = padding(input_array,
|
||||
self.zero_padding)
|
||||
for f in range(self.filter_number):
|
||||
filter = self.filters[f]
|
||||
conv(self.padded_input_array,
|
||||
filter.get_weights(), self.output_array[f],
|
||||
self.stride, filter.get_bias())
|
||||
element_wise_op(self.output_array,
|
||||
self.activator.forward)
|
||||
|
||||
def backward(self, input_array, sensitivity_array,
|
||||
activator):
|
||||
'''
|
||||
计算传递给前一层的误差项,以及计算每个权重的梯度
|
||||
前一层的误差项保存在self.delta_array
|
||||
梯度保存在Filter对象的weights_grad
|
||||
'''
|
||||
self.forward(input_array)
|
||||
self.bp_sensitivity_map(sensitivity_array,
|
||||
activator)
|
||||
self.bp_gradient(sensitivity_array)
|
||||
|
||||
def update(self):
|
||||
'''
|
||||
按照梯度下降,更新权重
|
||||
'''
|
||||
for filter in self.filters:
|
||||
filter.update(self.learning_rate)
|
||||
|
||||
def bp_sensitivity_map(self, sensitivity_array,
|
||||
activator):
|
||||
'''
|
||||
计算传递到上一层的sensitivity map
|
||||
sensitivity_array: 本层的sensitivity map
|
||||
activator: 上一层的激活函数
|
||||
'''
|
||||
# 处理卷积步长,对原始sensitivity map进行扩展
|
||||
expanded_array = self.expand_sensitivity_map(
|
||||
sensitivity_array)
|
||||
# full卷积,对sensitivitiy map进行zero padding
|
||||
# 虽然原始输入的zero padding单元也会获得残差
|
||||
# 但这个残差不需要继续向上传递,因此就不计算了
|
||||
expanded_width = expanded_array.shape[2]
|
||||
zp = (self.input_width +
|
||||
self.filter_width - 1 - expanded_width) / 2
|
||||
padded_array = padding(expanded_array, zp)
|
||||
# 初始化delta_array,用于保存传递到上一层的
|
||||
# sensitivity map
|
||||
self.delta_array = self.create_delta_array()
|
||||
# 对于具有多个filter的卷积层来说,最终传递到上一层的
|
||||
# sensitivity map相当于所有的filter的
|
||||
# sensitivity map之和
|
||||
for f in range(self.filter_number):
|
||||
filter = self.filters[f]
|
||||
# 将filter权重翻转180度
|
||||
flipped_weights = np.array(map(
|
||||
lambda i: np.rot90(i, 2),
|
||||
filter.get_weights()))
|
||||
# 计算与一个filter对应的delta_array
|
||||
delta_array = self.create_delta_array()
|
||||
for d in range(delta_array.shape[0]):
|
||||
conv(padded_array[f], flipped_weights[d],
|
||||
delta_array[d], 1, 0)
|
||||
self.delta_array += delta_array
|
||||
# 将计算结果与激活函数的偏导数做element-wise乘法操作
|
||||
derivative_array = np.array(self.input_array)
|
||||
element_wise_op(derivative_array,
|
||||
activator.backward)
|
||||
self.delta_array *= derivative_array
|
||||
|
||||
def bp_gradient(self, sensitivity_array):
|
||||
# 处理卷积步长,对原始sensitivity map进行扩展
|
||||
expanded_array = self.expand_sensitivity_map(
|
||||
sensitivity_array)
|
||||
for f in range(self.filter_number):
|
||||
# 计算每个权重的梯度
|
||||
filter = self.filters[f]
|
||||
for d in range(filter.weights.shape[0]):
|
||||
conv(self.padded_input_array[d],
|
||||
expanded_array[f],
|
||||
filter.weights_grad[d], 1, 0)
|
||||
# 计算偏置项的梯度
|
||||
filter.bias_grad = expanded_array[f].sum()
|
||||
|
||||
def expand_sensitivity_map(self, sensitivity_array):
|
||||
depth = sensitivity_array.shape[0]
|
||||
# 确定扩展后sensitivity map的大小
|
||||
# 计算stride为1时sensitivity map的大小
|
||||
expanded_width = (self.input_width -
|
||||
self.filter_width + 2 * self.zero_padding + 1)
|
||||
expanded_height = (self.input_height -
|
||||
self.filter_height + 2 * self.zero_padding + 1)
|
||||
# 构建新的sensitivity_map
|
||||
expand_array = np.zeros((depth, expanded_height,
|
||||
expanded_width))
|
||||
# 从原始sensitivity map拷贝误差值
|
||||
for i in range(self.output_height):
|
||||
for j in range(self.output_width):
|
||||
i_pos = i * self.stride
|
||||
j_pos = j * self.stride
|
||||
expand_array[:,i_pos,j_pos] = \
|
||||
sensitivity_array[:,i,j]
|
||||
return expand_array
|
||||
|
||||
def create_delta_array(self):
|
||||
return np.zeros((self.channel_number,
|
||||
self.input_height, self.input_width))
|
||||
|
||||
@staticmethod
|
||||
def calculate_output_size(input_size, filter_size, zero_padding, stride):
|
||||
'''
|
||||
Desc:
|
||||
用来确定卷积层输出的大小
|
||||
'''
|
||||
return (input_size - filter_size +
|
||||
2 * zero_padding) / stride + 1
|
||||
|
||||
|
||||
class MaxPoolingLayer(object):
|
||||
def __init__(self, input_width, input_height,
|
||||
channel_number, filter_width,
|
||||
filter_height, stride):
|
||||
self.input_width = input_width
|
||||
self.input_height = input_height
|
||||
self.channel_number = channel_number
|
||||
self.filter_width = filter_width
|
||||
self.filter_height = filter_height
|
||||
self.stride = stride
|
||||
self.output_width = (input_width -
|
||||
filter_width) / self.stride + 1
|
||||
self.output_height = (input_height -
|
||||
filter_height) / self.stride + 1
|
||||
self.output_array = np.zeros((self.channel_number,
|
||||
self.output_height, self.output_width))
|
||||
|
||||
def forward(self, input_array):
|
||||
for d in range(self.channel_number):
|
||||
for i in range(self.output_height):
|
||||
for j in range(self.output_width):
|
||||
self.output_array[d,i,j] = (
|
||||
get_patch(input_array[d], i, j,
|
||||
self.filter_width,
|
||||
self.filter_height,
|
||||
self.stride).max())
|
||||
|
||||
def backward(self, input_array, sensitivity_array):
|
||||
self.delta_array = np.zeros(input_array.shape)
|
||||
for d in range(self.channel_number):
|
||||
for i in range(self.output_height):
|
||||
for j in range(self.output_width):
|
||||
patch_array = get_patch(
|
||||
input_array[d], i, j,
|
||||
self.filter_width,
|
||||
self.filter_height,
|
||||
self.stride)
|
||||
k, l = get_max_index(patch_array)
|
||||
self.delta_array[d,
|
||||
i * self.stride + k,
|
||||
j * self.stride + l] = \
|
||||
sensitivity_array[d,i,j]
|
||||
|
||||
|
||||
def init_test():
|
||||
a = np.array(
|
||||
[[[0,1,1,0,2],
|
||||
[2,2,2,2,1],
|
||||
[1,0,0,2,0],
|
||||
[0,1,1,0,0],
|
||||
[1,2,0,0,2]],
|
||||
[[1,0,2,2,0],
|
||||
[0,0,0,2,0],
|
||||
[1,2,1,2,1],
|
||||
[1,0,0,0,0],
|
||||
[1,2,1,1,1]],
|
||||
[[2,1,2,0,0],
|
||||
[1,0,0,1,0],
|
||||
[0,2,1,0,1],
|
||||
[0,1,2,2,2],
|
||||
[2,1,0,0,1]]])
|
||||
b = np.array(
|
||||
[[[0,1,1],
|
||||
[2,2,2],
|
||||
[1,0,0]],
|
||||
[[1,0,2],
|
||||
[0,0,0],
|
||||
[1,2,1]]])
|
||||
cl = ConvLayer(5,5,3,3,3,2,1,2,IdentityActivator(),0.001)
|
||||
cl.filters[0].weights = np.array(
|
||||
[[[-1,1,0],
|
||||
[0,1,0],
|
||||
[0,1,1]],
|
||||
[[-1,-1,0],
|
||||
[0,0,0],
|
||||
[0,-1,0]],
|
||||
[[0,0,-1],
|
||||
[0,1,0],
|
||||
[1,-1,-1]]], dtype=np.float64)
|
||||
cl.filters[0].bias=1
|
||||
cl.filters[1].weights = np.array(
|
||||
[[[1,1,-1],
|
||||
[-1,-1,1],
|
||||
[0,-1,1]],
|
||||
[[0,1,0],
|
||||
[-1,0,-1],
|
||||
[-1,1,0]],
|
||||
[[-1,0,0],
|
||||
[-1,0,1],
|
||||
[-1,0,0]]], dtype=np.float64)
|
||||
return a, b, cl
|
||||
|
||||
|
||||
def test():
|
||||
a, b, cl = init_test()
|
||||
cl.forward(a)
|
||||
print(cl.output_array)
|
||||
|
||||
def test_bp():
|
||||
a, b, cl = init_test()
|
||||
cl.backward(a, b, IdentityActivator())
|
||||
cl.update()
|
||||
print(cl.filters[0])
|
||||
print(cl.filters[1])
|
||||
|
||||
|
||||
def gradient_check():
|
||||
'''
|
||||
梯度检查
|
||||
'''
|
||||
# 设计一个误差函数,取所有节点输出项之和
|
||||
error_function = lambda o: o.sum()
|
||||
|
||||
# 计算forward值
|
||||
a, b, cl = init_test()
|
||||
cl.forward(a)
|
||||
|
||||
# 求取sensitivity map
|
||||
sensitivity_array = np.ones(cl.output_array.shape,
|
||||
dtype=np.float64)
|
||||
# 计算梯度
|
||||
cl.backward(a, sensitivity_array,
|
||||
IdentityActivator())
|
||||
# 检查梯度
|
||||
epsilon = 10e-4
|
||||
for d in range(cl.filters[0].weights_grad.shape[0]):
|
||||
for i in range(cl.filters[0].weights_grad.shape[1]):
|
||||
for j in range(cl.filters[0].weights_grad.shape[2]):
|
||||
cl.filters[0].weights[d,i,j] += epsilon
|
||||
cl.forward(a)
|
||||
err1 = error_function(cl.output_array)
|
||||
cl.filters[0].weights[d,i,j] -= 2*epsilon
|
||||
cl.forward(a)
|
||||
err2 = error_function(cl.output_array)
|
||||
expect_grad = (err1 - err2) / (2 * epsilon)
|
||||
cl.filters[0].weights[d,i,j] += epsilon
|
||||
print('weights(%d,%d,%d): expected - actural %f - %f' % (
|
||||
d, i, j, expect_grad, cl.filters[0].weights_grad[d,i,j]))
|
||||
|
||||
|
||||
def init_pool_test():
|
||||
a = np.array(
|
||||
[[[1,1,2,4],
|
||||
[5,6,7,8],
|
||||
[3,2,1,0],
|
||||
[1,2,3,4]],
|
||||
[[0,1,2,3],
|
||||
[4,5,6,7],
|
||||
[8,9,0,1],
|
||||
[3,4,5,6]]], dtype=np.float64)
|
||||
|
||||
b = np.array(
|
||||
[[[1,2],
|
||||
[2,4]],
|
||||
[[3,5],
|
||||
[8,2]]], dtype=np.float64)
|
||||
|
||||
mpl = MaxPoolingLayer(4,4,2,2,2,2)
|
||||
|
||||
return a, b, mpl
|
||||
|
||||
|
||||
def test_pool():
|
||||
a, b, mpl = init_pool_test()
|
||||
mpl.forward(a)
|
||||
print('input array:\n%s\noutput array:\n%s' % (a, mpl.output_array))
|
||||
|
||||
|
||||
def test_pool_bp():
|
||||
a, b, mpl = init_pool_test()
|
||||
mpl.backward(a, b)
|
||||
print('input array:\n%s\nsensitivity array:\n%s\ndelta array:\n%s' % (a, b, mpl.delta_array))
|
||||
232
src/py2.x/dl/fc.py
Normal file
232
src/py2.x/dl/fc.py
Normal file
@@ -0,0 +1,232 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
|
||||
from __future__ import print_function
|
||||
import random
|
||||
import numpy as np
|
||||
from activators import SigmoidActivator, IdentityActivator
|
||||
|
||||
try:
|
||||
reduce # Python 2
|
||||
except NameError: # Python 3
|
||||
from functools import reduce
|
||||
|
||||
|
||||
# 全连接层实现类
|
||||
class FullConnectedLayer(object):
|
||||
def __init__(self, input_size, output_size,
|
||||
activator):
|
||||
'''
|
||||
构造函数
|
||||
input_size: 本层输入向量的维度
|
||||
output_size: 本层输出向量的维度
|
||||
activator: 激活函数
|
||||
'''
|
||||
self.input_size = input_size
|
||||
self.output_size = output_size
|
||||
self.activator = activator
|
||||
# 权重数组W
|
||||
self.W = np.random.uniform(-0.1, 0.1,
|
||||
(output_size, input_size))
|
||||
# 偏置项b
|
||||
self.b = np.zeros((output_size, 1))
|
||||
# 输出向量
|
||||
self.output = np.zeros((output_size, 1))
|
||||
|
||||
def forward(self, input_array):
|
||||
'''
|
||||
前向计算
|
||||
input_array: 输入向量,维度必须等于input_size
|
||||
'''
|
||||
# 式2
|
||||
self.input = input_array
|
||||
self.output = self.activator.forward(
|
||||
np.dot(self.W, input_array) + self.b)
|
||||
|
||||
def backward(self, delta_array):
|
||||
'''
|
||||
反向计算W和b的梯度
|
||||
delta_array: 从上一层传递过来的误差项
|
||||
'''
|
||||
# 式8
|
||||
self.delta = self.activator.backward(self.input) * np.dot(
|
||||
self.W.T, delta_array)
|
||||
self.W_grad = np.dot(delta_array, self.input.T)
|
||||
self.b_grad = delta_array
|
||||
|
||||
def update(self, learning_rate):
|
||||
'''
|
||||
使用梯度下降算法更新权重
|
||||
'''
|
||||
self.W += learning_rate * self.W_grad
|
||||
self.b += learning_rate * self.b_grad
|
||||
|
||||
def dump(self):
|
||||
print('W: %s\nb:%s' % (self.W, self.b))
|
||||
|
||||
|
||||
# 神经网络类
|
||||
class Network(object):
|
||||
def __init__(self, layers):
|
||||
'''
|
||||
构造函数
|
||||
'''
|
||||
self.layers = []
|
||||
for i in range(len(layers) - 1):
|
||||
self.layers.append(
|
||||
FullConnectedLayer(
|
||||
layers[i], layers[i+1],
|
||||
SigmoidActivator()
|
||||
)
|
||||
)
|
||||
|
||||
def predict(self, sample):
|
||||
'''
|
||||
使用神经网络实现预测
|
||||
sample: 输入样本
|
||||
'''
|
||||
output = sample
|
||||
for layer in self.layers:
|
||||
layer.forward(output)
|
||||
output = layer.output
|
||||
return output
|
||||
|
||||
def train(self, labels, data_set, rate, epoch):
|
||||
'''
|
||||
训练函数
|
||||
labels: 样本标签
|
||||
data_set: 输入样本
|
||||
rate: 学习速率
|
||||
epoch: 训练轮数
|
||||
'''
|
||||
for i in range(epoch):
|
||||
for d in range(len(data_set)):
|
||||
self.train_one_sample(labels[d],
|
||||
data_set[d], rate)
|
||||
|
||||
def train_one_sample(self, label, sample, rate):
|
||||
self.predict(sample)
|
||||
self.calc_gradient(label)
|
||||
self.update_weight(rate)
|
||||
|
||||
def calc_gradient(self, label):
|
||||
delta = self.layers[-1].activator.backward(
|
||||
self.layers[-1].output
|
||||
) * (label - self.layers[-1].output)
|
||||
for layer in self.layers[::-1]:
|
||||
layer.backward(delta)
|
||||
delta = layer.delta
|
||||
return delta
|
||||
|
||||
def update_weight(self, rate):
|
||||
for layer in self.layers:
|
||||
layer.update(rate)
|
||||
|
||||
def dump(self):
|
||||
for layer in self.layers:
|
||||
layer.dump()
|
||||
|
||||
def loss(self, output, label):
|
||||
return 0.5 * ((label - output) * (label - output)).sum()
|
||||
|
||||
def gradient_check(self, sample_feature, sample_label):
|
||||
'''
|
||||
梯度检查
|
||||
network: 神经网络对象
|
||||
sample_feature: 样本的特征
|
||||
sample_label: 样本的标签
|
||||
'''
|
||||
|
||||
# 获取网络在当前样本下每个连接的梯度
|
||||
self.predict(sample_feature)
|
||||
self.calc_gradient(sample_label)
|
||||
|
||||
# 检查梯度
|
||||
epsilon = 10e-4
|
||||
for fc in self.layers:
|
||||
for i in range(fc.W.shape[0]):
|
||||
for j in range(fc.W.shape[1]):
|
||||
fc.W[i,j] += epsilon
|
||||
output = self.predict(sample_feature)
|
||||
err1 = self.loss(sample_label, output)
|
||||
fc.W[i,j] -= 2*epsilon
|
||||
output = self.predict(sample_feature)
|
||||
err2 = self.loss(sample_label, output)
|
||||
expect_grad = (err1 - err2) / (2 * epsilon)
|
||||
fc.W[i,j] += epsilon
|
||||
print('weights(%d,%d): expected - actural %.4e - %.4e' % (
|
||||
i, j, expect_grad, fc.W_grad[i,j]))
|
||||
|
||||
|
||||
from bp import train_data_set
|
||||
|
||||
|
||||
def transpose(args):
|
||||
return map(
|
||||
lambda arg: map(
|
||||
lambda line: np.array(line).reshape(len(line), 1)
|
||||
, arg)
|
||||
, args
|
||||
)
|
||||
|
||||
|
||||
class Normalizer(object):
|
||||
def __init__(self):
|
||||
self.mask = [
|
||||
0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
||||
]
|
||||
|
||||
def norm(self, number):
|
||||
data = map(lambda m: 0.9 if number & m else 0.1, self.mask)
|
||||
return np.array(data).reshape(8, 1)
|
||||
|
||||
def denorm(self, vec):
|
||||
binary = map(lambda i: 1 if i > 0.5 else 0, vec[:,0])
|
||||
for i in range(len(self.mask)):
|
||||
binary[i] = binary[i] * self.mask[i]
|
||||
return reduce(lambda x,y: x + y, binary)
|
||||
|
||||
def train_data_set():
|
||||
normalizer = Normalizer()
|
||||
data_set = []
|
||||
labels = []
|
||||
for i in range(0, 256):
|
||||
n = normalizer.norm(i)
|
||||
data_set.append(n)
|
||||
labels.append(n)
|
||||
return labels, data_set
|
||||
|
||||
def correct_ratio(network):
|
||||
normalizer = Normalizer()
|
||||
correct = 0.0;
|
||||
for i in range(256):
|
||||
if normalizer.denorm(network.predict(normalizer.norm(i))) == i:
|
||||
correct += 1.0
|
||||
print('correct_ratio: %.2f%%' % (correct / 256 * 100))
|
||||
|
||||
|
||||
def test():
|
||||
labels, data_set = transpose(train_data_set())
|
||||
net = Network([8, 3, 8])
|
||||
rate = 0.5
|
||||
mini_batch = 20
|
||||
epoch = 10
|
||||
for i in range(epoch):
|
||||
net.train(labels, data_set, rate, mini_batch)
|
||||
print('after epoch %d loss: %f' % (
|
||||
(i + 1),
|
||||
net.loss(labels[-1], net.predict(data_set[-1]))
|
||||
))
|
||||
rate /= 2
|
||||
correct_ratio(net)
|
||||
|
||||
|
||||
def gradient_check():
|
||||
'''
|
||||
梯度检查
|
||||
'''
|
||||
labels, data_set = transpose(train_data_set())
|
||||
net = Network([8, 3, 8])
|
||||
net.gradient_check(data_set[0], labels[0])
|
||||
return net
|
||||
122
src/py2.x/dl/linear_unit.py
Normal file
122
src/py2.x/dl/linear_unit.py
Normal file
@@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
# 引入 Perceptron 类
|
||||
from __future__ import print_function
|
||||
from perceptron import Perceptron
|
||||
|
||||
# 定义激活函数 f
|
||||
f = lambda x: x
|
||||
|
||||
class LinearUnit(Perceptron):
|
||||
'''
|
||||
Desc:
|
||||
线性单元类
|
||||
Args:
|
||||
Perceptron —— 感知器
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
def __init__(self, input_num):
|
||||
'''
|
||||
Desc:
|
||||
初始化线性单元,设置输入参数的个数
|
||||
Args:
|
||||
input_num —— 输入参数的个数
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 初始化我们的感知器类,设置输入参数的个数 input_num 和 激活函数 f
|
||||
Perceptron.__init__(self, input_num, f)
|
||||
|
||||
# 构造简单的数据集
|
||||
def get_training_dataset():
|
||||
'''
|
||||
Desc:
|
||||
构建一个简单的训练数据集
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
input_vecs —— 训练数据集的特征部分
|
||||
labels —— 训练数据集的数据对应的标签,是一一对应的
|
||||
'''
|
||||
# 构建数据集,输入向量列表,每一项是工作年限
|
||||
input_vecs = [[5], [3], [8], [1.4], [10.1]]
|
||||
# 期望的输出列表,也就是输入向量的对应的标签,与工作年限对应的收入年薪
|
||||
labels = [5500, 2300, 7600, 1800, 11400]
|
||||
return input_vecs, labels
|
||||
|
||||
|
||||
# 使用我们的训练数据集对线性单元进行训练
|
||||
def train_linear_unit():
|
||||
'''
|
||||
Desc:
|
||||
使用训练数据集对我们的线性单元进行训练
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
lu —— 返回训练好的线性单元
|
||||
'''
|
||||
# 创建感知器对象,输入参数的个数也就是特征数为 1(工作年限)
|
||||
lu = LinearUnit(1)
|
||||
# 获取构建的数据集
|
||||
input_vecs, labels = get_training_dataset()
|
||||
# 训练感知器,迭代 10 轮,学习率为 0.01
|
||||
lu.train(input_vecs, labels, 10, 0.01)
|
||||
# 返回训练好的线性单元
|
||||
return lu
|
||||
|
||||
|
||||
# 将图像画出来
|
||||
def plot(linear_unit):
|
||||
'''
|
||||
Desc:
|
||||
将我们训练好的线性单元对数据的分类情况作图画出来
|
||||
Args:
|
||||
linear_unit —— 训练好的线性单元
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 引入绘图的库
|
||||
import matplotlib.pyplot as plt
|
||||
# 获取训练数据:特征 input_vecs 与 对应的标签 labels
|
||||
input_vecs, labels = get_training_dataset()
|
||||
# figure() 创建一个 Figure 对象,与用户交互的整个窗口,这个 figure 中容纳着 subplots
|
||||
fig = plt.figure()
|
||||
# 在 figure 对象中创建 1行1列中的第一个图
|
||||
ax = fig.add_subplot(111)
|
||||
# scatter(x, y) 绘制散点图,其中的 x,y 是相同长度的数组序列
|
||||
ax.scatter(map(lambda x: x[0], input_vecs), labels)
|
||||
# 设置权重
|
||||
weights = linear_unit.weights
|
||||
# 设置偏置项
|
||||
bias = linear_unit.bias
|
||||
# range(start, stop, step) 从 start 开始,到 stop 结束,步长为 step
|
||||
x = range(0, 12, 1)
|
||||
# 计算感知器对输入计算得到的值
|
||||
y = map(lambda x: weights[0] * x + bias, x)
|
||||
# 将图画出来
|
||||
ax.plot(x, y)
|
||||
# 将最终的图展示出来
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
'''
|
||||
Desc:
|
||||
main 函数,训练我们的线性单元,并进行预测
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 首先训练我们的线性单元
|
||||
linear_unit = train_linear_unit()
|
||||
# 打印训练获得的权重 和 偏置
|
||||
print(linear_unit)
|
||||
# 测试
|
||||
print('Work 3.4 years, monthly salary = %.2f' % linear_unit.predict([3.4]))
|
||||
print('Work 15 years, monthly salary = %.2f' % linear_unit.predict([15]))
|
||||
print('Work 1.5 years, monthly salary = %.2f' % linear_unit.predict([1.5]))
|
||||
print('Work 6.3 years, monthly salary = %.2f' % linear_unit.predict([6.3]))
|
||||
plot(linear_unit)
|
||||
334
src/py2.x/dl/lstm.py
Normal file
334
src/py2.x/dl/lstm.py
Normal file
@@ -0,0 +1,334 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
|
||||
from __future__ import print_function
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from cnn import element_wise_op
|
||||
from activators import SigmoidActivator, TanhActivator, IdentityActivator
|
||||
|
||||
|
||||
class LstmLayer(object):
|
||||
def __init__(self, input_width, state_width,
|
||||
learning_rate):
|
||||
self.input_width = input_width
|
||||
self.state_width = state_width
|
||||
self.learning_rate = learning_rate
|
||||
# 门的激活函数
|
||||
self.gate_activator = SigmoidActivator()
|
||||
# 输出的激活函数
|
||||
self.output_activator = TanhActivator()
|
||||
# 当前时刻初始化为t0
|
||||
self.times = 0
|
||||
# 各个时刻的单元状态向量c
|
||||
self.c_list = self.init_state_vec()
|
||||
# 各个时刻的输出向量h
|
||||
self.h_list = self.init_state_vec()
|
||||
# 各个时刻的遗忘门f
|
||||
self.f_list = self.init_state_vec()
|
||||
# 各个时刻的输入门i
|
||||
self.i_list = self.init_state_vec()
|
||||
# 各个时刻的输出门o
|
||||
self.o_list = self.init_state_vec()
|
||||
# 各个时刻的即时状态c~
|
||||
self.ct_list = self.init_state_vec()
|
||||
# 遗忘门权重矩阵Wfh, Wfx, 偏置项bf
|
||||
self.Wfh, self.Wfx, self.bf = (
|
||||
self.init_weight_mat())
|
||||
# 输入门权重矩阵Wfh, Wfx, 偏置项bf
|
||||
self.Wih, self.Wix, self.bi = (
|
||||
self.init_weight_mat())
|
||||
# 输出门权重矩阵Wfh, Wfx, 偏置项bf
|
||||
self.Woh, self.Wox, self.bo = (
|
||||
self.init_weight_mat())
|
||||
# 单元状态权重矩阵Wfh, Wfx, 偏置项bf
|
||||
self.Wch, self.Wcx, self.bc = (
|
||||
self.init_weight_mat())
|
||||
|
||||
def init_state_vec(self):
|
||||
'''
|
||||
初始化保存状态的向量
|
||||
'''
|
||||
state_vec_list = []
|
||||
state_vec_list.append(np.zeros(
|
||||
(self.state_width, 1)))
|
||||
return state_vec_list
|
||||
|
||||
def init_weight_mat(self):
|
||||
'''
|
||||
初始化权重矩阵
|
||||
'''
|
||||
Wh = np.random.uniform(-1e-4, 1e-4,
|
||||
(self.state_width, self.state_width))
|
||||
Wx = np.random.uniform(-1e-4, 1e-4,
|
||||
(self.state_width, self.input_width))
|
||||
b = np.zeros((self.state_width, 1))
|
||||
return Wh, Wx, b
|
||||
|
||||
def forward(self, x):
|
||||
'''
|
||||
根据式1-式6进行前向计算
|
||||
'''
|
||||
self.times += 1
|
||||
# 遗忘门
|
||||
fg = self.calc_gate(x, self.Wfx, self.Wfh,
|
||||
self.bf, self.gate_activator)
|
||||
self.f_list.append(fg)
|
||||
# 输入门
|
||||
ig = self.calc_gate(x, self.Wix, self.Wih,
|
||||
self.bi, self.gate_activator)
|
||||
self.i_list.append(ig)
|
||||
# 输出门
|
||||
og = self.calc_gate(x, self.Wox, self.Woh,
|
||||
self.bo, self.gate_activator)
|
||||
self.o_list.append(og)
|
||||
# 即时状态
|
||||
ct = self.calc_gate(x, self.Wcx, self.Wch,
|
||||
self.bc, self.output_activator)
|
||||
self.ct_list.append(ct)
|
||||
# 单元状态
|
||||
c = fg * self.c_list[self.times - 1] + ig * ct
|
||||
self.c_list.append(c)
|
||||
# 输出
|
||||
h = og * self.output_activator.forward(c)
|
||||
self.h_list.append(h)
|
||||
|
||||
def calc_gate(self, x, Wx, Wh, b, activator):
|
||||
'''
|
||||
计算门
|
||||
'''
|
||||
h = self.h_list[self.times - 1] # 上次的LSTM输出
|
||||
net = np.dot(Wh, h) + np.dot(Wx, x) + b
|
||||
gate = activator.forward(net)
|
||||
return gate
|
||||
|
||||
|
||||
def backward(self, x, delta_h, activator):
|
||||
'''
|
||||
实现LSTM训练算法
|
||||
'''
|
||||
self.calc_delta(delta_h, activator)
|
||||
self.calc_gradient(x)
|
||||
|
||||
def update(self):
|
||||
'''
|
||||
按照梯度下降,更新权重
|
||||
'''
|
||||
self.Wfh -= self.learning_rate * self.Whf_grad
|
||||
self.Wfx -= self.learning_rate * self.Whx_grad
|
||||
self.bf -= self.learning_rate * self.bf_grad
|
||||
self.Wih -= self.learning_rate * self.Whi_grad
|
||||
self.Wix -= self.learning_rate * self.Whi_grad
|
||||
self.bi -= self.learning_rate * self.bi_grad
|
||||
self.Woh -= self.learning_rate * self.Wof_grad
|
||||
self.Wox -= self.learning_rate * self.Wox_grad
|
||||
self.bo -= self.learning_rate * self.bo_grad
|
||||
self.Wch -= self.learning_rate * self.Wcf_grad
|
||||
self.Wcx -= self.learning_rate * self.Wcx_grad
|
||||
self.bc -= self.learning_rate * self.bc_grad
|
||||
|
||||
def calc_delta(self, delta_h, activator):
|
||||
# 初始化各个时刻的误差项
|
||||
self.delta_h_list = self.init_delta() # 输出误差项
|
||||
self.delta_o_list = self.init_delta() # 输出门误差项
|
||||
self.delta_i_list = self.init_delta() # 输入门误差项
|
||||
self.delta_f_list = self.init_delta() # 遗忘门误差项
|
||||
self.delta_ct_list = self.init_delta() # 即时输出误差项
|
||||
|
||||
# 保存从上一层传递下来的当前时刻的误差项
|
||||
self.delta_h_list[-1] = delta_h
|
||||
|
||||
# 迭代计算每个时刻的误差项
|
||||
for k in range(self.times, 0, -1):
|
||||
self.calc_delta_k(k)
|
||||
|
||||
def init_delta(self):
|
||||
'''
|
||||
初始化误差项
|
||||
'''
|
||||
delta_list = []
|
||||
for i in range(self.times + 1):
|
||||
delta_list.append(np.zeros(
|
||||
(self.state_width, 1)))
|
||||
return delta_list
|
||||
|
||||
def calc_delta_k(self, k):
|
||||
'''
|
||||
根据k时刻的delta_h,计算k时刻的delta_f、
|
||||
delta_i、delta_o、delta_ct,以及k-1时刻的delta_h
|
||||
'''
|
||||
# 获得k时刻前向计算的值
|
||||
ig = self.i_list[k]
|
||||
og = self.o_list[k]
|
||||
fg = self.f_list[k]
|
||||
ct = self.ct_list[k]
|
||||
c = self.c_list[k]
|
||||
c_prev = self.c_list[k-1]
|
||||
tanh_c = self.output_activator.forward(c)
|
||||
delta_k = self.delta_h_list[k]
|
||||
|
||||
# 根据式9计算delta_o
|
||||
delta_o = (delta_k * tanh_c *
|
||||
self.gate_activator.backward(og))
|
||||
delta_f = (delta_k * og *
|
||||
(1 - tanh_c * tanh_c) * c_prev *
|
||||
self.gate_activator.backward(fg))
|
||||
delta_i = (delta_k * og *
|
||||
(1 - tanh_c * tanh_c) * ct *
|
||||
self.gate_activator.backward(ig))
|
||||
delta_ct = (delta_k * og *
|
||||
(1 - tanh_c * tanh_c) * ig *
|
||||
self.output_activator.backward(ct))
|
||||
delta_h_prev = (
|
||||
np.dot(delta_o.transpose(), self.Woh) +
|
||||
np.dot(delta_i.transpose(), self.Wih) +
|
||||
np.dot(delta_f.transpose(), self.Wfh) +
|
||||
np.dot(delta_ct.transpose(), self.Wch)
|
||||
).transpose()
|
||||
|
||||
# 保存全部delta值
|
||||
self.delta_h_list[k-1] = delta_h_prev
|
||||
self.delta_f_list[k] = delta_f
|
||||
self.delta_i_list[k] = delta_i
|
||||
self.delta_o_list[k] = delta_o
|
||||
self.delta_ct_list[k] = delta_ct
|
||||
|
||||
def calc_gradient(self, x):
|
||||
# 初始化遗忘门权重梯度矩阵和偏置项
|
||||
self.Wfh_grad, self.Wfx_grad, self.bf_grad = (
|
||||
self.init_weight_gradient_mat())
|
||||
# 初始化输入门权重梯度矩阵和偏置项
|
||||
self.Wih_grad, self.Wix_grad, self.bi_grad = (
|
||||
self.init_weight_gradient_mat())
|
||||
# 初始化输出门权重梯度矩阵和偏置项
|
||||
self.Woh_grad, self.Wox_grad, self.bo_grad = (
|
||||
self.init_weight_gradient_mat())
|
||||
# 初始化单元状态权重梯度矩阵和偏置项
|
||||
self.Wch_grad, self.Wcx_grad, self.bc_grad = (
|
||||
self.init_weight_gradient_mat())
|
||||
|
||||
# 计算对上一次输出h的权重梯度
|
||||
for t in range(self.times, 0, -1):
|
||||
# 计算各个时刻的梯度
|
||||
(Wfh_grad, bf_grad,
|
||||
Wih_grad, bi_grad,
|
||||
Woh_grad, bo_grad,
|
||||
Wch_grad, bc_grad) = (
|
||||
self.calc_gradient_t(t))
|
||||
# 实际梯度是各时刻梯度之和
|
||||
self.Wfh_grad += Wfh_grad
|
||||
self.bf_grad += bf_grad
|
||||
self.Wih_grad += Wih_grad
|
||||
self.bi_grad += bi_grad
|
||||
self.Woh_grad += Woh_grad
|
||||
self.bo_grad += bo_grad
|
||||
self.Wch_grad += Wch_grad
|
||||
self.bc_grad += bc_grad
|
||||
|
||||
# 计算对本次输入x的权重梯度
|
||||
xt = x.transpose()
|
||||
self.Wfx_grad = np.dot(self.delta_f_list[-1], xt)
|
||||
self.Wix_grad = np.dot(self.delta_i_list[-1], xt)
|
||||
self.Wox_grad = np.dot(self.delta_o_list[-1], xt)
|
||||
self.Wcx_grad = np.dot(self.delta_ct_list[-1], xt)
|
||||
|
||||
def init_weight_gradient_mat(self):
|
||||
'''
|
||||
初始化权重矩阵
|
||||
'''
|
||||
Wh_grad = np.zeros((self.state_width,
|
||||
self.state_width))
|
||||
Wx_grad = np.zeros((self.state_width,
|
||||
self.input_width))
|
||||
b_grad = np.zeros((self.state_width, 1))
|
||||
return Wh_grad, Wx_grad, b_grad
|
||||
|
||||
def calc_gradient_t(self, t):
|
||||
'''
|
||||
计算每个时刻t权重的梯度
|
||||
'''
|
||||
h_prev = self.h_list[t-1].transpose()
|
||||
Wfh_grad = np.dot(self.delta_f_list[t], h_prev)
|
||||
bf_grad = self.delta_f_list[t]
|
||||
Wih_grad = np.dot(self.delta_i_list[t], h_prev)
|
||||
bi_grad = self.delta_f_list[t]
|
||||
Woh_grad = np.dot(self.delta_o_list[t], h_prev)
|
||||
bo_grad = self.delta_f_list[t]
|
||||
Wch_grad = np.dot(self.delta_ct_list[t], h_prev)
|
||||
bc_grad = self.delta_ct_list[t]
|
||||
return Wfh_grad, bf_grad, Wih_grad, bi_grad, \
|
||||
Woh_grad, bo_grad, Wch_grad, bc_grad
|
||||
|
||||
def reset_state(self):
|
||||
# 当前时刻初始化为t0
|
||||
self.times = 0
|
||||
# 各个时刻的单元状态向量c
|
||||
self.c_list = self.init_state_vec()
|
||||
# 各个时刻的输出向量h
|
||||
self.h_list = self.init_state_vec()
|
||||
# 各个时刻的遗忘门f
|
||||
self.f_list = self.init_state_vec()
|
||||
# 各个时刻的输入门i
|
||||
self.i_list = self.init_state_vec()
|
||||
# 各个时刻的输出门o
|
||||
self.o_list = self.init_state_vec()
|
||||
# 各个时刻的即时状态c~
|
||||
self.ct_list = self.init_state_vec()
|
||||
|
||||
|
||||
def data_set():
|
||||
x = [np.array([[1], [2], [3]]),
|
||||
np.array([[2], [3], [4]])]
|
||||
d = np.array([[1], [2]])
|
||||
return x, d
|
||||
|
||||
|
||||
def gradient_check():
|
||||
'''
|
||||
梯度检查
|
||||
'''
|
||||
# 设计一个误差函数,取所有节点输出项之和
|
||||
error_function = lambda o: o.sum()
|
||||
|
||||
lstm = LstmLayer(3, 2, 1e-3)
|
||||
|
||||
# 计算forward值
|
||||
x, d = data_set()
|
||||
lstm.forward(x[0])
|
||||
lstm.forward(x[1])
|
||||
|
||||
# 求取sensitivity map
|
||||
sensitivity_array = np.ones(lstm.h_list[-1].shape,
|
||||
dtype=np.float64)
|
||||
# 计算梯度
|
||||
lstm.backward(x[1], sensitivity_array, IdentityActivator())
|
||||
|
||||
# 检查梯度
|
||||
epsilon = 10e-4
|
||||
for i in range(lstm.Wfh.shape[0]):
|
||||
for j in range(lstm.Wfh.shape[1]):
|
||||
lstm.Wfh[i,j] += epsilon
|
||||
lstm.reset_state()
|
||||
lstm.forward(x[0])
|
||||
lstm.forward(x[1])
|
||||
err1 = error_function(lstm.h_list[-1])
|
||||
lstm.Wfh[i,j] -= 2*epsilon
|
||||
lstm.reset_state()
|
||||
lstm.forward(x[0])
|
||||
lstm.forward(x[1])
|
||||
err2 = error_function(lstm.h_list[-1])
|
||||
expect_grad = (err1 - err2) / (2 * epsilon)
|
||||
lstm.Wfh[i,j] += epsilon
|
||||
print('weights(%d,%d): expected - actural %.4e - %.4e' % (
|
||||
i, j, expect_grad, lstm.Wfh_grad[i,j]))
|
||||
return lstm
|
||||
|
||||
|
||||
def test():
|
||||
l = LstmLayer(3, 2, 1e-3)
|
||||
x, d = data_set()
|
||||
l.forward(x[0])
|
||||
l.forward(x[1])
|
||||
l.backward(x[1], d, IdentityActivator())
|
||||
return l
|
||||
177
src/py2.x/dl/mnist.py
Normal file
177
src/py2.x/dl/mnist.py
Normal file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
import struct
|
||||
from fc import *
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
# 数据加载器基类
|
||||
class Loader(object):
|
||||
def __init__(self, path, count):
|
||||
'''
|
||||
初始化加载器
|
||||
path: 数据文件路径
|
||||
count: 文件中的样本个数
|
||||
'''
|
||||
self.path = path
|
||||
self.count = count
|
||||
|
||||
def get_file_content(self):
|
||||
'''
|
||||
读取文件内容
|
||||
'''
|
||||
f = open(self.path, 'rb')
|
||||
content = f.read()
|
||||
f.close()
|
||||
return content
|
||||
|
||||
def to_int(self, byte):
|
||||
'''
|
||||
将unsigned byte字符转换为整数
|
||||
'''
|
||||
return struct.unpack('B', byte)[0]
|
||||
|
||||
|
||||
# 图像数据加载器
|
||||
class ImageLoader(Loader):
|
||||
def get_picture(self, content, index):
|
||||
'''
|
||||
内部函数,从文件中获取图像
|
||||
'''
|
||||
start = index * 28 * 28 + 16
|
||||
picture = []
|
||||
for i in range(28):
|
||||
picture.append([])
|
||||
for j in range(28):
|
||||
picture[i].append(
|
||||
self.to_int(content[start + i * 28 + j]))
|
||||
return picture
|
||||
|
||||
def get_one_sample(self, picture):
|
||||
'''
|
||||
内部函数,将图像转化为样本的输入向量
|
||||
'''
|
||||
sample = []
|
||||
for i in range(28):
|
||||
for j in range(28):
|
||||
sample.append(picture[i][j])
|
||||
return sample
|
||||
|
||||
def load(self):
|
||||
'''
|
||||
加载数据文件,获得全部样本的输入向量
|
||||
'''
|
||||
content = self.get_file_content()
|
||||
data_set = []
|
||||
for index in range(self.count):
|
||||
data_set.append(
|
||||
self.get_one_sample(
|
||||
self.get_picture(content, index)))
|
||||
return data_set
|
||||
|
||||
|
||||
# 标签数据加载器
|
||||
class LabelLoader(Loader):
|
||||
def load(self):
|
||||
'''
|
||||
加载数据文件,获得全部样本的标签向量
|
||||
'''
|
||||
content = self.get_file_content()
|
||||
labels = []
|
||||
for index in range(self.count):
|
||||
labels.append(self.norm(content[index + 8]))
|
||||
return labels
|
||||
|
||||
def norm(self, label):
|
||||
'''
|
||||
内部函数,将一个值转换为10维标签向量
|
||||
'''
|
||||
label_vec = []
|
||||
label_value = self.to_int(label)
|
||||
for i in range(10):
|
||||
if i == label_value:
|
||||
label_vec.append(0.9)
|
||||
else:
|
||||
label_vec.append(0.1)
|
||||
return label_vec
|
||||
|
||||
|
||||
def get_training_data_set():
|
||||
'''
|
||||
获得训练数据集
|
||||
'''
|
||||
image_loader = ImageLoader('train-images-idx3-ubyte', 60000)
|
||||
label_loader = LabelLoader('train-labels-idx1-ubyte', 60000)
|
||||
return image_loader.load(), label_loader.load()
|
||||
|
||||
|
||||
def get_test_data_set():
|
||||
'''
|
||||
获得测试数据集
|
||||
'''
|
||||
image_loader = ImageLoader('t10k-images-idx3-ubyte', 10000)
|
||||
label_loader = LabelLoader('t10k-labels-idx1-ubyte', 10000)
|
||||
return image_loader.load(), label_loader.load()
|
||||
|
||||
|
||||
def show(sample):
|
||||
str = ''
|
||||
for i in range(28):
|
||||
for j in range(28):
|
||||
if sample[i*28+j] != 0:
|
||||
str += '*'
|
||||
else:
|
||||
str += ' '
|
||||
str += '\n'
|
||||
print(str)
|
||||
|
||||
|
||||
def get_result(vec):
|
||||
max_value_index = 0
|
||||
max_value = 0
|
||||
for i in range(len(vec)):
|
||||
if vec[i] > max_value:
|
||||
max_value = vec[i]
|
||||
max_value_index = i
|
||||
return max_value_index
|
||||
|
||||
|
||||
def evaluate(network, test_data_set, test_labels):
|
||||
error = 0
|
||||
total = len(test_data_set)
|
||||
|
||||
for i in range(total):
|
||||
label = get_result(test_labels[i])
|
||||
predict = get_result(network.predict(test_data_set[i]))
|
||||
if label != predict:
|
||||
error += 1
|
||||
return float(error) / float(total)
|
||||
|
||||
|
||||
def now():
|
||||
return datetime.now().strftime('%c')
|
||||
|
||||
|
||||
def train_and_evaluate():
|
||||
last_error_ratio = 1.0
|
||||
epoch = 0
|
||||
train_data_set, train_labels = transpose(get_training_data_set())
|
||||
test_data_set, test_labels = transpose(get_test_data_set())
|
||||
network = Network([784, 100, 10])
|
||||
while True:
|
||||
epoch += 1
|
||||
network.train(train_labels, train_data_set, 0.01, 1)
|
||||
print('%s epoch %d finished, loss %f' % (now(), epoch,
|
||||
network.loss(train_labels[-1], network.predict(train_data_set[-1]))))
|
||||
if epoch % 2 == 0:
|
||||
error_ratio = evaluate(network, test_data_set, test_labels)
|
||||
print('%s after epoch %d, error ratio is %f' % (now(), epoch, error_ratio))
|
||||
if error_ratio > last_error_ratio:
|
||||
break
|
||||
else:
|
||||
last_error_ratio = error_ratio
|
||||
|
||||
if __name__ == '__main__':
|
||||
train_and_evaluate()
|
||||
187
src/py2.x/dl/perceptron.py
Normal file
187
src/py2.x/dl/perceptron.py
Normal file
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
# 神经元 / 感知器
|
||||
|
||||
from __future__ import print_function
|
||||
class Perceptron():
|
||||
'''
|
||||
Desc:
|
||||
感知器类
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
|
||||
def __init__(self, input_num, activator):
|
||||
'''
|
||||
Desc:
|
||||
初始化感知器
|
||||
Args:
|
||||
input_num —— 输入参数的个数
|
||||
activator —— 激活函数
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 设置的激活函数
|
||||
self.activator = activator
|
||||
# 权重向量初始化为 0
|
||||
self.weights = [0.0 for _ in range(input_num)]
|
||||
# 偏置项初始化为 0
|
||||
self.bias = 0.0
|
||||
|
||||
|
||||
def __str__(self):
|
||||
'''
|
||||
Desc:
|
||||
将感知器信息打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
return('weights\t:%s\nbias\t:%f\n' % (self.weights, self.bias))
|
||||
|
||||
|
||||
def predict(self, input_vec):
|
||||
'''
|
||||
Desc:
|
||||
输入向量,输出感知器的计算结果
|
||||
Args:
|
||||
input_vec —— 输入向量
|
||||
Returns:
|
||||
感知器的计算结果
|
||||
'''
|
||||
# 将输入向量的计算结果返回
|
||||
# 调用 激活函数 activator ,将输入向量输入,计算感知器的结果
|
||||
# reduce() 函数是 python 2 的内置函数,从 python 3 开始移到了 functools 模块
|
||||
# reduce() 从左到右对一个序列的项累计地应用有两个参数的函数,以此合并序列到一个单一值,例如 reduce(lambda x,y: x+y, [1,2,3,4,5]) 计算的就是 ((((1+2)+3)+4)+5)
|
||||
# map() 接收一个函数 f 和一个 list ,并通过把函数 f 依次作用在 list 的每个元素上,得到一个新的 list 返回。比如我们的 f 函数是计算平方, map(f, [1,2,3,4,5]) ===> 返回 [1,4,9,16,25]
|
||||
# zip() 接收任意多个(包括 0 个和 1个)序列作为参数,返回一个 tuple 列表。例:x = [1,2,3] y = [4,5,6] z = [7,8,9] xyz = zip(x, y, z) ===> [(1,4,7), (2,5,8), (3,6,9)]
|
||||
return self.activator(reduce(lambda a, b: a + b,map(lambda (x, w): x * w, zip(input_vec, self.weights)), 0.0) + self.bias)
|
||||
|
||||
|
||||
def train(self, input_vecs, labels, iteration, rate):
|
||||
'''
|
||||
Desc:
|
||||
输入训练数据:一组向量、与每个向量对应的 label; 以及训练轮数、学习率
|
||||
Args:
|
||||
input_vec —— 输入向量
|
||||
labels —— 数据对应的标签
|
||||
iteration —— 训练的迭代轮数
|
||||
rate —— 学习率
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
for i in range(iteration):
|
||||
self._one_iteration(input_vecs, labels, rate)
|
||||
|
||||
|
||||
def _one_iteration(self, input_vecs, labels, rate):
|
||||
'''
|
||||
Desc:
|
||||
训练过程的一次迭代过程
|
||||
Args:
|
||||
input_vecs —— 输入向量
|
||||
labels —— 数据对应的标签
|
||||
rate —— 学习率
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# zip() 接收任意多个(包括 0 个和 1个)序列作为参数,返回一个 tuple 列表。例:x = [1,2,3] y = [4,5,6] z = [7,8,9] xyz = zip(x, y, z) ===> [(1,4,7), (2,5,8), (3,6,9)]
|
||||
samples = zip(input_vecs, labels)
|
||||
# 对每个样本,按照感知器规则更新权重
|
||||
for (input_vec, label) in samples:
|
||||
# 计算感知器在当前权重下的输出
|
||||
output = self.predict(input_vec)
|
||||
# 更新权重
|
||||
output = self._update_weights(input_vec, output, label, rate)
|
||||
|
||||
def _update_weights(self, input_vec, output, label, rate):
|
||||
'''
|
||||
Desc:
|
||||
按照感知器规则更新权重
|
||||
Args:
|
||||
input_vec —— 输入向量
|
||||
output —— 经过感知器规则计算得到的输出
|
||||
label —— 输入向量对应的标签
|
||||
rate —— 学习率
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 利用感知器规则更新权重
|
||||
delta = label - output
|
||||
# map() 接收一个函数 f 和一个 list ,并通过把函数 f 依次作用在 list 的每个元素上,得到一个新的 list 返回。比如我们的 f 函数是计算平方, map(f, [1,2,3,4,5]) ===> 返回 [1,4,9,16,25]
|
||||
# zip() 接收任意多个(包括 0 个和 1个)序列作为参数,返回一个 tuple 列表。例:x = [1,2,3] y = [4,5,6] z = [7,8,9] xyz = zip(x, y, z) ===> [(1,4,7), (2,5,8), (3,6,9)]
|
||||
self.weights = map(lambda (x, w): w + rate * delta * x, zip(input_vec, self.weights))
|
||||
# 更新 bias
|
||||
self.bias += rate * delta
|
||||
|
||||
|
||||
|
||||
def f(x):
|
||||
'''
|
||||
Desc:
|
||||
定义激活函数 f
|
||||
Args:
|
||||
x —— 输入向量
|
||||
Returns:
|
||||
(实现阶跃函数)大于 0 返回 1,否则返回 0
|
||||
'''
|
||||
return 1 if x > 0 else 0
|
||||
|
||||
|
||||
def get_training_dataset():
|
||||
'''
|
||||
Desc:
|
||||
基于 and 真值表来构建/获取训练数据集
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
input_vecs —— 输入向量
|
||||
labels —— 输入向量对应的标签
|
||||
'''
|
||||
# 构建训练数据,输入向量的列表
|
||||
input_vecs = [[1,1],[0,0],[1,0],[0,1]]
|
||||
# 期望的输出列表,也就是上面的输入向量的列表中数据对应的标签,是一一对应的
|
||||
labels = [1, 0, 0, 0]
|
||||
return input_vecs, labels
|
||||
|
||||
|
||||
def train_and_perceptron():
|
||||
'''
|
||||
Desc:
|
||||
使用 and 真值表来训练我们的感知器
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
p —— 返回训练好的感知器
|
||||
'''
|
||||
# 创建感知器,输入参数的个数是 2 个(因为 and 是个二元函数),激活函数为 f
|
||||
p = Perceptron(2, f)
|
||||
# 进行训练,迭代 10 轮,学习速率是我们设定的 rate ,为 0.1
|
||||
input_vecs, labels = get_training_dataset()
|
||||
p.train(input_vecs, labels, 10, 0.1)
|
||||
# 返回训练好的感知器
|
||||
return p
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
'''
|
||||
Desc:
|
||||
主函数,调用上面返回的训练好的感知器进行预测
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 训练 and 感知器
|
||||
and_perceptron = train_and_perceptron()
|
||||
# 打印训练获得的权重
|
||||
print(and_perceptron)
|
||||
# 测试
|
||||
print('1 and 1 = %d' % and_perceptron.predict([1, 1]))
|
||||
print('0 and 0 = %d' % and_perceptron.predict([0, 0]))
|
||||
print('1 and 0 = %d' % and_perceptron.predict([1, 0]))
|
||||
print('0 and 1 = %d' % and_perceptron.predict([0, 1]))
|
||||
185
src/py2.x/dl/recursive.py
Normal file
185
src/py2.x/dl/recursive.py
Normal file
@@ -0,0 +1,185 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
|
||||
from __future__ import print_function
|
||||
import numpy as np
|
||||
from activators import IdentityActivator
|
||||
|
||||
|
||||
class TreeNode(object):
|
||||
def __init__(self, data, children=[], children_data=[]):
|
||||
self.parent = None
|
||||
self.children = children
|
||||
self.children_data = children_data
|
||||
self.data = data
|
||||
for child in children:
|
||||
child.parent = self
|
||||
|
||||
# 递归神经网络实现
|
||||
class RecursiveLayer(object):
|
||||
def __init__(self, node_width, child_count,
|
||||
activator, learning_rate):
|
||||
'''
|
||||
递归神经网络构造函数
|
||||
node_width: 表示每个节点的向量的维度
|
||||
child_count: 每个父节点有几个子节点
|
||||
activator: 激活函数对象
|
||||
learning_rate: 梯度下降算法学习率
|
||||
'''
|
||||
self.node_width = node_width
|
||||
self.child_count = child_count
|
||||
self.activator = activator
|
||||
self.learning_rate = learning_rate
|
||||
# 权重数组W
|
||||
self.W = np.random.uniform(-1e-4, 1e-4,
|
||||
(node_width, node_width * child_count))
|
||||
# 偏置项b
|
||||
self.b = np.zeros((node_width, 1))
|
||||
# 递归神经网络生成的树的根节点
|
||||
self.root = None
|
||||
|
||||
def forward(self, *children):
|
||||
'''
|
||||
前向计算
|
||||
'''
|
||||
children_data = self.concatenate(children)
|
||||
parent_data = self.activator.forward(
|
||||
np.dot(self.W, children_data) + self.b
|
||||
)
|
||||
self.root = TreeNode(parent_data, children
|
||||
, children_data)
|
||||
|
||||
def backward(self, parent_delta):
|
||||
'''
|
||||
BPTS反向传播算法
|
||||
'''
|
||||
self.calc_delta(parent_delta, self.root)
|
||||
self.W_grad, self.b_grad = self.calc_gradient(self.root)
|
||||
|
||||
def update(self):
|
||||
'''
|
||||
使用SGD算法更新权重
|
||||
'''
|
||||
self.W -= self.learning_rate * self.W_grad
|
||||
self.b -= self.learning_rate * self.b_grad
|
||||
|
||||
def reset_state(self):
|
||||
self.root = None
|
||||
|
||||
def concatenate(self, tree_nodes):
|
||||
'''
|
||||
将各个树节点中的数据拼接成一个长向量
|
||||
'''
|
||||
concat = np.zeros((0,1))
|
||||
for node in tree_nodes:
|
||||
concat = np.concatenate((concat, node.data))
|
||||
return concat
|
||||
|
||||
def calc_delta(self, parent_delta, parent):
|
||||
'''
|
||||
计算每个节点的delta
|
||||
'''
|
||||
parent.delta = parent_delta
|
||||
if parent.children:
|
||||
# 根据式2计算每个子节点的delta
|
||||
children_delta = np.dot(self.W.T, parent_delta) * (
|
||||
self.activator.backward(parent.children_data)
|
||||
)
|
||||
# slices = [(子节点编号,子节点delta起始位置,子节点delta结束位置)]
|
||||
slices = [(i, i * self.node_width,
|
||||
(i + 1) * self.node_width)
|
||||
for i in range(self.child_count)]
|
||||
# 针对每个子节点,递归调用calc_delta函数
|
||||
for s in slices:
|
||||
self.calc_delta(children_delta[s[1]:s[2]],
|
||||
parent.children[s[0]])
|
||||
|
||||
def calc_gradient(self, parent):
|
||||
'''
|
||||
计算每个节点权重的梯度,并将它们求和,得到最终的梯度
|
||||
'''
|
||||
W_grad = np.zeros((self.node_width,
|
||||
self.node_width * self.child_count))
|
||||
b_grad = np.zeros((self.node_width, 1))
|
||||
if not parent.children:
|
||||
return W_grad, b_grad
|
||||
parent.W_grad = np.dot(parent.delta, parent.children_data.T)
|
||||
parent.b_grad = parent.delta
|
||||
W_grad += parent.W_grad
|
||||
b_grad += parent.b_grad
|
||||
for child in parent.children:
|
||||
W, b = self.calc_gradient(child)
|
||||
W_grad += W
|
||||
b_grad += b
|
||||
return W_grad, b_grad
|
||||
|
||||
def dump(self, **kwArgs):
|
||||
print('root.data: %s' % self.root.data)
|
||||
print('root.children_data: %s' % self.root.children_data)
|
||||
if kwArgs.has_key('dump_grad'):
|
||||
print('W_grad: %s' % self.W_grad)
|
||||
print('b_grad: %s' % self.b_grad)
|
||||
|
||||
|
||||
def data_set():
|
||||
children = [
|
||||
TreeNode(np.array([[1],[2]])),
|
||||
TreeNode(np.array([[3],[4]])),
|
||||
TreeNode(np.array([[5],[6]]))
|
||||
]
|
||||
d = np.array([[0.5],[0.8]])
|
||||
return children, d
|
||||
|
||||
|
||||
def gradient_check():
|
||||
'''
|
||||
梯度检查
|
||||
'''
|
||||
# 设计一个误差函数,取所有节点输出项之和
|
||||
error_function = lambda o: o.sum()
|
||||
|
||||
rnn = RecursiveLayer(2, 2, IdentityActivator(), 1e-3)
|
||||
|
||||
# 计算forward值
|
||||
x, d = data_set()
|
||||
rnn.forward(x[0], x[1])
|
||||
rnn.forward(rnn.root, x[2])
|
||||
|
||||
# 求取sensitivity map
|
||||
sensitivity_array = np.ones((rnn.node_width, 1),
|
||||
dtype=np.float64)
|
||||
# 计算梯度
|
||||
rnn.backward(sensitivity_array)
|
||||
|
||||
# 检查梯度
|
||||
epsilon = 10e-4
|
||||
for i in range(rnn.W.shape[0]):
|
||||
for j in range(rnn.W.shape[1]):
|
||||
rnn.W[i,j] += epsilon
|
||||
rnn.reset_state()
|
||||
rnn.forward(x[0], x[1])
|
||||
rnn.forward(rnn.root, x[2])
|
||||
err1 = error_function(rnn.root.data)
|
||||
rnn.W[i,j] -= 2*epsilon
|
||||
rnn.reset_state()
|
||||
rnn.forward(x[0], x[1])
|
||||
rnn.forward(rnn.root, x[2])
|
||||
err2 = error_function(rnn.root.data)
|
||||
expect_grad = (err1 - err2) / (2 * epsilon)
|
||||
rnn.W[i,j] += epsilon
|
||||
print('weights(%d,%d): expected - actural %.4e - %.4e' % (
|
||||
i, j, expect_grad, rnn.W_grad[i,j]))
|
||||
return rnn
|
||||
|
||||
|
||||
def test():
|
||||
children, d = data_set()
|
||||
rnn = RecursiveLayer(2, 2, IdentityActivator(), 1e-3)
|
||||
rnn.forward(children[0], children[1])
|
||||
rnn.dump()
|
||||
rnn.forward(rnn.root, children[2])
|
||||
rnn.dump()
|
||||
rnn.backward(d)
|
||||
rnn.dump(dump_grad='true')
|
||||
return rnn
|
||||
161
src/py2.x/dl/rnn.py
Normal file
161
src/py2.x/dl/rnn.py
Normal file
@@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
import numpy as np
|
||||
from cnn import element_wise_op
|
||||
from activators import ReluActivator, IdentityActivator
|
||||
|
||||
try:
|
||||
reduce # Python 2
|
||||
except NameError: # Python 3
|
||||
from functools import reduce
|
||||
|
||||
|
||||
class RecurrentLayer(object):
|
||||
'''
|
||||
Desc:
|
||||
用 RecurrentLayer 类来实现一个循环层。下面的代码是初始化一个循环层,可以在构造函数中设置卷积层的超参数。我们注意到,循环层有两个权重数组,U和W
|
||||
'''
|
||||
def __init__(self, input_width, state_width,
|
||||
activator, learning_rate):
|
||||
self.input_width = input_width
|
||||
self.state_width = state_width
|
||||
self.activator = activator
|
||||
self.learning_rate = learning_rate
|
||||
self.times = 0 # 当前时刻初始化为t0
|
||||
self.state_list = [] # 保存各个时刻的state
|
||||
self.state_list.append(np.zeros(
|
||||
(state_width, 1))) # 初始化s0
|
||||
self.U = np.random.uniform(-1e-4, 1e-4,
|
||||
(state_width, input_width)) # 初始化U
|
||||
self.W = np.random.uniform(-1e-4, 1e-4,
|
||||
(state_width, state_width)) # 初始化W
|
||||
|
||||
def forward(self, input_array):
|
||||
'''
|
||||
Desc:
|
||||
实现循环层的前向计算
|
||||
'''
|
||||
self.times += 1
|
||||
state = (np.dot(self.U, input_array) +
|
||||
np.dot(self.W, self.state_list[-1]))
|
||||
element_wise_op(state, self.activator.forward)
|
||||
self.state_list.append(state)
|
||||
|
||||
def backward(self, sensitivity_array,
|
||||
activator):
|
||||
'''
|
||||
实现BPTT算法
|
||||
'''
|
||||
self.calc_delta(sensitivity_array, activator)
|
||||
self.calc_gradient()
|
||||
|
||||
def update(self):
|
||||
'''
|
||||
按照梯度下降,更新权重
|
||||
'''
|
||||
self.W -= self.learning_rate * self.gradient
|
||||
|
||||
def calc_delta(self, sensitivity_array, activator):
|
||||
self.delta_list = [] # 用来保存各个时刻的误差项
|
||||
for i in range(self.times):
|
||||
self.delta_list.append(np.zeros(
|
||||
(self.state_width, 1)))
|
||||
self.delta_list.append(sensitivity_array)
|
||||
# 迭代计算每个时刻的误差项
|
||||
for k in range(self.times - 1, 0, -1):
|
||||
self.calc_delta_k(k, activator)
|
||||
|
||||
def calc_delta_k(self, k, activator):
|
||||
'''
|
||||
根据k+1时刻的delta计算k时刻的delta
|
||||
'''
|
||||
state = self.state_list[k+1].copy()
|
||||
element_wise_op(self.state_list[k+1],
|
||||
activator.backward)
|
||||
self.delta_list[k] = np.dot(
|
||||
np.dot(self.delta_list[k+1].T, self.W),
|
||||
np.diag(state[:,0])).T
|
||||
|
||||
def calc_gradient(self):
|
||||
self.gradient_list = [] # 保存各个时刻的权重梯度
|
||||
for t in range(self.times + 1):
|
||||
self.gradient_list.append(np.zeros(
|
||||
(self.state_width, self.state_width)))
|
||||
for t in range(self.times, 0, -1):
|
||||
self.calc_gradient_t(t)
|
||||
# 实际的梯度是各个时刻梯度之和
|
||||
self.gradient = reduce(
|
||||
lambda a, b: a + b, self.gradient_list,
|
||||
self.gradient_list[0]) # [0]被初始化为0且没有被修改过
|
||||
|
||||
def calc_gradient_t(self, t):
|
||||
'''
|
||||
计算每个时刻t权重的梯度
|
||||
'''
|
||||
gradient = np.dot(self.delta_list[t],
|
||||
self.state_list[t-1].T)
|
||||
self.gradient_list[t] = gradient
|
||||
|
||||
def reset_state(self):
|
||||
self.times = 0 # 当前时刻初始化为t0
|
||||
self.state_list = [] # 保存各个时刻的state
|
||||
self.state_list.append(np.zeros(
|
||||
(self.state_width, 1))) # 初始化s0
|
||||
|
||||
|
||||
def data_set():
|
||||
x = [np.array([[1], [2], [3]]),
|
||||
np.array([[2], [3], [4]])]
|
||||
d = np.array([[1], [2]])
|
||||
return x, d
|
||||
|
||||
|
||||
def gradient_check():
|
||||
'''
|
||||
梯度检查
|
||||
'''
|
||||
# 设计一个误差函数,取所有节点输出项之和
|
||||
error_function = lambda o: o.sum()
|
||||
|
||||
rl = RecurrentLayer(3, 2, IdentityActivator(), 1e-3)
|
||||
|
||||
# 计算forward值
|
||||
x, d = data_set()
|
||||
rl.forward(x[0])
|
||||
rl.forward(x[1])
|
||||
|
||||
# 求取sensitivity map
|
||||
sensitivity_array = np.ones(rl.state_list[-1].shape,
|
||||
dtype=np.float64)
|
||||
# 计算梯度
|
||||
rl.backward(sensitivity_array, IdentityActivator())
|
||||
|
||||
# 检查梯度
|
||||
epsilon = 10e-4
|
||||
for i in range(rl.W.shape[0]):
|
||||
for j in range(rl.W.shape[1]):
|
||||
rl.W[i,j] += epsilon
|
||||
rl.reset_state()
|
||||
rl.forward(x[0])
|
||||
rl.forward(x[1])
|
||||
err1 = error_function(rl.state_list[-1])
|
||||
rl.W[i,j] -= 2*epsilon
|
||||
rl.reset_state()
|
||||
rl.forward(x[0])
|
||||
rl.forward(x[1])
|
||||
err2 = error_function(rl.state_list[-1])
|
||||
expect_grad = (err1 - err2) / (2 * epsilon)
|
||||
rl.W[i,j] += epsilon
|
||||
print('weights(%d,%d): expected - actural %f - %f' % (
|
||||
i, j, expect_grad, rl.gradient[i,j]))
|
||||
|
||||
|
||||
def test():
|
||||
l = RecurrentLayer(3, 2, ReluActivator(), 1e-3)
|
||||
x, d = data_set()
|
||||
l.forward(x[0])
|
||||
l.forward(x[1])
|
||||
l.backward(d, ReluActivator())
|
||||
return l
|
||||
59
src/py2.x/ml/1.MLFoundation/NumPy.py
Normal file
59
src/py2.x/ml/1.MLFoundation/NumPy.py
Normal file
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
|
||||
'''
|
||||
Created on 2017-05-18
|
||||
Update on 2017-05-18
|
||||
Author: Peter Harrington/1988/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
|
||||
from numpy import random, mat, eye
|
||||
|
||||
'''
|
||||
# NumPy 矩阵和数组的区别
|
||||
NumPy存在2中不同的数据类型:
|
||||
1. 矩阵 matrix
|
||||
2. 数组 array
|
||||
相似点:
|
||||
都可以处理行列表示的数字元素
|
||||
不同点:
|
||||
1. 2个数据类型上执行相同的数据运算可能得到不同的结果。
|
||||
2. NumPy函数库中的 matrix 与 MATLAB中 matrices 等价。
|
||||
'''
|
||||
|
||||
# 生成一个 4*4 的随机数组
|
||||
randArray = random.rand(4, 4)
|
||||
|
||||
# 转化关系, 数组转化为矩阵
|
||||
randMat = mat(randArray)
|
||||
'''
|
||||
.I 表示对矩阵求逆(可以利用矩阵的初等变换)
|
||||
意义:逆矩阵是一个判断相似性的工具。逆矩阵A与列向量p相乘后,将得到列向量q,q的第i个分量表示p与A的第i个列向量的相似度。
|
||||
参考案例链接:
|
||||
https://www.zhihu.com/question/33258489
|
||||
http://blog.csdn.net/vernice/article/details/48506027
|
||||
.T 表示对矩阵转置(行列颠倒)
|
||||
* 等同于: .transpose()
|
||||
.A 返回矩阵基于的数组
|
||||
参考案例链接:
|
||||
http://blog.csdn.net/qq403977698/article/details/47254539
|
||||
'''
|
||||
invRandMat = randMat.I
|
||||
TraRandMat = randMat.T
|
||||
ArrRandMat = randMat.A
|
||||
# 输出结果
|
||||
print('randArray=(%s) \n' % type(randArray), randArray)
|
||||
print('randMat=(%s) \n' % type(randMat), randMat)
|
||||
print('invRandMat=(%s) \n' % type(invRandMat), invRandMat)
|
||||
print('TraRandMat=(%s) \n' % type(TraRandMat), TraRandMat)
|
||||
print('ArrRandMat=(%s) \n' % type(ArrRandMat), ArrRandMat)
|
||||
# 矩阵和逆矩阵 进行求积 (单位矩阵,对角线都为1嘛,理论上4*4的矩阵其他的都为0)
|
||||
myEye = randMat*invRandMat
|
||||
# 误差
|
||||
print(myEye - eye(4))
|
||||
|
||||
'''
|
||||
如果上面的代码运行没有问题,说明numpy安装没有问题
|
||||
'''
|
||||
165
src/py2.x/ml/10.kmeans/kMeans.py
Normal file
165
src/py2.x/ml/10.kmeans/kMeans.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on Feb 16, 2011
|
||||
Update on 2017-05-18
|
||||
k Means Clustering for Ch10 of Machine Learning in Action
|
||||
Author: Peter Harrington/那伊抹微笑
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
from numpy import *
|
||||
|
||||
|
||||
# 从文本中构建矩阵,加载文本文件,然后处理
|
||||
def loadDataSet(fileName): # 通用函数,用来解析以 tab 键分隔的 floats(浮点数)
|
||||
dataSet = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
curLine = line.strip().split('\t')
|
||||
fltLine = map(float, curLine) # 映射所有的元素为 float(浮点数)类型
|
||||
dataSet.append(fltLine)
|
||||
return dataSet
|
||||
|
||||
|
||||
# 计算两个向量的欧式距离(可根据场景选择)
|
||||
def distEclud(vecA, vecB):
|
||||
return sqrt(sum(power(vecA - vecB, 2))) # la.norm(vecA-vecB)
|
||||
|
||||
|
||||
# 为给定数据集构建一个包含 k 个随机质心的集合。随机质心必须要在整个数据集的边界之内,这可以通过找到数据集每一维的最小和最大值来完成。然后生成 0~1.0 之间的随机数并通过取值范围和最小值,以便确保随机点在数据的边界之内。
|
||||
def randCent(dataMat, k):
|
||||
n = shape(dataMat)[1] # 列的数量
|
||||
centroids = mat(zeros((k, n))) # 创建k个质心矩阵
|
||||
for j in range(n): # 创建随机簇质心,并且在每一维的边界内
|
||||
minJ = min(dataMat[:, j]) # 最小值
|
||||
rangeJ = float(max(dataMat[:, j]) - minJ) # 范围 = 最大值 - 最小值
|
||||
centroids[:, j] = mat(minJ + rangeJ * random.rand(k, 1)) # 随机生成
|
||||
return centroids
|
||||
|
||||
|
||||
# k-means 聚类算法
|
||||
# 该算法会创建k个质心,然后将每个点分配到最近的质心,再重新计算质心。
|
||||
# 这个过程重复数次,知道数据点的簇分配结果不再改变位置。
|
||||
# 运行结果(多次运行结果可能会不一样,可以试试,原因为随机质心的影响,但总的结果是对的, 因为数据足够相似,也可能会陷入局部最小值)
|
||||
def kMeans(dataMat, k, distMeas=distEclud, createCent=randCent):
|
||||
m = shape(dataMat)[0] # 行数
|
||||
clusterAssment = mat(zeros(
|
||||
(m, 2))) # 创建一个与 dataMat 行数一样,但是有两列的矩阵,用来保存簇分配结果
|
||||
centroids = createCent(dataMat, k) # 创建质心,随机k个质心
|
||||
clusterChanged = True
|
||||
while clusterChanged:
|
||||
clusterChanged = False
|
||||
for i in range(m): # 循环每一个数据点并分配到最近的质心中去
|
||||
minDist = inf
|
||||
minIndex = -1
|
||||
for j in range(k):
|
||||
distJI = distMeas(centroids[j, :],
|
||||
dataMat[i, :]) # 计算数据点到质心的距离
|
||||
if distJI < minDist: # 如果距离比 minDist(最小距离)还小,更新 minDist(最小距离)和最小质心的 index(索引)
|
||||
minDist = distJI
|
||||
minIndex = j
|
||||
if clusterAssment[i, 0] != minIndex: # 簇分配结果改变
|
||||
clusterChanged = True # 簇改变
|
||||
clusterAssment[
|
||||
i, :] = minIndex, minDist**2 # 更新簇分配结果为最小质心的 index(索引),minDist(最小距离)的平方
|
||||
print(centroids)
|
||||
for cent in range(k): # 更新质心
|
||||
ptsInClust = dataMat[nonzero(
|
||||
clusterAssment[:, 0].A == cent)[0]] # 获取该簇中的所有点
|
||||
centroids[cent, :] = mean(
|
||||
ptsInClust, axis=0) # 将质心修改为簇中所有点的平均值,mean 就是求平均值的
|
||||
return centroids, clusterAssment
|
||||
|
||||
|
||||
# 二分 KMeans 聚类算法, 基于 kMeans 基础之上的优化,以避免陷入局部最小值
|
||||
def biKMeans(dataMat, k, distMeas=distEclud):
|
||||
m = shape(dataMat)[0]
|
||||
clusterAssment = mat(zeros((m, 2))) # 保存每个数据点的簇分配结果和平方误差
|
||||
centroid0 = mean(dataMat, axis=0).tolist()[0] # 质心初始化为所有数据点的均值
|
||||
centList = [centroid0] # 初始化只有 1 个质心的 list
|
||||
for j in range(m): # 计算所有数据点到初始质心的距离平方误差
|
||||
clusterAssment[j, 1] = distMeas(mat(centroid0), dataMat[j, :])**2
|
||||
while (len(centList) < k): # 当质心数量小于 k 时
|
||||
lowestSSE = inf
|
||||
for i in range(len(centList)): # 对每一个质心
|
||||
ptsInCurrCluster = dataMat[nonzero(
|
||||
clusterAssment[:, 0].A == i)[0], :] # 获取当前簇 i 下的所有数据点
|
||||
centroidMat, splitClustAss = kMeans(
|
||||
ptsInCurrCluster, 2, distMeas) # 将当前簇 i 进行二分 kMeans 处理
|
||||
sseSplit = sum(splitClustAss[:, 1]) # 将二分 kMeans 结果中的平方和的距离进行求和
|
||||
sseNotSplit = sum(
|
||||
clusterAssment[nonzero(clusterAssment[:, 0].A != i)[0],
|
||||
1]) # 将未参与二分 kMeans 分配结果中的平方和的距离进行求和
|
||||
print("sseSplit, and notSplit: ", sseSplit, sseNotSplit)
|
||||
if (sseSplit + sseNotSplit) < lowestSSE:
|
||||
bestCentToSplit = i
|
||||
bestNewCents = centroidMat
|
||||
bestClustAss = splitClustAss.copy()
|
||||
lowestSSE = sseSplit + sseNotSplit
|
||||
# 找出最好的簇分配结果
|
||||
bestClustAss[nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len(
|
||||
centList) # 调用二分 kMeans 的结果,默认簇是 0,1. 当然也可以改成其它的数字
|
||||
bestClustAss[nonzero(bestClustAss[:, 0].A == 0)[0],
|
||||
0] = bestCentToSplit # 更新为最佳质心
|
||||
print('the bestCentToSplit is: ', bestCentToSplit)
|
||||
print('the len of bestClustAss is: ', len(bestClustAss))
|
||||
# 更新质心列表
|
||||
centList[bestCentToSplit] = bestNewCents[0, :].tolist()[
|
||||
0] # 更新原质心 list 中的第 i 个质心为使用二分 kMeans 后 bestNewCents 的第一个质心
|
||||
centList.append(
|
||||
bestNewCents[1, :].tolist()[0]) # 添加 bestNewCents 的第二个质心
|
||||
clusterAssment[nonzero(clusterAssment[:, 0].A == bestCentToSplit)[
|
||||
0], :] = bestClustAss # 重新分配最好簇下的数据(质心)以及SSE
|
||||
return mat(centList), clusterAssment
|
||||
|
||||
|
||||
def testBasicFunc():
|
||||
# 加载测试数据集
|
||||
dataMat = mat(loadDataSet('data/10.KMeans/testSet.txt'))
|
||||
|
||||
# 测试 randCent() 函数是否正常运行。
|
||||
# 首先,先看一下矩阵中的最大值与最小值
|
||||
print('min(dataMat[:, 0])=', min(dataMat[:, 0]))
|
||||
print('min(dataMat[:, 1])=', min(dataMat[:, 1]))
|
||||
print('max(dataMat[:, 1])=', max(dataMat[:, 1]))
|
||||
print('max(dataMat[:, 0])=', max(dataMat[:, 0]))
|
||||
|
||||
# 然后看看 randCent() 函数能否生成 min 到 max 之间的值
|
||||
print('randCent(dataMat, 2)=', randCent(dataMat, 2))
|
||||
|
||||
# 最后测试一下距离计算方法
|
||||
print(' distEclud(dataMat[0], dataMat[1])=', distEclud(dataMat[0], dataMat[1]))
|
||||
|
||||
|
||||
def testKMeans():
|
||||
# 加载测试数据集
|
||||
dataMat = mat(loadDataSet('data/10.KMeans/testSet.txt'))
|
||||
|
||||
# 该算法会创建k个质心,然后将每个点分配到最近的质心,再重新计算质心。
|
||||
# 这个过程重复数次,知道数据点的簇分配结果不再改变位置。
|
||||
# 运行结果(多次运行结果可能会不一样,可以试试,原因为随机质心的影响,但总的结果是对的, 因为数据足够相似)
|
||||
myCentroids, clustAssing = kMeans(dataMat, 4)
|
||||
|
||||
print('centroids=', myCentroids)
|
||||
|
||||
|
||||
def testBiKMeans():
|
||||
# 加载测试数据集
|
||||
dataMat = mat(loadDataSet('data/10.KMeans/testSet2.txt'))
|
||||
|
||||
centList, myNewAssments = biKMeans(dataMat, 3)
|
||||
|
||||
print('centList=', centList)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# 测试基础的函数
|
||||
# testBasicFunc()
|
||||
|
||||
# 测试 kMeans 函数
|
||||
# testKMeans()
|
||||
|
||||
# 测试二分 biKMeans 函数
|
||||
testBiKMeans()
|
||||
24
src/py2.x/ml/10.kmeans/kMeansSklearn.py
Normal file
24
src/py2.x/ml/10.kmeans/kMeansSklearn.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# -*- coding:UTF-8 -*-
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
# 加载数据集
|
||||
dataMat = []
|
||||
fr = open("data/10.KMeans/testSet.txt") # 注意,这个是相对路径,请保证是在 MachineLearning 这个目录下执行。
|
||||
for line in fr.readlines():
|
||||
curLine = line.strip().split('\t')
|
||||
fltLine = map(float,curLine) # 映射所有的元素为 float(浮点数)类型
|
||||
dataMat.append(fltLine)
|
||||
|
||||
# 训练模型
|
||||
km = KMeans(n_clusters=4) # 初始化
|
||||
km.fit(dataMat) # 拟合
|
||||
km_pred = km.predict(dataMat) # 预测
|
||||
centers = km.cluster_centers_ # 质心
|
||||
|
||||
# 可视化结果
|
||||
plt.scatter(np.array(dataMat)[:, 1], np.array(dataMat)[:, 0], c=km_pred)
|
||||
plt.scatter(centers[:, 1], centers[:, 0], c="r")
|
||||
plt.show()
|
||||
43
src/py2.x/ml/10.kmeans/test.txt
Normal file
43
src/py2.x/ml/10.kmeans/test.txt
Normal file
@@ -0,0 +1,43 @@
|
||||
# import
|
||||
>>> import kMeans
|
||||
>>> from numpy import *
|
||||
|
||||
# 从文本中构建矩阵,加载测试数据集
|
||||
>>> datMat=mat(kMeans.loadDataSet('testSet.txt'))
|
||||
|
||||
# 测试 randCent() 函数是否正常运行。
|
||||
# 首先,先看一下矩阵中的最大值与最小值
|
||||
>>> min(datMat[:,0])
|
||||
matrix([[-5.379713]])
|
||||
>>> min(datMat[:,1])
|
||||
matrix([[-4.232586]])
|
||||
>>> max(datMat[:,1])
|
||||
matrix([[ 5.1904]])
|
||||
>>> max(datMat[:,0])
|
||||
matrix([[ 4.838138]])
|
||||
|
||||
# 然后看看 randCent() 函数能否生成 min 到 max 之间的值
|
||||
>>> kMeans.randCent(datMat, 2)
|
||||
matrix([[-3.59997714, -1.43558065],
|
||||
[-3.03744979, 4.35541488]])
|
||||
|
||||
# 最后测试一下距离计算方法
|
||||
>>> kMeans.distEclud(datMat[0], datMat[1])
|
||||
5.184632816681332
|
||||
|
||||
# 该算法会创建k个质心,然后将每个点分配到最近的质心,再重新计算质心。
|
||||
# 这个过程重复数次,知道数据点的簇分配结果不再改变位置。
|
||||
# 运行结果(多次运行结果可能会不一样,可以试试,原因为随机质心的影响,但总的结果是对的, 因为数据足够相似)
|
||||
>>> myCentroids, clustAssing = kMeans.kMeans(datMat, 4)
|
||||
[[ 0.15357605 -0.94962877]
|
||||
[ 3.3593825 1.05965957]
|
||||
[-2.41900657 3.30513371]
|
||||
[-2.80505526 -3.73280289]]
|
||||
[[ 2.35622556 -3.02056425]
|
||||
[ 2.95373358 2.32801413]
|
||||
[-2.46154315 2.78737555]
|
||||
[-3.38237045 -2.9473363 ]]
|
||||
[[ 2.65077367 -2.79019029]
|
||||
[ 2.6265299 3.10868015]
|
||||
[-2.46154315 2.78737555]
|
||||
[-3.53973889 -2.89384326]]
|
||||
371
src/py2.x/ml/11.Apriori/apriori.py
Normal file
371
src/py2.x/ml/11.Apriori/apriori.py
Normal file
@@ -0,0 +1,371 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
|
||||
'''
|
||||
Created on Mar 24, 2011
|
||||
Update on 2017-05-18
|
||||
Ch 11 code
|
||||
Author: Peter/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
print(__doc__)
|
||||
from numpy import *
|
||||
|
||||
# 加载数据集
|
||||
def loadDataSet():
|
||||
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
|
||||
|
||||
# 创建集合 C1。即对 dataSet 进行去重,排序,放入 list 中,然后转换所有的元素为 frozenset
|
||||
def createC1(dataSet):
|
||||
"""createC1(创建集合 C1)
|
||||
|
||||
Args:
|
||||
dataSet 原始数据集
|
||||
Returns:
|
||||
frozenset 返回一个 frozenset 格式的 list
|
||||
"""
|
||||
|
||||
C1 = []
|
||||
for transaction in dataSet:
|
||||
for item in transaction:
|
||||
if not [item] in C1:
|
||||
# 遍历所有的元素,如果不在 C1 出现过,那么就 append
|
||||
C1.append([item])
|
||||
# 对数组进行 `从小到大` 的排序
|
||||
# print 'sort 前=', C1
|
||||
C1.sort()
|
||||
# frozenset 表示冻结的 set 集合,元素无改变;可以把它当字典的 key 来使用
|
||||
# print 'sort 后=', C1
|
||||
# print 'frozenset=', map(frozenset, C1)
|
||||
return map(frozenset, C1)
|
||||
|
||||
# 计算候选数据集 CK 在数据集 D 中的支持度,并返回支持度大于最小支持度(minSupport)的数据
|
||||
def scanD(D, Ck, minSupport):
|
||||
"""scanD(计算候选数据集 CK 在数据集 D 中的支持度,并返回支持度大于最小支持度 minSupport 的数据)
|
||||
|
||||
Args:
|
||||
D 数据集
|
||||
Ck 候选项集列表
|
||||
minSupport 最小支持度
|
||||
Returns:
|
||||
retList 支持度大于 minSupport 的集合
|
||||
supportData 候选项集支持度数据
|
||||
"""
|
||||
|
||||
# ssCnt 临时存放选数据集 Ck 的频率. 例如: a->10, b->5, c->8
|
||||
ssCnt = {}
|
||||
for tid in D:
|
||||
for can in Ck:
|
||||
# s.issubset(t) 测试是否 s 中的每一个元素都在 t 中
|
||||
if can.issubset(tid):
|
||||
if not ssCnt.has_key(can):
|
||||
ssCnt[can] = 1
|
||||
else:
|
||||
ssCnt[can] += 1
|
||||
numItems = float(len(D)) # 数据集 D 的数量
|
||||
retList = []
|
||||
supportData = {}
|
||||
for key in ssCnt:
|
||||
# 支持度 = 候选项(key)出现的次数 / 所有数据集的数量
|
||||
support = ssCnt[key]/numItems
|
||||
if support >= minSupport:
|
||||
# 在 retList 的首位插入元素,只存储支持度满足频繁项集的值
|
||||
retList.insert(0, key)
|
||||
# 存储所有的候选项(key)和对应的支持度(support)
|
||||
supportData[key] = support
|
||||
return retList, supportData
|
||||
|
||||
# 输入频繁项集列表 Lk 与返回的元素个数 k,然后输出所有可能的候选项集 Ck
|
||||
def aprioriGen(Lk, k):
|
||||
"""aprioriGen(输入频繁项集列表 Lk 与返回的元素个数 k,然后输出候选项集 Ck。
|
||||
例如: 以 {0},{1},{2} 为输入且 k = 2 则输出 {0,1}, {0,2}, {1,2}. 以 {0,1},{0,2},{1,2} 为输入且 k = 3 则输出 {0,1,2}
|
||||
仅需要计算一次,不需要将所有的结果计算出来,然后进行去重操作
|
||||
这是一个更高效的算法)
|
||||
|
||||
Args:
|
||||
Lk 频繁项集列表
|
||||
k 返回的项集元素个数(若元素的前 k-2 相同,就进行合并)
|
||||
Returns:
|
||||
retList 元素两两合并的数据集
|
||||
"""
|
||||
|
||||
retList = []
|
||||
lenLk = len(Lk)
|
||||
for i in range(lenLk):
|
||||
for j in range(i+1, lenLk):
|
||||
L1 = list(Lk[i])[: k-2]
|
||||
L2 = list(Lk[j])[: k-2]
|
||||
# print '-----i=', i, k-2, Lk, Lk[i], list(Lk[i])[: k-2]
|
||||
# print '-----j=', j, k-2, Lk, Lk[j], list(Lk[j])[: k-2]
|
||||
L1.sort()
|
||||
L2.sort()
|
||||
# 第一次 L1,L2 为空,元素直接进行合并,返回元素两两合并的数据集
|
||||
# if first k-2 elements are equal
|
||||
if L1 == L2:
|
||||
# set union
|
||||
# print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j]
|
||||
retList.append(Lk[i] | Lk[j])
|
||||
return retList
|
||||
|
||||
# 找出数据集 dataSet 中支持度 >= 最小支持度的候选项集以及它们的支持度。即我们的频繁项集。
|
||||
def apriori(dataSet, minSupport=0.5):
|
||||
"""apriori(首先构建集合 C1,然后扫描数据集来判断这些只有一个元素的项集是否满足最小支持度的要求。那么满足最小支持度要求的项集构成集合 L1。然后 L1 中的元素相互组合成 C2,C2 再进一步过滤变成 L2,然后以此类推,知道 CN 的长度为 0 时结束,即可找出所有频繁项集的支持度。)
|
||||
|
||||
Args:
|
||||
dataSet 原始数据集
|
||||
minSupport 支持度的阈值
|
||||
Returns:
|
||||
L 频繁项集的全集
|
||||
supportData 所有元素和支持度的全集
|
||||
"""
|
||||
# C1 即对 dataSet 进行去重,排序,放入 list 中,然后转换所有的元素为 frozenset
|
||||
C1 = createC1(dataSet)
|
||||
# print 'C1: ', C1
|
||||
# 对每一行进行 set 转换,然后存放到集合中
|
||||
D = map(set, dataSet)
|
||||
# print 'D=', D
|
||||
# 计算候选数据集 C1 在数据集 D 中的支持度,并返回支持度大于 minSupport 的数据
|
||||
L1, supportData = scanD(D, C1, minSupport)
|
||||
# print "L1=", L1, "\n", "outcome: ", supportData
|
||||
|
||||
# L 加了一层 list, L 一共 2 层 list
|
||||
L = [L1]
|
||||
k = 2
|
||||
# 判断 L 的第 k-2 项的数据长度是否 > 0。第一次执行时 L 为 [[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]]。L[k-2]=L[0]=[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])],最后面 k += 1
|
||||
while (len(L[k-2]) > 0):
|
||||
# print 'k=', k, L, L[k-2]
|
||||
Ck = aprioriGen(L[k-2], k) # 例如: 以 {0},{1},{2} 为输入且 k = 2 则输出 {0,1}, {0,2}, {1,2}. 以 {0,1},{0,2},{1,2} 为输入且 k = 3 则输出 {0,1,2}
|
||||
# print 'Ck', Ck
|
||||
|
||||
Lk, supK = scanD(D, Ck, minSupport) # 计算候选数据集 CK 在数据集 D 中的支持度,并返回支持度大于 minSupport 的数据
|
||||
# 保存所有候选项集的支持度,如果字典没有,就追加元素,如果有,就更新元素
|
||||
supportData.update(supK)
|
||||
if len(Lk) == 0:
|
||||
break
|
||||
# Lk 表示满足频繁子项的集合,L 元素在增加,例如:
|
||||
# l=[[set(1), set(2), set(3)]]
|
||||
# l=[[set(1), set(2), set(3)], [set(1, 2), set(2, 3)]]
|
||||
L.append(Lk)
|
||||
k += 1
|
||||
# print 'k=', k, len(L[k-2])
|
||||
return L, supportData
|
||||
|
||||
# 计算可信度(confidence)
|
||||
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
|
||||
"""calcConf(对两个元素的频繁项,计算可信度,例如: {1,2}/{1} 或者 {1,2}/{2} 看是否满足条件)
|
||||
|
||||
Args:
|
||||
freqSet 频繁项集中的元素,例如: frozenset([1, 3])
|
||||
H 频繁项集中的元素的集合,例如: [frozenset([1]), frozenset([3])]
|
||||
supportData 所有元素的支持度的字典
|
||||
brl 关联规则列表的空数组
|
||||
minConf 最小可信度
|
||||
Returns:
|
||||
prunedH 记录 可信度大于阈值的集合
|
||||
"""
|
||||
# 记录可信度大于最小可信度(minConf)的集合
|
||||
prunedH = []
|
||||
for conseq in H: # 假设 freqSet = frozenset([1, 3]), H = [frozenset([1]), frozenset([3])],那么现在需要求出 frozenset([1]) -> frozenset([3]) 的可信度和 frozenset([3]) -> frozenset([1]) 的可信度
|
||||
|
||||
# print 'confData=', freqSet, H, conseq, freqSet-conseq
|
||||
conf = supportData[freqSet]/supportData[freqSet-conseq] # 支持度定义: a -> b = support(a | b) / support(a). 假设 freqSet = frozenset([1, 3]), conseq = [frozenset([1])],那么 frozenset([1]) 至 frozenset([3]) 的可信度为 = support(a | b) / support(a) = supportData[freqSet]/supportData[freqSet-conseq] = supportData[frozenset([1, 3])] / supportData[frozenset([1])]
|
||||
if conf >= minConf:
|
||||
# 只要买了 freqSet-conseq 集合,一定会买 conseq 集合(freqSet-conseq 集合和 conseq集合 是全集)
|
||||
print(freqSet-conseq, '-->', conseq, 'conf:', conf)
|
||||
brl.append((freqSet-conseq, conseq, conf))
|
||||
prunedH.append(conseq)
|
||||
return prunedH
|
||||
|
||||
# 递归计算频繁项集的规则
|
||||
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
|
||||
"""rulesFromConseq
|
||||
|
||||
Args:
|
||||
freqSet 频繁项集中的元素,例如: frozenset([2, 3, 5])
|
||||
H 频繁项集中的元素的集合,例如: [frozenset([2]), frozenset([3]), frozenset([5])]
|
||||
supportData 所有元素的支持度的字典
|
||||
brl 关联规则列表的数组
|
||||
minConf 最小可信度
|
||||
"""
|
||||
# H[0] 是 freqSet 的元素组合的第一个元素,并且 H 中所有元素的长度都一样,长度由 aprioriGen(H, m+1) 这里的 m + 1 来控制
|
||||
# 该函数递归时,H[0] 的长度从 1 开始增长 1 2 3 ...
|
||||
# 假设 freqSet = frozenset([2, 3, 5]), H = [frozenset([2]), frozenset([3]), frozenset([5])]
|
||||
# 那么 m = len(H[0]) 的递归的值依次为 1 2
|
||||
# 在 m = 2 时, 跳出该递归。假设再递归一次,那么 H[0] = frozenset([2, 3, 5]),freqSet = frozenset([2, 3, 5]) ,没必要再计算 freqSet 与 H[0] 的关联规则了。
|
||||
m = len(H[0])
|
||||
if (len(freqSet) > (m + 1)):
|
||||
# print 'freqSet******************', len(freqSet), m + 1, freqSet, H, H[0]
|
||||
# 生成 m+1 个长度的所有可能的 H 中的组合,假设 H = [frozenset([2]), frozenset([3]), frozenset([5])]
|
||||
# 第一次递归调用时生成 [frozenset([2, 3]), frozenset([2, 5]), frozenset([3, 5])]
|
||||
# 第二次 。。。没有第二次,递归条件判断时已经退出了
|
||||
Hmp1 = aprioriGen(H, m+1)
|
||||
# 返回可信度大于最小可信度的集合
|
||||
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
|
||||
print('Hmp1=', Hmp1)
|
||||
print('len(Hmp1)=', len(Hmp1), 'len(freqSet)=', len(freqSet))
|
||||
# 计算可信度后,还有数据大于最小可信度的话,那么继续递归调用,否则跳出递归
|
||||
if (len(Hmp1) > 1):
|
||||
# print '----------------------', Hmp1
|
||||
# print len(freqSet), len(Hmp1[0]) + 1
|
||||
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
|
||||
|
||||
# 生成关联规则
|
||||
def generateRules(L, supportData, minConf=0.7):
|
||||
"""generateRules
|
||||
|
||||
Args:
|
||||
L 频繁项集列表
|
||||
supportData 频繁项集支持度的字典
|
||||
minConf 最小置信度
|
||||
Returns:
|
||||
bigRuleList 可信度规则列表(关于 (A->B+置信度) 3个字段的组合)
|
||||
"""
|
||||
bigRuleList = []
|
||||
# 假设 L = [[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])]]
|
||||
for i in range(1, len(L)):
|
||||
# 获取频繁项集中每个组合的所有元素
|
||||
for freqSet in L[i]:
|
||||
# 假设:freqSet= frozenset([1, 3]), H1=[frozenset([1]), frozenset([3])]
|
||||
# 组合总的元素并遍历子元素,并转化为 frozenset 集合,再存放到 list 列表中
|
||||
H1 = [frozenset([item]) for item in freqSet]
|
||||
# 2 个的组合,走 else, 2 个以上的组合,走 if
|
||||
if (i > 1):
|
||||
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
|
||||
else:
|
||||
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
|
||||
return bigRuleList
|
||||
|
||||
|
||||
def getActionIds():
|
||||
from time import sleep
|
||||
from votesmart import votesmart
|
||||
# votesmart.apikey = 'get your api key first'
|
||||
votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
|
||||
actionIdList = []
|
||||
billTitleList = []
|
||||
fr = open('data/11.Apriori/recent20bills.txt')
|
||||
for line in fr.readlines():
|
||||
billNum = int(line.split('\t')[0])
|
||||
try:
|
||||
billDetail = votesmart.votes.getBill(billNum) # api call
|
||||
for action in billDetail.actions:
|
||||
if action.level == 'House' and (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
|
||||
actionId = int(action.actionId)
|
||||
print('bill: %d has actionId: %d' % (billNum, actionId))
|
||||
actionIdList.append(actionId)
|
||||
billTitleList.append(line.strip().split('\t')[1])
|
||||
except:
|
||||
print("problem getting bill %d" % billNum)
|
||||
sleep(1) # delay to be polite
|
||||
return actionIdList, billTitleList
|
||||
|
||||
|
||||
def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
|
||||
itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
|
||||
for billTitle in billTitleList:#fill up itemMeaning list
|
||||
itemMeaning.append('%s -- Nay' % billTitle)
|
||||
itemMeaning.append('%s -- Yea' % billTitle)
|
||||
transDict = {}#list of items in each transaction (politician)
|
||||
voteCount = 2
|
||||
for actionId in actionIdList:
|
||||
sleep(3)
|
||||
print('getting votes for actionId: %d' % actionId)
|
||||
try:
|
||||
voteList = votesmart.votes.getBillActionVotes(actionId)
|
||||
for vote in voteList:
|
||||
if not transDict.has_key(vote.candidateName):
|
||||
transDict[vote.candidateName] = []
|
||||
if vote.officeParties == 'Democratic':
|
||||
transDict[vote.candidateName].append(1)
|
||||
elif vote.officeParties == 'Republican':
|
||||
transDict[vote.candidateName].append(0)
|
||||
if vote.action == 'Nay':
|
||||
transDict[vote.candidateName].append(voteCount)
|
||||
elif vote.action == 'Yea':
|
||||
transDict[vote.candidateName].append(voteCount + 1)
|
||||
except:
|
||||
print("problem getting actionId: %d" % actionId)
|
||||
voteCount += 2
|
||||
return transDict, itemMeaning
|
||||
|
||||
|
||||
# 暂时没用上
|
||||
# def pntRules(ruleList, itemMeaning):
|
||||
# for ruleTup in ruleList:
|
||||
# for item in ruleTup[0]:
|
||||
# print itemMeaning[item]
|
||||
# print " -------->"
|
||||
# for item in ruleTup[1]:
|
||||
# print itemMeaning[item]
|
||||
# print "confidence: %f" % ruleTup[2]
|
||||
# print #print a blank line
|
||||
|
||||
def testApriori():
|
||||
# 加载测试数据集
|
||||
dataSet = loadDataSet()
|
||||
print('dataSet: ', dataSet)
|
||||
|
||||
# Apriori 算法生成频繁项集以及它们的支持度
|
||||
L1, supportData1 = apriori(dataSet, minSupport=0.7)
|
||||
print('L(0.7): ', L1)
|
||||
print('supportData(0.7): ', supportData1)
|
||||
|
||||
print('->->->->->->->->->->->->->->->->->->->->->->->->->->->->')
|
||||
|
||||
# Apriori 算法生成频繁项集以及它们的支持度
|
||||
L2, supportData2 = apriori(dataSet, minSupport=0.5)
|
||||
print('L(0.5): ', L2)
|
||||
print('supportData(0.5): ', supportData2)
|
||||
|
||||
def testGenerateRules():
|
||||
# 加载测试数据集
|
||||
dataSet = loadDataSet()
|
||||
print('dataSet: ', dataSet)
|
||||
|
||||
# Apriori 算法生成频繁项集以及它们的支持度
|
||||
L1, supportData1 = apriori(dataSet, minSupport=0.5)
|
||||
print('L(0.7): ', L1)
|
||||
print('supportData(0.7): ', supportData1)
|
||||
|
||||
# 生成关联规则
|
||||
rules = generateRules(L1, supportData1, minConf=0.5)
|
||||
print('rules: ', rules)
|
||||
|
||||
def main():
|
||||
# 测试 Apriori 算法
|
||||
testApriori()
|
||||
|
||||
# 生成关联规则
|
||||
# testGenerateRules()
|
||||
|
||||
# # 项目案例
|
||||
# # 构建美国国会投票记录的事务数据集
|
||||
# actionIdList, billTitleList = getActionIds()
|
||||
# # 测试前2个
|
||||
# # transDict, itemMeaning = getTransList(actionIdList[: 2], billTitleList[: 2])
|
||||
# # transDict 表示 action_id的集合,transDict[key]这个就是action_id对应的选项,例如 [1, 2, 3]
|
||||
# transDict, itemMeaning = getTransList(actionIdList, billTitleList)
|
||||
# # 得到全集的数据
|
||||
# dataSet = [transDict[key] for key in transDict.keys()]
|
||||
# L, supportData = apriori(dataSet, minSupport=0.3)
|
||||
# rules = generateRules(L, supportData, minConf=0.95)
|
||||
# print rules
|
||||
|
||||
# # 项目案例
|
||||
# # 发现毒蘑菇的相似特性
|
||||
# # 得到全集的数据
|
||||
# dataSet = [line.split() for line in open("data/11.Apriori/mushroom.dat").readlines()]
|
||||
# L, supportData = apriori(dataSet, minSupport=0.3)
|
||||
# # 2表示毒蘑菇,1表示可食用的蘑菇
|
||||
# # 找出关于2的频繁子项出来,就知道如果是毒蘑菇,那么出现频繁的也可能是毒蘑菇
|
||||
# for item in L[1]:
|
||||
# if item.intersection('2'):
|
||||
# print item
|
||||
|
||||
# for item in L[2]:
|
||||
# if item.intersection('2'):
|
||||
# print item
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
344
src/py2.x/ml/12.FrequentPattemTree/fpGrowth.py
Normal file
344
src/py2.x/ml/12.FrequentPattemTree/fpGrowth.py
Normal file
@@ -0,0 +1,344 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on Jun 14, 2011
|
||||
Update on 2017-05-18
|
||||
FP-Growth FP means frequent pattern
|
||||
the FP-Growth algorithm needs:
|
||||
1. FP-tree (class treeNode)
|
||||
2. header table (use dict)
|
||||
This finds frequent itemsets similar to apriori but does not find association rules.
|
||||
Author: Peter/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
print(__doc__)
|
||||
|
||||
|
||||
class treeNode:
|
||||
def __init__(self, nameValue, numOccur, parentNode):
|
||||
self.name = nameValue
|
||||
self.count = numOccur
|
||||
self.nodeLink = None
|
||||
# needs to be updated
|
||||
self.parent = parentNode
|
||||
self.children = {}
|
||||
|
||||
def inc(self, numOccur):
|
||||
"""inc(对count变量增加给定值)
|
||||
"""
|
||||
self.count += numOccur
|
||||
|
||||
def disp(self, ind=1):
|
||||
"""disp(用于将树以文本形式显示)
|
||||
|
||||
"""
|
||||
print(' '*ind, self.name, ' ', self.count)
|
||||
for child in self.children.values():
|
||||
child.disp(ind+1)
|
||||
|
||||
|
||||
def loadSimpDat():
|
||||
simpDat = [['r', 'z', 'h', 'j', 'p'],
|
||||
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
|
||||
['z'],
|
||||
['r', 'x', 'n', 'o', 's'],
|
||||
# ['r', 'x', 'n', 'o', 's'],
|
||||
['y', 'r', 'x', 'z', 'q', 't', 'p'],
|
||||
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
|
||||
return simpDat
|
||||
|
||||
|
||||
def createInitSet(dataSet):
|
||||
retDict = {}
|
||||
for trans in dataSet:
|
||||
if not retDict.has_key(frozenset(trans)):
|
||||
retDict[frozenset(trans)] = 1
|
||||
else:
|
||||
retDict[frozenset(trans)] += 1
|
||||
return retDict
|
||||
|
||||
|
||||
# this version does not use recursion
|
||||
def updateHeader(nodeToTest, targetNode):
|
||||
"""updateHeader(更新头指针,建立相同元素之间的关系,例如: 左边的r指向右边的r值,就是后出现的相同元素 指向 已经出现的元素)
|
||||
|
||||
从头指针的nodeLink开始,一直沿着nodeLink直到到达链表末尾。这就是链表。
|
||||
性能:如果链表很长可能会遇到迭代调用的次数限制。
|
||||
|
||||
Args:
|
||||
nodeToTest 满足minSup {所有的元素+(value, treeNode)}
|
||||
targetNode Tree对象的子节点
|
||||
"""
|
||||
# 建立相同元素之间的关系,例如: 左边的r指向右边的r值
|
||||
while (nodeToTest.nodeLink is not None):
|
||||
nodeToTest = nodeToTest.nodeLink
|
||||
nodeToTest.nodeLink = targetNode
|
||||
|
||||
|
||||
def updateTree(items, inTree, headerTable, count):
|
||||
"""updateTree(更新FP-tree,第二次遍历)
|
||||
|
||||
# 针对每一行的数据
|
||||
# 最大的key, 添加
|
||||
Args:
|
||||
items 满足minSup 排序后的元素key的数组(大到小的排序)
|
||||
inTree 空的Tree对象
|
||||
headerTable 满足minSup {所有的元素+(value, treeNode)}
|
||||
count 原数据集中每一组Kay出现的次数
|
||||
"""
|
||||
# 取出 元素 出现次数最高的
|
||||
# 如果该元素在 inTree.children 这个字典中,就进行累加
|
||||
# 如果该元素不存在 就 inTree.children 字典中新增key,value为初始化的 treeNode 对象
|
||||
if items[0] in inTree.children:
|
||||
# 更新 最大元素,对应的 treeNode 对象的count进行叠加
|
||||
inTree.children[items[0]].inc(count)
|
||||
else:
|
||||
# 如果不存在子节点,我们为该inTree添加子节点
|
||||
inTree.children[items[0]] = treeNode(items[0], count, inTree)
|
||||
# 如果满足minSup的dist字典的value值第二位为null, 我们就设置该元素为 本节点对应的tree节点
|
||||
# 如果元素第二位不为null,我们就更新header节点
|
||||
if headerTable[items[0]][1] is None:
|
||||
# headerTable只记录第一次节点出现的位置
|
||||
headerTable[items[0]][1] = inTree.children[items[0]]
|
||||
else:
|
||||
# 本质上是修改headerTable的key对应的Tree,的nodeLink值
|
||||
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
|
||||
if len(items) > 1:
|
||||
# 递归的调用,在items[0]的基础上,添加item0[1]做子节点, count只要循环的进行累计加和而已,统计出节点的最后的统计值。
|
||||
updateTree(items[1:], inTree.children[items[0]], headerTable, count)
|
||||
|
||||
|
||||
def createTree(dataSet, minSup=1):
|
||||
"""createTree(生成FP-tree)
|
||||
|
||||
Args:
|
||||
dataSet dist{行:出现次数}的样本数据
|
||||
minSup 最小的支持度
|
||||
Returns:
|
||||
retTree FP-tree
|
||||
headerTable 满足minSup {所有的元素+(value, treeNode)}
|
||||
"""
|
||||
# 支持度>=minSup的dist{所有元素:出现的次数}
|
||||
headerTable = {}
|
||||
# 循环 dist{行:出现次数}的样本数据
|
||||
for trans in dataSet:
|
||||
# 对所有的行进行循环,得到行里面的所有元素
|
||||
# 统计每一行中,每个元素出现的总次数
|
||||
for item in trans:
|
||||
# 例如: {'ababa': 3} count(a)=3+3+3=9 count(b)=3+3=6
|
||||
headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
|
||||
# 删除 headerTable中,元素次数<最小支持度的元素
|
||||
for k in headerTable.keys():
|
||||
if headerTable[k] < minSup:
|
||||
del(headerTable[k])
|
||||
|
||||
# 满足minSup: set(各元素集合)
|
||||
freqItemSet = set(headerTable.keys())
|
||||
# 如果不存在,直接返回None
|
||||
if len(freqItemSet) == 0:
|
||||
return None, None
|
||||
for k in headerTable:
|
||||
# 格式化: dist{元素key: [元素次数, None]}
|
||||
headerTable[k] = [headerTable[k], None]
|
||||
|
||||
# create tree
|
||||
retTree = treeNode('Null Set', 1, None)
|
||||
# 循环 dist{行:出现次数}的样本数据
|
||||
for tranSet, count in dataSet.items():
|
||||
# print 'tranSet, count=', tranSet, count
|
||||
# localD = dist{元素key: 元素总出现次数}
|
||||
localD = {}
|
||||
for item in tranSet:
|
||||
# 判断是否在满足minSup的集合中
|
||||
if item in freqItemSet:
|
||||
# print 'headerTable[item][0]=', headerTable[item][0], headerTable[item]
|
||||
localD[item] = headerTable[item][0]
|
||||
# print 'localD=', localD
|
||||
if len(localD) > 0:
|
||||
# p=key,value; 所以是通过value值的大小,进行从大到小进行排序
|
||||
# orderedItems 表示取出元组的key值,也就是字母本身,但是字母本身是大到小的顺序
|
||||
orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
|
||||
# print 'orderedItems=', orderedItems, 'headerTable', headerTable, '\n\n\n'
|
||||
# 填充树,通过有序的orderedItems的第一位,进行顺序填充 第一层的子节点。
|
||||
updateTree(orderedItems, retTree, headerTable, count)
|
||||
|
||||
return retTree, headerTable
|
||||
|
||||
|
||||
def ascendTree(leafNode, prefixPath):
|
||||
"""ascendTree(如果存在父节点,就记录当前节点的name值)
|
||||
|
||||
Args:
|
||||
leafNode 查询的节点对于的nodeTree
|
||||
prefixPath 要查询的节点值
|
||||
"""
|
||||
if leafNode.parent is not None:
|
||||
prefixPath.append(leafNode.name)
|
||||
ascendTree(leafNode.parent, prefixPath)
|
||||
|
||||
|
||||
def findPrefixPath(basePat, treeNode):
|
||||
"""findPrefixPath 基础数据集
|
||||
|
||||
Args:
|
||||
basePat 要查询的节点值
|
||||
treeNode 查询的节点所在的当前nodeTree
|
||||
Returns:
|
||||
condPats 对非basePat的倒叙值作为key,赋值为count数
|
||||
"""
|
||||
condPats = {}
|
||||
# 对 treeNode的link进行循环
|
||||
while treeNode is not None:
|
||||
prefixPath = []
|
||||
# 寻找改节点的父节点,相当于找到了该节点的频繁项集
|
||||
ascendTree(treeNode, prefixPath)
|
||||
# 避免 单独`Z`一个元素,添加了空节点
|
||||
if len(prefixPath) > 1:
|
||||
# 对非basePat的倒叙值作为key,赋值为count数
|
||||
# prefixPath[1:] 变frozenset后,字母就变无序了
|
||||
# condPats[frozenset(prefixPath)] = treeNode.count
|
||||
condPats[frozenset(prefixPath[1:])] = treeNode.count
|
||||
# 递归,寻找改节点的下一个 相同值的链接节点
|
||||
treeNode = treeNode.nodeLink
|
||||
# print treeNode
|
||||
return condPats
|
||||
|
||||
|
||||
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
|
||||
"""mineTree(创建条件FP树)
|
||||
|
||||
Args:
|
||||
inTree myFPtree
|
||||
headerTable 满足minSup {所有的元素+(value, treeNode)}
|
||||
minSup 最小支持项集
|
||||
preFix preFix为newFreqSet上一次的存储记录,一旦没有myHead,就不会更新
|
||||
freqItemList 用来存储频繁子项的列表
|
||||
"""
|
||||
# 通过value进行从小到大的排序, 得到频繁项集的key
|
||||
# 最小支持项集的key的list集合
|
||||
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]
|
||||
print('-----', sorted(headerTable.items(), key=lambda p: p[1]))
|
||||
print('bigL=', bigL)
|
||||
# 循环遍历 最频繁项集的key,从小到大的递归寻找对应的频繁项集
|
||||
for basePat in bigL:
|
||||
# preFix为newFreqSet上一次的存储记录,一旦没有myHead,就不会更新
|
||||
newFreqSet = preFix.copy()
|
||||
newFreqSet.add(basePat)
|
||||
print('newFreqSet=', newFreqSet, preFix)
|
||||
|
||||
freqItemList.append(newFreqSet)
|
||||
print('freqItemList=', freqItemList)
|
||||
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
|
||||
print('condPattBases=', basePat, condPattBases)
|
||||
|
||||
# 构建FP-tree
|
||||
myCondTree, myHead = createTree(condPattBases, minSup)
|
||||
print('myHead=', myHead)
|
||||
# 挖掘条件 FP-tree, 如果myHead不为空,表示满足minSup {所有的元素+(value, treeNode)}
|
||||
if myHead is not None:
|
||||
myCondTree.disp(1)
|
||||
print('\n\n\n')
|
||||
# 递归 myHead 找出频繁项集
|
||||
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
|
||||
print('\n\n\n')
|
||||
|
||||
|
||||
# import twitter
|
||||
# from time import sleep
|
||||
# import re
|
||||
|
||||
|
||||
# def getLotsOfTweets(searchStr):
|
||||
# """
|
||||
# 获取 100个搜索结果页面
|
||||
# """
|
||||
# CONSUMER_KEY = ''
|
||||
# CONSUMER_SECRET = ''
|
||||
# ACCESS_TOKEN_KEY = ''
|
||||
# ACCESS_TOKEN_SECRET = ''
|
||||
# api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET, access_token_key=ACCESS_TOKEN_KEY, access_token_secret=ACCESS_TOKEN_SECRET)
|
||||
|
||||
# # you can get 1500 results 15 pages * 100 per page
|
||||
# resultsPages = []
|
||||
# for i in range(1, 15):
|
||||
# print "fetching page %d" % i
|
||||
# searchResults = api.GetSearch(searchStr, per_page=100, page=i)
|
||||
# resultsPages.append(searchResults)
|
||||
# sleep(6)
|
||||
# return resultsPages
|
||||
|
||||
|
||||
# def textParse(bigString):
|
||||
# """
|
||||
# 解析页面内容
|
||||
# """
|
||||
# urlsRemoved = re.sub('(http:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString)
|
||||
# listOfTokens = re.split(r'\W*', urlsRemoved)
|
||||
# return [tok.lower() for tok in listOfTokens if len(tok) > 2]
|
||||
|
||||
|
||||
# def mineTweets(tweetArr, minSup=5):
|
||||
# """
|
||||
# 获取频繁项集
|
||||
# """
|
||||
# parsedList = []
|
||||
# for i in range(14):
|
||||
# for j in range(100):
|
||||
# parsedList.append(textParse(tweetArr[i][j].text))
|
||||
# initSet = createInitSet(parsedList)
|
||||
# myFPtree, myHeaderTab = createTree(initSet, minSup)
|
||||
# myFreqList = []
|
||||
# mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
|
||||
# return myFreqList
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# rootNode = treeNode('pyramid', 9, None)
|
||||
# rootNode.children['eye'] = treeNode('eye', 13, None)
|
||||
# rootNode.children['phoenix'] = treeNode('phoenix', 3, None)
|
||||
# # 将树以文本形式显示
|
||||
# # print rootNode.disp()
|
||||
|
||||
# load样本数据
|
||||
simpDat = loadSimpDat()
|
||||
# print simpDat, '\n'
|
||||
# frozen set 格式化 并 重新装载 样本数据,对所有的行进行统计求和,格式: {行:出现次数}
|
||||
initSet = createInitSet(simpDat)
|
||||
print(initSet)
|
||||
|
||||
# 创建FP树
|
||||
# 输入:dist{行:出现次数}的样本数据 和 最小的支持度
|
||||
# 输出:最终的PF-tree,通过循环获取第一层的节点,然后每一层的节点进行递归的获取每一行的字节点,也就是分支。然后所谓的指针,就是后来的指向已存在的
|
||||
myFPtree, myHeaderTab = createTree(initSet, 3)
|
||||
myFPtree.disp()
|
||||
|
||||
# 抽取条件模式基
|
||||
# 查询树节点的,频繁子项
|
||||
print('x --->', findPrefixPath('x', myHeaderTab['x'][1]))
|
||||
print('z --->', findPrefixPath('z', myHeaderTab['z'][1]))
|
||||
print('r --->', findPrefixPath('r', myHeaderTab['r'][1]))
|
||||
|
||||
# 创建条件模式基
|
||||
freqItemList = []
|
||||
mineTree(myFPtree, myHeaderTab, 3, set([]), freqItemList)
|
||||
print(freqItemList)
|
||||
|
||||
# # 项目实战
|
||||
# # 1.twitter项目案例
|
||||
# # 无法运行,因为没发链接twitter
|
||||
# lotsOtweets = getLotsOfTweets('RIMM')
|
||||
# listOfTerms = mineTweets(lotsOtweets, 20)
|
||||
# print len(listOfTerms)
|
||||
# for t in listOfTerms:
|
||||
# print t
|
||||
|
||||
# # 2.新闻网站点击流中挖掘,例如:文章1阅读过的人,还阅读过什么?
|
||||
# parsedDat = [line.split() for line in open('data/12.FPGrowth/kosarak.dat').readlines()]
|
||||
# initSet = createInitSet(parsedDat)
|
||||
# myFPtree, myHeaderTab = createTree(initSet, 100000)
|
||||
|
||||
# myFreList = []
|
||||
# mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreList)
|
||||
# print myFreList
|
||||
153
src/py2.x/ml/13.PCA/pca.py
Normal file
153
src/py2.x/ml/13.PCA/pca.py
Normal file
@@ -0,0 +1,153 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf-8
|
||||
|
||||
'''
|
||||
Created on Jun 1, 2011
|
||||
Update on 2017-05-18
|
||||
Author: Peter Harrington/片刻
|
||||
GitHub:https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
from numpy import *
|
||||
import matplotlib.pyplot as plt
|
||||
print(__doc__)
|
||||
|
||||
|
||||
def loadDataSet(fileName, delim='\t'):
|
||||
fr = open(fileName)
|
||||
stringArr = [line.strip().split(delim) for line in fr.readlines()]
|
||||
datArr = [map(float, line) for line in stringArr]
|
||||
return mat(datArr)
|
||||
|
||||
|
||||
def pca(dataMat, topNfeat=9999999):
|
||||
"""pca
|
||||
|
||||
Args:
|
||||
dataMat 原数据集矩阵
|
||||
topNfeat 应用的N个特征
|
||||
Returns:
|
||||
lowDDataMat 降维后数据集
|
||||
reconMat 新的数据集空间
|
||||
"""
|
||||
|
||||
# 计算每一列的均值
|
||||
meanVals = mean(dataMat, axis=0)
|
||||
# print 'meanVals', meanVals
|
||||
|
||||
# 每个向量同时都减去 均值
|
||||
meanRemoved = dataMat - meanVals
|
||||
# print 'meanRemoved=', meanRemoved
|
||||
|
||||
# cov协方差=[(x1-x均值)*(y1-y均值)+(x2-x均值)*(y2-y均值)+...+(xn-x均值)*(yn-y均值)+]/(n-1)
|
||||
'''
|
||||
方差:(一维)度量两个随机变量关系的统计量
|
||||
协方差: (二维)度量各个维度偏离其均值的程度
|
||||
协方差矩阵:(多维)度量各个维度偏离其均值的程度
|
||||
|
||||
当 cov(X, Y)>0时,表明X与Y正相关;(X越大,Y也越大;X越小Y,也越小。这种情况,我们称为“正相关”。)
|
||||
当 cov(X, Y)<0时,表明X与Y负相关;
|
||||
当 cov(X, Y)=0时,表明X与Y不相关。
|
||||
'''
|
||||
covMat = cov(meanRemoved, rowvar=0)
|
||||
|
||||
# eigVals为特征值, eigVects为特征向量
|
||||
eigVals, eigVects = linalg.eig(mat(covMat))
|
||||
# print 'eigVals=', eigVals
|
||||
# print 'eigVects=', eigVects
|
||||
# 对特征值,进行从小到大的排序,返回从小到大的index序号
|
||||
# 特征值的逆序就可以得到topNfeat个最大的特征向量
|
||||
'''
|
||||
>>> x = np.array([3, 1, 2])
|
||||
>>> np.argsort(x)
|
||||
array([1, 2, 0]) # index,1 = 1; index,2 = 2; index,0 = 3
|
||||
>>> y = np.argsort(x)
|
||||
>>> y[::-1]
|
||||
array([0, 2, 1])
|
||||
>>> y[:-3:-1]
|
||||
array([0, 2]) # 取出 -1, -2
|
||||
>>> y[:-6:-1]
|
||||
array([0, 2, 1])
|
||||
'''
|
||||
eigValInd = argsort(eigVals)
|
||||
# print 'eigValInd1=', eigValInd
|
||||
|
||||
# -1表示倒序,返回topN的特征值[-1 到 -(topNfeat+1) 但是不包括-(topNfeat+1)本身的倒叙]
|
||||
eigValInd = eigValInd[:-(topNfeat+1):-1]
|
||||
# print 'eigValInd2=', eigValInd
|
||||
# 重组 eigVects 最大到最小
|
||||
redEigVects = eigVects[:, eigValInd]
|
||||
# print 'redEigVects=', redEigVects.T
|
||||
# 将数据转换到新空间
|
||||
# print "---", shape(meanRemoved), shape(redEigVects)
|
||||
lowDDataMat = meanRemoved * redEigVects
|
||||
reconMat = (lowDDataMat * redEigVects.T) + meanVals
|
||||
# print 'lowDDataMat=', lowDDataMat
|
||||
# print 'reconMat=', reconMat
|
||||
return lowDDataMat, reconMat
|
||||
|
||||
|
||||
def replaceNanWithMean():
|
||||
datMat = loadDataSet('data/13.PCA/secom.data', ' ')
|
||||
numFeat = shape(datMat)[1]
|
||||
for i in range(numFeat):
|
||||
# 对value不为NaN的求均值
|
||||
# .A 返回矩阵基于的数组
|
||||
meanVal = mean(datMat[nonzero(~isnan(datMat[:, i].A))[0], i])
|
||||
# 将value为NaN的值赋值为均值
|
||||
datMat[nonzero(isnan(datMat[:, i].A))[0],i] = meanVal
|
||||
return datMat
|
||||
|
||||
|
||||
def show_picture(dataMat, reconMat):
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.scatter(dataMat[:, 0].flatten().A[0], dataMat[:, 1].flatten().A[0], marker='^', s=90)
|
||||
ax.scatter(reconMat[:, 0].flatten().A[0], reconMat[:, 1].flatten().A[0], marker='o', s=50, c='red')
|
||||
plt.show()
|
||||
|
||||
|
||||
def analyse_data(dataMat):
|
||||
meanVals = mean(dataMat, axis=0)
|
||||
meanRemoved = dataMat-meanVals
|
||||
covMat = cov(meanRemoved, rowvar=0)
|
||||
eigvals, eigVects = linalg.eig(mat(covMat))
|
||||
eigValInd = argsort(eigvals)
|
||||
|
||||
topNfeat = 20
|
||||
eigValInd = eigValInd[:-(topNfeat+1):-1]
|
||||
cov_all_score = float(sum(eigvals))
|
||||
sum_cov_score = 0
|
||||
for i in range(0, len(eigValInd)):
|
||||
line_cov_score = float(eigvals[eigValInd[i]])
|
||||
sum_cov_score += line_cov_score
|
||||
'''
|
||||
我们发现其中有超过20%的特征值都是0。
|
||||
这就意味着这些特征都是其他特征的副本,也就是说,它们可以通过其他特征来表示,而本身并没有提供额外的信息。
|
||||
|
||||
最前面15个值的数量级大于10^5,实际上那以后的值都变得非常小。
|
||||
这就相当于告诉我们只有部分重要特征,重要特征的数目也很快就会下降。
|
||||
|
||||
最后,我们可能会注意到有一些小的负值,他们主要源自数值误差应该四舍五入成0.
|
||||
'''
|
||||
print('主成分:%s, 方差占比:%s%%, 累积方差占比:%s%%' % (format(i+1, '2.0f'), format(line_cov_score/cov_all_score*100, '4.2f'), format(sum_cov_score/cov_all_score*100, '4.1f')))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# # 加载数据,并转化数据类型为float
|
||||
# dataMat = loadDataSet('data/13.PCA/testSet.txt')
|
||||
# # 只需要1个特征向量
|
||||
# lowDmat, reconMat = pca(dataMat, 1)
|
||||
# # 只需要2个特征向量,和原始数据一致,没任何变化
|
||||
# # lowDmat, reconMat = pca(dataMat, 2)
|
||||
# # print shape(lowDmat)
|
||||
# show_picture(dataMat, reconMat)
|
||||
|
||||
# 利用PCA对半导体制造数据降维
|
||||
dataMat = replaceNanWithMean()
|
||||
print(shape(dataMat))
|
||||
# 分析数据
|
||||
analyse_data(dataMat)
|
||||
# lowDmat, reconMat = pca(dataMat, 20)
|
||||
# print shape(lowDmat)
|
||||
# show_picture(dataMat, reconMat)
|
||||
363
src/py2.x/ml/14.SVD/svdRecommend.py
Normal file
363
src/py2.x/ml/14.SVD/svdRecommend.py
Normal file
@@ -0,0 +1,363 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf-8
|
||||
|
||||
'''
|
||||
Created on Mar 8, 2011
|
||||
Update on 2017-05-18
|
||||
Author: Peter Harrington/山上有课树/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
from numpy import linalg as la
|
||||
from numpy import *
|
||||
|
||||
|
||||
def loadExData3():
|
||||
# 利用SVD提高推荐效果,菜肴矩阵
|
||||
return[[2, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
|
||||
[0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0],
|
||||
[3, 3, 4, 0, 3, 0, 0, 2, 2, 0, 0],
|
||||
[5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],
|
||||
[4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5],
|
||||
[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4],
|
||||
[0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],
|
||||
[0, 0, 0, 3, 0, 0, 0, 0, 4, 5, 0],
|
||||
[1, 1, 2, 1, 1, 2, 1, 0, 4, 5, 0]]
|
||||
|
||||
|
||||
def loadExData2():
|
||||
# 书上代码给的示例矩阵
|
||||
return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
|
||||
[0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
|
||||
[0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
|
||||
[3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
|
||||
[5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
|
||||
[0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
|
||||
[4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
|
||||
[0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
|
||||
[0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
|
||||
[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
|
||||
[1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]
|
||||
|
||||
|
||||
def loadExData():
|
||||
"""
|
||||
# 推荐引擎示例矩阵
|
||||
return[[4, 4, 0, 2, 2],
|
||||
[4, 0, 0, 3, 3],
|
||||
[4, 0, 0, 1, 1],
|
||||
[1, 1, 1, 2, 0],
|
||||
[2, 2, 2, 0, 0],
|
||||
[1, 1, 1, 0, 0],
|
||||
[5, 5, 5, 0, 0]]
|
||||
"""
|
||||
# # 原矩阵
|
||||
# return[[1, 1, 1, 0, 0],
|
||||
# [2, 2, 2, 0, 0],
|
||||
# [1, 1, 1, 0, 0],
|
||||
# [5, 5, 5, 0, 0],
|
||||
# [1, 1, 0, 2, 2],
|
||||
# [0, 0, 0, 3, 3],
|
||||
# [0, 0, 0, 1, 1]]
|
||||
|
||||
# 原矩阵
|
||||
return[[0, -1.6, 0.6],
|
||||
[0, 1.2, 0.8],
|
||||
[0, 0, 0],
|
||||
[0, 0, 0]]
|
||||
|
||||
|
||||
# 相似度计算,假定inA和inB 都是列向量
|
||||
# 基于欧氏距离
|
||||
def ecludSim(inA, inB):
|
||||
return 1.0/(1.0 + la.norm(inA - inB))
|
||||
|
||||
|
||||
# pearsSim()函数会检查是否存在3个或更多的点。
|
||||
# corrcoef直接计算皮尔逊相关系数,范围[-1, 1],归一化后[0, 1]
|
||||
def pearsSim(inA, inB):
|
||||
# 如果不存在,该函数返回1.0,此时两个向量完全相关。
|
||||
if len(inA) < 3:
|
||||
return 1.0
|
||||
return 0.5 + 0.5 * corrcoef(inA, inB, rowvar=0)[0][1]
|
||||
|
||||
|
||||
# 计算余弦相似度,如果夹角为90度,相似度为0;如果两个向量的方向相同,相似度为1.0
|
||||
def cosSim(inA, inB):
|
||||
num = float(inA.T*inB)
|
||||
denom = la.norm(inA)*la.norm(inB)
|
||||
return 0.5 + 0.5*(num/denom)
|
||||
|
||||
|
||||
# 基于物品相似度的推荐引擎
|
||||
def standEst(dataMat, user, simMeas, item):
|
||||
"""standEst(计算某用户未评分物品中,以对该物品和其他物品评分的用户的物品相似度,然后进行综合评分)
|
||||
|
||||
Args:
|
||||
dataMat 训练数据集
|
||||
user 用户编号
|
||||
simMeas 相似度计算方法
|
||||
item 未评分的物品编号
|
||||
Returns:
|
||||
ratSimTotal/simTotal 评分(0~5之间的值)
|
||||
"""
|
||||
# 得到数据集中的物品数目
|
||||
n = shape(dataMat)[1]
|
||||
# 初始化两个评分值
|
||||
simTotal = 0.0
|
||||
ratSimTotal = 0.0
|
||||
# 遍历行中的每个物品(对用户评过分的物品进行遍历,并将它与其他物品进行比较)
|
||||
for j in range(n):
|
||||
userRating = dataMat[user, j]
|
||||
# 如果某个物品的评分值为0,则跳过这个物品
|
||||
if userRating == 0:
|
||||
continue
|
||||
# 寻找两个用户都评级的物品
|
||||
# 变量 overLap 给出的是两个物品当中已经被评分的那个元素的索引ID
|
||||
# logical_and 计算x1和x2元素的真值。
|
||||
overLap = nonzero(logical_and(dataMat[:, item].A > 0, dataMat[:, j].A > 0))[0]
|
||||
# 如果相似度为0,则两着没有任何重合元素,终止本次循环
|
||||
if len(overLap) == 0:
|
||||
similarity = 0
|
||||
# 如果存在重合的物品,则基于这些重合物重新计算相似度。
|
||||
else:
|
||||
similarity = simMeas(dataMat[overLap, item], dataMat[overLap, j])
|
||||
# print 'the %d and %d similarity is : %f'(iten,j,similarity)
|
||||
# 相似度会不断累加,每次计算时还考虑相似度和当前用户评分的乘积
|
||||
# similarity 用户相似度, userRating 用户评分
|
||||
simTotal += similarity
|
||||
ratSimTotal += similarity * userRating
|
||||
if simTotal == 0:
|
||||
return 0
|
||||
# 通过除以所有的评分总和,对上述相似度评分的乘积进行归一化,使得最后评分在0~5之间,这些评分用来对预测值进行排序
|
||||
else:
|
||||
return ratSimTotal/simTotal
|
||||
|
||||
|
||||
# 基于SVD的评分估计
|
||||
# 在recommend() 中,这个函数用于替换对standEst()的调用,该函数对给定用户给定物品构建了一个评分估计值
|
||||
def svdEst(dataMat, user, simMeas, item):
|
||||
"""svdEst( )
|
||||
|
||||
Args:
|
||||
dataMat 训练数据集
|
||||
user 用户编号
|
||||
simMeas 相似度计算方法
|
||||
item 未评分的物品编号
|
||||
Returns:
|
||||
ratSimTotal/simTotal 评分(0~5之间的值)
|
||||
"""
|
||||
# 物品数目
|
||||
n = shape(dataMat)[1]
|
||||
# 对数据集进行SVD分解
|
||||
simTotal = 0.0
|
||||
ratSimTotal = 0.0
|
||||
# 奇异值分解
|
||||
# 在SVD分解之后,我们只利用包含了90%能量值的奇异值,这些奇异值会以NumPy数组的形式得以保存
|
||||
U, Sigma, VT = la.svd(dataMat)
|
||||
|
||||
# # 分析 Sigma 的长度取值
|
||||
# analyse_data(Sigma, 20)
|
||||
|
||||
# 如果要进行矩阵运算,就必须要用这些奇异值构建出一个对角矩阵
|
||||
Sig4 = mat(eye(4) * Sigma[: 4])
|
||||
|
||||
# 利用U矩阵将物品转换到低维空间中,构建转换后的物品(物品+4个主要的特征)
|
||||
xformedItems = dataMat.T * U[:, :4] * Sig4.I
|
||||
print('dataMat', shape(dataMat))
|
||||
print('U[:, :4]', shape(U[:, :4]))
|
||||
print('Sig4.I', shape(Sig4.I))
|
||||
print('VT[:4, :]', shape(VT[:4, :]))
|
||||
print('xformedItems', shape(xformedItems))
|
||||
|
||||
# 对于给定的用户,for循环在用户对应行的元素上进行遍历
|
||||
# 这和standEst()函数中的for循环的目的一样,只不过这里的相似度计算时在低维空间下进行的。
|
||||
for j in range(n):
|
||||
userRating = dataMat[user, j]
|
||||
if userRating == 0 or j == item:
|
||||
continue
|
||||
# 相似度的计算方法也会作为一个参数传递给该函数
|
||||
similarity = simMeas(xformedItems[item, :].T, xformedItems[j, :].T)
|
||||
# for 循环中加入了一条print语句,以便了解相似度计算的进展情况。如果觉得累赘,可以去掉
|
||||
print('the %d and %d similarity is: %f' % (item, j, similarity))
|
||||
# 对相似度不断累加求和
|
||||
simTotal += similarity
|
||||
# 对相似度及对应评分值的乘积求和
|
||||
ratSimTotal += similarity * userRating
|
||||
if simTotal == 0:
|
||||
return 0
|
||||
else:
|
||||
# 计算估计评分
|
||||
return ratSimTotal/simTotal
|
||||
|
||||
|
||||
# recommend()函数,就是推荐引擎,它默认调用standEst()函数,产生了最高的N个推荐结果。
|
||||
# 如果不指定N的大小,则默认值为3。该函数另外的参数还包括相似度计算方法和估计方法
|
||||
def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
|
||||
"""svdEst( )
|
||||
|
||||
Args:
|
||||
dataMat 训练数据集
|
||||
user 用户编号
|
||||
simMeas 相似度计算方法
|
||||
estMethod 使用的推荐算法
|
||||
Returns:
|
||||
返回最终 N 个推荐结果
|
||||
"""
|
||||
# 寻找未评级的物品
|
||||
# 对给定的用户建立一个未评分的物品列表
|
||||
unratedItems = nonzero(dataMat[user, :].A == 0)[1]
|
||||
# 如果不存在未评分物品,那么就退出函数
|
||||
if len(unratedItems) == 0:
|
||||
return 'you rated everything'
|
||||
# 物品的编号和评分值
|
||||
itemScores = []
|
||||
# 在未评分物品上进行循环
|
||||
for item in unratedItems:
|
||||
# 获取 item 该物品的评分
|
||||
estimatedScore = estMethod(dataMat, user, simMeas, item)
|
||||
itemScores.append((item, estimatedScore))
|
||||
# 按照评分得分 进行逆排序,获取前N个未评级物品进行推荐
|
||||
return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[: N]
|
||||
|
||||
|
||||
def analyse_data(Sigma, loopNum=20):
|
||||
"""analyse_data(分析 Sigma 的长度取值)
|
||||
|
||||
Args:
|
||||
Sigma Sigma的值
|
||||
loopNum 循环次数
|
||||
"""
|
||||
# 总方差的集合(总能量值)
|
||||
Sig2 = Sigma**2
|
||||
SigmaSum = sum(Sig2)
|
||||
for i in range(loopNum):
|
||||
SigmaI = sum(Sig2[:i+1])
|
||||
'''
|
||||
根据自己的业务情况,就行处理,设置对应的 Singma 次数
|
||||
|
||||
通常保留矩阵 80% ~ 90% 的能量,就可以得到重要的特征并取出噪声。
|
||||
'''
|
||||
print('主成分:%s, 方差占比:%s%%' % (format(i+1, '2.0f'), format(SigmaI/SigmaSum*100, '4.2f')))
|
||||
|
||||
|
||||
# 图像压缩函数
|
||||
# 加载并转换数据
|
||||
def imgLoadData(filename):
|
||||
myl = []
|
||||
# 打开文本文件,并从文件以数组方式读入字符
|
||||
for line in open(filename).readlines():
|
||||
newRow = []
|
||||
for i in range(32):
|
||||
newRow.append(int(line[i]))
|
||||
myl.append(newRow)
|
||||
# 矩阵调入后,就可以在屏幕上输出该矩阵
|
||||
myMat = mat(myl)
|
||||
return myMat
|
||||
|
||||
|
||||
# 打印矩阵
|
||||
def printMat(inMat, thresh=0.8):
|
||||
# 由于矩阵保护了浮点数,因此定义浅色和深色,遍历所有矩阵元素,当元素大于阀值时打印1,否则打印0
|
||||
for i in range(32):
|
||||
for k in range(32):
|
||||
if float(inMat[i, k]) > thresh:
|
||||
print(1, end=' ')
|
||||
else:
|
||||
print(0, end=' ')
|
||||
print('')
|
||||
|
||||
|
||||
# 实现图像压缩,允许基于任意给定的奇异值数目来重构图像
|
||||
def imgCompress(numSV=3, thresh=0.8):
|
||||
"""imgCompress( )
|
||||
|
||||
Args:
|
||||
numSV Sigma长度
|
||||
thresh 判断的阈值
|
||||
"""
|
||||
# 构建一个列表
|
||||
myMat = imgLoadData('data/14.SVD/0_5.txt')
|
||||
|
||||
print("****original matrix****")
|
||||
# 对原始图像进行SVD分解并重构图像e
|
||||
printMat(myMat, thresh)
|
||||
|
||||
# 通过Sigma 重新构成SigRecom来实现
|
||||
# Sigma是一个对角矩阵,因此需要建立一个全0矩阵,然后将前面的那些奇异值填充到对角线上。
|
||||
U, Sigma, VT = la.svd(myMat)
|
||||
# SigRecon = mat(zeros((numSV, numSV)))
|
||||
# for k in range(numSV):
|
||||
# SigRecon[k, k] = Sigma[k]
|
||||
|
||||
# 分析插入的 Sigma 长度
|
||||
analyse_data(Sigma, 20)
|
||||
|
||||
SigRecon = mat(eye(numSV) * Sigma[: numSV])
|
||||
reconMat = U[:, :numSV] * SigRecon * VT[:numSV, :]
|
||||
print("****reconstructed matrix using %d singular values *****" % numSV)
|
||||
printMat(reconMat, thresh)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# # 对矩阵进行SVD分解(用python实现SVD)
|
||||
# Data = loadExData()
|
||||
# print 'Data:', Data
|
||||
# U, Sigma, VT = linalg.svd(Data)
|
||||
# # 打印Sigma的结果,因为前3个数值比其他的值大了很多,为9.72140007e+00,5.29397912e+00,6.84226362e-01
|
||||
# # 后两个值比较小,每台机器输出结果可能有不同可以将这两个值去掉
|
||||
# print 'U:', U
|
||||
# print 'Sigma', Sigma
|
||||
# print 'VT:', VT
|
||||
# print 'VT:', VT.T
|
||||
|
||||
# # 重构一个3x3的矩阵Sig3
|
||||
# Sig3 = mat([[Sigma[0], 0, 0], [0, Sigma[1], 0], [0, 0, Sigma[2]]])
|
||||
# print U[:, :3] * Sig3 * VT[:3, :]
|
||||
|
||||
"""
|
||||
# 计算欧氏距离
|
||||
myMat = mat(loadExData())
|
||||
# print myMat
|
||||
print ecludSim(myMat[:, 0], myMat[:, 4])
|
||||
print ecludSim(myMat[:, 0], myMat[:, 0])
|
||||
|
||||
# 计算余弦相似度
|
||||
print cosSim(myMat[:, 0], myMat[:, 4])
|
||||
print cosSim(myMat[:, 0], myMat[:, 0])
|
||||
|
||||
# 计算皮尔逊相关系数
|
||||
print pearsSim(myMat[:, 0], myMat[:, 4])
|
||||
print pearsSim(myMat[:, 0], myMat[:, 0])
|
||||
|
||||
"""
|
||||
|
||||
# 计算相似度的方法
|
||||
myMat = mat(loadExData3())
|
||||
# print myMat
|
||||
# 计算相似度的第一种方式
|
||||
print(recommend(myMat, 1, estMethod=svdEst))
|
||||
# 计算相似度的第二种方式
|
||||
print(recommend(myMat, 1, estMethod=svdEst, simMeas=pearsSim))
|
||||
|
||||
# 默认推荐(菜馆菜肴推荐示例)
|
||||
print(recommend(myMat, 2))
|
||||
|
||||
"""
|
||||
# 利用SVD提高推荐效果
|
||||
U, Sigma, VT = la.svd(mat(loadExData2()))
|
||||
print Sigma # 计算矩阵的SVD来了解其需要多少维的特征
|
||||
Sig2 = Sigma**2 # 计算需要多少个奇异值能达到总能量的90%
|
||||
print sum(Sig2) # 计算总能量
|
||||
print sum(Sig2) * 0.9 # 计算总能量的90%
|
||||
print sum(Sig2[: 2]) # 计算前两个元素所包含的能量
|
||||
print sum(Sig2[: 3]) # 两个元素的能量值小于总能量的90%,于是计算前三个元素所包含的能量
|
||||
# 该值高于总能量的90%,这就可以了
|
||||
|
||||
"""
|
||||
|
||||
# 压缩图片
|
||||
# imgCompress(2)
|
||||
60
src/py2.x/ml/15.BigData_MapReduce/mrMean.py
Normal file
60
src/py2.x/ml/15.BigData_MapReduce/mrMean.py
Normal file
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on 2017-04-07
|
||||
Update on 2017-06-20
|
||||
Author: Peter/ApacheCN-xy/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from mrjob.job import MRJob
|
||||
|
||||
|
||||
class MRmean(MRJob):
|
||||
def __init__(self, *args, **kwargs): # 对数据初始化
|
||||
super(MRmean, self).__init__(*args, **kwargs)
|
||||
self.inCount = 0
|
||||
self.inSum = 0
|
||||
self.inSqSum = 0
|
||||
|
||||
# 接受输入数据流
|
||||
def map(self, key, val): # 需要 2 个参数,求数据的和与平方和
|
||||
if False:
|
||||
yield
|
||||
inVal = float(val)
|
||||
self.inCount += 1
|
||||
self.inSum += inVal
|
||||
self.inSqSum += inVal*inVal
|
||||
|
||||
# 所有输入到达后开始处理
|
||||
def map_final(self): # 计算数据的平均值,平方的均值,并返回
|
||||
mn = self.inSum/self.inCount
|
||||
mnSq = self.inSqSum/self.inCount
|
||||
yield (1, [self.inCount, mn, mnSq])
|
||||
|
||||
def reduce(self, key, packedValues):
|
||||
cumN, cumVal, cumSumSq = 0.0, 0.0, 0.0
|
||||
for valArr in packedValues: # 从输入流中获取值
|
||||
nj = float(valArr[0])
|
||||
cumN += nj
|
||||
cumVal += nj*float(valArr[1])
|
||||
cumSumSq += nj*float(valArr[2])
|
||||
mean = cumVal/cumN
|
||||
var = (cumSumSq - 2*mean*cumVal + cumN*mean*mean)/cumN
|
||||
yield (mean, var) # 发出平均值和方差
|
||||
|
||||
def steps(self):
|
||||
"""
|
||||
step方法定义执行的步骤。
|
||||
执行顺序不必完全遵循map-reduce模式。
|
||||
例如:
|
||||
1. map-reduce-reduce-reduce
|
||||
2. map-reduce-map-reduce-map-reduce
|
||||
在step方法里,需要为mrjob指定mapper和reducer的名称。如果没有,它将默认调用mapper和reducer方法。
|
||||
|
||||
在mapper 和 mapper_final中还可以共享状态,mapper 或 mapper_final 不能 reducer之间共享状态。
|
||||
"""
|
||||
return ([self.mr(mapper=self.map, mapper_final=self.map_final, reducer=self.reduce,)])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
MRmean.run()
|
||||
41
src/py2.x/ml/15.BigData_MapReduce/mrMeanMapper.py
Normal file
41
src/py2.x/ml/15.BigData_MapReduce/mrMeanMapper.py
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on 2017-04-06
|
||||
Update on 2017-06-20
|
||||
Machine Learning in Action Chapter 18
|
||||
Map Reduce Job for Hadoop Streaming
|
||||
Author: Peter/ApacheCN-xy/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
from numpy import mat, mean, power
|
||||
|
||||
'''
|
||||
这个mapper文件按行读取所有的输入并创建一组对应的浮点数,然后得到数组的长度并创建NumPy矩阵。
|
||||
再对所有的值进行平方,最后将均值和平方后的均值发送出去。这些值将用来计算全局的均值和方差。
|
||||
|
||||
Args:
|
||||
file 输入数据
|
||||
Return:
|
||||
'''
|
||||
|
||||
|
||||
def read_input(file):
|
||||
for line in file:
|
||||
yield line.rstrip() # 返回一个 yield 迭代器,每次获取下一个值,节约内存。
|
||||
|
||||
|
||||
input = read_input(sys.stdin) # 创建一个输入的数据行的列表list
|
||||
input = [float(line) for line in input] # 将得到的数据转化为 float 类型
|
||||
numInputs = len(input) # 获取数据的个数,即输入文件的数据的行数
|
||||
input = mat(input) # 将 List 转换为矩阵
|
||||
sqInput = power(input, 2) # 将矩阵的数据分别求 平方,即 2次方
|
||||
|
||||
# 输出 数据的个数,n个数据的均值,n个数据平方之后的均值
|
||||
# 第一行是标准输出,也就是reducer的输出
|
||||
# 第二行识标准错误输出,即对主节点作出的响应报告,表明本节点工作正常。
|
||||
# 【这不就是面试的装逼重点吗?如何设计监听架构细节】注意:一个好的习惯是想标准错误输出发送报告。如果某任务10分钟内没有报告输出,则将被Hadoop中止。
|
||||
print("%d\t%f\t%f" % (numInputs, mean(input), mean(sqInput))) # 计算均值
|
||||
print("map report: still alive", file=sys.stderr)
|
||||
47
src/py2.x/ml/15.BigData_MapReduce/mrMeanReducer.py
Normal file
47
src/py2.x/ml/15.BigData_MapReduce/mrMeanReducer.py
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on 2017-04-06
|
||||
Update on 2017-06-20
|
||||
Machine Learning in Action Chapter 18
|
||||
Map Reduce Job for Hadoop Streaming
|
||||
Author: Peter/ApacheCN-xy/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
|
||||
'''
|
||||
mapper 接受原始的输入并产生中间值传递给 reducer。
|
||||
很多的mapper是并行执行的,所以需要将这些mapper的输出合并成一个值。
|
||||
即:将中间的 key/value 对进行组合。
|
||||
'''
|
||||
|
||||
|
||||
def read_input(file):
|
||||
for line in file:
|
||||
yield line.rstrip() # 返回值中包含输入文件的每一行的数据的一个大的List
|
||||
|
||||
|
||||
input = read_input(sys.stdin) # 创建一个输入的数据行的列表list
|
||||
|
||||
# 将输入行分割成单独的项目并存储在列表的列表中
|
||||
mapperOut = [line.split('\t') for line in input]
|
||||
# 输入 数据的个数,n个数据的均值,n个数据平方之后的均值
|
||||
print (mapperOut)
|
||||
|
||||
# 累计样本总和,总和 和 平分和的总和
|
||||
cumN, cumVal, cumSumSq = 0.0, 0.0, 0.0
|
||||
for instance in mapperOut:
|
||||
nj = float(instance[0])
|
||||
cumN += nj
|
||||
cumVal += nj*float(instance[1])
|
||||
cumSumSq += nj*float(instance[2])
|
||||
|
||||
# 计算均值( varSum是计算方差的展开形式 )
|
||||
mean_ = cumVal/cumN
|
||||
varSum = (cumSumSq - 2*mean_*cumVal + cumN*mean_*mean_)/cumN
|
||||
# 输出 数据总量,均值,平方的均值(方差)
|
||||
print ("数据总量:%d\t均值:%f\t方差:%f" % (cumN, mean_, varSum))
|
||||
print("reduce report: still alive", file=sys.stderr)
|
||||
95
src/py2.x/ml/15.BigData_MapReduce/mrSVM.py
Normal file
95
src/py2.x/ml/15.BigData_MapReduce/mrSVM.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on 2017-04-07
|
||||
Update on 2017-06-20
|
||||
MapReduce version of Pegasos SVM
|
||||
Using mrjob to automate job flow
|
||||
Author: Peter/ApacheCN-xy/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from mrjob.job import MRJob
|
||||
|
||||
import pickle
|
||||
from numpy import *
|
||||
|
||||
|
||||
class MRsvm(MRJob):
|
||||
DEFAULT_INPUT_PROTOCOL = 'json_value'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(MRsvm, self).__init__(*args, **kwargs)
|
||||
self.data = pickle.load(open('/opt/git/MachineLearnidata/15.BigData_MapReduce/svmDat27'))
|
||||
self.w = 0
|
||||
self.eta = 0.69
|
||||
self.dataList = []
|
||||
self.k = self.options.batchsize
|
||||
self.numMappers = 1
|
||||
self.t = 1 # iteration number
|
||||
|
||||
def configure_options(self):
|
||||
super(MRsvm, self).configure_options()
|
||||
self.add_passthrough_option(
|
||||
'--iterations', dest='iterations', default=2, type='int',
|
||||
help='T: number of iterations to run')
|
||||
self.add_passthrough_option(
|
||||
'--batchsize', dest='batchsize', default=100, type='int',
|
||||
help='k: number of data points in a batch')
|
||||
|
||||
def map(self, mapperId, inVals): # 需要 2 个参数
|
||||
# input: nodeId, ('w', w-vector) OR nodeId, ('x', int)
|
||||
if False:
|
||||
yield
|
||||
if inVals[0] == 'w': # 积累 w向量
|
||||
self.w = inVals[1]
|
||||
elif inVals[0] == 'x':
|
||||
self.dataList.append(inVals[1]) # 累积数据点计算
|
||||
elif inVals[0] == 't': # 迭代次数
|
||||
self.t = inVals[1]
|
||||
else:
|
||||
self.eta = inVals # 这用于 debug, eta未在map中使用
|
||||
|
||||
def map_fin(self):
|
||||
labels = self.data[:, -1]
|
||||
X = self.data[:, :-1] # 将数据重新形成 X 和 Y
|
||||
if self.w == 0:
|
||||
self.w = [0.001] * shape(X)[1] # 在第一次迭代时,初始化 w
|
||||
for index in self.dataList:
|
||||
p = mat(self.w)*X[index, :].T # calc p=w*dataSet[key].T
|
||||
if labels[index]*p < 1.0:
|
||||
yield (1, ['u', index]) # 确保一切数据包含相同的key
|
||||
yield (1, ['w', self.w]) # 它们将在同一个 reducer
|
||||
yield (1, ['t', self.t])
|
||||
|
||||
def reduce(self, _, packedVals):
|
||||
for valArr in packedVals: # 从流输入获取值
|
||||
if valArr[0] == 'u':
|
||||
self.dataList.append(valArr[1])
|
||||
elif valArr[0] == 'w':
|
||||
self.w = valArr[1]
|
||||
elif valArr[0] == 't':
|
||||
self.t = valArr[1]
|
||||
|
||||
labels = self.data[:, -1]
|
||||
X = self.data[:, 0:-1]
|
||||
wMat = mat(self.w)
|
||||
wDelta = mat(zeros(len(self.w)))
|
||||
|
||||
for index in self.dataList:
|
||||
wDelta += float(labels[index]) * X[index, :] # wDelta += label*dataSet
|
||||
eta = 1.0/(2.0*self.t) # calc new: eta
|
||||
# calc new: w = (1.0 - 1/t)*w + (eta/k)*wDelta
|
||||
wMat = (1.0 - 1.0/self.t)*wMat + (eta/self.k)*wDelta
|
||||
for mapperNum in range(1, self.numMappers+1):
|
||||
yield (mapperNum, ['w', wMat.tolist()[0]]) # 发出 w
|
||||
if self.t < self.options.iterations:
|
||||
yield (mapperNum, ['t', self.t+1]) # 增量 T
|
||||
for j in range(self.k/self.numMappers): # emit random ints for mappers iid
|
||||
yield (mapperNum, ['x', random.randint(shape(self.data)[0])])
|
||||
|
||||
def steps(self):
|
||||
return ([self.mr(mapper=self.map, reducer=self.reduce, mapper_final=self.map_fin)] * self.options.iterations)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
MRsvm.run()
|
||||
13
src/py2.x/ml/15.BigData_MapReduce/mrSVMkickStart.py
Normal file
13
src/py2.x/ml/15.BigData_MapReduce/mrSVMkickStart.py
Normal file
@@ -0,0 +1,13 @@
|
||||
'''
|
||||
Created on Feb 27, 2011
|
||||
|
||||
Author: Peter
|
||||
'''
|
||||
from mrjob.protocol import JSONProtocol
|
||||
from numpy import *
|
||||
|
||||
fw=open('kickStart2.txt', 'w')
|
||||
for i in [1]:
|
||||
for j in range(100):
|
||||
fw.write('["x", %d]\n' % random.randint(200))
|
||||
fw.close()
|
||||
112
src/py2.x/ml/15.BigData_MapReduce/pegasos.py
Normal file
112
src/py2.x/ml/15.BigData_MapReduce/pegasos.py
Normal file
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on 2017-04-07
|
||||
Sequential Pegasos
|
||||
the input T is k*T in Batch Pegasos
|
||||
Author: Peter/ApacheCN-xy
|
||||
'''
|
||||
from __future__ import print_function
|
||||
from numpy import *
|
||||
|
||||
|
||||
def loadDataSet(fileName):
|
||||
dataMat = []
|
||||
labelMat = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split('\t')
|
||||
# dataMat.append([float(lineArr[0]), float(lineArr[1]), float(lineArr[2])])
|
||||
dataMat.append([float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(float(lineArr[2]))
|
||||
return dataMat, labelMat
|
||||
|
||||
|
||||
def seqPegasos(dataSet, labels, lam, T):
|
||||
m, n = shape(dataSet)
|
||||
w = zeros(n)
|
||||
for t in range(1, T+1):
|
||||
i = random.randint(m)
|
||||
eta = 1.0/(lam*t)
|
||||
p = predict(w, dataSet[i, :])
|
||||
if labels[i]*p < 1:
|
||||
w = (1.0 - 1/t)*w + eta*labels[i]*dataSet[i, :]
|
||||
else:
|
||||
w = (1.0 - 1/t)*w
|
||||
print(w)
|
||||
return w
|
||||
|
||||
|
||||
def predict(w, x):
|
||||
return w*x.T # 就是预测 y 的值
|
||||
|
||||
|
||||
def batchPegasos(dataSet, labels, lam, T, k):
|
||||
"""batchPegasos()
|
||||
|
||||
Args:
|
||||
dataMat 特征集合
|
||||
labels 分类结果集合
|
||||
lam 固定值
|
||||
T 迭代次数
|
||||
k 待处理列表大小
|
||||
Returns:
|
||||
w 回归系数
|
||||
"""
|
||||
m, n = shape(dataSet)
|
||||
w = zeros(n) # 回归系数
|
||||
dataIndex = range(m)
|
||||
for t in range(1, T+1):
|
||||
wDelta = mat(zeros(n)) # 重置 wDelta
|
||||
|
||||
# 它是学习率,代表了权重调整幅度的大小。(也可以理解为随机梯度的步长,使它不断减小,便于拟合)
|
||||
# 输入T和K分别设定了迭代次数和待处理列表的大小。在T次迭代过程中,每次需要重新计算eta
|
||||
eta = 1.0/(lam*t)
|
||||
random.shuffle(dataIndex)
|
||||
for j in range(k): # 全部的训练集 内循环中执行批处理,将分类错误的值全部做累加后更新权重向量
|
||||
i = dataIndex[j]
|
||||
p = predict(w, dataSet[i, :]) # mapper 代码
|
||||
|
||||
# 如果预测正确,并且预测结果的绝对值>=1,因为最大间隔为1, 认为没问题。
|
||||
# 否则算是预测错误, 通过预测错误的结果,来累计更新w.
|
||||
if labels[i]*p < 1: # mapper 代码
|
||||
wDelta += labels[i]*dataSet[i, :].A # 累积变化
|
||||
# w通过不断的随机梯度的方式来优化
|
||||
w = (1.0 - 1/t)*w + (eta/k)*wDelta # 在每个 T上应用更改
|
||||
# print '-----', w
|
||||
# print '++++++', w
|
||||
return w
|
||||
|
||||
|
||||
datArr, labelList = loadDataSet('data/15.BigData_MapReduce/testSet.txt')
|
||||
datMat = mat(datArr)
|
||||
# finalWs = seqPegasos(datMat, labelList, 2, 5000)
|
||||
finalWs = batchPegasos(datMat, labelList, 2, 50, 100)
|
||||
print(finalWs)
|
||||
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
x1 = []
|
||||
y1 = []
|
||||
xm1 = []
|
||||
ym1 = []
|
||||
for i in range(len(labelList)):
|
||||
if labelList[i] == 1.0:
|
||||
x1.append(datMat[i, 0])
|
||||
y1.append(datMat[i, 1])
|
||||
else:
|
||||
xm1.append(datMat[i, 0])
|
||||
ym1.append(datMat[i, 1])
|
||||
ax.scatter(x1, y1, marker='s', s=90)
|
||||
ax.scatter(xm1, ym1, marker='o', s=50, c='red')
|
||||
x = arange(-6.0, 8.0, 0.1)
|
||||
y = (-finalWs[0, 0]*x - 0)/finalWs[0, 1]
|
||||
# y2 = (0.43799*x)/0.12316
|
||||
y2 = (0.498442*x)/0.092387 # 2 iterations
|
||||
ax.plot(x, y)
|
||||
ax.plot(x, y2, 'g-.')
|
||||
ax.axis([-6, 8, -4, 5])
|
||||
ax.legend(('50 Iterations', '2 Iterations'))
|
||||
plt.show()
|
||||
60
src/py2.x/ml/15.BigData_MapReduce/proximalSVM.py
Normal file
60
src/py2.x/ml/15.BigData_MapReduce/proximalSVM.py
Normal file
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on 2011-02-25
|
||||
Update on 2017-06-20
|
||||
Author: Peter/ApacheCN-xy/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
import base64
|
||||
import pickle
|
||||
|
||||
import numpy
|
||||
|
||||
|
||||
def map(key, value):
|
||||
# input key= class for one training example, e.g. "-1.0"
|
||||
classes = [float(item) for item in key.split(",")] # e.g. [-1.0]
|
||||
D = numpy.diag(classes)
|
||||
|
||||
# input value = feature vector for one training example, e.g. "3.0, 7.0, 2.0"
|
||||
featurematrix = [float(item) for item in value.split(",")]
|
||||
A = numpy.matrix(featurematrix)
|
||||
|
||||
# create matrix E and vector e
|
||||
e = numpy.matrix(numpy.ones(len(A)).reshape(len(A), 1))
|
||||
E = numpy.matrix(numpy.append(A, -e, axis=1))
|
||||
|
||||
# create a tuple with the values to be used by reducer
|
||||
# and encode it with base64 to avoid potential trouble with '\t' and '\n' used
|
||||
# as default separators in Hadoop Streaming
|
||||
producedvalue = base64.b64encode(pickle.dumps((E.T*E, E.T*D*e)))
|
||||
|
||||
# note: a single constant key "producedkey" sends to only one reducer
|
||||
# somewhat "atypical" due to low degree of parallism on reducer side
|
||||
print("producedkey\t%s" % (producedvalue))
|
||||
|
||||
def reduce(key, values, mu=0.1):
|
||||
sumETE = None
|
||||
sumETDe = None
|
||||
|
||||
# key isn't used, so ignoring it with _ (underscore).
|
||||
for _, value in values:
|
||||
# unpickle values
|
||||
ETE, ETDe = pickle.loads(base64.b64decode(value))
|
||||
if sumETE == None:
|
||||
# create the I/mu with correct dimensions
|
||||
sumETE = numpy.matrix(numpy.eye(ETE.shape[1])/mu)
|
||||
sumETE += ETE
|
||||
|
||||
if sumETDe == None:
|
||||
# create sumETDe with correct dimensions
|
||||
sumETDe = ETDe
|
||||
else:
|
||||
sumETDe += ETDe
|
||||
|
||||
# note: omega = result[:-1] and gamma = result[-1]
|
||||
# but printing entire vector as output
|
||||
result = sumETE.I*sumETDe
|
||||
print("%s\t%s" % (key, str(result.tolist())))
|
||||
25
src/py2.x/ml/15.BigData_MapReduce/py27dbg.py
Normal file
25
src/py2.x/ml/15.BigData_MapReduce/py27dbg.py
Normal file
@@ -0,0 +1,25 @@
|
||||
'''
|
||||
Created on Feb 27, 2011
|
||||
MapReduce version of Pegasos SVM
|
||||
Using mrjob to automate job flow
|
||||
Author: Peter
|
||||
'''
|
||||
from mrjob.job import MRJob
|
||||
|
||||
import pickle
|
||||
from numpy import *
|
||||
|
||||
class MRsvm(MRJob):
|
||||
|
||||
def map(self, mapperId, inVals): #needs exactly 2 arguments
|
||||
if False: yield
|
||||
yield (1, 22)
|
||||
|
||||
def reduce(self, _, packedVals):
|
||||
yield "fuck ass"
|
||||
|
||||
def steps(self):
|
||||
return ([self.mr(mapper=self.map, reducer=self.reduce)])
|
||||
|
||||
if __name__ == '__main__':
|
||||
MRsvm.run()
|
||||
32
src/py2.x/ml/15.BigData_MapReduce/wc.py
Normal file
32
src/py2.x/ml/15.BigData_MapReduce/wc.py
Normal file
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
from mrjob.job import MRJob
|
||||
|
||||
|
||||
class MRWordCountUtility(MRJob):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(MRWordCountUtility, self).__init__(*args, **kwargs)
|
||||
self.chars = 0
|
||||
self.words = 0
|
||||
self.lines = 0
|
||||
|
||||
def mapper(self, _, line):
|
||||
if False:
|
||||
yield # I'm a generator!
|
||||
|
||||
self.chars += len(line) + 1 # +1 for newline
|
||||
self.words += sum(1 for word in line.split() if word.strip())
|
||||
self.lines += 1
|
||||
|
||||
def mapper_final(self):
|
||||
yield('chars', self.chars)
|
||||
yield('words', self.words)
|
||||
yield('lines', self.lines)
|
||||
|
||||
def reducer(self, key, values):
|
||||
yield(key, sum(values))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
MRWordCountUtility.run()
|
||||
298
src/py2.x/ml/2.KNN/kNN.py
Normal file
298
src/py2.x/ml/2.KNN/kNN.py
Normal file
@@ -0,0 +1,298 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
'''
|
||||
Created on Sep 16, 2010
|
||||
Update on 2017-05-18
|
||||
Author: Peter Harrington/羊三/小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
from numpy import *
|
||||
# 导入科学计算包numpy和运算符模块operator
|
||||
import operator
|
||||
from os import listdir
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def createDataSet():
|
||||
"""
|
||||
创建数据集和标签
|
||||
|
||||
调用方式
|
||||
import kNN
|
||||
group, labels = kNN.createDataSet()
|
||||
"""
|
||||
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
|
||||
labels = ['A', 'A', 'B', 'B']
|
||||
return group, labels
|
||||
|
||||
|
||||
def classify0(inX, dataSet, labels, k):
|
||||
"""
|
||||
inx[1,2,3]
|
||||
DS=[[1,2,3],[1,2,0]]
|
||||
inX: 用于分类的输入向量
|
||||
dataSet: 输入的训练样本集
|
||||
labels: 标签向量
|
||||
k: 选择最近邻居的数目
|
||||
注意:labels元素数目和dataSet行数相同;程序使用欧式距离公式.
|
||||
|
||||
预测数据所在分类可在输入下列命令
|
||||
kNN.classify0([0,0], group, labels, 3)
|
||||
"""
|
||||
|
||||
# -----------实现 classify0() 方法的第一种方式----------------------------------------------------------------------------------------------------------------------------
|
||||
# 1. 距离计算
|
||||
dataSetSize = dataSet.shape[0]
|
||||
# tile生成和训练样本对应的矩阵,并与训练样本求差
|
||||
"""
|
||||
tile: 列-3表示复制的行数, 行-1/2表示对inx的重复的次数
|
||||
|
||||
In [8]: tile(inx, (3, 1))
|
||||
Out[8]:
|
||||
array([[1, 2, 3],
|
||||
[1, 2, 3],
|
||||
[1, 2, 3]])
|
||||
|
||||
In [9]: tile(inx, (3, 2))
|
||||
Out[9]:
|
||||
array([[1, 2, 3, 1, 2, 3],
|
||||
[1, 2, 3, 1, 2, 3],
|
||||
[1, 2, 3, 1, 2, 3]])
|
||||
"""
|
||||
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
|
||||
"""
|
||||
欧氏距离: 点到点之间的距离
|
||||
第一行: 同一个点 到 dataSet的第一个点的距离。
|
||||
第二行: 同一个点 到 dataSet的第二个点的距离。
|
||||
...
|
||||
第N行: 同一个点 到 dataSet的第N个点的距离。
|
||||
|
||||
[[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
|
||||
(A1-A2)^2+(B1-B2)^2+(c1-c2)^2
|
||||
"""
|
||||
# 取平方
|
||||
sqDiffMat = diffMat ** 2
|
||||
# 将矩阵的每一行相加
|
||||
sqDistances = sqDiffMat.sum(axis=1)
|
||||
# 开方
|
||||
distances = sqDistances ** 0.5
|
||||
# 根据距离排序从小到大的排序,返回对应的索引位置
|
||||
# argsort() 是将x中的元素从小到大排列,提取其对应的index(索引),然后输出到y。
|
||||
# 例如:y=array([3,0,2,1,4,5]) 则,x[3]=-1最小,所以y[0]=3;x[5]=9最大,所以y[5]=5。
|
||||
# print 'distances=', distances
|
||||
sortedDistIndicies = distances.argsort()
|
||||
# print 'distances.argsort()=', sortedDistIndicies
|
||||
|
||||
# 2. 选择距离最小的k个点
|
||||
classCount = {}
|
||||
for i in range(k):
|
||||
# 找到该样本的类型
|
||||
voteIlabel = labels[sortedDistIndicies[i]]
|
||||
# 在字典中将该类型加一
|
||||
# 字典的get方法
|
||||
# 如:list.get(k,d) 其中 get相当于一条if...else...语句,参数k在字典中,字典将返回list[k];如果参数k不在字典中则返回参数d,如果K在字典中则返回k对应的value值
|
||||
# l = {5:2,3:4}
|
||||
# print l.get(3,0)返回的值是4;
|
||||
# Print l.get(1,0)返回值是0;
|
||||
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
|
||||
# 3. 排序并返回出现最多的那个类型
|
||||
# 字典的 items() 方法,以列表返回可遍历的(键,值)元组数组。
|
||||
# 例如:dict = {'Name': 'Zara', 'Age': 7} print "Value : %s" % dict.items() Value : [('Age', 7), ('Name', 'Zara')]
|
||||
# sorted 中的第2个参数 key=operator.itemgetter(1) 这个参数的意思是先比较第几个元素
|
||||
# 例如:a=[('b',2),('a',1),('c',0)] b=sorted(a,key=operator.itemgetter(1)) >>>b=[('c',0),('a',1),('b',2)] 可以看到排序是按照后边的0,1,2进行排序的,而不是a,b,c
|
||||
# b=sorted(a,key=operator.itemgetter(0)) >>>b=[('a',1),('b',2),('c',0)] 这次比较的是前边的a,b,c而不是0,1,2
|
||||
# b=sorted(a,key=opertator.itemgetter(1,0)) >>>b=[('c',0),('a',1),('b',2)] 这个是先比较第2个元素,然后对第一个元素进行排序,形成多级排序。
|
||||
# sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
|
||||
# return sortedClassCount[0][0]
|
||||
# 3.利用max函数直接返回字典中value最大的key
|
||||
maxClassCount = max(classCount, key=classCount.get)
|
||||
return maxClassCount
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------------------------
|
||||
# 实现 classify0() 方法的第二种方式
|
||||
|
||||
# """
|
||||
# 1. 计算距离
|
||||
|
||||
# 欧氏距离: 点到点之间的距离
|
||||
# 第一行: 同一个点 到 dataSet的第一个点的距离。
|
||||
# 第二行: 同一个点 到 dataSet的第二个点的距离。
|
||||
# ...
|
||||
# 第N行: 同一个点 到 dataSet的第N个点的距离。
|
||||
|
||||
# [[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
|
||||
# (A1-A2)^2+(B1-B2)^2+(c1-c2)^2
|
||||
|
||||
# inx - dataset 使用了numpy broadcasting,见 https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html
|
||||
# np.sum() 函数的使用见 https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.sum.html
|
||||
# """
|
||||
# dist = np.sum((inx - dataset)**2, axis=1)**0.5
|
||||
|
||||
# """
|
||||
# 2. k个最近的标签
|
||||
|
||||
# 对距离排序使用numpy中的argsort函数, 见 https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.sort.html#numpy.sort
|
||||
# 函数返回的是索引,因此取前k个索引使用[0 : k]
|
||||
# 将这k个标签存在列表k_labels中
|
||||
# """
|
||||
# k_labels = [labels[index] for index in dist.argsort()[0 : k]]
|
||||
# """
|
||||
# 3. 出现次数最多的标签即为最终类别
|
||||
|
||||
# 使用collections.Counter可以统计各个标签的出现次数,most_common返回出现次数最多的标签tuple,例如[('lable1', 2)],因此[0][0]可以取出标签值
|
||||
# """
|
||||
# label = Counter(k_labels).most_common(1)[0][0]
|
||||
# return label
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test1():
|
||||
"""
|
||||
第一个例子演示
|
||||
"""
|
||||
group, labels = createDataSet()
|
||||
print(str(group))
|
||||
print(str(labels))
|
||||
print(classify0([0.1, 0.1], group, labels, 3))
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------
|
||||
def file2matrix(filename):
|
||||
"""
|
||||
导入训练数据
|
||||
:param filename: 数据文件路径
|
||||
:return: 数据矩阵returnMat和对应的类别classLabelVector
|
||||
"""
|
||||
fr = open(filename)
|
||||
# 获得文件中的数据行的行数
|
||||
numberOfLines = len(fr.readlines())
|
||||
# 生成对应的空矩阵
|
||||
# 例如:zeros(2,3)就是生成一个 2*3的矩阵,各个位置上全是 0
|
||||
returnMat = zeros((numberOfLines, 3)) # prepare matrix to return
|
||||
classLabelVector = [] # prepare labels return
|
||||
fr = open(filename)
|
||||
index = 0
|
||||
for line in fr.readlines():
|
||||
# str.strip([chars]) --返回移除字符串头尾指定的字符生成的新字符串
|
||||
line = line.strip()
|
||||
# 以 '\t' 切割字符串
|
||||
listFromLine = line.split('\t')
|
||||
# 每列的属性数据
|
||||
returnMat[index, :] = listFromLine[0:3]
|
||||
# 每列的类别数据,就是 label 标签数据
|
||||
classLabelVector.append(int(listFromLine[-1]))
|
||||
index += 1
|
||||
# 返回数据矩阵returnMat和对应的类别classLabelVector
|
||||
return returnMat, classLabelVector
|
||||
|
||||
|
||||
def autoNorm(dataSet):
|
||||
"""
|
||||
归一化特征值,消除属性之间量级不同导致的影响
|
||||
:param dataSet: 数据集
|
||||
:return: 归一化后的数据集normDataSet,ranges和minVals即最小值与范围,并没有用到
|
||||
|
||||
归一化公式:
|
||||
Y = (X-Xmin)/(Xmax-Xmin)
|
||||
其中的 min 和 max 分别是数据集中的最小特征值和最大特征值。该函数可以自动将数字特征值转化为0到1的区间。
|
||||
"""
|
||||
# 计算每种属性的最大值、最小值、范围
|
||||
minVals = dataSet.min(0)
|
||||
maxVals = dataSet.max(0)
|
||||
# 极差
|
||||
ranges = maxVals - minVals
|
||||
# -------第一种实现方式---start-------------------------
|
||||
normDataSet = zeros(shape(dataSet))
|
||||
m = dataSet.shape[0]
|
||||
# 生成与最小值之差组成的矩阵
|
||||
normDataSet = dataSet - tile(minVals, (m, 1))
|
||||
# 将最小值之差除以范围组成矩阵
|
||||
normDataSet = normDataSet / tile(ranges, (m, 1)) # element wise divide
|
||||
# -------第一种实现方式---end---------------------------------------------
|
||||
|
||||
# # -------第二种实现方式---start---------------------------------------
|
||||
# norm_dataset = (dataset - minvalue) / ranges
|
||||
# # -------第二种实现方式---end---------------------------------------------
|
||||
return normDataSet, ranges, minVals
|
||||
|
||||
|
||||
def datingClassTest():
|
||||
"""
|
||||
对约会网站的测试方法
|
||||
:return: 错误数
|
||||
"""
|
||||
# 设置测试数据的的一个比例(训练数据集比例=1-hoRatio)
|
||||
hoRatio = 0.1 # 测试范围,一部分测试一部分作为样本
|
||||
# 从文件中加载数据
|
||||
datingDataMat, datingLabels = file2matrix('data/2.KNN/datingTestSet2.txt') # load data setfrom file
|
||||
# 归一化数据
|
||||
normMat, ranges, minVals = autoNorm(datingDataMat)
|
||||
# m 表示数据的行数,即矩阵的第一维
|
||||
m = normMat.shape[0]
|
||||
# 设置测试的样本数量, numTestVecs:m表示训练样本的数量
|
||||
numTestVecs = int(m * hoRatio)
|
||||
print('numTestVecs=', numTestVecs)
|
||||
errorCount = 0.0
|
||||
for i in range(numTestVecs):
|
||||
# 对数据测试
|
||||
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
|
||||
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
|
||||
if (classifierResult != datingLabels[i]): errorCount += 1.0
|
||||
print("the total error rate is: %f" % (errorCount / float(numTestVecs)))
|
||||
print(errorCount)
|
||||
|
||||
|
||||
def img2vector(filename):
|
||||
"""
|
||||
将图像数据转换为向量
|
||||
:param filename: 图片文件 因为我们的输入数据的图片格式是 32 * 32的
|
||||
:return: 一维矩阵
|
||||
该函数将图像转换为向量:该函数创建 1 * 1024 的NumPy数组,然后打开给定的文件,
|
||||
循环读出文件的前32行,并将每行的头32个字符值存储在NumPy数组中,最后返回数组。
|
||||
"""
|
||||
returnVect = zeros((1, 1024))
|
||||
fr = open(filename)
|
||||
for i in range(32):
|
||||
lineStr = fr.readline()
|
||||
for j in range(32):
|
||||
returnVect[0, 32 * i + j] = int(lineStr[j])
|
||||
return returnVect
|
||||
|
||||
|
||||
def handwritingClassTest():
|
||||
# 1. 导入数据
|
||||
hwLabels = []
|
||||
trainingFileList = listdir('data/2.KNN/trainingDigits') # load the training set
|
||||
m = len(trainingFileList)
|
||||
trainingMat = zeros((m, 1024))
|
||||
# hwLabels存储0~9对应的index位置, trainingMat存放的每个位置对应的图片向量
|
||||
for i in range(m):
|
||||
fileNameStr = trainingFileList[i]
|
||||
fileStr = fileNameStr.split('.')[0] # take off .txt
|
||||
classNumStr = int(fileStr.split('_')[0])
|
||||
hwLabels.append(classNumStr)
|
||||
# 将 32*32的矩阵->1*1024的矩阵
|
||||
trainingMat[i, :] = img2vector('data/2.KNN/trainingDigits/%s' % fileNameStr)
|
||||
|
||||
# 2. 导入测试数据
|
||||
testFileList = listdir('data/2.KNN/testDigits') # iterate through the test set
|
||||
errorCount = 0.0
|
||||
mTest = len(testFileList)
|
||||
for i in range(mTest):
|
||||
fileNameStr = testFileList[i]
|
||||
fileStr = fileNameStr.split('.')[0] # take off .txt
|
||||
classNumStr = int(fileStr.split('_')[0])
|
||||
vectorUnderTest = img2vector('data/2.KNN/testDigits/%s' % fileNameStr)
|
||||
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
|
||||
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
|
||||
if (classifierResult != classNumStr): errorCount += 1.0
|
||||
print("\nthe total number of errors is: %d" % errorCount)
|
||||
print("\nthe total error rate is: %f" % (errorCount / float(mTest)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# test1()
|
||||
# datingClassTest()
|
||||
handwritingClassTest()
|
||||
70
src/py2.x/ml/2.KNN/sklearn-knn-demo.py
Normal file
70
src/py2.x/ml/2.KNN/sklearn-knn-demo.py
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
"""
|
||||
Created on 2017-06-28
|
||||
Updated on 2017-06-28
|
||||
KNN:k近邻算法
|
||||
Author: 小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
"""
|
||||
from __future__ import print_function
|
||||
print(__doc__)
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from numpy import *
|
||||
from matplotlib.colors import ListedColormap
|
||||
from sklearn import neighbors, datasets
|
||||
|
||||
n_neighbors = 3
|
||||
|
||||
# 导入一些要玩的数据
|
||||
# iris = datasets.load_iris()
|
||||
# X = iris.data[:, :2] # 我们只采用前两个feature. 我们可以使用二维数据集避免这个丑陋的切片
|
||||
# y = iris.target
|
||||
|
||||
# print 'X=', type(X), X
|
||||
# print 'y=', type(y), y
|
||||
|
||||
X = array([[-1.0, -1.1], [-1.0, -1.0], [0, 0], [1.0, 1.1], [2.0, 2.0], [2.0, 2.1]])
|
||||
y = array([0, 0, 0, 1, 1, 1])
|
||||
|
||||
# print 'X=', type(X), X
|
||||
# print 'y=', type(y), y
|
||||
|
||||
h = .02 # 网格中的步长
|
||||
|
||||
# 创建彩色的地图
|
||||
# cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
|
||||
# cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
|
||||
|
||||
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA'])
|
||||
cmap_bold = ListedColormap(['#FF0000', '#00FF00'])
|
||||
|
||||
for weights in ['uniform', 'distance']:
|
||||
# 我们创建了一个knn分类器的实例,并适合数据。
|
||||
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
|
||||
clf.fit(X, y)
|
||||
|
||||
# 绘制决策边界。为此,我们将为每个分配一个颜色
|
||||
# 来绘制网格中的点 [x_min, x_max]x[y_min, y_max].
|
||||
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
|
||||
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
|
||||
np.arange(y_min, y_max, h))
|
||||
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||
|
||||
# 将结果放入一个彩色图中
|
||||
Z = Z.reshape(xx.shape)
|
||||
plt.figure()
|
||||
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
|
||||
|
||||
# 绘制训练点
|
||||
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
|
||||
plt.xlim(xx.min(), xx.max())
|
||||
plt.ylim(yy.min(), yy.max())
|
||||
plt.title("3-Class classification (k = %i, weights = '%s')"
|
||||
% (n_neighbors, weights))
|
||||
|
||||
plt.show()
|
||||
118
src/py2.x/ml/3.DecisionTree/DTSklearn.py
Normal file
118
src/py2.x/ml/3.DecisionTree/DTSklearn.py
Normal file
@@ -0,0 +1,118 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
# 原始链接: http://blog.csdn.net/lsldd/article/details/41223147
|
||||
# GitHub: https://github.com/apachecn/AiLearning
|
||||
from __future__ import print_function
|
||||
import numpy as np
|
||||
from sklearn import tree
|
||||
from sklearn.metrics import precision_recall_curve
|
||||
from sklearn.metrics import classification_report
|
||||
from sklearn.cross_validation import train_test_split
|
||||
|
||||
|
||||
def createDataSet():
|
||||
''' 数据读入 '''
|
||||
data = []
|
||||
labels = []
|
||||
with open("data/3.DecisionTree/data.txt") as ifile:
|
||||
for line in ifile:
|
||||
# 特征: 身高 体重 label: 胖瘦
|
||||
tokens = line.strip().split(' ')
|
||||
data.append([float(tk) for tk in tokens[:-1]])
|
||||
labels.append(tokens[-1])
|
||||
# 特征数据
|
||||
x = np.array(data)
|
||||
# label分类的标签数据
|
||||
labels = np.array(labels)
|
||||
# 预估结果的标签数据
|
||||
y = np.zeros(labels.shape)
|
||||
|
||||
''' 标签转换为0/1 '''
|
||||
y[labels == 'fat'] = 1
|
||||
print(data, '-------', x, '-------', labels, '-------', y)
|
||||
return x, y
|
||||
|
||||
|
||||
def predict_train(x_train, y_train):
|
||||
'''
|
||||
使用信息熵作为划分标准,对决策树进行训练
|
||||
参考链接: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
|
||||
'''
|
||||
clf = tree.DecisionTreeClassifier(criterion='entropy')
|
||||
# print(clf)
|
||||
clf.fit(x_train, y_train)
|
||||
''' 系数反映每个特征的影响力。越大表示该特征在分类中起到的作用越大 '''
|
||||
print('feature_importances_: %s' % clf.feature_importances_)
|
||||
|
||||
'''测试结果的打印'''
|
||||
y_pre = clf.predict(x_train)
|
||||
# print(x_train)
|
||||
print(y_pre)
|
||||
print(y_train)
|
||||
print(np.mean(y_pre == y_train))
|
||||
return y_pre, clf
|
||||
|
||||
|
||||
def show_precision_recall(x, y, clf, y_train, y_pre):
|
||||
'''
|
||||
准确率与召回率
|
||||
参考链接: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html#sklearn.metrics.precision_recall_curve
|
||||
'''
|
||||
precision, recall, thresholds = precision_recall_curve(y_train, y_pre)
|
||||
# 计算全量的预估结果
|
||||
answer = clf.predict_proba(x)[:, 1]
|
||||
|
||||
'''
|
||||
展现 准确率与召回率
|
||||
precision 准确率
|
||||
recall 召回率
|
||||
f1-score 准确率和召回率的一个综合得分
|
||||
support 参与比较的数量
|
||||
参考链接:http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report
|
||||
'''
|
||||
# target_names 以 y的label分类为准
|
||||
target_names = ['thin', 'fat']
|
||||
print(classification_report(y, answer, target_names=target_names))
|
||||
print(answer)
|
||||
print(y)
|
||||
|
||||
|
||||
def show_pdf(clf):
|
||||
'''
|
||||
可视化输出
|
||||
把决策树结构写入文件: http://sklearn.lzjqsdd.com/modules/tree.html
|
||||
|
||||
Mac报错:pydotplus.graphviz.InvocationException: GraphViz's executables not found
|
||||
解决方案:sudo brew install graphviz
|
||||
参考写入: http://www.jianshu.com/p/59b510bafb4d
|
||||
'''
|
||||
# with open("testResult/tree.dot", 'w') as f:
|
||||
# from sklearn.externals.six import StringIO
|
||||
# tree.export_graphviz(clf, out_file=f)
|
||||
|
||||
import pydotplus
|
||||
from sklearn.externals.six import StringIO
|
||||
dot_data = StringIO()
|
||||
tree.export_graphviz(clf, out_file=dot_data)
|
||||
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
|
||||
graph.write_pdf("output/3.DecisionTree/tree.pdf")
|
||||
|
||||
# from IPython.display import Image
|
||||
# Image(graph.create_png())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
x, y = createDataSet()
|
||||
|
||||
''' 拆分训练数据与测试数据, 80%做训练 20%做测试 '''
|
||||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
|
||||
print('拆分数据:', x_train, x_test, y_train, y_test)
|
||||
|
||||
# 得到训练的预测结果集
|
||||
y_pre, clf = predict_train(x_train, y_train)
|
||||
|
||||
# 展现 准确率与召回率
|
||||
show_precision_recall(x, y, clf, y_train, y_pre)
|
||||
|
||||
# 可视化输出
|
||||
show_pdf(clf)
|
||||
390
src/py2.x/ml/3.DecisionTree/DecisionTree.py
Executable file
390
src/py2.x/ml/3.DecisionTree/DecisionTree.py
Executable file
@@ -0,0 +1,390 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
'''
|
||||
Created on Oct 12, 2010
|
||||
Update on 2017-05-18
|
||||
Decision Tree Source Code for Machine Learning in Action Ch. 3
|
||||
Author: Peter Harrington/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
print(__doc__)
|
||||
import operator
|
||||
from math import log
|
||||
import decisionTreePlot as dtPlot
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def createDataSet():
|
||||
"""DateSet 基础数据集
|
||||
|
||||
Args:
|
||||
无需传入参数
|
||||
Returns:
|
||||
返回数据集和对应的label标签
|
||||
"""
|
||||
dataSet = [[1, 1, 'yes'],
|
||||
[1, 1, 'yes'],
|
||||
[1, 0, 'no'],
|
||||
[0, 1, 'no'],
|
||||
[0, 1, 'no']]
|
||||
# dataSet = [['yes'],
|
||||
# ['yes'],
|
||||
# ['no'],
|
||||
# ['no'],
|
||||
# ['no']]
|
||||
# labels 露出水面 脚蹼
|
||||
labels = ['no surfacing', 'flippers']
|
||||
# change to discrete values
|
||||
return dataSet, labels
|
||||
|
||||
|
||||
def calcShannonEnt(dataSet):
|
||||
"""calcShannonEnt(calculate Shannon entropy 计算给定数据集的香农熵)
|
||||
|
||||
Args:
|
||||
dataSet 数据集
|
||||
Returns:
|
||||
返回 每一组feature下的某个分类下,香农熵的信息期望
|
||||
"""
|
||||
# -----------计算香农熵的第一种实现方式start--------------------------------------------------------------------------------
|
||||
# 求list的长度,表示计算参与训练的数据量
|
||||
numEntries = len(dataSet)
|
||||
# 下面输出我们测试的数据集的一些信息
|
||||
# 例如:<type 'list'> numEntries: 5 是下面的代码的输出
|
||||
# print type(dataSet), 'numEntries: ', numEntries
|
||||
|
||||
# 计算分类标签label出现的次数
|
||||
labelCounts = {}
|
||||
# the the number of unique elements and their occurance
|
||||
for featVec in dataSet:
|
||||
# 将当前实例的标签存储,即每一行数据的最后一个数据代表的是标签
|
||||
currentLabel = featVec[-1]
|
||||
# 为所有可能的分类创建字典,如果当前的键值不存在,则扩展字典并将当前键值加入字典。每个键值都记录了当前类别出现的次数。
|
||||
if currentLabel not in labelCounts.keys():
|
||||
labelCounts[currentLabel] = 0
|
||||
labelCounts[currentLabel] += 1
|
||||
# print '-----', featVec, labelCounts
|
||||
|
||||
# 对于label标签的占比,求出label标签的香农熵
|
||||
shannonEnt = 0.0
|
||||
for key in labelCounts:
|
||||
# 使用所有类标签的发生频率计算类别出现的概率。
|
||||
prob = float(labelCounts[key])/numEntries
|
||||
# log base 2
|
||||
# 计算香农熵,以 2 为底求对数
|
||||
shannonEnt -= prob * log(prob, 2)
|
||||
# print '---', prob, prob * log(prob, 2), shannonEnt
|
||||
# -----------计算香农熵的第一种实现方式end--------------------------------------------------------------------------------
|
||||
|
||||
# # -----------计算香农熵的第二种实现方式start--------------------------------------------------------------------------------
|
||||
# # 统计标签出现的次数
|
||||
# label_count = Counter(data[-1] for data in dataSet)
|
||||
# # 计算概率
|
||||
# probs = [p[1] / len(dataSet) for p in label_count.items()]
|
||||
# # 计算香农熵
|
||||
# shannonEnt = sum([-p * log(p, 2) for p in probs])
|
||||
# # -----------计算香农熵的第二种实现方式end--------------------------------------------------------------------------------
|
||||
return shannonEnt
|
||||
|
||||
|
||||
def splitDataSet(dataSet, index, value):
|
||||
"""splitDataSet(通过遍历dataSet数据集,求出index对应的colnum列的值为value的行)
|
||||
就是依据index列进行分类,如果index列的数据等于 value的时候,就要将 index 划分到我们创建的新的数据集中
|
||||
Args:
|
||||
dataSet 数据集 待划分的数据集
|
||||
index 表示每一行的index列 划分数据集的特征
|
||||
value 表示index列对应的value值 需要返回的特征的值。
|
||||
Returns:
|
||||
index列为value的数据集【该数据集需要排除index列】
|
||||
"""
|
||||
# -----------切分数据集的第一种方式 start------------------------------------
|
||||
retDataSet = []
|
||||
for featVec in dataSet:
|
||||
# index列为value的数据集【该数据集需要排除index列】
|
||||
# 判断index列的值是否为value
|
||||
if featVec[index] == value:
|
||||
# chop out index used for splitting
|
||||
# [:index]表示前index行,即若 index 为2,就是取 featVec 的前 index 行
|
||||
reducedFeatVec = featVec[:index]
|
||||
'''
|
||||
请百度查询一下: extend和append的区别
|
||||
list.append(object) 向列表中添加一个对象object
|
||||
list.extend(sequence) 把一个序列seq的内容添加到列表中
|
||||
1、使用append的时候,是将new_media看作一个对象,整体打包添加到music_media对象中。
|
||||
2、使用extend的时候,是将new_media看作一个序列,将这个序列和music_media序列合并,并放在其后面。
|
||||
result = []
|
||||
result.extend([1,2,3])
|
||||
print result
|
||||
result.append([4,5,6])
|
||||
print result
|
||||
result.extend([7,8,9])
|
||||
print result
|
||||
结果:
|
||||
[1, 2, 3]
|
||||
[1, 2, 3, [4, 5, 6]]
|
||||
[1, 2, 3, [4, 5, 6], 7, 8, 9]
|
||||
'''
|
||||
reducedFeatVec.extend(featVec[index+1:])
|
||||
# [index+1:]表示从跳过 index 的 index+1行,取接下来的数据
|
||||
# 收集结果值 index列为value的行【该行需要排除index列】
|
||||
retDataSet.append(reducedFeatVec)
|
||||
# -----------切分数据集的第一种方式 end------------------------------------
|
||||
|
||||
# # -----------切分数据集的第二种方式 start------------------------------------
|
||||
# retDataSet = [data for data in dataSet for i, v in enumerate(data) if i == axis and v == value]
|
||||
# # -----------切分数据集的第二种方式 end------------------------------------
|
||||
return retDataSet
|
||||
|
||||
|
||||
def chooseBestFeatureToSplit(dataSet):
|
||||
"""chooseBestFeatureToSplit(选择最好的特征)
|
||||
|
||||
Args:
|
||||
dataSet 数据集
|
||||
Returns:
|
||||
bestFeature 最优的特征列
|
||||
"""
|
||||
|
||||
# -----------选择最优特征的第一种方式 start------------------------------------
|
||||
# 求第一行有多少列的 Feature, 最后一列是label列嘛
|
||||
numFeatures = len(dataSet[0]) - 1
|
||||
# label的信息熵
|
||||
baseEntropy = calcShannonEnt(dataSet)
|
||||
# 最优的信息增益值, 和最优的Featurn编号
|
||||
bestInfoGain, bestFeature = 0.0, -1
|
||||
# iterate over all the features
|
||||
for i in range(numFeatures):
|
||||
# create a list of all the examples of this feature
|
||||
# 获取每一个实例的第i+1个feature,组成list集合
|
||||
featList = [example[i] for example in dataSet]
|
||||
# get a set of unique values
|
||||
# 获取剔重后的集合,使用set对list数据进行去重
|
||||
uniqueVals = set(featList)
|
||||
# 创建一个临时的信息熵
|
||||
newEntropy = 0.0
|
||||
# 遍历某一列的value集合,计算该列的信息熵
|
||||
# 遍历当前特征中的所有唯一属性值,对每个唯一属性值划分一次数据集,计算数据集的新熵值,并对所有唯一特征值得到的熵求和。
|
||||
for value in uniqueVals:
|
||||
subDataSet = splitDataSet(dataSet, i, value)
|
||||
prob = len(subDataSet)/float(len(dataSet))
|
||||
newEntropy += prob * calcShannonEnt(subDataSet)
|
||||
# gain[信息增益]: 划分数据集前后的信息变化, 获取信息熵最大的值
|
||||
# 信息增益是熵的减少或者是数据无序度的减少。最后,比较所有特征中的信息增益,返回最好特征划分的索引值。
|
||||
infoGain = baseEntropy - newEntropy
|
||||
print('infoGain=', infoGain, 'bestFeature=', i, baseEntropy, newEntropy)
|
||||
if (infoGain > bestInfoGain):
|
||||
bestInfoGain = infoGain
|
||||
bestFeature = i
|
||||
return bestFeature
|
||||
# -----------选择最优特征的第一种方式 end------------------------------------
|
||||
|
||||
# # -----------选择最优特征的第二种方式 start------------------------------------
|
||||
# # 计算初始香农熵
|
||||
# base_entropy = calcShannonEnt(dataSet)
|
||||
# best_info_gain = 0
|
||||
# best_feature = -1
|
||||
# # 遍历每一个特征
|
||||
# for i in range(len(dataSet[0]) - 1):
|
||||
# # 对当前特征进行统计
|
||||
# feature_count = Counter([data[i] for data in dataSet])
|
||||
# # 计算分割后的香农熵
|
||||
# new_entropy = sum(feature[1] / float(len(dataSet)) * calcShannonEnt(splitDataSet(dataSet, i, feature[0])) \
|
||||
# for feature in feature_count.items())
|
||||
# # 更新值
|
||||
# info_gain = base_entropy - new_entropy
|
||||
# print('No. {0} feature info gain is {1:.3f}'.format(i, info_gain))
|
||||
# if info_gain > best_info_gain:
|
||||
# best_info_gain = info_gain
|
||||
# best_feature = i
|
||||
# return best_feature
|
||||
# # -----------选择最优特征的第二种方式 end------------------------------------
|
||||
|
||||
|
||||
def majorityCnt(classList):
|
||||
"""majorityCnt(选择出现次数最多的一个结果)
|
||||
|
||||
Args:
|
||||
classList label列的集合
|
||||
Returns:
|
||||
bestFeature 最优的特征列
|
||||
"""
|
||||
# -----------majorityCnt的第一种方式 start------------------------------------
|
||||
classCount = {}
|
||||
for vote in classList:
|
||||
if vote not in classCount.keys():
|
||||
classCount[vote] = 0
|
||||
classCount[vote] += 1
|
||||
# 倒叙排列classCount得到一个字典集合,然后取出第一个就是结果(yes/no),即出现次数最多的结果
|
||||
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
|
||||
# print 'sortedClassCount:', sortedClassCount
|
||||
return sortedClassCount[0][0]
|
||||
# -----------majorityCnt的第一种方式 end------------------------------------
|
||||
|
||||
# # -----------majorityCnt的第二种方式 start------------------------------------
|
||||
# major_label = Counter(classList).most_common(1)[0]
|
||||
# return major_label
|
||||
# # -----------majorityCnt的第二种方式 end------------------------------------
|
||||
|
||||
|
||||
def createTree(dataSet, labels):
|
||||
classList = [example[-1] for example in dataSet]
|
||||
# 如果数据集的最后一列的第一个值出现的次数=整个集合的数量,也就说只有一个类别,就只直接返回结果就行
|
||||
# 第一个停止条件:所有的类标签完全相同,则直接返回该类标签。
|
||||
# count() 函数是统计括号中的值在list中出现的次数
|
||||
if classList.count(classList[0]) == len(classList):
|
||||
return classList[0]
|
||||
# 如果数据集只有1列,那么最初出现label次数最多的一类,作为结果
|
||||
# 第二个停止条件:使用完了所有特征,仍然不能将数据集划分成仅包含唯一类别的分组。
|
||||
if len(dataSet[0]) == 1:
|
||||
return majorityCnt(classList)
|
||||
|
||||
# 选择最优的列,得到最优列对应的label含义
|
||||
bestFeat = chooseBestFeatureToSplit(dataSet)
|
||||
# 获取label的名称
|
||||
bestFeatLabel = labels[bestFeat]
|
||||
# 初始化myTree
|
||||
myTree = {bestFeatLabel: {}}
|
||||
# 注:labels列表是可变对象,在PYTHON函数中作为参数时传址引用,能够被全局修改
|
||||
# 所以这行代码导致函数外的同名变量被删除了元素,造成例句无法执行,提示'no surfacing' is not in list
|
||||
del(labels[bestFeat])
|
||||
# 取出最优列,然后它的branch做分类
|
||||
featValues = [example[bestFeat] for example in dataSet]
|
||||
uniqueVals = set(featValues)
|
||||
for value in uniqueVals:
|
||||
# 求出剩余的标签label
|
||||
subLabels = labels[:]
|
||||
# 遍历当前选择特征包含的所有属性值,在每个数据集划分上递归调用函数createTree()
|
||||
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
|
||||
# print 'myTree', value, myTree
|
||||
return myTree
|
||||
|
||||
|
||||
def classify(inputTree, featLabels, testVec):
|
||||
"""classify(给输入的节点,进行分类)
|
||||
|
||||
Args:
|
||||
inputTree 决策树模型
|
||||
featLabels Feature标签对应的名称
|
||||
testVec 测试输入的数据
|
||||
Returns:
|
||||
classLabel 分类的结果值,需要映射label才能知道名称
|
||||
"""
|
||||
# 获取tree的根节点对于的key值
|
||||
firstStr = inputTree.keys()[0]
|
||||
# 通过key得到根节点对应的value
|
||||
secondDict = inputTree[firstStr]
|
||||
# 判断根节点名称获取根节点在label中的先后顺序,这样就知道输入的testVec怎么开始对照树来做分类
|
||||
featIndex = featLabels.index(firstStr)
|
||||
# 测试数据,找到根节点对应的label位置,也就知道从输入的数据的第几位来开始分类
|
||||
key = testVec[featIndex]
|
||||
valueOfFeat = secondDict[key]
|
||||
print('+++', firstStr, 'xxx', secondDict, '---', key, '>>>', valueOfFeat)
|
||||
# 判断分枝是否结束: 判断valueOfFeat是否是dict类型
|
||||
if isinstance(valueOfFeat, dict):
|
||||
classLabel = classify(valueOfFeat, featLabels, testVec)
|
||||
else:
|
||||
classLabel = valueOfFeat
|
||||
return classLabel
|
||||
|
||||
|
||||
def storeTree(inputTree, filename):
|
||||
import pickle
|
||||
# -------------- 第一种方法 start --------------
|
||||
fw = open(filename, 'wb')
|
||||
pickle.dump(inputTree, fw)
|
||||
fw.close()
|
||||
# -------------- 第一种方法 end --------------
|
||||
|
||||
# -------------- 第二种方法 start --------------
|
||||
with open(filename, 'wb') as fw:
|
||||
pickle.dump(inputTree, fw)
|
||||
# -------------- 第二种方法 start --------------
|
||||
|
||||
|
||||
def grabTree(filename):
|
||||
import pickle
|
||||
fr = open(filename,'rb')
|
||||
return pickle.load(fr)
|
||||
|
||||
|
||||
def fishTest():
|
||||
# 1.创建数据和结果标签
|
||||
myDat, labels = createDataSet()
|
||||
# print myDat, labels
|
||||
|
||||
# 计算label分类标签的香农熵
|
||||
# calcShannonEnt(myDat)
|
||||
|
||||
# # 求第0列 为 1/0的列的数据集【排除第0列】
|
||||
# print '1---', splitDataSet(myDat, 0, 1)
|
||||
# print '0---', splitDataSet(myDat, 0, 0)
|
||||
|
||||
# # 计算最好的信息增益的列
|
||||
# print chooseBestFeatureToSplit(myDat)
|
||||
|
||||
import copy
|
||||
myTree = createTree(myDat, copy.deepcopy(labels))
|
||||
print(myTree)
|
||||
# [1, 1]表示要取的分支上的节点位置,对应的结果值
|
||||
print(classify(myTree, labels, [1, 1]))
|
||||
|
||||
# 获得树的高度
|
||||
print(get_tree_height(myTree))
|
||||
|
||||
# 画图可视化展现
|
||||
dtPlot.createPlot(myTree)
|
||||
|
||||
|
||||
def ContactLensesTest():
|
||||
"""
|
||||
Desc:
|
||||
预测隐形眼镜的测试代码
|
||||
Args:
|
||||
none
|
||||
Returns:
|
||||
none
|
||||
"""
|
||||
|
||||
# 加载隐形眼镜相关的 文本文件 数据
|
||||
fr = open('data/3.DecisionTree/lenses.txt')
|
||||
# 解析数据,获得 features 数据
|
||||
lenses = [inst.strip().split('\t') for inst in fr.readlines()]
|
||||
# 得到数据的对应的 Labels
|
||||
lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']
|
||||
# 使用上面的创建决策树的代码,构造预测隐形眼镜的决策树
|
||||
lensesTree = createTree(lenses, lensesLabels)
|
||||
print(lensesTree)
|
||||
# 画图可视化展现
|
||||
dtPlot.createPlot(lensesTree)
|
||||
|
||||
|
||||
def get_tree_height(tree):
|
||||
"""
|
||||
Desc:
|
||||
递归获得决策树的高度
|
||||
Args:
|
||||
tree
|
||||
Returns:
|
||||
树高
|
||||
"""
|
||||
|
||||
if not isinstance(tree, dict):
|
||||
return 1
|
||||
|
||||
child_trees = tree.values()[0].values()
|
||||
|
||||
# 遍历子树, 获得子树的最大高度
|
||||
max_height = 0
|
||||
for child_tree in child_trees:
|
||||
child_tree_height = get_tree_height(child_tree)
|
||||
|
||||
if child_tree_height > max_height:
|
||||
max_height = child_tree_height
|
||||
|
||||
return max_height + 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fishTest()
|
||||
# ContactLensesTest()
|
||||
138
src/py2.x/ml/3.DecisionTree/decisionTreePlot.py
Normal file
138
src/py2.x/ml/3.DecisionTree/decisionTreePlot.py
Normal file
@@ -0,0 +1,138 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on Oct 14, 2010
|
||||
Update on 2018-01-04
|
||||
Decision Tree Source Code for Machine Learning in Action Ch. 3
|
||||
Author: Peter Harrington/jiangzhonglian/zh0ng
|
||||
'''
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# 定义文本框 和 箭头格式 【 sawtooth 波浪方框, round4 矩形方框 , fc表示字体颜色的深浅 0.1~0.9 依次变浅,没错是变浅】
|
||||
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
|
||||
leafNode = dict(boxstyle="round4", fc="0.8")
|
||||
arrow_args = dict(arrowstyle="<-")
|
||||
|
||||
|
||||
def getNumLeafs(myTree):
|
||||
numLeafs = 0
|
||||
firstStr = myTree.keys()[0]
|
||||
secondDict = myTree[firstStr]
|
||||
# 根节点开始遍历
|
||||
for key in secondDict.keys():
|
||||
# 判断子节点是否为dict, 不是+1
|
||||
if type(secondDict[key]) is dict:
|
||||
numLeafs += getNumLeafs(secondDict[key])
|
||||
else:
|
||||
numLeafs += 1
|
||||
return numLeafs
|
||||
|
||||
|
||||
def getTreeDepth(myTree):
|
||||
maxDepth = 0
|
||||
firstStr = myTree.keys()[0]
|
||||
secondDict = myTree[firstStr]
|
||||
# 根节点开始遍历
|
||||
for key in secondDict.keys():
|
||||
# 判断子节点是不是dict, 求分枝的深度
|
||||
if type(secondDict[key]) is dict:
|
||||
thisDepth = 1 + getTreeDepth(secondDict[key])
|
||||
else:
|
||||
thisDepth = 1
|
||||
# 记录最大的分支深度
|
||||
if thisDepth > maxDepth:
|
||||
maxDepth = thisDepth
|
||||
return maxDepth
|
||||
|
||||
|
||||
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
|
||||
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', xytext=centerPt,
|
||||
textcoords='axes fraction', va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
|
||||
|
||||
|
||||
def plotMidText(cntrPt, parentPt, txtString):
|
||||
xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
|
||||
yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
|
||||
createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
|
||||
|
||||
|
||||
def plotTree(myTree, parentPt, nodeTxt):
|
||||
# 获取叶子节点的数量
|
||||
numLeafs = getNumLeafs(myTree)
|
||||
# 获取树的深度
|
||||
# depth = getTreeDepth(myTree)
|
||||
|
||||
# 找出第1个中心点的位置,然后与 parentPt定点进行划线
|
||||
# x坐标为 (numLeafs-1.)/plotTree.totalW/2+1./plotTree.totalW,化简如下
|
||||
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)
|
||||
# print cntrPt
|
||||
# 并打印输入对应的文字
|
||||
plotMidText(cntrPt, parentPt, nodeTxt)
|
||||
|
||||
firstStr = myTree.keys()[0]
|
||||
# 可视化Node分支点;第一次调用plotTree时,cntrPt与parentPt相同
|
||||
plotNode(firstStr, cntrPt, parentPt, decisionNode)
|
||||
# 根节点的值
|
||||
secondDict = myTree[firstStr]
|
||||
# y值 = 最高点-层数的高度[第二个节点位置];1.0相当于树的高度
|
||||
plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
|
||||
for key in secondDict.keys():
|
||||
# 判断该节点是否是Node节点
|
||||
if type(secondDict[key]) is dict:
|
||||
# 如果是就递归调用[recursion]
|
||||
plotTree(secondDict[key], cntrPt, str(key))
|
||||
else:
|
||||
# 如果不是,就在原来节点一半的地方找到节点的坐标
|
||||
plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
|
||||
# 可视化该节点位置
|
||||
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
|
||||
# 并打印输入对应的文字
|
||||
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
|
||||
plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD
|
||||
|
||||
|
||||
def createPlot(inTree):
|
||||
# 创建一个figure的模版
|
||||
fig = plt.figure(1, facecolor='green')
|
||||
fig.clf()
|
||||
|
||||
axprops = dict(xticks=[], yticks=[])
|
||||
# 表示创建一个1行,1列的图,createPlot.ax1 为第 1 个子图,
|
||||
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
|
||||
|
||||
plotTree.totalW = float(getNumLeafs(inTree))
|
||||
plotTree.totalD = float(getTreeDepth(inTree))
|
||||
# 半个节点的长度;xOff表示当前plotTree未遍历到的最左的叶节点的左边一个叶节点的x坐标
|
||||
# 所有叶节点中,最左的叶节点的x坐标是0.5/plotTree.totalW(因为totalW个叶节点在x轴方向是平均分布在[0, 1]区间上的)
|
||||
# 因此,xOff的初始值应该是 0.5/plotTree.totalW-相邻两个叶节点的x轴方向距离
|
||||
plotTree.xOff = -0.5 / plotTree.totalW
|
||||
# 根节点的y坐标为1.0,树的最低点y坐标为0
|
||||
plotTree.yOff = 1.0
|
||||
# 第二个参数是根节点的坐标
|
||||
plotTree(inTree, (0.5, 1.0), '')
|
||||
plt.show()
|
||||
|
||||
|
||||
# # 测试画图
|
||||
# def createPlot():
|
||||
# fig = plt.figure(1, facecolor='white')
|
||||
# fig.clf()
|
||||
# # ticks for demo puropses
|
||||
# createPlot.ax1 = plt.subplot(111, frameon=False)
|
||||
# plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode)
|
||||
# plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)
|
||||
# plt.show()
|
||||
|
||||
|
||||
# 测试数据集
|
||||
def retrieveTree(i):
|
||||
listOfTrees = [
|
||||
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
|
||||
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
|
||||
]
|
||||
return listOfTrees[i]
|
||||
|
||||
# 用测试数据绘制树
|
||||
# myTree = retrieveTree(1)
|
||||
# createPlot(myTree)
|
||||
59
src/py2.x/ml/3.DecisionTree/skelearn_dts_regressor_demo.py
Normal file
59
src/py2.x/ml/3.DecisionTree/skelearn_dts_regressor_demo.py
Normal file
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
"""
|
||||
Created on 2017-06-29
|
||||
Updated on 2017-06-29
|
||||
DecisionTree:决策树
|
||||
Author: 小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
print(__doc__)
|
||||
|
||||
# 引入必要的模型和库
|
||||
import numpy as np
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# 创建一个随机的数据集
|
||||
# 参考 https://docs.scipy.org/doc/numpy-1.6.0/reference/generated/numpy.random.mtrand.RandomState.html
|
||||
rng = np.random.RandomState(1)
|
||||
# print 'lalalalala===', rng
|
||||
# rand() 是给定形状的随机值,rng.rand(80, 1)即矩阵的形状是 80行,1列
|
||||
# sort()
|
||||
X = np.sort(5 * rng.rand(80, 1), axis=0)
|
||||
# print 'X=', X
|
||||
y = np.sin(X).ravel()
|
||||
# print 'y=', y
|
||||
y[::5] += 3 * (0.5 - rng.rand(16))
|
||||
# print 'yyy=', y
|
||||
|
||||
# 拟合回归模型
|
||||
# regr_1 = DecisionTreeRegressor(max_depth=2)
|
||||
# 保持 max_depth=5 不变,增加 min_samples_leaf=6 的参数,效果进一步提升了
|
||||
regr_2 = DecisionTreeRegressor(max_depth=5)
|
||||
regr_2 = DecisionTreeRegressor(min_samples_leaf=6)
|
||||
# regr_3 = DecisionTreeRegressor(max_depth=4)
|
||||
# regr_1.fit(X, y)
|
||||
regr_2.fit(X, y)
|
||||
# regr_3.fit(X, y)
|
||||
|
||||
# 预测
|
||||
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
|
||||
# y_1 = regr_1.predict(X_test)
|
||||
y_2 = regr_2.predict(X_test)
|
||||
# y_3 = regr_3.predict(X_test)
|
||||
|
||||
# 绘制结果
|
||||
plt.figure()
|
||||
plt.scatter(X, y, c="darkorange", label="data")
|
||||
# plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
|
||||
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
|
||||
# plt.plot(X_test, y_3, color="red", label="max_depth=3", linewidth=2)
|
||||
plt.xlabel("data")
|
||||
plt.ylabel("target")
|
||||
plt.title("Decision Tree Regression")
|
||||
plt.legend()
|
||||
plt.show()
|
||||
62
src/py2.x/ml/3.DecisionTree/sklearn_dts_classify_demo.py
Normal file
62
src/py2.x/ml/3.DecisionTree/sklearn_dts_classify_demo.py
Normal file
@@ -0,0 +1,62 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
|
||||
"""
|
||||
Created on 2017-06-29
|
||||
Updated on 2017-06-29
|
||||
DecisionTree:决策树
|
||||
Author: 小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
"""
|
||||
from __future__ import print_function
|
||||
print(__doc__)
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
|
||||
# 参数
|
||||
n_classes = 3
|
||||
plot_colors = "bry"
|
||||
plot_step = 0.02
|
||||
|
||||
# 加载数据
|
||||
iris = load_iris()
|
||||
|
||||
for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
|
||||
# 我们只用两个相应的features
|
||||
X = iris.data[:, pair]
|
||||
y = iris.target
|
||||
|
||||
# 训练
|
||||
clf = DecisionTreeClassifier().fit(X, y)
|
||||
|
||||
# 绘制决策边界
|
||||
plt.subplot(2, 3, pairidx + 1)
|
||||
|
||||
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
|
||||
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
|
||||
np.arange(y_min, y_max, plot_step))
|
||||
|
||||
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||
Z = Z.reshape(xx.shape)
|
||||
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
|
||||
|
||||
plt.xlabel(iris.feature_names[pair[0]])
|
||||
plt.ylabel(iris.feature_names[pair[1]])
|
||||
plt.axis("tight")
|
||||
|
||||
# 绘制训练点
|
||||
for i, color in zip(range(n_classes), plot_colors):
|
||||
idx = np.where(y == i)
|
||||
plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
|
||||
cmap=plt.cm.Paired)
|
||||
|
||||
plt.axis("tight")
|
||||
|
||||
plt.suptitle("Decision surface of a decision tree using paired features")
|
||||
plt.legend()
|
||||
plt.show()
|
||||
356
src/py2.x/ml/4.NaiveBayes/bayes.py
Executable file
356
src/py2.x/ml/4.NaiveBayes/bayes.py
Executable file
@@ -0,0 +1,356 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
'''
|
||||
Created on Oct 19, 2010
|
||||
Update on 2017-05-18
|
||||
Author: Peter Harrington/羊三/小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
from numpy import *
|
||||
"""
|
||||
p(xy)=p(x|y)p(y)=p(y|x)p(x)
|
||||
p(x|y)=p(y|x)p(x)/p(y)
|
||||
"""
|
||||
|
||||
|
||||
# 项目案例1: 屏蔽社区留言板的侮辱性言论
|
||||
|
||||
def loadDataSet():
|
||||
"""
|
||||
创建数据集
|
||||
:return: 单词列表postingList, 所属类别classVec
|
||||
"""
|
||||
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #[0,0,1,1,1......]
|
||||
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
|
||||
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
|
||||
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
|
||||
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
|
||||
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
|
||||
classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not
|
||||
return postingList, classVec
|
||||
|
||||
|
||||
def createVocabList(dataSet):
|
||||
"""
|
||||
获取所有单词的集合
|
||||
:param dataSet: 数据集
|
||||
:return: 所有单词的集合(即不含重复元素的单词列表)
|
||||
"""
|
||||
vocabSet = set([]) # create empty set
|
||||
for document in dataSet:
|
||||
# 操作符 | 用于求两个集合的并集
|
||||
vocabSet = vocabSet | set(document) # union of the two sets
|
||||
return list(vocabSet)
|
||||
|
||||
|
||||
def setOfWords2Vec(vocabList, inputSet):
|
||||
"""
|
||||
遍历查看该单词是否出现,出现该单词则将该单词置1
|
||||
:param vocabList: 所有单词集合列表
|
||||
:param inputSet: 输入数据集
|
||||
:return: 匹配列表[0,1,0,1...],其中 1与0 表示词汇表中的单词是否出现在输入的数据集中
|
||||
"""
|
||||
# 创建一个和词汇表等长的向量,并将其元素都设置为0
|
||||
returnVec = [0] * len(vocabList)# [0,0......]
|
||||
# 遍历文档中的所有单词,如果出现了词汇表中的单词,则将输出的文档向量中的对应值设为1
|
||||
for word in inputSet:
|
||||
if word in vocabList:
|
||||
returnVec[vocabList.index(word)] = 1
|
||||
else:
|
||||
print("the word: %s is not in my Vocabulary!" % word)
|
||||
return returnVec
|
||||
|
||||
|
||||
def _trainNB0(trainMatrix, trainCategory):
|
||||
"""
|
||||
训练数据原版
|
||||
:param trainMatrix: 文件单词矩阵 [[1,0,1,1,1....],[],[]...]
|
||||
:param trainCategory: 文件对应的类别[0,1,1,0....],列表长度等于单词矩阵数,其中的1代表对应的文件是侮辱性文件,0代表不是侮辱性矩阵
|
||||
:return:
|
||||
"""
|
||||
# 文件数
|
||||
numTrainDocs = len(trainMatrix)
|
||||
# 单词数
|
||||
numWords = len(trainMatrix[0])
|
||||
# 侮辱性文件的出现概率,即trainCategory中所有的1的个数,
|
||||
# 代表的就是多少个侮辱性文件,与文件的总数相除就得到了侮辱性文件的出现概率
|
||||
pAbusive = sum(trainCategory) / float(numTrainDocs)
|
||||
# 构造单词出现次数列表
|
||||
p0Num = zeros(numWords) # [0,0,0,.....]
|
||||
p1Num = zeros(numWords) # [0,0,0,.....]
|
||||
|
||||
# 整个数据集单词出现总数
|
||||
p0Denom = 0.0
|
||||
p1Denom = 0.0
|
||||
for i in range(numTrainDocs):
|
||||
# 遍历所有的文件,如果是侮辱性文件,就计算此侮辱性文件中出现的侮辱性单词的个数
|
||||
if trainCategory[i] == 1:
|
||||
p1Num += trainMatrix[i] #[0,1,1,....]->[0,1,1,...]
|
||||
p1Denom += sum(trainMatrix[i])
|
||||
else:
|
||||
# 如果不是侮辱性文件,则计算非侮辱性文件中出现的侮辱性单词的个数
|
||||
p0Num += trainMatrix[i]
|
||||
p0Denom += sum(trainMatrix[i])
|
||||
# 类别1,即侮辱性文档的[P(F1|C1),P(F2|C1),P(F3|C1),P(F4|C1),P(F5|C1)....]列表
|
||||
# 即 在1类别下,每个单词出现次数的占比
|
||||
p1Vect = p1Num / p1Denom# [1,2,3,5]/90->[1/90,...]
|
||||
# 类别0,即正常文档的[P(F1|C0),P(F2|C0),P(F3|C0),P(F4|C0),P(F5|C0)....]列表
|
||||
# 即 在0类别下,每个单词出现次数的占比
|
||||
p0Vect = p0Num / p0Denom
|
||||
return p0Vect, p1Vect, pAbusive
|
||||
|
||||
|
||||
def trainNB0(trainMatrix, trainCategory):
|
||||
"""
|
||||
训练数据优化版本
|
||||
:param trainMatrix: 文件单词矩阵
|
||||
:param trainCategory: 文件对应的类别
|
||||
:return:
|
||||
"""
|
||||
# 总文件数
|
||||
numTrainDocs = len(trainMatrix)
|
||||
# 总单词数
|
||||
numWords = len(trainMatrix[0])
|
||||
# 侮辱性文件的出现概率
|
||||
pAbusive = sum(trainCategory) / float(numTrainDocs)
|
||||
# 构造单词出现次数列表
|
||||
# p0Num 正常的统计
|
||||
# p1Num 侮辱的统计
|
||||
# 避免单词列表中的任何一个单词为0,而导致最后的乘积为0,所以将每个单词的出现次数初始化为 1
|
||||
p0Num = ones(numWords)#[0,0......]->[1,1,1,1,1.....]
|
||||
p1Num = ones(numWords)
|
||||
|
||||
# 整个数据集单词出现总数,2.0根据样本/实际调查结果调整分母的值(2主要是避免分母为0,当然值可以调整)
|
||||
# p0Denom 正常的统计
|
||||
# p1Denom 侮辱的统计
|
||||
p0Denom = 2.0
|
||||
p1Denom = 2.0
|
||||
for i in range(numTrainDocs):
|
||||
if trainCategory[i] == 1:
|
||||
# 累加辱骂词的频次
|
||||
p1Num += trainMatrix[i]
|
||||
# 对每篇文章的辱骂的频次 进行统计汇总
|
||||
p1Denom += sum(trainMatrix[i])
|
||||
else:
|
||||
p0Num += trainMatrix[i]
|
||||
p0Denom += sum(trainMatrix[i])
|
||||
# 类别1,即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表
|
||||
p1Vect = log(p1Num / p1Denom)
|
||||
# 类别0,即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表
|
||||
p0Vect = log(p0Num / p0Denom)
|
||||
return p0Vect, p1Vect, pAbusive
|
||||
|
||||
|
||||
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
|
||||
"""
|
||||
使用算法:
|
||||
# 将乘法转换为加法
|
||||
乘法:P(C|F1F2...Fn) = P(F1F2...Fn|C)P(C)/P(F1F2...Fn)
|
||||
加法:P(F1|C)*P(F2|C)....P(Fn|C)P(C) -> log(P(F1|C))+log(P(F2|C))+....+log(P(Fn|C))+log(P(C))
|
||||
:param vec2Classify: 待测数据[0,1,1,1,1...],即要分类的向量
|
||||
:param p0Vec: 类别0,即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表
|
||||
:param p1Vec: 类别1,即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表
|
||||
:param pClass1: 类别1,侮辱性文件的出现概率
|
||||
:return: 类别1 or 0
|
||||
"""
|
||||
# 计算公式 log(P(F1|C))+log(P(F2|C))+....+log(P(Fn|C))+log(P(C))
|
||||
# 使用 NumPy 数组来计算两个向量相乘的结果,这里的相乘是指对应元素相乘,即先将两个向量中的第一个元素相乘,然后将第2个元素相乘,以此类推。
|
||||
# 我的理解是:这里的 vec2Classify * p1Vec 的意思就是将每个词与其对应的概率相关联起来
|
||||
# 可以理解为 1.单词在词汇表中的条件下,文件是good 类别的概率 也可以理解为 2.在整个空间下,文件既在词汇表中又是good类别的概率
|
||||
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
|
||||
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
|
||||
if p1 > p0:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def bagOfWords2VecMN(vocabList, inputSet):
|
||||
returnVec = [0] * len(vocabList)
|
||||
for word in inputSet:
|
||||
if word in vocabList:
|
||||
returnVec[vocabList.index(word)] += 1
|
||||
return returnVec
|
||||
|
||||
|
||||
def testingNB():
|
||||
"""
|
||||
测试朴素贝叶斯算法
|
||||
"""
|
||||
# 1. 加载数据集
|
||||
listOPosts, listClasses = loadDataSet()
|
||||
# 2. 创建单词集合
|
||||
myVocabList = createVocabList(listOPosts)
|
||||
# 3. 计算单词是否出现并创建数据矩阵
|
||||
trainMat = []
|
||||
for postinDoc in listOPosts:
|
||||
# 返回m*len(myVocabList)的矩阵, 记录的都是0,1信息
|
||||
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
|
||||
# 4. 训练数据
|
||||
p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
|
||||
# 5. 测试数据
|
||||
testEntry = ['love', 'my', 'dalmation']
|
||||
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
|
||||
print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
|
||||
testEntry = ['stupid', 'garbage']
|
||||
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
|
||||
print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------
|
||||
# 项目案例2: 使用朴素贝叶斯过滤垃圾邮件
|
||||
|
||||
# 切分文本
|
||||
def textParse(bigString):
|
||||
'''
|
||||
Desc:
|
||||
接收一个大字符串并将其解析为字符串列表
|
||||
Args:
|
||||
bigString -- 大字符串
|
||||
Returns:
|
||||
去掉少于 2 个字符的字符串,并将所有字符串转换为小写,返回字符串列表
|
||||
'''
|
||||
import re
|
||||
# 使用正则表达式来切分句子,其中分隔符是除单词、数字外的任意字符串
|
||||
listOfTokens = re.split(r'\W*', bigString)
|
||||
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
|
||||
|
||||
|
||||
def spamTest():
|
||||
'''
|
||||
Desc:
|
||||
对贝叶斯垃圾邮件分类器进行自动化处理。
|
||||
Args:
|
||||
none
|
||||
Returns:
|
||||
对测试集中的每封邮件进行分类,若邮件分类错误,则错误数加 1,最后返回总的错误百分比。
|
||||
'''
|
||||
docList = []
|
||||
classList = []
|
||||
fullText = []
|
||||
for i in range(1, 26):
|
||||
# 切分,解析数据,并归类为 1 类别
|
||||
wordList = textParse(open('data/4.NaiveBayes/email/spam/%d.txt' % i).read())
|
||||
docList.append(wordList)
|
||||
classList.append(1)
|
||||
# 切分,解析数据,并归类为 0 类别
|
||||
wordList = textParse(open('data/4.NaiveBayes/email/ham/%d.txt' % i).read())
|
||||
docList.append(wordList)
|
||||
fullText.extend(wordList)
|
||||
classList.append(0)
|
||||
# 创建词汇表
|
||||
vocabList = createVocabList(docList)
|
||||
trainingSet = range(50)
|
||||
testSet = []
|
||||
# 随机取 10 个邮件用来测试
|
||||
for i in range(10):
|
||||
# random.uniform(x, y) 随机生成一个范围为 x - y 的实数
|
||||
randIndex = int(random.uniform(0, len(trainingSet)))
|
||||
testSet.append(trainingSet[randIndex])
|
||||
del(trainingSet[randIndex])
|
||||
trainMat = []
|
||||
trainClasses = []
|
||||
for docIndex in trainingSet:
|
||||
trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
|
||||
trainClasses.append(classList[docIndex])
|
||||
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
|
||||
errorCount = 0
|
||||
for docIndex in testSet:
|
||||
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
|
||||
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
|
||||
errorCount += 1
|
||||
print('the errorCount is: ', errorCount)
|
||||
print('the testSet length is :', len(testSet))
|
||||
print('the error rate is :', float(errorCount)/len(testSet))
|
||||
|
||||
|
||||
def testParseTest():
|
||||
print(textParse(open('data/4.NaiveBayes/email/ham/1.txt').read()))
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------------
|
||||
# 项目案例3: 使用朴素贝叶斯从个人广告中获取区域倾向
|
||||
|
||||
# 将文本文件解析成 词条向量
|
||||
def setOfWords2VecMN(vocabList,inputSet):
|
||||
returnVec=[0]*len(vocabList) # 创建一个其中所含元素都为0的向量
|
||||
for word in inputSet:
|
||||
if word in vocabList:
|
||||
returnVec[vocabList.index(word)]+=1
|
||||
return returnVec
|
||||
|
||||
|
||||
#文件解析
|
||||
def textParse(bigString):
|
||||
import re
|
||||
listOfTokens=re.split(r'\W*', bigString)
|
||||
return [tok.lower() for tok in listOfTokens if len(tok)>2]
|
||||
|
||||
|
||||
#RSS源分类器及高频词去除函数
|
||||
def calcMostFreq(vocabList,fullText):
|
||||
import operator
|
||||
freqDict={}
|
||||
for token in vocabList: #遍历词汇表中的每个词
|
||||
freqDict[token]=fullText.count(token) #统计每个词在文本中出现的次数
|
||||
sortedFreq=sorted(freqDict.iteritems(),key=operator.itemgetter(1),reverse=True) #根据每个词出现的次数从高到底对字典进行排序
|
||||
return sortedFreq[:30] #返回出现次数最高的30个单词
|
||||
def localWords(feed1,feed0):
|
||||
import feedparser
|
||||
docList=[];classList=[];fullText=[]
|
||||
minLen=min(len(feed1['entries']),len(feed0['entries']))
|
||||
for i in range(minLen):
|
||||
wordList=textParse(feed1['entries'][i]['summary']) #每次访问一条RSS源
|
||||
docList.append(wordList)
|
||||
fullText.extend(wordList)
|
||||
classList.append(1)
|
||||
wordList=textParse(feed0['entries'][i]['summary'])
|
||||
docList.append(wordList)
|
||||
fullText.extend(wordList)
|
||||
classList.append(0)
|
||||
vocabList=createVocabList(docList)
|
||||
top30Words=calcMostFreq(vocabList,fullText)
|
||||
for pairW in top30Words:
|
||||
if pairW[0] in vocabList:vocabList.remove(pairW[0]) #去掉出现次数最高的那些词
|
||||
trainingSet=range(2*minLen);testSet=[]
|
||||
for i in range(20):
|
||||
randIndex=int(random.uniform(0,len(trainingSet)))
|
||||
testSet.append(trainingSet[randIndex])
|
||||
del(trainingSet[randIndex])
|
||||
trainMat=[];trainClasses=[]
|
||||
for docIndex in trainingSet:
|
||||
trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
|
||||
trainClasses.append(classList[docIndex])
|
||||
p0V,p1V,pSpam=trainNB0(array(trainMat),array(trainClasses))
|
||||
errorCount=0
|
||||
for docIndex in testSet:
|
||||
wordVector=bagOfWords2VecMN(vocabList,docList[docIndex])
|
||||
if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
|
||||
errorCount+=1
|
||||
print('the error rate is:',float(errorCount)/len(testSet))
|
||||
return vocabList,p0V,p1V
|
||||
|
||||
|
||||
# 最具表征性的词汇显示函数
|
||||
def getTopWords(ny,sf):
|
||||
import operator
|
||||
vocabList,p0V,p1V=localWords(ny,sf)
|
||||
topNY=[];topSF=[]
|
||||
for i in range(len(p0V)):
|
||||
if p0V[i]>-6.0:topSF.append((vocabList[i],p0V[i]))
|
||||
if p1V[i]>-6.0:topNY.append((vocabList[i],p1V[i]))
|
||||
sortedSF=sorted(topSF,key=lambda pair:pair[1],reverse=True)
|
||||
print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
|
||||
for item in sortedSF:
|
||||
print(item[0])
|
||||
sortedNY=sorted(topNY,key=lambda pair:pair[1],reverse=True)
|
||||
print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
|
||||
for item in sortedNY:
|
||||
print(item[0])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# testingNB()
|
||||
spamTest()
|
||||
# laTest()
|
||||
46
src/py2.x/ml/4.NaiveBayes/sklearn-nb-demo.py
Normal file
46
src/py2.x/ml/4.NaiveBayes/sklearn-nb-demo.py
Normal file
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
"""
|
||||
Created on 2017-06-28
|
||||
Updated on 2017-06-28
|
||||
NaiveBayes:朴素贝叶斯
|
||||
Author: 小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
# GaussianNB_高斯朴素贝叶斯
|
||||
import numpy as np
|
||||
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
|
||||
Y = np.array([1, 1, 1, 2, 2, 2])
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
clf = GaussianNB()
|
||||
clf.fit(X, Y)
|
||||
print(clf.predict([[-0.8, -1]]))
|
||||
clf_pf = GaussianNB()
|
||||
clf_pf.partial_fit(X, Y, np.unique(Y))
|
||||
print(clf_pf.predict([[-0.8, -1]]))
|
||||
|
||||
# MultinomialNB_多项朴素贝叶斯
|
||||
'''
|
||||
import numpy as np
|
||||
X = np.random.randint(5, size=(6, 100))
|
||||
y = np.array([1, 2, 3, 4, 5, 6])
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
clf = MultinomialNB()
|
||||
clf.fit(X, y)
|
||||
print clf.predict(X[2:3])
|
||||
'''
|
||||
|
||||
# BernoulliNB_伯努利朴素贝叶斯
|
||||
'''
|
||||
import numpy as np
|
||||
X = np.random.randint(2, size=(6, 100))
|
||||
Y = np.array([1, 2, 3, 4, 4, 5])
|
||||
from sklearn.naive_bayes import BernoulliNB
|
||||
clf = BernoulliNB()
|
||||
clf.fit(X, Y)
|
||||
print clf.predict(X[2:3])
|
||||
'''
|
||||
298
src/py2.x/ml/5.Logistic/logistic.py
Normal file
298
src/py2.x/ml/5.Logistic/logistic.py
Normal file
@@ -0,0 +1,298 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
'''
|
||||
Created on Oct 27, 2010
|
||||
Update on 2017-05-18
|
||||
Logistic Regression Working Module
|
||||
Author: Peter Harrington/羊三/小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
from numpy import *
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 使用 Logistic 回归在简单数据集上的分类
|
||||
|
||||
|
||||
# 解析数据
|
||||
def loadDataSet(file_name):
|
||||
'''
|
||||
Desc:
|
||||
加载并解析数据
|
||||
Args:
|
||||
file_name -- 文件名称,要解析的文件所在磁盘位置
|
||||
Returns:
|
||||
dataMat -- 原始数据的特征
|
||||
labelMat -- 原始数据的标签,也就是每条样本对应的类别
|
||||
'''
|
||||
# dataMat为原始数据, labelMat为原始数据的标签
|
||||
dataMat = []
|
||||
labelMat = []
|
||||
fr = open(file_name)
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split()
|
||||
if len(lineArr) == 1:
|
||||
continue # 这里如果就一个空的元素,则跳过本次循环
|
||||
# 为了方便计算,我们将 X0 的值设为 1.0 ,也就是在每一行的开头添加一个 1.0 作为 X0
|
||||
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(int(lineArr[2]))
|
||||
return dataMat, labelMat
|
||||
|
||||
|
||||
# sigmoid跳跃函数
|
||||
def sigmoid(inX):
|
||||
# return 1.0 / (1 + exp(-inX))
|
||||
|
||||
# Tanh是Sigmoid的变形,与 sigmoid 不同的是,tanh 是0均值的。因此,实际应用中,tanh 会比 sigmoid 更好。
|
||||
return 2 * 1.0/(1+exp(-2*inX)) - 1
|
||||
|
||||
|
||||
# 正常的处理方案
|
||||
# 两个参数:第一个参数==> dataMatIn 是一个2维NumPy数组,每列分别代表每个不同的特征,每行则代表每个训练样本。
|
||||
# 第二个参数==> classLabels 是类别标签,它是一个 1*100 的行向量。为了便于矩阵计算,需要将该行向量转换为列向量,做法是将原向量转置,再将它赋值给labelMat。
|
||||
def gradAscent(dataMatIn, classLabels):
|
||||
'''
|
||||
Desc:
|
||||
正常的梯度上升法
|
||||
Args:
|
||||
dataMatIn -- 输入的 数据的特征 List
|
||||
classLabels -- 输入的数据的类别标签
|
||||
Returns:
|
||||
array(weights) -- 得到的最佳回归系数
|
||||
'''
|
||||
|
||||
# 转化为矩阵[[1,1,2],[1,1,2]....]
|
||||
dataMatrix = mat(dataMatIn) # 转换为 NumPy 矩阵
|
||||
# 转化为矩阵[[0,1,0,1,0,1.....]],并转制[[0],[1],[0].....]
|
||||
# transpose() 行列转置函数
|
||||
# 将行向量转化为列向量 => 矩阵的转置
|
||||
labelMat = mat(classLabels).transpose() # 首先将数组转换为 NumPy 矩阵,然后再将行向量转置为列向量
|
||||
# m->数据量,样本数 n->特征数
|
||||
m, n = shape(dataMatrix)
|
||||
# print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100
|
||||
# alpha代表向目标移动的步长
|
||||
alpha = 0.001
|
||||
# 迭代次数
|
||||
maxCycles = 500
|
||||
# 生成一个长度和特征数相同的矩阵,此处n为3 -> [[1],[1],[1]]
|
||||
# weights 代表回归系数, 此处的 ones((n,1)) 创建一个长度和特征数相同的矩阵,其中的数全部都是 1
|
||||
weights = ones((n, 1))
|
||||
for k in range(maxCycles): # heavy on matrix operations
|
||||
# m*3 的矩阵 * 3*1 的单位矩阵 = m*1的矩阵
|
||||
# 那么乘上单位矩阵的意义,就代表:通过公式得到的理论值
|
||||
# 参考地址: 矩阵乘法的本质是什么? https://www.zhihu.com/question/21351965/answer/31050145
|
||||
# print 'dataMatrix====', dataMatrix
|
||||
# print 'weights====', weights
|
||||
# n*3 * 3*1 = n*1
|
||||
h = sigmoid(dataMatrix * weights) # 矩阵乘法
|
||||
# print 'hhhhhhh====', h
|
||||
# labelMat是实际值
|
||||
error = (labelMat - h) # 向量相减
|
||||
# 0.001* (3*m)*(m*1) 表示在每一个列上的一个误差情况,最后得出 x1,x2,xn的系数的偏移量
|
||||
weights = weights + alpha * dataMatrix.transpose() * error # 矩阵乘法,最后得到回归系数
|
||||
return array(weights)
|
||||
|
||||
|
||||
# 随机梯度下降
|
||||
# 梯度下降优化算法在每次更新数据集时都需要遍历整个数据集,计算复杂都较高
|
||||
# 随机梯度下降一次只用一个样本点来更新回归系数
|
||||
def stocGradAscent0(dataMatrix, classLabels):
|
||||
'''
|
||||
Desc:
|
||||
随机梯度下降,只使用一个样本点来更新回归系数
|
||||
Args:
|
||||
dataMatrix -- 输入数据的数据特征(除去最后一列)
|
||||
classLabels -- 输入数据的类别标签(最后一列数据)
|
||||
Returns:
|
||||
weights -- 得到的最佳回归系数
|
||||
'''
|
||||
m, n = shape(dataMatrix)
|
||||
alpha = 0.01
|
||||
# n*1的矩阵
|
||||
# 函数ones创建一个全1的数组
|
||||
weights = ones(n) # 初始化长度为n的数组,元素全部为 1
|
||||
for i in range(m):
|
||||
# sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn,此处求出的 h 是一个具体的数值,而不是一个矩阵
|
||||
h = sigmoid(sum(dataMatrix[i] * weights))
|
||||
# print 'dataMatrix[i]===', dataMatrix[i]
|
||||
# 计算真实类别与预测类别之间的差值,然后按照该差值调整回归系数
|
||||
error = classLabels[i] - h
|
||||
# 0.01*(1*1)*(1*n)
|
||||
# print weights, "*" * 10, dataMatrix[i], "*" * 10, error
|
||||
weights = weights + alpha * error * dataMatrix[i]
|
||||
return weights
|
||||
|
||||
|
||||
# 随机梯度下降算法(随机化)
|
||||
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
|
||||
'''
|
||||
Desc:
|
||||
改进版的随机梯度下降,使用随机的一个样本来更新回归系数
|
||||
Args:
|
||||
dataMatrix -- 输入数据的数据特征(除去最后一列数据)
|
||||
classLabels -- 输入数据的类别标签(最后一列数据)
|
||||
numIter=150 -- 迭代次数
|
||||
Returns:
|
||||
weights -- 得到的最佳回归系数
|
||||
'''
|
||||
m, n = shape(dataMatrix)
|
||||
weights = ones(n) # 创建与列数相同的矩阵的系数矩阵,所有的元素都是1
|
||||
# 随机梯度, 循环150,观察是否收敛
|
||||
for j in range(numIter):
|
||||
# [0, 1, 2 .. m-1]
|
||||
dataIndex = range(m)
|
||||
for i in range(m):
|
||||
# i和j的不断增大,导致alpha的值不断减少,但是不为0
|
||||
alpha = 4 / (
|
||||
1.0 + j + i
|
||||
) + 0.0001 # alpha 会随着迭代不断减小,但永远不会减小到0,因为后边还有一个常数项0.0001
|
||||
# 随机产生一个 0~len()之间的一个值
|
||||
# random.uniform(x, y) 方法将随机生成下一个实数,它在[x,y]范围内,x是这个范围内的最小值,y是这个范围内的最大值。
|
||||
randIndex = int(random.uniform(0, len(dataIndex)))
|
||||
# sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn
|
||||
h = sigmoid(sum(dataMatrix[dataIndex[randIndex]] * weights))
|
||||
error = classLabels[dataIndex[randIndex]] - h
|
||||
# print weights, '__h=%s' % h, '__'*20, alpha, '__'*20, error, '__'*20, dataMatrix[randIndex]
|
||||
weights = weights + alpha * error * dataMatrix[dataIndex[randIndex]]
|
||||
del (dataIndex[randIndex])
|
||||
return weights
|
||||
|
||||
|
||||
# 可视化展示
|
||||
def plotBestFit(dataArr, labelMat, weights):
|
||||
'''
|
||||
Desc:
|
||||
将我们得到的数据可视化展示出来
|
||||
Args:
|
||||
dataArr:样本数据的特征
|
||||
labelMat:样本数据的类别标签,即目标变量
|
||||
weights:回归系数
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
|
||||
n = shape(dataArr)[0]
|
||||
xcord1 = []
|
||||
ycord1 = []
|
||||
xcord2 = []
|
||||
ycord2 = []
|
||||
for i in range(n):
|
||||
if int(labelMat[i]) == 1:
|
||||
xcord1.append(dataArr[i, 1])
|
||||
ycord1.append(dataArr[i, 2])
|
||||
else:
|
||||
xcord2.append(dataArr[i, 1])
|
||||
ycord2.append(dataArr[i, 2])
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
|
||||
ax.scatter(xcord2, ycord2, s=30, c='green')
|
||||
x = arange(-3.0, 3.0, 0.1)
|
||||
"""
|
||||
y的由来,卧槽,是不是没看懂?
|
||||
首先理论上是这个样子的。
|
||||
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||||
w0*x0+w1*x1+w2*x2=f(x)
|
||||
x0最开始就设置为1叻, x2就是我们画图的y值,而f(x)被我们磨合误差给算到w0,w1,w2身上去了
|
||||
所以: w0+w1*x+w2*y=0 => y = (-w0-w1*x)/w2
|
||||
"""
|
||||
y = (-weights[0] - weights[1] * x) / weights[2]
|
||||
ax.plot(x, y)
|
||||
plt.xlabel('X')
|
||||
plt.ylabel('Y')
|
||||
plt.show()
|
||||
|
||||
|
||||
def simpleTest():
|
||||
# 1.收集并准备数据
|
||||
dataMat, labelMat = loadDataSet("data/5.Logistic/TestSet.txt")
|
||||
|
||||
# print dataMat, '---\n', labelMat
|
||||
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
|
||||
# 因为数组没有是复制n份, array的乘法就是乘法
|
||||
dataArr = array(dataMat)
|
||||
# print dataArr
|
||||
# weights = gradAscent(dataArr, labelMat)
|
||||
# weights = stocGradAscent0(dataArr, labelMat)
|
||||
weights = stocGradAscent1(dataArr, labelMat)
|
||||
# print '*'*30, weights
|
||||
|
||||
# 数据可视化
|
||||
plotBestFit(dataArr, labelMat, weights)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------
|
||||
# 从疝气病症预测病马的死亡率
|
||||
# 分类函数,根据回归系数和特征向量来计算 Sigmoid的值
|
||||
def classifyVector(inX, weights):
|
||||
'''
|
||||
Desc:
|
||||
最终的分类函数,根据回归系数和特征向量来计算 Sigmoid 的值,大于0.5函数返回1,否则返回0
|
||||
Args:
|
||||
inX -- 特征向量,features
|
||||
weights -- 根据梯度下降/随机梯度下降 计算得到的回归系数
|
||||
Returns:
|
||||
如果 prob 计算大于 0.5 函数返回 1
|
||||
否则返回 0
|
||||
'''
|
||||
prob = sigmoid(sum(inX * weights))
|
||||
if prob > 0.5: return 1.0
|
||||
else: return 0.0
|
||||
|
||||
|
||||
# 打开测试集和训练集,并对数据进行格式化处理
|
||||
def colicTest():
|
||||
'''
|
||||
Desc:
|
||||
打开测试集和训练集,并对数据进行格式化处理
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
errorRate -- 分类错误率
|
||||
'''
|
||||
frTrain = open('data/5.Logistic/horseColicTraining.txt')
|
||||
frTest = open('data/5.Logistic/horseColicTest.txt')
|
||||
trainingSet = []
|
||||
trainingLabels = []
|
||||
# 解析训练数据集中的数据特征和Labels
|
||||
# trainingSet 中存储训练数据集的特征,trainingLabels 存储训练数据集的样本对应的分类标签
|
||||
for line in frTrain.readlines():
|
||||
currLine = line.strip().split('\t')
|
||||
lineArr = []
|
||||
for i in range(21):
|
||||
lineArr.append(float(currLine[i]))
|
||||
trainingSet.append(lineArr)
|
||||
trainingLabels.append(float(currLine[21]))
|
||||
# 使用 改进后的 随机梯度下降算法 求得在此数据集上的最佳回归系数 trainWeights
|
||||
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 500)
|
||||
# trainWeights = stocGradAscent0(array(trainingSet), trainingLabels)
|
||||
errorCount = 0
|
||||
numTestVec = 0.0
|
||||
# 读取 测试数据集 进行测试,计算分类错误的样本条数和最终的错误率
|
||||
for line in frTest.readlines():
|
||||
numTestVec += 1.0
|
||||
currLine = line.strip().split('\t')
|
||||
lineArr = []
|
||||
for i in range(21):
|
||||
lineArr.append(float(currLine[i]))
|
||||
if int(classifyVector(array(lineArr), trainWeights)) != int(
|
||||
currLine[21]):
|
||||
errorCount += 1
|
||||
errorRate = (float(errorCount) / numTestVec)
|
||||
print("the error rate of this test is: %f" % errorRate)
|
||||
return errorRate
|
||||
|
||||
|
||||
# 调用 colicTest() 10次并求结果的平均值
|
||||
def multiTest():
|
||||
numTests = 10
|
||||
errorSum = 0.0
|
||||
for k in range(numTests):
|
||||
errorSum += colicTest()
|
||||
print("after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
simpleTest()
|
||||
# multiTest()
|
||||
282
src/py2.x/ml/5.Logistic/sklearn_logisticRegression_demo.py
Normal file
282
src/py2.x/ml/5.Logistic/sklearn_logisticRegression_demo.py
Normal file
@@ -0,0 +1,282 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
|
||||
'''
|
||||
Created on Oct 27, 2010
|
||||
Update on 2017-05-18
|
||||
Logistic Regression Working Module
|
||||
Author: 小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
scikit-learn的例子地址:http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
|
||||
'''
|
||||
|
||||
# 逻辑回归中的 L1 惩罚和稀缺性 L1 Penalty and Sparsity in Logistic Regression
|
||||
'''
|
||||
print(__doc__)
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn import datasets
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
digits = datasets.load_digits()
|
||||
|
||||
X, y = digits.data, digits.target
|
||||
X = StandardScaler().fit_transform(X)
|
||||
|
||||
# 将大小数字分类为小
|
||||
y = (y > 4).astype(np.int)
|
||||
|
||||
|
||||
# 设置正则化参数
|
||||
for i, C in enumerate((100, 1, 0.01)):
|
||||
# 减少训练时间短的容忍度
|
||||
clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
|
||||
clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
|
||||
clf_l1_LR.fit(X, y)
|
||||
clf_l2_LR.fit(X, y)
|
||||
|
||||
coef_l1_LR = clf_l1_LR.coef_.ravel()
|
||||
coef_l2_LR = clf_l2_LR.coef_.ravel()
|
||||
|
||||
# coef_l1_LR contains zeros due to the
|
||||
# L1 sparsity inducing norm
|
||||
# 由于 L1 稀疏诱导规范,coef_l1_LR 包含零
|
||||
|
||||
sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
|
||||
sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100
|
||||
|
||||
print("C=%.2f" % C)
|
||||
print("Sparsity with L1 penalty: %.2f%%" % sparsity_l1_LR)
|
||||
print("score with L1 penalty: %.4f" % clf_l1_LR.score(X, y))
|
||||
print("Sparsity with L2 penalty: %.2f%%" % sparsity_l2_LR)
|
||||
print("score with L2 penalty: %.4f" % clf_l2_LR.score(X, y))
|
||||
|
||||
l1_plot = plt.subplot(3, 2, 2 * i + 1)
|
||||
l2_plot = plt.subplot(3, 2, 2 * (i + 1))
|
||||
if i == 0:
|
||||
l1_plot.set_title("L1 penalty")
|
||||
l2_plot.set_title("L2 penalty")
|
||||
|
||||
l1_plot.imshow(np.abs(coef_l1_LR.reshape(8, 8)), interpolation='nearest',
|
||||
cmap='binary', vmax=1, vmin=0)
|
||||
l2_plot.imshow(np.abs(coef_l2_LR.reshape(8, 8)), interpolation='nearest',
|
||||
cmap='binary', vmax=1, vmin=0)
|
||||
plt.text(-8, 3, "C = %.2f" % C)
|
||||
|
||||
l1_plot.set_xticks(())
|
||||
l1_plot.set_yticks(())
|
||||
l2_plot.set_xticks(())
|
||||
l2_plot.set_yticks(())
|
||||
|
||||
plt.show()
|
||||
'''
|
||||
|
||||
# 具有 L1-逻辑回归的路径
|
||||
'''
|
||||
print(__doc__)
|
||||
|
||||
from datetime import datetime
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn import linear_model
|
||||
from sklearn import datasets
|
||||
from sklearn.svm import l1_min_c
|
||||
|
||||
iris = datasets.load_iris()
|
||||
X = iris.data
|
||||
y = iris.target
|
||||
|
||||
X = X[y != 2]
|
||||
y = y[y != 2]
|
||||
|
||||
X -= np.mean(X, 0)
|
||||
|
||||
cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)
|
||||
|
||||
|
||||
print("Computing regularization path ...")
|
||||
start = datetime.now()
|
||||
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
|
||||
coefs_ = []
|
||||
for c in cs:
|
||||
clf.set_params(C=c)
|
||||
clf.fit(X, y)
|
||||
coefs_.append(clf.coef_.ravel().copy())
|
||||
print("This took ", datetime.now() - start)
|
||||
|
||||
coefs_ = np.array(coefs_)
|
||||
plt.plot(np.log10(cs), coefs_)
|
||||
ymin, ymax = plt.ylim()
|
||||
plt.xlabel('log(C)')
|
||||
plt.ylabel('Coefficients')
|
||||
plt.title('Logistic Regression Path')
|
||||
plt.axis('tight')
|
||||
plt.show()
|
||||
'''
|
||||
|
||||
# 绘制多项式和一对二的逻辑回归 Plot multinomial and One-vs-Rest Logistic Regression
|
||||
'''
|
||||
print(__doc__)
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
# 制作 3 类数据集进行分类
|
||||
centers = [[-5, 0], [0, 1.5], [5, -1]]
|
||||
X, y = make_blobs(n_samples=1000, centers=centers, random_state=40)
|
||||
transformation = [[0.4, 0.2], [-0.4, 1.2]]
|
||||
X = np.dot(X, transformation)
|
||||
|
||||
for multi_class in ('multinomial', 'ovr'):
|
||||
clf = LogisticRegression(solver='sag', max_iter=100, random_state=42,
|
||||
multi_class=multi_class).fit(X, y)
|
||||
|
||||
# 打印训练分数
|
||||
print("training score : %.3f (%s)" % (clf.score(X, y), multi_class))
|
||||
|
||||
# 创建一个网格来绘制
|
||||
h = .02 # 网格中的步长
|
||||
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
|
||||
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
|
||||
np.arange(y_min, y_max, h))
|
||||
|
||||
# 绘制决策边界。为此,我们将为网格 [x_min, x_max]x[y_min, y_max]中的每个点分配一个颜色。
|
||||
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||
# 将结果放入彩色图
|
||||
Z = Z.reshape(xx.shape)
|
||||
plt.figure()
|
||||
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
|
||||
plt.title("Decision surface of LogisticRegression (%s)" % multi_class)
|
||||
plt.axis('tight')
|
||||
|
||||
# 将训练点也绘制进入
|
||||
colors = "bry"
|
||||
for i, color in zip(clf.classes_, colors):
|
||||
idx = np.where(y == i)
|
||||
plt.scatter(X[idx, 0], X[idx, 1], c=color, cmap=plt.cm.Paired)
|
||||
|
||||
# 绘制三个一对数分类器
|
||||
xmin, xmax = plt.xlim()
|
||||
ymin, ymax = plt.ylim()
|
||||
coef = clf.coef_
|
||||
intercept = clf.intercept_
|
||||
|
||||
def plot_hyperplane(c, color):
|
||||
def line(x0):
|
||||
return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
|
||||
plt.plot([xmin, xmax], [line(xmin), line(xmax)],
|
||||
ls="--", color=color)
|
||||
|
||||
for i, color in zip(clf.classes_, colors):
|
||||
plot_hyperplane(i, color)
|
||||
|
||||
plt.show()
|
||||
'''
|
||||
from __future__ import print_function
|
||||
|
||||
# Logistic Regression 3-class Classifier 逻辑回归 3-类 分类器
|
||||
|
||||
print(__doc__)
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn import linear_model, datasets
|
||||
|
||||
# 引入一些数据来玩
|
||||
iris = datasets.load_iris()
|
||||
# 我们只采用样本数据的前两个feature
|
||||
X = iris.data[:, :2]
|
||||
Y = iris.target
|
||||
|
||||
h = .02 # 网格中的步长
|
||||
|
||||
logreg = linear_model.LogisticRegression(C=1e5)
|
||||
|
||||
# 我们创建了一个 Neighbours Classifier 的实例,并拟合数据。
|
||||
logreg.fit(X, Y)
|
||||
|
||||
# 绘制决策边界。为此我们将为网格 [x_min, x_max]x[y_min, y_max] 中的每个点分配一个颜色。
|
||||
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
|
||||
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
|
||||
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||
|
||||
# 将结果放入彩色图中
|
||||
Z = Z.reshape(xx.shape)
|
||||
plt.figure(1, figsize=(4, 3))
|
||||
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
|
||||
|
||||
# 将训练点也同样放入彩色图中
|
||||
plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
|
||||
plt.xlabel('Sepal length')
|
||||
plt.ylabel('Sepal width')
|
||||
|
||||
plt.xlim(xx.min(), xx.max())
|
||||
plt.ylim(yy.min(), yy.max())
|
||||
plt.xticks(())
|
||||
plt.yticks(())
|
||||
|
||||
plt.show()
|
||||
|
||||
# Logistic function 逻辑回归函数
|
||||
# 这个类似于咱们之前讲解 logistic 回归的 Sigmoid 函数,模拟的阶跃函数
|
||||
|
||||
'''
|
||||
print(__doc__)
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn import linear_model
|
||||
|
||||
# 这是我们的测试集,它只是一条直线,带有一些高斯噪声。
|
||||
xmin, xmax = -5, 5
|
||||
n_samples = 100
|
||||
np.random.seed(0)
|
||||
X = np.random.normal(size=n_samples)
|
||||
y = (X > 0).astype(np.float)
|
||||
X[X > 0] *= 4
|
||||
X += .3 * np.random.normal(size=n_samples)
|
||||
|
||||
X = X[:, np.newaxis]
|
||||
# 运行分类器
|
||||
clf = linear_model.LogisticRegression(C=1e5)
|
||||
clf.fit(X, y)
|
||||
|
||||
# 并且画出我们的结果
|
||||
plt.figure(1, figsize=(4, 3))
|
||||
plt.clf()
|
||||
plt.scatter(X.ravel(), y, color='black', zorder=20)
|
||||
X_test = np.linspace(-5, 10, 300)
|
||||
|
||||
|
||||
def model(x):
|
||||
return 1 / (1 + np.exp(-x))
|
||||
loss = model(X_test * clf.coef_ + clf.intercept_).ravel()
|
||||
plt.plot(X_test, loss, color='red', linewidth=3)
|
||||
|
||||
ols = linear_model.LinearRegression()
|
||||
ols.fit(X, y)
|
||||
plt.plot(X_test, ols.coef_ * X_test + ols.intercept_, linewidth=1)
|
||||
plt.axhline(.5, color='.5')
|
||||
|
||||
plt.ylabel('y')
|
||||
plt.xlabel('X')
|
||||
plt.xticks(range(-5, 10))
|
||||
plt.yticks([0, 0.5, 1])
|
||||
plt.ylim(-.25, 1.25)
|
||||
plt.xlim(-4, 10)
|
||||
plt.legend(('Logistic Regression Model', 'Linear Regression Model'),
|
||||
loc="lower right", fontsize='small')
|
||||
plt.show()
|
||||
'''
|
||||
|
||||
|
||||
|
||||
83
src/py2.x/ml/6.SVM/sklearn-svm-demo.py
Normal file
83
src/py2.x/ml/6.SVM/sklearn-svm-demo.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
"""
|
||||
Created on 2017-06-28
|
||||
Updated on 2017-06-28
|
||||
SVM:最大边距分离超平面
|
||||
Author: 片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
sklearn-SVM译文链接: http://cwiki.apachecn.org/pages/viewpage.action?pageId=10031359
|
||||
"""
|
||||
from __future__ import print_function
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from sklearn import svm
|
||||
|
||||
print(__doc__)
|
||||
|
||||
|
||||
# 创建40个分离点
|
||||
np.random.seed(0)
|
||||
# X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
|
||||
# Y = [0] * 20 + [1] * 20
|
||||
|
||||
|
||||
def loadDataSet(fileName):
|
||||
"""
|
||||
对文件进行逐行解析,从而得到第行的类标签和整个数据矩阵
|
||||
Args:
|
||||
fileName 文件名
|
||||
Returns:
|
||||
dataMat 数据矩阵
|
||||
labelMat 类标签
|
||||
"""
|
||||
dataMat = []
|
||||
labelMat = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split('\t')
|
||||
dataMat.append([float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(float(lineArr[2]))
|
||||
return dataMat, labelMat
|
||||
|
||||
|
||||
X, Y = loadDataSet('data/6.SVM/testSet.txt')
|
||||
X = np.mat(X)
|
||||
|
||||
print(("X=", X))
|
||||
print(("Y=", Y))
|
||||
|
||||
# 拟合一个SVM模型
|
||||
clf = svm.SVC(kernel='linear')
|
||||
clf.fit(X, Y)
|
||||
|
||||
# 获取分割超平面
|
||||
w = clf.coef_[0]
|
||||
# 斜率
|
||||
a = -w[0]/w[1]
|
||||
# 从-5到5,顺序间隔采样50个样本,默认是num=50
|
||||
# xx = np.linspace(-5, 5) # , num=50)
|
||||
xx = np.linspace(-2, 10) # , num=50)
|
||||
# 二维的直线方程
|
||||
yy = a * xx - (clf.intercept_[0]) / w[1]
|
||||
print(("yy=", yy))
|
||||
|
||||
# plot the parallels to the separating hyperplane that pass through the support vectors
|
||||
# 通过支持向量绘制分割超平面
|
||||
print(("support_vectors_=", clf.support_vectors_))
|
||||
b = clf.support_vectors_[0]
|
||||
yy_down = a * xx + (b[1] - a * b[0])
|
||||
b = clf.support_vectors_[-1]
|
||||
yy_up = a * xx + (b[1] - a * b[0])
|
||||
|
||||
# plot the line, the points, and the nearest vectors to the plane
|
||||
plt.plot(xx, yy, 'k-')
|
||||
plt.plot(xx, yy_down, 'k--')
|
||||
plt.plot(xx, yy_up, 'k--')
|
||||
|
||||
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80, facecolors='none')
|
||||
plt.scatter([X[:, 0]], [X[:, 1]], c=Y, cmap=plt.cm.Paired)
|
||||
|
||||
plt.axis('tight')
|
||||
plt.show()
|
||||
536
src/py2.x/ml/6.SVM/svm-complete.py
Normal file
536
src/py2.x/ml/6.SVM/svm-complete.py
Normal file
@@ -0,0 +1,536 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
"""
|
||||
Created on Nov 4, 2010
|
||||
Update on 2017-05-18
|
||||
Chapter 5 source file for Machine Learing in Action
|
||||
Author: Peter/geekidentity/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
"""
|
||||
from __future__ import print_function
|
||||
from numpy import *
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
class optStruct:
|
||||
"""
|
||||
建立的数据结构来保存所有的重要值
|
||||
"""
|
||||
def __init__(self, dataMatIn, classLabels, C, toler, kTup):
|
||||
"""
|
||||
Args:
|
||||
dataMatIn 数据集
|
||||
classLabels 类别标签
|
||||
C 松弛变量(常量值),允许有些数据点可以处于分隔面的错误一侧。
|
||||
控制最大化间隔和保证大部分的函数间隔小于1.0这两个目标的权重。
|
||||
可以通过调节该参数达到不同的结果。
|
||||
toler 容错率
|
||||
kTup 包含核函数信息的元组
|
||||
"""
|
||||
|
||||
self.X = dataMatIn
|
||||
self.labelMat = classLabels
|
||||
self.C = C
|
||||
self.tol = toler
|
||||
|
||||
# 数据的行数
|
||||
self.m = shape(dataMatIn)[0]
|
||||
self.alphas = mat(zeros((self.m, 1)))
|
||||
self.b = 0
|
||||
|
||||
# 误差缓存,第一列给出的是eCache是否有效的标志位,第二列给出的是实际的E值。
|
||||
self.eCache = mat(zeros((self.m, 2)))
|
||||
|
||||
# m行m列的矩阵
|
||||
self.K = mat(zeros((self.m, self.m)))
|
||||
for i in range(self.m):
|
||||
self.K[:, i] = kernelTrans(self.X, self.X[i, :], kTup)
|
||||
|
||||
|
||||
def kernelTrans(X, A, kTup): # calc the kernel or transform data to a higher dimensional space
|
||||
"""
|
||||
核转换函数
|
||||
Args:
|
||||
X dataMatIn数据集
|
||||
A dataMatIn数据集的第i行的数据
|
||||
kTup 核函数的信息
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
m, n = shape(X)
|
||||
K = mat(zeros((m, 1)))
|
||||
if kTup[0] == 'lin':
|
||||
# linear kernel: m*n * n*1 = m*1
|
||||
K = X * A.T
|
||||
elif kTup[0] == 'rbf':
|
||||
for j in range(m):
|
||||
deltaRow = X[j, :] - A
|
||||
K[j] = deltaRow * deltaRow.T
|
||||
# 径向基函数的高斯版本
|
||||
K = exp(K / (-1 * kTup[1] ** 2)) # divide in NumPy is element-wise not matrix like Matlab
|
||||
else:
|
||||
raise NameError('Houston We Have a Problem -- That Kernel is not recognized')
|
||||
return K
|
||||
|
||||
|
||||
def loadDataSet(fileName):
|
||||
"""loadDataSet(对文件进行逐行解析,从而得到第行的类标签和整个数据矩阵)
|
||||
|
||||
Args:
|
||||
fileName 文件名
|
||||
Returns:
|
||||
dataMat 数据矩阵
|
||||
labelMat 类标签
|
||||
"""
|
||||
dataMat = []
|
||||
labelMat = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split('\t')
|
||||
dataMat.append([float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(float(lineArr[2]))
|
||||
return dataMat, labelMat
|
||||
|
||||
|
||||
def calcEk(oS, k):
|
||||
"""calcEk(求 Ek误差:预测值-真实值的差)
|
||||
|
||||
该过程在完整版的SMO算法中陪出现次数较多,因此将其单独作为一个方法
|
||||
Args:
|
||||
oS optStruct对象
|
||||
k 具体的某一行
|
||||
|
||||
Returns:
|
||||
Ek 预测结果与真实结果比对,计算误差Ek
|
||||
"""
|
||||
fXk = float(multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b)
|
||||
Ek = fXk - float(oS.labelMat[k])
|
||||
return Ek
|
||||
|
||||
|
||||
def selectJrand(i, m):
|
||||
"""
|
||||
随机选择一个整数
|
||||
Args:
|
||||
i 第一个alpha的下标
|
||||
m 所有alpha的数目
|
||||
Returns:
|
||||
j 返回一个不为i的随机数,在0~m之间的整数值
|
||||
"""
|
||||
j = i
|
||||
while j == i:
|
||||
j = int(random.uniform(0, m))
|
||||
return j
|
||||
|
||||
|
||||
def selectJ(i, oS, Ei): # this is the second choice -heurstic, and calcs Ej
|
||||
"""selectJ(返回最优的j和Ej)
|
||||
|
||||
内循环的启发式方法。
|
||||
选择第二个(内循环)alpha的alpha值
|
||||
这里的目标是选择合适的第二个alpha值以保证每次优化中采用最大步长。
|
||||
该函数的误差与第一个alpha值Ei和下标i有关。
|
||||
Args:
|
||||
i 具体的第i一行
|
||||
oS optStruct对象
|
||||
Ei 预测结果与真实结果比对,计算误差Ei
|
||||
|
||||
Returns:
|
||||
j 随机选出的第j一行
|
||||
Ej 预测结果与真实结果比对,计算误差Ej
|
||||
"""
|
||||
maxK = -1
|
||||
maxDeltaE = 0
|
||||
Ej = 0
|
||||
# 首先将输入值Ei在缓存中设置成为有效的。这里的有效意味着它已经计算好了。
|
||||
oS.eCache[i] = [1, Ei]
|
||||
|
||||
# print 'oS.eCache[%s]=%s' % (i, oS.eCache[i])
|
||||
# print 'oS.eCache[:, 0].A=%s' % oS.eCache[:, 0].A.T
|
||||
# """
|
||||
# # 返回非0的:行列值
|
||||
# nonzero(oS.eCache[:, 0].A)= (
|
||||
# 行: array([ 0, 2, 4, 5, 8, 10, 17, 18, 20, 21, 23, 25, 26, 29, 30, 39, 46,52, 54, 55, 62, 69, 70, 76, 79, 82, 94, 97]),
|
||||
# 列: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0])
|
||||
# )
|
||||
# """
|
||||
# print 'nonzero(oS.eCache[:, 0].A)=', nonzero(oS.eCache[:, 0].A)
|
||||
# # 取行的list
|
||||
# print 'nonzero(oS.eCache[:, 0].A)[0]=', nonzero(oS.eCache[:, 0].A)[0]
|
||||
# 非零E值的行的list列表,所对应的alpha值
|
||||
validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
|
||||
if (len(validEcacheList)) > 1:
|
||||
for k in validEcacheList: # 在所有的值上进行循环,并选择其中使得改变最大的那个值
|
||||
if k == i:
|
||||
continue # don't calc for i, waste of time
|
||||
|
||||
# 求 Ek误差:预测值-真实值的差
|
||||
Ek = calcEk(oS, k)
|
||||
deltaE = abs(Ei - Ek)
|
||||
if (deltaE > maxDeltaE):
|
||||
# 选择具有最大步长的j
|
||||
maxK = k
|
||||
maxDeltaE = deltaE
|
||||
Ej = Ek
|
||||
return maxK, Ej
|
||||
else: # 如果是第一次循环,则随机选择一个alpha值
|
||||
j = selectJrand(i, oS.m)
|
||||
|
||||
# 求 Ek误差:预测值-真实值的差
|
||||
Ej = calcEk(oS, j)
|
||||
return j, Ej
|
||||
|
||||
|
||||
def updateEk(oS, k):
|
||||
"""updateEk(计算误差值并存入缓存中。)
|
||||
|
||||
在对alpha值进行优化之后会用到这个值。
|
||||
Args:
|
||||
oS optStruct对象
|
||||
k 某一列的行号
|
||||
"""
|
||||
|
||||
# 求 误差:预测值-真实值的差
|
||||
Ek = calcEk(oS, k)
|
||||
oS.eCache[k] = [1, Ek]
|
||||
|
||||
|
||||
def clipAlpha(aj, H, L):
|
||||
"""clipAlpha(调整aj的值,使aj处于 L<=aj<=H)
|
||||
Args:
|
||||
aj 目标值
|
||||
H 最大值
|
||||
L 最小值
|
||||
Returns:
|
||||
aj 目标值
|
||||
"""
|
||||
if aj > H:
|
||||
aj = H
|
||||
if L > aj:
|
||||
aj = L
|
||||
return aj
|
||||
|
||||
|
||||
def innerL(i, oS):
|
||||
"""innerL
|
||||
内循环代码
|
||||
Args:
|
||||
i 具体的某一行
|
||||
oS optStruct对象
|
||||
|
||||
Returns:
|
||||
0 找不到最优的值
|
||||
1 找到了最优的值,并且oS.Cache到缓存中
|
||||
"""
|
||||
|
||||
# 求 Ek误差:预测值-真实值的差
|
||||
Ei = calcEk(oS, i)
|
||||
|
||||
# 约束条件 (KKT条件是解决最优化问题的时用到的一种方法。我们这里提到的最优化问题通常是指对于给定的某一函数,求其在指定作用域上的全局最小值)
|
||||
# 0<=alphas[i]<=C,但由于0和C是边界值,我们无法进行优化,因为需要增加一个alphas和降低一个alphas。
|
||||
# 表示发生错误的概率:labelMat[i]*Ei 如果超出了 toler, 才需要优化。至于正负号,我们考虑绝对值就对了。
|
||||
'''
|
||||
# 检验训练样本(xi, yi)是否满足KKT条件
|
||||
yi*f(i) >= 1 and alpha = 0 (outside the boundary)
|
||||
yi*f(i) == 1 and 0<alpha< C (on the boundary)
|
||||
yi*f(i) <= 1 and alpha = C (between the boundary)
|
||||
'''
|
||||
if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)):
|
||||
# 选择最大的误差对应的j进行优化。效果更明显
|
||||
j, Ej = selectJ(i, oS, Ei)
|
||||
alphaIold = oS.alphas[i].copy()
|
||||
alphaJold = oS.alphas[j].copy()
|
||||
|
||||
# L和H用于将alphas[j]调整到0-C之间。如果L==H,就不做任何改变,直接return 0
|
||||
if (oS.labelMat[i] != oS.labelMat[j]):
|
||||
L = max(0, oS.alphas[j] - oS.alphas[i])
|
||||
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
|
||||
else:
|
||||
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
|
||||
H = min(oS.C, oS.alphas[j] + oS.alphas[i])
|
||||
if L == H:
|
||||
# print("L==H")
|
||||
return 0
|
||||
|
||||
# eta是alphas[j]的最优修改量,如果eta==0,需要退出for循环的当前迭代过程
|
||||
# 参考《统计学习方法》李航-P125~P128<序列最小最优化算法>
|
||||
eta = 2.0 * oS.K[i, j] - oS.K[i, i] - oS.K[j, j] # changed for kernel
|
||||
if eta >= 0:
|
||||
print("eta>=0")
|
||||
return 0
|
||||
|
||||
# 计算出一个新的alphas[j]值
|
||||
oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta
|
||||
# 并使用辅助函数,以及L和H对其进行调整
|
||||
oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
|
||||
# 更新误差缓存
|
||||
updateEk(oS, j)
|
||||
|
||||
# 检查alpha[j]是否只是轻微的改变,如果是的话,就退出for循环。
|
||||
if (abs(oS.alphas[j] - alphaJold) < 0.00001):
|
||||
# print("j not moving enough")
|
||||
return 0
|
||||
|
||||
# 然后alphas[i]和alphas[j]同样进行改变,虽然改变的大小一样,但是改变的方向正好相反
|
||||
oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j])
|
||||
# 更新误差缓存
|
||||
updateEk(oS, i)
|
||||
|
||||
# 在对alpha[i], alpha[j] 进行优化之后,给这两个alpha值设置一个常数b。
|
||||
# w= Σ[1~n] ai*yi*xi => b = yi- Σ[1~n] ai*yi(xi*xj)
|
||||
# 所以: b1 - b = (y1-y) - Σ[1~n] yi*(a1-a)*(xi*x1)
|
||||
# 为什么减2遍? 因为是 减去Σ[1~n],正好2个变量i和j,所以减2遍
|
||||
b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, i] - oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.K[i, j]
|
||||
b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, j] - oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.K[j, j]
|
||||
if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
|
||||
oS.b = b1
|
||||
elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
|
||||
oS.b = b2
|
||||
else:
|
||||
oS.b = (b1 + b2) / 2.0
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup=('lin', 0)):
|
||||
"""
|
||||
完整SMO算法外循环,与smoSimple有些类似,但这里的循环退出条件更多一些
|
||||
Args:
|
||||
dataMatIn 数据集
|
||||
classLabels 类别标签
|
||||
C 松弛变量(常量值),允许有些数据点可以处于分隔面的错误一侧。
|
||||
控制最大化间隔和保证大部分的函数间隔小于1.0这两个目标的权重。
|
||||
可以通过调节该参数达到不同的结果。
|
||||
toler 容错率
|
||||
maxIter 退出前最大的循环次数
|
||||
kTup 包含核函数信息的元组
|
||||
Returns:
|
||||
b 模型的常量值
|
||||
alphas 拉格朗日乘子
|
||||
"""
|
||||
|
||||
# 创建一个 optStruct 对象
|
||||
oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler, kTup)
|
||||
iter = 0
|
||||
entireSet = True
|
||||
alphaPairsChanged = 0
|
||||
|
||||
# 循环遍历:循环maxIter次 并且 (alphaPairsChanged存在可以改变 or 所有行遍历一遍)
|
||||
while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
|
||||
alphaPairsChanged = 0
|
||||
|
||||
# 当entireSet=true or 非边界alpha对没有了;就开始寻找 alpha对,然后决定是否要进行else。
|
||||
if entireSet:
|
||||
# 在数据集上遍历所有可能的alpha
|
||||
for i in range(oS.m):
|
||||
# 是否存在alpha对,存在就+1
|
||||
alphaPairsChanged += innerL(i, oS)
|
||||
# print("fullSet, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
|
||||
iter += 1
|
||||
|
||||
# 对已存在 alpha对,选出非边界的alpha值,进行优化。
|
||||
else:
|
||||
# 遍历所有的非边界alpha值,也就是不在边界0或C上的值。
|
||||
nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
|
||||
for i in nonBoundIs:
|
||||
alphaPairsChanged += innerL(i, oS)
|
||||
# print("non-bound, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
|
||||
iter += 1
|
||||
|
||||
# 如果找到alpha对,就优化非边界alpha值,否则,就重新进行寻找,如果寻找一遍 遍历所有的行还是没找到,就退出循环。
|
||||
if entireSet:
|
||||
entireSet = False # toggle entire set loop
|
||||
elif (alphaPairsChanged == 0):
|
||||
entireSet = True
|
||||
print("iteration number: %d" % iter)
|
||||
return oS.b, oS.alphas
|
||||
|
||||
|
||||
def calcWs(alphas, dataArr, classLabels):
|
||||
"""
|
||||
基于alpha计算w值
|
||||
Args:
|
||||
alphas 拉格朗日乘子
|
||||
dataArr feature数据集
|
||||
classLabels 目标变量数据集
|
||||
|
||||
Returns:
|
||||
wc 回归系数
|
||||
"""
|
||||
X = mat(dataArr)
|
||||
labelMat = mat(classLabels).transpose()
|
||||
m, n = shape(X)
|
||||
w = zeros((n, 1))
|
||||
for i in range(m):
|
||||
w += multiply(alphas[i] * labelMat[i], X[i, :].T)
|
||||
return w
|
||||
|
||||
|
||||
def testRbf(k1=1.3):
|
||||
dataArr, labelArr = loadDataSet('data/6.SVM/testSetRBF.txt')
|
||||
b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1)) # C=200 important
|
||||
datMat = mat(dataArr)
|
||||
labelMat = mat(labelArr).transpose()
|
||||
svInd = nonzero(alphas.A > 0)[0]
|
||||
sVs = datMat[svInd] # get matrix of only support vectors
|
||||
labelSV = labelMat[svInd]
|
||||
print("there are %d Support Vectors" % shape(sVs)[0])
|
||||
m, n = shape(datMat)
|
||||
errorCount = 0
|
||||
for i in range(m):
|
||||
kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
|
||||
|
||||
# 和这个svm-simple类似: fXi = float(multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[i, :].T)) + b
|
||||
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
|
||||
if sign(predict) != sign(labelArr[i]):
|
||||
errorCount += 1
|
||||
print("the training error rate is: %f" % (float(errorCount) / m))
|
||||
|
||||
dataArr, labelArr = loadDataSet('data/6.SVM/testSetRBF2.txt')
|
||||
errorCount = 0
|
||||
datMat = mat(dataArr)
|
||||
labelMat = mat(labelArr).transpose()
|
||||
m, n = shape(datMat)
|
||||
for i in range(m):
|
||||
kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
|
||||
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
|
||||
if sign(predict) != sign(labelArr[i]):
|
||||
errorCount += 1
|
||||
print("the test error rate is: %f" % (float(errorCount) / m))
|
||||
|
||||
|
||||
def img2vector(filename):
|
||||
returnVect = zeros((1, 1024))
|
||||
fr = open(filename)
|
||||
for i in range(32):
|
||||
lineStr = fr.readline()
|
||||
for j in range(32):
|
||||
returnVect[0, 32 * i + j] = int(lineStr[j])
|
||||
return returnVect
|
||||
|
||||
|
||||
def loadImages(dirName):
|
||||
from os import listdir
|
||||
hwLabels = []
|
||||
print(dirName)
|
||||
trainingFileList = listdir(dirName) # load the training set
|
||||
m = len(trainingFileList)
|
||||
trainingMat = zeros((m, 1024))
|
||||
for i in range(m):
|
||||
fileNameStr = trainingFileList[i]
|
||||
fileStr = fileNameStr.split('.')[0] # take off .txt
|
||||
classNumStr = int(fileStr.split('_')[0])
|
||||
if classNumStr == 9:
|
||||
hwLabels.append(-1)
|
||||
else:
|
||||
hwLabels.append(1)
|
||||
trainingMat[i, :] = img2vector('%s/%s' % (dirName, fileNameStr))
|
||||
return trainingMat, hwLabels
|
||||
|
||||
|
||||
def testDigits(kTup=('rbf', 10)):
|
||||
|
||||
# 1. 导入训练数据
|
||||
dataArr, labelArr = loadImages('data/6.SVM/trainingDigits')
|
||||
b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
|
||||
datMat = mat(dataArr)
|
||||
labelMat = mat(labelArr).transpose()
|
||||
svInd = nonzero(alphas.A > 0)[0]
|
||||
sVs = datMat[svInd]
|
||||
labelSV = labelMat[svInd]
|
||||
# print("there are %d Support Vectors" % shape(sVs)[0])
|
||||
m, n = shape(datMat)
|
||||
errorCount = 0
|
||||
for i in range(m):
|
||||
kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
|
||||
# 1*m * m*1 = 1*1 单个预测结果
|
||||
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
|
||||
if sign(predict) != sign(labelArr[i]): errorCount += 1
|
||||
print("the training error rate is: %f" % (float(errorCount) / m))
|
||||
|
||||
# 2. 导入测试数据
|
||||
dataArr, labelArr = loadImages('data/6.SVM/testDigits')
|
||||
errorCount = 0
|
||||
datMat = mat(dataArr)
|
||||
labelMat = mat(labelArr).transpose()
|
||||
m, n = shape(datMat)
|
||||
for i in range(m):
|
||||
kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
|
||||
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
|
||||
if sign(predict) != sign(labelArr[i]): errorCount += 1
|
||||
print("the test error rate is: %f" % (float(errorCount) / m))
|
||||
|
||||
|
||||
def plotfig_SVM(xArr, yArr, ws, b, alphas):
|
||||
"""
|
||||
参考地址:
|
||||
http://blog.csdn.net/maoersong/article/details/24315633
|
||||
http://www.cnblogs.com/JustForCS/p/5283489.html
|
||||
http://blog.csdn.net/kkxgx/article/details/6951959
|
||||
"""
|
||||
|
||||
xMat = mat(xArr)
|
||||
yMat = mat(yArr)
|
||||
|
||||
# b原来是矩阵,先转为数组类型后其数组大小为(1,1),所以后面加[0],变为(1,)
|
||||
b = array(b)[0]
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
|
||||
# 注意flatten的用法
|
||||
ax.scatter(xMat[:, 0].flatten().A[0], xMat[:, 1].flatten().A[0])
|
||||
|
||||
# x最大值,最小值根据原数据集dataArr[:, 0]的大小而定
|
||||
x = arange(-1.0, 10.0, 0.1)
|
||||
|
||||
# 根据x.w + b = 0 得到,其式子展开为w0.x1 + w1.x2 + b = 0, x2就是y值
|
||||
y = (-b-ws[0, 0]*x)/ws[1, 0]
|
||||
ax.plot(x, y)
|
||||
|
||||
for i in range(shape(yMat[0, :])[1]):
|
||||
if yMat[0, i] > 0:
|
||||
ax.plot(xMat[i, 0], xMat[i, 1], 'cx')
|
||||
else:
|
||||
ax.plot(xMat[i, 0], xMat[i, 1], 'kp')
|
||||
|
||||
# 找到支持向量,并在图中标红
|
||||
for i in range(100):
|
||||
if alphas[i] > 0.0:
|
||||
ax.plot(xMat[i, 0], xMat[i, 1], 'ro')
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# 无核函数的测试
|
||||
# 获取特征和目标变量
|
||||
dataArr, labelArr = loadDataSet('data/6.SVM/testSet.txt')
|
||||
# print labelArr
|
||||
|
||||
# b是常量值, alphas是拉格朗日乘子
|
||||
b, alphas = smoP(dataArr, labelArr, 0.6, 0.001, 40)
|
||||
print('/n/n/n')
|
||||
print('b=', b)
|
||||
print('alphas[alphas>0]=', alphas[alphas > 0])
|
||||
print('shape(alphas[alphas > 0])=', shape(alphas[alphas > 0]))
|
||||
for i in range(100):
|
||||
if alphas[i] > 0:
|
||||
print(dataArr[i], labelArr[i])
|
||||
# 画图
|
||||
ws = calcWs(alphas, dataArr, labelArr)
|
||||
plotfig_SVM(dataArr, labelArr, ws, b, alphas)
|
||||
|
||||
# 有核函数的测试
|
||||
testRbf(0.8)
|
||||
|
||||
# # 项目实战
|
||||
# # 示例:手写识别问题回顾
|
||||
# testDigits(('rbf', 0.1))
|
||||
# testDigits(('rbf', 5))
|
||||
# testDigits(('rbf', 10))
|
||||
# testDigits(('rbf', 50))
|
||||
# testDigits(('rbf', 100))
|
||||
# testDigits(('lin'))
|
||||
375
src/py2.x/ml/6.SVM/svm-complete_Non-Kernel.py
Normal file
375
src/py2.x/ml/6.SVM/svm-complete_Non-Kernel.py
Normal file
@@ -0,0 +1,375 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
"""
|
||||
Created on Nov 4, 2010
|
||||
Update on 2017-05-18
|
||||
Chapter 5 source file for Machine Learing in Action
|
||||
Author: Peter/geekidentity/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
"""
|
||||
from __future__ import print_function
|
||||
from numpy import *
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
class optStruct:
|
||||
def __init__(self, dataMatIn, classLabels, C, toler): # Initialize the structure with the parameters
|
||||
self.X = dataMatIn
|
||||
self.labelMat = classLabels
|
||||
self.C = C
|
||||
self.tol = toler
|
||||
self.m = shape(dataMatIn)[0]
|
||||
self.alphas = mat(zeros((self.m, 1)))
|
||||
self.b = 0
|
||||
self.eCache = mat(zeros((self.m, 2))) # first column is valid flag
|
||||
|
||||
|
||||
def loadDataSet(fileName):
|
||||
"""loadDataSet(对文件进行逐行解析,从而得到第行的类标签和整个数据矩阵)
|
||||
|
||||
Args:
|
||||
fileName 文件名
|
||||
Returns:
|
||||
dataMat 数据矩阵
|
||||
labelMat 类标签
|
||||
"""
|
||||
dataMat = []
|
||||
labelMat = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split('\t')
|
||||
dataMat.append([float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(float(lineArr[2]))
|
||||
return dataMat, labelMat
|
||||
|
||||
|
||||
def selectJrand(i, m):
|
||||
"""
|
||||
随机选择一个整数
|
||||
Args:
|
||||
i 第一个alpha的下标
|
||||
m 所有alpha的数目
|
||||
Returns:
|
||||
j 返回一个不为i的随机数,在0~m之间的整数值
|
||||
"""
|
||||
j = i
|
||||
while j == i:
|
||||
j = int(random.uniform(0, m))
|
||||
return j
|
||||
|
||||
|
||||
def clipAlpha(aj, H, L):
|
||||
"""clipAlpha(调整aj的值,使aj处于 L<=aj<=H)
|
||||
Args:
|
||||
aj 目标值
|
||||
H 最大值
|
||||
L 最小值
|
||||
Returns:
|
||||
aj 目标值
|
||||
"""
|
||||
if aj > H:
|
||||
aj = H
|
||||
if L > aj:
|
||||
aj = L
|
||||
return aj
|
||||
|
||||
|
||||
def calcEk(oS, k):
|
||||
"""calcEk(求 Ek误差:预测值-真实值的差)
|
||||
|
||||
该过程在完整版的SMO算法中陪出现次数较多,因此将其单独作为一个方法
|
||||
Args:
|
||||
oS optStruct对象
|
||||
k 具体的某一行
|
||||
|
||||
Returns:
|
||||
Ek 预测结果与真实结果比对,计算误差Ek
|
||||
"""
|
||||
fXk = float(multiply(oS.alphas, oS.labelMat).T * (oS.X * oS.X[k, :].T)) + oS.b
|
||||
Ek = fXk - float(oS.labelMat[k])
|
||||
return Ek
|
||||
|
||||
|
||||
def selectJ(i, oS, Ei): # this is the second choice -heurstic, and calcs Ej
|
||||
"""selectJ(返回最优的j和Ej)
|
||||
|
||||
内循环的启发式方法。
|
||||
选择第二个(内循环)alpha的alpha值
|
||||
这里的目标是选择合适的第二个alpha值以保证每次优化中采用最大步长。
|
||||
该函数的误差与第一个alpha值Ei和下标i有关。
|
||||
Args:
|
||||
i 具体的第i一行
|
||||
oS optStruct对象
|
||||
Ei 预测结果与真实结果比对,计算误差Ei
|
||||
|
||||
Returns:
|
||||
j 随机选出的第j一行
|
||||
Ej 预测结果与真实结果比对,计算误差Ej
|
||||
"""
|
||||
maxK = -1
|
||||
maxDeltaE = 0
|
||||
Ej = 0
|
||||
# 首先将输入值Ei在缓存中设置成为有效的。这里的有效意味着它已经计算好了。
|
||||
oS.eCache[i] = [1, Ei]
|
||||
|
||||
# print 'oS.eCache[%s]=%s' % (i, oS.eCache[i])
|
||||
# print 'oS.eCache[:, 0].A=%s' % oS.eCache[:, 0].A.T
|
||||
# """
|
||||
# # 返回非0的:行列值
|
||||
# nonzero(oS.eCache[:, 0].A)= (
|
||||
# 行: array([ 0, 2, 4, 5, 8, 10, 17, 18, 20, 21, 23, 25, 26, 29, 30, 39, 46,52, 54, 55, 62, 69, 70, 76, 79, 82, 94, 97]),
|
||||
# 列: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0])
|
||||
# )
|
||||
# """
|
||||
# print 'nonzero(oS.eCache[:, 0].A)=', nonzero(oS.eCache[:, 0].A)
|
||||
# # 取行的list
|
||||
# print 'nonzero(oS.eCache[:, 0].A)[0]=', nonzero(oS.eCache[:, 0].A)[0]
|
||||
# 非零E值的行的list列表,所对应的alpha值
|
||||
validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
|
||||
if (len(validEcacheList)) > 1:
|
||||
for k in validEcacheList: # 在所有的值上进行循环,并选择其中使得改变最大的那个值
|
||||
if k == i:
|
||||
continue # don't calc for i, waste of time
|
||||
|
||||
# 求 Ek误差:预测值-真实值的差
|
||||
Ek = calcEk(oS, k)
|
||||
deltaE = abs(Ei - Ek)
|
||||
if (deltaE > maxDeltaE):
|
||||
maxK = k
|
||||
maxDeltaE = deltaE
|
||||
Ej = Ek
|
||||
return maxK, Ej
|
||||
else: # 如果是第一次循环,则随机选择一个alpha值
|
||||
j = selectJrand(i, oS.m)
|
||||
|
||||
# 求 Ek误差:预测值-真实值的差
|
||||
Ej = calcEk(oS, j)
|
||||
return j, Ej
|
||||
|
||||
|
||||
def updateEk(oS, k): # after any alpha has changed update the new value in the cache
|
||||
"""updateEk(计算误差值并存入缓存中。)
|
||||
|
||||
在对alpha值进行优化之后会用到这个值。
|
||||
Args:
|
||||
oS optStruct对象
|
||||
k 某一列的行号
|
||||
"""
|
||||
|
||||
# 求 误差:预测值-真实值的差
|
||||
Ek = calcEk(oS, k)
|
||||
oS.eCache[k] = [1, Ek]
|
||||
|
||||
|
||||
def innerL(i, oS):
|
||||
"""innerL
|
||||
内循环代码
|
||||
Args:
|
||||
i 具体的某一行
|
||||
oS optStruct对象
|
||||
|
||||
Returns:
|
||||
0 找不到最优的值
|
||||
1 找到了最优的值,并且oS.Cache到缓存中
|
||||
"""
|
||||
|
||||
# 求 Ek误差:预测值-真实值的差
|
||||
Ei = calcEk(oS, i)
|
||||
|
||||
# 约束条件 (KKT条件是解决最优化问题的时用到的一种方法。我们这里提到的最优化问题通常是指对于给定的某一函数,求其在指定作用域上的全局最小值)
|
||||
# 0<=alphas[i]<=C,但由于0和C是边界值,我们无法进行优化,因为需要增加一个alphas和降低一个alphas。
|
||||
# 表示发生错误的概率:labelMat[i]*Ei 如果超出了 toler, 才需要优化。至于正负号,我们考虑绝对值就对了。
|
||||
'''
|
||||
# 检验训练样本(xi, yi)是否满足KKT条件
|
||||
yi*f(i) >= 1 and alpha = 0 (outside the boundary)
|
||||
yi*f(i) == 1 and 0<alpha< C (on the boundary)
|
||||
yi*f(i) <= 1 and alpha = C (between the boundary)
|
||||
'''
|
||||
if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)):
|
||||
# 选择最大的误差对应的j进行优化。效果更明显
|
||||
j, Ej = selectJ(i, oS, Ei)
|
||||
alphaIold = oS.alphas[i].copy()
|
||||
alphaJold = oS.alphas[j].copy()
|
||||
|
||||
# L和H用于将alphas[j]调整到0-C之间。如果L==H,就不做任何改变,直接return 0
|
||||
if (oS.labelMat[i] != oS.labelMat[j]):
|
||||
L = max(0, oS.alphas[j] - oS.alphas[i])
|
||||
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
|
||||
else:
|
||||
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
|
||||
H = min(oS.C, oS.alphas[j] + oS.alphas[i])
|
||||
if L == H:
|
||||
print("L==H")
|
||||
return 0
|
||||
|
||||
# eta是alphas[j]的最优修改量,如果eta==0,需要退出for循环的当前迭代过程
|
||||
# 参考《统计学习方法》李航-P125~P128<序列最小最优化算法>
|
||||
eta = 2.0 * oS.X[i, :] * oS.X[j, :].T - oS.X[i, :] * oS.X[i, :].T - oS.X[j, :] * oS.X[j, :].T
|
||||
if eta >= 0:
|
||||
print("eta>=0")
|
||||
return 0
|
||||
|
||||
# 计算出一个新的alphas[j]值
|
||||
oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta
|
||||
# 并使用辅助函数,以及L和H对其进行调整
|
||||
oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
|
||||
# 更新误差缓存
|
||||
updateEk(oS, j)
|
||||
|
||||
# 检查alpha[j]是否只是轻微的改变,如果是的话,就退出for循环。
|
||||
if (abs(oS.alphas[j] - alphaJold) < 0.00001):
|
||||
print("j not moving enough")
|
||||
return 0
|
||||
|
||||
# 然后alphas[i]和alphas[j]同样进行改变,虽然改变的大小一样,但是改变的方向正好相反
|
||||
oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j])
|
||||
# 更新误差缓存
|
||||
updateEk(oS, i)
|
||||
|
||||
# 在对alpha[i], alpha[j] 进行优化之后,给这两个alpha值设置一个常数b。
|
||||
# w= Σ[1~n] ai*yi*xi => b = yj Σ[1~n] ai*yi(xi*xj)
|
||||
# 所以: b1 - b = (y1-y) - Σ[1~n] yi*(a1-a)*(xi*x1)
|
||||
# 为什么减2遍? 因为是 减去Σ[1~n],正好2个变量i和j,所以减2遍
|
||||
b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[i, :].T - oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.X[i, :] * oS.X[j, :].T
|
||||
b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[j, :].T - oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.X[j, :] * oS.X[j, :].T
|
||||
if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
|
||||
oS.b = b1
|
||||
elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
|
||||
oS.b = b2
|
||||
else:
|
||||
oS.b = (b1 + b2) / 2.0
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def smoP(dataMatIn, classLabels, C, toler, maxIter):
|
||||
"""
|
||||
完整SMO算法外循环,与smoSimple有些类似,但这里的循环退出条件更多一些
|
||||
Args:
|
||||
dataMatIn 数据集
|
||||
classLabels 类别标签
|
||||
C 松弛变量(常量值),允许有些数据点可以处于分隔面的错误一侧。
|
||||
控制最大化间隔和保证大部分的函数间隔小于1.0这两个目标的权重。
|
||||
可以通过调节该参数达到不同的结果。
|
||||
toler 容错率
|
||||
maxIter 退出前最大的循环次数
|
||||
Returns:
|
||||
b 模型的常量值
|
||||
alphas 拉格朗日乘子
|
||||
"""
|
||||
|
||||
# 创建一个 optStruct 对象
|
||||
oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler)
|
||||
iter = 0
|
||||
entireSet = True
|
||||
alphaPairsChanged = 0
|
||||
|
||||
# 循环遍历:循环maxIter次 并且 (alphaPairsChanged存在可以改变 or 所有行遍历一遍)
|
||||
# 循环迭代结束 或者 循环遍历所有alpha后,alphaPairs还是没变化
|
||||
while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
|
||||
alphaPairsChanged = 0
|
||||
|
||||
# 当entireSet=true or 非边界alpha对没有了;就开始寻找 alpha对,然后决定是否要进行else。
|
||||
if entireSet:
|
||||
# 在数据集上遍历所有可能的alpha
|
||||
for i in range(oS.m):
|
||||
# 是否存在alpha对,存在就+1
|
||||
alphaPairsChanged += innerL(i, oS)
|
||||
print("fullSet, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
|
||||
iter += 1
|
||||
# 对已存在 alpha对,选出非边界的alpha值,进行优化。
|
||||
else:
|
||||
# 遍历所有的非边界alpha值,也就是不在边界0或C上的值。
|
||||
nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
|
||||
for i in nonBoundIs:
|
||||
alphaPairsChanged += innerL(i, oS)
|
||||
print("non-bound, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
|
||||
iter += 1
|
||||
|
||||
# 如果找到alpha对,就优化非边界alpha值,否则,就重新进行寻找,如果寻找一遍 遍历所有的行还是没找到,就退出循环。
|
||||
if entireSet:
|
||||
entireSet = False # toggle entire set loop
|
||||
elif (alphaPairsChanged == 0):
|
||||
entireSet = True
|
||||
print("iteration number: %d" % iter)
|
||||
return oS.b, oS.alphas
|
||||
|
||||
|
||||
def calcWs(alphas, dataArr, classLabels):
|
||||
"""
|
||||
基于alpha计算w值
|
||||
Args:
|
||||
alphas 拉格朗日乘子
|
||||
dataArr feature数据集
|
||||
classLabels 目标变量数据集
|
||||
|
||||
Returns:
|
||||
wc 回归系数
|
||||
"""
|
||||
X = mat(dataArr)
|
||||
labelMat = mat(classLabels).transpose()
|
||||
m, n = shape(X)
|
||||
w = zeros((n, 1))
|
||||
for i in range(m):
|
||||
w += multiply(alphas[i] * labelMat[i], X[i, :].T)
|
||||
return w
|
||||
|
||||
|
||||
def plotfig_SVM(xArr, yArr, ws, b, alphas):
|
||||
"""
|
||||
参考地址:
|
||||
http://blog.csdn.net/maoersong/article/details/24315633
|
||||
http://www.cnblogs.com/JustForCS/p/5283489.html
|
||||
http://blog.csdn.net/kkxgx/article/details/6951959
|
||||
"""
|
||||
|
||||
xMat = mat(xArr)
|
||||
yMat = mat(yArr)
|
||||
|
||||
# b原来是矩阵,先转为数组类型后其数组大小为(1,1),所以后面加[0],变为(1,)
|
||||
b = array(b)[0]
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
|
||||
# 注意flatten的用法
|
||||
ax.scatter(xMat[:, 0].flatten().A[0], xMat[:, 1].flatten().A[0])
|
||||
|
||||
# x最大值,最小值根据原数据集dataArr[:, 0]的大小而定
|
||||
x = arange(-1.0, 10.0, 0.1)
|
||||
|
||||
# 根据x.w + b = 0 得到,其式子展开为w0.x1 + w1.x2 + b = 0, x2就是y值
|
||||
y = (-b-ws[0, 0]*x)/ws[1, 0]
|
||||
ax.plot(x, y)
|
||||
|
||||
for i in range(shape(yMat[0, :])[1]):
|
||||
if yMat[0, i] > 0:
|
||||
ax.plot(xMat[i, 0], xMat[i, 1], 'cx')
|
||||
else:
|
||||
ax.plot(xMat[i, 0], xMat[i, 1], 'kp')
|
||||
|
||||
# 找到支持向量,并在图中标红
|
||||
for i in range(100):
|
||||
if alphas[i] > 0.0:
|
||||
ax.plot(xMat[i, 0], xMat[i, 1], 'ro')
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 获取特征和目标变量
|
||||
dataArr, labelArr = loadDataSet('data/6.SVM/testSet.txt')
|
||||
# print labelArr
|
||||
|
||||
# b是常量值, alphas是拉格朗日乘子
|
||||
b, alphas = smoP(dataArr, labelArr, 0.6, 0.001, 40)
|
||||
print('/n/n/n')
|
||||
print('b=', b)
|
||||
print('alphas[alphas>0]=', alphas[alphas > 0])
|
||||
print('shape(alphas[alphas > 0])=', shape(alphas[alphas > 0]))
|
||||
for i in range(100):
|
||||
if alphas[i] > 0:
|
||||
print(dataArr[i], labelArr[i])
|
||||
# 画图
|
||||
ws = calcWs(alphas, dataArr, labelArr)
|
||||
plotfig_SVM(dataArr, labelArr, ws, b, alphas)
|
||||
255
src/py2.x/ml/6.SVM/svm-simple.py
Normal file
255
src/py2.x/ml/6.SVM/svm-simple.py
Normal file
@@ -0,0 +1,255 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
"""
|
||||
Created on Nov 4, 2010
|
||||
Update on 2017-05-18
|
||||
Chapter 5 source file for Machine Learing in Action
|
||||
Author: Peter/geekidentity/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
"""
|
||||
from __future__ import print_function
|
||||
from numpy import *
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def loadDataSet(fileName):
|
||||
"""
|
||||
对文件进行逐行解析,从而得到第行的类标签和整个特征矩阵
|
||||
Args:
|
||||
fileName 文件名
|
||||
Returns:
|
||||
dataMat 特征矩阵
|
||||
labelMat 类标签
|
||||
"""
|
||||
dataMat = []
|
||||
labelMat = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split('\t')
|
||||
dataMat.append([float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(float(lineArr[2]))
|
||||
return dataMat, labelMat
|
||||
|
||||
|
||||
def selectJrand(i, m):
|
||||
"""
|
||||
随机选择一个整数
|
||||
Args:
|
||||
i 第一个alpha的下标
|
||||
m 所有alpha的数目
|
||||
Returns:
|
||||
j 返回一个不为i的随机数,在0~m之间的整数值
|
||||
"""
|
||||
j = i
|
||||
while j == i:
|
||||
j = int(random.uniform(0, m))
|
||||
return j
|
||||
|
||||
|
||||
def clipAlpha(aj, H, L):
|
||||
"""clipAlpha(调整aj的值,使aj处于 L<=aj<=H)
|
||||
Args:
|
||||
aj 目标值
|
||||
H 最大值
|
||||
L 最小值
|
||||
Returns:
|
||||
aj 目标值
|
||||
"""
|
||||
if aj > H:
|
||||
aj = H
|
||||
if L > aj:
|
||||
aj = L
|
||||
return aj
|
||||
|
||||
|
||||
def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
|
||||
"""smoSimple
|
||||
|
||||
Args:
|
||||
dataMatIn 数据集
|
||||
classLabels 类别标签
|
||||
C 松弛变量(常量值),允许有些数据点可以处于分隔面的错误一侧。
|
||||
控制最大化间隔和保证大部分的函数间隔小于1.0这两个目标的权重。
|
||||
可以通过调节该参数达到不同的结果。
|
||||
toler 容错率(是指在某个体系中能减小一些因素或选择对某个系统产生不稳定的概率。)
|
||||
maxIter 退出前最大的循环次数
|
||||
Returns:
|
||||
b 模型的常量值
|
||||
alphas 拉格朗日乘子
|
||||
"""
|
||||
dataMatrix = mat(dataMatIn)
|
||||
# 矩阵转置 和 .T 一样的功能
|
||||
labelMat = mat(classLabels).transpose()
|
||||
m, n = shape(dataMatrix)
|
||||
|
||||
# 初始化 b和alphas(alpha有点类似权重值。)
|
||||
b = 0
|
||||
alphas = mat(zeros((m, 1)))
|
||||
|
||||
# 没有任何alpha改变的情况下遍历数据的次数
|
||||
iter = 0
|
||||
while (iter < maxIter):
|
||||
# w = calcWs(alphas, dataMatIn, classLabels)
|
||||
# print("w:", w)
|
||||
|
||||
# 记录alpha是否已经进行优化,每次循环时设为0,然后再对整个集合顺序遍历
|
||||
alphaPairsChanged = 0
|
||||
for i in range(m):
|
||||
# print 'alphas=', alphas
|
||||
# print 'labelMat=', labelMat
|
||||
# print 'multiply(alphas, labelMat)=', multiply(alphas, labelMat)
|
||||
# 我们预测的类别 y = w^Tx[i]+b; 其中因为 w = Σ(1~n) a[n]*lable[n]*x[n]
|
||||
fXi = float(multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[i, :].T)) + b
|
||||
# 预测结果与真实结果比对,计算误差Ei
|
||||
Ei = fXi - float(labelMat[i])
|
||||
|
||||
# 约束条件 (KKT条件是解决最优化问题的时用到的一种方法。我们这里提到的最优化问题通常是指对于给定的某一函数,求其在指定作用域上的全局最小值)
|
||||
# 0<=alphas[i]<=C,但由于0和C是边界值,我们无法进行优化,因为需要增加一个alphas和降低一个alphas。
|
||||
# 表示发生错误的概率:labelMat[i]*Ei 如果超出了 toler, 才需要优化。至于正负号,我们考虑绝对值就对了。
|
||||
'''
|
||||
# 检验训练样本(xi, yi)是否满足KKT条件
|
||||
yi*f(i) >= 1 and alpha = 0 (outside the boundary)
|
||||
yi*f(i) == 1 and 0<alpha< C (on the boundary)
|
||||
yi*f(i) <= 1 and alpha = C (between the boundary)
|
||||
'''
|
||||
if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > 0)):
|
||||
|
||||
# 如果满足优化的条件,我们就随机选取非i的一个点,进行优化比较
|
||||
j = selectJrand(i, m)
|
||||
# 预测j的结果
|
||||
fXj = float(multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[j, :].T)) + b
|
||||
Ej = fXj - float(labelMat[j])
|
||||
alphaIold = alphas[i].copy()
|
||||
alphaJold = alphas[j].copy()
|
||||
|
||||
# L和H用于将alphas[j]调整到0-C之间。如果L==H,就不做任何改变,直接执行continue语句
|
||||
# labelMat[i] != labelMat[j] 表示异侧,就相减,否则是同侧,就相加。
|
||||
if (labelMat[i] != labelMat[j]):
|
||||
L = max(0, alphas[j] - alphas[i])
|
||||
H = min(C, C + alphas[j] - alphas[i])
|
||||
else:
|
||||
L = max(0, alphas[j] + alphas[i] - C)
|
||||
H = min(C, alphas[j] + alphas[i])
|
||||
# 如果相同,就没发优化了
|
||||
if L == H:
|
||||
print("L==H")
|
||||
continue
|
||||
|
||||
# eta是alphas[j]的最优修改量,如果eta==0,需要退出for循环的当前迭代过程
|
||||
# 参考《统计学习方法》李航-P125~P128<序列最小最优化算法>
|
||||
eta = 2.0 * dataMatrix[i, :]*dataMatrix[j, :].T - dataMatrix[i, :]*dataMatrix[i, :].T - dataMatrix[j, :]*dataMatrix[j, :].T
|
||||
if eta >= 0:
|
||||
print("eta>=0")
|
||||
continue
|
||||
|
||||
# 计算出一个新的alphas[j]值
|
||||
alphas[j] -= labelMat[j]*(Ei - Ej)/eta
|
||||
# 并使用辅助函数,以及L和H对其进行调整
|
||||
alphas[j] = clipAlpha(alphas[j], H, L)
|
||||
# 检查alpha[j]是否只是轻微的改变,如果是的话,就退出for循环。
|
||||
if (abs(alphas[j] - alphaJold) < 0.00001):
|
||||
print("j not moving enough")
|
||||
continue
|
||||
# 然后alphas[i]和alphas[j]同样进行改变,虽然改变的大小一样,但是改变的方向正好相反
|
||||
alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j])
|
||||
# 在对alpha[i], alpha[j] 进行优化之后,给这两个alpha值设置一个常数b。
|
||||
# w= Σ[1~n] ai*yi*xi => b = yj- Σ[1~n] ai*yi(xi*xj)
|
||||
# 所以: b1 - b = (y1-y) - Σ[1~n] yi*(a1-a)*(xi*x1)
|
||||
# 为什么减2遍? 因为是 减去Σ[1~n],正好2个变量i和j,所以减2遍
|
||||
b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i, :]*dataMatrix[i, :].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i, :]*dataMatrix[j, :].T
|
||||
b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i, :]*dataMatrix[j, :].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j, :]*dataMatrix[j, :].T
|
||||
if (0 < alphas[i]) and (C > alphas[i]):
|
||||
b = b1
|
||||
elif (0 < alphas[j]) and (C > alphas[j]):
|
||||
b = b2
|
||||
else:
|
||||
b = (b1 + b2)/2.0
|
||||
alphaPairsChanged += 1
|
||||
print("iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
|
||||
# 在for循环外,检查alpha值是否做了更新,如果在更新则将iter设为0后继续运行程序
|
||||
# 知道更新完毕后,iter次循环无变化,才推出循环。
|
||||
if (alphaPairsChanged == 0):
|
||||
iter += 1
|
||||
else:
|
||||
iter = 0
|
||||
print("iteration number: %d" % iter)
|
||||
return b, alphas
|
||||
|
||||
|
||||
def calcWs(alphas, dataArr, classLabels):
|
||||
"""
|
||||
基于alpha计算w值
|
||||
Args:
|
||||
alphas 拉格朗日乘子
|
||||
dataArr feature数据集
|
||||
classLabels 目标变量数据集
|
||||
|
||||
Returns:
|
||||
wc 回归系数
|
||||
"""
|
||||
X = mat(dataArr)
|
||||
labelMat = mat(classLabels).transpose()
|
||||
m, n = shape(X)
|
||||
w = zeros((n, 1))
|
||||
for i in range(m):
|
||||
w += multiply(alphas[i] * labelMat[i], X[i, :].T)
|
||||
return w
|
||||
|
||||
|
||||
def plotfig_SVM(xMat, yMat, ws, b, alphas):
|
||||
"""
|
||||
参考地址:
|
||||
http://blog.csdn.net/maoersong/article/details/24315633
|
||||
http://www.cnblogs.com/JustForCS/p/5283489.html
|
||||
http://blog.csdn.net/kkxgx/article/details/6951959
|
||||
"""
|
||||
|
||||
xMat = mat(xMat)
|
||||
yMat = mat(yMat)
|
||||
|
||||
# b原来是矩阵,先转为数组类型后其数组大小为(1,1),所以后面加[0],变为(1,)
|
||||
b = array(b)[0]
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
|
||||
# 注意flatten的用法
|
||||
ax.scatter(xMat[:, 0].flatten().A[0], xMat[:, 1].flatten().A[0])
|
||||
|
||||
# x最大值,最小值根据原数据集dataArr[:, 0]的大小而定
|
||||
x = arange(-1.0, 10.0, 0.1)
|
||||
|
||||
# 根据x.w + b = 0 得到,其式子展开为w0.x1 + w1.x2 + b = 0, x2就是y值
|
||||
y = (-b-ws[0, 0]*x)/ws[1, 0]
|
||||
ax.plot(x, y)
|
||||
|
||||
for i in range(shape(yMat[0, :])[1]):
|
||||
if yMat[0, i] > 0:
|
||||
ax.plot(xMat[i, 0], xMat[i, 1], 'cx')
|
||||
else:
|
||||
ax.plot(xMat[i, 0], xMat[i, 1], 'kp')
|
||||
|
||||
# 找到支持向量,并在图中标红
|
||||
for i in range(100):
|
||||
if alphas[i] > 0.0:
|
||||
ax.plot(xMat[i, 0], xMat[i, 1], 'ro')
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 获取特征和目标变量
|
||||
dataArr, labelArr = loadDataSet('data/6.SVM/testSet.txt')
|
||||
# print labelArr
|
||||
|
||||
# b是常量值, alphas是拉格朗日乘子
|
||||
b, alphas = smoSimple(dataArr, labelArr, 0.6, 0.001, 40)
|
||||
print('/n/n/n')
|
||||
print('b=', b)
|
||||
print('alphas[alphas>0]=', alphas[alphas > 0])
|
||||
print('shape(alphas[alphas > 0])=', shape(alphas[alphas > 0]))
|
||||
for i in range(100):
|
||||
if alphas[i] > 0:
|
||||
print(dataArr[i], labelArr[i])
|
||||
# 画图
|
||||
ws = calcWs(alphas, dataArr, labelArr)
|
||||
plotfig_SVM(dataArr, labelArr, ws, b, alphas)
|
||||
312
src/py2.x/ml/7.AdaBoost/adaboost.py
Normal file
312
src/py2.x/ml/7.AdaBoost/adaboost.py
Normal file
@@ -0,0 +1,312 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on Nov 28, 2010
|
||||
Update on 2017-05-18
|
||||
Adaboost is short for Adaptive Boosting
|
||||
Author: Peter/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
from numpy import *
|
||||
|
||||
|
||||
def loadSimpData():
|
||||
""" 测试数据
|
||||
Returns:
|
||||
dataArr feature对应的数据集
|
||||
labelArr feature对应的分类标签
|
||||
"""
|
||||
dataArr = array([[1., 2.1], [2., 1.1], [1.3, 1.], [1., 1.], [2., 1.]])
|
||||
labelArr = [1.0, 1.0, -1.0, -1.0, 1.0]
|
||||
return dataArr, labelArr
|
||||
|
||||
|
||||
# general function to parse tab -delimited floats
|
||||
def loadDataSet(fileName):
|
||||
# get number of fields
|
||||
numFeat = len(open(fileName).readline().split('\t'))
|
||||
dataArr = []
|
||||
labelArr = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
lineArr = []
|
||||
curLine = line.strip().split('\t')
|
||||
for i in range(numFeat-1):
|
||||
lineArr.append(float(curLine[i]))
|
||||
dataArr.append(lineArr)
|
||||
labelArr.append(float(curLine[-1]))
|
||||
return dataArr, labelArr
|
||||
|
||||
|
||||
def stumpClassify(dataMat, dimen, threshVal, threshIneq):
|
||||
"""stumpClassify(将数据集,按照feature列的value进行 二分法切分比较来赋值分类)
|
||||
|
||||
Args:
|
||||
dataMat Matrix数据集
|
||||
dimen 特征列
|
||||
threshVal 特征列要比较的值
|
||||
Returns:
|
||||
retArray 结果集
|
||||
"""
|
||||
# 默认都是1
|
||||
retArray = ones((shape(dataMat)[0], 1))
|
||||
# dataMat[:, dimen] 表示数据集中第dimen列的所有值
|
||||
# threshIneq == 'lt'表示修改左边的值,gt表示修改右边的值
|
||||
# print '-----', threshIneq, dataMat[:, dimen], threshVal
|
||||
if threshIneq == 'lt':
|
||||
retArray[dataMat[:, dimen] <= threshVal] = -1.0
|
||||
else:
|
||||
retArray[dataMat[:, dimen] > threshVal] = -1.0
|
||||
return retArray
|
||||
|
||||
|
||||
def buildStump(dataArr, labelArr, D):
|
||||
"""buildStump(得到决策树的模型)
|
||||
|
||||
Args:
|
||||
dataArr 特征标签集合
|
||||
labelArr 分类标签集合
|
||||
D 最初的样本的所有特征权重集合
|
||||
Returns:
|
||||
bestStump 最优的分类器模型
|
||||
minError 错误率
|
||||
bestClasEst 训练后的结果集
|
||||
"""
|
||||
# 转换数据
|
||||
dataMat = mat(dataArr)
|
||||
labelMat = mat(labelArr).T
|
||||
# m行 n列
|
||||
m, n = shape(dataMat)
|
||||
|
||||
# 初始化数据
|
||||
numSteps = 10.0
|
||||
bestStump = {}
|
||||
bestClasEst = mat(zeros((m, 1)))
|
||||
# 初始化的最小误差为无穷大
|
||||
minError = inf
|
||||
|
||||
# 循环所有的feature列,将列切分成 若干份,每一段以最左边的点作为分类节点
|
||||
for i in range(n):
|
||||
rangeMin = dataMat[:, i].min()
|
||||
rangeMax = dataMat[:, i].max()
|
||||
# print 'rangeMin=%s, rangeMax=%s' % (rangeMin, rangeMax)
|
||||
# 计算每一份的元素个数
|
||||
stepSize = (rangeMax-rangeMin)/numSteps
|
||||
# 例如: 4=(10-1)/2 那么 1-4(-1次) 1(0次) 1+1*4(1次) 1+2*4(2次)
|
||||
# 所以: 循环 -1/0/1/2
|
||||
for j in range(-1, int(numSteps)+1):
|
||||
# go over less than and greater than
|
||||
for inequal in ['lt', 'gt']:
|
||||
# 如果是-1,那么得到rangeMin-stepSize; 如果是numSteps,那么得到rangeMax
|
||||
threshVal = (rangeMin + float(j) * stepSize)
|
||||
# 对单层决策树进行简单分类,得到预测的分类值
|
||||
predictedVals = stumpClassify(dataMat, i, threshVal, inequal)
|
||||
# print predictedVals
|
||||
errArr = mat(ones((m, 1)))
|
||||
# 正确为0,错误为1
|
||||
errArr[predictedVals == labelMat] = 0
|
||||
# 计算 平均每个特征的概率0.2*错误概率的总和为多少,就知道错误率多高
|
||||
# 例如: 一个都没错,那么错误率= 0.2*0=0 , 5个都错,那么错误率= 0.2*5=1, 只错3个,那么错误率= 0.2*3=0.6
|
||||
weightedError = D.T*errArr
|
||||
'''
|
||||
dim 表示 feature列
|
||||
threshVal 表示树的分界值
|
||||
inequal 表示计算树左右颠倒的错误率的情况
|
||||
weightedError 表示整体结果的错误率
|
||||
bestClasEst 预测的最优结果
|
||||
'''
|
||||
# print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
|
||||
if weightedError < minError:
|
||||
minError = weightedError
|
||||
bestClasEst = predictedVals.copy()
|
||||
bestStump['dim'] = i
|
||||
bestStump['thresh'] = threshVal
|
||||
bestStump['ineq'] = inequal
|
||||
|
||||
# bestStump 表示分类器的结果,在第几个列上,用大于/小于比较,阈值是多少
|
||||
return bestStump, minError, bestClasEst
|
||||
|
||||
|
||||
def adaBoostTrainDS(dataArr, labelArr, numIt=40):
|
||||
"""adaBoostTrainDS(adaBoost训练过程放大)
|
||||
|
||||
Args:
|
||||
dataArr 特征标签集合
|
||||
labelArr 分类标签集合
|
||||
numIt 实例数
|
||||
Returns:
|
||||
weakClassArr 弱分类器的集合
|
||||
aggClassEst 预测的分类结果值
|
||||
"""
|
||||
weakClassArr = []
|
||||
m = shape(dataArr)[0]
|
||||
# 初始化 D,设置每行数据的样本的所有特征权重集合,平均分为m份
|
||||
D = mat(ones((m, 1))/m)
|
||||
aggClassEst = mat(zeros((m, 1)))
|
||||
for i in range(numIt):
|
||||
# 得到决策树的模型
|
||||
bestStump, error, classEst = buildStump(dataArr, labelArr, D)
|
||||
|
||||
# alpha 目的主要是计算每一个分类器实例的权重(加和就是分类结果)
|
||||
# 计算每个分类器的 alpha 权重值
|
||||
alpha = float(0.5*log((1.0-error)/max(error, 1e-16)))
|
||||
bestStump['alpha'] = alpha
|
||||
# store Stump Params in Array
|
||||
weakClassArr.append(bestStump)
|
||||
|
||||
# print "alpha=%s, classEst=%s, bestStump=%s, error=%s " % (alpha, classEst.T, bestStump, error)
|
||||
# 分类正确:乘积为1,不会影响结果,-1主要是下面求e的-alpha次方
|
||||
# 分类错误:乘积为 -1,结果会受影响,所以也乘以 -1
|
||||
expon = multiply(-1*alpha*mat(labelArr).T, classEst)
|
||||
# print '\n'
|
||||
# print 'labelArr=', labelArr
|
||||
# print 'classEst=', classEst.T
|
||||
# print '\n'
|
||||
# print '乘积: ', multiply(mat(labelArr).T, classEst).T
|
||||
# 判断正确的,就乘以-1,否则就乘以1, 为什么? 书上的公式。
|
||||
# print '(-1取反)预测值expon=', expon.T
|
||||
# 计算e的expon次方,然后计算得到一个综合的概率的值
|
||||
# 结果发现: 判断错误的样本,D对于的样本权重值会变大。
|
||||
D = multiply(D, exp(expon))
|
||||
D = D/D.sum()
|
||||
# print "D: ", D.T
|
||||
# print '\n'
|
||||
|
||||
# 预测的分类结果值,在上一轮结果的基础上,进行加和操作
|
||||
# print '当前的分类结果:', alpha*classEst.T
|
||||
aggClassEst += alpha*classEst
|
||||
# print "叠加后的分类结果aggClassEst: ", aggClassEst.T
|
||||
# sign 判断正为1, 0为0, 负为-1,通过最终加和的权重值,判断符号。
|
||||
# 结果为:错误的样本标签集合,因为是 !=,那么结果就是0 正, 1 负
|
||||
aggErrors = multiply(sign(aggClassEst) != mat(labelArr).T, ones((m, 1)))
|
||||
errorRate = aggErrors.sum()/m
|
||||
# print "total error=%s " % (errorRate)
|
||||
if errorRate == 0.0:
|
||||
break
|
||||
return weakClassArr, aggClassEst
|
||||
|
||||
|
||||
def adaClassify(datToClass, classifierArr):
|
||||
# do stuff similar to last aggClassEst in adaBoostTrainDS
|
||||
dataMat = mat(datToClass)
|
||||
m = shape(dataMat)[0]
|
||||
aggClassEst = mat(zeros((m, 1)))
|
||||
|
||||
# 循环 多个分类器
|
||||
for i in range(len(classifierArr)):
|
||||
# 前提: 我们已经知道了最佳的分类器的实例
|
||||
# 通过分类器来核算每一次的分类结果,然后通过alpha*每一次的结果 得到最后的权重加和的值。
|
||||
classEst = stumpClassify(dataMat, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq'])
|
||||
aggClassEst += classifierArr[i]['alpha']*classEst
|
||||
# print aggClassEst
|
||||
return sign(aggClassEst)
|
||||
|
||||
|
||||
def plotROC(predStrengths, classLabels):
|
||||
"""plotROC(打印ROC曲线,并计算AUC的面积大小)
|
||||
|
||||
Args:
|
||||
predStrengths 最终预测结果的权重值
|
||||
classLabels 原始数据的分类结果集
|
||||
"""
|
||||
print('predStrengths=', predStrengths)
|
||||
print('classLabels=', classLabels)
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
# variable to calculate AUC
|
||||
ySum = 0.0
|
||||
# 对正样本的进行求和
|
||||
numPosClas = sum(array(classLabels)==1.0)
|
||||
# 正样本的概率
|
||||
yStep = 1/float(numPosClas)
|
||||
# 负样本的概率
|
||||
xStep = 1/float(len(classLabels)-numPosClas)
|
||||
# argsort函数返回的是数组值从小到大的索引值
|
||||
# get sorted index, it's reverse
|
||||
sortedIndicies = predStrengths.argsort()
|
||||
# 测试结果是否是从小到大排列
|
||||
print('sortedIndicies=', sortedIndicies, predStrengths[0, 176], predStrengths.min(), predStrengths[0, 293], predStrengths.max())
|
||||
|
||||
# 开始创建模版对象
|
||||
fig = plt.figure()
|
||||
fig.clf()
|
||||
ax = plt.subplot(111)
|
||||
# cursor光标值
|
||||
cur = (1.0, 1.0)
|
||||
# loop through all the values, drawing a line segment at each point
|
||||
for index in sortedIndicies.tolist()[0]:
|
||||
if classLabels[index] == 1.0:
|
||||
delX = 0
|
||||
delY = yStep
|
||||
else:
|
||||
delX = xStep
|
||||
delY = 0
|
||||
ySum += cur[1]
|
||||
# draw line from cur to (cur[0]-delX, cur[1]-delY)
|
||||
# 画点连线 (x1, x2, y1, y2)
|
||||
print(cur[0], cur[0]-delX, cur[1], cur[1]-delY)
|
||||
ax.plot([cur[0], cur[0]-delX], [cur[1], cur[1]-delY], c='b')
|
||||
cur = (cur[0]-delX, cur[1]-delY)
|
||||
# 画对角的虚线线
|
||||
ax.plot([0, 1], [0, 1], 'b--')
|
||||
plt.xlabel('False positive rate')
|
||||
plt.ylabel('True positive rate')
|
||||
plt.title('ROC curve for AdaBoost horse colic detection system')
|
||||
# 设置画图的范围区间 (x1, x2, y1, y2)
|
||||
ax.axis([0, 1, 0, 1])
|
||||
plt.show()
|
||||
'''
|
||||
参考说明:http://blog.csdn.net/wenyusuran/article/details/39056013
|
||||
为了计算 AUC ,我们需要对多个小矩形的面积进行累加。
|
||||
这些小矩形的宽度是xStep,因此可以先对所有矩形的高度进行累加,最后再乘以xStep得到其总面积。
|
||||
所有高度的和(ySum)随着x轴的每次移动而渐次增加。
|
||||
'''
|
||||
print("the Area Under the Curve is: ", ySum*xStep)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# # 我们要将5个点进行分类
|
||||
# dataArr, labelArr = loadSimpData()
|
||||
# print 'dataArr', dataArr, 'labelArr', labelArr
|
||||
|
||||
# # D表示最初值,对1进行均分为5份,平均每一个初始的概率都为0.2
|
||||
# # D的目的是为了计算错误概率: weightedError = D.T*errArr
|
||||
# D = mat(ones((5, 1))/5)
|
||||
# print 'D=', D.T
|
||||
|
||||
# # bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D)
|
||||
# # print 'bestStump=', bestStump
|
||||
# # print 'minError=', minError
|
||||
# # print 'bestClasEst=', bestClasEst.T
|
||||
|
||||
# # 分类器:weakClassArr
|
||||
# # 历史累计的分类结果集
|
||||
# weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 9)
|
||||
# print '\nweakClassArr=', weakClassArr, '\naggClassEst=', aggClassEst.T
|
||||
|
||||
# """
|
||||
# 发现:
|
||||
# 分类的权重值:最大的值,为alpha的加和,最小值为-最大值
|
||||
# 特征的权重值:如果一个值误判的几率越小,那么D的特征权重越少
|
||||
# """
|
||||
|
||||
# # 测试数据的分类结果, 观测:aggClassEst分类的最终权重
|
||||
# print adaClassify([0, 0], weakClassArr).T
|
||||
# print adaClassify([[5, 5], [0, 0]], weakClassArr).T
|
||||
|
||||
# 马疝病数据集
|
||||
# 训练集合
|
||||
dataArr, labelArr = loadDataSet("data/7.AdaBoost/horseColicTraining2.txt")
|
||||
weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 40)
|
||||
print(weakClassArr, '\n-----\n', aggClassEst.T)
|
||||
# 计算ROC下面的AUC的面积大小
|
||||
plotROC(aggClassEst.T, labelArr)
|
||||
# 测试集合
|
||||
dataArrTest, labelArrTest = loadDataSet("data/7.AdaBoost/horseColicTest2.txt")
|
||||
m = shape(dataArrTest)[0]
|
||||
predicting10 = adaClassify(dataArrTest, weakClassArr)
|
||||
errArr = mat(ones((m, 1)))
|
||||
# 测试:计算总样本数,错误样本数,错误率
|
||||
print(m, errArr[predicting10 != mat(labelArrTest).T].sum(), errArr[predicting10 != mat(labelArrTest).T].sum()/m)
|
||||
62
src/py2.x/ml/7.AdaBoost/sklearn-adaboost-demo.py
Normal file
62
src/py2.x/ml/7.AdaBoost/sklearn-adaboost-demo.py
Normal file
@@ -0,0 +1,62 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
"""
|
||||
Created on 2017-07-10
|
||||
Updated on 2017-07-10
|
||||
Author: 片刻/Noel Dawe
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
sklearn-AdaBoost译文链接: http://cwiki.apachecn.org/pages/viewpage.action?pageId=10813457
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
# importing necessary libraries
|
||||
import numpy as np
|
||||
from sklearn import metrics
|
||||
from sklearn.ensemble import AdaBoostRegressor
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
|
||||
print(__doc__)
|
||||
|
||||
|
||||
# Create the dataset
|
||||
rng = np.random.RandomState(1)
|
||||
X = np.linspace(0, 6, 100)[:, np.newaxis]
|
||||
y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])
|
||||
# dataArr, labelArr = loadDataSet("data/7.AdaBoost/horseColicTraining2.txt")
|
||||
|
||||
|
||||
# Fit regression model
|
||||
regr_1 = DecisionTreeRegressor(max_depth=4)
|
||||
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng)
|
||||
|
||||
regr_1.fit(X, y)
|
||||
regr_2.fit(X, y)
|
||||
|
||||
# Predict
|
||||
y_1 = regr_1.predict(X)
|
||||
y_2 = regr_2.predict(X)
|
||||
|
||||
# Plot the results
|
||||
plt.figure()
|
||||
plt.scatter(X, y, c="k", label="training samples")
|
||||
plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)
|
||||
plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2)
|
||||
plt.xlabel("data")
|
||||
plt.ylabel("target")
|
||||
plt.title("Boosted Decision Tree Regression")
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
print('y---', type(y[0]), len(y), y[:4])
|
||||
print('y_1---', type(y_1[0]), len(y_1), y_1[:4])
|
||||
print('y_2---', type(y_2[0]), len(y_2), y_2[:4])
|
||||
|
||||
# 适合2分类
|
||||
y_true = np.array([0, 0, 1, 1])
|
||||
y_scores = np.array([0.1, 0.4, 0.35, 0.8])
|
||||
print('y_scores---', type(y_scores[0]), len(y_scores), y_scores)
|
||||
print(metrics.roc_auc_score(y_true, y_scores))
|
||||
|
||||
# print "-" * 100
|
||||
# print metrics.roc_auc_score(y[:1], y_2[:1])
|
||||
347
src/py2.x/ml/7.RandomForest/randomForest.py
Normal file
347
src/py2.x/ml/7.RandomForest/randomForest.py
Normal file
@@ -0,0 +1,347 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created 2017-04-25
|
||||
Update on 2017-05-18
|
||||
Random Forest Algorithm on Sonar Dataset
|
||||
Author: Flying_sfeng/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
---
|
||||
源代码网址:http://www.tuicool.com/articles/iiUfeim
|
||||
Flying_sfeng博客地址:http://blog.csdn.net/flying_sfeng/article/details/64133822 (感谢作者贡献)
|
||||
'''
|
||||
from __future__ import print_function
|
||||
from random import seed, randrange, random
|
||||
|
||||
|
||||
# 导入csv文件
|
||||
def loadDataSet(filename):
|
||||
dataset = []
|
||||
with open(filename, 'r') as fr:
|
||||
for line in fr.readlines():
|
||||
if not line:
|
||||
continue
|
||||
lineArr = []
|
||||
for featrue in line.split(','):
|
||||
# strip()返回移除字符串头尾指定的字符生成的新字符串
|
||||
str_f = featrue.strip()
|
||||
|
||||
# isdigit 如果是浮点型数值,就是 false,所以换成 isalpha() 函数
|
||||
# if str_f.isdigit(): # 判断是否是数字
|
||||
if str_f.isalpha(): # 如果是字母,说明是标签
|
||||
# 添加分类标签
|
||||
lineArr.append(str_f)
|
||||
else:
|
||||
# 将数据集的第column列转换成float形式
|
||||
lineArr.append(float(str_f))
|
||||
dataset.append(lineArr)
|
||||
return dataset
|
||||
|
||||
|
||||
def cross_validation_split(dataset, n_folds):
|
||||
"""cross_validation_split(将数据集进行抽重抽样 n_folds 份,数据可以重复重复抽取,每一次list的元素是无重复的)
|
||||
|
||||
Args:
|
||||
dataset 原始数据集
|
||||
n_folds 数据集dataset分成n_flods份
|
||||
Returns:
|
||||
dataset_split list集合,存放的是:将数据集进行抽重抽样 n_folds 份,数据可以重复重复抽取,每一次list的元素是无重复的
|
||||
"""
|
||||
dataset_split = list()
|
||||
dataset_copy = list(dataset) # 复制一份 dataset,防止 dataset 的内容改变
|
||||
fold_size = len(dataset) / n_folds
|
||||
for i in range(n_folds):
|
||||
fold = list() # 每次循环 fold 清零,防止重复导入 dataset_split
|
||||
while len(fold) < fold_size: # 这里不能用 if,if 只是在第一次判断时起作用,while 执行循环,直到条件不成立
|
||||
# 有放回的随机采样,有一些样本被重复采样,从而在训练集中多次出现,有的则从未在训练集中出现,此则自助采样法。从而保证每棵决策树训练集的差异性
|
||||
index = randrange(len(dataset_copy))
|
||||
# 将对应索引 index 的内容从 dataset_copy 中导出,并将该内容从 dataset_copy 中删除。
|
||||
# pop() 函数用于移除列表中的一个元素(默认最后一个元素),并且返回该元素的值。
|
||||
# fold.append(dataset_copy.pop(index)) # 无放回的方式
|
||||
fold.append(dataset_copy[index]) # 有放回的方式
|
||||
dataset_split.append(fold)
|
||||
# 由dataset分割出的n_folds个数据构成的列表,为了用于交叉验证
|
||||
return dataset_split
|
||||
|
||||
|
||||
# Split a dataset based on an attribute and an attribute value # 根据特征和特征值分割数据集
|
||||
def test_split(index, value, dataset):
|
||||
left, right = list(), list()
|
||||
for row in dataset:
|
||||
if row[index] < value:
|
||||
left.append(row)
|
||||
else:
|
||||
right.append(row)
|
||||
return left, right
|
||||
|
||||
|
||||
'''
|
||||
Gini指数的计算问题,假如将原始数据集D切割两部分,分别为D1和D2,则
|
||||
Gini(D|切割) = (|D1|/|D| ) * Gini(D1) + (|D2|/|D|) * Gini(D2)
|
||||
学习地址:
|
||||
http://bbs.pinggu.org/thread-5986969-1-1.html
|
||||
http://www.cnblogs.com/pinard/p/6053344.html
|
||||
而原文中 计算方式为:
|
||||
Gini(D|切割) = Gini(D1) + Gini(D2)
|
||||
|
||||
# Calculate the Gini index for a split dataset
|
||||
def gini_index(groups, class_values): # 个人理解:计算代价,分类越准确,则 gini 越小
|
||||
gini = 0.0
|
||||
for class_value in class_values: # class_values = [0, 1]
|
||||
for group in groups: # groups = (left, right)
|
||||
size = len(group)
|
||||
if size == 0:
|
||||
continue
|
||||
proportion = [row[-1] for row in group].count(class_value) / float(size)
|
||||
gini += (proportion * (1.0 - proportion)) # 个人理解:计算代价,分类越准确,则 gini 越小
|
||||
return gini
|
||||
'''
|
||||
|
||||
|
||||
def gini_index(groups, class_values): # 个人理解:计算代价,分类越准确,则 gini 越小
|
||||
gini = 0.0
|
||||
D = len(groups[0]) + len(groups[1])
|
||||
for class_value in class_values: # class_values = [0, 1]
|
||||
for group in groups: # groups = (left, right)
|
||||
size = len(group)
|
||||
if size == 0:
|
||||
continue
|
||||
proportion = [row[-1] for row in group].count(class_value) / float(size)
|
||||
gini += float(size)/D * (proportion * (1.0 - proportion)) # 个人理解:计算代价,分类越准确,则 gini 越小
|
||||
return gini
|
||||
|
||||
|
||||
# 找出分割数据集的最优特征,得到最优的特征 index,特征值 row[index],以及分割完的数据 groups(left, right)
|
||||
def get_split(dataset, n_features):
|
||||
class_values = list(set(row[-1] for row in dataset)) # class_values =[0, 1]
|
||||
b_index, b_value, b_score, b_groups = 999, 999, 999, None
|
||||
features = list()
|
||||
while len(features) < n_features:
|
||||
index = randrange(len(dataset[0])-1) # 往 features 添加 n_features 个特征( n_feature 等于特征数的根号),特征索引从 dataset 中随机取
|
||||
if index not in features:
|
||||
features.append(index)
|
||||
for index in features: # 在 n_features 个特征中选出最优的特征索引,并没有遍历所有特征,从而保证了每课决策树的差异性
|
||||
for row in dataset:
|
||||
groups = test_split(index, row[index], dataset) # groups=(left, right), row[index] 遍历每一行 index 索引下的特征值作为分类值 value, 找出最优的分类特征和特征值
|
||||
gini = gini_index(groups, class_values)
|
||||
# 左右两边的数量越一样,说明数据区分度不高,gini系数越大
|
||||
if gini < b_score:
|
||||
b_index, b_value, b_score, b_groups = index, row[index], gini, groups # 最后得到最优的分类特征 b_index,分类特征值 b_value,分类结果 b_groups。b_value 为分错的代价成本
|
||||
# print b_score
|
||||
return {'index': b_index, 'value': b_value, 'groups': b_groups}
|
||||
|
||||
|
||||
# Create a terminal node value # 输出group中出现次数较多的标签
|
||||
def to_terminal(group):
|
||||
outcomes = [row[-1] for row in group] # max() 函数中,当 key 参数不为空时,就以 key 的函数对象为判断的标准
|
||||
return max(set(outcomes), key=outcomes.count) # 输出 group 中出现次数较多的标签
|
||||
|
||||
|
||||
# Create child splits for a node or make terminal # 创建子分割器,递归分类,直到分类结束
|
||||
def split(node, max_depth, min_size, n_features, depth): # max_depth = 10, min_size = 1, n_features=int(sqrt((len(dataset[0])-1)
|
||||
left, right = node['groups']
|
||||
del(node['groups'])
|
||||
# check for a no split
|
||||
if not left or not right:
|
||||
node['left'] = node['right'] = to_terminal(left + right)
|
||||
return
|
||||
# check for max depth
|
||||
if depth >= max_depth: # max_depth=10 表示递归十次,若分类还未结束,则选取数据中分类标签较多的作为结果,使分类提前结束,防止过拟合
|
||||
node['left'], node['right'] = to_terminal(left), to_terminal(right)
|
||||
return
|
||||
# process left child
|
||||
if len(left) <= min_size:
|
||||
node['left'] = to_terminal(left)
|
||||
else:
|
||||
node['left'] = get_split(left, n_features) # node['left']是一个字典,形式为{'index':b_index, 'value':b_value, 'groups':b_groups},所以node是一个多层字典
|
||||
split(node['left'], max_depth, min_size, n_features, depth+1) # 递归,depth+1计算递归层数
|
||||
# process right child
|
||||
if len(right) <= min_size:
|
||||
node['right'] = to_terminal(right)
|
||||
else:
|
||||
node['right'] = get_split(right, n_features)
|
||||
split(node['right'], max_depth, min_size, n_features, depth+1)
|
||||
|
||||
|
||||
# Build a decision tree
|
||||
def build_tree(train, max_depth, min_size, n_features):
|
||||
"""build_tree(创建一个决策树)
|
||||
|
||||
Args:
|
||||
train 训练数据集
|
||||
max_depth 决策树深度不能太深,不然容易导致过拟合
|
||||
min_size 叶子节点的大小
|
||||
n_features 选取的特征的个数
|
||||
Returns:
|
||||
root 返回决策树
|
||||
"""
|
||||
|
||||
# 返回最优列和相关的信息
|
||||
root = get_split(train, n_features)
|
||||
|
||||
# 对左右2边的数据 进行递归的调用,由于最优特征使用过,所以在后面进行使用的时候,就没有意义了
|
||||
# 例如: 性别-男女,对男使用这一特征就没任何意义了
|
||||
split(root, max_depth, min_size, n_features, 1)
|
||||
return root
|
||||
|
||||
|
||||
# Make a prediction with a decision tree
|
||||
def predict(node, row): # 预测模型分类结果
|
||||
if row[node['index']] < node['value']:
|
||||
if isinstance(node['left'], dict): # isinstance 是 Python 中的一个内建函数。是用来判断一个对象是否是一个已知的类型。
|
||||
return predict(node['left'], row)
|
||||
else:
|
||||
return node['left']
|
||||
else:
|
||||
if isinstance(node['right'], dict):
|
||||
return predict(node['right'], row)
|
||||
else:
|
||||
return node['right']
|
||||
|
||||
|
||||
# Make a prediction with a list of bagged trees
|
||||
def bagging_predict(trees, row):
|
||||
"""bagging_predict(bagging预测)
|
||||
|
||||
Args:
|
||||
trees 决策树的集合
|
||||
row 测试数据集的每一行数据
|
||||
Returns:
|
||||
返回随机森林中,决策树结果出现次数做大的
|
||||
"""
|
||||
|
||||
# 使用多个决策树trees对测试集test的第row行进行预测,再使用简单投票法判断出该行所属分类
|
||||
predictions = [predict(tree, row) for tree in trees]
|
||||
return max(set(predictions), key=predictions.count)
|
||||
|
||||
|
||||
# Create a random subsample from the dataset with replacement
|
||||
def subsample(dataset, ratio): # 创建数据集的随机子样本
|
||||
"""random_forest(评估算法性能,返回模型得分)
|
||||
|
||||
Args:
|
||||
dataset 训练数据集
|
||||
ratio 训练数据集的样本比例
|
||||
Returns:
|
||||
sample 随机抽样的训练样本
|
||||
"""
|
||||
|
||||
sample = list()
|
||||
# 训练样本的按比例抽样。
|
||||
# round() 方法返回浮点数x的四舍五入值。
|
||||
n_sample = round(len(dataset) * ratio)
|
||||
while len(sample) < n_sample:
|
||||
# 有放回的随机采样,有一些样本被重复采样,从而在训练集中多次出现,有的则从未在训练集中出现,此则自助采样法。从而保证每棵决策树训练集的差异性
|
||||
index = randrange(len(dataset))
|
||||
sample.append(dataset[index])
|
||||
return sample
|
||||
|
||||
|
||||
# Random Forest Algorithm
|
||||
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
|
||||
"""random_forest(评估算法性能,返回模型得分)
|
||||
|
||||
Args:
|
||||
train 训练数据集
|
||||
test 测试数据集
|
||||
max_depth 决策树深度不能太深,不然容易导致过拟合
|
||||
min_size 叶子节点的大小
|
||||
sample_size 训练数据集的样本比例
|
||||
n_trees 决策树的个数
|
||||
n_features 选取的特征的个数
|
||||
Returns:
|
||||
predictions 每一行的预测结果,bagging 预测最后的分类结果
|
||||
"""
|
||||
|
||||
trees = list()
|
||||
# n_trees 表示决策树的数量
|
||||
for i in range(n_trees):
|
||||
# 随机抽样的训练样本, 随机采样保证了每棵决策树训练集的差异性
|
||||
sample = subsample(train, sample_size)
|
||||
# 创建一个决策树
|
||||
tree = build_tree(sample, max_depth, min_size, n_features)
|
||||
trees.append(tree)
|
||||
|
||||
# 每一行的预测结果,bagging 预测最后的分类结果
|
||||
predictions = [bagging_predict(trees, row) for row in test]
|
||||
return predictions
|
||||
|
||||
|
||||
# Calculate accuracy percentage
|
||||
def accuracy_metric(actual, predicted): # 导入实际值和预测值,计算精确度
|
||||
correct = 0
|
||||
for i in range(len(actual)):
|
||||
if actual[i] == predicted[i]:
|
||||
correct += 1
|
||||
return correct / float(len(actual)) * 100.0
|
||||
|
||||
|
||||
# 评估算法性能,返回模型得分
|
||||
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
|
||||
"""evaluate_algorithm(评估算法性能,返回模型得分)
|
||||
|
||||
Args:
|
||||
dataset 原始数据集
|
||||
algorithm 使用的算法
|
||||
n_folds 数据的份数
|
||||
*args 其他的参数
|
||||
Returns:
|
||||
scores 模型得分
|
||||
"""
|
||||
|
||||
# 将数据集进行抽重抽样 n_folds 份,数据可以重复重复抽取,每一次 list 的元素是无重复的
|
||||
folds = cross_validation_split(dataset, n_folds)
|
||||
scores = list()
|
||||
# 每次循环从 folds 从取出一个 fold 作为测试集,其余作为训练集,遍历整个 folds ,实现交叉验证
|
||||
for fold in folds:
|
||||
train_set = list(folds)
|
||||
train_set.remove(fold)
|
||||
# 将多个 fold 列表组合成一个 train_set 列表, 类似 union all
|
||||
"""
|
||||
In [20]: l1=[[1, 2, 'a'], [11, 22, 'b']]
|
||||
In [21]: l2=[[3, 4, 'c'], [33, 44, 'd']]
|
||||
In [22]: l=[]
|
||||
In [23]: l.append(l1)
|
||||
In [24]: l.append(l2)
|
||||
In [25]: l
|
||||
Out[25]: [[[1, 2, 'a'], [11, 22, 'b']], [[3, 4, 'c'], [33, 44, 'd']]]
|
||||
In [26]: sum(l, [])
|
||||
Out[26]: [[1, 2, 'a'], [11, 22, 'b'], [3, 4, 'c'], [33, 44, 'd']]
|
||||
"""
|
||||
train_set = sum(train_set, [])
|
||||
test_set = list()
|
||||
# fold 表示从原始数据集 dataset 提取出来的测试集
|
||||
for row in fold:
|
||||
row_copy = list(row)
|
||||
row_copy[-1] = None
|
||||
test_set.append(row_copy)
|
||||
predicted = algorithm(train_set, test_set, *args)
|
||||
actual = [row[-1] for row in fold]
|
||||
|
||||
# 计算随机森林的预测结果的正确率
|
||||
accuracy = accuracy_metric(actual, predicted)
|
||||
scores.append(accuracy)
|
||||
return scores
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# 加载数据
|
||||
dataset = loadDataSet('data/7.RandomForest/sonar-all-data.txt')
|
||||
# print dataset
|
||||
|
||||
n_folds = 5 # 分成5份数据,进行交叉验证
|
||||
max_depth = 20 # 调参(自己修改) #决策树深度不能太深,不然容易导致过拟合
|
||||
min_size = 1 # 决策树的叶子节点最少的元素数量
|
||||
sample_size = 1.0 # 做决策树时候的样本的比例
|
||||
# n_features = int((len(dataset[0])-1))
|
||||
n_features = 15 # 调参(自己修改) #准确性与多样性之间的权衡
|
||||
for n_trees in [1, 10, 20, 30, 40, 50]: # 理论上树是越多越好
|
||||
scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
|
||||
# 每一次执行本文件时都能产生同一个随机数
|
||||
seed(1)
|
||||
print('random=', random())
|
||||
print('Trees: %d' % n_trees)
|
||||
print('Scores: %s' % scores)
|
||||
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
|
||||
606
src/py2.x/ml/8.Regression/regression.py
Normal file
606
src/py2.x/ml/8.Regression/regression.py
Normal file
@@ -0,0 +1,606 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on Jan 8, 2011
|
||||
Update on 2017-05-18
|
||||
Author: Peter Harrington/小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
|
||||
from numpy import *
|
||||
import matplotlib.pylab as plt
|
||||
|
||||
|
||||
def loadDataSet(fileName):
|
||||
""" 加载数据
|
||||
解析以tab键分隔的文件中的浮点数
|
||||
Returns:
|
||||
dataMat : feature 对应的数据集
|
||||
labelMat : feature 对应的分类标签,即类别标签
|
||||
"""
|
||||
# 获取样本特征的总数,不算最后的目标变量
|
||||
numFeat = len(open(fileName).readline().split('\t')) - 1
|
||||
dataMat = []
|
||||
labelMat = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
# 读取每一行
|
||||
lineArr = []
|
||||
# 删除一行中以tab分隔的数据前后的空白符号
|
||||
curLine = line.strip().split('\t')
|
||||
# i 从0到2,不包括2
|
||||
for i in range(numFeat):
|
||||
# 将数据添加到lineArr List中,每一行数据测试数据组成一个行向量
|
||||
lineArr.append(float(curLine[i]))
|
||||
# 将测试数据的输入数据部分存储到dataMat 的List中
|
||||
dataMat.append(lineArr)
|
||||
# 将每一行的最后一个数据,即类别,或者叫目标变量存储到labelMat List中
|
||||
labelMat.append(float(curLine[-1]))
|
||||
return dataMat, labelMat
|
||||
|
||||
|
||||
def standRegres(xArr, yArr):
|
||||
'''
|
||||
Description:
|
||||
线性回归
|
||||
Args:
|
||||
xArr :输入的样本数据,包含每个样本数据的 feature
|
||||
yArr :对应于输入数据的类别标签,也就是每个样本对应的目标变量
|
||||
Returns:
|
||||
ws:回归系数
|
||||
'''
|
||||
|
||||
# mat()函数将xArr,yArr转换为矩阵 mat().T 代表的是对矩阵进行转置操作
|
||||
xMat = mat(xArr)
|
||||
yMat = mat(yArr).T
|
||||
# 矩阵乘法的条件是左矩阵的列数等于右矩阵的行数
|
||||
xTx = xMat.T * xMat
|
||||
# 因为要用到xTx的逆矩阵,所以事先需要确定计算得到的xTx是否可逆,条件是矩阵的行列式不为0
|
||||
# linalg.det() 函数是用来求得矩阵的行列式的,如果矩阵的行列式为0,则这个矩阵是不可逆的,就无法进行接下来的运算
|
||||
if linalg.det(xTx) == 0.0:
|
||||
print("This matrix is singular, cannot do inverse")
|
||||
return
|
||||
# 最小二乘法
|
||||
# http://cwiki.apachecn.org/pages/viewpage.action?pageId=5505133
|
||||
# 书中的公式,求得w的最优解
|
||||
ws = xTx.I * (xMat.T * yMat)
|
||||
return ws
|
||||
|
||||
|
||||
# 局部加权线性回归
|
||||
def lwlr(testPoint, xArr, yArr, k=1.0):
|
||||
'''
|
||||
Description:
|
||||
局部加权线性回归,在待预测点附近的每个点赋予一定的权重,在子集上基于最小均方差来进行普通的回归。
|
||||
Args:
|
||||
testPoint:样本点
|
||||
xArr:样本的特征数据,即 feature
|
||||
yArr:每个样本对应的类别标签,即目标变量
|
||||
k:关于赋予权重矩阵的核的一个参数,与权重的衰减速率有关
|
||||
Returns:
|
||||
testPoint * ws:数据点与具有权重的系数相乘得到的预测点
|
||||
Notes:
|
||||
这其中会用到计算权重的公式,w = e^((x^((i))-x) / -2k^2)
|
||||
理解:x为某个预测点,x^((i))为样本点,样本点距离预测点越近,贡献的误差越大(权值越大),越远则贡献的误差越小(权值越小)。
|
||||
关于预测点的选取,在我的代码中取的是样本点。其中k是带宽参数,控制w(钟形函数)的宽窄程度,类似于高斯函数的标准差。
|
||||
算法思路:假设预测点取样本点中的第i个样本点(共m个样本点),遍历1到m个样本点(含第i个),算出每一个样本点与预测点的距离,
|
||||
也就可以计算出每个样本贡献误差的权值,可以看出w是一个有m个元素的向量(写成对角阵形式)。
|
||||
'''
|
||||
# mat() 函数是将array转换为矩阵的函数, mat().T 是转换为矩阵之后,再进行转置操作
|
||||
xMat = mat(xArr)
|
||||
yMat = mat(yArr).T
|
||||
# 获得xMat矩阵的行数
|
||||
m = shape(xMat)[0]
|
||||
# eye()返回一个对角线元素为1,其他元素为0的二维数组,创建权重矩阵weights,该矩阵为每个样本点初始化了一个权重
|
||||
weights = mat(eye((m)))
|
||||
for j in range(m):
|
||||
# testPoint 的形式是 一个行向量的形式
|
||||
# 计算 testPoint 与输入样本点之间的距离,然后下面计算出每个样本贡献误差的权值
|
||||
diffMat = testPoint - xMat[j, :]
|
||||
# k控制衰减的速度
|
||||
weights[j, j] = exp(diffMat * diffMat.T / (-2.0 * k**2))
|
||||
# 根据矩阵乘法计算 xTx ,其中的 weights 矩阵是样本点对应的权重矩阵
|
||||
xTx = xMat.T * (weights * xMat)
|
||||
if linalg.det(xTx) == 0.0:
|
||||
print("This matrix is singular, cannot do inverse")
|
||||
return
|
||||
# 计算出回归系数的一个估计
|
||||
ws = xTx.I * (xMat.T * (weights * yMat))
|
||||
return testPoint * ws
|
||||
|
||||
|
||||
def lwlrTest(testArr, xArr, yArr, k=1.0):
|
||||
'''
|
||||
Description:
|
||||
测试局部加权线性回归,对数据集中每个点调用 lwlr() 函数
|
||||
Args:
|
||||
testArr:测试所用的所有样本点
|
||||
xArr:样本的特征数据,即 feature
|
||||
yArr:每个样本对应的类别标签,即目标变量
|
||||
k:控制核函数的衰减速率
|
||||
Returns:
|
||||
yHat:预测点的估计值
|
||||
'''
|
||||
# 得到样本点的总数
|
||||
m = shape(testArr)[0]
|
||||
# 构建一个全部都是 0 的 1 * m 的矩阵
|
||||
yHat = zeros(m)
|
||||
# 循环所有的数据点,并将lwlr运用于所有的数据点
|
||||
for i in range(m):
|
||||
yHat[i] = lwlr(testArr[i], xArr, yArr, k)
|
||||
# 返回估计值
|
||||
return yHat
|
||||
|
||||
|
||||
def lwlrTestPlot(xArr, yArr, k=1.0):
|
||||
'''
|
||||
Description:
|
||||
首先将 X 排序,其余的都与lwlrTest相同,这样更容易绘图
|
||||
Args:
|
||||
xArr:样本的特征数据,即 feature
|
||||
yArr:每个样本对应的类别标签,即目标变量,实际值
|
||||
k:控制核函数的衰减速率的有关参数,这里设定的是常量值 1
|
||||
Return:
|
||||
yHat:样本点的估计值
|
||||
xCopy:xArr的复制
|
||||
'''
|
||||
# 生成一个与目标变量数目相同的 0 向量
|
||||
yHat = zeros(shape(yArr))
|
||||
# 将 xArr 转换为 矩阵形式
|
||||
xCopy = mat(xArr)
|
||||
# 排序
|
||||
xCopy.sort(0)
|
||||
# 开始循环,为每个样本点进行局部加权线性回归,得到最终的目标变量估计值
|
||||
for i in range(shape(xArr)[0]):
|
||||
yHat[i] = lwlr(xCopy[i], xArr, yArr, k)
|
||||
return yHat, xCopy
|
||||
|
||||
|
||||
def rssError(yArr, yHatArr):
|
||||
'''
|
||||
Desc:
|
||||
计算分析预测误差的大小
|
||||
Args:
|
||||
yArr:真实的目标变量
|
||||
yHatArr:预测得到的估计值
|
||||
Returns:
|
||||
计算真实值和估计值得到的值的平方和作为最后的返回值
|
||||
'''
|
||||
return ((yArr - yHatArr)**2).sum()
|
||||
|
||||
|
||||
def ridgeRegres(xMat, yMat, lam=0.2):
|
||||
'''
|
||||
Desc:
|
||||
这个函数实现了给定 lambda 下的岭回归求解。
|
||||
如果数据的特征比样本点还多,就不能再使用上面介绍的的线性回归和局部线性回归了,因为计算 (xTx)^(-1)会出现错误。
|
||||
如果特征比样本点还多(n > m),也就是说,输入数据的矩阵x不是满秩矩阵。非满秩矩阵在求逆时会出现问题。
|
||||
为了解决这个问题,我们下边讲一下:岭回归,这是我们要讲的第一种缩减方法。
|
||||
Args:
|
||||
xMat:样本的特征数据,即 feature
|
||||
yMat:每个样本对应的类别标签,即目标变量,实际值
|
||||
lam:引入的一个λ值,使得矩阵非奇异
|
||||
Returns:
|
||||
经过岭回归公式计算得到的回归系数
|
||||
'''
|
||||
|
||||
xTx = xMat.T * xMat
|
||||
# 岭回归就是在矩阵 xTx 上加一个 λI 从而使得矩阵非奇异,进而能对 xTx + λI 求逆
|
||||
denom = xTx + eye(shape(xMat)[1]) * lam
|
||||
# 检查行列式是否为零,即矩阵是否可逆,行列式为0的话就不可逆,不为0的话就是可逆。
|
||||
if linalg.det(denom) == 0.0:
|
||||
print("This matrix is singular, cannot do inverse")
|
||||
return
|
||||
ws = denom.I * (xMat.T * yMat)
|
||||
return ws
|
||||
|
||||
|
||||
def ridgeTest(xArr, yArr):
|
||||
'''
|
||||
Desc:
|
||||
函数 ridgeTest() 用于在一组 λ 上测试结果
|
||||
Args:
|
||||
xArr:样本数据的特征,即 feature
|
||||
yArr:样本数据的类别标签,即真实数据
|
||||
Returns:
|
||||
wMat:将所有的回归系数输出到一个矩阵并返回
|
||||
'''
|
||||
|
||||
xMat = mat(xArr)
|
||||
yMat = mat(yArr).T
|
||||
# 计算Y的均值
|
||||
yMean = mean(yMat, 0)
|
||||
# Y的所有的特征减去均值
|
||||
yMat = yMat - yMean
|
||||
# 标准化 x,计算 xMat 平均值
|
||||
xMeans = mean(xMat, 0)
|
||||
# 然后计算 X的方差
|
||||
xVar = var(xMat, 0)
|
||||
# 所有特征都减去各自的均值并除以方差
|
||||
xMat = (xMat - xMeans) / xVar
|
||||
# 可以在 30 个不同的 lambda 下调用 ridgeRegres() 函数。
|
||||
numTestPts = 30
|
||||
# 创建30 * m 的全部数据为0 的矩阵
|
||||
wMat = zeros((numTestPts, shape(xMat)[1]))
|
||||
for i in range(numTestPts):
|
||||
# exp() 返回 e^x
|
||||
ws = ridgeRegres(xMat, yMat, exp(i - 10))
|
||||
wMat[i, :] = ws.T
|
||||
return wMat
|
||||
|
||||
|
||||
def regularize(xMat): # 按列进行规范化
|
||||
inMat = xMat.copy()
|
||||
inMeans = mean(inMat, 0) # 计算平均值然后减去它
|
||||
inVar = var(inMat, 0) # 计算除以Xi的方差
|
||||
inMat = (inMat - inMeans) / inVar
|
||||
return inMat
|
||||
|
||||
|
||||
def stageWise(xArr, yArr, eps=0.01, numIt=100):
|
||||
xMat = mat(xArr)
|
||||
yMat = mat(yArr).T
|
||||
yMean = mean(yMat, 0)
|
||||
yMat = yMat - yMean # 也可以规则化ys但会得到更小的coef
|
||||
xMat = regularize(xMat)
|
||||
m, n = shape(xMat)
|
||||
# returnMat = zeros((numIt,n)) # 测试代码删除
|
||||
ws = zeros((n, 1))
|
||||
wsTest = ws.copy()
|
||||
wsMax = ws.copy()
|
||||
for i in range(numIt):
|
||||
print(ws.T)
|
||||
lowestError = inf
|
||||
for j in range(n):
|
||||
for sign in [-1, 1]:
|
||||
wsTest = ws.copy()
|
||||
wsTest[j] += eps * sign
|
||||
yTest = xMat * wsTest
|
||||
rssE = rssError(yMat.A, yTest.A)
|
||||
if rssE < lowestError:
|
||||
lowestError = rssE
|
||||
wsMax = wsTest
|
||||
ws = wsMax.copy()
|
||||
# returnMat[i,:]=ws.T
|
||||
# return returnMat
|
||||
|
||||
# def scrapePage(inFile,outFile,yr,numPce,origPrc):
|
||||
# from BeautifulSoup import BeautifulSoup
|
||||
# fr = open(inFile); fw=open(outFile,'a') #a is append mode writing
|
||||
# soup = BeautifulSoup(fr.read())
|
||||
# i=1
|
||||
# currentRow = soup.findAll('table', r="%d" % i)
|
||||
# while(len(currentRow)!=0):
|
||||
# title = currentRow[0].findAll('a')[1].text
|
||||
# lwrTitle = title.lower()
|
||||
# if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
|
||||
# newFlag = 1.0
|
||||
# else:
|
||||
# newFlag = 0.0
|
||||
# soldUnicde = currentRow[0].findAll('td')[3].findAll('span')
|
||||
# if len(soldUnicde)==0:
|
||||
# print "item #%d did not sell" % i
|
||||
# else:
|
||||
# soldPrice = currentRow[0].findAll('td')[4]
|
||||
# priceStr = soldPrice.text
|
||||
# priceStr = priceStr.replace('$','') #strips out $
|
||||
# priceStr = priceStr.replace(',','') #strips out ,
|
||||
# if len(soldPrice)>1:
|
||||
# priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping
|
||||
# print "%s\t%d\t%s" % (priceStr,newFlag,title)
|
||||
# fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr))
|
||||
# i += 1
|
||||
# currentRow = soup.findAll('table', r="%d" % i)
|
||||
# fw.close()
|
||||
|
||||
# --------------------------------------------------------------
|
||||
# 预测乐高玩具套装的价格 ------ 最初的版本,因为现在 google 的 api 变化,无法获取数据
|
||||
# 故改为了下边的样子,但是需要安装一个 beautifulSoup 这个第三方爬虫库,安装很简单,见下边
|
||||
|
||||
|
||||
'''
|
||||
from time import sleep
|
||||
import json
|
||||
import urllib2
|
||||
def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
|
||||
sleep(10)
|
||||
myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
|
||||
searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
|
||||
pg = urllib2.urlopen(searchURL)
|
||||
retDict = json.loads(pg.read())
|
||||
for i in range(len(retDict['items'])):
|
||||
try:
|
||||
currItem = retDict['items'][i]
|
||||
if currItem['product']['condition'] == 'new':
|
||||
newFlag = 1
|
||||
else: newFlag = 0
|
||||
listOfInv = currItem['product']['inventories']
|
||||
for item in listOfInv:
|
||||
sellingPrice = item['price']
|
||||
if sellingPrice > origPrc * 0.5:
|
||||
print ("%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice))
|
||||
retX.append([yr, numPce, newFlag, origPrc])
|
||||
retY.append(sellingPrice)
|
||||
except: print ('problem with item %d' % i)
|
||||
|
||||
def setDataCollect(retX, retY):
|
||||
searchForSet(retX, retY, 8288, 2006, 800, 49.99)
|
||||
searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
|
||||
searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
|
||||
searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
|
||||
searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
|
||||
searchForSet(retX, retY, 10196, 2009, 3263, 249.99)
|
||||
|
||||
def crossValidation(xArr,yArr,numVal=10):
|
||||
m = len(yArr)
|
||||
indexList = range(m)
|
||||
errorMat = zeros((numVal,30))#create error mat 30columns numVal rows创建error mat 30columns numVal 行
|
||||
for i in range(numVal):
|
||||
trainX=[]; trainY=[]
|
||||
testX = []; testY = []
|
||||
random.shuffle(indexList)
|
||||
for j in range(m):#create training set based on first 90% of values in indexList
|
||||
#基于indexList中的前90%的值创建训练集
|
||||
if j < m*0.9:
|
||||
trainX.append(xArr[indexList[j]])
|
||||
gt56 trainY.append(yArr[indexList[j]])
|
||||
else:
|
||||
testX.append(xArr[indexList[j]])
|
||||
testY.append(yArr[indexList[j]])
|
||||
wMat = ridgeTest(trainX,trainY) #get 30 weight vectors from ridge
|
||||
for k in range(30):#loop over all of the ridge estimates
|
||||
matTestX = mat(testX); matTrainX=mat(trainX)
|
||||
meanTrain = mean(matTrainX,0)
|
||||
varTrain = var(matTrainX,0)
|
||||
matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params
|
||||
yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
|
||||
errorMat[i,k]=rssError(yEst.T.A,array(testY))
|
||||
#print errorMat[i,k]
|
||||
meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
|
||||
minMean = float(min(meanErrors))
|
||||
bestWeights = wMat[nonzero(meanErrors==minMean)]
|
||||
#can unregularize to get model
|
||||
#when we regularized we wrote Xreg = (x-meanX)/var(x)
|
||||
#we can now write in terms of x not Xreg: x*w/var(x) - meanX/var(x) +meanY
|
||||
xMat = mat(xArr); yMat=mat(yArr).T
|
||||
meanX = mean(xMat,0); varX = var(xMat,0)
|
||||
unReg = bestWeights/varX
|
||||
print ("the best model from Ridge Regression is:\n",unReg)
|
||||
print ("with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat))
|
||||
'''
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# 预测乐高玩具套装的价格 可运行版本,我们把乐高数据存储到了我们的 input 文件夹下,使用 beautifulSoup 爬去一下内容
|
||||
# 前提:安装 BeautifulSoup 第三方爬虫库,步骤如下
|
||||
# 在这个页面 https://www.crummy.com/software/BeautifulSoup/bs4/download/4.4/ 下载,beautifulsoup4-4.4.1.tar.gz
|
||||
# 将下载文件解压,使用 windows 版本的 cmd 命令行,进入解压的包,输入以下两行命令即可完成安装
|
||||
# python setup.py build
|
||||
# python setup.py install
|
||||
'''
|
||||
from numpy import *
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 从页面读取数据,生成retX和retY列表
|
||||
def scrapePage(retX, retY, inFile, yr, numPce, origPrc):
|
||||
|
||||
# 打开并读取HTML文件
|
||||
fr = open(inFile)
|
||||
soup = BeautifulSoup(fr.read())
|
||||
i=1
|
||||
|
||||
# 根据HTML页面结构进行解析
|
||||
currentRow = soup.findAll('table', r="%d" % i)
|
||||
while(len(currentRow)!=0):
|
||||
currentRow = soup.findAll('table', r="%d" % i)
|
||||
title = currentRow[0].findAll('a')[1].text
|
||||
lwrTitle = title.lower()
|
||||
|
||||
# 查找是否有全新标签
|
||||
if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
|
||||
newFlag = 1.0
|
||||
else:
|
||||
newFlag = 0.0
|
||||
|
||||
# 查找是否已经标志出售,我们只收集已出售的数据
|
||||
soldUnicde = currentRow[0].findAll('td')[3].findAll('span')
|
||||
if len(soldUnicde)==0:
|
||||
print "item #%d did not sell" % i
|
||||
else:
|
||||
# 解析页面获取当前价格
|
||||
soldPrice = currentRow[0].findAll('td')[4]
|
||||
priceStr = soldPrice.text
|
||||
priceStr = priceStr.replace('$','') #strips out $
|
||||
priceStr = priceStr.replace(',','') #strips out ,
|
||||
if len(soldPrice)>1:
|
||||
priceStr = priceStr.replace('Free shipping', '')
|
||||
sellingPrice = float(priceStr)
|
||||
|
||||
# 去掉不完整的套装价格
|
||||
if sellingPrice > origPrc * 0.5:
|
||||
print "%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice)
|
||||
retX.append([yr, numPce, newFlag, origPrc])
|
||||
retY.append(sellingPrice)
|
||||
i += 1
|
||||
currentRow = soup.findAll('table', r="%d" % i)
|
||||
|
||||
# 依次读取六种乐高套装的数据,并生成数据矩阵
|
||||
def setDataCollect(retX, retY):
|
||||
scrapePage(retX, retY, 'data/8.Regression/setHtml/lego8288.html', 2006, 800, 49.99)
|
||||
scrapePage(retX, retY, 'data/8.Regression/setHtml/lego10030.html', 2002, 3096, 269.99)
|
||||
scrapePage(retX, retY, 'data/8.Regression/setHtml/lego10179.html', 2007, 5195, 499.99)
|
||||
scrapePage(retX, retY, 'data/8.Regression/setHtml/lego10181.html', 2007, 3428, 199.99)
|
||||
scrapePage(retX, retY, 'data/8.Regression/setHtml/lego10189.html', 2008, 5922, 299.99)
|
||||
scrapePage(retX, retY, 'data/8.Regression/setHtml/lego10196.html', 2009, 3263, 249.99)
|
||||
|
||||
|
||||
# 交叉验证测试岭回归
|
||||
def crossValidation(xArr,yArr,numVal=10):
|
||||
# 获得数据点个数,xArr和yArr具有相同长度
|
||||
m = len(yArr)
|
||||
indexList = range(m)
|
||||
errorMat = zeros((numVal,30))
|
||||
|
||||
# 主循环 交叉验证循环
|
||||
for i in range(numVal):
|
||||
# 随机拆分数据,将数据分为训练集(90%)和测试集(10%)
|
||||
trainX=[]; trainY=[]
|
||||
testX = []; testY = []
|
||||
|
||||
# 对数据进行混洗操作
|
||||
random.shuffle(indexList)
|
||||
|
||||
# 切分训练集和测试集
|
||||
for j in range(m):
|
||||
if j < m*0.9:
|
||||
trainX.append(xArr[indexList[j]])
|
||||
trainY.append(yArr[indexList[j]])
|
||||
else:
|
||||
testX.append(xArr[indexList[j]])
|
||||
testY.append(yArr[indexList[j]])
|
||||
|
||||
# 获得回归系数矩阵
|
||||
wMat = ridgeTest(trainX,trainY)
|
||||
|
||||
# 循环遍历矩阵中的30组回归系数
|
||||
for k in range(30):
|
||||
# 读取训练集和数据集
|
||||
matTestX = mat(testX); matTrainX=mat(trainX)
|
||||
# 对数据进行标准化
|
||||
meanTrain = mean(matTrainX,0)
|
||||
varTrain = var(matTrainX,0)
|
||||
matTestX = (matTestX-meanTrain)/varTrain
|
||||
|
||||
# 测试回归效果并存储
|
||||
yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)
|
||||
|
||||
# 计算误差
|
||||
errorMat[i,k] = ((yEst.T.A-array(testY))**2).sum()
|
||||
|
||||
# 计算误差估计值的均值
|
||||
meanErrors = mean(errorMat,0)
|
||||
minMean = float(min(meanErrors))
|
||||
bestWeights = wMat[nonzero(meanErrors==minMean)]
|
||||
|
||||
# 不要使用标准化的数据,需要对数据进行还原来得到输出结果
|
||||
xMat = mat(xArr); yMat=mat(yArr).T
|
||||
meanX = mean(xMat,0); varX = var(xMat,0)
|
||||
unReg = bestWeights/varX
|
||||
|
||||
# 输出构建的模型
|
||||
print "the best model from Ridge Regression is:\n",unReg
|
||||
print "with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat)
|
||||
'''
|
||||
|
||||
|
||||
# test for standRegression
|
||||
def regression1():
|
||||
xArr, yArr = loadDataSet("data/8.Regression/data.txt")
|
||||
xMat = mat(xArr)
|
||||
yMat = mat(yArr)
|
||||
ws = standRegres(xArr, yArr)
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(
|
||||
111) # add_subplot(349)函数的参数的意思是,将画布分成3行4列图像画在从左到右从上到下第9块
|
||||
ax.scatter(
|
||||
[xMat[:, 1].flatten()],
|
||||
[yMat.T[:, 0].flatten().A[0]]) # scatter 的x是xMat中的第二列,y是yMat的第一列
|
||||
xCopy = xMat.copy()
|
||||
xCopy.sort(0)
|
||||
yHat = xCopy * ws
|
||||
ax.plot(xCopy[:, 1], yHat)
|
||||
plt.show()
|
||||
|
||||
|
||||
# test for LWLR
|
||||
def regression2():
|
||||
xArr, yArr = loadDataSet("data/8.Regression/data.txt")
|
||||
yHat = lwlrTest(xArr, xArr, yArr, 0.003)
|
||||
xMat = mat(xArr)
|
||||
srtInd = xMat[:, 1].argsort(
|
||||
0) #argsort()函数是将x中的元素从小到大排列,提取其对应的index(索引),然后输出
|
||||
xSort = xMat[srtInd][:, 0, :]
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.plot(xSort[:, 1], yHat[srtInd])
|
||||
ax.scatter(
|
||||
[xMat[:, 1].flatten().A[0]], [mat(yArr).T.flatten().A[0]],
|
||||
s=2,
|
||||
c='red')
|
||||
plt.show()
|
||||
|
||||
|
||||
# test for abloneDataSet
|
||||
def abaloneTest():
|
||||
'''
|
||||
Desc:
|
||||
预测鲍鱼的年龄
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 加载数据
|
||||
abX, abY = loadDataSet("data/8.Regression/abalone.txt")
|
||||
# 使用不同的核进行预测
|
||||
oldyHat01 = lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
|
||||
oldyHat1 = lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
|
||||
oldyHat10 = lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
|
||||
# 打印出不同的核预测值与训练数据集上的真实值之间的误差大小
|
||||
print(("old yHat01 error Size is :", rssError(abY[0:99], oldyHat01.T)))
|
||||
print(("old yHat1 error Size is :", rssError(abY[0:99], oldyHat1.T)))
|
||||
print(("old yHat10 error Size is :", rssError(abY[0:99], oldyHat10.T)))
|
||||
|
||||
# 打印出 不同的核预测值 与 新数据集(测试数据集)上的真实值之间的误差大小
|
||||
newyHat01 = lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
|
||||
print(("new yHat01 error Size is :", rssError(abY[0:99], newyHat01.T)))
|
||||
newyHat1 = lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
|
||||
print(("new yHat1 error Size is :", rssError(abY[0:99], newyHat1.T)))
|
||||
newyHat10 = lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
|
||||
print(("new yHat10 error Size is :", rssError(abY[0:99], newyHat10.T)))
|
||||
|
||||
# 使用简单的 线性回归 进行预测,与上面的计算进行比较
|
||||
standWs = standRegres(abX[0:99], abY[0:99])
|
||||
standyHat = mat(abX[100:199]) * standWs
|
||||
print(("standRegress error Size is:", rssError(abY[100:199], standyHat.T.A)))
|
||||
|
||||
|
||||
# test for ridgeRegression
|
||||
def regression3():
|
||||
abX, abY = loadDataSet("data/8.Regression/abalone.txt")
|
||||
ridgeWeights = ridgeTest(abX, abY)
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.plot(ridgeWeights)
|
||||
plt.show()
|
||||
|
||||
|
||||
# test for stageWise
|
||||
def regression4():
|
||||
xArr, yArr = loadDataSet("data/8.Regression/abalone.txt")
|
||||
stageWise(xArr, yArr, 0.01, 200)
|
||||
xMat = mat(xArr)
|
||||
yMat = mat(yArr).T
|
||||
xMat = regularize(xMat)
|
||||
yM = mean(yMat, 0)
|
||||
yMat = yMat - yM
|
||||
weights = standRegres(xMat, yMat.T)
|
||||
print(weights.T)
|
||||
|
||||
|
||||
# predict for lego's price
|
||||
def regression5():
|
||||
lgX = []
|
||||
lgY = []
|
||||
|
||||
setDataCollect(lgX, lgY)
|
||||
crossValidation(lgX, lgY, 10)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
regression1()
|
||||
# regression2()
|
||||
# abaloneTest()
|
||||
# regression3()
|
||||
# regression4()
|
||||
# regression5()
|
||||
192
src/py2.x/ml/8.Regression/sklearn-regression-demo.py
Normal file
192
src/py2.x/ml/8.Regression/sklearn-regression-demo.py
Normal file
@@ -0,0 +1,192 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on Jan 8, 2011
|
||||
Update on 2017-05-18
|
||||
Author: Peter Harrington/小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
# Isotonic Regression 等式回归
|
||||
print(__doc__)
|
||||
|
||||
# Author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
|
||||
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# License: BSD
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.collections import LineCollection
|
||||
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.isotonic import IsotonicRegression
|
||||
from sklearn.utils import check_random_state
|
||||
|
||||
n = 100
|
||||
x = np.arange(n)
|
||||
rs = check_random_state(0)
|
||||
y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
|
||||
|
||||
ir = IsotonicRegression()
|
||||
|
||||
y_ = ir.fit_transform(x, y)
|
||||
|
||||
lr = LinearRegression()
|
||||
lr.fit(x[:, np.newaxis], y) # 线性回归的 x 需要为 2d
|
||||
|
||||
segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
|
||||
lc = LineCollection(segments, zorder=0)
|
||||
lc.set_array(np.ones(len(y)))
|
||||
lc.set_linewidths(0.5 * np.ones(n))
|
||||
|
||||
fig = plt.figure()
|
||||
plt.plot(x, y, 'r.', markersize=12)
|
||||
plt.plot(x, y_, 'g.-', markersize=12)
|
||||
plt.plot(x, lr.predict(x[:, np.newaxis]), 'b-')
|
||||
plt.gca().add_collection(lc)
|
||||
plt.legend(('Data', 'Isotonic Fit', 'Linear Fit'), loc='lower right')
|
||||
plt.title('Isotonic regression')
|
||||
plt.show()
|
||||
|
||||
# Kernel ridge regression ( 内核岭回归 )
|
||||
|
||||
# 2.1 Comparison of kernel ridge regression and SVR ( 内核岭回归与 SVR 的比较 )
|
||||
|
||||
# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
||||
# License: BSD 3 clause
|
||||
|
||||
'''
|
||||
from __future__ import division
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.svm import SVR
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.model_selection import learning_curve
|
||||
from sklearn.kernel_ridge import KernelRidge
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
# 生成样本数据
|
||||
X = 5 * rng.rand(10000, 1)
|
||||
y = np.sin(X).ravel()
|
||||
|
||||
# 给目标增加噪音
|
||||
y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))
|
||||
|
||||
X_plot = np.linspace(0, 5, 100000)[:, None]
|
||||
|
||||
# Fit regression model ( 拟合 回归 模型 )
|
||||
train_size = 100
|
||||
svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5,
|
||||
param_grid={"C": [1e0, 1e1, 1e2, 1e3],
|
||||
"gamma": np.logspace(-2, 2, 5)})
|
||||
|
||||
kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5,
|
||||
param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
|
||||
"gamma": np.logspace(-2, 2, 5)})
|
||||
|
||||
t0 = time.time()
|
||||
svr.fit(X[:train_size], y[:train_size])
|
||||
svr_fit = time.time() - t0
|
||||
print("SVR complexity and bandwidth selected and model fitted in %.3f s"
|
||||
% svr_fit)
|
||||
|
||||
t0 = time.time()
|
||||
kr.fit(X[:train_size], y[:train_size])
|
||||
kr_fit = time.time() - t0
|
||||
print("KRR complexity and bandwidth selected and model fitted in %.3f s"
|
||||
% kr_fit)
|
||||
|
||||
sv_ratio = svr.best_estimator_.support_.shape[0] / train_size
|
||||
print("Support vector ratio: %.3f" % sv_ratio)
|
||||
|
||||
t0 = time.time()
|
||||
y_svr = svr.predict(X_plot)
|
||||
svr_predict = time.time() - t0
|
||||
print("SVR prediction for %d inputs in %.3f s"
|
||||
% (X_plot.shape[0], svr_predict))
|
||||
|
||||
t0 = time.time()
|
||||
y_kr = kr.predict(X_plot)
|
||||
kr_predict = time.time() - t0
|
||||
print("KRR prediction for %d inputs in %.3f s"
|
||||
% (X_plot.shape[0], kr_predict))
|
||||
|
||||
# 查看结果
|
||||
sv_ind = svr.best_estimator_.support_
|
||||
plt.scatter(X[sv_ind], y[sv_ind], c='r', s=50, label='SVR support vectors',
|
||||
zorder=2)
|
||||
plt.scatter(X[:100], y[:100], c='k', label='data', zorder=1)
|
||||
plt.hold('on')
|
||||
plt.plot(X_plot, y_svr, c='r',
|
||||
label='SVR (fit: %.3fs, predict: %.3fs)' % (svr_fit, svr_predict))
|
||||
plt.plot(X_plot, y_kr, c='g',
|
||||
label='KRR (fit: %.3fs, predict: %.3fs)' % (kr_fit, kr_predict))
|
||||
plt.xlabel('data')
|
||||
plt.ylabel('target')
|
||||
plt.title('SVR versus Kernel Ridge')
|
||||
plt.legend()
|
||||
|
||||
# 可视化训练和预测时间
|
||||
plt.figure()
|
||||
|
||||
# 生成样本数据
|
||||
X = 5 * rng.rand(10000, 1)
|
||||
y = np.sin(X).ravel()
|
||||
y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))
|
||||
sizes = np.logspace(1, 4, 7, dtype=np.int)
|
||||
for name, estimator in {"KRR": KernelRidge(kernel='rbf', alpha=0.1,
|
||||
gamma=10),
|
||||
"SVR": SVR(kernel='rbf', C=1e1, gamma=10)}.items():
|
||||
train_time = []
|
||||
test_time = []
|
||||
for train_test_size in sizes:
|
||||
t0 = time.time()
|
||||
estimator.fit(X[:train_test_size], y[:train_test_size])
|
||||
train_time.append(time.time() - t0)
|
||||
|
||||
t0 = time.time()
|
||||
estimator.predict(X_plot[:1000])
|
||||
test_time.append(time.time() - t0)
|
||||
|
||||
plt.plot(sizes, train_time, 'o-', color="r" if name == "SVR" else "g",
|
||||
label="%s (train)" % name)
|
||||
plt.plot(sizes, test_time, 'o--', color="r" if name == "SVR" else "g",
|
||||
label="%s (test)" % name)
|
||||
|
||||
plt.xscale("log")
|
||||
plt.yscale("log")
|
||||
plt.xlabel("Train size")
|
||||
plt.ylabel("Time (seconds)")
|
||||
plt.title('Execution Time')
|
||||
plt.legend(loc="best")
|
||||
|
||||
# 可视化学习曲线
|
||||
plt.figure()
|
||||
|
||||
svr = SVR(kernel='rbf', C=1e1, gamma=0.1)
|
||||
kr = KernelRidge(kernel='rbf', alpha=0.1, gamma=0.1)
|
||||
train_sizes, train_scores_svr, test_scores_svr = \
|
||||
learning_curve(svr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10),
|
||||
scoring="neg_mean_squared_error", cv=10)
|
||||
train_sizes_abs, train_scores_kr, test_scores_kr = \
|
||||
learning_curve(kr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10),
|
||||
scoring="neg_mean_squared_error", cv=10)
|
||||
|
||||
plt.plot(train_sizes, -test_scores_svr.mean(1), 'o-', color="r",
|
||||
label="SVR")
|
||||
plt.plot(train_sizes, -test_scores_kr.mean(1), 'o-', color="g",
|
||||
label="KRR")
|
||||
plt.xlabel("Train size")
|
||||
plt.ylabel("Mean Squared Error")
|
||||
plt.title('Learning curves')
|
||||
plt.legend(loc="best")
|
||||
|
||||
plt.show()
|
||||
'''
|
||||
106
src/py2.x/ml/9.RegTrees/RTSklearn.py
Normal file
106
src/py2.x/ml/9.RegTrees/RTSklearn.py
Normal file
@@ -0,0 +1,106 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
# '''
|
||||
# Created on 2017-03-10
|
||||
# Update on 2017-03-10
|
||||
# author: jiangzhonglian
|
||||
# content: 回归树
|
||||
# '''
|
||||
|
||||
# print(__doc__)
|
||||
|
||||
|
||||
# # Import the necessary modules and libraries
|
||||
# import numpy as np
|
||||
# from sklearn.tree import DecisionTreeRegressor
|
||||
# import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
# # Create a random dataset
|
||||
# rng = np.random.RandomState(1)
|
||||
# X = np.sort(5 * rng.rand(80, 1), axis=0)
|
||||
# y = np.sin(X).ravel()
|
||||
# print X, '\n\n\n-----------\n\n\n', y
|
||||
# y[::5] += 3 * (0.5 - rng.rand(16))
|
||||
|
||||
|
||||
# # Fit regression model
|
||||
# regr_1 = DecisionTreeRegressor(max_depth=2, min_samples_leaf=5)
|
||||
# regr_2 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=5)
|
||||
# regr_1.fit(X, y)
|
||||
# regr_2.fit(X, y)
|
||||
|
||||
|
||||
# # Predict
|
||||
# X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
|
||||
# y_1 = regr_1.predict(X_test)
|
||||
# y_2 = regr_2.predict(X_test)
|
||||
|
||||
|
||||
# # Plot the results
|
||||
# plt.figure()
|
||||
# plt.scatter(X, y, c="darkorange", label="data")
|
||||
# plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
|
||||
# plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
|
||||
# plt.xlabel("data")
|
||||
# plt.ylabel("target")
|
||||
# plt.title("Decision Tree Regression")
|
||||
# plt.legend()
|
||||
# plt.show()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
'''
|
||||
Created on 2017-03-10
|
||||
Update on 2017-03-10
|
||||
author: jiangzhonglian
|
||||
content: 模型树
|
||||
'''
|
||||
from __future__ import print_function
|
||||
|
||||
print(__doc__)
|
||||
|
||||
# Author: Noel Dawe <noel.dawe@gmail.com>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
# importing necessary libraries
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
from sklearn.ensemble import AdaBoostRegressor
|
||||
|
||||
# Create the dataset
|
||||
rng = np.random.RandomState(1)
|
||||
X = np.linspace(0, 6, 100)[:, np.newaxis]
|
||||
y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])
|
||||
|
||||
# Fit regression model
|
||||
regr_1 = DecisionTreeRegressor(max_depth=4)
|
||||
|
||||
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
|
||||
n_estimators=300, random_state=rng)
|
||||
|
||||
regr_1.fit(X, y)
|
||||
regr_2.fit(X, y)
|
||||
|
||||
# Predict
|
||||
y_1 = regr_1.predict(X)
|
||||
y_2 = regr_2.predict(X)
|
||||
|
||||
# Plot the results
|
||||
plt.figure()
|
||||
plt.scatter(X, y, c="k", label="training samples")
|
||||
plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)
|
||||
plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2)
|
||||
plt.xlabel("data")
|
||||
plt.ylabel("target")
|
||||
plt.title("Boosted Decision Tree Regression")
|
||||
plt.legend()
|
||||
plt.show()
|
||||
468
src/py2.x/ml/9.RegTrees/regTrees.py
Normal file
468
src/py2.x/ml/9.RegTrees/regTrees.py
Normal file
@@ -0,0 +1,468 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on Feb 4, 2011
|
||||
Update on 2017-12-20
|
||||
Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9
|
||||
Author: Peter Harrington/片刻/小瑶/zh0ng
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
print(__doc__)
|
||||
from numpy import *
|
||||
|
||||
|
||||
# 默认解析的数据是用tab分隔,并且是数值类型
|
||||
# general function to parse tab -delimited floats
|
||||
def loadDataSet(fileName):
|
||||
"""loadDataSet(解析每一行,并转化为float类型)
|
||||
Desc:该函数读取一个以 tab 键为分隔符的文件,然后将每行的内容保存成一组浮点数
|
||||
Args:
|
||||
fileName 文件名
|
||||
Returns:
|
||||
dataMat 每一行的数据集array类型
|
||||
Raises:
|
||||
"""
|
||||
# 假定最后一列是结果值
|
||||
# assume last column is target value
|
||||
dataMat = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
curLine = line.strip().split('\t')
|
||||
# 将所有的元素转化为float类型
|
||||
# map all elements to float()
|
||||
# map() 函数具体的含义,可见 https://my.oschina.net/zyzzy/blog/115096
|
||||
fltLine = map(float, curLine)
|
||||
dataMat.append(fltLine)
|
||||
return dataMat
|
||||
|
||||
|
||||
def binSplitDataSet(dataSet, feature, value):
|
||||
"""binSplitDataSet(将数据集,按照feature列的value进行 二元切分)
|
||||
Description:在给定特征和特征值的情况下,该函数通过数组过滤方式将上述数据集合切分得到两个子集并返回。
|
||||
Args:
|
||||
dataMat 数据集
|
||||
feature 待切分的特征列
|
||||
value 特征列要比较的值
|
||||
Returns:
|
||||
mat0 小于等于 value 的数据集在左边
|
||||
mat1 大于 value 的数据集在右边
|
||||
Raises:
|
||||
"""
|
||||
# # 测试案例
|
||||
# print 'dataSet[:, feature]=', dataSet[:, feature]
|
||||
# print 'nonzero(dataSet[:, feature] > value)[0]=', nonzero(dataSet[:, feature] > value)[0]
|
||||
# print 'nonzero(dataSet[:, feature] <= value)[0]=', nonzero(dataSet[:, feature] <= value)[0]
|
||||
|
||||
# dataSet[:, feature] 取去每一行中,第1列的值(从0开始算)
|
||||
# nonzero(dataSet[:, feature] > value) 返回结果为true行的index下标
|
||||
mat0 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :]
|
||||
mat1 = dataSet[nonzero(dataSet[:, feature] > value)[0], :]
|
||||
return mat0, mat1
|
||||
|
||||
|
||||
# 返回每一个叶子结点的均值
|
||||
# returns the value used for each leaf
|
||||
# 我的理解是:regLeaf 是产生叶节点的函数,就是求均值,即用聚类中心点来代表这类数据
|
||||
def regLeaf(dataSet):
|
||||
return mean(dataSet[:, -1])
|
||||
|
||||
|
||||
# 计算总方差=方差*样本数
|
||||
# 我的理解是:求这组数据的方差,即通过决策树划分,可以让靠近的数据分到同一类中去
|
||||
def regErr(dataSet):
|
||||
# shape(dataSet)[0] 表示行数
|
||||
return var(dataSet[:, -1]) * shape(dataSet)[0]
|
||||
|
||||
|
||||
# 1.用最佳方式切分数据集
|
||||
# 2.生成相应的叶节点
|
||||
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
|
||||
"""chooseBestSplit(用最佳方式切分数据集 和 生成相应的叶节点)
|
||||
|
||||
Args:
|
||||
dataSet 加载的原始数据集
|
||||
leafType 建立叶子点的函数
|
||||
errType 误差计算函数(求总方差)
|
||||
ops [容许误差下降值,切分的最少样本数]。
|
||||
Returns:
|
||||
bestIndex feature的index坐标
|
||||
bestValue 切分的最优值
|
||||
Raises:
|
||||
"""
|
||||
|
||||
# ops=(1,4),非常重要,因为它决定了决策树划分停止的threshold值,被称为预剪枝(prepruning),其实也就是用于控制函数的停止时机。
|
||||
# 之所以这样说,是因为它防止决策树的过拟合,所以当误差的下降值小于tolS,或划分后的集合size小于tolN时,选择停止继续划分。
|
||||
# 最小误差下降值,划分后的误差减小小于这个差值,就不用继续划分
|
||||
tolS = ops[0]
|
||||
# 划分最小 size 小于,就不继续划分了
|
||||
tolN = ops[1]
|
||||
# 如果结果集(最后一列为1个变量),就返回退出
|
||||
# .T 对数据集进行转置
|
||||
# .tolist()[0] 转化为数组并取第0列
|
||||
if len(set(dataSet[:, -1].T.tolist()[0])) == 1: # 如果集合size为1,也就是说全部的数据都是同一个类别,不用继续划分。
|
||||
# exit cond 1
|
||||
return None, leafType(dataSet)
|
||||
# 计算行列值
|
||||
m, n = shape(dataSet)
|
||||
# 无分类误差的总方差和
|
||||
# the choice of the best feature is driven by Reduction in RSS error from mean
|
||||
S = errType(dataSet)
|
||||
# inf 正无穷大
|
||||
bestS, bestIndex, bestValue = inf, 0, 0
|
||||
# 循环处理每一列对应的feature值
|
||||
for featIndex in range(n-1): # 对于每个特征
|
||||
# [0]表示这一列的[所有行],不要[0]就是一个array[[所有行]],下面的一行表示的是将某一列全部的数据转换为行,然后设置为list形式
|
||||
for splitVal in set(dataSet[:, featIndex].T.tolist()[0]):
|
||||
# 对该列进行分组,然后组内的成员的val值进行 二元切分
|
||||
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
|
||||
# 判断二元切分的方式的元素数量是否符合预期
|
||||
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
|
||||
continue
|
||||
newS = errType(mat0) + errType(mat1)
|
||||
# 如果二元切分,算出来的误差在可接受范围内,那么就记录切分点,并记录最小误差
|
||||
# 如果划分后误差小于 bestS,则说明找到了新的bestS
|
||||
if newS < bestS:
|
||||
bestIndex = featIndex
|
||||
bestValue = splitVal
|
||||
bestS = newS
|
||||
# 判断二元切分的方式的元素误差是否符合预期
|
||||
# if the decrease (S-bestS) is less than a threshold don't do the split
|
||||
if (S - bestS) < tolS:
|
||||
return None, leafType(dataSet)
|
||||
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
|
||||
# 对整体的成员进行判断,是否符合预期
|
||||
# 如果集合的 size 小于 tolN
|
||||
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): # 当最佳划分后,集合过小,也不划分,产生叶节点
|
||||
return None, leafType(dataSet)
|
||||
return bestIndex, bestValue
|
||||
|
||||
|
||||
# assume dataSet is NumPy Mat so we can array filtering
|
||||
# 假设 dataSet 是 NumPy Mat 类型的,那么我们可以进行 array 过滤
|
||||
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
|
||||
"""createTree(获取回归树)
|
||||
Description:递归函数:如果构建的是回归树,该模型是一个常数,如果是模型树,其模型师一个线性方程。
|
||||
Args:
|
||||
dataSet 加载的原始数据集
|
||||
leafType 建立叶子点的函数
|
||||
errType 误差计算函数
|
||||
ops=(1, 4) [容许误差下降值,切分的最少样本数]
|
||||
Returns:
|
||||
retTree 决策树最后的结果
|
||||
"""
|
||||
# 选择最好的切分方式: feature索引值,最优切分值
|
||||
# choose the best split
|
||||
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
|
||||
# if the splitting hit a stop condition return val
|
||||
# 如果 splitting 达到一个停止条件,那么返回 val
|
||||
'''
|
||||
*** 最后的返回结果是最后剩下的 val,也就是len小于topN的集合
|
||||
'''
|
||||
if feat is None:
|
||||
return val
|
||||
retTree = {}
|
||||
retTree['spInd'] = feat
|
||||
retTree['spVal'] = val
|
||||
# 大于在右边,小于在左边,分为2个数据集
|
||||
lSet, rSet = binSplitDataSet(dataSet, feat, val)
|
||||
# 递归的进行调用,在左右子树中继续递归生成树
|
||||
retTree['left'] = createTree(lSet, leafType, errType, ops)
|
||||
retTree['right'] = createTree(rSet, leafType, errType, ops)
|
||||
return retTree
|
||||
|
||||
|
||||
# 判断节点是否是一个字典
|
||||
def isTree(obj):
|
||||
"""
|
||||
Desc:
|
||||
测试输入变量是否是一棵树,即是否是一个字典
|
||||
Args:
|
||||
obj -- 输入变量
|
||||
Returns:
|
||||
返回布尔类型的结果。如果 obj 是一个字典,返回true,否则返回 false
|
||||
"""
|
||||
return (type(obj).__name__ == 'dict')
|
||||
|
||||
|
||||
# 计算左右枝丫的均值
|
||||
def getMean(tree):
|
||||
"""
|
||||
Desc:
|
||||
从上往下遍历树直到叶节点为止,如果找到两个叶节点则计算它们的平均值。
|
||||
对 tree 进行塌陷处理,即返回树平均值。
|
||||
Args:
|
||||
tree -- 输入的树
|
||||
Returns:
|
||||
返回 tree 节点的平均值
|
||||
"""
|
||||
if isTree(tree['right']):
|
||||
tree['right'] = getMean(tree['right'])
|
||||
if isTree(tree['left']):
|
||||
tree['left'] = getMean(tree['left'])
|
||||
return (tree['left']+tree['right'])/2.0
|
||||
|
||||
|
||||
# 检查是否适合合并分枝
|
||||
def prune(tree, testData):
|
||||
"""
|
||||
Desc:
|
||||
从上而下找到叶节点,用测试数据集来判断将这些叶节点合并是否能降低测试误差
|
||||
Args:
|
||||
tree -- 待剪枝的树
|
||||
testData -- 剪枝所需要的测试数据 testData
|
||||
Returns:
|
||||
tree -- 剪枝完成的树
|
||||
"""
|
||||
# 判断是否测试数据集没有数据,如果没有,就直接返回tree本身的均值
|
||||
if shape(testData)[0] == 0:
|
||||
return getMean(tree)
|
||||
|
||||
# 判断分枝是否是dict字典,如果是就将测试数据集进行切分
|
||||
if (isTree(tree['right']) or isTree(tree['left'])):
|
||||
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
|
||||
# 如果是左边分枝是字典,就传入左边的数据集和左边的分枝,进行递归
|
||||
if isTree(tree['left']):
|
||||
tree['left'] = prune(tree['left'], lSet)
|
||||
# 如果是右边分枝是字典,就传入左边的数据集和左边的分枝,进行递归
|
||||
if isTree(tree['right']):
|
||||
tree['right'] = prune(tree['right'], rSet)
|
||||
|
||||
# 上面的一系列操作本质上就是将测试数据集按照训练完成的树拆分好,对应的值放到对应的节点
|
||||
|
||||
# 如果左右两边同时都不是dict字典,也就是左右两边都是叶节点,而不是子树了,那么分割测试数据集。
|
||||
# 1. 如果正确
|
||||
# * 那么计算一下总方差 和 该结果集的本身不分枝的总方差比较
|
||||
# * 如果 合并的总方差 < 不合并的总方差,那么就进行合并
|
||||
# 注意返回的结果: 如果可以合并,原来的dict就变为了 数值
|
||||
if not isTree(tree['left']) and not isTree(tree['right']):
|
||||
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
|
||||
# power(x, y)表示x的y次方;这时tree['left']和tree['right']都是具体数值
|
||||
errorNoMerge = sum(power(lSet[:, -1] - tree['left'], 2)) + sum(power(rSet[:, -1] - tree['right'], 2))
|
||||
treeMean = (tree['left'] + tree['right'])/2.0
|
||||
errorMerge = sum(power(testData[:, -1] - treeMean, 2))
|
||||
# 如果 合并的总方差 < 不合并的总方差,那么就进行合并
|
||||
if errorMerge < errorNoMerge:
|
||||
print("merging")
|
||||
return treeMean
|
||||
# 两个return可以简化成一个
|
||||
else:
|
||||
return tree
|
||||
else:
|
||||
return tree
|
||||
|
||||
|
||||
# 得到模型的ws系数:f(x) = x0 + x1*featrue1+ x2*featrue2 ...
|
||||
# create linear model and return coeficients
|
||||
def modelLeaf(dataSet):
|
||||
"""
|
||||
Desc:
|
||||
当数据不再需要切分的时候,生成叶节点的模型。
|
||||
Args:
|
||||
dataSet -- 输入数据集
|
||||
Returns:
|
||||
调用 linearSolve 函数,返回得到的 回归系数ws
|
||||
"""
|
||||
ws, X, Y = linearSolve(dataSet)
|
||||
return ws
|
||||
|
||||
|
||||
# 计算线性模型的误差值
|
||||
def modelErr(dataSet):
|
||||
"""
|
||||
Desc:
|
||||
在给定数据集上计算误差。
|
||||
Args:
|
||||
dataSet -- 输入数据集
|
||||
Returns:
|
||||
调用 linearSolve 函数,返回 yHat 和 Y 之间的平方误差。
|
||||
"""
|
||||
ws, X, Y = linearSolve(dataSet)
|
||||
yHat = X * ws
|
||||
# print corrcoef(yHat, Y, rowvar=0)
|
||||
return sum(power(Y - yHat, 2))
|
||||
|
||||
|
||||
# helper function used in two places
|
||||
def linearSolve(dataSet):
|
||||
"""
|
||||
Desc:
|
||||
将数据集格式化成目标变量Y和自变量X,执行简单的线性回归,得到ws
|
||||
Args:
|
||||
dataSet -- 输入数据
|
||||
Returns:
|
||||
ws -- 执行线性回归的回归系数
|
||||
X -- 格式化自变量X
|
||||
Y -- 格式化目标变量Y
|
||||
"""
|
||||
m, n = shape(dataSet)
|
||||
# 产生一个关于1的矩阵
|
||||
X = mat(ones((m, n)))
|
||||
Y = mat(ones((m, 1)))
|
||||
# X的0列为1,常数项,用于计算平衡误差
|
||||
X[:, 1: n] = dataSet[:, 0: n-1]
|
||||
Y = dataSet[:, -1]
|
||||
|
||||
# 转置矩阵*矩阵
|
||||
xTx = X.T * X
|
||||
# 如果矩阵的逆不存在,会造成程序异常
|
||||
if linalg.det(xTx) == 0.0:
|
||||
raise NameError('This matrix is singular, cannot do inverse,\ntry increasing the second value of ops')
|
||||
# 最小二乘法求最优解: w0*1+w1*x1=y
|
||||
ws = xTx.I * (X.T * Y)
|
||||
return ws, X, Y
|
||||
|
||||
|
||||
# 回归树测试案例
|
||||
# 为了和 modelTreeEval() 保持一致,保留两个输入参数
|
||||
def regTreeEval(model, inDat):
|
||||
"""
|
||||
Desc:
|
||||
对 回归树 进行预测
|
||||
Args:
|
||||
model -- 指定模型,可选值为 回归树模型 或者 模型树模型,这里为回归树
|
||||
inDat -- 输入的测试数据
|
||||
Returns:
|
||||
float(model) -- 将输入的模型数据转换为 浮点数 返回
|
||||
"""
|
||||
return float(model)
|
||||
|
||||
|
||||
# 模型树测试案例
|
||||
# 对输入数据进行格式化处理,在原数据矩阵上增加第0列,元素的值都是1,
|
||||
# 也就是增加偏移值,和我们之前的简单线性回归是一个套路,增加一个偏移量
|
||||
def modelTreeEval(model, inDat):
|
||||
"""
|
||||
Desc:
|
||||
对 模型树 进行预测
|
||||
Args:
|
||||
model -- 输入模型,可选值为 回归树模型 或者 模型树模型,这里为模型树模型,实则为 回归系数
|
||||
inDat -- 输入的测试数据
|
||||
Returns:
|
||||
float(X * model) -- 将测试数据乘以 回归系数 得到一个预测值 ,转化为 浮点数 返回
|
||||
"""
|
||||
n = shape(inDat)[1]
|
||||
X = mat(ones((1, n+1)))
|
||||
X[:, 1: n+1] = inDat
|
||||
# print X, model
|
||||
return float(X * model)
|
||||
|
||||
|
||||
# 计算预测的结果
|
||||
# 在给定树结构的情况下,对于单个数据点,该函数会给出一个预测值。
|
||||
# modelEval是对叶节点进行预测的函数引用,指定树的类型,以便在叶节点上调用合适的模型。
|
||||
# 此函数自顶向下遍历整棵树,直到命中叶节点为止,一旦到达叶节点,它就会在输入数据上
|
||||
# 调用modelEval()函数,该函数的默认值为regTreeEval()
|
||||
def treeForeCast(tree, inData, modelEval=regTreeEval):
|
||||
"""
|
||||
Desc:
|
||||
对特定模型的树进行预测,可以是 回归树 也可以是 模型树
|
||||
Args:
|
||||
tree -- 已经训练好的树的模型
|
||||
inData -- 输入的测试数据,只有一行
|
||||
modelEval -- 预测的树的模型类型,可选值为 regTreeEval(回归树) 或 modelTreeEval(模型树),默认为回归树
|
||||
Returns:
|
||||
返回预测值
|
||||
"""
|
||||
if not isTree(tree):
|
||||
return modelEval(tree, inData)
|
||||
# 书中写的是inData[tree['spInd']],只适合inData只有一列的情况,否则会产生异常
|
||||
if inData[0, tree['spInd']] <= tree['spVal']:
|
||||
# 可以把if-else去掉,只留if里面的分支
|
||||
if isTree(tree['left']):
|
||||
return treeForeCast(tree['left'], inData, modelEval)
|
||||
else:
|
||||
return modelEval(tree['left'], inData)
|
||||
else:
|
||||
# 同上,可以把if-else去掉,只留if里面的分支
|
||||
if isTree(tree['right']):
|
||||
return treeForeCast(tree['right'], inData, modelEval)
|
||||
else:
|
||||
return modelEval(tree['right'], inData)
|
||||
|
||||
|
||||
# 预测结果
|
||||
def createForeCast(tree, testData, modelEval=regTreeEval):
|
||||
"""
|
||||
Desc:
|
||||
调用 treeForeCast ,对特定模型的树进行预测,可以是 回归树 也可以是 模型树
|
||||
Args:
|
||||
tree -- 已经训练好的树的模型
|
||||
testData -- 输入的测试数据
|
||||
modelEval -- 预测的树的模型类型,可选值为 regTreeEval(回归树) 或 modelTreeEval(模型树),默认为回归树
|
||||
Returns:
|
||||
返回预测值矩阵
|
||||
"""
|
||||
m = len(testData)
|
||||
yHat = mat(zeros((m, 1)))
|
||||
# print yHat
|
||||
for i in range(m):
|
||||
yHat[i, 0] = treeForeCast(tree, mat(testData[i]), modelEval)
|
||||
# print "yHat==>", yHat[i, 0]
|
||||
return yHat
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试数据集
|
||||
testMat = mat(eye(4))
|
||||
print(testMat)
|
||||
print(type(testMat))
|
||||
mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)
|
||||
print(mat0, '\n-----------\n', mat1)
|
||||
|
||||
# # 回归树
|
||||
# myDat = loadDataSet('data/9.RegTrees/data1.txt')
|
||||
# # myDat = loadDataSet('data/9.RegTrees/data2.txt')
|
||||
# # print 'myDat=', myDat
|
||||
# myMat = mat(myDat)
|
||||
# # print 'myMat=', myMat
|
||||
# myTree = createTree(myMat)
|
||||
# print myTree
|
||||
|
||||
# # 1. 预剪枝就是:提起设置最大误差数和最少元素数
|
||||
# myDat = loadDataSet('data/9.RegTrees/data3.txt')
|
||||
# myMat = mat(myDat)
|
||||
# myTree = createTree(myMat, ops=(0, 1))
|
||||
# print myTree
|
||||
|
||||
# # 2. 后剪枝就是:通过测试数据,对预测模型进行合并判断
|
||||
# myDatTest = loadDataSet('data/9.RegTrees/data3test.txt')
|
||||
# myMat2Test = mat(myDatTest)
|
||||
# myFinalTree = prune(myTree, myMat2Test)
|
||||
# print '\n\n\n-------------------'
|
||||
# print myFinalTree
|
||||
|
||||
# # --------
|
||||
# # 模型树求解
|
||||
# myDat = loadDataSet('data/9.RegTrees/data4.txt')
|
||||
# myMat = mat(myDat)
|
||||
# myTree = createTree(myMat, modelLeaf, modelErr)
|
||||
# print myTree
|
||||
|
||||
# # # 回归树 VS 模型树 VS 线性回归
|
||||
# trainMat = mat(loadDataSet('data/9.RegTrees/bikeSpeedVsIq_train.txt'))
|
||||
# testMat = mat(loadDataSet('data/9.RegTrees/bikeSpeedVsIq_test.txt'))
|
||||
# # # 回归树
|
||||
# myTree1 = createTree(trainMat, ops=(1, 20))
|
||||
# print myTree1
|
||||
# yHat1 = createForeCast(myTree1, testMat[:, 0])
|
||||
# print "--------------\n"
|
||||
# # print yHat1
|
||||
# # print "ssss==>", testMat[:, 1]
|
||||
# # corrcoef 返回皮尔森乘积矩相关系数
|
||||
# print "regTree:", corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1]
|
||||
|
||||
# # 模型树
|
||||
# myTree2 = createTree(trainMat, modelLeaf, modelErr, ops=(1, 20))
|
||||
# yHat2 = createForeCast(myTree2, testMat[:, 0], modelTreeEval)
|
||||
# print myTree2
|
||||
# print "modelTree:", corrcoef(yHat2, testMat[:, 1],rowvar=0)[0, 1]
|
||||
|
||||
# # 线性回归
|
||||
# ws, X, Y = linearSolve(trainMat)
|
||||
# print ws
|
||||
# m = len(testMat[:, 0])
|
||||
# yHat3 = mat(zeros((m, 1)))
|
||||
# for i in range(shape(testMat)[0]):
|
||||
# yHat3[i] = testMat[i, 0]*ws[1, 0] + ws[0, 0]
|
||||
# print "lr:", corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1]
|
||||
59
src/py2.x/ml/9.RegTrees/sklearn-regressTree-demo.py
Normal file
59
src/py2.x/ml/9.RegTrees/sklearn-regressTree-demo.py
Normal file
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
"""
|
||||
Created on 2017-07-13
|
||||
Updated on 2017-07-13
|
||||
RegressionTree:树回归
|
||||
Author: 小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
print(__doc__)
|
||||
|
||||
# 引入必要的模型和库
|
||||
import numpy as np
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# 创建一个随机的数据集
|
||||
# 参考 https://docs.scipy.org/doc/numpy-1.6.0/reference/generated/numpy.random.mtrand.RandomState.html
|
||||
rng = np.random.RandomState(1)
|
||||
# print 'lalalalala===', rng
|
||||
# rand() 是给定形状的随机值,rng.rand(80, 1)即矩阵的形状是 80行,1列
|
||||
# sort()
|
||||
X = np.sort(5 * rng.rand(80, 1), axis=0)
|
||||
# print 'X=', X
|
||||
y = np.sin(X).ravel()
|
||||
# print 'y=', y
|
||||
y[::5] += 3 * (0.5 - rng.rand(16))
|
||||
# print 'yyy=', y
|
||||
|
||||
# 拟合回归模型
|
||||
# regr_1 = DecisionTreeRegressor(max_depth=2)
|
||||
# 保持 max_depth=5 不变,增加 min_samples_leaf=6 的参数,效果进一步提升了
|
||||
regr_2 = DecisionTreeRegressor(max_depth=5)
|
||||
regr_2 = DecisionTreeRegressor(min_samples_leaf=6)
|
||||
# regr_3 = DecisionTreeRegressor(max_depth=4)
|
||||
# regr_1.fit(X, y)
|
||||
regr_2.fit(X, y)
|
||||
# regr_3.fit(X, y)
|
||||
|
||||
# 预测
|
||||
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
|
||||
# y_1 = regr_1.predict(X_test)
|
||||
y_2 = regr_2.predict(X_test)
|
||||
# y_3 = regr_3.predict(X_test)
|
||||
|
||||
# 绘制结果
|
||||
plt.figure()
|
||||
plt.scatter(X, y, c="darkorange", label="data")
|
||||
# plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
|
||||
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
|
||||
# plt.plot(X_test, y_3, color="red", label="max_depth=3", linewidth=2)
|
||||
plt.xlabel("data")
|
||||
plt.ylabel("target")
|
||||
plt.title("Decision Tree Regression")
|
||||
plt.legend()
|
||||
plt.show()
|
||||
124
src/py2.x/ml/9.RegTrees/treeExplore.py
Normal file
124
src/py2.x/ml/9.RegTrees/treeExplore.py
Normal file
@@ -0,0 +1,124 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on 2017-03-08
|
||||
Update on 2017-05-18
|
||||
Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9
|
||||
Author: Peter/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
import regTrees
|
||||
from Tkinter import *
|
||||
from numpy import *
|
||||
|
||||
import matplotlib
|
||||
from matplotlib.figure import Figure
|
||||
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
|
||||
matplotlib.use('TkAgg')
|
||||
|
||||
|
||||
def test_widget_text(root):
|
||||
mylabel = Label(root, text="helloworld")
|
||||
# 相当于告诉 布局管理器(Geometry Manager),如果不设定位置,默认在 0行0列的位置
|
||||
mylabel.grid()
|
||||
|
||||
|
||||
# 最大为误差, 最大子叶节点的数量
|
||||
def reDraw(tolS, tolN):
|
||||
# clear the figure
|
||||
reDraw.f.clf()
|
||||
reDraw.a = reDraw.f.add_subplot(111)
|
||||
|
||||
# 检查复选框是否选中
|
||||
if chkBtnVar.get():
|
||||
if tolN < 2:
|
||||
tolN = 2
|
||||
myTree = regTrees.createTree(reDraw.rawDat, regTrees.modelLeaf, regTrees.modelErr, (tolS, tolN))
|
||||
yHat = regTrees.createForeCast(myTree, reDraw.testDat, regTrees.modelTreeEval)
|
||||
else:
|
||||
myTree = regTrees.createTree(reDraw.rawDat, ops=(tolS, tolN))
|
||||
yHat = regTrees.createForeCast(myTree, reDraw.testDat)
|
||||
|
||||
# use scatter for data set
|
||||
reDraw.a.scatter(reDraw.rawDat[:, 0].A, reDraw.rawDat[:, 1].A, s=5)
|
||||
# use plot for yHat
|
||||
reDraw.a.plot(reDraw.testDat, yHat, linewidth=2.0, c='red')
|
||||
reDraw.canvas.show()
|
||||
|
||||
|
||||
def getInputs():
|
||||
try:
|
||||
tolN = int(tolNentry.get())
|
||||
except:
|
||||
tolN = 10
|
||||
print("enter Integer for tolN")
|
||||
tolNentry.delete(0, END)
|
||||
tolNentry.insert(0, '10')
|
||||
try:
|
||||
tolS = float(tolSentry.get())
|
||||
except:
|
||||
tolS = 1.0
|
||||
print("enter Float for tolS")
|
||||
tolSentry.delete(0, END)
|
||||
tolSentry.insert(0, '1.0')
|
||||
return tolN, tolS
|
||||
|
||||
|
||||
# 画新的tree
|
||||
def drawNewTree():
|
||||
# #get values from Entry boxes
|
||||
tolN, tolS = getInputs()
|
||||
reDraw(tolS, tolN)
|
||||
|
||||
|
||||
def main(root):
|
||||
# 标题
|
||||
Label(root, text="Plot Place Holder").grid(row=0, columnspan=3)
|
||||
# 输入栏1, 叶子的数量
|
||||
Label(root, text="tolN").grid(row=1, column=0)
|
||||
global tolNentry
|
||||
tolNentry = Entry(root)
|
||||
tolNentry.grid(row=1, column=1)
|
||||
tolNentry.insert(0, '10')
|
||||
# 输入栏2, 误差量
|
||||
Label(root, text="tolS").grid(row=2, column=0)
|
||||
global tolSentry
|
||||
tolSentry = Entry(root)
|
||||
tolSentry.grid(row=2, column=1)
|
||||
# 设置输出值
|
||||
tolSentry.insert(0,'1.0')
|
||||
|
||||
# 设置提交的按钮
|
||||
Button(root, text="确定", command=drawNewTree).grid(row=1, column=2, rowspan=3)
|
||||
|
||||
# 设置复选按钮
|
||||
global chkBtnVar
|
||||
chkBtnVar = IntVar()
|
||||
chkBtn = Checkbutton(root, text="Model Tree", variable = chkBtnVar)
|
||||
chkBtn.grid(row=3, column=0, columnspan=2)
|
||||
|
||||
# 退出按钮
|
||||
Button(root, text="退出", fg="black", command=quit).grid(row=1, column=2)
|
||||
|
||||
# 创建一个画板 canvas
|
||||
reDraw.f = Figure(figsize=(5, 4), dpi=100)
|
||||
reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root)
|
||||
reDraw.canvas.show()
|
||||
reDraw.canvas.get_tk_widget().grid(row=0, columnspan=3)
|
||||
|
||||
reDraw.rawDat = mat(regTrees.loadDataSet('data/9.RegTrees/sine.txt'))
|
||||
reDraw.testDat = arange(min(reDraw.rawDat[:, 0]), max(reDraw.rawDat[:, 0]), 0.01)
|
||||
reDraw(1.0, 10)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# 创建一个事件
|
||||
root = Tk()
|
||||
# test_widget_text(root)
|
||||
main(root)
|
||||
|
||||
# 启动事件循环
|
||||
root.mainloop()
|
||||
38
src/py3.x/dl/activators.py
Normal file
38
src/py3.x/dl/activators.py
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class ReluActivator(object):
|
||||
def forward(self, weighted_input):
|
||||
#return weighted_input
|
||||
return max(0, weighted_input)
|
||||
|
||||
def backward(self, output):
|
||||
return 1 if output > 0 else 0
|
||||
|
||||
|
||||
class IdentityActivator(object):
|
||||
def forward(self, weighted_input):
|
||||
return weighted_input
|
||||
|
||||
def backward(self, output):
|
||||
return 1
|
||||
|
||||
|
||||
class SigmoidActivator(object):
|
||||
def forward(self, weighted_input):
|
||||
return np.longfloat(1.0 / (1.0 + np.exp(-weighted_input)))
|
||||
|
||||
def backward(self, output):
|
||||
return output * (1 - output)
|
||||
|
||||
|
||||
class TanhActivator(object):
|
||||
def forward(self, weighted_input):
|
||||
return 2.0 / (1.0 + np.exp(-2 * weighted_input)) - 1.0
|
||||
|
||||
def backward(self, output):
|
||||
return 1 - output * output
|
||||
869
src/py3.x/dl/bp.py
Normal file
869
src/py3.x/dl/bp.py
Normal file
@@ -0,0 +1,869 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
import random
|
||||
from functools import reduce
|
||||
from numpy import *
|
||||
|
||||
# sigmoid 函数
|
||||
def sigmoid(inX):
|
||||
'''
|
||||
Desc:
|
||||
sigmoid 函数实现
|
||||
Args:
|
||||
inX --- 输入向量
|
||||
Returns:
|
||||
对输入向量作用 sigmoid 函数之后得到的输出
|
||||
'''
|
||||
return 1.0 / (1 + exp(-inX))
|
||||
|
||||
|
||||
# 定义神经网络的节点类
|
||||
class Node(object):
|
||||
'''
|
||||
Desc:
|
||||
神经网络的节点类
|
||||
'''
|
||||
def __init__(self, layer_index, node_index):
|
||||
'''
|
||||
Desc:
|
||||
初始化一个节点
|
||||
Args:
|
||||
layer_index --- 层的索引,也就是表示第几层
|
||||
node_index --- 节点的索引,也就是表示节点的索引
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 设置节点所在的层的位置
|
||||
self.layer_index = layer_index
|
||||
# 设置层中的节点的索引
|
||||
self.node_index = node_index
|
||||
# 设置此节点的下游节点,也就是这个节点与下一层的哪个节点相连
|
||||
self.downstream = []
|
||||
# 设置此节点的上游节点,也就是哪几个节点的下游节点与此节点相连
|
||||
self.upstream = []
|
||||
# 此节点的输出
|
||||
self.output = 0
|
||||
# 此节点真实值与计算值之间的差值
|
||||
self.delta = 0
|
||||
|
||||
def set_output(self, output):
|
||||
'''
|
||||
Desc:
|
||||
设置节点的 output
|
||||
Args:
|
||||
output --- 节点的 output
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
self.output = output
|
||||
|
||||
def append_downstream_connection(self, conn):
|
||||
'''
|
||||
Desc:
|
||||
添加此节点的下游节点的连接
|
||||
Args:
|
||||
conn --- 当前节点的下游节点的连接的 list
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 使用 list 的 append 方法来将 conn 中的节点添加到 downstream 中
|
||||
self.downstream.append(conn)
|
||||
|
||||
def append_upstream_connection(self, conn):
|
||||
'''
|
||||
Desc:
|
||||
添加此节点的上游节点的连接
|
||||
Args:
|
||||
conn ---- 当前节点的上游节点的连接的 list
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 使用 list 的 append 方法来将 conn 中的节点添加到 upstream 中
|
||||
self.upstream.append(conn)
|
||||
|
||||
def calc_output(self):
|
||||
'''
|
||||
Desc:
|
||||
计算节点的输出,依据 output = sigmoid(wTx)
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 使用 reduce() 函数对其中的因素求和
|
||||
output = reduce(lambda ret, conn: ret + conn.upstream_node.output * conn.weight, self.upstream, 0)
|
||||
# 对上游节点的 output 乘 weights 之后求和得到的结果应用 sigmoid 函数,得到当前节点的 output
|
||||
self.output = sigmoid(output)
|
||||
|
||||
def calc_hidden_layer_delta(self):
|
||||
'''
|
||||
Desc:
|
||||
计算隐藏层的节点的 delta
|
||||
Args:
|
||||
output --- 节点的 output
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 根据 https://www.zybuluo.com/hanbingtao/note/476663 的 式4 计算隐藏层的delta
|
||||
downstream_delta = reduce(lambda ret, conn: ret + conn.downstream_node.delta * conn.weight, self.downstream, 0.0)
|
||||
# 计算此节点的 delta
|
||||
self.delta = self.output * (1 - self.output) * downstream_delta
|
||||
|
||||
def calc_output_layer_delta(self, label):
|
||||
'''
|
||||
Desc:
|
||||
计算输出层的 delta
|
||||
Args:
|
||||
label --- 输入向量对应的真实标签,不是计算得到的结果
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 就是那输出层的 delta
|
||||
self.delta = self.output * (1 - self.output) * (label - self.output)
|
||||
|
||||
def __str__(self):
|
||||
'''
|
||||
Desc:
|
||||
将节点的信息打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 打印格式:第几层 - 第几个节点,output 是多少,delta 是多少
|
||||
node_str = '%u-%u: output: %f delta: %f' % (self.layer_index, self.node_index, self.output, self.delta)
|
||||
# 下游节点
|
||||
downstream_str = reduce(lambda ret, conn: ret + '\n\t' + str(conn), self.downstream, '')
|
||||
# 上游节点
|
||||
upstream_str = reduce(lambda ret, conn: ret + '\n\t' + str(conn), self.upstream, '')
|
||||
# 将本节点 + 下游节点 + 上游节点 的信息打印出来
|
||||
return node_str + '\n\tdownstream:' + downstream_str + '\n\tupstream:' + upstream_str
|
||||
|
||||
|
||||
# ConstNode 对象,为了实现一个输出恒为 1 的节点(计算偏置项 wb 时需要)
|
||||
class ConstNode(object):
|
||||
'''
|
||||
Desc:
|
||||
常数项对象,即相当于计算的时候的偏置项
|
||||
'''
|
||||
def __init__(self, layer_index, node_index):
|
||||
'''
|
||||
Desc:
|
||||
初始化节点对象
|
||||
Args:
|
||||
layer_index --- 节点所属的层的编号
|
||||
node_index --- 节点的编号
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
self.layer_index = layer_index
|
||||
self.node_index = node_index
|
||||
self.downstream = []
|
||||
self.output = 1
|
||||
|
||||
|
||||
def append_downstream_connection(self, conn):
|
||||
'''
|
||||
Desc:
|
||||
添加一个到下游节点的连接
|
||||
Args:
|
||||
conn --- 到下游节点的连接
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 使用 list 的 append 方法将包含下游节点的 conn 添加到 downstream 中
|
||||
self.downstream.append(conn)
|
||||
|
||||
|
||||
def calc_hidden_layer_delta(self):
|
||||
'''
|
||||
Desc:
|
||||
计算隐藏层的 delta
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 使用我们的 公式 4 来计算下游节点的 delta,求和
|
||||
downstream_delta = reduce(lambda ret, conn: ret + conn.downstream_node.delta * conn.weight, self.downstream, 0.0)
|
||||
# 计算隐藏层的本节点的 delta
|
||||
self.delta = self.output * (1 - self.output) * downstream_delta
|
||||
|
||||
|
||||
def __str__(self):
|
||||
'''
|
||||
Desc:
|
||||
将节点信息打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 将节点的信息打印出来
|
||||
# 格式 第几层-第几个节点的 output
|
||||
node_str = '%u-%u: output: 1' % (self.layer_index, self.node_index)
|
||||
# 此节点的下游节点的信息
|
||||
downstream_str = reduce(lambda ret, conn: ret + '\n\t' + str(conn), self.downstream, '')
|
||||
# 将此节点与下游节点的信息组合,一起打印出来
|
||||
return node_str + '\n\tdownstream:' + downstream_str
|
||||
|
||||
|
||||
# 神经网络的层对象,负责初始化一层。此外,作为 Node 的集合对象,提供对 Node 集合的操作
|
||||
class Layer(object):
|
||||
'''
|
||||
Desc:
|
||||
神经网络的 Layer 类
|
||||
'''
|
||||
|
||||
def __init__(self, layer_index, node_count):
|
||||
'''
|
||||
Desc:
|
||||
神经网络的层对象的初始化
|
||||
Args:
|
||||
layer_index --- 层的索引
|
||||
node_count --- 节点的个数
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 设置 层的索引
|
||||
self.layer_index = layer_index
|
||||
# 设置层中的节点的 list
|
||||
self.nodes = []
|
||||
# 将 Node 节点添加到 nodes 中
|
||||
for i in range(node_count):
|
||||
self.nodes.append(Node(layer_index, i))
|
||||
# 将 ConstNode 节点也添加到 nodes 中
|
||||
self.nodes.append(ConstNode(layer_index, node_count))
|
||||
|
||||
def set_output(self, data):
|
||||
'''
|
||||
Desc:
|
||||
设置层的输出,当层是输入层时会用到
|
||||
Args:
|
||||
data --- 输出的值的 list
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 设置输入层中各个节点的 output
|
||||
for i in range(len(data)):
|
||||
self.nodes[i].set_output(data[i])
|
||||
|
||||
def calc_output(self):
|
||||
'''
|
||||
Desc:
|
||||
计算层的输出向量
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 遍历本层的所有节点(除去最后一个节点,因为它是恒为常数的偏置项b)
|
||||
# 调用节点的 calc_output 方法来计算输出向量
|
||||
for node in self.nodes[:-1]:
|
||||
node.calc_output()
|
||||
|
||||
def dump(self):
|
||||
'''
|
||||
Desc:
|
||||
将层信息打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 遍历层的所有的节点 nodes,将节点信息打印出来
|
||||
for node in self.nodes:
|
||||
print(node)
|
||||
|
||||
|
||||
# Connection 对象类,主要负责记录连接的权重,以及这个连接所关联的上下游的节点
|
||||
class Connection(object):
|
||||
'''
|
||||
Desc:
|
||||
Connection 对象,记录连接权重和连接所关联的上下游节点,注意,这里的 connection 没有 s ,不是复数
|
||||
'''
|
||||
def __init__(self, upstream_node, downstream_node):
|
||||
'''
|
||||
Desc:
|
||||
初始化 Connection 对象
|
||||
Args:
|
||||
upstream_node --- 上游节点
|
||||
downstream_node --- 下游节点
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 设置上游节点
|
||||
self.upstream_node = upstream_node
|
||||
# 设置下游节点
|
||||
self.downstream_node = downstream_node
|
||||
# 设置权重,这里设置的权重是 -0.1 到 0.1 之间的任何数
|
||||
self.weight = random.uniform(-0.1, 0.1)
|
||||
# 设置梯度 为 0.0
|
||||
self.gradient = 0.0
|
||||
|
||||
def calc_gradient(self):
|
||||
'''
|
||||
Desc:
|
||||
计算梯度
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 下游节点的 delta * 上游节点的 output 计算得到梯度
|
||||
self.gradient = self.downstream_node.delta * self.upstream_node.output
|
||||
|
||||
def update_weight(self, rate):
|
||||
'''
|
||||
Desc:
|
||||
根据梯度下降算法更新权重
|
||||
Args:
|
||||
rate --- 学习率 / 或者成为步长
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 调用计算梯度的函数来将梯度计算出来
|
||||
self.calc_gradient()
|
||||
# 使用梯度下降算法来更新权重
|
||||
self.weight += rate * self.gradient
|
||||
|
||||
def get_gradient(self):
|
||||
'''
|
||||
Desc:
|
||||
获取当前的梯度
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
当前的梯度 gradient
|
||||
'''
|
||||
return self.gradient
|
||||
|
||||
def __str__(self):
|
||||
'''
|
||||
Desc:
|
||||
将连接信息打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
连接信息进行返回
|
||||
'''
|
||||
# 格式为:上游节点的层的索引+上游节点的节点索引 ---> 下游节点的层的索引+下游节点的节点索引,最后一个数是权重
|
||||
return '(%u-%u) -> (%u-%u) = %f' % (
|
||||
self.upstream_node.layer_index,
|
||||
self.upstream_node.node_index,
|
||||
self.downstream_node.layer_index,
|
||||
self.downstream_node.node_index,
|
||||
self.weight)
|
||||
|
||||
|
||||
|
||||
# Connections 对象,提供 Connection 集合操作。
|
||||
class Connections(object):
|
||||
'''
|
||||
Desc:
|
||||
Connections 对象,提供 Connection 集合的操作,看清楚后面有没有 s ,不要看错
|
||||
'''
|
||||
def __init__(self):
|
||||
'''
|
||||
Desc:
|
||||
初始化 Connections 对象
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 初始化一个列表 list
|
||||
self.connections = []
|
||||
|
||||
def add_connection(self, connection):
|
||||
'''
|
||||
Desc:
|
||||
将 connection 中的节点信息 append 到 connections 中
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
self.connections.append(connection)
|
||||
|
||||
def dump(self):
|
||||
'''
|
||||
Desc:
|
||||
将 Connections 的节点信息打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
for conn in self.connections:
|
||||
print(conn)
|
||||
|
||||
|
||||
# Network 对象,提供相应 API
|
||||
class Network(object):
|
||||
'''
|
||||
Desc:
|
||||
Network 类
|
||||
'''
|
||||
def __init__(self, layers):
|
||||
'''
|
||||
Desc:
|
||||
初始化一个全连接神经网络
|
||||
Args:
|
||||
layers --- 二维数组,描述神经网络的每层节点数
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 初始化 connections,使用的是 Connections 对象
|
||||
self.connections = Connections()
|
||||
# 初始化 layers
|
||||
self.layers = []
|
||||
# 我们的神经网络的层数
|
||||
layer_count = len(layers)
|
||||
# 节点数
|
||||
node_count = 0
|
||||
# 遍历所有的层,将每层信息添加到 layers 中去
|
||||
for i in range(layer_count):
|
||||
self.layers.append(Layer(i, layers[i]))
|
||||
# 遍历除去输出层之外的所有层,将连接信息添加到 connections 对象中
|
||||
for layer in range(layer_count - 1):
|
||||
connections = [Connection(upstream_node, downstream_node) for upstream_node in self.layers[layer].nodes for downstream_node in self.layers[layer + 1].nodes[:-1]]
|
||||
# 遍历 connections,将 conn 添加到 connections 中
|
||||
for conn in connections:
|
||||
self.connections.add_connection(conn)
|
||||
# 为下游节点添加上游节点为 conn
|
||||
conn.downstream_node.append_upstream_connection(conn)
|
||||
# 为上游节点添加下游节点为 conn
|
||||
conn.upstream_node.append_downstream_connection(conn)
|
||||
|
||||
|
||||
def train(self, labels, data_set, rate, epoch):
|
||||
'''
|
||||
Desc:
|
||||
训练神经网络
|
||||
Args:
|
||||
labels --- 数组,训练样本标签,每个元素是一个样本的标签
|
||||
data_set --- 二维数组,训练样本的特征数据。每行数据是一个样本的特征
|
||||
rate --- 学习率
|
||||
epoch --- 迭代次数
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 循环迭代 epoch 次
|
||||
for i in range(epoch):
|
||||
# 遍历每个训练样本
|
||||
for d in range(len(data_set)):
|
||||
# 使用此样本进行训练(一条样本进行训练)
|
||||
self.train_one_sample(labels[d], data_set[d], rate)
|
||||
# print 'sample %d training finished' % d
|
||||
|
||||
def train_one_sample(self, label, sample, rate):
|
||||
'''
|
||||
Desc:
|
||||
内部函数,使用一个样本对网络进行训练
|
||||
Args:
|
||||
label --- 样本的标签
|
||||
sample --- 样本的特征
|
||||
rate --- 学习率
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 调用 Network 的 predict 方法,对这个样本进行预测
|
||||
self.predict(sample)
|
||||
# 计算根据此样本得到的结果的 delta
|
||||
self.calc_delta(label)
|
||||
# 更新权重
|
||||
self.update_weight(rate)
|
||||
|
||||
def calc_delta(self, label):
|
||||
'''
|
||||
Desc:
|
||||
计算每个节点的 delta
|
||||
Args:
|
||||
label --- 样本的真实值,也就是样本的标签
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 获取输出层的所有节点
|
||||
output_nodes = self.layers[-1].nodes
|
||||
# 遍历所有的 label
|
||||
for i in range(len(label)):
|
||||
# 计算输出层节点的 delta
|
||||
output_nodes[i].calc_output_layer_delta(label[i])
|
||||
# 这个用法就是切片的用法, [-2::-1] 就是将 layers 这个数组倒过来,从没倒过来的时候的倒数第二个元素开始,到翻转过来的倒数第一个数,比如这样:aaa = [1,2,3,4,5,6,7,8,9],bbb = aaa[-2::-1] ==> bbb = [8, 7, 6, 5, 4, 3, 2, 1]
|
||||
# 实际上就是除掉输出层之外的所有层按照相反的顺序进行遍历
|
||||
for layer in self.layers[-2::-1]:
|
||||
# 遍历每层的所有节点
|
||||
for node in layer.nodes:
|
||||
# 计算隐藏层的 delta
|
||||
node.calc_hidden_layer_delta()
|
||||
|
||||
def update_weight(self, rate):
|
||||
'''
|
||||
Desc:
|
||||
更新每个连接的权重
|
||||
Args:
|
||||
rate --- 学习率
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 按照正常顺序遍历除了输出层的层
|
||||
for layer in self.layers[:-1]:
|
||||
# 遍历每层的所有节点
|
||||
for node in layer.nodes:
|
||||
# 遍历节点的下游节点
|
||||
for conn in node.downstream:
|
||||
# 根据下游节点来更新连接的权重
|
||||
conn.update_weight(rate)
|
||||
|
||||
def calc_gradient(self):
|
||||
'''
|
||||
Desc:
|
||||
计算每个连接的梯度
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 按照正常顺序遍历除了输出层之外的层
|
||||
for layer in self.layers[:-1]:
|
||||
# 遍历层中的所有节点
|
||||
for node in layer.nodes:
|
||||
# 遍历节点的下游节点
|
||||
for conn in node.downstream:
|
||||
# 计算梯度
|
||||
conn.calc_gradient()
|
||||
|
||||
def get_gradient(self, label, sample):
|
||||
'''
|
||||
Desc:
|
||||
获得网络在一个样本下,每个连接上的梯度
|
||||
Args:
|
||||
label --- 样本标签
|
||||
sample --- 样本特征
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 调用 predict() 方法,利用样本的特征数据对样本进行预测
|
||||
self.predict(sample)
|
||||
# 计算 delta
|
||||
self.calc_delta(label)
|
||||
# 计算梯度
|
||||
self.calc_gradient()
|
||||
|
||||
def predict(self, sample):
|
||||
'''
|
||||
Desc:
|
||||
根据输入的样本预测输出值
|
||||
Args:
|
||||
sample --- 数组,样本的特征,也就是网络的输入向量
|
||||
Returns:
|
||||
使用我们的感知器规则计算网络的输出
|
||||
'''
|
||||
# 首先为输入层设置输出值output为样本的输入向量,即不发生任何变化
|
||||
self.layers[0].set_output(sample)
|
||||
# 遍历除去输入层开始到最后一层
|
||||
for i in range(1, len(self.layers)):
|
||||
# 计算 output
|
||||
self.layers[i].calc_output()
|
||||
# 将计算得到的输出,也就是我们的预测值返回
|
||||
return list(map(lambda node: node.output, self.layers[-1].nodes[:-1]))
|
||||
|
||||
def dump(self):
|
||||
'''
|
||||
Desc:
|
||||
打印出我们的网络信息
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 遍历所有的 layers
|
||||
for layer in self.layers:
|
||||
# 将所有的层的信息打印出来
|
||||
layer.dump()
|
||||
|
||||
|
||||
# # ------------------------- 至此,基本上我们把 我们的神经网络实现完成,下面还会介绍一下对应的梯度检查相关的算法,现在我们首先回顾一下我们上面写道的类及他们的作用 ------------------------
|
||||
'''
|
||||
1、节点类的实现 Node :负责记录和维护节点自身信息以及这个节点相关的上下游连接,实现输出值和误差项的计算。如下:
|
||||
layer_index --- 节点所属的层的编号
|
||||
node_index --- 节点的编号
|
||||
downstream --- 下游节点
|
||||
upstream ---- 上游节点
|
||||
output ---- 节点的输出值
|
||||
delta ------ 节点的误差项
|
||||
|
||||
2、ConstNode 类,偏置项类的实现:实现一个输出恒为 1 的节点(计算偏置项的时候会用到),如下:
|
||||
layer_index --- 节点所属层的编号
|
||||
node_index ---- 节点的编号
|
||||
downstream ---- 下游节点
|
||||
没有记录上游节点,因为一个偏置项的输出与上游节点的输出无关
|
||||
output ----- 偏置项的输出
|
||||
|
||||
3、layer 类,负责初始化一层。作为的是 Node 节点的集合对象,提供对 Node 集合的操作。也就是说,layer 包含的是 Node 的集合。
|
||||
layer_index ---- 层的编号
|
||||
node_count ----- 层所包含的节点的个数
|
||||
def set_ouput() -- 设置层的输出,当层是输入层时会用到
|
||||
def calc_output -- 计算层的输出向量,调用的 Node 类的 计算输出 方法
|
||||
|
||||
4、Connection 类:负责记录连接的权重,以及这个连接所关联的上下游节点,如下:
|
||||
upstream_node --- 连接的上游节点
|
||||
downstream_node -- 连接的下游节点
|
||||
weight -------- random.uniform(-0.1, 0.1) 初始化为一个很小的随机数
|
||||
gradient -------- 0.0 梯度,初始化为 0.0
|
||||
def calc_gradient() --- 计算梯度,使用的是下游节点的 delta 与上游节点的 output 相乘计算得到
|
||||
def get_gradient() ---- 获取当前的梯度
|
||||
def update_weight() --- 根据梯度下降算法更新权重
|
||||
|
||||
5、Connections 类:提供对 Connection 集合操作,如下:
|
||||
def add_connection() --- 添加一个 connection
|
||||
|
||||
6、Network 类:提供相应的 API,如下:
|
||||
connections --- Connections 对象
|
||||
layers -------- 神经网络的层
|
||||
layer_count --- 神经网络的层数
|
||||
node_count --- 节点个数
|
||||
def train() --- 训练神经网络
|
||||
def train_one_sample() --- 用一个样本训练网络
|
||||
def calc_delta() --- 计算误差项
|
||||
def update_weight() --- 更新每个连接权重
|
||||
def calc_gradient() --- 计算每个连接的梯度
|
||||
def get_gradient() --- 获得网络在一个样本下,每个连接上的梯度
|
||||
def predict() --- 根据输入的样本预测输出值
|
||||
'''
|
||||
|
||||
# #--------------------------------------回顾完成了,有些问题可能还是没有弄懂,没事,我们接着看下面---------------------------------------------
|
||||
|
||||
class Normalizer(object):
|
||||
'''
|
||||
Desc:
|
||||
归一化工具类
|
||||
Args:
|
||||
object --- 对象
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
def __init__(self):
|
||||
'''
|
||||
Desc:
|
||||
初始化
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 初始化 16 进制的数,用来判断位的,分别是
|
||||
# 0x1 ---- 00000001
|
||||
# 0x2 ---- 00000010
|
||||
# 0x4 ---- 00000100
|
||||
# 0x8 ---- 00001000
|
||||
# 0x10 --- 00010000
|
||||
# 0x20 --- 00100000
|
||||
# 0x40 --- 01000000
|
||||
# 0x80 --- 10000000
|
||||
self.mask = [0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80]
|
||||
|
||||
def norm(self, number):
|
||||
'''
|
||||
Desc:
|
||||
对 number 进行规范化
|
||||
Args:
|
||||
number --- 要规范化的数据
|
||||
Returns:
|
||||
规范化之后的数据
|
||||
'''
|
||||
# 此方法就相当于判断一个 8 位的向量,哪一位上有数字,如果有就将这个数设置为 0.9 ,否则,设置为 0.1,通俗比较来说,就是我们这里用 0.9 表示 1,用 0.1 表示 0
|
||||
return list(map(lambda m: 0.9 if number & m else 0.1, self.mask))
|
||||
|
||||
def denorm(self, vec):
|
||||
'''
|
||||
Desc:
|
||||
对我们得到的向量进行反规范化
|
||||
Args:
|
||||
vec --- 得到的向量
|
||||
Returns:
|
||||
最终的预测结果
|
||||
'''
|
||||
# 进行二分类,大于 0.5 就设置为 1,小于 0.5 就设置为 0
|
||||
binary = list(map(lambda i: 1 if i > 0.5 else 0, vec))
|
||||
# 遍历 mask
|
||||
for i in range(len(self.mask)):
|
||||
binary[i] = binary[i] * self.mask[i]
|
||||
# 将结果相加得到最终的预测结果
|
||||
return reduce(lambda x,y: x + y, binary)
|
||||
|
||||
|
||||
def mean_square_error(vec1, vec2):
|
||||
'''
|
||||
Desc:
|
||||
计算平均平方误差
|
||||
Args:
|
||||
vec1 --- 第一个数
|
||||
vec2 --- 第二个数
|
||||
Returns:
|
||||
返回 1/2 * (x-y)^2 计算得到的值
|
||||
'''
|
||||
return 0.5 * reduce(lambda a, b: a + b, map(lambda v: (v[0] - v[1]) * (v[0] - v[1]), zip(vec1, vec2)))
|
||||
|
||||
|
||||
|
||||
def gradient_check(network, sample_feature, sample_label):
|
||||
'''
|
||||
Desc:
|
||||
梯度检查
|
||||
Args:
|
||||
network --- 神经网络对象
|
||||
sample_feature --- 样本的特征
|
||||
sample_label --- 样本的标签
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 计算网络误差
|
||||
network_error = lambda vec1, vec2: 0.5 * reduce(lambda a, b: a + b, map(lambda v: (v[0] - v[1]) * (v[0] - v[1]), zip(vec1, vec2)))
|
||||
|
||||
# 获取网络在当前样本下每个连接的梯度
|
||||
network.get_gradient(sample_feature, sample_label)
|
||||
|
||||
# 对每个权重做梯度检查
|
||||
for conn in network.connections.connections:
|
||||
# 获取指定连接的梯度
|
||||
actual_gradient = conn.get_gradient()
|
||||
|
||||
# 增加一个很小的值,计算网络的误差
|
||||
epsilon = 0.0001
|
||||
conn.weight += epsilon
|
||||
error1 = network_error(network.predict(sample_feature), sample_label)
|
||||
|
||||
# 减去一个很小的值,计算网络的误差
|
||||
conn.weight -= 2 * epsilon # 刚才加过了一次,因此这里需要减去2倍
|
||||
error2 = network_error(network.predict(sample_feature), sample_label)
|
||||
|
||||
# 根据式6计算期望的梯度值
|
||||
expected_gradient = (error2 - error1) / (2 * epsilon)
|
||||
|
||||
# 打印
|
||||
print('expected gradient: \t%f\nactual gradient: \t%f' % (expected_gradient, actual_gradient))
|
||||
|
||||
|
||||
def train_data_set():
|
||||
'''
|
||||
Desc:
|
||||
获取训练数据集
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
labels --- 训练数据集每条数据对应的标签
|
||||
'''
|
||||
# 调用 Normalizer() 类
|
||||
normalizer = Normalizer()
|
||||
# 初始化一个 list,用来存储后面的数据
|
||||
data_set = []
|
||||
labels = []
|
||||
# 0 到 256 ,其中以 8 为步长
|
||||
for i in range(0, 256, 8):
|
||||
# 调用 normalizer 对象的 norm 方法
|
||||
n = normalizer.norm(int(random.uniform(0, 256)))
|
||||
# 在 data_set 中 append n
|
||||
data_set.append(n)
|
||||
# 在 labels 中 append n
|
||||
labels.append(n)
|
||||
# 将它们返回
|
||||
return labels, data_set
|
||||
|
||||
|
||||
def train(network):
|
||||
'''
|
||||
Desc:
|
||||
使用我们的神经网络进行训练
|
||||
Args:
|
||||
network --- 神经网络对象
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 获取训练数据集
|
||||
labels, data_set = train_data_set()
|
||||
labels = list(labels)
|
||||
data_set = list(labels)
|
||||
# 调用 network 中的 train方法来训练我们的神经网络
|
||||
network.train(labels, data_set, 0.3, 50)
|
||||
|
||||
|
||||
def test(net,data):
|
||||
#此函数不明觉厉,但是传参就有问题,如果跑不通就把这段代码注释掉吧。。。
|
||||
|
||||
'''
|
||||
Desc:
|
||||
对我们的全连接神经网络进行测试
|
||||
Args:
|
||||
network --- 神经网络对象
|
||||
data ------ 测试数据集
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 调用 Normalizer() 类
|
||||
|
||||
normalizer = Normalizer()
|
||||
# 调用 norm 方法,对数据进行规范化
|
||||
norm_data = normalizer.norm(data)
|
||||
norm_data = list(norm_data)
|
||||
# 对测试数据进行预测
|
||||
predict_data = net.predict(norm_data)
|
||||
# 将结果打印出来
|
||||
print('\ttestdata(%u)\tpredict(%u)' % (data, normalizer.denorm(predict_data)))
|
||||
|
||||
|
||||
def correct_ratio(network):
|
||||
'''
|
||||
Desc:
|
||||
计算我们的神经网络的正确率
|
||||
Args:
|
||||
network --- 神经网络对象
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
normalizer = Normalizer()
|
||||
correct = 0.0
|
||||
for i in range(256):
|
||||
if normalizer.denorm(network.predict(normalizer.norm(i))) == i:
|
||||
correct += 1.0
|
||||
print('correct_ratio: %.2f%%' % (correct / 256 * 100))
|
||||
|
||||
|
||||
def gradient_check_test():
|
||||
'''
|
||||
Desc:
|
||||
梯度检查测试
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 创建一个有 3 层的网络,每层有 2 个节点
|
||||
net = Network([2, 2, 2])
|
||||
# 样本的特征
|
||||
sample_feature = [0.9, 0.1]
|
||||
# 样本对应的标签
|
||||
sample_label = [0.9, 0.1]
|
||||
# 使用梯度检查来查看是否正确
|
||||
gradient_check(net, sample_feature, sample_label)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
'''
|
||||
Desc:
|
||||
主函数
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 初始化一个神经网络,输入层 8 个节点,隐藏层 3 个节点,输出层 8 个节点
|
||||
net = Network([8, 3, 8])
|
||||
# 训练我们的神经网络
|
||||
train(net)
|
||||
# 将我们的神经网络的信息打印出来
|
||||
net.dump()
|
||||
# 打印出神经网络的正确率
|
||||
correct_ratio(net)
|
||||
457
src/py3.x/dl/cnn.py
Normal file
457
src/py3.x/dl/cnn.py
Normal file
@@ -0,0 +1,457 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
|
||||
import numpy as np
|
||||
from activators import ReluActivator, IdentityActivator
|
||||
|
||||
|
||||
# 获取卷积区域
|
||||
def get_patch(input_array, i, j, filter_width,
|
||||
filter_height, stride):
|
||||
'''
|
||||
从输入数组中获取本次卷积的区域,
|
||||
自动适配输入为2D和3D的情况
|
||||
'''
|
||||
start_i = i * stride
|
||||
start_j = j * stride
|
||||
if input_array.ndim == 2:
|
||||
return input_array[
|
||||
start_i: start_i + filter_height,
|
||||
start_j: start_j + filter_width]
|
||||
elif input_array.ndim == 3:
|
||||
return input_array[:,
|
||||
start_i: start_i + filter_height,
|
||||
start_j: start_j + filter_width]
|
||||
|
||||
|
||||
# 获取一个2D区域的最大值所在的索引
|
||||
def get_max_index(array):
|
||||
max_i = 0
|
||||
max_j = 0
|
||||
max_value = array[0, 0]
|
||||
for i in range(array.shape[0]):
|
||||
for j in range(array.shape[1]):
|
||||
if array[i, j] > max_value:
|
||||
max_value = array[i, j]
|
||||
max_i, max_j = i, j
|
||||
return max_i, max_j
|
||||
|
||||
|
||||
# 计算卷积
|
||||
def conv(input_array,
|
||||
kernel_array,
|
||||
output_array,
|
||||
stride, bias):
|
||||
'''
|
||||
计算卷积,自动适配输入为2D和3D的情况
|
||||
'''
|
||||
channel_number = input_array.ndim
|
||||
output_width = output_array.shape[1]
|
||||
output_height = output_array.shape[0]
|
||||
kernel_width = kernel_array.shape[-1]
|
||||
kernel_height = kernel_array.shape[-2]
|
||||
for i in range(output_height):
|
||||
for j in range(output_width):
|
||||
output_array[i][j] = (
|
||||
get_patch(input_array, i, j, kernel_width,
|
||||
kernel_height, stride) * kernel_array
|
||||
).sum() + bias
|
||||
|
||||
|
||||
# 为数组增加Zero padding
|
||||
def padding(input_array, zp):
|
||||
'''
|
||||
为数组增加Zero padding,自动适配输入为2D和3D的情况
|
||||
'''
|
||||
if zp == 0:
|
||||
return input_array
|
||||
else:
|
||||
if input_array.ndim == 3:
|
||||
input_width = input_array.shape[2]
|
||||
input_height = input_array.shape[1]
|
||||
input_depth = input_array.shape[0]
|
||||
padded_array = np.zeros((
|
||||
input_depth,
|
||||
input_height + 2 * zp,
|
||||
input_width + 2 * zp))
|
||||
padded_array[:,
|
||||
zp: zp + input_height,
|
||||
zp: zp + input_width] = input_array
|
||||
return padded_array
|
||||
elif input_array.ndim == 2:
|
||||
input_width = input_array.shape[1]
|
||||
input_height = input_array.shape[0]
|
||||
padded_array = np.zeros((
|
||||
input_height + 2 * zp,
|
||||
input_width + 2 * zp))
|
||||
padded_array[zp: zp + input_height,
|
||||
zp: zp + input_width] = input_array
|
||||
return padded_array
|
||||
|
||||
|
||||
# 对numpy数组进行element wise操作
|
||||
def element_wise_op(array, op):
|
||||
for i in np.nditer(array,
|
||||
op_flags=['readwrite']):
|
||||
i[...] = op(i)
|
||||
|
||||
|
||||
class Filter(object):
|
||||
def __init__(self, width, height, depth):
|
||||
self.weights = np.random.uniform(-1e-4, 1e-4,
|
||||
(depth, height, width))
|
||||
self.bias = 0
|
||||
self.weights_grad = np.zeros(
|
||||
self.weights.shape)
|
||||
self.bias_grad = 0
|
||||
|
||||
def __repr__(self):
|
||||
return 'filter weights:\n%s\nbias:\n%s' % (
|
||||
repr(self.weights), repr(self.bias))
|
||||
|
||||
def get_weights(self):
|
||||
return self.weights
|
||||
|
||||
def get_bias(self):
|
||||
return self.bias
|
||||
|
||||
def update(self, learning_rate):
|
||||
self.weights -= learning_rate * self.weights_grad
|
||||
self.bias -= learning_rate * self.bias_grad
|
||||
|
||||
|
||||
class ConvLayer(object):
|
||||
def __init__(self, input_width, input_height,
|
||||
channel_number, filter_width,
|
||||
filter_height, filter_number,
|
||||
zero_padding, stride, activator,
|
||||
learning_rate):
|
||||
self.input_width = input_width
|
||||
self.input_height = input_height
|
||||
self.channel_number = channel_number
|
||||
self.filter_width = filter_width
|
||||
self.filter_height = filter_height
|
||||
self.filter_number = filter_number
|
||||
self.zero_padding = zero_padding
|
||||
self.stride = stride
|
||||
self.output_width = \
|
||||
ConvLayer.calculate_output_size(
|
||||
self.input_width, filter_width, zero_padding,
|
||||
stride)
|
||||
self.output_height = \
|
||||
ConvLayer.calculate_output_size(
|
||||
self.input_height, filter_height, zero_padding,
|
||||
stride)
|
||||
self.output_array = np.zeros((self.filter_number,
|
||||
self.output_height, self.output_width))
|
||||
self.filters = []
|
||||
for i in range(filter_number):
|
||||
self.filters.append(Filter(filter_width,
|
||||
filter_height, self.channel_number))
|
||||
self.activator = activator
|
||||
self.learning_rate = learning_rate
|
||||
|
||||
def forward(self, input_array):
|
||||
'''
|
||||
计算卷积层的输出
|
||||
输出结果保存在self.output_array
|
||||
'''
|
||||
self.input_array = input_array
|
||||
self.padded_input_array = padding(input_array,
|
||||
self.zero_padding)
|
||||
for f in range(self.filter_number):
|
||||
filter = self.filters[f]
|
||||
conv(self.padded_input_array,
|
||||
filter.get_weights(), self.output_array[f],
|
||||
self.stride, filter.get_bias())
|
||||
element_wise_op(self.output_array,
|
||||
self.activator.forward)
|
||||
|
||||
def backward(self, input_array, sensitivity_array,
|
||||
activator):
|
||||
'''
|
||||
计算传递给前一层的误差项,以及计算每个权重的梯度
|
||||
前一层的误差项保存在self.delta_array
|
||||
梯度保存在Filter对象的weights_grad
|
||||
'''
|
||||
self.forward(input_array)
|
||||
self.bp_sensitivity_map(sensitivity_array,
|
||||
activator)
|
||||
self.bp_gradient(sensitivity_array)
|
||||
|
||||
def update(self):
|
||||
'''
|
||||
按照梯度下降,更新权重
|
||||
'''
|
||||
for filter in self.filters:
|
||||
filter.update(self.learning_rate)
|
||||
|
||||
def bp_sensitivity_map(self, sensitivity_array,
|
||||
activator):
|
||||
'''
|
||||
计算传递到上一层的sensitivity map
|
||||
sensitivity_array: 本层的sensitivity map
|
||||
activator: 上一层的激活函数
|
||||
'''
|
||||
# 处理卷积步长,对原始sensitivity map进行扩展
|
||||
expanded_array = self.expand_sensitivity_map(
|
||||
sensitivity_array)
|
||||
# full卷积,对sensitivitiy map进行zero padding
|
||||
# 虽然原始输入的zero padding单元也会获得残差
|
||||
# 但这个残差不需要继续向上传递,因此就不计算了
|
||||
expanded_width = expanded_array.shape[2]
|
||||
zp = (self.input_width +
|
||||
self.filter_width - 1 - expanded_width) // 2
|
||||
padded_array = padding(expanded_array, zp)
|
||||
# 初始化delta_array,用于保存传递到上一层的
|
||||
# sensitivity map
|
||||
self.delta_array = self.create_delta_array()
|
||||
# 对于具有多个filter的卷积层来说,最终传递到上一层的
|
||||
# sensitivity map相当于所有的filter的
|
||||
# sensitivity map之和
|
||||
for f in range(self.filter_number):
|
||||
filter = self.filters[f]
|
||||
# 将filter权重翻转180度
|
||||
flipped_weights = np.array(list(map(lambda i: np.rot90(i, 2), filter.get_weights())))
|
||||
# 计算与一个filter对应的delta_array
|
||||
delta_array = self.create_delta_array()
|
||||
for d in range(delta_array.shape[0]):
|
||||
conv(padded_array[f], flipped_weights[d],
|
||||
delta_array[d], 1, 0)
|
||||
self.delta_array += delta_array
|
||||
# 将计算结果与激活函数的偏导数做element-wise乘法操作
|
||||
derivative_array = np.array(self.input_array)
|
||||
element_wise_op(derivative_array,
|
||||
activator.backward)
|
||||
self.delta_array *= derivative_array
|
||||
|
||||
def bp_gradient(self, sensitivity_array):
|
||||
# 处理卷积步长,对原始sensitivity map进行扩展
|
||||
expanded_array = self.expand_sensitivity_map(
|
||||
sensitivity_array)
|
||||
for f in range(self.filter_number):
|
||||
# 计算每个权重的梯度
|
||||
filter = self.filters[f]
|
||||
for d in range(filter.weights.shape[0]):
|
||||
conv(self.padded_input_array[d],
|
||||
expanded_array[f],
|
||||
filter.weights_grad[d], 1, 0)
|
||||
# 计算偏置项的梯度
|
||||
filter.bias_grad = expanded_array[f].sum()
|
||||
|
||||
def expand_sensitivity_map(self, sensitivity_array):
|
||||
depth = sensitivity_array.shape[0]
|
||||
# 确定扩展后sensitivity map的大小
|
||||
# 计算stride为1时sensitivity map的大小
|
||||
expanded_width = (self.input_width -
|
||||
self.filter_width + 2 * self.zero_padding + 1)
|
||||
expanded_height = (self.input_height -
|
||||
self.filter_height + 2 * self.zero_padding + 1)
|
||||
# 构建新的sensitivity_map
|
||||
expand_array = np.zeros((depth, expanded_height,
|
||||
expanded_width))
|
||||
# 从原始sensitivity map拷贝误差值
|
||||
for i in range(self.output_height):
|
||||
for j in range(self.output_width):
|
||||
i_pos = i * self.stride
|
||||
j_pos = j * self.stride
|
||||
expand_array[:, i_pos, j_pos] = \
|
||||
sensitivity_array[:, i, j]
|
||||
return expand_array
|
||||
|
||||
def create_delta_array(self):
|
||||
return np.zeros((self.channel_number,
|
||||
self.input_height, self.input_width))
|
||||
|
||||
@staticmethod
|
||||
def calculate_output_size(input_size,
|
||||
filter_size, zero_padding, stride):
|
||||
return (input_size - filter_size +
|
||||
2 * zero_padding) // stride + 1
|
||||
|
||||
|
||||
class MaxPoolingLayer(object):
|
||||
def __init__(self, input_width, input_height,
|
||||
channel_number, filter_width,
|
||||
filter_height, stride):
|
||||
self.input_width = input_width
|
||||
self.input_height = input_height
|
||||
self.channel_number = channel_number
|
||||
self.filter_width = filter_width
|
||||
self.filter_height = filter_height
|
||||
self.stride = stride
|
||||
self.output_width = (input_width -
|
||||
filter_width) // self.stride + 1
|
||||
self.output_height = (input_height -
|
||||
filter_height) // self.stride + 1
|
||||
self.output_array = np.zeros((self.channel_number,
|
||||
self.output_height, self.output_width))
|
||||
|
||||
def forward(self, input_array):
|
||||
for d in range(self.channel_number):
|
||||
for i in range(self.output_height):
|
||||
for j in range(self.output_width):
|
||||
self.output_array[d, i, j] = (
|
||||
get_patch(input_array[d], i, j,
|
||||
self.filter_width,
|
||||
self.filter_height,
|
||||
self.stride).max())
|
||||
|
||||
def backward(self, input_array, sensitivity_array):
|
||||
self.delta_array = np.zeros(input_array.shape)
|
||||
for d in range(self.channel_number):
|
||||
for i in range(self.output_height):
|
||||
for j in range(self.output_width):
|
||||
patch_array = get_patch(
|
||||
input_array[d], i, j,
|
||||
self.filter_width,
|
||||
self.filter_height,
|
||||
self.stride)
|
||||
k, l = get_max_index(patch_array)
|
||||
self.delta_array[d,
|
||||
i * self.stride + k,
|
||||
j * self.stride + l] = \
|
||||
sensitivity_array[d, i, j]
|
||||
|
||||
|
||||
def init_test():
|
||||
a = np.array(
|
||||
[[[0, 1, 1, 0, 2],
|
||||
[2, 2, 2, 2, 1],
|
||||
[1, 0, 0, 2, 0],
|
||||
[0, 1, 1, 0, 0],
|
||||
[1, 2, 0, 0, 2]],
|
||||
[[1, 0, 2, 2, 0],
|
||||
[0, 0, 0, 2, 0],
|
||||
[1, 2, 1, 2, 1],
|
||||
[1, 0, 0, 0, 0],
|
||||
[1, 2, 1, 1, 1]],
|
||||
[[2, 1, 2, 0, 0],
|
||||
[1, 0, 0, 1, 0],
|
||||
[0, 2, 1, 0, 1],
|
||||
[0, 1, 2, 2, 2],
|
||||
[2, 1, 0, 0, 1]]])
|
||||
b = np.array(
|
||||
[[[0, 1, 1],
|
||||
[2, 2, 2],
|
||||
[1, 0, 0]],
|
||||
[[1, 0, 2],
|
||||
[0, 0, 0],
|
||||
[1, 2, 1]]])
|
||||
cl = ConvLayer(5, 5, 3, 3, 3, 2, 1, 2, IdentityActivator(), 0.001)
|
||||
cl.filters[0].weights = np.array(
|
||||
[[[-1, 1, 0],
|
||||
[0, 1, 0],
|
||||
[0, 1, 1]],
|
||||
[[-1, -1, 0],
|
||||
[0, 0, 0],
|
||||
[0, -1, 0]],
|
||||
[[0, 0, -1],
|
||||
[0, 1, 0],
|
||||
[1, -1, -1]]], dtype=np.float64)
|
||||
cl.filters[0].bias = 1
|
||||
cl.filters[1].weights = np.array(
|
||||
[[[1, 1, -1],
|
||||
[-1, -1, 1],
|
||||
[0, -1, 1]],
|
||||
[[0, 1, 0],
|
||||
[-1, 0, -1],
|
||||
[-1, 1, 0]],
|
||||
[[-1, 0, 0],
|
||||
[-1, 0, 1],
|
||||
[-1, 0, 0]]], dtype=np.float64)
|
||||
return a, b, cl
|
||||
|
||||
|
||||
def test():
|
||||
a, b, cl = init_test()
|
||||
cl.forward(a)
|
||||
print(
|
||||
cl.output_array)
|
||||
|
||||
|
||||
def test_bp():
|
||||
a, b, cl = init_test()
|
||||
cl.backward(a, b, IdentityActivator())
|
||||
cl.update()
|
||||
print(
|
||||
cl.filters[0])
|
||||
print(
|
||||
cl.filters[1])
|
||||
|
||||
def gradient_check():
|
||||
'''
|
||||
梯度检查
|
||||
'''
|
||||
# 设计一个误差函数,取所有节点输出项之和
|
||||
error_function = lambda o: o.sum()
|
||||
|
||||
# 计算forward值
|
||||
a, b, cl = init_test()
|
||||
cl.forward(a)
|
||||
|
||||
# 求取sensitivity map
|
||||
sensitivity_array = np.ones(cl.output_array.shape,
|
||||
dtype=np.float64)
|
||||
# 计算梯度
|
||||
cl.backward(a, sensitivity_array,
|
||||
IdentityActivator())
|
||||
# 检查梯度
|
||||
epsilon = 10e-4
|
||||
for d in range(cl.filters[0].weights_grad.shape[0]):
|
||||
for i in range(cl.filters[0].weights_grad.shape[1]):
|
||||
for j in range(cl.filters[0].weights_grad.shape[2]):
|
||||
cl.filters[0].weights[d, i, j] += epsilon
|
||||
cl.forward(a)
|
||||
err1 = error_function(cl.output_array)
|
||||
cl.filters[0].weights[d, i, j] -= 2 * epsilon
|
||||
cl.forward(a)
|
||||
err2 = error_function(cl.output_array)
|
||||
expect_grad = (err1 - err2) / (2 * epsilon)
|
||||
cl.filters[0].weights[d, i, j] += epsilon
|
||||
print(
|
||||
'weights(%d,%d,%d): expected - actural %f - %f' % (
|
||||
d, i, j, expect_grad, cl.filters[0].weights_grad[d, i, j]))
|
||||
|
||||
|
||||
def init_pool_test():
|
||||
a = np.array(
|
||||
[[[1, 1, 2, 4],
|
||||
[5, 6, 7, 8],
|
||||
[3, 2, 1, 0],
|
||||
[1, 2, 3, 4]],
|
||||
[[0, 1, 2, 3],
|
||||
[4, 5, 6, 7],
|
||||
[8, 9, 0, 1],
|
||||
[3, 4, 5, 6]]], dtype=np.float64)
|
||||
|
||||
b = np.array(
|
||||
[[[1, 2],
|
||||
[2, 4]],
|
||||
[[3, 5],
|
||||
[8, 2]]], dtype=np.float64)
|
||||
|
||||
mpl = MaxPoolingLayer(4, 4, 2, 2, 2, 2)
|
||||
|
||||
return a, b, mpl
|
||||
|
||||
|
||||
def test_pool():
|
||||
a, b, mpl = init_pool_test()
|
||||
mpl.forward(a)
|
||||
print(
|
||||
'input array:\n%s\noutput array:\n%s' % (a,
|
||||
mpl.output_array))
|
||||
|
||||
|
||||
def test_pool_bp():
|
||||
a, b, mpl = init_pool_test()
|
||||
mpl.backward(a, b)
|
||||
print(
|
||||
'input array:\n%s\nsensitivity array:\n%s\ndelta array:\n%s' % (
|
||||
a, b, mpl.delta_array))
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
gradient_check()
|
||||
229
src/py3.x/dl/fc.py
Normal file
229
src/py3.x/dl/fc.py
Normal file
@@ -0,0 +1,229 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
|
||||
import random
|
||||
import numpy as np
|
||||
from functools import reduce
|
||||
from activators import SigmoidActivator, IdentityActivator
|
||||
|
||||
|
||||
# 全连接层实现类
|
||||
class FullConnectedLayer(object):
|
||||
def __init__(self, input_size, output_size,
|
||||
activator):
|
||||
'''
|
||||
构造函数
|
||||
input_size: 本层输入向量的维度
|
||||
output_size: 本层输出向量的维度
|
||||
activator: 激活函数
|
||||
'''
|
||||
self.input_size = input_size
|
||||
self.output_size = output_size
|
||||
self.activator = activator
|
||||
# 权重数组W
|
||||
self.W = np.random.uniform(-0.1, 0.1,
|
||||
(output_size, input_size))
|
||||
# 偏置项b
|
||||
self.b = np.zeros((output_size, 1))
|
||||
# 输出向量
|
||||
self.output = np.zeros((output_size, 1))
|
||||
|
||||
def forward(self, input_array):
|
||||
'''
|
||||
前向计算
|
||||
input_array: 输入向量,维度必须等于input_size
|
||||
'''
|
||||
# 式2
|
||||
self.input = input_array
|
||||
self.output = self.activator.forward(
|
||||
np.dot(self.W, input_array) + self.b)
|
||||
|
||||
def backward(self, delta_array):
|
||||
'''
|
||||
反向计算W和b的梯度
|
||||
delta_array: 从上一层传递过来的误差项
|
||||
'''
|
||||
# 式8
|
||||
self.delta = self.activator.backward(self.input) * np.dot(
|
||||
self.W.T, delta_array)
|
||||
self.W_grad = np.dot(delta_array, self.input.T)
|
||||
self.b_grad = delta_array
|
||||
|
||||
def update(self, learning_rate):
|
||||
'''
|
||||
使用梯度下降算法更新权重
|
||||
'''
|
||||
self.W += learning_rate * self.W_grad
|
||||
self.b += learning_rate * self.b_grad
|
||||
|
||||
def dump(self):
|
||||
print('W: %s\nb:%s' % (self.W, self.b))
|
||||
|
||||
|
||||
# 神经网络类
|
||||
class Network(object):
|
||||
def __init__(self, layers):
|
||||
'''
|
||||
构造函数
|
||||
'''
|
||||
self.layers = []
|
||||
for i in range(len(layers) - 1):
|
||||
self.layers.append(
|
||||
FullConnectedLayer(
|
||||
layers[i], layers[i+1],
|
||||
SigmoidActivator()
|
||||
)
|
||||
)
|
||||
|
||||
def predict(self, sample):
|
||||
'''
|
||||
使用神经网络实现预测
|
||||
sample: 输入样本
|
||||
'''
|
||||
output = sample
|
||||
for layer in self.layers:
|
||||
layer.forward(output)
|
||||
output = layer.output
|
||||
return output
|
||||
|
||||
def train(self, labels, data_set, rate, epoch):
|
||||
'''
|
||||
训练函数
|
||||
labels: 样本标签
|
||||
data_set: 输入样本
|
||||
rate: 学习速率
|
||||
epoch: 训练轮数
|
||||
'''
|
||||
for i in range(epoch):
|
||||
for d in range(len(list(data_set))):
|
||||
self.train_one_sample(labels[d],
|
||||
data_set[d], rate)
|
||||
|
||||
def train_one_sample(self, label, sample, rate):
|
||||
self.predict(sample)
|
||||
self.calc_gradient(label)
|
||||
self.update_weight(rate)
|
||||
|
||||
def calc_gradient(self, label):
|
||||
delta = self.layers[-1].activator.backward(
|
||||
self.layers[-1].output
|
||||
) * (label - self.layers[-1].output)
|
||||
for layer in self.layers[::-1]:
|
||||
layer.backward(delta)
|
||||
delta = layer.delta
|
||||
return delta
|
||||
|
||||
def update_weight(self, rate):
|
||||
for layer in self.layers:
|
||||
layer.update(rate)
|
||||
|
||||
def dump(self):
|
||||
for layer in self.layers:
|
||||
layer.dump()
|
||||
|
||||
def loss(self, output, label):
|
||||
return 0.5 * ((label - output) * (label - output)).sum()
|
||||
|
||||
def gradient_check(self, sample_feature, sample_label):
|
||||
'''
|
||||
梯度检查
|
||||
network: 神经网络对象
|
||||
sample_feature: 样本的特征
|
||||
sample_label: 样本的标签
|
||||
'''
|
||||
|
||||
# 获取网络在当前样本下每个连接的梯度
|
||||
self.predict(sample_feature)
|
||||
self.calc_gradient(sample_label)
|
||||
|
||||
# 检查梯度
|
||||
epsilon = 10e-4
|
||||
for fc in self.layers:
|
||||
for i in range(fc.W.shape[0]):
|
||||
for j in range(fc.W.shape[1]):
|
||||
fc.W[i,j] += epsilon
|
||||
output = self.predict(sample_feature)
|
||||
err1 = self.loss(sample_label, output)
|
||||
fc.W[i,j] -= 2*epsilon
|
||||
output = self.predict(sample_feature)
|
||||
err2 = self.loss(sample_label, output)
|
||||
expect_grad = (err1 - err2) / (2 * epsilon)
|
||||
fc.W[i,j] += epsilon
|
||||
print('weights(%d,%d): expected - actural %.4e - %.4e' % (
|
||||
i, j, expect_grad, fc.W_grad[i,j]))
|
||||
|
||||
|
||||
from bp import train_data_set
|
||||
|
||||
|
||||
def transpose(args):
|
||||
return map(
|
||||
lambda arg: map(
|
||||
lambda line: np.array(line).reshape(len(line), 1)
|
||||
, arg)
|
||||
, args
|
||||
)
|
||||
|
||||
|
||||
class Normalizer(object):
|
||||
def __init__(self):
|
||||
self.mask = [
|
||||
0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
||||
]
|
||||
|
||||
def norm(self, number):
|
||||
data = list(map(lambda m: 0.9 if number & m else 0.1, self.mask))
|
||||
return np.array(data).reshape(8, 1)
|
||||
|
||||
def denorm(self, vec):
|
||||
binary = list(map(lambda i: 1 if i > 0.5 else 0, vec[:,0]))
|
||||
for i in range(len(self.mask)):
|
||||
binary[i] = binary[i] * self.mask[i]
|
||||
return reduce(lambda x,y: x + y, binary)
|
||||
|
||||
def train_data_set():
|
||||
normalizer = Normalizer()
|
||||
data_set = []
|
||||
labels = []
|
||||
for i in range(0, 256):
|
||||
n = normalizer.norm(i)
|
||||
data_set.append(n)
|
||||
labels.append(n)
|
||||
return labels, data_set
|
||||
|
||||
def correct_ratio(network):
|
||||
normalizer = Normalizer()
|
||||
correct = 0.0;
|
||||
for i in range(256):
|
||||
if normalizer.denorm(network.predict(normalizer.norm(i))) == i:
|
||||
correct += 1.0
|
||||
print('correct_ratio: %.2f%%' % (correct / 256 * 100))
|
||||
|
||||
|
||||
def test():
|
||||
labels, data_set = list(transpose(train_data_set()))
|
||||
labels=list(labels)
|
||||
data_set=list(data_set)
|
||||
net = Network([8, 3, 8])
|
||||
rate = 0.5
|
||||
mini_batch = 20
|
||||
epoch = 10
|
||||
for i in range(epoch):
|
||||
net.train(labels, list(data_set), rate, mini_batch)
|
||||
print('after epoch %d loss: %f' % (
|
||||
(i + 1),
|
||||
net.loss(labels[-1], net.predict(data_set[-1]))
|
||||
))
|
||||
rate /= 2
|
||||
correct_ratio(net)
|
||||
|
||||
|
||||
def gradient_check():
|
||||
'''
|
||||
梯度检查
|
||||
'''
|
||||
labels, data_set = transpose(train_data_set())
|
||||
net = Network([8, 3, 8])
|
||||
net.gradient_check(data_set[0], labels[0])
|
||||
return net
|
||||
175
src/py3.x/dl/linear_unit.py
Normal file
175
src/py3.x/dl/linear_unit.py
Normal file
@@ -0,0 +1,175 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
# 引入 Perceptron 类
|
||||
from perceptron import Perceptron
|
||||
|
||||
# 定义激活函数 f
|
||||
f = lambda x: x
|
||||
|
||||
class LinearUnit(Perceptron):
|
||||
'''
|
||||
Desc:
|
||||
线性单元类
|
||||
Args:
|
||||
Perceptron —— 感知器
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
def __init__(self, input_num):
|
||||
'''
|
||||
Desc:
|
||||
初始化线性单元,设置输入参数的个数
|
||||
Args:
|
||||
input_num —— 输入参数的个数
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 初始化我们的感知器类,设置输入参数的个数 input_num 和 激活函数 f
|
||||
Perceptron.__init__(self, input_num, f)
|
||||
|
||||
# 构造简单的数据集
|
||||
def get_training_dataset():
|
||||
'''
|
||||
Desc:
|
||||
构建一个简单的训练数据集
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
input_vecs —— 训练数据集的特征部分
|
||||
labels —— 训练数据集的数据对应的标签,是一一对应的
|
||||
'''
|
||||
# 构建数据集,输入向量列表,每一项是工作年限
|
||||
input_vecs = [[5], [3], [8], [1.4], [10.1]]
|
||||
# 期望的输出列表,也就是输入向量的对应的标签,与工作年限对应的收入年薪
|
||||
labels = [5500, 2300, 7600, 1800, 11400]
|
||||
return input_vecs, labels
|
||||
|
||||
|
||||
# 使用我们的训练数据集对线性单元进行训练
|
||||
def train_linear_unit():
|
||||
'''
|
||||
Desc:
|
||||
使用训练数据集对我们的线性单元进行训练
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
lu —— 返回训练好的线性单元
|
||||
'''
|
||||
# 创建感知器对象,输入参数的个数也就是特征数为 1(工作年限)
|
||||
lu = LinearUnit(1)
|
||||
# 获取构建的数据集
|
||||
input_vecs, labels = get_training_dataset()
|
||||
# 训练感知器,迭代 10 轮,学习率为 0.01
|
||||
lu.train(input_vecs, labels, 10, 0.01)
|
||||
# 返回训练好的线性单元
|
||||
return lu
|
||||
|
||||
|
||||
# 将图像画出来
|
||||
def plot(linear_unit):
|
||||
'''
|
||||
Desc:
|
||||
将我们训练好的线性单元对数据的分类情况作图画出来
|
||||
Args:
|
||||
linear_unit —— 训练好的线性单元
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 引入绘图的库
|
||||
import matplotlib.pyplot as plt
|
||||
# 获取训练数据:特征 input_vecs 与 对应的标签 labels
|
||||
input_vecs, labels = get_training_dataset()
|
||||
# figure() 创建一个 Figure 对象,与用户交互的整个窗口,这个 figure 中容纳着 subplots
|
||||
fig = plt.figure()
|
||||
# 在 figure 对象中创建 1行1列中的第一个图
|
||||
ax = fig.add_subplot(111)
|
||||
# scatter(x, y) 绘制散点图,其中的 x,y 是相同长度的数组序列
|
||||
|
||||
ax.scatter(list(map(lambda x: x[0], input_vecs)), labels)
|
||||
|
||||
# 设置权重
|
||||
weights = linear_unit.weights
|
||||
# 设置偏置项
|
||||
bias = linear_unit.bias
|
||||
|
||||
y1 = 0*linear_unit.weights[0]+linear_unit.bias
|
||||
y2 = 12*linear_unit.weights[0]+ linear_unit.bias
|
||||
# 将图画出来
|
||||
plt.plot([0,12],[y1,y2])
|
||||
|
||||
# 将最终的图展示出来
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
'''
|
||||
Desc:
|
||||
main 函数,训练我们的线性单元,并进行预测
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 首先训练我们的线性单元
|
||||
linear_unit = train_linear_unit()
|
||||
# 打印训练获得的权重 和 偏置
|
||||
print(linear_unit)
|
||||
# 测试
|
||||
print('Work 3.4 years, monthly salary = %.2f' % linear_unit.predict([3.4]))
|
||||
print('Work 15 years, monthly salary = %.2f' % linear_unit.predict([15]))
|
||||
print('Work 1.5 years, monthly salary = %.2f' % linear_unit.predict([1.5]))
|
||||
print('Work 6.3 years, monthly salary = %.2f' % linear_unit.predict([6.3]))
|
||||
plot(linear_unit)
|
||||
|
||||
from Perceptron import Perceptron
|
||||
from matplotlib import pyplot as plt
|
||||
#定义激活函数f
|
||||
f = lambda x: x
|
||||
class LinearUnit(Perceptron):
|
||||
def __init__(self, input_num):
|
||||
'''初始化线性单元,设置输入参数的个数'''
|
||||
Perceptron.__init__(self, input_num, f)
|
||||
|
||||
|
||||
def get_train_dataset():
|
||||
input_vecs = [[5],[3],[8],[1.4],[10.1]]
|
||||
labels = [5500,2300,7600,1800,11400]
|
||||
return input_vecs,labels
|
||||
|
||||
def train_linear_unit():
|
||||
lu = LinearUnit(1)
|
||||
input_vecs,labels = get_train_dataset()
|
||||
lu.train(input_vecs,labels,10,0.01)
|
||||
return lu
|
||||
|
||||
'''
|
||||
#画图模块
|
||||
def plot(linear_unit):
|
||||
import matplotlib.pyplot as plt
|
||||
input_vecs, labels = get_training_dataset()
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.scatter(map(lambda x: x[0], input_vecs), labels)
|
||||
weights = linear_unit.weights
|
||||
bias = linear_unit.bias
|
||||
x = range(0,12,1)
|
||||
y = map(lambda x:weights[0] * x + bias, x)
|
||||
ax.plot(x, y)
|
||||
plt.show()
|
||||
'''
|
||||
|
||||
if __name__=='__main__':
|
||||
linear_unit = train_linear_unit()
|
||||
input_vecs,labels = get_train_dataset()
|
||||
print(linear_unit)
|
||||
print('Work 3.4 years, monthly salary = %.2f' % linear_unit.predict([3.4]))
|
||||
print('Work 15 years, monthly salary = %.2f' % linear_unit.predict([15]))
|
||||
print('Work 1.5 years, monthly salary = %.2f' % linear_unit.predict([1.5]))
|
||||
print('Work 6.3 years, monthly salary = %.2f' % linear_unit.predict([6.3]))
|
||||
print(linear_unit.weights)
|
||||
plt.scatter(input_vecs,labels)
|
||||
y1 = 0*linear_unit.weights[0]+linear_unit.bias
|
||||
y2 = 12*linear_unit.weights[0]+ linear_unit.bias
|
||||
plt.plot([0,12],[y1,y2])
|
||||
plt.show()
|
||||
336
src/py3.x/dl/lstm.py
Normal file
336
src/py3.x/dl/lstm.py
Normal file
@@ -0,0 +1,336 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from cnn import element_wise_op
|
||||
from activators import SigmoidActivator, TanhActivator, IdentityActivator
|
||||
|
||||
|
||||
class LstmLayer(object):
|
||||
def __init__(self, input_width, state_width,
|
||||
learning_rate):
|
||||
self.input_width = input_width
|
||||
self.state_width = state_width
|
||||
self.learning_rate = learning_rate
|
||||
# 门的激活函数
|
||||
self.gate_activator = SigmoidActivator()
|
||||
# 输出的激活函数
|
||||
self.output_activator = TanhActivator()
|
||||
# 当前时刻初始化为t0
|
||||
self.times = 0
|
||||
# 各个时刻的单元状态向量c
|
||||
self.c_list = self.init_state_vec()
|
||||
# 各个时刻的输出向量h
|
||||
self.h_list = self.init_state_vec()
|
||||
# 各个时刻的遗忘门f
|
||||
self.f_list = self.init_state_vec()
|
||||
# 各个时刻的输入门i
|
||||
self.i_list = self.init_state_vec()
|
||||
# 各个时刻的输出门o
|
||||
self.o_list = self.init_state_vec()
|
||||
# 各个时刻的即时状态c~
|
||||
self.ct_list = self.init_state_vec()
|
||||
# 遗忘门权重矩阵Wfh, Wfx, 偏置项bf
|
||||
self.Wfh, self.Wfx, self.bf = (
|
||||
self.init_weight_mat())
|
||||
# 输入门权重矩阵Wfh, Wfx, 偏置项bf
|
||||
self.Wih, self.Wix, self.bi = (
|
||||
self.init_weight_mat())
|
||||
# 输出门权重矩阵Wfh, Wfx, 偏置项bf
|
||||
self.Woh, self.Wox, self.bo = (
|
||||
self.init_weight_mat())
|
||||
# 单元状态权重矩阵Wfh, Wfx, 偏置项bf
|
||||
self.Wch, self.Wcx, self.bc = (
|
||||
self.init_weight_mat())
|
||||
|
||||
def init_state_vec(self):
|
||||
'''
|
||||
初始化保存状态的向量
|
||||
'''
|
||||
state_vec_list = []
|
||||
state_vec_list.append(np.zeros(
|
||||
(self.state_width, 1)))
|
||||
return state_vec_list
|
||||
|
||||
def init_weight_mat(self):
|
||||
'''
|
||||
初始化权重矩阵
|
||||
'''
|
||||
Wh = np.random.uniform(-1e-4, 1e-4,
|
||||
(self.state_width, self.state_width))
|
||||
Wx = np.random.uniform(-1e-4, 1e-4,
|
||||
(self.state_width, self.input_width))
|
||||
b = np.zeros((self.state_width, 1))
|
||||
return Wh, Wx, b
|
||||
|
||||
def forward(self, x):
|
||||
'''
|
||||
根据式1-式6进行前向计算
|
||||
'''
|
||||
self.times += 1
|
||||
# 遗忘门
|
||||
fg = self.calc_gate(x, self.Wfx, self.Wfh,
|
||||
self.bf, self.gate_activator)
|
||||
self.f_list.append(fg)
|
||||
# 输入门
|
||||
ig = self.calc_gate(x, self.Wix, self.Wih,
|
||||
self.bi, self.gate_activator)
|
||||
self.i_list.append(ig)
|
||||
# 输出门
|
||||
og = self.calc_gate(x, self.Wox, self.Woh,
|
||||
self.bo, self.gate_activator)
|
||||
self.o_list.append(og)
|
||||
# 即时状态
|
||||
ct = self.calc_gate(x, self.Wcx, self.Wch,
|
||||
self.bc, self.output_activator)
|
||||
self.ct_list.append(ct)
|
||||
# 单元状态
|
||||
c = fg * self.c_list[self.times - 1] + ig * ct
|
||||
self.c_list.append(c)
|
||||
# 输出
|
||||
h = og * self.output_activator.forward(c)
|
||||
self.h_list.append(h)
|
||||
|
||||
def calc_gate(self, x, Wx, Wh, b, activator):
|
||||
'''
|
||||
计算门
|
||||
'''
|
||||
h = self.h_list[self.times - 1] # 上次的LSTM输出
|
||||
net = np.dot(Wh, h) + np.dot(Wx, x) + b
|
||||
gate = activator.forward(net)
|
||||
return gate
|
||||
|
||||
|
||||
def backward(self, x, delta_h, activator):
|
||||
'''
|
||||
实现LSTM训练算法
|
||||
'''
|
||||
self.calc_delta(delta_h, activator)
|
||||
self.calc_gradient(x)
|
||||
|
||||
def update(self):
|
||||
'''
|
||||
按照梯度下降,更新权重
|
||||
'''
|
||||
self.Wfh -= self.learning_rate * self.Whf_grad
|
||||
self.Wfx -= self.learning_rate * self.Whx_grad
|
||||
self.bf -= self.learning_rate * self.bf_grad
|
||||
self.Wih -= self.learning_rate * self.Whi_grad
|
||||
self.Wix -= self.learning_rate * self.Whi_grad
|
||||
self.bi -= self.learning_rate * self.bi_grad
|
||||
self.Woh -= self.learning_rate * self.Wof_grad
|
||||
self.Wox -= self.learning_rate * self.Wox_grad
|
||||
self.bo -= self.learning_rate * self.bo_grad
|
||||
self.Wch -= self.learning_rate * self.Wcf_grad
|
||||
self.Wcx -= self.learning_rate * self.Wcx_grad
|
||||
self.bc -= self.learning_rate * self.bc_grad
|
||||
|
||||
def calc_delta(self, delta_h, activator):
|
||||
# 初始化各个时刻的误差项
|
||||
self.delta_h_list = self.init_delta() # 输出误差项
|
||||
self.delta_o_list = self.init_delta() # 输出门误差项
|
||||
self.delta_i_list = self.init_delta() # 输入门误差项
|
||||
self.delta_f_list = self.init_delta() # 遗忘门误差项
|
||||
self.delta_ct_list = self.init_delta() # 即时输出误差项
|
||||
|
||||
# 保存从上一层传递下来的当前时刻的误差项
|
||||
self.delta_h_list[-1] = delta_h
|
||||
|
||||
# 迭代计算每个时刻的误差项
|
||||
for k in range(self.times, 0, -1):
|
||||
self.calc_delta_k(k)
|
||||
|
||||
def init_delta(self):
|
||||
'''
|
||||
初始化误差项
|
||||
'''
|
||||
delta_list = []
|
||||
for i in range(self.times + 1):
|
||||
delta_list.append(np.zeros(
|
||||
(self.state_width, 1)))
|
||||
return delta_list
|
||||
|
||||
def calc_delta_k(self, k):
|
||||
'''
|
||||
根据k时刻的delta_h,计算k时刻的delta_f、
|
||||
delta_i、delta_o、delta_ct,以及k-1时刻的delta_h
|
||||
'''
|
||||
# 获得k时刻前向计算的值
|
||||
ig = self.i_list[k]
|
||||
og = self.o_list[k]
|
||||
fg = self.f_list[k]
|
||||
ct = self.ct_list[k]
|
||||
c = self.c_list[k]
|
||||
c_prev = self.c_list[k-1]
|
||||
tanh_c = self.output_activator.forward(c)
|
||||
delta_k = self.delta_h_list[k]
|
||||
|
||||
# 根据式9计算delta_o
|
||||
delta_o = (delta_k * tanh_c *
|
||||
self.gate_activator.backward(og))
|
||||
delta_f = (delta_k * og *
|
||||
(1 - tanh_c * tanh_c) * c_prev *
|
||||
self.gate_activator.backward(fg))
|
||||
delta_i = (delta_k * og *
|
||||
(1 - tanh_c * tanh_c) * ct *
|
||||
self.gate_activator.backward(ig))
|
||||
delta_ct = (delta_k * og *
|
||||
(1 - tanh_c * tanh_c) * ig *
|
||||
self.output_activator.backward(ct))
|
||||
delta_h_prev = (
|
||||
np.dot(delta_o.transpose(), self.Woh) +
|
||||
np.dot(delta_i.transpose(), self.Wih) +
|
||||
np.dot(delta_f.transpose(), self.Wfh) +
|
||||
np.dot(delta_ct.transpose(), self.Wch)
|
||||
).transpose()
|
||||
|
||||
# 保存全部delta值
|
||||
self.delta_h_list[k-1] = delta_h_prev
|
||||
self.delta_f_list[k] = delta_f
|
||||
self.delta_i_list[k] = delta_i
|
||||
self.delta_o_list[k] = delta_o
|
||||
self.delta_ct_list[k] = delta_ct
|
||||
|
||||
def calc_gradient(self, x):
|
||||
# 初始化遗忘门权重梯度矩阵和偏置项
|
||||
self.Wfh_grad, self.Wfx_grad, self.bf_grad = (
|
||||
self.init_weight_gradient_mat())
|
||||
# 初始化输入门权重梯度矩阵和偏置项
|
||||
self.Wih_grad, self.Wix_grad, self.bi_grad = (
|
||||
self.init_weight_gradient_mat())
|
||||
# 初始化输出门权重梯度矩阵和偏置项
|
||||
self.Woh_grad, self.Wox_grad, self.bo_grad = (
|
||||
self.init_weight_gradient_mat())
|
||||
# 初始化单元状态权重梯度矩阵和偏置项
|
||||
self.Wch_grad, self.Wcx_grad, self.bc_grad = (
|
||||
self.init_weight_gradient_mat())
|
||||
|
||||
# 计算对上一次输出h的权重梯度
|
||||
for t in range(self.times, 0, -1):
|
||||
# 计算各个时刻的梯度
|
||||
(Wfh_grad, bf_grad,
|
||||
Wih_grad, bi_grad,
|
||||
Woh_grad, bo_grad,
|
||||
Wch_grad, bc_grad) = (
|
||||
self.calc_gradient_t(t))
|
||||
# 实际梯度是各时刻梯度之和
|
||||
self.Wfh_grad += Wfh_grad
|
||||
self.bf_grad += bf_grad
|
||||
self.Wih_grad += Wih_grad
|
||||
self.bi_grad += bi_grad
|
||||
self.Woh_grad += Woh_grad
|
||||
self.bo_grad += bo_grad
|
||||
self.Wch_grad += Wch_grad
|
||||
self.bc_grad += bc_grad
|
||||
|
||||
# 计算对本次输入x的权重梯度
|
||||
xt = x.transpose()
|
||||
self.Wfx_grad = np.dot(self.delta_f_list[-1], xt)
|
||||
self.Wix_grad = np.dot(self.delta_i_list[-1], xt)
|
||||
self.Wox_grad = np.dot(self.delta_o_list[-1], xt)
|
||||
self.Wcx_grad = np.dot(self.delta_ct_list[-1], xt)
|
||||
|
||||
def init_weight_gradient_mat(self):
|
||||
'''
|
||||
初始化权重矩阵
|
||||
'''
|
||||
Wh_grad = np.zeros((self.state_width,
|
||||
self.state_width))
|
||||
Wx_grad = np.zeros((self.state_width,
|
||||
self.input_width))
|
||||
b_grad = np.zeros((self.state_width, 1))
|
||||
return Wh_grad, Wx_grad, b_grad
|
||||
|
||||
def calc_gradient_t(self, t):
|
||||
'''
|
||||
计算每个时刻t权重的梯度
|
||||
'''
|
||||
h_prev = self.h_list[t-1].transpose()
|
||||
Wfh_grad = np.dot(self.delta_f_list[t], h_prev)
|
||||
bf_grad = self.delta_f_list[t]
|
||||
Wih_grad = np.dot(self.delta_i_list[t], h_prev)
|
||||
bi_grad = self.delta_f_list[t]
|
||||
Woh_grad = np.dot(self.delta_o_list[t], h_prev)
|
||||
bo_grad = self.delta_f_list[t]
|
||||
Wch_grad = np.dot(self.delta_ct_list[t], h_prev)
|
||||
bc_grad = self.delta_ct_list[t]
|
||||
return Wfh_grad, bf_grad, Wih_grad, bi_grad, \
|
||||
Woh_grad, bo_grad, Wch_grad, bc_grad
|
||||
|
||||
def reset_state(self):
|
||||
# 当前时刻初始化为t0
|
||||
self.times = 0
|
||||
# 各个时刻的单元状态向量c
|
||||
self.c_list = self.init_state_vec()
|
||||
# 各个时刻的输出向量h
|
||||
self.h_list = self.init_state_vec()
|
||||
# 各个时刻的遗忘门f
|
||||
self.f_list = self.init_state_vec()
|
||||
# 各个时刻的输入门i
|
||||
self.i_list = self.init_state_vec()
|
||||
# 各个时刻的输出门o
|
||||
self.o_list = self.init_state_vec()
|
||||
# 各个时刻的即时状态c~
|
||||
self.ct_list = self.init_state_vec()
|
||||
|
||||
|
||||
def data_set():
|
||||
x = [np.array([[1], [2], [3]]),
|
||||
np.array([[2], [3], [4]])]
|
||||
d = np.array([[1], [2]])
|
||||
return x, d
|
||||
|
||||
|
||||
def gradient_check():
|
||||
'''
|
||||
梯度检查
|
||||
'''
|
||||
# 设计一个误差函数,取所有节点输出项之和
|
||||
error_function = lambda o: o.sum()
|
||||
|
||||
lstm = LstmLayer(3, 2, 1e-3)
|
||||
|
||||
# 计算forward值
|
||||
x, d = data_set()
|
||||
lstm.forward(x[0])
|
||||
lstm.forward(x[1])
|
||||
|
||||
# 求取sensitivity map
|
||||
sensitivity_array = np.ones(lstm.h_list[-1].shape,
|
||||
dtype=np.float64)
|
||||
# 计算梯度
|
||||
lstm.backward(x[1], sensitivity_array, IdentityActivator())
|
||||
|
||||
# 检查梯度
|
||||
epsilon = 10e-4
|
||||
for i in range(lstm.Wfh.shape[0]):
|
||||
for j in range(lstm.Wfh.shape[1]):
|
||||
lstm.Wfh[i,j] += epsilon
|
||||
lstm.reset_state()
|
||||
lstm.forward(x[0])
|
||||
lstm.forward(x[1])
|
||||
err1 = error_function(lstm.h_list[-1])
|
||||
lstm.Wfh[i,j] -= 2*epsilon
|
||||
lstm.reset_state()
|
||||
lstm.forward(x[0])
|
||||
lstm.forward(x[1])
|
||||
err2 = error_function(lstm.h_list[-1])
|
||||
expect_grad = (err1 - err2) / (2 * epsilon)
|
||||
lstm.Wfh[i,j] += epsilon
|
||||
print('weights(%d,%d): expected - actural %.4e - %.4e' % (
|
||||
i, j, expect_grad, lstm.Wfh_grad[i,j]))
|
||||
return lstm
|
||||
|
||||
|
||||
def test():
|
||||
l = LstmLayer(3, 2, 1e-3)
|
||||
x, d = data_set()
|
||||
l.forward(x[0])
|
||||
l.forward(x[1])
|
||||
l.backward(x[1], d, IdentityActivator())
|
||||
return l
|
||||
|
||||
def test_gradient_check():
|
||||
gradient_check()
|
||||
185
src/py3.x/dl/mnist.py
Normal file
185
src/py3.x/dl/mnist.py
Normal file
@@ -0,0 +1,185 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
import struct
|
||||
from fc import *
|
||||
from datetime import datetime
|
||||
import warnings
|
||||
#忽略警告一把梭,忽略了sigmoid函数位数溢出的警告
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
|
||||
# 数据加载器基类
|
||||
class Loader(object):
|
||||
def __init__(self, path, count):
|
||||
'''
|
||||
初始化加载器
|
||||
path: 数据文件路径
|
||||
count: 文件中的样本个数
|
||||
'''
|
||||
self.path = path
|
||||
self.count = count
|
||||
|
||||
def get_file_content(self):
|
||||
'''
|
||||
读取文件内容
|
||||
'''
|
||||
f = open(self.path, 'rb')
|
||||
content = f.read()
|
||||
f.close()
|
||||
return list(content)
|
||||
|
||||
def to_int(self, byte):
|
||||
'''
|
||||
将unsigned byte字符转换为整数
|
||||
'''
|
||||
#return struct.unpack('B', byte)[0]
|
||||
return byte
|
||||
|
||||
# 图像数据加载器
|
||||
class ImageLoader(Loader):
|
||||
def get_picture(self, content, index):
|
||||
'''
|
||||
内部函数,从文件中获取图像
|
||||
'''
|
||||
start = index * 28 * 28 + 16
|
||||
picture = []
|
||||
for i in range(28):
|
||||
picture.append([])
|
||||
for j in range(28):
|
||||
picture[i].append(
|
||||
self.to_int(content[start + i * 28 + j]))
|
||||
return picture
|
||||
|
||||
def get_one_sample(self, picture):
|
||||
'''
|
||||
内部函数,将图像转化为样本的输入向量
|
||||
'''
|
||||
sample = []
|
||||
for i in range(28):
|
||||
for j in range(28):
|
||||
sample.append(picture[i][j])
|
||||
return sample
|
||||
|
||||
def load(self):
|
||||
'''
|
||||
加载数据文件,获得全部样本的输入向量
|
||||
'''
|
||||
content = self.get_file_content()
|
||||
data_set = []
|
||||
for index in range(self.count):
|
||||
data_set.append(
|
||||
self.get_one_sample(
|
||||
self.get_picture(content, index)))
|
||||
return data_set
|
||||
|
||||
|
||||
# 标签数据加载器
|
||||
class LabelLoader(Loader):
|
||||
def load(self):
|
||||
'''
|
||||
加载数据文件,获得全部样本的标签向量
|
||||
'''
|
||||
content = self.get_file_content()
|
||||
labels = []
|
||||
for index in range(self.count):
|
||||
labels.append(self.norm(content[index + 8]))
|
||||
return labels
|
||||
|
||||
def norm(self, label):
|
||||
'''
|
||||
内部函数,将一个值转换为10维标签向量
|
||||
'''
|
||||
label_vec = []
|
||||
label_value = self.to_int(label)
|
||||
for i in range(10):
|
||||
if i == label_value:
|
||||
label_vec.append(0.9)
|
||||
else:
|
||||
label_vec.append(0.1)
|
||||
return label_vec
|
||||
|
||||
|
||||
def get_training_data_set():
|
||||
'''
|
||||
获得训练数据集
|
||||
原文为60000的数据集,但训练速度过于缓慢,这里
|
||||
'''
|
||||
image_loader = ImageLoader('./data/train-images-idx3-ubyte', 60000)
|
||||
label_loader = LabelLoader('./data/train-labels-idx1-ubyte', 60000)
|
||||
return image_loader.load(), label_loader.load()
|
||||
|
||||
|
||||
def get_test_data_set():
|
||||
'''
|
||||
获得测试数据集
|
||||
'''
|
||||
image_loader = ImageLoader('t10k-images-idx3-ubyte', 10000)
|
||||
label_loader = LabelLoader('t10k-labels-idx1-ubyte', 10000)
|
||||
return image_loader.load(), label_loader.load()
|
||||
|
||||
|
||||
def show(sample):
|
||||
str = ''
|
||||
for i in range(28):
|
||||
for j in range(28):
|
||||
if sample[i*28+j] != 0:
|
||||
str += '*'
|
||||
else:
|
||||
str += ' '
|
||||
str += '\n'
|
||||
print(str)
|
||||
|
||||
|
||||
def get_result(vec):
|
||||
max_value_index = 0
|
||||
max_value = 0
|
||||
vec = list(vec)
|
||||
for i in range(len(vec)):
|
||||
if vec[i] > max_value:
|
||||
max_value = vec[i]
|
||||
max_value_index = i
|
||||
return max_value_index
|
||||
|
||||
|
||||
def evaluate(network, test_data_set, test_labels):
|
||||
error = 0
|
||||
total = len(test_data_set)
|
||||
|
||||
for i in range(total):
|
||||
label = get_result(test_labels[i])
|
||||
predict = get_result(network.predict(test_data_set[i]))
|
||||
if label != predict:
|
||||
error += 1
|
||||
return float(error) / float(total)
|
||||
|
||||
|
||||
def now():
|
||||
return datetime.now().strftime('%c')
|
||||
|
||||
|
||||
def train_and_evaluate():
|
||||
last_error_ratio = 1.0
|
||||
epoch = 0
|
||||
train_data_set, train_labels = transpose(get_training_data_set())
|
||||
test_data_set, test_labels = transpose(get_test_data_set())
|
||||
train_data_set =list(train_data_set)
|
||||
train_labels = list(train_labels)
|
||||
test_data_set = list(test_data_set)
|
||||
test_labels = list(test_labels)
|
||||
network = Network([784, 100, 10])
|
||||
while True:
|
||||
epoch += 1
|
||||
network.train(train_labels, train_data_set, 0.01, 1)
|
||||
print('%s epoch %d finished, loss %f' % (now(), epoch,
|
||||
network.loss(train_labels[-1], network.predict(train_data_set[-1]))))
|
||||
if epoch % 2 == 0:
|
||||
error_ratio = evaluate(network, test_data_set, test_labels)
|
||||
print('%s after epoch %d, error ratio is %f' % (now(), epoch, error_ratio))
|
||||
if error_ratio > last_error_ratio:
|
||||
break
|
||||
else:
|
||||
last_error_ratio = error_ratio
|
||||
|
||||
if __name__ == '__main__':
|
||||
train_and_evaluate()
|
||||
199
src/py3.x/dl/perceptron.py
Normal file
199
src/py3.x/dl/perceptron.py
Normal file
@@ -0,0 +1,199 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
from functools import reduce
|
||||
|
||||
def add(x,y):
|
||||
return x+y
|
||||
|
||||
|
||||
class Perceptron(object):
|
||||
'''
|
||||
Desc:
|
||||
感知器类
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
def __init__(self,input_num,activator):
|
||||
'''
|
||||
Desc:
|
||||
初始化感知器
|
||||
Args:
|
||||
input_num —— 输入参数的个数
|
||||
activator —— 激活函数
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 设置的激活函数
|
||||
self.activator = activator
|
||||
# 权重向量初始化为 0
|
||||
self.weights = [0.0 for _ in range(input_num)]
|
||||
# 偏置项初始化为 0
|
||||
self.bias = 0.0
|
||||
|
||||
def __str__(self):
|
||||
'''
|
||||
Desc:
|
||||
将感知器信息打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
return 'weights\t:%s\nbias\t:%f\n' % (self.weights, self.bias)
|
||||
|
||||
def predict(self,input_vec):
|
||||
'''
|
||||
Desc:
|
||||
输入向量,输出感知器的计算结果
|
||||
Args:
|
||||
input_vec —— 输入向量
|
||||
Returns:
|
||||
感知器的计算结果
|
||||
'''
|
||||
# 将输入向量的计算结果返回
|
||||
# 调用 激活函数 activator ,将输入向量输入,计算感知器的结果
|
||||
# reduce() 函数是 python 2 的内置函数,从 python 3 开始移到了 functools 模块
|
||||
# reduce() 从左到右对一个序列的项累计地应用有两个参数的函数,以此合并序列到一个单一值,例如 reduce(lambda x,y: x+y, [1,2,3,4,5]) 计算的就是 ((((1+2)+3)+4)+5)
|
||||
# map() 接收一个函数 f 和一个 list ,并通过把函数 f 依次作用在 list 的每个元素上,得到一个新的 list 返回。比如我们的 f 函数是计算平方, map(f, [1,2,3,4,5]) ===> 返回 [1,4,9,16,25]
|
||||
# zip() 接收任意多个(包括 0 个和 1个)序列作为参数,返回一个 tuple 列表。例:x = [1,2,3] y = [4,5,6] z = [7,8,9] xyz = zip(x, y, z) ===> [(1,4,7), (2,5,8), (3,6,9)]
|
||||
|
||||
pack = zip(input_vec,self.weights)
|
||||
multi = []
|
||||
for (x,w) in pack:
|
||||
multi.append(x*w)
|
||||
activtion = reduce(add, multi)
|
||||
# 此处python3 lambda无法传入一个tuple的两个变量,因此将tuple当作一个整体,tp[0]为input_vec,tp[1]为self.weights
|
||||
return self.activator(activtion + self.bias)
|
||||
#还有一种更加简洁明了的写法,很清楚明白
|
||||
# return self.activator(sum([x*w for (x,w) in zip(input_vec,self.weights)])+self.bias)
|
||||
|
||||
def train(self,input_vecs,labels,iteration,rate):
|
||||
'''
|
||||
Desc:
|
||||
输入训练数据:一组向量、与每个向量对应的 label; 以及训练轮数、学习率
|
||||
Args:
|
||||
input_vec —— 输入向量
|
||||
labels —— 数据对应的标签
|
||||
iteration —— 训练的迭代轮数
|
||||
rate —— 学习率
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
for i in range(iteration):
|
||||
self._one_iteration(input_vecs,labels,rate)
|
||||
|
||||
def _one_iteration(self,input_vecs,labels,rate):
|
||||
'''
|
||||
Desc:
|
||||
训练过程的一次迭代过程
|
||||
Args:
|
||||
input_vecs —— 输入向量
|
||||
labels —— 数据对应的标签
|
||||
rate —— 学习率
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# zip() 接收任意多个(包括 0 个和 1个)序列作为参数,返回一个 tuple 列表。例:x = [1,2,3] y = [4,5,6] z = [7,8,9] xyz = zip(x, y, z) ===> [(1,4,7), (2,5,8), (3,6,9)]
|
||||
samples = zip(input_vecs, labels)
|
||||
# 对每个样本,按照感知器规则更新权重
|
||||
for (input_vec, label) in samples:
|
||||
# 计算感知器在当前权重下的输出
|
||||
output = self.predict(input_vec)
|
||||
# 更新权重
|
||||
output = self._update_weights(input_vec, output, label, rate)
|
||||
|
||||
def _update_weights(self,input_vecs,output,labels,rate):
|
||||
'''
|
||||
Desc:
|
||||
按照感知器规则更新权重
|
||||
Args:
|
||||
input_vec —— 输入向量
|
||||
output —— 经过感知器规则计算得到的输出
|
||||
label —— 输入向量对应的标签
|
||||
rate —— 学习率
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 利用感知器规则更新权重
|
||||
|
||||
delta = labels -output
|
||||
# map() 接收一个函数 f 和一个 list ,并通过把函数 f 依次作用在 list 的每个元素上,得到一个新的 list 返回。比如我们的 f 函数是计算平方, map(f, [1,2,3,4,5]) ===> 返回 [1,4,9,16,25]
|
||||
# zip() 接收任意多个(包括 0 个和 1个)序列作为参数,返回一个 tuple 列表。例:x = [1,2,3] y = [4,5,6] z = [7,8,9] xyz = zip(x, y, z) ===> [(1,4,7), (2,5,8), (3,6,9)]
|
||||
# 此处python3必须对map函数进行list操作,不然 self.weights为map类型,最后无法打印出具体数值
|
||||
pack = zip(input_vecs,self.weights)
|
||||
tmp = []
|
||||
for (x,w) in pack:
|
||||
tmp.append(w+x*delta*rate)
|
||||
self.weights = tmp
|
||||
# 更新 bias
|
||||
self.bias = self.bias + delta*rate
|
||||
|
||||
def f(x):
|
||||
'''
|
||||
Desc:
|
||||
定义激活函数 f
|
||||
Args:
|
||||
x —— 输入向量
|
||||
Returns:
|
||||
(实现阶跃函数)大于 0 返回 1,否则返回 0
|
||||
'''
|
||||
if x>0:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def get_training_dataset():
|
||||
'''
|
||||
Desc:
|
||||
基于 and 真值表来构建/获取训练数据集
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
input_vecs —— 输入向量
|
||||
labels —— 输入向量对应的标签
|
||||
'''
|
||||
# 构建训练数据,输入向量的列表
|
||||
input_vecs = [[1,1],[0,0],[1,0],[0,1]]
|
||||
# 期望的输出列表,也就是上面的输入向量的列表中数据对应的标签,是一一对应的
|
||||
|
||||
labels = [1,0,0,0]
|
||||
return input_vecs,labels
|
||||
|
||||
def train_and_perception():
|
||||
'''
|
||||
Desc:
|
||||
使用 and 真值表来训练我们的感知器
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
p —— 返回训练好的感知器
|
||||
'''
|
||||
# 创建感知器,输入参数的个数是 2 个(因为 and 是个二元函数),激活函数为 f
|
||||
p = Perceptron(2, f)
|
||||
# 进行训练,迭代 10 轮,学习速率是我们设定的 rate ,为 0.1
|
||||
input_vecs, labels = get_training_dataset()
|
||||
p.train(input_vecs, labels, 10, 0.1)
|
||||
# 返回训练好的感知器
|
||||
return p
|
||||
|
||||
if __name__ == '__main__':
|
||||
'''
|
||||
Desc:
|
||||
主函数,调用上面返回的训练好的感知器进行预测
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
# 训练 and 感知器
|
||||
and_perceptron = train_and_perceptron()
|
||||
# 打印训练获得的权重
|
||||
print(and_perceptron)
|
||||
# 测试
|
||||
print('1 and 1 = %d' % and_perceptron.predict([1, 1]))
|
||||
print('0 and 0 = %d' % and_perceptron.predict([0, 0]))
|
||||
print('1 and 0 = %d' % and_perceptron.predict([1, 0]))
|
||||
print('0 and 1 = %d' % and_perceptron.predict([0, 1]))
|
||||
187
src/py3.x/dl/recursive.py
Normal file
187
src/py3.x/dl/recursive.py
Normal file
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
|
||||
import numpy as np
|
||||
from activators import IdentityActivator
|
||||
|
||||
|
||||
class TreeNode(object):
|
||||
def __init__(self, data, children=[], children_data=[]):
|
||||
self.parent = None
|
||||
self.children = children
|
||||
self.children_data = children_data
|
||||
self.data = data
|
||||
for child in children:
|
||||
child.parent = self
|
||||
|
||||
# 递归神经网络实现
|
||||
class RecursiveLayer(object):
|
||||
def __init__(self, node_width, child_count,
|
||||
activator, learning_rate):
|
||||
'''
|
||||
递归神经网络构造函数
|
||||
node_width: 表示每个节点的向量的维度
|
||||
child_count: 每个父节点有几个子节点
|
||||
activator: 激活函数对象
|
||||
learning_rate: 梯度下降算法学习率
|
||||
'''
|
||||
self.node_width = node_width
|
||||
self.child_count = child_count
|
||||
self.activator = activator
|
||||
self.learning_rate = learning_rate
|
||||
# 权重数组W
|
||||
self.W = np.random.uniform(-1e-4, 1e-4,
|
||||
(node_width, node_width * child_count))
|
||||
# 偏置项b
|
||||
self.b = np.zeros((node_width, 1))
|
||||
# 递归神经网络生成的树的根节点
|
||||
self.root = None
|
||||
|
||||
def forward(self, *children):
|
||||
'''
|
||||
前向计算
|
||||
'''
|
||||
children_data = self.concatenate(children)
|
||||
parent_data = self.activator.forward(
|
||||
np.dot(self.W, children_data) + self.b
|
||||
)
|
||||
self.root = TreeNode(parent_data, children
|
||||
, children_data)
|
||||
|
||||
def backward(self, parent_delta):
|
||||
'''
|
||||
BPTS反向传播算法
|
||||
'''
|
||||
self.calc_delta(parent_delta, self.root)
|
||||
self.W_grad, self.b_grad = self.calc_gradient(self.root)
|
||||
|
||||
def update(self):
|
||||
'''
|
||||
使用SGD算法更新权重
|
||||
'''
|
||||
self.W -= self.learning_rate * self.W_grad
|
||||
self.b -= self.learning_rate * self.b_grad
|
||||
|
||||
def reset_state(self):
|
||||
self.root = None
|
||||
|
||||
def concatenate(self, tree_nodes):
|
||||
'''
|
||||
将各个树节点中的数据拼接成一个长向量
|
||||
'''
|
||||
concat = np.zeros((0,1))
|
||||
for node in tree_nodes:
|
||||
concat = np.concatenate((concat, node.data))
|
||||
return concat
|
||||
|
||||
def calc_delta(self, parent_delta, parent):
|
||||
'''
|
||||
计算每个节点的delta
|
||||
'''
|
||||
parent.delta = parent_delta
|
||||
if parent.children:
|
||||
# 根据式2计算每个子节点的delta
|
||||
children_delta = np.dot(self.W.T, parent_delta) * (
|
||||
self.activator.backward(parent.children_data)
|
||||
)
|
||||
# slices = [(子节点编号,子节点delta起始位置,子节点delta结束位置)]
|
||||
slices = [(i, i * self.node_width,
|
||||
(i + 1) * self.node_width)
|
||||
for i in range(self.child_count)]
|
||||
# 针对每个子节点,递归调用calc_delta函数
|
||||
for s in slices:
|
||||
self.calc_delta(children_delta[s[1]:s[2]],
|
||||
parent.children[s[0]])
|
||||
|
||||
def calc_gradient(self, parent):
|
||||
'''
|
||||
计算每个节点权重的梯度,并将它们求和,得到最终的梯度
|
||||
'''
|
||||
W_grad = np.zeros((self.node_width,
|
||||
self.node_width * self.child_count))
|
||||
b_grad = np.zeros((self.node_width, 1))
|
||||
if not parent.children:
|
||||
return W_grad, b_grad
|
||||
parent.W_grad = np.dot(parent.delta, parent.children_data.T)
|
||||
parent.b_grad = parent.delta
|
||||
W_grad += parent.W_grad
|
||||
b_grad += parent.b_grad
|
||||
for child in parent.children:
|
||||
W, b = self.calc_gradient(child)
|
||||
W_grad += W
|
||||
b_grad += b
|
||||
return W_grad, b_grad
|
||||
|
||||
def dump(self, **kwArgs):
|
||||
print('root.data: %s' % self.root.data)
|
||||
print('root.children_data: %s' % self.root.children_data)
|
||||
if 'dump_grad'in kwArgs:
|
||||
print('W_grad: %s' % self.W_grad)
|
||||
print('b_grad: %s' % self.b_grad)
|
||||
|
||||
|
||||
def data_set():
|
||||
children = [
|
||||
TreeNode(np.array([[1],[2]])),
|
||||
TreeNode(np.array([[3],[4]])),
|
||||
TreeNode(np.array([[5],[6]]))
|
||||
]
|
||||
d = np.array([[0.5],[0.8]])
|
||||
return children, d
|
||||
|
||||
|
||||
def gradient_check():
|
||||
'''
|
||||
梯度检查
|
||||
'''
|
||||
# 设计一个误差函数,取所有节点输出项之和
|
||||
error_function = lambda o: o.sum()
|
||||
|
||||
rnn = RecursiveLayer(2, 2, IdentityActivator(), 1e-3)
|
||||
|
||||
# 计算forward值
|
||||
x, d = data_set()
|
||||
rnn.forward(x[0], x[1])
|
||||
rnn.forward(rnn.root, x[2])
|
||||
|
||||
# 求取sensitivity map
|
||||
sensitivity_array = np.ones((rnn.node_width, 1),
|
||||
dtype=np.float64)
|
||||
# 计算梯度
|
||||
rnn.backward(sensitivity_array)
|
||||
|
||||
# 检查梯度
|
||||
epsilon = 10e-4
|
||||
for i in range(rnn.W.shape[0]):
|
||||
for j in range(rnn.W.shape[1]):
|
||||
rnn.W[i,j] += epsilon
|
||||
rnn.reset_state()
|
||||
rnn.forward(x[0], x[1])
|
||||
rnn.forward(rnn.root, x[2])
|
||||
err1 = error_function(rnn.root.data)
|
||||
rnn.W[i,j] -= 2*epsilon
|
||||
rnn.reset_state()
|
||||
rnn.forward(x[0], x[1])
|
||||
rnn.forward(rnn.root, x[2])
|
||||
err2 = error_function(rnn.root.data)
|
||||
expect_grad = (err1 - err2) / (2 * epsilon)
|
||||
rnn.W[i,j] += epsilon
|
||||
print('weights(%d,%d): expected - actural %.4e - %.4e' % (
|
||||
i, j, expect_grad, rnn.W_grad[i,j]))
|
||||
return rnn
|
||||
|
||||
|
||||
def test():
|
||||
children, d = data_set()
|
||||
rnn = RecursiveLayer(2, 2, IdentityActivator(), 1e-3)
|
||||
rnn.forward(children[0], children[1])
|
||||
rnn.dump()
|
||||
rnn.forward(rnn.root, children[2])
|
||||
rnn.dump()
|
||||
rnn.backward(d)
|
||||
rnn.dump(dump_grad='true')
|
||||
return rnn
|
||||
|
||||
def test_gradient_check():
|
||||
gradient_check()
|
||||
155
src/py3.x/dl/rnn.py
Normal file
155
src/py3.x/dl/rnn.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
|
||||
import numpy as np
|
||||
from cnn import element_wise_op
|
||||
from functools import reduce
|
||||
from activators import ReluActivator, IdentityActivator
|
||||
|
||||
|
||||
class RecurrentLayer(object):
|
||||
def __init__(self, input_width, state_width,
|
||||
activator, learning_rate):
|
||||
self.input_width = input_width
|
||||
self.state_width = state_width
|
||||
self.activator = activator
|
||||
self.learning_rate = learning_rate
|
||||
self.times = 0 # 当前时刻初始化为t0
|
||||
self.state_list = [] # 保存各个时刻的state
|
||||
self.state_list.append(np.zeros(
|
||||
(state_width, 1))) # 初始化s0
|
||||
self.U = np.random.uniform(-1e-4, 1e-4,
|
||||
(state_width, input_width)) # 初始化U
|
||||
self.W = np.random.uniform(-1e-4, 1e-4,
|
||||
(state_width, state_width)) # 初始化W
|
||||
|
||||
def forward(self, input_array):
|
||||
'''
|
||||
根据『式2』进行前向计算
|
||||
'''
|
||||
self.times += 1
|
||||
state = (np.dot(self.U, input_array) +
|
||||
np.dot(self.W, self.state_list[-1]))
|
||||
element_wise_op(state, self.activator.forward)
|
||||
self.state_list.append(state)
|
||||
|
||||
def backward(self, sensitivity_array,
|
||||
activator):
|
||||
'''
|
||||
实现BPTT算法
|
||||
'''
|
||||
self.calc_delta(sensitivity_array, activator)
|
||||
self.calc_gradient()
|
||||
|
||||
def update(self):
|
||||
'''
|
||||
按照梯度下降,更新权重
|
||||
'''
|
||||
self.W -= self.learning_rate * self.gradient
|
||||
|
||||
def calc_delta(self, sensitivity_array, activator):
|
||||
self.delta_list = [] # 用来保存各个时刻的误差项
|
||||
for i in range(self.times):
|
||||
self.delta_list.append(np.zeros(
|
||||
(self.state_width, 1)))
|
||||
self.delta_list.append(sensitivity_array)
|
||||
# 迭代计算每个时刻的误差项
|
||||
for k in range(self.times - 1, 0, -1):
|
||||
self.calc_delta_k(k, activator)
|
||||
|
||||
def calc_delta_k(self, k, activator):
|
||||
'''
|
||||
根据k+1时刻的delta计算k时刻的delta
|
||||
'''
|
||||
state = self.state_list[k + 1].copy()
|
||||
element_wise_op(self.state_list[k + 1],
|
||||
activator.backward)
|
||||
self.delta_list[k] = np.dot(
|
||||
np.dot(self.delta_list[k + 1].T, self.W),
|
||||
np.diag(state[:, 0])).T
|
||||
|
||||
def calc_gradient(self):
|
||||
self.gradient_list = [] # 保存各个时刻的权重梯度
|
||||
for t in range(self.times + 1):
|
||||
self.gradient_list.append(np.zeros(
|
||||
(self.state_width, self.state_width)))
|
||||
for t in range(self.times, 0, -1):
|
||||
self.calc_gradient_t(t)
|
||||
# 实际的梯度是各个时刻梯度之和
|
||||
self.gradient = reduce(
|
||||
lambda a, b: a + b, self.gradient_list,
|
||||
self.gradient_list[0]) # [0]被初始化为0且没有被修改过
|
||||
|
||||
def calc_gradient_t(self, t):
|
||||
'''
|
||||
计算每个时刻t权重的梯度
|
||||
'''
|
||||
gradient = np.dot(self.delta_list[t],
|
||||
self.state_list[t - 1].T)
|
||||
self.gradient_list[t] = gradient
|
||||
|
||||
def reset_state(self):
|
||||
self.times = 0 # 当前时刻初始化为t0
|
||||
self.state_list = [] # 保存各个时刻的state
|
||||
self.state_list.append(np.zeros(
|
||||
(self.state_width, 1))) # 初始化s0
|
||||
|
||||
|
||||
def data_set():
|
||||
x = [np.array([[1], [2], [3]]),
|
||||
np.array([[2], [3], [4]])]
|
||||
d = np.array([[1], [2]])
|
||||
return x, d
|
||||
|
||||
|
||||
def gradient_check():
|
||||
'''
|
||||
梯度检查
|
||||
'''
|
||||
# 设计一个误差函数,取所有节点输出项之和
|
||||
error_function = lambda o: o.sum()
|
||||
|
||||
rl = RecurrentLayer(3, 2, IdentityActivator(), 1e-3)
|
||||
|
||||
# 计算forward值
|
||||
x, d = data_set()
|
||||
rl.forward(x[0])
|
||||
rl.forward(x[1])
|
||||
|
||||
# 求取sensitivity map
|
||||
sensitivity_array = np.ones(rl.state_list[-1].shape,
|
||||
dtype=np.float64)
|
||||
# 计算梯度
|
||||
rl.backward(sensitivity_array, IdentityActivator())
|
||||
|
||||
# 检查梯度
|
||||
epsilon = 10e-4
|
||||
for i in range(rl.W.shape[0]):
|
||||
for j in range(rl.W.shape[1]):
|
||||
rl.W[i, j] += epsilon
|
||||
rl.reset_state()
|
||||
rl.forward(x[0])
|
||||
rl.forward(x[1])
|
||||
err1 = error_function(rl.state_list[-1])
|
||||
rl.W[i, j] -= 2 * epsilon
|
||||
rl.reset_state()
|
||||
rl.forward(x[0])
|
||||
rl.forward(x[1])
|
||||
err2 = error_function(rl.state_list[-1])
|
||||
expect_grad = (err1 - err2) / (2 * epsilon)
|
||||
rl.W[i, j] += epsilon
|
||||
print('weights(%d,%d): expected - actural %f - %f' % (
|
||||
i, j, expect_grad, rl.gradient[i, j]))
|
||||
|
||||
|
||||
def test():
|
||||
l = RecurrentLayer(3, 2, ReluActivator(), 1e-3)
|
||||
x, d = data_set()
|
||||
l.forward(x[0])
|
||||
l.forward(x[1])
|
||||
l.backward(d, ReluActivator())
|
||||
return l
|
||||
|
||||
|
||||
|
||||
58
src/py3.x/ml/1.MLFoundation/NumPy.py
Normal file
58
src/py3.x/ml/1.MLFoundation/NumPy.py
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
|
||||
'''
|
||||
Created on 2017-05-18
|
||||
Update on 2017-11-17
|
||||
Author: Peter Harrington/1988/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
|
||||
from numpy import random, mat, eye
|
||||
|
||||
'''
|
||||
# NumPy 矩阵和数组的区别
|
||||
NumPy存在2中不同的数据类型:
|
||||
1. 矩阵 matrix
|
||||
2. 数组 array
|
||||
相似点:
|
||||
都可以处理行列表示的数字元素
|
||||
不同点:
|
||||
1. 2个数据类型上执行相同的数据运算可能得到不同的结果。
|
||||
2. NumPy函数库中的 matrix 与 MATLAB中 matrices 等价。
|
||||
'''
|
||||
|
||||
# 生成一个 4*4 的随机数组
|
||||
randArray = random.rand(4, 4)
|
||||
|
||||
# 转化关系, 数组转化为矩阵
|
||||
randMat = mat(randArray)
|
||||
'''
|
||||
.I 表示对矩阵求逆(可以利用矩阵的初等变换)
|
||||
意义:逆矩阵是一个判断相似性的工具。逆矩阵A与列向量p相乘后,将得到列向量q,q的第i个分量表示p与A的第i个列向量的相似度。
|
||||
参考案例链接:
|
||||
https://www.zhihu.com/question/33258489
|
||||
http://blog.csdn.net/vernice/article/details/48506027
|
||||
.T 表示对矩阵转置(行列颠倒)
|
||||
* 等同于: .transpose()
|
||||
.A 返回矩阵基于的数组
|
||||
参考案例链接:
|
||||
http://blog.csdn.net/qq403977698/article/details/47254539
|
||||
'''
|
||||
invRandMat = randMat.I
|
||||
TraRandMat = randMat.T
|
||||
ArrRandMat = randMat.A
|
||||
# 输出结果
|
||||
print('randArray=(%s) \n' % type(randArray), randArray)
|
||||
print('randMat=(%s) \n' % type(randMat), randMat)
|
||||
print('invRandMat=(%s) \n' % type(invRandMat), invRandMat)
|
||||
print('TraRandMat=(%s) \n' % type(TraRandMat), TraRandMat)
|
||||
print('ArrRandMat=(%s) \n' % type(ArrRandMat), ArrRandMat)
|
||||
# 矩阵和逆矩阵 进行求积 (单位矩阵,对角线都为1嘛,理论上4*4的矩阵其他的都为0)
|
||||
myEye = randMat*invRandMat
|
||||
# 误差
|
||||
print(myEye - eye(4))
|
||||
|
||||
'''
|
||||
如果上面的代码运行没有问题,说明numpy安装没有问题
|
||||
'''
|
||||
24
src/py3.x/ml/10.kmeans/__init__.py
Normal file
24
src/py3.x/ml/10.kmeans/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python
|
||||
__coding__ = "utf-8"
|
||||
__author__ = "Ng WaiMing"
|
||||
|
||||
from training.action.unsupervised.kMeans import kMeans
|
||||
from numpy import *
|
||||
|
||||
if __name__ == '__main__':
|
||||
# dataMat = mat(kMeans.loadDataSet('../../../../data/k-means/testSet.txt'))
|
||||
# print('min(dataMat[:, 0])', min(dataMat[:, 0]), '\n')
|
||||
# print('min(dataMat[:, 1])', min(dataMat[:, 1]), '\n')
|
||||
# print('max(dataMat[:, 0])', max(dataMat[:, 0]), '\n')
|
||||
# print('max(dataMat[:, 1])', max(dataMat[:, 1]), '\n')
|
||||
# print(kMeans.randCent(dataMat, 2),'\n')
|
||||
# print(kMeans.distEclud(dataMat[0],dataMat[1]))
|
||||
# centroids, clusterAssment = kMeans.kMeans(dataMat, 4)
|
||||
# print('centroids:\n', centroids, '\n')
|
||||
# print('clusterAssment:\n',clusterAssment, '\n')
|
||||
# dataMat3 = mat(kMeans.loadDataSet('../../../../data/k-means/testSet2.txt'))
|
||||
# centList, myNewAssments = kMeans.biKmeans(dataMat3, 3)
|
||||
# print('centList: \n', centList, '\n')
|
||||
fileName = '../../../../data/k-means/places.txt'
|
||||
imgName = '../../../../data/k-means/Portland.png'
|
||||
kMeans.clusterClubs(fileName=fileName, imgName=imgName, numClust=5)
|
||||
52
src/py3.x/ml/10.kmeans/k-means.md
Normal file
52
src/py3.x/ml/10.kmeans/k-means.md
Normal file
@@ -0,0 +1,52 @@
|
||||
### K-均值聚类
|
||||
- 优点: 容易实现
|
||||
- 缺点: 可能收敛到局部最小值,在大规模数据集上收敛较慢
|
||||
- 适用数据类型: 数值型数据
|
||||
**k-means是发现给定数据集的K个簇的算法.簇个数K是用户给定的,每一个簇通过其'质心(centroid)',即簇中所有点的中心来描述**
|
||||
### K-means的工作流程
|
||||
- 首先,随机确定K个初始点作为质心.然后将数据集中的每个点分配到一个簇中,具体来讲,为每个点找距离最近的质心,并将其分配给该质心所对应的簇.这一步完成之后,每个簇的质心更新为该簇所有点的平均值
|
||||
|
||||
- 上述过程伪代码如下
|
||||
|
||||
创建k个点作为起始质心(经常是随机选择)
|
||||
当任意一个点的簇分配结果发生改变时
|
||||
对数据集中的每个数据点
|
||||
对每个质心
|
||||
计算质心与数据点之间的距离
|
||||
将数据点分配到距其最近的簇
|
||||
对每一个簇,计算簇中所有点的均值并将均值作为质心
|
||||
### k-means的一般流程
|
||||
1. 收集数据: 使用任意方法
|
||||
2. 准备数据: 需要数值型数据来计算距离,也可以将标称型数据映射为二值性数据再用于距离计算
|
||||
3. 分析数据: 使用任意方法
|
||||
4. 训练算法: 不适用与无监督学习,即无监督学习没有训练过程
|
||||
5. 测试算法: 应用聚类算法,观察结果.可以使用量化的误差指标如误差平方和来评价算法的结果
|
||||
6. 使用算法: 可用用于所希望的任何应用.通常情况下,簇质心可以代表整个簇的数据来做出决策
|
||||
### 使用后处理来提高聚类性能
|
||||
- 在包含簇分配结果的矩阵中保存着每个点的误差,即该点到簇质心的距离平方值.这个误差可以确定用户预先定义的参数K是否正确,也可以确定生成的簇是否较好
|
||||
- SSE(Sum of Squared Error,误差平方和):一种用于度量聚类效果的指标.
|
||||
- SSE值越小表示数据点越接近于它们的质心,聚类效果也越好.因为对误差取了平方,因此更重视那些远离中心的点.一种肯定可以降低SSE值的方法事增加簇的个数,但这违背了聚类的目标.聚类的目标事在保持簇数据不变的情况下提高簇的质量
|
||||
- 为了保持簇总数不变,可以将两个簇进行合并.可以很容易对二维数据上的聚类进行可视化,如果是多维 的,有两种可以量化的办法:合并最近的质心,或者合并两个使得SSE增幅最小的质心.第一种思路通过计算所有质心之间的距离,然后合并距离最近的两个点来实现.第二种方法需要合并两个簇然后计算总SSE值.必须在所有可能的两个簇上重复上述处理过程,直到找到合并最佳的两个簇为止
|
||||
### 二分k-means算法
|
||||
- 二分k-means算法是为了客服k-means算法收敛于局部最小值的问题,二分k-kmeans算法首先将所有点作为一个簇,然后将该簇一分为二.之后选择其中一个簇继续进行划分,选择哪一个簇进行划分取决于对其划分是否可以最大程度降低SSE值.上述基于SSE的划分过程不断重复,直到得到用户指定的簇数目为止
|
||||
|
||||
- 二分k-means的伪代码形式如下:
|
||||
|
||||
将所有点看成一个簇
|
||||
当簇数目小于k时
|
||||
对每一个簇
|
||||
计算总误差
|
||||
在给定的簇上面进行k-means(k=2)
|
||||
计算将该簇一分为二之后的总误差
|
||||
选择使得误差最小的那个簇进行划分操作
|
||||
|
||||
### 对于地理数据应用二分k-means
|
||||
1. 收集数据: 使用Yahoo!PlaceFinder API收集数据
|
||||
2. 准备数据: 只保留经纬度信息
|
||||
3. 分析数据: 使用Matplotlib来构建一个二维数据图,其中包含簇与地图
|
||||
4. 训练算法: 训练不适用于无监督学习
|
||||
5. 测试算法: 使用biKmeans()函数
|
||||
6. 使用算法: 最后的输出时包含簇及簇中心的地图
|
||||
|
||||
### 相关公式
|
||||
**欧式距离公式:** $d=\sqrt{(xA_0-xB_0)^2+(xA_1-xB_1)^2
|
||||
228
src/py3.x/ml/10.kmeans/kMeans.py
Normal file
228
src/py3.x/ml/10.kmeans/kMeans.py
Normal file
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env python
|
||||
__coding__ = "utf-8"
|
||||
__author__ = "Ng WaiMing"
|
||||
|
||||
from numpy import *
|
||||
from time import sleep
|
||||
import matplotlib
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
|
||||
def loadDataSet(fileName):
|
||||
'''
|
||||
加载数据集
|
||||
:param fileName:
|
||||
:return:
|
||||
'''
|
||||
# 初始化一个空列表
|
||||
dataSet = []
|
||||
# 读取文件
|
||||
fr = open(fileName)
|
||||
# 循环遍历文件所有行
|
||||
for line in fr.readlines():
|
||||
# 切割每一行的数据
|
||||
curLine = line.strip().split('\t')
|
||||
# 将数据转换为浮点类型,便于后面的计算
|
||||
# fltLine = [float(x) for x in curLine]
|
||||
# 将数据追加到dataMat
|
||||
fltLine = list(map(float,curLine)) # 映射所有的元素为 float(浮点数)类型
|
||||
dataSet.append(fltLine)
|
||||
# 返回dataMat
|
||||
return dataSet
|
||||
|
||||
|
||||
def distEclud(vecA, vecB):
|
||||
'''
|
||||
欧氏距离计算函数
|
||||
:param vecA:
|
||||
:param vecB:
|
||||
:return:
|
||||
'''
|
||||
return sqrt(sum(power(vecA - vecB, 2)))
|
||||
|
||||
|
||||
def randCent(dataMat, k):
|
||||
'''
|
||||
为给定数据集构建一个包含K个随机质心的集合,
|
||||
随机质心必须要在整个数据集的边界之内,这可以通过找到数据集每一维的最小和最大值来完成
|
||||
然后生成0到1.0之间的随机数并通过取值范围和最小值,以便确保随机点在数据的边界之内
|
||||
:param dataMat:
|
||||
:param k:
|
||||
:return:
|
||||
'''
|
||||
# 获取样本数与特征值
|
||||
m, n = shape(dataMat)
|
||||
# 初始化质心,创建(k,n)个以零填充的矩阵
|
||||
centroids = mat(zeros((k, n)))
|
||||
# 循环遍历特征值
|
||||
for j in range(n):
|
||||
# 计算每一列的最小值
|
||||
minJ = min(dataMat[:, j])
|
||||
# 计算每一列的范围值
|
||||
rangeJ = float(max(dataMat[:, j]) - minJ)
|
||||
# 计算每一列的质心,并将值赋给centroids
|
||||
centroids[:, j] = mat(minJ + rangeJ * random.rand(k, 1))
|
||||
# 返回质心
|
||||
return centroids
|
||||
|
||||
|
||||
def kMeans(dataMat, k, distMeas=distEclud, createCent=randCent):
|
||||
'''
|
||||
创建K个质心,然后将每个店分配到最近的质心,再重新计算质心。
|
||||
这个过程重复数次,直到数据点的簇分配结果不再改变为止
|
||||
:param dataMat: 数据集
|
||||
:param k: 簇的数目
|
||||
:param distMeans: 计算距离
|
||||
:param createCent: 创建初始质心
|
||||
:return:
|
||||
'''
|
||||
# 获取样本数和特征数
|
||||
m, n = shape(dataMat)
|
||||
# 初始化一个矩阵来存储每个点的簇分配结果
|
||||
# clusterAssment包含两个列:一列记录簇索引值,第二列存储误差(误差是指当前点到簇质心的距离,后面会使用该误差来评价聚类的效果)
|
||||
clusterAssment = mat(zeros((m, 2)))
|
||||
# 创建质心,随机K个质心
|
||||
centroids = createCent(dataMat, k)
|
||||
# 初始化标志变量,用于判断迭代是否继续,如果True,则继续迭代
|
||||
clusterChanged = True
|
||||
while clusterChanged:
|
||||
clusterChanged = False
|
||||
# 遍历所有数据找到距离每个点最近的质心,
|
||||
# 可以通过对每个点遍历所有质心并计算点到每个质心的距离来完成
|
||||
for i in range(m):
|
||||
minDist = inf
|
||||
minIndex = -1
|
||||
for j in range(k):
|
||||
# 计算数据点到质心的距离
|
||||
# 计算距离是使用distMeas参数给出的距离公式,默认距离函数是distEclud
|
||||
distJI = distMeas(centroids[j, :], dataMat[i, :])
|
||||
# 如果距离比minDist(最小距离)还小,更新minDist(最小距离)和最小质心的index(索引)
|
||||
if distJI < minDist:
|
||||
minDist = distJI
|
||||
minIndex = j
|
||||
# 如果任一点的簇分配结果发生改变,则更新clusterChanged标志
|
||||
if clusterAssment[i, 0] != minIndex: clusterChanged = True
|
||||
# 更新簇分配结果为最小质心的index(索引),minDist(最小距离)的平方
|
||||
clusterAssment[i, :] = minIndex, minDist ** 2
|
||||
# print(centroids)
|
||||
# 遍历所有质心并更新它们的取值
|
||||
for cent in range(k):
|
||||
# 通过数据过滤来获得给定簇的所有点
|
||||
ptsInClust = dataMat[nonzero(clusterAssment[:, 0].A == cent)[0]]
|
||||
# 计算所有点的均值,axis=0表示沿矩阵的列方向进行均值计算
|
||||
centroids[cent, :] = mean(ptsInClust, axis=0)
|
||||
# 返回所有的类质心与点分配结果
|
||||
return centroids, clusterAssment
|
||||
|
||||
|
||||
def biKmeans(dataMat, k, distMeas=distEclud):
|
||||
'''
|
||||
在给定数据集,所期望的簇数目和距离计算方法的条件下,函数返回聚类结果
|
||||
:param dataMat:
|
||||
:param k:
|
||||
:param distMeas:
|
||||
:return:
|
||||
'''
|
||||
m, n = shape(dataMat)
|
||||
# 创建一个矩阵来存储数据集中每个点的簇分配结果及平方误差
|
||||
clusterAssment = mat(zeros((m, 2)))
|
||||
# 计算整个数据集的质心,并使用一个列表来保留所有的质心
|
||||
centroid0 = mean(dataMat, axis=0).tolist()[0]
|
||||
centList = [centroid0]
|
||||
# 遍历数据集中所有点来计算每个点到质心的误差值
|
||||
for j in range(m):
|
||||
clusterAssment[j, 1] = distMeas(mat(centroid0), dataMat[j, :]) ** 2
|
||||
# 对簇不停的进行划分,直到得到想要的簇数目为止
|
||||
while (len(centList) < k):
|
||||
# 初始化最小SSE为无穷大,用于比较划分前后的SSE
|
||||
lowestSSE = inf
|
||||
# 通过考察簇列表中的值来获得当前簇的数目,遍历所有的簇来决定最佳的簇进行划分
|
||||
for i in range(len(centList)):
|
||||
# 对每一个簇,将该簇中的所有点堪称一个小的数据集
|
||||
ptsInCurrCluster = dataMat[nonzero(clusterAssment[:, 0].A == i)[0], :]
|
||||
# 将ptsInCurrCluster输入到函数kMeans中进行处理,k=2,
|
||||
# kMeans会生成两个质心(簇),同时给出每个簇的误差值
|
||||
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
|
||||
# 将误差值与剩余数据集的误差之和作为本次划分的误差
|
||||
sseSplit = sum(splitClustAss[:, 1])
|
||||
sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:, 0].A != i)[0], 1])
|
||||
print('sseSplit, and notSplit: ', sseSplit, sseNotSplit)
|
||||
# 如果本次划分的SSE值最小,则本次划分被保存
|
||||
if (sseSplit + sseNotSplit) < lowestSSE:
|
||||
bestCentToSplit = i
|
||||
bestNewCents = centroidMat
|
||||
bestClustAss = splitClustAss.copy()
|
||||
lowestSSE = sseSplit + sseNotSplit
|
||||
# 找出最好的簇分配结果
|
||||
# 调用kmeans函数并且指定簇数为2时,会得到两个编号分别为0和1的结果簇
|
||||
bestClustAss[nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len(centList)
|
||||
# 更新为最佳质心
|
||||
bestClustAss[nonzero(bestClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit
|
||||
print('the bestCentToSplit is: ', bestCentToSplit)
|
||||
print('the len of bestClustAss is: ', len(bestClustAss))
|
||||
# 更新质心列表
|
||||
# 更新原质心list中的第i个质心为使用二分kMeans后bestNewCents的第一个质心
|
||||
centList[bestCentToSplit] = bestNewCents[0, :].tolist()[0]
|
||||
# 添加bestNewCents的第二个质心
|
||||
centList.append(bestNewCents[1, :].tolist()[0])
|
||||
# 重新分配最好簇下的数据(质心)以及SSE
|
||||
clusterAssment[nonzero(clusterAssment[:, 0].A == bestCentToSplit)[0], :] = bestClustAss
|
||||
return mat(centList), clusterAssment
|
||||
|
||||
|
||||
def distSLC(vecA, vecB):
|
||||
'''
|
||||
返回地球表面两点间的距离,单位是英里
|
||||
给定两个点的经纬度,可以使用球面余弦定理来计算亮点的距离
|
||||
:param vecA:
|
||||
:param vecB:
|
||||
:return:
|
||||
'''
|
||||
# 经度和维度用角度作为单位,但是sin()和cos()以弧度为输入.
|
||||
# 可以将江都除以180度然后再诚意圆周率pi转换为弧度
|
||||
a = sin(vecA[0, 1] * pi / 180) * sin(vecB[0, 1] * pi / 180)
|
||||
b = cos(vecA[0, 1] * pi / 180) * cos(vecB[0, 1] * pi / 180) * \
|
||||
cos(pi * (vecB[0, 0] - vecA[0, 0]) / 180)
|
||||
return arccos(a + b) * 6371.0
|
||||
|
||||
|
||||
def clusterClubs(fileName, imgName, numClust=5):
|
||||
'''
|
||||
将文本文件的解析,聚类以及画图都封装在一起
|
||||
:param fileName: 文本数据路径
|
||||
:param imgName: 图片路径
|
||||
:param numClust: 希望得到的簇数目
|
||||
:return:
|
||||
'''
|
||||
# 创建一个空列表
|
||||
datList = []
|
||||
# 打开文本文件获取第4列和第5列,这两列分别对应维度和经度,然后将这些值封装到datList
|
||||
for line in open(fileName).readlines():
|
||||
lineArr = line.split('\t')
|
||||
datList.append([float(lineArr[4]), float(lineArr[3])])
|
||||
datMat = mat(datList)
|
||||
# 调用biKmeans并使用distSLC函数作为聚类中使用的距离计算方式
|
||||
myCentroids, clustAssing = biKmeans(datMat, numClust, distMeas=distSLC)
|
||||
# 创建一幅图和一个举行,使用该矩形来决定绘制图的哪一部分
|
||||
fig = plt.figure()
|
||||
rect = [0.1, 0.1, 0.8, 0.8]
|
||||
# 构建一个标记形状的列表用于绘制散点图
|
||||
scatterMarkers = ['s', 'o', '^', '8', 'p', 'd', 'v', 'h', '>', '<']
|
||||
axprops = dict(xticks=[], yticks=[])
|
||||
ax0 = fig.add_axes(rect, label='ax0', **axprops)
|
||||
# 使用imread函数基于一幅图像来创建矩阵
|
||||
imgP = plt.imread(imgName)
|
||||
# 使用imshow绘制该矩阵
|
||||
ax0.imshow(imgP)
|
||||
# 再同一幅图上绘制一张新图,允许使用两套坐标系统并不做任何缩放或偏移
|
||||
ax1 = fig.add_axes(rect, label='ax1', frameon=False)
|
||||
# 遍历每一个簇并将它们一一画出来,标记类型从前面创建的scatterMarkers列表中得到
|
||||
for i in range(numClust):
|
||||
ptsInCurrCluster = datMat[nonzero(clustAssing[:, 0].A == i)[0], :]
|
||||
# 使用索引i % len(scatterMarkers)来选择标记形状,这意味这当有更多簇时,可以循环使用这标记
|
||||
markerStyle = scatterMarkers[i % len(scatterMarkers)]
|
||||
# 使用十字标记来表示簇中心并在图中显示
|
||||
ax1.scatter(ptsInCurrCluster[:, 0].flatten().A[0], ptsInCurrCluster[:, 1].flatten().A[0], marker=markerStyle,
|
||||
s=90)
|
||||
ax1.scatter(myCentroids[:, 0].flatten().A[0], myCentroids[:, 1].flatten().A[0], marker='+', s=300)
|
||||
plt.show()
|
||||
24
src/py3.x/ml/10.kmeans/kMeansSklearn.py
Normal file
24
src/py3.x/ml/10.kmeans/kMeansSklearn.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# -*- coding:UTF-8 -*-
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
# 加载数据集
|
||||
dataMat = []
|
||||
fr = open("data/10.KMeans/testSet.txt") # 注意,这个是相对路径,请保证是在 MachineLearning 这个目录下执行。
|
||||
for line in fr.readlines():
|
||||
curLine = line.strip().split('\t')
|
||||
fltLine = list(map(float,curLine)) # 映射所有的元素为 float(浮点数)类型
|
||||
dataMat.append(fltLine)
|
||||
|
||||
# 训练模型
|
||||
km = KMeans(n_clusters=4) # 初始化
|
||||
km.fit(dataMat) # 拟合
|
||||
km_pred = km.predict(dataMat) # 预测
|
||||
centers = km.cluster_centers_ # 质心
|
||||
|
||||
# 可视化结果
|
||||
plt.scatter(np.array(dataMat)[:, 1], np.array(dataMat)[:, 0], c=km_pred)
|
||||
plt.scatter(centers[:, 1], centers[:, 0], c="r")
|
||||
plt.show()
|
||||
43
src/py3.x/ml/10.kmeans/test.txt
Normal file
43
src/py3.x/ml/10.kmeans/test.txt
Normal file
@@ -0,0 +1,43 @@
|
||||
# import
|
||||
>>> import kMeans
|
||||
>>> from numpy import *
|
||||
|
||||
# 从文本中构建矩阵,加载测试数据集
|
||||
>>> datMat=mat(kMeans.loadDataSet('testSet.txt'))
|
||||
|
||||
# 测试 randCent() 函数是否正常运行。
|
||||
# 首先,先看一下矩阵中的最大值与最小值
|
||||
>>> min(datMat[:,0])
|
||||
matrix([[-5.379713]])
|
||||
>>> min(datMat[:,1])
|
||||
matrix([[-4.232586]])
|
||||
>>> max(datMat[:,1])
|
||||
matrix([[ 5.1904]])
|
||||
>>> max(datMat[:,0])
|
||||
matrix([[ 4.838138]])
|
||||
|
||||
# 然后看看 randCent() 函数能否生成 min 到 max 之间的值
|
||||
>>> kMeans.randCent(datMat, 2)
|
||||
matrix([[-3.59997714, -1.43558065],
|
||||
[-3.03744979, 4.35541488]])
|
||||
|
||||
# 最后测试一下距离计算方法
|
||||
>>> kMeans.distEclud(datMat[0], datMat[1])
|
||||
5.184632816681332
|
||||
|
||||
# 该算法会创建k个质心,然后将每个点分配到最近的质心,再重新计算质心。
|
||||
# 这个过程重复数次,知道数据点的簇分配结果不再改变位置。
|
||||
# 运行结果(多次运行结果可能会不一样,可以试试,原因为随机质心的影响,但总的结果是对的, 因为数据足够相似)
|
||||
>>> myCentroids, clustAssing = kMeans.kMeans(datMat, 4)
|
||||
[[ 0.15357605 -0.94962877]
|
||||
[ 3.3593825 1.05965957]
|
||||
[-2.41900657 3.30513371]
|
||||
[-2.80505526 -3.73280289]]
|
||||
[[ 2.35622556 -3.02056425]
|
||||
[ 2.95373358 2.32801413]
|
||||
[-2.46154315 2.78737555]
|
||||
[-3.38237045 -2.9473363 ]]
|
||||
[[ 2.65077367 -2.79019029]
|
||||
[ 2.6265299 3.10868015]
|
||||
[-2.46154315 2.78737555]
|
||||
[-3.53973889 -2.89384326]]
|
||||
369
src/py3.x/ml/11.Apriori/apriori.py
Normal file
369
src/py3.x/ml/11.Apriori/apriori.py
Normal file
@@ -0,0 +1,369 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
|
||||
'''
|
||||
Created on Mar 24, 2011
|
||||
Update on 2017-05-18
|
||||
Ch 11 code
|
||||
Author: Peter/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning'''
|
||||
print(__doc__)
|
||||
from numpy import *
|
||||
|
||||
# 加载数据集
|
||||
def loadDataSet():
|
||||
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
|
||||
|
||||
# 创建集合 C1。即对 dataSet 进行去重,排序,放入 list 中,然后转换所有的元素为 frozenset
|
||||
def createC1(dataSet):
|
||||
"""createC1(创建集合 C1)
|
||||
|
||||
Args:
|
||||
dataSet 原始数据集
|
||||
Returns:
|
||||
frozenset 返回一个 frozenset 格式的 list
|
||||
"""
|
||||
|
||||
C1 = []
|
||||
for transaction in dataSet:
|
||||
for item in transaction:
|
||||
if not [item] in C1:
|
||||
# 遍历所有的元素,如果不在 C1 出现过,那么就 append
|
||||
C1.append([item])
|
||||
# 对数组进行 `从小到大` 的排序
|
||||
# print 'sort 前=', C1
|
||||
C1.sort()
|
||||
# frozenset 表示冻结的 set 集合,元素无改变;可以把它当字典的 key 来使用
|
||||
# print 'sort 后=', C1
|
||||
# print 'frozenset=', map(frozenset, C1)
|
||||
return map(frozenset, C1)
|
||||
|
||||
# 计算候选数据集 CK 在数据集 D 中的支持度,并返回支持度大于最小支持度(minSupport)的数据
|
||||
def scanD(D, Ck, minSupport):
|
||||
"""scanD(计算候选数据集 CK 在数据集 D 中的支持度,并返回支持度大于最小支持度 minSupport 的数据)
|
||||
|
||||
Args:
|
||||
D 数据集
|
||||
Ck 候选项集列表
|
||||
minSupport 最小支持度
|
||||
Returns:
|
||||
retList 支持度大于 minSupport 的集合
|
||||
supportData 候选项集支持度数据
|
||||
"""
|
||||
|
||||
# ssCnt 临时存放选数据集 Ck 的频率. 例如: a->10, b->5, c->8
|
||||
ssCnt = {}
|
||||
for tid in D:
|
||||
for can in Ck:
|
||||
# s.issubset(t) 测试是否 s 中的每一个元素都在 t 中
|
||||
if can.issubset(tid):
|
||||
if not ssCnt.has_key(can):
|
||||
ssCnt[can] = 1
|
||||
else:
|
||||
ssCnt[can] += 1
|
||||
numItems = float(len(D)) # 数据集 D 的数量
|
||||
retList = []
|
||||
supportData = {}
|
||||
for key in ssCnt:
|
||||
# 支持度 = 候选项(key)出现的次数 / 所有数据集的数量
|
||||
support = ssCnt[key]/numItems
|
||||
if support >= minSupport:
|
||||
# 在 retList 的首位插入元素,只存储支持度满足频繁项集的值
|
||||
retList.insert(0, key)
|
||||
# 存储所有的候选项(key)和对应的支持度(support)
|
||||
supportData[key] = support
|
||||
return retList, supportData
|
||||
|
||||
# 输入频繁项集列表 Lk 与返回的元素个数 k,然后输出所有可能的候选项集 Ck
|
||||
def aprioriGen(Lk, k):
|
||||
"""aprioriGen(输入频繁项集列表 Lk 与返回的元素个数 k,然后输出候选项集 Ck。
|
||||
例如: 以 {0},{1},{2} 为输入且 k = 2 则输出 {0,1}, {0,2}, {1,2}. 以 {0,1},{0,2},{1,2} 为输入且 k = 3 则输出 {0,1,2}
|
||||
仅需要计算一次,不需要将所有的结果计算出来,然后进行去重操作
|
||||
这是一个更高效的算法)
|
||||
|
||||
Args:
|
||||
Lk 频繁项集列表
|
||||
k 返回的项集元素个数(若元素的前 k-2 相同,就进行合并)
|
||||
Returns:
|
||||
retList 元素两两合并的数据集
|
||||
"""
|
||||
|
||||
retList = []
|
||||
lenLk = len(Lk)
|
||||
for i in range(lenLk):
|
||||
for j in range(i+1, lenLk):
|
||||
L1 = list(Lk[i])[: k-2]
|
||||
L2 = list(Lk[j])[: k-2]
|
||||
# print '-----i=', i, k-2, Lk, Lk[i], list(Lk[i])[: k-2]
|
||||
# print '-----j=', j, k-2, Lk, Lk[j], list(Lk[j])[: k-2]
|
||||
L1.sort()
|
||||
L2.sort()
|
||||
# 第一次 L1,L2 为空,元素直接进行合并,返回元素两两合并的数据集
|
||||
# if first k-2 elements are equal
|
||||
if L1 == L2:
|
||||
# set union
|
||||
# print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j]
|
||||
retList.append(Lk[i] | Lk[j])
|
||||
return retList
|
||||
|
||||
# 找出数据集 dataSet 中支持度 >= 最小支持度的候选项集以及它们的支持度。即我们的频繁项集。
|
||||
def apriori(dataSet, minSupport=0.5):
|
||||
"""apriori(首先构建集合 C1,然后扫描数据集来判断这些只有一个元素的项集是否满足最小支持度的要求。那么满足最小支持度要求的项集构成集合 L1。然后 L1 中的元素相互组合成 C2,C2 再进一步过滤变成 L2,然后以此类推,知道 CN 的长度为 0 时结束,即可找出所有频繁项集的支持度。)
|
||||
|
||||
Args:
|
||||
dataSet 原始数据集
|
||||
minSupport 支持度的阈值
|
||||
Returns:
|
||||
L 频繁项集的全集
|
||||
supportData 所有元素和支持度的全集
|
||||
"""
|
||||
# C1 即对 dataSet 进行去重,排序,放入 list 中,然后转换所有的元素为 frozenset
|
||||
C1 = createC1(dataSet)
|
||||
# print 'C1: ', C1
|
||||
# 对每一行进行 set 转换,然后存放到集合中
|
||||
D = map(set, dataSet)
|
||||
# print 'D=', D
|
||||
# 计算候选数据集 C1 在数据集 D 中的支持度,并返回支持度大于 minSupport 的数据
|
||||
L1, supportData = scanD(D, C1, minSupport)
|
||||
# print "L1=", L1, "\n", "outcome: ", supportData
|
||||
|
||||
# L 加了一层 list, L 一共 2 层 list
|
||||
L = [L1]
|
||||
k = 2
|
||||
# 判断 L 的第 k-2 项的数据长度是否 > 0。第一次执行时 L 为 [[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]]。L[k-2]=L[0]=[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])],最后面 k += 1
|
||||
while (len(L[k-2]) > 0):
|
||||
# print 'k=', k, L, L[k-2]
|
||||
Ck = aprioriGen(L[k-2], k) # 例如: 以 {0},{1},{2} 为输入且 k = 2 则输出 {0,1}, {0,2}, {1,2}. 以 {0,1},{0,2},{1,2} 为输入且 k = 3 则输出 {0,1,2}
|
||||
# print 'Ck', Ck
|
||||
|
||||
Lk, supK = scanD(D, Ck, minSupport) # 计算候选数据集 CK 在数据集 D 中的支持度,并返回支持度大于 minSupport 的数据
|
||||
# 保存所有候选项集的支持度,如果字典没有,就追加元素,如果有,就更新元素
|
||||
supportData.update(supK)
|
||||
if len(Lk) == 0:
|
||||
break
|
||||
# Lk 表示满足频繁子项的集合,L 元素在增加,例如:
|
||||
# l=[[set(1), set(2), set(3)]]
|
||||
# l=[[set(1), set(2), set(3)], [set(1, 2), set(2, 3)]]
|
||||
L.append(Lk)
|
||||
k += 1
|
||||
# print 'k=', k, len(L[k-2])
|
||||
return L, supportData
|
||||
|
||||
# 计算可信度(confidence)
|
||||
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
|
||||
"""calcConf(对两个元素的频繁项,计算可信度,例如: {1,2}/{1} 或者 {1,2}/{2} 看是否满足条件)
|
||||
|
||||
Args:
|
||||
freqSet 频繁项集中的元素,例如: frozenset([1, 3])
|
||||
H 频繁项集中的元素的集合,例如: [frozenset([1]), frozenset([3])]
|
||||
supportData 所有元素的支持度的字典
|
||||
brl 关联规则列表的空数组
|
||||
minConf 最小可信度
|
||||
Returns:
|
||||
prunedH 记录 可信度大于阈值的集合
|
||||
"""
|
||||
# 记录可信度大于最小可信度(minConf)的集合
|
||||
prunedH = []
|
||||
for conseq in H: # 假设 freqSet = frozenset([1, 3]), H = [frozenset([1]), frozenset([3])],那么现在需要求出 frozenset([1]) -> frozenset([3]) 的可信度和 frozenset([3]) -> frozenset([1]) 的可信度
|
||||
|
||||
# print 'confData=', freqSet, H, conseq, freqSet-conseq
|
||||
conf = supportData[freqSet]/supportData[freqSet-conseq] # 支持度定义: a -> b = support(a | b) / support(a). 假设 freqSet = frozenset([1, 3]), conseq = [frozenset([1])],那么 frozenset([1]) 至 frozenset([3]) 的可信度为 = support(a | b) / support(a) = supportData[freqSet]/supportData[freqSet-conseq] = supportData[frozenset([1, 3])] / supportData[frozenset([1])]
|
||||
if conf >= minConf:
|
||||
# 只要买了 freqSet-conseq 集合,一定会买 conseq 集合(freqSet-conseq 集合和 conseq集合 是全集)
|
||||
print (freqSet-conseq, '-->', conseq, 'conf:', conf)
|
||||
brl.append((freqSet-conseq, conseq, conf))
|
||||
prunedH.append(conseq)
|
||||
return prunedH
|
||||
|
||||
# 递归计算频繁项集的规则
|
||||
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
|
||||
"""rulesFromConseq
|
||||
|
||||
Args:
|
||||
freqSet 频繁项集中的元素,例如: frozenset([2, 3, 5])
|
||||
H 频繁项集中的元素的集合,例如: [frozenset([2]), frozenset([3]), frozenset([5])]
|
||||
supportData 所有元素的支持度的字典
|
||||
brl 关联规则列表的数组
|
||||
minConf 最小可信度
|
||||
"""
|
||||
# H[0] 是 freqSet 的元素组合的第一个元素,并且 H 中所有元素的长度都一样,长度由 aprioriGen(H, m+1) 这里的 m + 1 来控制
|
||||
# 该函数递归时,H[0] 的长度从 1 开始增长 1 2 3 ...
|
||||
# 假设 freqSet = frozenset([2, 3, 5]), H = [frozenset([2]), frozenset([3]), frozenset([5])]
|
||||
# 那么 m = len(H[0]) 的递归的值依次为 1 2
|
||||
# 在 m = 2 时, 跳出该递归。假设再递归一次,那么 H[0] = frozenset([2, 3, 5]),freqSet = frozenset([2, 3, 5]) ,没必要再计算 freqSet 与 H[0] 的关联规则了。
|
||||
m = len(H[0])
|
||||
if (len(freqSet) > (m + 1)):
|
||||
# print 'freqSet******************', len(freqSet), m + 1, freqSet, H, H[0]
|
||||
# 生成 m+1 个长度的所有可能的 H 中的组合,假设 H = [frozenset([2]), frozenset([3]), frozenset([5])]
|
||||
# 第一次递归调用时生成 [frozenset([2, 3]), frozenset([2, 5]), frozenset([3, 5])]
|
||||
# 第二次 。。。没有第二次,递归条件判断时已经退出了
|
||||
Hmp1 = aprioriGen(H, m+1)
|
||||
# 返回可信度大于最小可信度的集合
|
||||
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
|
||||
print ('Hmp1=', Hmp1)
|
||||
print ('len(Hmp1)=', len(Hmp1), 'len(freqSet)=', len(freqSet))
|
||||
# 计算可信度后,还有数据大于最小可信度的话,那么继续递归调用,否则跳出递归
|
||||
if (len(Hmp1) > 1):
|
||||
# print '----------------------', Hmp1
|
||||
# print len(freqSet), len(Hmp1[0]) + 1
|
||||
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
|
||||
|
||||
# 生成关联规则
|
||||
def generateRules(L, supportData, minConf=0.7):
|
||||
"""generateRules
|
||||
|
||||
Args:
|
||||
L 频繁项集列表
|
||||
supportData 频繁项集支持度的字典
|
||||
minConf 最小置信度
|
||||
Returns:
|
||||
bigRuleList 可信度规则列表(关于 (A->B+置信度) 3个字段的组合)
|
||||
"""
|
||||
bigRuleList = []
|
||||
# 假设 L = [[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])]]
|
||||
for i in range(1, len(L)):
|
||||
# 获取频繁项集中每个组合的所有元素
|
||||
for freqSet in L[i]:
|
||||
# 假设:freqSet= frozenset([1, 3]), H1=[frozenset([1]), frozenset([3])]
|
||||
# 组合总的元素并遍历子元素,并转化为 frozenset 集合,再存放到 list 列表中
|
||||
H1 = [frozenset([item]) for item in freqSet]
|
||||
# 2 个的组合,走 else, 2 个以上的组合,走 if
|
||||
if (i > 1):
|
||||
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
|
||||
else:
|
||||
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
|
||||
return bigRuleList
|
||||
|
||||
|
||||
def getActionIds():
|
||||
from time import sleep
|
||||
from votesmart import votesmart
|
||||
# votesmart.apikey = 'get your api key first'
|
||||
votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
|
||||
actionIdList = []
|
||||
billTitleList = []
|
||||
fr = open('data/11.Apriori/recent20bills.txt')
|
||||
for line in fr.readlines():
|
||||
billNum = int(line.split('\t')[0])
|
||||
try:
|
||||
billDetail = votesmart.votes.getBill(billNum) # api call
|
||||
for action in billDetail.actions:
|
||||
if action.level == 'House' and (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
|
||||
actionId = int(action.actionId)
|
||||
print ('bill: %d has actionId: %d' % (billNum, actionId))
|
||||
actionIdList.append(actionId)
|
||||
billTitleList.append(line.strip().split('\t')[1])
|
||||
except:
|
||||
print ("problem getting bill %d" % billNum)
|
||||
sleep(1) # delay to be polite
|
||||
return actionIdList, billTitleList
|
||||
|
||||
|
||||
def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
|
||||
itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
|
||||
for billTitle in billTitleList:#fill up itemMeaning list
|
||||
itemMeaning.append('%s -- Nay' % billTitle)
|
||||
itemMeaning.append('%s -- Yea' % billTitle)
|
||||
transDict = {}#list of items in each transaction (politician)
|
||||
voteCount = 2
|
||||
for actionId in actionIdList:
|
||||
sleep(3)
|
||||
print ('getting votes for actionId: %d' % actionId)
|
||||
try:
|
||||
voteList = votesmart.votes.getBillActionVotes(actionId)
|
||||
for vote in voteList:
|
||||
if not transDict.has_key(vote.candidateName):
|
||||
transDict[vote.candidateName] = []
|
||||
if vote.officeParties == 'Democratic':
|
||||
transDict[vote.candidateName].append(1)
|
||||
elif vote.officeParties == 'Republican':
|
||||
transDict[vote.candidateName].append(0)
|
||||
if vote.action == 'Nay':
|
||||
transDict[vote.candidateName].append(voteCount)
|
||||
elif vote.action == 'Yea':
|
||||
transDict[vote.candidateName].append(voteCount + 1)
|
||||
except:
|
||||
print ("problem getting actionId: %d" % actionId)
|
||||
voteCount += 2
|
||||
return transDict, itemMeaning
|
||||
|
||||
|
||||
# 暂时没用上
|
||||
# def pntRules(ruleList, itemMeaning):
|
||||
# for ruleTup in ruleList:
|
||||
# for item in ruleTup[0]:
|
||||
# print itemMeaning[item]
|
||||
# print " -------->"
|
||||
# for item in ruleTup[1]:
|
||||
# print itemMeaning[item]
|
||||
# print "confidence: %f" % ruleTup[2]
|
||||
# print #print a blank line
|
||||
|
||||
def testApriori():
|
||||
# 加载测试数据集
|
||||
dataSet = loadDataSet()
|
||||
print ('dataSet: ', dataSet)
|
||||
|
||||
# Apriori 算法生成频繁项集以及它们的支持度
|
||||
L1, supportData1 = apriori(dataSet, minSupport=0.7)
|
||||
print ('L(0.7): ', L1)
|
||||
print ('supportData(0.7): ', supportData1)
|
||||
|
||||
print ('->->->->->->->->->->->->->->->->->->->->->->->->->->->->')
|
||||
|
||||
# Apriori 算法生成频繁项集以及它们的支持度
|
||||
L2, supportData2 = apriori(dataSet, minSupport=0.5)
|
||||
print ('L(0.5): ', L2)
|
||||
print ('supportData(0.5): ', supportData2)
|
||||
|
||||
def testGenerateRules():
|
||||
# 加载测试数据集
|
||||
dataSet = loadDataSet()
|
||||
print ('dataSet: ', dataSet)
|
||||
|
||||
# Apriori 算法生成频繁项集以及它们的支持度
|
||||
L1, supportData1 = apriori(dataSet, minSupport=0.5)
|
||||
print ('L(0.7): ', L1)
|
||||
print ('supportData(0.7): ', supportData1)
|
||||
|
||||
# 生成关联规则
|
||||
rules = generateRules(L1, supportData1, minConf=0.5)
|
||||
print ('rules: ', rules)
|
||||
|
||||
def main():
|
||||
# 测试 Apriori 算法
|
||||
# testApriori()
|
||||
|
||||
# 生成关联规则
|
||||
# testGenerateRules()
|
||||
|
||||
##项目案例
|
||||
# # 构建美国国会投票记录的事务数据集
|
||||
# actionIdList, billTitleList = getActionIds()
|
||||
# # 测试前2个
|
||||
# transDict, itemMeaning = getTransList(actionIdList[: 2], billTitleList[: 2])
|
||||
#transDict 表示 action_id的集合,transDict[key]这个就是action_id对应的选项,例如 [1, 2, 3]
|
||||
# transDict, itemMeaning = getTransList(actionIdList, billTitleList)
|
||||
# # 得到全集的数据
|
||||
# dataSet = [transDict[key] for key in transDict.keys()]
|
||||
# L, supportData = apriori(dataSet, minSupport=0.3)
|
||||
# rules = generateRules(L, supportData, minConf=0.95)
|
||||
# print (rules)
|
||||
|
||||
# # 项目案例
|
||||
# # 发现毒蘑菇的相似特性
|
||||
# # 得到全集的数据
|
||||
dataSet = [line.split() for line in open("data/11.Apriori/mushroom.dat").readlines()]
|
||||
L, supportData = apriori(dataSet, minSupport=0.3)
|
||||
# # 2表示毒蘑菇,1表示可食用的蘑菇
|
||||
# # 找出关于2的频繁子项出来,就知道如果是毒蘑菇,那么出现频繁的也可能是毒蘑菇
|
||||
for item in L[1]:
|
||||
if item.intersection('2'):
|
||||
print (item)
|
||||
|
||||
for item in L[2]:
|
||||
if item.intersection('2'):
|
||||
print (item)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
345
src/py3.x/ml/12.FrequentPattemTree/fpGrowth.py
Normal file
345
src/py3.x/ml/12.FrequentPattemTree/fpGrowth.py
Normal file
@@ -0,0 +1,345 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on Jun 14, 2011
|
||||
Update on 2017-05-18
|
||||
FP-Growth FP means frequent pattern
|
||||
the FP-Growth algorithm needs:
|
||||
1. FP-tree (class treeNode)
|
||||
2. header table (use dict)
|
||||
This finds frequent itemsets similar to apriori but does not find association rules.
|
||||
Author: Peter/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
print(__doc__)
|
||||
|
||||
|
||||
class treeNode:
|
||||
def __init__(self, nameValue, numOccur, parentNode):
|
||||
self.name = nameValue
|
||||
self.count = numOccur
|
||||
self.nodeLink = None
|
||||
# needs to be updated
|
||||
self.parent = parentNode
|
||||
self.children = {}
|
||||
|
||||
def inc(self, numOccur):
|
||||
"""inc(对count变量增加给定值)
|
||||
"""
|
||||
self.count += numOccur
|
||||
|
||||
def disp(self, ind=1):
|
||||
"""disp(用于将树以文本形式显示)
|
||||
|
||||
"""
|
||||
print(' '*ind, self.name, ' ', self.count)
|
||||
for child in self.children.values():
|
||||
child.disp(ind+1)
|
||||
|
||||
|
||||
def loadSimpDat():
|
||||
simpDat = [['r', 'z', 'h', 'j', 'p'],
|
||||
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
|
||||
['z'],
|
||||
['r', 'x', 'n', 'o', 's'],
|
||||
# ['r', 'x', 'n', 'o', 's'],
|
||||
['y', 'r', 'x', 'z', 'q', 't', 'p'],
|
||||
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
|
||||
return simpDat
|
||||
|
||||
|
||||
def createInitSet(dataSet):
|
||||
retDict = {}
|
||||
for trans in dataSet:
|
||||
if frozenset(trans) not in retDict.keys():
|
||||
retDict[frozenset(trans)] = 1
|
||||
else:
|
||||
retDict[frozenset(trans)] += 1
|
||||
return retDict
|
||||
|
||||
|
||||
# this version does not use recursion
|
||||
def updateHeader(nodeToTest, targetNode):
|
||||
"""updateHeader(更新头指针,建立相同元素之间的关系,例如: 左边的r指向右边的r值,就是后出现的相同元素 指向 已经出现的元素)
|
||||
|
||||
从头指针的nodeLink开始,一直沿着nodeLink直到到达链表末尾。这就是链表。
|
||||
性能:如果链表很长可能会遇到迭代调用的次数限制。
|
||||
|
||||
Args:
|
||||
nodeToTest 满足minSup {所有的元素+(value, treeNode)}
|
||||
targetNode Tree对象的子节点
|
||||
"""
|
||||
# 建立相同元素之间的关系,例如: 左边的r指向右边的r值
|
||||
while (nodeToTest.nodeLink is not None):
|
||||
nodeToTest = nodeToTest.nodeLink
|
||||
nodeToTest.nodeLink = targetNode
|
||||
|
||||
|
||||
def updateTree(items, inTree, headerTable, count):
|
||||
"""updateTree(更新FP-tree,第二次遍历)
|
||||
|
||||
# 针对每一行的数据
|
||||
# 最大的key, 添加
|
||||
Args:
|
||||
items 满足minSup 排序后的元素key的数组(大到小的排序)
|
||||
inTree 空的Tree对象
|
||||
headerTable 满足minSup {所有的元素+(value, treeNode)}
|
||||
count 原数据集中每一组Kay出现的次数
|
||||
"""
|
||||
# 取出 元素 出现次数最高的
|
||||
# 如果该元素在 inTree.children 这个字典中,就进行累加
|
||||
# 如果该元素不存在 就 inTree.children 字典中新增key,value为初始化的 treeNode 对象
|
||||
if items[0] in inTree.children:
|
||||
# 更新 最大元素,对应的 treeNode 对象的count进行叠加
|
||||
inTree.children[items[0]].inc(count)
|
||||
else:
|
||||
# 如果不存在子节点,我们为该inTree添加子节点
|
||||
inTree.children[items[0]] = treeNode(items[0], count, inTree)
|
||||
# 如果满足minSup的dist字典的value值第二位为null, 我们就设置该元素为 本节点对应的tree节点
|
||||
# 如果元素第二位不为null,我们就更新header节点
|
||||
if headerTable[items[0]][1] is None:
|
||||
# headerTable只记录第一次节点出现的位置
|
||||
headerTable[items[0]][1] = inTree.children[items[0]]
|
||||
else:
|
||||
# 本质上是修改headerTable的key对应的Tree,的nodeLink值
|
||||
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
|
||||
if len(items) > 1:
|
||||
# 递归的调用,在items[0]的基础上,添加item0[1]做子节点, count只要循环的进行累计加和而已,统计出节点的最后的统计值。
|
||||
updateTree(items[1:], inTree.children[items[0]], headerTable, count)
|
||||
|
||||
|
||||
def createTree(dataSet, minSup=1):
|
||||
"""createTree(生成FP-tree)
|
||||
|
||||
Args:
|
||||
dataSet dist{行:出现次数}的样本数据
|
||||
minSup 最小的支持度
|
||||
Returns:
|
||||
retTree FP-tree
|
||||
headerTable 满足minSup {所有的元素+(value, treeNode)}
|
||||
"""
|
||||
# 支持度>=minSup的dist{所有元素:出现的次数}
|
||||
headerTable = {}
|
||||
# 循环 dist{行:出现次数}的样本数据
|
||||
for trans in dataSet:
|
||||
# 对所有的行进行循环,得到行里面的所有元素
|
||||
# 统计每一行中,每个元素出现的总次数
|
||||
for item in trans:
|
||||
# 例如: {'ababa': 3} count(a)=3+3+3=9 count(b)=3+3=6
|
||||
headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
|
||||
# 删除 headerTable中,元素次数<最小支持度的元素
|
||||
for k in list(headerTable.keys()): # python3中.keys()返回的是迭代器不是list,不能在遍历时对其改变。
|
||||
if headerTable[k] < minSup:
|
||||
del(headerTable[k])
|
||||
|
||||
# 满足minSup: set(各元素集合)
|
||||
freqItemSet = set(headerTable.keys())
|
||||
# 如果不存在,直接返回None
|
||||
if len(freqItemSet) == 0:
|
||||
return None, None
|
||||
for k in headerTable:
|
||||
# 格式化: dist{元素key: [元素次数, None]}
|
||||
headerTable[k] = [headerTable[k], None]
|
||||
|
||||
# create tree
|
||||
retTree = treeNode('Null Set', 1, None)
|
||||
# 循环 dist{行:出现次数}的样本数据
|
||||
for tranSet, count in dataSet.items():
|
||||
# print('tranSet, count=', tranSet, count)
|
||||
# localD = dist{元素key: 元素总出现次数}
|
||||
localD = {}
|
||||
for item in tranSet:
|
||||
# 判断是否在满足minSup的集合中
|
||||
if item in freqItemSet:
|
||||
# print('headerTable[item][0]=', headerTable[item][0], headerTable[item])
|
||||
localD[item] = headerTable[item][0]
|
||||
# print('localD=', localD)
|
||||
# 对每一行的key 进行排序,然后开始往树添加枝丫,直到丰满
|
||||
# 第二次,如果在同一个排名下出现,那么就对该枝丫的值进行追加,继续递归调用!
|
||||
if len(localD) > 0:
|
||||
# p=key,value; 所以是通过value值的大小,进行从大到小进行排序
|
||||
# orderedItems 表示取出元组的key值,也就是字母本身,但是字母本身是大到小的顺序
|
||||
orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
|
||||
# print 'orderedItems=', orderedItems, 'headerTable', headerTable, '\n\n\n'
|
||||
# 填充树,通过有序的orderedItems的第一位,进行顺序填充 第一层的子节点。
|
||||
updateTree(orderedItems, retTree, headerTable, count)
|
||||
|
||||
return retTree, headerTable
|
||||
|
||||
|
||||
def ascendTree(leafNode, prefixPath):
|
||||
"""ascendTree(如果存在父节点,就记录当前节点的name值)
|
||||
|
||||
Args:
|
||||
leafNode 查询的节点对于的nodeTree
|
||||
prefixPath 要查询的节点值
|
||||
"""
|
||||
if leafNode.parent is not None:
|
||||
prefixPath.append(leafNode.name)
|
||||
ascendTree(leafNode.parent, prefixPath)
|
||||
|
||||
|
||||
def findPrefixPath(basePat, treeNode):
|
||||
"""findPrefixPath 基础数据集
|
||||
|
||||
Args:
|
||||
basePat 要查询的节点值
|
||||
treeNode 查询的节点所在的当前nodeTree
|
||||
Returns:
|
||||
condPats 对非basePat的倒叙值作为key,赋值为count数
|
||||
"""
|
||||
condPats = {}
|
||||
# 对 treeNode的link进行循环
|
||||
while treeNode is not None:
|
||||
prefixPath = []
|
||||
# 寻找改节点的父节点,相当于找到了该节点的频繁项集
|
||||
ascendTree(treeNode, prefixPath)
|
||||
# 排除自身这个元素,判断是否存在父元素(所以要>1, 说明存在父元素)
|
||||
if len(prefixPath) > 1:
|
||||
# 对非basePat的倒叙值作为key,赋值为count数
|
||||
# prefixPath[1:] 变frozenset后,字母就变无序了
|
||||
# condPats[frozenset(prefixPath)] = treeNode.count
|
||||
condPats[frozenset(prefixPath[1:])] = treeNode.count
|
||||
# 递归,寻找改节点的下一个 相同值的链接节点
|
||||
treeNode = treeNode.nodeLink
|
||||
# print(treeNode)
|
||||
return condPats
|
||||
|
||||
|
||||
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
|
||||
"""mineTree(创建条件FP树)
|
||||
|
||||
Args:
|
||||
inTree myFPtree
|
||||
headerTable 满足minSup {所有的元素+(value, treeNode)}
|
||||
minSup 最小支持项集
|
||||
preFix preFix为newFreqSet上一次的存储记录,一旦没有myHead,就不会更新
|
||||
freqItemList 用来存储频繁子项的列表
|
||||
"""
|
||||
# 通过value进行从小到大的排序, 得到频繁项集的key
|
||||
# 最小支持项集的key的list集合
|
||||
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1][0])]
|
||||
print('-----', sorted(headerTable.items(), key=lambda p: p[1][0]))
|
||||
print('bigL=', bigL)
|
||||
# 循环遍历 最频繁项集的key,从小到大的递归寻找对应的频繁项集
|
||||
for basePat in bigL:
|
||||
# preFix为newFreqSet上一次的存储记录,一旦没有myHead,就不会更新
|
||||
newFreqSet = preFix.copy()
|
||||
newFreqSet.add(basePat)
|
||||
print('newFreqSet=', newFreqSet, preFix)
|
||||
|
||||
freqItemList.append(newFreqSet)
|
||||
print('freqItemList=', freqItemList)
|
||||
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
|
||||
print('condPattBases=', basePat, condPattBases)
|
||||
|
||||
# 构建FP-tree
|
||||
myCondTree, myHead = createTree(condPattBases, minSup)
|
||||
print('myHead=', myHead)
|
||||
# 挖掘条件 FP-tree, 如果myHead不为空,表示满足minSup {所有的元素+(value, treeNode)}
|
||||
if myHead is not None:
|
||||
myCondTree.disp(1)
|
||||
print('\n\n\n')
|
||||
# 递归 myHead 找出频繁项集
|
||||
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
|
||||
print('\n\n\n')
|
||||
|
||||
|
||||
# import twitter
|
||||
# from time import sleep
|
||||
# import re
|
||||
|
||||
|
||||
# def getLotsOfTweets(searchStr):
|
||||
# """
|
||||
# 获取 100个搜索结果页面
|
||||
# """
|
||||
# CONSUMER_KEY = ''
|
||||
# CONSUMER_SECRET = ''
|
||||
# ACCESS_TOKEN_KEY = ''
|
||||
# ACCESS_TOKEN_SECRET = ''
|
||||
# api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET, access_token_key=ACCESS_TOKEN_KEY, access_token_secret=ACCESS_TOKEN_SECRET)
|
||||
|
||||
# # you can get 1500 results 15 pages * 100 per page
|
||||
# resultsPages = []
|
||||
# for i in range(1, 15):
|
||||
# print("fetching page %d" % i)
|
||||
# searchResults = api.GetSearch(searchStr, per_page=100, page=i)
|
||||
# resultsPages.append(searchResults)
|
||||
# sleep(6)
|
||||
# return resultsPages
|
||||
|
||||
|
||||
# def textParse(bigString):
|
||||
# """
|
||||
# 解析页面内容
|
||||
# """
|
||||
# urlsRemoved = re.sub('(http:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString)
|
||||
# listOfTokens = re.split(r'\W*', urlsRemoved)
|
||||
# return [tok.lower() for tok in listOfTokens if len(tok) > 2]
|
||||
|
||||
|
||||
# def mineTweets(tweetArr, minSup=5):
|
||||
# """
|
||||
# 获取频繁项集
|
||||
# """
|
||||
# parsedList = []
|
||||
# for i in range(14):
|
||||
# for j in range(100):
|
||||
# parsedList.append(textParse(tweetArr[i][j].text))
|
||||
# initSet = createInitSet(parsedList)
|
||||
# myFPtree, myHeaderTab = createTree(initSet, minSup)
|
||||
# myFreqList = []
|
||||
# mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
|
||||
# return myFreqList
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# rootNode = treeNode('pyramid', 9, None)
|
||||
# rootNode.children['eye'] = treeNode('eye', 13, None)
|
||||
# rootNode.children['phoenix'] = treeNode('phoenix', 3, None)
|
||||
# # 将树以文本形式显示
|
||||
# # print(rootNode.disp())
|
||||
|
||||
# load样本数据
|
||||
simpDat = loadSimpDat()
|
||||
# print(simpDat, '\n')
|
||||
# frozen set 格式化 并 重新装载 样本数据,对所有的行进行统计求和,格式: {行:出现次数}
|
||||
initSet = createInitSet(simpDat)
|
||||
print(initSet)
|
||||
|
||||
# 创建FP树
|
||||
# 输入:dist{行:出现次数}的样本数据 和 最小的支持度
|
||||
# 输出:最终的PF-tree,通过循环获取第一层的节点,然后每一层的节点进行递归的获取每一行的字节点,也就是分支。然后所谓的指针,就是后来的指向已存在的
|
||||
myFPtree, myHeaderTab = createTree(initSet, 3)
|
||||
myFPtree.disp()
|
||||
|
||||
# 抽取条件模式基
|
||||
# 查询树节点的,频繁子项
|
||||
print('x --->', findPrefixPath('x', myHeaderTab['x'][1]))
|
||||
print('z --->', findPrefixPath('z', myHeaderTab['z'][1]))
|
||||
print('r --->', findPrefixPath('r', myHeaderTab['r'][1]))
|
||||
|
||||
# 创建条件模式基
|
||||
freqItemList = []
|
||||
mineTree(myFPtree, myHeaderTab, 3, set([]), freqItemList)
|
||||
print("freqItemList: \n", freqItemList)
|
||||
|
||||
# # 项目实战
|
||||
# # 1.twitter项目案例
|
||||
# # 无法运行,因为没发链接twitter
|
||||
# lotsOtweets = getLotsOfTweets('RIMM')
|
||||
# listOfTerms = mineTweets(lotsOtweets, 20)
|
||||
# print(len(listOfTerms))
|
||||
# for t in listOfTerms:
|
||||
# print(t)
|
||||
|
||||
# # 2.新闻网站点击流中挖掘,例如:文章1阅读过的人,还阅读过什么?
|
||||
# parsedDat = [line.split() for line in open('data/12.FPGrowth/kosarak.dat').readlines()]
|
||||
# initSet = createInitSet(parsedDat)
|
||||
# myFPtree, myHeaderTab = createTree(initSet, 100000)
|
||||
|
||||
# myFreList = []
|
||||
# mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreList)
|
||||
# print myFreList
|
||||
153
src/py3.x/ml/13.PCA/pca.py
Normal file
153
src/py3.x/ml/13.PCA/pca.py
Normal file
@@ -0,0 +1,153 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on Jun 1, 2011
|
||||
Update on 2017-12-20
|
||||
Author: Peter Harrington/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from numpy import *
|
||||
import matplotlib.pyplot as plt
|
||||
print(__doc__)
|
||||
|
||||
|
||||
def loadDataSet(fileName, delim='\t'):
|
||||
fr = open(fileName)
|
||||
stringArr = [line.strip().split(delim) for line in fr.readlines()]
|
||||
datArr = [list(map(float, line)) for line in stringArr]
|
||||
#注意这里和python2的区别,需要在map函数外加一个list(),否则显示结果为 map at 0x3fed1d0
|
||||
return mat(datArr)
|
||||
|
||||
|
||||
def pca(dataMat, topNfeat=9999999):
|
||||
"""pca
|
||||
|
||||
Args:
|
||||
dataMat 原数据集矩阵
|
||||
topNfeat 应用的N个特征
|
||||
Returns:
|
||||
lowDDataMat 降维后数据集
|
||||
reconMat 新的数据集空间
|
||||
"""
|
||||
|
||||
# 计算每一列的均值
|
||||
meanVals = mean(dataMat, axis=0)
|
||||
# print('meanVals', meanVals)
|
||||
|
||||
# 每个向量同时都减去 均值
|
||||
meanRemoved = dataMat - meanVals
|
||||
# print('meanRemoved=', meanRemoved)
|
||||
|
||||
# cov协方差=[(x1-x均值)*(y1-y均值)+(x2-x均值)*(y2-y均值)+...+(xn-x均值)*(yn-y均值)+]/(n-1)
|
||||
'''
|
||||
方差:(一维)度量两个随机变量关系的统计量
|
||||
协方差: (二维)度量各个维度偏离其均值的程度
|
||||
协方差矩阵:(多维)度量各个维度偏离其均值的程度
|
||||
|
||||
当 cov(X, Y)>0时,表明X与Y正相关;(X越大,Y也越大;X越小Y,也越小。这种情况,我们称为“正相关”。)
|
||||
当 cov(X, Y)<0时,表明X与Y负相关;
|
||||
当 cov(X, Y)=0时,表明X与Y不相关。
|
||||
'''
|
||||
covMat = cov(meanRemoved, rowvar=0)
|
||||
|
||||
# eigVals为特征值, eigVects为特征向量
|
||||
eigVals, eigVects = linalg.eig(mat(covMat))
|
||||
# print('eigVals=', eigVals)
|
||||
# print('eigVects=', eigVects)
|
||||
# 对特征值,进行从小到大的排序,返回从小到大的index序号
|
||||
# 特征值的逆序就可以得到topNfeat个最大的特征向量
|
||||
'''
|
||||
>>> x = np.array([3, 1, 2])
|
||||
>>> np.argsort(x)
|
||||
array([1, 2, 0]) # index,1 = 1; index,2 = 2; index,0 = 3
|
||||
>>> y = np.argsort(x)
|
||||
>>> y[::-1]
|
||||
array([0, 2, 1])
|
||||
>>> y[:-3:-1]
|
||||
array([0, 2]) # 取出 -1, -2
|
||||
>>> y[:-6:-1]
|
||||
array([0, 2, 1])
|
||||
'''
|
||||
eigValInd = argsort(eigVals)
|
||||
# print('eigValInd1=', eigValInd)
|
||||
|
||||
# -1表示倒序,返回topN的特征值[-1 到 -(topNfeat+1) 但是不包括-(topNfeat+1)本身的倒叙]
|
||||
eigValInd = eigValInd[:-(topNfeat+1):-1]
|
||||
# print('eigValInd2=', eigValInd)
|
||||
# 重组 eigVects 最大到最小
|
||||
redEigVects = eigVects[:, eigValInd]
|
||||
# print('redEigVects=', redEigVects.T)
|
||||
# 将数据转换到新空间
|
||||
# print( "---", shape(meanRemoved), shape(redEigVects))
|
||||
lowDDataMat = meanRemoved * redEigVects
|
||||
reconMat = (lowDDataMat * redEigVects.T) + meanVals
|
||||
# print('lowDDataMat=', lowDDataMat)
|
||||
# print('reconMat=', reconMat)
|
||||
return lowDDataMat, reconMat
|
||||
|
||||
|
||||
def replaceNanWithMean():
|
||||
datMat = loadDataSet('data/13.PCA/secom.data', ' ')
|
||||
numFeat = shape(datMat)[1]
|
||||
for i in range(numFeat):
|
||||
# 对value不为NaN的求均值
|
||||
# .A 返回矩阵基于的数组
|
||||
meanVal = mean(datMat[nonzero(~isnan(datMat[:, i].A))[0], i])
|
||||
# 将value为NaN的值赋值为均值
|
||||
datMat[nonzero(isnan(datMat[:, i].A))[0],i] = meanVal
|
||||
return datMat
|
||||
|
||||
|
||||
def show_picture(dataMat, reconMat):
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.scatter(dataMat[:, 0].flatten().A[0], dataMat[:, 1].flatten().A[0], marker='^', s=90)
|
||||
ax.scatter(reconMat[:, 0].flatten().A[0], reconMat[:, 1].flatten().A[0], marker='o', s=50, c='red')
|
||||
plt.show()
|
||||
|
||||
|
||||
def analyse_data(dataMat):
|
||||
meanVals = mean(dataMat, axis=0)
|
||||
meanRemoved = dataMat-meanVals
|
||||
covMat = cov(meanRemoved, rowvar=0)
|
||||
eigvals, eigVects = linalg.eig(mat(covMat))
|
||||
eigValInd = argsort(eigvals)
|
||||
|
||||
topNfeat = 20
|
||||
eigValInd = eigValInd[:-(topNfeat+1):-1]
|
||||
cov_all_score = float(sum(eigvals))
|
||||
sum_cov_score = 0
|
||||
for i in range(0, len(eigValInd)):
|
||||
line_cov_score = float(eigvals[eigValInd[i]])
|
||||
sum_cov_score += line_cov_score
|
||||
'''
|
||||
我们发现其中有超过20%的特征值都是0。
|
||||
这就意味着这些特征都是其他特征的副本,也就是说,它们可以通过其他特征来表示,而本身并没有提供额外的信息。
|
||||
|
||||
最前面15个值的数量级大于10^5,实际上那以后的值都变得非常小。
|
||||
这就相当于告诉我们只有部分重要特征,重要特征的数目也很快就会下降。
|
||||
|
||||
最后,我们可能会注意到有一些小的负值,他们主要源自数值误差应该四舍五入成0.
|
||||
'''
|
||||
print('主成分:%s, 方差占比:%s%%, 累积方差占比:%s%%' % (format(i+1, '2.0f'), format(line_cov_score/cov_all_score*100, '4.2f'), format(sum_cov_score/cov_all_score*100, '4.1f')))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# # 加载数据,并转化数据类型为float
|
||||
# dataMat = loadDataSet('data/13.PCA/testSet.txt')
|
||||
# # 只需要1个特征向量
|
||||
# lowDmat, reconMat = pca(dataMat, 1)
|
||||
# # 只需要2个特征向量,和原始数据一致,没任何变化
|
||||
# # lowDmat, reconMat = pca(dataMat, 2)
|
||||
# # print(shape(lowDmat))
|
||||
# show_picture(dataMat, reconMat)
|
||||
|
||||
# 利用PCA对半导体制造数据降维
|
||||
dataMat = replaceNanWithMean()
|
||||
print(shape(dataMat))
|
||||
# 分析数据
|
||||
analyse_data(dataMat)
|
||||
# lowDmat, reconMat = pca(dataMat, 20)
|
||||
# print(shape(lowDmat))
|
||||
# show_picture(dataMat, reconMat)
|
||||
352
src/py3.x/ml/14.SVD/svdRecommend.py
Normal file
352
src/py3.x/ml/14.SVD/svdRecommend.py
Normal file
@@ -0,0 +1,352 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf-8
|
||||
|
||||
'''
|
||||
Created on Mar 8, 2011
|
||||
Update on 2017-12-12
|
||||
Author: Peter Harrington/山上有课树/片刻/marsjhao
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from numpy import linalg as la
|
||||
from numpy import *
|
||||
|
||||
|
||||
def loadExData3():
|
||||
# 利用SVD提高推荐效果,菜肴矩阵
|
||||
return[[2, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
|
||||
[0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0],
|
||||
[3, 3, 4, 0, 3, 0, 0, 2, 2, 0, 0],
|
||||
[5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],
|
||||
[4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5],
|
||||
[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4],
|
||||
[0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],
|
||||
[0, 0, 0, 3, 0, 0, 0, 0, 4, 5, 0],
|
||||
[1, 1, 2, 1, 1, 2, 1, 0, 4, 5, 0]]
|
||||
|
||||
|
||||
def loadExData2():
|
||||
# 书上代码给的示例矩阵
|
||||
return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
|
||||
[0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
|
||||
[0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
|
||||
[3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
|
||||
[5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
|
||||
[0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
|
||||
[4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
|
||||
[0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
|
||||
[0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
|
||||
[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
|
||||
[1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]
|
||||
|
||||
|
||||
def loadExData():
|
||||
"""
|
||||
# 推荐引擎示例矩阵
|
||||
return[[4, 4, 0, 2, 2],
|
||||
[4, 0, 0, 3, 3],
|
||||
[4, 0, 0, 1, 1],
|
||||
[1, 1, 1, 2, 0],
|
||||
[2, 2, 2, 0, 0],
|
||||
[1, 1, 1, 0, 0],
|
||||
[5, 5, 5, 0, 0]]
|
||||
"""
|
||||
# # 原矩阵
|
||||
# return[[1, 1, 1, 0, 0],
|
||||
# [2, 2, 2, 0, 0],
|
||||
# [1, 1, 1, 0, 0],
|
||||
# [5, 5, 5, 0, 0],
|
||||
# [1, 1, 0, 2, 2],
|
||||
# [0, 0, 0, 3, 3],
|
||||
# [0, 0, 0, 1, 1]]
|
||||
|
||||
# 原矩阵
|
||||
return[[0, -1.6, 0.6],
|
||||
[0, 1.2, 0.8],
|
||||
[0, 0, 0],
|
||||
[0, 0, 0]]
|
||||
|
||||
|
||||
# 相似度计算,假定inA和inB 都是列向量
|
||||
# 基于欧氏距离
|
||||
def ecludSim(inA, inB):
|
||||
return 1.0/(1.0 + la.norm(inA - inB))
|
||||
|
||||
|
||||
# pearsSim()函数会检查是否存在3个或更多的点。
|
||||
# corrcoef直接计算皮尔逊相关系数,范围[-1, 1],归一化后[0, 1]
|
||||
def pearsSim(inA, inB):
|
||||
# 如果不存在,该函数返回1.0,此时两个向量完全相关。
|
||||
if len(inA) < 3:
|
||||
return 1.0
|
||||
return 0.5 + 0.5 * corrcoef(inA, inB, rowvar=0)[0][1]
|
||||
|
||||
|
||||
# 计算余弦相似度,如果夹角为90度,相似度为0;如果两个向量的方向相同,相似度为1.0
|
||||
def cosSim(inA, inB):
|
||||
num = float(inA.T*inB)
|
||||
denom = la.norm(inA)*la.norm(inB)
|
||||
return 0.5 + 0.5*(num/denom)
|
||||
|
||||
|
||||
# 基于物品相似度的推荐引擎
|
||||
def standEst(dataMat, user, simMeas, item):
|
||||
"""standEst(计算某用户未评分物品中,以对该物品和其他物品评分的用户的物品相似度,然后进行综合评分)
|
||||
Args:
|
||||
dataMat 训练数据集
|
||||
user 用户编号
|
||||
simMeas 相似度计算方法
|
||||
item 未评分的物品编号
|
||||
Returns:
|
||||
ratSimTotal/simTotal 评分(0~5之间的值)
|
||||
"""
|
||||
# 得到数据集中的物品数目
|
||||
n = shape(dataMat)[1]
|
||||
# 初始化两个评分值
|
||||
simTotal = 0.0
|
||||
ratSimTotal = 0.0
|
||||
# 遍历行中的每个物品(对用户评过分的物品进行遍历,并将它与其他物品进行比较)
|
||||
for j in range(n):
|
||||
userRating = dataMat[user, j]
|
||||
# 如果某个物品的评分值为0,则跳过这个物品
|
||||
if userRating == 0:
|
||||
continue
|
||||
# 寻找两个用户都评级的物品
|
||||
# 变量 overLap 给出的是两个物品当中已经被评分的那个元素的索引ID
|
||||
# logical_and 计算x1和x2元素的真值。
|
||||
overLap = nonzero(logical_and(dataMat[:, item].A > 0, dataMat[:, j].A > 0))[0]
|
||||
# 如果相似度为0,则两着没有任何重合元素,终止本次循环
|
||||
if len(overLap) == 0:
|
||||
similarity = 0
|
||||
# 如果存在重合的物品,则基于这些重合物重新计算相似度。
|
||||
else:
|
||||
similarity = simMeas(dataMat[overLap, item], dataMat[overLap, j])
|
||||
# print('the %d and %d similarity is : %f'(iten,j,similarity))
|
||||
# 相似度会不断累加,每次计算时还考虑相似度和当前用户评分的乘积
|
||||
# similarity 用户相似度, userRating 用户评分
|
||||
simTotal += similarity
|
||||
ratSimTotal += similarity * userRating
|
||||
if simTotal == 0:
|
||||
return 0
|
||||
# 通过除以所有的评分总和,对上述相似度评分的乘积进行归一化,使得最后评分在0~5之间,这些评分用来对预测值进行排序
|
||||
else:
|
||||
return ratSimTotal/simTotal
|
||||
|
||||
|
||||
# 基于SVD的评分估计
|
||||
# 在recommend() 中,这个函数用于替换对standEst()的调用,该函数对给定用户给定物品构建了一个评分估计值
|
||||
def svdEst(dataMat, user, simMeas, item):
|
||||
"""svdEst( )
|
||||
Args:
|
||||
dataMat 训练数据集
|
||||
user 用户编号
|
||||
simMeas 相似度计算方法
|
||||
item 未评分的物品编号
|
||||
Returns:
|
||||
ratSimTotal/simTotal 评分(0~5之间的值)
|
||||
"""
|
||||
# 物品数目
|
||||
n = shape(dataMat)[1]
|
||||
# 对数据集进行SVD分解
|
||||
simTotal = 0.0
|
||||
ratSimTotal = 0.0
|
||||
# 奇异值分解
|
||||
# 在SVD分解之后,我们只利用包含了90%能量值的奇异值,这些奇异值会以NumPy数组的形式得以保存
|
||||
U, Sigma, VT = la.svd(dataMat)
|
||||
|
||||
# # 分析 Sigma 的长度取值
|
||||
# analyse_data(Sigma, 20)
|
||||
|
||||
# 如果要进行矩阵运算,就必须要用这些奇异值构建出一个对角矩阵
|
||||
Sig4 = mat(eye(4) * Sigma[: 4])
|
||||
|
||||
# 利用U矩阵将物品转换到低维空间中,构建转换后的物品(物品+4个主要的特征)
|
||||
xformedItems = dataMat.T * U[:, :4] * Sig4.I
|
||||
print('dataMat', shape(dataMat))
|
||||
print('U[:, :4]', shape(U[:, :4]))
|
||||
print('Sig4.I', shape(Sig4.I))
|
||||
print('VT[:4, :]', shape(VT[:4, :]))
|
||||
print('xformedItems', shape(xformedItems))
|
||||
|
||||
# 对于给定的用户,for循环在用户对应行的元素上进行遍历
|
||||
# 这和standEst()函数中的for循环的目的一样,只不过这里的相似度计算时在低维空间下进行的。
|
||||
for j in range(n):
|
||||
userRating = dataMat[user, j]
|
||||
if userRating == 0 or j == item:
|
||||
continue
|
||||
# 相似度的计算方法也会作为一个参数传递给该函数
|
||||
similarity = simMeas(xformedItems[item, :].T, xformedItems[j, :].T)
|
||||
# for 循环中加入了一条print语句,以便了解相似度计算的进展情况。如果觉得累赘,可以去掉
|
||||
print('the %d and %d similarity is: %f' % (item, j, similarity))
|
||||
# 对相似度不断累加求和
|
||||
simTotal += similarity
|
||||
# 对相似度及对应评分值的乘积求和
|
||||
ratSimTotal += similarity * userRating
|
||||
if simTotal == 0:
|
||||
return 0
|
||||
else:
|
||||
# 计算估计评分
|
||||
return ratSimTotal/simTotal
|
||||
|
||||
|
||||
# recommend()函数,就是推荐引擎,它默认调用standEst()函数,产生了最高的N个推荐结果。
|
||||
# 如果不指定N的大小,则默认值为3。该函数另外的参数还包括相似度计算方法和估计方法
|
||||
def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
|
||||
"""svdEst( )
|
||||
Args:
|
||||
dataMat 训练数据集
|
||||
user 用户编号
|
||||
simMeas 相似度计算方法
|
||||
estMethod 使用的推荐算法
|
||||
Returns:
|
||||
返回最终 N 个推荐结果
|
||||
"""
|
||||
# 寻找未评级的物品
|
||||
# 对给定的用户建立一个未评分的物品列表
|
||||
unratedItems = nonzero(dataMat[user, :].A == 0)[1]
|
||||
# 如果不存在未评分物品,那么就退出函数
|
||||
if len(unratedItems) == 0:
|
||||
return 'you rated everything'
|
||||
# 物品的编号和评分值
|
||||
itemScores = []
|
||||
# 在未评分物品上进行循环
|
||||
for item in unratedItems:
|
||||
# 获取 item 该物品的评分
|
||||
estimatedScore = estMethod(dataMat, user, simMeas, item)
|
||||
itemScores.append((item, estimatedScore))
|
||||
# 按照评分得分 进行逆排序,获取前N个未评级物品进行推荐
|
||||
return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[: N]
|
||||
|
||||
|
||||
def analyse_data(Sigma, loopNum=20):
|
||||
"""analyse_data(分析 Sigma 的长度取值)
|
||||
Args:
|
||||
Sigma Sigma的值
|
||||
loopNum 循环次数
|
||||
"""
|
||||
# 总方差的集合(总能量值)
|
||||
Sig2 = Sigma**2
|
||||
SigmaSum = sum(Sig2)
|
||||
for i in range(loopNum):
|
||||
SigmaI = sum(Sig2[:i+1])
|
||||
'''
|
||||
根据自己的业务情况,就行处理,设置对应的 Singma 次数
|
||||
通常保留矩阵 80% ~ 90% 的能量,就可以得到重要的特征并取出噪声。
|
||||
'''
|
||||
print('主成分:%s, 方差占比:%s%%' % (format(i+1, '2.0f'), format(SigmaI/SigmaSum*100, '4.2f')))
|
||||
|
||||
|
||||
# 图像压缩函数
|
||||
# 加载并转换数据
|
||||
def imgLoadData(filename):
|
||||
myl = []
|
||||
# 打开文本文件,并从文件以数组方式读入字符
|
||||
for line in open(filename).readlines():
|
||||
newRow = []
|
||||
for i in range(32):
|
||||
newRow.append(int(line[i]))
|
||||
myl.append(newRow)
|
||||
# 矩阵调入后,就可以在屏幕上输出该矩阵
|
||||
myMat = mat(myl)
|
||||
return myMat
|
||||
|
||||
|
||||
# 打印矩阵
|
||||
def printMat(inMat, thresh=0.8):
|
||||
# 由于矩阵保护了浮点数,因此定义浅色和深色,遍历所有矩阵元素,当元素大于阀值时打印1,否则打印0
|
||||
for i in range(32):
|
||||
for k in range(32):
|
||||
if float(inMat[i, k]) > thresh:
|
||||
print(1,)
|
||||
else:
|
||||
print(0,)
|
||||
print('')
|
||||
|
||||
|
||||
# 实现图像压缩,允许基于任意给定的奇异值数目来重构图像
|
||||
def imgCompress(numSV=3, thresh=0.8):
|
||||
"""imgCompress( )
|
||||
Args:
|
||||
numSV Sigma长度
|
||||
thresh 判断的阈值
|
||||
"""
|
||||
# 构建一个列表
|
||||
myMat = imgLoadData('data/14.SVD/0_5.txt')
|
||||
|
||||
print("****original matrix****")
|
||||
# 对原始图像进行SVD分解并重构图像e
|
||||
printMat(myMat, thresh)
|
||||
|
||||
# 通过Sigma 重新构成SigRecom来实现
|
||||
# Sigma是一个对角矩阵,因此需要建立一个全0矩阵,然后将前面的那些奇异值填充到对角线上。
|
||||
U, Sigma, VT = la.svd(myMat)
|
||||
# SigRecon = mat(zeros((numSV, numSV)))
|
||||
# for k in range(numSV):
|
||||
# SigRecon[k, k] = Sigma[k]
|
||||
|
||||
# 分析插入的 Sigma 长度
|
||||
analyse_data(Sigma, 20)
|
||||
|
||||
SigRecon = mat(eye(numSV) * Sigma[: numSV])
|
||||
reconMat = U[:, :numSV] * SigRecon * VT[:numSV, :]
|
||||
print("****reconstructed matrix using %d singular values *****" % numSV)
|
||||
printMat(reconMat, thresh)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# # 对矩阵进行SVD分解(用python实现SVD)
|
||||
# Data = loadExData()
|
||||
# print('Data:', Data)
|
||||
# U, Sigma, VT = linalg.svd(Data)
|
||||
# # 打印Sigma的结果,因为前3个数值比其他的值大了很多,为9.72140007e+00,5.29397912e+00,6.84226362e-01
|
||||
# # 后两个值比较小,每台机器输出结果可能有不同可以将这两个值去掉
|
||||
# print('U:', U)
|
||||
# print('Sigma', Sigma)
|
||||
# print('VT:', VT)
|
||||
# print('VT:', VT.T)
|
||||
|
||||
# # 重构一个3x3的矩阵Sig3
|
||||
# Sig3 = mat([[Sigma[0], 0, 0], [0, Sigma[1], 0], [0, 0, Sigma[2]]])
|
||||
# print(U[:, :3] * Sig3 * VT[:3, :])
|
||||
|
||||
"""
|
||||
# 计算欧氏距离
|
||||
myMat = mat(loadExData())
|
||||
# print(myMat)
|
||||
print(ecludSim(myMat[:, 0], myMat[:, 4]))
|
||||
print(ecludSim(myMat[:, 0], myMat[:, 0]))
|
||||
# 计算余弦相似度
|
||||
print(cosSim(myMat[:, 0], myMat[:, 4]))
|
||||
print(cosSim(myMat[:, 0], myMat[:, 0]))
|
||||
# 计算皮尔逊相关系数
|
||||
print(pearsSim(myMat[:, 0], myMat[:, 4]))
|
||||
print(pearsSim(myMat[:, 0], myMat[:, 0]))
|
||||
"""
|
||||
|
||||
# 计算相似度的方法
|
||||
myMat = mat(loadExData3())
|
||||
# print(myMat)
|
||||
# 计算相似度的第一种方式
|
||||
print(recommend(myMat, 1, estMethod=svdEst))
|
||||
# 计算相似度的第二种方式
|
||||
print(recommend(myMat, 1, estMethod=svdEst, simMeas=pearsSim))
|
||||
|
||||
# 默认推荐(菜馆菜肴推荐示例)
|
||||
print(recommend(myMat, 2))
|
||||
|
||||
"""
|
||||
# 利用SVD提高推荐效果
|
||||
U, Sigma, VT = la.svd(mat(loadExData2()))
|
||||
print(Sigma) # 计算矩阵的SVD来了解其需要多少维的特征
|
||||
Sig2 = Sigma**2 # 计算需要多少个奇异值能达到总能量的90%
|
||||
print(sum(Sig2)) # 计算总能量
|
||||
print(sum(Sig2) * 0.9) # 计算总能量的90%
|
||||
print(sum(Sig2[: 2])) # 计算前两个元素所包含的能量
|
||||
print(sum(Sig2[: 3])) # 两个元素的能量值小于总能量的90%,于是计算前三个元素所包含的能量
|
||||
# 该值高于总能量的90%,这就可以了
|
||||
"""
|
||||
|
||||
# 压缩图片
|
||||
# imgCompress(2)
|
||||
65
src/py3.x/ml/15.BigData_MapReduce/mrMean.py
Normal file
65
src/py3.x/ml/15.BigData_MapReduce/mrMean.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
|
||||
'''
|
||||
Created on 2017-04-07
|
||||
Update on 2017-11-17
|
||||
Author: Peter/ApacheCN-xy/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
|
||||
from mrjob.job import MRJob
|
||||
from mrjob.step import MRStep
|
||||
|
||||
|
||||
class MRmean(MRJob):
|
||||
def __init__(self, *args, **kwargs): # 对数据初始化
|
||||
super(MRmean, self).__init__(*args, **kwargs)
|
||||
self.inCount = 0
|
||||
self.inSum = 0
|
||||
self.inSqSum = 0
|
||||
|
||||
# 接受输入数据流
|
||||
def map(self, key, val): # 需要 2 个参数,求数据的和与平方和
|
||||
if False:
|
||||
yield
|
||||
inVal = float(val)
|
||||
self.inCount += 1
|
||||
self.inSum += inVal
|
||||
self.inSqSum += inVal*inVal
|
||||
|
||||
# 所有输入到达后开始处理
|
||||
def map_final(self): # 计算数据的平均值,平方的均值,并返回
|
||||
if self.inCount == 0:
|
||||
return
|
||||
mn = self.inSum/self.inCount
|
||||
mnSq = self.inSqSum/self.inCount
|
||||
yield (1, [self.inCount, mn, mnSq])
|
||||
|
||||
def reduce(self, key, packedValues):
|
||||
cumN, cumVal, cumSumSq = 0.0, 0.0, 0.0
|
||||
for valArr in packedValues: # 从输入流中获取值
|
||||
nj = float(valArr[0])
|
||||
cumN += nj
|
||||
cumVal += nj*float(valArr[1])
|
||||
cumSumSq += nj*float(valArr[2])
|
||||
mean = cumVal/cumN
|
||||
var = (cumSumSq - 2*mean*cumVal + cumN*mean*mean)/cumN
|
||||
yield (mean, var) # 发出平均值和方差
|
||||
|
||||
def steps(self):
|
||||
"""
|
||||
step方法定义执行的步骤。
|
||||
执行顺序不必完全遵循map-reduce模式。
|
||||
例如:
|
||||
1. map-reduce-reduce-reduce
|
||||
2. map-reduce-map-reduce-map-reduce
|
||||
在step方法里,需要为mrjob指定mapper和reducer的名称。如果没有,它将默认调用mapper和reducer方法。
|
||||
|
||||
在mapper 和 mapper_final中还可以共享状态,mapper 或 mapper_final 不能 reducer之间共享状态。
|
||||
"""
|
||||
return [MRStep(mapper=self.map, mapper_final=self.map_final, reducer=self.reduce)]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
MRmean.run()
|
||||
42
src/py3.x/ml/15.BigData_MapReduce/mrMeanMapper.py
Normal file
42
src/py3.x/ml/15.BigData_MapReduce/mrMeanMapper.py
Normal file
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
|
||||
'''
|
||||
Created on 2017-04-06
|
||||
Update on 2017-11-17
|
||||
Author: Peter/ApacheCN-xy/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
from numpy import mat, mean, power
|
||||
|
||||
'''
|
||||
这个mapper文件按行读取所有的输入并创建一组对应的浮点数,然后得到数组的长度并创建NumPy矩阵。
|
||||
再对所有的值进行平方,最后将均值和平方后的均值发送出去。这些值将用来计算全局的均值和方差。
|
||||
|
||||
Args:
|
||||
file 输入数据
|
||||
Return:
|
||||
'''
|
||||
|
||||
|
||||
def read_input(file):
|
||||
for line in file:
|
||||
yield line.rstrip() # 返回一个 yield 迭代器,每次获取下一个值,节约内存。
|
||||
|
||||
|
||||
input = read_input(sys.stdin) # 创建一个输入的数据行的列表list
|
||||
input = [float(line) for line in input] # 将得到的数据转化为 float 类型
|
||||
numInputs = len(input) # 获取数据的个数,即输入文件的数据的行数
|
||||
input = mat(input) # 将 List 转换为矩阵
|
||||
sqInput = power(input, 2) # 将矩阵的数据分别求 平方,即 2次方
|
||||
|
||||
# 输出 数据的个数,n个数据的均值,n个数据平方之后的均值
|
||||
# 第一行是标准输出,也就是reducer的输出
|
||||
# 第二行识标准错误输出,即对主节点作出的响应报告,表明本节点工作正常。
|
||||
# 【这不就是面试的装逼重点吗?如何设计监听架构细节】注意:一个好的习惯是想标准错误输出发送报告。如果某任务10分钟内没有报告输出,则将被Hadoop中止。
|
||||
print("%d\t%f\t%f" % (numInputs, mean(input), mean(sqInput))) # 计算均值
|
||||
print("map report: still alive", file=sys.stderr)
|
||||
47
src/py3.x/ml/15.BigData_MapReduce/mrMeanReducer.py
Normal file
47
src/py3.x/ml/15.BigData_MapReduce/mrMeanReducer.py
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
|
||||
'''
|
||||
Created on 2017-04-06
|
||||
Update on 2017-11-17
|
||||
Author: Peter/ApacheCN-xy/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
|
||||
'''
|
||||
mapper 接受原始的输入并产生中间值传递给 reducer。
|
||||
很多的mapper是并行执行的,所以需要将这些mapper的输出合并成一个值。
|
||||
即:将中间的 key/value 对进行组合。
|
||||
'''
|
||||
|
||||
|
||||
def read_input(file):
|
||||
for line in file:
|
||||
yield line.rstrip() # 返回值中包含输入文件的每一行的数据的一个大的List
|
||||
|
||||
|
||||
input = read_input(sys.stdin) # 创建一个输入的数据行的列表list
|
||||
|
||||
# 将输入行分割成单独的项目并存储在列表的列表中
|
||||
mapperOut = [line.split('\t') for line in input]
|
||||
# 输入 数据的个数,n个数据的均值,n个数据平方之后的均值
|
||||
print (mapperOut)
|
||||
|
||||
# 累计样本总和,总和 和 平分和的总和
|
||||
cumN, cumVal, cumSumSq = 0.0, 0.0, 0.0
|
||||
for instance in mapperOut:
|
||||
nj = float(instance[0])
|
||||
cumN += nj
|
||||
cumVal += nj*float(instance[1])
|
||||
cumSumSq += nj*float(instance[2])
|
||||
|
||||
# 计算均值( varSum是计算方差的展开形式 )
|
||||
mean_ = cumVal/cumN
|
||||
varSum = (cumSumSq - 2*mean_*cumVal + cumN*mean_*mean_)/cumN
|
||||
# 输出 数据总量,均值,平方的均值(方差)
|
||||
print("数据总量:%d\t均值:%f\t方差:%f" % (cumN, mean_, varSum))
|
||||
print("reduce report: still alive", file=sys.stderr)
|
||||
95
src/py3.x/ml/15.BigData_MapReduce/mrSVM.py
Normal file
95
src/py3.x/ml/15.BigData_MapReduce/mrSVM.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
|
||||
'''
|
||||
Created on 2017-04-07
|
||||
Update on 2017-11-17
|
||||
Author: Peter/ApacheCN-xy/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
|
||||
import pickle
|
||||
from numpy import *
|
||||
from mrjob.job import MRJob
|
||||
from mrjob.step import MRStep
|
||||
|
||||
|
||||
class MRsvm(MRJob):
|
||||
DEFAULT_INPUT_PROTOCOL = 'json_value'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(MRsvm, self).__init__(*args, **kwargs)
|
||||
self.data = pickle.load(open('/opt/git/MachineLearnidata/15.BigData_MapReduce/svmDat27', 'r'))
|
||||
self.w = 0
|
||||
self.eta = 0.69
|
||||
self.dataList = []
|
||||
self.k = self.options.batchsize
|
||||
self.numMappers = 1
|
||||
self.t = 1 # iteration number
|
||||
|
||||
def configure_args(self):
|
||||
super(MRsvm, self).configure_args()
|
||||
self.add_passthru_arg(
|
||||
'--iterations', dest='iterations', default=2, type=int,
|
||||
help='T: number of iterations to run')
|
||||
self.add_passthru_arg(
|
||||
'--batchsize', dest='batchsize', default=100, type=int,
|
||||
help='k: number of data points in a batch')
|
||||
|
||||
def map(self, mapperId, inVals): # 需要 2 个参数
|
||||
# input: nodeId, ('w', w-vector) OR nodeId, ('x', int)
|
||||
if False:
|
||||
yield
|
||||
if inVals[0] == 'w': # 积累 w向量
|
||||
self.w = inVals[1]
|
||||
elif inVals[0] == 'x':
|
||||
self.dataList.append(inVals[1]) # 累积数据点计算
|
||||
elif inVals[0] == 't': # 迭代次数
|
||||
self.t = inVals[1]
|
||||
else:
|
||||
self.eta = inVals # 这用于 debug, eta未在map中使用
|
||||
|
||||
def map_fin(self):
|
||||
labels = self.data[:, -1]
|
||||
X = self.data[:, :-1] # 将数据重新形成 X 和 Y
|
||||
if self.w == 0:
|
||||
self.w = [0.001] * shape(X)[1] # 在第一次迭代时,初始化 w
|
||||
for index in self.dataList:
|
||||
p = mat(self.w)*X[index, :].T # calc p=w*dataSet[key].T
|
||||
if labels[index]*p < 1.0:
|
||||
yield (1, ['u', index]) # 确保一切数据包含相同的key
|
||||
yield (1, ['w', self.w]) # 它们将在同一个 reducer
|
||||
yield (1, ['t', self.t])
|
||||
|
||||
def reduce(self, _, packedVals):
|
||||
for valArr in packedVals: # 从流输入获取值
|
||||
if valArr[0] == 'u':
|
||||
self.dataList.append(valArr[1])
|
||||
elif valArr[0] == 'w':
|
||||
self.w = valArr[1]
|
||||
elif valArr[0] == 't':
|
||||
self.t = valArr[1]
|
||||
|
||||
labels = self.data[:, -1]
|
||||
X = self.data[:, 0:-1]
|
||||
wMat = mat(self.w)
|
||||
wDelta = mat(zeros(len(self.w)))
|
||||
|
||||
for index in self.dataList:
|
||||
wDelta += float(labels[index]) * X[index, :] # wDelta += label*dataSet
|
||||
eta = 1.0/(2.0*self.t) # calc new: eta
|
||||
# calc new: w = (1.0 - 1/t)*w + (eta/k)*wDelta
|
||||
wMat = (1.0 - 1.0/self.t)*wMat + (eta/self.k)*wDelta
|
||||
for mapperNum in range(1, self.numMappers+1):
|
||||
yield (mapperNum, ['w', wMat.tolist()[0]]) # 发出 w
|
||||
if self.t < self.options.iterations:
|
||||
yield (mapperNum, ['t', self.t+1]) # 增量 T
|
||||
for j in range(self.k/self.numMappers): # emit random ints for mappers iid
|
||||
yield (mapperNum, ['x', random.randint(shape(self.data)[0])])
|
||||
|
||||
def steps(self):
|
||||
return [MRStep(mapper=self.map, reducer=self.reduce, mapper_final=self.map_fin)] * self.options.iterations
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
MRsvm.run()
|
||||
13
src/py3.x/ml/15.BigData_MapReduce/mrSVMkickStart.py
Normal file
13
src/py3.x/ml/15.BigData_MapReduce/mrSVMkickStart.py
Normal file
@@ -0,0 +1,13 @@
|
||||
'''
|
||||
Created on Feb 27, 2011
|
||||
|
||||
Author: Peter
|
||||
'''
|
||||
from mrjob.protocol import JSONProtocol
|
||||
from numpy import *
|
||||
|
||||
fw=open('kickStart2.txt', 'w')
|
||||
for i in [1]:
|
||||
for j in range(100):
|
||||
fw.write('["x", %d]\n' % random.randint(200))
|
||||
fw.close()
|
||||
113
src/py3.x/ml/15.BigData_MapReduce/pegasos.py
Normal file
113
src/py3.x/ml/15.BigData_MapReduce/pegasos.py
Normal file
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
|
||||
'''
|
||||
Created on 2017-04-07
|
||||
Update on 2017-11-17
|
||||
Author: Peter/ApacheCN-xy/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
|
||||
from numpy import *
|
||||
|
||||
|
||||
def loadDataSet(fileName):
|
||||
dataMat = []
|
||||
labelMat = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split('\t')
|
||||
# dataMat.append([float(lineArr[0]), float(lineArr[1]), float(lineArr[2])])
|
||||
dataMat.append([float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(float(lineArr[2]))
|
||||
return dataMat, labelMat
|
||||
|
||||
|
||||
def seqPegasos(dataSet, labels, lam, T):
|
||||
m, n = shape(dataSet)
|
||||
w = zeros(n)
|
||||
for t in range(1, T+1):
|
||||
i = random.randint(m)
|
||||
eta = 1.0/(lam*t)
|
||||
p = predict(w, dataSet[i, :])
|
||||
if labels[i]*p < 1:
|
||||
w = (1.0 - 1/t)*w + eta*labels[i]*dataSet[i, :]
|
||||
else:
|
||||
w = (1.0 - 1/t)*w
|
||||
print(w)
|
||||
return w
|
||||
|
||||
|
||||
def predict(w, x):
|
||||
return w*x.T # 就是预测 y 的值
|
||||
|
||||
|
||||
def batchPegasos(dataSet, labels, lam, T, k):
|
||||
"""batchPegasos()
|
||||
|
||||
Args:
|
||||
dataMat 特征集合
|
||||
labels 分类结果集合
|
||||
lam 固定值
|
||||
T 迭代次数
|
||||
k 待处理列表大小
|
||||
Returns:
|
||||
w 回归系数
|
||||
"""
|
||||
m, n = shape(dataSet)
|
||||
w = zeros(n) # 回归系数
|
||||
dataIndex = list(range(m))
|
||||
for t in range(1, T+1):
|
||||
wDelta = mat(zeros(n)) # 重置 wDelta
|
||||
|
||||
# 它是学习率,代表了权重调整幅度的大小。(也可以理解为随机梯度的步长,使它不断减小,便于拟合)
|
||||
# 输入T和K分别设定了迭代次数和待处理列表的大小。在T次迭代过程中,每次需要重新计算eta
|
||||
eta = 1.0/(lam*t)
|
||||
random.shuffle(dataIndex)
|
||||
for j in range(k): # 全部的训练集 内循环中执行批处理,将分类错误的值全部做累加后更新权重向量
|
||||
i = dataIndex[j]
|
||||
p = predict(w, dataSet[i, :]) # mapper 代码
|
||||
|
||||
# 如果预测正确,并且预测结果的绝对值>=1,因为最大间隔为1, 认为没问题。
|
||||
# 否则算是预测错误, 通过预测错误的结果,来累计更新w.
|
||||
if labels[i]*p < 1: # mapper 代码
|
||||
wDelta += labels[i]*dataSet[i, :].A # 累积变化
|
||||
# w通过不断的随机梯度的方式来优化
|
||||
w = (1.0 - 1/t)*w + (eta/k)*wDelta # 在每个 T上应用更改
|
||||
# print '-----', w
|
||||
# print '++++++', w
|
||||
return w
|
||||
|
||||
|
||||
datArr, labelList = loadDataSet('data/15.BigData_MapReduce/testSet.txt')
|
||||
datMat = mat(datArr)
|
||||
# finalWs = seqPegasos(datMat, labelList, 2, 5000)
|
||||
finalWs = batchPegasos(datMat, labelList, 2, 50, 100)
|
||||
print(finalWs)
|
||||
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
x1 = []
|
||||
y1 = []
|
||||
xm1 = []
|
||||
ym1 = []
|
||||
for i in range(len(labelList)):
|
||||
if labelList[i] == 1.0:
|
||||
x1.append(datMat[i, 0])
|
||||
y1.append(datMat[i, 1])
|
||||
else:
|
||||
xm1.append(datMat[i, 0])
|
||||
ym1.append(datMat[i, 1])
|
||||
ax.scatter(x1, y1, marker='s', s=90)
|
||||
ax.scatter(xm1, ym1, marker='o', s=50, c='red')
|
||||
x = arange(-6.0, 8.0, 0.1)
|
||||
y = (-finalWs[0, 0]*x - 0)/finalWs[0, 1]
|
||||
# y2 = (0.43799*x)/0.12316
|
||||
y2 = (0.498442*x)/0.092387 # 2 iterations
|
||||
ax.plot(x, y)
|
||||
ax.plot(x, y2, 'g-.')
|
||||
ax.axis([-6, 8, -4, 5])
|
||||
ax.legend(('50 Iterations', '2 Iterations'))
|
||||
plt.show()
|
||||
59
src/py3.x/ml/15.BigData_MapReduce/proximalSVM.py
Normal file
59
src/py3.x/ml/15.BigData_MapReduce/proximalSVM.py
Normal file
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on 2011-02-25
|
||||
Update on 2017-06-20
|
||||
Author: Peter/ApacheCN-xy/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
import base64
|
||||
import pickle
|
||||
|
||||
import numpy
|
||||
|
||||
|
||||
def map(key, value):
|
||||
# input key= class for one training example, e.g. "-1.0"
|
||||
classes = [float(item) for item in key.split(",")] # e.g. [-1.0]
|
||||
D = numpy.diag(classes)
|
||||
|
||||
# input value = feature vector for one training example, e.g. "3.0, 7.0, 2.0"
|
||||
featurematrix = [float(item) for item in value.split(",")]
|
||||
A = numpy.matrix(featurematrix)
|
||||
|
||||
# create matrix E and vector e
|
||||
e = numpy.matrix(numpy.ones(len(A)).reshape(len(A), 1))
|
||||
E = numpy.matrix(numpy.append(A, -e, axis=1))
|
||||
|
||||
# create a tuple with the values to be used by reducer
|
||||
# and encode it with base64 to avoid potential trouble with '\t' and '\n' used
|
||||
# as default separators in Hadoop Streaming
|
||||
producedvalue = base64.b64encode(pickle.dumps((E.T*E, E.T*D*e)))
|
||||
|
||||
# note: a single constant key "producedkey" sends to only one reducer
|
||||
# somewhat "atypical" due to low degree of parallism on reducer side
|
||||
print("producedkey\t%s" % (producedvalue))
|
||||
|
||||
def reduce(key, values, mu=0.1):
|
||||
sumETE = None
|
||||
sumETDe = None
|
||||
|
||||
# key isn't used, so ignoring it with _ (underscore).
|
||||
for _, value in values:
|
||||
# unpickle values
|
||||
ETE, ETDe = pickle.loads(base64.b64decode(value))
|
||||
if sumETE == None:
|
||||
# create the I/mu with correct dimensions
|
||||
sumETE = numpy.matrix(numpy.eye(ETE.shape[1])/mu)
|
||||
sumETE += ETE
|
||||
|
||||
if sumETDe == None:
|
||||
# create sumETDe with correct dimensions
|
||||
sumETDe = ETDe
|
||||
else:
|
||||
sumETDe += ETDe
|
||||
|
||||
# note: omega = result[:-1] and gamma = result[-1]
|
||||
# but printing entire vector as output
|
||||
result = sumETE.I*sumETDe
|
||||
print("%s\t%s" % (key, str(result.tolist())))
|
||||
25
src/py3.x/ml/15.BigData_MapReduce/py27dbg.py
Normal file
25
src/py3.x/ml/15.BigData_MapReduce/py27dbg.py
Normal file
@@ -0,0 +1,25 @@
|
||||
'''
|
||||
Created on Feb 27, 2011
|
||||
MapReduce version of Pegasos SVM
|
||||
Using mrjob to automate job flow
|
||||
Author: Peter
|
||||
'''
|
||||
from mrjob.job import MRJob
|
||||
|
||||
import pickle
|
||||
from numpy import *
|
||||
|
||||
class MRsvm(MRJob):
|
||||
|
||||
def map(self, mapperId, inVals): #needs exactly 2 arguments
|
||||
if False: yield
|
||||
yield (1, 22)
|
||||
|
||||
def reduce(self, _, packedVals):
|
||||
yield "fuck ass"
|
||||
|
||||
def steps(self):
|
||||
return ([self.mr(mapper=self.map, reducer=self.reduce)])
|
||||
|
||||
if __name__ == '__main__':
|
||||
MRsvm.run()
|
||||
32
src/py3.x/ml/15.BigData_MapReduce/wc.py
Normal file
32
src/py3.x/ml/15.BigData_MapReduce/wc.py
Normal file
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
from mrjob.job import MRJob
|
||||
|
||||
|
||||
class MRWordCountUtility(MRJob):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(MRWordCountUtility, self).__init__(*args, **kwargs)
|
||||
self.chars = 0
|
||||
self.words = 0
|
||||
self.lines = 0
|
||||
|
||||
def mapper(self, _, line):
|
||||
if False:
|
||||
yield # I'm a generator!
|
||||
|
||||
self.chars += len(line) + 1 # +1 for newline
|
||||
self.words += sum(1 for word in line.split() if word.strip())
|
||||
self.lines += 1
|
||||
|
||||
def mapper_final(self):
|
||||
yield('chars', self.chars)
|
||||
yield('words', self.words)
|
||||
yield('lines', self.lines)
|
||||
|
||||
def reducer(self, key, values):
|
||||
yield(key, sum(values))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
MRWordCountUtility.run()
|
||||
227
src/py3.x/ml/16.RecommenderSystems/RS-itemcf.py
Normal file
227
src/py3.x/ml/16.RecommenderSystems/RS-itemcf.py
Normal file
@@ -0,0 +1,227 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on 2015-06-22
|
||||
Update on 2017-05-16
|
||||
Author: Lockvictor/片刻
|
||||
《推荐系统实践》协同过滤算法源代码
|
||||
参考地址:https://github.com/Lockvictor/MovieLens-RecSys
|
||||
更新地址:https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import math
|
||||
import random
|
||||
from operator import itemgetter
|
||||
|
||||
# 作用:使得随机数据可预测
|
||||
random.seed(0)
|
||||
|
||||
|
||||
class ItemBasedCF():
|
||||
''' TopN recommendation - ItemBasedCF '''
|
||||
|
||||
def __init__(self):
|
||||
self.trainset = {}
|
||||
self.testset = {}
|
||||
|
||||
# n_sim_user: top 20个用户, n_rec_movie: top 10个推荐结果
|
||||
self.n_sim_movie = 20
|
||||
self.n_rec_movie = 10
|
||||
|
||||
# user_sim_mat: 电影之间的相似度, movie_popular: 电影的出现次数, movie_count: 总电影数量
|
||||
self.movie_sim_mat = {}
|
||||
self.movie_popular = {}
|
||||
self.movie_count = 0
|
||||
|
||||
print('Similar movie number = %d' % self.n_sim_movie, file=sys.stderr)
|
||||
print('Recommended movie number = %d' % self.n_rec_movie, file=sys.stderr)
|
||||
|
||||
@staticmethod
|
||||
def loadfile(filename):
|
||||
"""loadfile(加载文件,返回一个生成器)
|
||||
|
||||
Args:
|
||||
filename 文件名
|
||||
Returns:
|
||||
line 行数据,去空格
|
||||
"""
|
||||
fp = open(filename, 'r')
|
||||
for i, line in enumerate(fp):
|
||||
yield line.strip('\r\n')
|
||||
if i > 0 and i % 100000 == 0:
|
||||
print('loading %s(%s)' % (filename, i), file=sys.stderr)
|
||||
fp.close()
|
||||
print('load %s success' % filename, file=sys.stderr)
|
||||
|
||||
def generate_dataset(self, filename, pivot=0.7):
|
||||
"""loadfile(加载文件,将数据集按照7:3 进行随机拆分)
|
||||
|
||||
Args:
|
||||
filename 文件名
|
||||
pivot 拆分比例
|
||||
"""
|
||||
trainset_len = 0
|
||||
testset_len = 0
|
||||
|
||||
for line in self.loadfile(filename):
|
||||
# 用户ID,电影名称,评分,时间戳
|
||||
# user, movie, rating, _ = line.split('::')
|
||||
user, movie, rating, _ = line.split('\t')
|
||||
# 通过pivot和随机函数比较,然后初始化用户和对应的值
|
||||
if (random.random() < pivot):
|
||||
|
||||
# dict.setdefault(key, default=None)
|
||||
# key -- 查找的键值
|
||||
# default -- 键不存在时,设置的默认键值
|
||||
self.trainset.setdefault(user, {})
|
||||
self.trainset[user][movie] = int(rating)
|
||||
trainset_len += 1
|
||||
else:
|
||||
self.testset.setdefault(user, {})
|
||||
self.testset[user][movie] = int(rating)
|
||||
testset_len += 1
|
||||
|
||||
print('分离训练集和测试集成功', file=sys.stderr)
|
||||
print('train set = %s' % trainset_len, file=sys.stderr)
|
||||
print('test set = %s' % testset_len, file=sys.stderr)
|
||||
|
||||
def calc_movie_sim(self):
|
||||
"""calc_movie_sim(计算用户之间的相似度)"""
|
||||
|
||||
print('counting movies number and popularity...', file=sys.stderr)
|
||||
|
||||
# 统计在所有的用户中,不同电影的总出现次数, user, movies
|
||||
for _, movies in self.trainset.items():
|
||||
for movie in movies:
|
||||
# count item popularity
|
||||
if movie not in self.movie_popular:
|
||||
self.movie_popular[movie] = 0
|
||||
self.movie_popular[movie] += 1
|
||||
|
||||
print('count movies number and popularity success', file=sys.stderr)
|
||||
|
||||
# save the total number of movies
|
||||
self.movie_count = len(self.movie_popular)
|
||||
print('total movie number = %d' % self.movie_count, file=sys.stderr)
|
||||
|
||||
# 统计在相同用户时,不同电影同时出现的次数
|
||||
itemsim_mat = self.movie_sim_mat
|
||||
print('building co-rated users matrix...', file=sys.stderr)
|
||||
# user, movies
|
||||
for _, movies in self.trainset.items():
|
||||
for m1 in movies:
|
||||
for m2 in movies:
|
||||
if m1 == m2:
|
||||
continue
|
||||
itemsim_mat.setdefault(m1, {})
|
||||
itemsim_mat[m1].setdefault(m2, 0)
|
||||
itemsim_mat[m1][m2] += 1
|
||||
print('build co-rated users matrix success', file=sys.stderr)
|
||||
|
||||
# calculate similarity matrix
|
||||
print('calculating movie similarity matrix...', file=sys.stderr)
|
||||
simfactor_count = 0
|
||||
PRINT_STEP = 2000000
|
||||
for m1, related_movies in itemsim_mat.items():
|
||||
for m2, count in related_movies.iteritems():
|
||||
# 余弦相似度
|
||||
itemsim_mat[m1][m2] = count / math.sqrt(
|
||||
self.movie_popular[m1] * self.movie_popular[m2])
|
||||
simfactor_count += 1
|
||||
# 打印进度条
|
||||
if simfactor_count % PRINT_STEP == 0:
|
||||
print('calculating movie similarity factor(%d)' % simfactor_count, file=sys.stderr)
|
||||
|
||||
print('calculate movie similarity matrix(similarity factor) success', file=sys.stderr)
|
||||
print('Total similarity factor number = %d' % simfactor_count, file=sys.stderr)
|
||||
|
||||
# @profile
|
||||
def recommend(self, user):
|
||||
"""recommend(找出top K的电影,对电影进行相似度sum的排序,取出top N的电影数)
|
||||
|
||||
Args:
|
||||
user 用户
|
||||
Returns:
|
||||
rec_movie 电影推荐列表,按照相似度从大到小的排序
|
||||
"""
|
||||
''' Find K similar movies and recommend N movies. '''
|
||||
K = self.n_sim_movie
|
||||
N = self.n_rec_movie
|
||||
rank = {}
|
||||
watched_movies = self.trainset[user]
|
||||
|
||||
# 计算top K 电影的相似度
|
||||
# rating=电影评分, w=不同电影出现的次数
|
||||
# 耗时分析:98.2%的时间在 line-154行
|
||||
for movie, rating in watched_movies.iteritems():
|
||||
for related_movie, w in sorted(
|
||||
self.movie_sim_mat[movie].items(),
|
||||
key=itemgetter(1),
|
||||
reverse=True)[0:K]:
|
||||
if related_movie in watched_movies:
|
||||
continue
|
||||
rank.setdefault(related_movie, 0)
|
||||
rank[related_movie] += w * rating
|
||||
# return the N best movies
|
||||
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
|
||||
|
||||
def evaluate(self):
|
||||
''' return precision, recall, coverage and popularity '''
|
||||
print('Evaluation start...', file=sys.stderr)
|
||||
|
||||
# 返回top N的推荐结果
|
||||
N = self.n_rec_movie
|
||||
# varables for precision and recall
|
||||
# hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数
|
||||
hit = 0
|
||||
rec_count = 0
|
||||
test_count = 0
|
||||
# varables for coverage
|
||||
all_rec_movies = set()
|
||||
# varables for popularity
|
||||
popular_sum = 0
|
||||
|
||||
# enumerate将其组成一个索引序列,利用它可以同时获得索引和值
|
||||
# 参考地址:http://blog.csdn.net/churximi/article/details/51648388
|
||||
for i, user in enumerate(self.trainset):
|
||||
if i > 0 and i % 500 == 0:
|
||||
print('recommended for %d users' % i, file=sys.stderr)
|
||||
test_movies = self.testset.get(user, {})
|
||||
rec_movies = self.recommend(user)
|
||||
|
||||
# 对比测试集和推荐集的差异 movie, w
|
||||
for movie, _ in rec_movies:
|
||||
if movie in test_movies:
|
||||
hit += 1
|
||||
all_rec_movies.add(movie)
|
||||
# 计算用户对应的电影出现次数log值的sum加和
|
||||
popular_sum += math.log(1 + self.movie_popular[movie])
|
||||
rec_count += N
|
||||
test_count += len(test_movies)
|
||||
|
||||
precision = hit / (1.0 * rec_count)
|
||||
recall = hit / (1.0 * test_count)
|
||||
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
|
||||
popularity = popular_sum / (1.0 * rec_count)
|
||||
|
||||
print('precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
|
||||
precision, recall, coverage, popularity), file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# ratingfile = 'data/16.RecommenderSystems/ml-1m/ratings.dat'
|
||||
ratingfile = 'data/16.RecommenderSystems/ml-100k/u.data'
|
||||
|
||||
# 创建ItemCF对象
|
||||
itemcf = ItemBasedCF()
|
||||
# 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中
|
||||
itemcf.generate_dataset(ratingfile, pivot=0.7)
|
||||
# 计算用户之间的相似度
|
||||
itemcf.calc_movie_sim()
|
||||
# 评估推荐效果
|
||||
# itemcf.evaluate()
|
||||
# 查看推荐结果用户
|
||||
user = "2"
|
||||
print("推荐结果", itemcf.recommend(user))
|
||||
print("---", itemcf.testset.get(user, {}))
|
||||
190
src/py3.x/ml/16.RecommenderSystems/RS-sklearn-rating.py
Normal file
190
src/py3.x/ml/16.RecommenderSystems/RS-sklearn-rating.py
Normal file
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import math
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.sparse.linalg import svds
|
||||
from sklearn import cross_validation as cv
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
|
||||
|
||||
def splitData(dataFile, test_size):
|
||||
# 加载数据集
|
||||
header = ['user_id', 'item_id', 'rating', 'timestamp']
|
||||
df = pd.read_csv(dataFile, sep='\t', names=header)
|
||||
|
||||
n_users = df.user_id.unique().shape[0]
|
||||
n_items = df.item_id.unique().shape[0]
|
||||
|
||||
print('Number of users = ' + str(n_users) + ' | Number of movies = ' +
|
||||
str(n_items))
|
||||
train_data, test_data = cv.train_test_split(df, test_size=test_size)
|
||||
print("数据量:", len(train_data), len(test_data))
|
||||
return df, n_users, n_items, train_data, test_data
|
||||
|
||||
|
||||
def calc_similarity(n_users, n_items, train_data, test_data):
|
||||
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
|
||||
train_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in train_data.itertuples():
|
||||
train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
|
||||
test_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in test_data.itertuples():
|
||||
test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
|
||||
|
||||
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
|
||||
print("1:", np.shape(train_data_matrix)) # 行:人,列:电影
|
||||
print("2:", np.shape(train_data_matrix.T)) # 行:电影,列:人
|
||||
|
||||
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
|
||||
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
|
||||
|
||||
print('开始统计流行item的数量...', file=sys.stderr)
|
||||
item_popular = {}
|
||||
# 统计在所有的用户中,不同电影的总出现次数
|
||||
for i_index in range(n_items):
|
||||
if np.sum(train_data_matrix[:, i_index]) != 0:
|
||||
item_popular[i_index] = np.sum(train_data_matrix[:, i_index] != 0)
|
||||
# print "pop=", i_index, self.item_popular[i_index]
|
||||
|
||||
# save the total number of items
|
||||
item_count = len(item_popular)
|
||||
print('总共流行item数量 = %d' % item_count, file=sys.stderr)
|
||||
|
||||
return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular
|
||||
|
||||
|
||||
def predict(rating, similarity, type='user'):
|
||||
print(type)
|
||||
print("rating=", np.shape(rating))
|
||||
print("similarity=", np.shape(similarity))
|
||||
if type == 'user':
|
||||
# 求出每一个用户,所有电影的综合评分(axis=0 表示对列操作, 1表示对行操作)
|
||||
# print "rating=", np.shape(rating)
|
||||
mean_user_rating = rating.mean(axis=1)
|
||||
# np.newaxis参考地址: http://blog.csdn.net/xtingjie/article/details/72510834
|
||||
# print "mean_user_rating=", np.shape(mean_user_rating)
|
||||
# print "mean_user_rating.newaxis=", np.shape(mean_user_rating[:, np.newaxis])
|
||||
rating_diff = (rating - mean_user_rating[:, np.newaxis])
|
||||
# print "rating=", rating[:3, :3]
|
||||
# print "mean_user_rating[:, np.newaxis]=", mean_user_rating[:, np.newaxis][:3, :3]
|
||||
# print "rating_diff=", rating_diff[:3, :3]
|
||||
|
||||
# 均分 + 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分
|
||||
pred = mean_user_rating[:, np.newaxis] + similarity.dot(
|
||||
rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
|
||||
elif type == 'item':
|
||||
# 综合打分: 人-电影-评分(943, 1682)*电影-电影-距离(1682, 1682)=结果-人-电影(各个电影对同一电影的综合得分)(943, 1682) / 再除以 电影与其他电影总的距离 = 人-电影综合得分
|
||||
pred = rating.dot(similarity) / np.array(
|
||||
[np.abs(similarity).sum(axis=1)])
|
||||
return pred
|
||||
|
||||
|
||||
def rmse(prediction, ground_truth):
|
||||
prediction = prediction[ground_truth.nonzero()].flatten()
|
||||
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
|
||||
return math.sqrt(mean_squared_error(prediction, ground_truth))
|
||||
|
||||
|
||||
def evaluate(prediction, item_popular, name):
|
||||
hit = 0
|
||||
rec_count = 0
|
||||
test_count = 0
|
||||
popular_sum = 0
|
||||
all_rec_items = set()
|
||||
for u_index in range(n_users):
|
||||
items = np.where(train_data_matrix[u_index, :] == 0)[0]
|
||||
pre_items = sorted(
|
||||
dict(zip(items, prediction[u_index, items])).items(),
|
||||
key=itemgetter(1),
|
||||
reverse=True)[:20]
|
||||
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
|
||||
|
||||
# 对比测试集和推荐集的差异 item, w
|
||||
for item, _ in pre_items:
|
||||
if item in test_items:
|
||||
hit += 1
|
||||
all_rec_items.add(item)
|
||||
|
||||
# 计算用户对应的电影出现次数log值的sum加和
|
||||
if item in item_popular:
|
||||
popular_sum += math.log(1 + item_popular[item])
|
||||
|
||||
rec_count += len(pre_items)
|
||||
test_count += len(test_items)
|
||||
|
||||
precision = hit / (1.0 * rec_count)
|
||||
recall = hit / (1.0 * test_count)
|
||||
coverage = len(all_rec_items) / (1.0 * len(item_popular))
|
||||
popularity = popular_sum / (1.0 * rec_count)
|
||||
print('%s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
|
||||
name, precision, recall, coverage, popularity), file=sys.stderr)
|
||||
|
||||
|
||||
def recommend(u_index, prediction):
|
||||
items = np.where(train_data_matrix[u_index, :] == 0)[0]
|
||||
pre_items = sorted(
|
||||
dict(zip(items, prediction[u_index, items])).items(),
|
||||
key=itemgetter(1),
|
||||
reverse=True)[:10]
|
||||
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
|
||||
|
||||
print('原始结果:', test_items)
|
||||
print('推荐结果:', [key for key, value in pre_items])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# 基于内存的协同过滤
|
||||
# ...
|
||||
# 拆分数据集
|
||||
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
|
||||
dataFile = 'data/16.RecommenderSystems/ml-100k/u.data'
|
||||
df, n_users, n_items, train_data, test_data = splitData(
|
||||
dataFile, test_size=0.25)
|
||||
|
||||
# 计算相似度
|
||||
train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(
|
||||
n_users, n_items, train_data, test_data)
|
||||
|
||||
item_prediction = predict(train_data_matrix, item_similarity, type='item')
|
||||
user_prediction = predict(train_data_matrix, user_similarity, type='user')
|
||||
|
||||
# 评估:均方根误差
|
||||
print(
|
||||
'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))
|
||||
print(
|
||||
'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
|
||||
|
||||
# 基于模型的协同过滤
|
||||
# ...
|
||||
# 计算MovieLens数据集的稀疏度 (n_users,n_items 是常量,所以,用户行为数据越少,意味着信息量少;越稀疏,优化的空间也越大)
|
||||
sparsity = round(1.0 - len(df) / float(n_users * n_items), 3)
|
||||
print('The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%')
|
||||
|
||||
# 计算稀疏矩阵的最大k个奇异值/向量
|
||||
u, s, vt = svds(train_data_matrix, k=15)
|
||||
s_diag_matrix = np.diag(s)
|
||||
svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
print("svd-shape:", np.shape(svd_prediction))
|
||||
print(
|
||||
'Model based CF RMSE: ' + str(rmse(svd_prediction, test_data_matrix)))
|
||||
"""
|
||||
在信息量相同的情况下,矩阵越小,那么携带的信息越可靠。
|
||||
所以:user-cf 推荐效果高于 item-cf; 而svd分解后,发现15个维度效果就能达到90%以上,所以信息更可靠,效果也更好。
|
||||
item-cf: 1682
|
||||
user-cf: 943
|
||||
svd: 15
|
||||
"""
|
||||
evaluate(item_prediction, item_popular, 'item')
|
||||
evaluate(user_prediction, item_popular, 'user')
|
||||
evaluate(svd_prediction, item_popular, 'svd')
|
||||
|
||||
# 推荐结果
|
||||
recommend(1, svd_prediction)
|
||||
238
src/py3.x/ml/16.RecommenderSystems/RS-usercf.py
Normal file
238
src/py3.x/ml/16.RecommenderSystems/RS-usercf.py
Normal file
@@ -0,0 +1,238 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on 2015-06-22
|
||||
Update on 2017-05-16
|
||||
Author: Lockvictor/片刻
|
||||
《推荐系统实践》协同过滤算法源代码
|
||||
参考地址:https://github.com/Lockvictor/MovieLens-RecSys
|
||||
更新地址:https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import math
|
||||
import random
|
||||
from operator import itemgetter
|
||||
print(__doc__)
|
||||
# 作用:使得随机数据可预测
|
||||
random.seed(0)
|
||||
|
||||
|
||||
class UserBasedCF():
|
||||
''' TopN recommendation - UserBasedCF '''
|
||||
|
||||
def __init__(self):
|
||||
self.trainset = {}
|
||||
self.testset = {}
|
||||
|
||||
# n_sim_user: top 20个用户, n_rec_movie: top 10个推荐结果
|
||||
self.n_sim_user = 20
|
||||
self.n_rec_movie = 10
|
||||
|
||||
# user_sim_mat: 用户之间的相似度, movie_popular: 电影的出现次数, movie_count: 总电影数量
|
||||
self.user_sim_mat = {}
|
||||
self.movie_popular = {}
|
||||
self.movie_count = 0
|
||||
|
||||
print('similar user number = %d' % self.n_sim_user, file=sys.stderr)
|
||||
print('recommended movie number = %d' % self.n_rec_movie, file=sys.stderr)
|
||||
|
||||
@staticmethod
|
||||
def loadfile(filename):
|
||||
"""loadfile(加载文件,返回一个生成器)
|
||||
|
||||
Args:
|
||||
filename 文件名
|
||||
Returns:
|
||||
line 行数据,去空格
|
||||
"""
|
||||
fp = open(filename, 'r')
|
||||
for i, line in enumerate(fp):
|
||||
yield line.strip('\r\n')
|
||||
if i > 0 and i % 100000 == 0:
|
||||
print('loading %s(%s)' % (filename, i), file=sys.stderr)
|
||||
fp.close()
|
||||
print('load %s success' % filename, file=sys.stderr)
|
||||
|
||||
def generate_dataset(self, filename, pivot=0.7):
|
||||
"""loadfile(加载文件,将数据集按照7:3 进行随机拆分)
|
||||
|
||||
Args:
|
||||
filename 文件名
|
||||
pivot 拆分比例
|
||||
"""
|
||||
trainset_len = 0
|
||||
testset_len = 0
|
||||
|
||||
for line in self.loadfile(filename):
|
||||
# 用户ID,电影名称,评分,时间戳timestamp
|
||||
# user, movie, rating, timestamp = line.split('::')
|
||||
user, movie, rating, _ = line.split('\t')
|
||||
# 通过pivot和随机函数比较,然后初始化用户和对应的值
|
||||
if (random.random() < pivot):
|
||||
|
||||
# dict.setdefault(key, default=None)
|
||||
# key -- 查找的键值
|
||||
# default -- 键不存在时,设置的默认键值
|
||||
self.trainset.setdefault(user, {})
|
||||
self.trainset[user][movie] = int(rating)
|
||||
trainset_len += 1
|
||||
else:
|
||||
self.testset.setdefault(user, {})
|
||||
self.testset[user][movie] = int(rating)
|
||||
testset_len += 1
|
||||
|
||||
print('分离训练集和测试集成功', file=sys.stderr)
|
||||
print('train set = %s' % trainset_len, file=sys.stderr)
|
||||
print('test set = %s' % testset_len, file=sys.stderr)
|
||||
|
||||
def calc_user_sim(self):
|
||||
"""calc_user_sim(计算用户之间的相似度)"""
|
||||
|
||||
# build inverse table for item-users
|
||||
# key=movieID, value=list of userIDs who have seen this movie
|
||||
print('building movie-users inverse table...', file=sys.stderr)
|
||||
movie2users = dict()
|
||||
|
||||
# 同一个电影中,收集用户的集合
|
||||
# 统计在所有的用户中,不同电影的总出现次数
|
||||
for user, movies in self.trainset.items():
|
||||
for movie in movies:
|
||||
# inverse table for item-users
|
||||
if movie not in movie2users:
|
||||
movie2users[movie] = set()
|
||||
movie2users[movie].add(user)
|
||||
# count item popularity at the same time
|
||||
if movie not in self.movie_popular:
|
||||
self.movie_popular[movie] = 0
|
||||
self.movie_popular[movie] += 1
|
||||
|
||||
print('build movie-users inverse table success', file=sys.stderr)
|
||||
|
||||
# save the total movie number, which will be used in evaluation
|
||||
self.movie_count = len(movie2users)
|
||||
print('total movie number = %d' % self.movie_count, file=sys.stderr)
|
||||
|
||||
usersim_mat = self.user_sim_mat
|
||||
# 统计在相同电影时,不同用户同时出现的次数
|
||||
print('building user co-rated movies matrix...', file=sys.stderr)
|
||||
|
||||
for movie, users in movie2users.items():
|
||||
for u in users:
|
||||
for v in users:
|
||||
if u == v:
|
||||
continue
|
||||
usersim_mat.setdefault(u, {})
|
||||
usersim_mat[u].setdefault(v, 0)
|
||||
usersim_mat[u][v] += 1
|
||||
print('build user co-rated movies matrix success', file=sys.stderr)
|
||||
|
||||
# calculate similarity matrix
|
||||
print('calculating user similarity matrix...', file=sys.stderr)
|
||||
simfactor_count = 0
|
||||
PRINT_STEP = 2000000
|
||||
for u, related_users in usersim_mat.items():
|
||||
for v, count in related_users.iteritems():
|
||||
# 余弦相似度
|
||||
usersim_mat[u][v] = count / math.sqrt(
|
||||
len(self.trainset[u]) * len(self.trainset[v]))
|
||||
simfactor_count += 1
|
||||
# 打印进度条
|
||||
if simfactor_count % PRINT_STEP == 0:
|
||||
print('calculating user similarity factor(%d)' % simfactor_count, file=sys.stderr)
|
||||
|
||||
print('calculate user similarity matrix(similarity factor) success', file=sys.stderr)
|
||||
print('Total similarity factor number = %d' % simfactor_count, file=sys.stderr)
|
||||
|
||||
# @profile
|
||||
def recommend(self, user):
|
||||
"""recommend(找出top K的用户,所看过的电影,对电影进行相似度sum的排序,取出top N的电影数)
|
||||
|
||||
Args:
|
||||
user 用户
|
||||
Returns:
|
||||
rec_movie 电影推荐列表,按照相似度从大到小的排序
|
||||
"""
|
||||
''' Find K similar users and recommend N movies. '''
|
||||
K = self.n_sim_user
|
||||
N = self.n_rec_movie
|
||||
rank = dict()
|
||||
watched_movies = self.trainset[user]
|
||||
|
||||
# 计算top K 用户的相似度
|
||||
# v=similar user, wuv=不同用户同时出现的次数,根据wuv倒序从大到小选出K个用户进行排列
|
||||
# 耗时分析:50.4%的时间在 line-160行
|
||||
for v, wuv in sorted(
|
||||
self.user_sim_mat[user].items(), key=itemgetter(1),
|
||||
reverse=True)[0:K]:
|
||||
for movie, rating in self.trainset[v].iteritems():
|
||||
if movie in watched_movies:
|
||||
continue
|
||||
# predict the user's "interest" for each movie
|
||||
rank.setdefault(movie, 0)
|
||||
rank[movie] += wuv * rating
|
||||
# return the N best movies
|
||||
"""
|
||||
wuv
|
||||
precision=0.3766 recall=0.0759 coverage=0.3183 popularity=6.9194
|
||||
|
||||
wuv * rating
|
||||
precision=0.3865 recall=0.0779 coverage=0.2681 popularity=7.0116
|
||||
"""
|
||||
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
|
||||
|
||||
def evaluate(self):
|
||||
''' return precision, recall, coverage and popularity '''
|
||||
print('Evaluation start...', file=sys.stderr)
|
||||
|
||||
# 返回top N的推荐结果
|
||||
N = self.n_rec_movie
|
||||
# varables for precision and recall
|
||||
# hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数
|
||||
hit = 0
|
||||
rec_count = 0
|
||||
test_count = 0
|
||||
# varables for coverage
|
||||
all_rec_movies = set()
|
||||
# varables for popularity
|
||||
popular_sum = 0
|
||||
|
||||
# enumerate将其组成一个索引序列,利用它可以同时获得索引和值
|
||||
# 参考地址:http://blog.csdn.net/churximi/article/details/51648388
|
||||
for i, user in enumerate(self.trainset):
|
||||
if i > 0 and i % 500 == 0:
|
||||
print('recommended for %d users' % i, file=sys.stderr)
|
||||
test_movies = self.testset.get(user, {})
|
||||
rec_movies = self.recommend(user)
|
||||
|
||||
# 对比测试集和推荐集的差异 movie, w
|
||||
for movie, _ in rec_movies:
|
||||
if movie in test_movies:
|
||||
hit += 1
|
||||
all_rec_movies.add(movie)
|
||||
# 计算用户对应的电影出现次数log值的sum加和
|
||||
popular_sum += math.log(1 + self.movie_popular[movie])
|
||||
rec_count += N
|
||||
test_count += len(test_movies)
|
||||
|
||||
precision = hit / (1.0 * rec_count)
|
||||
recall = hit / (1.0 * test_count)
|
||||
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
|
||||
popularity = popular_sum / (1.0 * rec_count)
|
||||
|
||||
print('precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
|
||||
precision, recall, coverage, popularity), file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# ratingfile = 'data/16.RecommenderSystems/ml-1m/ratings.dat'
|
||||
ratingfile = 'data/16.RecommenderSystems/ml-100k/u.data'
|
||||
|
||||
# 创建UserCF对象
|
||||
usercf = UserBasedCF()
|
||||
# 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中
|
||||
usercf.generate_dataset(ratingfile, pivot=0.7)
|
||||
# 计算用户之间的相似度
|
||||
usercf.calc_user_sim()
|
||||
# 评估推荐效果
|
||||
usercf.evaluate()
|
||||
28
src/py3.x/ml/16.RecommenderSystems/python/Recommender.py
Normal file
28
src/py3.x/ml/16.RecommenderSystems/python/Recommender.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import numpy as np
|
||||
|
||||
|
||||
# 自定义杰卡德相似系数函数,仅对0-1矩阵有效
|
||||
def Jaccard(a, b):
|
||||
return 1.0*(a*b).sum()/(a+b-a*b).sum()
|
||||
|
||||
|
||||
class Recommender():
|
||||
|
||||
# 相似度矩阵
|
||||
sim = None
|
||||
|
||||
# 计算相似度矩阵的函数
|
||||
def similarity(self, x, distance):
|
||||
y = np.ones((len(x), len(x)))
|
||||
for i in range(len(x)):
|
||||
for j in range(len(x)):
|
||||
y[i, j] = distance(x[i], x[j])
|
||||
return y
|
||||
|
||||
# 训练函数
|
||||
def fit(self, x, distance=Jaccard):
|
||||
self.sim = self.similarity(x, distance)
|
||||
|
||||
# 推荐函数
|
||||
def recommend(self, a):
|
||||
return np.dot(self.sim, a)*(1-a)
|
||||
@@ -0,0 +1,201 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on 2015-06-22
|
||||
Update on 2017-05-16
|
||||
Author: Lockvictor/片刻
|
||||
《推荐系统实践》协同过滤算法源代码
|
||||
参考地址:https://github.com/Lockvictor/MovieLens-RecSys
|
||||
更新地址:https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
from __future__ import print_function
|
||||
import math
|
||||
import random
|
||||
import sys
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn import cross_validation as cv
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
|
||||
# 作用:使得随机数据可预测
|
||||
random.seed(0)
|
||||
|
||||
|
||||
class ItemBasedCF():
|
||||
''' TopN recommendation - ItemBasedCF '''
|
||||
|
||||
def __init__(self):
|
||||
# 拆分数据集
|
||||
self.train_mat = {}
|
||||
self.test_mat = {}
|
||||
|
||||
# 总用户数
|
||||
self.n_users = 0
|
||||
self.n_items = 0
|
||||
|
||||
# n_sim_user: top 20个用户, n_rec_item: top 10个推荐结果
|
||||
self.n_sim_item = 20
|
||||
self.n_rec_item = 10
|
||||
|
||||
# item_mat_similarity: 电影之间的相似度, item_popular: 电影的出现次数, item_count: 总电影数量
|
||||
self.item_mat_similarity = {}
|
||||
self.item_popular = {}
|
||||
self.item_count = 0
|
||||
|
||||
print('Similar item number = %d' % self.n_sim_item, file=sys.stderr)
|
||||
print('Recommended item number = %d' % self.n_rec_item, file=sys.stderr)
|
||||
|
||||
def splitData(self, dataFile, test_size):
|
||||
# 加载数据集
|
||||
header = ['user_id', 'item_id', 'rating', 'timestamp']
|
||||
df = pd.read_csv(dataFile, sep='\t', names=header)
|
||||
|
||||
self.n_users = df.user_id.unique().shape[0]
|
||||
self.n_items = df.item_id.unique().shape[0]
|
||||
|
||||
print('Number of users = ' + str(self.n_users) +
|
||||
' | Number of items = ' + str(self.n_items))
|
||||
|
||||
# 拆分数据集: 用户+电影
|
||||
self.train_data, self.test_data = cv.train_test_split(
|
||||
df, test_size=test_size)
|
||||
print('分离训练集和测试集成功', file=sys.stderr)
|
||||
print('len(train) = %s' % np.shape(self.train_data)[0], file=sys.stderr)
|
||||
print('len(test) = %s' % np.shape(self.test_data)[0], file=sys.stderr)
|
||||
|
||||
def calc_similarity(self):
|
||||
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
|
||||
self.train_mat = np.zeros((self.n_users, self.n_items))
|
||||
for line in self.train_data.itertuples():
|
||||
self.train_mat[int(line.user_id) - 1,
|
||||
int(line.item_id) - 1] = float(line.rating)
|
||||
self.test_mat = np.zeros((self.n_users, self.n_items))
|
||||
for line in self.test_data.itertuples():
|
||||
# print "line", line.user_id-1, line.item_id-1, line.rating
|
||||
self.test_mat[int(line.user_id) - 1,
|
||||
int(line.item_id) - 1] = float(line.rating)
|
||||
|
||||
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
|
||||
print("1:", np.shape(np.mat(self.train_mat).T)) # 行:电影,列:人
|
||||
# 电影-电影-距离(1682, 1682)
|
||||
self.item_mat_similarity = pairwise_distances(
|
||||
np.mat(self.train_mat).T, metric='cosine')
|
||||
print('item_mat_similarity=', np.shape(
|
||||
self.item_mat_similarity), file=sys.stderr)
|
||||
|
||||
print('开始统计流行item的数量...', file=sys.stderr)
|
||||
|
||||
# 统计在所有的用户中,不同电影的总出现次数
|
||||
for i_index in range(self.n_items):
|
||||
if np.sum(self.train_mat[:, i_index]) != 0:
|
||||
self.item_popular[i_index] = np.sum(
|
||||
self.train_mat[:, i_index] != 0)
|
||||
# print "pop=", i_index, self.item_popular[i_index]
|
||||
|
||||
# save the total number of items
|
||||
self.item_count = len(self.item_popular)
|
||||
print('总共流行item数量 = %d' % self.item_count, file=sys.stderr)
|
||||
|
||||
# @profile
|
||||
def recommend(self, u_index):
|
||||
"""recommend(找出top K的电影,对电影进行相似度sum的排序,取出top N的电影数)
|
||||
|
||||
Args:
|
||||
u_index 用户_ID-1=用户index
|
||||
Returns:
|
||||
rec_item 电影推荐列表,按照相似度从大到小的排序
|
||||
"""
|
||||
''' Find K similar items and recommend N items. '''
|
||||
K = self.n_sim_item
|
||||
N = self.n_rec_item
|
||||
rank = {}
|
||||
i_items = np.where(self.train_mat[u_index, :] != 0)[0]
|
||||
# print "i_items=", i_items
|
||||
watched_items = dict(zip(i_items, self.train_mat[u_index, i_items]))
|
||||
|
||||
# 计算top K 电影的相似度
|
||||
# rating=电影评分, w=不同电影出现的次数
|
||||
# 耗时分析:98.2%的时间在 line-154行
|
||||
for i_item, rating in watched_items.items():
|
||||
i_other_items = np.where(
|
||||
self.item_mat_similarity[i_item, :] != 0)[0]
|
||||
for related_item, w in sorted(
|
||||
dict(
|
||||
zip(i_other_items, self.item_mat_similarity[
|
||||
i_item, i_other_items])).items(),
|
||||
key=itemgetter(1),
|
||||
reverse=True)[0:K]:
|
||||
if related_item in watched_items:
|
||||
continue
|
||||
rank.setdefault(related_item, 0)
|
||||
rank[related_item] += w * rating
|
||||
|
||||
# return the N best items
|
||||
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
|
||||
|
||||
def evaluate(self):
|
||||
''' return precision, recall, coverage and popularity '''
|
||||
print('Evaluation start...', file=sys.stderr)
|
||||
|
||||
# varables for precision and recall
|
||||
# hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数
|
||||
hit = 0
|
||||
rec_count = 0
|
||||
test_count = 0
|
||||
# varables for coverage
|
||||
all_rec_items = set()
|
||||
# varables for popularity
|
||||
popular_sum = 0
|
||||
|
||||
# enumerate 将其组成一个索引序列,利用它可以同时获得索引和值
|
||||
# 参考地址:http://blog.csdn.net/churximi/article/details/51648388
|
||||
for u_index in range(50):
|
||||
if u_index > 0 and u_index % 10 == 0:
|
||||
print('recommended for %d users' % u_index, file=sys.stderr)
|
||||
print("u_index", u_index)
|
||||
|
||||
# 对比测试集和推荐集的差异
|
||||
rec_items = self.recommend(u_index)
|
||||
print("rec_items=", rec_items)
|
||||
# item, w
|
||||
for item, _ in rec_items:
|
||||
# print 'test_mat[u_index, item]=', item, self.test_mat[u_index, item]
|
||||
|
||||
if self.test_mat[u_index, item] != 0:
|
||||
hit += 1
|
||||
print("self.test_mat[%d, %d]=%s" %
|
||||
(u_index, item, self.test_mat[u_index, item]))
|
||||
# 计算用户对应的电影出现次数log值的sum加和
|
||||
if item in self.item_popular:
|
||||
popular_sum += math.log(1 + self.item_popular[item])
|
||||
|
||||
rec_count += len(rec_items)
|
||||
test_count += np.sum(self.test_mat[u_index, :] != 0)
|
||||
# print "test_count=", np.sum(self.test_mat[u_index, :] != 0), np.sum(self.train_mat[u_index, :] != 0)
|
||||
|
||||
print("-------", hit, rec_count)
|
||||
precision = hit / (1.0 * rec_count)
|
||||
recall = hit / (1.0 * test_count)
|
||||
coverage = len(all_rec_items) / (1.0 * self.item_count)
|
||||
popularity = popular_sum / (1.0 * rec_count)
|
||||
|
||||
print('precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
|
||||
precision, recall, coverage, popularity), file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
dataFile = 'data/16.RecommenderSystems/ml-100k/u.data'
|
||||
|
||||
# 创建ItemCF对象
|
||||
itemcf = ItemBasedCF()
|
||||
# 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中
|
||||
itemcf.splitData(dataFile, test_size=0.3)
|
||||
# 计算用户之间的相似度
|
||||
itemcf.calc_similarity()
|
||||
# 评估推荐效果
|
||||
# itemcf.evaluate()
|
||||
# 查看推荐结果用户
|
||||
print("推荐结果", itemcf.recommend(u_index=1))
|
||||
print("---", np.where(itemcf.test_mat[1, :] != 0)[0])
|
||||
31
src/py3.x/ml/16.RecommenderSystems/sklearn-RS-demo-item.py
Normal file
31
src/py3.x/ml/16.RecommenderSystems/sklearn-RS-demo-item.py
Normal file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
import numpy as np
|
||||
from sklearn.decomposition import NMF
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
RATE_MATRIX = np.array([[5, 5, 3, 0, 5, 5], [5, 0, 4, 0, 4, 4],
|
||||
[0, 3, 0, 5, 4, 5], [5, 4, 3, 3, 5, 5]])
|
||||
|
||||
nmf = NMF(n_components=2)
|
||||
user_distribution = nmf.fit_transform(RATE_MATRIX)
|
||||
item_distribution = nmf.components_
|
||||
|
||||
item_distribution = item_distribution.T
|
||||
plt.plot(item_distribution[:, 0], item_distribution[:, 1], "b*")
|
||||
plt.xlim((-1, 3))
|
||||
plt.ylim((-1, 3))
|
||||
|
||||
plt.title(u'the distribution of items (NMF)')
|
||||
count = 1
|
||||
for item in item_distribution:
|
||||
plt.text(
|
||||
item[0],
|
||||
item[1],
|
||||
'item ' + str(count),
|
||||
bbox=dict(facecolor='red', alpha=0.2),
|
||||
)
|
||||
count += 1
|
||||
|
||||
plt.show()
|
||||
32
src/py3.x/ml/16.RecommenderSystems/sklearn-RS-demo-user.py
Normal file
32
src/py3.x/ml/16.RecommenderSystems/sklearn-RS-demo-user.py
Normal file
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
import numpy as np
|
||||
from sklearn.decomposition import NMF
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
RATE_MATRIX = np.array([[5, 5, 3, 0, 5, 5], [5, 0, 4, 0, 4, 4],
|
||||
[0, 3, 0, 5, 4, 5], [5, 4, 3, 3, 5, 5]])
|
||||
|
||||
nmf = NMF(n_components=2)
|
||||
user_distribution = nmf.fit_transform(RATE_MATRIX)
|
||||
item_distribution = nmf.components_
|
||||
|
||||
users = ['Ben', 'Tom', 'John', 'Fred']
|
||||
zip_data = zip(users, user_distribution)
|
||||
|
||||
plt.title(u'the distribution of users (NMF)')
|
||||
plt.xlim((-1, 3))
|
||||
plt.ylim((-1, 4))
|
||||
for item in zip_data:
|
||||
user_name = item[0]
|
||||
data = item[1]
|
||||
plt.plot(data[0], data[1], "b*")
|
||||
plt.text(
|
||||
data[0],
|
||||
data[1],
|
||||
user_name,
|
||||
bbox=dict(facecolor='red', alpha=0.2),
|
||||
)
|
||||
|
||||
plt.show()
|
||||
18
src/py3.x/ml/16.RecommenderSystems/sklearn-RS-demo.py
Normal file
18
src/py3.x/ml/16.RecommenderSystems/sklearn-RS-demo.py
Normal file
@@ -0,0 +1,18 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
import numpy as np
|
||||
from sklearn.decomposition import NMF
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
RATE_MATRIX = np.array([[5, 5, 3, 0, 5, 5], [5, 0, 4, 0, 4, 4],
|
||||
[0, 3, 0, 5, 4, 5], [5, 4, 3, 3, 5, 5]])
|
||||
|
||||
nmf = NMF(n_components=2) # 设有2个隐主题
|
||||
user_distribution = nmf.fit_transform(RATE_MATRIX)
|
||||
item_distribution = nmf.components_
|
||||
|
||||
print('用户的主题分布:')
|
||||
print(user_distribution)
|
||||
print('物品的主题分布:')
|
||||
print(item_distribution)
|
||||
73
src/py3.x/ml/16.RecommenderSystems/test_evaluation_model.py
Normal file
73
src/py3.x/ml/16.RecommenderSystems/test_evaluation_model.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import math
|
||||
import random
|
||||
|
||||
def SplitData(data, M, k, seed):
|
||||
test = []
|
||||
train = []
|
||||
random.seed(seed)
|
||||
for user, item in data:
|
||||
if random.randint(0, M) == k:
|
||||
test.append([user, item])
|
||||
else:
|
||||
train.append([user, item])
|
||||
return train, test
|
||||
|
||||
|
||||
# 准确率
|
||||
def Precision(train, test, N):
|
||||
hit = 0
|
||||
all = 0
|
||||
for user in train.keys():
|
||||
tu = test[user]
|
||||
rank = GetRecommendation(user, N)
|
||||
for item, pui in rank:
|
||||
if item in tu:
|
||||
hit += 1
|
||||
all += N
|
||||
return hit / (all * 1.0)
|
||||
|
||||
|
||||
# 召回率
|
||||
def Recall(train, test, N):
|
||||
hit = 0
|
||||
all = 0
|
||||
for user in train.keys():
|
||||
tu = test[user]
|
||||
rank = GetRecommendation(user, N)
|
||||
for item, pui in rank:
|
||||
if item in tu:
|
||||
hit += 1
|
||||
all += len(tu)
|
||||
return hit / (all * 1.0)
|
||||
|
||||
|
||||
# 覆盖率
|
||||
def Coverage(train, test, N):
|
||||
recommend_items = set()
|
||||
all_items = set()
|
||||
for user in train.keys():
|
||||
for item in train[user].keys():
|
||||
all_items.add(item)
|
||||
rank = GetRecommendation(user, N)
|
||||
for item, pui in rank:
|
||||
recommend_items.add(item)
|
||||
return len(recommend_items) / (len(all_items) * 1.0)
|
||||
|
||||
|
||||
# 新颖度
|
||||
def Popularity(train, test, N):
|
||||
item_popularity = dict()
|
||||
for user, items in train.items():
|
||||
for item in items.keys():
|
||||
if item not in item_popularity:
|
||||
item_popularity[item] = 0
|
||||
item_popularity[item] += 1
|
||||
ret = 0
|
||||
n = 0
|
||||
for user in train.keys():
|
||||
rank = GetRecommendation(user, N)
|
||||
for item, pui in rank:
|
||||
ret += math.log(1 + item_popularity[item])
|
||||
n += 1
|
||||
ret /= n * 1.0
|
||||
return ret
|
||||
16
src/py3.x/ml/16.RecommenderSystems/test_graph-based.py
Normal file
16
src/py3.x/ml/16.RecommenderSystems/test_graph-based.py
Normal file
@@ -0,0 +1,16 @@
|
||||
def PersonalRank(G, alpha, root):
|
||||
rank = dict()
|
||||
rank = {x: 0 for x in G.keys()}
|
||||
rank[root] = 1
|
||||
for _ in range(20):
|
||||
tmp = {x: 0 for x in G.keys()}
|
||||
for i, ri in G.items():
|
||||
# j, wij
|
||||
for j, _ in ri.items():
|
||||
if j not in tmp:
|
||||
tmp[j] = 0
|
||||
tmp[j] += 0.6 * rank[i] / (1.0 * len(ri))
|
||||
if j == root:
|
||||
tmp[j] += 1 - alpha
|
||||
rank = tmp
|
||||
return rank
|
||||
40
src/py3.x/ml/16.RecommenderSystems/test_lfm.py
Normal file
40
src/py3.x/ml/16.RecommenderSystems/test_lfm.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import random
|
||||
|
||||
|
||||
# 负样本采样过程
|
||||
def RandSelectNegativeSamples(self, items):
|
||||
ret = {key: 1 for key in items}
|
||||
n = 0
|
||||
for i in range(0, len(items) * 3):
|
||||
item = items_pool[random.randint(0, len(items_pool) - 1)]
|
||||
if item in ret:
|
||||
continue
|
||||
ret[item] = 0
|
||||
n += 1
|
||||
if n > len(items):
|
||||
break
|
||||
return ret
|
||||
|
||||
|
||||
def LatentFactorModel(user_items, F, N, alpha, _lambda):
|
||||
[P, Q] = InitModel(user_items, F)
|
||||
for step in range(0, N):
|
||||
for user, items in user_items.items():
|
||||
samples = RandSelectNegativeSamples(items)
|
||||
for item, rui in samples.items():
|
||||
eui = rui - Predict(user, item)
|
||||
for f in range(0, F):
|
||||
P[user][f] += alpha * (eui * Q[item][f] - _lambda * P[user][f])
|
||||
Q[item][f] += alpha * (eui * P[user][f] - _lambda * Q[item][f])
|
||||
alpha *= 0.9
|
||||
|
||||
|
||||
def Recommend(user, P, Q):
|
||||
rank = dict()
|
||||
for f, puf in P[user].items():
|
||||
for i, qfi in Q[f].items():
|
||||
if i not in rank:
|
||||
rank[i] += puf * qfi
|
||||
return rank
|
||||
|
||||
|
||||
65
src/py3.x/ml/16.RecommenderSystems/test_基于物品.py
Normal file
65
src/py3.x/ml/16.RecommenderSystems/test_基于物品.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import math
|
||||
from operator import itemgetter
|
||||
|
||||
|
||||
def ItemSimilarity1(train):
|
||||
#calculate co-rated users between items
|
||||
C = dict()
|
||||
N = dict()
|
||||
for u, items in train.items():
|
||||
for i in users:
|
||||
N[i] += 1
|
||||
for j in users:
|
||||
if i == j:
|
||||
continue
|
||||
C[i][j] += 1
|
||||
|
||||
#calculate finial similarity matrix W
|
||||
W = dict()
|
||||
for i,related_items in C.items():
|
||||
for j, cij in related_items.items():
|
||||
W[u][v] = cij / math.sqrt(N[i] * N[j])
|
||||
return W
|
||||
|
||||
|
||||
def ItemSimilarity2(train):
|
||||
#calculate co-rated users between items
|
||||
C = dict()
|
||||
N = dict()
|
||||
for u, items in train.items():
|
||||
for i in users:
|
||||
N[i] += 1
|
||||
for j in users:
|
||||
if i == j:
|
||||
continue
|
||||
C[i][j] += 1 / math.log(1 + len(items) * 1.0)
|
||||
|
||||
#calculate finial similarity matrix W
|
||||
W = dict()
|
||||
for i,related_items in C.items():
|
||||
for j, cij in related_items.items():
|
||||
W[u][v] = cij / math.sqrt(N[i] * N[j])
|
||||
return W
|
||||
|
||||
|
||||
def Recommendation1(train, user_id, W, K):
|
||||
rank = dict()
|
||||
ru = train[user_id]
|
||||
for i,pi in ru.items():
|
||||
for j, wj in sorted(W[i].items(), key=itemgetter(1), reverse=True)[0:K]:
|
||||
if j in ru:
|
||||
continue
|
||||
rank[j] += pi * wj
|
||||
return rank
|
||||
|
||||
|
||||
def Recommendation2(train, user_id, W, K):
|
||||
rank = dict()
|
||||
ru = train[user_id]
|
||||
for i,pi in ru.items():
|
||||
for j, wj in sorted(W[i].items(), key=itemgetter(1), reverse=True)[0:K]:
|
||||
if j in ru:
|
||||
continue
|
||||
rank[j].weight += pi * wj
|
||||
rank[j].reason[i] = pi * wj
|
||||
return rank
|
||||
80
src/py3.x/ml/16.RecommenderSystems/test_基于用户.py
Normal file
80
src/py3.x/ml/16.RecommenderSystems/test_基于用户.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import math
|
||||
from operator import itemgetter
|
||||
|
||||
def UserSimilarity1(train):
|
||||
W = dict()
|
||||
for u in train.keys():
|
||||
for v in train.keys():
|
||||
if u == v:
|
||||
continue
|
||||
W[u][v] = len(train[u] & train[v])
|
||||
W[u][v] /= math.sqrt(len(train[u]) * len(train[v]) * 1.0)
|
||||
return W
|
||||
|
||||
|
||||
def UserSimilarity2(train):
|
||||
# build inverse table for item_users
|
||||
item_users = dict()
|
||||
for u, items in train.items():
|
||||
for i in items.keys():
|
||||
if i not in item_users:
|
||||
item_users[i] = set()
|
||||
item_users[i].add(u)
|
||||
|
||||
#calculate co-rated items between users
|
||||
C = dict()
|
||||
N = dict()
|
||||
for i, users in item_users.items():
|
||||
for u in users:
|
||||
N[u] += 1
|
||||
for v in users:
|
||||
if u == v:
|
||||
continue
|
||||
C[u][v] += 1
|
||||
|
||||
#calculate finial similarity matrix W
|
||||
W = dict()
|
||||
for u, related_users in C.items():
|
||||
for v, cuv in related_users.items():
|
||||
W[u][v] = cuv / math.sqrt(N[u] * N[v])
|
||||
return W
|
||||
|
||||
|
||||
def UserSimilarity3(train):
|
||||
# build inverse table for item_users
|
||||
item_users = dict()
|
||||
for u, items in train.items():
|
||||
for i in items.keys():
|
||||
if i not in item_users:
|
||||
item_users[i] = set()
|
||||
item_users[i].add(u)
|
||||
|
||||
#calculate co-rated items between users
|
||||
C = dict()
|
||||
N = dict()
|
||||
for i, users in item_users.items():
|
||||
for u in users:
|
||||
N[u] += 1
|
||||
for v in users:
|
||||
if u == v:
|
||||
continue
|
||||
C[u][v] += 1 / math.log(1 + len(users))
|
||||
|
||||
#calculate finial similarity matrix W
|
||||
W = dict()
|
||||
for u, related_users in C.items():
|
||||
for v, cuv in related_users.items():
|
||||
W[u][v] = cuv / math.sqrt(N[u] * N[v])
|
||||
return W
|
||||
|
||||
|
||||
def Recommend(user, train, W):
|
||||
rank = dict()
|
||||
interacted_items = train[user]
|
||||
for v, wuv in sorted(W[u].items, key=itemgetter(1), reverse=True)[0:K]:
|
||||
for i, rvi in train[v].items:
|
||||
if i in interacted_items:
|
||||
#we should filter items user interacted before
|
||||
continue
|
||||
rank[i] += wuv * rvi
|
||||
return rank
|
||||
325
src/py3.x/ml/2.KNN/kNN.py
Normal file
325
src/py3.x/ml/2.KNN/kNN.py
Normal file
@@ -0,0 +1,325 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
'''
|
||||
Created on Sep 16, 2010
|
||||
Update on 2017-05-18
|
||||
Author: Peter Harrington/羊三/小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
|
||||
# 导入科学计算包numpy和运算符模块operator
|
||||
from numpy import *
|
||||
import operator
|
||||
import os
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def createDataSet():
|
||||
"""
|
||||
Desc:
|
||||
创建数据集和标签
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
group -- 训练数据集的 features
|
||||
labels -- 训练数据集的 labels
|
||||
调用方式
|
||||
import kNN
|
||||
group, labels = kNN.createDataSet()
|
||||
"""
|
||||
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
|
||||
labels = ['A', 'A', 'B', 'B']
|
||||
return group, labels
|
||||
|
||||
|
||||
def classify0(inX, dataSet, labels, k):
|
||||
"""
|
||||
Desc:
|
||||
kNN 的分类函数
|
||||
Args:
|
||||
inX -- 用于分类的输入向量/测试数据
|
||||
dataSet -- 训练数据集的 features
|
||||
labels -- 训练数据集的 labels
|
||||
k -- 选择最近邻的数目
|
||||
Returns:
|
||||
sortedClassCount[0][0] -- 输入向量的预测分类 labels
|
||||
|
||||
注意:labels元素数目和dataSet行数相同;程序使用欧式距离公式.
|
||||
|
||||
预测数据所在分类可在输入下列命令
|
||||
kNN.classify0([0,0], group, labels, 3)
|
||||
"""
|
||||
|
||||
# -----------实现 classify0() 方法的第一种方式----------------------------------------------------------------------------------------------------------------------------
|
||||
# 1. 距离计算
|
||||
dataSetSize = dataSet.shape[0]
|
||||
# tile生成和训练样本对应的矩阵,并与训练样本求差
|
||||
"""
|
||||
tile: 列-3表示复制的行数, 行-1/2表示对inx的重复的次数
|
||||
|
||||
In [8]: tile(inx, (3, 1))
|
||||
Out[8]:
|
||||
array([[1, 2, 3],
|
||||
[1, 2, 3],
|
||||
[1, 2, 3]])
|
||||
|
||||
In [9]: tile(inx, (3, 2))
|
||||
Out[9]:
|
||||
array([[1, 2, 3, 1, 2, 3],
|
||||
[1, 2, 3, 1, 2, 3],
|
||||
[1, 2, 3, 1, 2, 3]])
|
||||
"""
|
||||
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
|
||||
"""
|
||||
欧氏距离: 点到点之间的距离
|
||||
第一行: 同一个点 到 dataSet 的第一个点的距离。
|
||||
第二行: 同一个点 到 dataSet 的第二个点的距离。
|
||||
...
|
||||
第N行: 同一个点 到 dataSet 的第N个点的距离。
|
||||
|
||||
[[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
|
||||
(A1-A2)^2+(B1-B2)^2+(c1-c2)^2
|
||||
"""
|
||||
# 取平方
|
||||
sqDiffMat = diffMat ** 2
|
||||
# 将矩阵的每一行相加
|
||||
sqDistances = sqDiffMat.sum(axis=1)
|
||||
# 开方
|
||||
distances = sqDistances ** 0.5
|
||||
# 根据距离排序从小到大的排序,返回对应的索引位置
|
||||
# argsort() 是将x中的元素从小到大排列,提取其对应的index(索引),然后输出到y。
|
||||
# 例如:y=array([3,0,2,1,4,5]) 则,x[3]=1最小,所以y[0]=3;x[5]=5最大,所以y[5]=5。
|
||||
# print 'distances=', distances
|
||||
sortedDistIndicies = distances.argsort()
|
||||
# print 'distances.argsort()=', sortedDistIndicies
|
||||
|
||||
# 2. 选择距离最小的k个点
|
||||
classCount = {}
|
||||
for i in range(k):
|
||||
# 找到该样本的类型
|
||||
voteIlabel = labels[sortedDistIndicies[i]]
|
||||
# 在字典中将该类型加一
|
||||
# 字典的get方法
|
||||
# 如:list.get(k,d) 其中 get相当于一条if...else...语句,参数k在字典中,字典将返回list[k];如果参数k不在字典中则返回参数d,如果K在字典中则返回k对应的value值
|
||||
# l = {5:2,3:4}
|
||||
# print l.get(3,0)返回的值是4;
|
||||
# Print l.get(1,0)返回值是0;
|
||||
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
|
||||
# 3. 排序并返回出现最多的那个类型
|
||||
# 字典的 items() 方法,以列表返回可遍历的(键,值)元组数组。
|
||||
# 例如:dict = {'Name': 'Zara', 'Age': 7} print "Value : %s" % dict.items() Value : [('Age', 7), ('Name', 'Zara')]
|
||||
# sorted 中的第2个参数 key=operator.itemgetter(1) 这个参数的意思是先比较第几个元素
|
||||
# 例如:a=[('b',2),('a',1),('c',0)] b=sorted(a,key=operator.itemgetter(1)) >>>b=[('c',0),('a',1),('b',2)] 可以看到排序是按照后边的0,1,2进行排序的,而不是a,b,c
|
||||
# b=sorted(a,key=operator.itemgetter(0)) >>>b=[('a',1),('b',2),('c',0)] 这次比较的是前边的a,b,c而不是0,1,2
|
||||
# b=sorted(a,key=opertator.itemgetter(1,0)) >>>b=[('c',0),('a',1),('b',2)] 这个是先比较第2个元素,然后对第一个元素进行排序,形成多级排序。
|
||||
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
|
||||
return sortedClassCount[0][0]
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------------------------
|
||||
# 实现 classify0() 方法的第二种方式
|
||||
|
||||
# """
|
||||
# 1. 计算距离
|
||||
|
||||
# 欧氏距离: 点到点之间的距离
|
||||
# 第一行: 同一个点 到 dataSet的第一个点的距离。
|
||||
# 第二行: 同一个点 到 dataSet的第二个点的距离。
|
||||
# ...
|
||||
# 第N行: 同一个点 到 dataSet的第N个点的距离。
|
||||
|
||||
# [[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
|
||||
# (A1-A2)^2+(B1-B2)^2+(c1-c2)^2
|
||||
|
||||
# inx - dataset 使用了numpy broadcasting,见 https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html
|
||||
# np.sum() 函数的使用见 https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.sum.html
|
||||
# """
|
||||
# dist = np.sum((inx - dataset)**2, axis=1)**0.5
|
||||
|
||||
# """
|
||||
# 2. k个最近的标签
|
||||
|
||||
# 对距离排序使用numpy中的argsort函数, 见 https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.sort.html#numpy.sort
|
||||
# 函数返回的是索引,因此取前k个索引使用[0 : k]
|
||||
# 将这k个标签存在列表k_labels中
|
||||
# """
|
||||
# k_labels = [labels[index] for index in dist.argsort()[0 : k]]
|
||||
# """
|
||||
# 3. 出现次数最多的标签即为最终类别
|
||||
|
||||
# 使用collections.Counter可以统计各个标签的出现次数,most_common返回出现次数最多的标签tuple,例如[('lable1', 2)],因此[0][0]可以取出标签值
|
||||
# """
|
||||
# label = Counter(k_labels).most_common(1)[0][0]
|
||||
# return label
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test1():
|
||||
"""
|
||||
第一个例子演示
|
||||
"""
|
||||
group, labels = createDataSet()
|
||||
print(str(group))
|
||||
print(str(labels))
|
||||
print(classify0([0.1, 0.1], group, labels, 3))
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------
|
||||
def file2matrix(filename):
|
||||
"""
|
||||
导入训练数据
|
||||
:param filename: 数据文件路径
|
||||
:return: 数据矩阵returnMat和对应的类别classLabelVector
|
||||
"""
|
||||
fr = open(filename, 'r')
|
||||
# 获得文件中的数据行的行数
|
||||
numberOfLines = len(fr.readlines())
|
||||
# 生成对应的空矩阵
|
||||
# 例如:zeros(2,3)就是生成一个 2*3 的矩阵,各个位置上全是 0
|
||||
returnMat = zeros((numberOfLines, 3)) # prepare matrix to return
|
||||
classLabelVector = [] # prepare labels return
|
||||
fr = open(filename, 'r')
|
||||
index = 0
|
||||
for line in fr.readlines():
|
||||
# str.strip([chars]) --返回移除字符串头尾指定的字符生成的新字符串
|
||||
line = line.strip()
|
||||
# 以 '\t' 切割字符串
|
||||
listFromLine = line.split('\t')
|
||||
# 每列的属性数据,即 features
|
||||
returnMat[index] = listFromLine[0 : 3]
|
||||
# 每列的类别数据,就是 label 标签数据
|
||||
classLabelVector.append(int(listFromLine[-1]))
|
||||
index += 1
|
||||
# 返回数据矩阵returnMat和对应的类别classLabelVector
|
||||
return returnMat, classLabelVector
|
||||
|
||||
|
||||
def autoNorm(dataSet):
|
||||
"""
|
||||
Desc:
|
||||
归一化特征值,消除属性之间量级不同导致的影响
|
||||
Args:
|
||||
dataSet -- 需要进行归一化处理的数据集
|
||||
Returns:
|
||||
normDataSet -- 归一化处理后得到的数据集
|
||||
ranges -- 归一化处理的范围
|
||||
minVals -- 最小值
|
||||
|
||||
归一化公式:
|
||||
Y = (X-Xmin)/(Xmax-Xmin)
|
||||
其中的 min 和 max 分别是数据集中的最小特征值和最大特征值。该函数可以自动将数字特征值转化为0到1的区间。
|
||||
"""
|
||||
# 计算每种属性的最大值、最小值、范围
|
||||
minVals = dataSet.min(0)
|
||||
maxVals = dataSet.max(0)
|
||||
# 极差
|
||||
ranges = maxVals - minVals
|
||||
# -------第一种实现方式---start-------------------------
|
||||
normDataSet = zeros(shape(dataSet))
|
||||
m = dataSet.shape[0]
|
||||
# 生成与最小值之差组成的矩阵
|
||||
normDataSet = dataSet - tile(minVals, (m, 1))
|
||||
# 将最小值之差除以范围组成矩阵
|
||||
normDataSet = normDataSet / tile(ranges, (m, 1)) # element wise divide
|
||||
# -------第一种实现方式---end---------------------------------------------
|
||||
|
||||
# # -------第二种实现方式---start---------------------------------------
|
||||
# norm_dataset = (dataset - minvalue) / ranges
|
||||
# # -------第二种实现方式---end---------------------------------------------
|
||||
return normDataSet, ranges, minVals
|
||||
|
||||
|
||||
def datingClassTest():
|
||||
"""
|
||||
Desc:
|
||||
对约会网站的测试方法,并将分类错误的数量和分类错误率打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# 设置测试数据的的一个比例(训练数据集比例=1-hoRatio)
|
||||
hoRatio = 0.1 # 测试范围,一部分测试一部分作为样本
|
||||
# 从文件中加载数据
|
||||
datingDataMat, datingLabels = file2matrix("data/2.KNN/datingTestSet2.txt") # load data setfrom file
|
||||
# 归一化数据
|
||||
normMat, ranges, minVals = autoNorm(datingDataMat)
|
||||
# m 表示数据的行数,即矩阵的第一维
|
||||
m = normMat.shape[0]
|
||||
# 设置测试的样本数量, numTestVecs:m表示训练样本的数量
|
||||
numTestVecs = int(m * hoRatio)
|
||||
print('numTestVecs=', numTestVecs)
|
||||
errorCount = 0
|
||||
for i in range(numTestVecs):
|
||||
# 对数据测试
|
||||
classifierResult = classify0(normMat[i], normMat[numTestVecs : m], datingLabels[numTestVecs : m], 3)
|
||||
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
|
||||
errorCount += classifierResult != datingLabels[i]
|
||||
print("the total error rate is: %f" % (errorCount / numTestVecs))
|
||||
print(errorCount)
|
||||
|
||||
|
||||
def img2vector(filename):
|
||||
"""
|
||||
Desc:
|
||||
将图像数据转换为向量
|
||||
Args:
|
||||
filename -- 图片文件 因为我们的输入数据的图片格式是 32 * 32的
|
||||
Returns:
|
||||
returnVect -- 图片文件处理完成后的一维矩阵
|
||||
|
||||
该函数将图像转换为向量:该函数创建 1 * 1024 的NumPy数组,然后打开给定的文件,
|
||||
循环读出文件的前32行,并将每行的头32个字符值存储在NumPy数组中,最后返回数组。
|
||||
"""
|
||||
returnVect = zeros((1, 1024))
|
||||
fr = open(filename, 'r')
|
||||
for i in range(32):
|
||||
lineStr = fr.readline()
|
||||
for j in range(32):
|
||||
returnVect[0, 32 * i + j] = int(lineStr[j])
|
||||
return returnVect
|
||||
|
||||
|
||||
def handwritingClassTest():
|
||||
"""
|
||||
Desc:
|
||||
手写数字识别分类器,并将分类错误数和分类错误率打印出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# 1. 导入数据
|
||||
hwLabels = []
|
||||
trainingFileList = os.listdir("data/2.KNN/trainingDigits") # load the training set
|
||||
m = len(trainingFileList)
|
||||
trainingMat = zeros((m, 1024))
|
||||
# hwLabels存储0~9对应的index位置, trainingMat存放的每个位置对应的图片向量
|
||||
for i in range(m):
|
||||
fileNameStr = trainingFileList[i]
|
||||
fileStr = fileNameStr.split('.')[0] # take off .txt
|
||||
classNumStr = int(fileStr.split('_')[0])
|
||||
hwLabels.append(classNumStr)
|
||||
# 将 32*32的矩阵->1*1024的矩阵
|
||||
trainingMat[i] = img2vector('data/2.KNN/trainingDigits/%s' % fileNameStr)
|
||||
|
||||
# 2. 导入测试数据
|
||||
testFileList = os.listdir('data/2.KNN/testDigits') # iterate through the test set
|
||||
errorCount = 0
|
||||
mTest = len(testFileList)
|
||||
for i in range(mTest):
|
||||
fileNameStr = testFileList[i]
|
||||
fileStr = fileNameStr.split('.')[0] # take off .txt
|
||||
classNumStr = int(fileStr.split('_')[0])
|
||||
vectorUnderTest = img2vector('data/2.KNN/testDigits/%s' % fileNameStr)
|
||||
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
|
||||
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
|
||||
errorCount += classifierResult != classNumStr
|
||||
print("\nthe total number of errors is: %d" % errorCount)
|
||||
print("\nthe total error rate is: %f" % (errorCount / mTest))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# test1()
|
||||
# datingClassTest()
|
||||
handwritingClassTest()
|
||||
69
src/py3.x/ml/2.KNN/sklearn-knn-demo.py
Normal file
69
src/py3.x/ml/2.KNN/sklearn-knn-demo.py
Normal file
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
"""
|
||||
Created on 2017-06-28
|
||||
Updated on 2017-06-28
|
||||
KNN:k近邻算法
|
||||
Author: 小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
"""
|
||||
print(__doc__)
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from numpy import *
|
||||
from matplotlib.colors import ListedColormap
|
||||
from sklearn import neighbors, datasets
|
||||
|
||||
n_neighbors = 3
|
||||
|
||||
# 导入一些要玩的数据
|
||||
iris = datasets.load_iris()
|
||||
X = iris.data[:, :2] # 我们只采用前两个feature. 我们可以使用二维数据集避免这个丑陋的切片
|
||||
y = iris.target
|
||||
|
||||
# print 'X=', type(X), X
|
||||
# print 'y=', type(y), y
|
||||
|
||||
# X = array([[-1.0, -1.1], [-1.0, -1.0], [0, 0], [1.0, 1.1], [2.0, 2.0], [2.0, 2.1]])
|
||||
# y = array([0, 0, 0, 1, 1, 1])
|
||||
|
||||
# print 'X=', type(X), X
|
||||
# print 'y=', type(y), y
|
||||
|
||||
h = .02 # 网格中的步长
|
||||
|
||||
# 创建彩色的图
|
||||
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
|
||||
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
|
||||
|
||||
# cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA'])
|
||||
# cmap_bold = ListedColormap(['#FF0000', '#00FF00'])
|
||||
|
||||
for weights in ['uniform', 'distance']:
|
||||
# 我们创建了一个knn分类器的实例,并拟合数据。
|
||||
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
|
||||
clf.fit(X, y)
|
||||
|
||||
# 绘制决策边界。为此,我们将为每个分配一个颜色
|
||||
# 来绘制网格中的点 [x_min, x_max]x[y_min, y_max].
|
||||
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
|
||||
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
|
||||
np.arange(y_min, y_max, h))
|
||||
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||
|
||||
# 将结果放入一个彩色图中
|
||||
Z = Z.reshape(xx.shape)
|
||||
plt.figure()
|
||||
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
|
||||
|
||||
# 绘制训练点
|
||||
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
|
||||
plt.xlim(xx.min(), xx.max())
|
||||
plt.ylim(yy.min(), yy.max())
|
||||
plt.title("3-Class classification (k = %i, weights = '%s')"
|
||||
% (n_neighbors, weights))
|
||||
|
||||
plt.show()
|
||||
117
src/py3.x/ml/3.DecisionTree/DTSklearn.py
Normal file
117
src/py3.x/ml/3.DecisionTree/DTSklearn.py
Normal file
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: UTF-8 -*-
|
||||
# 原始链接: http://blog.csdn.net/lsldd/article/details/41223147
|
||||
# GitHub: https://github.com/apachecn/AiLearning
|
||||
import numpy as np
|
||||
from sklearn import tree
|
||||
from sklearn.metrics import precision_recall_curve
|
||||
from sklearn.metrics import classification_report
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
|
||||
def createDataSet():
|
||||
''' 数据读入 '''
|
||||
data = []
|
||||
labels = []
|
||||
with open("data/3.DecisionTree/data.txt") as ifile:
|
||||
for line in ifile:
|
||||
# 特征: 身高 体重 label: 胖瘦
|
||||
tokens = line.strip().split(' ')
|
||||
data.append([float(tk) for tk in tokens[:-1]])
|
||||
labels.append(tokens[-1])
|
||||
# 特征数据
|
||||
x = np.array(data)
|
||||
# label分类的标签数据
|
||||
labels = np.array(labels)
|
||||
# 预估结果的标签数据
|
||||
y = np.zeros(labels.shape)
|
||||
|
||||
''' 标签转换为0/1 '''
|
||||
y[labels == 'fat'] = 1
|
||||
print(data, '-------', x, '-------', labels, '-------', y)
|
||||
return x, y
|
||||
|
||||
|
||||
def predict_train(x_train, y_train):
|
||||
'''
|
||||
使用信息熵作为划分标准,对决策树进行训练
|
||||
参考链接: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
|
||||
'''
|
||||
clf = tree.DecisionTreeClassifier(criterion='entropy')
|
||||
# print(clf)
|
||||
clf.fit(x_train, y_train)
|
||||
''' 系数反映每个特征的影响力。越大表示该特征在分类中起到的作用越大 '''
|
||||
print('feature_importances_: %s' % clf.feature_importances_)
|
||||
|
||||
'''测试结果的打印'''
|
||||
y_pre = clf.predict(x_train)
|
||||
# print(x_train)
|
||||
print(y_pre)
|
||||
print(y_train)
|
||||
print(np.mean(y_pre == y_train))
|
||||
return y_pre, clf
|
||||
|
||||
|
||||
def show_precision_recall(x, y, clf, y_train, y_pre):
|
||||
'''
|
||||
准确率与召回率
|
||||
参考链接: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html#sklearn.metrics.precision_recall_curve
|
||||
'''
|
||||
precision, recall, thresholds = precision_recall_curve(y_train, y_pre)
|
||||
# 计算全量的预估结果
|
||||
answer = clf.predict_proba(x)[:, 1]
|
||||
|
||||
'''
|
||||
展现 准确率与召回率
|
||||
precision 准确率
|
||||
recall 召回率
|
||||
f1-score 准确率和召回率的一个综合得分
|
||||
support 参与比较的数量
|
||||
参考链接:http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report
|
||||
'''
|
||||
# target_names 以 y的label分类为准
|
||||
target_names = ['thin', 'fat']
|
||||
print(classification_report(y, answer, target_names=target_names))
|
||||
print(answer)
|
||||
print(y)
|
||||
|
||||
|
||||
def show_pdf(clf):
|
||||
'''
|
||||
可视化输出
|
||||
把决策树结构写入文件: http://sklearn.lzjqsdd.com/modules/tree.html
|
||||
|
||||
Mac报错:pydotplus.graphviz.InvocationException: GraphViz's executables not found
|
||||
解决方案:sudo brew install graphviz
|
||||
参考写入: http://www.jianshu.com/p/59b510bafb4d
|
||||
'''
|
||||
# with open("testResult/tree.dot", 'w') as f:
|
||||
# from sklearn.externals.six import StringIO
|
||||
# tree.export_graphviz(clf, out_file=f)
|
||||
|
||||
import pydotplus
|
||||
from sklearn.externals.six import StringIO
|
||||
dot_data = StringIO()
|
||||
tree.export_graphviz(clf, out_file=dot_data)
|
||||
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
|
||||
graph.write_pdf("../../../output/3.DecisionTree/tree.pdf")
|
||||
|
||||
# from IPython.display import Image
|
||||
# Image(graph.create_png())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
x, y = createDataSet()
|
||||
|
||||
''' 拆分训练数据与测试数据, 80%做训练 20%做测试 '''
|
||||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
|
||||
print('拆分数据:', x_train, x_test, y_train, y_test)
|
||||
|
||||
# 得到训练的预测结果集
|
||||
y_pre, clf = predict_train(x_train, y_train)
|
||||
|
||||
# 展现 准确率与召回率
|
||||
show_precision_recall(x, y, clf, y_train, y_pre)
|
||||
|
||||
# 可视化输出
|
||||
show_pdf(clf)
|
||||
404
src/py3.x/ml/3.DecisionTree/DecisionTree.py
Executable file
404
src/py3.x/ml/3.DecisionTree/DecisionTree.py
Executable file
@@ -0,0 +1,404 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
'''
|
||||
Created on Oct 12, 2010
|
||||
Update on 2017-05-18
|
||||
Decision Tree Source Code for Machine Learning in Action Ch. 3
|
||||
Author: Peter Harrington/片刻
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
'''
|
||||
print(__doc__)
|
||||
import operator
|
||||
from math import log
|
||||
import decisionTreePlot as dtPlot
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def createDataSet():
|
||||
"""
|
||||
Desc:
|
||||
创建数据集
|
||||
Args:
|
||||
无需传入参数
|
||||
Returns:
|
||||
返回数据集和对应的label标签
|
||||
"""
|
||||
# dataSet 前两列是特征,最后一列对应的是每条数据对应的分类标签
|
||||
dataSet = [[1, 1, 'yes'],
|
||||
[1, 1, 'yes'],
|
||||
[1, 0, 'no'],
|
||||
[0, 1, 'no'],
|
||||
[0, 1, 'no']]
|
||||
# dataSet = [['yes'],
|
||||
# ['yes'],
|
||||
# ['no'],
|
||||
# ['no'],
|
||||
# ['no']]
|
||||
# labels 露出水面 脚蹼,注意:这里的labels是写的 dataSet 中特征的含义,并不是对应的分类标签或者说目标变量
|
||||
labels = ['no surfacing', 'flippers']
|
||||
# 返回
|
||||
return dataSet, labels
|
||||
|
||||
|
||||
def calcShannonEnt(dataSet):
|
||||
"""
|
||||
Desc:
|
||||
calculate Shannon entropy -- 计算给定数据集的香农熵
|
||||
Args:
|
||||
dataSet -- 数据集
|
||||
Returns:
|
||||
shannonEnt -- 返回 每一组 feature 下的某个分类下,香农熵的信息期望
|
||||
"""
|
||||
# -----------计算香农熵的第一种实现方式start--------------------------------------------------------------------------------
|
||||
# 求list的长度,表示计算参与训练的数据量
|
||||
numEntries = len(dataSet)
|
||||
# 下面输出我们测试的数据集的一些信息
|
||||
# 例如:<type 'list'> numEntries: 5 是下面的代码的输出
|
||||
# print(type(dataSet), 'numEntries: ', numEntries)
|
||||
|
||||
# 计算分类标签label出现的次数
|
||||
labelCounts = {}
|
||||
# the the number of unique elements and their occurance
|
||||
for featVec in dataSet:
|
||||
# 将当前实例的标签存储,即每一行数据的最后一个数据代表的是标签
|
||||
currentLabel = featVec[-1]
|
||||
# 为所有可能的分类创建字典,如果当前的键值不存在,则扩展字典并将当前键值加入字典。每个键值都记录了当前类别出现的次数。
|
||||
if currentLabel not in labelCounts.keys():
|
||||
labelCounts[currentLabel] = 0
|
||||
labelCounts[currentLabel] += 1
|
||||
# print('-----', featVec, labelCounts)
|
||||
|
||||
# 对于label标签的占比,求出label标签的香农熵
|
||||
shannonEnt = 0.0
|
||||
for key in labelCounts:
|
||||
# 使用所有类标签的发生频率计算类别出现的概率。
|
||||
prob = float(labelCounts[key])/numEntries
|
||||
# log base 2
|
||||
# 计算香农熵,以 2 为底求对数
|
||||
shannonEnt -= prob * log(prob, 2)
|
||||
# print('---', prob, prob * log(prob, 2), shannonEnt)
|
||||
# -----------计算香农熵的第一种实现方式end--------------------------------------------------------------------------------
|
||||
|
||||
# # -----------计算香农熵的第二种实现方式start--------------------------------------------------------------------------------
|
||||
# # 统计标签出现的次数
|
||||
# label_count = Counter(data[-1] for data in dataSet)
|
||||
# # 计算概率
|
||||
# probs = [p[1] / len(dataSet) for p in label_count.items()]
|
||||
# # 计算香农熵
|
||||
# shannonEnt = sum([-p * log(p, 2) for p in probs])
|
||||
# # -----------计算香农熵的第二种实现方式end--------------------------------------------------------------------------------
|
||||
return shannonEnt
|
||||
|
||||
|
||||
def splitDataSet(dataSet, index, value):
|
||||
"""
|
||||
Desc:
|
||||
划分数据集
|
||||
splitDataSet(通过遍历dataSet数据集,求出index对应的colnum列的值为value的行)
|
||||
就是依据index列进行分类,如果index列的数据等于 value的时候,就要将 index 划分到我们创建的新的数据集中
|
||||
Args:
|
||||
dataSet -- 数据集 待划分的数据集
|
||||
index -- 表示每一行的index列 划分数据集的特征
|
||||
value -- 表示index列对应的value值 需要返回的特征的值。
|
||||
Returns:
|
||||
index 列为 value 的数据集【该数据集需要排除index列】
|
||||
"""
|
||||
# -----------切分数据集的第一种方式 start------------------------------------
|
||||
retDataSet = []
|
||||
for featVec in dataSet:
|
||||
# index列为value的数据集【该数据集需要排除index列】
|
||||
# 判断index列的值是否为value
|
||||
if featVec[index] == value:
|
||||
# chop out index used for splitting
|
||||
# [:index]表示前index行,即若 index 为2,就是取 featVec 的前 index 行
|
||||
reducedFeatVec = featVec[:index]
|
||||
'''
|
||||
请百度查询一下: extend和append的区别
|
||||
list.append(object) 向列表中添加一个对象object
|
||||
list.extend(sequence) 把一个序列seq的内容添加到列表中
|
||||
1、使用append的时候,是将new_media看作一个对象,整体打包添加到music_media对象中。
|
||||
2、使用extend的时候,是将new_media看作一个序列,将这个序列和music_media序列合并,并放在其后面。
|
||||
result = []
|
||||
result.extend([1,2,3])
|
||||
print(result)
|
||||
result.append([4,5,6])
|
||||
print(result)
|
||||
result.extend([7,8,9])
|
||||
print(result)
|
||||
结果:
|
||||
[1, 2, 3]
|
||||
[1, 2, 3, [4, 5, 6]]
|
||||
[1, 2, 3, [4, 5, 6], 7, 8, 9]
|
||||
'''
|
||||
reducedFeatVec.extend(featVec[index+1:])
|
||||
# [index+1:]表示从跳过 index 的 index+1行,取接下来的数据
|
||||
# 收集结果值 index列为value的行【该行需要排除index列】
|
||||
retDataSet.append(reducedFeatVec)
|
||||
# -----------切分数据集的第一种方式 end------------------------------------
|
||||
|
||||
# # -----------切分数据集的第二种方式 start------------------------------------
|
||||
# retDataSet = [data[:index] + data[index + 1:] for data in dataSet for i, v in enumerate(data) if i == index and v == value]
|
||||
# # -----------切分数据集的第二种方式 end------------------------------------
|
||||
return retDataSet
|
||||
|
||||
|
||||
def chooseBestFeatureToSplit(dataSet):
|
||||
"""
|
||||
Desc:
|
||||
选择切分数据集的最佳特征
|
||||
Args:
|
||||
dataSet -- 需要切分的数据集
|
||||
Returns:
|
||||
bestFeature -- 切分数据集的最优的特征列
|
||||
"""
|
||||
|
||||
# -----------选择最优特征的第一种方式 start------------------------------------
|
||||
# 求第一行有多少列的 Feature, 最后一列是label列嘛
|
||||
numFeatures = len(dataSet[0]) - 1
|
||||
# label的信息熵
|
||||
baseEntropy = calcShannonEnt(dataSet)
|
||||
# 最优的信息增益值, 和最优的Featurn编号
|
||||
bestInfoGain, bestFeature = 0.0, -1
|
||||
# iterate over all the features
|
||||
for i in range(numFeatures):
|
||||
# create a list of all the examples of this feature
|
||||
# 获取每一个实例的第i+1个feature,组成list集合
|
||||
featList = [example[i] for example in dataSet]
|
||||
# get a set of unique values
|
||||
# 获取剔重后的集合,使用set对list数据进行去重
|
||||
uniqueVals = set(featList)
|
||||
# 创建一个临时的信息熵
|
||||
newEntropy = 0.0
|
||||
# 遍历某一列的value集合,计算该列的信息熵
|
||||
# 遍历当前特征中的所有唯一属性值,对每个唯一属性值划分一次数据集,计算数据集的新熵值,并对所有唯一特征值得到的熵求和。
|
||||
for value in uniqueVals:
|
||||
subDataSet = splitDataSet(dataSet, i, value)
|
||||
prob = len(subDataSet)/float(len(dataSet))
|
||||
newEntropy += prob * calcShannonEnt(subDataSet)
|
||||
# gain[信息增益]: 划分数据集前后的信息变化, 获取信息熵最大的值
|
||||
# 信息增益是熵的减少或者是数据无序度的减少。最后,比较所有特征中的信息增益,返回最好特征划分的索引值。
|
||||
infoGain = baseEntropy - newEntropy
|
||||
print('infoGain=', infoGain, 'bestFeature=', i, baseEntropy, newEntropy)
|
||||
if (infoGain > bestInfoGain):
|
||||
bestInfoGain = infoGain
|
||||
bestFeature = i
|
||||
return bestFeature
|
||||
# -----------选择最优特征的第一种方式 end------------------------------------
|
||||
|
||||
# # -----------选择最优特征的第二种方式 start------------------------------------
|
||||
# # 计算初始香农熵
|
||||
# base_entropy = calcShannonEnt(dataSet)
|
||||
# best_info_gain = 0
|
||||
# best_feature = -1
|
||||
# # 遍历每一个特征
|
||||
# for i in range(len(dataSet[0]) - 1):
|
||||
# # 对当前特征进行统计
|
||||
# feature_count = Counter([data[i] for data in dataSet])
|
||||
# # 计算分割后的香农熵
|
||||
# new_entropy = sum(feature[1] / float(len(dataSet)) * calcShannonEnt(splitDataSet(dataSet, i, feature[0])) \
|
||||
# for feature in feature_count.items())
|
||||
# # 更新值
|
||||
# info_gain = base_entropy - new_entropy
|
||||
# print('No. {0} feature info gain is {1:.3f}'.format(i, info_gain))
|
||||
# if info_gain > best_info_gain:
|
||||
# best_info_gain = info_gain
|
||||
# best_feature = i
|
||||
# return best_feature
|
||||
# # -----------选择最优特征的第二种方式 end------------------------------------
|
||||
|
||||
|
||||
def majorityCnt(classList):
|
||||
"""
|
||||
Desc:
|
||||
选择出现次数最多的一个结果
|
||||
Args:
|
||||
classList label列的集合
|
||||
Returns:
|
||||
bestFeature 最优的特征列
|
||||
"""
|
||||
# -----------majorityCnt的第一种方式 start------------------------------------
|
||||
classCount = {}
|
||||
for vote in classList:
|
||||
if vote not in classCount.keys():
|
||||
classCount[vote] = 0
|
||||
classCount[vote] += 1
|
||||
# 倒叙排列classCount得到一个字典集合,然后取出第一个就是结果(yes/no),即出现次数最多的结果
|
||||
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
|
||||
# print('sortedClassCount:', sortedClassCount)
|
||||
return sortedClassCount[0][0]
|
||||
# -----------majorityCnt的第一种方式 end------------------------------------
|
||||
|
||||
# # -----------majorityCnt的第二种方式 start------------------------------------
|
||||
# major_label = Counter(classList).most_common(1)[0]
|
||||
# return major_label
|
||||
# # -----------majorityCnt的第二种方式 end------------------------------------
|
||||
|
||||
|
||||
def createTree(dataSet, labels):
|
||||
"""
|
||||
Desc:
|
||||
创建决策树
|
||||
Args:
|
||||
dataSet -- 要创建决策树的训练数据集
|
||||
labels -- 训练数据集中特征对应的含义的labels,不是目标变量
|
||||
Returns:
|
||||
myTree -- 创建完成的决策树
|
||||
"""
|
||||
classList = [example[-1] for example in dataSet]
|
||||
# 如果数据集的最后一列的第一个值出现的次数=整个集合的数量,也就说只有一个类别,就只直接返回结果就行
|
||||
# 第一个停止条件:所有的类标签完全相同,则直接返回该类标签。
|
||||
# count() 函数是统计括号中的值在list中出现的次数
|
||||
if classList.count(classList[0]) == len(classList):
|
||||
return classList[0]
|
||||
# 如果数据集只有1列,那么最初出现label次数最多的一类,作为结果
|
||||
# 第二个停止条件:使用完了所有特征,仍然不能将数据集划分成仅包含唯一类别的分组。
|
||||
if len(dataSet[0]) == 1:
|
||||
return majorityCnt(classList)
|
||||
|
||||
# 选择最优的列,得到最优列对应的label含义
|
||||
bestFeat = chooseBestFeatureToSplit(dataSet)
|
||||
# 获取label的名称
|
||||
bestFeatLabel = labels[bestFeat]
|
||||
# 初始化myTree
|
||||
myTree = {bestFeatLabel: {}}
|
||||
# 注:labels列表是可变对象,在PYTHON函数中作为参数时传址引用,能够被全局修改
|
||||
# 所以这行代码导致函数外的同名变量被删除了元素,造成例句无法执行,提示'no surfacing' is not in list
|
||||
del(labels[bestFeat])
|
||||
# 取出最优列,然后它的branch做分类
|
||||
featValues = [example[bestFeat] for example in dataSet]
|
||||
uniqueVals = set(featValues)
|
||||
for value in uniqueVals:
|
||||
# 求出剩余的标签label
|
||||
subLabels = labels[:]
|
||||
# 遍历当前选择特征包含的所有属性值,在每个数据集划分上递归调用函数createTree()
|
||||
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
|
||||
# print('myTree', value, myTree)
|
||||
return myTree
|
||||
|
||||
|
||||
def classify(inputTree, featLabels, testVec):
|
||||
"""
|
||||
Desc:
|
||||
对新数据进行分类
|
||||
Args:
|
||||
inputTree -- 已经训练好的决策树模型
|
||||
featLabels -- Feature标签对应的名称,不是目标变量
|
||||
testVec -- 测试输入的数据
|
||||
Returns:
|
||||
classLabel -- 分类的结果值,需要映射label才能知道名称
|
||||
"""
|
||||
# 获取tree的根节点对于的key值
|
||||
firstStr = list(inputTree.keys())[0]
|
||||
# 通过key得到根节点对应的value
|
||||
secondDict = inputTree[firstStr]
|
||||
# 判断根节点名称获取根节点在label中的先后顺序,这样就知道输入的testVec怎么开始对照树来做分类
|
||||
featIndex = featLabels.index(firstStr)
|
||||
# 测试数据,找到根节点对应的label位置,也就知道从输入的数据的第几位来开始分类
|
||||
key = testVec[featIndex]
|
||||
valueOfFeat = secondDict[key]
|
||||
print('+++', firstStr, 'xxx', secondDict, '---', key, '>>>', valueOfFeat)
|
||||
# 判断分枝是否结束: 判断valueOfFeat是否是dict类型
|
||||
if isinstance(valueOfFeat, dict):
|
||||
classLabel = classify(valueOfFeat, featLabels, testVec)
|
||||
else:
|
||||
classLabel = valueOfFeat
|
||||
return classLabel
|
||||
|
||||
|
||||
def storeTree(inputTree, filename):
|
||||
"""
|
||||
Desc:
|
||||
将之前训练好的决策树模型存储起来,使用 pickle 模块
|
||||
Args:
|
||||
inputTree -- 以前训练好的决策树模型
|
||||
filename -- 要存储的名称
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
import pickle
|
||||
# -------------- 第一种方法 start --------------
|
||||
fw = open(filename, 'wb')
|
||||
pickle.dump(inputTree, fw)
|
||||
fw.close()
|
||||
# -------------- 第一种方法 end --------------
|
||||
|
||||
# -------------- 第二种方法 start --------------
|
||||
with open(filename, 'wb') as fw:
|
||||
pickle.dump(inputTree, fw)
|
||||
# -------------- 第二种方法 start --------------
|
||||
|
||||
|
||||
def grabTree(filename):
|
||||
"""
|
||||
Desc:
|
||||
将之前存储的决策树模型使用 pickle 模块 还原出来
|
||||
Args:
|
||||
filename -- 之前存储决策树模型的文件名
|
||||
Returns:
|
||||
pickle.load(fr) -- 将之前存储的决策树模型还原出来
|
||||
"""
|
||||
import pickle
|
||||
fr = open(filename, 'rb')
|
||||
return pickle.load(fr)
|
||||
|
||||
|
||||
def fishTest():
|
||||
"""
|
||||
Desc:
|
||||
对动物是否是鱼类分类的测试函数,并将结果使用 matplotlib 画出来
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# 1.创建数据和结果标签
|
||||
myDat, labels = createDataSet()
|
||||
# print(myDat, labels)
|
||||
|
||||
# 计算label分类标签的香农熵
|
||||
# calcShannonEnt(myDat)
|
||||
|
||||
# # 求第0列 为 1/0的列的数据集【排除第0列】
|
||||
# print('1---', splitDataSet(myDat, 0, 1))
|
||||
# print('0---', splitDataSet(myDat, 0, 0))
|
||||
|
||||
# # 计算最好的信息增益的列
|
||||
# print(chooseBestFeatureToSplit(myDat))
|
||||
|
||||
import copy
|
||||
myTree = createTree(myDat, copy.deepcopy(labels))
|
||||
print(myTree)
|
||||
# [1, 1]表示要取的分支上的节点位置,对应的结果值
|
||||
print(classify(myTree, labels, [1, 1]))
|
||||
|
||||
# 画图可视化展现
|
||||
dtPlot.createPlot(myTree)
|
||||
|
||||
|
||||
def ContactLensesTest():
|
||||
"""
|
||||
Desc:
|
||||
预测隐形眼镜的测试代码,并将结果画出来
|
||||
Args:
|
||||
none
|
||||
Returns:
|
||||
none
|
||||
"""
|
||||
|
||||
# 加载隐形眼镜相关的 文本文件 数据
|
||||
fr = open('data/3.DecisionTree/lenses.txt')
|
||||
# 解析数据,获得 features 数据
|
||||
lenses = [inst.strip().split('\t') for inst in fr.readlines()]
|
||||
# 得到数据的对应的 Labels
|
||||
lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']
|
||||
# 使用上面的创建决策树的代码,构造预测隐形眼镜的决策树
|
||||
lensesTree = createTree(lenses, lensesLabels)
|
||||
print(lensesTree)
|
||||
# 画图可视化展现
|
||||
dtPlot.createPlot(lensesTree)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# fishTest()
|
||||
ContactLensesTest()
|
||||
137
src/py3.x/ml/3.DecisionTree/decisionTreePlot.py
Normal file
137
src/py3.x/ml/3.DecisionTree/decisionTreePlot.py
Normal file
@@ -0,0 +1,137 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
'''
|
||||
Created on Oct 14, 2010
|
||||
Update on 2017-02-27
|
||||
Decision Tree Source Code for Machine Learning in Action Ch. 3
|
||||
Author: Peter Harrington/jiangzhonglian
|
||||
'''
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# 定义文本框 和 箭头格式 【 sawtooth 波浪方框, round4 矩形方框 , fc表示字体颜色的深浅 0.1~0.9 依次变浅,没错是变浅】
|
||||
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
|
||||
leafNode = dict(boxstyle="round4", fc="0.8")
|
||||
arrow_args = dict(arrowstyle="<-")
|
||||
|
||||
|
||||
def getNumLeafs(myTree):
|
||||
numLeafs = 0
|
||||
firstStr = list(myTree.keys())[0]
|
||||
secondDict = myTree[firstStr]
|
||||
# 根节点开始遍历
|
||||
for key in secondDict.keys():
|
||||
# 判断子节点是否为dict, 不是+1
|
||||
if type(secondDict[key]) is dict:
|
||||
numLeafs += getNumLeafs(secondDict[key])
|
||||
else:
|
||||
numLeafs += 1
|
||||
return numLeafs
|
||||
|
||||
|
||||
def getTreeDepth(myTree):
|
||||
maxDepth = 0
|
||||
firstStr = list(myTree.keys())[0]
|
||||
secondDict = myTree[firstStr]
|
||||
# 根节点开始遍历
|
||||
for key in secondDict.keys():
|
||||
# 判断子节点是不是dict, 求分枝的深度
|
||||
# ----------写法1 start ---------------
|
||||
if type(secondDict[key]) is dict:
|
||||
thisDepth = 1 + getTreeDepth(secondDict[key])
|
||||
else:
|
||||
thisDepth = 1
|
||||
# ----------写法1 end ---------------
|
||||
|
||||
# ----------写法2 start --------------
|
||||
# thisDepth = 1 + getTreeDepth(secondDict[key]) if type(secondDict[key]) is dict else 1
|
||||
# ----------写法2 end --------------
|
||||
# 记录最大的分支深度
|
||||
maxDepth = max(maxDepth, thisDepth)
|
||||
return maxDepth
|
||||
|
||||
|
||||
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
|
||||
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', xytext=centerPt, textcoords='axes fraction', va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
|
||||
|
||||
|
||||
def plotMidText(cntrPt, parentPt, txtString):
|
||||
xMid = (parentPt[0] - cntrPt[0]) / 2 + cntrPt[0]
|
||||
yMid = (parentPt[1] - cntrPt[1]) / 2 + cntrPt[1]
|
||||
createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
|
||||
|
||||
|
||||
def plotTree(myTree, parentPt, nodeTxt):
|
||||
# 获取叶子节点的数量
|
||||
numLeafs = getNumLeafs(myTree)
|
||||
# 获取树的深度
|
||||
# depth = getTreeDepth(myTree)
|
||||
|
||||
# 找出第1个中心点的位置,然后与 parentPt定点进行划线
|
||||
cntrPt = (plotTree.xOff + (1 + numLeafs) / 2 / plotTree.totalW, plotTree.yOff)
|
||||
# print(cntrPt)
|
||||
# 并打印输入对应的文字
|
||||
plotMidText(cntrPt, parentPt, nodeTxt)
|
||||
|
||||
firstStr = list(myTree.keys())[0]
|
||||
# 可视化Node分支点
|
||||
plotNode(firstStr, cntrPt, parentPt, decisionNode)
|
||||
# 根节点的值
|
||||
secondDict = myTree[firstStr]
|
||||
# y值 = 最高点-层数的高度[第二个节点位置]
|
||||
plotTree.yOff = plotTree.yOff - 1 / plotTree.totalD
|
||||
for key in secondDict.keys():
|
||||
# 判断该节点是否是Node节点
|
||||
if type(secondDict[key]) is dict:
|
||||
# 如果是就递归调用[recursion]
|
||||
plotTree(secondDict[key], cntrPt, str(key))
|
||||
else:
|
||||
# 如果不是,就在原来节点一半的地方找到节点的坐标
|
||||
plotTree.xOff = plotTree.xOff + 1 / plotTree.totalW
|
||||
# 可视化该节点位置
|
||||
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
|
||||
# 并打印输入对应的文字
|
||||
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
|
||||
plotTree.yOff = plotTree.yOff + 1 / plotTree.totalD
|
||||
|
||||
|
||||
def createPlot(inTree):
|
||||
# 创建一个figure的模版
|
||||
fig = plt.figure(1, facecolor='green')
|
||||
fig.clf()
|
||||
|
||||
axprops = dict(xticks=[], yticks=[])
|
||||
# 表示创建一个1行,1列的图,createPlot.ax1 为第 1 个子图,
|
||||
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
|
||||
|
||||
plotTree.totalW = float(getNumLeafs(inTree))
|
||||
plotTree.totalD = float(getTreeDepth(inTree))
|
||||
# 半个节点的长度
|
||||
plotTree.xOff = -0.5 / plotTree.totalW
|
||||
plotTree.yOff = 1.0
|
||||
plotTree(inTree, (0.5, 1.0), '')
|
||||
plt.show()
|
||||
|
||||
|
||||
# # 测试画图
|
||||
# def createPlot():
|
||||
# fig = plt.figure(1, facecolor='white')
|
||||
# fig.clf()
|
||||
# # ticks for demo puropses
|
||||
# createPlot.ax1 = plt.subplot(111, frameon=False)
|
||||
# plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode)
|
||||
# plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)
|
||||
# plt.show()
|
||||
|
||||
|
||||
# 测试数据集
|
||||
def retrieveTree(i):
|
||||
listOfTrees = [
|
||||
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
|
||||
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
|
||||
]
|
||||
return listOfTrees[i]
|
||||
|
||||
|
||||
# myTree = retrieveTree(1)
|
||||
# createPlot(myTree)
|
||||
58
src/py3.x/ml/3.DecisionTree/skelearn_dts_regressor_demo.py
Normal file
58
src/py3.x/ml/3.DecisionTree/skelearn_dts_regressor_demo.py
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
"""
|
||||
Created on 2017-06-29
|
||||
Updated on 2017-06-29
|
||||
DecisionTree:决策树
|
||||
Author: 小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
"""
|
||||
|
||||
print(__doc__)
|
||||
|
||||
# 引入必要的模型和库
|
||||
import numpy as np
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# 创建一个随机的数据集
|
||||
# 参考 https://docs.scipy.org/doc/numpy-1.6.0/reference/generated/numpy.random.mtrand.RandomState.html
|
||||
rng = np.random.RandomState(1)
|
||||
# print('lalalalala===', rng)
|
||||
# rand() 是给定形状的随机值,rng.rand(80, 1)即矩阵的形状是 80行,1列
|
||||
# sort()
|
||||
X = np.sort(5 * rng.rand(80, 1), axis=0)
|
||||
# print('X=', X)
|
||||
y = np.sin(X).ravel()
|
||||
# print('y=', y)
|
||||
y[::5] += 3 * (0.5 - rng.rand(16))
|
||||
# print('yyy=', y)
|
||||
|
||||
# 拟合回归模型
|
||||
# regr_1 = DecisionTreeRegressor(max_depth=2)
|
||||
# 保持 max_depth=5 不变,增加 min_samples_leaf=6 的参数,效果进一步提升了
|
||||
regr_2 = DecisionTreeRegressor(max_depth=5)
|
||||
regr_2 = DecisionTreeRegressor(min_samples_leaf=6)
|
||||
# regr_3 = DecisionTreeRegressor(max_depth=4)
|
||||
# regr_1.fit(X, y)
|
||||
regr_2.fit(X, y)
|
||||
# regr_3.fit(X, y)
|
||||
|
||||
# 预测
|
||||
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
|
||||
# y_1 = regr_1.predict(X_test)
|
||||
y_2 = regr_2.predict(X_test)
|
||||
# y_3 = regr_3.predict(X_test)
|
||||
|
||||
# 绘制结果
|
||||
plt.figure()
|
||||
plt.scatter(X, y, c="darkorange", label="data")
|
||||
# plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
|
||||
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
|
||||
# plt.plot(X_test, y_3, color="red", label="max_depth=3", linewidth=2)
|
||||
plt.xlabel("data")
|
||||
plt.ylabel("target")
|
||||
plt.title("Decision Tree Regression")
|
||||
plt.legend()
|
||||
plt.show()
|
||||
61
src/py3.x/ml/3.DecisionTree/sklearn_dts_classify_demo.py
Normal file
61
src/py3.x/ml/3.DecisionTree/sklearn_dts_classify_demo.py
Normal file
@@ -0,0 +1,61 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
"""
|
||||
Created on 2017-06-29
|
||||
Updated on 2017-06-29
|
||||
DecisionTree:决策树
|
||||
Author: 小瑶
|
||||
GitHub: https://github.com/apachecn/AiLearning
|
||||
"""
|
||||
print(__doc__)
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
|
||||
# 参数
|
||||
n_classes = 3
|
||||
plot_colors = "bry"
|
||||
plot_step = 0.02
|
||||
|
||||
# 加载数据
|
||||
iris = load_iris()
|
||||
|
||||
for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
|
||||
# 我们只用两个相应的features
|
||||
X = iris.data[:, pair]
|
||||
y = iris.target
|
||||
|
||||
# 训练
|
||||
clf = DecisionTreeClassifier().fit(X, y)
|
||||
|
||||
# 绘制决策边界
|
||||
plt.subplot(2, 3, pairidx + 1)
|
||||
|
||||
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
|
||||
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
|
||||
np.arange(y_min, y_max, plot_step))
|
||||
|
||||
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||
Z = Z.reshape(xx.shape)
|
||||
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
|
||||
|
||||
plt.xlabel(iris.feature_names[pair[0]])
|
||||
plt.ylabel(iris.feature_names[pair[1]])
|
||||
plt.axis("tight")
|
||||
|
||||
# 绘制训练点
|
||||
for i, color in zip(range(n_classes), plot_colors):
|
||||
idx = np.where(y == i)
|
||||
plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
|
||||
cmap=plt.cm.Paired)
|
||||
|
||||
plt.axis("tight")
|
||||
|
||||
plt.suptitle("Decision surface of a decision tree using paired features")
|
||||
plt.legend()
|
||||
plt.show()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user