概述
1 处理数据
import numpy as np
# 预处理数据
def loadData(filename):
dataSet = []
fr = open(filename)
for line in fr.readlines():
curLine = line.strip('n').split('t')
fltLine = list(map(float, curLine))
dataSet.append(fltLine)
fr.close()
return dataSet
# 计算高斯函数
def Gaussian(data,mean,cov):
dim = np.shape(cov)[0] # 计算维度
covdet = np.linalg.det(cov) # 计算|cov|
if covdet==0: # 以防行列式为0
covdet = np.linalg.det(cov+np.eye(dim)*0.01)
covinv = np.linalg.inv(cov+np.eye(dim)*0.01)
else:
covinv = np.linalg.inv(cov) # 计算cov的逆
#print(data,mean)
m = data - mean
z = -0.5 * np.dot(np.dot(m, covinv),m) # 计算exp()里的值
return 1.0/(np.power(np.power(2*np.pi,dim)*abs(covdet),0.5))*np.exp(z) # 返回概率密度值
2 获取初始聚类中心
# 获取最初的聚类中心
def GetInitialMeans(data,K,criterion):
dim = data.shape[1] # 数据的维度
means = [[] for k in range(K)] # 存储均值
minmax=[]
for i in range(dim):
minmax.append(np.array([min(data[:,i]),max(data[:,i])])) # 存储每一维的最大最小值
minmax=np.array(minmax)
while True:
for i in range(K):
means[i]=[]
for j in range(dim):
means[i].append(np.random.random()*(minmax[j][1]-minmax[j][0])+minmax[j][0] ) #随机产生means
means[i]=np.array(means[i])
if isdistance(means,criterion):
break
return means
# 用于判断初始聚类簇中的means是否距离离得比较近
def isdistance(means,criterion=0.03):
K=len(means)
for i in range(K):
for j in range(i+1,K):
if criterion>np.linalg.norm(means[i]-means[j]):
return False
return True
3 GMM主程序
def GMM(data,K,ITER):
N = data.shape[0]
dim = data.shape[1]
means= Kmeans(data,K)
#means=GetInitialMeans(data,K,0.03)
convs=[0]*K
# 初始方差等于整体data的方差
for i in range(K):
convs[i]=np.cov(data.T)
#convs=np.full((K,dim,dim),np.diag(np.full(dim,0.1)))
phi = [1.0/K] * K
omega = [np.zeros(K) for i in range(N)]
loglikelyhood = 0
oldloglikelyhood = 1
while np.abs(loglikelyhood - oldloglikelyhood) > 0.00001:
#print(np.abs(loglikelyhood - oldloglikelyhood))
#while ITER:
oldloglikelyhood = loglikelyhood
# E步
for i in range(N):
res = [phi[k] * Gaussian(data[i],means[k],convs[k]) for k in range(K)]
sumres = np.sum(res)
for k in range(K): # gamma表示第n个样本属于第k个混合高斯的概率
omega[i][k] = res[k] / sumres
# M步
for k in range(K):
Nk = np.sum([omega[n][k] for n in range(N)]) # N[k] 表示N个样本中有多少属于第k个高斯
phi[k] = 1.0 * Nk/N
means[k] = (1.0/Nk)*np.sum([omega[n][k] * data[n] for n in range(N)],axis=0)
xdiffs = data - means[k]
convs[k] = (1.0/ Nk)*np.sum([omega[n][k]* xdiffs[n].reshape(dim,1) * xdiffs[n] for n in range(N)],axis=0)
# 计算最大似然函数
loglikelyhood = np.sum(
[np.log(np.sum([phi[k] * Gaussian(data[n], means[k], convs[k]) for k in range(K)])) for n in range(N)])
ITER-=1
#print(oldloglikelyhood,loglikelyhood)
return phi,means,convs
在GMM中用到的Kmeans算法如下:
# K均值算法,估计大约几个样本属于一个GMM
import copy
def Kmeans(data,K):
N = data.shape[0] # 样本数量
dim = data.shape[1] # 样本维度
means = GetInitialMeans(data,K,0.03)
means_old = [np.zeros(dim) for k in range(K)]
# 收敛条件
while np.sum([np.linalg.norm(means_old[k] - means[k]) for k in range(K)]) > 0.0001:
means_old = copy.deepcopy(means)
numlog = [1] * K # 存储属于某类的个数
sumlog = [np.zeros(dim) for k in range(K)]
# E步
for i in range(N):
dislog = [np.linalg.norm(data[i]-means[k]) for k in range(K)]
tok = dislog.index(np.min(dislog))
numlog[tok]+=1 # 属于该类的样本数量加1
sumlog[tok]+=data[i] # 存储属于该类的样本取值
# M步
for k in range(K):
means[k]=1.0 / numlog[k] * sumlog[k]
return means
def computeOmega(X,mu,sigma,phi,multiGaussian):
n_samples=X.shape[0]
n_clusters=len(phi)
gamma=np.zeros((n_samples,n_clusters))
p=np.zeros(n_clusters)
g=np.zeros(n_clusters)
for i in range(n_samples):
for j in range(n_clusters):
p[j]=multiGaussian(X[i],mu[j],sigma[j])
g[j]=phi[j]*p[j]
for k in range(n_clusters):
gamma[i,k]=g[k]/np.sum(g)
return gamma
def predict(data,m,c,p):
pred=computeOmega(np.array(data),m,c,p,Gaussian)
cluster_results=np.argmax(pred,axis=1)
return cluster_results
4 测试身高体重数据集
d=[]
with open('d:/dataset1.txt','r') as f:
for line in f.readlines():
d.append(line.strip('n').split('t'))
d1=np.array(d)
d2=[list(map(float,i))for i in d1[:,:-1]]
data=np.array(d2)
t=d1[:,-1]
p2,m2,c2=GMM(data,2,50)
pt2=predict(data,m2,c2,p2)
pt2
结果如下:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0], dtype=int64)
p2,m2,c2的值分别为:
p2
[0.7822324209405439, 0.21776757905945612]
m2
[array([170.5300851 , 59.69283016]), array([175.14653505, 73.79246884])]
c2
[array([[63.72099898, 54.66242696],
[54.66242696, 73.46041058]]), array([[ 21.00207568, 14.73278643],
[ 14.73278643, 140.29202475]])]
误差和错误率:
t=d1[:,-1]
t[t=='f']=1
t[t=='m']=0
t=list(map(int,t))
t=np.array(t)
c=0
for i in t==pt2:
if i==False:
c+=1
print('错误数为:',c)
print('错误率为:',round(c/len(t),3))
结果为:
错误数为: 176
错误率为: 0.389
最后
以上就是幸福紫菜为你收集整理的python EM算法4(身高体重数据集)的全部内容,希望文章能够帮你解决python EM算法4(身高体重数据集)所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复