概述
第五章 判别分析
- 课堂代码整理及注释
- 第七次作业
课堂代码整理及注释
import numpy as np
import pandas as pd
#### ---------------------------------------------
#### (一) 距离判别
#### ---------------------------------------------
#### ---------------------------------------------
#### 两个总体距离判别问题
#### 1.例 5.2.3
#### ---------------------------------------------
dat = pd.read_excel("F:\基础数学课\应用多元统计分析\examp5.2.3.xlsx",header = 1)
G1 = dat.iloc[:21,:-1]
G2 = dat.iloc[21:,:-1]
x1_bar = G1.mean()
x2_bar = G2.mean()
u_bar = (x1_bar + x2_bar)/2
S1 = G1.cov()
S2 = G2.cov()
n1 = G1.shape[0]
n2 = G2.shape[0]
Sp = (n1-1)/(n1+n2-2)*S1 + (n2-1)/(n1+n2-2)*S2
alpha = np.linalg.inv(Sp).dot(x1_bar-x2_bar)
#### 判别函数 W(X) = alpha.T.dot(X-u_bar)
def W(X):
if alpha.T.dot(X-u_bar) > 0:
return [1 , "G1:破产"]
else:
return [2 , "G2:不破产"]
Xnew = np.array([0.2,0.05,1.5,0.5])
W(Xnew)
## 将 46 家公司全部判断一遍
pred = dat.iloc[:,:-1].agg(W,axis=1) # agg() 函数
## 计算正确率
1-sum(np.abs(pd.DataFrame(list(pred.values)).iloc[:,0] - dat.iloc[:,-1]))/dat.shape[0]
## 交叉验证法
## 线性判别 cross validation
def linear_dis2(data):
G1 = data[data.iloc[:,-1] == 1].iloc[:,:-1]
G2 = data[data.iloc[:,-1] == 2].iloc[:,:-1]
G1 = dat.iloc[:21,:-1] ; G2 = dat.iloc[21:,:-1]
x1_bar = G1.mean() ; x2_bar = G2.mean()
u_bar = (x1_bar + x2_bar)/2
S1 = G1.cov() ; S2 = G2.cov()
n1 = G1.shape[0] ; n2 = G2.shape[0]
Sp = (n1-1)/(n1+n2-2)*S1 + (n2-1)/(n1+n2-2)*S2
alpha = np.linalg.inv(Sp).dot(x1_bar-x2_bar)
#### 判别函数 W(X) = alpha.T.dot(X-u_bar)
def W(X):
if alpha.T.dot(X-u_bar) > 0:
return [1 , "G1:破产"]
else:
return [2 , "G2:不破产"]
return W
f2 = linear_dis2(dat)
f2(Xnew)
## 交叉验证法
res = []
for i in range(46):
data = np.delete(dat.values,i,0)
data = pd.DataFrame(data)
f_temp = linear_dis2(data)
res.append(f_temp(dat.iloc[i,:-1].values))
## 刀切法
train_N = int(np.ceil(dat.shape[0]*0.8))
np.random.seed(123)
# np.random.randint(0,46,train_N)
# np.random.shuffle()
train_sample = np.random.choice(46,train_N,replace = False)
train_set = dat.iloc[train_sample,:]
test_sample =list(set(np.arange(46))-set(train_sample))
test_set = dat.iloc[train_sample,:]
f_train = linear_dis2(train_set)
f_train(test_set.iloc[0,:-1])
test_set.iloc[:,:-1].agg(f_train,axis=1)
test_set.iloc[:,-1]
#### ---------------------------------------------
#### 两个总体距离判别问题
#### 2.iris
#### ---------------------------------------------
# 距离判别
## 两个总体
iris = pd.read_csv('F:\基础数学课\应用多元统计分析\iris.csv')
u1_hat,u2_hat = iris.groupby('species').agg(np.mean).to_numpy()
S1 = iris.groupby('species').cov().to_numpy()[:4,]
S2 = iris.groupby('species').cov().to_numpy()[4:,]
n1,n2 = iris.groupby('species').count().iloc[:,0]
### n1,n2 = np.array(iris.groupby('Species').count()['Sepal.Length'])
u_bar = 1/2*(u1_hat+u2_hat)
S = (n1-1)/(n1+n2-2)*S1 + (n2-1)/(n1+n2-2)*S2
alpha = np.linalg.inv(S).dot(u1_hat-u2_hat)
def f1(X):
return alpha.T.dot(X-u_bar)
result = iris.iloc[:,:-1].apply(f1,axis=1)>0
pd.DataFrame(result)
result.value_counts()
#### ---------------------------------------------
#### 多个总体距离判别问题
#### 1.iris
#### ---------------------------------------------
iris = pd.read_csv("F:\基础数学课\应用多元统计分析\iris.csv")
iris.groupby('species').count()
train_data1 = iris.iloc[:40,:]
train_data2 = iris.iloc[50:90,:]
train_data3 = iris.iloc[100:140,:]
u1 = train_data1.mean()
u2 = train_data2.mean()
u3 = train_data3.mean()
S1 = train_data1.cov()
S2 = train_data2.cov()
S3 = train_data3.cov()
Sp = 1/3*(S1+S2+S3)
I1 = np.linalg.inv(Sp).dot(u1)
C1 = -0.5*u1.T.dot(np.linalg.inv(Sp)).dot(u1)
def W1(X):
return I1.T.dot(X)+C1
I2 = np.linalg.inv(Sp).dot(u2)
C2 = -0.5*u2.T.dot(np.linalg.inv(Sp)).dot(u2)
def W2(X):
return I2.T.dot(X)+C2
I3 = np.linalg.inv(Sp).dot(u3)
C3 = -0.5*u3.T.dot(np.linalg.inv(Sp)).dot(u3)
def W3(X):
return I3.T.dot(X)+C3
data41 = iris.iloc[40,:-1]
### iris.iloc[40,:-1]
W1(data41)
W2(data41)
W3(data41)
np.where([2,9,4]==np.amax([2,9,4]))
def W(X):
v=np.array([W1(X),W2(X),W3(X)])
return np.where(v==np.amax(v))[0].tolist()[0]
W(data41)
test_data = pd.concat([iris.iloc[40:50],iris.iloc[90:100],iris.iloc[140:150]]).iloc[:,:-1]
test_data.agg(W,axis=1)
#### ---------------------------------------------
#### (二) Bayes判别
#### ---------------------------------------------
iris = pd.read_csv("F:\基础数学课\应用多元统计分析\iris.csv")
q = np.array([0.1,0.3,0.6])
C = np.array([[0,3,9],[12,0,2],[20,40,0]])
###假设数据集服从多元正态分布
train_data1 = iris.iloc[:40,:]
train_data2 = iris.iloc[50:90,:]
train_data3 = iris.iloc[100:140,:]
u1 = train_data1.mean()
u2 = train_data2.mean()
u3 = train_data3.mean()
S1 = train_data1.cov()
S2 = train_data2.cov()
S3 = train_data3.cov()
Sp = 1/3*(S1+S2+S3)
def fx(x,u,S):
return np.exp(-0.5*(x-u).T.dot(np.linalg.inv(S)).dot(x-u))
def f1x(x):
return fx(x,u=u1,S=Sp)
def f2x(x):
return fx(x,u=u2,S=Sp)
def f3x(x):
return fx(x,u=u3,S=Sp)
def h(x,j):
return np.sum(q*C[:,j-1]*np.array([f1x(x),f2x(x),f3x(x)]))
xnew = np.array([5,3,2,1])
h(xnew,1)
#### ---------------------------------------------
#### (三) Fisher判别
#### ---------------------------------------------
##协差阵不等
iris = pd.read_csv("F:\基础数学课\应用多元统计分析\iris.csv")
iris.groupby('species').count()
train_data1 = iris.iloc[:40,:]
train_data2 = iris.iloc[50:90,:]
train_data3 = iris.iloc[100:140,:]
u1 = train_data1.mean()
u2 = train_data2.mean()
u3 = train_data3.mean()
S1 = train_data1.cov()
S2 = train_data2.cov()
S3 = train_data3.cov()
Sp = 1/3*(S1+S2+S3)
k = 3
p = 4
E = S1+S2+S3
M = pd.concat([pd.DataFrame(u1),pd.DataFrame(u2),pd.DataFrame(u3)],axis=1).T.values
B = M.T.dot(np.eye(3)-1/3*np.ones((3,3))).dot(M)
lamda, T = np.linalg.eig(np.linalg.inv(E).dot(B))
lamda_max = lamda.max()
# t1 = T[0] #特征根为列向量
t1 = T[:,0]
# 代入测试集
test_data1 = iris[40:50]
test_data2 = iris[90:100]
test_data3 = iris[140:150]
test_data = pd.concat([test_data1,test_data2,test_data3])
## 有问题??
def U(x):
return t1.T.dot(x.T)
for i in range(10):
X = iris[i:i+1]
print(U(X))
第七次作业
#### ---------------------------------------------
# 利用iris数据集,每个类前40条作为训练集,后10条作为测试集
# 1. 进行Fisher判别,计算判别准确率
# 2. 进行贝叶斯判别,,计算判别准确率, 设3个总体都服从正态分布,先验概率为 0.2,0.3, 0.5, 误判概率矩阵为: [[0, 24, 40],[ 12, 0, 8][14, 27,0]]
#### ---------------------------------------------
import numpy as np
import pandas as pd
iris = pd.read_csv("F:\基础数学课\应用多元统计分析\iris.csv")
iris.groupby('species').count()
train_data1 = iris[:40].drop(columns='species')
train_data2 = iris[50:90].drop(columns='species')
train_data3 = iris[100:140].drop(columns='species')
test_data1 = iris[40:50].drop(columns='species')
test_data2 = iris[90:100].drop(columns='species')
test_data3 = iris[140:150].drop(columns='species')
test_data = pd.concat([test_data1,test_data2,test_data3])
iris = iris.drop(columns='species')
u1 = train_data1.mean().to_list()
u2 = train_data2.mean().to_list()
u3 = train_data3.mean().to_list()
S1 = train_data1.cov()
S2 = train_data2.cov()
S3 = train_data3.cov()
Sp = 1/3*(S1+S2+S3)
# 假设三个总体都服从四元正态分布
k = 3
p = 4
#### ---------------------------------------------
#### 1. 进行Fisher判别,计算判别准确率
#### ---------------------------------------------
E = S1+S2+S3
M = pd.concat([pd.DataFrame(u1),pd.DataFrame(u2),pd.DataFrame(u3)],axis=1).T.values
B = M.T.dot(np.eye(3)-1/3*np.ones((3,3))).dot(M)
lamda, T = np.linalg.eig(np.linalg.inv(E).dot(B))
lamda_max = lamda.max()
t1 = T[:,0]
# 利用课本的Fisher第一判别函数
def fisher_y1(X):
dis1 = abs(t1.dot(X.T)-t1.dot(u1))
dis2 = abs(t1.dot(X.T)-t1.dot(u2))
dis3 = abs(t1.dot(X.T)-t1.dot(u3))
dis = np.array([dis1,dis2,dis3])
return np.where(dis == np.amin(dis))[0][0]+1
# 代入测试集
Species_fisher = []
# 判断 'species'
for i in range(30):
X = np.array(test_data[i:i+1])[0]
kind_fisher = fisher_y1(X)
Species_fisher.append(kind_fisher)
# 计算判别准确率
real_species = np.hstack(([1]*10,[2]*10,[3]*10)).tolist()
real_species == Species_fisher
pd.DataFrame(np.column_stack((real_species,Species_fisher)),columns=('real_species','Species_fisher'))
#### ---------------------------------------------
#### 2. Bayes判别
#### ---------------------------------------------
# 先验概率
q = np.array([0.2,0.3,0.5])
# 误判代价矩阵
C = np.array([[0,24,40],[12,0,8],[14,27,0]])
# 概率密度函数
def f(X,u,S):
return ((2*np.pi)**(-p/2)*np.linalg.det(S)**(-0.5))*np.exp(-0.5*(X-u).T.dot(np.linalg.inv(S)).dot(X-u))
def f1(X):
return f(X,u1,S1)
def f2(X):
return f(X,u2,S2)
def f3(X):
return f(X,u3,S3)
def F(X):
return np.array((f1(X),f2(X),f3(X)))
# 判别函数
def bayes_func(X):
ECM1 = np.sum(q*F(X)*C[:,0])
ECM2 = np.sum(q*F(X)*C[:,1])
ECM3 = np.sum(q*F(X)*C[:,2])
return np.where([ECM1,ECM2,ECM3]==min([ECM1,ECM2,ECM3]))[0][0]+1
# 代入测试集
Species_bayes = []
for i in range(30):
X = np.array(test_data[i:i+1])[0]
kind_bayes = bayes_func(X)
Species_bayes.append(kind_bayes)
# 计算判别准确率
real_species = np.hstack(([1]*10,[2]*10,[3]*10)).tolist()
real_species == Species_bayes
np.array(real_species) == np.array(Species_bayes)
pd.DataFrame(np.column_stack((real_species,Species_bayes)),columns=('real_species','Species_bayes'))
最后
以上就是耍酷凉面为你收集整理的应用多元统计分析(7)第五章 判别分析的全部内容,希望文章能够帮你解决应用多元统计分析(7)第五章 判别分析所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复