概述
import matplotlib.pyplot as plt
import numpy as np
import csv
from numpy import genfromtxt
from scipy.stats import multivariate_normal
from sklearn.metrics import f1_score
#画图设置
plt.style.use('ggplot')
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13
#读取文件
reader = csv.reader(open("train_server_data.csv", "r"), delimiter=",")
reader1 = csv.reader(open("crossval_server_data.csv", "r"), delimiter=",")
reader2 = csv.reader(open("test_server_data.csv", "r"), delimiter=",")
#转为list格式
tr = list(reader)
cv = list(reader1)
ts = list(reader2)
#得到训练,交叉,测试数据
train_data = np.array(tr[: :]).astype("float")
crossval_data = np.array(cv[: :]).astype("float")
test_data = np.array(ts[: :]).astype("float")
'''
def read_dataset(filePath, delimiter=','):
return genfromtxt(filePath, delimiter=delimiter)
#特征归一化
def feature_normalize(dataset):
mu = np.mean(dataset, axis=0)
sigma = np.std(dataset, axis=0)
return (dataset - mu) / sigma
'''
#高斯预测
def estimate_gaussian(dataset):
mu = np.mean(dataset, axis=0)
sigma = np.cov(dataset.T) #求协方差,协方差表示两个变量在一起的水平。
return mu, sigma
#多元高斯
def multivariate_gaussian(dataset, mu, sigma):
p = multivariate_normal(mean=mu, cov=sigma)
return p.pdf(dataset)
#选择阈值
def select_threshold(probs, test_data):
best_epsilon = 0
best_f1 = 0
f = 0
stepsize = (max(probs) - min(probs)) / 1000;
epsilons = np.arange(min(probs), max(probs), stepsize)
for epsilon in np.nditer(epsilons):
predictions = (probs < epsilon)
f = f1_score(test_data, predictions, average='binary')
if f > best_f1:
best_f1 = f
best_epsilon = epsilon
return best_f1, best_epsilon
mu, sigma = estimate_gaussian(train_data)
p = multivariate_gaussian(train_data,mu,sigma)
#利用交叉熵寻找最佳阈值ep。selecting optimal value of epsilon using cross validation
p_cv = multivariate_gaussian(crossval_data,mu,sigma)
fscore, ep = select_threshold(p_cv,test_data)
print(fscore, ep)
#选择异常点
outliers = np.asarray(np.where(p < ep))
plt.figure(1)
plt.xlabel('motor1')
plt.ylabel('motor2')
plt.title('Datapoints of distribution')
plt.plot(train_data[:,0], train_data[:,1],'b+')
plt.show()
plt.figure(2)
plt.xlabel('motor1')
plt.ylabel('motor2')
plt.title('Detection of Outliers')
plt.plot(train_data[:,0],train_data[:,1],'bx')
plt.plot(train_data[outliers,0],train_data[outliers,1],'ro') #把异常点标记红色
plt.show()
'''
C:Usersz003tesjAppDataLocalProgramsPythonPython35libsite-packagessklearnmetricsclassification.py:1135:
UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
'precision', 'predicted', average, warn_for)
0.875 9.036201327981212e-05
C:Usersz003tesjAppDataLocalProgramsPythonPython35libsite-packagesmatplotlibfont_manager.py:1320:
UserWarning: findfont: Font family ['serif'] not found. Falling back to DejaVu Sans
(prop.get_family(), self.defaultFamily[fontext]))
[Finished in 14.4s]
'''
最后
以上就是过时自行车为你收集整理的基于高斯分布的异常检测代码实现的全部内容,希望文章能够帮你解决基于高斯分布的异常检测代码实现所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复