基于高斯分布的异常检测代码实现

113 阅读 0 评论 75 点赞

我是靠谱客的博主过时自行车，这篇文章主要介绍基于高斯分布的异常检测代码实现，现在分享给大家，希望可以做个参考。

复制代码

import matplotlib.pyplot as plt
import numpy as np
import csv
from numpy import genfromtxt
from scipy.stats import multivariate_normal
from sklearn.metrics import f1_score

#画图设置
plt.style.use('ggplot')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13

#读取文件
reader = csv.reader(open("train_server_data.csv", "r"), delimiter=",")
reader1 = csv.reader(open("crossval_server_data.csv", "r"), delimiter=",")
reader2 = csv.reader(open("test_server_data.csv", "r"), delimiter=",")
#转为list格式
tr = list(reader)
cv = list(reader1)
ts = list(reader2)
#得到训练，交叉，测试数据
train_data = np.array(tr[: :]).astype("float")
crossval_data = np.array(cv[: :]).astype("float")
test_data = np.array(ts[: :]).astype("float")

'''
def read_dataset(filePath, delimiter=','):
    return genfromtxt(filePath, delimiter=delimiter)

#特征归一化
def feature_normalize(dataset):
    mu = np.mean(dataset, axis=0)
    sigma = np.std(dataset, axis=0)
    return (dataset - mu) / sigma
'''
#高斯预测
def estimate_gaussian(dataset):
    mu = np.mean(dataset, axis=0)
    sigma = np.cov(dataset.T)  #求协方差，协方差表示两个变量在一起的水平。
    return mu, sigma

#多元高斯
def multivariate_gaussian(dataset, mu, sigma):
    p = multivariate_normal(mean=mu, cov=sigma)
    return p.pdf(dataset)

#选择阈值
def select_threshold(probs, test_data):
    best_epsilon = 0
    best_f1 = 0
    f = 0
    stepsize = (max(probs) - min(probs)) / 1000;
    epsilons = np.arange(min(probs), max(probs), stepsize)
    for epsilon in np.nditer(epsilons):
        predictions = (probs < epsilon)
        f = f1_score(test_data, predictions, average='binary')
        if f > best_f1:
            best_f1 = f
            best_epsilon = epsilon

return best_f1, best_epsilon

mu, sigma = estimate_gaussian(train_data)
p = multivariate_gaussian(train_data,mu,sigma)

#利用交叉熵寻找最佳阈值ep。selecting optimal value of epsilon using cross validation
p_cv = multivariate_gaussian(crossval_data,mu,sigma)
fscore, ep = select_threshold(p_cv,test_data)
print(fscore, ep)

#选择异常点
outliers = np.asarray(np.where(p < ep))

plt.figure(1)
plt.xlabel('motor1')
plt.ylabel('motor2')
plt.title('Datapoints of distribution')
plt.plot(train_data[:,0], train_data[:,1],'b+')
plt.show()

plt.figure(2)
plt.xlabel('motor1')
plt.ylabel('motor2')
plt.title('Detection of Outliers')
plt.plot(train_data[:,0],train_data[:,1],'bx')
plt.plot(train_data[outliers,0],train_data[outliers,1],'ro')  #把异常点标记红色
plt.show()

'''
C:Usersz003tesjAppDataLocalProgramsPythonPython35libsite-packagessklearnmetricsclassification.py:1135: 
UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
0.875 9.036201327981212e-05
C:Usersz003tesjAppDataLocalProgramsPythonPython35libsite-packagesmatplotlibfont_manager.py:1320: 
UserWarning: findfont: Font family ['serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))
[Finished in 14.4s]
'''

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114



import matplotlib.pyplot as plt
import numpy as np
import csv
from numpy import genfromtxt
from scipy.stats import multivariate_normal
from sklearn.metrics import f1_score

#画图设置
plt.style.use('ggplot')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13



#读取文件
reader = csv.reader(open("train_server_data.csv", "r"), delimiter=",")
reader1 = csv.reader(open("crossval_server_data.csv", "r"), delimiter=",")
reader2 = csv.reader(open("test_server_data.csv", "r"), delimiter=",")
#转为list格式
tr = list(reader)
cv = list(reader1)
ts = list(reader2)
#得到训练，交叉，测试数据
train_data = np.array(tr[: :]).astype("float")
crossval_data = np.array(cv[: :]).astype("float")
test_data = np.array(ts[: :]).astype("float")

'''
def read_dataset(filePath, delimiter=','):
    return genfromtxt(filePath, delimiter=delimiter)

#特征归一化
def feature_normalize(dataset):
    mu = np.mean(dataset, axis=0)
    sigma = np.std(dataset, axis=0)
    return (dataset - mu) / sigma
'''
#高斯预测
def estimate_gaussian(dataset):
    mu = np.mean(dataset, axis=0)
    sigma = np.cov(dataset.T)  #求协方差，协方差表示两个变量在一起的水平。
    return mu, sigma

#多元高斯
def multivariate_gaussian(dataset, mu, sigma):
    p = multivariate_normal(mean=mu, cov=sigma)
    return p.pdf(dataset)

#选择阈值
def select_threshold(probs, test_data):
    best_epsilon = 0
    best_f1 = 0
    f = 0
    stepsize = (max(probs) - min(probs)) / 1000;
    epsilons = np.arange(min(probs), max(probs), stepsize)
    for epsilon in np.nditer(epsilons):
        predictions = (probs < epsilon)
        f = f1_score(test_data, predictions, average='binary')
        if f > best_f1:
            best_f1 = f
            best_epsilon = epsilon

    return best_f1, best_epsilon

mu, sigma = estimate_gaussian(train_data)
p = multivariate_gaussian(train_data,mu,sigma)

#利用交叉熵寻找最佳阈值ep。selecting optimal value of epsilon using cross validation
p_cv = multivariate_gaussian(crossval_data,mu,sigma)
fscore, ep = select_threshold(p_cv,test_data)
print(fscore, ep)

#选择异常点
outliers = np.asarray(np.where(p < ep))

plt.figure(1)
plt.xlabel('motor1')
plt.ylabel('motor2')
plt.title('Datapoints of distribution')
plt.plot(train_data[:,0], train_data[:,1],'b+')
plt.show()

plt.figure(2)
plt.xlabel('motor1')
plt.ylabel('motor2')
plt.title('Detection of Outliers')
plt.plot(train_data[:,0],train_data[:,1],'bx')
plt.plot(train_data[outliers,0],train_data[outliers,1],'ro')  #把异常点标记红色
plt.show()

'''
C:Usersz003tesjAppDataLocalProgramsPythonPython35libsite-packagessklearnmetricsclassification.py:1135: 
UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
0.875 9.036201327981212e-05
C:Usersz003tesjAppDataLocalProgramsPythonPython35libsite-packagesmatplotlibfont_manager.py:1320: 
UserWarning: findfont: Font family ['serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))
[Finished in 14.4s]
'''