我是靠谱客的博主 自觉电灯胆,最近开发中收集的这篇文章主要介绍使用支持向量机算法区分黑白,觉得挺不错的,现在分享给大家,希望可以做个参考。

概述

本博客是学习《Web安全之机器学习入门》的笔记。

支持向量机算法可以在这里看:https://blog.csdn.net/qq_37865996/article/details/84555680

这个算法在这里的应用,自然是想找到一个超平面,划分普通用户和黑客,距离超平面最近的用户样本成为支持向量。在支持向量机中,很多时候都是不可线形区分的情况,虽然增加维数可以解决这一问题,但是由此造成的“维数灾难”是很多人极其不想遇到的。而使用核函数可以有效避免这一问题,有关核函数可以查看:https://blog.csdn.net/kateyabc/article/details/79980880

比如通过下列代码

print(__doc__)

import numpy as np
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from sklearn import svm

# 创建40个随机点,np.r_是按列连接两个矩阵,就是把两矩阵上下相加,要求列数相等,类似于pandas中的concat()。
np.random.seed(0)
X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
Y = [0] * 20 + [1] * 20

# fit the model
clf = svm.SVC(kernel='linear')
clf.fit(X, Y)

# 构造超平面
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)#均分指令
yy = a * xx - (clf.intercept_[0]) / w[1]
# plot the parallels to the separating hyperplane that pass through the
# support vectors
b = clf.support_vectors_[0]
yy_down = a * xx + (b[1] - a * b[0])
b = clf.support_vectors_[-1]
yy_up = a * xx + (b[1] - a * b[0])

# 调用matplotlib进行画图
plt.plot(xx, yy, 'k-')
plt.plot(xx, yy_down, 'k--')
plt.plot(xx, yy_up, 'k--')

plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
            s=80, facecolors='none')
plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

plt.axis('tight')
plt.show()

结果:

1.识别XSS

关于识别XSS,我们可以从网络日志入手。

在特征化步骤中,经过宝书作者的处理,网络日志特征最后落到了url长度、url中包含第三方域名的个数、敏感字符的个数和敏感关键字的个数。

import re
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.externals import  joblib
from sklearn.metrics import classification_report
from sklearn import metrics

x = []
y = []
#提取url中第三方域名的个数、敏感字符的个数、敏感关键字的个数
def get_len(url):
    return len(url)
def get_url_count(url):
    if re.search('(http://)|(https://)', url, re.IGNORECASE) :
        return 1
    else:
        return 0
def get_evil_char(url):
    return len(re.findall("[<>,'"/]", url, re.IGNORECASE))
def get_evil_word(url):
    return len(re.findall("(alert)|(script=)(%3c)|(%3e)|(%20)|(onerror)|(onload)|(eval)|(src=)|(prompt)",url,re.IGNORECASE))
def get_last_char(url):
    if re.search('/$', url, re.IGNORECASE) :
        return 1
    else:
        return 0
def get_feature(url):
    return [get_len(url),get_url_count(url),get_evil_char(url),get_evil_word(url),get_last_char(url)]

def do_metrics(y_test,y_pred):
    print "metrics.accuracy_score:"
    print metrics.accuracy_score(y_test, y_pred)
    print "metrics.confusion_matrix:"
    print metrics.confusion_matrix(y_test, y_pred)
    print "metrics.precision_score:"
    print metrics.precision_score(y_test, y_pred)
    print "metrics.recall_score:"
    print metrics.recall_score(y_test, y_pred)
    print "metrics.f1_score:"
    print metrics.f1_score(y_test,y_pred)

def etl(filename,data,isxss):
        with open(filename) as f:
            for line in f:
                f1=get_len(line)
                f2=get_url_count(line)
                f3=get_evil_char(line)
                f4=get_evil_word(line)
                data.append([f1,f2,f3,f4])
                if isxss:
                    y.append(1)
                else:
                    y.append(0)
        return data

etl('/Users/zhanglipeng/Data/xss-200000.txt',x,1)
etl('/Users/zhanglipeng/Data/good-xss-200000.txt',x,0)
#数据拆分,40%作为测试样本,剩余的作为训练样本,可自己设置
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.4, random_state=0)
#数据训练,此处使用核函数为linear
clf = svm.SVC(kernel='linear', C=1).fit(x_train, y_train)

y_pred = clf.predict(x_test)
do_metrics(y_test, y_pred)

输出结果:

metrics.accuracy_score:

0.9979394698668074

metrics.confusion_matrix:

[[54092    73]

[   52  6447]]

metrics.precision_score:

0.9888036809815951

metrics.recall_score:

0.991998769041391

metrics.f1_score:

0.9903986481296566

可以看到无论是精确率还是召回率,支持向量机算法在这里的应用都显得很出色。

2.使用支持向量机算法区分僵尸网络DGA家族

根据书中作者在这里搜集的url,包含有cryptolocker、post-tovar-goz和alexa三个家族的域名。这里作者提出了四种方法来进行各自的统计和对比,见下图:

# -*- coding:utf-8 -*-

import sys
import urllib
import urlparse
import re
from hmmlearn import hmm
import numpy as np
from sklearn.externals import joblib
import HTMLParser
import nltk
import csv
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import os


#处理域名的最小长度
MIN_LEN=10

#状态个数
N=8
#最大似然概率阈值
T=-50

#模型文件名
FILE_MODEL="9-2.m"

def load_alexa(filename):
    domain_list=[]
    csv_reader = csv.reader(open(filename))
    for row in csv_reader:
        domain=row[1]
        if len(domain) >= MIN_LEN:
            domain_list.append(domain)
    return domain_list

def domain2ver(domain):
    ver=[]
    for i in range(0,len(domain)):
        ver.append([ord(domain[i])])
    return ver

def train_hmm(domain_list):
    X = [[0]]
    X_lens = [1]
    for domain in domain_list:
        ver=domain2ver(domain)
        np_ver = np.array(ver)
        X=np.concatenate([X,np_ver])
        X_lens.append(len(np_ver))

    remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
    remodel.fit(X,X_lens)
    joblib.dump(remodel, FILE_MODEL)

    return remodel
#从DGA文件中提取域名数据
def load_dga(filename):
    domain_list=[]
    #xsxqeadsbgvpdke.co.uk,Domain used by Cryptolocker - Flashback DGA for 13 Apr 2017,2017-04-13,
    # http://osint.bambenekconsulting.com/manual/cl.txt
    with open(filename) as f:
        for line in f:
            domain=line.split(",")[0]
            if len(domain) >= MIN_LEN:
                domain_list.append(domain)
    return  domain_list

def test_dga(remodel,filename):
    x=[]
    y=[]
    dga_cryptolocke_list = load_dga(filename)
    for domain in dga_cryptolocke_list:
        domain_ver=domain2ver(domain)
        np_ver = np.array(domain_ver)
        pro = remodel.score(np_ver)
        #print  "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
        x.append(len(domain))
        y.append(pro)
    return x,y

def test_alexa(remodel,filename):
    x=[]
    y=[]
    alexa_list = load_alexa(filename)
    for domain in alexa_list:
        domain_ver=domain2ver(domain)
        np_ver = np.array(domain_ver)
        pro = remodel.score(np_ver)
        #print  "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
        x.append(len(domain))
        y.append(pro)
    return x, y

def show_hmm():
    domain_list = load_alexa("/Users/zhanglipeng/Data/top-1000.csv")
    if not os.path.exists(FILE_MODEL):
        remodel=train_hmm(domain_list)
    remodel=joblib.load(FILE_MODEL)
    x_3,y_3=test_dga(remodel, "/Users/zhanglipeng/Data/dga-post-tovar-goz-1000.txt")
    x_2,y_2=test_dga(remodel,"/Users/zhanglipeng/Data/dga-cryptolocke-1000.txt")
    x_1,y_1=test_alexa(remodel, "/Users/zhanglipeng/Data/test-top-1000.csv")
    fig,ax=plt.subplots()
    ax.set_xlabel('Domain Length')
    ax.set_ylabel('HMM Score')
    ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
    ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
    ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
    ax.legend(loc='best')
    plt.show()

#针对正常域名元音字母比例较高,可以计算元音字母的比例,以进行区分
def get_aeiou(domain_list):
    x=[]
    y=[]
    for domain in domain_list:
        x.append(len(domain))
        count=len(re.findall(r'[aeiou]',domain.lower()))
        count=(0.0+count)/len(domain)
        y.append(count)
    return x,y
#三个家族的字母比例自然不会相同,这里进行获取和计算
def show_aeiou():
    x1_domain_list = load_alexa("/Users/zhanglipeng/Data/top-1000.csv")
    x_1,y_1=get_aeiou(x1_domain_list)
    x2_domain_list = load_dga("/Users/zhanglipeng/Data/dga-cryptolocke-1000.txt")
    x_2,y_2=get_aeiou(x2_domain_list)
    x3_domain_list = load_dga("/Users/zhanglipeng/Data/dga-post-tovar-goz-1000.txt")
    x_3,y_3=get_aeiou(x3_domain_list)
#以域名长度为横轴,元音字母比例作为纵轴
    fig,ax=plt.subplots()
    ax.set_xlabel('Domain Length')
    ax.set_ylabel('AEIOU Score')
    ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
    ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
    ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
    ax.legend(loc='best')
    plt.show()
#计算去重后的字母数字个数与域名长度的比例,可以使用set数据结构。set数据结构不包含重复的值
def get_uniq_char_num(domain_list):
    x=[]
    y=[]
    for domain in domain_list:
        x.append(len(domain))
        count=len(set(domain))
        count=(0.0+count)/len(domain)
        y.append(count)
    return x,y
#分别获取僵尸网络和alexa域名数据,计算去重后的字母个数与域名长度的比例
def show_uniq_char_num():
    x1_domain_list = load_alexa("/Users/zhanglipeng/Data/top-1000.csv")
    x_1,y_1=get_uniq_char_num(x1_domain_list)
    x2_domain_list = load_dga("/Users/zhanglipeng/Data/dga-cryptolocke-1000.txt")
    x_2,y_2=get_uniq_char_num(x2_domain_list)
    x3_domain_list = load_dga("/Users/zhanglipeng/Data/dga-post-tovar-goz-1000.txt")
    x_3,y_3=get_uniq_char_num(x3_domain_list)

    fig,ax=plt.subplots()
    ax.set_xlabel('Domain Length')
    ax.set_ylabel('UNIQ CHAR NUMBER')
    ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
    ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
    ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
    ax.legend(loc='best')
    plt.show()

#jarccard系数定义为两个集合交集与并集元素个数的比值。
def count2string_jarccard_index(a,b):
    x=set(' '+a[0])
    y=set(' '+b[0])
    for i in range(0,len(a)-1):
        x.add(a[i]+a[i+1])
    x.add(a[len(a)-1]+' ')

    for i in range(0,len(b)-1):
        y.add(b[i]+b[i+1])
    y.add(b[len(b)-1]+' ')

    return (0.0+len(x-y))/len(x|y)

#计算两个域名集合的平均jarccard
def get_jarccard_index(a_list,b_list):
    x=[]
    y=[]
    for a in a_list:
        j=0.0
        for b in b_list:
            j+=count2string_jarccard_index(a,b)
        x.append(len(a))
        y.append(j/len(b_list))

    return x,y

#分别计算三个
def show_jarccard_index():
    x1_domain_list = load_alexa("/Users/zhanglipeng/Data/top-1000.csv")
    x_1,y_1=get_jarccard_index(x1_domain_list,x1_domain_list)
    x2_domain_list = load_dga("/Users/zhanglipeng/Data/dga-cryptolocke-1000.txt")
    x_2,y_2=get_jarccard_index(x2_domain_list,x1_domain_list)
    x3_domain_list = load_dga("/Users/zhanglipeng/Data/dga-post-tovar-goz-1000.txt")
    x_3,y_3=get_jarccard_index(x3_domain_list,x1_domain_list)

    fig,ax=plt.subplots()
    ax.set_xlabel('Domain Length')
    ax.set_ylabel('JARCCARD INDEX')
    ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
    ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
    ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
    ax.legend(loc='lower right')
    plt.show()

if __name__ == '__main__':
    #show_hmm()
    #show_aeiou()
    #show_uniq_char_num()
    show_jarccard_index()

其中一个:

 

学至此处,愈发觉得此书的价值无比。

最后

以上就是自觉电灯胆为你收集整理的使用支持向量机算法区分黑白的全部内容,希望文章能够帮你解决使用支持向量机算法区分黑白所遇到的程序开发问题。

如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(45)

评论列表共有 0 条评论

立即
投稿
返回
顶部