DGA域名指僵尸网络通过算法生成的随机性较高的域名,此类域名往往被攻击者用于构建自己的恶意软件基础设施,用于绕过安全产品的黑名单,从而规避安全设备的拦截以建立C2链接或DNS通道传输。
1.数据集:
本小节使用alexa前1000域名(679个样本:label标记为0)作为白样本,使用dga-cryptolocker(1000个样本:label标记为1)和dga-tovar-goz(1000个样本:label标记为2)做为黑样本.
def load_alexa(filename):
domain_list=[]
csv_reader = csv.reader(open(filename))
for row in csv_reader:
domain=row[1]
if len(domain) >= MIN_LEN:
domain_list.append(domain)
return domain_list
def load_dga(filename):
domain_list=[]
with open(filename) as f:
for line in f:
domain=line.split(",")[0]
if len(domain) >= MIN_LEN:
domain_list.append(domain)
return domain_list
def nb_dga():
x1_domain_list = load_alexa("../data/top-1000.csv")
x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")
x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")
x_domain_list=np.concatenate((x1_domain_list, x2_domain_list,x3_domain_list))
y1=[0]*len(x1_domain_list)
y2=[1]*len(x2_domain_list)
y3=[2]*len(x3_domain_list)
y=np.concatenate((y1, y2,y3))
2.特征化:
本小节DGA域名使用2-gram分割域名,切割单元为字符(r='\w')并映射为向量,具体代码如下:
cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
token_pattern=r"\w", min_df=1)
x= cv.fit_transform(x_domain_list).toarray()
3.训练样本:
model=KMeans(n_clusters=2, random_state=random_state)
y_pred = model.fit_predict(x)
4.可视化:
使用TSNE将高维向量降维,其中DGA是使用x表示
tsne = TSNE(learning_rate=100)
x=tsne.fit_transform(x)
for i,label in enumerate(x):
x1,x2=x[i]
if y_pred[i] == 1:
plt.scatter(x1, x2,marker='o')
else:
plt.scatter(x1, x2,marker='x')
#plt.annotate(label,xy=(x1,x2),xytext=(x1,x2))
plt.show()
5.完整代码:
相比原作者提供的源码,新增了计算准确率的部分
# -*- coding:utf-8 -*-
import numpy as np
import csv
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
#处理域名的最小长度
MIN_LEN=10
#随机程度
random_state = 170
def load_alexa(filename):
domain_list=[]
csv_reader = csv.reader(open(filename))
for row in csv_reader:
domain=row[1]
if len(domain) >= MIN_LEN:
domain_list.append(domain)
return domain_list
def load_dga(filename):
domain_list=[]
#xsxqeadsbgvpdke.co.uk,Domain used by Cryptolocker - Flashback DGA for 13 Apr 2017,2017-04-13,
# http://osint.bambenekconsulting.com/manual/cl.txt
with open(filename) as f:
for line in f:
domain=line.split(",")[0]
if len(domain) >= MIN_LEN:
domain_list.append(domain)
return domain_list
def kmeans_dga():
x1_domain_list = load_alexa("../data/dga/top-100.csv")
x2_domain_list = load_dga("../data/dga/dga-cryptolocke-50.txt")
x3_domain_list = load_dga("../data/dga/dga-post-tovar-goz-50.txt")
x_domain_list=np.concatenate((x1_domain_list, x2_domain_list,x3_domain_list))
#x_domain_list = np.concatenate((x1_domain_list, x2_domain_list))
y1=[0]*len(x1_domain_list)
y2=[1]*len(x2_domain_list)
y3=[1]*len(x3_domain_list)
y=np.concatenate((y1, y2,y3))
#y = np.concatenate((y1, y2))
cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
token_pattern=r"\w", min_df=1)
x= cv.fit_transform(x_domain_list).toarray()
model=KMeans(n_clusters=2, random_state=random_state)
y_pred = model.fit_predict(x)
tsne = TSNE(learning_rate=100)
x=tsne.fit_transform(x)
print(np.mean(y_pred == y) * 100)
for i,label in enumerate(x):
#print('index:', i, 'label:', label)
x1,x2=x[i]
if y_pred[i] == 1:
plt.scatter(x1,x2,marker='o')
else:
plt.scatter(x1, x2,marker='x')
#plt.annotate(label,xy=(x1,x2),xytext=(x1,x2))
plt.show()
if __name__ == '__main__':
kmeans_dga()
6.运行结果:
72.15189873417721
可视化如下
看起来效果不怎么地啊
7.测试场景2:
测试仅区分正常数据与cryptolock家族的DGA域名,代码修改如下
def kmeans_dga():
x1_domain_list = load_alexa("../data/dga/top-100.csv")
x2_domain_list = load_dga("../data/dga/dga-cryptolocke-50.txt")
x3_domain_list = load_dga("../data/dga/dga-post-tovar-goz-50.txt")
x_domain_list = np.concatenate((x1_domain_list, x2_domain_list))
y1=[0]*len(x1_domain_list)
y2=[1]*len(x2_domain_list)
y3=[1]*len(x3_domain_list)
y = np.concatenate((y1, y2))
cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
token_pattern=r"\w", min_df=1)
x= cv.fit_transform(x_domain_list).toarray()
model=KMeans(n_clusters=2, random_state=random_state)
y_pred = model.fit_predict(x)
tsne = TSNE(learning_rate=100)
x=tsne.fit_transform(x)
print(np.mean(y_pred == y) * 100)
for i,label in enumerate(x):
#print('index:', i, 'label:', label)
x1,x2=x[i]
if y_pred[i] == 1:
plt.scatter(x1,x2,marker='o')
else:
plt.scatter(x1, x2,marker='x')
plt.show()
测试结果如下所示,看起来也没有好到哪里去
82.4074074074074
可视化
8.测试多种场景:
将代码改为可配置组合,源码如下
# -*- coding:utf-8 -*-
import numpy as np
import csv
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
#处理域名的最小长度
MIN_LEN=10
#随机程度
random_state = 170
def load_alexa(filename):
domain_list=[]
csv_reader = csv.reader(open(filename))
for row in csv_reader:
domain=row[1]
if len(domain) >= MIN_LEN:
domain_list.append(domain)
return domain_list
def load_dga(filename):
domain_list=[]
#xsxqeadsbgvpdke.co.uk,Domain used by Cryptolocker - Flashback DGA for 13 Apr 2017,2017-04-13,
# http://osint.bambenekconsulting.com/manual/cl.txt
with open(filename) as f:
for line in f:
domain=line.split(",")[0]
if len(domain) >= MIN_LEN:
domain_list.append(domain)
return domain_list
def kmeans_dga(domain_x=123, pic_show=False):
x1_domain_list = load_alexa("../data/dga/top-100.csv")
x2_domain_list = load_dga("../data/dga/dga-cryptolocke-50.txt")
x3_domain_list = load_dga("../data/dga/dga-post-tovar-goz-50.txt")
y1=[0]*len(x1_domain_list)
y2=[1]*len(x2_domain_list)
y3=[1]*len(x3_domain_list)
x_domain_list = np.concatenate((x1_domain_list, x2_domain_list, x3_domain_list))
y = np.concatenate((y1, y2, y3))
if domain_x ==12:
x_domain_list = np.concatenate((x1_domain_list, x2_domain_list))
y = np.concatenate((y1, y2))
elif domain_x ==13:
x_domain_list = np.concatenate((x1_domain_list, x3_domain_list))
y1 = [0] * len(x1_domain_list)
y2 = [1] * len(x3_domain_list)
y = np.concatenate((y1, y2))
elif domain_x == 23:
x_domain_list = np.concatenate((x2_domain_list, x3_domain_list))
y1 = [0] * len(x2_domain_list)
y2 = [1] * len(x3_domain_list)
y = np.concatenate((y1, y2))
cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
token_pattern=r"\w", min_df=1)
x= cv.fit_transform(x_domain_list).toarray()
model=KMeans(n_clusters=2, random_state=random_state)
y_pred = model.fit_predict(x)
score = np.mean(y_pred == y) * 100
print(domain_x, score)
if pic_show:
tsne = TSNE(learning_rate=100)
x = tsne.fit_transform(x)
for i,label in enumerate(x):
x1,x2=x[i]
if y_pred[i] == 1:
plt.scatter(x1, x2, marker='o')
else:
plt.scatter(x1, x2, marker='x')
#plt.annotate(label,xy=(x1,x2),xytext=(x1,x2))
plt.show()
if __name__ == '__main__':
kmeans_dga(domain_x=123)
kmeans_dga(domain_x=12)
kmeans_dga(domain_x=13)
kmeans_dga(domain_x=23)
输出结果如下
123 72.15189873417721
12 82.4074074074074
13 33.33333333333333
23 60.0
结果都不怎么样,即便是分类y_pred预测为0或者1,怎么看效果都不怎么样。