n ba
数据工程概论作业实现,https://archive.ics.uci.edu/ml/datasets/detection_of_IoT_botnet_attacks_N_BaioT 数据集的异常检测。
论文实现了数据集的自编码器异常检测并且在时间上与正确率上进行了检测。
该数据集含有9个设备,检测9个设备的攻击。
本人使用pandas对数据进行处理,使用sklearn对模型进行评估发现,随记森林效果最好。最后使用混淆矩阵来进行结果分析。
1.读取数据
def dataPrepare():
#读取正常数据,异常数据并合并
#读取正常数据 使用数据 DanMini_Doorbell 物理设备。
data = pd.read_csv("benign_traffic.csv")
data['Class']=0
#读取异常数据mirai_attacks
udp = pd.read_csv("./mirai_attacks/udp.csv")
udp['Class'] = 1
ack = pd.read_csv("./mirai_attacks/ack.csv")
ack['Class'] = 1
scan = pd.read_csv("./mirai_attacks/scan.csv")
scan['Class'] = 1
syn = pd.read_csv("./mirai_attacks/syn.csv")
syn['Class'] = 1
udpplain = pd.read_csv("./mirai_attacks/udpplain.csv")
udpplain['Class'] = 1
#读取异常数据gafgyt_attacks
g_combo = pd.read_csv("./gafgyt_attacks/combo.csv")
g_combo['Class'] = 1
g_junk = pd.read_csv("./gafgyt_attacks/junk.csv")
g_junk['Class'] = 1
g_scan = pd.read_csv("./gafgyt_attacks/scan.csv")
g_scan['Class'] = 1
g_tcp = pd.read_csv("./gafgyt_attacks/tcp.csv")
g_tcp['Class'] = 1
g_udp = pd.read_csv("./gafgyt_attacks/udp.csv")
g_udp['Class'] = 1
frames = [data, udp, ack, scan, syn, udpplain
,g_combo, g_junk, g_scan, g_tcp, g_udp]
#结果拼接
result = pd.concat(frames, ignore_index=True)
showData(result)
#result.to_csv('test_res.csv')
return result
2.对数据集进行分析,数据集进行划分
def dataPreprocessing(data):
#统计正例子和负例数目
print('------')
count_class = pd.value_counts(data['Class'],sort=True).sort_index()
print(count_class)
print('------')
# 分离出特征X 和 输出变量y
X = data.iloc[:, data.columns != 'Class']
y = data.iloc[:, data.columns == 'Class']
#下采样使得分类数据1:1
# 正样本个数
positive_sample_count = len(data[data.Class == 1])
print("正样本个数为:", positive_sample_count)
# 负样本所对应的索引为
negative_sample_index = np.array(data[data.Class == 0].index)
print("负样本在数据集中所对应的索引为(打印前5个):", negative_sample_index[:5])
# 正样本所对应的索引
positive_sample_index = data[data.Class == 1].index
# numpy.random.choice(a, size=None, replace=True, p=None) 从给定的一维阵列生成一个随机样本
# replace 样品是否有更换 True 表示每次都随机生成, false表示只随机生成一次
random_positive_sample_index = np.random.choice(positive_sample_index, int(1*len(data[data.Class == 0])), replace=False)
#在异常检测时调整样本数目
#negative_sample_index = np.random.choice(positive_sample_index, len(data[data.Class == 0]), replace=False)
print("正样本在数据集中所对应的索引为(打印前5个):", random_positive_sample_index[:5])
under_sample_index = np.concatenate([random_positive_sample_index, negative_sample_index])
under_sample_data = data.iloc[under_sample_index, :]
X_under_sample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
y_under_sample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']
print('下采样后,新数据集中,正样本所占比例:',
len(under_sample_data[under_sample_data.Class == 1]) / len(under_sample_data))
print('下采样后,新数据集中,负样本所占比例:',
len(under_sample_data[under_sample_data.Class == 0]) / len(under_sample_data))
print('下采样后,新数据集的样本个数为:', len(under_sample_data))
#数据集划分 将数据集进行7训练 3测试的划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train_under_sample, X_test_under_sample, y_train_under_sample, y_test_under_sample = train_test_split(
X_under_sample,
y_under_sample,
test_size=0.3,
random_state=0)
print('训练集样本数:', len(X_train_under_sample))
print('测试集样本数:', len(X_test_under_sample))
return X_train, X_test, y_train, y_test,X_train_under_sample, X_test_under_sample, y_train_under_sample, y_test_under_sample
3.绘制混淆矩阵
def plot_confusion_matrix(confusion_matrix, classes):
# print(confusion_matrix)
#plt.imshow 绘制热图
plt.figure()
plt.imshow(confusion_matrix, interpolation='nearest',cmap=plt.cm.Blues)
plt.title('confusion matrix')
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
thresh = confusion_matrix.max() / 2.
for i, j in itertools.product(range(confusion_matrix.shape[0]), range(confusion_matrix.shape[1])):
plt.text(j, i, confusion_matrix[i, j],
horizontalalignment="center",
color="white" if confusion_matrix[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
print('查准率为:',confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1]))
print('召回率为:',confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[1,0]))
print('准确率为:',(confusion_matrix[0,0]+confusion_matrix[1,1])/(confusion_matrix[0,0]+confusion_matrix[0,1]+confusion_matrix[1,1]+confusion_matrix[1,0]))
print('*********************************************************************************')
4.kFold进行交叉验证对于逻辑回归
def Kfold_for_TrainModel(X_train_data, y_train_data):
fold = KFold(len(X_train_data),5,shuffle = False)
# 正则化前面的C 参数
c_params = [0.01, 0.1, 1, 10, 100]
#这块生成一个DataFrame 用来保存不同的C参数,对应的召回率是多少
result_tables = pd.DataFrame(columns = ['C_parameter','Mean recall score'])
result_tables['C_parameter'] = c_params
j = 0
for c_param in c_params:
print('-------------------------------------------')
print('C参数为:',c_param)
print('-------------------------------------------')
print('')
recall_list = []
for iteration, indices in enumerate(fold,start=1):
#采用l1正则化
lr = logisticregression(C=c_param, penalty = 'l1',max_iter=10)
#indices[0] 保存的是这个k=5次训练中的某一次的用来验证的数据的索引
#indices[1] 保存的是这个k=5次训练中的某一次的用来测试的数据的索引
lr.fit(X_train_data.iloc[indices[0],:],
y_train_data.iloc[indices[0],:].values.ravel())#.ravel可以将输出降到一维
#用剩下的一份数据进行测试(即indices[1]中所保存的下标)
y_undersample_pred = lr.predict(X_train_data.iloc[indices[1],:].values)
recall = recall_score(y_train_data.iloc[indices[1],:].values,
y_undersample_pred)
f1_score_value = f1_score(y_train_data.iloc[indices[1],:].values,
y_undersample_pred)
recall_list.APPend(recall)
print('Iteration ',iteration," 召回率为:",recall,"f1_score为",f1_score_value)
print('')
print('平均召回率为:', np.mean(recall_list))
print('')
result_tables.loc[j,'Mean recall score'] = np.mean(recall_list)
j = j+1
5.调用过程 未使用方法进行了注释
#数据准备
result=dataPrepare()
#数据预处理 训练集、验证集、测试集
X_train, X_test, y_train, y_test,X_train_under_sample, X_test_under_sample, y_train_under_sample, y_test_under_sample=dataPreprocessing(result)
# #模型
# # best_c_param = Kfold_for_TrainModel(X_train_under_sample, y_train_under_sample)
# best_c_param=10
# #评估
# lr = LogisticRegression(C = best_c_param, penalty = 'l1',max_iter=5)
# lr.fit(X_train_under_sample, y_train_under_sample.values.ravel())
#
# #获得测试集的测试结果
# y_undersample_pred = lr.predict(X_test.values)
# #构建稀疏矩阵
# conf_matrix = confusion_matrix(y_test,y_undersample_pred)
#
# np.set_printoptions(precision=2)
# class_names = [0,1]
#
# plot_confusion_matrix(conf_matrix
# , classes=class_names)
#模型
# best_c_param = Kfold_for_TrainModel(X_train_under_sample, y_train_under_sample)
best_c_param=10
#评估
lr = RandomforestClassifier()
lr.fit(X_train_under_sample, y_train_under_sample.values.ravel())
#获得测试集的测试结果
y_undersample_pred = lr.predict(X_test_under_sample.values)
#构建稀疏矩阵
conf_matrix = confusion_matrix(y_test_under_sample,y_undersample_pred)
np.set_printoptions(precision=2)
class_names = [0,1]
plot_confusion_matrix(conf_matrix
, classes=class_names)
# #isolationForest
# rng = np.random.RandomState(42)
# method=''
# clf = IsolationForest(max_samples="auto", random_state=rng)
# clf.fit(X_train_under_sample)
# pred_y = clf.predict(X_train_under_sample)
# #IsolationForest 返回-1是异常值 返回1是正常值 所以画矩阵时将1替换为0,-1替换为1异常
# print(pred_y)
# pred_y = [0 if x==1 else x for x in pred_y]
# pred_y = [1 if x==-1 else x for x in pred_y]
# print(pred_y)
# #构建稀疏矩阵
# conf_matrix = confusion_matrix(y_train_under_sample,pred_y)
#
# np.set_printoptions(precision=2)
# class_names = [0,1]
#
# plot_confusion_matrix(conf_matrix
# , classes=class_names)
# #end
# #oneSVM
# rng = np.random.RandomState(42)
# method=''
# clf = svm.OneClassSVM(nu=0.02, kernel="rbf",
# gamma=0.1)
# clf.fit(X_train_under_sample)
# pred_y = clf.predict(X_train_under_sample)
# #IsolationForest 返回-1是异常值 返回1是正常值 所以画矩阵时将1替换为0,-1替换为1异常
# print(pred_y)
# pred_y = [0 if x==1 else x for x in pred_y]
# pred_y = [1 if x==-1 else x for x in pred_y]
# print(pred_y)
# #构建稀疏矩阵
# conf_matrix = confusion_matrix(y_train_under_sample,pred_y)
#
# np.set_printoptions(precision=2)
# class_names = [0,1]
#
# plot_confusion_matrix(conf_matrix
# , classes=class_names)
# #end
#测试真实数据
结果随记森林效果最好
github 代码:https://github.com/Hx96/N-BaIoT-Network-based-Detection-of-IoT-Botnet-Attacks-Using-Logistic-regression
文章最后发布于: 2019-05-02 17:28:27
相关阅读
浅谈setInterval(aa,1000)与setInterval(aa(),1000)的
一直有个疑惑,在定时器上调用某个方法时,加括号和不加括号有什么区别。今天做了个实验,发现,不加括号定时器会每秒执行一次,加了括号只
图片来源图虫:已授站长之家使用声明:本文来自于微信公众号运营研究社公众号(ID:U_quan),作者:陈维贤,授权站长之家转载发布。文章整理自
与E1000E和E1000相比,VMXNET3的网络性能更好。本文将解释虚拟网络适配器和第2部分之间的区别,并将演示通过选择半虚拟化适配器可以
我们生活中所说的真无线耳机其实就是两个“耳塞”,主体没有任何的可见线材,索尼真无线蓝牙降噪耳机WF-1000XM3就是如此
2018家用显示器多大好 6款1000左右23-27英寸显示器推
现在千元左右就能买到2K显示器了,但对于家庭用户来说,2K分辨率还是太超前了。一方面,千元左右的2K产品多是23英寸产品,点距太小用着会