liblinear是libsvm的线性核的改进版本,专门适用于百万数据量的分类。正好适用于我这次数据挖掘的实验。
liblinear用法和libsvm很相似,我是用的是.exe文件,利用python的subprocess向控制台发送命令即可完成本次试验。
其中核心两句即
train train.txt
predict test.txt train.txt.model output.txt
由于是线性核,没有设置参数c、g
对于50W篇文章模型训练仅需340秒,50W篇文章的预测仅需6秒
1 from subprocess import *
2 import time
3
4 time = time.time
5
6 start_time = time()
7 print("训练")
8 cmd = "train train.txt"
9 Popen(cmd, shell = True, stdout = PIPE).communicate()
10 print("训练结束",str(time() - start_time))
11
12
13 start_time = time()
14 print("预测")
15 cmd = "predict test.txt train.txt.model output.txt"
16 Popen(cmd, shell = True).communicate()
17 print("预测结束",str(time() - start_time))
18
19
20 #进行统计
21 #读测试集真实label
22 start_time = time()
23 print("统计")
24 test_filename = "test.txt"
25 f = open(test_filename,"r",encoding = "utf-8")
26 real_class = []
27 for line in f:
28 real_class.append(line[0])
29
30 #总样本
31 total_sample = len(real_class)
32
33 #读预测结果label
34 predict_filename = "output.txt"
35 f_predict = open(predict_filename,"r",encoding = "utf-8")
36 s = f_predict.read()
37 predict_class = s.split()
38
39 #对预测正确的文章进行计数
40 T = 0
41 for real, predict in zip(real_class,predict_class):
42 if int(real) == int(predict):
43 T += 1
44 accuracy = T / total_sample * 100
45 print("正确率 为", str(accuracy) + "%")
46
47
48 # class_label = ["0","1","2","3","4","5","6","7","8","9"]
49 num_to_cate = {0:"it",1:"体育",2:"军事",3:"金融",4:"健康",5:"汽车",6:"房产",7:"文化",8:"教育",9:"娱乐"}
50
51 class_label = ["it","体育","军事","金融","健康","汽车","房产","文化","教育","娱乐"]
52
53 predict_precision = dict.fromkeys(class_label,1.0)
54 predict_true = dict.fromkeys(class_label,1.0)
55
56 predict_recall = dict.fromkeys(class_label,1.0)
57 predict_F = dict.fromkeys(class_label,0.0)
58 # print(str(predict_precision))
59 # print(str(predict_precision))
60 # print(str(predict_recall))
61 # print(str(predict_true))
62 mat = dict.fromkeys(class_label,{})
63 for k,v in mat.items():
64 mat[k] = dict.fromkeys(class_label,0)
65
66 # print(str(mat))
67
68 for real, predict in zip(real_class,predict_class):
69 real = int(real)
70 predict = int(predict)
71 # print(num_to_cate[real])
72 # print(num_to_cate[predict])
73 mat[num_to_cate[real]][num_to_cate[predict]] += 1
74 predict_precision[num_to_cate[predict]] += 1
75 predict_recall[num_to_cate[real]] += 1
76
77 if int(real) == int(predict):
78 predict_true[num_to_cate[predict]] += 1
79
80 # print(str(predict_precision))
81 # print(str(predict_recall))
82 # print(str(predict_true))
83
84 #输出混淆矩阵
85 for k, v in mat.items():
86 print(k + ":" + str(v))
87
88 #计算精确率和召回率
89 for x in range(len(class_label)):
90 # x = str(x)
91 predict_precision[num_to_cate[x]] = predict_true[num_to_cate[x]] / predict_precision[num_to_cate[x]]
92 predict_recall[num_to_cate[x]] = predict_true[num_to_cate[x]] / predict_recall[num_to_cate[x]]
93
94 # print(str(predict_precision))
95 # print(str(predict_recall))
96 # print(str(predict_true))
97
98 #计算F测度
99 for x in range(len(class_label)):
100 # x = str(x)
101 predict_F[num_to_cate[x]] = 2 * predict_recall[num_to_cate[x]] * predict_precision[num_to_cate[x]] / (predict_precision[num_to_cate[x]] + predict_recall[num_to_cate[x]])
102
103 print("统计结束",str(time() - start_time))
104 print("精确率为",str(predict_precision))
105 print("召回率为",str(predict_recall))
106 print("F测度为",str(predict_F))
107
108 print("保存结果")
109 final_result_filename = "./finalresult.txt"
110 f = open(final_result_filename,"w",encoding = "utf-8")
111 for k, v in mat.items():
112 f.write(k + ":" + str(v) + "\n")
113
114 f.write("\n")
115 f.write("正确率为" + str(accuracy) + "%" + "\n\n")
116 f.write("精确率为" + str(predict_precision) + "\n\n")
117 f.write("召回率为" + str(predict_recall) + "\n\n")
118 f.write("F测度为" + str(predict_F) + "\n\n")
119 print("保存结果结束")
120
121
122 # cate_to_num = {"it":0,"体育":1,"军事":2,"华人":3,"国内":4,"国际":5,"房产":6,"文娱":7,"社会":8,"财经":9}
123 # num_to_cate = {0:"it",1:"体育",2:"军事",3:"华人",4:"国内",5:"国际",6:"房产",7:"文娱",8:"社会",9:"财经"}