# 提取所有邮件并保存到一个文件中 def get_data_in_a_file(original_path, save_path='all_email.txt'): files = os.listdir(original_path) for file in files: if os.path.isdir(os.path.join(original_path, file)): get_data_in_a_file(os.path.join(original_path, file), save_path=save_path) else: with open(os.path.join(original_path, file), 'r', encoding='gbk', errors='ignore') as f: email = ''.join([clean_str(line) for line in f]) with open(save_path, 'a', encoding='utf8') as f: email = [word for word in jieba.cut(email) if word.strip() != ''] f.write(' '.join(email) + '\n')
```python from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn import metrics import numpy as np
if __name__ == '__main__': np.random.seed(1) x, vectoring = get_data_tf_idf('all_email.txt') y = get_label_list('label.txt') index = np.arange(len(y)) np.random.shuffle(index) x = x[index] y = y[index] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) clf = LogisticRegression() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print('classification_report\n', metrics.classification_report(y_test, y_pred, digits=4)) print('Accuracy:', metrics.accuracy_score(y_test, y_pred)) ```
### 7. 综合测试结果
我们对2000条数据进行了测试,结果显示分类精度较高。然而,由于数据量较小,难以全面评估模型性能。
### 8. 其他模型方法
除了传统的机器学习方法,还可以构建深度学习模型。以下是LSTM模型的一个例子:
```python from keras.models import Sequential from keras.layers import Embedding, LSTM, Dense, Dropout
# 获取嵌入矩阵 def get_embedding_vectors(tokenizer, dim=100): embedding_index = {} with open(f'data/glove.6B.{dim}d.txt', encoding='utf8') as f: for line in tqdm.tqdm(f, 'Reading GloVe'): values = line.split() word = values[0] vectors = np.asarray(values[1:], dtype='float32') embedding_index[word] = vectors word_index = tokenizer.word_index embedding_matrix = np.zeros((len(word_index) + 1, dim)) for word, i in word_index.items(): embedding_vector = embedding_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector return embedding_matrix