from scipy import stats
x, y = np.mgrid[2.5:25:.01,2.5:25:.01]
pos = np.dstack((x, y))
rv = stats.multivariate_normal(mu, cov)
fig, ax = plt.subplots()# plot probability density
ax.contourf(x, y, rv.pdf(pos), cmap='Blues')# plot original data points
sns.regplot('Latency','Throughput',
data=pd.DataFrame(X, columns=['Latency','Throughput']),
fit_reg=False,
ax=ax,
scatter_kws={"s":15,"alpha":0.5})
plt.show()
2.3 select threshold ϵ
use training set X to model the multivariate Gaussian
use cross validation set (Xval,yval) to find the best ϵ by finding the best F-score
from sklearn.metrics import f1_score, classification_report
defselect_threshold(X, Xval, yval):'''
使用交叉验证数据找到最优的epsilon
Returrn:
e: best epsilon with highest f-score
f-score: such best f-score
'''# use training data create multivariate model
mu = X.mean(axis=0)
cov = np.cov(X.T)
multi_normal = stats.multivariate_normal(mu, cov)# use CV data for fining turning hyper parameters
pval = multi_normal.pdf(Xval)# set up epsilon candidates
epsilon = np.linspace(np.min(pval), np.max(pval), num=10000)# calculate f=score
fs =[]for e in epsilon:
y_pred =(pval <= e).astype('int')
fs.append(f1_score(yval, y_pred))# find the best f-score
argmax_fs = np.argmax(fs)return epsilon[argmax_fs], fs[argmax_fs]
e, fs = select_threshold(X, Xval, yval)
defpredict(X, Xval, e, Xtest, ytest):'''
用最优的epsilon,结合X, Xval预测Xtest
with optimal epsilon, combine X, Xval, and predict Xtest
Returns:
multi_normal: multivariate normal model
y_predict: prediction of test data
'''
Xdata = np.concatenate((X, Xval), axis=0)
mu = Xdata.mean(axis=0)
cov = np.cov(Xdata.T)
multi_normal = stats.multivariate_normal(mu, cov)# calculate probability of test data
pval = multi_normal.pdf(Xtest)
y_pred =(pval <= e).astype('int')print(classification_report(ytest, y_pred))return multi_normal, y_pred
multi_normal, y_pred = predict(X, Xval, e, Xtest, ytest)
2.4 visualize anomaly detection
data = pd.DataFrame(Xtest, columns=['Latency','Throughput'])
data['y_pred']= y_pred
# create a grid for graphing
x, y = np.mgrid[0:30:.01,0:30:.01]
pos = np.dstack((x,y))
fig, ax = plt.subplots()# plot probability density
ax.contourf(x, y,multi_normal.pdf(pos), cmap='Blues')# plot original Xval points
sns.regplot('Latency','Throughput',
data=data,
fit_reg =False,
ax=ax,
scatter_kws={"s":10,'alpha':0.4})# mark the predicted anomoly of CVdata
anamoly_data = data[data['y_pred']==1]
ax.scatter(anamoly_data['Latency'], anamoly_data['Throughput'], marker='x', s=50)
plt.show()