脚本背景:

我所在的公司为运营CDN业务的IDC公司,客户域名的流量图经常会出现毛刺,但是服务的域名非常多,每天挨个流量图看耗时耗力。因此用python写了个可以自动检测异常rrd里异常数值并发送报警邮件的脚本。


由于我们的rrd文件是以服务域名命名的,所以先在相应的API上获取服务域名,然后根据域名扫描rrd文件。我设的是扫描半小时的数值,每10分钟执行一次,大概有2000来个rrd文件,执行一次6、7秒左右。


代码如下:


#!/usr/bin/env python
#coding:utf-8
from pyrrd.graph import DEF,CDEF,AREA
from pyrrd.graph import Graph
from pyrrd.graph import ColorAttributes
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.p_w_picpath import MIMEImage
from datetime import datetime
import calendar
import os
import time
import urllib2
import smtplib
import email
import sys
def graphrrd(files):now_utc =calendar.timegm(datetime.utcnow().utctimetuple())def1 = DEF(rrdfile=files, vname='back',dsName='RX')def2 = DEF(rrdfile=files, vname='CDN',dsName='TX')cdef1 = CDEF(vname='back_flow',rpn='%s,0.026,*' % def1.vname)cdef2 = CDEF(vname='CDN_flow',rpn='%s,0.026,*' % def2.vname)area1 = AREA(defObj=cdef1, color='#002A97FF', legend='back_flow')area2 = AREA(defObj=cdef2, color='#00CF00FF', legend='CDN_flow')ca = ColorAttributes()ca.back = '#333333'ca.canvas = '#333333'ca.shadea = '#000000'ca.shadeb = '#111111'ca.mgrid = '#CCCCCC'ca.axis = '#FFFFFF'ca.frame = '#AAAAAA'ca.font = '#FFFFFF'ca.arrow = '#FFFFFF'graphfile = p_w_picpath_dirtitle_url=files[23:-4]g = Graph(graphfile, start= now_utc-43200, end= now_utc,vertical_label='flow',title=title_url )g.data.extend([def1, def2, cdef1, cdef2, area2, area1])g.write()
def connect():server=smtplib.SMTP(smtpserver)server.ehlo()server.login(smtpuser,smtppass)return server
def sendmessage(server,to,subj,content):msg = MIMEMultipart('related')msg['Subject'] = subjmsg['From'] = smtpusermsg['To'] = tomsg['Date'] = email.Utils.formatdate() msgText = MIMEText(content,"html", "utf-8")msg.attach(msgText)fp = open(p_w_picpath_dir, 'rb')msgImage = MIMEImage(fp.read())fp.close()msgImage.add_header('Content-ID', '')msg.attach(msgImage)try:server.sendmail(smtpuser, to, msg.as_string())except Exception ,ex:print Exception,exprint 'Error - send failed'
def aver(rrd_file,n&#61;6):global dict_datasum1&#61;0sum2&#61;0sum3&#61;0data &#61; os.popen(&#39;rrdtool fetch %s AVERAGE -s -1d | tail -%d | grep -v nan| grep -v RX &#39; % (rrd_file,n)).readlines()if len(data)<(n/2):log("[ERRORS: %s] has not enough record ! please check it!!\n" % rrd_file)return []for i in data:if len(i) > 25:dict_data[i[:10]]&#61;i.strip()[12:].split()for i in dict_data.values():try:sum1 &#61; sum1&#43;float(i[0])sum2 &#61; sum2&#43;float(i[1])sum3 &#61; sum3&#43;float(i[2])except:log(&#39;%s %s\n&#39; % (rrd_file,i))if sum2/len(data) <3500000000:log(&#39;WARNING: %s was less then 200M\n&#39; % rrd_file)return []return [sum1/len(data),sum2/len(data),sum3/len(data)]
def check(average):wrong_t&#61;[]for key in dict_data:if float(dict_data[key][1])/average > 1.6:wrong_t.append(key)return wrong_tdef update(rrd_file,t,aver1,aver2,aver3):global textglobal dict_dataerrors_time&#61;os.popen(&#39;date -d "1970-01-01 UTC %s seconds"&#39; % t).readline().strip()content &#61; &#39;

%s 异常信息:
    域名:        %s
    时间:        %s
    流量值:     回源带宽: %.2fM , cdn带宽 : %dM
                
rrd 异常信息:
    路径:        %s
    UTC 时间:    %s
    异常值:     [%s], [%s], [%s]

&#39; % (rrd_file[23:-4],rrd_file[23:-4],errors_time,float(dict_data[t][0])*8/300000000,int(float(dict_data[t][1])*8/300000000),rrd_file,t,dict_data[t][0],dict_data[t][1],dict_data[t][2])write_error(&#39;[ %s ]: at[ %s(%s) ],the value was [%s] [%s] [%s] \n&#39; %(rrd_file,errors_time,t,dict_data[t][0],dict_data[t][1],dict_data[t][2]))text &#61; text &#43; content
def log(log_write):f &#61; open(&#39;%s/rrd_alt1.log&#39; % rrd_bak, &#39;a&#39;) f.write(log_write)f.close()
def write_error(log_write):f &#61; open(&#39;%s/rrd_error1.log&#39; % rrd_bak, &#39;a&#39;)f.write(log_write)f.closedef run_script(rrd_file):global to_allglobal textaver_rrd&#61;aver(rrd_file)if len(aver_rrd) &#61;&#61; 0:return wrong_time&#61;check(aver_rrd[1])if len(wrong_time)&#61;&#61;0:log(&#39;[%s] no errors !\n&#39; % (rrd_file))returnfor t in wrong_time:update(rrd_file,t,aver_rrd[0],aver_rrd[1],aver_rrd[2]) graphrrd(rrd_file)if text:for to in to_all:server&#61;connect()sendmessage(server,to,subj,text)log(&#39;sendmail to %s\n&#39; % to)
if __name__&#61;&#61;&#39;__main__&#39;:p_w_picpath_time&#61;time.strftime("%d-%H-%M")rrd_dir&#61;&#39;/data/rrd/db/1/billing&#39;rrd_bak&#61;&#39;/data/rrd/db/1/billing/bak&#39;smtpserver&#61;&#39;xxx&#39;p_w_picpath_dir&#61;&#39;%s/rrdgraph_%s.png&#39; % (rrd_bak,p_w_picpath_time)smtpuser&#61;&#39;xxx&#39;smtppass&#61;&#39;yyy&#39;to_all&#61;[&#39;xxx&#39;,&#39;yyy&#39;]subj&#61;&#39;check the flow of CDN!!!!&#39;while True:url_list&#61;[]local_time &#61; time.strftime("%m-%d %H:%M:%S")url&#61;urllib2.urlopen(&#39;xxx&#39;).readlines()for u in url:a &#61; "%s/%s.rrd" % (rrd_dir,u.strip())url_list.append(a)log("-"*60&#43;"\n")log("the script run time at %s \n" % local_time)while len(url_list):text&#61;&#39;&#39;dict_data&#61;{}rrd_file &#61; url_list.pop()if os.path.exists(rrd_file):run_script(rrd_file)else:continue log("-"*60&#43;"\n")break

邮件截图

173433350.jpg