123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
importsysimportosimportreimportjsonimporturllib.request importurllib.errorimportchardet defbuiltwith(url,headers=None,html=None,user_agent='builtwith'): """Detect the technology used to build a website >>> builtwith('http://wordpress.com') {u'blogs': [u'PHP', u'WordPress'], u'font-scripts': [u'Google Font API'], u'web-servers': [u'Nginx'], u'Javascript-frameworks': [u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress']} >>> builtwith('http://webscraping.com') {u'Javascript-frameworks': [u'jQuery', u'Modernizr'], u'web-frameworks': [u'Twitter Bootstrap'], u'web-servers': [u'Nginx']} >>> builtwith('http://microsoft.com') {u'Javascript-frameworks': [u'jQuery'], u'mobile-frameworks': [u'jQuery Mobile'], u'operating-systems': [u'Windows Server'], u'web-servers': [u'IIS']} >>> builtwith('http://jquery.com') {u'cdn': [u'CloudFlare'], u'web-servers': [u'Nginx'], u'Javascript-frameworks': [u'jQuery', u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress'], u'blogs': [u'PHP', u'WordPress']} >>> builtwith('http://joomla.org') {u'font-scripts': [u'Google Font API'], u'miscellaneous': [u'Gravatar'], u'web-servers': [u'LiteSpeed'], u'Javascript-frameworks': [u'jQuery'], u'programming-languages': [u'PHP'], u'web-frameworks': [u'Twitter Bootstrap'], u'cms': [u'Joomla'], u'video-players': [u'YouTube']} """ techs={} # check URL forapp_name,app_specindata['apps'].items(): if'url'inapp_spec: ifcontains(url,app_spec['url']): add_app(techs,app_name,app_spec) # download content ifNonein(headers,html): try: request=urllib.request.Request(url,None,{'User-Agent':user_agent}) ifhtml: # already have HTML so just need to make HEAD request for headers request.get_method=lambda:'HEAD' response=urllib.request.urlopen(request) ifheadersisNone: headers=response.headers ifhtmlisNone: html=response.read() encode_type=chardet.detect(html) ifencode_type['encoding']=='utf-8': html=html.decode('utf-8') else: html=html.decode('gbk') exceptExceptionase: print('Error:',e) request=None # check headers ifheaders: forapp_name,app_specindata['apps'].items(): if'headers'inapp_spec: ifcontains_dict(headers,app_spec['headers']): add_app(techs,app_name,app_spec) # check html ifhtml: forapp_name,app_specindata['apps'].items(): forkeyin'html','script': snippets=app_spec.get(key,[]) ifnotisinstance(snippets,list): snippets=[snippets] forsnippetinsnippets: ifcontains(html,snippet): add_app(techs,app_name,app_spec) break # check meta # XXX add proper meta data parsing metas=dict(re.compile(']*?name=[\'"]([^>]*?)[\'"][^>]*?cOntent=[\'"]([^>]*?)[\'"][^>]*?>',re.IGNORECASE).findall(html)) forapp_name,app_specindata['apps'].items(): forname,contentinapp_spec.get('meta',{}).items(): ifnameinmetas: ifcontains(metas[name],content): add_app(techs,app_name,app_spec) break returntechsparse=builtwith defadd_app(techs,app_name,app_spec): """Add this app to technology """ forcategoryinget_categories(app_spec): ifcategorynotintechs: techs[category]=[] ifapp_namenotintechs[category]: techs[category].append(app_name) implies=app_spec.get('implies',[]) ifnotisinstance(implies,list): implies=[implies] forapp_nameinimplies: add_app(techs,app_name,data['apps'][app_name]) defget_categories(app_spec): """Return category names for this app_spec """ return[data['categories'][str(c_id)]forc_idinapp_spec['cats']] defcontains(v,regex): """Removes meta data from regex then checks for a regex match """ returnre.compile(regex.split('\\;')[0],flags=re.IGNORECASE).search(v) defcontains_dict(d1,d2): """Takes 2 dictionaries Returns True if d1 contains all items in d2""" fork2,v2ind2.items(): v1=d1.get(k2) ifv1: ifnotcontains(v1,v2): returnFalse else: returnFalse returnTrue defload_apps(filename='apps.json.py'): """Load apps from Wappalyzer JSON (https://github.com/ElbertF/Wappalyzer) """ # get the path of this filename relative to the current script # XXX add support to download update filename=os.path.join(os.getcwd(),os.path.dirname(__file__),filename) returnjson.load(open(filename))data=load_apps() if__name__=='__main__': urls=sys.argv[1:] ifurls: forurlinurls: results=builtwith(url) forresultinsorted(results.items()): print('%s: %s'%result) else: print('Usage: %s url1 [url2 url3 ...]'%sys.argv[0]) |