# -*- coding: utf-8 -*-
__author__ = "zlingh"
__date__ = "Date: 2014/01/28"
import re
import urllib,urllib2
#urllib:
#urllib2: The urllib2 module defines functions and classes which help in opening
#URLs (mostly HTTP) in a complex world — basic and digest authentication,
#redirections, COOKIEs and more.
def translate(text): ‘‘‘模拟浏览器的行为,向Google Translate的主页发送数据,然后抓取翻译结果 ‘‘‘ #text 输入要翻译的英文句子
text_1=text
#values={‘hl‘:‘zh-CN‘,‘ie‘:‘UTF-8‘,‘text‘:text_1,‘langpair‘:"‘en‘|‘zh-CN‘"}
#‘langpair‘:‘en‘|‘zh-CN‘从简体中文英语
values={‘hl‘:‘en‘,‘ie‘:‘UTF-8‘,‘text‘:text_1,‘langpair‘:"‘zh-CN‘|‘en‘"}
url=‘http://translate.google.cn/translate_t‘
data = urllib.urlencode(values)
req = urllib2.Request(url,data)
#模拟一个浏览器
browser=‘Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)‘
req.add_header(‘User-Agent‘,browser)
#向谷歌翻译发送请求
respOnse= urllib2.urlopen(req)
#读取返回页面
html=response.read()
#从返回页面中过滤出翻译后的文本
#使用正则表达式匹配
#翻译后的文本是‘TRANSLATED_TEXT=‘等号后面的内容
#.*? non-greedy or minimal fashion
#(?<=...)Matches if the current position in the string is preceded
#by a match for ... that ends at the current position
p=re.compile(r"(?<=TRANSLATED_TEXT=).*?;")
m=p.search(html)
text_2=m.group(0).strip(‘;‘)
#open(‘tmpp.txt‘,‘w‘).write(html)
return text_2
if __name__ == "__main__":
#text_1 从文件的读取
#text_1=open(‘c:\\text.txt‘,‘r‘).read()
text_1=‘北京天安门‘+‘,‘+‘故宫‘+‘,‘+‘长城‘+‘,‘+‘社会主义‘
#text_1=‘速度‘
print(‘%s‘ % text_1.decode(‘utf8‘))
text_2=translate(text_1).strip("‘")
print(‘%s‘ % text_2.decode(‘utf8‘))
二:利用xpath提取
在批量抓取网页内容时,我经常采用的做法是:1、得到目标内容在网页中的位置,即xpath路径;2、批量下载网页,然后利用xpath,取出每个网页中所需要的内容。
在这里,我们利用python模块lxml。
以谷歌翻译为例,我要批量抓取翻译内容,那么首先我要知道译文的xpath,代码如下:
import urllib,urllib2
import lxml
import lxml.html as HTML
import lxml.etree as etree
lin = ‘en‘
lout = ‘zh-CN‘
text = ‘my apple 123‘
values = {‘hl‘:‘zh-CN‘, ‘ie‘:‘UTF-8‘, ‘text‘:text, ‘sl‘:lin, ‘tl‘:lout}
url = ‘http://translate.google.cn/translate_t‘
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
req.add_header(‘User-Agent‘, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)")
response = urllib2.urlopen(req, timeout = 10)
shtml = response.read()
response.close()
hdoc = HTML.fromstring(shtml)
htree = etree.ElementTree(hdoc)
for t in hdoc.iter():
print htree.getpath(t)
print t.text_content()
raw_input()
运行这段代码,发现译文“我的苹果123”的xpath为“/html/body/div[2]/div[2]/div[2]/div/div/div[2]/div”。
现在可以利用xpath取出译文内容。以下方法接受英文原文,然后调用google translate,返回中文译文。代码如下:
import urllib,urllib2
import lxml
import lxml.html as HTML
import lxml.etree as etree
def g_trans(str_text):
lin = ‘en‘
lout = ‘zh-CN‘
values = {‘hl‘:‘zh-CN‘, ‘ie‘:‘UTF-8‘, ‘text‘:str_text, ‘sl‘:lin, ‘tl‘:lout}
url = ‘http://translate.google.cn/translate_t‘
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
req.add_header(‘User-Agent‘, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)")
response = urllib2.urlopen(req, timeout = 10)
htree = HTML.parse(response)
response.close()
emts = htree.xpath(‘/html/body/div[2]/div[2]/div[2]/div/div/div[2]/div‘)
return emts[0].text_content()
三:解析div标签提取结果
import urllib,urllib2
import time
from sgmllib import SGMLParser
class URLLister(SGMLParser):
def __init__(self, result):
SGMLParser.__init__(self)
self.result = result
self.open = False
def start_div(self, attrs):
id = [v for k, v in attrs if k==‘id‘]
if ‘tts_button‘ in id:
self.open = True
def handle_data(self, text):
if self.open:
self.result.append(text)
self.open = False
def Translate(text, f, t):
MySentence = []
values = {‘hl‘:‘%s‘%t,‘ie‘:‘UTF-8‘,‘text‘:text,‘langpair‘:"%s|%s"%(f, t)}
url = ‘http://translate.google.cn/translate_t‘
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
req.add_header(‘User-Agent‘, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)")
respOnse= urllib2.urlopen(req)
parser = URLLister(MySentence)
parser.feed(response.read())
parser.close()
return MySentencedef TranlateForIgnorException(text):
excpCnt = 0
while 1:
try:
arStr = Translate(langStr, "en", "zh-CN")[0]
break
except:
excpCnt = excpCnt + 1
if excpCnt > 10:
break
time.sleep(2)
return arStr
if __name__ == "__main__":
#ArStr = TranlateForIgnorException("This")
b=‘你好‘
c=‘hello‘
a=Translate(c,‘en‘,‘zh-CN‘)
print a[0].decode(‘utf8‘)
另外:上面提取网页不好发现的话,通过:http://translate.google.cn/?hl=en#zh-CN/en/饼干,网页来提取也可以。
下面是网上一个个很牛的项目,很方便,但是中文翻译成英文好像有点问题,我没有调试出来:
Goslate 免费谷歌翻译
http://zhuoqiang.me/goslate-free-google-translate-api.html