获取google,百度搜索排行榜的脚本
运行后,会在同文件夹下产生topwords.txt.
update:
1. 原脚本分别写入2次文件,在google关键字包含¥符号时,此符号本身是gb2312的,解码utf-8没问题,再编码为gbk就出错.于是改成连接字符串,然后一次写入;
2. 正则加入group,可以简化掉一个循环.
#! /usr/env/bin python
import string
import urllib2
import re
def main():
result = []
# google
url_google = r'http://www.google.cn/rebang/home'
# get exact result by group
re_google = r'/search\?q=(.+?)&'
doc_google= urllib2.urlopen(url_google).read()
url_baidu = r'http://top.baidu.com/buzz/top_keyword.html'
re_baidu = r'tn=baidutop10&wd=(.+?)"'
doc_baidu = urllib2.urlopen(url_baidu).read()
url_baidu2 = r'http://top.baidu.com/buzz/weekhotspot.html'
re_baidu2 = r'tn=baidutop10&wd=(.+?)"'
doc_baidu2 = urllib2.urlopen(url_baidu2).read()
if len(doc_google) != 0:
result_google = re.findall(re_google, doc_google)
ss = urllib2.unquote('%20'.join(result_google))
sss += ss
result = []
if len(doc_baidu) != 0:
result_baidu = re.findall(re_baidu, doc_baidu)
ss = urllib2.unquote('%20'.join(result_baidu))
sss += ss.decode('gb2312').encode('utf-8')
result = []
if len(doc_baidu2) != 0:
result_baidu = re.findall(re_baidu2, doc_baidu2)
ss = urllib2.unquote('%20'.join(result_baidu))
sss += ss.decode('gb2312').encode('utf-8')
f = open('hotwords.txt', 'w')
f.write(sss)
f.close()
del doc_google
del doc_baidu
del doc_baidu2
if __name__ == '__main__': main()
No comments :
Post a Comment