获取google,百度搜索排行榜的脚本
运行后,会在同文件夹下产生topwords.txt.
update:
1. 原脚本分别写入2次文件,在google关键字包含¥符号时,此符号本身是gb2312的,解码utf-8没问题,再编码为gbk就出错.于是改成连接字符串,然后一次写入;
2. 正则加入group,可以简化掉一个循环.
#! /usr/env/bin python import string import urllib2 import re def main(): result = [] # google url_google = r'http://www.google.cn/rebang/home' # get exact result by group re_google = r'/search\?q=(.+?)&' doc_google= urllib2.urlopen(url_google).read() url_baidu = r'http://top.baidu.com/buzz/top_keyword.html' re_baidu = r'tn=baidutop10&wd=(.+?)"' doc_baidu = urllib2.urlopen(url_baidu).read() url_baidu2 = r'http://top.baidu.com/buzz/weekhotspot.html' re_baidu2 = r'tn=baidutop10&wd=(.+?)"' doc_baidu2 = urllib2.urlopen(url_baidu2).read() if len(doc_google) != 0: result_google = re.findall(re_google, doc_google) ss = urllib2.unquote('%20'.join(result_google)) sss += ss result = [] if len(doc_baidu) != 0: result_baidu = re.findall(re_baidu, doc_baidu) ss = urllib2.unquote('%20'.join(result_baidu)) sss += ss.decode('gb2312').encode('utf-8') result = [] if len(doc_baidu2) != 0: result_baidu = re.findall(re_baidu2, doc_baidu2) ss = urllib2.unquote('%20'.join(result_baidu)) sss += ss.decode('gb2312').encode('utf-8') f = open('hotwords.txt', 'w') f.write(sss) f.close() del doc_google del doc_baidu del doc_baidu2 if __name__ == '__main__': main()
No comments :
Post a Comment