百度贴吧爬虫

发布时间:2016-12-7 5:47:19 编辑:www.fx114.net 分享查询网我要评论
本篇文章主要介绍了"百度贴吧爬虫",主要涉及到百度贴吧爬虫方面的内容,对于百度贴吧爬虫感兴趣的同学可以参考一下。

#!/usr/bin/env python #coding=utf-8 import httplib2 import json from lxml import etree def replace(s): s= s.replace('/p/','http://tieba.baidu.com/p/') return s def openhttp(url): h2 = httplib2.Http('.cache') (resp2,html) = h2.request(url,'GET') return html def store_file(reply_sum): filehandle.write(reply_sum['topic']) filehandle.write('\n') try: for i in reply_sum['every_floor']: filehandle.write(str(i['floor'])) filehandle.write('\t') filehandle.write(str(i['id'])) filehandle.write('\t') filehandle.write(i['name']) filehandle.write('\t') filehandle.write(i['content']) filehandle.write('\t') filehandle.write(i['time']) filehandle.write('\n') except: print('') def parse_link(topic,link): original_link = link sub_html = openhttp(link) now_page = etree.HTML(sub_html.decode('gbk')) total_page = int(now_page.xpath(u'//*[@class="l_reply_num"]/span')[0].text) #print total_page print "共有页码数:%d" %(total_page) print 'start==========================' print "主题是:%s" %(topic.encode('utf8')) floor = 0 reply_dict = {} reply_list =[] reply_sum = {} reply_sum['topic'] = topic.encode('utf8') n = 1 while(total_page>=n): link = original_link link = link + '?pn='+str(n) print '准备检索的url' print link sub_html = openhttp(link) now_page = etree.HTML(sub_html.decode('gbk')) replies = now_page.xpath(u'//*[@class="l_post "]|//*[@class="l_post noborder"]') for reply in replies: try: print "层数:%d" %(floor) contents = reply.xpath(u'descendant::div[@class="d_post_content j_d_post_content"]') json_str = reply.attrib['data-field'] author_data = json.loads(json_str) author_id = author_data["author"]["id"] author_name = author_data["author"]["name"] author_time = author_data["content"]["date"] for content in contents: print '' reply_dict['floor'] = floor+1 reply_dict['id']= author_id reply_dict['content'] = content.text.encode('utf8') reply_dict['time'] = author_time.encode('utf8') reply_dict['name'] = author_name.encode('utf8') except: print('') reply_list+=[reply_dict] reply_dict = {} floor = floor +1 n = n+1 for i in reply_list: for m in i: print i[m] reply_sum['every_floor'] = reply_list store_file(reply_sum) #http://tieba.baidu.com/p/2259628273?pn=2d_post_content j_d_post_content def main(): pn = 0 while pn < 50: url = 'http://tieba.baidu.com/f?kw=%B0%CD%C0%E5%B5%BA&tp='+str(pn) print url main_html = openhttp(url) l = [] page = etree.HTML(main_html.decode('gbk')) p = page.xpath(u'//a[@target="_blank"][@class="j_th_tit"]') print p[0].values() for h in p: l = h.values() link = replace(l[0]) topic = l[1] parse_link(topic,link) print topic pn = pn+50 if __name__=='__main__': filehandle = open('aaaaaaaaa.txt','w') main() filehandle.close()

上一篇:ext direct spring Simple Method
下一篇:Win32 Series - A Sample MDI Implementation

相关文章

关键词: 百度贴吧爬虫

相关评论