百度贴吧爬虫案例
#-*- coding:utf-8 -*-
# 作者:GeraTear
# 日期:2021年9月3日
# 说明: 百度贴吧爬虫
import urllib
import urllib2
def load_page(url,filename):
"""
发送请求,返回响应
"""
print "[INFO]正在爬取 %..." % filename
try:
response = urllib2.urlopen(url)
return response.read()
except:
print "[ERRoR]:%s 爬取失败" % filename
def write_page(html,filename):
print "[info] 正在保存 %s ..."% filename
with open(filename,'w') as f:
f.write(html)
def start_work(tieba_name,start_page,end_page):
base_url ="http://tieba.baidu.com/f?"
for page in range(start_page,end_page +1):
pn = (page -1) *50
dict_kw ={"kw":tieba_name,"pn":pn}
str_kw = urllib.urlencode(dict_kw)
full_url = base_url +str_kw
print full_url
print "\n爬取完成,谢谢使用"
if __name__== "__main__":
tieba_name = raw_input("请输入需要的爬取的贴吧名:")
start_page = int(raw_input("请输入爬取的起始页"))
end_page = int(raw_input('请输入爬取的结束页'))
start_work(tieba_name,start_page,end_page)