百度贴吧爬虫

百度贴吧爬虫案例

#-*- coding:utf-8 -*-
#   作者:GeraTear
#   日期:2021年9月3日
#   说明: 百度贴吧爬虫
import urllib
import urllib2

def load_page(url,filename):
    """
    发送请求,返回响应
    """

    print "[INFO]正在爬取 %..." % filename
    try:
        response = urllib2.urlopen(url)
        return response.read()
    except:
        print "[ERRoR]:%s 爬取失败" % filename

def write_page(html,filename):
    print "[info] 正在保存 %s ..."% filename
    with open(filename,'w') as f:
        f.write(html)

def start_work(tieba_name,start_page,end_page):
    base_url ="http://tieba.baidu.com/f?"

    for page in range(start_page,end_page +1):
        pn = (page -1) *50

        dict_kw ={"kw":tieba_name,"pn":pn}
        str_kw = urllib.urlencode(dict_kw)

        full_url = base_url +str_kw

        print full_url
    print "\n爬取完成,谢谢使用"
if __name__== "__main__":
    tieba_name = raw_input("请输入需要的爬取的贴吧名:")
    start_page = int(raw_input("请输入爬取的起始页"))
    end_page = int(raw_input('请输入爬取的结束页'))
    start_work(tieba_name,start_page,end_page)

上一篇:【图片识别】java 图片文字识别 ocr (转)


下一篇:JavaScript 时间格式