Python爬取妹子秀图片

Python爬取妹子秀

爬虫真好玩,就是牢饭也香hhh

初体验之爬个妹子秀图库的swmt栏,上面的小妹妹…嘶溜针不戳。
不多说,上代码。第一次写,抄了这位佬的代码基本上,感谢这位佬@https://www.kancloud.cn/@noahs
用的python3.7
会有bug,要是服务器的图片挂了就没法加载了,才疏学浅等会了再来更新代码好了(鸽了

# -*- codeing = utf-8 -*-

import requests
from bs4 import BeautifulSoup
import re
import os
import sys
import time

class download_xiuaa():

    def __init__(self):             #定义基础变量,文件头以及爬虫的网站
        self.baseurl = "https://xiuaa.com/swmt/"
        self.head = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46"
         }
        self.title_list =[]

    # def get_data(self):

    def ask_url(self):         #获取目标网站的url信息

        rq = requests.get(url = self.baseurl,headers = self.head)
        bs = BeautifulSoup(rq.text,"html.parser")
        text = bs.find('div',id = 'container').find_all('a',target = '_blank')
        # print(text)

        for i in text:
            title = i.get_text()
            url = i.get('href')
            if title:
                self.title_list.append({
                    'title':title,
                    'url':url
                })


    def get_maxpage(self):      #获取当前图集的最大页数
        for thing in self.title_list:
            urls = 'https://www.xiuaa.com'+thing['url']
            rq = requests.get(url=urls, headers =self.head)
            sp = BeautifulSoup(rq.text, 'html.parser')
            text = sp.find('div',id = 'pager').find_all('a')
            # print(text)
            maxpag = text[0].get_text()
            thing['url'] = urls
            thing['maxpag'] = int(maxpag)


    def get_ever_url(self,dic):     #获取图片所有地址
        print('下载:%s,\t 页数%s' % (dic['title'], dic['maxpag']))

        urls = dic['url']

        for i in range(0,int(dic['maxpag'])):
            findNum = re.finditer(r"\d+", urls)
            for match in findNum:
                # print(match.group())
                page_url = 'https://www.xiuaa.com/swmt/'+ match.group() +'_'+ str(i)  +'.html'

            rq = requests.get(url =page_url,headers = self.head)
            bs = BeautifulSoup(rq.text,'html.parser')
            text = bs.find('div',id = 'bigpic').find_all('img')[0]

            pic_url = text.get('src')
            name = re.split('/',pic_url)[4]

            self.down_pic(pic_url,dic['title'],name)
            time.sleep(2)

            sys.stdout.write("\r")
            sys.stdout.write("%s%% | %s" % (int(i / int(dic['maxpag']) * 100), i * '|'))
            sys.stdout.flush()


    def down_pic(self,pic_url,title,name):
        if not os.path.exists(title):
            os.mkdir(title)
        rq=requests.get(url = pic_url,headers = self.head)
        with open("%s/%s"%(title,name),'wb') as f:
            f.write(rq.content)
            f.close()


if __name__ == '__main__':
    dx = download_xiuaa()

    dic = {'title':'紧臀蓝裙美女Lucy黑丝美腿诱惑','url':'https://www.xiuaa.com/swmt/1823.html','maxpag':'20'}

    dx.get_ever_url(dic)



上一篇:python37实现1-统计文本中出现次数最多的单词


下一篇:es6安装与配置