Python爬取妹子秀
爬虫真好玩,就是牢饭也香hhh
初体验之爬个妹子秀图库的swmt栏,上面的小妹妹…嘶溜针不戳。
不多说,上代码。第一次写,抄了这位佬的代码基本上,感谢这位佬@https://www.kancloud.cn/@noahs
用的python3.7。
会有bug,要是服务器的图片挂了就没法加载了,才疏学浅等会了再来更新代码好了(鸽了
# -*- codeing = utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
import os
import sys
import time
class download_xiuaa():
def __init__(self): #定义基础变量,文件头以及爬虫的网站
self.baseurl = "https://xiuaa.com/swmt/"
self.head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46"
}
self.title_list =[]
# def get_data(self):
def ask_url(self): #获取目标网站的url信息
rq = requests.get(url = self.baseurl,headers = self.head)
bs = BeautifulSoup(rq.text,"html.parser")
text = bs.find('div',id = 'container').find_all('a',target = '_blank')
# print(text)
for i in text:
title = i.get_text()
url = i.get('href')
if title:
self.title_list.append({
'title':title,
'url':url
})
def get_maxpage(self): #获取当前图集的最大页数
for thing in self.title_list:
urls = 'https://www.xiuaa.com'+thing['url']
rq = requests.get(url=urls, headers =self.head)
sp = BeautifulSoup(rq.text, 'html.parser')
text = sp.find('div',id = 'pager').find_all('a')
# print(text)
maxpag = text[0].get_text()
thing['url'] = urls
thing['maxpag'] = int(maxpag)
def get_ever_url(self,dic): #获取图片所有地址
print('下载:%s,\t 页数%s' % (dic['title'], dic['maxpag']))
urls = dic['url']
for i in range(0,int(dic['maxpag'])):
findNum = re.finditer(r"\d+", urls)
for match in findNum:
# print(match.group())
page_url = 'https://www.xiuaa.com/swmt/'+ match.group() +'_'+ str(i) +'.html'
rq = requests.get(url =page_url,headers = self.head)
bs = BeautifulSoup(rq.text,'html.parser')
text = bs.find('div',id = 'bigpic').find_all('img')[0]
pic_url = text.get('src')
name = re.split('/',pic_url)[4]
self.down_pic(pic_url,dic['title'],name)
time.sleep(2)
sys.stdout.write("\r")
sys.stdout.write("%s%% | %s" % (int(i / int(dic['maxpag']) * 100), i * '|'))
sys.stdout.flush()
def down_pic(self,pic_url,title,name):
if not os.path.exists(title):
os.mkdir(title)
rq=requests.get(url = pic_url,headers = self.head)
with open("%s/%s"%(title,name),'wb') as f:
f.write(rq.content)
f.close()
if __name__ == '__main__':
dx = download_xiuaa()
dic = {'title':'紧臀蓝裙美女Lucy黑丝美腿诱惑','url':'https://www.xiuaa.com/swmt/1823.html','maxpag':'20'}
dx.get_ever_url(dic)