使用python编写一个壁纸网站的简单爬虫

目标网站:http://www.netbian.com/

目的:实现对壁纸各分类的第一页壁纸的获取

一:分析网站,编写代码:

(ps:源代码在文章的最后)

1.获取网站目录部分的一大段代码,下一步再进行仔细匹配网址与标题.

 #coding=gbk
#目标:下载各目录的壁纸(大图)
__author__ = 'CQC'
import urllib2
import urllib
import re
import os #创建壁纸下载文件夹
path = 'd:\\彼岸壁纸'
if not os.path.isdir(path):
os.makedirs(path)
#目录
big_title = [] #首页打开
url = 'http://www.netbian.com/'
headers = {'User-agent' : 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0'}
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request) #首页目录源代码获取
pat_menu = re.compile('<ul class="menu">(.*?)</a></div>',re.S)
code_menu = re.search(pat_menu,response.read())

如图:

使用python编写一个壁纸网站的简单爬虫

2.进行分类的标题与链接的匹配。

 #目录标题
pat_menu_title = re.compile('<a href=".*?" title="(.*?)">',re.S)
menu_title = re.findall(pat_menu_title,code_menu.group(1))
for a_item in menu_title:
big_title.append(a_item)
print a_item #目录链接
pat_menu_link = re.compile('<a href="(.*?)" title=".*?">',re.S)
menu_link = re.findall(pat_menu_link,code_menu.group(1))

如下图所示:

使用python编写一个壁纸网站的简单爬虫

3.从爬取到的目录进入,获得该目录下所有壁纸的标题与链接.

 #进入目录
j = 0
for b_item in menu_link:
url_menu = 'http://www.netbian.com/' + b_item
request_son = urllib2.Request(url_menu,headers = headers)
response_son = urllib2.urlopen(request_son)
#获得每个目录的图片标题,链接 #获得子目录标题
title_son = []
pat_title_son = re.compile('<img src=".*?" data-src=".*?" alt="(.*?)"/>',re.S)
res_title = re.findall(pat_title_son,response_son.read())
for c_item in res_title:
title_son.append(c_item) #筛选出子目录代码
pat_code_son = re.compile('<ul>(.*?)</ul>',re.S)
middle_pattern = urllib2.Request(url_menu,headers = headers)
middle_response = urllib2.urlopen(middle_pattern)
res_code_son = re.search(pat_code_son,middle_response.read()) #获得子目录链接,合成大图网页链接
pat_link_son = re.compile('<li><a href="(.*?)" target="_blank"><img',re.S)
res_link = re.findall(pat_link_son,res_code_son.group(1))

如下图所示:

使用python编写一个壁纸网站的简单爬虫

4.根据上一步爬取到的链接,合成真正的1080p壁纸链接.

因为我们从上图标题点进去后是这样:

使用python编写一个壁纸网站的简单爬虫

还需要点击下载按钮才能打开1080p壁纸的链接。为了方便,我们直接合成1080p壁纸的链接.

例如: http://www.netbian.com/desk/9805.htm

对应的1080p网址:http://www.netbian.com/desk/9805-1920x1080.htm

代码:

     i = 0
#显示进度
print big_title[j]
for d_item in res_link:
#获得大图下载链接
if d_item == 'http://www.mmmwu.com/':
pass
else:
new_link = 'http://www.netbian.com/' + d_item[:-4] + '-1920x1080.htm'
print new_link

(ps:由于‘美女’分类中的第一个标题链接到了其他网站,为了简单一点,所以我直接跳过了)

5.进入1080p壁纸链接,下载壁纸.

 request_real = urllib2.Request(new_link,headers = headers)
response_real = urllib2.urlopen(request_real)
pat_real = re.compile('<img src="(.*?)" alt=".*?"/></td></tr>') link_real = re.search(pat_real,response_real.read())
#跳过vip壁纸
if link_real:
fina_link = link_real.group(1)
#创建下载目录
path_final = 'd:\\彼岸壁纸\\' + big_title[j] + '\\'
if not os.path.isdir(path_final):
os.makedirs(path_final)
path_pic = path_final + title_son[i] + '.jpg'
f = open(path_pic,'wb')
data = urllib.urlopen(fina_link)
f.write(data.read())
f.close()
if not data:
print "Download Failed."
i += 1
print 'One menu download OK.'
j += 1

6.下载完成.

使用python编写一个壁纸网站的简单爬虫

二、所有的源代码。

 #coding=gbk
#目标:下载各目录的壁纸(大图)
__author__ = 'CQC'
import urllib2
import urllib
import re
import os #创建壁纸下载文件夹
path = 'd:\\彼岸壁纸'
if not os.path.isdir(path):
os.makedirs(path)
#目录
big_title = [] #首页打开
url = 'http://www.netbian.com/'
headers = {'User-agent' : 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0'}
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request) #首页目录源代码获取
pat_menu = re.compile('<ul class="menu">(.*?)</a></div>',re.S)
code_menu = re.search(pat_menu,response.read()) #目录标题
pat_menu_title = re.compile('<a href=".*?" title="(.*?)">',re.S)
menu_title = re.findall(pat_menu_title,code_menu.group(1))
for a_item in menu_title:
big_title.append(a_item)
print a_item #目录链接
pat_menu_link = re.compile('<a href="(.*?)" title=".*?">',re.S)
menu_link = re.findall(pat_menu_link,code_menu.group(1)) #进入目录
j = 0
for b_item in menu_link:
url_menu = 'http://www.netbian.com/' + b_item
request_son = urllib2.Request(url_menu,headers = headers)
response_son = urllib2.urlopen(request_son)
#获得每个目录的图片标题,链接 #获得子目录标题
title_son = []
pat_title_son = re.compile('<img src=".*?" data-src=".*?" alt="(.*?)"/>',re.S)
res_title = re.findall(pat_title_son,response_son.read())
for c_item in res_title:
title_son.append(c_item) #筛选出子目录代码
pat_code_son = re.compile('<ul>(.*?)</ul>',re.S)
middle_pattern = urllib2.Request(url_menu,headers = headers)
middle_response = urllib2.urlopen(middle_pattern)
res_code_son = re.search(pat_code_son,middle_response.read()) #获得子目录链接,合成大图网页链接
pat_link_son = re.compile('<li><a href="(.*?)" target="_blank"><img',re.S)
res_link = re.findall(pat_link_son,res_code_son.group(1))
i = 0
#显示进度
print big_title[j]
for d_item in res_link:
#获得大图下载链接
if d_item == 'http://www.mmmwu.com/':
pass
else:
new_link = 'http://www.netbian.com/' + d_item[:-4] + '-1920x1080.htm'
print new_link
request_real = urllib2.Request(new_link,headers = headers)
response_real = urllib2.urlopen(request_real)
pat_real = re.compile('<img src="(.*?)" alt=".*?"/></td></tr>') link_real = re.search(pat_real,response_real.read())
#跳过vip壁纸
if link_real:
fina_link = link_real.group(1)
#创建下载目录
path_final = 'd:\\彼岸壁纸\\' + big_title[j] + '\\'
if not os.path.isdir(path_final):
os.makedirs(path_final)
path_pic = path_final + title_son[i] + '.jpg'
f = open(path_pic,'wb')
data = urllib.urlopen(fina_link)
f.write(data.read())
f.close()
if not data:
print "Download Failed."
i += 1
print 'One menu download OK.'
j += 1
上一篇:Android Studio和SDK下载、安装和环境变量配置


下一篇:Swift基础之实现下拉变大和OC下拉变大上拉缩小Demo