Python爬虫小结

有些数据是没有专门的数据集的,为了找到神经网络训练的数据,自然而然的想到了用爬虫的方法开始采集数据。一开始采用了网上的一个动态爬虫的代码,发现爬取的图片大多是重复的,有效图片很少。

动态爬虫:

from lxml import etree
import requests
import re
import urllib
import json
import time
import os local_path = '/home/path/'
if not os.path.exists(local_path):
os.makedirs(local_path)
keyword = input('请输入想要搜索图片的关键字:')
first_url = 'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1530850407660_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1530850407660%5E00_1651X792&word={}'.format(keyword)
want_download = input('请输入想要下载图片的张数:') global page_num
page_num = 1
global download_num
download_num = 0 #这个函数用来获取图片格式
def get_format(pic_url):
#url的末尾存着图片的格式,用split提取
#有些url末尾并不是常见图片格式,此时用jpg补全
t = pic_url.split('.')
if t[-1].lower() != 'bmp' and t[-1].lower() != 'gif' and t[-1].lower() != 'jpg' and t[-1].lower() != 'png':
pic_format = 'jpg'
else:
pic_format = t[-1]
return pic_format #这个函数用来获取下一页的url
def get_next_page(page_url):
global page_num
html = requests.get(page_url).text
with open('html_info.txt', 'w', encoding='utf-8') as h:
h.write(html)
selector = etree.HTML(html)
try:
msg = selector.xpath('//a[@class="n"]/@href')
print(msg[0])
next_page = 'http://image.baidu.com/' + msg[0]
print('现在是第%d页' % (page_num + 1))
except Exception as e:
print('已经没有下一页了')
print(e)
next_page = None
page_num = page_num + 1
return next_page #这个函数用来下载并保存图片
def download_img(pic_urls):
count = 1
global download_num
for i in pic_urls:
time.sleep(1)
try:
pic_format = get_format(i)
pic = requests.get(i, timeout=15)
#按照格式和名称保存图片
with open(local_path + 'page%d_%d.%s' % (page_num, count, pic_format), 'wb') as f:
f.write(pic.content)
#print('成功下载第%s张图片: %s' % (str(count), str(pic.url)))
count = count + 1
download_num = download_num + 1
except Exception as e:
#print('下载第%s张图片时失败: %s' % (str(count), str(pic.url)))
print(e)
count = count + 1
continue
finally:
if int(want_download) == download_num:
return 0 #这个函数用来提取url中图片的url
def get_pic_urls(web_url):
html = requests.get(web_url).text
#通过正则表达式寻找图片的地址,
pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
#返回图片地址,是一个list
return pic_urls if __name__ == "__main__":
while True:
pic_urls = get_pic_urls(first_url)
t = download_img(pic_urls)
if t==0:
break
next_url = get_next_page(first_url)
if next_url == None:
print('已经没有更多图片')
break
pic_urls = get_pic_urls(next_url)
t = download_img(pic_urls)
if t== 0:
break
first_url = next_url
#print('已经成功下载%d张图片' %download_num)

为了筛选出重复的图片又采用了哈希算法进行去重

 # -*- coding: utf-8 -*-

 import sys
reload(sys)
sys.setdefaultencoding('utf8') """
用dhash判断是否相同照片
基于渐变比较的hash
hash可以省略(本文省略)
By Guanpx
"""
import os
from PIL import Image
from os import listdir def picPostfix(): # 相册后缀的集合
postFix = set()
postFix.update(['bmp', 'jpg', 'png', 'tiff', 'gif', 'pcx', 'tga', 'exif',
'fpx', 'svg', 'psd', 'cdr', 'pcd', 'dxf', 'ufo', 'eps', 'JPG', 'raw', 'jpeg'])
return postFix def getDiff(width, high, image): # 将要裁剪成w*h的image照片
diff = []
im = image.resize((width, high))
imgray = im.convert('L') # 转换为灰度图片 便于处理
pixels = list(imgray.getdata()) # 得到像素数据 灰度0-255 for row in range(high): # 逐一与它左边的像素点进行比较
rowStart = row * width # 起始位置行号
for index in range(width - 1):
leftIndex = rowStart + index
rightIndex = leftIndex + 1 # 左右位置号
diff.append(pixels[leftIndex] > pixels[rightIndex]) return diff # *得到差异值序列 这里可以转换为hash码* def getHamming(diff=[], diff2=[]): # 暴力计算两点间汉明距离
hamming_distance = 0
for i in range(len(diff)):
if diff[i] != diff2[i]:
hamming_distance += 1 return hamming_distance if __name__ == '__main__': width = 32
high = 32 # 压缩后的大小
dirName = "/home/yourpath" # 相册路径
allDiff = []
postFix = picPostfix() # 图片后缀的集合 dirList = os.listdir(dirName)
cnt = 0
for i in dirList:
cnt += 1
# print('文件处理的数量是', cnt) # 可以不打印 表示处理的文件计数
if str(i).split('.')[-1] in postFix: # 判断后缀是不是照片格式
try:
im = Image.open(r'%s/%s' % (dirName, unicode(str(i), "utf-8")))
except OSError as err:
os.remove(r'%s/%s' % (dirName, unicode(str(i), "utf-8")))
print('OS error : {}'.format(err))
# continue except IndexError as err:
os.remove(r'%s/%s' % (dirName, unicode(str(i), "utf-8")))
print('OS error : {}'.format(err))
print('Index Error: {}'.format(err))
# continue except IOError as err:
os.remove(r'%s/%s' % (dirName, unicode(str(i), "utf-8"))) # 删除图片
# print('OS error : {}'.format(err))
print('IOError : {}'.format(err))
# continue # except:
# print ('Other error')
else:
diff = getDiff(width, high, im)
allDiff.append((str(i), diff)) for i in range(len(allDiff)):
for j in range(i + 1, len(allDiff)):
if i != j:
ans = getHamming(allDiff[i][1], allDiff[j][1])
if ans <= 5: # 判别的汉明距离,自己根据实际情况设置
print(allDiff[i][0], "and", allDiff[j][0], "maybe same photo...")
result = dirName + "/" + allDiff[j][0]
if os.path.exists(result):
os.remove(result)

用哈希算法筛选后又发现筛除的太多了,阈值不好控制。又尝试采用了静态爬虫的方法,发现结果还不错,重复的也不多,也就省了筛除的步骤。

静态爬虫:

 # -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import time
# 导入需要的库
import requests
# import os
import json
import time # 爬取百度图片,解析页面的函数
def getManyPages(keyword, pages):
'''
参数keyword:要下载的影像关键词
参数pages:需要下载的页面数
'''
params = [] for i in range(30, 30 * pages + 30, 30):
params.append({
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn': i,
'rn': 30,
'gsm': '1e',
'': ''
})
url = 'https://image.baidu.com/search/acjson'
urls = []
for i in params:
try:
urls.append(requests.get(url, params=i).json().get('data'))
# except json.decoder.JSONDecodeError:
# print("解析出错") except OSError as err:
print('OS error : {}'.format(err)) except IndexError as err:
print('Index Error: {}'.format(err)) except IOError as err:
print('IOError : {}'.format(err))
except:
print('Other error')
return urls # 下载图片并保存
def getImg(dataList, localPath):
'''
参数datallist:下载图片的地址集
参数localPath:保存下载图片的路径
'''
if not os.path.exists(localPath): # 判断是否存在保存路径,如果不存在就创建
os.mkdir(localPath)
x = 0
for list in dataList:
for i in list:
if i.get('thumbURL') != None:
# print('正在下载:%s' % i.get('thumbURL'))
ir = requests.get(i.get('thumbURL'))
open(localPath + '/' + '%d.jpg' % x, 'wb').write(ir.content) # 这里是新加的斜杠
x += 1
else:
print('图片链接不存在') # 根据关键词来下载图片
if __name__ == '__main__':
import os
father_path = "/home/yourpath/"
t0 = time.time()
for init in os.listdir(father_path):
print('init is{}'.format(str(init)))
for name in os.listdir(init):
print('name is{}'.format(str(name)))
t1 = time.time()
if not os.listdir(os.path.join(father_path, init, name)):
dataList = getManyPages(name, 30)
getImg(dataList, os.path.join(father_path, init, name))
t2 = time.time()
print('cost time is', t2 - t1)
t3 = time.time()
print('total time is', t3 - t0)
# t1 = time.time()
# dataList = getManyPages('keyword', page
_number) # 参数1:关键字,参数2:要下载的页数
# getImg(dataList, './file_path/') # 参数2:指定保存的路径
# t2 = time.time()
# print('cost time is', t2 - t1)
#
# parent_name = "/home/path" # 相册路径
# dirList = os.listdir(parent_name) # 所有文件夹的列表
# for one_file in dirList: # 其中的一个文件夹
# # son_list = os.listdir(one_file)
# son_list = os.path.join(parent_name, one_file)
# son_file = os.listdir(son_list)
# t1 = time.time()
上一篇:解决升级到Xcode10,react native项目运行报错问题


下一篇:微软SSAS 错误 Internal error: Invalid enumeration value. Please call customer support! 不是此元素的有效值。