from urllib.parse import urlencode
import requests
import re
import os
import json
from requests.exceptions import RequestException
def getindex(pn,rn):
data={
'tn': 'resultjson_com',
'ipn': 'rj',
'ct':'201326592',
'is':'',
'fp': 'result',
'queryWord': '大数据',
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid':'',
'st': '',
'z':'',
'latest':'',
'copyright':'',
'word': '大数据',
's':'',
'se':'',
'tab':'',
'width':'',
'height':'',
'face': '0',
'istype': '2',
'qc':'',
'nc': '1',
'fr':'',
'expermode':'',
'force':'',
'pn': pn,
'rn': rn,
'gsm': '1e',
'1557667455319':'',
}
url='http://image.baidu.com/search/acjson?'+urlencode(data)#对data进行编码,将字典对象转化为请求参数
response=requests.get(url)
try:
if response.status_code ==200:
return response.text
return None
except RequestException:
print("请求失败")
return None
def geturl(html):
data=json.loads(html)
if data and 'data' in data.keys(): #判断含有json属性
for item in data.get('data'):
yield item.get('thumbURL')
def download(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
'Referer':'http://image.baidu.com/search'
}
req = requests.get(url, headers=headers)
DIR='images'
if len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))])!=350:
f=open('./images/'+url.split('/')[-1],'wb')
f.write(req.content)
f.close()
else:
return 0
def main():
for i in range(70):
pn=i*30
rn=8
html=getindex(pn,rn)
for url in geturl(html)[:-1]:#最后一个为none
abc=download(url)
if abc==0:
break
if __name__=="__main__":
DIR='images'
os.makedirs(DIR)
main()