Python爬取百度图片
这里写目录标题
解析
import urllib.request
import urllib.parse
import re
import os
#添加header,referer是必须的User-Agent用来伪装浏览器
header=\
{
'User-Agert':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
"referer":"http://image.baidu.com"
}
#keword=input(“请输入搜索关键字”)
url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E4%B9%94%E5%B8%83%E6%96%AF&oq=%E4%B9%94%E5%B8%83%E6%96%AF&rsp=-1"##.format(word=keword) [http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={word}](http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%7bword%7d)
#转码
\#keword=urllib.parse.quote(keword,"utf-8")
n=0
j=0
while n<3000:
error=0
n+=30
#获取请求
rep=urllib.request.Request(url,headers=header)
#打开网页
rep=urllib.request.urlopen(rep)
#获取网页内容
try:
html=rep.read().decode("utf-8")
#print(html)
except:
print("出错了")
error=1
#正则匹配
p=re.compile(r"thumbURL.*?\.jpg")
#获取正则匹配到的结果,返回list
s=p.findall(html)
if os.path.isdir("D://text_pic") !=True:
os.makedirs("D://text_pic")
with open("testpic.txt","a") as f:
#获取图片
for i in s:
i=i.replace('thumbURL":"',"")
print(i)
f.write(i)
f.write("\n")
#保存图片
urllib.request.urlretrieve(i,"D://text_pic/pic{num}.jpg".format(num=j))
j+=1
f.close()
print("总共爬取的图片数为:"+str(j))
代码
`import urllib.request`
`import urllib.parse`
`import re`
`import os`
\#添加header,referer是必须的User-Agent用来伪装浏览器
`header=\`
`{`
`'User-Agert':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',`
`"referer":"http://image.baidu.com"`
`}`
\#keword=input("请输入搜索关键字")
`url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E4%B9%94%E5%B8%83%E6%96%AF&oq=%E4%B9%94%E5%B8%83%E6%96%AF&rsp=-1"##.format(word=keword) [http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={word}](http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%7bword%7d)`
\#转码
`\#keword=urllib.parse.quote(keword,"utf-8")`
`n=0`
`j=0`
`while n<3000:`
`error=0`
`n+=30`
#获取请求
`rep=urllib.request.Request(url,headers=header)`
#打开网页
`rep=urllib.request.urlopen(rep)`
#获取网页内容
`try:`
`html=rep.read().decode("utf-8")`
`#print(html)`
`except:`
`print("出错了")`
`error=1`
#正则匹配
`p=re.compile(r"thumbURL.*?\.jpg")`
#获取正则匹配到的结果,返回list
`s=p.findall(html)`
`if os.path.isdir("D://text_pic") !=True:`
`os.makedirs("D://text_pic")`
`with open("testpic.txt","a") as f:`
#获取图片
`for i in s:`
`i=i.replace('thumbURL":"',"")`
`print(i)`
`f.write(i)`
`f.write("\n")`
#保存图片 `urllib.request.urlretrieve(i,"D://text_pic/pic{num}.jpg".format(num=j)`
`j+=1`
`f.close()`
`print("总共爬取的图片数为:"+str(j))`