Python爬取百度图片

Python爬取百度图片

这里写目录标题

解析

import urllib.request

import urllib.parse

import re

import os

#添加header,referer是必须的User-Agent用来伪装浏览器

header=\

{

'User-Agert':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',

"referer":"http://image.baidu.com"

}

#keword=input(“请输入搜索关键字”)

url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E4%B9%94%E5%B8%83%E6%96%AF&oq=%E4%B9%94%E5%B8%83%E6%96%AF&rsp=-1"##.format(word=keword) [http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={word}](http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%7bword%7d)

#转码

\#keword=urllib.parse.quote(keword,"utf-8")

n=0

j=0

while n<3000:

error=0

n+=30

​ #获取请求

rep=urllib.request.Request(url,headers=header)

​ #打开网页

rep=urllib.request.urlopen(rep)

​ #获取网页内容

try:

html=rep.read().decode("utf-8")

#print(html)

except:

print("出错了")

error=1

​ #正则匹配

p=re.compile(r"thumbURL.*?\.jpg")

​ #获取正则匹配到的结果,返回list

s=p.findall(html)

if os.path.isdir("D://text_pic") !=True:

os.makedirs("D://text_pic")

with open("testpic.txt","a") as f:

​ #获取图片

for i in s:

i=i.replace('thumbURL":"',"")

print(i)

f.write(i)

f.write("\n")

​ #保存图片

urllib.request.urlretrieve(i,"D://text_pic/pic{num}.jpg".format(num=j))

j+=1

f.close()

print("总共爬取的图片数为:"+str(j))

代码

`import urllib.request`
`import urllib.parse`
`import re`
`import os`
\#添加header,referer是必须的User-Agent用来伪装浏览器
`header=\`
`{`
​    `'User-Agert':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',`
​    `"referer":"http://image.baidu.com"`
`}`
\#keword=input("请输入搜索关键字")
`url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E4%B9%94%E5%B8%83%E6%96%AF&oq=%E4%B9%94%E5%B8%83%E6%96%AF&rsp=-1"##.format(word=keword)   [http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={word}](http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%7bword%7d)` 
\#转码
`\#keword=urllib.parse.quote(keword,"utf-8")` 
`n=0`
`j=0`
`while n<3000:`
​    `error=0`
​    `n+=30`   
​    #获取请求
​    `rep=urllib.request.Request(url,headers=header)`
​    #打开网页
​    `rep=urllib.request.urlopen(rep)`
​    #获取网页内容
​    `try:`
​        `html=rep.read().decode("utf-8")`
​        `#print(html)`
​    `except:`
​        `print("出错了")`
​        `error=1`
​    #正则匹配
​    `p=re.compile(r"thumbURL.*?\.jpg")`
​    #获取正则匹配到的结果,返回list
​    `s=p.findall(html)`
​    `if os.path.isdir("D://text_pic") !=True:`
​        `os.makedirs("D://text_pic")`
​    `with open("testpic.txt","a") as f:`
​        #获取图片
​        `for i in s:`​            
​            `i=i.replace('thumbURL":"',"")`
​            `print(i)`
​            `f.write(i)`
​            `f.write("\n")`
​            #保存图片​            `urllib.request.urlretrieve(i,"D://text_pic/pic{num}.jpg".format(num=j)`
​            `j+=1`
​        `f.close()`
`print("总共爬取的图片数为:"+str(j))`
上一篇:python爬虫——requests


下一篇:200个python标准库