python 爬虫（五）

2022-07-11 19:09:15

下载媒体文件

I 使用urllib.request.urlretrieve方法可以下载文件存为指定文件

from urllib.request import urlretrieve

from urllib.request import urlopen

from bs4 import BeautifulSoup

with urlopen("http://www.pythonscraping.com") as html:

    bsObj = BeautifulSoup(html,'html.parser')

imageLocation = bsObj.find('a',{'id':"logo"}).find("img")["src"]

urlretrieve(imageLocation,"logo.jpg")

import os

from urllib.request import urlretrieve

from urllib.request import urlopen

from bs4 import BeautifulSoup

downloadDirectory = "downloaded"

baseUrl = "http://pythonscraping.com"

def getAbsoluteURL(baseUrl, source):

    if source.startswith("http://www."):

        url = "http://" + source[11:]

    elif source.startswith("http://"):

        url = source

    elif source.startswith("www."):

        url = source[4:]

        url = "http://" + source

    else:

        url = baseUrl + "/" + source

    if baseUrl not in url:

        return None

    return url

def getDownLoadPath(baseUrl, absoluteUrl, downloadDirecory):

    path = absoluteUrl.replace("www.","")

    path = path.replace(baseUrl,"")

    path = downloadDirectory + path

    path = path.split("?")[0]

    directory = os.path.dirname(path)

    if not os.path.exists(directory):

        os.makedirs(directory)

    return path

html = urlopen("http://www.pythonscraping.com")

bsObj = BeautifulSoup(html,"html.parser")

downloadList = bsObj.find_all(src=True)

for download in downloadList:

    fileUrl = getAbsoluteURL(baseUrl, download["src"])

    if fileUrl is not None:

        print(fileUrl)

        dir = getDownLoadPath(baseUrl,fileUrl,downloadDirectory)

        print("save: " + dir)

        urlretrieve(fileUrl,dir)

码农公寓

相关文章