从爬虫看多进程开发

简介

因为写英文应用文与写作需要参考新闻信息,但是,我脑子里除了报纸没有其他更好的信息整合平台。遂打算下载renming日报

参考链接

https://www.liaoxuefeng.com/wiki/1016959663602400/1017628290184064
https://blog.csdn.net/qq_38161040/article/details/88366427
https://blog.csdn.net/baidu_28479651/article/details/76158051?utm_source=blogxgwz7

code 第一版

70%手动 30%自动 需要频繁的创建文件夹和更改下载次数

# coding = UTF-8
# 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html

import urllib.request
import re
import os

# open the url and read
def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    page.close()
    return html

# compile the regular expressions and find
# all stuff we need
def getUrl(html):
    reg = r‘([A-Z]\d+)‘ #匹配了G176200001
    url_re = re.compile(reg)
    url_lst = url_re.findall(html.decode(‘UTF-8‘)) #返回匹配的数组
    return(url_lst)

def getFile(url):
    file_name = url.split(‘/‘)[-1]
    u = urllib.request.urlopen(url)
    f = open(file_name, ‘wb‘)

    block_sz = 8192
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break

        f.write(buffer)
    f.close()
    print ("Sucessful to download" + " " + file_name)




if __name__ == ‘__main__‘:
    tmp = "http://paper.people.com.cn/rmrb/page/2020-03/26/01/rmrb20200326";
    for i in range(20):
        #print(i)
        # http://paper.people.com.cn/rmrb/page/2020-03/26/02/rmrb2020032602.pdf
        # http://paper.people.com.cn/rmrb/page/2020-03/26/03/rmrb2020032603.pdf
        if(i+1 <10):
            getFile("http://paper.people.com.cn/rmrb/page/2020-03/07/0"+str(i+1)+"/rmrb202003070"+str(i+1)+".pdf")
        else:
            getFile("http://paper.people.com.cn/rmrb/page/2020-03/07/"+str(i+1)+"/rmrb20200307"+str(i+1)+".pdf")

code 第二版 自动创建文件夹版本

下载速度较慢需要等待

# coding = UTF-8
# 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html

import urllib.request
import re
import os
import shutil

# open the url and read
def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    page.close()
    return html

# compile the regular expressions and find
# all stuff we need
def getUrl(html):
    reg = r‘([A-Z]\d+)‘ #匹配了G176200001
    url_re = re.compile(reg)
    url_lst = url_re.findall(html.decode(‘UTF-8‘)) #返回匹配的数组
    return(url_lst)

def getFile(url):
    file_name = url.split(‘/‘)[-1]
    u = urllib.request.urlopen(url)
    f = open(file_name, ‘wb‘)

    block_sz = 8192
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break

        f.write(buffer)
    f.close()
    print ("Sucessful to download" + " " + file_name)
    return file_name

if __name__ == ‘__main__‘:
    for i in range(29):
        folderName=""
        data = str(i+1);
        if(i+1 < 10):
            data = "0"+data;
        folderName = "02"+data;
        os.mkdir(folderName)
       
        for j in range(20):
            fineName = ""

            try:
                if(j+1 <10):
                    fileName = "http://paper.people.com.cn/rmrb/page/2020-02/"+data+"/0"+str(j+1)+"/rmrb202002"+data+"0"+str(j+1)+".pdf";
                    tmp = getFile(fileName)
                else:
                    fileName = "http://paper.people.com.cn/rmrb/page/2020-02/"+data+"/"+str(j+1)+"/rmrb202002"+data+str(j+1)+".pdf";
                    tmp = getFile(fileName)
                shutil.move(tmp,folderName)
            except OSError:
                pass
            continue

code 多进程下载

超级爽

# coding = UTF-8
# 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html

import urllib.request
import re
import os
import shutil
from multiprocessing import Pool
import time
# open the url and read
def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    page.close()
    return html

# compile the regular expressions and find
# all stuff we need
def getUrl(html):
    reg = r‘([A-Z]\d+)‘ #匹配了G176200001
    url_re = re.compile(reg)
    url_lst = url_re.findall(html.decode(‘UTF-8‘)) #返回匹配的数组
    return(url_lst)

def getFile(url):
    file_name = url.split(‘/‘)[-1]
    u = urllib.request.urlopen(url)
    f = open(file_name, ‘wb‘)

    block_sz = 8192
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break

        f.write(buffer)
    f.close()
    print ("Sucessful to download" + " " + file_name)
    return file_name
def download(i):
    folderName=""
    data = str(i+1);
    if(i+1 < 10):
        data = "0"+data;
    folderName = "01"+data;
    os.mkdir(folderName)
   
    for j in range(20):
        fineName = ""

        try:
            if(j+1 <10):
                fileName = "http://paper.people.com.cn/rmrb/page/2020-01/"+data+"/0"+str(j+1)+"/rmrb202001"+data+"0"+str(j+1)+".pdf";
                tmp = getFile(fileName)
            else:
                fileName = "http://paper.people.com.cn/rmrb/page/2020-01/"+data+"/"+str(j+1)+"/rmrb202001"+data+str(j+1)+".pdf";
                tmp = getFile(fileName)
            shutil.move(tmp,folderName)
        except OSError:
            pass
        continue

if __name__ == ‘__main__‘:
    p = Pool(31)
    for i in range(31):
        p.apply_async(download, args = (i,))
    p.close()    
    p.join()
    print(‘All subprocesses done.‘)

从爬虫看多进程开发

上一篇:解决Linux中的ip重启后变化的问题


下一篇:Linux添加普通权限账号并授予root权限