Day06 - 20210601 - 爬虫+和多线程

1.excel写

import openpyxl

# 1.获取工作簿对象(工作簿对应的就是一个excel文件)
# 新建(默认有一个工作表)
# work_book = openpyxl.Workbook()
# 打开
# work_book = openpyxl.load_workbook(文件路径)


# work_book = openpyxl.Workbook()
work_book = openpyxl.load_workbook('files/test2.xlsx')

# 2.获取所有的表的表名
all_names = work_book.sheetnames
print(all_names)

# 3.获取表
# sheet = work_book['Sheet']

# 4.新建表
# 工作簿对象.create_sheet(表名, 表的下标)
# work_book.create_sheet('学生表')
# work_book.create_sheet('学生表2', 0)

# 5.删除表
# 工作簿对象.remove(表对象)
# work_book.remove(work_book[表名])

# 6.表重命名
# sheet = work_book['学生表2']
# sheet = work_book.active
# sheet.title = 'Student'

# 7.写入数据到单元格
# 单元格对象.value   -   获取单元格中的内容
# 单元格对象.value = 新值   -   修改单元格中的内容
sheet = work_book['Student']

# 1)获取单元格方法一
# 工作表对象.cell(行号: int, 列号: int)
cell1 = sheet.cell(1, 1)
# 修改单元格中的内容
# cell1.value = '姓名'

cell2 = sheet['B1']
# cell2.value = '年龄'

# 清空单元格
cell2.value = ''


# 5.保存文件
work_book.save('./files/test2.xlsx')


2.Excel读

import openpyxl

# 1.打开文件
wb = openpyxl.load_workbook('files/test1.xlsx')

# 2.获取工作表
# sheet = wb['学生表']
sheet = wb.active

# 3.获取单元格
# 1)获取单个单元格
# sheet.cell(行号, 列号)    -  行号和列号都是从1开始的数字
# sheet[位置信息]  - 位置信息是类似:'A1'、'B2'的行列信息,其中字母是列信息,数字是行号

# 2)以行为单位获取单元格对象
# 工作表.iter_rows(最小行号, 最大行号, 最小列号, 最大列号)
cells = sheet.iter_rows(1, 4, 1, 4)
print(list(cells))

row_4 = sheet.iter_rows(4, 4)
print(list(row_4))

cells = sheet.iter_rows(2, 4, 1, 2)
print(list(cells))

# 3)以列为单位获取单元格对象
cells = sheet.iter_cols(1, 4, 1, 4)
print(list(cells))

all_scores = sheet.iter_cols(4, 4, 2, 4)
# print(list(all_scores))
for score_cell in next(all_scores):
    print(score_cell.value)

3.selenium设置

from selenium import webdriver
# from selenium.webdriver import ChromeOptions

url = 'https://www.jd.com'

# 1.创建设置选项
options = webdriver.ChromeOptions()

# 2.添加选项参数
# 1) 取消测试环境
options.add_experimental_option('excludeSwitches', ['enable-automation'])

# 2) 取消图片加载(提高爬虫效率)
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})

b = webdriver.Chrome(options=options)
b.get(url)
print(b.page_source)




4.iframe切换

from selenium import webdriver

url = 'https://mail.163.com/'

b = webdriver.Chrome()
b.get(url)

"""
有的时候会遇到这样的网页:一个网页对应的html标签嵌套了其他的html标签
(前端如果要实现嵌套的功能必须要将被嵌套的html放在iframe标签中),
如果需要爬取网页内容在嵌套的html里面,需要先让浏览器选中内容嵌套的html。
(浏览器对象默认选中的是最外面的html标签)
"""
# 1. 获取提供html标签的iframe标签
box = b.find_element_by_css_selector('#loginDiv>iframe')

# 2.切换frame
b.switch_to.frame(box)

print(b.page_source)

5.多线程

# 一个进程默认有一个线程,这个线程叫主线程。其他的线程(需要手动创建)都叫子线程。
# 如果一个Python程序需要子线程需要手动创建线程类Thread的对象

import time
from datetime import datetime
from threading import Thread
# Thread类 - 线程类   Thread类的对象  -  子线程


def download(name):
    print(f'{name}开始下载:{datetime.now()}')
    time.sleep(2)
    print(f'{name}下载结束:{datetime.now()}')


# 1.在一个线程(主线程)中下载三个电影:消耗时间是6秒
# download('肖申克的救赎')
# download('霸王别姬')
# download('阿甘正传')

# 2.在三个子线程中分别下载三个电影
# 1)创建线程对象
t1 = Thread(target=download, args=('肖申克的救赎',))
t2 = Thread(target=download, args=('霸王别姬',))
t3 = Thread(target=download, args=('阿甘正传',))

# 2)启动线程
t1.start()
t2.start()
t3.start()

作业

"""
Time: 2021/6/1 18:51
Author:Liu Bowen
"""
import requests
import time
from datetime import datetime
from re import findall
from json import loads
from threading import Thread

def get_proxy_ips():
    api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3'
    response = requests.get(api)
    if response.status_code == 200:
        if response.text[0] == '{':
            print('获取代理失败!提取太频繁')
        else:
            return response.text.split('\n')[:-1]
    else:
        print('ip请求失败!')
def get_net_data_ip(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    while True:
        # 获取5个代理ip
        ips = get_proxy_ips()
        # 如果没有取到
        if not ips:
            print('ip获取失败!')
            time.sleep(1)
            continue

        while ips:
            ip1 = ips.pop()
            ip2 = ips.pop()
            print(ip1, ip2)
            proxies = {
                'http': ip1,
                'https': ip2
            }
            try:
                response = requests.get(url, headers=headers, proxies=proxies, timeout=3)
                if response.status_code == 200:
                    # print(response.text)
                    return response.text
                else:
                    print('数据请求失败!')
            except (requests.exceptions.ProxyError,  requests.exceptions.ConnectTimeout):
                print('超时,继续请求')

def get_net_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    return response.text

def analysis_data(data):
    result = findall(r'window.__SEARCH_RESULT__\s=\s(\{.*?\})</script>',data)[0]
    result = loads(result)
    result = result['engine_search_result']
    for job in result:
        job_name = job.get('job_name','')
        job_company = job.get('company_name', '')
        job_companytype = job.get('companytype_text', '')
        job_welf = job.get('jobwelf', '')
        return job_name,job_company,job_companytype,job_welf

def spider_51job(page1:int,page2:int):
    print(f'{page1}页-{page2}页开始爬取:{datetime.now()}')
    for page in range(page1,page2+1):
        page = str(page)
        url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,'+page+'.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
        data = get_net_data(url)
        analysis_data(data)
    print(f'{page1}页-{page2}页爬取结束:{datetime.now()}')
# if __name__ == '__main__':
#     spider_51job(1,1)
t1 = Thread(target=spider_51job, args=(1,200,))
t2 = Thread(target=spider_51job, args=(201,400,))
t3 = Thread(target=spider_51job, args=(401,600,))
t1.start()
t2.start()
t3.start()
上一篇:B站狂神说Java基础_Day06面向对象


下一篇:Note_Logistics_Day06