《Python3.7网络爬虫快速入门》学习笔记

1、基础模块篇
2、爬虫篇
3、文件处理篇
4、实战篇


1、基础模块篇

re模块

1、功能:提供对正则表达式的支持

2、常用方法
(1)compile():根据包含正则表达式的字符串创建模式对象,返回一个pattern对象

import re
string = "A1.45, b5, 6.45, 8.82"
regex = re.compile(r"\d+\.?\d*")
print(regex.findall(string))

(2)match():从字符串的pos指定的下标开始处匹配pattern,匹配返回一个Match对象,不匹配返回None

(3)group()和groups():前者获得一个或多个分组截获的字符串,后者以元组形式返回全部分组截获的字符串

(4)search():查找匹配的字符串,一次匹配

import re
pattern = re.compile("\d+")
my_str = 'one12twothree34four'
m = pattern.search(my_str)
m = pattern.search(my_str, 10, 30)
print(m)
print(m.group())

(5)findall():搜索整个字符串,获得所有匹配结果,一次匹配

import re
# 查找数字
pattern = re.compile(r'\d+')
result1 = pattern.findall('hello 123456 789')
result2 = pattern.findall('one1two2three3four4', 0, 10)
print(result1)
print(result2)
# 查找浮点数
pattern = re.compile(r'\d+\.\d*')
result = pattern.findall("123.141, 'big', 232312, 3.14")
for item in result:
    print(item)

(6)finditer():返回一个顺序访问每一个匹配结果的迭代器

(7)split():按照匹配的子串将字符串分割后返回列表

import re
p = re.compile(r'[\s\,\;]+')
result = p.split('a,b;; c  d')
print(result)

(8)sub():正则替换

import re
p = re.compile(r'(\w+) (\w+)')
s = 'hello 123, hello 456'
print(p.sub(r'hello world', s))
print(p.sub(r'\2 \1', s))
def func(m):
    return 'hi' + '' + m.group(2)
print(p.sub(func, s))
print(p.sub(func, s, 1))

lxml模块

1、功能:解析和提取HTML/XML格式数据

2、安装:pip install lxml

3、Xpath语法:对XML和HTML文档的信息搜索

nodename 选取此节点的所有子节点
/ 从根节点选取
// 匹配当前节点的节点,不考虑它们的位置
. 选取当前节点
选取当前节点的父节点
@ 选取属性
* 选取所有元素节点与元素名
@* 选取所有属性
[@attrib] 选取指定属性的所有元素
@attrib=‘value’ 匹配值的所有元素
tag 选取直接子节点
tag=‘text’ 指定元素且文本为text的节点

4、提示:Python抓取时,需转换为xml(etree.HTML())

5、简例

(1)使用XPath解析HTML字符串

# 导入lxml模块
from lxml import etree
# 初始化生成一个Xpath解析对象(xxx为字符串(自己在代码中编辑html))
html=etree.HTML(xxx)
# 解析对象输出代码
result=etree.tostring(html, encoding='utf-8')

(2)使用XPath解析HTML文件

from lxml import etree
# 解析XML文件
htmlEmt = etree.parse('xxx.xml')
# pretty_print优化输出
result = etree.tostring(htmlEmt, pretty_print=True)

(3)获取具体标签下的内容

from lxml import etree
htmlEmt = etree.parse('xxx.xml')
# 获取所有<li>标签
result = htmlEmt.xpath("//li")
# 获取<li>标签的所有class
result = htmlEmt.xpath("//li//@class")
# 获取<li>标签下href为xxx.html的<a>标签
result = htmlEmt.xpath('//li/a[@href="xxx.html"]')
# 获取<li>标签下的所有<span>标签
result = htmlEmt.xpath('//li//span')
# 获取不包括<li>标签本身的class
result = htmlEmt.xpath('//li/a//@class')
# 获取最后一个<li>标签的<a>标签的href
result = htmlEmt.xpath('//li[last()]/a/@href')
# 获取倒数第二个元素的内容
result = htmlEmt.xpath('//li[lat()-1]/a')
# 获取class为xxx的标签
result = htmlEmt.xpath('//*[@class="xxx"]')

BeautifulSoup库

1、功能:用于从网页中提取数据

2、安装:pip install BeautifulSoup4

3、简例

(1)创建对象

from bs4 import BeautifulSoup
# 用lxml解析器解析目录下的html文件
soup = BeautifulSoup(open('xxx.html'), 'lxml')
print(soup.prettify())

(2)4个对象

1、Tag(标签):HTML中的标签

from bs4 import BeautifulSoup
soup = BeautifulSoup(open('xxx.html'), 'lxml')
print(soup.prettify())

# 获取Tag标签
print(soup.title)
# 获取头部
print(soup.head)
# 获取超链接
print(soup.a)
# 获取段落标记
print(soup.p)

# name属性
print(soup.name)
# head的name属性
print(soup.head.name)
# soup的属性
print(soup.attrs)
# soup.p的属性
print(soup.p.arrts)
# 获取单个属性
print(soup.p['class'])
# get()获取属性
print(soup.p.get('class'))
# 属性赋值
soup.p['class'] = "newClass"
# 删除属性
del soup.p['class']

2、NavigableString(可遍历的字符串):string属性获取内部文字

from bs4 import BeautifulSoup
soup = BeautifulSoup(open('xxx.html'), 'lxml')
print(soup.prettify())
# 输出标签文本内容
print(soup.title.string)

3、BeautifulSoup(文档对象):表示一个文档的全部内容

from bs4 import BeautifulSoup
soup = BeautifulSoup(open('xxx.html'), 'lxml')
# 输出soup的类型
print(type(soup))
# 输出soup的name
print(soup.name)
print(soup.attrs)

4、Comment(注释):特殊的NavigableString对象

from bs4 import BeautifulSoup
soup = BeautifulSoup(open('xxx.html'), 'lxml')
print(soup.a)
# 可能输出时会把注释符号去掉,因此需要先进行判断,看是否为规定类型
print(soup.a.string)
print(type(soup.a.string))

(3)文档树操作

from bs4 import BeautifulSoup
soup = BeautifulSoup(open('index.html'), 'lxml')

# 遍历文档树
# 直接获取子节点
print(soup.head.contents)       # 输出子节点
print(soup.head.contents[0])    # 获取第一个元素

# 使用childeren属性获取子节点
print(soup.head.children)
for child in soup.p.children:
    print(child)

# 使用descendants属性获取所有下层节点
for child in soup.descendants:
    print(child)

# 使用strings属性获取多个内容
for string in soup.strings:
    print(repr(string))

# 使用parents属性来获取全部父节点
content = soup.head.title.string
for parent in content.parents:
    print(parent.name)


# 搜索文档树
# 查找标签title
result = soup.find("title")
# 文本查找
result = soup.find(text="\n    Tillit\n   ")
# 通过标签属性查找
result = soup.find(id='link2')
print(type(result))
print(result)
print(result.string)

# 正则表达式查找
import re
from bs4 import BeautifulSoup
email_id_example = """<br/>
<div>The below HTML has the information that has email ids.</div>
abc@example.com
<div>xyz@example.com</div>
<span>foo@example.com</span>
"""
soup = BeautifulSoup(email_id_example, 'lxml')
emailid_regexp = re.compile("\w+@\w+\.\w+")
first_email_id = soup.find(text=emailid_regexp)
print(first_email_id)

# 通过回调函数查找
def is_secondary_consumers(tag):
    return tag.has_attr('name') and tag.get('name') == 'dromouse'
result = soup.find(is_secondary_consumers)
print(type(result))
print(result)
print(result.string)

(4)CSS选择器

from bs4 import BeautifulSoup
soup = BeautifulSoup(open('index.html'), 'lxml')

# 通过标签名查找
result = soup.select("p")
# 通过类名查找
result = soup.select(".sister")
# 通过ID名查找
result = soup.select("#link3")
# 组合查找
result = soup.select("p #link1")
# 使用属性查找
result = soup.select("[class='title']")

3、 _thread模块

1、功能:派生线程,提供基本的同步数据结构(锁对象)

2、简例

(1)使用_thread创建多线程

import _thread
from time import sleep, ctime

def loop0():
    print("开始循环0次在: ", ctime())
    sleep(4)
    print("结束循环0次在: ", ctime())

def loop1():
    print("开始循环1次在: ", ctime())
    sleep(2)
    print("结束循环1次在: ", ctime())

def main():
    print('starting at: ', ctime())
    _thread.start_new_thread(loop0, ())
    _thread.start_new_thread(loop1, ())
    sleep(6)
    print('all done at:', ctime())

if __name__ == '__main__':
    main()

(2)使用锁机制进行线程管理

import _thread
from time import sleep, ctime
loops=[4,2]

def loop(nloop, sec, lock):
    print('开始循环', nloop, '在: ', ctime())
    sleep(sec)
    print('循环', nloop, '结束于: ', ctime())
    # 释放锁
    lock.release()

def main():
    print('开始于:',ctime())
    locks=[]
    nloops=range(len(loops))
    for i in nloops:
        lock=_thread.allocate_lock()
        lock.acquire()
        locks.append(lock)
    for i in nloops:
        _thread.start_new_thread(loop,(i,loops[i],locks[i]))
    for i in nloops:
        while locks[i].locked():
            pass
    print('所有的任务完成于:',ctime())

if __name__ =='__main__':
    main()

4、Threading模块

1、功能:支持守护线程,一个等待客户端请求的服务器

2、简例

(1)使用函数方式创建多线程

import threading
from time import sleep, ctime

loops = [4, 2]

class ThreadFunc(object):
    def __init__(self, func, args, name=''):
        self.name = name
        self.func = func
        self.args = args

    def __call__(self):
        self.func(*self.args)

def loop(nloop, nsec):
    print('开始循环', nloop, 'at:', ctime())
    sleep(nsec)
    print('循环', nloop, '结束于:', ctime())

def main():
    print('程序开始于:', ctime())
    threads = []
    nloops = range(len(loops))
    for i in nloops:
        t = threading.Thread(target=loop, args=(i, loops[i]))  # 循环
        threads.append(t)
    for i in nloops:
        threads[i].start()  # 循环 开始线程
    for i in nloops:
        threads[i].join()  # 循环 join()方法可以让主线程等待所有的线程都执行完毕。
    print('任务完成于:', ctime())

if __name__ == '__main__':
    main()

(2)传递可调用的类的实例来创建多线程

import threading
from time import sleep,ctime

loops=[4,2]

class ThreadFunc(object):
	def __init__(self,func,args,name=''):
		self.name=name
		self.func = func
		self.args=args
		
	def __call__(self):
		self.func(*self.args)
				
def loop(nloop,nsec):
	print('开始循环',nloop,'at:',ctime())
	sleep(nsec)
	print('循环',nloop,'结束于:',ctime())

def main():
	print('程序开始于:',ctime())
	threads=[]
	nloops=range(len(loops))
	for i in nloops:
		#传递一个可调用类的实例
		t = threading.Thread(target=ThreadFunc(loop,(i,loops[i]),loop.__name__)) 
		threads.append(t)
	for i in nloops:
		threads[i].start()  #循环 开始线程
	for i in nloops:
		threads[i].join()   #循环 join()方法可以让主线程等待所有的线程都执行完毕。
	print('任务完成于:',ctime())
	
if __name__=='__main__':
	main()

(3)派生子类并创建实例来实现多线程

import threading
from time import sleep,ctime

loops=[4,2]

class MyThread(threading.Thread):
	def __init__(self,func,args,name=''):
		threading.Thread.__init__(self)
		self.name = name
		self.func = func
		self.args = args
	def run(self):
		self.func(*self.args)

def loop(nloop,nsec):
	print('开始循环',nloop,'在:',ctime())
	sleep(nsec)
	print('结束循环',nloop,'于:',ctime())

def main():
	print('程序开始于:',ctime())
	threads = []
	nloops = range(len(loops))
	for i in nloops:
		t = MyThread(loop,(i,loops[i]),loop.__name__)
		threads.append(t)
	for i in nloops:
		threads[i].start()
	for i in nloops:
		threads[i].join()
	print('所有的任务完成于:',ctime())

if __name__ =='__main__':
	main()

5、Tesseract库

1、功能:图像识别开源库,将图像转换成文字的光学文字识别(OCR)

2、下载地址

3、安装:pip install pytesseract

4、简例

(1)验证Pytesseract模块(需要图片,可能会报错,修改pytesseract.py文件中的
tesseract_cmd = ‘tesseract’,改成你的安装路径)

from PIL import Image
import pytesseract

text = pytesseract.image_to_string(Image.open(r"image.png"))
print(text)

(2)读取网络验证码

import pytesseract
from urllib import request
from PIL import Image
import time

for i in range(20):
    captchaUrl = "https://passport.lagou.com/vcode/create"
    request.urlretrieve(captchaUrl, 'captcha.png')
    image = Image.open('captcha.png')
    text = pytesseract.image_to_string(image, lang='eng')
    print(text)
    # 暂停两秒后再次执行
    time.sleep(2)

6、PyQuery模块

1、功能:javascript库,可简化javascript编程

2、安装:pip install pyquery

3、简例

(1)使用字符串初始化PyQuery对象

from pyquery import PyQuery as pq
html = '''
<div>
    <ul>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''
html_query = pq(html)
print(html_query('li'))

(2)使用HTML文件初始化PyQuery对象

from pyquery import PyQuery as pq
html_query = pq(filename='xxx.html')
print(html_query('.xxx'))

(3)使用URL初始化PyQuery对象

from pyquery import PyQuery as pq
html_query = pq(url='https://www.baidu.com', encoding="utf-8")
print(html_query('title'))

(4)基本CSS筛选器使用

from pyquery import PyQuery as pq
html ='''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''
html_query = pq(html)
re = html_query('#container .list li')
print(re)
print(type(re))

(5)查找节点

from pyquery import PyQuery as pq
html ='''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''
html_query = pq(html)
items = html_query('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(lis)

(6)遍历结果并输出

from pyquery import PyQuery as pq					#导入pyquery模块
html ='''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''
html_query = pq(html)
lis = html_query ('li').items()
for li in lis:								
	print(li)

(7)获取对象属性文本

from pyquery import PyQuery as pq					
html ='''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''
html_query = pq(html)
#获取所有超链接
a = html_query ('a')
for item in a.items():
	print(item.attr('href'))

(8)获取对象文本信息

from pyquery import PyQuery as pq
html ='''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''
html_query = pq(html)
a = html_query ('[href="link3.html"]')						
print(a.text())
print(a.html())

2、爬虫篇

正则表达式

定义:又称为规则表达式,用来检索、替换那些符合某个模式(规则)的文本。用事先定义好的特定字符及字符的组合,组成一个规则字符串,来表示对字符串的一种筛选逻辑

Scrapy框架

1、概述:一个爬取网站数据、提取结构性数据的应用框架。应用于数据挖掘、信息处理或存储历史数据等方面

2、组件:引擎、调度器、下载器、爬虫、管道、下载器中间件、爬虫中间件、调度中间件

3、工作流程

  1. 引擎从调度器中取出一个URL用于接下来的网页抓取
  2. 引擎将URL封装成一个请求传给下载器
  3. 下载器下载网络资源,并封装成响应包
  4. 爬虫解析响应包
  5. 若解析出各个表项,则交给管道进行进一步的处理
  6. 若解析出URL,则把URL交给调度器等待下一步的网页抓取

4、安装:pip install scrapy

5、开发流程

  1. 创建项目:scrapy startproject < name > [dir]
  2. 进入项目:cd xxx
  3. 创建爬虫:scrapy genspider [options] < name > < domain >
  4. 运行爬虫:scrapy crawl < spider >
  5. 内容存储:-o选项(JSON格式、文本格式、CSV格式、XML格式)

6、范例Scrapy爬取美剧天堂

1、创建项目:scrapy startproject movie
2、进入项目:cd movie
3、创建爬虫程序:scrapy genspider meiju meijutt.com
4、配置items.py

import scrapy
class MovieItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    pass

5、编写meiju.py

import scrapy
from movie.items import MovieItem


class MeijuSpider(scrapy.Spider):
    name = 'meiju'
    allowed_domains = ['meijutt.com']
    start_urls = ['http://meijutt.com/new100.html']

    def parse(self, response):
        movies = response.xpath('//ul[@class="top-list fn-clear"]/li')
        for each_movie in movies:
            item = MovieItem()
            item['name'] = each_movie.xpath('./h5/a/@title').extract()[0]
            yield item
        pass

6、配置settings.py

ITEM_PIPELINES = {
    'movie.pipelines.MoviePipeline': 100,
}

编写pipelines.py

from itemadapter import ItemAdapter
class MoviePipeline(object):
    def process_item(self, item, spider):
        with open("my_meiju.txt", 'a', encoding='utf-8') as fp:
            fp.write(item['name'] + '\n')
        return item

7、执行程序

cd movie
scrapy crawl meiju

爬虫步骤

  1. 明确目标:哪个范围或者网站去搜索
  2. :将网站所有内容爬下来
  3. :去掉无用的数据
  4. 处理数据:存储和使用

爬虫抓取

1、抓取标签间的内容

(1)抓取title标签间内容(< title >(.*?)< /title >)

# 抓取标题
import re
import urllib.request
url = "http://www.baidu.com/"
# 打开url
content = urllib.request.urlopen(url).read()
# 查找标题
title = re.findall(b'<title>(.*?)</title>', content)
print(str(title[0], 'utf-8'))
----------------------------------------------------
# 抓取标签
import re
import urllib.request
url = "http://www.baidu.com/"
content = urllib.request.urlopen(url).read()
# 定义正则规则
pat = b'(?<=<title>).*?(?=</title>)'
# 创建对象
ex = re.compile(pat, re.M|re.S)
# 正则匹配查找
obj = re.search(ex, content)
title = obj.group()
print(str(title, 'utf-8'))

(2)抓取超链接标签之间的内容(< a href=URL > < /a >)

import re
content = """
<a href="http://news.baidu.com" name="tj_trnews" class="mnav">新闻</a>
<a href="http://www.hao123.com" name="tj_trhao123" class="mnav">hao123</a>
<a href="http://map.baidu.com" name="tj_trmap" class="mnav">地图</a>
<a href="http://v.baidu.com" name="tj_trvideo" class="mnav">视频</a>
"""
res = r"<a. *?href=.*?<\/a>"
urls = re.findall(res, content)
for u in urls:
    print(u)
# 获取超链接之间的内容
res = r'<a .*?>(.*?)</a>'
texts = re.findall(res, content, re.S|re.M)
for t in texts:
    print(t)

(3)抓取tr/td标签间的内容

import re
content = """
<html>
<head><title>表格</title></head>
<body>
    <table border=1>
        <tr><th>学号</th><th>姓名</th></tr>
        <tr><th>1001</th><th>张三</th></tr>
        <tr><th>1002</th><th>李四</th></tr>
    </table>
</body>
</html>
"""
res = r'<tr>(.*?)</tr>'
texts = re.findall(res, content, re.S|re.M)
for m in texts:
    print(m)
for m in texts:
    res_th = r'<th>(.*?)</th>'
    m_th = re.findall(res_th, m, re.S|re.M)
    for t in m_th:
        print(t)
res = r'<td>(.*?)</td><td>(.*?)</td>'
texts = re.findall(res, content, re.S|re.M)
for m in texts:
    print(m[0], m[1])

2、抓取标签中的参数

(1)抓取超链接标签的URL(< a href=URL >链接内容 < /a >)

import re
content = """
<a href="http://news.baidu.com" name="tj_trnews" class="mnav">新闻</a>
<a href="http://www.hao123.com" name="tj_trhao123" class="mnav">hao123</a>
<a href="http://map.baidu.com" name="tj_trmap" class="mnav">地图</a>
<a href="http://v.baidu.com" name="tj_trvideo" class="mnav">视频</a>
"""
res = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
urls = re.findall(res, content, re.I|re.S|re.M)
for url in urls:
    print(url)

(2)抓取图片标签的src(< img src=图片地址 />)

import re
content = '''
<img alt = "Python" src="http://www.csdn.net/eastmount.jpg" /> 
'''
urls = re.findall('src="(.*?)"', content, re.I|re.S|re.M)
print(urls[0])

(3)获取url中最后一个参数

import re
content = '''
<img alt = "Python" src="http://www.csdn.net/eastmount.jpg" /> 
'''
urls = 'http://www..csdn.net/eastmount.jpg'
name = urls.split('/')[-1]
print(name)

(4)爬取内容

import re
content = '''
<tr><td>1</td><td>王二<br /></td></tr>
<tr><td>2</td><td>张三</td></tr>
<tr><td>3</td><td><B>李四</B></td></tr>
'''
res = r'<td>(.*?)</td><td>(.*?)</td>'
texts = re.findall(res, content, re.S|re.M)
for m in texts:
    print(m[0], m[1])

(5)对内容进行筛选

import re
content = '''
<tr><td>1</td><td>王二<br /></td></tr>
<tr><td>2</td><td>张三</td></tr>
<tr><td>3</td><td><B>李四</B></td></tr>
'''
res = r'<td>(.*?)</td><td>(.*?)</td>'
texts = re.findall(res, content, re.S|re.M)
for m in texts:
    value0 = m[0].replace('<br />', '').replace(' ', '')
    value1 = m[1].replace('<br />', '').replace(' ', '')
    if '<B>' in value1:
        m_value = re.findall(r'<B>(.*?)</B>', value1, re.S|re.M)
        print(value0, m_value[0])
    else:
        print(value0, value1)

3、文件处理篇

JSON文件处理

JSON:一种轻量级的数据交换语言,采用完全独立于编程语言的文本格式来存储和表示数据,本质上是一个字符串

(1)字典和列表转JSON

import json
books = [
    {
        'title': '水浒传',
        'price': 10
    },
    {
        'title': '西游记',
        'price': 10
    }
]
# dumps:将指定的列表转换为JSON字符串(只能存放ascii,关闭该特性)
json_str = json.dumps(books, ensure_ascii=False)
print(json_str)

(2)将JSON数据转存到文件

import json
books = [
    {
        'title': '水浒传',
        'price': 10
    },
    {
        'title': '西游记',
        'price': 10
    }
]
# dump:参数转存的数据+打开的文件指针
with open('a.JSON', 'w') as fp:
    json.dump(books, fp)

(3)将JSON数据字符串加载为对象

import json
json_str = '[{"title": "水浒传", "price": 10}, {"title": "西游记", "price": 10}]'
# loads:将JSON字符串加载为Python对象
books = json.loads(json_str, encoding='utf-8')
print(type(books))
print(books)

(4)从文件中读取JSON

import json
with open('a.JSON', 'r', encoding='utf-8') as fp:
    json_str = json.load(fp)
print(json_str)

CSV文件处理

CSV:逗号分隔值(字符分隔值),文件以纯文本形式存储表格数据

(1)读取CSV文件

import csv
with open('data.csv', 'r', encoding='utf-8') as fp:
    reader = csv.reader(fp)
    titles = next(reader)
    for x in reader:
        print(x)

(2)数据写入CSV文件

import csv
headers = ['name', 'price', 'author']
values = [
    ('史记', 100, '司马迁'),
    ('儒林外史', 46, '吴敬梓'),
    ('时间简史', 60, '霍金')
]
with open ('book.csv', 'w', newline='') as fp:
    writer = csv.writer(fp)
    writer.writerow(headers)
    writer.writerows(values)

(3)综合练习

import csv
# 通过下标读取文件
def read_csv_demo():
    with open('', 'r') as fp:
        reader = csv.reader(fp)
        next(reader)
        for x in reader:
            name = [3]
            other = [-1]
            print({'name':name, 'other':other})


# 通过字典读取文件
def read_csv_deom2():
    with open('', 'r') as fp:
        reader = csv.DictReader(fp)
        for x in reader:
            value = {'name':x['name'], 'other':x['other']}
            print(value)


# 通过字典写入文件
def read_csv_demo3():
    headers = ['username', 'age', 'height']
    values = [
        {'王二', '20', '140'},
        {'张三', '21', '143'},
        {'李四', '22', '142'}
    ]
    with open('classroom.csv', 'w', encoding='utf-8', newline='') as fp:
        writer = csv.writer(fp)
        # 写入表头
        writer.writerow(headers)
        # 写入数据
        writer.writerows(values)

# 通过字典写入文件
def read_csv_demo4():
    headers = ['username', 'age', 'height']
    values = [
        {'username':'王二','age':'20', 'height':'140'},
        {'username':'张三','age':'21', 'height':'143'},
        {'username':'李四','age':'22', 'height':'142'}
    ]
    with open('classroom2.csv', 'w', encoding='utf-8', newline='')as fp:
        writer = csv.DictWriter(fp, headers)
        writer.writeheader()
        writer.writerows(values)

if __name__ == '__main__':
    read_csv_demo()
    read_csv_deom2()
    read_csv_demo3()
    read_csv_demo4()

4、实战篇

1、利用lxml爬取豆瓣网站

import requests
from lxml import etree
# 请求头设置
headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
# 定义请求URL
url = "https://movie.douban.com/cinema/nowplaying/chongqing"
# 发起请求
rep = requests.get(url, headers=headers)
# 返回Unicode数据类型
text = rep.text
# 转换成html格式
html = etree.HTML(text)
# 找到子孙节点ul标签
ul = html.xpath("//ul[@class='lists']")[0]
# 当前ul下的所有li标签
lis = ul.xpath("./li")
movies = []
for li in lis:
    title = li.xpath("@data-title")[0]
    score = li.xpath("@data-score")[0]
    region = li.xpath("@data-region")[0]
    actors = li.xpath("@data-actors")[0]
    director = li.xpath("@data-director")[0]
    liimg = li.xpath(".//img/@src")
    # 字典数据
    movie = {
        "title": title,
        "score": score,
        "region": region,
        "actors": actors,
        "director": director,
        "liimg": liimg,
    }
    movies.append(movie)
print(movies)

2、爬取糗事网站

import requests
import re

# 糗事百科爬虫类
class Spider:

    def loadPage(self, page):
        url = " https://www.qiushibaike.com/hot/page/" + str(page) + "/"
        user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT6.1; Trident/5.0"
        headers = {"User-Agent": user_agent}
        req = requests.get(url, headers=headers)
        pattern = re.compile(r'<div class="content">\n<span>(.*?)</span>', re.S)
        item_list = pattern.findall(req.text)
        return item_list


    # 爬取页面
    def printOnePage(self, item_list, page):
        print("*********第%d页,爬取完毕...******" % page)
        for item in item_list:
            print("===============")
            print(item)


if __name__ == "__main__":
    print("请按下回车开始")
    input()
    mySpider = Spider()
    mySpider.printOnePage(mySpider.loadPage(1), 1)

3、使用Process多进程爬取豆瓣电影

from multiprocessing import Process, Queue
import time
from lxml import etree
import requests


class DouBanSpider(Process):
    def __init__(self, url, q):
        # 重写__init__方法
        super(DouBanSpider, self).__init__()
        self.url = url
        self.q = q
        # 请求头部
        self.headers = {
            'HOST': 'movie.douban.com',
            'Referer': 'https://movie.douban.com/top250?start=225&filter=',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',
        }

    def run(self):
        # 分析网页
        self.parse_page()

    # 发送请求,返回网页源代码
    def send_request(self, url):
        i = 0
        # 请求若出错,重复3次
        while i <= 3:
            try:
                print(u'请求url: ' + url)
                return requests.get(url=url, headers=self.headers).content
            except Exception as e:
                print(u'%s%s'% (e, url))
                i += 1

    # 解析网站源码,采用xpath提取电影名称和评分,放到队列中
    def parse_page(self):
        response = self.send_request(self.url)
        html = etree.HTML(response)
        # 获取到一页的电影数据
        node_list = html.xpath("//div[@class='info']")
        for move in node_list:
            # 电影名称
            title = move.xpath('.//a/span/text()')[0]
            # 电影评分
            score = move.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0]
            # 存放至队列
            self.q.put(score + "\t" + title)


def main():
    q = Queue()
    base_url = 'https://movie.douban.com/top250?start='
    url_list = [base_url + str(num) for num in range(0, 255+1, 25)]
    # 保存进程
    Process_list = []
    # 创建并启动进程
    for url in url_list:
        p = DouBanSpider(url, q)
        p.start()
        Process_list.append(p)
    # 主进程等待子进程执行完成
    for i in Process_list:
        i.join()
    while not q.empty():
        print(q.get())


if __name__ == '__main__':
    start = time.time()
    main()
    print('耗时: %s' %(time.time()-start))

4、使用Thread多线程爬取豆瓣电影

from threading import Thread
from queue import Queue
import time
from lxml import etree
import requests


class DouBanSpider(Thread):
    def __init__(self, url, q):
        super(DouBanSpider, self).__init__()  # 重写写父类的__init__方法
        self.url = url
        self.q = q
        self.headers = {
            'Cookie': 'll="118282"; bid=ctyiEarSLfw; ps=y; __yadk_uid=0Sr85yZ9d4bEeLKhv4w3695OFOPoedzC; dbcl2="155150959:OEu4dds1G1o"; as="https://sec.douban.com/b?r=https%3A%2F%2Fbook.douban.com%2F"; ck=fTrQ; _pk_id.100001.4cf6=c86baf05e448fb8d.1506160776.3.1507290432.1507283501.; _pk_ses.100001.4cf6=*; __utma=30149280.1633528206.1506160772.1507283346.1507290433.3; __utmb=30149280.0.10.1507290433; __utmc=30149280; __utmz=30149280.1506160772.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1475767059.1506160772.1507283346.1507290433.3; __utmb=223695111.0.10.1507290433; __utmc=223695111; __utmz=223695111.1506160772.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); push_noty_num=0; push_doumail_num=0',
            'Host': 'movie.douban.com',
            'Referer': 'https://movie.douban.com/top250?start=225&filter=',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',
        }

    def run(self):
        self.parse_page()

    def send_request(self, url):
        i = 0
        while i <= 3:  # 请求出错时,重复请求3次,
            try:
                print(u"请求url:" + url)
                html = requests.get(url=url, headers=self.headers).content
            except Exception as e:
                print(u'%s%s' % (e, url))
                i += 1
            else:
                return html

    def parse_page(self):
        response = self.send_request(self.url)  # 解析网站源码,并采用xpath提取电影名称和平分放到队列中
        html = etree.HTML(response)

        node_list = html.xpath("//div[@class='info']")
        for move in node_list:
            title = move.xpath('.//a/span/text()')[0]  # 电影名称
            score = move.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0]  # 评分
            self.q.put(score + "\t" + title)  # 将每一部电影的名称跟评分加入到队列


def main():
    # 创建一个队列用来保存进程获取到的数据
    q = Queue()
    base_url = 'https://movie.douban.com/top250?start='
    # 构造所有url
    url_list = [base_url + str(num) for num in range(0, 225 + 1, 25)]
    # 保存线程
    Thread_list = []
    # 创建并启动线程
    for url in url_list:
        p = DouBanSpider(url, q)
        p.start()
        Thread_list.append(p)
    # 让主线程等待子线程执行完成
    for i in Thread_list:
        i.join()
    while not q.empty():
        print(q.get())


if __name__ == "__main__":
    start = time.time()
    main()
    print('耗时:%s' % (time.time() - start))


5、利用PyQuery爬取微博热搜

from pyquery import  PyQuery as pq
import requests
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
            }
url = "https://s.weibo.com/top/summary"
response = requests.request('get',url,headers=headers)
content_all = response.content.decode('utf-8')
doc = pq(content_all)
items = doc('tbody')
content=items('.td-02').items()
for c in content:
	name=c('a').text()
	print(name)
上一篇:新浪疫情新闻数据的爬取


下一篇:Python selenium