1、基础模块篇
re模块
1、功能:提供对正则表达式的支持
2、常用方法
(1)compile():根据包含正则表达式的字符串创建模式对象,返回一个pattern对象
import re
string = "A1.45, b5, 6.45, 8.82"
regex = re.compile(r"\d+\.?\d*")
print(regex.findall(string))
(2)match():从字符串的pos指定的下标开始处匹配pattern,匹配返回一个Match对象,不匹配返回None
(3)group()和groups():前者获得一个或多个分组截获的字符串,后者以元组形式返回全部分组截获的字符串
(4)search():查找匹配的字符串,一次匹配
import re
pattern = re.compile("\d+")
my_str = 'one12twothree34four'
m = pattern.search(my_str)
m = pattern.search(my_str, 10, 30)
print(m)
print(m.group())
(5)findall():搜索整个字符串,获得所有匹配结果,一次匹配
import re
# 查找数字
pattern = re.compile(r'\d+')
result1 = pattern.findall('hello 123456 789')
result2 = pattern.findall('one1two2three3four4', 0, 10)
print(result1)
print(result2)
# 查找浮点数
pattern = re.compile(r'\d+\.\d*')
result = pattern.findall("123.141, 'big', 232312, 3.14")
for item in result:
print(item)
(6)finditer():返回一个顺序访问每一个匹配结果的迭代器
(7)split():按照匹配的子串将字符串分割后返回列表
import re
p = re.compile(r'[\s\,\;]+')
result = p.split('a,b;; c d')
print(result)
(8)sub():正则替换
import re
p = re.compile(r'(\w+) (\w+)')
s = 'hello 123, hello 456'
print(p.sub(r'hello world', s))
print(p.sub(r'\2 \1', s))
def func(m):
return 'hi' + '' + m.group(2)
print(p.sub(func, s))
print(p.sub(func, s, 1))
lxml模块
1、功能:解析和提取HTML/XML格式数据
2、安装:pip install lxml
3、Xpath语法:对XML和HTML文档的信息搜索
nodename | 选取此节点的所有子节点 |
---|---|
/ | 从根节点选取 |
// | 匹配当前节点的节点,不考虑它们的位置 |
. | 选取当前节点 |
… | 选取当前节点的父节点 |
@ | 选取属性 |
* | 选取所有元素节点与元素名 |
@* | 选取所有属性 |
[@attrib] | 选取指定属性的所有元素 |
@attrib=‘value’ | 匹配值的所有元素 |
tag | 选取直接子节点 |
tag=‘text’ | 指定元素且文本为text的节点 |
4、提示:Python抓取时,需转换为xml(etree.HTML())
5、简例:
(1)使用XPath解析HTML字符串
# 导入lxml模块
from lxml import etree
# 初始化生成一个Xpath解析对象(xxx为字符串(自己在代码中编辑html))
html=etree.HTML(xxx)
# 解析对象输出代码
result=etree.tostring(html, encoding='utf-8')
(2)使用XPath解析HTML文件
from lxml import etree
# 解析XML文件
htmlEmt = etree.parse('xxx.xml')
# pretty_print优化输出
result = etree.tostring(htmlEmt, pretty_print=True)
(3)获取具体标签下的内容
from lxml import etree
htmlEmt = etree.parse('xxx.xml')
# 获取所有<li>标签
result = htmlEmt.xpath("//li")
# 获取<li>标签的所有class
result = htmlEmt.xpath("//li//@class")
# 获取<li>标签下href为xxx.html的<a>标签
result = htmlEmt.xpath('//li/a[@href="xxx.html"]')
# 获取<li>标签下的所有<span>标签
result = htmlEmt.xpath('//li//span')
# 获取不包括<li>标签本身的class
result = htmlEmt.xpath('//li/a//@class')
# 获取最后一个<li>标签的<a>标签的href
result = htmlEmt.xpath('//li[last()]/a/@href')
# 获取倒数第二个元素的内容
result = htmlEmt.xpath('//li[lat()-1]/a')
# 获取class为xxx的标签
result = htmlEmt.xpath('//*[@class="xxx"]')
BeautifulSoup库
1、功能:用于从网页中提取数据
2、安装:pip install BeautifulSoup4
3、简例
(1)创建对象
from bs4 import BeautifulSoup
# 用lxml解析器解析目录下的html文件
soup = BeautifulSoup(open('xxx.html'), 'lxml')
print(soup.prettify())
(2)4个对象
1、Tag(标签):HTML中的标签
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('xxx.html'), 'lxml')
print(soup.prettify())
# 获取Tag标签
print(soup.title)
# 获取头部
print(soup.head)
# 获取超链接
print(soup.a)
# 获取段落标记
print(soup.p)
# name属性
print(soup.name)
# head的name属性
print(soup.head.name)
# soup的属性
print(soup.attrs)
# soup.p的属性
print(soup.p.arrts)
# 获取单个属性
print(soup.p['class'])
# get()获取属性
print(soup.p.get('class'))
# 属性赋值
soup.p['class'] = "newClass"
# 删除属性
del soup.p['class']
2、NavigableString(可遍历的字符串):string属性获取内部文字
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('xxx.html'), 'lxml')
print(soup.prettify())
# 输出标签文本内容
print(soup.title.string)
3、BeautifulSoup(文档对象):表示一个文档的全部内容
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('xxx.html'), 'lxml')
# 输出soup的类型
print(type(soup))
# 输出soup的name
print(soup.name)
print(soup.attrs)
4、Comment(注释):特殊的NavigableString对象
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('xxx.html'), 'lxml')
print(soup.a)
# 可能输出时会把注释符号去掉,因此需要先进行判断,看是否为规定类型
print(soup.a.string)
print(type(soup.a.string))
(3)文档树操作
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('index.html'), 'lxml')
# 遍历文档树
# 直接获取子节点
print(soup.head.contents) # 输出子节点
print(soup.head.contents[0]) # 获取第一个元素
# 使用childeren属性获取子节点
print(soup.head.children)
for child in soup.p.children:
print(child)
# 使用descendants属性获取所有下层节点
for child in soup.descendants:
print(child)
# 使用strings属性获取多个内容
for string in soup.strings:
print(repr(string))
# 使用parents属性来获取全部父节点
content = soup.head.title.string
for parent in content.parents:
print(parent.name)
# 搜索文档树
# 查找标签title
result = soup.find("title")
# 文本查找
result = soup.find(text="\n Tillit\n ")
# 通过标签属性查找
result = soup.find(id='link2')
print(type(result))
print(result)
print(result.string)
# 正则表达式查找
import re
from bs4 import BeautifulSoup
email_id_example = """<br/>
<div>The below HTML has the information that has email ids.</div>
abc@example.com
<div>xyz@example.com</div>
<span>foo@example.com</span>
"""
soup = BeautifulSoup(email_id_example, 'lxml')
emailid_regexp = re.compile("\w+@\w+\.\w+")
first_email_id = soup.find(text=emailid_regexp)
print(first_email_id)
# 通过回调函数查找
def is_secondary_consumers(tag):
return tag.has_attr('name') and tag.get('name') == 'dromouse'
result = soup.find(is_secondary_consumers)
print(type(result))
print(result)
print(result.string)
(4)CSS选择器
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('index.html'), 'lxml')
# 通过标签名查找
result = soup.select("p")
# 通过类名查找
result = soup.select(".sister")
# 通过ID名查找
result = soup.select("#link3")
# 组合查找
result = soup.select("p #link1")
# 使用属性查找
result = soup.select("[class='title']")
3、 _thread模块
1、功能:派生线程,提供基本的同步数据结构(锁对象)
2、简例
(1)使用_thread创建多线程
import _thread
from time import sleep, ctime
def loop0():
print("开始循环0次在: ", ctime())
sleep(4)
print("结束循环0次在: ", ctime())
def loop1():
print("开始循环1次在: ", ctime())
sleep(2)
print("结束循环1次在: ", ctime())
def main():
print('starting at: ', ctime())
_thread.start_new_thread(loop0, ())
_thread.start_new_thread(loop1, ())
sleep(6)
print('all done at:', ctime())
if __name__ == '__main__':
main()
(2)使用锁机制进行线程管理
import _thread
from time import sleep, ctime
loops=[4,2]
def loop(nloop, sec, lock):
print('开始循环', nloop, '在: ', ctime())
sleep(sec)
print('循环', nloop, '结束于: ', ctime())
# 释放锁
lock.release()
def main():
print('开始于:',ctime())
locks=[]
nloops=range(len(loops))
for i in nloops:
lock=_thread.allocate_lock()
lock.acquire()
locks.append(lock)
for i in nloops:
_thread.start_new_thread(loop,(i,loops[i],locks[i]))
for i in nloops:
while locks[i].locked():
pass
print('所有的任务完成于:',ctime())
if __name__ =='__main__':
main()
4、Threading模块
1、功能:支持守护线程,一个等待客户端请求的服务器
2、简例
(1)使用函数方式创建多线程
import threading
from time import sleep, ctime
loops = [4, 2]
class ThreadFunc(object):
def __init__(self, func, args, name=''):
self.name = name
self.func = func
self.args = args
def __call__(self):
self.func(*self.args)
def loop(nloop, nsec):
print('开始循环', nloop, 'at:', ctime())
sleep(nsec)
print('循环', nloop, '结束于:', ctime())
def main():
print('程序开始于:', ctime())
threads = []
nloops = range(len(loops))
for i in nloops:
t = threading.Thread(target=loop, args=(i, loops[i])) # 循环
threads.append(t)
for i in nloops:
threads[i].start() # 循环 开始线程
for i in nloops:
threads[i].join() # 循环 join()方法可以让主线程等待所有的线程都执行完毕。
print('任务完成于:', ctime())
if __name__ == '__main__':
main()
(2)传递可调用的类的实例来创建多线程
import threading
from time import sleep,ctime
loops=[4,2]
class ThreadFunc(object):
def __init__(self,func,args,name=''):
self.name=name
self.func = func
self.args=args
def __call__(self):
self.func(*self.args)
def loop(nloop,nsec):
print('开始循环',nloop,'at:',ctime())
sleep(nsec)
print('循环',nloop,'结束于:',ctime())
def main():
print('程序开始于:',ctime())
threads=[]
nloops=range(len(loops))
for i in nloops:
#传递一个可调用类的实例
t = threading.Thread(target=ThreadFunc(loop,(i,loops[i]),loop.__name__))
threads.append(t)
for i in nloops:
threads[i].start() #循环 开始线程
for i in nloops:
threads[i].join() #循环 join()方法可以让主线程等待所有的线程都执行完毕。
print('任务完成于:',ctime())
if __name__=='__main__':
main()
(3)派生子类并创建实例来实现多线程
import threading
from time import sleep,ctime
loops=[4,2]
class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(*self.args)
def loop(nloop,nsec):
print('开始循环',nloop,'在:',ctime())
sleep(nsec)
print('结束循环',nloop,'于:',ctime())
def main():
print('程序开始于:',ctime())
threads = []
nloops = range(len(loops))
for i in nloops:
t = MyThread(loop,(i,loops[i]),loop.__name__)
threads.append(t)
for i in nloops:
threads[i].start()
for i in nloops:
threads[i].join()
print('所有的任务完成于:',ctime())
if __name__ =='__main__':
main()
5、Tesseract库
1、功能:图像识别开源库,将图像转换成文字的光学文字识别(OCR)
2、下载:地址
3、安装:pip install pytesseract
4、简例
(1)验证Pytesseract模块(需要图片,可能会报错,修改pytesseract.py文件中的
tesseract_cmd = ‘tesseract’,改成你的安装路径)
from PIL import Image
import pytesseract
text = pytesseract.image_to_string(Image.open(r"image.png"))
print(text)
(2)读取网络验证码
import pytesseract
from urllib import request
from PIL import Image
import time
for i in range(20):
captchaUrl = "https://passport.lagou.com/vcode/create"
request.urlretrieve(captchaUrl, 'captcha.png')
image = Image.open('captcha.png')
text = pytesseract.image_to_string(image, lang='eng')
print(text)
# 暂停两秒后再次执行
time.sleep(2)
6、PyQuery模块
1、功能:javascript库,可简化javascript编程
2、安装:pip install pyquery
3、简例
(1)使用字符串初始化PyQuery对象
from pyquery import PyQuery as pq
html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
html_query = pq(html)
print(html_query('li'))
(2)使用HTML文件初始化PyQuery对象
from pyquery import PyQuery as pq
html_query = pq(filename='xxx.html')
print(html_query('.xxx'))
(3)使用URL初始化PyQuery对象
from pyquery import PyQuery as pq
html_query = pq(url='https://www.baidu.com', encoding="utf-8")
print(html_query('title'))
(4)基本CSS筛选器使用
from pyquery import PyQuery as pq
html ='''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
html_query = pq(html)
re = html_query('#container .list li')
print(re)
print(type(re))
(5)查找节点
from pyquery import PyQuery as pq
html ='''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
html_query = pq(html)
items = html_query('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(lis)
(6)遍历结果并输出
from pyquery import PyQuery as pq #导入pyquery模块
html ='''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
html_query = pq(html)
lis = html_query ('li').items()
for li in lis:
print(li)
(7)获取对象属性文本
from pyquery import PyQuery as pq
html ='''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
html_query = pq(html)
#获取所有超链接
a = html_query ('a')
for item in a.items():
print(item.attr('href'))
(8)获取对象文本信息
from pyquery import PyQuery as pq
html ='''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
html_query = pq(html)
a = html_query ('[href="link3.html"]')
print(a.text())
print(a.html())
2、爬虫篇
正则表达式
定义:又称为规则表达式,用来检索、替换那些符合某个模式(规则)的文本。用事先定义好的特定字符及字符的组合,组成一个规则字符串,来表示对字符串的一种筛选逻辑
Scrapy框架
1、概述:一个爬取网站数据、提取结构性数据的应用框架。应用于数据挖掘、信息处理或存储历史数据等方面
2、组件:引擎、调度器、下载器、爬虫、管道、下载器中间件、爬虫中间件、调度中间件
3、工作流程
- 引擎从调度器中取出一个URL用于接下来的网页抓取
- 引擎将URL封装成一个请求传给下载器
- 下载器下载网络资源,并封装成响应包
- 爬虫解析响应包
- 若解析出各个表项,则交给管道进行进一步的处理
- 若解析出URL,则把URL交给调度器等待下一步的网页抓取
4、安装:pip install scrapy
5、开发流程
- 创建项目:scrapy startproject < name > [dir]
- 进入项目:cd xxx
- 创建爬虫:scrapy genspider [options] < name > < domain >
- 运行爬虫:scrapy crawl < spider >
- 内容存储:-o选项(JSON格式、文本格式、CSV格式、XML格式)
6、范例:Scrapy爬取美剧天堂
1、创建项目:scrapy startproject movie
2、进入项目:cd movie
3、创建爬虫程序:scrapy genspider meiju meijutt.com
4、配置items.py
import scrapy
class MovieItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
pass
5、编写meiju.py
import scrapy
from movie.items import MovieItem
class MeijuSpider(scrapy.Spider):
name = 'meiju'
allowed_domains = ['meijutt.com']
start_urls = ['http://meijutt.com/new100.html']
def parse(self, response):
movies = response.xpath('//ul[@class="top-list fn-clear"]/li')
for each_movie in movies:
item = MovieItem()
item['name'] = each_movie.xpath('./h5/a/@title').extract()[0]
yield item
pass
6、配置settings.py
ITEM_PIPELINES = {
'movie.pipelines.MoviePipeline': 100,
}
编写pipelines.py
from itemadapter import ItemAdapter
class MoviePipeline(object):
def process_item(self, item, spider):
with open("my_meiju.txt", 'a', encoding='utf-8') as fp:
fp.write(item['name'] + '\n')
return item
7、执行程序
cd movie
scrapy crawl meiju
爬虫步骤
- 明确目标:哪个范围或者网站去搜索
- 爬:将网站所有内容爬下来
- 取:去掉无用的数据
- 处理数据:存储和使用
爬虫抓取
1、抓取标签间的内容
(1)抓取title标签间内容(< title >(.*?)< /title >)
# 抓取标题
import re
import urllib.request
url = "http://www.baidu.com/"
# 打开url
content = urllib.request.urlopen(url).read()
# 查找标题
title = re.findall(b'<title>(.*?)</title>', content)
print(str(title[0], 'utf-8'))
----------------------------------------------------
# 抓取标签
import re
import urllib.request
url = "http://www.baidu.com/"
content = urllib.request.urlopen(url).read()
# 定义正则规则
pat = b'(?<=<title>).*?(?=</title>)'
# 创建对象
ex = re.compile(pat, re.M|re.S)
# 正则匹配查找
obj = re.search(ex, content)
title = obj.group()
print(str(title, 'utf-8'))
(2)抓取超链接标签之间的内容(< a href=URL > < /a >)
import re
content = """
<a href="http://news.baidu.com" name="tj_trnews" class="mnav">新闻</a>
<a href="http://www.hao123.com" name="tj_trhao123" class="mnav">hao123</a>
<a href="http://map.baidu.com" name="tj_trmap" class="mnav">地图</a>
<a href="http://v.baidu.com" name="tj_trvideo" class="mnav">视频</a>
"""
res = r"<a. *?href=.*?<\/a>"
urls = re.findall(res, content)
for u in urls:
print(u)
# 获取超链接之间的内容
res = r'<a .*?>(.*?)</a>'
texts = re.findall(res, content, re.S|re.M)
for t in texts:
print(t)
(3)抓取tr/td标签间的内容
import re
content = """
<html>
<head><title>表格</title></head>
<body>
<table border=1>
<tr><th>学号</th><th>姓名</th></tr>
<tr><th>1001</th><th>张三</th></tr>
<tr><th>1002</th><th>李四</th></tr>
</table>
</body>
</html>
"""
res = r'<tr>(.*?)</tr>'
texts = re.findall(res, content, re.S|re.M)
for m in texts:
print(m)
for m in texts:
res_th = r'<th>(.*?)</th>'
m_th = re.findall(res_th, m, re.S|re.M)
for t in m_th:
print(t)
res = r'<td>(.*?)</td><td>(.*?)</td>'
texts = re.findall(res, content, re.S|re.M)
for m in texts:
print(m[0], m[1])
2、抓取标签中的参数
(1)抓取超链接标签的URL(< a href=URL >链接内容 < /a >)
import re
content = """
<a href="http://news.baidu.com" name="tj_trnews" class="mnav">新闻</a>
<a href="http://www.hao123.com" name="tj_trhao123" class="mnav">hao123</a>
<a href="http://map.baidu.com" name="tj_trmap" class="mnav">地图</a>
<a href="http://v.baidu.com" name="tj_trvideo" class="mnav">视频</a>
"""
res = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
urls = re.findall(res, content, re.I|re.S|re.M)
for url in urls:
print(url)
(2)抓取图片标签的src(< img src=图片地址 />)
import re
content = '''
<img alt = "Python" src="http://www.csdn.net/eastmount.jpg" />
'''
urls = re.findall('src="(.*?)"', content, re.I|re.S|re.M)
print(urls[0])
(3)获取url中最后一个参数
import re
content = '''
<img alt = "Python" src="http://www.csdn.net/eastmount.jpg" />
'''
urls = 'http://www..csdn.net/eastmount.jpg'
name = urls.split('/')[-1]
print(name)
(4)爬取内容
import re
content = '''
<tr><td>1</td><td>王二<br /></td></tr>
<tr><td>2</td><td>张三</td></tr>
<tr><td>3</td><td><B>李四</B></td></tr>
'''
res = r'<td>(.*?)</td><td>(.*?)</td>'
texts = re.findall(res, content, re.S|re.M)
for m in texts:
print(m[0], m[1])
(5)对内容进行筛选
import re
content = '''
<tr><td>1</td><td>王二<br /></td></tr>
<tr><td>2</td><td>张三</td></tr>
<tr><td>3</td><td><B>李四</B></td></tr>
'''
res = r'<td>(.*?)</td><td>(.*?)</td>'
texts = re.findall(res, content, re.S|re.M)
for m in texts:
value0 = m[0].replace('<br />', '').replace(' ', '')
value1 = m[1].replace('<br />', '').replace(' ', '')
if '<B>' in value1:
m_value = re.findall(r'<B>(.*?)</B>', value1, re.S|re.M)
print(value0, m_value[0])
else:
print(value0, value1)
3、文件处理篇
JSON文件处理
JSON:一种轻量级的数据交换语言,采用完全独立于编程语言的文本格式来存储和表示数据,本质上是一个字符串
(1)字典和列表转JSON
import json
books = [
{
'title': '水浒传',
'price': 10
},
{
'title': '西游记',
'price': 10
}
]
# dumps:将指定的列表转换为JSON字符串(只能存放ascii,关闭该特性)
json_str = json.dumps(books, ensure_ascii=False)
print(json_str)
(2)将JSON数据转存到文件
import json
books = [
{
'title': '水浒传',
'price': 10
},
{
'title': '西游记',
'price': 10
}
]
# dump:参数转存的数据+打开的文件指针
with open('a.JSON', 'w') as fp:
json.dump(books, fp)
(3)将JSON数据字符串加载为对象
import json
json_str = '[{"title": "水浒传", "price": 10}, {"title": "西游记", "price": 10}]'
# loads:将JSON字符串加载为Python对象
books = json.loads(json_str, encoding='utf-8')
print(type(books))
print(books)
(4)从文件中读取JSON
import json
with open('a.JSON', 'r', encoding='utf-8') as fp:
json_str = json.load(fp)
print(json_str)
CSV文件处理
CSV:逗号分隔值(字符分隔值),文件以纯文本形式存储表格数据
(1)读取CSV文件
import csv
with open('data.csv', 'r', encoding='utf-8') as fp:
reader = csv.reader(fp)
titles = next(reader)
for x in reader:
print(x)
(2)数据写入CSV文件
import csv
headers = ['name', 'price', 'author']
values = [
('史记', 100, '司马迁'),
('儒林外史', 46, '吴敬梓'),
('时间简史', 60, '霍金')
]
with open ('book.csv', 'w', newline='') as fp:
writer = csv.writer(fp)
writer.writerow(headers)
writer.writerows(values)
(3)综合练习
import csv
# 通过下标读取文件
def read_csv_demo():
with open('', 'r') as fp:
reader = csv.reader(fp)
next(reader)
for x in reader:
name = [3]
other = [-1]
print({'name':name, 'other':other})
# 通过字典读取文件
def read_csv_deom2():
with open('', 'r') as fp:
reader = csv.DictReader(fp)
for x in reader:
value = {'name':x['name'], 'other':x['other']}
print(value)
# 通过字典写入文件
def read_csv_demo3():
headers = ['username', 'age', 'height']
values = [
{'王二', '20', '140'},
{'张三', '21', '143'},
{'李四', '22', '142'}
]
with open('classroom.csv', 'w', encoding='utf-8', newline='') as fp:
writer = csv.writer(fp)
# 写入表头
writer.writerow(headers)
# 写入数据
writer.writerows(values)
# 通过字典写入文件
def read_csv_demo4():
headers = ['username', 'age', 'height']
values = [
{'username':'王二','age':'20', 'height':'140'},
{'username':'张三','age':'21', 'height':'143'},
{'username':'李四','age':'22', 'height':'142'}
]
with open('classroom2.csv', 'w', encoding='utf-8', newline='')as fp:
writer = csv.DictWriter(fp, headers)
writer.writeheader()
writer.writerows(values)
if __name__ == '__main__':
read_csv_demo()
read_csv_deom2()
read_csv_demo3()
read_csv_demo4()
4、实战篇
1、利用lxml爬取豆瓣网站
import requests
from lxml import etree
# 请求头设置
headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
# 定义请求URL
url = "https://movie.douban.com/cinema/nowplaying/chongqing"
# 发起请求
rep = requests.get(url, headers=headers)
# 返回Unicode数据类型
text = rep.text
# 转换成html格式
html = etree.HTML(text)
# 找到子孙节点ul标签
ul = html.xpath("//ul[@class='lists']")[0]
# 当前ul下的所有li标签
lis = ul.xpath("./li")
movies = []
for li in lis:
title = li.xpath("@data-title")[0]
score = li.xpath("@data-score")[0]
region = li.xpath("@data-region")[0]
actors = li.xpath("@data-actors")[0]
director = li.xpath("@data-director")[0]
liimg = li.xpath(".//img/@src")
# 字典数据
movie = {
"title": title,
"score": score,
"region": region,
"actors": actors,
"director": director,
"liimg": liimg,
}
movies.append(movie)
print(movies)
2、爬取糗事网站
import requests
import re
# 糗事百科爬虫类
class Spider:
def loadPage(self, page):
url = " https://www.qiushibaike.com/hot/page/" + str(page) + "/"
user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT6.1; Trident/5.0"
headers = {"User-Agent": user_agent}
req = requests.get(url, headers=headers)
pattern = re.compile(r'<div class="content">\n<span>(.*?)</span>', re.S)
item_list = pattern.findall(req.text)
return item_list
# 爬取页面
def printOnePage(self, item_list, page):
print("*********第%d页,爬取完毕...******" % page)
for item in item_list:
print("===============")
print(item)
if __name__ == "__main__":
print("请按下回车开始")
input()
mySpider = Spider()
mySpider.printOnePage(mySpider.loadPage(1), 1)
3、使用Process多进程爬取豆瓣电影
from multiprocessing import Process, Queue
import time
from lxml import etree
import requests
class DouBanSpider(Process):
def __init__(self, url, q):
# 重写__init__方法
super(DouBanSpider, self).__init__()
self.url = url
self.q = q
# 请求头部
self.headers = {
'HOST': 'movie.douban.com',
'Referer': 'https://movie.douban.com/top250?start=225&filter=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',
}
def run(self):
# 分析网页
self.parse_page()
# 发送请求,返回网页源代码
def send_request(self, url):
i = 0
# 请求若出错,重复3次
while i <= 3:
try:
print(u'请求url: ' + url)
return requests.get(url=url, headers=self.headers).content
except Exception as e:
print(u'%s%s'% (e, url))
i += 1
# 解析网站源码,采用xpath提取电影名称和评分,放到队列中
def parse_page(self):
response = self.send_request(self.url)
html = etree.HTML(response)
# 获取到一页的电影数据
node_list = html.xpath("//div[@class='info']")
for move in node_list:
# 电影名称
title = move.xpath('.//a/span/text()')[0]
# 电影评分
score = move.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0]
# 存放至队列
self.q.put(score + "\t" + title)
def main():
q = Queue()
base_url = 'https://movie.douban.com/top250?start='
url_list = [base_url + str(num) for num in range(0, 255+1, 25)]
# 保存进程
Process_list = []
# 创建并启动进程
for url in url_list:
p = DouBanSpider(url, q)
p.start()
Process_list.append(p)
# 主进程等待子进程执行完成
for i in Process_list:
i.join()
while not q.empty():
print(q.get())
if __name__ == '__main__':
start = time.time()
main()
print('耗时: %s' %(time.time()-start))
4、使用Thread多线程爬取豆瓣电影
from threading import Thread
from queue import Queue
import time
from lxml import etree
import requests
class DouBanSpider(Thread):
def __init__(self, url, q):
super(DouBanSpider, self).__init__() # 重写写父类的__init__方法
self.url = url
self.q = q
self.headers = {
'Cookie': 'll="118282"; bid=ctyiEarSLfw; ps=y; __yadk_uid=0Sr85yZ9d4bEeLKhv4w3695OFOPoedzC; dbcl2="155150959:OEu4dds1G1o"; as="https://sec.douban.com/b?r=https%3A%2F%2Fbook.douban.com%2F"; ck=fTrQ; _pk_id.100001.4cf6=c86baf05e448fb8d.1506160776.3.1507290432.1507283501.; _pk_ses.100001.4cf6=*; __utma=30149280.1633528206.1506160772.1507283346.1507290433.3; __utmb=30149280.0.10.1507290433; __utmc=30149280; __utmz=30149280.1506160772.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1475767059.1506160772.1507283346.1507290433.3; __utmb=223695111.0.10.1507290433; __utmc=223695111; __utmz=223695111.1506160772.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); push_noty_num=0; push_doumail_num=0',
'Host': 'movie.douban.com',
'Referer': 'https://movie.douban.com/top250?start=225&filter=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',
}
def run(self):
self.parse_page()
def send_request(self, url):
i = 0
while i <= 3: # 请求出错时,重复请求3次,
try:
print(u"请求url:" + url)
html = requests.get(url=url, headers=self.headers).content
except Exception as e:
print(u'%s%s' % (e, url))
i += 1
else:
return html
def parse_page(self):
response = self.send_request(self.url) # 解析网站源码,并采用xpath提取电影名称和平分放到队列中
html = etree.HTML(response)
node_list = html.xpath("//div[@class='info']")
for move in node_list:
title = move.xpath('.//a/span/text()')[0] # 电影名称
score = move.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0] # 评分
self.q.put(score + "\t" + title) # 将每一部电影的名称跟评分加入到队列
def main():
# 创建一个队列用来保存进程获取到的数据
q = Queue()
base_url = 'https://movie.douban.com/top250?start='
# 构造所有url
url_list = [base_url + str(num) for num in range(0, 225 + 1, 25)]
# 保存线程
Thread_list = []
# 创建并启动线程
for url in url_list:
p = DouBanSpider(url, q)
p.start()
Thread_list.append(p)
# 让主线程等待子线程执行完成
for i in Thread_list:
i.join()
while not q.empty():
print(q.get())
if __name__ == "__main__":
start = time.time()
main()
print('耗时:%s' % (time.time() - start))
5、利用PyQuery爬取微博热搜
from pyquery import PyQuery as pq
import requests
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
url = "https://s.weibo.com/top/summary"
response = requests.request('get',url,headers=headers)
content_all = response.content.decode('utf-8')
doc = pq(content_all)
items = doc('tbody')
content=items('.td-02').items()
for c in content:
name=c('a').text()
print(name)