正则表达式
拆分字符串
import re
#1.拆分字符串
one = 'asetfsgbd'
#标准是以s为拆分
pattern = re.compile('s')
result = pattern.split(one)
print(result)
匹配中文
#2.匹配中文
two = '<span class="title-prefix">吴亦凡</span>'
#python中匹配中间 [a-z] unisode范围
# +至少一次
pattern = re.compile('[\u4e00-\u9fa5]+')
result1 = pattern.findall(two)
print(result1)
爬取百度新闻
#爬取百度新闻
import re
import requests
url = 'http://news.baidu.com/'
header = {
"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
#response.text 不太准确 转码 是靠推测
data = requests.get(url,headers = header).content.decode("utf-8")
#正则解析数据
#每个数据的title url
# <a href="https://news.cctv.com/2020/11/21/ARTIRIi4qRjRWa6JPoHjUcoj201121.shtml" target="_blank" class="a3" mon="ct=1&a=1&c=top&pn=0">*出席亚太经合组织*会议并发表讲话</a>
#pattern = re.compile('<a href="(.*?)" target="_blank" mon="(.*?)">(.*?)</a>')
pattern = re.compile('<a (.*?)</a>',re.S)
result = pattern.findall(data)
print(result)
with open('02news.html','w',encoding = 'utf-8') as f:
f.write(data)
xpath表达式
# xpath语法 1.节点:/
# 2.跨节点://
# 3.精确的标签://a[@属性=“属性值”]
# 4.标签包裹的内容 text()
# 5.属性:@href
# xpath 返回数据类型------list
# xpath中下标从 1 开始;只能取 平级关系的标签
#爬取百度新闻
#xpath是方法 不是底层库
from lxml import etree
import requests
url = 'http://news.baidu.com/'
header = {
"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
#response.text 不太准确 转码 是靠推测
data = requests.get(url,headers = header).content.decode("utf-8")
#xpath解析数据
#每个数据的title url
# <a href="https://news.cctv.com/2020/11/21/ARTIRIi4qRjRWa6JPoHjUcoj201121.shtml" target="_blank" class="a3" mon="ct=1&a=1&c=top&pn=0">*出席亚太经合组织*会议并发表讲话</a>
#1.转解析类型
xpath_data = etree.HTML(data)
#xpath语法 1.节点:/
# 2.跨节点://
# 3.精确的标签://a[@属性=“属性值”]
# 4.标签包裹的内容 text()
# 5.属性:@href
# xpath 返回数据类型------list
#2.调用 xpath的方法
#result = xpath_data.xpath('/html/head/title/text()')
#result = xpath_data.xpath('//a/text()')
# result = xpath_data.xpath('//a[@ mon="ct=1&a=2&c=top&pn=18"]/@href')
result = xpath_data.xpath('//li/a/text()')
print(result)
爬取网站http://8btc.com
#爬取网站http://8btc.com
import requests
from lxml import etree
import json
class BtcSpider(object):
def __init__(self):
self.base_url = 'http://8btc.com/forum-61-'
self.headers = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
self.data_list = []
#1.发起请求
def get_response(self,url):
response= requests.get(url,headers = self.headers)
#网页编码在header中看 head--meta--charset
#抓取的 网页 编码是utf-8
data = response.content.decode('utf-8')
return data
#2.解析数据
def parse_data(self,data):
# 使用xpath解析 当前界面的 所有新闻title和url 保存
#1)转类型
x_data = etree.HTML(data)
#2)根据xpath路径进行解析
#路径 1)纯手写 2)借助浏览器的 右击 粘贴xpath路径;需要修改
title_list = x_data.xpath('//a[@class="link-dark-major font-bold bbt-block"]/text()')
url_list = x_data.xpath('//a[@class="link-dark-major font-bold bbt-block"]/@href')
for index,title in enumerate(title_list):
news = {}
print(index)
print(title)
news['name'] = title
news['url'] = url_list[index]
self.data_list.append(news)
# print(data_list)
#
# return data_list
#3.保存数据
def save_data(self):
#将 list----->str
data_str = json.dumps(self.data_list)
with open('05btc.json','w',encoding = 'utf-8') as f:
f.write(data_str)
#4.启动
def run(self):
for i in range(1,5):
#1.拼接完整u'r'l
url = self.base_url + str(i)+'.html'
print(url)
#2发起请求
data = self.get_response(url)
#3做解析
self.parse_data(data)
# 4保存
self.save_data()
BtcSpider().run()