爬虫DAY6

正则表达式

拆分字符串
import re
#1.拆分字符串
one = 'asetfsgbd'
#标准是以s为拆分
pattern = re.compile('s')
result = pattern.split(one)
print(result)
匹配中文
#2.匹配中文
two = '<span class="title-prefix">吴亦凡</span>'
#python中匹配中间  [a-z]   unisode范围
# +至少一次
pattern = re.compile('[\u4e00-\u9fa5]+')
result1 = pattern.findall(two)
print(result1)
爬取百度新闻
#爬取百度新闻
import  re
import  requests

url = 'http://news.baidu.com/'
header = {
    "User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
#response.text  不太准确  转码  是靠推测
data = requests.get(url,headers = header).content.decode("utf-8")

#正则解析数据
#每个数据的title  url

# <a href="https://news.cctv.com/2020/11/21/ARTIRIi4qRjRWa6JPoHjUcoj201121.shtml" target="_blank" class="a3" mon="ct=1&amp;a=1&amp;c=top&amp;pn=0">*出席亚太经合组织*会议并发表讲话</a>

#pattern = re.compile('<a href="(.*?)" target="_blank" mon="(.*?)">(.*?)</a>')
pattern = re.compile('<a (.*?)</a>',re.S)
result = pattern.findall(data)
print(result)

with open('02news.html','w',encoding = 'utf-8') as f:
    f.write(data)

xpath表达式

#		xpath语法  1.节点:/
#                  2.跨节点://
#                  3.精确的标签://a[@属性=“属性值”]
#                   4.标签包裹的内容  text()
#                   5.属性:@href
#               xpath    返回数据类型------list
#				xpath中下标从  1  开始;只能取  平级关系的标签
#爬取百度新闻
#xpath是方法  不是底层库
from lxml import etree
import  requests

url = 'http://news.baidu.com/'
header = {
    "User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
#response.text  不太准确  转码  是靠推测
data = requests.get(url,headers = header).content.decode("utf-8")

#xpath解析数据
#每个数据的title  url

# <a href="https://news.cctv.com/2020/11/21/ARTIRIi4qRjRWa6JPoHjUcoj201121.shtml" target="_blank" class="a3" mon="ct=1&amp;a=1&amp;c=top&amp;pn=0">*出席亚太经合组织*会议并发表讲话</a>

#1.转解析类型
xpath_data = etree.HTML(data)

#xpath语法  1.节点:/
#                  2.跨节点://
#                  3.精确的标签://a[@属性=“属性值”]
#                   4.标签包裹的内容  text()
#                   5.属性:@href
#               xpath    返回数据类型------list

#2.调用  xpath的方法
#result = xpath_data.xpath('/html/head/title/text()')
#result = xpath_data.xpath('//a/text()')
# result = xpath_data.xpath('//a[@ mon="ct=1&a=2&c=top&pn=18"]/@href')
result = xpath_data.xpath('//li/a/text()')

print(result)

爬取网站http://8btc.com

#爬取网站http://8btc.com
import requests
from lxml import etree
import json

class BtcSpider(object):
    def __init__(self):
        self.base_url = 'http://8btc.com/forum-61-'
        self.headers = {
            "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
        }
        self.data_list = []

        #1.发起请求
    def get_response(self,url):
        response= requests.get(url,headers = self.headers)
        #网页编码在header中看   head--meta--charset
        #抓取的  网页  编码是utf-8
        data = response.content.decode('utf-8')
        return  data
        #2.解析数据
    def parse_data(self,data):
        # 使用xpath解析   当前界面的   所有新闻title和url  保存
        #1)转类型
        x_data = etree.HTML(data)
        #2)根据xpath路径进行解析
        #路径  1)纯手写     2)借助浏览器的  右击  粘贴xpath路径;需要修改
        title_list = x_data.xpath('//a[@class="link-dark-major font-bold bbt-block"]/text()')
        url_list = x_data.xpath('//a[@class="link-dark-major font-bold bbt-block"]/@href')


        for index,title  in enumerate(title_list):
            news = {}
            print(index)
            print(title)
            news['name'] = title
            news['url'] = url_list[index]
            self.data_list.append(news)
        # print(data_list)
        #
        # return data_list

        #3.保存数据
    def save_data(self):

        #将  list----->str
        data_str = json.dumps(self.data_list)
        with open('05btc.json','w',encoding = 'utf-8') as f:
            f.write(data_str)

        #4.启动
    def run(self):
        for i in range(1,5):

            #1.拼接完整u'r'l
            url = self.base_url + str(i)+'.html'
            print(url)
            #2发起请求
            data = self.get_response(url)

            #3做解析
            self.parse_data(data)
        # 4保存
        self.save_data()

BtcSpider().run()
上一篇:day6


下一篇:【LeetCode/力扣】#1160-拼写单词