爬虫DAY8

www.allitebooks.com

1.json

xml是json的前身。json是轻量级的数据交互格式。

json简单理解就是字典或者一个列表。

书写格式:1.不能写注释 2.key:value 必须是双引号 3.末尾不能写逗号 4.整个文件有且仅有一个{}或者[]

以下为json操作的四种方法

import  json
#1.字符串和dic  list转换

#字符串(json)----dict  list
data = '[{"name":"张三","age":20},{"name":"李四","age":22}]'
list_data = json.loads(data)
print(data)
print(list_data)

#dict  list ---字符串
list2 = [{"name":"张三","age":20},{"name":"李四","age":22}]
data_json = json.dumps(list2)
print(data_json)

#2.文件对象  和dict  lisr转换
list2 = [{"name":"张三","age":20},{"name":"李四","age":22}]
str_data = json.dumps(list2)
with open('02json.json','w',encoding='utf-8') as f:
    f.write(str_data)

#dict  list  写入文件
list3 = [{"name":"张三","age":20},{"name":"李四","age":22}]
#fp  是 file  path
json.dump(list3,open('02new.json','w'))

#读取文件json----list  dict
fp = open('02new.json','r')
result = json.load(fp)
print(result)

2.csv

#part2   csv
import json
import  csv
#需求 json  中的文件   转换  成   csv文件

#1.分别  读 ,创建文件
json_fp = open('02new.json','r')
csv_fp = open('03csv.csv','w')
#2.提出  表头,表内容
data_list= json.load(json_fp)
sheet_title = data_list[0].keys()
sheet_data = []
for data in data_list:
    sheet_data.append(data.values())

#3.csv写入器
writer = csv.writer(csv_fp)
#4.写入表头
writer.writerow(sheet_title)
#5.写入内容
writer.writerows(sheet_data)
#6.关闭两个文件
json_fp.close()
csv_fp.close()

获取https://www.allitebooks.in/page中书的简要内容,写入txt中。

import requests
import random
from lxml import etree

class BookSpider(object):
    def __init__(self):
        self.base_url = "https://www.allitebooks.in/page/{}"
        ua1 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
        ua2 = 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
        ua3 = 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36'
        ua4 = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
        self.headers = {
            'User-agent':random.choice([ua1,ua2,ua3,ua4])
        }

    #1.构建所有的url
    def get_url_list(self):
        for i in range(1,2):
            url_list = []
            url = self.base_url.format(i)
            url_list.append(url)
        return url_list
    #2.发请求
    def send_request(self,url):
        response = requests.get(url,headers = self.headers)
        data = response.content.decode("utf-8")
        return data

    #3.解析数据  xpath
    def parse_data(self,data):
        xpath_data = etree.HTML(data)
        #解析出所有的  book
        book_list = xpath_data.xpath('//h3[@class="entry-title td-module-title"]/a/@title')
        print(book_list)
        print(len(book_list))
        #解析出每本书的信息
        summary_list = xpath_data.xpath('//div[@class="td-excerpt"]/text()')
        print(summary_list)
        print(len(summary_list))
        self.save_data(data,book_list,summary_list)

    #4.保存数据
    def save_data(self,data,book_list,summary_list):
        # with open('book.html','w',encoding= 'utf-8') as f:
        #     f.write(data)
        f = open('book.txt','w')
        for index ,book in enumerate(book_list):
            f.write(book)
            f.write(summary_list[index])
            f.write("\n")
        f.close()

    #统筹管理
    def start(self):
        url_list = self.get_url_list()
        #循环遍历发送请求
        for url in url_list:
            data = self.send_request(url)
            self.parse_data(data)


BookSpider().start()

将书名和简要信息以json的形式保存。

import requests
import random
from lxml import etree
import json

class BookSpider(object):
    def __init__(self):
        self.base_url = "https://www.allitebooks.in/page/{}"
        ua1 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
        ua2 = 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
        ua3 = 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36'
        ua4 = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
        self.headers = {
            'User-agent':random.choice([ua1,ua2,ua3,ua4])
        }
        self.data_list = []

    #1.构建所有的url
    def get_url_list(self):
        url_list = []
        for i in range(1,5):
            url = self.base_url.format(i)
            url_list.append(url)
        return url_list
    #2.发请求
    def send_request(self,url):
        response = requests.get(url,headers = self.headers)
        data = response.content.decode("utf-8")
        print(url)
        return data

    #3.解析数据  xpath
    def parse_data(self,data):
        xpath_data = etree.HTML(data)
        #解析出所有的  book
        book_list = xpath_data.xpath('//h3[@class="entry-title td-module-title"]/a/@title')

        #解析出每本书的信息
        summary_list = xpath_data.xpath('//div[@class="td-excerpt"]//text()')
        for index ,book in enumerate(book_list):
            dict_data = {}
            dict_data['name'] = book
            dict_data['summary'] = summary_list[index]
            self.data_list.append(dict_data)


    #4.保存数据
    def save_data(self):
        # with open('book.html','w',encoding= 'utf-8') as f:
        #     f.write(data)
        fp = open('book.json','w')
        f = json.dump(self.data_list,fp)
        fp.close()

    #统筹管理
    def start(self):
        url_list = self.get_url_list()
        #循环遍历发送请求
        for url in url_list:
            data = self.send_request(url)
            self.parse_data(data)
        self.save_data()


BookSpider().start()

解析出每本书的信息,再循环遍历出每本书的具体信息。

import requests
import random
from lxml import etree
import json
from bs4 import BeautifulSoup

class BookSpider(object):
    def __init__(self):
        self.base_url = "https://www.allitebooks.in/page/{}"
        ua1 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
        ua2 = 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
        ua3 = 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36'
        ua4 = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
        self.headers = {
            'User-agent':random.choice([ua1,ua2,ua3,ua4])
        }
        self.data_list = []

    #1.构建所有的url
    def get_url_list(self):
        url_list = []
        for i in range(1,5):
            url = self.base_url.format(i)
            url_list.append(url)
        return url_list
    #2.发请求
    def send_request(self,url):
        response = requests.get(url,headers = self.headers)
        data = response.content.decode("utf-8")
        print(url)
        return data

    #3.解析数据  xpath
    def parse_data(self,data):
        xpath_data = etree.HTML(data)
        #解析出所有的  book
        book_list = xpath_data.xpath('//div[@class="td_module_19 td_module_wrap td-animation-stack td-meta-info-hide"]')

        #解析出每本书的信息
        for index,book in enumerate(book_list):
            book_dict = {}

            # 1.书名字
            book_dict['book_name']= xpath_data.xpath('.//h3[@class="entry-title td-module-title"]/a/@title')[index]

            # 2.书的图片url
            book_dict['book_img_url'] = book.xpath('.//div[@class="td-module-thumb"]/a/img/@src')[0]

            # 4.书的简介
            book_dict['book_info'] = book.xpath('.//div[@class="td-excerpt"]/text()')[0]
            print(book_dict)

            self.data_list.append(book_dict)

    #4.保存数据
    def save_data(self):
        # with open('book.html','w',encoding= 'utf-8') as f:
        #     f.write(data)
        fp = open('book.json','w')
        f = json.dump(self.data_list,fp)
        fp.close()

    #统筹管理
    def start(self):
        url_list = self.get_url_list()
        #循环遍历发送请求
        for url in url_list:
            data = self.send_request(url)
            self.parse_data(data)
        self.save_data()

BookSpider().start()

bs4解析数据

def parse_bs4_data(self, data):

    bs4_data = BeautifulSoup(data, 'lxml')
    # 1.取出所有的书
    book_list = bs4_data.select('article')



    # 2.解析出 每本书的 信息
    for book in book_list:
        book_dict = {}
        # 1.书名字
        book_dict['book_name'] = book.select_one('.entry-title').get_text()

        # # 2.书的图片url
        book_dict['book_img_url'] = book.select_one('.attachment-post-thumbnail').get('src')

        # # 3.书的作者
        book_dict['book_author'] = book.select_one('.entry-author').get_text()[3:]
        #
        # # 4.书的简介
        book_dict['book_info'] = book.select_one('.entry-summary p').get_text()
        print(book_dict)
        self.data_list.append(book_dict)

自己按照视频思路写的

import requests
import random
from lxml import etree
import json
from bs4 import BeautifulSoup

class BookSpider(object):
    def __init__(self):
        self.base_url = "https://www.allitebooks.in/page/{}"
        ua1 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
        ua2 = 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
        ua3 = 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36'
        ua4 = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
        self.headers = {
            'User-agent':random.choice([ua1,ua2,ua3,ua4])
        }
        self.data_list = []

    #1.构建所有的url
    def get_url_list(self):
        url_list = []
        for i in range(1,5):
            url = self.base_url.format(i)
            url_list.append(url)
        return url_list
    #2.发请求
    def send_request(self,url):
        response = requests.get(url,headers = self.headers)
        data = response.content.decode("utf-8")
        print(url)
        return data

    #3.解析数据  xpath
    def parse_data(self,data):
        xpath_data = etree.HTML(data)
        #解析出所有的  book
        book_list = xpath_data.xpath('//div[@class="td_module_19 td_module_wrap td-animation-stack td-meta-info-hide"]')

        #解析出每本书的信息
        for index,book in enumerate(book_list):
            book_dict = {}

            # 1.书名字
            book_dict['book_name']= xpath_data.xpath('.//h3[@class="entry-title td-module-title"]/a/@title')[index]

            # 2.书的图片url
            book_dict['book_img_url'] = book.xpath('.//div[@class="td-module-thumb"]/a/img/@src')[0]

            # 4.书的简介
            book_dict['book_info'] = book.xpath('.//div[@class="td-excerpt"]/text()')[0]
            print(book_dict)

            self.data_list.append(book_dict)

    def parse_bs4_data(self, data):

        bs4_data = BeautifulSoup(data, 'lxml')
        # 1.取出所有的书
        book_list = bs4_data.select('article')

        # 2.解析出 每本书的 信息
        for book in book_list:
            book_dict = {}
            # 1.书名字
            book_dict['book_name'] = book.select_one('.entry-title td-module-title').get('title')

            # # 2.书的图片url
            book_dict['book_img_url'] = book.select_one('.attachment-post-thumbnail').get('src')

            # # 3.书的作者
            book_dict['book_author'] = book.select_one('.entry-author').get_text()[3:]
            #
            # # 4.书的简介
            book_dict['book_info'] = book.select_one('.entry-summary p').get_text()
            print(book_dict)
            self.data_list.append(book_dict)



    #4.保存数据
    def save_data(self):
        # with open('book.html','w',encoding= 'utf-8') as f:
        #     f.write(data)
        fp = open('book.json','w')
        f = json.dump(self.data_list,fp)
        fp.close()

    #统筹管理
    def start(self):
        url_list = self.get_url_list()
        #循环遍历发送请求
        for url in url_list:
            data = self.send_request(url)
            self.parse_data(data)
        self.save_data()

BookSpider().start()
上一篇:Java基础学习 Day8 (流程控制)


下一篇:您为何会咸鱼——冲刺day8