www.allitebooks.com
1.json
xml是json的前身。json是轻量级的数据交互格式。
json简单理解就是字典或者一个列表。
书写格式:1.不能写注释 2.key:value 必须是双引号 3.末尾不能写逗号 4.整个文件有且仅有一个{}或者[]
以下为json操作的四种方法
import json
#1.字符串和dic list转换
#字符串(json)----dict list
data = '[{"name":"张三","age":20},{"name":"李四","age":22}]'
list_data = json.loads(data)
print(data)
print(list_data)
#dict list ---字符串
list2 = [{"name":"张三","age":20},{"name":"李四","age":22}]
data_json = json.dumps(list2)
print(data_json)
#2.文件对象 和dict lisr转换
list2 = [{"name":"张三","age":20},{"name":"李四","age":22}]
str_data = json.dumps(list2)
with open('02json.json','w',encoding='utf-8') as f:
f.write(str_data)
#dict list 写入文件
list3 = [{"name":"张三","age":20},{"name":"李四","age":22}]
#fp 是 file path
json.dump(list3,open('02new.json','w'))
#读取文件json----list dict
fp = open('02new.json','r')
result = json.load(fp)
print(result)
2.csv
#part2 csv
import json
import csv
#需求 json 中的文件 转换 成 csv文件
#1.分别 读 ,创建文件
json_fp = open('02new.json','r')
csv_fp = open('03csv.csv','w')
#2.提出 表头,表内容
data_list= json.load(json_fp)
sheet_title = data_list[0].keys()
sheet_data = []
for data in data_list:
sheet_data.append(data.values())
#3.csv写入器
writer = csv.writer(csv_fp)
#4.写入表头
writer.writerow(sheet_title)
#5.写入内容
writer.writerows(sheet_data)
#6.关闭两个文件
json_fp.close()
csv_fp.close()
获取https://www.allitebooks.in/page中书的简要内容,写入txt中。
import requests
import random
from lxml import etree
class BookSpider(object):
def __init__(self):
self.base_url = "https://www.allitebooks.in/page/{}"
ua1 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
ua2 = 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
ua3 = 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36'
ua4 = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
self.headers = {
'User-agent':random.choice([ua1,ua2,ua3,ua4])
}
#1.构建所有的url
def get_url_list(self):
for i in range(1,2):
url_list = []
url = self.base_url.format(i)
url_list.append(url)
return url_list
#2.发请求
def send_request(self,url):
response = requests.get(url,headers = self.headers)
data = response.content.decode("utf-8")
return data
#3.解析数据 xpath
def parse_data(self,data):
xpath_data = etree.HTML(data)
#解析出所有的 book
book_list = xpath_data.xpath('//h3[@class="entry-title td-module-title"]/a/@title')
print(book_list)
print(len(book_list))
#解析出每本书的信息
summary_list = xpath_data.xpath('//div[@class="td-excerpt"]/text()')
print(summary_list)
print(len(summary_list))
self.save_data(data,book_list,summary_list)
#4.保存数据
def save_data(self,data,book_list,summary_list):
# with open('book.html','w',encoding= 'utf-8') as f:
# f.write(data)
f = open('book.txt','w')
for index ,book in enumerate(book_list):
f.write(book)
f.write(summary_list[index])
f.write("\n")
f.close()
#统筹管理
def start(self):
url_list = self.get_url_list()
#循环遍历发送请求
for url in url_list:
data = self.send_request(url)
self.parse_data(data)
BookSpider().start()
将书名和简要信息以json的形式保存。
import requests
import random
from lxml import etree
import json
class BookSpider(object):
def __init__(self):
self.base_url = "https://www.allitebooks.in/page/{}"
ua1 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
ua2 = 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
ua3 = 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36'
ua4 = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
self.headers = {
'User-agent':random.choice([ua1,ua2,ua3,ua4])
}
self.data_list = []
#1.构建所有的url
def get_url_list(self):
url_list = []
for i in range(1,5):
url = self.base_url.format(i)
url_list.append(url)
return url_list
#2.发请求
def send_request(self,url):
response = requests.get(url,headers = self.headers)
data = response.content.decode("utf-8")
print(url)
return data
#3.解析数据 xpath
def parse_data(self,data):
xpath_data = etree.HTML(data)
#解析出所有的 book
book_list = xpath_data.xpath('//h3[@class="entry-title td-module-title"]/a/@title')
#解析出每本书的信息
summary_list = xpath_data.xpath('//div[@class="td-excerpt"]//text()')
for index ,book in enumerate(book_list):
dict_data = {}
dict_data['name'] = book
dict_data['summary'] = summary_list[index]
self.data_list.append(dict_data)
#4.保存数据
def save_data(self):
# with open('book.html','w',encoding= 'utf-8') as f:
# f.write(data)
fp = open('book.json','w')
f = json.dump(self.data_list,fp)
fp.close()
#统筹管理
def start(self):
url_list = self.get_url_list()
#循环遍历发送请求
for url in url_list:
data = self.send_request(url)
self.parse_data(data)
self.save_data()
BookSpider().start()
解析出每本书的信息,再循环遍历出每本书的具体信息。
import requests
import random
from lxml import etree
import json
from bs4 import BeautifulSoup
class BookSpider(object):
def __init__(self):
self.base_url = "https://www.allitebooks.in/page/{}"
ua1 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
ua2 = 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
ua3 = 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36'
ua4 = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
self.headers = {
'User-agent':random.choice([ua1,ua2,ua3,ua4])
}
self.data_list = []
#1.构建所有的url
def get_url_list(self):
url_list = []
for i in range(1,5):
url = self.base_url.format(i)
url_list.append(url)
return url_list
#2.发请求
def send_request(self,url):
response = requests.get(url,headers = self.headers)
data = response.content.decode("utf-8")
print(url)
return data
#3.解析数据 xpath
def parse_data(self,data):
xpath_data = etree.HTML(data)
#解析出所有的 book
book_list = xpath_data.xpath('//div[@class="td_module_19 td_module_wrap td-animation-stack td-meta-info-hide"]')
#解析出每本书的信息
for index,book in enumerate(book_list):
book_dict = {}
# 1.书名字
book_dict['book_name']= xpath_data.xpath('.//h3[@class="entry-title td-module-title"]/a/@title')[index]
# 2.书的图片url
book_dict['book_img_url'] = book.xpath('.//div[@class="td-module-thumb"]/a/img/@src')[0]
# 4.书的简介
book_dict['book_info'] = book.xpath('.//div[@class="td-excerpt"]/text()')[0]
print(book_dict)
self.data_list.append(book_dict)
#4.保存数据
def save_data(self):
# with open('book.html','w',encoding= 'utf-8') as f:
# f.write(data)
fp = open('book.json','w')
f = json.dump(self.data_list,fp)
fp.close()
#统筹管理
def start(self):
url_list = self.get_url_list()
#循环遍历发送请求
for url in url_list:
data = self.send_request(url)
self.parse_data(data)
self.save_data()
BookSpider().start()
bs4解析数据
def parse_bs4_data(self, data):
bs4_data = BeautifulSoup(data, 'lxml')
# 1.取出所有的书
book_list = bs4_data.select('article')
# 2.解析出 每本书的 信息
for book in book_list:
book_dict = {}
# 1.书名字
book_dict['book_name'] = book.select_one('.entry-title').get_text()
# # 2.书的图片url
book_dict['book_img_url'] = book.select_one('.attachment-post-thumbnail').get('src')
# # 3.书的作者
book_dict['book_author'] = book.select_one('.entry-author').get_text()[3:]
#
# # 4.书的简介
book_dict['book_info'] = book.select_one('.entry-summary p').get_text()
print(book_dict)
self.data_list.append(book_dict)
自己按照视频思路写的
import requests
import random
from lxml import etree
import json
from bs4 import BeautifulSoup
class BookSpider(object):
def __init__(self):
self.base_url = "https://www.allitebooks.in/page/{}"
ua1 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
ua2 = 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
ua3 = 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36'
ua4 = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
self.headers = {
'User-agent':random.choice([ua1,ua2,ua3,ua4])
}
self.data_list = []
#1.构建所有的url
def get_url_list(self):
url_list = []
for i in range(1,5):
url = self.base_url.format(i)
url_list.append(url)
return url_list
#2.发请求
def send_request(self,url):
response = requests.get(url,headers = self.headers)
data = response.content.decode("utf-8")
print(url)
return data
#3.解析数据 xpath
def parse_data(self,data):
xpath_data = etree.HTML(data)
#解析出所有的 book
book_list = xpath_data.xpath('//div[@class="td_module_19 td_module_wrap td-animation-stack td-meta-info-hide"]')
#解析出每本书的信息
for index,book in enumerate(book_list):
book_dict = {}
# 1.书名字
book_dict['book_name']= xpath_data.xpath('.//h3[@class="entry-title td-module-title"]/a/@title')[index]
# 2.书的图片url
book_dict['book_img_url'] = book.xpath('.//div[@class="td-module-thumb"]/a/img/@src')[0]
# 4.书的简介
book_dict['book_info'] = book.xpath('.//div[@class="td-excerpt"]/text()')[0]
print(book_dict)
self.data_list.append(book_dict)
def parse_bs4_data(self, data):
bs4_data = BeautifulSoup(data, 'lxml')
# 1.取出所有的书
book_list = bs4_data.select('article')
# 2.解析出 每本书的 信息
for book in book_list:
book_dict = {}
# 1.书名字
book_dict['book_name'] = book.select_one('.entry-title td-module-title').get('title')
# # 2.书的图片url
book_dict['book_img_url'] = book.select_one('.attachment-post-thumbnail').get('src')
# # 3.书的作者
book_dict['book_author'] = book.select_one('.entry-author').get_text()[3:]
#
# # 4.书的简介
book_dict['book_info'] = book.select_one('.entry-summary p').get_text()
print(book_dict)
self.data_list.append(book_dict)
#4.保存数据
def save_data(self):
# with open('book.html','w',encoding= 'utf-8') as f:
# f.write(data)
fp = open('book.json','w')
f = json.dump(self.data_list,fp)
fp.close()
#统筹管理
def start(self):
url_list = self.get_url_list()
#循环遍历发送请求
for url in url_list:
data = self.send_request(url)
self.parse_data(data)
self.save_data()
BookSpider().start()