import re
import ssl
import requests
import urllib3
from w3lib.html import remove_tags
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
ssl._create_default_https_context = ssl._create_unverified_context
urllib3.disable_warnings()
# 特殊字符替换
def replace_entity(str):
CHAR_ENTITIES_dict = {
" ": "",
" ": "",
" ": "",
"<": "<",
">": ">",
"&": "&",
""": '"',
"“": "“",
"”": "”",
"©": "©",
"®": "™",
"×": "×",
"÷": "÷",
}
for i in list(CHAR_ENTITIES_dict.keys()):
if i in str:
str = str.replace(i, list(CHAR_ENTITIES_dict.values())[list(CHAR_ENTITIES_dict.keys()).index(i)])
return str
# 只保留<span><p><div>标签,去除各种样式,将div、span\标签处理为p标签,
def filter_tag(html_str):
# 只保留p、span、div标签
html_str = remove_tags(html_str, which_ones=(), keep=("p", 'span', 'div', 'br'))
# 匹配换行、制表等空白字符
re_blank = '\s*'
# 去除样式
re_style = r'style=".*?"'
# 匹配class选择器相关内容
re_class = r'class=".*?"'
# 匹配id选择器等内容
re_id = r'id=".*?"'
# 匹配可见的独立左右的样式等内容
re_align = r'align=".*"'
# pip连续重复出现的<p>标签:类似于<p><p>,<p><p><p>若干个连续<p>字符串
re_p_pre_repeat = "<p[><p]+p>"
re_p_next_repeat = "</p[/p<>]+/p"
html_str = re.sub(re_style, "", html_str)
html_str = re.sub(re_blank, "", html_str)
html_str = re.sub(re_class, "", html_str)
html_str = re.sub(re_id, "", html_str)
# html_str = re.sub(re_center, "", html_str)
html_str = re.sub(re_align, "", html_str)
html_str = html_str.replace("翡翠王朝", "九玉网").replace(
"www.jaadee.com", "www.91yu.com").replace("<span>", "<p>").replace("</span>", "</p>").replace("<div>",
"<p>").replace(
"</div>", "</p>")
html_str = re.sub(re_p_pre_repeat, "<p>", html_str)
html_str = re.sub(re_p_next_repeat, "</p>", html_str)
html_str = replace_entity(html_str)
return html_str.replace(">>", ">").lstrip().rstrip()
def get_data(content_url_list, domain):
info = {}
info["url"] = content_url_list[0]
info["askreocrd"] = 0
info["domain"] = domain
content_all_list = []
title_list = []
for i in content_url_list:
try:
resp = requests.get(i, headers=headers)
resp.encoding = "utf-8"
# print(resp.text)
title = re.findall('<h1 class="post-title".*?>(.*?)<i.*?</h1>', resp.text)[0].lstrip().rstrip()
title_list.append(title)
content_list = re.findall(
r'<div class="entry" itemprop="articleBody">.*?<div class="clear"></div>(.*?)<div class="page5">',
resp.text, re.S)
content_all_list.append(content_list)
except Exception as e:
print(e)
info["title"] = title_list[0]
content_all_list = [i for k in content_all_list for i in k]
info["content"] = ','.join(content_all_list)
info["content"] = info["content"].replace(",", "")
info_list = [info["title"], info["content"], info["url"], info["domain"], info["askreocrd"]]
info_list[1] = filter_tag(info_list[1])
return info_list
url = 'http://www.ydce.com'
domain = url.replace("http://", "")
content_url_list = ['http://www.ydce.com/news/27906.html', 'http://www.ydce.com/news/27906_2.html']
record = get_data(content_url_list, domain)
print(record)