import requests
import re
url="https://www.baidu.com/s?wd=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B"
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"
}
response=requests.get(url,headers=header)
data=response.content.decode()
print("实时热搜")
#<a href="https://www.baidu.com/s?tn=SE_PcFYBssrd_na11wmj5&wd=%E6%B9%96%E5%8D%97%E5%90%89%E6%9E%97%E8%B4%B5%E5%B7%9E%E4%BA%91%E5%8D%97%E7%9C%81%E5%A7%94%E4%B9%A6%E8%AE%B0%E8%B0%83%E6%95%B4&rsv_dl=0_left_fyb_doodle" target="_blank" class="OP_LOG_LINK" data-click="{fm:'beha'}"> <div class="c-gap-top-small c-gap-bottom-small">
title_rule=re.compile('<span class="c-index.*</span>[\s\S]?\s*(.*)[\s\S]?')
link_rule=re.compile('<a href="(.*?)".*?class="OP_LOG_LINK" data-click="{fm:\'beha\'}">')
searchindex_rule=re.compile('<div class="op-hotboard-search-index">\s*(.*)\s*<i class="opr-toplist-st c-icon c-icon-down"></i>')
titles=title_rule.findall(data)
links=link_rule.findall(data)
searchindexs=searchindex_rule.findall(data)
for i in range(0,5):
print(titles[i])
print(links[i])
print(searchindexs[i])
xpath_pc
import requests
from lxml import etree
url="https://www.baidu.com/s?wd=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B"
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"
# "User-Agent": "Mozilla/5.0 (Linux; Android 10; SM-G975U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.93 Mobile Safari/537.36"
}
response=requests.get(url,headers=header)
print(response.status_code)
data=response.content
with open("baidu.html","wb") as f:
f.write(data)
x_data= etree.HTML(data)
print(x_data)
print("实时热搜")
result_href=x_data.xpath('//*[@class="op-hotboard-hotnews-body"]//@href')
result_title_raw=x_data.xpath('//*[@id="1"]/div[1]/div/ul/li/a/div/div[1]/text()')
result_description=x_data.xpath('//*[@id="1"]/div[1]/div/ul/li/a/div[2]/text()')
# print(result_title_raw)
print(len(result_href))
# print(result_href)
# print(len(result_title_raw))
# print(result_description)
print(len(result_description))
result_title_final=[]
for i in range(1,45,3):
result_title_final.append(result_title_raw[i].strip())
print(result_title_final)
print(len(result_title_final))
for i in range(len(result_href)):
print(result_title_final[i])
print(result_href[i].strip())
print(result_description[i].strip()+"\n")
xpath_mobile
import requests
from lxml import etree
url="https://www.baidu.com/s?wd=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B"
header={
# "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"
"User-Agent": "Mozilla/5.0 (Linux; Android 10; SM-G975U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.93 Mobile Safari/537.36"
}
response=requests.get(url,headers=header)
data=response.content
with open("baidu.html","wb") as f:
f.write(data)
x_data= etree.HTML(data)
print(x_data)
print("实时热搜")
news_timely=x_data.xpath('//*[@id="results"]/div[1]/div/article/section/div[2]/div/a/div/div[1]/div/span[2]/text()')
print(len(news_timely))
print(news_timely)
bs4
import requests
from bs4 import BeautifulSoup
url="https://www.baidu.com/s?wd=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B"
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"
}
response=requests.get(url,headers=header)
data=response.content
with open("baidu.html","wb") as f:
f.write(data)
soup= BeautifulSoup(data,"lxml")
print("实时热搜")
for i in range(0,5):
result1=soup.select('.op-hotboard-hotnews-list-title')[i].get_text(strip=True)
result2=soup.find_all(attrs={"class":"OP_LOG_LINK","data-click":"{fm:'beha'}"})[i].get('href')
print(result1+result2)
print("今日热搜")
for i in range(5,10):
result1=soup.select('.op-hotboard-hotnews-list-title')[i].get_text().replace(" ", "").replace("\n","")
result2=soup.find_all(attrs={"class":"OP_LOG_LINK","data-click":"{fm:'beha'}"})[i].get('href')
print(result1+" 链接 "+result2)
print("七日热搜")
for i in range(10,15):
result1=soup.select('.op-hotboard-hotnews-list-title')[i].get_text().replace(" ", "").replace("\n","")
result2=soup.find_all(attrs={"class":"OP_LOG_LINK","data-click":"{fm:'beha'}"})[i].get('href')
print(result1+" 链接 "+result2)