requests+正则爬取豆瓣图书

 #requests+正则爬取豆瓣图书

 import requests
import re def get_html(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 LBBROWSER'}
response = requests.get(url,headers=headers)
html = response.text
return html def get_books(url): html = get_html(url)
pattern = re.compile(r'<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?</li>',re.S)
result = re.findall(pattern,html)
for rs in result:
link,book,name,data = rs
book = re.sub('\s','',book)#可用sub去掉换行空白等 print(link,book,name.strip(),data.strip())#也可用strip去掉换行空白 if __name__ == '__main__': url = 'https://book.douban.com/'
get_books(url)
上一篇:hdu 5497 Inversion 树状数组 逆序对,单点修改


下一篇:JavaScript栈和队列