思路:
1.查看网页源代码
2.抓取单页内容
3.使用正则表达式
4.猫眼电影TOP100所有信息
查看网页源代码
使用Chrome浏览器:
抓取单页内容
通过以下代码获取网页源代码
import requests
from requests.exceptions import RequestException
import re
import os
headers = {
#'Content-Type': 'text/html;charset=UTF-8',
#'Origin':' https://maoyan.com?offset=0',
'Referer': 'https://maoyan.com/board/4',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
def get_page(url, headers):
try:
res = requests.get(url=url, headers=headers)
if res.status_code == 200:
return res.text
return None
except RequestException:
return None
使用正则表达式提取信息
def parse_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2],
'actor': item[3].strip()[3:],
'time': item[4].strip()[5:],
'score': item[5]+item[6]
}
将爬取的信息写入文件,我这采用的是写入txt文本
def write_file(txt):
data = open('result.txt', 'a', encoding='utf-8')
print(txt, file=data)
data.close()
将电影所对应的图片进行下载
def save_picture(url, path):
if not os.path.exists('picture'):
os.makedirs('picture')
res = requests.get(url=url)
if res.status_code ==200:
with open (path, 'wb') as f:
f.write(res.content)
print(path,'下载成功')
f.close()
代码实现如下
import requests
from requests.exceptions import RequestException
import re
import os
headers = {
#'Content-Type': 'text/html;charset=UTF-8',
#'Origin':' https://maoyan.com?offset=0',
'Referer': 'https://maoyan.com/board/4',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
def get_page(url, headers):
try:
res = requests.get(url=url, headers=headers)
if res.status_code == 200:
return res.text
return None
except RequestException:
return None
def parse_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2],
'actor': item[3].strip()[3:],
'time': item[4].strip()[5:],
'score': item[5]+item[6]
}
def write_file(txt):
data = open('result.txt', 'a', encoding='utf-8')
print(txt, file=data)
data.close()
def save_picture(url, path):
if not os.path.exists('picture'):
os.makedirs('picture')
res = requests.get(url=url)
if res.status_code ==200:
with open (path, 'wb') as f:
f.write(res.content)
print(path,'下载成功')
f.close()
def main(offset):
url = "https://maoyan.com/board/4?offset="+str(offset)
html = get_page(url, headers)
for item in parse_page(html):
write_file(item)
save_picture(item['image'], 'picture/'+item['title']+'.jpg')
if __name__ == '__main__':
for i in range(10):
main (i*10)
运行结果