爬虫_豆瓣电影top250 (正则表达式)

一样的套路,就是多线程还没弄

 import requests
import re
import json headers = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' def get_one_page(url):
try:
html = requests.get(url, headers={'User-Agent':'headers'})
if html.status_code == 200:
return html.text
return None except RequestsException:
return None def write_txt(content):
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
f.close def parse_one_page(html):
# <em class="">(\d+)</em>
# .*?href="(.*?)/">.*?
# other">(\w+)</span
match = re.compile('.*?<em class="">(.*?)</em>.*?href="(.*?)/">.*?"title">(.*?)</span.*?other">(.*?)</span', re.S)
results = re.findall(match, html)
for item in results:
yield{
'range': item[0],
'movie_main_page': item[1],
'movie_title': item[2],
'other_name': item[3].strip()[13:]
}
# print(results) def main():
for start in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start=' + str(start)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_txt(item) if __name__ == '__main__':
main()

运行结果

爬虫_豆瓣电影top250 (正则表达式)

上一篇:原生JS的使用,包括jquery和原生JS获取节点、jquery和原生JS修改属性的比较


下一篇:LVS + Keepalived 理论