1 """ 2 爬取豆瓣电影TOP250 - 完整示例代码 3 """ 4 5 import codecs 6 7 import requests 8 from bs4 import BeautifulSoup 9 10 DOWNLOAD_URL = 'http://movie.douban.com/top250/' 11 12 13 def download_page(url): 14 return requests.get(url, headers={ 15 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36' 16 }).content 17 18 19 def parse_html(html): 20 soup = BeautifulSoup(html) 21 movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'}) 22 23 movie_name_list = [] 24 25 for movie_li in movie_list_soup.find_all('li'): 26 detail = movie_li.find('div', attrs={'class': 'hd'}) 27 movie_name = detail.find('span', attrs={'class': 'title'}).getText() 28 29 movie_name_list.append(movie_name) 30 31 next_page = soup.find('span', attrs={'class': 'next'}).find('a') 32 if next_page: 33 return movie_name_list, DOWNLOAD_URL + next_page['href'] 34 return movie_name_list, None 35 36 37 def main(): 38 url = DOWNLOAD_URL 39 40 with codecs.open('movies', 'wb', encoding='utf-8') as fp: 41 while url: 42 html = download_page(url) 43 movies, url = parse_html(html) 44 fp.write(u'{movies}\n'.format(movies='\n'.join(movies))) 45 46 47 if __name__ == '__main__': 48 main()
原文链接:
https://zhuanlan.zhihu.com/p/20423182
感谢原po。