Python爬虫实现抓取腾讯视频所有电影【实战必学】

 

2019-06-27 23:51:51 阅读数 407  收藏 更多 分类专栏: python爬虫  

用python实现的抓取腾讯视频所有电影的爬虫

Python爬虫实现抓取腾讯视频所有电影【实战必学】

  1.   # -*- coding: utf-8 -*-
  2.   import re
  3.   import urllib2
  4.   from bs4 import BeautifulSoup
  5.   import string, time
  6.   import pymongo
  7.    
  8.   NUM = 0 #全局变量,电影数量
  9.   m_type = u'' #全局变量,电影类型
  10.   m_site = u'qq' #全局变量,电影网站
  11.    
  12.   #根据指定的URL获取网页内容
  13.   def gethtml(url):
  14.   req = urllib2.Request(url)
  15.   response = urllib2.urlopen(req)
  16.   html = response.read()
  17.   return html
  18.   '''
  19.   在学习过程中有什么不懂得可以加我的python学习交流扣扣qun,934109170,群里有不错的学习教程与开发工具。
  20.   '''
  21.    
  22.   #从电影分类列表页面获取电影分类
  23.   def gettags(html):
  24.   global m_type
  25.   soup = BeautifulSoup(html) #过滤出分类内容
  26.   #print soup
  27.   #<ul class="clearfix _group" gname="mi_type" gtype="1">
  28.   tags_all = soup.find_all('ul', {'class' : 'clearfix _group' , 'gname' : 'mi_type'})
  29.   #print len(tags_all), tags_all
  30.   #print str(tags_all[1]).replace('\n', '')
  31.    
  32.   #<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html"title="动作" tvalue="0">动作</a>
  33.   re_tags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'
  34.   p = re.compile(re_tags, re.DOTALL)
  35.    
  36.   tags = p.findall(str(tags_all[0]))
  37.   if tags:
  38.   tags_url = {}
  39.   #print tags
  40.   for tag in tags:
  41.   tag_url = tag[0].decode('utf-8')
  42.   #print tag_url
  43.   m_type = tag[1].decode('utf-8')
  44.   tags_url[m_type] = tag_url
  45.    
  46.   else:
  47.   print "Not Find"
  48.   return tags_url
  49.    
  50.   #获取每个分类的页数
  51.   def get_pages(tag_url):
  52.   tag_html = gethtml(tag_url)
  53.   #div class="paginator
  54.   soup = BeautifulSoup(tag_html) #过滤出标记页面的html
  55.   #print soup
  56.   #<div class="mod_pagenav" id="pager">
  57.   div_page = soup.find_all('div', {'class' : 'mod_pagenav', 'id' : 'pager'})
  58.   #print div_page #len(div_page), div_page[0]
  59.    
  60.   #<a class="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>
  61.   re_pages = r'<a class=.+?><span>(.+?)</span></a>'
  62.   p = re.compile(re_pages, re.DOTALL)
  63.   pages = p.findall(str(div_page[0]))
  64.   #print pages
  65.   if len(pages) > 1:
  66.   return pages[-2]
  67.   else:
  68.   return 1
  69.    
  70.    
  71.   def getmovielist(html):
  72.   soup = BeautifulSoup(html)
  73.    
  74.   #<ul class="mod_list_pic_130">
  75.   divs = soup.find_all('ul', {'class' : 'mod_list_pic_130'})
  76.   #print divs
  77.   for div_html in divs:
  78.   div_html = str(div_html).replace('\n', '')
  79.   #print div_html
  80.   getmovie(div_html)
  81.    
  82.    
  83.   def getmovie(html):
  84.   global NUM
  85.   global m_type
  86.   global m_site
  87.    
  88.   re_movie = r'<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>'
  89.   p = re.compile(re_movie, re.DOTALL)
  90.   movies = p.findall(html)
  91.   if movies:
  92.   conn = pymongo.Connection('localhost', 27017)
  93.   movie_db = conn.dianying
  94.   playlinks = movie_db.playlinks
  95.   #print movies
  96.   for movie in movies:
  97.   #print movie
  98.   NUM += 1
  99.   print "%s : %d" % ("=" * 70, NUM)
  100.   values = dict(
  101.   movie_title = movie[1],
  102.   movie_url = movie[0],
  103.   movie_site = m_site,
  104.   movie_type = m_type
  105.   )
  106.   print values
  107.   playlinks.insert(values)
  108.   print "_" * 70
  109.   NUM += 1
  110.   print "%s : %d" % ("=" * 70, NUM)
  111.    
  112.   #else:
  113.   # print "Not Find"
  114.    
  115.   def getmovieinfo(url):
  116.   html = gethtml(url)
  117.   soup = BeautifulSoup(html)
  118.    
  119.   #pack pack_album album_cover
  120.   divs = soup.find_all('div', {'class' : 'pack pack_album album_cover'})
  121.   #print divs[0]
  122.    
  123.   #<a href="http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html" target="new" title="《血滴子》独家纪录片" wl="1"> </a>
  124.   re_info = r'<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>'
  125.   p_info = re.compile(re_info, re.DOTALL)
  126.   m_info = p_info.findall(str(divs[0]))
  127.   if m_info:
  128.   return m_info
  129.   else:
  130.   print "Not find movie info"
  131.    
  132.   return m_info
  133.    
  134.    
  135.   def insertdb(movieinfo):
  136.   global conn
  137.   movie_db = conn.dianying_at
  138.   movies = movie_db.movies
  139.   movies.insert(movieinfo)
  140.    
  141.   if __name__ == "__main__":
  142.   global conn
  143.    
  144.   tags_url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
  145.   #print tags_url
  146.   tags_html = gethtml(tags_url)
  147.   #print tags_html
  148.   tag_urls = gettags(tags_html)
  149.   #print tag_urls
  150.    
  151.    
  152.   for url in tag_urls.items():
  153.   print str(url[1]).encode('utf-8') #,url[0]
  154.   maxpage = int(get_pages(str(url[1]).encode('utf-8')))
  155.   print maxpage
  156.    
  157.   for x in range(0, maxpage):
  158.   #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html
  159.   m_url = str(url[1]).replace('0_20_0_-1_0.html', '')
  160.   movie_url = "%s%d_20_0_-1_0.html" % (m_url, x)
  161.   print movie_url
  162.   movie_html = gethtml(movie_url.encode('utf-8'))
  163.   #print movie_html
  164.   getmovielist(movie_html)
  165.   time.sleep(0.1)
 大工告成,以上代码大家都看明白了没? 如果你看不懂,建议你可以去小编的Python交流.裙 :一久武其而而流一思(数字的谐音)转换下可以找到了,里面有最新Python教程项目,多练习自然就懂了!
上一篇:React中创建组件


下一篇:模拟登陆(入门级认知)_python爬虫