#!/usr/bin/python3 # coding=utf8 import requests from bs4 import BeautifulSoup import pymysql import time ''' 需求:某视频网站,没有搜索功能,我弄个python爬虫爬取网站视频名称和磁力链接,全部爬取下来放到mysql数据库中,就可以按自己喜好搜索关键字获得影片下载地址进行下载了 作者:xiaoxiaohui 时间:2019-10-03 其他:mysql数据库创建数据库和数据表 mysql -uroot -pxxh123 create database 4hucom; use 4hucom; 数据库id自增长 CREATE TABLE `4hu_shoujixiaoshipin` (`id` INT(11) not null auto_increment,`biaoti` VARCHAR(380), `fabutime` VARCHAR(380), `lianjie` VARCHAR(380),primary key(id) ); 其他2:因为是通过之前一些爬虫代码快速改进,所以关于(1)关于方法名称get_house_info都是沿用之前爬取租房网站的名称啦(2)info字典里面这个'播放地址':fabutime,其实'播放地址'改为bofangdizhi比较好 ''' def get_links(url): response = requests.get(url) soup = BeautifulSoup(response.text,'html.parser') links_div = soup.find_all('li',class_="col-md-2 col-sm-3 col-xs-4") links = ['http://www.网站名马赛克.com'+div.a.get('href') for div in links_div] #print(links) return links def get_house_info(item_url): response = requests.get(item_url) response.encoding = 'utf-8' soup = BeautifulSoup(response.text,'html.parser') links_div = soup.find_all('ul',class_="playul") lianjie_temp = 'http://www.网站名马赛克.com'+links_div[1].li.a.get('href')#爬下载链接 这里注意playul有2个 第一个playul links_div[0]是播放的 第二个playul links_div[1]是下载的 lianjie=get_cililianjie(lianjie_temp) print(lianjie) links_div2 = soup.find_all('div',class_="detail-title fn-clear") biaoti = links_div2[0].text[:].strip() #爬影片名字 我加了.strip() 去空格 #print(biaoti) links_div3 = soup.find_all('ul',class_="playul") fabutime = 'http://www.网站名马赛克.com'+links_div[0].li.a.get('href') #爬影片播放地址 #print(fabutime) info = { 'id':id, '影片名字':biaoti, '播放地址':fabutime, '下载链接':lianjie } return info def get_cililianjie(url): response = requests.get(url) response.encoding = 'utf-8' soup = BeautifulSoup(response.text,'html.parser') #print(soup) links_div = soup.find_all('div',class_="download") #print(links_div) lianjie = links_div[0].a.get('href') #磁力链接 return lianjie def get_db(setting): return pymysql.connect(**setting) def insert(db,house): values_ = "'{}',"*2 + "'{}'" sql_values = values_.format(house['影片名字'],house['播放地址'],house['下载链接']) sql ='insert into 4hu_shoujixiaoshipin (biaoti,fabutime,lianjie) values({})'.format(sql_values) cursor = db.cursor() cursor.execute(sql) db.commit() DATABASE = { 'host':'127.0.0.1', 'database':'4hucom', 'user':'root', 'password':'xxh123', 'charset':'utf8' #之前代码是utf8mb4之后我用navicat.exe查看一直是乱码 改成utf8 发现navicat.exe查是正常中文了 } db = get_db(DATABASE) #连接数据库 #循环所有页例子 for yema in range(1,44): if yema == 1: url = 'https://www.网站名马赛克.com/vod/html7/index.html' else: url = 'https://www.网站名马赛克.com/vod/html7/index_'+str(yema)+'.html' links = get_links(url) for item_url in links: time.sleep(1.0) house = get_house_info(item_url) print('获取一条成功:{}'.format(house['影片名字'])) insert(db,house) #插入爬取到的数据输入进数据库