OSS的扩展性和超大的存储空间,使得在一些第三方昂贵的存储成本极大的压缩,譬如ZOOM。但如何实现将ZOOM上的视频转移存储到OSS这个第三方存储介质上呢?其实很简单,具体实现方式是先将ZOOM上的云录制平台中所有视频的地址获取到,然后分批下载到跟OSS绑定的文件目录下,绑定OSS后的目录能实现和bucket的同步。绑定OSS和目录的方法可以通过ossfs工具实现。
以下是ZOOM视频的爬虫脚本的代码,基于python实现 ,这也是基础爬虫的代码实现,观众老爷们也可以当看个爬虫入门
# -*- coding=utf8 -*-
import requests
import bs4
import urllib
import datetime
import download as ds
import dd
from lxml import html
session_requests = requests.session()
login_url = "https://www.zoom.us/signin"
#登录页面的账号密码
payload = {
"email": "xxx@qq.com",
"password": "password"
}
#发送seesion请求
result = session_requests.get(login_url)
tree = html.fromstring(result.text)
result = session_requests.post(
login_url,
data = payload,
headers = dict(referer=login_url)
)
#第一页访问
url = 'https://www.zoom.us/recording/management'
#获取到的页面值
result = session_requests.get(
url,
headers = dict(referer = url)
)
#获取第一页整个页面,转成BeautifulSoup
soup = bs4.BeautifulSoup(result.text, 'lxml')
#获取到页数
page=soup.find_all(class_="pagination")
page1=bs4.BeautifulSoup(page.__str__(),'lxml');#将收缩结果转成bs格式在进行筛选,筛选出list集合
pp=page1.select("li");
num=1
# 获取a下的href
hrefs = []
videoHrefs=[]
while(num<=pp.__len__()-2):
url = 'https://www.zoom.us/recording/management?p='+num.__str__()
#重新请求页面
result = session_requests.get(
url,
headers=dict(referer=url)
)
soup = bs4.BeautifulSoup(result.text, 'lxml')
# 获取指定class的<div></div>
dv = soup.find_all(class_="list-col rec-filesize")
# 将a记录筛选出来
a = bs4.BeautifulSoup(dv.__str__(), 'lxml')
ahref = a.find_all("a")
#获取到所有的href-meeting地址
for i in ahref:
hrefs.append(i['href'])
href=i['href']
#访问href地址获取视频地址
resultForVideo = session_requests.get(
href,
headers=dict(referer=url)
)
#获取class=btn play-id的节点
play = bs4.BeautifulSoup(resultForVideo.text, 'lxml')
# t通过新的soup获取指定class的<a></a>
aPlay = play.find_all(class_="btn play-id")
# 将a记录筛选出来
str=aPlay[0]['href'].replace("play", "download");
videoHrefs.append(str)
print str
filename = "D:/url/" + num.__str__() + ".mp4"
print filename
nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 现在
print nowTime
# ds.do_load_media(str, filename)
#dd._downloader(str,filename)
urllib.urlretrieve(str, filename)
# urllib.request.urlretrieve(url, filename, Schedule)
#计数值增加
num=num+1
print videoHrefs
print hrefs.__len__()
以上是基于代码层的爬虫实现,因为存储的地址是filename,而现在的filename为 filename = "D:/url/" + num.__str__() + ".mp4"
但是该文件地址与OSS没有关系,所以需修改为OSS绑定下的目录地址,OSS绑定目录地址的操作方式如下:
1.通过下载ossfs
2.通过安装命令执行安装
- 对于Ubuntu,安装命令为:
sudo apt-get update
sudo apt-get install gdebi-core
sudo gdebi your_ossfs_package
- 对于CentOS6.5及以上,安装命令为:
sudo yum localinstall your_ossfs_package
- 对于CentOS5,安装命令为:
sudo yum localinstall your_ossfs_package --nogpgcheck
3.设置bucket name 和 AccessKeyId/Secret信息,将其存放在/etc/passwd-ossfs 文件中,注意这个文件的权限必须正确设置,建议设为640。
echo my-bucket:my-access-key-id:my-access-key-secret > /etc/passwd-ossfs
chmod 640 /etc/passwd-ossfs
4.将OSS bucket mount到指定目录。
ossfs my-bucket my-mount-point -ourl=my-oss-endpoint
具体的挂载目录名称有观众老爷们自行决定,挂载完成后只需要将代码中的filename赋值于指定的目录就大功告成了~~
2018/06/04 update 增加了oss上传功能,改成了读取配置文件:
# -*- coding=utf8 -*-
import requests
import bs4
import urllib
import datetime
import os
import time
import oss2
from lxml import html
#创建存放目录地址
filedir="C:/"+time.strftime("%Y-%m-%d")
folder=os.path.exists(filedir)
if not folder:
os.mkdir(filedir)
else:
print "--- There is this folder! ---"
filedir=filedir+"-bk"
os.mkdir(filedir)
# 打开a.txt配置文件
with open('C:/Users/Administrator/Desktop/a.txt') as file_object:
lines = file_object.readlines()
for line in lines:
str = line.split(" ");
username = str[0]
password = str[1]
print("当前账号", username)
session_requests = requests.session()
login_url = "https://www.zoom.us/signin"
payload = {
"email": username,
"password": password
}
result = session_requests.get(login_url)
tree = html.fromstring(result.text)
result = session_requests.post(
login_url,
data = payload,
headers = dict(referer=login_url)
)
#第一页访问
url = 'https://www.zoom.us/recording/management'
print url
result = session_requests.get(
url,
headers = dict(referer = url)
)
#获取第一页整个页面
soup = bs4.BeautifulSoup(result.text, 'lxml')
#获取到页数
page=soup.find_all(class_="pagination")
page1=bs4.BeautifulSoup(page.__str__(),'lxml');#将收缩结果转成bs格式在进行筛选,筛选出list集合
pp=page1.select("li");
num=1
vnum=1
page=0
if pp.__len__()==0:
page=1
else:
pa=bs4.BeautifulSoup(pp[-2].__str__(), 'lxml')
page=int(pa.find('a').text)
print('page:', page)
# 获取a下的href
hrefs = []
videoHrefs=[]
while(num<=page):
url = 'https://www.zoom.us/recording/management?p='+num.__str__()
#重新请求页面
result = session_requests.get(
url,
headers=dict(referer=url)
)
soup = bs4.BeautifulSoup(result.text, 'lxml')
# 获取指定class的<div></div>
dv = soup.find_all(class_="list-col rec-filesize")
# 将a记录筛选出来
a = bs4.BeautifulSoup(dv.__str__(), 'lxml')
ahref = a.find_all("a")
#获取到所有的href-meeting地址
for i in ahref:
hrefs.append(i['href'])
href=i['href']
#访问href地址获取视频地址
resultForVideo = session_requests.get(
href,
headers=dict(referer=url)
)
#获取class=btn play-id的节点
play = bs4.BeautifulSoup(resultForVideo.text, 'lxml')
# t通过新的soup获取指定class的<a></a>
aPlay = play.find_all(class_="btn play-id")
# 将a记录筛选出来
str=aPlay[0]['href'].replace("play", "download");
#videoHrefs.append(str)
print str
# 获取到会议ID
metID = play.find_all(class_="control-label meeting-topic")
st = metID.__str__().index('>') + 1
en = metID.__str__().index('</')
metID = metID.__str__()[st:en]
# 判断是否之前已经有相同文件名有的话就加编号
echo = ""
if metID in videoHrefs:
# for fnum in range(videoHrefs.__len__()):
# ss = videoHrefs[fnum][0:videoHrefs[i].__len__() - 3]
# if ss == metID:
# echo=videoHrefs[fnum]
# metID=echo[0:echo.__len__()-3]+(int(echo[echo.__len__()-3:echo.__len__()])+1).__str__()
metID = metID + "-002"
# 获取到新的metID的命名
videoHrefs.append(metID)
filename = filedir + "/" + metID.__str__() + ".mp4"
print("fileNum", vnum)
vnum=vnum+1
print filename
nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 现在
print nowTime
# ds.do_load_media(str, filename)
#dd._downloader(str,filename)
urllib.urlretrieve(str, filename)
# urllib.request.urlretrieve(url, filename, Schedule)
#页码计数值增加
num=num+1
print('page',num)
print videoHrefs
print hrefs.__len__()
#通过oss2 将视频上传到bucket
#用户aky
auth = oss2.Auth('aid', 'keys')
bucket = oss2.Bucket(auth, 'oss-cn-shanghai-internal.aliyuncs.com', 'bucketName')
#获取当前时间命名的目录
nowTimeDir=time.strftime("%Y-%m-%d")
#循环上传视频到oss上
ossNum=1
for videoPath in videoHrefs:
ossPath=nowTimeDir+'/'+videoPath+".mp4"
localPath=filedir+"/"+ videoPath.__str__() + ".mp4"
bucket.put_object_from_file(ossPath, localPath)
print("ossFile",ossNum)
ossNum=ossNum+1