爬虫实现qq音乐歌单无vip批量下载
分享歌单链接
电脑网页无法获取歌单完信息,所以需要借助手机下载网页文件
利用下载网站实现批量下载
music.py
import requests
from fake_useragent import UserAgent
from lxml import html
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# 读取qq音乐分享文件
def get_html_file():
file = input('请输入html文件地址(项目目录下则直接输入文件名包括后缀名):')
with open(file, 'r', encoding='utf-8') as f:
html_ = f.read()
return html_
# 从html中获取歌曲的信息(歌名和歌手)
def get_music_name_and_singer(html_):
etree = html.etree
e = etree.HTML(html_)
# 不同歌曲歌单
# music_infos = e.xpath('//p[@class="song_list__desc"]/text()')
# 同一作者的歌单
music_infos = e.xpath('//span[@class="song_list__txt"]/text()')
# print(music_info)
return music_infos
# 从下载网站获取歌曲下载地址
def get_download_url(music_info):
url = 'https://www.musictool.top/?name={}&type=qq'
url = url.format(str(music_info))
option = webdriver.ChromeOptions()
option.add_argument('headless')
dcap = dict(DesiredCapabilities.CHROME)
dcap['chrome.page.settings.userAgent'] = UserAgent().chrome
driver = webdriver.Chrome(chrome_options=option, desired_capabilities=dcap)
driver.get(url)
sleep(6)
response = driver.page_source
# print(response)
etree = html.etree
e = etree.HTML(response)
if ((e.xpath('//a[@id="j-src-btn"]/@href'))) == None:
return None
else:
download_url = ''.join(e.xpath('//a[@id="j-src-btn"]/@href'))
print(download_url)
return download_url
# 下载歌曲,保存歌曲
def download_music(download_url, name):
if download_url == None:
print(name + '---下载失败')
headers = {
'User-Agent': UserAgent().random
}
response = requests.get(download_url, headers=headers)
if response.status_code == 200:
with open('music/' + name + '.mp3', 'wb') as f:
f.write(response.content)
print('下载完成---' + name)
print('--------------------')
else:
print(name + '---下载失败')
# 主方法,遍历歌曲信息执行下载歌曲
def main():
html_ = get_html_file()
music_infos = get_music_name_and_singer(html_)
print(music_infos)
for music_info in music_infos:
print('开始下载---' + music_info)
# music_info = '马良/孙茜茹 往后余生'
download_url = get_download_url(music_info)
music_info = music_info.replace(' · ', ' ').replace('/', '_').replace('?', '').strip()
download_music(download_url, music_info)
if __name__ == '__main__':
main()
说明:
1.歌曲下载网站使用的时Ajax异步请求,所以不能通过常规方法的爬取歌曲
2.采用selenuim来获取网页完整的代码,从而获取歌曲的下载地址
3.爬取过程中ua很重要的,一定要设置
4.访问速度一定不要太快,所以采用了sleep()方法来减慢爬取速度,防止被检测到电脑操作,从而报错