练手的小爬虫
爬取酷狗音乐飙升榜上的22首歌
网页地址:https://www.kugou.com/yy/html/rank.html
使用 requests 进行网络请求,获取相关数据。
xpath (需要使用浏览器插件)获取相应的网页信息。
项目结构:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
| ''' 爬取酷狗音乐飙升榜上的22首歌 会员歌只有试听部分,不完整。 音频下载地址为,该文件所在目录的song文件夹里。 若song文件夹存在且不为空,则会将song文件删除后,重建song文件夹 若song不存在,则新建。 若song存在且为空,不执行任何操作 ''' import time import requests import os import json from lxml import etree
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' }
def down_song(url, name): time.sleep(0.5) resq = requests.get(url, headers=headers) with open(f'./song/{name}.mp3', 'wb') as f: f.write(resq.content)
try: os.mkdir('./song') except FileExistsError: print('文件夹(song)已存在') if os.path.getsize('./song') != 0: try: import shutil shutil.rmtree('./song') os.mkdir('./song') except Exception as e: print('清空文件夹(song)失败', e) raise except Exception as e: print('创建文件夹(song)失败', e) raise
index_url = 'https://www.kugou.com/yy/html/rank.html' resq = requests.get(index_url, headers=headers) e = etree.HTML(resq.text) xpath_query = '//li/@data-eid' ids = e.xpath(xpath_query)
query_url = 'https://wwwapi.kugou.com/yy/index.php?r=play/getdata&mid=1&encode_album_audio_id=' for i in range(len(ids)): resq = requests.get(f'{query_url}{ids[i]}', headers=headers) try: json_data = json.loads(resq.text) except: raise break song_url = json_data['data']['play_url'] audio_name = json_data['data']['audio_name'] print(f"正在下载:{audio_name}") down_song(song_url,audio_name) print("下载成功")
|