唱吧音乐爬取
#导入框架
import requests
import re
#确定url
url = 'http://changba.com/u/461549830'
#请求
def changba(url):
res = requests.get(url)
if res.status_code == 200:
res_html = res.text
# print(res.text)
#匹配歌曲链接
reg1 = '<a href="(/s/.{22})" style="color:#999;display:block;" target="_blank">'
media = re.findall(reg1, res_html)
# print(media)
media_mid = []
for i in range(len(media)):
media_mid.append('http://changba.com'+media[i])
print(media_mid)
#匹配歌曲名称
reg2 = '^style="color:#999;display:block;" target="_blank">\n\t*([\u4e00-\u9fa5]{1,20})\t*<div class="userPage-work-detail">$'
reg2 = '(.*)<div class="userPage-work-detail">'
name = re.findall(reg2, res_html)
song_name = []
for i in range(len(name)):
song_name.append(name[i].strip())
print(song_name)
reg3 = 'http://\w{4,20}.changba.com/\d{10}.mp3'
for i in range(len(media_mid)):
result = requests.get(media_mid[i]).text
MP3_html = re.findall(reg3,result)
print(MP3_html)
if(MP3_html):
MP3 = requests.get(MP3_html[0])
if MP3.status_code ==200:
with open(song_name[i]+'.mp3','wb') as f:
f.write(MP3.content)
else:
continue
changba(url)
全民K歌音乐爬取
from urllib import request
import re
import os
import json
#个人主页地址
url = "https://kg.qq.com/node/personal?uid=6a9d9a81222830833c"
html = request.urlopen(url).read().decode('utf-8')
data = re.findall(r'"ugclist":.*?],', html)#获取页面中数据json,处理有点糙,不过能用哈
ugclists = data[0][10:-1]
print(len(data[0]))
for ugclist in json.loads(ugclists):#json.loads把字符串转json,以前没使过Python,这里好像说是什么dict的,也就是key=>value的数据格式,很好理解
print(ugclist['shareid'])#这个数据很关键是每首曲子的id值
print(ugclist['title'])#每首歌的名字
title = ugclist['title']
shareid = ugclist['shareid']
data_url = "http://cgi.kg.qq.com/fcgi-bin/fcg_get_play_url?shareid=" + shareid #通过解析获取到的文件地址及拼接形式,也就是这个曲子的文件下载地址
# 设置保存歌曲的路径,否则会保存到程序当前路径
path = r'C:/Users/HUAWEI/Desktop/le' # 路径前的r是保持字符串原始值的意思,就是说不对其中的符号进行转义
file = path + title + '.m4a'
is_set = os.path.exists(file)#这里做了简单的文件存在与否的判断,这样以后再执行,文件不会丢失或增多,或覆盖或重复爬取
# 如果文件存在则跳过
if is_set == False:
request.urlretrieve(data_url, path + title + '.m4a') # 使用request.urlretrieve直接将所有远程链接数据下载到本地
import urllib.request
import requests
import re
path = "C://Users//HUAWEI//Desktop//kgqq//"
url = "https://kg.qq.com/node/personal?uid=6a9d9a81222830833c"
#伪装浏览器用户
headers = {'User-Agent':'User-Agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'}
req = urllib.request.Request(url,headers=headers)
#执行请求获取响应信息
res = urllib.request.urlopen(req)
# 从响应对象中读取信息并解码
html = res.read().decode("utf-8")
# print(len(html))
#使用正则解析出歌曲链接
pat = '<a href="(.*?)" .*? target="_blank">(.*?)</a>'
dlist = re.findall(pat,html)
# 遍历输出结果
# for v in dlist:
# print(v[1]+":"+v[0])
# 匹配歌曲链接
pat_music = 'http://[a-z][a-z].stream.kg.qq.com.*.m4a.*?"'
for url in dlist:
music = urllib.request.Request(url[0],headers=headers)
res = urllib.request.urlopen(music)
music_html = res.read().decode("utf-8")
mus = re.findall(pat_music,music_html)
if mus:
MP3 = requests.get(mus[0])
with open(path+url[1]+".mp3","wb") as f:
f.write(MP3.content)
if MP3.content:
print(url[1]+":"+url[0])
else:
print(url[1]+":"+url[0]+"write error!")
版权声明:本文为matafeiyanll原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。