# coding:utf-8
from urllib import request
import re
class Spider():
url = 'https://www.panda.tv/cate/kingglory' # 切换专区,修改成专区的URL就可
root_pattern = '<div class="video-info">(.*?)</div>' # 使用贪婪匹配会匹配到网页最后一个</div>
name_pattern = '<span class="video-nickname" title="(.*?)">'
number_pattern = '<span class="video-number">[\s\S]*?</i>([\s\S]*?)</span>'
anchors = []
# 获取网页内容
def __get_content(self):
r = request.urlopen(Spider.url) # 发送http请求
content = r.read()
htmls = str(content,encoding='utf-8')
return htmls
# 处理抓取的内容
def __handle_content(self, htmls):
root_html = re.findall(Spider.root_pattern,htmls, re.S) # re.S 使.匹配所有字符
for html in root_html:
name = re.findall(Spider.name_pattern, html)[0]
number = re.findall(Spider.number_pattern, html)[0]
anchor = {
'name': name,
'number': number
}
Spider.anchors.append(anchor)
# print(Spider.anchors)
# 按人气排序
def sort(self, anchors):
anchors = self.keey_seed(anchors)
rank_list = sorted(anchors, key=lambda anchors : anchors['number'], reverse=True)
return rank_list
# 将人气转换成数值
def keey_seed(self, anchors):
i = 0
for a in anchors:
if '万' in a['number']:
a['number'] = float(re.findall('\d*', a['number'])[0])*10000
else:
a['number'] = float(a['number'])
anchors[i] = a
i += 1
return anchors
def show(self,r):
for a in r:
print(a['name'],': ',a['number'])
def run(self):
contents = self.__get_content()
self.__handle_content(contents)
rank_list = self.sort(Spider.anchors)
self.show(rank_list)
spider = Spider()
spider.run()
输出:
版权声明:本文为StrongbyTime原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。