Python爬虫系列:爬取CSDN个人主页基本信息
1、背景
为了在嵌入式等设备上实时显示CSDN博客基本信息,写了该脚本,后面CSDN个人主页改版的话,爬虫可能不可用,当不可用时,博主可能会更新。
如果在使用过程中遇到问题,可以留言或者自己修改代码解决。
Python版本:Python3.9
依赖库:
bs4
requests
最近更新时间:2022.03.23
2、代码
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import requests
from bs4 import BeautifulSoup
import re
class csdn:
def __init__(self, user_id: str):
self.user_id = user_id
self.url_mainpage = "http://blog.csdn.net/"+self.user_id
self.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"cache-control": "max-age=0",
"sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\", \"Google Chrome\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"
}
def get_info(self):
"""获取博主基本信息
Returns:
dict:
{
'base_info': { # 基本信息
'ico_url': 'https: xxx.xxx', # 头像url
'user_name': 'SameWorld', # 用户名
'code_age': '码龄7年' # 码龄
},
'profile_info': { # 属性信息
'access': 362586, # 访问数
'original': 93, # 原创
'ranking': 10486, # 排名
'fans': 261 # 粉丝
},
'user_achievement': { # 个人成就
'like': 309, # 喜欢数
'comments': 138, # 评论数
'collection': 1559 # 收藏数
},
'force_info': { # 原力信息
'level': 5, # 原理等级
'total': 1055, # 总分
'monthly': 66 # 当月
},
'creation_process': [ # 创作历程
{
'url': 'https://xxx.xxx', # 链接
'count': 1, # 数量
'year': 2022 # 年份
},
...more
],
'special_column': [ # 专栏信息
{
'title': 'Linux', # 标题
'url': 'https: //xxx.xxx', # 专栏链接
'count': 1 # 专栏文章数
},
...more
]
}
"""
self.user_info = {}
self.res = requests.get(self.url_mainpage, headers=self.headers)
self.res.encoding = 'utf-8'
self.soup = BeautifulSoup(self.res.text, "html.parser")
if self.soup is None:
return None
''' 基本信息 '''
self.user_info["base_info"] = self.get_user_base_info(self.soup)
''' 属性信息 '''
self.user_info["profile_info"] = self.get_user_profile_info(self.soup)
''' 个人成就 '''
self.user_info["user_achievement"] = self.get_user_achievement(
self.soup)
''' 原力信息 '''
self.user_info["force_info"] = self.get_force_info(self.soup)
''' 创作历程 '''
self.user_info["creation_process"] = self.get_creation_process(
self.soup)
''' 专栏信息 '''
self.user_info["special_column"] = self.get_special_column(self.soup)
# print(self.user_info)
return self.user_info
def get_user_base_info(self, soup):
'''头像url 昵称 码龄'''
user_base_info = {}
bs4soup = soup.select('.user-profile-avatar img')
if bs4soup is not None and len(bs4soup) > 0:
user_base_info["ico_url"] = bs4soup[0]["src"]
bs4soup = soup.select('.user-profile-head-name div')
if bs4soup is not None and len(bs4soup) > 2:
user_base_info["user_name"] = bs4soup[0].text.strip()
user_base_info["code_age"] = bs4soup[1].text.strip()
# print(user_base_info)
return user_base_info
def get_user_profile_info(self, soup):
'''访问数 原创 排名 粉丝'''
user_profile_info = {}
bs4soup = soup.select('.user-profile-head-info-r-c ul li')
for item in bs4soup:
str_item = item.text.strip()
# print(str_item)
result = re.search(r"\d+(,\d+)*", str_item)
# print(result)
if result is None:
continue
result = (int)(result.group(0).replace(",", ""))
if str_item.find("访问") > 0:
user_profile_info["access"] = result
elif str_item.find("原创") > 0:
user_profile_info["original"] = result
elif str_item.find("排名") > 0:
user_profile_info["ranking"] = result
elif str_item.find("粉丝") > 0:
user_profile_info["fans"] = result
# print(user_profile_info)
return user_profile_info
def get_user_achievement(self, soup):
'''点赞数 评论数 收藏数'''
user_achievement = {}
bs4soup = soup.select('.aside-common-box-achievement li')
for item in bs4soup:
str_item = item.text.strip()
# print(str_item)
result = re.search(r"\d+(,\d+)*", str_item)
# print(result)
if result is None:
continue
result = (int)(result.group(0).replace(",", ""))
if str_item.find("点赞") > 0:
user_achievement["like"] = result
elif str_item.find("评论") > 0:
user_achievement["comments"] = result
elif str_item.find("收藏") > 0:
user_achievement["collection"] = result
# print(user_achievement)
return user_achievement
def get_force_info(self, soup):
'''原力等级'''
force_info = {}
bs4soup = soup.select(
'.user-influence-list .influence-top .influence-left')
# print(bs4soup)
if bs4soup is not None and len(bs4soup) > 0:
force_info["level"] = int(bs4soup[0].text.strip())
'''总分 当月分数'''
bs4soup = soup.select(
'.user-influence-list .influence-bottom .influence-left')
# print(bs4soup)
if bs4soup is not None and len(bs4soup) > 0:
str_item = bs4soup[0].text.strip()
if str_item is not None:
# print(str_item)
result = re.search(r"总分\s*(\d+(,\d+)*)", str_item)
if result is not None:
# print(result.group(1))
force_info["total"] = int(result.group(1).replace(",", ""))
result = re.search(r"当月\s*(\d+(,\d+)*)", str_item)
if result is not None:
# print(result.group(1))
force_info["monthly"] = int(
result.group(1).replace(",", ""))
# print(force_info)
return force_info
def get_creation_process(self, soup):
'''创造历程'''
creation_process = []
bs4soup = soup.select('.aside-common-box-create li a')
# print(bs4soup)
for item in bs4soup:
item_process = {}
item_process["url"] = item['href']
bs4count = item.select('.count')
if bs4count is not None:
str_item = bs4count[0].text.strip()
result = re.search(r"(\d+(,\d+)*)篇", str_item)
if result is not None:
item_process["count"] = int(
result.group(1).replace(",", ""))
bs4time = item.select('.time')
if bs4time is not None:
str_item = bs4time[0].text.strip()
result = re.search(r"(\d+(,\d+)*)年", str_item)
if result is not None:
item_process["year"] = int(
result.group(1).replace(",", ""))
creation_process.append(item_process)
# print(creation_process)
return creation_process
def get_special_column(self, soup):
'''专栏'''
special_column = []
bs4soup = soup.select(
'.user-special-column .aside-common-box-content ul li')
# print(bs4soup)
for item in bs4soup:
item_column = {}
bs4name = item.select('.special-column-name')
if bs4name is not None:
str_item = bs4name[0].text.strip()
item_column["title"] = str_item
item_column["url"] = bs4name[0]["href"]
bs4num = item.select('.special-column-num')
if bs4num is not None and len(bs4num) > 0:
str_item = bs4num[0].text.strip()
result = re.search(r"(\d+(,\d+)*)篇", str_item)
if result is not None:
item_column["count"] = int(
result.group(1).replace(",", ""))
else:
item_column["count"] = 0
special_column.append(item_column)
# print(special_column)
return special_column
3、用法
可以将上述代码集成到自己项目中,如通过MQTT定时向嵌入式设备发送采集到的信息并显示
''' sample '''
csdn = csdn('baidu_26678247')
print(csdn.get_info())
爬取数据结构
{
'base_info': { # 基本信息
'ico_url': 'https: xxx.xxx', # 头像url
'user_name': 'SameWorld', # 用户名
'code_age': '码龄7年' # 码龄
},
'profile_info': { # 属性信息
'access': 362586, # 访问数
'original': 93, # 原创
'ranking': 10486, # 排名
'fans': 261 # 粉丝
},
'user_achievement': { # 个人成就
'like': 309, # 喜欢数
'comments': 138, # 评论数
'collection': 1559 # 收藏数
},
'force_info': { # 原力信息
'level': 5, # 原理等级
'total': 1055, # 总分
'monthly': 66 # 当月
},
'creation_process': [ # 创作历程
{
'url': 'https://xxx.xxx', # 链接
'count': 1, # 数量
'year': 2022 # 年份
},
# ...more
],
'special_column': [ # 专栏信息
{
'title': 'Linux', # 标题
'url': 'https: //xxx.xxx', # 专栏链接
'count': 1 # 专栏文章数
},
# ...more
]
}
版权声明:本文为baidu_26678247原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。