功能实现:
对北邮人论坛所有版块进行关键词搜索,将帖子名称及链接打印出来。
'''
实现的功能:BYR 全部讨论区 超级搜索
作者:莹
时间:2019.11.13
2019年11月13日20:29:03复习了一下爬虫
简介:模拟了论坛的登录,所有版块对关键词的搜索,附带搜索结果网址(手机可直接打开不用登录,PC端需登录)
并在查询结果有多页时实现多页爬取(最多5页,可自行设置)。
'''
doc = open('shixi_out.txt','w',encoding='utf-8')
import requests
from urllib import parse
import re
import time
import math
#输入论坛账号密码
username = 'xxxxxx'
password = 'xxxxxx'
#填写想要搜索的词
str1 = '实习'
str2 = parse.quote(str1) #按照标准,URL只允许一部分ASCII字符,其他字符(如汉字)是不符合标准的,此时就要进行编码。
#创建会话
session = requests.Session()
#论坛登录
headers_login = {
'origin': 'https://bbs.byr.cn',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'accept': 'application/json, text/javascript, */*; q=0.01',
'referer': 'https://bbs.byr.cn/',
'authority': 'bbs.byr.cn',
'x-requested-with': 'XMLHttpRequest',
'dnt': '1',
}
data_login = {
'id': username,
'passwd': password,
}
response_login = session.post('https://bbs.byr.cn/user/ajax_login.json', headers=headers_login, data=data_login)
#print(response_login.text)
'''
功能:提取大板块小版块ID
输出:list,每一项为板块对应的英文及中文
'''
board_list = []
for i in range(0,10):
headers_section = {
'dnt': '1',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'accept': '*/*',
'referer': 'https://bbs.byr.cn/',
'authority': 'bbs.byr.cn',
'x-requested-with': 'XMLHttpRequest',
}
params_section = (
('_uid', username),
)
response_section = session.get('https://bbs.byr.cn/section/'+str(i), headers=headers_section, params=params_section)
compile_board = re.compile('<a href="\/board\/(.*?)">(.*?)<\/a>')
item_match_board = re.findall(compile_board, response_section.text)
#print(len(item_match_board))
for item_info_board in item_match_board:
board_list.append(item_info_board)
#for i in range(0,len(board_list)):
# print(i,board_list[i])
'''
功能:对每个板块进行关键词搜索
输入:list(板块中英文)、搜索关键词、搜索的板块序号区间
输出:搜索到的帖子与对应网址
'''
#for i in range(0,len(board_list)):
for i in range(83,86):
headers_search = {
'dnt': '1',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'accept': '*/*',
'referer': 'https://bbs.byr.cn/',
'authority': 'bbs.byr.cn',
'x-requested-with': 'XMLHttpRequest',
}
params_search = (
('t1', str2),
('au', ''),
('b', board_list[i][0]),
('_uid', username),
)
response_search = session.get('https://bbs.byr.cn/s/article', headers=headers_search, params=params_search)
re_extract = re.compile('\<tr\>\<td\sclass\=\"title\_8\"\>([\S]*?)\.\<\/td\>\<td\sclass\=\"title\_9\"\>\<a\shref\=\"([\S]*?)\"\>([\S\s]*?)\<\/a\>.*?\<\/td\>\<td\sclass\=\"title\_10\"\>([\S]*?)\<\/td\>\<td\sclass\=\"title\_12\"\>\|\&ensp\;\<a\shref\=\"([\S]*?)"\sclass\=\"c63f\"\>([\S]*?)\<\/a\>\<\/td\>\<td\sclass\=\"title\_11\smiddle\"\>([\S]*?)\<\/td\>\<td\sclass\=\"title\_10\"\>\<a\shref\=\"([\S]*?)"\stitle\=\"跳转至最后回复\"\>([\S]*?)\<\/a\><\/td\>\<td\sclass\=\"title\_12\"\>\|\&ensp\;\<a\shref\=\"([\S]*?)"\sclass\=\"c09f\"\>([\S]*?)\<\/a\>\<\/td\>')
re_topic_num = re.compile('<ul class="pagination"><li class="page-pre">主题数:<i>(.*?)<\/i>')
re_item_match = re.findall(re_extract, response_search.text)
re_item_num_match = re.findall(re_topic_num, response_search.text)
print('%d.'%(i)+ board_list[i][1],file = doc) #打印板块名
for re_item_info in re_item_match: #打印帖子名称与对应网址
print(' ('+re_item_info[0]+')'+re_item_info[2]+' '+'https://bbs.byr.cn'+re_item_info[1],file = doc)
time.sleep(5)
#计算页码,实现翻页功能
if re_item_num_match:
num_page = math.ceil(int(re_item_num_match[0])/80)
if(num_page>1):
if(num_page>5):
num_page = 5
for num_count in range(2,num_page+1):
headers_page = {
#'cookie': '_ga=GA1.2.176952686.1548122192; login-user=wzb25; Hm_lvt_38b0e830a659ea9a05888b924f641842=1573523888,1573645052,1573699130,1573729163; nforum[UTMPUSERID]=wzb25; nforum[PASSWORD]=VPrld6Cz2VjMUNaP34WzKg%3D%3D; left-index=0001000000; nforum[BMODE]=2; nforum[XWJOKE]=hoho; nforum[UTMPKEY]=1047837; nforum[UTMPNUM]=8589; Hm_lpvt_38b0e830a659ea9a05888b924f641842=1573807847',
'dnt': '1',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'accept': '*/*',
'referer': 'https://bbs.byr.cn/',
'authority': 'bbs.byr.cn',
'x-requested-with': 'XMLHttpRequest',
}
params_page = (
('t1', str2),
('au', ''),
('b', board_list[i][0]),
('_uid', [username, username]),
('p', num_count),
)
response_page = session.get('https://bbs.byr.cn/s/article', headers=headers_page, params=params_page )
re_extract = re.compile('\<tr\>\<td\sclass\=\"title\_8\"\>([\S]*?)\.\<\/td\>\<td\sclass\=\"title\_9\"\>\<a\shref\=\"([\S]*?)\"\>([\S\s]*?)\<\/a\>.*?\<\/td\>\<td\sclass\=\"title\_10\"\>([\S]*?)\<\/td\>\<td\sclass\=\"title\_12\"\>\|\&ensp\;\<a\shref\=\"([\S]*?)"\sclass\=\"c63f\"\>([\S]*?)\<\/a\>\<\/td\>\<td\sclass\=\"title\_11\smiddle\"\>([\S]*?)\<\/td\>\<td\sclass\=\"title\_10\"\>\<a\shref\=\"([\S]*?)"\stitle\=\"跳转至最后回复\"\>([\S]*?)\<\/a\><\/td\>\<td\sclass\=\"title\_12\"\>\|\&ensp\;\<a\shref\=\"([\S]*?)"\sclass\=\"c09f\"\>([\S]*?)\<\/a\>\<\/td\>')
re_item_page_match = re.findall(re_extract, response_page.text)
for re_item_page_info in re_item_page_match:
print(' ('+str(int(re_item_page_info[0])+80*(num_count-1))+')'+re_item_page_info[2]+' '+'https://bbs.byr.cn'+re_item_page_info[1],file = doc)
time.sleep(5)
doc.close()
版权声明:本文为qq_34623223原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。