天涯论坛——python网络爬虫下载论坛关键字帖文(一)

前言

本文介绍了一个天涯网站关键词搜索的爬虫实现

内容

相关爬取内容如下所示

 

代码

# 账号列表
# 对应关系二维列表

from pyquery import PyQuery as pq
import requests
from urllib.parse import quote
from time import sleep
import json

page = 75
key_word = '新冠疫情'


def prase_all_page(urls):
    """
    解析所有搜索页,获取帖子url,过滤无评论帖子
    :param urls:
    :return: content_urls
    """

    content_urls = []
    for url in urls:
        sleep(1)
        print('正在抓取:', url)
        doc = pq(requests.get(url=url, timeout=30).text)
        # print(doc)
        doc('.searchListOne li:last-child').remove()  # 删除最后一个无用li节点
        lis = doc('.searchListOne li').items()  # 获取content节点生成器
        for li in lis:
            reverse = li('.source span:last-child').text()
            a = li('a:first-child')
            content_url = a.attr('href')
            # print(content_url)
            # print('评论数:', reverse)
            content_urls.append(content_url)

    return content_urls


def prase_all_content(urls):
    """
    获取网页相关信息
    :param urls:
    :return:
    """
    dic = []
    i = 0
    for url in urls:
        print(i)
        i = i + 1
        try:
            dic1 = {}
            print('正在解析:', url)
            doc = pq(requests.get(url=url, timeout=30).text)
            title = doc('.atl-head .atl-title').text()
            main_id = doc('.atl-head .atl-menu').attr('_host')
            replytime = doc('.atl-head .atl-menu').attr('js_replytime')
            replycount = doc('.atl-head .atl-menu').attr('js_replycount')
            clickcount = doc('.atl-head .atl-menu').attr('js_clickcount')
            article = next(doc('.bbs-content').items()).text()
            dic1["title"] = str(title)
            dic1["main_id"] = main_id
            dic1["time"] = replytime
            dic1["replycount"] = replycount
            dic1["clickcount"] = clickcount
            dic1["article"] = article

            comments_replys = []
            comments = doc('.atl-main div:gt(1)').items()  # 通栏广告后的评论列表
            for comment in comments:  # 处理评论
                dic3 = {}
                dic4 = {}
                dic5 = {}
                host_id = comment.attr('_hostid')
                # user_name = comment.attr('_host')
                comment_text = comment('.bbs-content').text()
                replys = comment('.item-reply-view li').items()  # 评论回复

                if replys != None:
                    for reply in replys:
                        rid = reply.attr('_rid')
                        rtext = reply('.ir-content').text()
                        if rid:
                            if rid != main_id and rid != host_id:
                                dic5[host_id] = rtext

                if host_id:
                    k = comment_text.rfind("----------------------------")
                    if (k != -1):
                        comment_text = comment_text[k + 29:]
                    dic4[host_id] = comment_text
                    dic3['comment'] = dic4
                    dic3['reply'] = dic5
                    comments_replys.append(dic3)
            dic1["comments_replys"] = comments_replys
            dic.append(dic1)
        except:
            continue
    string = json.dumps(dic, ensure_ascii=False, indent=4)
    print(string)
    f = open('data.json','w',encoding='utf-8')
    json.dump(dic,f,ensure_ascii=False, indent=4)




def run(key, page):
    """
    :param key:
    :param page:
    :return:
    """
    start_urls = []
    for p in range(1, page+1):
        url = 'http://search.tianya.cn/bbs?q={}&pn={}'.format(quote(key), p)
        start_urls.append(url)
    content_urls = prase_all_page(start_urls)
    # print(content_urls)
    prase_all_content(content_urls)


if __name__ == '__main__':
    run(key_word, page)

 


版权声明:本文为weixin_43906500原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。