论坛爬虫爬取tid,title,uid

自己编写的练习爬虫代码,希望对大家思路有帮助,技术欠佳,正在练习。加油!!

# -*- codeing = utf-8 -*-
# @Time : 2021/8/10 16:56
# @Author : yz
# @File : 论坛2.py
# @Software : PyCharm
# 引入request库, requests是python实现的简单易用的HTTP库,使用起来比urllib简洁很多
import requests
from lxml import etree
import re


def main():
    head()
    url = "http://114.112.74.132:8089/forum.php?mod=viewthread&tid="
    askurl(url)


findUid = re.compile(r'uid=(\d+)')
max_tid = 5846

########判断url是否合法,是否可连通,HTTP状态码是否为200
def get_url_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        if "抱歉,指定的主题不存在或已被删除或正在被审核" in response.text:
            return False
        else:
            # 如果可以连通返回网页源码
            html = etree.HTML(response.text)
            return html
    else:
        return False

# 定义方法解析html_text
def pare_post_data(html_text,tid):
    title_list = html_text.xpath('//*[@id="thread_subject"]')
    title = title_list[0].text

    uid_list = html_text.xpath('//*[@id="favatar%s"]/div[1]/div/a/@href'%tid)
    uid = str(uid_list[0])
    uid = (re.findall(r"\d+\.?\d*", uid))[0]

    post_content_info = {
        "tid": tid,
        "title": title,
        "uid": uid,
    }
    return post_content_info

def head():
    f = open(".\\luntan.txt", "w", encoding='utf-8')
    f.write("tid" + "," + "title" + "," + "uid\n")
    f.close()

#爬取所有网页并处理数据
def askurl(url):
    for i in range(max_tid):
        html = get_url_content(url + str(i))
        if html != False:
            tid = str(i)
            pare_post_data1 = pare_post_data(html,tid)
            get_tid = pare_post_data1.get("tid")
            get_title = pare_post_data1.get("title")
            get_uid = pare_post_data1.get("uid")
            print(get_tid + ',' + get_title + ',' + get_uid)
            f = open(".\\luntan.txt", "a+", encoding='utf-8')
            f.write(get_tid + ',' + get_title + ',' + get_uid + '\n')
            f.close()

if __name__ == "__main__":
    print("tid"+","+"title"+","+"uid")
    main()
    print("爬取完毕!!!")

版权声明:本文为qq_43083688原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。