【python爬虫案例】2022年5月爬取读者文章,爬虫基础入门案例(BeautifulSoup)

爬虫入门,萌新报道

第一篇独立完成的爬虫,网址目前未见其他人爬过

入门作品代码略显冗余,欢迎大家批评指正!!

在这里插入图片描述

源码

"""
作者:菜菜的大鹏
每天起床第一句,大鹏世界最菜!
"""
import random
from bs4 import BeautifulSoup
import requests
import os


def getBaseHtml(url,headers):
    res = requests.get(url,headers,timeout=5)

    if res.status_code == 200:
        html_string = res.text
        soup = BeautifulSoup(html_string,"html.parser")
        books = soup.findAll(attrs={'class':'zp-book-item'})
        print("现有总期数为:",len(books))
        detailUrls=[]

        for item in books:
            href = item.a['href']
            dateName = item.h2.string
            detailUrls.append(zip({href:f"{href}",dateName:f"{dateName}"}))

            getDetailUrl(href,dateName)



def getDetailUrl(url,dateName):
    ua = random.choice(uas)
    headers = {
        'User-Agent': ua,
    }
    res = requests.get(url, headers, timeout=5)

    if res.status_code == 200:
        html_string = res.text
        soup = BeautifulSoup(html_string, "html.parser")
        books = soup.find_all('ul',attrs={'class':None})
        booklist = books[0].find_all('li')
        print(dateName+"文章数为:", len(booklist))
        textUrls=[]

        for item in booklist:
            href = item.a['href']
            title = item.a['title']
            textUrls.append(zip({href:f"{href}",title:f"{title}"}))

            getText(href,dateName,title)

        print("第" + dateName + "爬取完毕")



def getText(url,dateName,title):
    ua = random.choice(uas)
    headers = {
        'User-Agent': ua,
    }
    res = requests.get(url, headers, timeout=5)

    if res.status_code == 200:
        html_string = res.text
        soup = BeautifulSoup(html_string, "html.parser")
        text = soup.find(attrs={'class': 'contentbox'})
        saveText(text,dateName,title)


def saveText(text,dateName,title):
    if text is not None:
        messages = text.find_all('p')
        path = 'text/{}'.format(dateName)
        folder = os.path.exists(path)
        if not folder:
            os.makedirs(path)
        with open("text/{}/{}.txt".format(dateName,title), "w+",encoding='utf-8') as f:
            f.write(title+"\n")
            for message in messages:
                f.write("  "+message.text+'\n')

def LoadUserAgents(uafile):
    uas = []
    with open(uafile, 'rb') as uaf:
        for ua in uaf.readlines():
            if ua:
                uas.append(ua.strip()[:-1])
    random.shuffle(uas)
    return uas

if __name__ == '__main__':
    baseUrl = 'http://duzhe.1she.com/'
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}
    uas = LoadUserAgents("user_agents.txt")

    getBaseHtml(baseUrl,headers)

版权声明:本文为weixin_52144018原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。