学习札记之python:一只黄色的小爬虫

最近看小甲鱼的课程,动手实战了一下,成功了还是很开心的。。
废话就不说了,直接上源码,(本人比较懒所以没有注释)

import urllib.request as ureq
import os
import time


def url_open(url):
    req = ureq.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0')
    req.add_header('Referer', url)
    response = ureq.urlopen(req)
    html = response.read()
    return html


def get_pages(url):
    html = url_open(url).decode('utf-8')
    pages_nums = []
    a = html.find('data-original=')
    while a != -1:
        b = html.find("' width", a)
        if b != -1:
            pages_nums.append(html[a + 15:b])
        else:
            b = a + 15
        a = html.find('data-original=', b)
    return pages_nums


def find_imgs(url):
    html = url_open(url).decode('utf-8')
    a = html.find('src=')
    b = html.find('.jpg', a, a + 255)
    while b == -1:
        b = a + 5
        a = html.find('src=', b)
        b = html.find('.jpg', a, a + 255)
    img_addrs = html[a + 5:b + 4]
    return img_addrs


def save_img(img_addrs):
    filename = img_addrs.split('/')[-1]
    with open(filename, 'wb') as f:
        img = url_open(img_addrs)
        f.write(img)


def maxnum(url, maxPageInd):
    html = url_open(url).decode('utf-8')
    a = html.find(maxPageInd)
    b = html.find('<span>', a - 10)
    num = html[b + 6:a]
    return int(num)


def meizi():
    folder = input("请输入保存图包的文件夹名称:")

    try:
        os.mkdir(folder)
    except OSError:
        pass
    os.chdir(folder)

    count = 0
    url = 'https://www.mzitu.com/'
    page_num = input("请输入你想要下载的页数:")
    print('开始获取页面信息...\n')
    page_nums = get_pages(url)
    img_addrs = []
    print('检测完毕,当前页面共有%d个图包,即将分别爬取...\n' % len(page_nums))

    for each in page_nums:
        count += 1
        temp = each.split("'")
        packnum = temp[0].split('/')[-1]
        name = packnum.split('_')[0]
        newfolder = temp[-1]
        page_url = url + name
        page_url2 = page_url + '/2'

        maxPageInd = "</span></a><a href='" + page_url2 + "'><span>"
        every_maxnum = maxnum(page_url, maxPageInd)
        print('第%d个图包的总图片数为%d张...\n' % (count, every_maxnum))

        os.mkdir(newfolder)
        os.chdir(newfolder)
        for i in range(every_maxnum):
            page_url3 = page_url + '/' + str(i + 1)
            print("正在获取第%d张图片地址..." % int(i+1))
            img_addrs = find_imgs(page_url3)
            save_img(img_addrs)

        os.chdir('..')
        print("等待5秒后,开始爬取下一个图包。")
        time.sleep(5)


if __name__ == '__main__':
    meizi()


版权声明:本文为qq_41926906原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。