爬虫学习计报存--报存图片

爬虫学习计-=-meiztu


# -*- coding: utf-8 -*-
import requests
import re
import os
import time
from scrapy import Selector
from fake_useragent import UserAgent

# 替换随机请求头
ua = UserAgent()
headers = {'User-Agent': ua.random}

headers = {
	'Referer': '**********',
	'Sec-Fetch-Mode': 'no-cors',
	'User-Agent': ua.random
}

# 得到列表页信息
for first_i in range(1, 249):
	url = '*********/page/' + str(first_i) + '/'
	response_first = requests.get(url = url, headers = headers)
	print('response_first ---code===' + str(response_first.status_code))
	
	selector_first = Selector(response_first)
	# xpath语法获取到详情页的超链接
	href_list_first = selector_first.xpath('//*[@id="pins"]/li/a/@href').getall()
	for href_first in href_list_first:
		print(href_first)
		
		response_first = requests.get(url = href_first, headers = headers)
		response_first.encoding = 'utf-8'
		selector_first = Selector(response_first)
		# 获取到各详情页的名字,为后面文件名安排
		name = selector_first.xpath('/html/body/div[2]/div[1]/h2/text()').extract_first()
		# 正则去除windows文件夹不符合的名称
		name = re.findall(r'[^\*"/:?\\|<>]', name)
		name = "".join(name)
		print(name)
		# 获取到各详情页的图片数量
		page = selector_first.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span/text()').extract_first()
		if page == None:
			pass
		elif page == '下一页»':
			page = selector_first.xpath('/html/body/div[2]/div[1]/div[4]/a[3]/span/text()').extract_first()
			page = int(page) + 1
		else:
			page = int(page) + 1
			print(page)
		# 对详情页的每张图片发起请求
		for i in range(1, page):
			
			url_second = href_first + '/' + str(i)
			response_second = requests.get(url = url_second, headers = headers)
			print('response_second ---code===' + str(response_second.status_code))
			picture_url = re.findall('''<img class="blur" src="(.*?)"''', response_second.text)
			if picture_url != []:
				print(picture_url[0])
				response_picture = requests.get(url = picture_url[0], headers = headers)
				if response_picture.status_code == 200:
					
					print('response_picture ---code===' + str(response_picture.status_code))
					print('正在打开' + str(first_i) + '页的' + str(name) + '系列' + '的第' + str(i) + '张首页')
					
					以各详情页系列为名创建文件夹
					if not os.path.exists(f'./首页/{name}'):
						os.mkdir(f'./首页/{name}')
					with open(f'./首页/{name}/{i}.jpg', 'ab') as f:
						# 保存图片
						f.write(response_picture.content)
						print(str(name) + 'de' + str(i) + '报cun成功')
					print('*' * 20)
					# 睡眠一秒>>>>防止封IP
					time.sleep(1)
				else:
					pass
			else:
				pass

由于一秒一张图!图片数量较多!有大佬搞定ip池的欢迎交流下!!!


版权声明:本文为weixin_45278644原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。