爬虫学习计-=-meiztu
# -*- coding: utf-8 -*-
import requests
import re
import os
import time
from scrapy import Selector
from fake_useragent import UserAgent
# 替换随机请求头
ua = UserAgent()
headers = {'User-Agent': ua.random}
headers = {
'Referer': '**********',
'Sec-Fetch-Mode': 'no-cors',
'User-Agent': ua.random
}
# 得到列表页信息
for first_i in range(1, 249):
url = '*********/page/' + str(first_i) + '/'
response_first = requests.get(url = url, headers = headers)
print('response_first ---code===' + str(response_first.status_code))
selector_first = Selector(response_first)
# xpath语法获取到详情页的超链接
href_list_first = selector_first.xpath('//*[@id="pins"]/li/a/@href').getall()
for href_first in href_list_first:
print(href_first)
response_first = requests.get(url = href_first, headers = headers)
response_first.encoding = 'utf-8'
selector_first = Selector(response_first)
# 获取到各详情页的名字,为后面文件名安排
name = selector_first.xpath('/html/body/div[2]/div[1]/h2/text()').extract_first()
# 正则去除windows文件夹不符合的名称
name = re.findall(r'[^\*"/:?\\|<>]', name)
name = "".join(name)
print(name)
# 获取到各详情页的图片数量
page = selector_first.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span/text()').extract_first()
if page == None:
pass
elif page == '下一页»':
page = selector_first.xpath('/html/body/div[2]/div[1]/div[4]/a[3]/span/text()').extract_first()
page = int(page) + 1
else:
page = int(page) + 1
print(page)
# 对详情页的每张图片发起请求
for i in range(1, page):
url_second = href_first + '/' + str(i)
response_second = requests.get(url = url_second, headers = headers)
print('response_second ---code===' + str(response_second.status_code))
picture_url = re.findall('''<img class="blur" src="(.*?)"''', response_second.text)
if picture_url != []:
print(picture_url[0])
response_picture = requests.get(url = picture_url[0], headers = headers)
if response_picture.status_code == 200:
print('response_picture ---code===' + str(response_picture.status_code))
print('正在打开' + str(first_i) + '页的' + str(name) + '系列' + '的第' + str(i) + '张首页')
以各详情页系列为名创建文件夹
if not os.path.exists(f'./首页/{name}'):
os.mkdir(f'./首页/{name}')
with open(f'./首页/{name}/{i}.jpg', 'ab') as f:
# 保存图片
f.write(response_picture.content)
print(str(name) + 'de' + str(i) + '报cun成功')
print('*' * 20)
# 睡眠一秒>>>>防止封IP
time.sleep(1)
else:
pass
else:
pass
由于一秒一张图!图片数量较多!有大佬搞定ip池的欢迎交流下!!!
版权声明:本文为weixin_45278644原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。