python爬虫:requests+xpath,图片下载

一、示例

import requests
from lxml import html

res = requests.get(url, verify=False, headers=self.headers, timeout=60)
res.encoding = 'utf-8'

# 使用etree解析相应
etree = html.etree
parse_html = etree.HTML(res.text)

# 使用xpath进行解析,得到的是一个列表
r_list = parse_html.xpath("//div[@class='view-content']//text()")    # text() 获取对应元素的文本信息

# xpath可以进行二次解析
details = parse_html.xpath('//*[@id="block-surrey-content"]/div/ul')[0]
number_of_rooms = details.xpath('./li[1]/text()')[0]    # li[1]  第一个li元素,计数从1开始
single_rooms = details.xpath('./li[2]/text()')[0]
shared_rooms = details.xpath('./li[3]/text()')[0]

# 获取img元素的src属性值列表
images = parse_html.xpath('//img/@src')

二、下载图片

res = requests.get(image_url, verify=False, headers=self.headers, timeout=60)

image_name = time.strftime('%Y%m%d%H%M%S') + '.jpg'
path = os.path.join(settings.BASE_DIR, 'static/images/news/%s' % image_name)
with open(path, 'wb') as f:
    f.write(res.content)

三、xpath补充

# div包含多个class属性时使用
//div[contains(@class, message)]

xpath常用语法参考:https://blog.csdn.net/qq_1290259791/article/details/85864041


版权声明:本文为weixin_43667990原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。