from lxml import etree
import requests
BASE_DOMIN = "http://dytt8.net"
url = "http://dytt8.net/html/gndy/dyzz/list_23_1.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'
}
def get_detail_urls(url):
response = requests.get(url, headers=headers)
#response.text
#response.content
#requests库,默认会使用自己猜测的编码方式将
#抓取下来的网页进行编码,然后存储到 text属性上去
# 在电影天堂的网页中,因为编码方式,requests库猜错了,所以会产生乱码
#print(response.text)
#print(response.content.decode("gbk"))
print(response.encoding)
text = response.text
html = etree.HTML(text)
details_urls = html.xpath(".//table[@class='tbspan']//a/@href")
details_urls = map(lambda url:BASE_DOMIN+url, details_urls)
return details_urls
def parse_detail_page(url):
movie = {}
response = requests.get(url, headers=headers)
text = response.content.decode("gbk")
html = etree.HTML(text)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
# for x in title:
# print(etree.tostring(x, encoding="utf-8").decode("utf-8"))
movie['title'] = title
zoomE = html.xpath("//div[@id='Zoom']")[0]
imgs = zoomE.xpath(".//img/@src")
cover = imgs[0]
screenshot = imgs[1]
movie['cover'] = cover
movie['screenshot'] = screenshot
infos = zoomE.xpath(".//text()")
def pars_info(info, relu):
return info.replace(relu, "").strip()
for index,info in enumerate(infos):
# print(index)
# print(info)
# print("-------")
if info.startswith("◎年 代"):
info = pars_info(info,"◎年 代")
movie["year"] = info
elif info.startswith("◎产 地"):
info = pars_info(info,"◎产 地")
# print(info)
movie["country"] = info
elif info.startswith("◎类 别"):
info = pars_info(info,"◎类 别")
movie["category"] = info
elif info.startswith("◎豆瓣评分"):
info = pars_info(info,"◎豆瓣评分")
# print(info)
movie["douban_rating"] = info
elif info.startswith("◎片 长"):
info = pars_info(info,"◎片 长")
# print(info)
movie["duration"] = info
elif info.startswith("◎导 演"):
info = pars_info(info,"◎导 演")
# print(info)
movie["director"] = info
elif info.startswith("◎主 演"):
info = pars_info(info,"◎主 演")
print(info)
actors = [info]
for i in range(index+1, len(infos)):
actor = infos[i].strip()
if actor.startswith("◎"):
break
# print(actor)
actors.append(actor)
# print(actors)
movie["actors"] = actors
elif info.startswith("◎简 介"):
info = pars_info(info,"◎简 介")
for i in range(index+1, len(infos)):
profile = infos[i].strip()
if profile.startswith("◎获奖情况"):
break
# print(profile)
movie["profile"] = profile
download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/text()")[0]
movie["download_url"] = download_url
return movie
def spider():
base_url = "http://dytt8.net/html/gndy/dyzz/list_23_{}.html"
movies = []
for i in range(1,8):
print("*"*20)
print(i)
print("*"*20)
url = base_url.format(i)
detail_urls = get_detail_urls(url)
# print(detail_urls)
for detail_url in detail_urls:
movie = parse_detail_page(detail_url)
movies.append(movie)
print(movies)
# base_url = "http://dytt8.net/html/gndy/dyzz/list_23_2.html"
# detail_urls = get_detail_urls(base_url)
# for detail_url in detail_urls:
# parse_detail_page(detail_url)
if __name__ == "__main__":
spider()
结果:
[{'title': '2018年冒险动作《罗宾汉》BD中英双字幕', 'cover': 'https://extraimage.net/images/2019/01/25/73af46b85071e3b8807e4ee6d822a659.jpg', 'screenshot': 'https://lookimg.com/images/2019/02/07/wfxR9.jpg', 'year': '2018', 'country': '美国', 'category': '动作/冒险', 'douban_rating': '5.4/10 from 729 users', 'duration': '116分钟', 'director': '奥图·巴瑟赫斯特 Otto Bathurst', 'actors': ['塔伦·埃格顿 Taron Egerton', '詹米·多南 Jamie Dornan', '本·门德尔森 Ben Mendelsohn', '保罗·安德森 Paul Anderson', '伊芙·休森 Eve Hewson', '杰米·福克斯 Jamie Foxx', '蒂姆·明钦 Tim Minchin', '乔什·赫德曼 Josh Herdman', '布乔恩·本辛森 Björn Bengtsson', '斯考特·格瑞南 Scot Greenan', '亚森·阿图 Yasen Atour', '罗德里克·希尔 Roderick Hill', '埃文特·斯特朗 Avant Strangel', '赖云 Yun Lai', '查理·文森特 Charlie Vincent'], 'profile': '由奥图·巴瑟赫斯特(《浴血黑帮》《黑镜》)执导,艾格顿饰演罗宾汉,剧情将发生在他成为传奇侠盗英雄之前,聚焦他作为十字军战士时期,加盟荒野沼地的一支反抗军大胆起义,扳倒腐败堕落的英国统治政权。杰米·福克斯将饰演罗宾汉的战友小约翰,伊芙·休森饰罗宾汉的情人梅德·玛丽安,杰米·多南饰威尔·斯加雷特,是罗宾汉同父异母/同母异父的兄弟。Joby Harold(《亚瑟王:斗兽争霸》)操刀剧本,将于莱昂纳多·迪卡普里奥、珍妮佛·戴维森等联合制片。', 'download_url': 'ftp://ygdy8:ygdy8@yg45.dydytt.net:3186/阳光电影www.ygdy8.com.罗宾汉.BD.720p.中英双字幕.mkv'}]
版权声明:本文为winnertakeall原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。