# !/usr/bin/python
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
site = 'http://www.ygdy8.net'
lineNo = 1
class Movie:
def __init__(self,name,url,score,link):
self.name = name
self.url = url
self.score = score
self.link = link
def __str__(self):
return '%s,\t%s分,\t%s' % (self.name,self.score,self.link)
__repr__ = __str__
#下载网页信息
def getSoup(url):
#headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
r = requests.get(url,headers = headers)
r.encoding = 'gb18030'
return BeautifulSoup(r.text,'html.parser')
#获取下载链接
def getDownloadLink(url):
soup = getSoup(url)
downloadTd = soup.find('td',attrs={"style":"WORD-WRAP: break-word"})
downloadA = downloadTd.find('a')
return downloadA['href']
def filterMovie(url):
resultList = []
soup = getSoup(url)
tables = soup.find_all('table',attrs = {'class':'tbspan'})
for table in tables:
nameA = table.find('a',text = re.compile("《"))
td = table.find('td',text = re.compile("IMDb"))
if td is not None:
scoreStr = re.findall(r"评分 (.+?)/10", td.text)
if(len(scoreStr)>0):
try:
score = float(scoreStr[0])
if(score>7.8):
name = nameA.text
url = site + nameA["href"]
print('url:',url)
print('title:',name)
print('score:',score)
downloadLink = getDownloadLink(url)
movie = Movie(name,url,score,downloadLink)
resultList.append(movie)
except:
print('error!!')
return resultList
#保存信息
def saveInfo(movieList):
fileObj = open('data2018.txt','a')
for movie in movieList:
movie_str = str(movie)
print("movie info:",movie_str)
global lineNo
fileObj.write('(' + str(lineNo) + ')' + movie_str)
fileObj.write('\n')
fileObj.write(
'-----------------------------------------------------'
)
fileObj.write('\n')
lineNo += 1
fileObj.close()
def getPageResource(url):
resultList = filterMovie(url)
if len(resultList) > 0:
saveInfo(resultList)
#url = 'http://www.ygdy8.net/html/gndy/oumei/list_7_1.html'
if __name__ == '__main__':
for index in range(176):
index += 1
url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_' + str(index) + '.html'
getPageResource(url)