爬取电影天堂下载链接

# !/usr/bin/python
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re

site = 'http://www.ygdy8.net'
lineNo = 1


class Movie:
    def __init__(self,name,url,score,link):
        self.name = name
        self.url = url
        self.score = score
        self.link = link
    def __str__(self):
        return '%s,\t%s分,\t%s' % (self.name,self.score,self.link)
    __repr__ = __str__

#下载网页信息
def getSoup(url):
    #headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    r = requests.get(url,headers = headers)
    r.encoding = 'gb18030'
    return BeautifulSoup(r.text,'html.parser')

#获取下载链接
def getDownloadLink(url):
    soup = getSoup(url)
    downloadTd = soup.find('td',attrs={"style":"WORD-WRAP: break-word"})
    downloadA = downloadTd.find('a')
    return downloadA['href']

def filterMovie(url):
    resultList = []
    soup = getSoup(url)
    tables = soup.find_all('table',attrs = {'class':'tbspan'})
    for table in tables:
        nameA = table.find('a',text = re.compile("《"))
        td = table.find('td',text = re.compile("IMDb"))
        if td is not None:
            scoreStr = re.findall(r"评分 (.+?)/10", td.text)
            if(len(scoreStr)>0):
                try:
                    score = float(scoreStr[0])
                    if(score>7.8):
                        name = nameA.text
                        url = site + nameA["href"]
                        print('url:',url)
                        print('title:',name)
                        print('score:',score)
                        downloadLink = getDownloadLink(url)
                        movie = Movie(name,url,score,downloadLink)
                        resultList.append(movie)
                except:
                    print('error!!')
    return resultList

#保存信息
def saveInfo(movieList):
    fileObj = open('data2018.txt','a')
    for movie in movieList:
        movie_str = str(movie)
        print("movie info:",movie_str)
        global lineNo
        fileObj.write('(' + str(lineNo) + ')' + movie_str)
        fileObj.write('\n')
        fileObj.write(
            '-----------------------------------------------------'
        )
        fileObj.write('\n')
        lineNo += 1
    fileObj.close()

def getPageResource(url):
    resultList = filterMovie(url)
    if len(resultList) > 0:
        saveInfo(resultList)

#url = 'http://www.ygdy8.net/html/gndy/oumei/list_7_1.html'

if __name__ == '__main__':
    for index in range(176):
        index += 1
        url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_' + str(index) + '.html'
        getPageResource(url)