python爬虫学习--pixiv爬虫(4)--代码优化

之前写了这个爬虫,其实就是想给自己壁纸扩下容,毕竟自己动手丰衣足食。。。

前段时间东忙一点西忙一点,就把爬虫放下了,前几天重新捡起来发现自己写的好烂,好多不必要的变量名啊,功能重复啊,头疼。。。

于是乎抽出点时间把这个小脚本整理了一下,去掉了一些没必要的东西,把重复的部分统一了一下,把代码量压缩到了200行左右。。。

#coding:UTF-8
__author__ = 'monburan'
__version__ = '0.3 optimization'

import os
import urllib
import urllib2
import cookielib
import re
from bs4 import BeautifulSoup
from urllib2 import urlopen        

class Tools:                                                                             #工具类

    remove = re.compile('amp;')
    rmbig = re.compile('_big')
    make_m = re.compile('mode=medium')
    
    def removebig(self,x):
        x = re.sub(self.rmbig,"",x)
        return x.strip()

    def removesomething(self,x):
        x = re.sub(self.remove,"",x)
        return x.strip()

    def make_big_url(self,x):
        x = re.sub(self.make_m,"mode=manga_big",x)
        return x.strip()

    def Pic_Type(self,real_url):                                                   #判断图片格式
        p_type = re.search(re.compile('png',re.S),real_url)
        if p_type == None:
            self.pic_type = 'jpg'
            return self.pic_type
        else:
            self.pic_type = 'png'
            return self.pic_type

    def Pic_Style_M(self,soupfile):                                             #统一通过正则表达式将所有的url提取出来,并将这些url以元组(多个列表)的形式传给下载函数
        single = re.findall(re.compile('<.*?work\s_work\s".*?href="(.*?)">',re.S),soupfile)
        multiple = re.findall(re.compile('<a.*?work\s_work\smultiple\s.*?href="(.*?)">',re.S),soupfile)
        video = re.findall(re.compile('<a.*?work\s_work\sugoku-illust\s.*?href="(.*?)">',re.S),soupfile)
        manga = re.findall(re.compile('<a.*?work\s_work\smanga\smultiple\s.*?href="(.*?)">',re.S),soupfile)
        return single,multiple,manga,video        

class Pixiv_Spider:

    def __init__(self):
        self.tool = Tools()
	self.dl_dir = ''
        self.pic_type = 'jpg'
        self.p_your_follow_url = 'http://www.pixiv.net/bookmark.php?type=user'      
        self.p_international_url = 'http://www.pixiv.net/ranking_area.php?type=detail&no=6'    

    def Login(self):                        	
        p_login_url = 'https://www.pixiv.net/login.php'        
        data = {                                    
                'mode':'login',
                'skip':1
                }
        data['pixiv_id'] = raw_input('pixiv id:')                                                 #将登录所需信息写如 data
        data['pass'] = raw_input('pixiv password:')
        p_login_data = urllib.urlencode(data)
        p_login_header = {                                                                             #登录请求头
                'accept-language':'zh-cn,zh;q=0.8',
                'referer':'https://www.pixiv.net/login.php?return_to=0',
                'user-agent':'mozilla/5.0 (windows nt 10.0; win64; x64; rv:45.0) gecko/20100101 firefox/45.0'
                }
        request = urllib2.Request(                                                                 #处理登录请求
                url = p_login_url,
                data = p_login_data,
                headers = p_login_header
                )
        try:
            cookie_file = 'cookie.txt'                                                                 #登录并生成cookie文件,并将其保存在本地
            cookie = cookielib.MozillaCookieJar(cookie_file)
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))         
            response = opener.open(request)             
            cookie.save(ignore_discard = True,ignore_expires = True)
        except urllib2.URLError,e:
            if hasattr(e,"reason"):
                print "ERROR!!!reason:",e.reason

    def Cookie_Login(self):                                                                            #用来处理前面登录保存的cookie,方便后面下载等功能调用
        cookie_login = cookielib.MozillaCookieJar()
        cookie_login.load('cookie.txt',ignore_discard = True,ignore_expires = True)
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_login))
        return opener
    
    def Download_Request(self,opener,make_url,real_url):                            #处理下载需要的请求头
            p_download_header = {                          
                'Accept-Language':'zh-CN,zh;q=0.8',
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0'
            }
            p_download_header['Referer'] = self.tool.removebig(make_url)                      
            download_request = urllib2.Request(
                url = real_url.group(1),
                headers = p_download_header
                )
            decode_url = opener.open(download_request)
            return decode_url.read()

    def Single(self,opener,pic,dl_dir):                                                         #下载单张图片
        p_url = self.tool.removesomething('http://www.pixiv.net/' + pic)
        soup = BeautifulSoup(opener.open(p_url),'lxml')                              #这里抛弃了python默认的html解析,使用lxml解析提高解析速度和容错
        p_id = (re.search(re.compile('(\d+)',re.S),p_url)).group(1)
        real_url = re.search(re.compile('.*?data-src="(.*?)"',re.S),str(soup.find_all("img",class_="original-image")))
        print 'find real url...\n' + real_url.group(1)
        p_type = self.tool.Pic_Type(real_url.group(1))
        file_pic = open('pixiv_' + p_id + '.' + p_type,'w')                
        file_pic.write(self.Download_Request(opener,p_url,real_url))
        file_pic.close()
        print 'download ok...'

    def Multiple(self,opener,pic,dl_dir):                                                     #下载多张图片
        p_url = self.tool.removesomething('http://www.pixiv.net/' + pic)
        p_id = (re.search(re.compile('(\d+)',re.S),p_url)).group(1)
        soup = BeautifulSoup(opener.open(p_url),'lxml')
        result_pic_more = re.search(re.compile('</li><li>.*?\s(.*?)P</li>',re.S),str(soup.find_all("ul",class_="meta")))
        print "find multpile" + result_pic_more.group(1)
        for j in range(0,int(result_pic_more.group(1))):
            m_soup = BeautifulSoup(opener.open(self.tool.make_big_url(p_url)+'&page='+str(j)))
            real_url = re.search(re.compile('<img.*?src="(.*?)"/>',re.S),str(m_soup.find_all("img")))
            p_type = self.tool.Pic_Type(real_url.group(1))
            print 'find real rul...\n' + real_url.group(1)    
            file_pic = open('pixiv_' + p_id + '_' + str(j) + '.' + p_type,'w')  
            file_pic.write(self.Download_Request(opener,(self.tool.make_big_url(p_url)+'&page='+str(j)),real_url))
            file_pic.close()
    	    print 'download ok...'

    def Choice_Pixiv(self,opener):
	print ('1.international 2.you follow')
	p_choice = raw_input()
        if (p_choice == '1'):
            try:
                p_page = opener.open(self.p_international_url)
                p_international = p_page.read()
                dl_dir = 'international'
                self.Pixiv_International(opener,p_international,dl_dir)
            except urllib2.URLError,e:
                if hasattr(e,"reason"):
                    print "ERROR!!!reason:",e.reason
        if (p_choice == '2'):
            try:
                p_page = opener.open(self.p_your_follow_url)
                p_your_follow = p_page.read()
                self.Pixiv_Your_Follow(opener,p_your_follow)
            except urllib2.URLError,e:
                if hasattr(e,"reason"):
                    print "ERROR!!!reason:",e.reason

    def Pixiv_International(self,opener,p_international,dl_dir):                    #爬取国际排行榜
        pic = self.tool.Pic_Style_M(str(BeautifulSoup(p_international,'lxml')))
        print "This page have Single:"+str(len(pic[0]))+"Multiple:"+str(len(pic[1]))+"Manga:"+str(len(pic[2]))+"Video:"+str(len(pic[3]))
        for i in pic[0]:                                                 #使用列表节省了很多代码
            self.Single(opener,i,dl_dir)
        for j in pic[1]:
            self.Multiple(opener,j,dl_dir)
        for k in pic[2]:
            self.Multiple(opener,k,dl_dir)

    def Pixiv_Your_Follow(self,opener,p_your_follow):                                #处理所有关注用户页面
        soup = BeautifulSoup(p_your_follow,'lxml')
        user_num = re.search(re.compile('(\d+)',re.S),str(soup.find_all(class_="unit-count")))
        print 'you have' + str(user_num.group(1)) + 'following author...'
        if int(user_num.group(1))/48!=0:                                                     #单一页面只有48位用户,所以需要计算用户有多少页关注
            u_p = int(user_num.group(1))/48 + 1
        else :
            u_p = int(user_num.group(1))/48
        for i in range(0,u_p+1):
            f_url = 'http://www.pixiv.net/bookmark.php?type=user&rest=show&p=' + str(i+1)
            self.User_Page(opener,f_url)

    def User_Page(self,opener,f_url):                                                        #爬取当前关注页所有用户的所有作品
        soup = BeautifulSoup(opener.open(f_url),'lxml')
        uname_list = re.findall(re.compile('data-user_name="(.*?)"',re.S),str(soup.find_all(class_="userdata")))
        uid_list = re.findall(re.compile('data-user_id="(.*?)"',re.S),str(soup.find_all(class_="userdata")))
        for h in range(0,len(uid_list)):
            user_info = BeautifulSoup(opener.open('http://www.pixiv.net/member_illust.php?id=' + uid_list[h]))
            pic_num = re.search(re.compile('(\d+)',re.S),str(user_info.find(class_="count-badge")))
            print 'author:' + uname_list[h] + 'have' + pic_num.group(1) + 'pic'
            if (int(pic_num.group(1))%20)!=0:                                             #用户作品页只有20张作品,所以要计算页数
                p = (int(pic_num.group(1))/20) + 1
            else :
                p = int(pic_num.group(1))/20
            for i in range(1,p+1):
                pic = self.tool.Pic_Style_M(str(BeautifulSoup(opener.open(('http://www.pixiv.net/member_illust.php?id=' + uid_list[h]) + '&type=all&p=' + str(i)))))
                print 'Page'+ str(i) + 'have Single:'+ str(len(pic[0])) + 'Multiple:' + str(len(pic[1])) + 'Manga:' + str(len(pic[2])) + 'Video:' + str(len(pic[3]))
                for j in pic[0]:
                    self.Single(opener,j,uid_list[h])
                for k in pic[1]:
                    self.Multiple(opener,k,uid_list[h])
                for l in pic[2]:
                    self.Multiple(opener,l,uid_list[h])

    def Program_Start(self):                                                                     #爬虫启动
        self.Login()
        opener = self.Cookie_Login()
        self.Choice_Pixiv(opener)

if __name__ == '__main__':                                                                      #规范代码
    ps = Pixiv_Spider()
    ps.Program_Start()


版权声明:本文为qq_33669549原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。