import scrapy, urllib, hashlib, time, random, threading, os
from pyquery import PyQuery as pq
headers = {
'Referer': 'http://www.mm131.com/1/1',
'user-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
def getSiteSource(url):
try:
req = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(req)
return response.read()
except Exception:
return ''
class downLoadImg(threading.Thread):
dir = 'c:/mm131'
catSiteUrl = 0
def __init__(self, catSiteUrl):
threading.Thread.__init__(self)
self.catSiteUrl = catSiteUrl
def run(self):
# 获取列表源码
listPageCode = getSiteSource(self.catSiteUrl)
doc = pq(listPageCode)
dls = pq(doc('dl').filter('.public-box'))
hrefs = dls('dl a')
goodUrls = []
for i in range(hrefs.length):
if (hrefs.eq(i).attr('target') == '_blank'):
goodUrls.append(hrefs.eq(i).attr('href'))
for i in range(goodUrls.__len__()):
baseUrl = goodUrls[i].replace('.html', '')
for k in range(1, 70):
if k == 1:
picInurl = goodUrls[i]
else:
picInurl = (baseUrl + '_' + str(k) + '.html') # 下载图片地址
pcode = getSiteSource(picInurl)
if len(pcode) == 0:
continue
pcodepq = pq(pcode)
img = pcodepq('div .content-pic a img').eq(0).attr('src')
if len(img) > 0:
imgInfo = img.split('/')
parent = imgInfo[4]
#图片写入文件夹
imgFold = self.dir + '/' + str(parent)
if not os.path.exists(imgFold):
os.makedirs(imgFold)
req = urllib.request.Request(img,headers=headers)
response = urllib.request.urlopen(req)
# if len(response.read()) == 0:
# return
# f = open(imgFold + '/' + hashlib.md5((str(time.time()) + str(int(random.randint(1, 1000000)))).encode(encoding='UTF-8')).hexdigest() + ".jpg", 'wb')
f = open(imgFold + '/' + str(imgInfo[5]), 'wb')
f.write(response.read())
f.close()
# 获取一个列表所有的图片
# 获取网站的大分类 的所有的url
class getSiteCat():
siteUrl = 'http://www.mm131.com/'
def getCat(self):
urlList = []
result = []
html = getSiteSource(self.siteUrl)
doc = pq(html)
hrefs = doc('div').filter('.nav ul li a')
for i in range(0, hrefs.length):
herf = hrefs.eq(i).attr('href')
if self.siteUrl != herf:
urlList.append(herf)
for i in range(0, urlList.__len__()):
sourceCode = getSiteSource(urlList[i])
doc = pq(sourceCode)
allHref = doc('a')
catMaxPageNum = 0
for k in range(allHref.length):
if allHref.eq(k).text() == '末页':
catMaxPageNumInfo = str(allHref.eq(k).attr('href').replace('list_', '').replace('.html', ''))
catMaxPageNumArr = catMaxPageNumInfo.split("_")
catMaxPageNum = catMaxPageNumArr[1]
temp = {'maxPageNum': catMaxPageNum, 'url': urlList[i], 'catId': catMaxPageNumArr[0]}
result.append(temp)
break
return result
def buildAllUrl(self, allSiteCat: list):
allUrlList = []
for i in range(0, allSiteCat.__len__()):
allUrlList.append(allSiteCat[i]['url'])
# print(allSiteCat[i]['url'])
# exit()
for j in range(2, int(allSiteCat[i]['maxPageNum'])):
allUrlList.append(allSiteCat[i]['url'] + 'list_' + str(allSiteCat[i]['catId']) + '_' + str(j) + '.html')
return allUrlList
site = getSiteCat()
catList = site.getCat()
urlList = site.buildAllUrl(catList)
thList = []
for i in range(0,50):#这里暂定50个进程 开到 203 启动不来,可以去研究下怎么解决。 目前全站有 203个 列表页面 开50 然后分成40次跑也可以
threadtemp = downLoadImg(urlList[i])
threadtemp.start()
thList.append(threadtemp)
# thList =[]
# for i in range(1000,900,-1):
# threadtemp = downLoadImg(i)
# threadtemp.start()
# thList.append(threadtemp)
版权声明:本文为huangweibbk原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。