github python下载_github 资源文件下载(python 爬虫)

import re

import os

import sys

import time

import threading

import socket

import urllib

import urllib2

server = ‘127.0.0.1‘

port = ‘8087‘

timeout = 720

socket.setdefaulttimeout(timeout)

class timer(threading.Thread): #The timer class is derived from the class threading.Thread

def __init__(self, num, interval,dir,url):

threading.Thread.__init__(self)

self.thread_num = num

self.interval = interval

self.url = url

self.dir = dir

self.thread_stop = False

def run(self): #Overwrite run() method, put what you want the thread do here

#while not self.thread_stop:

DownloadFile(self.interval,self.url,self.dir)

#print ‘Thread Object(%d), Time:%s‘ %(self.thread_num, time.ctime())

#time.sleep(self.interval)

def stop(self):

self.thread_stop = True

def getContent(url,type):

print(">>start connecting:%s" % url)

from urllib2 import Request, urlopen, URLError, HTTPError

#proxy = urllib2.ProxyHandler({‘http‘:‘http://127.0.0.1:8087‘})

proxy = urllib2.ProxyHandler({})

opener = urllib2.build_opener(proxy,urllib2.HTTPHandler)

urllib2.install_opener(opener)

try:

urlHandler = urllib2.urlopen(url)

headers = urlHandler.info().headers

length = 0

for header in headers:

if header.find(‘Length‘) != -1:

length = header.split(‘:‘)[-1].strip()

length = int(length)

if(type=="img" and length<15000):

print(" >>>>>>>>%d" % length)

dataStr = ‘EOF‘

else:

print(" ++++++++%d" % length)

dataStr = urlHandler.read()

except HTTPError, e:

print ‘The server couldn\‘t fulfill the request.‘

print ‘Error code: ‘, e.code

except URLError, e:

print ‘We failed to reach a server.‘

print ‘Reason: ‘, e.reason

else:

# print("%s" % dataStr)

# f = open("text.txt",‘wb‘)

# f.write(dataStr)

# f.close()

return dataStr

def DownloadFile(interval,url,dir):

strinfo = re.compile(r‘\S*/blob/master/‘)

dataStr = getContent(url,"html")

print("...:%s" % url)

#download Files

base = url.replace(‘https://github.com‘,‘‘).replace(‘/tree/master/‘,‘/blob/master/‘).strip()

reg = r‘href="%s(\S+)"‘ % base

imgre = re.compile(reg)

imglist = imgre.findall(dataStr)

x = 0

for fileName in imglist:

javaFileUrl = "%s%s%s" % (‘https://raw.githubusercontent.com‘,base.replace(‘/blob/master/‘,‘/master/‘),fileName)

imgdata=getContent(javaFileUrl,"html")

if(imgdata != ‘EOF‘):

outputFile = ‘%s%s‘ % (dir,strinfo.sub(‘/‘,base))

if not os.path.exists(outputFile):

os.makedirs(outputFile);

f = open(‘%s%s‘ % (outputFile,fileName),‘wb‘)

f.write(imgdata)

f.close()

x = x + 1

time.sleep(interval)

#download recursive

base = url.replace(‘https://github.com‘,‘‘).strip()

reg = r‘href="%s(\S+)"‘ % base

imgre = re.compile(reg)

imglist = imgre.findall(dataStr)

for fileDir in imglist:

DownloadFile(interval,‘%s%s‘ % (url,fileDir),dir)

#https://raw.githubusercontent.com/vogella/vogella/master/de.vogella.rcp.editor.example/src/de/vogella/rcp/editor/example/Application.java

#https://raw.githubusercontent.com/clojure/clojure/master/src/jvm/clojure/lang/Util.java

#https://github.com/vogella/vogella/blob/master/de.vogella.rcp.editor.example/src/de/vogella/rcp/editor/example/Activator.java

url_="https://github.com/vogella/vogella/tree/master/de.vogella.rcp.editor.example/src"

n=1;

thread=[]

for i in range(0, n):

url=url_

dir=‘‘.join(‘FILE‘)

thread.append(timer(1, 1,dir,url))

for i in range(0, n):

thread[i].start()

原文:http://blog.csdn.net/kevinkitty_love/article/details/23249603


版权声明:本文为weixin_32442555原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。