服务端通过socket监听客户端发来的网址信息,然后运行selenium爬取对应网址网页,之后再通过socket将网页信息以.mht格式发送至客户端,客户端通过socket接收到网页信息后,以.mht格式文件保存到本地,再以浏览器打开该文件。这样,客户端及时无法访问外网,只需要能够和服务端通过socket连接,服务端能够访问外网,客户端就也能浏览外网网页。
import socket
import webbrowser
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re
js='''
///
//通过ajax获取script或img后生成Data URI
function loadDoc(uri,element) {
const xhttp = new XMLHttpRequest();
xhttp.onload = function() {
//console.log(this.response);
var reader = new FileReader();
reader.onloadend = function() {
var base64data = reader.result;
if(base64data){
if(element.hasAttribute("src")){
console.log(element);
element.src=base64data;
}
if(element.hasAttribute("href")){
console.log(element);
element.href=base64data;
}
console.log("base64 获取成功");
console.log(base64data);
}
}
reader.readAsDataURL(this.response);
}
xhttp.responseType = "blob";
xhttp.open("GET", uri, true);
xhttp.withCredentials = true;
xhttp.send();
}
//获取所有元素 并筛选出有src属性的
var es=document.getElementsByTagName('*');
for(var i=0;i<es.length;i++){
if(es[i].hasAttribute("src")||es[i].hasAttribute("href")){
//console.log(es[i].tagName);
//console.log(es[i].src);
if(es[i].hasAttribute("src")){
es[i].src=es[i].src;
es[i].src=es[i].src;
}
if(es[i].hasAttribute("href")){
es[i].href=es[i].href;
es[i].href=es[i].href;
}
if(["SCRIPT","IMG","LINK"].includes(es[i].tagName.toUpperCase())){
if(es[i].hasAttribute("src")){
console.log("src: "+es[i].src);
loadDoc(es[i].src,es[i]);
}
if(es[i].hasAttribute("href")){
console.log("href: "+es[i].href);
loadDoc(es[i].href,es[i]);
}
console.log(es[i].tagName);
}
}
}
///
'''
#-----------------------------------------------------------
#服务端
class socketCrawlerServer:
'''
python socket 服务端向客户端浏览器发送信息
'''
def __init__(self,host,port):
self.HOST = host
self.PORT = port
def start_server(self):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind((self.HOST, self.PORT))
while True:
s.listen()
conn, addr = s.accept()
with conn:
print('Connected by', addr)
data = conn.recv(1024)
#使用selenium爬取目标网页
try:
website_string=str(data.decode(encoding='UTF-8',errors='ignore'))
website,mode=website_string.split("#####")
m=re.match( r'http(s)?://.*', website)
if m:
url=website
else:
url="http://"+website
#url="http://"+str(data.decode(encoding='UTF-8',errors='ignore'))
options = Options()
options.page_load_strategy = 'eager'
driver=webdriver.Chrome(options=options)
driver.maximize_window()
#隐式等待10秒
driver.implicitly_wait(1)
driver.get(url)
#执行将资源img script css等资源转换为base64的js程序
if mode=='HTML':
driver.execute_script(js)
time.sleep(6)
#print("执行js后的网页源码",driver.page_source)
res=driver.page_source
driver.quit()
#将网页源码发送给客户端
conn.sendall(res.encode(encoding='UTF-8',errors='ignore'))
elif mode=='MHT':
#将网页另存为mhtml文件
res = driver.execute_cdp_cmd('Page.captureSnapshot', {})
driver.quit()
#将网页源码发送给客户端
conn.sendall(res['data'].encode(encoding='UTF-8',errors='ignore'))
except selenium.common.exceptions.WebDriverException:
driver.quit()
conn.sendall("后端故障".encode())
#-----------------------------------------------------------
#客户端
from tkinter import *
from tkinter.ttk import *
class SocketCrawler:
'''
python socket 客户端从服务端获取数据并保存
'''
def __init__(self,host,port):
self.HOST=host
self.PORT=port
def crawl(self,url,mode):
data=""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.connect((self.HOST, self.PORT))
if mode=="HTML":
save_file=re.sub('[\/\\:*?"<>|]','_',url)+'.html'
url_string=url+"#####HTML"
elif mode=="MHT":
save_file=re.sub('[\/\\:*?"<>|]','_',url)+'.mht'
url_string=url+"#####MHT"
s.sendall(url_string.encode())
data = s.recv(1024)
print(data.decode(encoding='UTF-8',errors='ignore'))
f=open(save_file,'wb+')
f.write(data)
while data:
data = s.recv(1024)
print(data.decode(encoding='UTF-8',errors='ignore'))
f.write(data)
f.close()
webbrowser.open(save_file)
def ui(self):
root = Tk()
root.attributes("-topmost", True)
root.title("访问网站")
Style().configure("TEntry",foreground="#ff0000")
entry_string =StringVar(root)
entry= Entry(root, width=64,textvariable = entry_string)
entry.pack(side=LEFT,padx=4,pady=4)
button0 = Button(root, text ="获取HTML", command = lambda: self.crawl(entry_string.get(),"HTML"))
button0.pack(side=RIGHT,padx=4,pady=4)
button1 = Button(root, text ="获取MHT", command = lambda: self.crawl(entry_string.get(),"MHT"))
button1.pack(side=RIGHT,padx=4,pady=4)
root.mainloop()
#-----------------------------------------------------------
#服务端运行
#ss=socketCrawlerServer("",8080)
#ss.start_server()
#客户端运行
sc=SocketCrawler("localhost",8080)
sc.ui()
版权声明:本文为MAILLIBIN原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。