python socket 实现类似远程浏览器效果

服务端通过socket监听客户端发来的网址信息,然后运行selenium爬取对应网址网页,之后再通过socket将网页信息以.mht格式发送至客户端,客户端通过socket接收到网页信息后,以.mht格式文件保存到本地,再以浏览器打开该文件。这样,客户端及时无法访问外网,只需要能够和服务端通过socket连接,服务端能够访问外网,客户端就也能浏览外网网页。

import socket
import webbrowser
import selenium

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import time
import re

js='''
///

//通过ajax获取script或img后生成Data URI
function loadDoc(uri,element) {
    const xhttp = new XMLHttpRequest();
    xhttp.onload = function() {
        //console.log(this.response);
        var reader = new FileReader();    
        reader.onloadend = function() {
            var base64data = reader.result;      
          
            if(base64data){
                if(element.hasAttribute("src")){
                    console.log(element);
                    element.src=base64data;
                    
                }
                if(element.hasAttribute("href")){
                    console.log(element);
                    element.href=base64data;
                }
                console.log("base64 获取成功");
                console.log(base64data);
            }
        }
        reader.readAsDataURL(this.response);   
    }  

    xhttp.responseType = "blob";
    xhttp.open("GET", uri, true);      
    xhttp.withCredentials = true;
    xhttp.send();
}

//获取所有元素 并筛选出有src属性的
var es=document.getElementsByTagName('*'); 
for(var i=0;i<es.length;i++){
    if(es[i].hasAttribute("src")||es[i].hasAttribute("href")){
        //console.log(es[i].tagName);
        //console.log(es[i].src);
        if(es[i].hasAttribute("src")){
            es[i].src=es[i].src;
            es[i].src=es[i].src;
        }
        if(es[i].hasAttribute("href")){
            es[i].href=es[i].href;
            es[i].href=es[i].href;
        }
        
        
        if(["SCRIPT","IMG","LINK"].includes(es[i].tagName.toUpperCase())){
            if(es[i].hasAttribute("src")){
                console.log("src: "+es[i].src);
                loadDoc(es[i].src,es[i]);
            }
            if(es[i].hasAttribute("href")){
                console.log("href: "+es[i].href);
                loadDoc(es[i].href,es[i]);
            } 
            console.log(es[i].tagName);
        }
    } 
}
///
'''


#-----------------------------------------------------------
#服务端
class socketCrawlerServer:

    '''
        python socket 服务端向客户端浏览器发送信息
    '''
    def __init__(self,host,port):

        self.HOST = host  
        self.PORT = port
        
    def start_server(self):
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:

            s.bind((self.HOST, self.PORT))   
            
            while True:
                s.listen()
                conn, addr = s.accept()
                with conn:
                    print('Connected by', addr)
                    data = conn.recv(1024)
                    #使用selenium爬取目标网页
                    try:
                        website_string=str(data.decode(encoding='UTF-8',errors='ignore'))
                        website,mode=website_string.split("#####")
                        m=re.match( r'http(s)?://.*', website)
                        if m:
                            url=website
                        else:
                            url="http://"+website
                        #url="http://"+str(data.decode(encoding='UTF-8',errors='ignore'))
                        options = Options()
                        options.page_load_strategy = 'eager'
                        driver=webdriver.Chrome(options=options)
                        driver.maximize_window()
                        #隐式等待10秒
                        driver.implicitly_wait(1)
                        driver.get(url)
                        #执行将资源img script css等资源转换为base64的js程序
                        if mode=='HTML':                        
                            driver.execute_script(js)
                            time.sleep(6)
                            #print("执行js后的网页源码",driver.page_source)
                            res=driver.page_source
                            driver.quit()                
                            #将网页源码发送给客户端
                            conn.sendall(res.encode(encoding='UTF-8',errors='ignore'))
                        elif mode=='MHT':
                            #将网页另存为mhtml文件
                            res = driver.execute_cdp_cmd('Page.captureSnapshot', {})  
                            driver.quit() 
                            #将网页源码发送给客户端
                            conn.sendall(res['data'].encode(encoding='UTF-8',errors='ignore'))
                    except selenium.common.exceptions.WebDriverException:
                        driver.quit()
                        conn.sendall("后端故障".encode())
                        
                    
#-----------------------------------------------------------
#客户端
from tkinter import *
from tkinter.ttk import *

class SocketCrawler:
    '''
        python socket 客户端从服务端获取数据并保存
    '''    
    
    def __init__(self,host,port):
        self.HOST=host
        self.PORT=port
    def crawl(self,url,mode):    
        data=""
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            s.connect((self.HOST, self.PORT))  
            
            if mode=="HTML":
                save_file=re.sub('[\/\\:*?"<>|]','_',url)+'.html'
                url_string=url+"#####HTML"                
            elif mode=="MHT":
                save_file=re.sub('[\/\\:*?"<>|]','_',url)+'.mht'
                url_string=url+"#####MHT"
                
            s.sendall(url_string.encode())   
            
            data = s.recv(1024)
            print(data.decode(encoding='UTF-8',errors='ignore'))
            f=open(save_file,'wb+')
            f.write(data)  

            while data:
                data = s.recv(1024)
                print(data.decode(encoding='UTF-8',errors='ignore'))
                f.write(data)  

            
            f.close()
            
            webbrowser.open(save_file)
            
    def ui(self):      
        root = Tk() 
        root.attributes("-topmost", True)
        root.title("访问网站")
        Style().configure("TEntry",foreground="#ff0000")
        entry_string =StringVar(root)
        
        entry= Entry(root, width=64,textvariable = entry_string)
        entry.pack(side=LEFT,padx=4,pady=4)
        
        button0 = Button(root, text ="获取HTML", command = lambda: self.crawl(entry_string.get(),"HTML"))
        button0.pack(side=RIGHT,padx=4,pady=4)
        
        button1 = Button(root, text ="获取MHT", command = lambda: self.crawl(entry_string.get(),"MHT"))
        button1.pack(side=RIGHT,padx=4,pady=4)
        
        root.mainloop()
        
        
        
#-----------------------------------------------------------

#服务端运行            
#ss=socketCrawlerServer("",8080)   
#ss.start_server()

#客户端运行        
sc=SocketCrawler("localhost",8080)
sc.ui()




版权声明:本文为MAILLIBIN原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。