JAVA写的网页爬虫爬取必应词典的词汇音标

为了建立一个英语词汇音标库,需要从网上搜索词汇音标,由于有10万多个词汇需要处理,所以做个这个单词音标爬虫爬取必应网络词典网页上的单词音标。


package bingword;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javafx.application.Application;
import javafx.beans.value.ChangeListener;
import javafx.beans.value.ObservableValue;
import javafx.concurrent.Worker;
import javafx.event.ActionEvent;
import javafx.event.EventHandler;
import javafx.geometry.Insets;
import javafx.scene.Scene;
import javafx.scene.control.Button;
import javafx.scene.control.Label;
import javafx.scene.control.TextArea;
import javafx.scene.control.TextField;
import javafx.scene.control.Tooltip;
import javafx.scene.layout.GridPane;
import javafx.scene.text.Font;
import javafx.scene.web.WebEngine;
import javafx.scene.web.WebView;
import javafx.stage.FileChooser;
import javafx.stage.Stage;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;

public class BingWord extends Application {

    private final GridPane grid = new GridPane();
    private final TextField inputText = new TextField();
    private final Button btn = new Button();
    private final Button btn1 = new Button();
    private final Button btn2 = new Button();
    private final Button btnPause = new Button();
    private final WebView browser = new WebView();
    private final WebEngine webEngine = browser.getEngine();
    private final TextArea textArea = new TextArea();
    private final Label info = new Label();
    private int recordPosition = 1;
    private String currentWord = "";
    private Connection connection = null;
    private Statement statement;
    private ResultSet rs;
    private int maxNumber = 1;
    private Boolean pause = false;

    @Override
    public void start(Stage primaryStage) throws SQLException {

        //程序启动后就连接好SQLite数据库,或者新建一个dictionary.db数据库文件
        connection = DriverManager.getConnection("jdbc:sqlite:dictionary.db");
        statement = connection.createStatement();
        statement.setQueryTimeout(30);
        statement.executeUpdate("CREATE TABLE IF NOT EXISTS wordlist (id INTEGER PRIMARY KEY AUTOINCREMENT, word TEXT, pronunciation1 TEXT, pronunciation2 TEXT)");
        System.out.println("dictionary.db数据库已经连接");

        //文本框供手动设置开始行位置,默认值为第一行开始
        inputText.setText("1");

        //查找必应词典音标的启动按钮
        Tooltip tooltip=new Tooltip();
        tooltip.setText("请在右侧输入框内填入开始查找的行号");
        tooltip.setFont(new Font("Arial", 20));     
        
        btn.setTooltip(tooltip);
        btn.setText("开始查找音标行");
        btn.setOnAction(new EventHandler<ActionEvent>() {
            @Override
            public void handle(ActionEvent event) {
                //重启
                pause = false;
                //获取文本框中设定的起始行位置
                recordPosition = Integer.parseInt(inputText.getText());
                try {
                    //查询记录总条数
                    rs = statement.executeQuery("SELECT count(*) FROM wordlist;");
                    while (rs.next()) {
                        maxNumber = rs.getInt(1);
                        System.out.println("数据库中总单词量:" + maxNumber);
                    }
                    //单词音标搜索程序
                    search();
                } catch (SQLException ex) {
                    Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
                }

            }
        });

        //将EXCEL导入到SQLITE数据库中
        Tooltip tooltip1=new Tooltip();
        tooltip1.setText("将EXCEL表中第一列单词导入到SQLite数据库中");
        tooltip1.setFont(new Font("Arial", 20));          
        
        btn1.setTooltip(tooltip1);        
        btn1.setText("Excel -> SQLite");
        btn1.setOnAction(new EventHandler<ActionEvent>() {
            @Override
            public void handle(ActionEvent event) {
                System.out.println("将Excel中的单词导入dictionary.db数据库的wordlist表中");
                FileChooser fileChooser = new FileChooser();
                fileChooser.setTitle("打开EXCEL文件");
                fileChooser.getExtensionFilters().add(new FileChooser.ExtensionFilter("EXCEL", "*.xls"));
                File file = fileChooser.showOpenDialog(primaryStage);
                if (file == null) {
                    return;
                }
                String filePath = file.getPath();
                System.out.println(filePath);
                try {
                    //导入EXCEL文件
                    FileInputStream inputExcel = new FileInputStream(filePath);
                    HSSFWorkbook wb = new HSSFWorkbook(inputExcel);
                    //获取SHEET的张数
                    int sheetCount = wb.getNumberOfSheets();
                    System.out.println("导入的EXCEL表总共有工作表张数:" + sheetCount);
                    //循环操作各个工作表
                    for (int i = 0; i < sheetCount; i++) {
                        HSSFSheet sheet = wb.getSheetAt(i);
                        int rowCount = sheet.getLastRowNum();
                        System.out.println("sheet" + i + "总共有" + rowCount + "行");
                        for (int j = 0; j <= rowCount; j++) {
                            HSSFRow row = sheet.getRow(j);
                            if (row != null) {
                                if (row.getCell(0) != null) {
                                    row.getCell(0).setCellType(1);
                                    //循环读取各个工作表第一列
                                    String cellValue = row.getCell(0).getStringCellValue().trim();
                                    //先查询看单词是否存在
                                    rs = statement.executeQuery("SELECT * FROM wordlist WHERE word='" + cellValue + "';");
                                    System.out.println("SELECT * FROM wordlist WHERE word='" + cellValue + "';");

                                    if (rs.next()) {
                                        //如果单词已经存在则无需重复插入
                                    } else {
                                        //如果单词不存在就将该单词插入到单词列表中
                                        statement.executeUpdate("INSERT INTO wordlist (word) VALUES('" + cellValue + "');");
                                        System.out.println("INSERT INTO wordlist (word) VALUES('" + cellValue + "');");
                                    }
                                }
                            }
                        }
                    }
                    info.setText("EXCEL中的数据已经导入数据库完毕");
                    //记得打开的文件要及时关闭
                    wb.close();
                    inputExcel.close();
                } catch (FileNotFoundException ex) {
                    Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
                } catch (IOException | SQLException ex) {
                    Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        });

        //将数据写入EXCEL文件保存,注意EXCEL字体需要设置为IPAPANNEW,否则音标会乱码
        Tooltip tooltip2=new Tooltip();
        tooltip2.setText("完整的单词列表和单词音标将从数据库输出到EXCEL表中");
        tooltip2.setFont(new Font("Arial", 20));  
        
        btn2.setTooltip(tooltip2);        
        btn2.setText("SQLite -> Excel");
        btn2.setOnAction(new EventHandler<ActionEvent>() {
            @Override
            public void handle(ActionEvent event) {
                System.out.println("将SQLITE数据库中的数据导出到EXCEL文件中");
                FileChooser fileChooser = new FileChooser();
                fileChooser.setTitle("保存到EXCEL文件");
                fileChooser.getExtensionFilters().add(new FileChooser.ExtensionFilter("EXCEL", "*.xls"));
                fileChooser.setInitialFileName("wordlist.xls");
                File file = fileChooser.showSaveDialog(primaryStage);
                if (file == null) {
                    return;
                }
                String filePath = file.getPath();
                System.out.println(filePath);
                try {
                    //获取总记录数目maxNumber
                    rs = statement.executeQuery("SELECT count(*) FROM wordlist;");
                    while (rs.next()) {
                        maxNumber = rs.getInt(1);
                        System.out.println("数据库中总单词量:" + maxNumber);
                    }
                    //计算总共需要多少个工作表来存数据
                    int maxRowNumber = 65536;
                    int sheetCount = (int) (Math.floor(maxNumber / (maxRowNumber + 1)) + 1);
                    //目标EXCEL文件输出流
                    FileOutputStream outputExcel = new FileOutputStream(filePath);
                    //创建EXCEL表
                    HSSFWorkbook wb = new HSSFWorkbook();
                    //循环创建各个工作表和单元格
                    for (int i = 0; i < sheetCount; i++) {
                        HSSFSheet sheet = wb.createSheet("wordlist" + i);
                        //设置列宽
                        sheet.setColumnWidth(1, 6400);
                        sheet.setColumnWidth(2, 6400);
                        sheet.setColumnWidth(3, 6400);
                        for (int j = 0; j < maxRowNumber; j++) {
                            HSSFRow row = sheet.createRow(j);
                            HSSFCell cell0 = row.createCell(0);
                            HSSFCell cell1 = row.createCell(1);
                            HSSFCell cell2 = row.createCell(2);
                            HSSFCell cell3 = row.createCell(3);
                            cell0.setCellType(1);
                            cell1.setCellType(1);
                            cell2.setCellType(1);
                            cell3.setCellType(1);
                        }
                    }
                    //将数据库记录存入Excel各个对应单元格
                    rs = statement.executeQuery("SELECT * FROM wordlist;");
                    while (rs.next()) {
                        int currentLineNumber = 0;
                        String currentWordSpell = "";
                        String currentPronunciation1 = "";
                        String currentPronunciation2 = "";

                        currentLineNumber = rs.getInt(1);

                        if (rs.getString(2) != null) {
                            currentWordSpell = rs.getString(2).trim();
                        }
                        if (rs.getString(3) != null) {
                            currentPronunciation1 = rs.getString(3).trim();
                        }
                        if (rs.getString(4) != null) {
                            currentPronunciation2 = rs.getString(4).trim();
                        }

                        //计算当前记录应该写入第几个工作表第几行
                        int targetSheetNumber = (int) (Math.floor(currentLineNumber / (maxRowNumber + 1)));
                        int targetRowNumber = (currentLineNumber - 1) % maxRowNumber;

                        //将数据库的记录写入相应单元格中
                        Row currentRow = wb.getSheetAt(targetSheetNumber).getRow(targetRowNumber);
                        currentRow.getCell(0).setCellValue(currentLineNumber + "");
                        currentRow.getCell(1).setCellValue(currentWordSpell);
                        currentRow.getCell(2).setCellValue(currentPronunciation1);
                        currentRow.getCell(3).setCellValue(currentPronunciation2);

                    }
                    //写入文件并关闭
                    wb.write(outputExcel);
                    wb.close();
                    outputExcel.close();
                    info.setText("数据已经保存到文件 " + filePath);

                } catch (FileNotFoundException ex) {
                    Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
                } catch (IOException | SQLException ex) {
                    Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        });

        //暂停按钮设置
        Tooltip tooltip3=new Tooltip();
        tooltip3.setText("暂停查找循环");
        tooltip3.setFont(new Font("Arial", 20));    
        
        btnPause.setTooltip(tooltip3);        
        btnPause.setText("Pause");
        btnPause.setOnAction(new EventHandler<ActionEvent>() {
            @Override
            public void handle(ActionEvent event) {
                pause = true;
            }
        });

        //WEBVIEW浏览器监听加载成功事件
        webEngine.getLoadWorker().stateProperty()
                .addListener(new ChangeListener<Worker.State>() {
                    @Override
                    public void changed(ObservableValue<? extends Worker.State> ov,
                            Worker.State oldState, Worker.State newState) {

                        //如果点击了暂停按钮则跳出程序循环
                        if (pause == true) {
                            return;
                        }
                        //每次浏览器加载完毕则执行操作
                        if (newState == Worker.State.SUCCEEDED) {
                            //获取浏览器中网页的HTML
                            String docContent = webEngine.getDocument().getDocumentElement().getTextContent();
                            String resultString = "";

                            //用正则表达式搜索获取HTML中的美式音标
                            Pattern patternUS = Pattern.compile("美\\S\\[[^\\[\\]]+\\]");
                            Matcher matcherUS = patternUS.matcher(docContent);
                            if (matcherUS.find()) {
                                resultString = matcherUS.group(0).replaceAll("[美\\s\\[\\]]", "");
                                //把单个单引号替换成两个单引号避免SQL报错
                                resultString = resultString.replaceAll("'", "''");
                                System.out.println("PronunciationUS:" + resultString);
                                info.setText("PronunciationUS: " + resultString);
                                try {
                                    //如果找到了音标则写入数据库
                                    statement.executeUpdate("UPDATE wordlist SET pronunciation1='" + resultString + "' WHERE word='" + currentWord + "';");
                                    System.out.println("UPDATE wordlist SET pronunciation1='" + resultString + "' WHERE word='" + currentWord + "';");
                                } catch (SQLException ex) {
                                    Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
                                }
                            } else {
                                System.out.println("NO MATCH");
                                info.setText("NO MATCH");
                            }

                            //用正则表达式搜索获取HTML中的英式音标
                            Pattern patternUK = Pattern.compile("英\\S\\[[^\\[\\]]+\\]");
                            Matcher matcherUK = patternUK.matcher(docContent);
                            if (matcherUK.find()) {
                                resultString = matcherUK.group(0).replaceAll("[英\\s\\[\\]]", "");
                                //把单个单引号替换成两个单引号避免SQL报错
                                resultString = resultString.replaceAll("'", "''");
                                System.out.println("PronunciationUK: " + resultString);
                                info.setText("PronunciationUK: " + resultString);
                                try {
                                    //如果找到了音标则写入数据库
                                    statement.executeUpdate("UPDATE wordlist SET pronunciation2='" + resultString + "' WHERE word='" + currentWord + "';");
                                    System.out.println("UPDATE wordlist SET pronunciation2='" + resultString + "' WHERE word='" + currentWord + "';");
                                } catch (SQLException ex) {
                                    Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
                                }
                            } else {
                                System.out.println("NO MATCH");
                                info.setText("NO MATCH");
                            }

                            //进入查询接龙式循环
                            recordPosition++;
                            inputText.setText(recordPosition + "");

                            if (recordPosition > maxNumber) {
                                info.setText("大功告成");
                                return;
                            }

                            try {
                                //继续查询程序
                                search();
                            } catch (SQLException ex) {
                                Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
                            }

                        }
                    }

                });

        //整理控件布局并显示到场景中
        grid.add(btn, 0, 0);
        grid.add(inputText, 1, 0);
        grid.add(btn1, 2, 0);
        grid.add(btn2, 3, 0);
        grid.add(btnPause, 4, 0);
        grid.add(browser, 0, 1, 5, 1);
        grid.add(info, 0, 3, 5, 1);

        grid.setPadding(new Insets(10, 10, 10, 10));
        grid.setVgap(10);
        grid.setHgap(10);

        browser.setPrefSize(800, 600);
        
        Scene scene = new Scene(grid, 600, 400);

        scene.widthProperty().addListener(
                new ChangeListener() {
            @Override
            public void changed(ObservableValue observable,
                    Object oldValue, Object newValue) {
                Double width = (Double) newValue;
                browser.setPrefWidth(width - 20);
            }
        }
        );
        scene.heightProperty().addListener(
                new ChangeListener() {
            @Override
            public void changed(ObservableValue observable,
                    Object oldValue, Object newValue) {
                Double height = (Double) newValue;
                browser.setPrefHeight(height - 50);
            }
        }
        );

        primaryStage.setTitle("必应英汉词典音标爬虫 Benny");
        primaryStage.setScene(scene);
        primaryStage.show();
    }

    public static void main(String[] args) {
        launch(args);
    }

    public void search() throws SQLException {
        //按单词列表序号依次开始在必应词典网站中请求查询各个单词
        rs = statement.executeQuery("SELECT * FROM wordlist WHERE id=" + recordPosition);
        if (rs.next()) {
            currentWord = rs.getString(2);
            System.out.println("当前正在处理的单词是:" + currentWord);
            webEngine.load("http://cn.bing.com/dict/search?q=" + currentWord);
        }
    }

}



版权声明:本文为MAILLIBIN原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。