Java 敏感词过滤(DFA有穷自动机)

Java 敏感词过滤 DFA有穷自动机

初始化敏感词

使用set集合,可以查询数据库。

 /**
     * 初始化值
     * @return
     */
    private static Set<String> getKeySet() {
        HashSet<String> set = new HashSet<>(3);
        //敏感词 河南 编码 0.3300
        set.add("河南0.3300");
        set.add("河北0.3200");
        set.add("北京0.1000");
        return set;
    }

构建树

将set集合中的数据构建成树,方便查询;树如图所示:
在这里插入图片描述

/**
     * 构建树
     */
    public static void add() {
        keySet = getKeySet();
        String key;
        Map nowMap;
        Map<String, String> newWordMap;
        Iterator<String> iterator = keySet.iterator();
        while (iterator.hasNext()) {
            //北京0.1000
            key = iterator.next();
            nowMap = cityMap;
            for (int i = 0; i < key.length(); i++) {
                char keyChar = key.charAt(i);
                //获取敏感词id
                if('0'==keyChar && '.'==key.charAt(i+1)){
                    nowMap.put("isEnd", "1");
                    nowMap.put("id",key.substring(i));
                    break;
                }else {
                    Object wordMap = nowMap.get(keyChar);
                    if (wordMap != null) {
                        nowMap = (Map) wordMap;
                    } else {
                        newWordMap = new HashMap<>();
                        newWordMap.put("isEnd", "0");
                        nowMap.put(keyChar, newWordMap);
                        nowMap = newWordMap;
                    }
                }
            }
        }
    }

查询

传入要匹配的字符串,如果匹配到了就返回敏感词id(仅匹配一个敏感词)

/**
     * 匹配txt是否含有对应字段
     * @param txt
     * @return
     */
    public static String check(String txt) {
        if (cityMap.size() == 0) {
            add();
        }
        int matchFlag = 0;
        char word;
        Map nowMap = cityMap;
        for (int i = 0; i < txt.length(); i++) {
            word = txt.charAt(i);
            if (nowMap.get(word) != null) {
                nowMap = (Map) nowMap.get(word);
                matchFlag++;
                if ("1".equals(nowMap.get("isEnd"))) {
                    return nowMap.get("id").toString();
                }
            }else {
                i-=matchFlag;
                matchFlag=0;
                nowMap = cityMap;
            }
        }
        return "0.";
    }
}

完整代码如下

/**
 * @projectName XXXX
 * @package com.XXXX.system
 * @className com.XXXX.system.CityTree
 * @copyright Copyright 2021 XXXX, Inc All rights reserved.
 */
package com.XXXX.system;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/**
 * CityTree
 * @description 构建城市省份查询树,用于根据企业名称查找企业所在省份
 * @author XXX
 * @date 2022/10/17 17:38
 * @version 1.0
 */
public class CityTree {
    private static Map<String, Object> cityMap = new HashMap(6);
    private static Set<String> keySet;

    /**
     * 构建树
     */
    public static void add() {
        keySet = getKeySet();
        String key;
        Map nowMap;
        Map<String, String> newWordMap;
        Iterator<String> iterator = keySet.iterator();
        while (iterator.hasNext()) {
            //北京0.1000
            key = iterator.next();
            nowMap = cityMap;
            for (int i = 0; i < key.length(); i++) {
                char keyChar = key.charAt(i);
                //获取id
                if('0'==keyChar && '.'==key.charAt(i+1)){
                    nowMap.put("isEnd", "1");
                    nowMap.put("id",key.substring(i));
                    break;
                }else {
                    Object wordMap = nowMap.get(keyChar);
                    if (wordMap != null) {
                        nowMap = (Map) wordMap;
                    } else {
                        newWordMap = new HashMap<>();
                        newWordMap.put("isEnd", "0");
                        nowMap.put(keyChar, newWordMap);
                        nowMap = newWordMap;
                    }
                }
            }
        }
    }

    /**
     * 初始化值
     * @return
     */
    private static Set<String> getKeySet() {
        HashSet<String> set = new HashSet<>(3);
        //敏感词 河南 编码 0.3300
        set.add("河南0.3300");
        set.add("河北0.3200");
        set.add("北京0.1000");
        return set;
    }

    /**
     * 匹配txt是否含有对应字段
     * @param txt
     * @return
     */
    public static String check(String txt) {
        if (cityMap.size() == 0) {
            add();
        }
        int matchFlag = 0;
        char word;
        Map nowMap = cityMap;
        for (int i = 0; i < txt.length(); i++) {
            word = txt.charAt(i);
            if (nowMap.get(word) != null) {
                nowMap = (Map) nowMap.get(word);
                matchFlag++;
                if ("1".equals(nowMap.get("isEnd"))) {
                    return nowMap.get("id").toString();
                }
            }else {
                i-=matchFlag;
                matchFlag=0;
                nowMap = cityMap;
            }
        }
        return "";
    }
}
 

后记

1.查询方法可根据不同的需求修改为匹配多个敏感词、修改敏感词等
2.构建树方法可修改为不要id,仅构建敏感词树


版权声明:本文为tjk12345原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。