JAVA ik es_ElasticSearch-IK分词使用踩坑总结

ES IK插件自带基础中文分词词典如下:

f178e59ffaf2

image

但是基础字典不能满足用户自定义需求,故需要IK支持词典自定义能力

** ES自定义分词词库有以下方式:**

方式一:用户在本地自定义词典

1.在elasticsearch-XXX(版本号)/plugins/ik/config目录下新增xx.dic文件,作为用户个性化词典,而后在plugins/ik/config目录的IKAnalyzer.cfg.xml配置文件内新增自定义扩展词典。

f178e59ffaf2

image

** 2. 将自定义的xx.dic文件名称填入中,如**

my.dic

** 3.配置之后需要重启ES服务才能生效**

4.测试

1)未加自定义词库

f178e59ffaf2

image

2)在个性化字典中加入个性化词,重启es集群,再次执行分词。

f178e59ffaf2

image

f178e59ffaf2

image

5.缺点:每次在个性化词典中新增词汇后,都需要重启es,一旦新增次数频繁会造成大量的时间浪费,且不利于es本身

方式二:配置远程自定义词典

1.在IKAnalyzer.cfg.xml配置文件中修改配置项

location

location

其中location是指一个 url,比如http://yoursite.com/getCustomDict,该请求只需满足以下两点即可完成分词热更新。

该 http 请求需要返回两个头部(header),一个是Last-Modified,一个是ETag,这两者都是字符串类型,只要有一个发生变化,该插件就会去抓取新的分词进而更新词库。

该 http 请求返回的内容格式是一行一个分词,换行符用\n即可。

满足上面两点要求可实现热更新分词,不需要重启 ES 实例。

可以将需自动更新的热词放在一个 UTF-8 编码的 .txt 文件里,放在 nginx 或其他简易 http server 下,当 .txt 文件修改时,http server 会在客户端请求该文件时自动返回相应的 Last-Modified 和 ETag,或者可以另外做一个工具来从业务系统提取相关词汇,并更新这个 .txt 文件。

2.IK同时支持多个词典文件加载,经过尝试使用*.dic 方式进行多文件匹配不可行

f178e59ffaf2

image

f178e59ffaf2

image

需使用";"来分隔不同dic文件路径

f178e59ffaf2

image

3.实例工具代码如下

private static final String EXT_DICT_PATH = "/data/soft/mydic";

/**

* Description:加载扩展词

* @param response

*/

@RequestMapping(value="/getCustomDict.htm")

public void getCustomDict(HttpServletRequest request, HttpServletResponse response){

try {

// 读取字典文件

String path = EXT_DICT_PATH;

File file = new File(path);

String content = "";

if(file.exists()){

// 读取文件内容

FileInputStream fi = new FileInputStream(file);

byte[] buffer = new byte[(int) file.length()];

int offset = 0, numRead = 0;

while (offset < buffer.length && (numRead = fi.read(buffer, offset, buffer.length - offset)) >= 0) {

offset += numRead;

}

fi.close();

content = new String(buffer, "UTF-8");

}

// 返回数据

OutputStream out= response.getOutputStream();

response.setHeader("Last-Modified", String.valueOf(content.length()));

response.setHeader("ETag",String.valueOf(content.length()));

response.setContentType("text/plain; charset=utf-8");

out.write(content.getBytes("utf-8"));

out.flush();

logger.info(content+"这是读取数据值");

} catch (Exception e) {

e.printStackTrace();

}

}

修改IKAnalyzer.cfg.xml配置文件

f178e59ffaf2

image

格式为ip:port/接口名称 or 远程服务器文件位置

方式三:修改IK源码,使用mysql数据库实现热更新

1.git clone到本地,check至当前es版本

2.在Dictionary.java中新增一个方法

f178e59ffaf2

image

public void addStopWords(Collection words) {

if (words != null) {

for (String word : words) {

if (word != null) {

// 批量加载词条到主内存词典中

_StopWords.fillSegment(word.trim().toCharArray());

}

}

}

}

3.新增ext包,增加三个java文件

f178e59ffaf2

image

** DBHelper.java**

package org.wltea.analyzer.ext;

import java.sql.Connection;

import java.sql.DriverManager;

import java.sql.PreparedStatement;

import java.sql.ResultSet;

import java.text.SimpleDateFormat;

import java.time.LocalDate;

import java.util.Arrays;

import java.util.Date;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.TimeZone;

import org.apache.logging.log4j.Logger;

import org.elasticsearch.common.logging.Loggers;

public class DBHelper {

Logger logger=Loggers.getLogger(DBRunnable.class);

public static String url = null;

public static String dbUser = null;

public static String dbPwd = null;

public static String dbTable = null;

/*public static String url = "jdbc:mysql:///elasticsearch";

public static String dbUser = "root";

public static String dbPwd = "whdhz19";

public static String dbTable = "t_es_ik_dic";*/

private Connection conn;

public static Map lastImportTimeMap = new HashMap();

static{

try {

Class.forName("com.mysql.jdbc.Driver");

// 加载Mysql数据驱动

} catch (Exception e) {

e.printStackTrace();

}

}

private Connection getConn() throws Exception {

try {

conn = DriverManager.getConnection(url, dbUser, dbPwd);

// 创建数据连接

} catch (Exception e) {

e.printStackTrace();

}

return conn;

}

/**

*

* @param key 数据库中的属性 扩展词 停用词 同义词等

* @param flag

* @param synony

* @return

* @throws Exception

*/

public String getKey(String key, boolean flag, boolean... synony) throws Exception {

conn = getConn();

StringBuilder data = new StringBuilder();

PreparedStatement ps = null;

ResultSet rs = null;

try {

StringBuilder sql = new StringBuilder("select * from " + dbTable + " where delete_type=0");

//lastImportTime 最新更新时间

Date lastImportTime = DBHelper.lastImportTimeMap.get(key);

SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

if (lastImportTime != null && flag) {

sql.append(" and update_time > '" + sdf.format(lastImportTime) + "'");

}

sql.append(" and " + key + " !=''");

lastImportTime = new Date();

lastImportTimeMap.put(key,lastImportTime);

//如果打印出来的时间 和本地时间不一样,则要注意JVM时区是否和服务器系统时区一致

logger.warn("sql==={}",sql.toString());

ps = conn.prepareStatement(sql.toString());

rs = ps.executeQuery();

while (rs.next()) {

String value = rs.getString(key);

if (StringUtils.isNotBlank(value)) {

if (synony != null&&synony.length>0) {

data.append(value + "\n");

} else {

data.append(value + ",");

}

}

}

} catch (Exception e) {

e.printStackTrace();

} finally {

try {

if (ps != null) {

ps.close();

}

if (rs != null) {

rs.close();

}

conn.close();

} catch (Exception e) {

e.printStackTrace();

}

}

return data.toString();

}

public static void main(String[] args) throws Exception {

DBHelper dbHelper=new DBHelper();

String extWords=dbHelper.getKey("ext_word",true);

List extList = Arrays.asList(extWords.split(","));

System.out.println(extList);

// System.out.println(getKey("stopword"));

// System.out.println(getKey("synonym"));

LocalDate now=LocalDate.now();

}

}

** DBRunnable.java**

package org.wltea.analyzer.ext;

import java.util.Arrays;

import java.util.List;

import org.apache.logging.log4j.Logger;

import org.elasticsearch.common.logging.Loggers;

import org.wltea.analyzer.dic.Dictionary;

public class DBRunnable implements Runnable {

Logger logger = Loggers.getLogger(DBRunnable.class);

private String extField;

private String stopField;

public DBRunnable(String extField, String stopField) {

super();

this.extField = extField;

this.stopField = stopField;

}

@Override

public void run() {

logger.warn("开始加载词库========");

//获取词库

Dictionary dic = Dictionary.getSingleton();

DBHelper dbHelper = new DBHelper();

try {

String extWords = dbHelper.getKey(extField, true);

String stopWords = dbHelper.getKey(stopField, true);

if(StringUtils.isNotBlank(extWords)){

List extList = Arrays.asList(extWords.split(","));

//把扩展词加载到主词库中

dic.addWords(extList);

logger.warn("加载扩展词成功========");

logger.warn("extWords为==={}",extWords);

}

if(StringUtils.isNotBlank(stopWords)){

List stopList = Arrays.asList(stopWords.split(","));

//把扩展词加载到主词库中

dic.addStopWords(stopList);

logger.warn("加载停用词成功========");

logger.warn("stopWords为==={}",stopWords);

}

} catch (Exception e) {

logger.warn("加载扩展词失败========{}",e);

}

}

}

4.在AnalysisIkplugins.java中新增识别配置文件属性方法

f178e59ffaf2

image

@Override

public List> getSettings() {

Setting dbUrl=new Setting<>("dbUrl", "", Function.identity(), Setting.Property.NodeScope);

Setting dbUser = new Setting<>("dbUser", "", Function.identity(),Setting.Property.NodeScope);

Setting dbPwd = new Setting<>("dbPwd", "", Function.identity(),Setting.Property.NodeScope);

Setting dbTable = new Setting<>("dbTable", "", Function.identity(),Setting.Property.NodeScope);

Setting extField = new Setting<>("extField", "", Function.identity(),Setting.Property.NodeScope);

Setting stopField = new Setting<>("stopField", "", Function.identity(),Setting.Property.NodeScope);

Setting flushTime =Setting.intSetting("flushTime", 5, Setting.Property.NodeScope);

Setting autoReloadDic = Setting.boolSetting("autoReloadDic", false, Setting.Property.NodeScope);

return Arrays.asList(dbUrl,dbUser,dbPwd,dbTable,extField,stopField,flushTime,autoReloadDic);

}

5.修改IkTokenizerFactory.java的构造方法

f178e59ffaf2

image

public IkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {

super(indexSettings, name, settings);

configuration=new Configuration(env,settings);

//从es配置文件elasticserach.yml中获取mysql信息

Settings s = indexSettings.getSettings();

String dbUrl = s.get("dbUrl");

boolean autoReloadDic=s.getAsBoolean("autoReloadDic", false);

if(autoReloadDic&& StringUtils.isBlank(DBHelper.url)&&StringUtils.isNotBlank(dbUrl)){

String dbUser = s.get("dbUser");

String dbPwd = s.get("dbPwd");

//获取每隔多久从数据库更新信息 默认60S

Integer flushTime = s.getAsInt("flushTime", 60);

String dbTable = s.get("dbTable");

DBHelper.dbTable=dbTable;

DBHelper.dbUser=dbUser;

DBHelper.dbPwd=dbPwd;

DBHelper.url=dbUrl;

logger.warn("dbUrl=========={}",dbUrl);

String extField = s.get("extField");

String stopField = s.get("stopField");

logger.warn("extField=========={}",extField);

logger.warn("stopField=========={}",stopField);

ScheduledExecutorService scheduledExecutorService = Executors.newSingleThreadScheduledExecutor();

scheduledExecutorService.scheduleAtFixedRate(new DBRunnable(extField,stopField), 0, flushTime, TimeUnit.SECONDS);

}

}

6.修改pom文件,增加mysql驱动

f178e59ffaf2

image

执行mvn clean package 命令生成jar包

f178e59ffaf2

image

把修改后的jar放入plugins/ik目录下,替换原有文件

并同时放入mysql驱动包(版本最好同mvn中的驱动版本保持一致)

7.修改elasticsearch配置文件

vim config/elasticsearch.yml

在末尾增加

dbUser: 用户名

dbPwd:密码

dbTable: hot_words

extField: ext_word

stopField: stop_word

flushTime: 5

autoReloadDic: true

8.mysql建表

f178e59ffaf2

image

9.重启es

注:重启后es会产生如下错误

java.security.AccessControlException: access denied (java.net.SocketPermission172.16.154.26:3306 connect,resolve)

目前的解决方式是在运行es环境内找到%JAVA_HOME%/jre6/lib/security/java.policy 文件

增加

permission java.net.SocketPermission "172.16.154.26:3306","accept";

permission java.net.SocketPermission "172.16.154.26:3306","listen";

permission java.net.SocketPermission "172.16.154.26:3306","resolve";

permission java.net.SocketPermission "172.16.154.26:3306","connect";

再次重启es后,成功


版权声明:本文为weixin_42351363原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。