Skip to content

Commit

Permalink
feat: add opensearch module
Browse files Browse the repository at this point in the history
  • Loading branch information
zhanglei committed Mar 10, 2024
1 parent ff24cdd commit 38bc426
Show file tree
Hide file tree
Showing 40 changed files with 777 additions and 335 deletions.
69 changes: 69 additions & 0 deletions core/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>analysis-ik</artifactId>
<groupId>com.infinilabs</groupId>
<version>1.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

<artifactId>ik-core</artifactId>
<packaging>jar</packaging>

<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<scope>provided</scope>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analysis-common</artifactId>
<scope>provided</scope>
<version>${lucene.version}</version>
</dependency>

<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>

<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.18.0</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-core</artifactId>
<version>1.3</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-library</artifactId>
<version>1.3</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>

</project>
45 changes: 45 additions & 0 deletions core/src/main/java/org/wltea/analyzer/cfg/Configuration.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package org.wltea.analyzer.cfg;


import java.nio.file.Path;

public abstract class Configuration {

//是否启用智能分词
protected boolean useSmart = false;

//是否启用远程词典加载
protected boolean enableRemoteDict = false;

//是否启用小写处理
protected boolean enableLowercase = true;


public Configuration() {
}

public abstract Path getConfDir();

public abstract Path getConfigInPluginDir();

public boolean isUseSmart() {
return useSmart;
}

public Configuration setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
return this;
}

public boolean isEnableRemoteDict() {
return enableRemoteDict;
}

public boolean isEnableLowercase() {
return enableLowercase;
}

public abstract Path getPath(String first, String... more);

public void check(){}
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ public final class IKSegmenter {
private List<ISegmenter> segmenters;
//分词歧义裁决器
private IKArbitrator arbitrator;
private Configuration configuration;
private Configuration configuration;


/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,13 @@

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.elasticsearch.SpecialPermission;
import org.elasticsearch.core.PathUtils;
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
import org.wltea.analyzer.cfg.Configuration;
import org.apache.logging.log4j.Logger;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.help.ESPluginLoggerFactory;


Expand Down Expand Up @@ -103,7 +99,7 @@ public class Dictionary {
private Dictionary(Configuration cfg) {
this.configuration = cfg;
this.props = new Properties();
this.conf_dir = cfg.getEnvironment().configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME);
this.conf_dir = cfg.getConfDir();
Path configFile = conf_dir.resolve(FILE_NAME);

InputStream input = null;
Expand Down Expand Up @@ -159,10 +155,10 @@ public static synchronized void initial(Configuration cfg) {
// 建立监控线程
for (String location : singleton.getRemoteExtDictionarys()) {
// 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
pool.scheduleAtFixedRate(new Monitor(location, cfg), 10, 60, TimeUnit.SECONDS);
}
for (String location : singleton.getRemoteExtStopWordDictionarys()) {
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
pool.scheduleAtFixedRate(new Monitor(location, cfg), 10, 60, TimeUnit.SECONDS);
}
}

Expand Down Expand Up @@ -224,7 +220,7 @@ private List<String> getExtDictionarys() {
String[] filePaths = extDictCfg.split(";");
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
Path file = PathUtils.get(getDictRoot(), filePath.trim());
Path file = configuration.getPath(getDictRoot(), filePath.trim());
walkFileTree(extDictFiles, file);

}
Expand Down Expand Up @@ -257,7 +253,7 @@ private List<String> getExtStopWordDictionarys() {
String[] filePaths = extStopWordDictCfg.split(";");
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
Path file = PathUtils.get(getDictRoot(), filePath.trim());
Path file = configuration.getPath(getDictRoot(), filePath.trim());
walkFileTree(extStopWordDictFiles, file);

}
Expand Down Expand Up @@ -385,7 +381,7 @@ private void loadMainDict() {
_MainDict = new DictSegment((char) 0);

// 读取主词典文件
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
Path file = configuration.getPath(getDictRoot(), Dictionary.PATH_DIC_MAIN);
loadDictFile(_MainDict, file, false, "Main Dict");
// 加载扩展词典
this.loadExtDict();
Expand All @@ -403,7 +399,7 @@ private void loadExtDict() {
for (String extDictName : extDictFiles) {
// 读取扩展词典文件
logger.info("[Dict Loading] " + extDictName);
Path file = PathUtils.get(extDictName);
Path file = configuration.getPath(extDictName);
loadDictFile(_MainDict, file, false, "Extra Dict");
}
}
Expand All @@ -416,7 +412,7 @@ private void loadRemoteExtDict() {
List<String> remoteExtDictFiles = getRemoteExtDictionarys();
for (String location : remoteExtDictFiles) {
logger.info("[Dict Loading] " + location);
List<String> lists = getRemoteWords(location);
List<String> lists = getRemoteWords(location, configuration::check);
// 如果找不到扩展的字典,则忽略
if (lists == null) {
logger.error("[Dict Loading] " + location + " load failed");
Expand All @@ -433,8 +429,8 @@ private void loadRemoteExtDict() {

}

private static List<String> getRemoteWords(String location) {
SpecialPermission.check();
private static List<String> getRemoteWords(String location, Runnable runnable) {
runnable.run();
return AccessController.doPrivileged((PrivilegedAction<List<String>>) () -> {
return getRemoteWordsUnprivileged(location);
});
Expand Down Expand Up @@ -496,7 +492,7 @@ private void loadStopWordDict() {
_StopWords = new DictSegment((char) 0);

// 读取主词典文件
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP);
Path file = configuration.getPath(getDictRoot(), Dictionary.PATH_DIC_STOP);
loadDictFile(_StopWords, file, false, "Main Stopwords");

// 加载扩展停止词典
Expand All @@ -506,7 +502,7 @@ private void loadStopWordDict() {
logger.info("[Dict Loading] " + extStopWordDictName);

// 读取扩展词典文件
file = PathUtils.get(extStopWordDictName);
file = configuration.getPath(extStopWordDictName);
loadDictFile(_StopWords, file, false, "Extra Stopwords");
}
}
Expand All @@ -515,7 +511,7 @@ private void loadStopWordDict() {
List<String> remoteExtStopWordDictFiles = getRemoteExtStopWordDictionarys();
for (String location : remoteExtStopWordDictFiles) {
logger.info("[Dict Loading] " + location);
List<String> lists = getRemoteWords(location);
List<String> lists = getRemoteWords(location, configuration::check);
// 如果找不到扩展的字典,则忽略
if (lists == null) {
logger.error("[Dict Loading] " + location + " load failed");
Expand All @@ -539,25 +535,25 @@ private void loadQuantifierDict() {
// 建立一个量词典实例
_QuantifierDict = new DictSegment((char) 0);
// 读取量词词典文件
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
Path file = configuration.getPath(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
loadDictFile(_QuantifierDict, file, false, "Quantifier");
}

private void loadSurnameDict() {
DictSegment _SurnameDict = new DictSegment((char) 0);
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME);
Path file = configuration.getPath(getDictRoot(), Dictionary.PATH_DIC_SURNAME);
loadDictFile(_SurnameDict, file, true, "Surname");
}

private void loadSuffixDict() {
DictSegment _SuffixDict = new DictSegment((char) 0);
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
Path file = configuration.getPath(getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
loadDictFile(_SuffixDict, file, true, "Suffix");
}

private void loadPrepDict() {
DictSegment _PrepDict = new DictSegment((char) 0);
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP);
Path file = configuration.getPath(getDictRoot(), Dictionary.PATH_DIC_PREP);
loadDictFile(_PrepDict, file, true, "Preposition");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.SpecialPermission;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.help.ESPluginLoggerFactory;

public class Monitor implements Runnable {
Expand All @@ -31,15 +31,20 @@ public class Monitor implements Runnable {
* 请求地址
*/
private String location;

private Configuration configuration;



public Monitor(String location) {
public Monitor(String location, Configuration cfg) {
this.location = location;
this.last_modified = null;
this.eTags = null;
this.configuration = cfg;
}

public void run() {
SpecialPermission.check();
configuration.check();
AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
this.runUnprivileged();
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,12 @@
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

/**
Expand Down
63 changes: 63 additions & 0 deletions elasticsearch/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>analysis-ik</artifactId>
<groupId>com.infinilabs</groupId>
<version>1.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

<artifactId>elasticsearch-analysis-ik</artifactId>
<version>${elasticsearch.version}</version>
<description>IK Analyzer for Elasticsearch</description>
<packaging>jar</packaging>


<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
<elasticsearch.plugin.name>analysis-ik</elasticsearch.plugin.name>
<elasticsearch.plugin.classname>com.infinilabs.ik.elasticsearch.AnalysisIkPlugin</elasticsearch.plugin.classname>
<elasticsearch.plugin.jvm>true</elasticsearch.plugin.jvm>
</properties>

<dependencies>
<dependency>
<groupId>com.infinilabs</groupId>
<artifactId>ik-core</artifactId>
<version>${project.parent.version}</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>${elasticsearch.version}</version>
<scope>compile</scope>
</dependency>

</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<outputDirectory>${project.build.directory}/releases/</outputDirectory>
<descriptors>
<descriptor>elasticsearch/src/main/assemblies/plugin.xml</descriptor>
</descriptors>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
Loading

0 comments on commit 38bc426

Please sign in to comment.