Skip to content

Commit 1b46839

Browse files
author
binbin.hou
committed
dd
1 parent 175604d commit 1b46839

File tree

9 files changed

+204
-7
lines changed

9 files changed

+204
-7
lines changed

CHANGELOG.md

+7
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,10 @@
1414
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
1515
|:---|:---|:---|:---|:--|
1616
| 1 | A | 常见基础实现 | 2019-02-20 21:40:43 | |
17+
18+
19+
# release_1.1.0
20+
21+
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
22+
|:---|:---|:---|:---|:--|
23+
| 1 | A | 添加拆字实现 | 2021-11-25 21:40:43 | |

README.md

+8-6
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ maven 3.x+
5050
<dependency>
5151
<groupId>com.github.houbb</groupId>
5252
<artifactId>nlp-hanzi-similar</artifactId>
53-
<version>1.0.0</version>
53+
<version>1.1.0</version>
5454
</dependency>
5555
```
5656

@@ -83,6 +83,7 @@ double rate = HanziSimilarBs.newInstance()
8383
.bushouRate(6)
8484
.bihuashuRate(2)
8585
.pinyinRate(1)
86+
.chaiziRate(8)
8687
.similar('', '');
8788
```
8889

@@ -127,11 +128,12 @@ HanziSimilarBs 中允许自定义的配置列表如下:
127128
| 10 | sijiaoRate | 四角编码权重 |
128129
| 12 | sijiaoData | 四角编码数据 |
129130
| 13 | sijiaoSimilar | 四角编码相似度策略 |
130-
| 14 | pinyinRate | 拼音权重 |
131-
| 15 | pinyinData | 拼音数据 |
132-
| 16 | pinyinSimilar | 拼音相似度策略 |
133-
| 17 | hanziSimilar | 汉字相似度核心策略 |
134-
| 18 | userDefineData | 用户自定义数据 |
131+
| 14 | pinyinData | 拼音数据 |
132+
| 15 | pinyinSimilar | 拼音相似度策略 |
133+
| 16 | hanziSimilar | 汉字相似度核心策略 |
134+
| 17 | userDefineData | 用户自定义数据 |
135+
| 18 | chaiziRate | 拆字比例 |
136+
| 19 | chaiziSimlar | 拆字相似度 |
135137

136138
所有的配置都可以基于接口,用户进行自定义。
137139

pom.xml

+10
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@
4848
<artifactId>pinyin</artifactId>
4949
<version>0.2.2</version>
5050
</dependency>
51+
<dependency>
52+
<groupId>com.github.houbb</groupId>
53+
<artifactId>nlp-chaizi</artifactId>
54+
<version>1.1.0</version>
55+
</dependency>
5156

5257
<!--============================== OTHER ==============================-->
5358
<dependency>
@@ -80,6 +85,11 @@
8085
<artifactId>pinyin</artifactId>
8186
</dependency>
8287

88+
<dependency>
89+
<groupId>com.github.houbb</groupId>
90+
<artifactId>nlp-chaizi</artifactId>
91+
</dependency>
92+
8393
<!--============================== OTHER ==============================-->
8494
<dependency>
8595
<groupId>junit</groupId>

src/main/java/com/github/houbb/nlp/hanzi/similar/api/IHanziSimilarContext.java

+12
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,18 @@ public interface IHanziSimilarContext {
8080
*/
8181
double pinyinRate();
8282

83+
/**
84+
* 拆字相似度计算
85+
* @return 相似度计算
86+
*/
87+
IHanziSimilar chaiziSimiar();
88+
89+
/**
90+
* 拆字比例
91+
* @return 比例
92+
*/
93+
double chaiziRate();
94+
8395
/**
8496
* 用户自定义 数据
8597
* @return 数据

src/main/java/com/github/houbb/nlp/hanzi/similar/bs/HanziSimilarBs.java

+25-1
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,20 @@ public static HanziSimilarBs newInstance() {
9595
*/
9696
private IHanziSimilar pinyinSimilar = HanziSimilars.pinyin();
9797

98+
/**
99+
* 拆字占比
100+
*/
101+
private double chaiziRate = HanziSimilarRateConst.CHAIZI;
102+
103+
/**
104+
* 拆字相似度实现
105+
* @since 1.1.0
106+
*/
107+
private IHanziSimilar chaiziSimilar = HanziSimilars.chaizi();
108+
98109
/**
99110
* 核心实现
111+
* @since 1.1.0
100112
*/
101113
private IHanziSimilar hanziSimilar = Instances.singleton(HanziSimilar.class);
102114

@@ -183,6 +195,16 @@ public HanziSimilarBs pinyinSimilar(IHanziSimilar pinyinSimilar) {
183195
return this;
184196
}
185197

198+
public HanziSimilarBs chaiziRate(double chaiziRate) {
199+
this.chaiziRate = chaiziRate;
200+
return this;
201+
}
202+
203+
public HanziSimilarBs chaiziSimilar(IHanziSimilar chaiziSimilar) {
204+
this.chaiziSimilar = chaiziSimilar;
205+
return this;
206+
}
207+
186208
public HanziSimilarBs hanziSimilar(IHanziSimilar hanziSimilar) {
187209
this.hanziSimilar = hanziSimilar;
188210
return this;
@@ -222,7 +244,9 @@ private IHanziSimilarContext buildContext(char one, char two) {
222244
.sijiaoRate(sijiaoRate)
223245
.sijiaoSimilar(sijiaoSimilar)
224246
.pinyinRate(pinyinRate)
225-
.pinyinSimilar(pinyinSimilar);
247+
.pinyinSimilar(pinyinSimilar)
248+
.chaiziRate(chaiziRate)
249+
.chaiziSimiar(chaiziSimilar);
226250

227251
return context;
228252
}

src/main/java/com/github/houbb/nlp/hanzi/similar/constant/HanziSimilarRateConst.java

+5
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@ private HanziSimilarRateConst(){}
2020
*/
2121
public static final double SIJIAO = 8.0;
2222

23+
/**
24+
* 拆字
25+
*/
26+
public static final double CHAIZI = 6.0;
27+
2328
/**
2429
* 偏旁部首
2530
*/
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
package com.github.houbb.nlp.hanzi.similar.support.similar;
2+
3+
import com.github.houbb.heaven.util.lang.CharUtil;
4+
import com.github.houbb.heaven.util.util.ArrayPrimitiveUtil;
5+
import com.github.houbb.heaven.util.util.ArrayUtil;
6+
import com.github.houbb.nlp.hanzi.similar.api.IHanziSimilar;
7+
import com.github.houbb.nlp.hanzi.similar.api.IHanziSimilarContext;
8+
import com.github.houbb.nlp.hanzi.similar.util.ChaiziHelper;
9+
10+
import java.util.List;
11+
import java.util.Map;
12+
13+
/**
14+
* 拆字
15+
*
16+
* A = {A1, A2, ..., Am}
17+
* B = {B1, B2, ..., Bm}
18+
*
19+
* 每一个组成部分都有对应的笔画数(没有默认取1),所以有对应的权重。
20+
*
21+
* 得分应该如何计算呢?
22+
*
23+
* 长度:min(A, B) = m_AB
24+
* 然后遍历,遍历元素。比如以 A 为准。
25+
*
26+
* A1 和 B1 相同,score_1 = A1_n/A_n + B1_n/B_n;
27+
*
28+
* 如何归一化?
29+
*
30+
* @author binbin.hou
31+
* @since 1.0.0
32+
*/
33+
public class ChaiziSimilar implements IHanziSimilar {
34+
35+
@Override
36+
public double similar(IHanziSimilarContext similarContext) {
37+
String hanziOne = similarContext.charOne();
38+
String hanziTwo = similarContext.charTwo();
39+
40+
int numberOne = getNumber(hanziOne, similarContext);
41+
int numberTwo = getNumber(hanziTwo, similarContext);
42+
43+
// 拆分
44+
char[] charsOne = getSplitChars(hanziOne);
45+
char[] charsTwo = getSplitChars(hanziTwo);
46+
47+
int minLen = Math.min(charsOne.length, charsTwo.length);
48+
49+
// 比较
50+
double totalScore = 0.0;
51+
for(int i = 0; i < minLen; i++) {
52+
char iChar = charsOne[i];
53+
String textChar = iChar+"";
54+
if(ArrayPrimitiveUtil.contains(charsTwo, iChar)) {
55+
int textNumber = getNumber(textChar, similarContext);
56+
57+
double scoreOne = textNumber*1.0 / numberOne * 1.0;
58+
double scoreTwo = textNumber*1.0 / numberTwo * 1.0;
59+
60+
totalScore += (scoreOne + scoreTwo) / 2.0;
61+
}
62+
}
63+
64+
return totalScore * similarContext.chaiziRate();
65+
}
66+
67+
/**
68+
* 获取拆分后对应的拆分字符
69+
* @param charWord 字符
70+
* @return 结果
71+
*/
72+
private char[] getSplitChars(String charWord) {
73+
List<String> stringList = ChaiziHelper.chai(charWord.charAt(0));
74+
75+
// 这里应该选择哪一个是有讲究的。此处为了简单,默认选择第一个。
76+
String string = stringList.get(0);
77+
78+
return string.toCharArray();
79+
}
80+
81+
/**
82+
* 获取笔画数
83+
* @param text 文本
84+
* @param similarContext 上下文
85+
* @return 结果
86+
*/
87+
private int getNumber(String text, IHanziSimilarContext similarContext) {
88+
Map<String, Integer> map = similarContext.bihuashuData().dataMap();
89+
90+
Integer number = map.get(text);
91+
if(number == null) {
92+
return 1;
93+
}
94+
95+
return number;
96+
}
97+
98+
}

src/main/java/com/github/houbb/nlp/hanzi/similar/support/similar/HanziSimilarContext.java

+30
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,16 @@ public class HanziSimilarContext implements IHanziSimilarContext {
9595
*/
9696
private IHanziSimilar pinyinSimilar;
9797

98+
/**
99+
* 拆字相似度计算
100+
*/
101+
private IHanziSimilar chaiziSimiar;
102+
103+
/**
104+
* 拆字比例
105+
*/
106+
private double chaiziRate;
107+
98108
@Override
99109
public String charOne() {
100110
return charOne;
@@ -264,4 +274,24 @@ public HanziSimilarContext pinyinSimilar(IHanziSimilar pinyinSimilar) {
264274
this.pinyinSimilar = pinyinSimilar;
265275
return this;
266276
}
277+
278+
@Override
279+
public IHanziSimilar chaiziSimiar() {
280+
return chaiziSimiar;
281+
}
282+
283+
public HanziSimilarContext chaiziSimiar(IHanziSimilar chaiziSimiar) {
284+
this.chaiziSimiar = chaiziSimiar;
285+
return this;
286+
}
287+
288+
@Override
289+
public double chaiziRate() {
290+
return chaiziRate;
291+
}
292+
293+
public HanziSimilarContext chaiziRate(double chaiziRate) {
294+
this.chaiziRate = chaiziRate;
295+
return this;
296+
}
267297
}

src/main/java/com/github/houbb/nlp/hanzi/similar/support/similar/HanziSimilars.java

+9
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,13 @@ public static IHanziSimilar sijiao() {
5151
return Instances.singleton(SijiaoSimilar.class);
5252
}
5353

54+
/**
55+
* 拆字
56+
* @return 实现
57+
* @since 1.1.0
58+
*/
59+
public static IHanziSimilar chaizi() {
60+
return Instances.singleton(ChaiziSimilar.class);
61+
}
62+
5463
}

0 commit comments

Comments
 (0)