-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
anyihao
committed
Sep 26, 2024
1 parent
01345c4
commit c444f50
Showing
69 changed files
with
35,117 additions
and
19,079 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Tencent VectorDB Sparse Encoder Java SDK | ||
|
||
Java SDK for [Tencent VectorDB Sparse Encoder](https://cloud.tencent.com/product/vdb). | ||
|
||
## Getting started | ||
|
||
|
||
### Prerequisites | ||
|
||
- Java 8 or higher | ||
- Apache Maven or Gradle/Grails | ||
|
||
### Install Java SDK | ||
|
||
You can use **Apache Maven** or **Gradle**/**Grails** to download the SDK. | ||
|
||
- Apache Maven | ||
|
||
```xml | ||
<dependency> | ||
<groupId>com.tencent.tcvectordb</groupId> | ||
<artifactId>tcvdb-text</artifactId> | ||
<version>1.0.0</version> | ||
</dependency> | ||
``` | ||
|
||
- Gradle/Grails | ||
|
||
```gradle | ||
compile 'com.tencent.tcvectordb:tcvdb-text:1.0.0' | ||
``` | ||
|
||
### Examples | ||
|
||
Please refer to [examples](src/main/java/com/tencent/tcvdbtext/example.java) folder for Java SDK examples. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xmlns="http://maven.apache.org/POM/4.0.0" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>com.tencent.tcvectordb</groupId> | ||
<artifactId>tcvdb-text</artifactId> | ||
<version>1.0.0</version> | ||
<packaging>jar</packaging> | ||
<name>tcvdb-text</name> | ||
<dependencies> | ||
<dependency> | ||
<groupId>net.jpountz.lz4</groupId> | ||
<artifactId>lz4</artifactId> | ||
<version>1.3.0</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.huaban</groupId> | ||
<artifactId>jieba-analysis</artifactId> | ||
<version>1.0.2</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.google.guava</groupId> | ||
<artifactId>guava</artifactId> | ||
<version>31.1-jre</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.fasterxml.jackson.core</groupId> | ||
<artifactId>jackson-core</artifactId> | ||
<version>2.14.1</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.fasterxml.jackson.core</groupId> | ||
<artifactId>jackson-databind</artifactId> | ||
<version>2.14.1</version> | ||
</dependency> | ||
</dependencies> | ||
<properties> | ||
<maven.compiler.source>1.8</maven.compiler.source> | ||
<maven.compiler.target>1.8</maven.compiler.target> | ||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
</properties> | ||
<build> | ||
<plugins> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-compiler-plugin</artifactId> | ||
<version>3.8.1</version> | ||
<configuration> | ||
<source>${maven.compiler.source}</source> | ||
<target>${maven.compiler.source}</target> | ||
</configuration> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.sonatype.plugins</groupId> | ||
<artifactId>nexus-staging-maven-plugin</artifactId> | ||
<version>1.6.13</version> | ||
<extensions>true</extensions> | ||
<configuration> | ||
<serverId>ossrh</serverId> | ||
<nexusUrl>https://oss.sonatype.org</nexusUrl> | ||
<autoReleaseAfterClose>true</autoReleaseAfterClose> | ||
</configuration> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-source-plugin</artifactId> | ||
<version>2.2.1</version> | ||
<executions> | ||
<execution> | ||
<id>attach-sources</id> | ||
<goals> | ||
<goal>jar-no-fork</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-gpg-plugin</artifactId> | ||
<version>1.6</version> | ||
<executions> | ||
<execution> | ||
<id>sign-artifacts</id> | ||
<phase>verify</phase> | ||
<goals> | ||
<goal>sign</goal> | ||
</goals> | ||
<configuration> | ||
<keyname>${gpg.keyname}</keyname> | ||
<passphraseServerId>${gpg.keyname}</passphraseServerId> | ||
<gpgArguments> | ||
<arg>--pinentry-mode</arg> | ||
<arg>loopback</arg> | ||
</gpgArguments> | ||
</configuration> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
|
||
<distributionManagement> | ||
<repository> | ||
<id>ossrh</id> | ||
<name>${releases.name}</name> | ||
<url>${releases.url}</url> | ||
</repository> | ||
<snapshotRepository> | ||
<id>${snapshots.id}</id> | ||
<name>${snapshots.name}</name> | ||
<url>${snapshots.url}</url> | ||
</snapshotRepository> | ||
</distributionManagement> | ||
</project> |
74 changes: 74 additions & 0 deletions
74
tcvdb_text/src/main/java/com/tencent/tcvdbtext/encoder/BaseSparseEncoder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
/* | ||
*Copyright (c) 2024, Tencent. All rights reserved. | ||
* | ||
*Redistribution and use in source and binary forms, with or without | ||
*modification, are permitted provided that the following conditions are met: | ||
* | ||
* * Redistributions of source code must retain the above copyright notice, | ||
* this list of conditions and the following disclaimer. | ||
* * Redistributions in binary form must reproduce the above copyright | ||
* notice, this list of conditions and the following disclaimer in the | ||
* documentation and/or other materials provided with the distribution. | ||
* * Neither the name of elasticfaiss nor the names of its contributors may be used | ||
* to endorse or promote products derived from this software without | ||
* specific prior written permission. | ||
* | ||
*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
*ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS | ||
*BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
*CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
*SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
*INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
*CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
*ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | ||
*THE POSSIBILITY OF SUCH DAMAGE. | ||
*/ | ||
package com.tencent.tcvdbtext.encoder; | ||
|
||
|
||
import org.apache.commons.lang3.tuple.Pair; | ||
|
||
import java.io.Serializable; | ||
import java.util.List; | ||
public interface BaseSparseEncoder extends Serializable { | ||
/** | ||
* Convert the given texts into its corresponding sparse vector representation. | ||
* @param texts:the texts to be encoded | ||
* @return List<List<Pair<Long, Float>>>: sparse vectors of each text | ||
*/ | ||
public List<List<Pair<Long, Float>>> encodeTexts(List<String> texts); | ||
|
||
/** | ||
* Convert the given query texts into its corresponding sparse vector representation. | ||
* @param texts: the query texts to be encoded | ||
* @return List<List<Pair<Long, Float>>>: sparse vectors of each query | ||
*/ | ||
public List<List<Pair<Long, Float>>> encodeQueries(List<String> texts); | ||
|
||
/** | ||
* Based on the given text corpus, calculate and adjust parameters such as term frequency and document count. | ||
* @param texts: the text to be fit and adjust parameters. | ||
*/ | ||
public void fitCorpus(List<String> texts); | ||
|
||
/** | ||
* Download the params of the encoder model to the local file | ||
* @param paramsFile: the file path to save the params | ||
*/ | ||
public void downloadParams(String paramsFile); | ||
|
||
/** | ||
* Set the params of the encoder model | ||
* @param paramsFile: the file path to load the params | ||
*/ | ||
public void setParams(String paramsFile); | ||
|
||
/** | ||
* Load the dictionary file used by the tokenizer. | ||
* @param dictFile: the file path to load the dict, txt format, words are separated by newline or space. | ||
*/ | ||
public void setDict(String dictFile); | ||
|
||
} |
121 changes: 121 additions & 0 deletions
121
tcvdb_text/src/main/java/com/tencent/tcvdbtext/encoder/Bm25Parameter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
/* | ||
*Copyright (c) 2024, Tencent. All rights reserved. | ||
* | ||
*Redistribution and use in source and binary forms, with or without | ||
*modification, are permitted provided that the following conditions are met: | ||
* | ||
* * Redistributions of source code must retain the above copyright notice, | ||
* this list of conditions and the following disclaimer. | ||
* * Redistributions in binary form must reproduce the above copyright | ||
* notice, this list of conditions and the following disclaimer in the | ||
* documentation and/or other materials provided with the distribution. | ||
* * Neither the name of elasticfaiss nor the names of its contributors may be used | ||
* to endorse or promote products derived from this software without | ||
* specific prior written permission. | ||
* | ||
*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
*ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS | ||
*BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
*CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
*SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
*INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
*CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
*ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | ||
*THE POSSIBILITY OF SUCH DAMAGE. | ||
*/ | ||
package com.tencent.tcvdbtext.encoder; | ||
|
||
|
||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties; | ||
import com.fasterxml.jackson.annotation.JsonInclude; | ||
import com.fasterxml.jackson.annotation.JsonProperty; | ||
|
||
import java.util.Map; | ||
|
||
@JsonIgnoreProperties(ignoreUnknown = true) | ||
@JsonInclude(JsonInclude.Include.NON_NULL) | ||
public class Bm25Parameter { | ||
private Double k1; | ||
private Double b; | ||
@JsonProperty("token_freq") | ||
private Map<String, Integer> tokenFreq; | ||
@JsonProperty("doc_count") | ||
private Integer docCount; | ||
@JsonProperty("average_doc_length") | ||
private Double averageDocLength; | ||
@JsonProperty("stop_words") | ||
private Boolean stopWords; | ||
@JsonProperty("lower_case") | ||
private Boolean lowerCase; | ||
@JsonProperty("dict_file") | ||
private String dictFile; | ||
|
||
public Double getK1() { | ||
return k1; | ||
} | ||
|
||
public void setK1(Double k1) { | ||
this.k1 = k1; | ||
} | ||
|
||
public Double getB() { | ||
return b; | ||
} | ||
|
||
public void setB(Double b) { | ||
this.b = b; | ||
} | ||
|
||
public Map<String, Integer> getTokenFreq() { | ||
return tokenFreq; | ||
} | ||
|
||
public void setTokenFreq(Map<String, Integer> tokenFreq) { | ||
this.tokenFreq = tokenFreq; | ||
} | ||
|
||
public Integer getDocCount() { | ||
return docCount; | ||
} | ||
|
||
public void setDocCount(Integer docCount) { | ||
this.docCount = docCount; | ||
} | ||
|
||
public Double getAverageDocLength() { | ||
return averageDocLength; | ||
} | ||
|
||
public void setAverageDocLength(Double averageDocLength) { | ||
this.averageDocLength = averageDocLength; | ||
} | ||
|
||
public Boolean getStopWords() { | ||
if(stopWords == null) { | ||
return true; | ||
} | ||
return stopWords; | ||
} | ||
|
||
public void setStopWords(Boolean stopWords) { | ||
this.stopWords = stopWords; | ||
} | ||
|
||
public Boolean getLowerCase() { | ||
return lowerCase; | ||
} | ||
|
||
public void setLowerCase(Boolean lowerCase) { | ||
this.lowerCase = lowerCase; | ||
} | ||
|
||
public String getDictFile() { | ||
return dictFile; | ||
} | ||
|
||
public void setDictFile(String dictFile) { | ||
this.dictFile = dictFile; | ||
} | ||
} |
Oops, something went wrong.