Skip to content

Commit

Permalink
2.0.3
Browse files Browse the repository at this point in the history
  • Loading branch information
anyihao committed Sep 26, 2024
1 parent 01345c4 commit c444f50
Show file tree
Hide file tree
Showing 69 changed files with 35,117 additions and 19,079 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ You can use **Apache Maven** or **Gradle**/**Grails** to download the SDK.
<dependency>
<groupId>com.tencent.tcvectordb</groupId>
<artifactId>vectordatabase-sdk-java</artifactId>
<version>1.3.6</version>
<version>2.0.3</version>
</dependency>
```

- Gradle/Grails

```gradle
compile 'com.tencent.tcvectordb:vectordatabase-sdk-java:1.3.6'
compile 'com.tencent.tcvectordb:vectordatabase-sdk-java:2.0.3'
```

### Examples

Please refer to [examples](.tcvectordb/src/main/java/com/tencent/tcvectordb/examples) folder for Java SDK examples.
Please refer to [examples](./tcvectordb/src/main/java/com/tencent/tcvectordb/examples) folder for Java SDK examples.
35 changes: 35 additions & 0 deletions tcvdb_text/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Tencent VectorDB Sparse Encoder Java SDK

Java SDK for [Tencent VectorDB Sparse Encoder](https://cloud.tencent.com/product/vdb).

## Getting started


### Prerequisites

- Java 8 or higher
- Apache Maven or Gradle/Grails

### Install Java SDK

You can use **Apache Maven** or **Gradle**/**Grails** to download the SDK.

- Apache Maven

```xml
<dependency>
<groupId>com.tencent.tcvectordb</groupId>
<artifactId>tcvdb-text</artifactId>
<version>1.0.0</version>
</dependency>
```

- Gradle/Grails

```gradle
compile 'com.tencent.tcvectordb:tcvdb-text:1.0.0'
```

### Examples

Please refer to [examples](src/main/java/com/tencent/tcvdbtext/example.java) folder for Java SDK examples.
116 changes: 116 additions & 0 deletions tcvdb_text/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.tencent.tcvectordb</groupId>
<artifactId>tcvdb-text</artifactId>
<version>1.0.0</version>
<packaging>jar</packaging>
<name>tcvdb-text</name>
<dependencies>
<dependency>
<groupId>net.jpountz.lz4</groupId>
<artifactId>lz4</artifactId>
<version>1.3.0</version>
</dependency>
<dependency>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
<version>1.0.2</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>31.1-jre</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.14.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.14.1</version>
</dependency>
</dependencies>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>${maven.compiler.source}</source>
<target>${maven.compiler.source}</target>
</configuration>
</plugin>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.13</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
<nexusUrl>https://oss.sonatype.org</nexusUrl>
<autoReleaseAfterClose>true</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.6</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
<configuration>
<keyname>${gpg.keyname}</keyname>
<passphraseServerId>${gpg.keyname}</passphraseServerId>
<gpgArguments>
<arg>--pinentry-mode</arg>
<arg>loopback</arg>
</gpgArguments>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

<distributionManagement>
<repository>
<id>ossrh</id>
<name>${releases.name}</name>
<url>${releases.url}</url>
</repository>
<snapshotRepository>
<id>${snapshots.id}</id>
<name>${snapshots.name}</name>
<url>${snapshots.url}</url>
</snapshotRepository>
</distributionManagement>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
*Copyright (c) 2024, Tencent. All rights reserved.
*
*Redistribution and use in source and binary forms, with or without
*modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of elasticfaiss nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
*ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
*BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
*CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
*SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
*INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
*CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
*ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
*THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.tencent.tcvdbtext.encoder;


import org.apache.commons.lang3.tuple.Pair;

import java.io.Serializable;
import java.util.List;
public interface BaseSparseEncoder extends Serializable {
/**
* Convert the given texts into its corresponding sparse vector representation.
* @param texts:the texts to be encoded
* @return List<List<Pair<Long, Float>>>: sparse vectors of each text
*/
public List<List<Pair<Long, Float>>> encodeTexts(List<String> texts);

/**
* Convert the given query texts into its corresponding sparse vector representation.
* @param texts: the query texts to be encoded
* @return List<List<Pair<Long, Float>>>: sparse vectors of each query
*/
public List<List<Pair<Long, Float>>> encodeQueries(List<String> texts);

/**
* Based on the given text corpus, calculate and adjust parameters such as term frequency and document count.
* @param texts: the text to be fit and adjust parameters.
*/
public void fitCorpus(List<String> texts);

/**
* Download the params of the encoder model to the local file
* @param paramsFile: the file path to save the params
*/
public void downloadParams(String paramsFile);

/**
* Set the params of the encoder model
* @param paramsFile: the file path to load the params
*/
public void setParams(String paramsFile);

/**
* Load the dictionary file used by the tokenizer.
* @param dictFile: the file path to load the dict, txt format, words are separated by newline or space.
*/
public void setDict(String dictFile);

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
*Copyright (c) 2024, Tencent. All rights reserved.
*
*Redistribution and use in source and binary forms, with or without
*modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of elasticfaiss nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
*ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
*BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
*CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
*SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
*INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
*CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
*ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
*THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.tencent.tcvdbtext.encoder;


import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;

import java.util.Map;

@JsonIgnoreProperties(ignoreUnknown = true)
@JsonInclude(JsonInclude.Include.NON_NULL)
public class Bm25Parameter {
private Double k1;
private Double b;
@JsonProperty("token_freq")
private Map<String, Integer> tokenFreq;
@JsonProperty("doc_count")
private Integer docCount;
@JsonProperty("average_doc_length")
private Double averageDocLength;
@JsonProperty("stop_words")
private Boolean stopWords;
@JsonProperty("lower_case")
private Boolean lowerCase;
@JsonProperty("dict_file")
private String dictFile;

public Double getK1() {
return k1;
}

public void setK1(Double k1) {
this.k1 = k1;
}

public Double getB() {
return b;
}

public void setB(Double b) {
this.b = b;
}

public Map<String, Integer> getTokenFreq() {
return tokenFreq;
}

public void setTokenFreq(Map<String, Integer> tokenFreq) {
this.tokenFreq = tokenFreq;
}

public Integer getDocCount() {
return docCount;
}

public void setDocCount(Integer docCount) {
this.docCount = docCount;
}

public Double getAverageDocLength() {
return averageDocLength;
}

public void setAverageDocLength(Double averageDocLength) {
this.averageDocLength = averageDocLength;
}

public Boolean getStopWords() {
if(stopWords == null) {
return true;
}
return stopWords;
}

public void setStopWords(Boolean stopWords) {
this.stopWords = stopWords;
}

public Boolean getLowerCase() {
return lowerCase;
}

public void setLowerCase(Boolean lowerCase) {
this.lowerCase = lowerCase;
}

public String getDictFile() {
return dictFile;
}

public void setDictFile(String dictFile) {
this.dictFile = dictFile;
}
}
Loading

0 comments on commit c444f50

Please sign in to comment.