Skip to content

Commit 28ad60f

Browse files
committed
save file hash values to db
1 parent 7509d33 commit 28ad60f

File tree

11 files changed

+199
-259
lines changed

11 files changed

+199
-259
lines changed

src/main/java/com/apolloconfig/apollo/ai/qabot/api/VectorDBService.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,8 @@ void persistChunkEmbeddings(String fileRoot, List<String> chunks,
1010
List<Embedding> embeddings);
1111

1212
List<MarkdownSearchResult> search(List<List<Float>> searchVectors, int topK);
13+
14+
String queryFileHashValue(String fileRoot);
15+
16+
void persistFile(String fileRoot, String hashValue);
1317
}

src/main/java/com/apolloconfig/apollo/ai/qabot/config/MilvusConfig.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ public class MilvusConfig {
1313
private int port;
1414
private String collection;
1515

16+
private String fileCollection;
17+
1618
private boolean useZillzCloud;
1719

1820
private String zillizCloudUri;
@@ -31,6 +33,14 @@ public String getCollection() {
3133
return collection;
3234
}
3335

36+
public String getFileCollection() {
37+
return fileCollection;
38+
}
39+
40+
public void setFileCollection(String fileCollection) {
41+
this.fileCollection = fileCollection;
42+
}
43+
3444
public void setHost(String host) {
3545
this.host = host;
3646
}

src/main/java/com/apolloconfig/apollo/ai/qabot/controller/HelloController.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,7 @@ public String hello(@PathVariable String name) {
2626
ChatMessage userMessage = new ChatMessage(ChatMessageRole.USER.value(),
2727
"write a brief greeting for " + name);
2828

29-
String result = aiService.getCompletionFromMessages(
29+
return aiService.getCompletionFromMessages(
3030
Lists.newArrayList(systemMessage, userMessage));
31-
32-
return result;
3331
}
3432
}

src/main/java/com/apolloconfig/apollo/ai/qabot/markdown/MarkdownProcessor.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import com.vladsch.flexmark.util.ast.Document;
1212
import com.vladsch.flexmark.util.ast.Node;
1313
import java.io.IOException;
14-
import java.nio.charset.StandardCharsets;
1514
import java.nio.file.Files;
1615
import java.nio.file.Path;
1716
import java.nio.file.Paths;
@@ -20,6 +19,7 @@
2019
import java.util.ArrayList;
2120
import java.util.List;
2221
import java.util.Map;
22+
import java.util.Objects;
2323
import java.util.stream.Stream;
2424
import org.slf4j.Logger;
2525
import org.slf4j.LoggerFactory;
@@ -38,7 +38,6 @@ public class MarkdownProcessor {
3838
private final MarkdownProcessorRetryConfig markdownProcessorRetryConfig;
3939
private final AiService aiService;
4040
private final VectorDBService vectorDBService;
41-
private final Map<String, String> hashValueMap = Maps.newConcurrentMap();
4241
private final BackOff backOff;
4342

4443
public MarkdownProcessor(MarkdownFilesConfig markdownFilesConfig,
@@ -120,11 +119,12 @@ private boolean processFileWithRetry(Path mdFile) throws IOException {
120119
boolean processFile(Path mdFile) throws IOException {
121120
String fileRoot = getMarkdownFileRoots(mdFile);
122121

123-
String markdownContent = new String(Files.readAllBytes(mdFile), StandardCharsets.UTF_8);
122+
String markdownContent = Files.readString(mdFile);
124123
String hashValue = computeHash(markdownContent);
125124

126-
// we could store the hash value in the database
127-
if (hashValueMap.containsKey(fileRoot) && hashValueMap.get(fileRoot).equals(hashValue)) {
125+
String fileHashValue = vectorDBService.queryFileHashValue(fileRoot);
126+
127+
if (Objects.equals(hashValue, fileHashValue)) {
128128
return false;
129129
}
130130

@@ -137,7 +137,7 @@ boolean processFile(Path mdFile) throws IOException {
137137

138138
vectorDBService.persistChunkEmbeddings(fileRoot, chunks, embeddings);
139139

140-
hashValueMap.put(fileRoot, hashValue);
140+
vectorDBService.persistFile(fileRoot, hashValue);
141141

142142
return true;
143143
}

src/main/java/com/apolloconfig/apollo/ai/qabot/milvus/MilvusService.java

Lines changed: 175 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
package com.apolloconfig.apollo.ai.qabot.milvus;
22

3-
import com.google.common.collect.Lists;
43
import com.apolloconfig.apollo.ai.qabot.api.VectorDBService;
54
import com.apolloconfig.apollo.ai.qabot.config.MilvusConfig;
65
import com.apolloconfig.apollo.ai.qabot.markdown.MarkdownSearchResult;
6+
import com.google.common.collect.Lists;
77
import com.theokanning.openai.embedding.Embedding;
88
import io.milvus.client.MilvusServiceClient;
99
import io.milvus.common.clientenum.ConsistencyLevelEnum;
@@ -32,6 +32,7 @@
3232
import java.util.Arrays;
3333
import java.util.Collections;
3434
import java.util.List;
35+
import java.util.Random;
3536
import java.util.stream.Collectors;
3637
import org.springframework.context.annotation.Profile;
3738
import org.springframework.stereotype.Service;
@@ -43,6 +44,7 @@ class MilvusService implements VectorDBService {
4344

4445
private final MilvusServiceClient milvusServiceClient;
4546
private final MilvusConfig milvusConfig;
47+
private final List<Float> dummyEmbeddings = Lists.newArrayList();
4648

4749
public MilvusService(MilvusConfig milvusConfig) {
4850
this.milvusConfig = milvusConfig;
@@ -160,7 +162,7 @@ private List<Long> queryChunkIdByFileRoot(String fileRoot) {
160162
R<RpcStatus> loadStatus = milvusServiceClient.loadCollection(
161163
loadCollectionParam);
162164

163-
List<String> query_output_fields = Arrays.asList("chunk_id");
165+
List<String> query_output_fields = List.of("chunk_id");
164166
QueryParam queryParam = QueryParam.newBuilder()
165167
.withCollectionName(milvusConfig.getCollection())
166168
.withConsistencyLevel(ConsistencyLevelEnum.STRONG)
@@ -169,6 +171,10 @@ private List<Long> queryChunkIdByFileRoot(String fileRoot) {
169171
.build();
170172
R<QueryResults> respQuery = milvusServiceClient.query(queryParam);
171173

174+
if (respQuery.getStatus() != Status.Success.getCode()) {
175+
throw new RuntimeException("Query failed: " + respQuery.getMessage());
176+
}
177+
172178
QueryResultsWrapper wrapperQuery = new QueryResultsWrapper(respQuery.getData());
173179
List<?> chunkIds = wrapperQuery.getFieldWrapper("chunk_id").getFieldData();
174180

@@ -180,8 +186,116 @@ private List<Long> queryChunkIdByFileRoot(String fileRoot) {
180186
.collect(Collectors.toList());
181187
}
182188

189+
@Override
190+
public String queryFileHashValue(String fileRoot) {
191+
LoadCollectionParam loadCollectionParam = LoadCollectionParam.newBuilder()
192+
.withCollectionName(milvusConfig.getFileCollection())
193+
.build();
194+
195+
R<RpcStatus> loadStatus = milvusServiceClient.loadCollection(
196+
loadCollectionParam);
197+
198+
List<String> query_output_fields = List.of("hash_value");
199+
QueryParam queryParam = QueryParam.newBuilder()
200+
.withCollectionName(milvusConfig.getFileCollection())
201+
.withConsistencyLevel(ConsistencyLevelEnum.STRONG)
202+
.withExpr(String.format("file_root in ['%s']", fileRoot))
203+
.withOutFields(query_output_fields)
204+
.build();
205+
R<QueryResults> respQuery = milvusServiceClient.query(queryParam);
206+
207+
if (respQuery.getStatus() != Status.Success.getCode()) {
208+
throw new RuntimeException("Query failed: " + respQuery.getMessage());
209+
}
210+
211+
QueryResultsWrapper wrapperQuery = new QueryResultsWrapper(respQuery.getData());
212+
List<?> hashValues = wrapperQuery.getFieldWrapper("hash_value").getFieldData();
213+
214+
if (CollectionUtils.isEmpty(hashValues)) {
215+
return null;
216+
}
217+
218+
return hashValues.get(0).toString();
219+
}
220+
221+
@Override
222+
public void persistFile(String fileRoot, String hashValue) {
223+
List<Long> currentFileIds = queryFileIdByFileRoot(fileRoot);
224+
225+
List<Field> fields = new ArrayList<>();
226+
fields.add(new InsertParam.Field("hash_value", List.of(hashValue)));
227+
fields.add(new InsertParam.Field("dummy_embedding", List.of(dummyEmbeddings)));
228+
fields.add(new InsertParam.Field("file_root", List.of(fileRoot)));
229+
230+
InsertParam insertParam = InsertParam.newBuilder()
231+
.withCollectionName(milvusConfig.getFileCollection())
232+
.withFields(fields)
233+
.build();
234+
milvusServiceClient.insert(insertParam);
235+
236+
deleteByFileIdList(currentFileIds);
237+
238+
FlushParam flushParam = FlushParam.newBuilder()
239+
.withCollectionNames(Lists.newArrayList(milvusConfig.getFileCollection()))
240+
.build();
241+
milvusServiceClient.flush(flushParam);
242+
}
243+
244+
private void deleteByFileIdList(List<Long> fileIds) {
245+
if (!fileIds.isEmpty()) {
246+
StringBuilder sb = new StringBuilder();
247+
sb.append("file_id in [");
248+
for (int i = 0; i < fileIds.size(); i++) {
249+
sb.append(fileIds.get(i));
250+
if (i != fileIds.size() - 1) {
251+
sb.append(",");
252+
}
253+
}
254+
sb.append("]");
255+
DeleteParam deleteParam = DeleteParam.newBuilder()
256+
.withCollectionName(milvusConfig.getFileCollection())
257+
.withExpr(sb.toString())
258+
.build();
259+
milvusServiceClient.delete(deleteParam);
260+
}
261+
}
262+
263+
private List<Long> queryFileIdByFileRoot(String fileRoot) {
264+
LoadCollectionParam loadCollectionParam = LoadCollectionParam.newBuilder()
265+
.withCollectionName(milvusConfig.getFileCollection())
266+
.build();
267+
268+
R<RpcStatus> loadStatus = milvusServiceClient.loadCollection(
269+
loadCollectionParam);
270+
271+
List<String> query_output_fields = List.of("file_id");
272+
QueryParam queryParam = QueryParam.newBuilder()
273+
.withCollectionName(milvusConfig.getFileCollection())
274+
.withConsistencyLevel(ConsistencyLevelEnum.STRONG)
275+
.withExpr(String.format("file_root in ['%s']", fileRoot))
276+
.withOutFields(query_output_fields)
277+
.build();
278+
R<QueryResults> respQuery = milvusServiceClient.query(queryParam);
279+
280+
if (respQuery.getStatus() != Status.Success.getCode()) {
281+
throw new RuntimeException("Query failed: " + respQuery.getMessage());
282+
}
283+
284+
QueryResultsWrapper wrapperQuery = new QueryResultsWrapper(respQuery.getData());
285+
List<?> fileIds = wrapperQuery.getFieldWrapper("file_id").getFieldData();
286+
287+
if (CollectionUtils.isEmpty(fileIds)) {
288+
return Collections.emptyList();
289+
}
290+
291+
return fileIds.stream().map(id -> Long.parseLong(id.toString()))
292+
.collect(Collectors.toList());
293+
}
294+
295+
183296
private void ensureCollections() {
184297
ensureChunkCollection();
298+
ensureFileCollection();
185299
}
186300

187301
private void ensureChunkCollection() {
@@ -239,5 +353,64 @@ private void ensureChunkCollection() {
239353

240354
}
241355

356+
private void ensureFileCollection() {
357+
// prepare dummy embedding data
358+
Random random = new Random();
359+
for (int i = 0; i < 1536; i++) {
360+
dummyEmbeddings.add(random.nextFloat());
361+
}
362+
363+
HasCollectionParam hasCollectionParam = HasCollectionParam.newBuilder()
364+
.withCollectionName(milvusConfig.getFileCollection())
365+
.build();
366+
367+
if (milvusServiceClient.hasCollection(hasCollectionParam).getData()) {
368+
return;
369+
}
370+
371+
FieldType fileId = FieldType.newBuilder()
372+
.withName("file_id")
373+
.withDataType(DataType.Int64)
374+
.withPrimaryKey(true)
375+
.withAutoID(true)
376+
.build();
377+
FieldType fileRoot = FieldType.newBuilder()
378+
.withName("file_root")
379+
.withDataType(DataType.VarChar)
380+
.withMaxLength(100)
381+
.build();
382+
FieldType hashValue = FieldType.newBuilder()
383+
.withName("hash_value")
384+
.withDataType(DataType.VarChar)
385+
.withMaxLength(3000)
386+
.build();
387+
// not used, just for compatibility
388+
FieldType dummyEmbedding = FieldType.newBuilder()
389+
.withName("dummy_embedding")
390+
.withDataType(DataType.FloatVector)
391+
.withDimension(1536)
392+
.build();
393+
CreateCollectionParam createCollectionReq = CreateCollectionParam.newBuilder()
394+
.withCollectionName(milvusConfig.getFileCollection())
395+
.withDescription("Files for QA Search")
396+
.addFieldType(fileId)
397+
.addFieldType(hashValue)
398+
.addFieldType(fileRoot)
399+
.addFieldType(dummyEmbedding)
400+
.build();
401+
402+
milvusServiceClient.createCollection(createCollectionReq);
403+
404+
// not used, just for compatibility
405+
milvusServiceClient.createIndex(
406+
CreateIndexParam.newBuilder()
407+
.withCollectionName(milvusConfig.getFileCollection())
408+
.withFieldName("dummy_embedding")
409+
.withIndexType(IndexType.FLAT)
410+
.withMetricType(MetricType.L2)
411+
.withSyncMode(Boolean.FALSE)
412+
.build()
413+
);
414+
}
242415

243416
}

src/main/resources/application.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ milvus:
5555
zillizCloudToken: xxxxxxxxxxxx
5656
# the milvus database collection name, no need to create it manually
5757
collection: docs
58+
fileCollection: files
5859

5960
qa:
6061
# the topK number of chunks retrieved from milvus database

src/test/java/com/apolloconfig/apollo/ai/qabot/QABotApplicationTests.java

Lines changed: 0 additions & 13 deletions
This file was deleted.

src/test/java/com/apolloconfig/apollo/ai/qabot/controller/MarkdownControllerTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
import static org.mockito.Mockito.verify;
66
import static org.mockito.Mockito.when;
77

8-
import com.google.common.collect.Lists;
98
import com.apolloconfig.apollo.ai.qabot.config.MarkdownFilesConfig;
109
import com.apolloconfig.apollo.ai.qabot.markdown.MarkdownProcessor;
10+
import com.google.common.collect.Lists;
1111
import java.util.List;
1212
import org.junit.jupiter.api.BeforeEach;
1313
import org.junit.jupiter.api.Test;

src/test/java/com/apolloconfig/apollo/ai/qabot/controller/QAControllerTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@
1212
import static org.mockito.Mockito.verify;
1313
import static org.mockito.Mockito.when;
1414

15-
import com.google.common.collect.Lists;
1615
import com.apolloconfig.apollo.ai.qabot.api.AiService;
1716
import com.apolloconfig.apollo.ai.qabot.api.VectorDBService;
1817
import com.apolloconfig.apollo.ai.qabot.controller.QAController.Answer;
1918
import com.apolloconfig.apollo.ai.qabot.markdown.MarkdownSearchResult;
19+
import com.google.common.collect.Lists;
2020
import com.theokanning.openai.embedding.Embedding;
2121
import java.util.Collections;
2222
import java.util.List;

src/test/java/com/apolloconfig/apollo/ai/qabot/markdown/MarkdownProcessorTest.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import static org.mockito.Mockito.doReturn;
77
import static org.mockito.Mockito.doThrow;
88
import static org.mockito.Mockito.mock;
9-
import static org.mockito.Mockito.spy;
109
import static org.mockito.Mockito.times;
1110
import static org.mockito.Mockito.verify;
1211
import static org.mockito.Mockito.when;

0 commit comments

Comments
 (0)