infiniflow · letterbeezps · Mar 21, 2025 · May 13, 2025 · May 14, 2025 · Jun 10, 2025
diff --git a/api/settings.py b/api/settings.py
@@ -23,6 +23,7 @@
 import rag.utils.es_conn
 import rag.utils.infinity_conn
 import rag.utils.opensearch_conn
+import rag.utils.baidu_vdb_conn
 from api.constants import RAG_FLOW_SERVICE_NAME
 from api.utils import decrypt_database_config, get_base_config
 from api.utils.file_utils import get_project_base_directory
@@ -184,10 +185,12 @@ def init_settings():
         docStoreConn = rag.utils.infinity_conn.InfinityConnection()
     elif lower_case_doc_engine == "opensearch":
         docStoreConn = rag.utils.opensearch_conn.OSConnection()
+    elif lower_case_doc_engine == "baiduvdb":
+        docStoreConn = rag.utils.baidu_vdb_conn.BaiduVDBConnection()
     else:
         raise Exception(f"Not supported doc engine: {DOC_ENGINE}")
 
-    retrievaler = search.Dealer(docStoreConn)
+    retrievaler = search.Dealer(dataStore = docStoreConn, docEngine=DOC_ENGINE)
     from graphrag import search as kg_search
 
     kg_retrievaler = kg_search.KGSearch(docStoreConn)

diff --git a/conf/mochow_mapping.json b/conf/mochow_mapping.json
@@ -0,0 +1,42 @@
+{
+	"id": {"fieldType": "STRING", "primaryKey": true, "partitionKey": true,  "default": ""},
+	"doc_id": {"fieldType": "STRING", "default": ""},
+	"kb_id": {"fieldType": "ARRAY", "elementType": "STRING", "default": [""]},
+	"create_time": {"fieldType": "STRING", "default": ""},
+	"create_timestamp_flt": {"fieldType": "FLOAT", "default": 0.0},
+	"img_id": {"fieldType": "STRING", "default": ""},
+	"docnm_kwd": {"fieldType": "STRING", "default": ""},
+	"title_tks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
+	"title_sm_tks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
+	"name_kwd": {"fieldType": "STRING", "default": ""},
+	"important_kwd": {"fieldType": "ARRAY", "elementType": "STRING", "default": [""]},
+	"tag_kwd": {"fieldType": "ARRAY", "elementType": "STRING", "default": [""]},
+	"important_tks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
+	"question_kwd": {"fieldType": "ARRAY", "elementType": "STRING", "default": [""]},
+	"question_tks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
+	"content_with_weight": {"fieldType": "STRING", "default": ""},
+	"content_ltks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
+	"content_sm_ltks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
+	"authors_tks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
+	"authors_sm_tks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
+	"page_num_int": {"fieldType": "ARRAY", "elementType": "INT64", "default": [0]},
+	"top_int": {"fieldType": "ARRAY", "elementType": "INT64", "default": [0]},
+	"position_int":  {"fieldType": "ARRAY", "elementType": "INT64", "default": [0]},
+	"weight_int": {"fieldType": "INT64", "default": 0},
+	"weight_flt": {"fieldType": "FLOAT", "default": 0.0},
+	"rank_int": {"fieldType": "INT64", "default": 0},
+	"rank_flt": {"fieldType": "FLOAT", "default": 0},
+	"available_int": {"fieldType": "INT64", "default": 1},
+	"knowledge_graph_kwd": {"fieldType": "STRING", "default": ""},
+	"entities_kwd": {"fieldType": "ARRAY", "elementType": "STRING", "default": [""]},
+	"pagerank_fea": {"fieldType": "INT64", "default":  0},
+	"tag_feas": {"fieldType": "STRING", "default":  ""},
+
+	"from_entity_kwd": {"fieldType": "STRING", "default": ""},
+	"to_entity_kwd": {"fieldType": "STRING", "default": ""},
+	"entity_kwd": {"fieldType": "STRING", "default": ""},
+	"entity_type_kwd": {"fieldType": "STRING", "default": ""},
+	"source_id": {"fieldType": "ARRAY", "elementType": "STRING", "default": [""]},
+	"n_hop_with_weight": {"fieldType": "STRING", "default": ""},
+	"removed_kwd": {"fieldType": "STRING", "default": ""}
+}
diff --git a/conf/service_conf.yaml b/conf/service_conf.yaml
@@ -29,6 +29,11 @@ redis:
   db: 1
   password: 'infini_rag_flow'
   host: 'localhost:6379'
+baiduvdb:
+  endpoint: 'http://127.0.0.:5287'
+  username: 'root'
+  password: 'mochow'
+  replication: 1
 # postgres:
 #   name: 'rag_flow'
 #   user: 'rag_flow'

diff --git a/docker/.env b/docker/.env
@@ -53,7 +53,13 @@ INFINITY_THRIFT_PORT=23817
 INFINITY_HTTP_PORT=23820
 INFINITY_PSQL_PORT=5432
 
-# The password for MySQL.
+# config of baidu vdb
+BAIDU_VDB_ENDPOINT=http://127.0.0.1:5287
+BAIDU_VDB_USER=root
+BAIDU_VDB_PASSWORD=password
+BAIDU_VDB_REPLICATION=1
+
+# The password for MySQL. 
 MYSQL_PASSWORD=infini_rag_flow
 # The hostname where the MySQL service is exposed
 MYSQL_HOST=mysql

diff --git a/docker/service_conf.yaml.template b/docker/service_conf.yaml.template
@@ -29,6 +29,11 @@ redis:
   db: 1
   password: '${REDIS_PASSWORD:-infini_rag_flow}'
   host: '${REDIS_HOST:-redis}:6379'
+baiduvdb:
+  endpoint: 'http://${BAIDU_VDB_ENDPOINT:-http://127.0.0.1:5287}'
+  username: '${BAIDU_VDB_USER:-root}'
+  password: '${BAIDU_VDB_PASSWORD:-password}'
+  replication: '${BAIDU_VDB_REPLICATION:-1}'
 # postgres:
 #   name: '${POSTGRES_DBNAME:-rag_flow}'
 #   user: '${POSTGRES_USER:-rag_flow}'

diff --git a/pyproject.toml b/pyproject.toml
@@ -126,6 +126,7 @@ dependencies = [
     "debugpy>=1.8.13",
     "mcp>=1.9.4",
     "opensearch-py==2.7.1",
+    "pymochow==2.2.9"
     "pluginlib==0.9.4",
     "click>=8.1.8",
     "python-calamine>=0.4.0",

diff --git a/rag/nlp/query.py b/rag/nlp/query.py
@@ -24,7 +24,7 @@
 
 
 class FulltextQueryer:
-    def __init__(self):
+    def __init__(self, docEngine: str = "elasticsearch"):
         self.tw = term_weight.Dealer()
         self.syn = synonym.Dealer()
         self.query_fields = [
@@ -36,6 +36,15 @@ def __init__(self):
             "content_ltks^2",
             "content_sm_ltks",
         ]
+        self.baidu_vdb_query_fields = [
+            "title_tks",
+            "title_sm_tks",
+            "important_tks",
+            "question_tks",
+            "content_ltks",
+            "content_sm_ltks",
+        ]
+        self.docEngine = docEngine
 
     @staticmethod
     def subSpecialChar(line):
@@ -83,6 +92,8 @@ def add_space_between_eng_zh(txt):
         return txt
 
     def question(self, txt, tbl="qa", min_match: float = 0.6):
+        if self.docEngine == "baiduvdb":
+            return self.question_by_baidu_vdb(txt=txt)
         txt = FulltextQueryer.add_space_between_eng_zh(txt)
         txt = re.sub(
             r"[ :|\r\n\t,，。？?/`!！&^%%()\[\]{}<>]+",
@@ -216,6 +227,84 @@ def need_fine_grained_tokenize(tk):
             ), keywords
         return None, keywords
 
+    def question_by_baidu_vdb(self, txt):
+        txt = re.sub(
+            r"[ :|\r\n\t,，。？?/`!！&^%%()\[\]{}<>]+",
+            " ",
+            rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
+        ).strip()
+        txt = FulltextQueryer.rmWWW(txt)
+
+        if not self.isChinese(txt):
+            txt = FulltextQueryer.rmWWW(txt)
+            tks = rag_tokenizer.tokenize(txt).split()
+            keywords = [t for t in tks if t]
+            tks_w = self.tw.weights(tks, preprocess=False)
+            tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
+            tks_w = [(re.sub(r"^[a-z0-9]$", "", tk), w) for tk, w in tks_w if tk]
+            tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk]
+            tks_w = [(tk.strip(), w) for tk, w in tks_w if tk.strip()]
+            for tk, w in tks_w[:256]:
+                syn = self.syn.lookup(tk)
+                syn = rag_tokenizer.tokenize(" ".join(syn)).split()
+                keywords.extend(syn)
+
+            return MatchTextExpr(
+                self.baidu_vdb_query_fields, txt, 100
+            ), keywords
+
+        def need_fine_grained_tokenize(tk):
+            if len(tk) < 3:
+                return False
+            if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
+                return False
+            return True
+
+        txt = FulltextQueryer.rmWWW(txt)
+        keywords = []
+        for tt in self.tw.split(txt)[:256]:  # .split():
+            if not tt:
+                continue
+            keywords.append(tt)
+            twts = self.tw.weights([tt])
+            syns = self.syn.lookup(tt)
+            if syns and len(keywords) < 32:
+                keywords.extend(syns)
+            logging.debug(json.dumps(twts, ensure_ascii=False))
+            for tk, w in sorted(twts, key=lambda x: x[1] * -1):
+                sm = (
+                    rag_tokenizer.fine_grained_tokenize(tk).split()
+                    if need_fine_grained_tokenize(tk)
+                    else []
+                )
+                sm = [
+                    re.sub(
+                        r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|，。；‘’【】、！￥……（）——《》？：“”-]+",
+                        "",
+                        m,
+                    )
+                    for m in sm
+                ]
+                sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
+                sm = [m for m in sm if len(m) > 1]
+
+                if len(keywords) < 32:
+                    keywords.append(re.sub(r"[ \\\"']+", "", tk))
+                    keywords.extend(sm)
+
+                tk_syns = self.syn.lookup(tk)
+                tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
+                if len(keywords) < 32:
+                    keywords.extend([s for s in tk_syns if s])
+                tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
+                tk_syns = [f"\"{s}\"" if s.find(" ") > 0 else s for s in tk_syns]
+
+                if len(keywords) >= 32:
+                    break
+        return MatchTextExpr(
+            self.baidu_vdb_query_fields, txt, 100
+        ), keywords
+
     def hybrid_similarity(self, avec, bvecs, atks, btkss, tkweight=0.3, vtweight=0.7):
         from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
         import numpy as np

diff --git a/rag/nlp/search.py b/rag/nlp/search.py
@@ -30,9 +30,10 @@ def index_name(uid): return f"ragflow_{uid}"
 
 
 class Dealer:
-    def __init__(self, dataStore: DocStoreConnection):
-        self.qryr = query.FulltextQueryer()
+    def __init__(self, dataStore: DocStoreConnection, docEngine: str = "elasticsearch"):
+        self.qryr = query.FulltextQueryer(docEngine=docEngine)
         self.dataStore = dataStore
+        self.docEngine = docEngine
 
     @dataclass
     class SearchResult:

diff --git a/rag/settings.py b/rag/settings.py
@@ -27,6 +27,7 @@
 
 ES = {}
 INFINITY = {}
+BAIDUVDB = {}
 AZURE = {}
 S3 = {}
 MINIO = {}
@@ -40,6 +41,8 @@
     OS = get_base_config("os", {})
 elif DOC_ENGINE == 'infinity':
     INFINITY = get_base_config("infinity", {"uri": "infinity:23817"})
+elif DOC_ENGINE == 'baiduvdb':
+    BAIDUVDB = get_base_config("baiduvdb", {})
 
 if STORAGE_IMPL_TYPE in ['AZURE_SPN', 'AZURE_SAS']:
     AZURE = get_base_config("azure", {})

diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py
@@ -648,7 +648,7 @@ async def delete_image(kb_id, chunk_id):
         if b % 128 == 0:
             progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
         if doc_store_result:
-            error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
+            error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity/BaiduVDB status!"
             progress_callback(-1, msg=error_message)
             raise Exception(error_message)
         chunk_ids = [chunk["id"] for chunk in chunks[:b + DOC_BULK_SIZE]]