Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion api/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import rag.utils.es_conn
import rag.utils.infinity_conn
import rag.utils.opensearch_conn
import rag.utils.baidu_vdb_conn
from api.constants import RAG_FLOW_SERVICE_NAME
from api.utils import decrypt_database_config, get_base_config
from api.utils.file_utils import get_project_base_directory
Expand Down Expand Up @@ -184,10 +185,12 @@ def init_settings():
docStoreConn = rag.utils.infinity_conn.InfinityConnection()
elif lower_case_doc_engine == "opensearch":
docStoreConn = rag.utils.opensearch_conn.OSConnection()
elif lower_case_doc_engine == "baiduvdb":
docStoreConn = rag.utils.baidu_vdb_conn.BaiduVDBConnection()
else:
raise Exception(f"Not supported doc engine: {DOC_ENGINE}")

retrievaler = search.Dealer(docStoreConn)
retrievaler = search.Dealer(dataStore = docStoreConn, docEngine=DOC_ENGINE)
from graphrag import search as kg_search

kg_retrievaler = kg_search.KGSearch(docStoreConn)
Expand Down
42 changes: 42 additions & 0 deletions conf/mochow_mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"id": {"fieldType": "STRING", "primaryKey": true, "partitionKey": true, "default": ""},
"doc_id": {"fieldType": "STRING", "default": ""},
"kb_id": {"fieldType": "ARRAY", "elementType": "STRING", "default": [""]},
"create_time": {"fieldType": "STRING", "default": ""},
"create_timestamp_flt": {"fieldType": "FLOAT", "default": 0.0},
"img_id": {"fieldType": "STRING", "default": ""},
"docnm_kwd": {"fieldType": "STRING", "default": ""},
"title_tks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
"title_sm_tks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
"name_kwd": {"fieldType": "STRING", "default": ""},
"important_kwd": {"fieldType": "ARRAY", "elementType": "STRING", "default": [""]},
"tag_kwd": {"fieldType": "ARRAY", "elementType": "STRING", "default": [""]},
"important_tks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
"question_kwd": {"fieldType": "ARRAY", "elementType": "STRING", "default": [""]},
"question_tks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
"content_with_weight": {"fieldType": "STRING", "default": ""},
"content_ltks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
"content_sm_ltks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
"authors_tks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
"authors_sm_tks": {"fieldType": "TEXT", "default": "", "analyzer": "DEFAULT_ANALYZER"},
"page_num_int": {"fieldType": "ARRAY", "elementType": "INT64", "default": [0]},
"top_int": {"fieldType": "ARRAY", "elementType": "INT64", "default": [0]},
"position_int": {"fieldType": "ARRAY", "elementType": "INT64", "default": [0]},
"weight_int": {"fieldType": "INT64", "default": 0},
"weight_flt": {"fieldType": "FLOAT", "default": 0.0},
"rank_int": {"fieldType": "INT64", "default": 0},
"rank_flt": {"fieldType": "FLOAT", "default": 0},
"available_int": {"fieldType": "INT64", "default": 1},
"knowledge_graph_kwd": {"fieldType": "STRING", "default": ""},
"entities_kwd": {"fieldType": "ARRAY", "elementType": "STRING", "default": [""]},
"pagerank_fea": {"fieldType": "INT64", "default": 0},
"tag_feas": {"fieldType": "STRING", "default": ""},

"from_entity_kwd": {"fieldType": "STRING", "default": ""},
"to_entity_kwd": {"fieldType": "STRING", "default": ""},
"entity_kwd": {"fieldType": "STRING", "default": ""},
"entity_type_kwd": {"fieldType": "STRING", "default": ""},
"source_id": {"fieldType": "ARRAY", "elementType": "STRING", "default": [""]},
"n_hop_with_weight": {"fieldType": "STRING", "default": ""},
"removed_kwd": {"fieldType": "STRING", "default": ""}
}
5 changes: 5 additions & 0 deletions conf/service_conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ redis:
db: 1
password: 'infini_rag_flow'
host: 'localhost:6379'
baiduvdb:
endpoint: 'http://127.0.0.:5287'
username: 'root'
password: 'mochow'
replication: 1
# postgres:
# name: 'rag_flow'
# user: 'rag_flow'
Expand Down
8 changes: 7 additions & 1 deletion docker/.env
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,13 @@ INFINITY_THRIFT_PORT=23817
INFINITY_HTTP_PORT=23820
INFINITY_PSQL_PORT=5432

# The password for MySQL.
# config of baidu vdb
BAIDU_VDB_ENDPOINT=http://127.0.0.1:5287
BAIDU_VDB_USER=root
BAIDU_VDB_PASSWORD=password
BAIDU_VDB_REPLICATION=1

# The password for MySQL.
MYSQL_PASSWORD=infini_rag_flow
# The hostname where the MySQL service is exposed
MYSQL_HOST=mysql
Expand Down
5 changes: 5 additions & 0 deletions docker/service_conf.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ redis:
db: 1
password: '${REDIS_PASSWORD:-infini_rag_flow}'
host: '${REDIS_HOST:-redis}:6379'
baiduvdb:
endpoint: 'http://${BAIDU_VDB_ENDPOINT:-http://127.0.0.1:5287}'
username: '${BAIDU_VDB_USER:-root}'
password: '${BAIDU_VDB_PASSWORD:-password}'
replication: '${BAIDU_VDB_REPLICATION:-1}'
# postgres:
# name: '${POSTGRES_DBNAME:-rag_flow}'
# user: '${POSTGRES_USER:-rag_flow}'
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ dependencies = [
"debugpy>=1.8.13",
"mcp>=1.9.4",
"opensearch-py==2.7.1",
"pymochow==2.2.9"
"pluginlib==0.9.4",
"click>=8.1.8",
"python-calamine>=0.4.0",
Expand Down
91 changes: 90 additions & 1 deletion rag/nlp/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@


class FulltextQueryer:
def __init__(self):
def __init__(self, docEngine: str = "elasticsearch"):
self.tw = term_weight.Dealer()
self.syn = synonym.Dealer()
self.query_fields = [
Expand All @@ -36,6 +36,15 @@ def __init__(self):
"content_ltks^2",
"content_sm_ltks",
]
self.baidu_vdb_query_fields = [
"title_tks",
"title_sm_tks",
"important_tks",
"question_tks",
"content_ltks",
"content_sm_ltks",
]
self.docEngine = docEngine

@staticmethod
def subSpecialChar(line):
Expand Down Expand Up @@ -83,6 +92,8 @@ def add_space_between_eng_zh(txt):
return txt

def question(self, txt, tbl="qa", min_match: float = 0.6):
if self.docEngine == "baiduvdb":
return self.question_by_baidu_vdb(txt=txt)
txt = FulltextQueryer.add_space_between_eng_zh(txt)
txt = re.sub(
r"[ :|\r\n\t,,。??/`!!&^%%()\[\]{}<>]+",
Expand Down Expand Up @@ -216,6 +227,84 @@ def need_fine_grained_tokenize(tk):
), keywords
return None, keywords

def question_by_baidu_vdb(self, txt):
txt = re.sub(
r"[ :|\r\n\t,,。??/`!!&^%%()\[\]{}<>]+",
" ",
rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
).strip()
txt = FulltextQueryer.rmWWW(txt)

if not self.isChinese(txt):
txt = FulltextQueryer.rmWWW(txt)
tks = rag_tokenizer.tokenize(txt).split()
keywords = [t for t in tks if t]
tks_w = self.tw.weights(tks, preprocess=False)
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
tks_w = [(re.sub(r"^[a-z0-9]$", "", tk), w) for tk, w in tks_w if tk]
tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk]
tks_w = [(tk.strip(), w) for tk, w in tks_w if tk.strip()]
for tk, w in tks_w[:256]:
syn = self.syn.lookup(tk)
syn = rag_tokenizer.tokenize(" ".join(syn)).split()
keywords.extend(syn)

return MatchTextExpr(
self.baidu_vdb_query_fields, txt, 100
), keywords

def need_fine_grained_tokenize(tk):
if len(tk) < 3:
return False
if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
return False
return True

txt = FulltextQueryer.rmWWW(txt)
keywords = []
for tt in self.tw.split(txt)[:256]: # .split():
if not tt:
continue
keywords.append(tt)
twts = self.tw.weights([tt])
syns = self.syn.lookup(tt)
if syns and len(keywords) < 32:
keywords.extend(syns)
logging.debug(json.dumps(twts, ensure_ascii=False))
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
sm = (
rag_tokenizer.fine_grained_tokenize(tk).split()
if need_fine_grained_tokenize(tk)
else []
)
sm = [
re.sub(
r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
"",
m,
)
for m in sm
]
sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
sm = [m for m in sm if len(m) > 1]

if len(keywords) < 32:
keywords.append(re.sub(r"[ \\\"']+", "", tk))
keywords.extend(sm)

tk_syns = self.syn.lookup(tk)
tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
if len(keywords) < 32:
keywords.extend([s for s in tk_syns if s])
tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
tk_syns = [f"\"{s}\"" if s.find(" ") > 0 else s for s in tk_syns]

if len(keywords) >= 32:
break
return MatchTextExpr(
self.baidu_vdb_query_fields, txt, 100
), keywords

def hybrid_similarity(self, avec, bvecs, atks, btkss, tkweight=0.3, vtweight=0.7):
from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
import numpy as np
Expand Down
5 changes: 3 additions & 2 deletions rag/nlp/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,10 @@ def index_name(uid): return f"ragflow_{uid}"


class Dealer:
def __init__(self, dataStore: DocStoreConnection):
self.qryr = query.FulltextQueryer()
def __init__(self, dataStore: DocStoreConnection, docEngine: str = "elasticsearch"):
self.qryr = query.FulltextQueryer(docEngine=docEngine)
self.dataStore = dataStore
self.docEngine = docEngine

@dataclass
class SearchResult:
Expand Down
3 changes: 3 additions & 0 deletions rag/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

ES = {}
INFINITY = {}
BAIDUVDB = {}
AZURE = {}
S3 = {}
MINIO = {}
Expand All @@ -40,6 +41,8 @@
OS = get_base_config("os", {})
elif DOC_ENGINE == 'infinity':
INFINITY = get_base_config("infinity", {"uri": "infinity:23817"})
elif DOC_ENGINE == 'baiduvdb':
BAIDUVDB = get_base_config("baiduvdb", {})

if STORAGE_IMPL_TYPE in ['AZURE_SPN', 'AZURE_SAS']:
AZURE = get_base_config("azure", {})
Expand Down
2 changes: 1 addition & 1 deletion rag/svr/task_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,7 @@ async def delete_image(kb_id, chunk_id):
if b % 128 == 0:
progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
if doc_store_result:
error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity/BaiduVDB status!"
progress_callback(-1, msg=error_message)
raise Exception(error_message)
chunk_ids = [chunk["id"] for chunk in chunks[:b + DOC_BULK_SIZE]]
Expand Down
Loading