Skip to content

Commit 1597a07

Browse files
committed
Merge commit '6e0fb055d18969eb923e719ad92ecac3a5c5d534'
* commit '6e0fb055d18969eb923e719ad92ecac3a5c5d534': (42 commits) chore: bump version to 0.15.1 (langgenius#12690) feat: add table of contents to Knowledge API doc (langgenius#12688) [fix] support feature restore (langgenius#12563) api tool support multiple env url (langgenius#12249) Add new integration with Opik Tracking tool (langgenius#11501) fix: add type hints for App model and improve error handling in audio services (langgenius#12677) fix: Update variable handling in VariableAssignerNode and clean up app_dsl_service (langgenius#12672) Revert "Feat/new saas billing" (langgenius#12673) fix(workflow): fix answer node stream processing in conditional branches (langgenius#12510) fix: ruff with statements (langgenius#12578) fix: ruff check for True if ... else (langgenius#12576) chore: Adjust translations to align with Taiwanese Mandarin conventions (langgenius#12633) Fix pandas indexing method for knowledge base imports (langgenius#12637) (langgenius#12638) Feat/new saas billing (langgenius#12591) improve the readability of the function generate_api_key (langgenius#12552) chore: translate i18n files (langgenius#12543) Feat/add knowledge include all filter (langgenius#12537) fix: Add datasets list access control and fix datasets config display issue (langgenius#12533) fix: sum costs return error value on overview page (langgenius#12534) feat: show workflow running status (langgenius#12531) ... # Conflicts: # api/poetry.lock
2 parents 9a4ecc0 + 6e0fb05 commit 1597a07

File tree

147 files changed

+2930
-744
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

147 files changed

+2930
-744
lines changed

.github/workflows/style.yml

+27
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,33 @@ jobs:
8282
if: steps.changed-files.outputs.any_changed == 'true'
8383
run: yarn run lint
8484

85+
docker-compose-template:
86+
name: Docker Compose Template
87+
runs-on: ubuntu-latest
88+
89+
steps:
90+
- name: Checkout code
91+
uses: actions/checkout@v4
92+
93+
- name: Check changed files
94+
id: changed-files
95+
uses: tj-actions/changed-files@v45
96+
with:
97+
files: |
98+
docker/generate_docker_compose
99+
docker/.env.example
100+
docker/docker-compose-template.yaml
101+
docker/docker-compose.yaml
102+
103+
- name: Generate Docker Compose
104+
if: steps.changed-files.outputs.any_changed == 'true'
105+
run: |
106+
cd docker
107+
./generate_docker_compose
108+
109+
- name: Check for changes
110+
if: steps.changed-files.outputs.any_changed == 'true'
111+
run: git diff --exit-code
85112

86113
superlinter:
87114
name: SuperLinter

api/configs/middleware/vdb/milvus_config.py

+6
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,9 @@ class MilvusConfig(BaseSettings):
3333
description="Name of the Milvus database to connect to (default is 'default')",
3434
default="default",
3535
)
36+
37+
MILVUS_ENABLE_HYBRID_SEARCH: bool = Field(
38+
description="Enable hybrid search features (requires Milvus >= 2.5.0). Set to false for compatibility with "
39+
"older versions",
40+
default=True,
41+
)

api/configs/packaging/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class PackagingInfo(BaseSettings):
99

1010
CURRENT_VERSION: str = Field(
1111
description="Dify version",
12-
default="0.15.0",
12+
default="0.15.1",
1313
)
1414

1515
COMMIT_SHA: str = Field(

api/controllers/console/app/audio.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
2323
from core.model_runtime.errors.invoke import InvokeError
2424
from libs.login import login_required
25-
from models.model import AppMode
25+
from models import App, AppMode
2626
from services.audio_service import AudioService
2727
from services.errors.audio import (
2828
AudioTooLargeServiceError,
@@ -79,7 +79,7 @@ class ChatMessageTextApi(Resource):
7979
@login_required
8080
@account_initialization_required
8181
@get_app_model
82-
def post(self, app_model):
82+
def post(self, app_model: App):
8383
from werkzeug.exceptions import InternalServerError
8484

8585
try:
@@ -98,9 +98,13 @@ def post(self, app_model):
9898
and app_model.workflow.features_dict
9999
):
100100
text_to_speech = app_model.workflow.features_dict.get("text_to_speech")
101+
if text_to_speech is None:
102+
raise ValueError("TTS is not enabled")
101103
voice = args.get("voice") or text_to_speech.get("voice")
102104
else:
103105
try:
106+
if app_model.app_model_config is None:
107+
raise ValueError("AppModelConfig not found")
104108
voice = args.get("voice") or app_model.app_model_config.text_to_speech_dict.get("voice")
105109
except Exception:
106110
voice = None

api/controllers/console/datasets/datasets.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,12 @@ def get(self):
5252
# provider = request.args.get("provider", default="vendor")
5353
search = request.args.get("keyword", default=None, type=str)
5454
tag_ids = request.args.getlist("tag_ids")
55-
55+
include_all = request.args.get("include_all", default="false").lower() == "true"
5656
if ids:
5757
datasets, total = DatasetService.get_datasets_by_ids(ids, current_user.current_tenant_id)
5858
else:
5959
datasets, total = DatasetService.get_datasets(
60-
page, limit, current_user.current_tenant_id, current_user, search, tag_ids
60+
page, limit, current_user.current_tenant_id, current_user, search, tag_ids, include_all
6161
)
6262

6363
# check embedding setting
@@ -640,6 +640,7 @@ def get(self):
640640
| VectorType.MYSCALE
641641
| VectorType.ORACLE
642642
| VectorType.ELASTICSEARCH
643+
| VectorType.ELASTICSEARCH_JA
643644
| VectorType.PGVECTOR
644645
| VectorType.TIDB_ON_QDRANT
645646
| VectorType.LINDORM
@@ -683,6 +684,7 @@ def get(self, vector_type):
683684
| VectorType.MYSCALE
684685
| VectorType.ORACLE
685686
| VectorType.ELASTICSEARCH
687+
| VectorType.ELASTICSEARCH_JA
686688
| VectorType.COUCHBASE
687689
| VectorType.PGVECTOR
688690
| VectorType.LINDORM

api/controllers/console/datasets/datasets_document.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,8 @@ def post(self, dataset_id):
257257
parser.add_argument("original_document_id", type=str, required=False, location="json")
258258
parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
259259
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
260-
260+
parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
261+
parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
261262
parser.add_argument(
262263
"doc_language", type=str, default="English", required=False, nullable=False, location="json"
263264
)

api/controllers/console/datasets/datasets_segments.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -368,9 +368,9 @@ def post(self, dataset_id, document_id):
368368
result = []
369369
for index, row in df.iterrows():
370370
if document.doc_form == "qa_model":
371-
data = {"content": row[0], "answer": row[1]}
371+
data = {"content": row.iloc[0], "answer": row.iloc[1]}
372372
else:
373-
data = {"content": row[0]}
373+
data = {"content": row.iloc[0]}
374374
result.append(data)
375375
if len(result) == 0:
376376
raise ValueError("The CSV file is empty.")

api/controllers/console/explore/conversation.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def get(self, installed_app):
3232

3333
pinned = None
3434
if "pinned" in args and args["pinned"] is not None:
35-
pinned = True if args["pinned"] == "true" else False
35+
pinned = args["pinned"] == "true"
3636

3737
try:
3838
with Session(db.engine) as session:

api/controllers/service_api/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77

88
from . import index
99
from .app import app, audio, completion, conversation, file, message, workflow
10-
from .dataset import dataset, document, hit_testing, segment
10+
from .dataset import dataset, document, hit_testing, segment, upload_file

api/controllers/service_api/dataset/dataset.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,11 @@ def get(self, tenant_id):
3131
# provider = request.args.get("provider", default="vendor")
3232
search = request.args.get("keyword", default=None, type=str)
3333
tag_ids = request.args.getlist("tag_ids")
34+
include_all = request.args.get("include_all", default="false").lower() == "true"
3435

35-
datasets, total = DatasetService.get_datasets(page, limit, tenant_id, current_user, search, tag_ids)
36+
datasets, total = DatasetService.get_datasets(
37+
page, limit, tenant_id, current_user, search, tag_ids, include_all
38+
)
3639
# check embedding setting
3740
provider_manager = ProviderManager()
3841
configurations = provider_manager.get_configurations(tenant_id=current_user.current_tenant_id)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from werkzeug.exceptions import NotFound
2+
3+
from controllers.service_api import api
4+
from controllers.service_api.wraps import (
5+
DatasetApiResource,
6+
)
7+
from core.file import helpers as file_helpers
8+
from extensions.ext_database import db
9+
from models.dataset import Dataset
10+
from models.model import UploadFile
11+
from services.dataset_service import DocumentService
12+
13+
14+
class UploadFileApi(DatasetApiResource):
15+
def get(self, tenant_id, dataset_id, document_id):
16+
"""Get upload file."""
17+
# check dataset
18+
dataset_id = str(dataset_id)
19+
tenant_id = str(tenant_id)
20+
dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first()
21+
if not dataset:
22+
raise NotFound("Dataset not found.")
23+
# check document
24+
document_id = str(document_id)
25+
document = DocumentService.get_document(dataset.id, document_id)
26+
if not document:
27+
raise NotFound("Document not found.")
28+
# check upload file
29+
if document.data_source_type != "upload_file":
30+
raise ValueError(f"Document data source type ({document.data_source_type}) is not upload_file.")
31+
data_source_info = document.data_source_info_dict
32+
if data_source_info and "upload_file_id" in data_source_info:
33+
file_id = data_source_info["upload_file_id"]
34+
upload_file = db.session.query(UploadFile).filter(UploadFile.id == file_id).first()
35+
if not upload_file:
36+
raise NotFound("UploadFile not found.")
37+
else:
38+
raise ValueError("Upload file id not found in document data source info.")
39+
40+
url = file_helpers.get_signed_file_url(upload_file_id=upload_file.id)
41+
return {
42+
"id": upload_file.id,
43+
"name": upload_file.name,
44+
"size": upload_file.size,
45+
"extension": upload_file.extension,
46+
"url": url,
47+
"download_url": f"{url}&as_attachment=true",
48+
"mime_type": upload_file.mime_type,
49+
"created_by": upload_file.created_by,
50+
"created_at": upload_file.created_at.timestamp(),
51+
}, 200
52+
53+
54+
api.add_resource(UploadFileApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/upload-file")

api/controllers/service_api/wraps.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ def create_or_update_end_user_for_user_id(app_model: App, user_id: Optional[str]
236236
tenant_id=app_model.tenant_id,
237237
app_id=app_model.id,
238238
type="service_api",
239-
is_anonymous=True if user_id == "DEFAULT-USER" else False,
239+
is_anonymous=user_id == "DEFAULT-USER",
240240
session_id=user_id,
241241
)
242242
db.session.add(end_user)

api/controllers/web/conversation.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def get(self, app_model, end_user):
3939

4040
pinned = None
4141
if "pinned" in args and args["pinned"] is not None:
42-
pinned = True if args["pinned"] == "true" else False
42+
pinned = args["pinned"] == "true"
4343

4444
try:
4545
with Session(db.engine) as session:

api/core/indexing_runner.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,6 @@ def _load(
530530
# chunk nodes by chunk size
531531
indexing_start_at = time.perf_counter()
532532
tokens = 0
533-
chunk_size = 10
534533
if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX:
535534
# create keyword index
536535
create_keyword_thread = threading.Thread(
@@ -539,11 +538,22 @@ def _load(
539538
)
540539
create_keyword_thread.start()
541540

541+
max_workers = 10
542542
if dataset.indexing_technique == "high_quality":
543-
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
543+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
544544
futures = []
545-
for i in range(0, len(documents), chunk_size):
546-
chunk_documents = documents[i : i + chunk_size]
545+
546+
# Distribute documents into multiple groups based on the hash values of page_content
547+
# This is done to prevent multiple threads from processing the same document,
548+
# Thereby avoiding potential database insertion deadlocks
549+
document_groups: list[list[Document]] = [[] for _ in range(max_workers)]
550+
for document in documents:
551+
hash = helper.generate_text_hash(document.page_content)
552+
group_index = int(hash, 16) % max_workers
553+
document_groups[group_index].append(document)
554+
for chunk_documents in document_groups:
555+
if len(chunk_documents) == 0:
556+
continue
547557
futures.append(
548558
executor.submit(
549559
self._process_chunk,

api/core/model_runtime/model_providers/__base/tokenizers/gpt2_tokenzier.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1+
import logging
12
from threading import Lock
23
from typing import Any
34

4-
import tiktoken
5+
logger = logging.getLogger(__name__)
56

67
_tokenizer: Any = None
78
_lock = Lock()
@@ -33,9 +34,18 @@ def get_encoder() -> Any:
3334
if _tokenizer is None:
3435
# Try to use tiktoken to get the tokenizer because it is faster
3536
#
36-
_tokenizer = tiktoken.get_encoding("gpt2")
37-
# base_path = abspath(__file__)
38-
# gpt2_tokenizer_path = join(dirname(base_path), "gpt2")
39-
# _tokenizer = TransformerGPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)
37+
try:
38+
import tiktoken
39+
40+
_tokenizer = tiktoken.get_encoding("gpt2")
41+
except Exception:
42+
from os.path import abspath, dirname, join
43+
44+
from transformers import GPT2Tokenizer as TransformerGPT2Tokenizer # type: ignore
45+
46+
base_path = abspath(__file__)
47+
gpt2_tokenizer_path = join(dirname(base_path), "gpt2")
48+
_tokenizer = TransformerGPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)
49+
logger.info("Fallback to Transformers' GPT-2 tokenizer from tiktoken")
4050

4151
return _tokenizer

api/core/model_runtime/model_providers/openai_api_compatible/llm/llm.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -377,10 +377,7 @@ def _generate(
377377
for tool in tools:
378378
formatted_tools.append(helper.dump_model(PromptMessageFunction(function=tool)))
379379

380-
if prompt_messages[-1].role.value == "tool":
381-
data["tools"] = None
382-
else:
383-
data["tools"] = formatted_tools
380+
data["tools"] = formatted_tools
384381

385382
if stop:
386383
data["stop"] = stop

api/core/model_runtime/model_providers/openrouter/llm/claude-3-5-sonnet.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ features:
77
- vision
88
- tool-call
99
- stream-tool-call
10+
- document
1011
model_properties:
1112
mode: chat
1213
context_size: 200000

api/core/model_runtime/schema_validators/common_validator.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,6 @@ def _validate_credential_form_schema(
8787
if value.lower() not in {"true", "false"}:
8888
raise ValueError(f"Variable {credential_form_schema.variable} should be true or false")
8989

90-
value = True if value.lower() == "true" else False
90+
value = value.lower() == "true"
9191

9292
return value

api/core/ops/entities/config_entity.py

+32
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
class TracingProviderEnum(Enum):
77
LANGFUSE = "langfuse"
88
LANGSMITH = "langsmith"
9+
OPIK = "opik"
910

1011

1112
class BaseTracingConfig(BaseModel):
@@ -56,5 +57,36 @@ def set_value(cls, v, info: ValidationInfo):
5657
return v
5758

5859

60+
class OpikConfig(BaseTracingConfig):
61+
"""
62+
Model class for Opik tracing config.
63+
"""
64+
65+
api_key: str | None = None
66+
project: str | None = None
67+
workspace: str | None = None
68+
url: str = "https://www.comet.com/opik/api/"
69+
70+
@field_validator("project")
71+
@classmethod
72+
def project_validator(cls, v, info: ValidationInfo):
73+
if v is None or v == "":
74+
v = "Default Project"
75+
76+
return v
77+
78+
@field_validator("url")
79+
@classmethod
80+
def url_validator(cls, v, info: ValidationInfo):
81+
if v is None or v == "":
82+
v = "https://www.comet.com/opik/api/"
83+
if not v.startswith(("https://", "http://")):
84+
raise ValueError("url must start with https:// or http://")
85+
if not v.endswith("/api/"):
86+
raise ValueError("url should ends with /api/")
87+
88+
return v
89+
90+
5991
OPS_FILE_PATH = "ops_trace/"
6092
OPS_TRACE_FAILED_KEY = "FAILED_OPS_TRACE"

api/core/ops/opik_trace/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)