Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

master merge for 1.8.1 release #2391

Merged
merged 9 commits into from
Mar 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion dlt/common/configuration/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
CONFIG_TOML,
SECRETS_TOML,
StringTomlProvider,
CustomLoaderDocProvider,
)
from .doc import CustomLoaderDocProvider
from .vault import SECRETS_TOML_KEY
Expand Down
2 changes: 2 additions & 0 deletions dlt/common/configuration/specs/aws_credentials.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class AwsCredentialsWithoutDefaults(
profile_name: Optional[str] = None
region_name: Optional[str] = None
endpoint_url: Optional[str] = None
s3_url_style: Optional[str] = None
"""Only needed for duckdb sql_client s3 access, for minio this needs to be set to path for example."""

def to_s3fs_credentials(self) -> Dict[str, Optional[str]]:
"""Dict of keyword arguments that can be passed to s3fs"""
Expand Down
4 changes: 4 additions & 0 deletions dlt/common/libs/pydantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,10 @@ def pydantic_to_table_schema_columns(
result[schema_key] = {
**hints,
"name": snake_case_naming_convention.make_path(name, hints["name"]),
# if the outer field containing the nested mode is optional,
# then each field in the model itself has to be nullable as well,
# as otherwise we end up with flattened non-nullable optional nested fields
"nullable": hints["nullable"] or nullable,
}
elif data_type == "json" and skip_nested_types:
continue
Expand Down
10 changes: 3 additions & 7 deletions dlt/destinations/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
from dlt.destinations.dataset.factory import (
dataset,
)
from dlt.destinations.dataset.dataset import (
ReadableDBAPIDataset,
get_destination_clients,
)
from dlt.destinations.dataset.factory import dataset
from dlt.destinations.dataset.dataset import ReadableDBAPIDataset

from dlt.destinations.dataset.utils import (
get_destination_clients,
get_destination_client_initial_config,
Expand Down
1 change: 0 additions & 1 deletion dlt/destinations/impl/filesystem/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
Dict,
)
from fsspec import AbstractFileSystem
from contextlib import contextmanager

import dlt
from dlt.common import logger, time, json, pendulum
Expand Down
20 changes: 14 additions & 6 deletions dlt/destinations/impl/filesystem/sql_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,17 @@ def create_authentication(self, persistent: bool = False, secret_name: str = Non
session_token = (
"" if aws_creds.aws_session_token is None else aws_creds.aws_session_token
)
endpoint = (
aws_creds.endpoint_url.replace("https://", "")
if aws_creds.endpoint_url
else "s3.amazonaws.com"
)

use_ssl = "true"
endpoint = "s3.amazonaws.com"
if aws_creds.endpoint_url and "http://" in aws_creds.endpoint_url:
use_ssl = "false"
endpoint = aws_creds.endpoint_url.replace("http://", "")
elif aws_creds.endpoint_url and "https://" in aws_creds.endpoint_url:
endpoint = aws_creds.endpoint_url.replace("https://", "")

s3_url_style = aws_creds.s3_url_style or "vhost"

self._conn.sql(f"""
CREATE OR REPLACE {persistent_stmt} SECRET {secret_name} (
TYPE S3,
Expand All @@ -125,7 +131,9 @@ def create_authentication(self, persistent: bool = False, secret_name: str = Non
SESSION_TOKEN '{session_token}',
REGION '{aws_creds.region_name}',
ENDPOINT '{endpoint}',
SCOPE '{scope}'
SCOPE '{scope}',
URL_STYLE '{s3_url_style}',
USE_SSL {use_ssl}
);""")

# azure with storage account creds
Expand Down
1 change: 0 additions & 1 deletion dlt/destinations/impl/synapse/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
MsSqlCredentials,
MsSqlClientConfiguration,
)
from dlt.destinations.impl.mssql.configuration import MsSqlCredentials

from dlt.destinations.impl.synapse.synapse_adapter import TTableIndexType

Expand Down
3 changes: 1 addition & 2 deletions dlt/destinations/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re

from typing import Any, List, Optional, Sequence, Tuple
from typing import Any, List, Dict, Type, Optional, Sequence, Tuple, cast

from dlt.common import logger
from dlt.common.destination.capabilities import DestinationCapabilitiesContext
Expand All @@ -15,7 +15,6 @@
is_nested_table,
pipeline_state_table,
)
from typing import Any, cast, Tuple, Dict, Type

from dlt.destinations.exceptions import DatabaseTransientException
from dlt.extract import DltResource, resource as make_resource, DltSource
Expand Down
4 changes: 2 additions & 2 deletions dlt/extract/reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
Any,
Generic,
Tuple,
cast,
overload,
ClassVar,
Type,
)
from typing_extensions import Self, TypeVar
from typing import Dict, Type, ClassVar

from dlt.common import logger
from dlt.common.configuration.specs import BaseConfiguration, known_sections
Expand Down
1 change: 0 additions & 1 deletion dlt/extract/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
Any,
)
from typing_extensions import Self
from typing import Dict

from dlt.common.configuration.resolve import inject_section
from dlt.common.configuration.specs import known_sections
Expand Down
1 change: 0 additions & 1 deletion dlt/extract/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
)
from dlt.extract.items import (
TTableHintTemplate,
TDataItem,
TFunHintTemplate,
SupportsPipe,
)
Expand Down
1 change: 0 additions & 1 deletion dlt/pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@
WithStateSync,
JobClientBase,
DestinationClientStagingConfiguration,
DestinationClientStagingConfiguration,
)
from dlt.common.destination.dataset import SupportsReadableDataset
from dlt.common.destination.typing import TDatasetType
Expand Down
3 changes: 2 additions & 1 deletion dlt/sources/rest_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,8 @@ def paginate_resource(
incremental_object, incremental_cursor_transform
)
)

# TODO: expand json as well. make sure you do not default to {} as this will
# generate empty body
path = expand_placeholders(path, format_kwargs)
params = expand_placeholders(params, format_kwargs)

Expand Down
4 changes: 3 additions & 1 deletion dlt/sources/rest_api/config_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,9 @@ def process_parent_data_item(
)
expanded_path = expand_placeholders(path, params_values)
expanded_params = expand_placeholders(params or {}, params_values)
expanded_json = expand_placeholders(request_json or {}, params_values)
expanded_json = (
None if request_json is None else expand_placeholders(request_json, params_values)
)

parent_resource_name = resolved_params[0].resolve_config["resource"]
parent_record = build_parent_record(item, parent_resource_name, include_from_parent)
Expand Down
14 changes: 0 additions & 14 deletions dlt/sources/rest_api/typing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from dataclasses import dataclass, field

from typing import (
Any,
Callable,
Expand All @@ -22,17 +20,6 @@

from dataclasses import dataclass, field

from dlt.common import jsonpath
from dlt.common.typing import TSortOrder, TColumnNames
from dlt.common.schema.typing import (
TTableFormat,
TAnySchemaColumns,
TWriteDispositionConfig,
TSchemaContract,
)

from dlt.extract.items import TTableHintTemplate
from dlt.common.incremental.typing import LastValueFunc
from dlt.extract.resource import DltResource

from requests import Session
Expand All @@ -47,7 +34,6 @@
PageNumberPaginator,
SinglePagePaginator,
)
from dlt.sources.helpers.rest_client.typing import HTTPMethodBasic


try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,12 +188,11 @@ mode.
```py
@dlt.resource(write_disposition="merge")
def opportunity(
last_timestamp: Incremental[str] = dlt.sources.incremental(
last_timestamp: incremental[str] = dlt.sources.incremental(
"SystemModstamp", initial_value=None
)
) -> Iterator[Dict[str, Any]]:

yield from _get_records(
) -> Iterable[TDataItem]:
yield get_records(
client, "Opportunity", last_timestamp.last_value, "SystemModstamp"
)
```
Expand Down
2 changes: 1 addition & 1 deletion docs/website/docusaurus.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ const versions = {"current": {
noIndex: true
}}

let knownVersions = ["current"];
let knownVersions = [];
if (fs.existsSync("versions.json")) {
knownVersions = JSON.parse(fs.readFileSync("versions.json", 'utf8'));
}
Expand Down
27 changes: 12 additions & 15 deletions docs/website/tools/update_versions.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@ const proc = require('child_process')
const fs = require('fs');
const semver = require('semver')

// disable versions for now
process.exit(0)

// const
const REPO_DIR = ".dlt-repo"
const REPO_DOCS_DIR = REPO_DIR + "/docs/website"
Expand All @@ -15,7 +12,7 @@ const VERSIONED_SIDEBARS_FOLDER = "versioned_sidebars"
const ENV_FILE = '.env'

// no doc versions below this version will be deployed
const MINIMUM_SEMVER_VERSION = "0.5.0"
const MINIMUM_SEMVER_VERSION = "1.0.0"

// clear old repo version
fs.rmSync(REPO_DIR, { recursive: true, force: true })
Expand All @@ -41,6 +38,7 @@ versions = semver.rsort(versions.filter(v => semver.gt(v, min_version)))
versions.filter(v => semver.prerelease(v) == null)

console.log(`Found ${versions.length} elligible versions`)
console.log(versions)
if (versions.length < 2) {
console.error("Sanity check failed, not enough elligble version tags found")
process.exit(1)
Expand All @@ -50,17 +48,16 @@ if (versions.length < 2) {
const envFileContent = `DOCUSAURUS_DLT_VERSION=${versions[0]}`;
fs.writeFileSync(ENV_FILE, envFileContent, 'utf8');

// go through the versions and find all newest versions of any major version
// the newest version is replace by the master branch here so the master head
// always is the "current" doc
// in the future the only other version to build is the minor version below the latest
const selectedVersions = ["master"];
let lastVersion = versions[0];
for (let ver of versions) {
if ( semver.major(ver) != semver.major(lastVersion)) {
selectedVersions.push(ver)
}
lastVersion = ver;
}
// let lastVersion = versions[0];
// for (let ver of versions) {
// console.log(semver.minor(ver))
// console.log(semver.minor(lastVersion))
// if ( semver.minor(ver) == (semver.minor(lastVersion) - 1)) {
// selectedVersions.push(ver)
// }
// }

console.log(`Will create docs versions for ${selectedVersions}`)

Expand Down Expand Up @@ -100,7 +97,7 @@ for (const version of selectedVersions) {

// build doc version, we also run preprocessing and markdown gen for each doc version
console.log(`Building docs...`)
proc.execSync(`cd ${REPO_DOCS_DIR} && npm run preprocess-docs && PYTHONPATH=. pydoc-markdown`)
proc.execSync(`cd ${REPO_DOCS_DIR} && npm run preprocess-docs && PYTHONPATH=. pydoc-markdown && python clean_pydoc_sidebar.py`)

console.log(`Snapshotting version...`)
proc.execSync(`cd ${REPO_DOCS_DIR} && npm run docusaurus docs:version ${version}`)
Expand Down
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dlt"
version = "1.8.0"
version = "1.8.1"
description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run."
authors = ["dltHub Inc. <[email protected]>"]
maintainers = [ "Marcin Rudolf <[email protected]>", "Adrian Brudaru <[email protected]>", "Anton Burnashev <[email protected]>", "David Scharf <[email protected]>" ]
Expand Down
12 changes: 12 additions & 0 deletions tests/libs/test_pydantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,3 +743,15 @@ class MyModel(BaseModel):

with pytest.raises(ValidationError):
m = MyModel(column_type={"data_type": "invalid_type"}) # type: ignore[typeddict-item]


def test_parent_nullable_means_children_nullable():
class MyParent(BaseModel):
optional_child: Optional[ChildModel]
non_optional_child: ChildModel
dlt_config: ClassVar[DltConfig] = {"skip_nested_types": True}

schema = pydantic_to_table_schema_columns(MyParent)

assert schema["optional_child__child_attribute"]["nullable"]
assert schema["non_optional_child__child_attribute"]["nullable"] is False
2 changes: 1 addition & 1 deletion tests/load/sources/rest_api/test_rest_api_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_rest_api_source(destination_config: DestinationTestConfiguration, reque

assert table_counts.keys() == {"pokemon_list", "berry", "location"}

assert table_counts["pokemon_list"] == 1304
assert table_counts["pokemon_list"] == 1302
assert table_counts["berry"] == 64
assert table_counts["location"] == 1039

Expand Down
28 changes: 27 additions & 1 deletion tests/sources/rest_api/configurations/test_resolve_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,33 @@ def test_process_parent_data_item() -> None:
include_from_parent=None,
)
assert bound_path == "dlt-hub/dlt/issues/12345/comments"
assert expanded_params == {}
assert expanded_params == {} # defaults to empty dict
assert request_json is None # defaults to None
assert parent_record == {}

# same but with empty params and json
bound_path, expanded_params, request_json, parent_record = process_parent_data_item(
path="dlt-hub/dlt/issues/{id}/comments",
item={"obj_id": 12345},
params={},
request_json={},
resolved_params=resolved_params,
)
# those got propagated
assert expanded_params == {}
assert request_json == {} # generates empty body!

# also test params and json
bound_path, expanded_params, request_json, parent_record = process_parent_data_item(
path="dlt-hub/dlt/issues/comments",
item={"obj_id": 12345},
params={"orig_id": "{id}"},
request_json={"orig_id": "{id}"},
resolved_params=resolved_params,
)
assert expanded_params == {"orig_id": "12345"}
assert request_json == {"orig_id": "12345"}

bound_path, expanded_params, request_json, parent_record = process_parent_data_item(
path="dlt-hub/dlt/issues/{id}/comments",
item={"obj_id": 12345},
Expand Down Expand Up @@ -137,11 +161,13 @@ def test_process_parent_data_item() -> None:
path="dlt-hub/dlt/issues/comments",
item={"obj_id": 12345, "obj_node": "node_1"},
params={"id": "{resources.issues.obj_id}"},
request_json={"id": "{resources.issues.obj_id}"},
resolved_params=resolved_params_reference,
include_from_parent=["obj_id", "obj_node"],
)
assert bound_path == "dlt-hub/dlt/issues/comments"
assert expanded_params == {"id": "12345"}
assert request_json == {"id": "12345"}

# Test nested data
resolved_param_nested = [
Expand Down
Loading
Loading