Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attach availability to materialization #1318

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""Associate availability and materialization

Revision ID: b8ef80efd70c
Revises: c3d5f327296c
Create Date: 2025-02-24 05:49:06.588675+00:00

"""

# pylint: disable=no-member, invalid-name, missing-function-docstring, unused-import, no-name-in-module
import json
import sqlalchemy as sa
from sqlalchemy.sql import table, column
from alembic import op
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "b8ef80efd70c"
down_revision = "c3d5f327296c"
branch_labels = None
depends_on = None


def upgrade():
with op.batch_alter_table("availabilitystate", schema=None) as batch_op:
batch_op.add_column(
sa.Column(
"custom_metadata",
postgresql.JSONB(astext_type=sa.Text()),
nullable=True,
),
)
batch_op.add_column(
sa.Column("materialization_id", sa.BigInteger(), nullable=True),
)
batch_op.create_foreign_key(
"fk_availability_materialization_id_materialization",
"materialization",
["materialization_id"],
["id"],
)

availabilitystate = table(
"availabilitystate",
column("id", sa.BigInteger()),
column("url", sa.String()),
column("links", postgresql.JSON),
column("custom_metadata", postgresql.JSONB),
)

# Move data from url and links to custom_metadata
connection = op.get_bind()
results = connection.execute(
sa.select(
availabilitystate.c.id,
availabilitystate.c.url,
availabilitystate.c.links,
),
).fetchall()

for row in results:
metadata = {}
if row.url:
metadata["url"] = row.url
if row.links:
metadata["links"] = row.links

if metadata:
connection.execute(
sa.update(availabilitystate)
.where(availabilitystate.c.id == row.id)
.values(custom_metadata=metadata),
)

with op.batch_alter_table("availabilitystate", schema=None) as batch_op:
batch_op.drop_column("url")
batch_op.drop_column("links")


def downgrade():
with op.batch_alter_table("availabilitystate", schema=None) as batch_op:
batch_op.add_column(
sa.Column(
"links",
postgresql.JSON(astext_type=sa.Text()),
autoincrement=False,
nullable=True,
),
)
batch_op.add_column(
sa.Column("url", sa.VARCHAR(), autoincrement=False, nullable=True),
)
batch_op.drop_constraint(
"fk_availability_materialization_id_materialization",
type_="foreignkey",
)

# Restore `url` and `links` from `custom_metadata`
availabilitystate = table(
"availabilitystate",
column("id", sa.BigInteger()),
column("custom_metadata", postgresql.JSONB),
column("url", sa.String()),
column("links", postgresql.JSON),
)

conn = op.get_bind()
results = conn.execute(
sa.select(availabilitystate.c.id, availabilitystate.c.custom_metadata),
).fetchall()

for row in results:
metadata = json.loads(row.custom_metadata) if row.custom_metadata else {}
conn.execute(
sa.update(availabilitystate)
.where(availabilitystate.c.id == row.id)
.values(
url=metadata.get("url"),
links=metadata.get("links"),
),
)

with op.batch_alter_table("availabilitystate", schema=None) as batch_op:
batch_op.drop_column("materialization_id")
batch_op.drop_column("custom_metadata")
6 changes: 3 additions & 3 deletions datajunction-server/datajunction_server/api/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
router = SecureAPIRouter(tags=["data"])


@router.post("/data/{node_name}/availability/", name="Add Availability State to Node")
@router.post("/data/{node_name}/availability", name="Add Availability State to Node")
async def add_availability_state(
node_name: str,
data: AvailabilityStateBase,
Expand Down Expand Up @@ -129,7 +129,7 @@ async def add_availability_state(
schema_=data.schema_,
table=data.table,
valid_through_ts=data.valid_through_ts,
url=data.url,
custom_metadata=data.custom_metadata,
min_temporal_partition=data.min_temporal_partition,
max_temporal_partition=data.max_temporal_partition,
partitions=[
Expand All @@ -138,7 +138,7 @@ async def add_availability_state(
],
categorical_partitions=data.categorical_partitions,
temporal_partitions=data.temporal_partitions,
links=data.links,
materialization_id=data.materialization_id,
)
if node_revision.availability and not node_revision.availability.partitions:
node_revision.availability.partitions = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,20 @@

from datetime import datetime, timezone
from functools import partial
from typing import Any, Dict, List, Optional
from typing import TYPE_CHECKING, Dict, List, Optional

import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy import JSON, DateTime, ForeignKey
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.orm import Mapped, mapped_column, relationship

from datajunction_server.database.base import Base
from datajunction_server.models.node import BuildCriteria, PartitionAvailability
from datajunction_server.typing import UTCDatetime

if TYPE_CHECKING:
from datajunction_server.database.materialization import Materialization


class AvailabilityState(Base):
"""
Expand All @@ -25,12 +29,31 @@ class AvailabilityState(Base):
primary_key=True,
)

# Identifying where the dataset lives
catalog: Mapped[str]
schema_: Mapped[Optional[str]] = mapped_column(nullable=True)
table: Mapped[str]

# Indicates data freshness
valid_through_ts: Mapped[int] = mapped_column(sa.BigInteger())
url: Mapped[Optional[str]]
links: Mapped[Optional[Dict[str, Any]]] = mapped_column(JSON, default=dict)

# Arbitrary JSON metadata. This can encompass any URLs associated with the materialized dataset
custom_metadata: Mapped[Optional[Dict]] = mapped_column(
JSONB,
default=dict,
)

# The materialization that this availability is associated with, if any
materialization_id: Mapped[Optional[int]] = mapped_column(
ForeignKey(
"materialization.id",
name="fk_availability_materialization_id_materialization",
),
)
materialization: Mapped[Optional["Materialization"]] = relationship(
back_populates="availability",
primaryjoin="Materialization.id==AvailabilityState.materialization_id",
)

# An ordered list of categorical partitions like ["country", "group_id"]
# or ["region_id", "age_group"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from datajunction_server.database.backfill import Backfill
from datajunction_server.database.base import Base
from datajunction_server.database.column import Column
from datajunction_server.database.availabilitystate import AvailabilityState
from datajunction_server.models.materialization import (
DruidMeasuresCubeConfig,
GenericMaterializationConfig,
Expand Down Expand Up @@ -97,6 +98,13 @@ class Materialization(Base):
lazy="selectin",
)

availability: Mapped[List[AvailabilityState]] = relationship(
back_populates="materialization",
primaryjoin="Materialization.id==AvailabilityState.materialization_id",
cascade="all, delete",
lazy="selectin",
)

@classmethod
async def get_by_names(
cls,
Expand Down
1 change: 1 addition & 0 deletions datajunction-server/datajunction_server/internal/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1570,6 +1570,7 @@ async def upsert_complex_dimension_link(
node = await Node.get_by_name(
session,
node_name,
raise_if_not_exists=True,
)
if node.type not in (NodeType.SOURCE, NodeType.DIMENSION, NodeType.TRANSFORM): # type: ignore
raise DJInvalidInputException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def schedule(
)
return query_service_client.materialize_cube(
materialization_input=DruidCubeMaterializationInput(
id=materialization.id,
name=materialization.name,
cube=cube_config.cube,
dimensions=cube_config.dimensions,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,7 @@ class DruidCubeMaterializationInput(BaseModel):
Materialization info as passed to the query service.
"""

id: int | None = None
name: str

# Frozen cube info at the time of materialization
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ class MaterializationConfigOutput(BaseModel):
Output for materialization config.
"""

id: int
name: Optional[str]
config: Dict
schedule: str
Expand Down
4 changes: 2 additions & 2 deletions datajunction-server/datajunction_server/models/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,8 @@ class AvailabilityStateBase(TemporalPartitionRange):
schema_: Optional[str] = Field(default=None)
table: str
valid_through_ts: int
url: Optional[str]
links: Optional[Dict[str, Any]] = Field(default={})
materialization_id: Optional[int] = Field(default=None)
custom_metadata: Optional[Dict[str, Any]] = Field(default={})

# An ordered list of categorical partitions like ["country", "group_id"]
# or ["region_id", "age_group"]
Expand Down
Loading