Skip to content

Commit b684446

Browse files
committed
Add changes necessary for services
1 parent 960e54e commit b684446

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+721
-1300
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1616
- Move from in-line encoding to schema-based encoding with `Leaf._fields`
1717
- No need to define `_fields`
1818
- Use databackend to perform metadata duties
19+
- Add `db.create` and `db.insert` instead of `auto_schema`
1920

2021
#### New Features & Functionality
2122

plugins/ibis/plugin_test/config.yaml

-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
artifact_store: null
21
data_backend: sqlite://
32
auto_schema: false
43
force_apply: true

plugins/ibis/plugin_test/test_query.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
import numpy as np
44
import pytest
5+
from superduper.base.base import Base
56
from superduper.base.document import Document
7+
from superduper.components.listener import Listener
68
from superduper.components.table import Table
79

810

@@ -83,8 +85,12 @@ def test_filter(db):
8385
assert len(r) == uq[1][0]
8486

8587

88+
class documents(Base):
89+
this: 'str'
90+
91+
8692
def test_select_using_ids(db):
87-
db.cfg.auto_schema = True
93+
db.create(documents)
8894

8995
table = db["documents"]
9096
table.insert([{"this": f"is a test {i}", "id": str(i)} for i in range(4)])
@@ -102,12 +108,17 @@ def test_select_using_ids_of_outputs(db):
102108
def my_func(x):
103109
return x + ' ' + x
104110

105-
db.cfg.auto_schema = True
111+
db.create(documents)
106112

107113
table = db["documents"]
108114
table.insert([{"this": f"is a test {i}", "id": str(i)} for i in range(4)])
109115

110-
listener = my_func.to_listener(key='this', select=db['documents'].select())
116+
listener = Listener(
117+
'test',
118+
model=my_func,
119+
key='this',
120+
select=db['documents'].select(),
121+
)
111122
db.apply(listener)
112123

113124
q1 = db[listener.outputs].select()

plugins/ibis/plugin_test/test_sanity.py

+9-11
Original file line numberDiff line numberDiff line change
@@ -7,33 +7,31 @@ class New(Base):
77

88
def test(db):
99

10-
db.create(New)
11-
1210
data = [
13-
New('testa', a='a').dict(path=False),
14-
New('testb', a='b').dict(path=False),
15-
New('testc', a='c').dict(path=False),
11+
New(a='a'),
12+
New(a='b'),
13+
New(a='c').dict(path=False),
1614
]
1715

18-
db['New'].insert(data)
16+
db.insert(data)
1917

2018
loaded = db['New'].execute()
2119

2220
assert len(loaded) == 3
2321

2422
db['New'].update({'a': 'a'}, 'a', 'a2')
2523

26-
r = db['New'].get(identifier='testa')
24+
r = db['New'].get(a='a2')
2725

2826
assert r['a'] == 'a2'
2927

3028
r['a'] = 'a3'
3129

32-
db['New'].replace({'identifier': 'testa'}, r)
30+
db['New'].replace({'a': 'a2'}, r)
3331

34-
assert db['New'].get(identifier='testa')['a'] == 'a3'
32+
assert db['New'].get(a='a3')['a'] == 'a3'
3533

36-
db['New'].delete({'identifier': 'testa'})
34+
db['New'].delete({'a': 'a3'})
3735
assert len(db['New'].execute()) == 2
3836

39-
assert db['New'].get(identifier='testa') is None
37+
assert db['New'].get(a='a3') is None

plugins/ibis/superduper_ibis/data_backend.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -133,15 +133,15 @@ def check_output_dest(self, predict_id) -> bool:
133133
except (NoSuchTableError, ibis.IbisError):
134134
return False
135135

136-
def create_table_and_schema(self, identifier: str, schema: Schema):
136+
def create_table_and_schema(self, identifier: str, schema: Schema, primary_id: str):
137137
"""Create a schema in the data-backend.
138138
139139
:param identifier: The identifier of the table.
140140
:param mapping: The mapping of the schema.
141141
"""
142142
mapping = convert_schema_to_fields(schema)
143-
if "id" not in mapping:
144-
mapping["id"] = "string"
143+
if primary_id not in mapping:
144+
mapping[primary_id] = "string"
145145
try:
146146
mapping = self.db_helper.process_schema_types(mapping)
147147
t = self._create_table_with_retry(identifier, schema=ibis.schema(mapping))
-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
artifact_store: null
21
data_backend: mongomock://test_db
32
auto_schema: false
43
force_apply: true

plugins/mongodb/superduper_mongodb/data_backend.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def check_output_dest(self, predict_id) -> bool:
111111
"""
112112
return self._database[f"{CFG.output_prefix}{predict_id}"].find_one() is not None
113113

114-
def create_table_and_schema(self, identifier: str, schema: Schema):
114+
def create_table_and_schema(self, identifier: str, schema: Schema, primary_id: str):
115115
"""Create a table and schema in the data backend.
116116
117117
:param identifier: The identifier for the table

plugins/openai/superduper_openai/model.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
import json
44
import os
55
import typing as t
6-
76
from functools import lru_cache as cache
7+
88
import numpy
99
import requests
1010
import tqdm

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,8 @@ ignore = [
136136
"D401",
137137
"D102",
138138
"E402",
139-
"F403"
139+
"F403",
140+
"F401", # Removing lines of code
140141
]
141142
exclude = ["templates", "superduper/templates"]
142143

superduper/backends/base/cluster.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def load_custom_plugins(self):
8383
from superduper import logging
8484

8585
if 'Plugin' in self.db.show('Table'):
86-
logging.info(f"Found custom plugins - loading...")
86+
logging.info("Found custom plugins - loading...")
8787
for plugin in self.db.show('Plugin'):
8888
logging.info(f"Loading plugin: {plugin}")
8989
plugin = self.db.load('Plugin', plugin)

superduper/backends/base/data_backend.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,14 @@ def build_artifact_store(self):
7575
pass
7676

7777
@abstractmethod
78-
def create_table_and_schema(self, identifier: str, schema: 'Schema'):
78+
def create_table_and_schema(
79+
self, identifier: str, schema: 'Schema', primary_id: str
80+
):
7981
"""Create a schema in the data-backend.
8082
8183
:param identifier: The identifier of the schema.
8284
:param schema: The schema to create.
85+
:param primary_id: The primary id of the schema.
8386
"""
8487

8588
@abstractmethod

superduper/backends/base/metadata.py

+50-12
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44
import uuid
55

66
from superduper import logging
7-
from superduper.base.exceptions import DatabackendError
87
from superduper.base.base import Base
8+
from superduper.base.exceptions import DatabackendError
99
from superduper.components.cdc import CDC
10+
from superduper.components.component import Component
1011
from superduper.components.schema import Schema
1112
from superduper.components.table import Table
1213
from superduper.misc.importing import import_object
@@ -113,7 +114,7 @@ def __init__(self, db):
113114
).encode(),
114115
('Table', 'Job'): Table(
115116
identifier='Job',
116-
primary_id='uuid',
117+
primary_id='job_id',
117118
uuid='jkl',
118119
component=True,
119120
path='superduper.backends.base.metadata.Job',
@@ -127,7 +128,12 @@ def __init__(self, db):
127128
def init(self):
128129
"""Initialize the metadata store."""
129130
for cls in metaclasses.values():
130-
self.db.databackend.create_table_and_schema(cls.__name__, cls.class_schema)
131+
preset = self.preset_components.get(('Table', cls.__name__))
132+
self.db.databackend.create_table_and_schema(
133+
cls.__name__,
134+
cls.class_schema,
135+
primary_id=preset['primary_id'],
136+
)
131137

132138
def get_schema(self, table: str):
133139
"""Get the schema of a table.
@@ -162,10 +168,18 @@ def create(self, cls: t.Type[Base]):
162168
r = self.db['Table'].get(identifier=cls.__name__)
163169
except DatabackendError as e:
164170
if 'not found' in str(e):
165-
self.db.databackend.create_table_and_schema('Table', Table.class_schema)
166-
t = Table('Table', path='superduper.components.table.Table', primary_id='uuid', component=True)
171+
self.db.databackend.create_table_and_schema(
172+
'Table', Table.class_schema, 'uuid'
173+
)
174+
t = Table(
175+
'Table',
176+
path='superduper.components.table.Table',
177+
primary_id='uuid',
178+
component=True,
179+
)
180+
167181
r = self.db['Table'].insert(
168-
[t.dict(schema=True, path=False)],
182+
[t.dict(schema=True, path=False)],
169183
)
170184
else:
171185
raise e
@@ -175,8 +189,16 @@ def create(self, cls: t.Type[Base]):
175189
f'{cls.__name__} already exists in metadata with data: {r}'
176190
)
177191

178-
self.db.databackend.create_table_and_schema(cls.__name__, cls.class_schema)
179-
t = Table(identifier=cls.__name__, path=f'{cls.__module__}.{cls.__name__}', primary_id='uuid', component=True)
192+
pid = 'uuid' if issubclass(cls, Component) else self.db.databackend.id_field
193+
self.db.databackend.create_table_and_schema(
194+
cls.__name__, cls.class_schema, primary_id=pid
195+
)
196+
t = Table(
197+
identifier=cls.__name__,
198+
path=f'{cls.__module__}.{cls.__name__}',
199+
primary_id=pid,
200+
component=True,
201+
)
180202
self.db['Table'].insert([t.dict(path=False)])
181203
return t
182204

@@ -198,9 +220,17 @@ def create_entry(
198220
:param component: component name
199221
:param raw: whether to insert raw data
200222
"""
223+
path = None
201224
if component is None:
202225
path = info.pop('_path')
203226
component = path.rsplit('.', 1)[1]
227+
228+
metadata = self.db['Table'].get(component)
229+
if metadata is None:
230+
assert path is not None
231+
cls = import_object(path)
232+
self.create(cls)
233+
204234
self.db[component].insert([info], raw=raw)
205235

206236
def create_component(self, info: t.Dict):
@@ -339,10 +369,14 @@ def show_jobs(self, component: str, identifier: str):
339369
:param component: type of component
340370
:param identifier: identifier of component
341371
"""
342-
return self.db['Job'].filter(
343-
self.db['Job']['component'] == component,
344-
self.db['Job']['identifier'] == identifier,
345-
).distinct('job_id')
372+
return (
373+
self.db['Job']
374+
.filter(
375+
self.db['Job']['component'] == component,
376+
self.db['Job']['identifier'] == identifier,
377+
)
378+
.distinct('job_id')
379+
)
346380

347381
def show_components(self, component: str | None = None):
348382
"""
@@ -492,6 +526,10 @@ def get_component_by_uuid(self, component: str, uuid: str):
492526
if uuid in self.preset_uuids:
493527
return self.preset_uuids[uuid]
494528
r = self.db[component].get(uuid=uuid, raw=True)
529+
if r is None:
530+
raise NonExistentMetadataError(
531+
f'Object {uuid} does not exist in metadata for {component}'
532+
)
495533
path = self.db['Table'].get(identifier=component)['path']
496534
r['_path'] = path
497535
return r

superduper/backends/base/query.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818

1919
from superduper import CFG, logging
2020
from superduper.backends.base.metadata import NonExistentMetadataError
21+
from superduper.base.base import Base
2122
from superduper.base.constant import KEY_BLOBS, KEY_BUILDS, KEY_FILES, KEY_PATH
2223
from superduper.base.document import Document, _unpack
23-
from superduper.base.base import Base
2424

2525
if t.TYPE_CHECKING:
2626
from superduper.base.datalayer import Datalayer

superduper/base/apply.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -301,11 +301,9 @@ def replace_existing(x):
301301
# during the `.map` to the children
302302
# serializer.map...
303303
# this means replacing components with references
304-
serialized = object.dict().update(serialized)
304+
serialized = object.dict(metadata=False).update(serialized)
305305

306306
# this is necessary to prevent inconsistencies
307-
# this takes the difference between
308-
# the current and
309307
serialized = serialized.update(this_diff).encode(keep_schema=False)
310308

311309
# assign/ increment the version since
@@ -339,7 +337,7 @@ def replace_existing(x):
339337
# update the existing component with the change
340338
# data from the applied component
341339
serialized = (
342-
current.dict()
340+
current.dict(metadata=False)
343341
.update(serialized)
344342
.update(this_diff)
345343
.encode(keep_schema=False)
@@ -354,7 +352,11 @@ def replace_existing(x):
354352
)
355353
serialized['version'] = 0
356354

357-
serialized = object.dict().update(serialized)
355+
# TODO this was set to empty (no metadata specified) previously
356+
# Do we even need this?
357+
# The metadata components could "simple" sit quietly inside the component
358+
# and be applied when the component is applied
359+
serialized = object.dict(metadata=False).update(serialized)
358360

359361
# if the metadata includes components, which
360362
# need to be applied, do that now

0 commit comments

Comments
 (0)