Skip to content

Commit 437ffe4

Browse files
authored
Add genson integration for json generation (#1390)
This PR aims at integrating support of the `genson` package (in `generate.json`) to be able to use dynamic json schema generation as proposed in #1383.
1 parent ea4904a commit 437ffe4

File tree

4 files changed

+98
-1
lines changed

4 files changed

+98
-1
lines changed

docs/reference/generation/json.md

+73
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,76 @@ print(add(**result))
103103
```
104104

105105
A great advantage of passing functions directly to specify the structure is that the structure of the LLM will change with the function's definition. No need to change the code at several places!
106+
107+
108+
## From a dynamic JSON schema builder - GenSON
109+
110+
Outlines integrated [GenSON](https://github.com/wolverdude/GenSON) builders to be able to dynamicly declare JSON schemas. It can be used as follow:
111+
112+
```python
113+
from genson import SchemaBuilder
114+
115+
from outlines import models
116+
from outlines import generate
117+
118+
builder = SchemaBuilder()
119+
builder.add_schema({"type": "object", "properties": {}})
120+
builder.add_object({"name": "Toto", "age": 5})
121+
122+
model = models.transformers(
123+
"HuggingFaceTB/SmolLM2-135M",
124+
device="auto",
125+
)
126+
generator = generate.json(model, builder)
127+
128+
res = generator("Return a json of a young boy")
129+
print(res)
130+
# {"name": "Ben", "age": 10}
131+
```
132+
133+
Anytime you are updating the schema through the builder, you need to redifine the outline generator to include these changes. From the the previous example:
134+
135+
```python
136+
from genson import SchemaBuilder
137+
138+
from outlines import models
139+
from outlines import generate
140+
141+
builder = SchemaBuilder()
142+
builder.add_schema({"type": "object", "properties": {}})
143+
builder.add_object({"name": "Toto", "age": 5})
144+
145+
model = models.transformers(
146+
"HuggingFaceTB/SmolLM2-135M",
147+
device="auto",
148+
)
149+
generator = generate.json(model, builder)
150+
151+
res = generator("Return a json of a young boy")
152+
print(res)
153+
# {"name": "Ben", "age": 10}
154+
155+
builder.add_object({"hobby": "sports"})
156+
generator = generate.json(model, builder)
157+
158+
res = generator("Return a json of a youg boy whose hobby is coding")
159+
print(res)
160+
# {"name": "Ben", "age": 10, "hobby": "coding"}
161+
```
162+
163+
!!! Note
164+
165+
Beware of [GenSON](https://github.com/wolverdude/GenSON)'s behavior regarding dynamic amending of schemas through their builder. Here is an example of how you could lose `required` informations and generate json with missing fields:
166+
167+
```python
168+
builder = SchemaBuilder()
169+
builder.add_schema({"type": "object", "properties": {}})
170+
builder.add_object({"name": "Toto", "age": 5})
171+
172+
print(builder.to_schema())
173+
# {'$schema': 'http://json-schema.org/schema#', 'type': 'object', 'properties': {'name': {'type': 'string'}, 'age': {'type': 'integer'}}, 'required': ['age', 'name']}
174+
175+
builder.add_object({"hobby": "sport"})
176+
print(builder.to_schema())
177+
# {'name': {'type': 'string'}, 'age': {'type': 'integer'}, 'hobby': {'type': 'string'}}}
178+
```

outlines/generate/json.py

+6
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from functools import singledispatch
44
from typing import Callable, Optional, Union
55

6+
from genson import SchemaBuilder
67
from outlines_core.fsm.json_schema import build_regex_from_schema
78
from pydantic import BaseModel
89

@@ -55,6 +56,11 @@ def json(
5556
regex_str = build_regex_from_schema(schema, whitespace_pattern)
5657
generator = regex(model, regex_str, sampler)
5758
generator.format_sequence = lambda x: pyjson.loads(x)
59+
elif isinstance(schema_object, SchemaBuilder):
60+
schema = schema_object.to_json()
61+
regex_str = build_regex_from_schema(schema, whitespace_pattern)
62+
generator = regex(model, regex_str, sampler)
63+
generator.format_sequence = lambda x: pyjson.loads(x)
5864
elif callable(schema_object):
5965
schema = pyjson.dumps(get_schema_from_signature(schema_object))
6066
regex_str = build_regex_from_schema(schema, whitespace_pattern)

pyproject.toml

+3-1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ dependencies = [
4141
"airportsdata",
4242
"torch",
4343
"outlines_core==0.1.26",
44+
"genson",
4445
]
4546
dynamic = ["version"]
4647

@@ -71,7 +72,7 @@ test = [
7172
"transformers",
7273
"pillow",
7374
"exllamav2",
74-
"jax"
75+
"jax",
7576
]
7677
serve = [
7778
"vllm>=0.3.0",
@@ -147,6 +148,7 @@ module = [
147148
"pycountry.*",
148149
"airportsdata.*",
149150
"outlines_core.*",
151+
"genson",
150152
]
151153
ignore_missing_imports = true
152154

tests/generate/test_generate.py

+16
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,22 @@ def test_generate_json(request, model_fixture, sample_schema):
276276
generator(**get_inputs(model_fixture), max_tokens=100)
277277

278278

279+
def test_integrate_genson_generate_json(request):
280+
from genson import SchemaBuilder
281+
282+
builder = SchemaBuilder()
283+
builder.add_schema({"type": "object", "properties": {}})
284+
builder.add_object({"name": "Toto", "age": 5})
285+
286+
model = request.getfixturevalue("model_transformers_opt125m")
287+
288+
generator = generate.json(model, builder)
289+
res = generator("Return a json of a young boy")
290+
291+
assert "name" in res
292+
assert "age" in res
293+
294+
279295
@pytest.mark.parametrize("model_fixture", ALL_MODEL_FIXTURES)
280296
@pytest.mark.parametrize("sample_choices", ALL_SAMPLE_CHOICES_FIXTURES)
281297
def test_generate_choice(request, model_fixture, sample_choices):

0 commit comments

Comments
 (0)