Skip to content

Commit f60b9b3

Browse files
serialization of columns added into the definition of the table
1 parent 85b3166 commit f60b9b3

File tree

4 files changed

+84
-68
lines changed

4 files changed

+84
-68
lines changed

CONTRIBUTING.md

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,25 @@ make spell_fix
6565
We use `pytest` to test our code. You can run the tests by running the following command:
6666

6767
```bash
68-
make tests
68+
make test_all
69+
```
70+
71+
If you prefer, you can run only the core tests with the command:
72+
73+
```bash
74+
make test_core
75+
```
76+
77+
or the test of extensions with the command:
78+
79+
```bash
80+
make test_extensions
81+
```
82+
83+
You can also run the tests with coverage by running the following command:
84+
85+
```bash
86+
make test-coverage
6987
```
7088

7189
Make sure that all tests pass before submitting a pull request.

pandasai/data_loader/semantic_layer_schema.py

Lines changed: 58 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import re
22
from functools import partial
3-
from typing import Any, Dict, List, Optional, Union
3+
from typing import Any, Dict, List
44

55
import yaml
66
from pydantic import (
@@ -45,12 +45,10 @@ def __eq__(self, other):
4545

4646
class Column(BaseModel):
4747
name: str = Field(..., description="Name of the column.")
48-
type: Optional[str] = Field(None, description="Data type of the column.")
49-
description: Optional[str] = Field(None, description="Description of the column")
50-
expression: Optional[str] = Field(
51-
None, description="Aggregation expression (avg, min, max, sum)"
52-
)
53-
alias: Optional[str] = Field(None, description="Alias for the column")
48+
type: str | None = Field(None, description="Data type of the column.")
49+
description: str | None = Field(None, description="Description of the column")
50+
expression: str | None = Field(None, description="Aggregation expression (avg, min, max, sum)")
51+
alias: str | None = Field(None, description="Alias for the column")
5452

5553
@field_validator("type")
5654
@classmethod
@@ -63,7 +61,9 @@ def is_column_type_supported(cls, type: str) -> str:
6361

6462
@field_validator("expression")
6563
@classmethod
66-
def is_expression_valid(cls, expr: str) -> str:
64+
def is_expression_valid(cls, expr: str) -> str | None:
65+
if expr is None:
66+
return expr
6767
try:
6868
parse_one(expr)
6969
return expr
@@ -72,87 +72,83 @@ def is_expression_valid(cls, expr: str) -> str:
7272

7373

7474
class Relation(BaseModel):
75-
name: Optional[str] = Field(None, description="Name of the relationship.")
76-
description: Optional[str] = Field(
77-
None, description="Description of the relationship."
78-
)
79-
from_: str = Field(
80-
..., alias="from", description="Source column for the relationship."
81-
)
75+
name: str | None = Field(None, description="Name of the relationship.")
76+
description: str | None = Field(None, description="Description of the relationship.")
77+
from_: str = Field(..., alias="from", description="Source column for the relationship.")
8278
to: str = Field(..., description="Target column for the relationship.")
8379

8480

8581
class TransformationParams(BaseModel):
86-
column: Optional[str] = Field(None, description="Column to transform")
87-
value: Optional[Union[str, int, float, bool]] = Field(
82+
column: str | None = Field(None, description="Column to transform")
83+
value: str | int | float | bool | None = Field(
8884
None, description="Value for fill_na and other transformations"
8985
)
90-
mapping: Optional[Dict[str, str]] = Field(
86+
mapping: Dict[str, str] | None = Field(
9187
None, description="Mapping dictionary for map_values transformation"
9288
)
93-
format: Optional[str] = Field(None, description="Format string for date formatting")
94-
decimals: Optional[int] = Field(
89+
format: str | None = Field(None, description="Format string for date formatting")
90+
decimals: int | None = Field(
9591
None, description="Number of decimal places for rounding"
9692
)
97-
factor: Optional[Union[int, float]] = Field(None, description="Scaling factor")
98-
to_tz: Optional[str] = Field(None, description="Target timezone or format")
99-
from_tz: Optional[str] = Field(None, description="From timezone or format")
100-
errors: Optional[str] = Field(
93+
factor: int | float | None = Field(None, description="Scaling factor")
94+
to_tz: str | None = Field(None, description="Target timezone or format")
95+
from_tz: str | None = Field(None, description="From timezone or format")
96+
errors: str | None = Field(
10197
None, description="Error handling mode for numeric/datetime conversion"
10298
)
103-
old_value: Optional[Any] = Field(
99+
old_value: Any | None = Field(
104100
None, description="Old value for replace transformation"
105101
)
106-
new_value: Optional[Any] = Field(
102+
new_value: Any | None = Field(
107103
None, description="New value for replace transformation"
108104
)
109-
new_name: Optional[str] = Field(
105+
new_name: str | None = Field(
110106
None, description="New name for column in rename transformation"
111107
)
112-
pattern: Optional[str] = Field(
108+
pattern: str | None = Field(
113109
None, description="Pattern for extract transformation"
114110
)
115-
length: Optional[int] = Field(
111+
length: int | None = Field(
116112
None, description="Length for truncate transformation"
117113
)
118-
add_ellipsis: Optional[bool] = Field(
114+
add_ellipsis: bool | None = Field(
119115
True, description="Whether to add ellipsis in truncate"
120116
)
121-
width: Optional[int] = Field(None, description="Width for pad transformation")
122-
side: Optional[str] = Field("left", description="Side for pad transformation")
123-
pad_char: Optional[str] = Field(" ", description="Character for pad transformation")
124-
lower: Optional[Union[int, float]] = Field(None, description="Lower bound for clip")
125-
upper: Optional[Union[int, float]] = Field(None, description="Upper bound for clip")
126-
bins: Optional[Union[int, List[Union[int, float]]]] = Field(
117+
width: int | None = Field(None, description="Width for pad transformation")
118+
side: str | None = Field("left", description="Side for pad transformation")
119+
pad_char: str | None = Field(" ", description="Character for pad transformation")
120+
lower: int | float | None = Field(None, description="Lower bound for clip")
121+
upper: int | float | None = Field(None, description="Upper bound for clip")
122+
bins: int | List[int | float] | None = Field(
127123
None, description="Bins for binning"
128124
)
129-
labels: Optional[List[str]] = Field(None, description="Labels for bins")
130-
drop_first: Optional[bool] = Field(
125+
labels: List[str] | None = Field(None, description="Labels for bins")
126+
drop_first: bool | None = Field(
131127
True, description="Whether to drop first category in encoding"
132128
)
133-
drop_invalid: Optional[bool] = Field(
129+
drop_invalid: bool | None = Field(
134130
False, description="Whether to drop invalid values"
135131
)
136-
start_date: Optional[str] = Field(
132+
start_date: str | None = Field(
137133
None, description="Start date for date range validation"
138134
)
139-
end_date: Optional[str] = Field(
135+
end_date: str | None = Field(
140136
None, description="End date for date range validation"
141137
)
142-
country_code: Optional[str] = Field(
138+
country_code: str | None = Field(
143139
"+1", description="Country code for phone normalization"
144140
)
145-
columns: Optional[List[str]] = Field(
141+
columns: List[str] | None = Field(
146142
None, description="List of columns for multi-column operations"
147143
)
148-
keep: Optional[str] = Field("first", description="Which duplicates to keep")
149-
ref_table: Optional[Any] = Field(
144+
keep: str | None = Field("first", description="Which duplicates to keep")
145+
ref_table: Any | None = Field(
150146
None, description="Reference DataFrame for foreign key validation"
151147
)
152-
ref_column: Optional[str] = Field(
148+
ref_column: str | None = Field(
153149
None, description="Reference column for foreign key validation"
154150
)
155-
drop_negative: Optional[bool] = Field(
151+
drop_negative: bool | None = Field(
156152
False, description="Whether to drop negative values"
157153
)
158154

@@ -172,7 +168,7 @@ def validate_required_params(cls, values: dict) -> dict:
172168

173169
class Transformation(BaseModel):
174170
type: str = Field(..., description="Type of transformation to be applied.")
175-
params: Optional[TransformationParams] = Field(
171+
params: TransformationParams | None = Field(
176172
None, description="Parameters for the transformation."
177173
)
178174

@@ -195,11 +191,11 @@ def set_transform_type(cls, values: dict) -> dict:
195191

196192
class Source(BaseModel):
197193
type: str = Field(..., description="Type of the data source.")
198-
path: Optional[str] = Field(None, description="Path of the local data source.")
199-
connection: Optional[SQLConnectionConfig] = Field(
194+
path: str | None = Field(None, description="Path of the local data source.")
195+
connection: SQLConnectionConfig | None = Field(
200196
None, description="Connection object of the data source."
201197
)
202-
table: Optional[str] = Field(None, description="Table of the data source.")
198+
table: str | None = Field(None, description="Table of the data source.")
203199

204200
def is_compatible_source(self, source2: "Source"):
205201
"""
@@ -267,33 +263,33 @@ def is_format_supported(cls, format: str) -> str:
267263

268264
class SemanticLayerSchema(BaseModel):
269265
name: str = Field(..., description="Dataset name.")
270-
source: Optional[Source] = Field(None, description="Data source for your dataset.")
271-
view: Optional[bool] = Field(None, description="Whether table is a view")
272-
description: Optional[str] = Field(
266+
source: Source | None = Field(None, description="Data source for your dataset.")
267+
view: bool | None = Field(None, description="Whether table is a view")
268+
description: str | None = Field(
273269
None, description="Dataset’s contents and purpose description."
274270
)
275-
columns: Optional[List[Column]] = Field(
271+
columns: List[Column] | None = Field(
276272
None, description="Structure and metadata of your dataset’s columns"
277273
)
278-
relations: Optional[List[Relation]] = Field(
274+
relations: List[Relation] | None = Field(
279275
None, description="Relationships between columns and tables."
280276
)
281-
order_by: Optional[List[str]] = Field(
277+
order_by: List[str] | None = Field(
282278
None, description="Ordering criteria for the dataset."
283279
)
284-
limit: Optional[int] = Field(
280+
limit: int | None = Field(
285281
None, description="Maximum number of records to retrieve."
286282
)
287-
transformations: Optional[List[Transformation]] = Field(
283+
transformations: List[Transformation] | None = Field(
288284
None, description="List of transformations to apply to the data."
289285
)
290-
destination: Optional[Destination] = Field(
286+
destination: Destination | None = Field(
291287
None, description="Destination for saving the dataset."
292288
)
293-
update_frequency: Optional[str] = Field(
289+
update_frequency: str | None = Field(
294290
None, description="Frequency of dataset updates."
295291
)
296-
group_by: Optional[List[str]] = Field(
292+
group_by: List[str] | None = Field(
297293
None,
298294
description="List of columns to group by. Every non-aggregated column must be included in group_by.",
299295
)

pandasai/helpers/dataframe_serializer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ def serialize(cls, df: "DataFrame", dialect: str = "postgres") -> str:
2828
if df.schema.description is not None:
2929
dataframe_info += f' description="{df.schema.description}"'
3030

31+
if df.schema.columns:
32+
columns = [column.model_dump() for column in df.schema.columns]
33+
dataframe_info += f' columns="{json.dumps(columns, ensure_ascii=False)}"'
34+
3135
dataframe_info += f' dimensions="{df.rows_count}x{df.columns_count}">'
3236

3337
# Truncate long values

tests/unit_tests/helpers/test_dataframe_serializer.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import pandas as pd
2-
31
from pandasai.helpers.dataframe_serializer import DataframeSerializer
42

53

@@ -8,7 +6,7 @@ def test_serialize_with_name_and_description(self, sample_df):
86
"""Test serialization with name and description attributes."""
97

108
result = DataframeSerializer.serialize(sample_df)
11-
expected = """<table dialect="postgres" table_name="table_6c30b42101939c7bdf95f4c1052d615c" dimensions="3x2">
9+
expected = """<table dialect="postgres" table_name="table_6c30b42101939c7bdf95f4c1052d615c" columns="[{"name": "A", "type": "integer", "description": null, "expression": null, "alias": null}, {"name": "B", "type": "integer", "description": null, "expression": null, "alias": null}]" dimensions="3x2">
1210
A,B
1311
1,4
1412
2,5
@@ -21,7 +19,7 @@ def test_serialize_with_name_and_description_with_dialect(self, sample_df):
2119
"""Test serialization with name and description attributes."""
2220

2321
result = DataframeSerializer.serialize(sample_df, dialect="mysql")
24-
expected = """<table dialect="mysql" table_name="table_6c30b42101939c7bdf95f4c1052d615c" dimensions="3x2">
22+
expected = """<table dialect="mysql" table_name="table_6c30b42101939c7bdf95f4c1052d615c" columns="[{"name": "A", "type": "integer", "description": null, "expression": null, "alias": null}, {"name": "B", "type": "integer", "description": null, "expression": null, "alias": null}]" dimensions="3x2">
2523
A,B
2624
1,4
2725
2,5
@@ -44,7 +42,7 @@ def test_serialize_with_dataframe_long_strings(self, sample_df):
4442
truncated_text = long_text[: DataframeSerializer.MAX_COLUMN_TEXT_LENGTH] + "…"
4543

4644
# Expected output
47-
expected = f"""<table dialect="mysql" table_name="table_6c30b42101939c7bdf95f4c1052d615c" dimensions="3x2">
45+
expected = f"""<table dialect="mysql" table_name="table_6c30b42101939c7bdf95f4c1052d615c" columns="[{{"name": "A", "type": "integer", "description": null, "expression": null, "alias": null}}, {{"name": "B", "type": "integer", "description": null, "expression": null, "alias": null}}]" dimensions="3x2">
4846
A,B
4947
{truncated_text},4
5048
2,5

0 commit comments

Comments
 (0)