diff --git a/README.md b/README.md index ae048b6..e30b71a 100644 --- a/README.md +++ b/README.md @@ -33,9 +33,11 @@ There are metadata objects for each of the following metadata types: |------------------|-------------------------------------------------| | document | `document_schema.ScriptSchemaDraft` | | geospatial | `geospatial_schema.GeospatialSchema` | +| image | `image_schema.ImageDataTypeSchema` | | indicator | `indicator_schema.TimeseriesSchema` | | indicators_db | `indicators_db_schema.TimeseriesDatabaseSchema` | -| microdata | `microdata_schema.MicrodataSchema` | +| microdata | `microdata_schema.MicrodataSchema` | +| resource |`resource_schema.Model` | | script | `script_schema.ResearchProjectSchemaDraft` | | table | `table_schema.Model` | | video | `video_schema.Model` | @@ -65,9 +67,6 @@ filename = mm.save_metadata_to_excel('indicator', object=indicator_metadata) updated_indicator_metadata = mm.read_metadata_from_excel(filename) ``` - -Note that the Excel write and save functions do not currently support Geospatial metadata. - The manager also offers a convenient way to get started creating metadata in pydantic by creating an empty pydantic object for a given metadata type which can then be updated as needed. ```python @@ -98,7 +97,7 @@ Next update the pydantic schemas so that they match the latest json schemas by r Finally update the Excel sheets by running - `python pydantic_schemas/generators/generate_excel_files.py` + `python -m pydantic_schemas.generators.generate_excel_files` ## Versioning conventions for schemas diff --git a/excel_sheets/Document_metadata.xlsx b/excel_sheets/Document_metadata.xlsx index f9ec93d..2020cc3 100644 Binary files a/excel_sheets/Document_metadata.xlsx and b/excel_sheets/Document_metadata.xlsx differ diff --git a/excel_sheets/Geospatial_metadata.xlsx b/excel_sheets/Geospatial_metadata.xlsx new file mode 100644 index 0000000..77d68d8 Binary files /dev/null and b/excel_sheets/Geospatial_metadata.xlsx differ diff --git a/excel_sheets/Image_metadata.xlsx b/excel_sheets/Image_metadata.xlsx new file mode 100644 index 0000000..99809cc Binary files /dev/null and b/excel_sheets/Image_metadata.xlsx differ diff --git a/excel_sheets/Indicators_db_metadata.xlsx b/excel_sheets/Indicators_db_metadata.xlsx index 83e6331..a563383 100644 Binary files a/excel_sheets/Indicators_db_metadata.xlsx and b/excel_sheets/Indicators_db_metadata.xlsx differ diff --git a/excel_sheets/Microdata_metadata.xlsx b/excel_sheets/Microdata_metadata.xlsx index 6f1691e..b6d47d5 100644 Binary files a/excel_sheets/Microdata_metadata.xlsx and b/excel_sheets/Microdata_metadata.xlsx differ diff --git a/excel_sheets/Script_metadata.xlsx b/excel_sheets/Script_metadata.xlsx index 9038bdc..2b725d5 100644 Binary files a/excel_sheets/Script_metadata.xlsx and b/excel_sheets/Script_metadata.xlsx differ diff --git a/excel_sheets/Table_metadata.xlsx b/excel_sheets/Table_metadata.xlsx index 422a2cc..16cc3fc 100644 Binary files a/excel_sheets/Table_metadata.xlsx and b/excel_sheets/Table_metadata.xlsx differ diff --git a/excel_sheets/Video_metadata.xlsx b/excel_sheets/Video_metadata.xlsx index 6fc08f9..7a541a6 100644 Binary files a/excel_sheets/Video_metadata.xlsx and b/excel_sheets/Video_metadata.xlsx differ diff --git a/pydantic_schemas/document_schema.py b/pydantic_schemas/document_schema.py index e15b6ea..e21163f 100644 --- a/pydantic_schemas/document_schema.py +++ b/pydantic_schemas/document_schema.py @@ -4,7 +4,7 @@ from __future__ import annotations from enum import Enum -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from pydantic import Extra, Field @@ -261,44 +261,6 @@ class Tag(SchemaBaseModel): tag_group: Optional[str] = Field(None, title="Tag group") -class ModelInfoItem(SchemaBaseModel): - source: Optional[str] = Field(None, title="Source") - author: Optional[str] = Field(None, title="Author") - version: Optional[str] = Field(None, title="Version") - model_id: Optional[str] = Field(None, title="Model Identifier") - nb_topics: Optional[float] = Field(None, title="Number of topics") - description: Optional[str] = Field(None, title="Description") - corpus: Optional[str] = Field(None, title="Corpus name") - uri: Optional[str] = Field(None, title="URI") - - -class TopicWord(SchemaBaseModel): - word: Optional[str] = Field(None, title="Word") - word_weight: Optional[float] = Field(None, title="Word weight") - - -class TopicDescriptionItem(SchemaBaseModel): - topic_id: Optional[Union[int, str]] = Field(None, title="Topic identifier") - topic_score: Optional[Union[float, str]] = Field(None, title="Topic score") - topic_label: Optional[str] = Field(None, title="Topic label") - topic_words: Optional[List[TopicWord]] = Field(None, description="Words", title="Topic words") - - -class LdaTopic(SchemaBaseModel): - class Config: - extra = Extra.forbid - - model_info: Optional[List[ModelInfoItem]] = Field(None, title="Model information") - topic_description: Optional[List[TopicDescriptionItem]] = Field(None, title="Topic information") - - -class Embedding(SchemaBaseModel): - id: str = Field(..., title="Vector Model ID") - description: Optional[str] = Field(None, title="Vector Model Description") - date: Optional[str] = Field(None, title="Date (YYYY-MM-DD)") - vector: Union[Dict[str, Any], List[Any]] = Field(..., title="Vector") - - class OriginDescription(SchemaBaseModel): harvest_date: Optional[str] = Field(None, description="Harvest date using UTC date format") altered: Optional[bool] = Field( @@ -587,6 +549,4 @@ class ScriptSchemaDraft(SchemaBaseModel): ) provenance: Optional[List[ProvenanceSchema]] = Field(None, description="Provenance") tags: Optional[List[Tag]] = Field(None, description="Tags", title="Tags") - lda_topics: Optional[List[LdaTopic]] = Field(None, description="LDA topics", title="LDA topics") - embeddings: Optional[List[Embedding]] = Field(None, description="Word embeddings", title="Word embeddings") additional: Optional[Dict[str, Any]] = Field(None, description="Additional metadata") diff --git a/pydantic_schemas/generators/generate_excel_files.py b/pydantic_schemas/generators/generate_excel_files.py index d1acbfb..a43725a 100644 --- a/pydantic_schemas/generators/generate_excel_files.py +++ b/pydantic_schemas/generators/generate_excel_files.py @@ -16,9 +16,9 @@ def compare_excel_files(file1, file2): # Check if both workbooks have the same sheets if sheets1 != sheets2: - print("Sheet names do not match") - print(f"File1 sheets: {sheets1}") - print(f"File2 sheets: {sheets2}") + # print("Sheet names do not match") + # print(f"File1 sheets: {sheets1}") + # print(f"File2 sheets: {sheets2}") return False # Iterate through each sheet @@ -62,9 +62,9 @@ def compare_excel_files(file1, file2): differences.append(f"Alignment: {ws1[cell_address].alignment} != {ws2[cell_address].alignment}") if differences: - print(f"Differences found at {sheet_name} {cell_address}:") - for difference in differences: - print(f" - {difference}") + # print(f"Differences found at {sheet_name} {cell_address}:") + # for difference in differences: + # print(f" - {difference}") return False return True @@ -73,8 +73,6 @@ def compare_excel_files(file1, file2): metadata_manager = MetadataManager() for metadata_name in metadata_manager.metadata_type_names: - if metadata_name in ["image", "geospatial"]: - continue filename = f"excel_sheets/{metadata_name.capitalize()}_metadata.xlsx" print(f"Writing {metadata_name} outline to {filename}") if os.path.exists(filename): diff --git a/pydantic_schemas/geospatial_schema.py b/pydantic_schemas/geospatial_schema.py index 3678503..d04b1c7 100644 --- a/pydantic_schemas/geospatial_schema.py +++ b/pydantic_schemas/geospatial_schema.py @@ -4,7 +4,7 @@ from __future__ import annotations from enum import Enum -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from pydantic import Extra, Field, confloat @@ -493,44 +493,6 @@ class Tag(SchemaBaseModel): tag_group: Optional[str] = Field(None, title="Tag group") -class ModelInfoItem(SchemaBaseModel): - source: Optional[str] = Field(None, title="Source") - author: Optional[str] = Field(None, title="Author") - version: Optional[str] = Field(None, title="Version") - model_id: Optional[str] = Field(None, title="Model Identifier") - nb_topics: Optional[float] = Field(None, title="Number of topics") - description: Optional[str] = Field(None, title="Description") - corpus: Optional[str] = Field(None, title="Corpus name") - uri: Optional[str] = Field(None, title="URI") - - -class TopicWord(SchemaBaseModel): - word: Optional[str] = Field(None, title="Word") - word_weight: Optional[float] = Field(None, title="Word weight") - - -class TopicDescriptionItem(SchemaBaseModel): - topic_id: Optional[Union[int, str]] = Field(None, title="Topic identifier") - topic_score: Optional[Union[float, str]] = Field(None, title="Topic score") - topic_label: Optional[str] = Field(None, title="Topic label") - topic_words: Optional[List[TopicWord]] = Field(None, description="Words", title="Topic words") - - -class LdaTopic(SchemaBaseModel): - class Config: - extra = Extra.forbid - - model_info: Optional[List[ModelInfoItem]] = Field(None, title="Model information") - topic_description: Optional[List[TopicDescriptionItem]] = Field(None, title="Topic information") - - -class Embedding(SchemaBaseModel): - id: str = Field(..., title="Vector Model ID") - description: Optional[str] = Field(None, title="Vector Model Description") - date: Optional[str] = Field(None, title="Date (YYYY-MM-DD)") - vector: Dict[str, Any] = Field(..., title="Vector") - - class ResourceSchema(SchemaBaseModel): """ External resource schema @@ -1523,8 +1485,6 @@ class GeospatialSchema(SchemaBaseModel): description: Description = Field(..., title="Geospatial schema") provenance: Optional[List[ProvenanceSchema]] = Field(None, description="Provenance") tags: Optional[List[Tag]] = Field(None, description="Tags", title="Tags") - lda_topics: Optional[List[LdaTopic]] = Field(None, description="LDA topics", title="LDA topics") - embeddings: Optional[List[Embedding]] = Field(None, description="Word embeddings", title="Word embeddings") additional: Optional[Dict[str, Any]] = Field( None, description="Any additional metadata", title="Additional metadata" ) diff --git a/pydantic_schemas/image_schema.py b/pydantic_schemas/image_schema.py index a586e22..a71d648 100644 --- a/pydantic_schemas/image_schema.py +++ b/pydantic_schemas/image_schema.py @@ -5,9 +5,9 @@ from datetime import datetime from enum import Enum -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional -from pydantic import AnyUrl, Extra, Field, confloat, constr +from pydantic import AnyUrl, Extra, Field, confloat from .utils.schema_base_model import SchemaBaseModel @@ -71,44 +71,6 @@ class Tag(SchemaBaseModel): tag_group: Optional[str] = Field(None, title="Tag group") -class ModelInfoItem(SchemaBaseModel): - source: Optional[str] = Field(None, title="Source") - author: Optional[str] = Field(None, title="Author") - version: Optional[str] = Field(None, title="Version") - model_id: Optional[str] = Field(None, title="Model Identifier") - nb_topics: Optional[float] = Field(None, title="Number of topics") - description: Optional[str] = Field(None, title="Description") - corpus: Optional[str] = Field(None, title="Corpus name") - uri: Optional[str] = Field(None, title="URI") - - -class TopicWord(SchemaBaseModel): - word: Optional[str] = Field(None, title="Word") - word_weight: Optional[float] = Field(None, title="Word weight") - - -class TopicDescriptionItem(SchemaBaseModel): - topic_id: Optional[Union[int, str]] = Field(None, title="Topic identifier") - topic_score: Optional[Union[float, str]] = Field(None, title="Topic score") - topic_label: Optional[str] = Field(None, title="Topic label") - topic_words: Optional[List[TopicWord]] = Field(None, description="Words", title="Topic words") - - -class LdaTopic(SchemaBaseModel): - class Config: - extra = Extra.forbid - - model_info: Optional[List[ModelInfoItem]] = Field(None, title="Model information") - topic_description: Optional[List[TopicDescriptionItem]] = Field(None, title="Topic information") - - -class Embedding(SchemaBaseModel): - id: str = Field(..., title="Vector Model ID") - description: Optional[str] = Field(None, title="Vector Model Description") - date: Optional[str] = Field(None, title="Date (YYYY-MM-DD)") - vector: Dict[str, Any] = Field(..., title="Vector") - - class SceneCodesLabelledItem(SchemaBaseModel): code: Optional[str] = Field(None, description="Scene code as a string of 6 digits", title="Scene Code") label: Optional[str] = Field(None, description="Label", title="Scene Label") @@ -139,18 +101,6 @@ class Config: description: Optional[str] = None -class AltLangObject(SchemaBaseModel): - class Config: - extra = Extra.forbid - - __root__: Dict[ - constr( - regex=r"^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+))$" - ), - str, - ] = Field(..., description="Text in alternative languages") - - class ArtworkOrObject(SchemaBaseModel): class Config: extra = Extra.forbid @@ -1176,6 +1126,4 @@ class ImageDataTypeSchema(SchemaBaseModel): image_description: Optional[ImageDescription] = None provenance: Optional[List[ProvenanceSchema]] = Field(None, description="Provenance") tags: Optional[List[Tag]] = Field(None, description="Tags", title="Tags") - lda_topics: Optional[List[LdaTopic]] = Field(None, description="LDA topics", title="LDA topics") - embeddings: Optional[List[Embedding]] = Field(None, description="Word embeddings", title="Word embeddings") additional: Optional[Dict[str, Any]] = Field(None, description="Additional metadata") diff --git a/pydantic_schemas/metadata_manager.py b/pydantic_schemas/metadata_manager.py index 70f9c69..acd9434 100644 --- a/pydantic_schemas/metadata_manager.py +++ b/pydantic_schemas/metadata_manager.py @@ -4,9 +4,10 @@ from openpyxl import load_workbook from pydantic import BaseModel -from . import ( # image_schema, +from . import ( document_schema, geospatial_schema, + image_schema, indicator_schema, indicators_db_schema, microdata_schema, @@ -32,7 +33,7 @@ class MetadataManager: _TYPE_TO_SCHEMA = { "document": document_schema.ScriptSchemaDraft, "geospatial": geospatial_schema.GeospatialSchema, - # "image":image_schema.ImageDataTypeSchema, + "image": image_schema.ImageDataTypeSchema, "resource": resource_schema.Model, "script": script_schema.ResearchProjectSchemaDraft, "microdata": microdata_schema.MicrodataSchema, @@ -44,8 +45,8 @@ class MetadataManager: _TYPE_TO_WRITER = { "document": write_across_many_sheets, - # "geospatial":, - # "image":, + "geospatial": write_across_many_sheets, + "image": write_across_many_sheets, "resource": write_to_single_sheet, "script": write_across_many_sheets, "microdata": write_across_many_sheets, @@ -57,8 +58,8 @@ class MetadataManager: _TYPE_TO_READER = { "document": excel_doc_to_pydantic, - # "geospatial":, - # "image":, + "geospatial": excel_doc_to_pydantic, + "image": excel_doc_to_pydantic, "resource": excel_single_sheet_to_pydantic, "script": excel_doc_to_pydantic, "microdata": excel_doc_to_pydantic, @@ -126,8 +127,8 @@ def write_metadata_outline_to_excel( """ if isinstance(metadata_name_or_class, str): metadata_name = self.standardize_metadata_name(metadata_name_or_class) - if metadata_name == "geospatial": - raise NotImplementedError("Geospatial schema contains an infinite loop so cannot be written to excel") + # if metadata_name == "geospatial": + # raise NotImplementedError("Geospatial schema contains an infinite loop so cannot be written to excel") skeleton_object = self.create_metadata_outline(metadata_name, debug=False) writer = self._TYPE_TO_WRITER[metadata_name] if filename is None: @@ -154,6 +155,7 @@ def save_metadata_to_excel( object: BaseModel, filename: Optional[str] = None, title: Optional[str] = None, + verbose: bool = False, ) -> str: """ Save an Excel document of the given metadata object. @@ -176,8 +178,8 @@ def save_metadata_to_excel( """ if isinstance(metadata_name_or_class, str): metadata_name = self.standardize_metadata_name(metadata_name_or_class) - if metadata_name == "geospatial": - raise NotImplementedError("Geospatial schema contains an infinite loop so cannot be written to excel") + # if metadata_name == "geospatial": + # raise NotImplementedError("Geospatial schema contains an infinite loop so cannot be written to excel") schema = self.metadata_class_from_name(metadata_name) writer = self._TYPE_TO_WRITER[metadata_name] else: @@ -195,13 +197,11 @@ def save_metadata_to_excel( combined_dict = merge_dicts( skeleton_object.model_dump(), - object.model_dump(exclude_none=True, exclude_unset=True, exclude_defaults=True), + object.model_dump(exclude_none=False, exclude_unset=True, exclude_defaults=True), ) combined_dict = standardize_keys_in_dict(combined_dict) - new_ob = schema(**combined_dict) - - # writer = self._TYPE_TO_WRITER[metadata_name] - writer(filename, new_ob, metadata_name, title) + new_ob = schema.model_validate(combined_dict) + writer(filename, new_ob, metadata_name, title, verbose=verbose) return filename @staticmethod @@ -231,7 +231,9 @@ def _get_metadata_name_from_excel_file(filename: str) -> str: return cell_values[0] - def read_metadata_from_excel(self, filename: str, metadata_class: Optional[Type[BaseModel]] = None) -> BaseModel: + def read_metadata_from_excel( + self, filename: str, metadata_class: Optional[Type[BaseModel]] = None, verbose: bool = False + ) -> BaseModel: """ Read in metadata from an appropriately formatted Excel file as a pydantic object. If using standard metadata types (document, indicator, indicators_db, microdata, resource, script, table, video) then there is no need to pass in the metadata_class. But if using a template, then the class must be provided. @@ -255,25 +257,23 @@ def read_metadata_from_excel(self, filename: str, metadata_class: Optional[Type[ ) from e schema = metadata_class reader = excel_single_sheet_to_pydantic - read_object = reader(filename, schema) + read_object = reader(filename, schema, verbose=verbose) skeleton_object = self.create_metadata_outline(metadata_name_or_class=schema, debug=False) - read_object_dict = read_object.model_dump(exclude_none=True, exclude_unset=True, exclude_defaults=True) + read_object_dict = read_object.model_dump(exclude_none=False, exclude_unset=True, exclude_defaults=True) combined_dict = merge_dicts( skeleton_object.model_dump(), read_object_dict, ) combined_dict = standardize_keys_in_dict(combined_dict) - new_ob = schema(**combined_dict) + new_ob = schema.model_validate(combined_dict) return new_ob def _raise_if_unsupported_metadata_name(self, metadata_name: str): """ - If the type is specifically unsupported - geospatial or image - a NotImplementedError is raised + If the type is specifically unsupported a NotImplementedError is raised If the type is simply unknown then a ValueError is raised. """ - if metadata_name == "image": - raise NotImplementedError("Due to an issue with image metadata schema definition causing __root__ errors") if metadata_name not in self._TYPE_TO_SCHEMA.keys(): raise ValueError(f"'{metadata_name}' not supported. Must be: {list(self._TYPE_TO_SCHEMA.keys())}") diff --git a/pydantic_schemas/microdata_schema.py b/pydantic_schemas/microdata_schema.py index 036a238..fdf0c0a 100644 --- a/pydantic_schemas/microdata_schema.py +++ b/pydantic_schemas/microdata_schema.py @@ -38,44 +38,6 @@ class Tag(SchemaBaseModel): tag_group: Optional[str] = Field(None, title="Tag group") -class ModelInfoItem(SchemaBaseModel): - source: Optional[str] = Field(None, title="Source") - author: Optional[str] = Field(None, title="Author") - version: Optional[str] = Field(None, title="Version") - model_id: Optional[str] = Field(None, title="Model Identifier") - nb_topics: Optional[float] = Field(None, title="Number of topics") - description: Optional[str] = Field(None, title="Description") - corpus: Optional[str] = Field(None, title="Corpus name") - uri: Optional[str] = Field(None, title="URI") - - -class TopicWord(SchemaBaseModel): - word: Optional[str] = Field(None, title="Word") - word_weight: Optional[float] = Field(None, title="Word weight") - - -class TopicDescriptionItem(SchemaBaseModel): - topic_id: Optional[Union[int, str]] = Field(None, title="Topic identifier") - topic_score: Optional[Union[float, str]] = Field(None, title="Topic score") - topic_label: Optional[str] = Field(None, title="Topic label") - topic_words: Optional[List[TopicWord]] = Field(None, description="Words", title="Topic words") - - -class LdaTopic(SchemaBaseModel): - class Config: - extra = Extra.forbid - - model_info: Optional[List[ModelInfoItem]] = Field(None, title="Model information") - topic_description: Optional[List[TopicDescriptionItem]] = Field(None, title="Topic information") - - -class Embedding(SchemaBaseModel): - id: str = Field(..., title="Vector Model ID") - description: Optional[str] = Field(None, title="Vector Model Description") - date: Optional[str] = Field(None, title="Date (YYYY-MM-DD)") - vector: Dict[str, Any] = Field(..., title="Vector") - - class DatafileSchema(SchemaBaseModel): file_id: str = Field(..., title="File unique ID") file_name: str = Field(..., title="File name") @@ -1440,6 +1402,4 @@ class MicrodataSchema(DdiSchema): overwrite: Optional[Overwrite] = Field("no", description="Overwrite survey if already exists?") provenance: Optional[List[ProvenanceSchema]] = Field(None, description="Provenance") tags: Optional[List[Tag]] = Field(None, description="Tags", title="Tags (user-defined)") - lda_topics: Optional[List[LdaTopic]] = Field(None, description="LDA topics", title="LDA topics") - embeddings: Optional[List[Embedding]] = Field(None, description="Word embeddings", title="Word embeddings") additional: Optional[Dict[str, Any]] = Field(None, description="Additional metadata not covered by DDI elements") diff --git a/pydantic_schemas/script_schema.py b/pydantic_schemas/script_schema.py index 6af04b5..74b592a 100644 --- a/pydantic_schemas/script_schema.py +++ b/pydantic_schemas/script_schema.py @@ -4,7 +4,7 @@ from __future__ import annotations from enum import Enum -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from pydantic import Extra, Field @@ -594,44 +594,6 @@ class Tag(SchemaBaseModel): tag_group: Optional[str] = Field(None, title="Tag group") -class ModelInfoItem(SchemaBaseModel): - source: Optional[str] = Field(None, title="Source") - author: Optional[str] = Field(None, title="Author") - version: Optional[str] = Field(None, title="Version") - model_id: Optional[str] = Field(None, title="Model Identifier") - nb_topics: Optional[float] = Field(None, title="Number of topics") - description: Optional[str] = Field(None, title="Description") - corpus: Optional[str] = Field(None, title="Corpus name") - uri: Optional[str] = Field(None, title="URI") - - -class TopicWord(SchemaBaseModel): - word: Optional[str] = Field(None, title="Word") - word_weight: Optional[float] = Field(None, title="Word weight") - - -class TopicDescriptionItem(SchemaBaseModel): - topic_id: Optional[Union[int, str]] = Field(None, title="Topic identifier") - topic_score: Optional[Union[float, str]] = Field(None, title="Topic score") - topic_label: Optional[str] = Field(None, title="Topic label") - topic_words: Optional[List[TopicWord]] = Field(None, description="Words", title="Topic words") - - -class LdaTopic(SchemaBaseModel): - class Config: - extra = Extra.forbid - - model_info: Optional[List[ModelInfoItem]] = Field(None, title="Model information") - topic_description: Optional[List[TopicDescriptionItem]] = Field(None, title="Topic information") - - -class Embedding(SchemaBaseModel): - id: str = Field(..., title="Vector Model ID") - description: Optional[str] = Field(None, title="Vector Model Description") - date: Optional[str] = Field(None, title="Date (YYYY-MM-DD)") - vector: Dict[str, Any] = Field(..., title="Vector") - - class OriginDescription(SchemaBaseModel): harvest_date: Optional[str] = Field(None, description="Harvest date using UTC date format") altered: Optional[bool] = Field( @@ -681,6 +643,4 @@ class ResearchProjectSchemaDraft(SchemaBaseModel): ) provenance: Optional[List[ProvenanceSchema]] = Field(None, description="Provenance") tags: Optional[List[Tag]] = Field(None, description="Tags", title="Tags (user-defined)") - lda_topics: Optional[List[LdaTopic]] = Field(None, description="LDA topics", title="LDA topics") - embeddings: Optional[List[Embedding]] = Field(None, description="Word embeddings", title="Word embeddings") additional: Optional[Dict[str, Any]] = Field(None, description="Additional metadata") diff --git a/pydantic_schemas/table_schema.py b/pydantic_schemas/table_schema.py index 4ea8836..172b401 100644 --- a/pydantic_schemas/table_schema.py +++ b/pydantic_schemas/table_schema.py @@ -4,7 +4,7 @@ from __future__ import annotations from enum import Enum -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from pydantic import Extra, Field @@ -301,44 +301,6 @@ class Tag(SchemaBaseModel): tag_group: Optional[str] = Field(None, title="Tag group") -class ModelInfoItem(SchemaBaseModel): - source: Optional[str] = Field(None, title="Source") - author: Optional[str] = Field(None, title="Author") - version: Optional[str] = Field(None, title="Version") - model_id: Optional[str] = Field(None, title="Model Identifier") - nb_topics: Optional[float] = Field(None, title="Number of topics") - description: Optional[str] = Field(None, title="Description") - corpus: Optional[str] = Field(None, title="Corpus name") - uri: Optional[str] = Field(None, title="URI") - - -class TopicWord(SchemaBaseModel): - word: Optional[str] = Field(None, title="Word") - word_weight: Optional[float] = Field(None, title="Word weight") - - -class TopicDescriptionItem(SchemaBaseModel): - topic_id: Optional[Union[int, str]] = Field(None, title="Topic identifier") - topic_score: Optional[Union[float, str]] = Field(None, title="Topic score") - topic_label: Optional[str] = Field(None, title="Topic label") - topic_words: Optional[List[TopicWord]] = Field(None, description="Words", title="Topic words") - - -class LdaTopic(SchemaBaseModel): - class Config: - extra = Extra.forbid - - model_info: Optional[List[ModelInfoItem]] = Field(None, title="Model information") - topic_description: Optional[List[TopicDescriptionItem]] = Field(None, title="Topic information") - - -class Embedding(SchemaBaseModel): - id: str = Field(..., title="Vector Model ID") - description: Optional[str] = Field(None, title="Vector Model Description") - date: Optional[str] = Field(None, title="Date (YYYY-MM-DD)") - vector: Dict[str, Any] = Field(..., title="Vector") - - class OriginDescription(SchemaBaseModel): harvest_date: Optional[str] = Field(None, description="Harvest date using UTC date format") altered: Optional[bool] = Field( @@ -507,6 +469,4 @@ class Model(SchemaBaseModel): ) provenance: Optional[List[ProvenanceSchema]] = Field(None, description="Provenance") tags: Optional[List[Tag]] = Field(None, description="Tags", title="Tags") - lda_topics: Optional[List[LdaTopic]] = Field(None, description="LDA topics", title="LDA topics") - embeddings: Optional[List[Embedding]] = Field(None, description="Word embeddings", title="Word embeddings") additional: Optional[Dict[str, Any]] = Field(None, description="Additional metadata") diff --git a/pydantic_schemas/tests/test_metadata_manager.py b/pydantic_schemas/tests/test_metadata_manager.py index 426ab84..40a0b26 100644 --- a/pydantic_schemas/tests/test_metadata_manager.py +++ b/pydantic_schemas/tests/test_metadata_manager.py @@ -1,10 +1,147 @@ +import random +import string +from copy import copy + import pytest +from pydantic import BaseModel, ValidationError +from utils.quick_start import make_skeleton from pydantic_schemas.metadata_manager import MetadataManager +# Function to generate a random 4-character string +def random_string(length=4): + return "".join(random.choices(string.ascii_letters, k=length)) + + +# Recursive function to traverse and replace Nones or empty strings +def replace_nones_with_random(model: BaseModel): + assert isinstance(model, BaseModel), model + for field_name, field_value in model.__dict__.items(): + # If the field is None or an empty string, replace it with a random string + if field_value is None or field_value == "": + try: + show = field_value is not None or random.random() < 0.7 + setattr(model, field_name, random_string() if show else None) + except ValidationError: + continue + # If the field is another Pydantic model, recursively apply the function + elif isinstance(field_value, BaseModel): + replace_nones_with_random(field_value) + # If the field is a list of models, apply the function to each item + elif isinstance(field_value, list): + n_elements = random.choices([1, 4, 8])[0] + non_null_values = [random.random() < 0.7 for _ in range(n_elements)] + if not any(non_null_values): + continue + elif len(field_value) == 0: + try: + setattr( + model, field_name, [random_string() if non_null_values[i] else None for i in range(n_elements)] + ) + except ValidationError: + continue + elif isinstance(field_value[0], BaseModel): + try: + new_vals = [copy(field_value[0]) for i in range(n_elements)] + for v in new_vals: + replace_nones_with_random(v) + setattr( + model, + field_name, + new_vals, + ) + except ValidationError as e: + raise ValueError(f"{field_name}, {new_vals}") from e + # continue + else: + continue + # for item in field_value: + # if isinstance(item, BaseModel): + # replace_nones_with_random(item) + # If the field is a dict, apply the function to each value + elif isinstance(field_value, dict): + for key, item in field_value.items(): + if isinstance(item, BaseModel): + replace_nones_with_random(item) + + +def is_empty(m): + if isinstance(m, BaseModel): + iterabl = [v for _, v in m.model_dump().items()] + elif isinstance(m, dict): + if len(m) == 0: + return True + iterabl = [v for _, v in m.items()] + elif isinstance(m, list): + if len(m) == 0: + return True + iterabl = m + else: + return m is None + + for v in iterabl: + if isinstance(v, dict) or isinstance(v, BaseModel) or isinstance(v, list): + if is_empty(v) == False: + return False + elif v is not None: + return False + return True + + +# Recursive function to compare two Pydantic models +def compare_pydantic_models(model1: BaseModel, model2: BaseModel) -> bool: + # First, check if the two models are of the same type + if type(model1) is not type(model2): + assert False + + if not hasattr(model1, "model_fields"): + assert model1 == model2 + + # Traverse through the fields of the model + for field_name in model1.model_fields: + value1 = getattr(model1, field_name) + value2 = getattr(model2, field_name) + + # If values are different, return False + if value1 != value2: + # If both are BaseModel instances, compare recursively + if isinstance(value1, BaseModel) and isinstance(value2, BaseModel): + if not compare_pydantic_models(value1, value2): + assert False, field_name + # If both are lists, compare their elements + elif isinstance(value1, list) and isinstance(value2, list): + value1 = [v for v in value1 if is_empty(v) == False] + value2 = [v for v in value2 if is_empty(v) == False] + # remove empty basemodels + + assert len(value1) == len(value2) + for v1, v2 in zip(value1, value2): + if isinstance(v1, BaseModel) and isinstance(v2, BaseModel): + if not compare_pydantic_models(v1, v2): + assert False, field_name + elif v1 != v2: + assert False, field_name + elif isinstance(value1, list) and value2 is None: + continue + # If both are dicts, compare their items + elif isinstance(value1, dict) and isinstance(value2, dict): + assert value1.keys() == value2.keys() + for key in value1: + if isinstance(value1[key], BaseModel) and isinstance(value2[key], BaseModel): + if not compare_pydantic_models(value1[key], value2[key]): + assert False, field_name + else: + assert value1[key] == value2[key], field_name + else: + assert value1 == value2, field_name # For other types, if they are not equal, return False + + return True # All fields are equal + + @pytest.mark.parametrize( - "metadata_name", ["document", "script", "microdata", "table", "indicators_db", "indicator", "video"] + "metadata_name", + ["document", "script", "microdata", "table", "indicators_db", "indicator", "video", "geospatial", "image"], ) def test_metadata_by_name(tmpdir, metadata_name): mm = MetadataManager() @@ -15,22 +152,38 @@ def test_metadata_by_name(tmpdir, metadata_name): # Write empty metadata filename = mm.write_metadata_outline_to_excel( - metadata_name_or_class=metadata_name, filename=tmpdir.join(f"test_{metadata_name}.xlsx"), title=metadata_name + metadata_name_or_class=metadata_name, + filename=tmpdir.join(f"test_{metadata_name}_outline.xlsx"), + title=metadata_name, ) # Read the metadata back tmp = mm.read_metadata_from_excel(filename=filename) # Save the read metadata to a new file - filename2 = tmpdir.join(f"test_{metadata_name}_2.xlsx") + filename2 = tmpdir.join(f"test_{metadata_name}_save.xlsx") mm.save_metadata_to_excel(metadata_name_or_class=metadata_name, object=tmp, filename=filename2, title=metadata_name) - # make an outline object - mm.create_metadata_outline(metadata_name_or_class=metadata_name) + for i in range(10): + modl = mm.create_metadata_outline(metadata_name_or_class=metadata_name) + replace_nones_with_random(modl) + + # Write filled in metadata + filename3 = tmpdir.join(f"test_{metadata_name}_{i}.xlsx") + # filename3 = f"test_{metadata_name}_{i}.xlsx" + mm.save_metadata_to_excel( + metadata_name_or_class=metadata_name, object=modl, filename=filename3, title=metadata_name + ) + + # Read the metadata back + actual = mm.read_metadata_from_excel(filename=filename3) + compare_pydantic_models(modl, actual) + # assert modl == actual, actual @pytest.mark.parametrize( - "metadata_name", ["document", "script", "microdata", "table", "timeseries_db", "indicator", "video"] + "metadata_name", + ["document", "script", "microdata", "table", "timeseries_db", "indicator", "video", "geospatial", "image"], ) def test_metadata_by_class(tmpdir, metadata_name): mm = MetadataManager() @@ -64,6 +217,8 @@ def test_standardize_metadata_name(): "INdicator", "timeseries", "VIdeo", + "image", + "IMaGe", ] expecteds = [ @@ -79,14 +234,13 @@ def test_standardize_metadata_name(): "indicator", "indicator", "video", + "image", + "image", ] for inp, expected in zip(inputs, expecteds): actual = mm.standardize_metadata_name(inp) assert actual == expected, f"expected {expected} but got {actual}" - with pytest.raises(NotImplementedError): - mm.standardize_metadata_name("Image") - with pytest.raises(ValueError): mm.standardize_metadata_name("Bad-name") diff --git a/pydantic_schemas/tests/test_pydantic_to_excel.py b/pydantic_schemas/tests/test_pydantic_to_excel.py index e5884bc..4a24e11 100644 --- a/pydantic_schemas/tests/test_pydantic_to_excel.py +++ b/pydantic_schemas/tests/test_pydantic_to_excel.py @@ -1,17 +1,16 @@ import os from enum import Enum -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import pandas as pd import pytest from pydantic import BaseModel, Field from pydantic_schemas.document_schema import ScriptSchemaDraft +from pydantic_schemas.geospatial_schema import GeospatialSchema +from pydantic_schemas.image_schema import ImageDataTypeSchema from pydantic_schemas.indicator_schema import TimeseriesSchema from pydantic_schemas.indicators_db_schema import TimeseriesDatabaseSchema - -# from pydantic_schemas.definitions.geospatial_schema import GeospatialSchema -# from pydantic_schemas.definitions.image_schema import ImageDataTypeSchema from pydantic_schemas.microdata_schema import MicrodataSchema from pydantic_schemas.script_schema import ResearchProjectSchemaDraft from pydantic_schemas.table_schema import Model as TableModel @@ -263,7 +262,9 @@ class ProductionAndCountries(BaseModel): filename, example_production_and_country, "ProductionAndCountries", "Production and Countries" ) - new_pandc = excel_sheet_to_pydantic(filename=filename, sheetname="metadata", model_type=ProductionAndCountries) + new_pandc = excel_sheet_to_pydantic( + filename=filename, sheetname="metadata", model_type=ProductionAndCountries, debug=True + ) assert new_pandc.production.idno is None assert new_pandc.production.title is None assert len(new_pandc.production.authors) == 4 @@ -324,7 +325,7 @@ class ProductionAndCountries(BaseModel): filename, example_production_and_country, "ProductionAndCountries", "Production and Countries" ) - new_pandc = excel_doc_to_pydantic(filename, ProductionAndCountries) + new_pandc = excel_doc_to_pydantic(filename, ProductionAndCountries, verbose=True) assert new_pandc.production.idno == "myidno" assert new_pandc.production.title is None assert len(new_pandc.production.authors) == 4 @@ -341,6 +342,46 @@ class ProductionAndCountries(BaseModel): assert new_pandc.single_val == "single" +def test_union_list(tmpdir): + class Method(BaseModel): + """ + Methodology and processing + """ + + study_class: Optional[Union[str, List[Any]]] = Field( + None, + description=( + "Generally used to give the data archive's class or study status number, which indicates the processing" + " status of the study. May also be used as a text field to describe processing status. Example: `DDA Class" + " C`, `Study is available from http://example.com` " + ), + title="Class of the Study", + ) + + class StudyDesc(BaseModel): + """ + Study Description + """ + + method: Optional[Method] = Field( + None, description="Methodology and processing", title="Methodology and Processing" + ) + + class MicrodataSchema(BaseModel): + """ + Schema for Microdata data type based on DDI 2.5 + """ + + study_desc: Optional[StudyDesc] = None + + ms = MicrodataSchema(study_desc=StudyDesc(method=Method(study_class=["a1", "b2"]))) + filename = tmpdir.join(f"integration_test_union_list_.xlsx") + write_across_many_sheets(filename, ms, "UnionList", "Looking at a union with a list") + + parsed_outp = excel_doc_to_pydantic(filename, MicrodataSchema) + assert parsed_outp == ms, parsed_outp + + def test_dictionaries(tmpdir): class SubDict(BaseModel): sub_additional: Optional[Dict[str, Any]] = Field(None, description="Additional metadata at a lower level") @@ -358,19 +399,120 @@ class WithDict(BaseModel): assert parsed_outp == wd, parsed_outp +def test_list_of_lists(tmpdir): + class Citation(BaseModel): + """ + A set of elements to describe a resource citation + """ + + title: Optional[str] = Field(None, description="Resource title", title="Title") + alternateTitle: Optional[List[str]] = Field( + None, description="Resource alternate title", title="Alternate Title" + ) + + class IdentificationInfo(BaseModel): + """ + Identification(s) of the resource + """ + + citation: Optional[Citation] = Field(None, description="Dataset citation", title="Citation") + + class LegalConstraints(BaseModel): + """ + Legal constraints associated to the resource + """ + + useLimitation: Optional[List[str]] = None + accessConstraints: Optional[List[str]] = Field( + None, + description=( + "A restriction to access/use a resource. e.g. 'dataset'. Recommended code following the [ISO/TS" + " 19139](http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_RestrictionCode) Restriction" + " codelist. Suggested values: {`copyright`, `patent`, `patentPending`, `trademark`, `license`," + " `intellectualPropertyRights`, `restricted`, `otherRestrictions`, `unrestricted`, `licenceUnrestricted`," + " `licenceEndUser`, `licenceDistributor`, `private`, `statutory`, `confidential`, `SBU`, `in-confidence`}" + ), + title="Access constraints", + ) + + class Constraints(BaseModel): + """ + Constraints associated to the resource + """ + + legalConstraints: Optional[LegalConstraints] = Field( + None, description="Legal constraints associated to the resource", title="Legal constraints" + ) + + class ServiceIdentification(BaseModel): + """ + Service identification + """ + + restrictions: Optional[List[Constraints]] = Field( + None, description="Constraints associated to the service", title="Service constraints" + ) + + class MetaDataOfVariousHierarchies(BaseModel): + citation: Optional[Citation] = None + identification_info: Optional[IdentificationInfo] = None + lst: Optional[List[str]] = (None,) + service_identification: Optional[ServiceIdentification] = None + + inp = MetaDataOfVariousHierarchies( + citation=Citation(title="topleveltitle", alternateTitle=[]), + identification_info=IdentificationInfo( + citation=Citation(title="citation_title", alternateTitle=["alt_title_1", "alt_title_2"]) + ), + lst=["a", "b", "c"], + service_identification=ServiceIdentification( + restrictions=[ + Constraints(legalConstraints=LegalConstraints(useLimitation=["s1", "s2"], accessConstraints=["s3"])) + ] + ), + ) + + # index = pd.MultiIndex.from_tuples([("identification_info", "citation", "title"), ("identification_info", "citation", "alternateTitle"), ("service_identification", "restrictions", "legalConstraints", "useLimitation"), ("service_identification", "restrictions", "legalConstraints", "accessConstraints")]) + + # expected = pd.DataFrame([["citation_title", None], ["alt_title_1", "alt_title_2"], [[], None], [[], None]], index=index) + + filename = tmpdir.join(f"integration_test_list_of_lists_.xlsx") + # filename = "integration_test_list_of_lists_.xlsx" + if os.path.exists(filename): + os.remove(filename) + write_across_many_sheets(filename, inp, "ListOfLists", "Looking at lists of lists") + + expected = inp + expected.citation.alternateTitle = None + actual = excel_doc_to_pydantic(filename, MetaDataOfVariousHierarchies, verbose=True) + # assert actual == inp, actual + + # outp = pydantic_to_dataframe(inp) + # actual = outp[0] + # list_indices = outp[1] + # enums = outp[2] + assert expected.citation == actual.citation, actual.citation + assert expected.identification_info == actual.identification_info, actual.identification_info + assert expected.service_identification == actual.service_identification, actual.service_identification + assert expected.lst == actual.lst, actual.lst + assert expected == actual, actual + # assert list_indices == [1, 2, 3], list_indices + # assert enums == {}, enums + + NAME_TO_TYPE = { "Document": (ScriptSchemaDraft, write_across_many_sheets, excel_doc_to_pydantic), - # "Geospatial":GeospatialSchema, - # "Image":ImageDataTypeSchema, - "Survey": (MicrodataSchema, write_across_many_sheets, excel_doc_to_pydantic), + "Geospatial": (GeospatialSchema, write_across_many_sheets, excel_doc_to_pydantic), + "Image": (ImageDataTypeSchema, write_across_many_sheets, excel_doc_to_pydantic), + "Microdata": (MicrodataSchema, write_across_many_sheets, excel_doc_to_pydantic), "Script": (ResearchProjectSchemaDraft, write_across_many_sheets, excel_doc_to_pydantic), "Table": (TableModel, write_across_many_sheets, excel_doc_to_pydantic), - "Timeseries_DB": ( + "Indicator_DB": ( TimeseriesDatabaseSchema, write_to_single_sheet, excel_single_sheet_to_pydantic, ), # could be one sheet - "Timeseries": (TimeseriesSchema, write_across_many_sheets, excel_doc_to_pydantic), + "Indicator": (TimeseriesSchema, write_across_many_sheets, excel_doc_to_pydantic), "Video": (VideoModel, write_to_single_sheet, excel_single_sheet_to_pydantic), # could be one sheet } @@ -380,12 +522,14 @@ def test_write_real_skeleton(tmpdir, name, type_writer_reader): type, writer, reader = type_writer_reader # folder = "excel_sheets" filename = os.path.join(tmpdir, f"{name}_metadata.xlsx") + # filename = f"{name}_metadata_real_sckele.xlsx" if os.path.exists(filename): os.remove(filename) ob = make_skeleton(type) writer(filename, ob, name, f"{name} Metadata") - reader(filename, type) + reader(filename, type, verbose=True) + # assert False def test_demo(): @@ -414,6 +558,59 @@ class SubObject(BaseModel): a: str b: str + class Citation(BaseModel): + """ + A set of elements to describe a resource citation + """ + + title: Optional[str] = Field(None, description="Resource title", title="Title") + alternateTitle: Optional[List[str]] = Field( + None, description="Resource alternate title", title="Alternate Title" + ) + + class IdentificationInfo(BaseModel): + """ + Identification(s) of the resource + """ + + citation: Optional[Citation] = Field(None, description="Dataset citation", title="Citation") + + class LegalConstraints(BaseModel): + """ + Legal constraints associated to the resource + """ + + useLimitation: Optional[List[str]] = None + accessConstraints: Optional[List[str]] = Field( + None, + description=( + "A restriction to access/use a resource. e.g. 'dataset'. Recommended code following the [ISO/TS" + " 19139](http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_RestrictionCode) Restriction" + " codelist. Suggested values: {`copyright`, `patent`, `patentPending`, `trademark`, `license`," + " `intellectualPropertyRights`, `restricted`, `otherRestrictions`, `unrestricted`, `licenceUnrestricted`," + " `licenceEndUser`, `licenceDistributor`, `private`, `statutory`, `confidential`, `SBU`, `in-confidence`}" + ), + title="Access constraints", + ) + + class Constraints(BaseModel): + """ + Constraints associated to the resource + """ + + legalConstraints: Optional[LegalConstraints] = Field( + None, description="Legal constraints associated to the resource", title="Legal constraints" + ) + + class ServiceIdentification(BaseModel): + """ + Service identification + """ + + restrictions: Optional[List[Constraints]] = Field( + None, description="Constraints associated to the service", title="Service constraints" + ) + class MetaDataOfVariousHierarchies(BaseModel): idno: Optional[str] = None database_name: Optional[str] = None @@ -423,6 +620,8 @@ class MetaDataOfVariousHierarchies(BaseModel): top_level_optional_list: Optional[List[str]] = None top_level_list_of_pydantic_objects: List[SubObject] dictionary: Dict[str, str] + identification_info: Optional[IdentificationInfo] = None + service_identification: Optional[ServiceIdentification] = None example = MetaDataOfVariousHierarchies( single_level_data=SingleLevelData(title="Metadata demo", author="FirstName LastName"), @@ -435,19 +634,17 @@ class MetaDataOfVariousHierarchies(BaseModel): organization="Example Org", ), top_level_list=["a", "b"], - top_level_list_of_pydantic_objects=[SubObject(a="a", b="b")], + top_level_list_of_pydantic_objects=[SubObject(a="asub", b="b")], dictionary={"example_key": "example_value"}, + identification_info=IdentificationInfo( + citation=Citation(title="citation_title", alternateTitle=["alt_title_1", "alt_title_2"]) + ), + service_identification=ServiceIdentification( + restrictions=[Constraints(legalConstraints=LegalConstraints(useLimitation=[], accessConstraints=[]))] + ), ) if os.path.exists(filename): os.remove(filename) - write_to_single_sheet(filename, example, "MetaDataOfVariousHierarchies", sheet_title) - - # current_row = create_sheet_and_write_title(filename, sheetname, sheet_title) - # current_row = write_nested_simple_pydantic_to_sheet(filename, sheetname, example, current_row + 1) - # worksheet = open_or_create_workbook(filename) - # correct_column_widths(worksheet, sheet_name=sheetname) - # shade_30_rows_and_protect_sheet(worksheet, sheetname, current_row + 1) - # shade_locked_cells(worksheet, sheetname) - # worksheet.save(filename) + write_to_single_sheet(filename, example, "MetaDataOfVariousHierarchies", sheet_title, verbose=True) diff --git a/pydantic_schemas/tests/test_pydantic_to_pandas.py b/pydantic_schemas/tests/test_pydantic_to_pandas.py new file mode 100644 index 0000000..e46696b --- /dev/null +++ b/pydantic_schemas/tests/test_pydantic_to_pandas.py @@ -0,0 +1,318 @@ +from typing import Any, Dict, List, Optional, Union + +import pandas as pd +from pydantic import BaseModel, Field +from utils.pydantic_to_excel import pydantic_to_dataframe +from utils.quick_start import make_skeleton + + +def test_simple(): + class Simple(BaseModel): + idno: str + title: Optional[str] = None + author: str + + simple_original = Simple(idno="AVal", author="CVal") + + expected = pd.DataFrame([["AVal"], [None], ["CVal"]], index=["idno", "title", "author"]) + outp = pydantic_to_dataframe(simple_original) + actual = outp[0] + list_indices = outp[1] + enums = outp[2] + assert expected.equals(actual), actual + assert list_indices == [], list_indices + assert enums == {}, enums + + +def test_simple_list(): + class Simple(BaseModel): + idno: str + title: str + authors: List[str] + + simple_original = Simple(idno="AVal", title="BVal", authors=["CVal"]) + + expected = pd.DataFrame([["AVal"], ["BVal"], ["CVal"]], index=["idno", "title", "authors"]) + outp = pydantic_to_dataframe(simple_original, debug=True) + actual = outp[0] + list_indices = outp[1] + enums = outp[2] + print("actual", actual) + assert expected.equals(actual), actual + assert list_indices == [2], list_indices + assert enums == {}, enums + + class SimpleOptional(BaseModel): + idno: str + title: str + authors: Optional[List[str]] + + simple_original_optional = SimpleOptional(idno="AVal", title="BVal", authors=None) + + expected = pd.DataFrame([["AVal"], ["BVal"], [None]], index=["idno", "title", "authors"]) + outp = pydantic_to_dataframe(simple_original_optional) + actual = outp[0] + list_indices = outp[1] + enums = outp[2] + assert expected.equals(actual), actual + assert list_indices == [2], list_indices + assert enums == {}, enums + + simple_original_empty = SimpleOptional(idno="AVal", title="BVal", authors=[]) + + expected = pd.DataFrame([["AVal"], ["BVal"], [None]], index=["idno", "title", "authors"]) + outp = pydantic_to_dataframe(simple_original_empty) + actual = outp[0] + list_indices = outp[1] + enums = outp[2] + assert expected.equals(actual), actual + assert list_indices == [2], list_indices + assert enums == {}, enums + + +def test_subfield(): + class Production(BaseModel): + idno: str + title: str + author: str + + class Country(BaseModel): + name: str + initials: str + + class ProductionAndCountries(BaseModel): + production: Production + countries: Country + + inp = ProductionAndCountries( + production=Production(idno="AVal", title="BVal", author="CVal"), + countries=Country(name="MyCountry", initials="MC"), + ) + + index = pd.MultiIndex.from_tuples( + [ + ("production", "idno"), + ("production", "title"), + ("production", "author"), + ("countries", "name"), + ("countries", "initials"), + ] + ) + expected = pd.DataFrame([["AVal"], ["BVal"], ["CVal"], ["MyCountry"], ["MC"]], index=index) + outp = pydantic_to_dataframe(inp, debug=True) + actual = outp[0] + list_indices = outp[1] + enums = outp[2] + assert expected.equals(actual), actual + assert list_indices == [] + assert enums == {}, enums + + +def test_sublists(): + class Citation(BaseModel): + """ + A set of elements to describe a resource citation + """ + + title: Optional[str] = Field(None, description="Resource title", title="Title") + alternateTitle: Optional[List[str]] = Field( + None, description="Resource alternate title", title="Alternate Title" + ) + + class IdentificationInfo(BaseModel): + """ + Identification(s) of the resource + """ + + citation: Optional[Citation] = Field(None, description="Dataset citation", title="Citation") + + class MetaDataOfVariousHierarchies(BaseModel): + citation: Optional[Citation] = None + identification_info: Optional[IdentificationInfo] = None + lst: Optional[List[str]] = None + + inp = MetaDataOfVariousHierarchies( + citation=Citation(title="topleveltitle", alternateTitle=[]), + identification_info=IdentificationInfo( + citation=Citation(title="citation_title", alternateTitle=["alt_title_1", "alt_title_2"]) + ), + lst=["a", "b", "c"], + ) + + index = pd.MultiIndex.from_tuples( + [ + ("lst",), + ("citation", "title"), + ("citation", "alternateTitle"), + ("identification_info", "citation", "title"), + ("identification_info", "citation", "alternateTitle"), + ] + ) + expected = pd.DataFrame( + [["a", "b", "c"], ["topleveltitle"], [], ["citation_title"], ["alt_title_1", "alt_title_2"]], index=index + ) + outp = pydantic_to_dataframe(inp, debug=True) + actual = outp[0] + list_indices = outp[1] + enums = outp[2] + assert "lst" in actual.index + assert "citation" in actual.index + assert "identification_info" in actual.index + print("Gordon", expected.loc["lst"]) + print(actual.loc["lst"]) + assert expected.loc["lst"].equals(actual.loc["lst"]), actual.loc["lst"] + assert expected.equals(actual), actual + assert list_indices == [0, 2, 4], list_indices + assert enums == {}, enums + + +def test_list_of_lists(): + class Citation(BaseModel): + """ + A set of elements to describe a resource citation + """ + + title: Optional[str] = Field(None, description="Resource title", title="Title") + alternateTitle: Optional[List[str]] = Field( + None, description="Resource alternate title", title="Alternate Title" + ) + + class IdentificationInfo(BaseModel): + """ + Identification(s) of the resource + """ + + citation: Optional[Citation] = Field(None, description="Dataset citation", title="Citation") + + class LegalConstraints(BaseModel): + """ + Legal constraints associated to the resource + """ + + useLimitation: Optional[List[str]] = None + accessConstraints: Optional[List[str]] = Field( + None, + description=( + "A restriction to access/use a resource. e.g. 'dataset'. Recommended code following the [ISO/TS" + " 19139](http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_RestrictionCode) Restriction" + " codelist. Suggested values: {`copyright`, `patent`, `patentPending`, `trademark`, `license`," + " `intellectualPropertyRights`, `restricted`, `otherRestrictions`, `unrestricted`, `licenceUnrestricted`," + " `licenceEndUser`, `licenceDistributor`, `private`, `statutory`, `confidential`, `SBU`, `in-confidence`}" + ), + title="Access constraints", + ) + + class Constraints(BaseModel): + """ + Constraints associated to the resource + """ + + legalConstraints: Optional[LegalConstraints] = Field( + None, description="Legal constraints associated to the resource", title="Legal constraints" + ) + + class ServiceIdentification(BaseModel): + """ + Service identification + """ + + restrictions: Optional[List[Constraints]] = Field( + None, description="Constraints associated to the service", title="Service constraints" + ) + + class MetaDataOfVariousHierarchies(BaseModel): + # citation: Optional[Citation] = None + identification_info: Optional[IdentificationInfo] = None + # lst: Optional[List[str]] = None, + service_identification: Optional[ServiceIdentification] = None + + inp = MetaDataOfVariousHierarchies( + # citation = Citation(title="topleveltitle", alternateTitle=[]), + identification_info=IdentificationInfo( + citation=Citation(title="citation_title", alternateTitle=["alt_title_1", "alt_title_2"]) + ), + # lst = ["a", 'b', 'c'], + service_identification=ServiceIdentification( + restrictions=[Constraints(legalConstraints=LegalConstraints(useLimitation=[], accessConstraints=[]))] + ), + ) + + index = pd.MultiIndex.from_tuples( + [ + ("identification_info", "citation", "title"), + ("identification_info", "citation", "alternateTitle"), + ("service_identification", "restrictions", "legalConstraints", "useLimitation"), + ("service_identification", "restrictions", "legalConstraints", "accessConstraints"), + ] + ) + + expected = pd.DataFrame( + [["citation_title", None], ["alt_title_1", "alt_title_2"], [[], None], [[], None]], index=index + ) + + outp = pydantic_to_dataframe(inp) + actual = outp[0] + list_indices = outp[1] + enums = outp[2] + assert expected.loc["identification_info"].equals(actual.loc["identification_info"]), actual.loc[ + "identification_info" + ] + assert expected.loc["service_identification"].equals(actual.loc["service_identification"]), actual.loc[ + "service_identification" + ] + assert expected.equals(actual), actual + assert list_indices == [1, 2, 3], list_indices + assert enums == {}, enums + + +def test_dictionary(): + class Embedding(BaseModel): + id: str = Field(..., title="Vector Model ID") + description: Optional[str] = Field(None, title="Vector Model Description") + date: Optional[str] = Field(None, title="Date (YYYY-MM-DD)") + vector: Union[Dict[str, Any], List[Any]] = Field(..., title="Vector") + + emb = make_skeleton(Embedding) + df, _, _ = pydantic_to_dataframe(emb, debug=True) + + emb = Embedding(id="sjc", description="ekjrv", date="2024-01-01", vector={"1": "a", "2": "b"}) + df, _, _ = pydantic_to_dataframe(emb, debug=True) + assert df.loc["id"].values[0][0] == "sjc", df.loc["id"] + assert df.loc["description"].values[0][0] == "ekjrv", df.loc["description"] + assert df.loc["date"].values[0][0] == "2024-01-01", df.loc["date"] + assert df.loc["vector"].loc["key"].values[0] == "1", df.loc["vector"].loc["key"] + assert df.loc["vector"].loc["key"].values[1] == "2", df.loc["vector"].loc["key"] + assert df.loc["vector"].loc["value"].values[0] == "a", df.loc["vector"].loc["value"] + assert df.loc["vector"].loc["value"].values[1] == "b", df.loc["vector"].loc["value"] + + emb = Embedding(id="sjc", description="ekjrv", date="2024-01-01", vector=[1, 2, 3]) + df, _, _ = pydantic_to_dataframe(emb, debug=True) + assert df.loc["id"].values[0] == "sjc", df.loc["id"] + assert df.loc["description"].values[0] == "ekjrv", df.loc["description"] + assert df.loc["date"].values[0] == "2024-01-01", df.loc["date"] + assert df.loc["vector"].values[0] == 1, df.loc["vector"] + assert df.loc["vector"].values[1] == 2, df.loc["vector"] + assert df.loc["vector"].values[2] == 3, df.loc["vector"] + + # # lists of embeddings + # TODO make a list of dicts work + # class Parent(BaseModel): + # embeddings: Optional[List[Embedding]] = Field(None, description="Word embeddings", title="Word embeddings") + + # emb = make_skeleton(Parent) + # df, _, _ = pydantic_to_dataframe(emb, debug=True) + + # emb = Parent(embeddings=[Embedding(id="sjc", description="ekjrv", date="2024-01-01", vector={"1": "a", "2": "b"})]) + # df, _, _ = pydantic_to_dataframe(emb, debug=True) + # assert df.loc["embeddings"].loc["id"].values[0][0] == "sjc", df.loc["embeddings"].loc["id"] + # assert df.loc["embeddings"].loc["description"].values[0][0] == "ekjrv", df.loc["embeddings"].loc["description"] + # assert df.loc["embeddings"].loc["date"].values[0][0] == "2024-01-01", df.loc["embeddings"].loc["date"] + # # assert False, df.loc["embeddings"] + # assert df.loc["embeddings"].loc["vector"].loc["key"].values[0] == "1", df.loc["embeddings"].loc["vector"].loc["key"] + # assert df.loc["embeddings"].loc["vector"].loc["key"].values[1] == "2", df.loc["embeddings"].loc["vector"].loc["key"] + # assert df.loc["embeddings"].loc["vector"].loc["value"].values[0] == "a", ( + # df.loc["embeddings"].loc["vector"].loc["value"] + # ) + # assert df.loc["embeddings"].loc["vector"].loc["value"].values[1] == "b", ( + # df.loc["embeddings"].loc["vector"].loc["value"] + # ) diff --git a/pydantic_schemas/tests/test_quick_start.py b/pydantic_schemas/tests/test_quick_start.py index 9d53dc2..8dd77fa 100644 --- a/pydantic_schemas/tests/test_quick_start.py +++ b/pydantic_schemas/tests/test_quick_start.py @@ -235,6 +235,28 @@ class BadFieldNames(BaseModel): assert actual == expected, actual +def test_limit_on_recurrence(tmpdir): + class Production(BaseModel): + idno: Optional[str] = None + title: Optional[str] = None + subtitle: Optional[str] = None + author: str + productions: Optional["Production"] = None # Forward reference + + Production.model_rebuild() + ob = make_skeleton(Production) + + class ProductionWithList(BaseModel): + idno: Optional[str] = None + title: Optional[str] = None + subtitle: Optional[str] = None + author: str + productions: Optional[List["Production"]] = None # Forward reference + + ProductionWithList.model_rebuild() + ob = make_skeleton(ProductionWithList) + + @pytest.mark.parametrize("n", [n for n in MetadataManager().metadata_type_names]) def test_actual_schemas(n): if n == "geospatial": diff --git a/pydantic_schemas/utils/excel_to_pydantic.py b/pydantic_schemas/utils/excel_to_pydantic.py index 124b2e7..aef3fd2 100644 --- a/pydantic_schemas/utils/excel_to_pydantic.py +++ b/pydantic_schemas/utils/excel_to_pydantic.py @@ -1,10 +1,11 @@ import json import warnings -from typing import Any, List, Optional, Type, Union, get_args +from typing import Annotated, Any, List, Optional, Type, Union, get_args, get_origin import numpy as np import pandas as pd from pydantic import BaseModel, create_model +from utils.pydantic_to_excel import pydantic_to_dataframe from .quick_start import make_skeleton from .utils import ( @@ -13,7 +14,9 @@ is_dict_annotation, is_list_annotation, is_optional_annotation, + is_optional_list, seperate_simple_from_pydantic, + standardize_keys_in_dict, subset_pydantic_model_type, ) @@ -72,11 +75,15 @@ def get_relevant_sub_frame(m: Type[BaseModel], df: pd.DataFrame, name_of_field: THis function obtains only that information that pertains to this model """ names = df.iloc[:, 0].values + if debug: + print(f"getting subframe for {m} or {name_of_field} given {names}") try: - name_of_class = m.model_json_schema()["title"] - + json_schema = m.model_json_schema() + if debug: + print(f"get relevant sub frame using json schema: {json_schema}") + name_of_class = json_schema["title"] idx, sze = find_string_and_count_nans(names, name_of_class) - except AttributeError: + except (AttributeError, KeyError): idx = -1 sze = 0 if idx < 0: @@ -88,9 +95,13 @@ def get_relevant_sub_frame(m: Type[BaseModel], df: pd.DataFrame, name_of_field: error_message += f"and '{name_of_field}' " error_message += f"not found in {names}" raise IndexError(error_message) + else: + if debug: + print(f"get relevant sub frame sze={sze}, idx={idx}") sub = df.iloc[idx : idx + sze + 1, 1:] - + if debug: + print(sub) sub = sub.dropna(how="all", axis=0) # drop all null rows sub = sub.dropna(how="all", axis=1) # drop all null columns if debug: @@ -104,7 +115,10 @@ def handle_optional(name, annotation, df, from_within_list: bool = False, debug= args = [a for a in get_args(annotation) if a is not type(None)] # assert len(args) == 1, f"handle_optional encountered {args}" if len(args) > 1: - if str in args: + list_args = [a for a in args if is_list_annotation(a)] + if len(list_args): + arg = list_args[0] + elif str in args: arg = str elif float in args: arg = float @@ -112,7 +126,7 @@ def handle_optional(name, annotation, df, from_within_list: bool = False, debug= arg = args[0] else: arg = args[0] - ret = annotation_switch(name, arg, df, from_within_list=from_within_list) + ret = annotation_switch(name, arg, df, from_within_list=from_within_list, debug=debug) if debug: print(f"optional ret: {ret}") print(f"isinstance(ret, list): {isinstance(ret, list)}") @@ -127,19 +141,36 @@ def handle_optional(name, annotation, df, from_within_list: bool = False, debug= def handle_list(name, anno, df, debug=False): subtype = get_subtype_of_optional_or_list(anno) + if debug: + print(f"handle_list found subtype: {subtype} from {anno} with name {name}\n{df}") if isinstance(subtype, type(BaseModel)): try: - subframe = get_relevant_sub_frame(subtype, df, name_of_field=name) + subframe = get_relevant_sub_frame(subtype, df, name_of_field=name, debug=debug) + if debug: + print(f"subframe\n{subframe}") except IndexError: return [] list_of_subs = [] - for c in subframe.columns[1:]: - subsubframe = subframe.loc[:, [subframe.columns[0], c]] + if debug: + print("handle list df received") + print(subframe) + print("handle list df expected except for the specific values") + print(pydantic_to_dataframe([make_skeleton(subtype)])[0]) + index_size = max( + [len(x) if isinstance(x, tuple) else 1 for x in pydantic_to_dataframe([make_skeleton(subtype)])[0].index] + ) + if debug: + print(f"measured index to have depth={index_size}") + ## need to figure out the index columns and the data columns rather than assuming that the zeroth column is the *only* index column + for c in list(range(len(subframe.columns)))[index_size:]: + subsubframe = subframe.iloc[ + :, list(range(index_size)) + [c] + ] # subframe.loc[:, [subframe.columns[:index_size], c]] if debug: print("subsubframe") print(subsubframe) print() - sub = instantiate_pydantic_object(model_type=subtype, df=subsubframe, from_within_list=True) + sub = instantiate_pydantic_object(model_type=subtype, df=subsubframe, from_within_list=True, debug=debug) if debug: print(f"instantiated: {sub}") list_of_subs.append(sub) @@ -156,19 +187,38 @@ def handle_list_within_list(name, anno, df, debug=False): if debug: print(f"handle_list_within_list {name}, {anno}") print(df) - values = df.set_index(df.columns[0]).loc[name, df.columns[1]] + print(df.set_index(df.columns[0]).loc[name]) + print(df.columns) + + df = df.dropna(axis=1, how="all") + if debug: + print("dropna") + print(df) + df = df.set_index(df.columns[0]) + if debug: + print("setting index") + print(df) + values = df.loc[name] + if debug: + print(f"getting entry for '{name}'") + print(values) + values = values.values[-1] # , df.columns[1] if debug: print(f"values: {values}, {type(values)}") if values is None: return [] values = json.loads(values.replace("'", '"').replace("None", "null")) + if debug: + print(f"decoded values:", values) if len(values) == 0: return [] sub_type = get_subtype_of_optional_or_list(anno) - if isinstance(values[0], dict) and annotation_contains_pydantic(sub_type): - return [sub_type(**v) for v in values] - elif not isinstance(values[0], dict) and not annotation_contains_pydantic(sub_type): - return [sub_type(v) for v in values] + is_dicts = any([isinstance(v, dict) for v in values]) + if is_dicts and annotation_contains_pydantic(sub_type): + return [sub_type(**standardize_keys_in_dict(v)) for v in values] + elif not is_dicts and not annotation_contains_pydantic(sub_type): + # return [sub_type(v) for v in values] + return values else: raise NotImplementedError(f"handle_list_within_list unexpected values - {name}, {anno}, {values}, {df}") @@ -218,30 +268,41 @@ def annotation_switch(name: str, anno, df: pd.DataFrame, from_within_list=False, if is_optional_annotation(anno): if debug: print("optional") - return handle_optional(name, anno, df, from_within_list=from_within_list) + return handle_optional(name, anno, df, from_within_list=from_within_list, debug=debug) elif is_dict_annotation(anno): return handle_dict(name, anno, df) elif is_list_annotation(anno): if from_within_list: if debug: print("list within a list") - return handle_list_within_list(name, anno, df) + return handle_list_within_list(name, anno, df, debug=debug) else: if debug: print("list") - return handle_list(name, anno, df) + return handle_list(name, anno, df, debug=debug) elif isinstance(anno, type(BaseModel)): if debug: print("pydantic") + print(anno) + print(name) + print(df) try: - sub = get_relevant_sub_frame(anno, df, name_of_field=name) + sub = get_relevant_sub_frame(anno, df, name_of_field=name, debug=debug) + if debug: + print("pydantic sub:") + print(sub) except IndexError: return make_skeleton(anno) - return instantiate_pydantic_object(anno, sub) + return instantiate_pydantic_object(anno, sub, from_within_list=from_within_list, debug=debug) elif len(get_args(anno)) == 0: if debug: print("builtin or enum") return handle_builtin_or_enum(name, anno, df) + elif get_origin(anno) is Annotated: + if debug: + print(f"got Annotated type: {anno}, treating as builtin or enum") + datatype = getattr(anno, "__origin__", None) + return handle_builtin_or_enum(name, datatype, df) else: raise NotImplementedError(anno) @@ -256,43 +317,55 @@ def instantiate_pydantic_object( anno = field_info.annotation if debug: print(f"Instantiating field {field_name}, anno {anno} and args {get_args(anno)}") - ret[field_name] = annotation_switch(field_name, anno, df, from_within_list=from_within_list) + ret[field_name] = annotation_switch(field_name, anno, df, from_within_list=from_within_list, debug=debug) if debug: print(ret[field_name]) print() - return model_type(**ret) + return model_type(**standardize_keys_in_dict(ret)) def excel_sheet_to_pydantic( filename: str, sheetname: str, model_type: Union[Type[BaseModel], Type[List[BaseModel]]], debug=False ): + if debug: + print(f"excel_sheet_to_pydantic, sheetname={sheetname}, model_type={model_type}") df = pd.read_excel(filename, sheet_name=sheetname, header=None) df = df.where(df.notnull(), None) if sheetname != "metadata": try: - df = get_relevant_sub_frame(model_type, df) + df = get_relevant_sub_frame(model_type, df, debug=debug) except (KeyError, IndexError): pass + if debug: + print("line 304", model_type) + print(df) if is_optional_annotation(model_type): - return handle_optional(df.iloc[0, 0], model_type, df) + if not annotation_contains_pydantic(model_type): + return handle_optional(df.iloc[0, 0], model_type, df, debug=debug) + else: + model_type = [x for x in get_args(model_type) if x is not type(None)][0] if is_list_annotation(model_type): - return handle_list(df.iloc[0, 0], model_type, df) + return handle_list(df.iloc[0, 0], model_type, df, debug=debug) + if debug: + print("getting children for", model_type) children = seperate_simple_from_pydantic(model_type) + if debug: + print(f"children: {children}") ret = {} if "simple" in children and len(children["simple"]): sub = get_relevant_sub_frame(model_type, df, name_of_field=df.iloc[0, 0]) simple_child_field_type = subset_pydantic_model_type(model_type, children["simple"]) - fields = instantiate_pydantic_object(simple_child_field_type, sub, debug=debug) + fields = instantiate_pydantic_object(simple_child_field_type, sub, from_within_list=False, debug=debug) for child in children["simple"]: ret[child] = getattr(fields, child) for name in children["pydantic"]: if debug: - print(f"Looking to get {name}") + print(f"sheet Looking to get {name}") anno = model_type.model_fields[name].annotation - ret[name] = annotation_switch(name, anno, df) + ret[name] = annotation_switch(name, anno, df, from_within_list=False, debug=debug) for k, v in ret.items(): if isinstance(v, list) or isinstance(v, np.ndarray): ret[k] = [elem for elem in v if elem is not None] diff --git a/pydantic_schemas/utils/pydantic_to_excel.py b/pydantic_schemas/utils/pydantic_to_excel.py index 5f621be..84a2e7b 100644 --- a/pydantic_schemas/utils/pydantic_to_excel.py +++ b/pydantic_schemas/utils/pydantic_to_excel.py @@ -3,7 +3,7 @@ import json import os from enum import Enum -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union, get_args __version__ = importlib.metadata.version("metadataschemas") @@ -14,13 +14,16 @@ from openpyxl.worksheet.datavalidation import DataValidation from openpyxl.worksheet.protection import SheetProtection from openpyxl.worksheet.worksheet import Worksheet -from pydantic import BaseModel +from pydantic import AnyUrl, BaseModel from .utils import ( annotation_contains_dict, annotation_contains_list, assert_dict_annotation_is_strings_or_any, get_subtype_of_optional_or_list, + is_list_annotation, + is_optional_annotation, + is_union_annotation, seperate_simple_from_pydantic, subset_pydantic_model, ) @@ -150,6 +153,31 @@ def replace_row_with_multiple_rows(original_df, new_df, row_to_replace): return df_replaced +def count_lists(model_fields, idx: str): + """ + idx is a string name of a nested field seperated by dots like + "identification_info.citation.alternateTitle" + """ + n_lists = 0 + for part in idx.split("."): + try: + anno = model_fields[part].annotation + except KeyError: + raise KeyError(f"bad model fields given {idx}, for {part} of {model_fields}") + n_lists += annotation_contains_list(anno) + if is_optional_annotation(anno) or is_list_annotation(anno): + anno = get_subtype_of_optional_or_list(anno) + # if hasattr(anno, "model_fields"): + # model_fields = anno.model_fields + # else: + # break + if hasattr(anno, "model_fields"): + model_fields = anno.model_fields + else: + break + return n_lists, anno + + def pydantic_to_dataframe( ob: Union[BaseModel, List[BaseModel]], debug: bool = False, @@ -164,77 +192,137 @@ def pydantic_to_dataframe( if isinstance(ob, list): ob_dict = [elem.model_dump() for elem in ob] annotations = {k: v.annotation for k, v in ob[0].model_fields.items()} + model_fields = {k: v for k, v in ob[0].model_fields.items()} is_list_of_objects = True else: ob_dict = ob.model_dump() annotations = {k: v.annotation for k, v in ob.model_fields.items()} + model_fields = {k: v for k, v in ob.model_fields.items()} is_list_of_objects = False df = pd.json_normalize(ob_dict).T if debug: print("pydantic_to_dataframe") print(df) - # handle dictionaries - # for idx, field in ob_dict.items(): - # if annotation_contains_dict(annotations[idx]): - for fieldname, anno in annotations.items(): - if annotation_contains_dict(anno): - if debug: - print("Found a dictionary") - if is_list_of_objects: - continue - assert_dict_annotation_is_strings_or_any(anno) - field = ob_dict[fieldname] - if field is None or len(field) == 0: - dict_df = pd.DataFrame(["", ""], index=["key", "value"]) - else: - dict_df = pd.DataFrame([field.keys(), field.values()], index=["key", "value"]) - dict_df.index = dict_df.index.map(lambda x: f"{fieldname}.{x}") - df = df[~df.index.str.startswith(f"{fieldname}.")] - df = df[df.index != fieldname] - df = pd.concat([df, dict_df]) - i = 0 list_indices = [] + observed_dicts = set() enums = {} for idx in df.index: + if idx.split(".")[0] in observed_dicts: + continue if debug: - print(f"pydantic_to_dataframe::172 idx = {idx}, df = {df}") + print(f"pydantic_to_dataframe::202 idx = {idx}, df = {df}") vals = df.loc[idx] # [0] + number_of_lists, anno = count_lists(model_fields, idx) + number_of_lists = number_of_lists + int(is_list_of_objects) if debug: print(f"vals: {vals}") print(f'idx.split(".")[0]: {idx.split(".")[0]}') print(f'annotations[idx.split(".")[0]]: {annotations[idx.split(".")[0]]}') - # field = ob_dict[idx.split(".")[0]] + print(f"number of lists = {number_of_lists}") + print(f"anno = {anno}") - if annotation_contains_list(annotations[idx.split(".")[0]]) or annotation_contains_dict( - annotations[idx.split(".")[0]] - ): - if annotation_contains_list(annotations[idx.split(".")[0]]): - subtype = get_subtype_of_optional_or_list(annotations[idx.split(".")[0]]) + if annotation_contains_dict(anno): + if debug: + print(f"annotation contains dict, {ob_dict[idx.split('.')[0]]}") + fieldname = idx.split(".")[0] + # field = ob_dict[fieldname] + subdf = df[df.index.str.startswith(f"{fieldname}.")] + field = {"".join(i.split(".")[1:]): v[0] for i, v in zip(subdf.index, subdf.values)} + if debug: + print(f"field: {field}") + if is_union_annotation(anno) and (len(subdf) == 0 or (field is not None and not isinstance(field, dict))): + args = [a for a in get_args(anno) if a is not type(None)] + anno = [a for a in args if not annotation_contains_dict(a)][0] + if debug: + print(f"falling back to {anno}") else: - subtype = dict + if debug: + print("Found a dictionary") + if is_list_of_objects: + continue + assert_dict_annotation_is_strings_or_any(anno) + + if field is None or len(field) == 0: + dict_df = pd.DataFrame(["", ""], index=["key", "value"]) + else: + dict_df = pd.DataFrame([field.keys(), field.values()], index=["key", "value"]) + if debug: + print(f"created a dict_df:\n{dict_df}") + dict_df.index = dict_df.index.map(lambda x: f"{fieldname}.{x}") + df = df[~df.index.str.startswith(f"{fieldname}.")] + df = df[df.index != fieldname] + df = pd.concat([df, dict_df]) + list_indices += list(range(i, i + 2)) + i += 2 + observed_dicts.add(fieldname) + continue + + if number_of_lists >= 1: #: or annotation_contains_dict(annotations[idx.split(".")[0]]): + # if number_of_lists > 0: + subtype = anno + # else: + # subtype = dict if debug: print("subtype = ", subtype) print("isinstance(subtype, BaseModel)", isinstance(subtype, type(BaseModel))) - print("isinstance(subtype, dict)", isinstance(subtype, dict)) - if is_list_of_objects: + print("isinstance(subtype, dict)", annotation_contains_dict(subtype)) # isinstance(subtype, dict)) + if number_of_lists >= 2 or is_list_of_objects: # is_list_of_objects: if debug: print("list of lists") list_indices.append(i) i += 1 - elif isinstance(subtype, type(BaseModel)) or isinstance(subtype, dict): + # elif number_of_lists == 0: # dicts + # list_indices += list(range(i, i + 2)) + # if debug: + # print(list_indices) + # i += 2 + + elif isinstance(subtype, type(BaseModel)): # isinstance(subtype, type(dict)) + if debug: + print("list of base models", vals, vals[0]) + print("experiment:", vals[0]) + print("experiment:", pd.DataFrame(vals[0]).T) + if vals[0] is None: + vals[0] = [None] + elif isinstance(vals[0], list) and len(vals[0]) == 0: + vals[0] = [None] + sub = pd.json_normalize(vals[0]).T if debug: - print("list of base models", vals) - sub = pd.json_normalize(df.loc[idx].values[0]).reset_index(drop=True).T + print(sub) + # if len(sub.index) == 1: + # sub.index = [idx] + # else: sub.index = sub.index.map(lambda x: f"{idx}." + x) + if debug: + print(sub) df = replace_row_with_multiple_rows(df, sub, idx) + if debug: + print(df) + if debug: + print(list_indices) list_indices += list(range(i, i + len(sub))) + if debug: + print(list_indices) i += len(sub) + if debug: + print("done with basemodel subtype") else: if debug: print("list of builtins or else empty") - df = replace_row_with_multiple_rows(df, df.loc[idx].explode().to_frame().reset_index(drop=True).T, idx) + if vals[0] is None: + vals[0] = [None] + elif isinstance(vals[0], list) and len(vals[0]) == 0: + vals[0] = [None] + sub = pd.DataFrame(vals[0]).T + if len(sub.index) == 1: + sub.index = [idx] + else: + sub.index = sub.index.map(lambda x: f"{idx}." + x) + df = replace_row_with_multiple_rows(df, sub, idx) + if debug: + print("new df:", df) list_indices.append(i) i += 1 else: @@ -272,6 +360,8 @@ def stringify_cell_element(elem): return str(elem.value) elif isinstance(elem, dict): return json.dumps(elem, default=stringify_enum) + elif isinstance(elem, AnyUrl): + return elem.unicode_string() else: return elem @@ -286,7 +376,6 @@ def write_pydantic_to_excel(ws, ob, row_number, debug=False): if all(map(lambda x: x is None, r)): continue r = [stringify_cell_element(val) for val in r] - # r = [str(val) if isinstance(val, list) else str(val.value) if isinstance(val, Enum) else val for val in r ] r = [""] + r if debug: print("about to append", r) @@ -376,7 +465,7 @@ def write_pydantic_to_sheet(worksheet: Worksheet, ob: BaseModel, current_row: in current_row += 1 child_object = getattr(ob, mfield) current_row, sub_list_rows, sub_list_enums = write_pydantic_to_excel( - ws=worksheet, ob=child_object, row_number=current_row + ws=worksheet, ob=child_object, row_number=current_row, debug=debug ) list_rows.update(sub_list_rows) enum_list_rows.update(sub_list_enums) diff --git a/pydantic_schemas/utils/quick_start.py b/pydantic_schemas/utils/quick_start.py index 13d11bd..f5155f7 100644 --- a/pydantic_schemas/utils/quick_start.py +++ b/pydantic_schemas/utils/quick_start.py @@ -9,6 +9,7 @@ from .utils import standardize_keys_in_dict DEFAULT_URL = "http://www.example.com" +MAX_DEPTH = 12 def _is_typing_annotation(annotation): @@ -43,79 +44,83 @@ def _filter_list_for_condition(args: List[Any], condition: Callable[[Any], bool] return [a for a in args if condition(a)] -def _is_pydantic_annotated_string(p, debug=False, indentation=""): +def _is_pydantic_annotated_string(p, debug=False, recursion_level=0): if typing.get_origin(p) is typing.Annotated: args = typing.get_args(p) if args[0] is str: if debug: - print(indentation, "Is Annotated String") + print(" " * recursion_level, "Is Annotated String") return True if debug: - print(indentation, f"Is Annotated but not a string {p}") + print(" " * recursion_level, f"Is Annotated but not a string {p}") return False -def _is_pydantic_annotated_float(p, debug=False, indentation=""): +def _is_pydantic_annotated_float(p, debug=False, recursion_level=0): if typing.get_origin(p) is typing.Annotated: args = typing.get_args(p) if args[0] is float: if debug: - print(indentation, "Is Annotated float") + print(" " * recursion_level, "Is Annotated float") return True if debug: - print(indentation, f"Is Annotated but not a float {p}") + print(" " * recursion_level, f"Is Annotated but not a float {p}") return False def _create_default_class_from_annotation( - p: Any, is_optional: bool = False, debug: bool = False, indentation: str = "" + p: Any, is_optional: bool = False, debug: bool = False, recursion_level: int = 0 ): if p is str: if debug: - print(indentation, "STR") + print(" " * recursion_level, "STR") if is_optional: return None else: return "" elif p is float: if debug: - print(indentation, "STR") + print(" " * recursion_level, "STR") if is_optional: return None else: raise ValueError("Cannot create default float as it's not optional") elif _is_enum_type(p): if debug: - print(indentation, "ENUM") + print(" " * recursion_level, "ENUM") if is_optional: return None else: return list(p)[0].value # get first value of the enum - elif _is_pydantic_subclass(p): + elif _is_pydantic_subclass(p) and recursion_level < MAX_DEPTH: if debug: - print(indentation, "pydantic CLASS") - return make_skeleton(p, debug=debug, indentation=indentation + " ") + print(" " * recursion_level, "pydantic CLASS") + return make_skeleton(p, debug=debug, recursion_level=recursion_level + 1) + elif _is_pydantic_subclass(p) and is_optional: + return None elif isinstance(p, type(AnyUrl)): return DEFAULT_URL else: raise ValueError(f"Unknown annotation: {p}") -def _create_default_from_list_of_args(args: List[Any], is_optional=True, debug=False, indentation=""): +def _create_default_from_list_of_args(args: List[Any], is_optional=True, debug=False, recursion_level=0): """ return None for built in types and enums, but create skeletons of pydantic or typed parameters """ + if is_optional and recursion_level >= MAX_DEPTH: + return None args = _filter_list_for_condition(args, lambda a: a is not type(None)) typed_args = _filter_list_for_condition(args, _is_typing_annotation) # _filter_list_for_typing_args(args) pydantic_args = _filter_list_for_condition(args, _is_pydantic_subclass) # _filter_for_pydantic_args(args) if debug: print( - indentation, + " " * recursion_level, f"LIST OF ARGS: {args}, LIST OF TYPED ARGS: {typed_args}, LIST_OF_PYDANTIC_ARGS: {pydantic_args}", ) if len(typed_args): if debug: - print(indentation, "moving to _create_default_from_typing_annotation") + print(" " * recursion_level, "moving to _create_default_from_typing_annotation") # because dicts are more complicated than lists, we should default to dicts typed_dicts = _filter_list_for_condition(typed_args, lambda p: getattr(p, "__origin__", None) is dict) typed_lists = _filter_list_for_condition(typed_args, lambda p: getattr(p, "__origin__", None) is list) @@ -126,25 +131,25 @@ def _create_default_from_list_of_args(args: List[Any], is_optional=True, debug=F else: chosen_type = typed_args[0] return _create_default_from_typing_annotation( - chosen_type, is_optional=is_optional, debug=debug, indentation=indentation + chosen_type, is_optional=is_optional, debug=debug, recursion_level=recursion_level ) elif len(pydantic_args): - return make_skeleton(pydantic_args[0], debug=debug, indentation=indentation + " ") + return make_skeleton(pydantic_args[0], debug=debug, recursion_level=recursion_level + 1) elif len(_filter_list_for_condition(args, lambda a: _is_builtin_type(a) or _is_enum_type(a))): if debug: - print(indentation, "all builtins or enums") + print(" " * recursion_level, "all builtins or enums") if is_optional: return None elif len(_filter_list_for_condition(args, lambda a: a is str)): return "" else: raise ValueError(f"Can't create a default of {args}") - elif len(args) == 1 and _is_pydantic_annotated_string(args[0], debug=debug, indentation=indentation): + elif len(args) == 1 and _is_pydantic_annotated_string(args[0], debug=debug, recursion_level=recursion_level): if is_optional: return None else: return "" - elif len(args) == 1 and _is_pydantic_annotated_float(args[0], debug=debug, indentation=indentation): + elif len(args) == 1 and _is_pydantic_annotated_float(args[0], debug=debug, recursion_level=recursion_level): if is_optional: return None else: @@ -158,9 +163,9 @@ def _create_default_from_list_of_args(args: List[Any], is_optional=True, debug=F raise ValueError(f"Can't create a default of {args}") -def _create_default_from_typing_annotation(p: Any, is_optional: bool = False, debug: bool = False, indentation=""): +def _create_default_from_typing_annotation(p: Any, is_optional: bool = False, debug: bool = False, recursion_level=0): if debug: - print(indentation, "_create_default_from_typing_annotation") + print(" " * recursion_level, "_create_default_from_typing_annotation") if p is typing.Any: return "" args = typing.get_args(p) @@ -169,46 +174,54 @@ def _create_default_from_typing_annotation(p: Any, is_optional: bool = False, de isOptional = type(None) in args if isOptional: if debug: - print(indentation, "isOPTIONAL") - return _create_default_from_list_of_args(args, is_optional=True, debug=debug, indentation=indentation) + print(" " * recursion_level, "isOPTIONAL") + if recursion_level >= MAX_DEPTH: + return None + return _create_default_from_list_of_args(args, is_optional=True, debug=debug, recursion_level=recursion_level) elif getattr(p, "__origin__", None) is list: if debug: - print(indentation, "isLIST") + print(" " * recursion_level, "isLIST") if _is_pydantic_subclass(args[0]): - return [make_skeleton(args[0], debug=debug, indentation=indentation + " ")] + return [make_skeleton(args[0], debug=debug, recursion_level=recursion_level + 1)] else: if is_optional: return [] else: - return [_create_default(args[0], is_optional=False, debug=debug, indentation=indentation + " ")] + return [_create_default(args[0], is_optional=False, debug=debug, recursion_level=recursion_level + 1)] elif getattr(p, "__origin__", None) is dict: if debug: - print(indentation, "isDICT") - k = _create_default(args[0], debug=debug, indentation=indentation + " ") - v = _create_default(args[1], debug=debug, indentation=indentation + " ") + print(" " * recursion_level, "isDICT") + k = _create_default(args[0], debug=debug, recursion_level=recursion_level + 1) + v = _create_default(args[1], debug=debug, recursion_level=recursion_level + 1) return {k: v} elif len(args) > 1: if debug: - print(indentation, "isUNION") - return _create_default_from_list_of_args(args, is_optional=is_optional, debug=debug, indentation=indentation) + print(" " * recursion_level, "isUNION") + return _create_default_from_list_of_args( + args, is_optional=is_optional, debug=debug, recursion_level=recursion_level + ) else: raise ValueError(f"Unknown typing {p}") -def _create_default(p: inspect.Parameter, is_optional: bool = False, debug: bool = False, indentation: str = ""): +def _create_default(p: inspect.Parameter, is_optional: bool = False, debug: bool = False, recursion_level: int = 0): if hasattr(p, "annotation"): p = p.annotation if inspect.isclass(p) and not _is_typing_annotation(p): if debug: - print(indentation, "CLASS") - return _create_default_class_from_annotation(p, is_optional=is_optional, debug=debug, indentation=indentation) + print(" " * recursion_level, "CLASS") + return _create_default_class_from_annotation( + p, is_optional=is_optional, debug=debug, recursion_level=recursion_level + ) elif _is_typing_annotation(p): if debug: - print(indentation, "TYPED") - return _create_default_from_typing_annotation(p, is_optional=is_optional, debug=debug, indentation=indentation) - elif _is_pydantic_annotated_string(p, debug=debug, indentation=indentation): + print(" " * recursion_level, "TYPED") + return _create_default_from_typing_annotation( + p, is_optional=is_optional, debug=debug, recursion_level=recursion_level + ) + elif _is_pydantic_annotated_string(p, debug=debug, recursion_level=recursion_level): if debug: - print(indentation, "ANNOTATED STRING") + print(" " * recursion_level, "ANNOTATED STRING") if is_optional: return None else: @@ -217,15 +230,15 @@ def _create_default(p: inspect.Parameter, is_optional: bool = False, debug: bool raise ValueError(f"Unknown parameter {p}") -def make_skeleton(cl: Type[BaseModel], debug=False, indentation=""): +def make_skeleton(cl: Type[BaseModel], debug=False, recursion_level=0): parameter_map = inspect.signature(cl).parameters # {'name': } param_values = {} for name, param in parameter_map.items(): if debug: - print(indentation, f"{param.name}: {param.annotation}") - param_values[name] = _create_default(param, debug=debug, indentation=indentation + " ") + print(" " * recursion_level, f"{param.name}: {param.annotation}") + param_values[name] = _create_default(param, debug=debug, recursion_level=recursion_level + 1) if debug: - print(indentation, f"Parameter: {name}, value: {param_values[name]}") + print(" " * recursion_level, f"Parameter: {name}, value: {param_values[name]}") param_values = standardize_keys_in_dict(param_values) return cl(**param_values) diff --git a/pydantic_schemas/utils/utils.py b/pydantic_schemas/utils/utils.py index a7b3d02..0ef2a71 100644 --- a/pydantic_schemas/utils/utils.py +++ b/pydantic_schemas/utils/utils.py @@ -51,6 +51,11 @@ def get_subtype_of_optional_or_list(anno: typing._UnionGenericAlias, debug=False return get_subtype_of_optional_or_list(arg, debug=debug) if len(args) == 1: return args[0] + elif len(args) > 1: + if str in args: + return str + else: + return args[0] else: raise NotImplementedError("Only optional lists optional builtin types implemented") @@ -68,7 +73,7 @@ def _annotation_contains_generic( return True if is_optional_annotation(anno) or is_list_annotation(anno): # optional check is pointless given union check above subtype = get_subtype_of_optional_or_list(anno) - return checker(subtype) + return _annotation_contains_generic(subtype, checker=checker) return False @@ -85,6 +90,10 @@ def annotation_contains_pydantic(anno: typing._UnionGenericAlias) -> bool: def assert_dict_annotation_is_strings_or_any(anno): + if is_union_annotation(anno): + args = [a for a in typing.get_args(anno) if a is not type(None)] + args = [a for a in args if is_dict_annotation(a)] + anno = args[0] if is_dict_annotation(anno): args = typing.get_args(anno) for a in args: diff --git a/pydantic_schemas/video_schema.py b/pydantic_schemas/video_schema.py index fc23af7..42c0932 100644 --- a/pydantic_schemas/video_schema.py +++ b/pydantic_schemas/video_schema.py @@ -4,7 +4,7 @@ from __future__ import annotations from enum import Enum -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from pydantic import Extra, Field @@ -246,44 +246,6 @@ class Tag(SchemaBaseModel): tag_group: Optional[str] = Field(None, title="Tag group") -class ModelInfoItem(SchemaBaseModel): - source: Optional[str] = Field(None, title="Source") - author: Optional[str] = Field(None, title="Author") - version: Optional[str] = Field(None, title="Version") - model_id: Optional[str] = Field(None, title="Model Identifier") - nb_topics: Optional[float] = Field(None, title="Number of topics") - description: Optional[str] = Field(None, title="Description") - corpus: Optional[str] = Field(None, title="Corpus name") - uri: Optional[str] = Field(None, title="URI") - - -class TopicWord(SchemaBaseModel): - word: Optional[str] = Field(None, title="Word") - word_weight: Optional[float] = Field(None, title="Word weight") - - -class TopicDescriptionItem(SchemaBaseModel): - topic_id: Optional[Union[int, str]] = Field(None, title="Topic identifier") - topic_score: Optional[Union[float, str]] = Field(None, title="Topic score") - topic_label: Optional[str] = Field(None, title="Topic label") - topic_words: Optional[List[TopicWord]] = Field(None, description="Words", title="Topic words") - - -class LdaTopic(SchemaBaseModel): - class Config: - extra = Extra.forbid - - model_info: Optional[List[ModelInfoItem]] = Field(None, title="Model information") - topic_description: Optional[List[TopicDescriptionItem]] = Field(None, title="Topic information") - - -class Embedding(SchemaBaseModel): - id: str = Field(..., title="Vector Model ID") - description: Optional[str] = Field(None, title="Vector Model Description") - date: Optional[str] = Field(None, title="Date (YYYY-MM-DD)") - vector: Dict[str, Any] = Field(..., title="Vector") - - class OriginDescription(SchemaBaseModel): harvest_date: Optional[str] = Field(None, description="Harvest date using UTC date format") altered: Optional[bool] = Field( @@ -331,6 +293,4 @@ class Model(SchemaBaseModel): ) provenance: Optional[List[ProvenanceSchema]] = Field(None, description="Provenance") tags: Optional[List[Tag]] = Field(None, description="Tags", title="Tags") - lda_topics: Optional[List[LdaTopic]] = Field(None, description="LDA topics", title="LDA topics") - embeddings: Optional[List[Embedding]] = Field(None, description="Word embeddings", title="Word embeddings") additional: Optional[Dict[str, Any]] = Field(None, description="Additional metadata") diff --git a/schemas/document-schema.json b/schemas/document-schema.json index e90581c..fe492d4 100644 --- a/schemas/document-schema.json +++ b/schemas/document-schema.json @@ -973,145 +973,6 @@ "tag" ] }, - "lda_topics": { - "type": "array", - "title": "LDA topics", - "description": "LDA topics", - "items": { - "type": "object", - "properties": { - "model_info": { - "type": "array", - "title": "Model information", - "items": { - "type": "object", - "properties": { - "source": { - "title": "Source", - "type": "string" - }, - "author": { - "title": "Author", - "type": "string" - }, - "version": { - "title": "Version", - "type": "string" - }, - "model_id": { - "title": "Model Identifier", - "type": "string" - }, - "nb_topics": { - "title": "Number of topics", - "type": "number" - }, - "description": { - "title": "Description", - "type": "string" - }, - "corpus": { - "title": "Corpus name", - "type": "string" - }, - "uri": { - "title": "URI", - "type": "string" - } - } - }, - "required": [ - "model_id" - ] - }, - "topic_description": { - "type": "array", - "title": "Topic information", - "items": { - "type": "object", - "properties": { - "topic_id": { - "title": "Topic identifier", - "type": [ - "integer", - "string" - ] - }, - "topic_score": { - "title": "Topic score", - "type": [ - "number", - "string" - ] - }, - "topic_label": { - "title": "Topic label", - "type": "string" - }, - "topic_words": { - "type": "array", - "title": "Topic words", - "description": "Words", - "items": { - "type": "object", - "properties": { - "word": { - "title": "Word", - "type": "string" - }, - "word_weight": { - "title": "Word weight", - "type": "number" - } - } - }, - "required": [ - "word" - ] - } - } - }, - "required": [ - "topic_id" - ] - } - }, - "additionalProperties": false - } - }, - "embeddings": { - "type": "array", - "title": "Word embeddings", - "description": "Word embeddings", - "items": { - "type": "object", - "properties": { - "id": { - "title": "Vector Model ID", - "type": "string" - }, - "description": { - "title": "Vector Model Description", - "type": "string" - }, - "date": { - "title": "Date (YYYY-MM-DD)", - "type": "string" - }, - "vector": { - "title": "Vector", - "type": [ - "object", - "array" - ] - } - }, - "required": [ - "id", - "vector" - ] - } - }, "additional": { "type": "object", "description": "Additional metadata", diff --git a/schemas/geospatial-schema.json b/schemas/geospatial-schema.json index 5f3082f..33ac6fe 100644 --- a/schemas/geospatial-schema.json +++ b/schemas/geospatial-schema.json @@ -2103,136 +2103,6 @@ "tag" ] }, - "lda_topics": { - "type": "array", - "title": "LDA topics", - "description": "LDA topics", - "items": { - "type": "object", - "properties": { - "model_info": { - "type": "array", - "title": "Model information", - "items": { - "type": "object", - "properties": { - "source": { - "title": "Source", - "type": "string" - }, - "author": { - "title": "Author", - "type": "string" - }, - "version": { - "title": "Version", - "type": "string" - }, - "model_id": { - "title": "Model Identifier", - "type": "string" - }, - "nb_topics": { - "title": "Number of topics", - "type": "number" - }, - "description": { - "title": "Description", - "type": "string" - }, - "corpus": { - "title": "Corpus name", - "type": "string" - }, - "uri": { - "title": "URI", - "type": "string" - } - } - }, - "required": [ - "model_id" - ] - }, - "topic_description": { - "type": "array", - "title": "Topic information", - "items": { - "type": "object", - "properties": { - "topic_id": { - "title": "Topic identifier", - "type": ["integer", "string"] - }, - "topic_score": { - "title": "Topic score", - "type": ["number", "string"] - }, - "topic_label": { - "title": "Topic label", - "type": "string" - }, - "topic_words": { - "type": "array", - "title": "Topic words", - "description": "Words", - "items": { - "type": "object", - "properties": { - "word": { - "title": "Word", - "type": "string" - }, - "word_weight": { - "title": "Word weight", - "type": "number" - } - } - }, - "required": [ - "word" - ] - } - } - }, - "required": [ - "topic_id" - ] - } - - }, - "additionalProperties": false - } - }, - "embeddings":{ - "type": "array", - "title": "Word embeddings", - "description": "Word embeddings", - "items": { - "type": "object", - "properties": { - "id": { - "title": "Vector Model ID", - "type": "string" - }, - "description": { - "title": "Vector Model Description", - "type": "string" - }, - "date": { - "title": "Date (YYYY-MM-DD)", - "type": "string" - }, - "vector": { - "title": "Vector", - "type": "object" - } - }, - "required": [ - "id","vector" - ] - } - }, "additional": { "title": "Additional metadata", "description": "Any additional metadata", diff --git a/schemas/image-schema.json b/schemas/image-schema.json index f161be6..b185bb3 100644 --- a/schemas/image-schema.json +++ b/schemas/image-schema.json @@ -196,136 +196,6 @@ "tag" ] }, - "lda_topics": { - "type": "array", - "title": "LDA topics", - "description": "LDA topics", - "items": { - "type": "object", - "properties": { - "model_info": { - "type": "array", - "title": "Model information", - "items": { - "type": "object", - "properties": { - "source": { - "title": "Source", - "type": "string" - }, - "author": { - "title": "Author", - "type": "string" - }, - "version": { - "title": "Version", - "type": "string" - }, - "model_id": { - "title": "Model Identifier", - "type": "string" - }, - "nb_topics": { - "title": "Number of topics", - "type": "number" - }, - "description": { - "title": "Description", - "type": "string" - }, - "corpus": { - "title": "Corpus name", - "type": "string" - }, - "uri": { - "title": "URI", - "type": "string" - } - } - }, - "required": [ - "model_id" - ] - }, - "topic_description": { - "type": "array", - "title": "Topic information", - "items": { - "type": "object", - "properties": { - "topic_id": { - "title": "Topic identifier", - "type": ["integer", "string"] - }, - "topic_score": { - "title": "Topic score", - "type": ["number", "string"] - }, - "topic_label": { - "title": "Topic label", - "type": "string" - }, - "topic_words": { - "type": "array", - "title": "Topic words", - "description": "Words", - "items": { - "type": "object", - "properties": { - "word": { - "title": "Word", - "type": "string" - }, - "word_weight": { - "title": "Word weight", - "type": "number" - } - } - }, - "required": [ - "word" - ] - } - } - }, - "required": [ - "topic_id" - ] - } - - }, - "additionalProperties": false - } - }, - "embeddings":{ - "type": "array", - "title": "Word embeddings", - "description": "Word embeddings", - "items": { - "type": "object", - "properties": { - "id": { - "title": "Vector Model ID", - "type": "string" - }, - "description": { - "title": "Vector Model Description", - "type": "string" - }, - "date": { - "title": "Date (YYYY-MM-DD)", - "type": "string" - }, - "vector": { - "title": "Vector", - "type": "object" - } - }, - "required": [ - "id","vector" - ] - } - }, "additional": { "type": "object", "description": "Additional metadata", diff --git a/schemas/iptc-phovidmdshared-schema.json b/schemas/iptc-phovidmdshared-schema.json index 52014c8..c059772 100644 --- a/schemas/iptc-phovidmdshared-schema.json +++ b/schemas/iptc-phovidmdshared-schema.json @@ -30,18 +30,6 @@ ], "additionalProperties": false }, - "AltLangObject": { - "description": "Text in alternative languages", - "type": "object", - "patternProperties": { - "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+))$": { - "title": "Language tagged text", - "description": "Property name is a BCP47 language tag, property value a text using this language", - "type": "string" - } - }, - "additionalProperties": false - }, "AltLang": { "description": "Text in alternative languages", "type": "string" diff --git a/schemas/microdata-schema.json b/schemas/microdata-schema.json index e427c90..3a9bc5b 100644 --- a/schemas/microdata-schema.json +++ b/schemas/microdata-schema.json @@ -78,142 +78,6 @@ "tag" ] }, - "lda_topics": { - "type": "array", - "title": "LDA topics", - "description": "LDA topics", - "items": { - "type": "object", - "properties": { - "model_info": { - "type": "array", - "title": "Model information", - "items": { - "type": "object", - "properties": { - "source": { - "title": "Source", - "type": "string" - }, - "author": { - "title": "Author", - "type": "string" - }, - "version": { - "title": "Version", - "type": "string" - }, - "model_id": { - "title": "Model Identifier", - "type": "string" - }, - "nb_topics": { - "title": "Number of topics", - "type": "number" - }, - "description": { - "title": "Description", - "type": "string" - }, - "corpus": { - "title": "Corpus name", - "type": "string" - }, - "uri": { - "title": "URI", - "type": "string" - } - } - }, - "required": [ - "model_id" - ] - }, - "topic_description": { - "type": "array", - "title": "Topic information", - "items": { - "type": "object", - "properties": { - "topic_id": { - "title": "Topic identifier", - "type": [ - "integer", - "string" - ] - }, - "topic_score": { - "title": "Topic score", - "type": [ - "number", - "string" - ] - }, - "topic_label": { - "title": "Topic label", - "type": "string" - }, - "topic_words": { - "type": "array", - "title": "Topic words", - "description": "Words", - "items": { - "type": "object", - "properties": { - "word": { - "title": "Word", - "type": "string" - }, - "word_weight": { - "title": "Word weight", - "type": "number" - } - } - }, - "required": [ - "word" - ] - } - } - }, - "required": [ - "topic_id" - ] - } - }, - "additionalProperties": false - } - }, - "embeddings": { - "type": "array", - "title": "Word embeddings", - "description": "Word embeddings", - "items": { - "type": "object", - "properties": { - "id": { - "title": "Vector Model ID", - "type": "string" - }, - "description": { - "title": "Vector Model Description", - "type": "string" - }, - "date": { - "title": "Date (YYYY-MM-DD)", - "type": "string" - }, - "vector": { - "title": "Vector", - "type": "object" - } - }, - "required": [ - "id", - "vector" - ] - } - }, "additional": { "type": "object", "description": "Additional metadata not covered by DDI elements", diff --git a/schemas/script-schema.json b/schemas/script-schema.json index d2dbfc4..be1a0ac 100644 --- a/schemas/script-schema.json +++ b/schemas/script-schema.json @@ -1104,136 +1104,6 @@ "tag" ] }, - "lda_topics": { - "type": "array", - "title": "LDA topics", - "description": "LDA topics", - "items": { - "type": "object", - "properties": { - "model_info": { - "type": "array", - "title": "Model information", - "items": { - "type": "object", - "properties": { - "source": { - "title": "Source", - "type": "string" - }, - "author": { - "title": "Author", - "type": "string" - }, - "version": { - "title": "Version", - "type": "string" - }, - "model_id": { - "title": "Model Identifier", - "type": "string" - }, - "nb_topics": { - "title": "Number of topics", - "type": "number" - }, - "description": { - "title": "Description", - "type": "string" - }, - "corpus": { - "title": "Corpus name", - "type": "string" - }, - "uri": { - "title": "URI", - "type": "string" - } - } - }, - "required": [ - "model_id" - ] - }, - "topic_description": { - "type": "array", - "title": "Topic information", - "items": { - "type": "object", - "properties": { - "topic_id": { - "title": "Topic identifier", - "type": ["integer", "string"] - }, - "topic_score": { - "title": "Topic score", - "type": ["number", "string"] - }, - "topic_label": { - "title": "Topic label", - "type": "string" - }, - "topic_words": { - "type": "array", - "title": "Topic words", - "description": "Words", - "items": { - "type": "object", - "properties": { - "word": { - "title": "Word", - "type": "string" - }, - "word_weight": { - "title": "Word weight", - "type": "number" - } - } - }, - "required": [ - "word" - ] - } - } - }, - "required": [ - "topic_id" - ] - } - - }, - "additionalProperties": false - } - }, - "embeddings":{ - "type": "array", - "title": "Word embeddings", - "description": "Word embeddings", - "items": { - "type": "object", - "properties": { - "id": { - "title": "Vector Model ID", - "type": "string" - }, - "description": { - "title": "Vector Model Description", - "type": "string" - }, - "date": { - "title": "Date (YYYY-MM-DD)", - "type": "string" - }, - "vector": { - "title": "Vector", - "type": "object" - } - }, - "required": [ - "id","vector" - ] - } - }, "additional": { "type": "object", "description": "Additional metadata", diff --git a/schemas/table-schema.json b/schemas/table-schema.json index 530f9ca..ae532a4 100644 --- a/schemas/table-schema.json +++ b/schemas/table-schema.json @@ -987,142 +987,6 @@ "tag" ] }, - "lda_topics": { - "type": "array", - "title": "LDA topics", - "description": "LDA topics", - "items": { - "type": "object", - "properties": { - "model_info": { - "type": "array", - "title": "Model information", - "items": { - "type": "object", - "properties": { - "source": { - "title": "Source", - "type": "string" - }, - "author": { - "title": "Author", - "type": "string" - }, - "version": { - "title": "Version", - "type": "string" - }, - "model_id": { - "title": "Model Identifier", - "type": "string" - }, - "nb_topics": { - "title": "Number of topics", - "type": "number" - }, - "description": { - "title": "Description", - "type": "string" - }, - "corpus": { - "title": "Corpus name", - "type": "string" - }, - "uri": { - "title": "URI", - "type": "string" - } - } - }, - "required": [ - "model_id" - ] - }, - "topic_description": { - "type": "array", - "title": "Topic information", - "items": { - "type": "object", - "properties": { - "topic_id": { - "title": "Topic identifier", - "type": [ - "integer", - "string" - ] - }, - "topic_score": { - "title": "Topic score", - "type": [ - "number", - "string" - ] - }, - "topic_label": { - "title": "Topic label", - "type": "string" - }, - "topic_words": { - "type": "array", - "title": "Topic words", - "description": "Words", - "items": { - "type": "object", - "properties": { - "word": { - "title": "Word", - "type": "string" - }, - "word_weight": { - "title": "Word weight", - "type": "number" - } - } - }, - "required": [ - "word" - ] - } - } - }, - "required": [ - "topic_id" - ] - } - }, - "additionalProperties": false - } - }, - "embeddings": { - "type": "array", - "title": "Word embeddings", - "description": "Word embeddings", - "items": { - "type": "object", - "properties": { - "id": { - "title": "Vector Model ID", - "type": "string" - }, - "description": { - "title": "Vector Model Description", - "type": "string" - }, - "date": { - "title": "Date (YYYY-MM-DD)", - "type": "string" - }, - "vector": { - "title": "Vector", - "type": "object" - } - }, - "required": [ - "id", - "vector" - ] - } - }, "additional": { "type": "object", "description": "Additional metadata", diff --git a/schemas/video-schema.json b/schemas/video-schema.json index 4fd3aba..591de52 100644 --- a/schemas/video-schema.json +++ b/schemas/video-schema.json @@ -632,137 +632,6 @@ "tag" ] }, - "lda_topics": { - "type": "array", - "title": "LDA topics", - "description": "LDA topics", - "items": { - "type": "object", - "properties": { - "model_info": { - "type": "array", - "title": "Model information", - "items": { - "type": "object", - "properties": { - "source": { - "title": "Source", - "type": "string" - }, - "author": { - "title": "Author", - "type": "string" - }, - "version": { - "title": "Version", - "type": "string" - }, - "model_id": { - "title": "Model Identifier", - "type": "string" - }, - "nb_topics": { - "title": "Number of topics", - "type": "number" - }, - "description": { - "title": "Description", - "type": "string" - }, - "corpus": { - "title": "Corpus name", - "type": "string" - }, - "uri": { - "title": "URI", - "type": "string" - } - } - }, - "required": [ - "model_id" - ] - }, - "topic_description": { - "type": "array", - "title": "Topic information", - "items": { - "type": "object", - "properties": { - "topic_id": { - "title": "Topic identifier", - "type": ["integer", "string"] - }, - "topic_score": { - "title": "Topic score", - "type": ["number", "string"] - }, - "topic_label": { - "title": "Topic label", - "type": "string" - }, - "topic_words": { - "type": "array", - "title": "Topic words", - "description": "Words", - "items": { - "type": "object", - "properties": { - "word": { - "title": "Word", - "type": "string" - }, - "word_weight": { - "title": "Word weight", - "type": "number" - } - } - }, - "required": [ - "word" - ] - } - } - }, - "required": [ - "topic_id" - ] - } - - }, - "additionalProperties": false - } - }, - "embeddings":{ - "type": "array", - "title": "Word embeddings", - "description": "Word embeddings", - "items": { - "type": "object", - "properties": { - "id": { - "title": "Vector Model ID", - "type": "string" - }, - "description": { - "title": "Vector Model Description", - "type": "string" - }, - "date": { - "title": "Date (YYYY-MM-DD)", - "type": "string" - }, - "vector": { - "title": "Vector", - "type": "object" - } - }, - "required": [ - "id","vector" - ] - } - }, - "additional": { "type": "object", "description": "Additional metadata",