Skip to content

Commit

Permalink
include resource, delete Series, remove template stuff but make the m…
Browse files Browse the repository at this point in the history
…anager work with templates
  • Loading branch information
Gordon Blackadder committed Sep 5, 2024
1 parent d762c3f commit 05f04a4
Show file tree
Hide file tree
Showing 31 changed files with 434 additions and 518 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ survey_metadata.study_desc.title_statement.idno = "project_idno"

To update the pydantic schemas so that they match the latest json schemas run

`python pydantic_schemas\\generators\\generate_pydantic_schemas.py`
`python pydantic_schemas/generators/generate_pydantic_schemas.py`

Then to update the Excel sheets run

`python pydantic_schemas\\generators\\generate_excel_files.py`
`python pydantic_schemas/generators/generate_excel_files.py`
Binary file modified excel_sheets/Document_metadata.xlsx
Binary file not shown.
Binary file added excel_sheets/Resource_metadata.xlsx
Binary file not shown.
Binary file modified excel_sheets/Script_metadata.xlsx
Binary file not shown.
Binary file removed excel_sheets/Series_metadata.xlsx
Binary file not shown.
Binary file modified excel_sheets/Survey_metadata.xlsx
Binary file not shown.
Binary file modified excel_sheets/Table_metadata.xlsx
Binary file not shown.
Binary file modified excel_sheets/Timeseries_db_metadata.xlsx
Binary file not shown.
Binary file modified excel_sheets/Timeseries_metadata.xlsx
Binary file not shown.
Binary file modified excel_sheets/Video_metadata.xlsx
Binary file not shown.
4 changes: 2 additions & 2 deletions pydantic_schemas/document_schema.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generated by datamodel-codegen:
# filename: document-schema.json
# timestamp: 2024-07-24T21:06:20+00:00
# timestamp: 2024-08-29T18:53:37+00:00

from __future__ import annotations

Expand All @@ -9,7 +9,7 @@

from pydantic import Extra, Field

from .schema_base_model import SchemaBaseModel
from .utils.schema_base_model import SchemaBaseModel


class Overwrite(Enum):
Expand Down
14 changes: 8 additions & 6 deletions pydantic_schemas/generators/generate_excel_files.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import os

from pydantic_schemas.schema_interface import SchemaInterface
from pydantic_schemas.metadata_manager import MetadataManager

ei = SchemaInterface()
metadata_manager = MetadataManager()

for metadata_type in ei.list_metadata_types():
filename = f"excel_sheets/{metadata_type.capitalize()}_metadata.xlsx"
print(f"Writing {metadata_type} outline to {filename}")
for metadata_name in metadata_manager.metadata_type_names:
if metadata_name in ["image", "geospatial"]:
continue
filename = f"excel_sheets/{metadata_name.capitalize()}_metadata.xlsx"
print(f"Writing {metadata_name} outline to {filename}")
if os.path.exists(filename):
os.remove(filename)
ei.write_outline_metadata_to_excel(metadata_type=metadata_type, filename=filename)
metadata_manager.write_metadata_outline_to_excel(metadata_name_or_class=metadata_name, filename=filename)
3 changes: 2 additions & 1 deletion pydantic_schemas/generators/generate_pydantic_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
SCHEMA_DIR = "schemas"
OUTPUT_DIR = os.path.join("pydantic_schemas")
PYTHON_VERSION = "3.11"
BASE_CLASS = ".schema_base_model.SchemaBaseModel"
BASE_CLASS = ".utils.schema_base_model.SchemaBaseModel"
INPUTS = [
"document-schema.json",
"geospatial-schema.json",
"image-schema.json",
"microdata-schema.json",
"resource-schema.json",
"script-schema.json",
"table-schema.json",
"timeseries-db-schema.json",
Expand Down
4 changes: 2 additions & 2 deletions pydantic_schemas/geospatial_schema.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generated by datamodel-codegen:
# filename: geospatial-schema.json
# timestamp: 2024-07-24T21:06:22+00:00
# timestamp: 2024-08-29T18:53:39+00:00

from __future__ import annotations

Expand All @@ -9,7 +9,7 @@

from pydantic import Extra, Field, confloat

from .schema_base_model import SchemaBaseModel
from .utils.schema_base_model import SchemaBaseModel


class Producer(SchemaBaseModel):
Expand Down
4 changes: 2 additions & 2 deletions pydantic_schemas/image_schema.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generated by datamodel-codegen:
# filename: image-schema.json
# timestamp: 2024-07-24T21:06:23+00:00
# timestamp: 2024-08-29T18:53:41+00:00

from __future__ import annotations

Expand All @@ -10,7 +10,7 @@

from pydantic import AnyUrl, Extra, Field, confloat, constr

from .schema_base_model import SchemaBaseModel
from .utils.schema_base_model import SchemaBaseModel


class Overwrite(Enum):
Expand Down
273 changes: 273 additions & 0 deletions pydantic_schemas/metadata_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
from copy import copy
from typing import Dict, Optional, Type, Union

from openpyxl import load_workbook
from pydantic import BaseModel

from . import ( # image_schema,
document_schema,
geospatial_schema,
microdata_schema,
resource_schema,
script_schema,
table_schema,
timeseries_db_schema,
timeseries_schema,
video_schema,
)
from .utils.excel_to_pydantic import excel_doc_to_pydantic, excel_single_sheet_to_pydantic
from .utils.pydantic_to_excel import write_across_many_sheets, write_to_single_sheet
from .utils.quick_start import make_skeleton
from .utils.utils import merge_dicts, standardize_keys_in_dict


class MetadataManager:
"""
Interface with Excel for creating, saving and updating metadata for various types:
documents, scripts, survey, table, timeseries, timeseries_db, video
Retrieve pydantic model definitions for each metadata type
"""

_TYPE_TO_SCHEMA = {
"document": document_schema.ScriptSchemaDraft,
"geospatial": geospatial_schema.GeospatialSchema,
# "image":image_schema.ImageDataTypeSchema,
"resource": resource_schema.Model,
"script": script_schema.ResearchProjectSchemaDraft,
"survey": microdata_schema.MicrodataSchema,
"table": table_schema.Model,
"timeseries": timeseries_schema.TimeseriesSchema,
"timeseries_db": timeseries_db_schema.TimeseriesDatabaseSchema,
"video": video_schema.Model,
}

_TYPE_TO_WRITER = {
"document": write_across_many_sheets,
# "geospatial":,
# "image":,
"resource": write_to_single_sheet,
"script": write_across_many_sheets,
"survey": write_across_many_sheets,
"table": write_across_many_sheets,
"timeseries": write_across_many_sheets,
"timeseries_db": write_to_single_sheet, # one sheet
"video": write_to_single_sheet, # one sheet
}

_TYPE_TO_READER = {
"document": excel_doc_to_pydantic,
# "geospatial":,
# "image":,
"resource": excel_single_sheet_to_pydantic,
"script": excel_doc_to_pydantic,
"survey": excel_doc_to_pydantic,
"table": excel_doc_to_pydantic,
"timeseries": excel_doc_to_pydantic,
"timeseries_db": excel_single_sheet_to_pydantic, # one sheet
"video": excel_single_sheet_to_pydantic, # one sheet
}

def metadata_class_from_name(self, metadata_name: str):
metadata_name = self.standardize_metadata_name(metadata_name)
schema = self._TYPE_TO_SCHEMA[metadata_name]
return copy(schema)

@property
def metadata_type_names(self):
return list(self._TYPE_TO_SCHEMA.keys())

def standardize_metadata_name(self, metadata_name: str) -> str:
metadata_name = metadata_name.lower()
metadata_name = metadata_name.replace("-", "_")
if metadata_name == "microdata" or metadata_name == "survey_microdata":
metadata_name = "survey"
self._raise_if_unsupported_metadata_name(metadata_name=metadata_name)
return metadata_name

def create_metadata_outline(
self, metadata_name_or_class: Union[str, Type[BaseModel]], debug: bool = False
) -> BaseModel:
if isinstance(metadata_name_or_class, str):
schema = self.metadata_class_from_name(metadata_name_or_class)
else:
schema = metadata_name_or_class
skeleton_object = make_skeleton(schema, debug=debug)
return skeleton_object

def write_metadata_outline_to_excel(
self,
metadata_name_or_class: Union[str, Type[BaseModel]],
filename: Optional[str] = None,
title: Optional[str] = None,
) -> str:
"""
Create an Excel file formatted for writing the given metadata_name metadata.
Args:
metadata_name_or_class (str or type[BaseModel]): the name of a supported metadata type, currently:
document, script, series, survey, table, timeseries, timeseries_DB, video
Currently not supported:
geospatial, image
If passed as a BaseModel type, for instance this is what you would do with a template, then the writer
defaults to a single page.
filename (Optional[str]): The path to the Excel file. If None, defaults to {metadata_name}_metadata.xlsx
title (Optional[str]): The title for the Excel sheet. If None, defaults to '{metadata_name} Metadata'
Returns:
str: filename of metadata file
Outputs:
An Excel file into which metadata can be entered
"""
if isinstance(metadata_name_or_class, str):
metadata_name = self.standardize_metadata_name(metadata_name_or_class)
if metadata_name == "geospatial":
raise NotImplementedError("Geospatial schema contains an infinite loop so cannot be written to excel")
skeleton_object = self.create_metadata_outline(metadata_name, debug=False)
writer = self._TYPE_TO_WRITER[metadata_name]
if filename is None:
filename = f"{metadata_name}_metadata.xlsx"
if title is None:
title = f"{metadata_name.capitalize()} Metadata"
else:
skeleton_object = make_skeleton(metadata_name_or_class, debug=False)
writer = write_to_single_sheet
metadata_name = metadata_name_or_class.model_json_schema()["title"]
if filename is None:
filename = f"{metadata_name}_metadata.xlsx"
if title is None:
title = f"{metadata_name.capitalize()} Metadata"

if not str(filename).endswith(".xlsx"):
filename += ".xlsx"
writer(filename, skeleton_object, metadata_name, title)
return filename

def save_metadata_to_excel(
self,
metadata_name_or_class: Union[str, Type[BaseModel]],
object: BaseModel,
filename: Optional[str] = None,
title: Optional[str] = None,
) -> str:
"""
Save an Excel document of the given metadata object.
Args:
metadata_name_or_class (str or type[BaseModel]): the name of a supported metadata type, currently:
document, script, series, survey, table, timeseries, timeseries_DB, video
Currently not supported:
geospatial, image
If passed as a BaseModel type, for instance this is what you would do with a template, then the writer defaults to a single page.
object (BaseModel): The pydantic object to save to the Excel file.
filename (Optional[str]): The path to the Excel file. Defaults to {name}_metadata.xlsx
title (Optional[str]): The title for the Excel sheet. Defaults to '{name} Metadata'
Returns:
str: filename of metadata file
Outputs:
An Excel file containing the metadata from the pydantic object. This file can be updated as needed.
"""
if isinstance(metadata_name_or_class, str):
metadata_name = self.standardize_metadata_name(metadata_name_or_class)
if metadata_name == "geospatial":
raise NotImplementedError("Geospatial schema contains an infinite loop so cannot be written to excel")
schema = self.metadata_class_from_name(metadata_name)
else:
schema = metadata_name_or_class
writer = write_to_single_sheet
metadata_name = metadata_name_or_class.model_json_schema()["title"]
skeleton_object = self.create_metadata_outline(metadata_name_or_class=metadata_name_or_class, debug=False)

if filename is None:
filename = f"{metadata_name}_metadata.xlsx"
if not str(filename).endswith(".xlsx"):
filename += ".xlsx"
if title is None:
title = f"{metadata_name.capitalize()} Metadata"

combined_dict = merge_dicts(
skeleton_object.model_dump(),
object.model_dump(exclude_none=True, exclude_unset=True, exclude_defaults=True),
)
combined_dict = standardize_keys_in_dict(combined_dict)
new_ob = schema(**combined_dict)

writer = self._TYPE_TO_WRITER[metadata_name]
writer(filename, new_ob, metadata_name, title)
return filename

@staticmethod
def _get_metadata_name_from_excel_file(filename: str) -> str:
error_message = "Improperly formatted Excel file for metadata"
workbook = load_workbook(filename)
# Select the 'metadata' sheet
try:
sheet = workbook["metadata"]
# Get the value of cell C1
type_info = sheet["C1"].value
except KeyError:
raise ValueError(f"Sheet 'metadata' not found. {error_message}")
except Exception as e:
raise ValueError(f"Error reading Excel file: {e}")
finally:
# Close the workbook
workbook.close()

if not type_info or not isinstance(type_info, str):
raise ValueError(f"Cell C3 is empty or not a string. {error_message}")

cell_values = type_info.split(" ")

if len(cell_values) < 3 or cell_values[1] != "type" or cell_values[2] != "metadata":
raise ValueError(f"Cell C3 is improperly formatted. {error_message}")

return cell_values[0]

def read_metadata_from_excel(self, filename: str, metadata_class: Optional[Type[BaseModel]] = None) -> BaseModel:
"""
Read in metadata from an appropriately formatted Excel file as a pydantic object.
If using s standard metadata types (documents, scripts, survey, table, timeseries, timeseries_db, video) then there is no need to pass in the metadata_class. But if using a template, then the class must be provided.
Args:
filename (str): The path to the Excel file.
metadata_class (Optional type of BaseModel): A pudantic class type correspondong to the type used to write the Excel file
Returns:
BaseModel: a pydantic object containing the metadata from the file
"""
metadata_name = self._get_metadata_name_from_excel_file(filename)
try:
metadata_name = self.standardize_metadata_name(metadata_name)
schema = self._TYPE_TO_SCHEMA[metadata_name]
reader = self._TYPE_TO_READER[metadata_name]
except ValueError:
if metadata_class is None:
raise ValueError(
f"'{metadata_name}' not supported. Must be: {list(self._TYPE_TO_SCHEMA.keys())} or try passing in the metadata_class"
)
schema = metadata_class
reader = excel_single_sheet_to_pydantic
read_object = reader(filename, schema)

skeleton_object = self.create_metadata_outline(metadata_name_or_class=schema, debug=False)

read_object_dict = read_object.model_dump(exclude_none=True, exclude_unset=True, exclude_defaults=True)
combined_dict = merge_dicts(
skeleton_object.model_dump(),
read_object_dict,
)
combined_dict = standardize_keys_in_dict(combined_dict)
new_ob = schema(**combined_dict)
return new_ob

def _raise_if_unsupported_metadata_name(self, metadata_name: str):
"""
If the type is specifically unsupported - geospatial or image - a NotImplementedError is raised
If the type is simply unknown then a ValueError is raised.
"""
if metadata_name == "image":
raise NotImplementedError("Due to an issue with image metadata schema definition causing __root__ errors")
if metadata_name not in self._TYPE_TO_SCHEMA.keys():
raise ValueError(f"'{metadata_name}' not supported. Must be: {list(self._TYPE_TO_SCHEMA.keys())}")
4 changes: 2 additions & 2 deletions pydantic_schemas/microdata_schema.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generated by datamodel-codegen:
# filename: microdata-schema.json
# timestamp: 2024-07-24T21:06:25+00:00
# timestamp: 2024-08-29T18:53:43+00:00

from __future__ import annotations

Expand All @@ -9,7 +9,7 @@

from pydantic import Extra, Field, constr

from .schema_base_model import SchemaBaseModel
from .utils.schema_base_model import SchemaBaseModel


class AccessPolicy(Enum):
Expand Down
Loading

0 comments on commit 05f04a4

Please sign in to comment.