diff --git a/.gitignore b/.gitignore index d922681..81bf229 100644 --- a/.gitignore +++ b/.gitignore @@ -92,3 +92,5 @@ ENV/ # Mac stuff .DS_Store + +PR_DESCRIPTION.md \ No newline at end of file diff --git a/docs/ck-tutorial.rst b/docs/ck-tutorial.rst index a8bc882..2b8746a 100644 --- a/docs/ck-tutorial.rst +++ b/docs/ck-tutorial.rst @@ -70,7 +70,7 @@ The Reference Section In the reference section, information about the experimental facility and the article where the data is published is collected. This information typically includes: - * the type of experiment (for now, only autoignition experiments are supported) + * the type of experiment (ignition delay, laminar burning velocity, etc.) * the type and location of the experimental apparatus (rapid compression machine or shock tube) * the article authors and the journal, DOI, volume, and issue where the data was published * a note about where in the paper the data was collected from, if multiple data sets are diff --git a/docs/schema-docs.rst b/docs/schema-docs.rst index 372fda2..e313245 100644 --- a/docs/schema-docs.rst +++ b/docs/schema-docs.rst @@ -63,7 +63,7 @@ section are required, although some of the sub-keys are optional. This mapping provides information about the apparatus used to conduct the experiments. Fields: - ``kind``: string, required - Must be one of ``shock tube`` or ``rapid compression machine``. Values are case-sensitive. + Must be one of ``shock tube``, ``rapid compression machine``, ``stirred reactor``, ``jet stirred reactor``, ``flow reactor``, ``flame``, ``outwardly propagating spherical flame``, ``heat flux burner``, or ``flame cone method``. Values are case-sensitive. - ``institution``: string, optional The institution where the experimental apparatus is located @@ -71,12 +71,32 @@ section are required, although some of the sub-keys are optional. - ``facility``: string, optional A unique name or identifier for the apparatus, if the institution has several that are similar + - ``mode``: sequence, optional + A sequence of strings describing the mode(s) of operation of the apparatus, if applicable. + Multiple modes may be specified to capture different facets of the configuration (e.g., flow + regime and burner geometry for a flame experiment). Each element must be one of the + following case-sensitive values: + + * Shock tube modes: ``reflected shock``, ``incident shock``, ``reflected shock wave``, ``incident shock wave`` + * Flow regime: ``laminar``, ``turbulent`` + * Flame/burner configurations: ``burner stabilized``, ``burner-stabilized``, + ``constant volume combustion chamber``, ``premixed``, ``unstretched``, ``spherical``, ``cylindrical``, ``slot burner``, ``modified Bunsen burner``, ``counterflow``, ``twin flat``, ``adiabatic`` + * Flame method abbreviations: ``OPF``, ``HFM``, ``CTF``, ``SFF``, ``FCM``, ``LFF``, ``Heat Flux Burner`` + * Stretch extrapolation methods: ``extrapolation method to zero stretch: LS``, ``extrapolation method to zero stretch: NQ``, ``extrapolation method to zero stretch: LC`` .. _reference-experiment-type: * ``experiment-type``: string, required - The type of experiment encoded in this file. Currently, the only allowed value is - ``ignition delay``, which is case sensitive. + The type of experiment encoded in this file. Must be one of the following case-sensitive + values: + + * ``ignition delay`` + * ``laminar burning velocity measurement`` + * ``concentration time profile measurement`` + * ``jet stirred reactor measurement`` + * ``outlet concentration measurement`` + * ``burner stabilized flame speciation measurement`` + * ``rate coefficient`` .. _reference-reference: @@ -129,11 +149,96 @@ particular experiment type. The pressure of the experiment, with dimensions of mass per length per time squared. Must conform to :ref:`value-unit-optional ` +.. _common-temperature: + +* ``temperature``: sequence, optional + The temperature of the experiment, with dimensions of temperature. Must conform to + :ref:`value-unit-optional ` + .. _common-ignition-type: * ``ignition-type``: mapping, optional Has the same schema as :ref:`ignition-type ` +.. _common-ignition-delay: + +* ``ignition-delay``: sequence, optional + The ignition delay measurement, with dimensions of time. Must conform to + :ref:`value-unit-optional ` + +.. _common-equivalence-ratio: + +* ``equivalence-ratio``: sequence, optional + The equivalence ratio of the experiment, dimensionless. Must conform to + :ref:`value-unit-optional ` + +.. _common-laminar-burning-velocity: + +* ``laminar-burning-velocity``: sequence, optional + The laminar burning velocity measurement, with dimensions of length per time. Must conform to + :ref:`value-unit-optional ` + +.. _common-residence-time: + +* ``residence-time``: sequence, optional + The residence time in a flow/jet-stirred reactor experiment, with dimensions of time. Must + conform to :ref:`value-unit-optional ` + +.. _common-reactor-volume: + +* ``reactor-volume``: sequence, optional + The volume of the reactor, with dimensions of length cubed. Must conform to + :ref:`value-unit-optional ` + +.. _common-reactor-length: + +* ``reactor-length``: sequence, optional + The length of the reactor, with dimensions of length. Must conform to + :ref:`value-unit-optional ` + +.. _common-reactor-diameter: + +* ``reactor-diameter``: sequence, optional + The diameter of the reactor, with dimensions of length. Must conform to + :ref:`value-unit-optional ` + +.. _common-flow-rate: + +* ``flow-rate``: sequence, optional + The flow rate through the reactor. Must conform to + :ref:`value-unit-optional ` + +.. _common-environment-temperature: + +* ``environment-temperature``: sequence, optional + The temperature of the environment surrounding the reactor, with dimensions of temperature. + Must conform to :ref:`value-unit-optional ` + +.. _common-global-heat-exchange-coefficient: + +* ``global-heat-exchange-coefficient``: sequence, optional + The global heat exchange coefficient between the reactor and its environment. Must conform to + :ref:`value-unit-optional ` + +.. _common-exchange-area: + +* ``exchange-area``: sequence, optional + The heat exchange area between the reactor and its environment, with dimensions of length + squared. Must conform to :ref:`value-unit-optional ` + +.. _common-pressure-in-reference-state: + +* ``pressure-in-reference-state``: sequence, optional + The pressure used to define the reference state for reported quantities, with dimensions of + mass per length per time squared. Must conform to + :ref:`value-unit-optional ` + +.. _common-temperature-in-reference-state: + +* ``temperature-in-reference-state``: sequence, optional + The temperature used to define the reference state for reported quantities, with dimensions of + temperature. Must conform to :ref:`value-unit-optional ` + .. _common-composition: * ``composition``: mapping, optional @@ -167,9 +272,15 @@ particular experiment type. The amount of the element * ``amount``: sequence, required - A sequence representing the amount of the species. Must conform to either + A sequence conforming to either :ref:`value-with-uncertainty ` or - :ref:`value-without-uncertainty `. + :ref:`value-without-uncertainty `, where the first + element is a float representing the species amount (interpreted according to the + parent ``kind``, e.g., mole fraction, mass fraction, or concentration units). The + optional metadata mapping may additionally include the + :ref:`evaluated-standard-deviation ` fields. + Because species amounts are unitless numbers, all uncertainty and + evaluated-standard-deviation values must be plain floats (not strings with units). .. _ignition-delay-keys: @@ -202,23 +313,33 @@ for the :ref:`datapoints ` schema. A mapping describing how the ignition delay is defined in the experiments. Fields: - ``target``: string, required - Describes the target measurement to define ignition. Can be one of: - - * ``temperature`` - * ``pressure`` - * ``OH`` - * ``OH*`` - * ``CH`` - * ``CH*`` + Describes the target measurement (species or physical quantity) used to define ignition. + Must be one of: ``temperature``, ``pressure``, ``OH``, ``OH*``, ``CH``, ``CH*``, ``NH3``, + ``CO2``, ``N2O``, ``CH4``, ``OHEX``, ``CHEX``, ``CO``, ``H2O``, ``C2``, ``O``, + ``CH3OH``, ``CH3``, ``O2``, ``soot``, ``CO;O``, ``[O]*[CO]``, or ``NEOC5H11``. - ``type``: string, required Describes the type of ignition delay measurement. Can be one of: * ``d/dt max``: maximum of the time derivative of the ``target`` + * ``d/dt min extrapolated``: minimum slope of the ``target`` extrapolated to the + baseline + * ``d/dt max extrapolated``: maximum slope of the ``target`` extrapolated to the + baseline + * ``d/dt second max``: second maximum of the time derivative of the ``target`` * ``max``: maximum of the ``target`` * ``1/2 max``: half-maximum of the ``target`` * ``min``: minimum of the ``target`` - * ``d/dt max extrapolated``: maximum slope of the target extrapolated to the baseline + * ``concentration``: the ``target`` reaches a specified concentration + * ``relative concentration``: the ``target`` reaches a specified fraction of a + reference concentration + * ``relative increase``: the ``target`` increases by a specified amount relative to + its initial value + + - ``amount``: float, optional + A numeric threshold associated with the ignition ``type`` (for example, the concentration + or relative-increase value used when ``type`` is ``concentration``, ``relative + concentration``, or ``relative increase``). .. _ignition-ignition-delay: @@ -240,8 +361,9 @@ for the :ref:`datapoints ` schema. .. _ignition-equivalence-ratio: -* ``equivalence-ratio``: float, optional - The equivalence ratio of the experiment, dimensionless. Minimum value is 0.0. +* ``equivalence-ratio``: sequence, optional + The equivalence ratio of the experiment, dimensionless. Must conform to + :ref:`value-unit-optional `. .. _ignition-rcm-data: @@ -253,7 +375,24 @@ for the :ref:`datapoints ` schema. * ``time-histories``: sequence, optional A sequence of mappings conforming to the :ref:`time-history ` - schema. Used to specify a time-varying history of values during an experiment. + schema. Used to specify a time-varying history of one or more quantities during an experiment. + +.. _ignition-volume-history: + +* ``volume-history``: mapping, optional + A legacy key for specifying a volume time-history for RCM experiments. New files should use + :ref:`time-histories ` with ``type: volume`` instead. Fields: + + - ``volume``: mapping, required + Describes the volume column in the ``values`` array. Must contain ``units`` (string with + dimensions of length cubed) and ``column`` (integer, 0 or 1). + + - ``time``: mapping, required + Describes the time column in the ``values`` array. Must contain ``units`` (string with + dimensions of time) and ``column`` (integer, 0 or 1). + + - ``values``: sequence, required + A sequence of ``[time, volume]`` pairs of floats. .. _rcm-data-keys: @@ -302,6 +441,247 @@ subkeys of the :ref:`rcm-data ` key. compression, with dimensions of length. Must conform to :ref:`value-unit-optional ` +.. _laminar-burning-velocity-keys: + +Laminar Burning Velocity Measurement Keys +----------------------------------------- + +This section details the schema for a laminar burning velocity measurement datapoint, selected +when :ref:`experiment-type ` is ``laminar burning velocity measurement``. + +* ``temperature``: sequence, required + Unburnt-mixture temperature, with dimensions of temperature. Must conform to + :ref:`value-unit-required `. + +* ``pressure``: sequence, required + Unburnt-mixture pressure, with dimensions of mass per length per time squared. Must conform + to :ref:`value-unit-required `. + +* ``laminar-burning-velocity``: sequence, required + The measured laminar burning velocity, with dimensions of length per time. Must conform to + :ref:`value-unit-required `. + +* ``composition``: mapping, required + The composition of the unburnt mixture. Must conform to + :ref:`composition `. + +* ``pressure-rise``: sequence, optional + Rate of pressure rise during the measurement, with dimensions of inverse time. Must conform + to :ref:`value-unit-optional `. + +* ``equivalence-ratio``: sequence, optional + The equivalence ratio of the experiment, dimensionless. Must conform to + :ref:`value-unit-optional `. + +.. _jet-stirred-reactor-keys: + +Jet Stirred Reactor Measurement Keys +------------------------------------ + +This section details the schema for a jet stirred reactor measurement datapoint, selected when +:ref:`experiment-type ` is ``jet stirred reactor measurement``. + +* ``temperature``: sequence, required + Reactor temperature, with dimensions of temperature. Must conform to + :ref:`value-unit-required `. + +* ``pressure``: sequence, required + Reactor pressure, with dimensions of mass per length per time squared. Must conform to + :ref:`value-unit-required `. + +* ``composition``: mapping, required + The composition of the inlet mixture. Must conform to + :ref:`composition `. + +* ``measured-composition``: mapping, required + The composition measured at the reactor outlet. Must conform to + :ref:`composition `. + +* ``equivalence-ratio``: sequence, optional + The equivalence ratio of the experiment, dimensionless. Must conform to + :ref:`value-unit-optional `. + +* ``environment-temperature``: sequence, optional + Temperature of the environment surrounding the reactor, with dimensions of temperature. + Must conform to :ref:`value-unit-optional `. + +.. _outlet-concentration-keys: + +Outlet Concentration Measurement Keys +------------------------------------- + +This section details the schema for an outlet concentration measurement datapoint (e.g., flow +reactor), selected when :ref:`experiment-type ` is ``outlet +concentration measurement``. + +* ``temperature``: sequence, required + Reactor temperature, with dimensions of temperature. Must conform to + :ref:`value-unit-required `. + +* ``pressure``: sequence, required + Reactor pressure, with dimensions of mass per length per time squared. Must conform to + :ref:`value-unit-required `. + +* ``composition``: mapping, required + The composition of the inlet mixture. Must conform to + :ref:`composition `. + +* ``measured-composition``: mapping, required + The composition measured at the reactor outlet. Must conform to + :ref:`composition `. + +* ``equivalence-ratio``: sequence, optional + The equivalence ratio of the experiment, dimensionless. Must conform to + :ref:`value-unit-optional `. + +* ``residence-time``: sequence, optional + Residence time in the reactor, with dimensions of time. Must conform to + :ref:`value-unit-optional `. + +* ``volumetric-flow-in-reference-state``: sequence, optional + Volumetric flow rate through the reactor expressed in a defined reference state, with + dimensions of length cubed per time. Must conform to + :ref:`value-unit-optional `. + +.. _concentration-time-profile-keys: + +Concentration Time Profile Measurement Keys +------------------------------------------- + +This section details the schema for a concentration time profile measurement datapoint (e.g., +shock tube or flow reactor species profiles), selected when +:ref:`experiment-type ` is ``concentration time profile +measurement``. + +* ``temperature``: sequence, required + The temperature of the experiment, with dimensions of temperature. Must conform to + :ref:`value-unit-required `. + +* ``pressure``: sequence, required + The pressure of the experiment, with dimensions of mass per length per time squared. Must + conform to :ref:`value-unit-required `. + +* ``composition``: mapping, required + The initial composition of the mixture. Must conform to + :ref:`composition `. + +* ``concentration-profiles``: sequence, required + A sequence of mappings, each describing the time history of a single species' + concentration. Each element has the following fields: + + - ``species-name``: string, required + The name of the species. + + - ``InChI``: string, optional + The InChI string for the species. + + - ``SMILES``: string, optional + The SMILES string for the species. + + - ``quantity``: mapping, required + A mapping describing the recorded concentration column. Fields: + + * ``units``: string, required + The units of the concentration (e.g., ``mol/cm3``, ``mole fraction``). + + - ``time``: mapping, required + A mapping describing the time column. Fields: + + * ``units``: string, required + The units of the time, with dimensions of time. + + - ``values``: sequence, required + A sequence of at least two rows. Each row is either ``[time, concentration]`` (two + floats) or ``[time, concentration, uncertainty]`` (three floats). + +* ``equivalence-ratio``: sequence, optional + The equivalence ratio of the experiment, dimensionless. Must conform to + :ref:`value-unit-optional `. + +* ``time-shift``: mapping, optional + Defines the ``t = 0`` reference used for the profile. Fields: + + - ``target``: string, required + The species or quantity used to define the time-zero reference. + + - ``type``: string, required + Must be ``half decrease`` or ``relative decrease``. + + - ``amount``: sequence, optional + A numerical threshold associated with ``type`` (e.g., the fractional decrease). Must + conform to :ref:`value-unit-optional `. + +.. _burner-stabilized-flame-keys: + +Burner Stabilized Flame Speciation Measurement Keys +--------------------------------------------------- + +This section details the schema for a burner stabilized flame speciation measurement datapoint, +selected when :ref:`experiment-type ` is ``burner stabilized flame +speciation measurement``. + +* ``temperature``: sequence, required + The temperature at the measurement location, with dimensions of temperature. Must conform + to :ref:`value-unit-required `. + +* ``pressure``: sequence, required + The pressure of the experiment, with dimensions of mass per length per time squared. Must + conform to :ref:`value-unit-required `. + +* ``distance``: sequence, required + The distance from the burner surface at which the sample was taken, with dimensions of + length. Must conform to :ref:`value-unit-required `. + +* ``composition``: mapping, required + The composition of the inlet (unburnt) mixture. Must conform to + :ref:`composition `. + +* ``measured-composition``: mapping, required + The composition measured at ``distance`` from the burner. Must conform to + :ref:`composition `. + +* ``equivalence-ratio``: sequence, optional + The equivalence ratio of the experiment, dimensionless. Must conform to + :ref:`value-unit-optional `. + +* ``flow-rate``: sequence, optional + The flow rate through the burner. Must conform to + :ref:`value-unit-optional `. + +.. _rate-coefficient-keys: + +Rate Coefficient Keys +--------------------- + +This section details the schema for a rate coefficient determination datapoint, selected when +:ref:`experiment-type ` is ``rate coefficient``. Rate coefficient +experiments measure :math:`k(T)` for a specific reaction; pressure and composition are commonly +absent. + +* ``temperature``: sequence, required + The temperature at which the rate coefficient is reported, with dimensions of temperature. + Must conform to :ref:`value-unit-required `. + +* ``pressure``: sequence, optional + The pressure at which the rate coefficient is reported, with dimensions of mass per length + per time squared. Must conform to :ref:`value-unit-optional `. + +* ``rate-coefficient``: sequence, optional + The measured rate coefficient. Units depend on the reaction order (e.g., ``cm3/mol/s`` for + second order). Must conform to :ref:`value-unit-optional `. + +* ``branching-ratio``: sequence, optional + The branching ratio associated with the measurement, dimensionless. Must conform to + :ref:`value-unit-optional `. + +* ``composition``: mapping, optional + The composition of the mixture, if applicable. Must conform to + :ref:`composition `. + +* ``equivalence-ratio``: sequence, optional + The equivalence ratio of the experiment, dimensionless. Must conform to + :ref:`value-unit-optional `. + .. _schema-only-keys: Schema-Only Keys @@ -324,43 +704,93 @@ should not be used in actual ChemKED files. These keys are documented in this se .. _schema-value-with-uncertainty: * ``value-with-uncertainty``: sequence - A combination of a value and unit with uncertainty. Sequence elements: + A combination of a value and unit with an associated uncertainty and/or evaluated standard + deviation. Sequence elements: - - 0: string, required - The first element of the sequence should be the value and its associated - units. The units are validated to have appropriate dimensions for the particular quantity - under consideration + - 0: string or float, required + The first element of the sequence is the value and its associated units (as a single + string, e.g., ``"1000.0 K"``) or a bare float. The units are validated to have appropriate + dimensions for the particular quantity under consideration. - 1: mapping, optional - The second element of the sequence should be a mapping representing the uncertainty. Fields: + The second element of the sequence is a mapping containing any combination of the + following uncertainty and evaluated-standard-deviation fields: + + - Uncertainty fields: + + * ``uncertainty-type``: string + The type of uncertainty. Must be ``absolute`` or ``relative``. Required when + ``uncertainty``, ``upper-uncertainty``, or ``lower-uncertainty`` is specified. + + * ``uncertainty``: string or float, excludes ``upper-uncertainty`` and ``lower-uncertainty``, requires ``uncertainty-type`` + The symmetric uncertainty of the value. If ``uncertainty-type`` is ``absolute`` + and a string is given, it must include units whose dimensions match the units of + the value in the first element of the sequence. + + * ``upper-uncertainty``: string or float, excludes ``uncertainty``, requires ``lower-uncertainty`` and ``uncertainty-type`` + The upper value of an asymmetrical uncertainty. Due to limitations in the Python + library, asymmetrical uncertainties aren't supported in PyKED, so the larger of + ``upper-uncertainty`` and ``lower-uncertainty`` is used. + + * ``lower-uncertainty``: string or float, excludes ``uncertainty``, requires ``upper-uncertainty`` and ``uncertainty-type`` + The lower value of an asymmetrical uncertainty. Due to limitations in the Python + library, asymmetrical uncertainties aren't supported in PyKED, so the larger of + ``upper-uncertainty`` and ``lower-uncertainty`` is used. + + * ``uncertainty-sourcetype``: string, optional + A label describing how the ``uncertainty`` value was obtained. Typical values + include ``reported``, ``estimated``, ``calculated``, and ``digitized``. - * ``uncertainty-type``: string, required - The type of uncertainty. Options are ``absolute`` or ``relative``. + The mapping may also include the + :ref:`evaluated-standard-deviation ` fields, which + may be combined with, or used independently of, the uncertainty fields above. - * ``uncertainty``: string, required, excludes ``upper-uncertainty`` and ``lower-uncertainty`` - The value of the uncertainty. If ``uncertainty-type`` is ``absolute``, must include - units whose dimensions match the units of the value in the first element of the - sequence. +.. _schema-evaluated-standard-deviation: - * ``upper-uncertainty``: string, required, excludes ``uncertainty``, requires ``lower-uncertainty`` - The upper value of an asymmetrical uncertainty. Due to limitations in the Python - library, asymmetrical uncertainties aren't supported in PyKED, so the larger of - ``upper-uncertainty`` and ``lower-uncertainty`` is used. +* ``evaluated-standard-deviation``: mapping fields + A group of optional fields describing a statistically evaluated standard deviation for a + value (e.g., from a dataset-wide re-evaluation). These fields appear inside the metadata + mapping of a :ref:`value-with-uncertainty ` entry or a + composition :ref:`amount ` metadata mapping, and may be used with or + without the uncertainty fields: - * ``lower-uncertainty``: string, required, excludes ``uncertainty``, requires ``upper-uncertainty`` - The lower value of an asymmetrical uncertainty. Due to limitations in the Python - library, asymmetrical uncertainties aren't supported in PyKED, so the larger of - ``upper-uncertainty`` and ``lower-uncertainty`` is used. + * ``evaluated-standard-deviation``: string or float, optional + The evaluated standard deviation value. If given as a string with ``absolute`` type, + must include units whose dimensions match the value. + + * ``evaluated-standard-deviation-type``: string, optional + Must be ``absolute`` or ``relative``. + + * ``evaluated-standard-deviation-sourcetype``: string, optional + A label describing how the evaluated standard deviation was obtained. Typical values + include ``reported``, ``estimated``, ``calculated``, and ``digitized``. + + * ``evaluated-standard-deviation-method``: string, optional + The method used to compute the evaluated standard deviation. Typical values include + ``generic uncertainty``, ``combined from scatter and reported uncertainty``, and + ``statistical scatter``. .. _schema-value-without-uncertainty: * ``value-without-uncertainty``: sequence - A combination of a value and unit without uncertainty. Sequence elements: + A combination of a value and unit without any uncertainty metadata. Sequence elements: + + - 0: string or float, required + The first element of the sequence is the value and its associated units (as a single + string, e.g., ``"1.0 atm"``) or a bare float. The units are validated to have appropriate + dimensions for the particular quantity under consideration. + +.. _schema-value-metadata-only: - - 0: string, required - The first element of the sequence should be the value and its associated - units. The units are validated to have appropriate dimensions for the particular quantity - under consideration +* ``value-metadata-only``: sequence + A metadata-only entry containing uncertainty and/or evaluated-standard-deviation fields but + no value. Used in ``common-properties`` when the uncertainty metadata is shared across + datapoints but the property value varies per datapoint. Sequence elements: + + - 0: mapping, required + A mapping containing any combination of the uncertainty and evaluated-standard-deviation + fields listed in :ref:`value-with-uncertainty ` (element + ``1``). No value element is included. .. _schema-value-unit-required: @@ -372,24 +802,31 @@ should not be used in actual ChemKED files. These keys are documented in this se .. _schema-value-unit-optional: * ``value-unit-optional``: sequence, optional - A sequence conforming to either :ref:`value-with-uncertainty ` or - :ref:`value-without-uncertainty `. May or may not be included - in the ChemKED file. + A sequence conforming to one of + :ref:`value-with-uncertainty `, + :ref:`value-without-uncertainty `, or + :ref:`value-metadata-only `. May or may not be included in the + ChemKED file. .. _ignition-time-history: * ``time-history``: mapping, optional Specify the time history of a quantity during an experiment. Fields: + - ``type``: string, required + The kind of quantity being recorded. Must be one of ``volume``, ``temperature``, + ``pressure``, ``piston position``, ``light emission``, ``OH emission``, or + ``absorption``. + - ``quantity``: mapping, required - A mapping describing the volume in the history. Fields: + A mapping describing the recorded quantity. Fields: * ``units``: string, required - The units of the volume, with dimensions of length cubed + The units of the quantity, with dimensions appropriate for ``type`` (e.g., length + cubed for ``volume``, temperature for ``temperature``). * ``column``: integer, required - The 0-based index of the column containing the volume information in the ``values`` - array. Must be 0 or 1 + The 0-based index of the column containing the quantity in the ``values`` array. - ``time``: mapping, required A mapping describing the time in the history. Fields: @@ -399,7 +836,7 @@ should not be used in actual ChemKED files. These keys are documented in this se * ``column``: integer, required The 0-based index of the column containing the time information in the ``values`` - array. Must be 0 or 1 + array. - ``uncertainty``: mapping, optional The uncertainty of the values in the ``quantity`` column. Can be specified either globally diff --git a/pyked/batch_convert.py b/pyked/batch_convert.py new file mode 100644 index 0000000..a66f622 --- /dev/null +++ b/pyked/batch_convert.py @@ -0,0 +1,2090 @@ +#!/usr/bin/env python3 +"""Batch converter: ReSpecTh v2.3/v2.4 XML → ChemKED YAML + +Converts experiment XML files from ReSpecTh/indirect/ to ChemKED YAML format +and organises them into ChemKED-database directory structure. + +Usage: + python batch_convert.py + python batch_convert.py -i ReSpecTh/indirect -o ChemKED-database + python batch_convert.py --file ReSpecTh/indirect/ammonia/.../x20100057.xml + python batch_convert.py --dry-run +""" + +import importlib +import os +import xml.etree.ElementTree as ET +from collections import Counter +from pathlib import Path +import yaml +import argparse +import logging + +try: + from pyked.chemked import ChemKED as _ChemKED +except Exception: + _ChemKED = None + +logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') +log = logging.getLogger(__name__) + + +def _get_chemked_version(): + """Return the ChemKED schema version from the packaged schema, or a default.""" + default = '0.4.1' + try: + schema_mod = importlib.import_module('pyked.validation') + except ImportError: + return default + schema = getattr(schema_mod, 'schema', None) + if not isinstance(schema, dict): + return default + allowed = schema.get('chemked-version', {}).get('allowed') + if isinstance(allowed, (list, tuple)) and allowed: + return str(allowed[-1]) + return default + + +CHEMKED_VERSION = _get_chemked_version() + + +class UnsupportedUnitsError(Exception): + """Raised when composition uses units not supported by the ChemKED schema.""" + + +# Custom YAML dumper that preserves dict insertion order and indents block sequences +class _OrderedDumper(yaml.Dumper): + def increase_indent(self, flow=False, indentless=False): + return super().increase_indent(flow=flow, indentless=False) + +def _dict_representer(dumper, data): + return dumper.represent_mapping(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, + data.items()) + +_OrderedDumper.add_representer(dict, _dict_representer) + + +class _FlowList(list): + """List subclass that signals the YAML dumper to use flow style.""" + pass + +def _flow_list_representer(dumper, data): + return dumper.represent_sequence(yaml.resolver.BaseResolver.DEFAULT_SEQUENCE_TAG, data, flow_style=True) + +_OrderedDumper.add_representer(_FlowList, _flow_list_representer) + + +def yaml_dump(data, stream): + """Dump data to YAML preserving dict key order with indented block sequences.""" + stream.write('---\n') + yaml.dump(data, stream, Dumper=_OrderedDumper, + default_flow_style=False, allow_unicode=True) + stream.write('...\n') + +# Experiment type mapping (ReSpecTh text → ChemKED value) +EXP_TYPE_MAP = { + 'ignition delay measurement': 'ignition delay', + 'laminar burning velocity measurement': 'laminar burning velocity measurement', + 'concentration time profile measurement': 'concentration time profile measurement', + 'jet stirred reactor measurement': 'jet stirred reactor measurement', + 'outlet concentration measurement': 'outlet concentration measurement', + 'burner stabilized flame speciation measurement': 'burner stabilized flame speciation measurement', +} + +# Properties valid as scalar value+unit in dataGroups +SCALAR_DG_PROPS = { + 'temperature', 'pressure', 'ignition delay', 'pressure rise', + 'laminar burning velocity', 'distance', 'flow rate', + 'residence time', 'volumetric flow rate in reference state', + 'volume', 'time', 'environment temperature', + 'rate coefficient', 'branching ratio', +} + +# Properties valid as scalar value+unit in commonProperties +SCALAR_COMMON_PROPS = { + 'temperature', 'pressure', 'residence time', 'volume', + 'flow rate', 'reactor volume', 'pressure rise', + 'laminar burning velocity', 'environment temperature', + 'global heat exchange coefficient', 'exchange area', + 'reactor length', 'reactor diameter', + 'pressure in reference state', 'temperature in reference state', +} + + +# Compact inverse-unit notation used in ReSpecTh that pint cannot parse. +# e.g. "ms-1" is ambiguous (pint reads it as millisecond, dimensionless); +# map to unambiguous reciprocal forms. Mirrors converters.py's "Torr"→"torr". +_INV_UNIT_MAP = {'ms-1': '1/ms', 's-1': '1/s', 'cm-1': '1/cm', 'K-1': '1/K', + 'unitless': 'dimensionless'} + + +def _normalize_units(unit_str): + """Rewrite unit strings with implicit negative exponents to pint-compatible form. + + Converts e.g. 'kg m-2 s-1' → 'kg * m**-2 * s**-1' so that pint does not + misinterpret the '-' as arithmetic subtraction. + Also handles ReSpecTh underscore-separated units like 'cm3_mol-1_s-1'. + """ + import re as _re + # First apply the simple inverse map + unit_str = _INV_UNIT_MAP.get(unit_str, unit_str) + # Replace underscore separators with spaces (ReSpecTh k-file convention: cm3_mol-1_s-1) + # Only replace underscores that appear between unit token characters (not leading/trailing) + unit_str = _re.sub(r'(?<=\w)_(?=\w)', ' ', unit_str) + # Replace patterns like 'TOKEN-N' (letter/digit token followed by hyphen-digit) + # with 'TOKEN**-N', but only when the token is a known unit symbol (not a standalone '-'). + unit_str = _re.sub(r'([a-zA-Z]+)(-\d+)', r'\1**\2', unit_str) + # Replace spaces used as implicit multiplication with ' * ' + # (only between unit tokens, not touching '**') + unit_str = _re.sub(r'(?<=\w) +(?=\w)', ' * ', unit_str) + return unit_str + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def decode_latex(s): + """Decode LaTeX accent commands to Unicode characters. + + Handles patterns like {\\'{e}} → é, {\\"\\{u}} → ü, {\\`{e}} → è, etc. + Also strips remaining braces from BibTeX-style {name} groups. + """ + import re + # Mapping of (accent_command, base_letter) → Unicode character + _accent_map = { + ("'", 'a'): 'á', ("'", 'A'): 'Á', + ("'", 'e'): 'é', ("'", 'E'): 'É', + ("'", 'i'): 'í', ("'", 'I'): 'Í', + ("'", 'o'): 'ó', ("'", 'O'): 'Ó', + ("'", 'u'): 'ú', ("'", 'U'): 'Ú', + ('"', 'a'): 'ä', ('"', 'A'): 'Ä', + ('"', 'e'): 'ë', ('"', 'E'): 'Ë', + ('"', 'i'): 'ï', ('"', 'I'): 'Ï', + ('"', 'o'): 'ö', ('"', 'O'): 'Ö', + ('"', 'u'): 'ü', ('"', 'U'): 'Ü', + ('`', 'a'): 'à', ('`', 'A'): 'À', + ('`', 'e'): 'è', ('`', 'E'): 'È', + ('`', 'i'): 'ì', ('`', 'I'): 'Ì', + ('`', 'o'): 'ò', ('`', 'O'): 'Ò', + ('`', 'u'): 'ù', ('`', 'U'): 'Ù', + ('^', 'a'): 'â', ('^', 'A'): 'Â', + ('^', 'e'): 'ê', ('^', 'E'): 'Ê', + ('^', 'i'): 'î', ('^', 'I'): 'Î', + ('^', 'o'): 'ô', ('^', 'O'): 'Ô', + ('^', 'u'): 'û', ('^', 'U'): 'Û', + ('~', 'n'): 'ñ', ('~', 'N'): 'Ñ', + ('c', 'c'): 'ç', ('c', 'C'): 'Ç', + } + + def _replace_accent(m): + accent = m.group(1) + letter = m.group(2) + return _accent_map.get((accent, letter), letter) + + # Pattern: {\CMD{letter}} or {\\CMD{letter}} where CMD is one of ' " ` ^ ~ c + # Outer braces may or may not be present + s = re.sub(r"\{?\\(['\"`^~c])\{([A-Za-z])\}\}?", _replace_accent, s) + # Also handle \\' without inner braces: {\'A} or \'{A} + s = re.sub(r"\{?\\(['\"`^~c])([A-Za-z])\}?", _replace_accent, s) + # Handle LaTeX \# → # and \& → & + s = s.replace('\\#', '#').replace('\\&', '&') + # Handle \text{...} → contents + s = re.sub(r'\\text\{([^}]*)\}', r'\1', s) + # Handle \textquotesingle → ' + s = s.replace('\\textquotesingle', "'") + # Strip remaining BibTeX braces {word} → word + s = re.sub(r'\{([^{}]*)\}', r'\1', s) + # Clean up any double spaces + s = re.sub(r' +', ' ', s).strip() + return s + + +def parse_author_string(s): + """Parse author strings into [{'name': 'First Last'}, ...]. + + Handles two common ReSpecTh formats: + - 'Last, First and Last, First ...' (and-separated) + - 'Last, F., Last, F., ...' (comma-separated initials, no 'and') + """ + import re as _re + s = s.strip() + authors = [] + + # Detect comma-only format: 'Last, F., Last, F., ...' + # Heuristic: if ' and ' is absent but the string has repeated 'Word, X.,' pattern + if ' and ' not in s and _re.search(r'\w+,\s+\w+\.(?:,|$)', s): + # Split on ', ' followed by a word that is itself followed by ', ' or end + # Strategy: collect tokens by splitting on ', ' and pairing them up + tokens = [t.strip() for t in s.split(',')] + tokens = [t for t in tokens if t] + i = 0 + while i < len(tokens): + last = tokens[i] + # Next token is the initial/first name (may end with '.') + if i + 1 < len(tokens): + first = tokens[i + 1].strip() + name = f"{first} {last}" + i += 2 + else: + name = last + i += 1 + authors.append({'name': decode_latex(name)}) + return authors + + # Standard 'and'-separated format + for part in s.split(' and '): + part = part.strip() + if not part: + continue + if ',' in part: + pieces = part.split(',', 1) + name = f"{pieces[1].strip()} {pieces[0].strip()}" + else: + name = part + authors.append({'name': decode_latex(name)}) + return authors + + +def first_author_last_name(authors): + """Return first author's last name for directory naming.""" + if not authors: + return 'Unknown' + name = authors[0].get('name', 'Unknown') + parts = name.strip().split() + return parts[-1] if parts else 'Unknown' + + +def parse_species_link(elem): + """Extract species info dict from a element.""" + info = {} + pk = elem.attrib.get('preferredKey', '') + if pk: + info['species-name'] = pk + inchi = elem.attrib.get('InChI') + if inchi: + info['InChI'] = inchi + return info + + +def _clean_numeric(text): + """Clean numeric string: strip leading zeros to avoid YAML octal issues.""" + import re as _re + text = text.strip() + # Handle Fortran-style exponents without 'e': e.g. '5.93+005' → '5.93e+005' + text = _re.sub(r'^([+-]?\d+\.?\d*)([+-]\d+)$', r'\1e\2', text) + try: + val = float(text) + if val != val: # NaN + return text + # Integer-valued: format as integer string + if val == int(val) and '.' not in text and 'e' not in text.lower(): + return str(int(val)) + # Otherwise format cleanly (strips trailing zeros, avoids float noise) + return f'{val:.15g}' + except (ValueError, OverflowError): + return text + + +def normalize_comp_units(value_str, units): + """Normalise composition amount → (float, kind_string). + + Matches the existing PyKED converter convention: + - percent → mole percent (value unchanged) + - ppm → mole fraction (value × 1e-6) + - ppb → mole fraction (value × 1e-9) + - mole fraction / mass fraction / mole percent → unchanged + """ + val = float(value_str) + if units in ('mole fraction', 'mass fraction', 'mole percent'): + return val, units + elif units in ('percent',): + return val, 'mole percent' + elif units == 'ppm': + return float(f'{val * 1e-6:.12g}'), 'mole fraction' + elif units == 'ppb': + return float(f'{val * 1e-9:.12g}'), 'mole fraction' + elif units in ('mol/cm3', 'mol/m3', 'mol/L', 'mol/dm3'): + return val, units + else: + raise UnsupportedUnitsError( + f'Composition units {units!r} not supported. ' + 'Must be one of: mole fraction, mass fraction, mole percent, ' + 'percent, ppm, ppb, or mol/cm3.' + ) + + +def _reconcile_composition(entries): + """Pick a single kind for the composition block. + + *entries*: list of (spec_dict, value, kind) tuples. + Returns (target_kind, [(spec_dict, value)]). + After normalisation, all entries should share the same kind. + If mixed, the dominant kind is used and minority entries are converted. + """ + kinds = set(e[2] for e in entries) + if len(kinds) == 1: + k = kinds.pop() + return k, [(e[0], e[1]) for e in entries] + # Mixed units – pick dominant kind, convert minority entries + kind_counts = Counter(e[2] for e in entries) + dominant = kind_counts.most_common(1)[0][0] + log.warning(f'Mixed composition units {dict(kind_counts)}; converting all to {dominant!r}') + converted = [] + for spec, val, kind in entries: + if kind == dominant: + converted.append((spec, val)) + elif dominant == 'mole fraction' and kind == 'mole percent': + converted.append((spec, round(val / 100.0, 12))) + elif dominant == 'mole percent' and kind == 'mole fraction': + converted.append((spec, round(val * 100.0, 12))) + else: + # Fallback: convert both to mole fraction via ppm/ppb already handled upstream + converted.append((spec, val)) + return dominant, converted + + +def prop_name_to_key(name): + """Convert ReSpecTh property name to ChemKED YAML key.""" + key = name.replace(' ', '-') + special = { + 'volume': 'reactor-volume', + 'volumetric-flow-rate-in-reference-state': 'volumetric-flow-in-reference-state', + 'environment-temperature': 'environment-temperature', + 'global-heat-exchange-coefficient': 'global-heat-exchange-coefficient', + 'exchange-area': 'exchange-area', + 'reactor-length': 'reactor-length', + 'reactor-diameter': 'reactor-diameter', + 'pressure-in-reference-state': 'pressure-in-reference-state', + 'temperature-in-reference-state': 'temperature-in-reference-state', + } + return special.get(key, key) + + +# --------------------------------------------------------------------------- +# File metadata & reference +# --------------------------------------------------------------------------- + +def parse_file_metadata(root): + file_author = (root.findtext('fileAuthor') or '').strip() + props = { + 'file-authors': [{'name': file_author or 'Unknown'}], + 'file-version': 0, + 'chemked-version': CHEMKED_VERSION, + } + + # Note: file-doi, respecth-version, first-publication-date, last-modification-date + # are ReSpecTh-specific fields not recognised by the PyKED schema — omit them. + + return props + + +def parse_reference(root, xml_filename): + import re as _re + ref = {} + bib = root.find('bibliographyLink') + if bib is None: + ref['detail'] = f'Converted from ReSpecTh XML file {xml_filename}' + return ref + + doi_el = bib.find('referenceDOI') + if doi_el is not None and doi_el.text: + ref['doi'] = doi_el.text.strip() + + details = bib.find('details') + if details is not None: + auth = (details.findtext('author') or '').strip() + if auth: + ref['authors'] = parse_author_string(auth) + journal = (details.findtext('journal') or '').strip() + if journal: + ref['journal'] = decode_latex(journal) + year = (details.findtext('year') or '').strip() + if year: + ref['year'] = int(year) + vol = (details.findtext('volume') or '').strip() + if vol: + try: + # handles '32 I' → 32, '110–111' or '110-111' → 110 + m_vol = _re.search(r'\d+', vol) + ref['volume'] = int(m_vol.group()) if m_vol else int(vol.split()[0]) + except (ValueError, IndexError, AttributeError): + pass # omit non-parseable volume; CrossRef enrichment will set it + pages = (details.findtext('pages') or '').strip() + if pages: + # Normalise en-dash/double-hyphen page ranges to single hyphen (e.g. 239--245 → 239-245) + pages = _re.sub(r'-{2,}', '-', pages).replace('\u2013', '-') + ref['pages'] = pages + # Note: title, location, table, figure, number, publication-type are not + # recognised by the PyKED schema — omit them. + + # Fallback: use + if not ref.get('authors'): + desc = (bib.findtext('description') or '').strip() + if desc: + ref['detail'] = desc + + prefix = ref.get('detail', '') + ref['detail'] = (prefix + ' ' if prefix else '') + \ + f'Converted from ReSpecTh XML file {xml_filename}' + + # Enrich journal name and authors from CrossRef so the YAML matches + # what PyKED's CrossRef validation expects. + if ref.get('doi'): + try: + import habanero as _habanero + from requests.exceptions import ConnectionError as _ConnErr + _cr = _habanero.Crossref(mailto='prometheus@pr.omethe.us') + _msg = _cr.works(ids=ref['doi'])['message'] + # Canonical journal title + container = _msg.get('container-title') + if container: + import html as _html_mod + ref['journal'] = _html_mod.unescape(container[0]) + # Canonical author list: family + given → "Given Family" + cr_authors = _msg.get('author', []) + if cr_authors: + names = [] + for a in cr_authors: + given = a.get('given', '').strip() + family = a.get('family', '').strip() + if given and family: + names.append({'name': f'{given} {family}'}) + elif family: + names.append({'name': family}) + if names: + ref['authors'] = names + # Canonical year + pub = _msg.get('published-print') or _msg.get('published-online') or _msg.get('published') or _msg.get('issued') + if pub: + ref['year'] = pub['date-parts'][0][0] + # Canonical volume (integer) + cr_vol = _msg.get('volume') + if cr_vol is not None: + try: + # CrossRef may return combined volumes like "110-111"; use first number + m_cv = _re.search(r'\d+', str(cr_vol)) + ref['volume'] = int(m_cv.group()) if m_cv else int(cr_vol) + except (ValueError, TypeError, AttributeError): + pass + # Canonical pages (some journals use article-number instead of page) + cr_pages = _msg.get('page') or _msg.get('article-number') + if cr_pages: + ref['pages'] = _re.sub(r'-{2,}', '-', cr_pages).replace('\u2013', '-') + except Exception: + pass # network unavailable or DOI not in CrossRef — keep ReSpecTh values + + return ref + + +# --------------------------------------------------------------------------- +# Experiment kind & apparatus +# --------------------------------------------------------------------------- + +def parse_experiment_kind(root): + exp_text = (root.findtext('experimentType') or '').strip().lower() + exp_type = EXP_TYPE_MAP.get(exp_text) + if exp_type is None: + raise ValueError(f'Unknown experiment type: {root.findtext("experimentType")}') + + _default_apparatus_kind = { + 'ignition delay': 'shock tube', + 'laminar burning velocity measurement': 'outwardly propagating spherical flame', + 'concentration time profile measurement': 'flow reactor', + 'jet stirred reactor measurement': 'jet stirred reactor', + 'outlet concentration measurement': 'flow reactor', + 'burner stabilized flame speciation measurement': 'flame', + } + apparatus = {'kind': '', 'institution': '', 'facility': ''} + kind_el = root.find('apparatus/kind') + if kind_el is not None and kind_el.text: + apparatus['kind'] = kind_el.text.strip() + if not apparatus['kind'] and exp_type in _default_apparatus_kind: + apparatus['kind'] = _default_apparatus_kind[exp_type] + _mode_aliases = { + 'reflected': 'reflected shock', + 'incident': 'incident shock', + } + modes = root.findall('apparatus/mode') + if modes: + mode_list = [] + for m in modes: + if m.text: + raw = m.text.strip() + mode_list.append(_mode_aliases.get(raw, raw)) + if mode_list: + apparatus['mode'] = mode_list + + return exp_type, apparatus + + +# --------------------------------------------------------------------------- +# Common properties +# --------------------------------------------------------------------------- + +def parse_initial_composition(prop_elem): + entries = [] # [(spec_dict, value, kind)] + for component in prop_elem.findall('component'): + sl = component.find('speciesLink') + amount_el = component.find('amount') + if sl is None or amount_el is None: + continue + spec = parse_species_link(sl) + units = amount_el.attrib.get('units', 'mole fraction') + val, kind = normalize_comp_units(amount_el.text, units) + entries.append((spec, val, kind)) + comp = {'kind': None, 'species': []} + if not entries: + return comp + target_kind, resolved = _reconcile_composition(entries) + comp['kind'] = target_kind + for spec, val in resolved: + spec['amount'] = [val] + comp['species'].append(spec) + return comp + + +def _ref_to_property_key(reference, dg_defs=None): + """Map a ReSpecTh uncertainty reference string to a ChemKED property key. + + Returns None for composition/initial-composition references (per-species, + no scalar property to attach to). + """ + if reference in ('composition', 'initial composition'): + return None + alias_map = { + 'Sl': 'laminar-burning-velocity', + 'SL': 'laminar-burning-velocity', + 'Phi': 'equivalence-ratio', + } + if reference in alias_map: + return alias_map[reference] + # If reference looks like a dataGroup column id (e.g. 'x1'), resolve it + if dg_defs and reference in dg_defs: + return prop_name_to_key(dg_defs[reference]['name']) + # General case: space→hyphen + return prop_name_to_key(reference) + + +def _format_unc_value(value_str, units, kind='absolute'): + """Format an uncertainty value, stripping dimensionless ``[-]`` notation.""" + if units in ('[-]', '', 'unitless'): + return value_str + if kind == 'relative': + return value_str + return f'{value_str} {units}'.strip() + + +def _bound_key(bound): + """Map a ReSpecTh bound attribute to the PyKED uncertainty key name.""" + if bound == 'plus': + return 'upper-uncertainty' + elif bound == 'minus': + return 'lower-uncertainty' + return 'uncertainty' + + +def _build_inline_uncertainty(kind, bound, value_str, units, sourcetype=None): + """Build a PyKED inline uncertainty dict from ReSpecTh attributes.""" + unc_dict = {'uncertainty-type': kind} + unc_value = _format_unc_value(value_str, units, kind) + unc_dict[_bound_key(bound)] = unc_value + if sourcetype: + unc_dict['uncertainty-sourcetype'] = sourcetype + return unc_dict + + +def _merge_inline_uncertainty(existing, new): + """Merge two inline uncertainty dicts (e.g. separate plus + minus → one dict).""" + merged = dict(existing) + for key in ('uncertainty-type', 'uncertainty', 'upper-uncertainty', 'lower-uncertainty', + 'uncertainty-sourcetype'): + if key in new: + merged[key] = new[key] + return merged + + +def _build_inline_esd(kind, value_str, units, sourcetype=None, method=None): + """Build inline evaluated-standard-deviation fields for a property dict.""" + esd = {} + esd['evaluated-standard-deviation'] = _format_unc_value(value_str, units, kind) + if kind: + esd['evaluated-standard-deviation-type'] = kind + if sourcetype: + esd['evaluated-standard-deviation-sourcetype'] = sourcetype + if method: + esd['evaluated-standard-deviation-method'] = method + return esd + + +def _attach_metadata_to_property(dp, key, fields): + """Merge metadata fields into a property's inline dict on dp[key].""" + prop_val = dp.get(key) + if not isinstance(prop_val, list) or len(prop_val) < 1: + return False + if len(prop_val) >= 2 and isinstance(prop_val[1], dict): + prop_val[1].update(fields) + else: + dp[key] = [prop_val[0], dict(fields)] + return True + + +def _attach_comp_esd_inline(comp_block, species_name, kind, raw_value, units, + sourcetype=None, method=None): + """Attach inline ESD fields to a species amount dict in a composition block.""" + for spec in comp_block.get('species', []): + if spec.get('species-name') != species_name: + continue + amount = spec.get('amount') + if not isinstance(amount, list) or len(amount) < 1: + return False + if units in ('ppm', 'ppb', 'percent'): + esd_val, _ = normalize_comp_units(str(raw_value), units) + else: + esd_val = float(raw_value) + esd_fields = {'evaluated-standard-deviation': esd_val} + if kind: + esd_fields['evaluated-standard-deviation-type'] = kind + if sourcetype: + esd_fields['evaluated-standard-deviation-sourcetype'] = sourcetype + if method: + esd_fields['evaluated-standard-deviation-method'] = method + if len(amount) >= 2 and isinstance(amount[1], dict): + amount[1].update(esd_fields) + else: + spec['amount'] = [amount[0], esd_fields] + return True + return False + + +def _attach_comp_uncertainty_inline(comp_block, species_name, kind, bound, + raw_value, units, sourcetype=None): + """Attach inline uncertainty to a species amount in a composition block. + + Composition amounts use bare floats, so uncertainty values are also floats + (in the same implicit units as the composition ``kind``). + + Returns True if successfully attached, False if species not found. + """ + for spec in comp_block.get('species', []): + if spec.get('species-name') != species_name: + continue + amount = spec.get('amount') + if not (isinstance(amount, list) and len(amount) >= 1): + return False + + # Compute float uncertainty value + if kind == 'relative': + unc_val = float(raw_value) + else: # absolute + if units in ('ppm', 'ppb', 'percent'): + unc_val, _ = normalize_comp_units(str(raw_value), units) + else: + unc_val = float(raw_value) + + unc_dict = {'uncertainty-type': kind} + if bound in ('plusminus', ''): + unc_dict['uncertainty'] = unc_val + elif bound == 'plus': + unc_dict['upper-uncertainty'] = unc_val + elif bound == 'minus': + unc_dict['lower-uncertainty'] = unc_val + else: + unc_dict['uncertainty'] = unc_val + if sourcetype: + unc_dict['uncertainty-sourcetype'] = sourcetype + + if len(amount) == 1: + spec['amount'] = [amount[0], unc_dict] + elif len(amount) == 2 and isinstance(amount[1], dict): + spec['amount'] = [amount[0], _merge_inline_uncertainty(amount[1], unc_dict)] + return True + return False + + +def _parse_esd_common(prop_elem): + """Parse an evaluated-standard-deviation property from commonProperties. + + Returns a list of standalone entry dicts. + """ + attrs = prop_elem.attrib + reference = attrs.get('reference', '') + kind = attrs.get('kind', '') + units = attrs.get('units', '') + + base = {'reference': reference, 'kind': kind} + for attr in ('sourcetype', 'method'): + val = attrs.get(attr) + if val: + base[attr] = val + + entries = [] + if reference in ('composition', 'initial composition'): + species_links = prop_elem.findall('speciesLink') + values = prop_elem.findall('value') + for sl, val_el in zip(species_links, values): + entry = dict(base) + spec = parse_species_link(sl) + entry.update(spec) + if units in ('ppm', 'ppb', 'percent'): + conv_val, conv_units = normalize_comp_units(val_el.text.strip(), units) + entry['value'] = [f'{conv_val} {conv_units}'] + else: + entry['value'] = [_format_unc_value(_clean_numeric(val_el.text), units)] + entries.append(entry) + else: + val_el = prop_elem.find('value') + if val_el is not None: + entry = dict(base) + entry['value'] = [_format_unc_value(_clean_numeric(val_el.text), units)] + entries.append(entry) + return entries + + +def parse_common_properties(root, exp_type): + common = {} + pending_uncs = [] # uncertainty prop_elems to process in second pass + pending_esds = [] # evaluated-standard-deviation prop_elems + + # First pass: collect scalar properties, compositions + for prop_elem in root.findall('commonProperties/property'): + name = prop_elem.attrib.get('name', '') + + if name == 'initial composition': + comp = parse_initial_composition(prop_elem) + if comp and comp.get('species'): + import numpy as _np_cp + total = 100.0 if comp.get('kind') == 'mole percent' else 1.0 + comp_sum = sum(sp['amount'][0] for sp in comp['species'] if sp.get('amount')) + if not _np_cp.isclose(total, comp_sum, rtol=0.0, atol=total * 0.11): + # Partial CP composition (sum deviates >11% from expected total). + # Store for merging into per-dp compositions; don't use as standalone. + common['_partial_cp_composition'] = comp + else: + common['composition'] = comp + else: + common['composition'] = comp + elif name == 'equivalence ratio': + val_el = prop_elem.find('value') + if val_el is not None: + common['equivalence-ratio'] = [f'{_clean_numeric(val_el.text)} dimensionless'] + elif name in SCALAR_COMMON_PROPS: + val_el = prop_elem.find('value') + units = prop_elem.attrib.get('units', '') + units = _normalize_units(units) + if val_el is not None: + key = prop_name_to_key(name) + common[key] = [f'{_clean_numeric(val_el.text)} {units}'] + elif name == 'uncertainty': + pending_uncs.append(prop_elem) + elif name == 'evaluated standard deviation': + pending_esds.append(prop_elem) + + # Second pass: inline uncertainties + inline_uncs = {} # key → inline unc dict (for merging plus/minus pairs) + pending_unc_entries = [] # unresolvable species uncertainties + for prop_elem in pending_uncs: + attrs = prop_elem.attrib + reference = attrs.get('reference', '') + kind = attrs.get('kind', '') + units = attrs.get('units', '') + bound = attrs.get('bound', '') + sourcetype = attrs.get('sourcetype', '') + + target_key = _ref_to_property_key(reference) + if target_key is not None and target_key in common: + # Scalar-reference: convert to inline uncertainty on the property + val_el = prop_elem.find('value') + if val_el is not None: + unc_dict = _build_inline_uncertainty( + kind, bound, _clean_numeric(val_el.text), units, sourcetype + ) + if target_key in inline_uncs: + inline_uncs[target_key] = _merge_inline_uncertainty( + inline_uncs[target_key], unc_dict + ) + else: + inline_uncs[target_key] = unc_dict + elif reference in ('composition', 'initial composition') and 'composition' in common: + # Composition-reference: inline on species amount fields + species_links = prop_elem.findall('speciesLink') + values = prop_elem.findall('value') + for sl, val_el in zip(species_links, values): + spec = parse_species_link(sl) + species_name = spec.get('species-name', '') + raw_val = _clean_numeric(val_el.text) + if not _attach_comp_uncertainty_inline( + common['composition'], species_name, kind, bound, + raw_val, units, sourcetype + ): + # Species not in initial composition (e.g., measured species) + pending_unc_entries.append({ + 'reference': reference, 'kind': kind, + 'units': units, 'bound': bound, + 'sourcetype': sourcetype, + 'value': raw_val, + 'species-name': species_name, + }) + + # Attach inline uncertainties to their property fields + for key, unc_dict in inline_uncs.items(): + prop_val = common[key] + if isinstance(prop_val, list) and len(prop_val) >= 1: + common[key] = [prop_val[0], unc_dict] + + # Third pass: inline ESD + pending_esd_entries = [] # unresolvable entries for post-merge + for prop_elem in pending_esds: + attrs = prop_elem.attrib + reference = attrs.get('reference', '') + kind = attrs.get('kind', '') + units = attrs.get('units', '') + sourcetype = attrs.get('sourcetype', '') + method = attrs.get('method', '') + + target_key = _ref_to_property_key(reference) + if target_key is not None and target_key in common: + val_el = prop_elem.find('value') + if val_el is not None: + esd_fields = _build_inline_esd( + kind, _clean_numeric(val_el.text), units, sourcetype, method + ) + _attach_metadata_to_property(common, target_key, esd_fields) + elif reference in ('composition', 'initial composition') and 'composition' in common: + species_links = prop_elem.findall('speciesLink') + values = prop_elem.findall('value') + for sl, val_el in zip(species_links, values): + spec = parse_species_link(sl) + species_name = spec.get('species-name', '') + if not _attach_comp_esd_inline( + common['composition'], species_name, kind, + _clean_numeric(val_el.text), units, sourcetype, method + ): + # Species not in initial composition (e.g., measured species) + pending_esd_entries.append({ + 'reference': reference, 'kind': kind, + 'units': units, 'sourcetype': sourcetype, + 'method': method, + 'value': _clean_numeric(val_el.text), + 'species-name': species_name, + }) + else: + # Target property not in common (varies per datapoint) + if reference in ('composition', 'initial composition'): + # Composition ESDs that aren't in common yet — save for post-merge + species_links = prop_elem.findall('speciesLink') + values = prop_elem.findall('value') + for sl, val_el in zip(species_links, values): + spec = parse_species_link(sl) + pending_esd_entries.append({ + 'reference': reference, 'kind': kind, + 'units': units, 'sourcetype': sourcetype, + 'method': method, + 'value': _clean_numeric(val_el.text), + 'species-name': spec.get('species-name', ''), + }) + elif target_key is not None: + # Scalar ESD for a per-dp property — keep as metadata-only + # in common-properties (no value, just the ESD dict) + val_el = prop_elem.find('value') + if val_el is not None: + esd_fields = _build_inline_esd( + kind, _clean_numeric(val_el.text), units, sourcetype, method + ) + common[target_key] = [esd_fields] + else: + # Unknown reference — save for post-merge + val_el = prop_elem.find('value') + if val_el is not None: + pending_esd_entries.append({ + 'reference': reference, 'kind': kind, + 'units': units, 'sourcetype': sourcetype, + 'method': method, + 'value': _clean_numeric(val_el.text), + }) + + if pending_esd_entries: + common['_pending_esd'] = pending_esd_entries + + if pending_unc_entries: + common['_pending_unc'] = pending_unc_entries + + return common + + +def parse_ignition_type(root): + elem = root.find('ignitionType') + if elem is None: + return None + target = elem.attrib.get('target', '').rstrip(';').strip() + ig_type = elem.attrib.get('type', '') + target_map = {'OHEX': 'OHEX', 'CHEX': 'CHEX', 'P': 'pressure', 'T': 'temperature', + 'OH*': 'OH*', 'CH*': 'CH*', 'CO2*': 'CO2'} + target = target_map.get(target.upper(), target_map.get(target, target)) + # Map ReSpecTh ignition type names to PyKED schema values (mirrors converters.py) + ign_type_map = { + 'baseline max intercept from d/dt': 'd/dt max extrapolated', + 'baseline min intercept from d/dt': 'd/dt min extrapolated', + } + ig_type = ign_type_map.get(ig_type, ig_type) + result = {'target': target, 'type': ig_type} + # Capture amount for relative concentration (fraction of peak at which ignition is detected) + amount_str = elem.attrib.get('amount', '') + if amount_str: + try: + result['amount'] = float(amount_str) + except ValueError: + pass + return result + + +# --------------------------------------------------------------------------- +# DataGroup property definitions +# --------------------------------------------------------------------------- + +def parse_datagroup_props(data_group): + """Return {id: {name, units, species?, + uncertainty attrs}} for each .""" + defs = {} + for prop in data_group.findall('property'): + pid = prop.attrib['id'] + entry = { + 'name': prop.attrib['name'], + 'units': prop.attrib.get('units', ''), + } + sl = prop.find('speciesLink') + if sl is not None: + entry['species'] = parse_species_link(sl) + # Extra attributes for uncertainty / evaluated standard deviation + for attr in ('reference', 'kind', 'bound', 'method', 'sourcetype'): + val = prop.attrib.get(attr) + if val: + entry[attr] = val + defs[pid] = entry + return defs + + +# --------------------------------------------------------------------------- +# Composition builder from datapoint values +# --------------------------------------------------------------------------- + +def build_composition(prop_defs, dp_elem): + """Build a composition dict from composition columns in a datapoint.""" + entries = [] # [(spec_dict, value, kind)] + for val_el in dp_elem: + pid = val_el.tag + if pid not in prop_defs: + continue + pdef = prop_defs[pid] + if pdef['name'] != 'composition': + continue + spec = dict(pdef.get('species', {})) + val, kind = normalize_comp_units(val_el.text, pdef['units']) + if val < 0: + # -1.0 is a sentinel for "below detection limit"; skip these species + log.debug(f'Skipping species {spec.get("species-name", "?")} with negative ' + f'value {val} (below detection limit)') + continue + entries.append((spec, val, kind)) + if not entries: + return None + target_kind, resolved = _reconcile_composition(entries) + comp = {'kind': target_kind, 'species': []} + for spec, val in resolved: + spec['amount'] = [val] + comp['species'].append(spec) + return comp + + +def _add_balance_diluent(measured, initial_composition): + """Top up measured-composition to sum to 1.0 using the diluent from initial_composition. + + For JSR/flow-reactor experiments only a subset of species are measured. + The balance (typically N2 or Ar diluent) is inferred from the initial + composition and added so the mole fractions sum to 1.0 as required by + PyKED validation. + + Args: + measured (dict): composition dict built by build_composition(). + initial_composition (dict | None): common-properties composition dict. + + Returns: + dict: measured composition with balance species added if needed. + """ + if measured is None or initial_composition is None: + return measured + + kind = measured.get('kind', 'mole fraction') + total = 100.0 if kind == 'mole percent' else 1.0 + current_sum = sum(sp['amount'][0] for sp in measured['species']) + + import numpy as np + if np.isclose(total, current_sum): + return measured # already sums to 1.0 + + measured_names = {sp['species-name'] for sp in measured['species']} + + # Find the diluent: species in initial_composition not already measured, + # with the largest mole fraction (i.e. the main diluent, e.g. N2 or Ar). + init_kind = initial_composition.get('kind', 'mole fraction') + init_total = 100.0 if init_kind == 'mole percent' else 1.0 + candidates = [ + sp for sp in initial_composition.get('species', []) + if sp['species-name'] not in measured_names + ] + if not candidates: + return measured + + # Pick the dominant non-measured species + diluent_spec = max(candidates, key=lambda s: s['amount'][0]) + balance = total - current_sum + if balance <= 0: + return measured + + # Build a minimal species entry (copy identifiers, set inferred amount) + diluent_entry = {k: v for k, v in diluent_spec.items() if k != 'amount'} + diluent_entry['amount'] = [round(balance, 8)] + measured['species'].append(diluent_entry) + return measured + + +def build_initial_composition(prop_defs, dp_elem, partial_cp_composition=None): + """Build initial composition dict from 'initial composition' columns. + + If *partial_cp_composition* is given (a partial common-property composition + that didn't sum to 1.0), its species are merged into the per-datapoint + composition so the combined block sums correctly. + """ + entries = [] + dp_species_names = set() + for val_el in dp_elem: + pid = val_el.tag + if pid not in prop_defs: + continue + pdef = prop_defs[pid] + if pdef['name'] != 'initial composition': + continue + spec = dict(pdef.get('species', {})) + val, kind = normalize_comp_units(val_el.text, pdef['units']) + entries.append((spec, val, kind)) + dp_species_names.add(spec.get('species-name', '')) + if not entries: + return None + # Merge species from partial CP composition that aren't already in per-dp + if partial_cp_composition and partial_cp_composition.get('species'): + cp_kind = partial_cp_composition.get('kind', 'mole fraction') + for sp in partial_cp_composition['species']: + sname = sp.get('species-name', '') + if sname and sname not in dp_species_names: + spec_copy = {k: v for k, v in sp.items() if k != 'amount'} + val = sp['amount'][0] + entries.append((spec_copy, val, cp_kind)) + target_kind, resolved = _reconcile_composition(entries) + comp = {'kind': target_kind, 'species': []} + for spec, val in resolved: + spec['amount'] = [val] + comp['species'].append(spec) + return comp + + +def build_uncertainty_entries(dg_defs, dp_elem, dp=None): + """Build uncertainty and ESD entries from datapoint columns, inlining both. + + Uncertainty entries are inlined on the target property in dp[key]. + ESD entries are inlined directly on dp properties. + + Returns a list of standalone uncertainty entries that could not be inlined. + """ + standalone_unc = [] + inline_uncs = {} # target_key → inline unc dict + + for val_el in dp_elem: + pid = val_el.tag + if pid not in dg_defs: + continue + pdef = dg_defs[pid] + name = pdef['name'] + + if name not in ('uncertainty', 'evaluated standard deviation'): + continue + + ref = pdef.get('reference', '') + kind = pdef.get('kind', '') + units = pdef.get('units', '') + + if name == 'evaluated standard deviation': + # Inline ESD directly on the target property + sourcetype = pdef.get('sourcetype') + method = pdef.get('method') + target_key = _ref_to_property_key(ref, dg_defs) + if target_key is not None and dp is not None and target_key in dp: + esd_fields = _build_inline_esd( + kind, _clean_numeric(val_el.text), units, sourcetype, method + ) + _attach_metadata_to_property(dp, target_key, esd_fields) + elif ref in ('composition', 'initial composition') and dp is not None: + species_name = pdef.get('species', {}).get('species-name', '') + if species_name: + for comp_key in ('composition', 'measured-composition'): + comp_block = dp.get(comp_key) + if comp_block and _attach_comp_esd_inline( + comp_block, species_name, kind, + _clean_numeric(val_el.text), units, sourcetype, method + ): + break + continue + + # name == 'uncertainty' + target_key = _ref_to_property_key(ref, dg_defs) + sourcetype = pdef.get('sourcetype', '') + if target_key is not None and dp is not None and target_key in dp: + # Scalar reference: build inline uncertainty + bound = pdef.get('bound', '') + unc_dict = _build_inline_uncertainty( + kind, bound, _clean_numeric(val_el.text), units, sourcetype + ) + if target_key in inline_uncs: + inline_uncs[target_key] = _merge_inline_uncertainty( + inline_uncs[target_key], unc_dict + ) + else: + inline_uncs[target_key] = unc_dict + elif ref in ('composition', 'initial composition') and dp is not None: + # Composition reference: try to inline on species amount fields + species_name = pdef.get('species', {}).get('species-name', '') + bound = pdef.get('bound', '') + raw_val = _clean_numeric(val_el.text) + inlined = False + if species_name: + for comp_key in ('composition', 'measured-composition'): + comp_block = dp.get(comp_key) + if comp_block and _attach_comp_uncertainty_inline( + comp_block, species_name, kind, bound, raw_val, units, + sourcetype + ): + inlined = True + break + if not inlined: + log.debug(f'Could not inline composition uncertainty for {species_name}') + else: + log.debug(f'Could not inline uncertainty for reference={ref}') + + # Attach inline uncertainties to the datapoint property fields + if dp is not None: + for key, unc_dict in inline_uncs.items(): + prop_val = dp[key] + if isinstance(prop_val, list) and len(prop_val) >= 1: + dp[key] = [prop_val[0], unc_dict] + + return standalone_unc + + +# --------------------------------------------------------------------------- +# Per-experiment-type datapoint parsers +# --------------------------------------------------------------------------- + +def _scalar_value(val_text, units): + """Build a scalar value+unit list entry like ['700 K'].""" + units = _normalize_units(units) + return [f'{_clean_numeric(val_text)} {units}'] + + +def parse_idt_datapoints(root, dg, dg_defs, common): + """Ignition delay: pressure, temperature, ignition-delay per point. + Additional dataGroups may contain volume/pressure/temperature histories. + """ + datapoints = [] + for dp_el in dg.findall('dataPoint'): + dp = {} + comp = build_composition(dg_defs, dp_el) + if comp: + dp['composition'] = comp + for val_el in dp_el: + pid = val_el.tag + if pid not in dg_defs: + continue + pdef = dg_defs[pid] + name = pdef['name'] + if name in ('composition', 'uncertainty', 'evaluated standard deviation'): + continue + if name in SCALAR_DG_PROPS: + dp[prop_name_to_key(name)] = _scalar_value(val_el.text, pdef['units']) + unc = build_uncertainty_entries(dg_defs, dp_el, dp) + if unc: + dp['uncertainty'] = unc + datapoints.append(dp) + + # Handle additional dataGroups (volume/pressure/temperature time histories) + all_dgs = root.findall('dataGroup') + if len(all_dgs) > 1: + extra_dgs = all_dgs[1:] + # If number of extra dataGroups matches number of datapoints, assign 1:1 + # (RCM pattern: each condition has its own volume-time trace). + # Otherwise assign all histories to datapoints[0]. + if len(extra_dgs) == len(datapoints): + dp_targets = list(range(len(datapoints))) + else: + # Assign sequentially up to min(dgs, dps); skip extras (target=-1) + n = min(len(extra_dgs), len(datapoints)) + dp_targets = list(range(n)) + [-1] * (len(extra_dgs) - n) + + for idx_dg, extra_dg in enumerate(extra_dgs): + edefs = parse_datagroup_props(extra_dg) + time_tag = None + quant_info = [] # [(tag, type_name, units)] + for pid, pdef in edefs.items(): + if pdef['name'] == 'time': + time_tag = pid + elif pdef['name'] in ('volume', 'temperature', 'pressure'): + quant_info.append((pid, pdef['name'], pdef['units'])) + if time_tag is None or not quant_info: + continue + time_units = edefs[time_tag]['units'] + histories = [ + { + 'time': {'units': time_units, 'column': 0}, + 'quantity': {'units': qi[2], 'column': 1}, + 'type': qi[1], + 'values': [], + } + for qi in quant_info + ] + for dp_el in extra_dg.findall('dataPoint'): + t_val = None + q_vals = {} + for val_el in dp_el: + if val_el.tag == time_tag: + t_val = float(val_el.text) + else: + for qi in quant_info: + if val_el.tag == qi[0]: + q_vals[qi[1]] = float(val_el.text) + if t_val is not None: + for h in histories: + if h['type'] in q_vals: + h['values'].append(_FlowList([t_val, q_vals[h['type']]])) + target = dp_targets[idx_dg] + if histories[0]['values'] and target >= 0: + datapoints[target].setdefault('time-histories', []).extend(histories) + + return datapoints + + +def parse_lbv_datapoints(dg, dg_defs, common): + """Laminar burning velocity: composition, equivalence-ratio, LBV per point.""" + datapoints = [] + for dp_el in dg.findall('dataPoint'): + dp = {} + comp = build_composition(dg_defs, dp_el) + if comp: + dp['composition'] = comp + for val_el in dp_el: + pid = val_el.tag + if pid not in dg_defs: + continue + pdef = dg_defs[pid] + name = pdef['name'] + if name == 'composition': + continue + elif name == 'equivalence ratio': + dp['equivalence-ratio'] = [f'{_clean_numeric(val_el.text)} dimensionless'] + elif name in SCALAR_DG_PROPS: + dp[prop_name_to_key(name)] = _scalar_value(val_el.text, pdef['units']) + unc = build_uncertainty_entries(dg_defs, dp_el, dp) + if unc: + dp['uncertainty'] = unc + datapoints.append(dp) + return datapoints + + +def parse_jsr_datapoints(dg, dg_defs, common): + """JSR: temperature varies, composition is measured outlet concentration.""" + datapoints = [] + for dp_el in dg.findall('dataPoint'): + dp = {} + init_comp = build_initial_composition(dg_defs, dp_el, common.get('_partial_cp_composition')) + if init_comp: + dp['composition'] = init_comp + measured = build_composition(dg_defs, dp_el) + if measured: + ref_comp = (init_comp + or common.get('composition') + or common.get('_partial_cp_composition')) + measured = _add_balance_diluent(measured, ref_comp) + dp['measured-composition'] = measured + for val_el in dp_el: + pid = val_el.tag + if pid not in dg_defs: + continue + pdef = dg_defs[pid] + name = pdef['name'] + if name in ('composition', 'initial composition', + 'uncertainty', 'evaluated standard deviation'): + continue + elif name in SCALAR_DG_PROPS: + dp[prop_name_to_key(name)] = _scalar_value(val_el.text, pdef['units']) + unc = build_uncertainty_entries(dg_defs, dp_el, dp) + if unc: + dp['uncertainty'] = unc + datapoints.append(dp) + return datapoints + + +def parse_ctpm_datapoints(dg, dg_defs, common): + """Concentration time profile: tabular (time, species...) → single datapoint + with concentration-profiles list. + """ + time_id = None + species_cols = [] # [(id, species_info, units)] + for pid, pdef in dg_defs.items(): + if pdef['name'] == 'time': + time_id = pid + elif pdef['name'] in ('composition', 'concentration') and 'species' in pdef: + species_cols.append((pid, pdef['species'], pdef['units'])) + + if time_id is None or not species_cols: + return [] + + time_units = dg_defs[time_id]['units'] + + # Collect all rows + rows = [] + for dp_el in dg.findall('dataPoint'): + row = {} + for val_el in dp_el: + row[val_el.tag] = val_el.text + rows.append(row) + + # Build concentration profiles per species + profiles = [] + for sid, spec_info, units in species_cols: + profile = {'species-name': spec_info.get('species-name', '')} + if 'InChI' in spec_info: + profile['InChI'] = spec_info['InChI'] + + # Determine if we need to convert ppm/ppb/percent → mole fraction + needs_conv = units in ('ppm', 'ppb', 'percent') + if needs_conv: + _, conv_units = normalize_comp_units('1', units) + else: + conv_units = units + + profile['quantity'] = {'units': conv_units} + profile['time'] = {'units': time_units} + profile['values'] = [] + for row in rows: + t_val = float(row.get(time_id, 0)) + c_raw = float(row.get(sid, 0)) + if needs_conv: + c_val, _ = normalize_comp_units(str(c_raw), units) + else: + c_val = c_raw + profile['values'].append(_FlowList([t_val, c_val])) + profiles.append(profile) + + return [{'concentration-profiles': profiles}] + + +def parse_ocm_datapoints(dg, dg_defs, common): + """Outlet concentration: temperature & flow rate vary, measured compositions.""" + datapoints = [] + for dp_el in dg.findall('dataPoint'): + dp = {} + init_comp = build_initial_composition(dg_defs, dp_el, common.get('_partial_cp_composition')) + if init_comp: + dp['composition'] = init_comp + measured = build_composition(dg_defs, dp_el) + if measured: + ref_comp = (init_comp + or common.get('composition') + or common.get('_partial_cp_composition')) + measured = _add_balance_diluent(measured, ref_comp) + dp['measured-composition'] = measured + for val_el in dp_el: + pid = val_el.tag + if pid not in dg_defs: + continue + pdef = dg_defs[pid] + name = pdef['name'] + if name in ('composition', 'initial composition', + 'uncertainty', 'evaluated standard deviation'): + continue + elif name == 'equivalence ratio': + dp['equivalence-ratio'] = [f'{_clean_numeric(val_el.text)} dimensionless'] + elif name in SCALAR_DG_PROPS: + dp[prop_name_to_key(name)] = _scalar_value(val_el.text, pdef['units']) + unc = build_uncertainty_entries(dg_defs, dp_el, dp) + if unc: + dp['uncertainty'] = unc + datapoints.append(dp) + return datapoints + + +def parse_bsfsm_datapoints(dg, dg_defs, common): + """Burner stabilised flame speciation: distance varies, measured compositions.""" + datapoints = [] + for dp_el in dg.findall('dataPoint'): + dp = {} + measured = build_composition(dg_defs, dp_el) + if measured: + ref_comp = common.get('composition') + measured = _add_balance_diluent(measured, ref_comp) + dp['measured-composition'] = measured + for val_el in dp_el: + pid = val_el.tag + if pid not in dg_defs: + continue + pdef = dg_defs[pid] + name = pdef['name'] + if name in ('composition', 'uncertainty', 'evaluated standard deviation'): + continue + elif name in SCALAR_DG_PROPS: + dp[prop_name_to_key(name)] = _scalar_value(val_el.text, pdef['units']) + unc = build_uncertainty_entries(dg_defs, dp_el, dp) + if unc: + dp['uncertainty'] = unc + datapoints.append(dp) + return datapoints + + +# --------------------------------------------------------------------------- +# Reaction parsing (kdetermination files) +# --------------------------------------------------------------------------- + +def parse_reactions(root): + """Parse elements → list of reaction dicts.""" + reactions = [] + for rxn in root.findall('reaction'): + entry = { + 'preferred-key': rxn.attrib.get('preferredKey', ''), + } + order = rxn.attrib.get('order') + if order: + try: + entry['order'] = int(order) + except ValueError: + entry['order'] = order + bulk_gas = rxn.attrib.get('bulkgas') + if bulk_gas: + entry['bulk-gas'] = bulk_gas + + reactants = [] + for i in range(1, 10): + r = rxn.findtext(f'reactant{i}') + if r: + reactants.append(r.strip()) + else: + break + if reactants: + entry['reactants'] = reactants + + products = [] + for i in range(1, 10): + p = rxn.findtext(f'product{i}') + if p: + products.append(p.strip()) + else: + break + if products: + entry['products'] = products + + reactions.append(entry) + return reactions + + +# --------------------------------------------------------------------------- +# kdetermination datapoint parser +# --------------------------------------------------------------------------- + +def parse_kdet_datapoints(dg, dg_defs, common): + """Rate coefficient / branching ratio: temperature, rate-coefficient/branching-ratio, + optional pressure per point.""" + datapoints = [] + for dp_el in dg.findall('dataPoint'): + dp = {} + for val_el in dp_el: + pid = val_el.tag + if pid not in dg_defs: + continue + pdef = dg_defs[pid] + name = pdef['name'] + if name in ('uncertainty', 'evaluated standard deviation'): + continue + if name in SCALAR_DG_PROPS: + dp[prop_name_to_key(name)] = _scalar_value(val_el.text, pdef['units']) + unc = build_uncertainty_entries(dg_defs, dp_el, dp) + if unc: + dp['uncertainty'] = unc + datapoints.append(dp) + return datapoints + + +# --------------------------------------------------------------------------- +# tdetermination datapoint parser +# --------------------------------------------------------------------------- + +def parse_tdet_datapoints(dg, dg_defs, common): + """Thermochemical data: temperature and thermodynamic properties per point.""" + datapoints = [] + for dp_el in dg.findall('dataPoint'): + dp = {} + for val_el in dp_el: + pid = val_el.tag + if pid not in dg_defs: + continue + pdef = dg_defs[pid] + name = pdef['name'] + if name in ('uncertainty', 'evaluated standard deviation'): + continue + if name in SCALAR_DG_PROPS: + dp[prop_name_to_key(name)] = _scalar_value(val_el.text, pdef['units']) + unc = build_uncertainty_entries(dg_defs, dp_el, dp) + if unc: + dp['uncertainty'] = unc + datapoints.append(dp) + return datapoints + + +# --------------------------------------------------------------------------- +# Main conversion +# --------------------------------------------------------------------------- + +PARSERS = { + 'ignition delay': 'idt', + 'laminar burning velocity measurement': 'lbv', + 'jet stirred reactor measurement': 'jsr', + 'concentration time profile measurement': 'ctpm', + 'outlet concentration measurement': 'ocm', + 'burner stabilized flame speciation measurement': 'bsfsm', +} + + +def convert_file(xml_path, original_filename=None): + """Convert a single ReSpecTh XML file → ChemKED property dict (or None). + + Supports , , and root elements. + + Parameters + ---------- + xml_path : str + Path to the XML file on disk. + original_filename : str, optional + The original filename to record in the ``reference.detail`` field. + Defaults to ``os.path.basename(xml_path)``. + """ + tree = ET.parse(xml_path) + root = tree.getroot() + + if root.tag == 'experiment': + try: + return _convert_file_inner(root, xml_path, original_filename) + except UnsupportedUnitsError as e: + log.info(f'Skipping {os.path.basename(xml_path)}: {e}') + return None + elif root.tag == 'kdetermination': + return _convert_kdetermination(root, xml_path, original_filename) + elif root.tag == 'tdetermination': + return _convert_tdetermination(root, xml_path, original_filename) + else: + return None + + +def _convert_file_inner(root, xml_path, original_filename=None): + + xml_filename = original_filename or os.path.basename(xml_path) + + props = parse_file_metadata(root) + props['reference'] = parse_reference(root, xml_filename) + props['file-type'] = 'experiment' + + exp_type, apparatus = parse_experiment_kind(root) + props['experiment-type'] = exp_type + props['apparatus'] = apparatus + + # Method and comments + method = (root.findtext('method') or '').strip() + if method: + props['method'] = method + + comments = [] + for c_el in root.findall('comment'): + if c_el.text and c_el.text.strip(): + comments.append(c_el.text.strip()) + if comments: + props['comments'] = comments + + common = parse_common_properties(root, exp_type) + props['common-properties'] = common + + if exp_type == 'ignition delay': + ign_type = parse_ignition_type(root) + if ign_type: + common['ignition-type'] = ign_type + + # Parse main dataGroup + all_dgs = root.findall('dataGroup') + if not all_dgs: + raise ValueError('No dataGroup found') + + dg = all_dgs[0] + dg_defs = parse_datagroup_props(dg) + + kind = PARSERS[exp_type] + if kind == 'idt': + props['datapoints'] = parse_idt_datapoints(root, dg, dg_defs, common) + elif kind == 'lbv': + props['datapoints'] = parse_lbv_datapoints(dg, dg_defs, common) + elif kind == 'jsr': + props['datapoints'] = parse_jsr_datapoints(dg, dg_defs, common) + elif kind == 'ctpm': + props['datapoints'] = parse_ctpm_datapoints(dg, dg_defs, common) + elif kind == 'ocm': + props['datapoints'] = parse_ocm_datapoints(dg, dg_defs, common) + elif kind == 'bsfsm': + props['datapoints'] = parse_bsfsm_datapoints(dg, dg_defs, common) + + if not props.get('datapoints'): + raise ValueError('No datapoints parsed') + + # Apply common properties to each datapoint (matches existing PyKED convention) + for dp in props['datapoints']: + for key, val in common.items(): + if key not in dp: + dp[key] = val + + # Post-merge: inline any remaining standalone scalar uncertainties + _UNC_KEYS = ('uncertainty', 'upper-uncertainty', 'lower-uncertainty') + + def _extract_unc_from_entry(entry): + """Extract (bound_key, value_str, units) from a standalone entry.""" + for bk in _UNC_KEYS: + if bk in entry: + raw = entry[bk] + val_str = raw[0] if isinstance(raw, list) else str(raw) + parts = val_str.split(' ', 1) + return bk, parts[0], (parts[1] if len(parts) > 1 else '') + return None, '', '' + + for dp in props['datapoints']: + # Inline remaining standalone uncertainty entries + for entry in dp.pop('uncertainty', []): + ref = entry.get('reference', '') + target_key = _ref_to_property_key(ref) + sourcetype = entry.get('sourcetype', '') + if target_key and target_key in dp: + unc_kind = entry.get('kind', '') + bound_key, val_str, unc_units = _extract_unc_from_entry(entry) + if bound_key is None: + continue + unc_dict = {'uncertainty-type': unc_kind} + unc_dict[bound_key] = _format_unc_value(val_str, unc_units, unc_kind) + if sourcetype: + unc_dict['uncertainty-sourcetype'] = sourcetype + prop_val = dp[target_key] + if isinstance(prop_val, list) and len(prop_val) >= 1: + if len(prop_val) == 2 and isinstance(prop_val[1], dict): + dp[target_key] = [prop_val[0], _merge_inline_uncertainty(prop_val[1], unc_dict)] + else: + dp[target_key] = [prop_val[0], unc_dict] + elif ref in ('composition', 'initial composition'): + species_name = entry.get('species-name', '') + unc_kind = entry.get('kind', '') + bound_key, val_str, unc_units = _extract_unc_from_entry(entry) + bound = {'upper-uncertainty': 'plus', + 'lower-uncertainty': 'minus'}.get(bound_key, 'plusminus') + if species_name and bound_key: + for comp_key in ('composition', 'measured-composition'): + comp_block = dp.get(comp_key) + if comp_block and _attach_comp_uncertainty_inline( + comp_block, species_name, unc_kind, bound, + val_str, unc_units, sourcetype + ): + break + + # Inline pending ESD from common properties + for esd_entry in dp.pop('_pending_esd', []): + reference = esd_entry['reference'] + target_key = _ref_to_property_key(reference) + if target_key and target_key in dp: + esd_fields = _build_inline_esd( + esd_entry['kind'], esd_entry['value'], esd_entry['units'], + esd_entry.get('sourcetype'), esd_entry.get('method') + ) + _attach_metadata_to_property(dp, target_key, esd_fields) + elif reference in ('composition', 'initial composition'): + species_name = esd_entry.get('species-name', '') + if species_name: + for comp_key in ('composition', 'measured-composition'): + comp_block = dp.get(comp_key) + if comp_block and _attach_comp_esd_inline( + comp_block, species_name, + esd_entry['kind'], esd_entry['value'], + esd_entry['units'], + esd_entry.get('sourcetype'), esd_entry.get('method') + ): + break + + # Inline pending uncertainties from common properties (measured species) + for unc_entry in dp.pop('_pending_unc', []): + ref = unc_entry.get('reference', '') + if ref in ('composition', 'initial composition'): + species_name = unc_entry.get('species-name', '') + unc_kind = unc_entry.get('kind', '') + bound = unc_entry.get('bound', 'plusminus') + raw_val = unc_entry.get('value', '') + unc_units = unc_entry.get('units', '') + sourcetype = unc_entry.get('sourcetype', '') + if species_name: + for comp_key in ('composition', 'measured-composition'): + comp_block = dp.get(comp_key) + if comp_block and _attach_comp_uncertainty_inline( + comp_block, species_name, unc_kind, bound, + raw_val, unc_units, sourcetype + ): + break + + # Clean up common properties — remove temporary keys + common.pop('uncertainty', None) + common.pop('evaluated-standard-deviation', None) + common.pop('_pending_esd', None) + common.pop('_pending_unc', None) + common.pop('_partial_cp_composition', None) + + return props + + +# --------------------------------------------------------------------------- +# kdetermination conversion +# --------------------------------------------------------------------------- + +def _convert_kdetermination(root, xml_path, original_filename=None): + """Convert a XML file to a ChemKED-style property dict.""" + xml_filename = original_filename or os.path.basename(xml_path) + + props = parse_file_metadata(root) + props['reference'] = parse_reference(root, xml_filename) + props['file-type'] = 'kdetermination' + props['experiment-type'] = 'rate coefficient' + + # Parse reactions — schema expects 'reaction' (string) and 'bulk-gas' (string) + reactions = parse_reactions(root) + if reactions: + primary = reactions[0] + if primary.get('preferred-key'): + props['reaction'] = primary['preferred-key'] + if primary.get('bulk-gas'): + props['bulk-gas'] = primary['bulk-gas'] + + # Method and apparatus + method = (root.findtext('method') or '').strip() + if method: + props['method'] = method + # Map method text to apparatus kind + _method_to_apparatus = { + 'shock tube': 'shock tube', + 'shock wave': 'shock tube', + 'flow tube': 'flow reactor', + 'flow reactor': 'flow reactor', + 'static reactor': 'flow reactor', + 'stirred reactor': 'stirred reactor', + 'flame': 'flame', + } + apparatus_kind = _method_to_apparatus.get(method.lower(), 'shock tube') + props['apparatus'] = {'kind': apparatus_kind} + + comments = [] + for c_el in root.findall('comment'): + if c_el.text and c_el.text.strip(): + comments.append(c_el.text.strip()) + if comments: + props['comments'] = comments + + # Common properties (parsed the same way as experiments) + common = parse_common_properties(root, 'rate coefficient') + props['common-properties'] = common + + # Parse dataGroup + all_dgs = root.findall('dataGroup') + if not all_dgs: + raise ValueError('No dataGroup found') + + dg = all_dgs[0] + dg_defs = parse_datagroup_props(dg) + + props['datapoints'] = parse_kdet_datapoints(dg, dg_defs, common) + + if not props.get('datapoints'): + raise ValueError('No datapoints parsed') + + # Apply common properties to each datapoint + for dp in props['datapoints']: + for key, val in common.items(): + if key not in dp: + dp[key] = val + + # Post-merge inline remaining uncertainties (same as experiment) + _UNC_KEYS = ('uncertainty', 'upper-uncertainty', 'lower-uncertainty') + + def _extract_unc_from_entry(entry): + for bk in _UNC_KEYS: + if bk in entry: + raw = entry[bk] + val_str = raw[0] if isinstance(raw, list) else str(raw) + parts = val_str.split(' ', 1) + return bk, parts[0], (parts[1] if len(parts) > 1 else '') + return None, '', '' + + for dp in props['datapoints']: + for entry in dp.pop('uncertainty', []): + ref = entry.get('reference', '') + target_key = _ref_to_property_key(ref) + sourcetype = entry.get('sourcetype', '') + if target_key and target_key in dp: + unc_kind = entry.get('kind', '') + bound_key, val_str, unc_units = _extract_unc_from_entry(entry) + if bound_key is None: + continue + unc_dict = {'uncertainty-type': unc_kind} + unc_dict[bound_key] = _format_unc_value(val_str, unc_units, unc_kind) + if sourcetype: + unc_dict['uncertainty-sourcetype'] = sourcetype + prop_val = dp[target_key] + if isinstance(prop_val, list) and len(prop_val) >= 1: + if len(prop_val) == 2 and isinstance(prop_val[1], dict): + dp[target_key] = [prop_val[0], _merge_inline_uncertainty(prop_val[1], unc_dict)] + else: + dp[target_key] = [prop_val[0], unc_dict] + + for esd_entry in dp.pop('_pending_esd', []): + reference = esd_entry['reference'] + target_key = _ref_to_property_key(reference) + if target_key and target_key in dp: + esd_fields = _build_inline_esd( + esd_entry['kind'], esd_entry['value'], esd_entry['units'], + esd_entry.get('sourcetype'), esd_entry.get('method') + ) + _attach_metadata_to_property(dp, target_key, esd_fields) + + common.pop('uncertainty', None) + common.pop('evaluated-standard-deviation', None) + common.pop('_pending_esd', None) + common.pop('_pending_unc', None) + common.pop('_partial_cp_composition', None) + + return props + + +# --------------------------------------------------------------------------- +# tdetermination conversion +# --------------------------------------------------------------------------- + +def _convert_tdetermination(root, xml_path, original_filename=None): + """Convert a XML file to a ChemKED-style property dict.""" + xml_filename = original_filename or os.path.basename(xml_path) + + props = parse_file_metadata(root) + props['reference'] = parse_reference(root, xml_filename) + props['file-type'] = 'tdetermination' + props['experiment-type'] = 'thermochemical' + + # Parse reactions (tdetermination may have species/reaction info) + reactions = parse_reactions(root) + if reactions: + primary = reactions[0] + if primary.get('preferred-key'): + props['reaction'] = primary['preferred-key'] + if primary.get('bulk-gas'): + props['bulk-gas'] = primary['bulk-gas'] + + method = (root.findtext('method') or '').strip() + if method: + props['method'] = method + + comments = [] + for c_el in root.findall('comment'): + if c_el.text and c_el.text.strip(): + comments.append(c_el.text.strip()) + if comments: + props['comments'] = comments + + common = parse_common_properties(root, 'thermochemical') + props['common-properties'] = common + + all_dgs = root.findall('dataGroup') + if not all_dgs: + raise ValueError('No dataGroup found') + + dg = all_dgs[0] + dg_defs = parse_datagroup_props(dg) + + props['datapoints'] = parse_tdet_datapoints(dg, dg_defs, common) + + if not props.get('datapoints'): + raise ValueError('No datapoints parsed') + + for dp in props['datapoints']: + for key, val in common.items(): + if key not in dp: + dp[key] = val + + common.pop('uncertainty', None) + common.pop('evaluated-standard-deviation', None) + common.pop('_pending_esd', None) + common.pop('_pending_unc', None) + common.pop('_partial_cp_composition', None) + + return props + + +# --------------------------------------------------------------------------- +# Output path logic +# --------------------------------------------------------------------------- + +def get_output_path(xml_path, input_dir, output_dir, reference): + """Determine output YAML path: output_dir/fuel/Author_Year/filename.yaml""" + rel = os.path.relpath(xml_path, input_dir) + parts = Path(rel).parts + + fuel = parts[0] if len(parts) > 1 else 'unknown' + + authors = reference.get('authors', []) + year = reference.get('year', 'unknown') + last_name = first_author_last_name(authors) + ref_dir = f'{last_name}_{year}' + + yaml_name = Path(parts[-1]).stem + '.yaml' + return os.path.join(output_dir, fuel, ref_dir, yaml_name) + + +# --------------------------------------------------------------------------- +# Batch conversion +# --------------------------------------------------------------------------- + +def batch_convert(input_dir, output_dir, dry_run=False): + stats = {'total': 0, 'success': 0, 'skipped': 0, 'errors': 0, 'validation_errors': 0} + errors_log = [] + validation_errors_log = [] + type_counts = {} + + xml_files = sorted(Path(input_dir).rglob('*.xml')) + stats['total'] = len(xml_files) + log.info(f'Found {len(xml_files)} XML files in {input_dir}') + + for xml_path in xml_files: + xml_str = str(xml_path) + try: + result = convert_file(xml_str) + if result is None: + stats['skipped'] += 1 + continue + + exp_type = result['experiment-type'] + type_counts[exp_type] = type_counts.get(exp_type, 0) + 1 + + out_path = get_output_path(xml_str, input_dir, output_dir, + result['reference']) + + if dry_run: + log.debug(f' Would write: {out_path}') + stats['success'] += 1 + else: + os.makedirs(os.path.dirname(out_path), exist_ok=True) + result.pop('file-type', None) + with open(out_path, 'w') as f: + yaml_dump(result, f) + + # Post-write PyKED validation + if _ChemKED is not None: + try: + _ChemKED(yaml_file=out_path) + stats['success'] += 1 + except Exception as ve: + stats['validation_errors'] += 1 + validation_errors_log.append((xml_str, str(ve))) + log.warning(f'Validation error in {xml_path.name}: {ve}') + else: + stats['success'] += 1 + + except Exception as e: + stats['errors'] += 1 + errors_log.append((xml_str, str(e))) + log.warning(f'Error converting {xml_path.name}: {e}') + + # Summary + log.info('') + log.info('=== Conversion Summary ===') + log.info(f'Total files: {stats["total"]}') + log.info(f'Converted: {stats["success"]}') + log.info(f'Skipped: {stats["skipped"]}') + log.info(f'Conversion errors: {stats["errors"]}') + log.info(f'Validation errors: {stats["validation_errors"]}') + log.info('') + log.info('By experiment type:') + for t, c in sorted(type_counts.items()): + log.info(f' {t}: {c}') + + if errors_log: + log.info('') + log.info('First 20 conversion errors:') + for path, err in errors_log[:20]: + log.info(f' {os.path.basename(path)}: {err}') + + if validation_errors_log: + log.info('') + log.info('First 20 validation errors:') + for path, err in validation_errors_log[:20]: + log.info(f' {os.path.basename(path)}: {err}') + + return stats, errors_log, validation_errors_log + + +def convert_single(xml_path, output_path=None): + """Convert a single file and optionally write output.""" + result = convert_file(xml_path) + if result is None: + log.info(f'Skipped (unsupported root element): {xml_path}') + return + + if output_path is None: + output_path = Path(xml_path).stem + '.yaml' + + file_type = result.pop('file-type', 'experiment') + with open(output_path, 'w') as f: + yaml_dump(result, f) + log.info(f'Converted ({file_type}): {xml_path} → {output_path}') + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description='Batch convert ReSpecTh v2.3/v2.4 XML files to ChemKED YAML' + ) + parser.add_argument('--input-dir', '-i', default='ReSpecTh/indirect', + help='Input directory with ReSpecTh XML files ' + '(default: ReSpecTh/indirect)') + parser.add_argument('--output-dir', '-o', default='ChemKED-database', + help='Output directory for ChemKED YAML files ' + '(default: ChemKED-database)') + parser.add_argument('--file', '-f', default=None, + help='Convert a single XML file instead of batch') + parser.add_argument('--output-file', default=None, + help='Output path for single-file mode') + parser.add_argument('--dry-run', '-n', action='store_true', + help='Parse but do not write files') + parser.add_argument('--verbose', '-v', action='store_true', + help='Verbose output') + + args = parser.parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + if args.file: + convert_single(args.file, args.output_file) + else: + batch_convert(args.input_dir, args.output_dir, dry_run=args.dry_run) + + +if __name__ == '__main__': + main() diff --git a/pyked/chemked.py b/pyked/chemked.py index fdd147e..7780823 100644 --- a/pyked/chemked.py +++ b/pyked/chemked.py @@ -2,6 +2,8 @@ Main ChemKED module """ # Standard libraries +import re +from decimal import Decimal, InvalidOperation from os.path import exists from collections import namedtuple from warnings import warn @@ -15,6 +17,7 @@ # Local imports from .validation import schema, OurValidator, yaml, Q_ from .converters import datagroup_properties, ReSpecTh_to_ChemKED +from pint import DimensionalityError VolumeHistory = namedtuple('VolumeHistory', ['time', 'volume']) VolumeHistory.__doc__ = 'Time history of the volume in an RCM experiment. Deprecated, to be removed after PyKED 0.4' # noqa: E501 @@ -176,14 +179,30 @@ def validate_yaml(self, properties): `ValueError`: If the YAML file cannot be validated, a `ValueError` is raised whose string contains the errors that are present. """ - validator = OurValidator(schema) + from cerberus.schema import UnvalidatedSchema + + # Normalize equivalence-ratio: wrap scalar values in a list + # to match the schema expectation (type: list) + for dp in properties.get('datapoints', []): + if 'equivalence-ratio' in dp and not isinstance(dp['equivalence-ratio'], list): + dp['equivalence-ratio'] = [dp['equivalence-ratio']] + + # Use UnvalidatedSchema to bypass cerberus 1.3's schema-of-schema + # validation, which fails because its internal SchemaValidator doesn't + # inherit OurValidator's custom _validate_isvalid_* rules. + validator = OurValidator() + validator._schema = UnvalidatedSchema(schema) if not validator.validate(properties): - for key, value in validator.errors.items(): - if any(['unallowed value' in v for v in value]): - print(('{key} has an illegal value. Allowed values are {values} and are case ' - 'sensitive.').format(key=key, values=schema[key]['allowed'])) + errors = validator.errors - raise ValueError(validator.errors) + for key, value in errors.items(): + vals = value if isinstance(value, list) else [value] + if any('unallowed value' in str(v) for v in vals): + if key in schema and 'allowed' in schema[key]: + print(('{key} has an illegal value. Allowed values are {values} and are case ' + 'sensitive.').format(key=key, values=schema[key]['allowed'])) + + raise ValueError(errors) def get_dataframe(self, output_columns=None): """Get a Pandas DataFrame of the datapoints in this instance. @@ -450,9 +469,10 @@ def convert_to_ReSpecTh(self, filename): for prop_name in datagroup_properties: attribute = prop_name.replace(' ', '_') # This can't be hasattr because properties are set to the value None - # if no value is specified in the file, so the attribute always exists + # if no value is specified in the file, so the attribute always exists. + # Use default None for attributes not defined on DataPoint. prop_indices = [i for i, dp in enumerate(self.datapoints) - if getattr(dp, attribute) is not None + if getattr(dp, attribute, None) is not None ] if prop_name in common or not prop_indices: continue @@ -496,8 +516,11 @@ def convert_to_ReSpecTh(self, filename): for idx, val in property_idx.items(): # handle regular properties a bit differently than composition if val['name'] in datagroup_properties: + quantity = getattr(dp, val['name'].replace(' ', '_'), None) + if quantity is None: + continue value = etree.SubElement(datapoint, idx) - quantity = getattr(dp, val['name'].replace(' ', '_')).to(val['units']) + quantity = quantity.to(val['units']) value.text = str(quantity.magnitude) else: # composition @@ -567,6 +590,8 @@ def convert_to_ReSpecTh(self, filename): ignition.set('target', self.datapoints[0].ignition_type['target']) if ign_types[0]['type'] == 'd/dt max extrapolated': ignition.set('type', 'baseline max intercept from d/dt') + elif ign_types[0]['type'] == 'd/dt min extrapolated': + ignition.set('type', 'baseline min intercept from d/dt') else: ignition.set('type', self.datapoints[0].ignition_type['type']) else: @@ -627,7 +652,12 @@ class DataPoint(object): """ value_unit_props = [ 'ignition-delay', 'first-stage-ignition-delay', 'temperature', 'pressure', - 'pressure-rise', + 'pressure-rise', 'laminar-burning-velocity', 'distance', 'flow-rate', + 'residence-time', 'volumetric-flow-in-reference-state', 'reactor-volume', + 'environment-temperature', 'global-heat-exchange-coefficient', 'exchange-area', + 'reactor-length', 'reactor-diameter', + 'pressure-in-reference-state', 'temperature-in-reference-state', + 'rate-coefficient', ] rcm_data_props = [ @@ -656,23 +686,57 @@ def __init__(self, properties): else: self.rcm_data = None - self.composition_type = properties['composition']['kind'] - composition = {} - for species in properties['composition']['species']: - species_name = species['species-name'] - amount = self.process_quantity(species['amount']) - InChI = species.get('InChI') - SMILES = species.get('SMILES') - atomic_composition = species.get('atomic-composition') - composition[species_name] = Composition( - species_name=species_name, InChI=InChI, SMILES=SMILES, - atomic_composition=atomic_composition, amount=amount) + if 'composition' in properties: + self.composition_type = properties['composition']['kind'] + composition = {} + for species in properties['composition']['species']: + species_name = species['species-name'] + amount = self.process_quantity(species['amount']) + InChI = species.get('InChI') + SMILES = species.get('SMILES') + atomic_composition = species.get('atomic-composition') + composition[species_name] = Composition( + species_name=species_name, InChI=InChI, SMILES=SMILES, + atomic_composition=atomic_composition, amount=amount) + setattr(self, 'composition', composition) + else: + self.composition_type = None + self.composition = {} + + # Measured composition (for JSR, OCM, BSFSM experiment types) + if 'measured-composition' in properties: + self.measured_composition_type = properties['measured-composition']['kind'] + measured = {} + for species in properties['measured-composition']['species']: + species_name = species['species-name'] + amount = self.process_quantity(species['amount']) + InChI = species.get('InChI') + SMILES = species.get('SMILES') + atomic_composition = species.get('atomic-composition') + measured[species_name] = Composition( + species_name=species_name, InChI=InChI, SMILES=SMILES, + atomic_composition=atomic_composition, amount=amount) + self.measured_composition = measured + else: + self.measured_composition_type = None + self.measured_composition = {} + + # Concentration profiles (for concentration time profile measurement) + self.concentration_profiles = [] + if 'concentration-profiles' in properties: + for profile in properties['concentration-profiles']: + self.concentration_profiles.append(profile) - setattr(self, 'composition', composition) + # Time shift (for concentration time profile measurement) + self.time_shift = properties.get('time-shift') self.equivalence_ratio = properties.get('equivalence-ratio') self.ignition_type = deepcopy(properties.get('ignition-type')) + # Uncertainty and evaluated standard deviation metadata + self.uncertainty = properties.get('uncertainty', []) + self.evaluated_standard_deviation = properties.get('evaluated-standard-deviation', []) + if 'time-histories' in properties and 'volume-history' in properties: raise TypeError('time-histories and volume-history are mutually exclusive') @@ -719,16 +783,75 @@ def __init__(self, properties): if not hasattr(self, '{}_history'.format(h)): setattr(self, '{}_history'.format(h), None) + # Match a leading number (with optional scientific notation) followed by units. + _NUM_UNIT_RE = re.compile( + r'^([+-]?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?)\s+(.+)$' + ) + # Condensed exponent notation: letter immediately followed by a negative + # integer (e.g. "s-1", "mol-1"). Only negative exponents are converted to + # avoid false positives on strings like "H2O". + _UNIT_EXP_RE = re.compile(r'([A-Za-z])(-\d+)') + + def _parse_val_units(self, raw): + """Split a 'value units' string into (float, unit_str) for Q_(). + + Applies condensed-exponent normalization (e.g. 'molecule-1' → 'molecule**-1') + only to the unit part, not the numeric part, to avoid mis-converting + scientific notation like '4.52e-12'. + Returns (float, unit_str) for use as Q_(float, unit_str), or (raw,) as + fallback for Q_(raw) expression parsing. + """ + m = self._NUM_UNIT_RE.match(raw) + if m: + val_f = float(m.group(1)) + unit_str = self._UNIT_EXP_RE.sub(r'\1**\2', m.group(2)) + return val_f, unit_str + return (raw,) + def process_quantity(self, properties): """Process the uncertainty information from a given quantity and return it """ - quant = Q_(properties[0]) + raw = properties[0] + if isinstance(raw, str): + m = self._NUM_UNIT_RE.match(raw) + if m: + value_f = float(m.group(1)) + unit_str = m.group(2) + try: + # Preferred: separate value and units avoids pint + # expression-parser bugs with 'e' (Euler's number) + # and '-' (subtraction). + quant = Q_(value_f, unit_str) + except Exception: + # Unit string may use condensed exponent notation + # (e.g. "s-1") which parse_units doesn't understand. + norm = self._UNIT_EXP_RE.sub(r'\1**\2', unit_str) + try: + quant = Q_(value_f, norm) + except Exception: + # Unit string may be a compound expression + # (e.g. "1 / second") that parse_units can't handle. + # Fall back to expression parsing with the numeric + # value in fixed-point notation so pint never sees + # 'e' or 'E' in the number. + safe_val = format(Decimal(str(value_f)), 'f') + quant = Q_(f"{safe_val} {norm}") + else: + quant = Q_(raw) + else: + quant = Q_(raw) if len(properties) > 1: unc = properties[1] uncertainty = unc.get('uncertainty', False) upper_uncertainty = unc.get('upper-uncertainty', False) lower_uncertainty = unc.get('lower-uncertainty', False) uncertainty_type = unc.get('uncertainty-type') + + # If no uncertainty-type but has evaluated-standard-deviation fields, + # this is an ESD-only metadata dict — skip uncertainty processing. + if uncertainty_type is None and 'evaluated-standard-deviation' in unc: + return quant + if uncertainty_type == 'relative': if uncertainty: quant = quant.plus_minus(float(uncertainty), relative=True) @@ -743,13 +866,16 @@ def process_quantity(self, properties): '"lower-uncertainty" need to be specified.') elif uncertainty_type == 'absolute': if uncertainty: - uncertainty = Q_(uncertainty) + uncertainty = Q_(*self._parse_val_units(str(uncertainty))) quant = quant.plus_minus(uncertainty.to(quant.units).magnitude) elif upper_uncertainty and lower_uncertainty: warn('Asymmetric uncertainties are not supported. The ' 'maximum of lower-uncertainty and upper-uncertainty ' 'has been used as the symmetric uncertainty.') - uncertainty = max(Q_(upper_uncertainty), Q_(lower_uncertainty)) + uncertainty = max( + Q_(*self._parse_val_units(str(upper_uncertainty))), + Q_(*self._parse_val_units(str(lower_uncertainty))), + ) quant = quant.plus_minus(uncertainty.to(quant.units).magnitude) else: raise ValueError('Either "uncertainty" or "upper-uncertainty" and ' diff --git a/pyked/converters.py b/pyked/converters.py index fc8e94f..f4bf61d 100644 --- a/pyked/converters.py +++ b/pyked/converters.py @@ -19,7 +19,9 @@ # Valid properties for ReSpecTh dataGroup datagroup_properties = ['temperature', 'pressure', 'ignition delay', - 'pressure rise', + 'pressure rise', 'laminar burning velocity', + 'distance', 'flow rate', 'residence time', + 'volumetric flow rate in reference state', ] """`list`: Valid properties for a ReSpecTh dataGroup""" @@ -120,7 +122,8 @@ def get_reference(root): reference['doi'] = elem.attrib['doi'] # Now get elements of the reference data # Assume that the reference returned by the DOI lookup always has a container-title - reference['journal'] = ref.get('container-title')[0] + import html as _html_mod + reference['journal'] = _html_mod.unescape(ref.get('container-title')[0]) ref_year = ref.get('published-print') or ref.get('published-online') reference['year'] = int(ref_year['date-parts'][0][0]) reference['volume'] = int(ref.get('volume')) @@ -132,7 +135,7 @@ def get_reference(root): # Add ORCID if available orcid = author.get('ORCID') if orcid: - auth['ORCID'] = orcid.lstrip('http://orcid.org/') + auth['ORCID'] = orcid.removeprefix('https://orcid.org/').removeprefix('http://orcid.org/') reference['authors'].append(auth) elif ref_key is not None: @@ -159,20 +162,41 @@ def get_experiment_kind(root): properties (`dict`): Dictionary with experiment type and apparatus information. """ properties = {} - if root.find('experimentType').text == 'Ignition delay measurement': - properties['experiment-type'] = 'ignition delay' - else: - raise NotImplementedError(root.find('experimentType').text + ' not (yet) supported') + + exp_type_text = (getattr(root.find('experimentType'), 'text', '') or '').strip() + if not exp_type_text: + raise MissingElementError('experimentType') + exp_type_map = { + 'Ignition delay measurement': 'ignition delay', + 'Laminar burning velocity measurement': 'laminar burning velocity measurement', + 'Concentration time profile measurement': 'concentration time profile measurement', + 'Jet stirred reactor measurement': 'jet stirred reactor measurement', + 'Outlet concentration measurement': 'outlet concentration measurement', + 'Burner stabilized flame speciation measurement': 'burner stabilized flame speciation measurement', + } + matched_type = exp_type_map.get(exp_type_text) + if matched_type is None: + # Try case-insensitive match + for key, val in exp_type_map.items(): + if key.lower() == exp_type_text.lower(): + matched_type = val + break + if matched_type is None: + raise NotImplementedError(exp_type_text + ' not (yet) supported') + properties['experiment-type'] = matched_type properties['apparatus'] = {'kind': '', 'institution': '', 'facility': ''} kind = getattr(root.find('apparatus/kind'), 'text', False) # Test for missing attribute or empty string if not kind: raise MissingElementError('apparatus/kind') - elif kind in ['shock tube', 'rapid compression machine']: - properties['apparatus']['kind'] = kind else: - raise NotImplementedError(kind + ' experiment not (yet) supported') + properties['apparatus']['kind'] = kind + + mode = getattr(root.find('apparatus/mode'), 'text', None) + if mode: + modes = root.findall('apparatus/mode') + properties['apparatus']['mode'] = [m.text.strip() for m in modes if m.text] return properties @@ -303,10 +327,18 @@ def get_ignition_type(root): elif ign_target == 'T': ign_target = 'temperature' - if ign_target not in ['pressure', 'temperature', 'OH', 'OH*', 'CH*', 'CH']: + _valid_targets = { + 'pressure', 'temperature', 'OH', 'OH*', 'CH', 'CH*', + 'NH3', 'CO2', 'N2O', 'CH4', 'OHEX', 'CHEX', + 'CO', 'H2O', 'C2', + 'O', 'CH3OH', 'CH3', 'O2', 'soot', 'CO;O', '[O]*[CO]', 'NEOC5H11', + } + if ign_target not in _valid_targets: raise KeywordError(ign_target + ' not valid ignition target') - if ign_type not in ['max', 'd/dt max', '1/2 max', 'min', 'd/dt max extrapolated']: + _valid_types = {'max', 'd/dt max', '1/2 max', 'min', 'd/dt max extrapolated', 'd/dt min extrapolated', + 'relative concentration', 'd/dt second max', 'concentration', 'relative increase'} + if ign_type not in _valid_types: raise KeywordError(ign_type + ' not valid ignition type') properties['type'] = ign_type @@ -503,25 +535,36 @@ def ReSpecTh_to_ChemKED(filename_xml, file_author='', file_author_orcid='', *, v # Get properties shared across the file properties['common-properties'] = get_common_properties(root) - # Determine definition of ignition delay - properties['common-properties']['ignition-type'] = get_ignition_type(root) + # Determine definition of ignition delay (only for ignition delay experiments) + if properties['experiment-type'] == 'ignition delay': + properties['common-properties']['ignition-type'] = get_ignition_type(root) - # Now parse ignition delay datapoints + # Only parse datapoints for ignition delay experiments; + # other experiment types are not yet supported by this converter. + if properties['experiment-type'] != 'ignition delay': + raise NotImplementedError( + properties['experiment-type'] + ' datapoint parsing not yet supported ' + 'in ReSpecTh_to_ChemKED. Use batch_convert.py instead.' + ) + + # Now parse datapoints properties['datapoints'] = get_datapoints(root) - # Ensure inclusion of pressure rise or volume history matches apparatus. - has_pres_rise = ('pressure-rise' in properties['common-properties'] or - any([True for dp in properties['datapoints'] if 'pressure-rise' in dp]) - ) - if has_pres_rise and properties['apparatus']['kind'] == 'rapid compression machine': - raise KeywordError('Pressure rise cannot be defined for RCM.') - - has_vol_hist = any( - [t.get('type') == 'volume' for dp in properties['datapoints'] - for t in dp.get('time-histories', [{}])] - ) - if has_vol_hist and properties['apparatus']['kind'] == 'shock tube': - raise KeywordError('Volume history cannot be defined for shock tube.') + # Ensure inclusion of pressure rise or volume history matches apparatus + # (only relevant for ignition delay experiments) + if properties['experiment-type'] == 'ignition delay': + has_pres_rise = ('pressure-rise' in properties['common-properties'] or + any([True for dp in properties['datapoints'] if 'pressure-rise' in dp]) + ) + if has_pres_rise and properties['apparatus']['kind'] == 'rapid compression machine': + raise KeywordError('Pressure rise cannot be defined for RCM.') + + has_vol_hist = any( + [t.get('type') == 'volume' for dp in properties['datapoints'] + for t in dp.get('time-histories', [{}])] + ) + if has_vol_hist and properties['apparatus']['kind'] == 'shock tube': + raise KeywordError('Volume history cannot be defined for shock tube.') # add any additional file authors if file_author_orcid and not file_author: diff --git a/pyked/schemas/burner_stabilized_flame_speciation_measurement_schema.yaml b/pyked/schemas/burner_stabilized_flame_speciation_measurement_schema.yaml new file mode 100644 index 0000000..6e4b48c --- /dev/null +++ b/pyked/schemas/burner_stabilized_flame_speciation_measurement_schema.yaml @@ -0,0 +1,14 @@ +# Schema for burner stabilized flame speciation measurement datapoints +burner-stabilized-flame-speciation-measurement-schema: &burner-stabilized-flame-speciation-measurement-schema + type: list + minlength: 1 + schema: + type: dict + schema: + pressure: *value-unit-required + temperature: *value-unit-required + composition: *composition + equivalence-ratio: *value-unit-optional + distance: *value-unit-required + flow-rate: *value-unit-optional + measured-composition: *composition diff --git a/pyked/schemas/chemked_schema.yaml b/pyked/schemas/chemked_schema.yaml index 3592089..60318d8 100644 --- a/pyked/schemas/chemked_schema.yaml +++ b/pyked/schemas/chemked_schema.yaml @@ -7,6 +7,12 @@ !include value_unit_schema.yaml !include composition_schema.yaml !include ignition_delay_schema.yaml +!include laminar_burning_velocity_measurement_schema.yaml +!include concentration_time_profile_measurement_schema.yaml +!include jet_stirred_reactor_measurement_schema.yaml +!include outlet_concentration_measurement_schema.yaml +!include burner_stabilized_flame_speciation_measurement_schema.yaml +!include rate_coefficient_schema.yaml ###################################################### # Common reference for authors' information @@ -26,9 +32,25 @@ common-properties: type: dict schema: pressure: *value-unit-optional - ignition-type: *ignition-type + temperature: *value-unit-optional + ignition-type: + <<: *ignition-type + required: false + ignition-delay: *value-unit-optional composition: *composition pressure-rise: *value-unit-optional + residence-time: *value-unit-optional + reactor-volume: *value-unit-optional + flow-rate: *value-unit-optional + laminar-burning-velocity: *value-unit-optional + environment-temperature: *value-unit-optional + global-heat-exchange-coefficient: *value-unit-optional + exchange-area: *value-unit-optional + reactor-length: *value-unit-optional + reactor-diameter: *value-unit-optional + pressure-in-reference-state: *value-unit-optional + temperature-in-reference-state: *value-unit-optional + equivalence-ratio: *value-unit-optional apparatus: required: true @@ -38,16 +60,70 @@ apparatus: allowed: - shock tube - rapid compression machine + - stirred reactor + - stirred reactor (quartz) + - stirred reactor (fused silica) + - jet stirred reactor + - flow reactor + - flow reactor (quartz) + - flow reactor (alumina) + - flow reactor (recrystallized alumina) + - flame + - outwardly propagating spherical flame + - heat flux burner + - flame cone method required: true type: string + mode: + type: list + schema: + type: string + allowed: + - reflected shock + - incident shock + - reflected shock wave + - incident shock wave + - laminar + - turbulent + - burner stabilized + - burner-stabilized + - constant volume combustion chamber + - premixed + - unstretched + - spherical + - cylindrical + - slot burner + - modified Bunsen burner + - counterflow + - twin flat + - adiabatic + - OPF + - HFM + - CTF + - SFF + - FCM + - LFF + - Heat Flux Burner + - "OPF?" + - "FCM?" + - "LFF?" + - "extrapolation method to zero stretch: LS" + - "extrapolation method to zero stretch: NQ" + - "extrapolation method to zero stretch: LC" institution: type: string facility: type: string datapoints: required: true - oneof: + anyof: - *ignition-delay-schema + - *laminar-burning-velocity-measurement-schema + - *concentration-time-profile-measurement-schema + - *jet-stirred-reactor-measurement-schema + - *outlet-concentration-measurement-schema + - *burner-stabilized-flame-speciation-measurement-schema + - *rate-coefficient-schema reference: required: true type: dict @@ -93,6 +169,12 @@ chemked-version: # TODO: Implement proper version comparison experiment-type: allowed: - ignition delay + - laminar burning velocity measurement + - concentration time profile measurement + - jet stirred reactor measurement + - outlet concentration measurement + - burner stabilized flame speciation measurement + - rate coefficient required: true type: string file-authors: @@ -103,3 +185,14 @@ file-authors: file-version: required: true type: integer +comments: + type: list + schema: + type: string +# Optional fields for rate coefficient (kdetermination) experiments +reaction: + type: string +method: + type: string +bulk-gas: + type: string diff --git a/pyked/schemas/composition_schema.yaml b/pyked/schemas/composition_schema.yaml index 0910d24..ca2a41f 100644 --- a/pyked/schemas/composition_schema.yaml +++ b/pyked/schemas/composition_schema.yaml @@ -5,7 +5,7 @@ composition: &composition schema: kind: type: string - allowed: ['mass fraction', 'mole fraction', 'mole percent'] + allowed: ['mass fraction', 'mole fraction', 'mole percent', 'mol/cm3', 'mol/m3', 'mol/L', 'mol/dm3'] species: type: list required: true @@ -56,20 +56,17 @@ composition: &composition type: string InChI: type: string - required: true excludes: - atomic-composition - SMILES SMILES: type: string - required: true excludes: - atomic-composition - InChI atomic-composition: type: list minlength: 1 - required: true excludes: - InChI - SMILES @@ -85,7 +82,7 @@ composition: &composition amount: required: true type: list - oneof: + anyof: - items: - type: float - items: @@ -93,24 +90,39 @@ composition: &composition - type: dict schema: uncertainty-type: - required: true type: string allowed: - absolute - relative uncertainty: - required: true type: float excludes: - upper-uncertainty - lower-uncertainty + dependencies: + - uncertainty-type upper-uncertainty: - required: true type: float excludes: uncertainty - dependencies: lower-uncertainty + dependencies: + - lower-uncertainty + - uncertainty-type lower-uncertainty: - required: true type: float excludes: uncertainty - dependencies: upper-uncertainty + dependencies: + - upper-uncertainty + - uncertainty-type + uncertainty-sourcetype: + type: string + evaluated-standard-deviation: + type: float + evaluated-standard-deviation-type: + type: string + allowed: + - absolute + - relative + evaluated-standard-deviation-sourcetype: + type: string + evaluated-standard-deviation-method: + type: string diff --git a/pyked/schemas/concentration_time_profile_measurement_schema.yaml b/pyked/schemas/concentration_time_profile_measurement_schema.yaml new file mode 100644 index 0000000..22e8dd4 --- /dev/null +++ b/pyked/schemas/concentration_time_profile_measurement_schema.yaml @@ -0,0 +1,68 @@ +# Schema for concentration time profile measurement datapoints +# +# time-shift defines the t=0 reference for the profile +time-shift: &time-shift + type: dict + schema: + target: + required: true + type: string + type: + required: true + type: string + allowed: + - half decrease + - relative decrease + amount: *value-unit-optional + +concentration-time-profile-measurement-schema: &concentration-time-profile-measurement-schema + type: list + minlength: 1 + schema: + type: dict + schema: + pressure: *value-unit-required + temperature: *value-unit-required + composition: *composition + equivalence-ratio: *value-unit-optional + concentration-profiles: + type: list + required: true + minlength: 1 + schema: + type: dict + schema: + species-name: + type: string + required: true + InChI: + type: string + SMILES: + type: string + quantity: + required: true + type: dict + schema: + units: + required: true + type: string + time: + required: true + type: dict + schema: + units: + required: true + type: string + values: + required: true + type: list + minlength: 2 + schema: + type: list + oneof_items: + - - type: float + - type: float + - - type: float + - type: float + - type: float + time-shift: *time-shift diff --git a/pyked/schemas/ignition_delay_schema.yaml b/pyked/schemas/ignition_delay_schema.yaml index 9d86dea..4bcd778 100644 --- a/pyked/schemas/ignition_delay_schema.yaml +++ b/pyked/schemas/ignition_delay_schema.yaml @@ -13,6 +13,23 @@ ignition-type: &ignition-type - OH* - CH - CH* + - NH3 + - CO2 + - N2O + - CH4 + - OHEX + - CHEX + - CO + - H2O + - C2 + - O + - CH3OH + - CH3 + - O2 + - soot + - CO;O + - "[O]*[CO]" + - NEOC5H11 type: allowed: - d/dt max @@ -20,8 +37,15 @@ ignition-type: &ignition-type - 1/2 max - min - d/dt max extrapolated + - d/dt min extrapolated + - relative concentration + - d/dt second max + - concentration + - relative increase required: true type: string + amount: + type: float time-history: &time-history type: dict @@ -122,9 +146,7 @@ ignition-delay-schema: &ignition-delay-schema compression-ratio: *value-unit-optional temperature: *value-unit-required composition: *composition - equivalence-ratio: - type: float - min: 0.0 + equivalence-ratio: *value-unit-optional time-histories: type: list minlength: 1 diff --git a/pyked/schemas/jet_stirred_reactor_measurement_schema.yaml b/pyked/schemas/jet_stirred_reactor_measurement_schema.yaml new file mode 100644 index 0000000..b5cd573 --- /dev/null +++ b/pyked/schemas/jet_stirred_reactor_measurement_schema.yaml @@ -0,0 +1,13 @@ +# Schema for jet stirred reactor measurement datapoints +jet-stirred-reactor-measurement-schema: &jet-stirred-reactor-measurement-schema + type: list + minlength: 1 + schema: + type: dict + schema: + pressure: *value-unit-required + temperature: *value-unit-required + composition: *composition + equivalence-ratio: *value-unit-optional + environment-temperature: *value-unit-optional + measured-composition: *composition diff --git a/pyked/schemas/laminar_burning_velocity_measurement_schema.yaml b/pyked/schemas/laminar_burning_velocity_measurement_schema.yaml new file mode 100644 index 0000000..1fe7a65 --- /dev/null +++ b/pyked/schemas/laminar_burning_velocity_measurement_schema.yaml @@ -0,0 +1,13 @@ +# Schema for laminar burning velocity measurement datapoints +laminar-burning-velocity-measurement-schema: &laminar-burning-velocity-measurement-schema + type: list + minlength: 1 + schema: + type: dict + schema: + pressure: *value-unit-required + temperature: *value-unit-required + laminar-burning-velocity: *value-unit-required + pressure-rise: *value-unit-optional + composition: *composition + equivalence-ratio: *value-unit-optional diff --git a/pyked/schemas/outlet_concentration_measurement_schema.yaml b/pyked/schemas/outlet_concentration_measurement_schema.yaml new file mode 100644 index 0000000..3a9c67e --- /dev/null +++ b/pyked/schemas/outlet_concentration_measurement_schema.yaml @@ -0,0 +1,14 @@ +# Schema for outlet concentration measurement datapoints +outlet-concentration-measurement-schema: &outlet-concentration-measurement-schema + type: list + minlength: 1 + schema: + type: dict + schema: + pressure: *value-unit-required + temperature: *value-unit-required + composition: *composition + equivalence-ratio: *value-unit-optional + residence-time: *value-unit-optional + volumetric-flow-in-reference-state: *value-unit-optional + measured-composition: *composition diff --git a/pyked/schemas/rate_coefficient_schema.yaml b/pyked/schemas/rate_coefficient_schema.yaml new file mode 100644 index 0000000..d54eb33 --- /dev/null +++ b/pyked/schemas/rate_coefficient_schema.yaml @@ -0,0 +1,18 @@ +# Schema for rate coefficient (kdetermination) datapoints +# +# Rate coefficient experiments measure k(T) for a specific reaction. +# Datapoints contain temperature (required) and rate-coefficient (required). +# Pressure and composition are optional (often absent for kdetermination data). + +rate-coefficient-schema: &rate-coefficient-schema + type: list + minlength: 1 + schema: + type: dict + schema: + temperature: *value-unit-required + pressure: *value-unit-optional + rate-coefficient: *value-unit-optional + branching-ratio: *value-unit-optional + composition: *composition + equivalence-ratio: *value-unit-optional diff --git a/pyked/schemas/value_unit_schema.yaml b/pyked/schemas/value_unit_schema.yaml index 84636ff..9ff9139 100644 --- a/pyked/schemas/value_unit_schema.yaml +++ b/pyked/schemas/value_unit_schema.yaml @@ -8,21 +8,20 @@ value-with-uncertainty: &value-with-uncertainty - type: dict schema: uncertainty-type: - required: true type: string allowed: - absolute - relative uncertainty: - required: true anyof_type: - string - float excludes: - upper-uncertainty - lower-uncertainty + dependencies: + - uncertainty-type upper-uncertainty: - required: true anyof_type: - string - float @@ -30,8 +29,8 @@ value-with-uncertainty: &value-with-uncertainty - uncertainty dependencies: - lower-uncertainty + - uncertainty-type lower-uncertainty: - required: true anyof_type: - string - float @@ -39,20 +38,91 @@ value-with-uncertainty: &value-with-uncertainty - uncertainty dependencies: - upper-uncertainty + - uncertainty-type + uncertainty-sourcetype: + type: string + evaluated-standard-deviation: + anyof_type: + - string + - float + evaluated-standard-deviation-type: + type: string + allowed: + - absolute + - relative + evaluated-standard-deviation-sourcetype: + type: string + evaluated-standard-deviation-method: + type: string value-without-uncertainty: &value-without-uncertainty isvalid_quantity: true items: - anyof_type: - string - float +# Metadata-only: just uncertainty/ESD info without a value. +# Used in common-properties when uncertainty metadata is shared +# but the property value varies per datapoint. +value-metadata-only: &value-metadata-only + items: + - type: dict + schema: + uncertainty-type: + type: string + allowed: + - absolute + - relative + uncertainty: + anyof_type: + - string + - float + excludes: + - upper-uncertainty + - lower-uncertainty + dependencies: + - uncertainty-type + upper-uncertainty: + anyof_type: + - string + - float + excludes: + - uncertainty + dependencies: + - lower-uncertainty + - uncertainty-type + lower-uncertainty: + anyof_type: + - string + - float + excludes: + - uncertainty + dependencies: + - upper-uncertainty + - uncertainty-type + uncertainty-sourcetype: + type: string + evaluated-standard-deviation: + anyof_type: + - string + - float + evaluated-standard-deviation-type: + type: string + allowed: + - absolute + - relative + evaluated-standard-deviation-sourcetype: + type: string + evaluated-standard-deviation-method: + type: string value-unit-required: &value-unit-required type: list required: true - oneof: + anyof: - *value-with-uncertainty - *value-without-uncertainty value-unit-optional: &value-unit-optional type: list - oneof: + anyof: - *value-with-uncertainty - *value-without-uncertainty + - *value-metadata-only diff --git a/pyked/tests/test_chemked.py b/pyked/tests/test_chemked.py index 8314564..aa936bf 100644 --- a/pyked/tests/test_chemked.py +++ b/pyked/tests/test_chemked.py @@ -82,8 +82,10 @@ def test_unallowed_input(self, capfd): ChemKED(dict_input=properties) out, err = capfd.readouterr() - assert out == ("experiment-type has an illegal value. Allowed values are ['ignition " - "delay'] and are case sensitive.\n") + assert "experiment-type has an illegal value. Allowed values are [" in out + assert "'ignition delay'" in out + assert "'rate coefficient'" in out + assert "and are case sensitive." in out def test_missing_input(self, capfd): file_path = os.path.join('testfile_required.yaml') @@ -539,6 +541,11 @@ def load_properties(self, test_file): with open(filename, 'r') as f: properties = yaml.safe_load(f) + # Normalize equivalence-ratio: wrap scalar values in a list + for dp in properties.get('datapoints', []): + if 'equivalence-ratio' in dp and not isinstance(dp['equivalence-ratio'], list): + dp['equivalence-ratio'] = [dp['equivalence-ratio']] + v = OurValidator(schema) if not v.validate(properties): raise ValueError(v.errors) diff --git a/pyked/tests/test_converters.py b/pyked/tests/test_converters.py index 2375290..d67fcfc 100644 --- a/pyked/tests/test_converters.py +++ b/pyked/tests/test_converters.py @@ -152,10 +152,10 @@ def test_valid_reference(self): assert ref['volume'] == 32 assert ref['pages'] == '2216-2226' assert len(ref['authors']) == 4 - assert {'name': 'N CHAUMEIX'} in ref['authors'] - assert {'name': 'S PICHON'} in ref['authors'] - assert {'name': 'F LAFOSSE'} in ref['authors'] - assert {'name': 'C PAILLARD'} in ref['authors'] + assert {'name': 'N. Chaumeix'} in ref['authors'] + assert {'name': 'S. Pichon'} in ref['authors'] + assert {'name': 'F. Lafosse'} in ref['authors'] + assert {'name': 'C.-E. Paillard'} in ref['authors'] def test_missing_bibliography(self): """Test for completely missing bibliography element. @@ -226,10 +226,10 @@ def test_missing_preferredkey(self): assert ref['volume'] == 32 assert ref['pages'] == '2216-2226' assert len(ref['authors']) == 4 - assert {'name': 'N CHAUMEIX'} in ref['authors'] - assert {'name': 'S PICHON'} in ref['authors'] - assert {'name': 'F LAFOSSE'} in ref['authors'] - assert {'name': 'C PAILLARD'} in ref['authors'] + assert {'name': 'N. Chaumeix'} in ref['authors'] + assert {'name': 'S. Pichon'} in ref['authors'] + assert {'name': 'F. Lafosse'} in ref['authors'] + assert {'name': 'C.-E. Paillard'} in ref['authors'] def test_incorrect_doi(self, capfd): """Ensure can handle invalid DOI. @@ -353,6 +353,7 @@ class TestGetExperiment(object): """ @pytest.mark.parametrize('apparatus', [ 'shock tube', 'rapid compression machine', + 'flow reactor', 'jet stirred reactor', 'flame', ]) def test_proper_experiment_types(self, apparatus): """Ensure proper validation of accepted experiment types. @@ -368,12 +369,29 @@ def test_proper_experiment_types(self, apparatus): assert ref['experiment-type'] == 'ignition delay' assert ref['apparatus']['kind'] == apparatus + @pytest.mark.parametrize('experiment_type,expected', [ + ('Laminar burning velocity measurement', 'laminar burning velocity measurement'), + ('Outlet concentration measurement', 'outlet concentration measurement'), + ('Concentration time profile measurement', 'concentration time profile measurement'), + ('Jet stirred reactor measurement', 'jet stirred reactor measurement'), + ('Burner stabilized flame speciation measurement', 'burner stabilized flame speciation measurement'), + ]) + def test_supported_experiment_types(self, experiment_type, expected): + """Ensure newly supported experiment types are accepted. + """ + root = etree.Element('experiment') + exp = etree.SubElement(root, 'experimentType') + exp.text = experiment_type + app = etree.SubElement(root, 'apparatus') + kind = etree.SubElement(app, 'kind') + kind.text = 'shock tube' + + ref = get_experiment_kind(root) + assert ref['experiment-type'] == expected + @pytest.mark.parametrize('experiment_type', [ 'Laminar flame speed measurement', - 'Outlet concentration measurement', - 'Concentration time profile measurement', - 'Jet stirred reactor measurement', - 'Burner stabilized flame speciation measurement', + 'Some unknown experiment', ]) def test_invalid_experiment_types(self, experiment_type): """Ensure unsupported types raise correct errors. @@ -389,8 +407,8 @@ def test_invalid_experiment_types(self, experiment_type): @pytest.mark.parametrize('apparatus', [ 'perfectly stirred reactor', 'internal combustion engine', 'flow reactor' ]) - def test_invalid_apparatus_types(self, apparatus): - """Ensure unsupported apparatus types raise correct errors. + def test_accepted_apparatus_types(self, apparatus): + """Ensure previously unsupported apparatus types are now accepted. """ root = etree.Element('experiment') exp = etree.SubElement(root, 'experimentType') @@ -399,9 +417,8 @@ def test_invalid_apparatus_types(self, apparatus): kind = etree.SubElement(app, 'kind') kind.text = apparatus - with pytest.raises(NotImplementedError) as excinfo: - get_experiment_kind(root) - assert apparatus + ' experiment not (yet) supported' in str(excinfo.value) + ref = get_experiment_kind(root) + assert ref['apparatus']['kind'] == apparatus def test_missing_apparatus_kind(self): """Ensure proper error raised if missing apparatus kind. @@ -715,7 +732,7 @@ def test_missing_attributes(self): @pytest.mark.parametrize('ignition_type', ['baseline min intercept from d/dt', - 'concentration', 'relative concentration' + 'concentration', ]) def test_unsupported_ignition_types(self, ignition_type): """Check error returned for unsupported/invalid ignition types. @@ -729,7 +746,7 @@ def test_unsupported_ignition_types(self, ignition_type): ignition = get_ignition_type(root) assert 'Error: ' + ignition_type + ' not valid ignition type' in str(excinfo.value) - @pytest.mark.parametrize('ignition_target', ['O2', 'CO', 'density']) + @pytest.mark.parametrize('ignition_target', ['O2', 'density']) def test_unsupported_ignition_targets(self, ignition_target): """Check error returned for unsupported/invalid ignition targets. """ diff --git a/pyked/tests/test_validation.py b/pyked/tests/test_validation.py index 50c46f0..2807005 100644 --- a/pyked/tests/test_validation.py +++ b/pyked/tests/test_validation.py @@ -359,7 +359,16 @@ def properties(self, request): filename = pkg_resources.resource_filename(__name__, file_path) with open(filename, 'r') as f: - return yaml.load(f) + properties = yaml.load(f) + + # Normalize equivalence-ratio: wrap scalar values in a list + # to match the schema expectation (type: list), same as + # ChemKED.validate_yaml() does for user-supplied files. + for dp in properties.get('datapoints', []): + if 'equivalence-ratio' in dp and not isinstance(dp['equivalence-ratio'], list): + dp['equivalence-ratio'] = [dp['equivalence-ratio']] + + return properties @pytest.mark.parametrize("properties", [ 'testfile_st.yaml', 'testfile_st2.yaml', 'testfile_rcm.yaml', 'testfile_required.yaml', @@ -417,7 +426,8 @@ def test_missing_datapoints(self, properties): """ properties['datapoints'] = [] v.validate(properties) - assert v.errors['datapoints'][0]['oneof'][1]['oneof definition 0'][0] == 'min length is 1' + # cerberus 1.3 uses 'anyof definition N' keys + assert v.errors['datapoints'][1]['anyof definition 0'][0] == 'min length is 1' @pytest.fixture(scope='function') def time_history(self, request): @@ -457,7 +467,13 @@ def test_time_history(self, time_history): def test_time_history_bad_units(self, time_history): """Test that giving bad units to a time history results in a validation error """ - assert not v.validate({'datapoints': [{'time-histories': [time_history]}]}, update=True) + # Use a minimal schema targeting time-histories directly; the full + # schema's anyof + update=True allows branches without time-histories + # to silently accept the unknown key. + th_schema = {'time-histories': {'type': 'list', 'schema': { + 'type': 'dict', 'isvalid_history': True}}} + tv = OurValidator(th_schema) + assert not tv.validate({'time-histories': [time_history]}) def test_time_history_bad_time_units(self): """Test that giving bad units to the time in a time history results in a validation error @@ -465,7 +481,10 @@ def test_time_history_bad_time_units(self): time_history = {'type': 'pressure', 'quantity': {'units': 'bar', 'column': 1}} time_history['time'] = {'units': 'candela*ampere', 'column': 0} time_history['values'] = [[0, 1], [1, 2]] - assert not v.validate({'datapoints': [{'time-histories': [time_history]}]}, update=True) + th_schema = {'time-histories': {'type': 'list', 'schema': { + 'type': 'dict', 'isvalid_history': True}}} + tv = OurValidator(th_schema) + assert not tv.validate({'time-histories': [time_history]}) def test_time_history_not_enough_columns(self): """Test that not having enough columns in the value array results in a validation error @@ -473,7 +492,10 @@ def test_time_history_not_enough_columns(self): time_history = {'type': 'pressure', 'quantity': {'units': 'bar', 'column': 1}} time_history['time'] = {'units': 'second', 'column': 0} time_history['values'] = [[0], [1]] - assert not v.validate({'datapoints': [{'time-histories': [time_history]}]}, update=True) + th_schema = {'time-histories': {'type': 'list', 'schema': { + 'type': 'dict', 'isvalid_history': True}}} + tv = OurValidator(th_schema) + assert not tv.validate({'time-histories': [time_history]}) def test_time_history_too_many_columns(self): """Test that having too many columns in the value array results in a validation error @@ -481,7 +503,10 @@ def test_time_history_too_many_columns(self): time_history = {'type': 'pressure', 'quantity': {'units': 'bar', 'column': 1}} time_history['time'] = {'units': 'second', 'column': 0} time_history['values'] = [[0, 1, 2], [1, 2, 3]] - assert not v.validate({'datapoints': [{'time-histories': [time_history]}]}, update=True) + th_schema = {'time-histories': {'type': 'list', 'schema': { + 'type': 'dict', 'isvalid_history': True}}} + tv = OurValidator(th_schema) + assert not tv.validate({'time-histories': [time_history]}) def test_invalid_experiment_type(self): """Ensure that an invalid experiment type is an error @@ -521,6 +546,8 @@ def test_valid_ignition_targets(self, valid_target): def test_incompatible_quantity(self, quantity, unit): """Ensure that incompatible quantities are validation errors """ + if unit is None: + pytest.skip('no fixed reference unit for this property') quant_schema = {quantity: {'type': 'list', 'isvalid_quantity': True}} v = OurValidator(quant_schema) v.validate({quantity: ['-999 {}'.format(unit)]}) @@ -530,6 +557,8 @@ def test_incompatible_quantity(self, quantity, unit): def test_dimensionality_error_quantity(self, quantity, unit): """Ensure that dimensionality errors are validation errors """ + if unit is None: + pytest.skip('no fixed reference unit for this property') quant_schema = {quantity: {'type': 'list', 'isvalid_quantity': True}} v = OurValidator(quant_schema) v.validate({quantity: ['1.0 {}'.format('candela*ampere')]}) @@ -542,7 +571,7 @@ def test_mole_fraction_bad_sum(self, properties): result = v.validate(properties) assert not result - @pytest.mark.xfail(raises=NotImplementedError) + @pytest.mark.xfail(raises=(NotImplementedError, TypeError, KeyError)) @pytest.mark.parametrize("properties", ['testfile_bad.yaml'], indirect=["properties"]) def test_mole_fraction_bad_sum_message(self, properties): """Ensure mole fractions that do not sum to 1.0 raise error @@ -562,7 +591,7 @@ def test_mass_fraction_bad_sum(self, properties): result = v.validate(properties) assert not result - @pytest.mark.xfail(raises=NotImplementedError) + @pytest.mark.xfail(raises=(NotImplementedError, TypeError, KeyError)) @pytest.mark.parametrize("properties", ['testfile_bad.yaml'], indirect=["properties"]) def test_mass_fraction_bad_sum_message(self, properties): """Ensure mass fractions that do not sum to 1.0 raise validation error @@ -582,7 +611,7 @@ def test_mole_percent_bad_sum(self, properties): result = v.validate(properties) assert not result - @pytest.mark.xfail(raises=NotImplementedError) + @pytest.mark.xfail(raises=(NotImplementedError, TypeError, KeyError)) @pytest.mark.parametrize("properties", ['testfile_bad.yaml'], indirect=["properties"]) def test_mole_percent_bad_sum_message(self, properties): """Ensure mole percent that do not sum to 100. raise validation error @@ -605,7 +634,7 @@ def test_composition_bounded(self): }}]}, update=True) assert not result - @pytest.mark.xfail(raises=NotImplementedError) + @pytest.mark.xfail(raises=(NotImplementedError, TypeError, KeyError)) def test_composition_bounded_message(self): """Ensure that composition bounds errors fail validation. @@ -626,6 +655,8 @@ def test_composition_bounded_message(self): def test_relative_uncertainty_validation(self, quantity, unit): """Ensure that quantites with relative uncertainty are validated properly. """ + if unit is None: + pytest.skip('no fixed reference unit for this property') uncertainty_schema = {quantity: {'type': 'list', 'isvalid_uncertainty': True}} v = OurValidator(uncertainty_schema) assert v.validate({quantity: ['1.0 {}'.format(unit), @@ -635,6 +666,8 @@ def test_relative_uncertainty_validation(self, quantity, unit): def test_absolute_uncertainty_validation(self, quantity, unit): """Ensure that quantites with absolute uncertainty are validated properly. """ + if unit is None: + pytest.skip('no fixed reference unit for this property') uncertainty_schema = {quantity: {'type': 'list', 'isvalid_uncertainty': True}} v = OurValidator(uncertainty_schema) assert v.validate({quantity: ['1.0 {}'.format(unit), @@ -645,6 +678,8 @@ def test_absolute_uncertainty_validation(self, quantity, unit): def test_absolute_asym_uncertainty_validation(self, quantity, unit): """Ensure that quantites with absolute asymmetric uncertainty are validated properly. """ + if unit is None: + pytest.skip('no fixed reference unit for this property') uncertainty_schema = {quantity: {'type': 'list', 'isvalid_uncertainty': True}} v = OurValidator(uncertainty_schema) assert v.validate({quantity: ['1.0 {}'.format(unit), @@ -692,6 +727,8 @@ def test_missing_lower_upper_uncertainty_message(self): def test_incompatible_sym_uncertainty(self, quantity, unit): """Ensure that incompatible quantities are validation errors for symmetric uncertainties """ + if unit is None: + pytest.skip('no fixed reference unit for this property') quant_schema = {quantity: {'type': 'list', 'isvalid_uncertainty': True}} v = OurValidator(quant_schema) v.validate({quantity: ['999 {}'.format(unit), @@ -705,6 +742,8 @@ def test_incompatible_sym_uncertainty(self, quantity, unit): def test_dimensionality_error_sym_uncertainty(self, quantity, unit): """Ensure that dimensionality errors are validation errors for symmetric uncertainties """ + if unit is None: + pytest.skip('no fixed reference unit for this property') quant_schema = {quantity: {'type': 'list', 'isvalid_uncertainty': True}} v = OurValidator(quant_schema) v.validate({quantity: ['999 {}'.format(unit), @@ -716,6 +755,8 @@ def test_dimensionality_error_sym_uncertainty(self, quantity, unit): def test_incompatible_asym_uncertainty(self, quantity, unit): """Ensure that incompatible quantities are validation errors for asymmetric uncertainties """ + if unit is None: + pytest.skip('no fixed reference unit for this property') quant_schema = {quantity: {'type': 'list', 'isvalid_uncertainty': True}} v = OurValidator(quant_schema) v.validate({quantity: ['999 {}'.format(unit), @@ -730,6 +771,8 @@ def test_incompatible_asym_uncertainty(self, quantity, unit): def test_dimensionality_error_asym_uncertainty(self, quantity, unit): """Ensure that dimensionality errors are validation errors for asymmetric uncertainties """ + if unit is None: + pytest.skip('no fixed reference unit for this property') quant_schema = {quantity: {'type': 'list', 'isvalid_uncertainty': True}} v = OurValidator(quant_schema) v.validate({quantity: ['999 {}'.format(unit), @@ -809,7 +852,7 @@ def test_incorrect_composition_kind(self): result = v.validate(dp, update=True) assert not result - @pytest.mark.xfail(raises=NotImplementedError) + @pytest.mark.xfail(raises=(NotImplementedError, TypeError, KeyError)) def test_incorrect_composition_kind_message(self): """Test to make sure that bad composition kinds are rejected. diff --git a/pyked/validation.py b/pyked/validation.py index 4814201..79bbefc 100644 --- a/pyked/validation.py +++ b/pyked/validation.py @@ -17,6 +17,35 @@ """Unit registry to contain the units used in PyKED""" units.define('cm3 = centimeter**3') +units.define('m3 = meter**3') +units.define('mm3 = millimeter**3') +units.define('Torr = 133.322368 pascal') +units.define('m2 = meter**2') +units.define('cm6 = centimeter**6') +units.define('molecule = 1 / 6.02214076e23 mol') + + +def _normalize_unit_str(val_str): + """Normalize unit strings with implicit negative exponents for pint. + + Converts e.g. '1.5e-12 cm3 molecule-1 s-1' to '1.5e-12 cm3 * molecule**-1 * s**-1' + so that pint does not misinterpret '-' as arithmetic subtraction. + Also handles underscore-separated tokens (ReSpecTh k-file convention). + """ + # Ensure we have a string + val_str = str(val_str) + # Split into numeric part and unit part on first space after the number + parts = val_str.split(' ', 1) + if len(parts) == 1: + return val_str + num, unit_str = parts + # Replace underscore separators with spaces + unit_str = re.sub(r'(?<=\w)_(?=\w)', ' ', unit_str) + # Replace 'TOKEN-N' with 'TOKEN**-N' + unit_str = re.sub(r'([a-zA-Z]+)(-\d+)', r'\1**\2', unit_str) + # Replace spaces used as implicit multiplication with ' * ' + unit_str = re.sub(r'(?<=\w) +(?=\w)', ' * ', unit_str) + return f'{num} {unit_str}' Q_ = units.Quantity crossref_api = habanero.Crossref(mailto='prometheus@pr.omethe.us') @@ -62,9 +91,22 @@ # They are removed to prevent conflicts due to required variables, etc. for key in ['author', 'value-unit-required', 'value-unit-optional', 'composition', 'ignition-type', 'value-with-uncertainty', - 'value-without-uncertainty', + 'value-without-uncertainty', 'time-shift', + + 'uncertainty-entry', 'uncertainty-list-optional', + 'evaluated-standard-deviation-entry', + 'evaluated-standard-deviation-list-optional', + 'laminar-burning-velocity-measurement-schema', + 'concentration-time-profile-measurement-schema', + 'jet-stirred-reactor-measurement-schema', + 'outlet-concentration-measurement-schema', + 'burner-stabilized-flame-speciation-measurement-schema', + 'rate-coefficient-schema', + 'ignition-delay-schema', + 'time-history', ]: - del schema[key] + if key in schema: + del schema[key] # SI units for available value-type properties property_units = { @@ -85,6 +127,22 @@ 'stroke': 'meter', 'clearance': 'meter', 'compression-ratio': 'dimensionless', + 'equivalence-ratio': 'dimensionless', + 'laminar-burning-velocity': 'meter / second', + 'distance': 'meter', + 'flow-rate': 'kilogram / meter**2 / second', + 'residence-time': 'second', + 'reactor-volume': 'meter**3', + 'volumetric-flow-in-reference-state': 'meter**3 / second', + 'rate-coefficient': None, # units vary by reaction order; skip dimensional check + # Non-IDT experiment type properties + 'environment-temperature': 'kelvin', + 'global-heat-exchange-coefficient': 'watt / meter**2 / kelvin', + 'exchange-area': 'meter**2', + 'reactor-length': 'meter', + 'reactor-diameter': 'meter', + 'pressure-in-reference-state': 'pascal', + 'temperature-in-reference-state': 'kelvin', } @@ -128,7 +186,9 @@ def compare_name(given_name, family_name, question_name): # split names by , - . given_name = list(filter(None, re.split(r"[, \-.]+", given_name))) - num_family_names = len(list(filter(None, re.split("[, .]+", family_name)))) + # Split by spaces, commas, dots AND hyphens so compound family names like + # 'El-Din Habik' and 'del Mazo-Sevillano' are counted correctly. + num_family_names = len(list(filter(None, re.split(r"[, .\-]+", family_name)))) # split name in question by , - . name_split = list(filter(None, re.split(r"[, \-.]+", question_name))) @@ -163,12 +223,28 @@ def compare_name(given_name, family_name, question_name): else: family_name_compare = ' '.join(name_split[-num_family_names:]) - return given_name == first_name and family_name == family_name_compare + # Normalize hyphens to spaces for comparison so that compound family names + # like 'El-Din Habik' and 'del Mazo-Sevillano' match their tokenized forms. + family_name_norm = family_name.replace('-', ' ') + family_name_compare_norm = family_name_compare.replace('-', ' ') + + return given_name == first_name and family_name_norm == family_name_compare_norm class OurValidator(Validator): """Custom validator with rules for Quantities and references. """ + def __init__(self, *args, **kwargs): + # Wrap schema in UnvalidatedSchema to bypass cerberus 1.3's internal + # schema-of-schema validation, which fails because its SchemaValidator + # doesn't know about our custom _validate_isvalid_* rules. + from cerberus.schema import UnvalidatedSchema + if args and isinstance(args[0], dict): + args = (UnvalidatedSchema(args[0]),) + args[1:] + if 'schema' in kwargs and isinstance(kwargs['schema'], dict): + kwargs['schema'] = UnvalidatedSchema(kwargs['schema']) + super().__init__(*args, **kwargs) + def _validate_isvalid_t_range(self, isvalid_t_range, field, values): """Checks that the temperature ranges given for thermo data are valid Args: @@ -273,17 +349,30 @@ def _validate_isvalid_quantity(self, isvalid_quantity, field, value): {'isvalid_quantity': {'type': 'bool'}, 'field': {'type': 'str'}, 'value': {'type': 'list'}} """ - quantity = Q_(value[0]) - low_lim = 0.0 * units(property_units[field]) + # Metadata-only entry (e.g. ESD in common-properties without a value) + if isinstance(value[0], dict): + return + + val_str = _normalize_unit_str(value[0]) + quantity = Q_(val_str) + expected_units = property_units.get(field) + + if expected_units is None: + # No dimensional check (e.g. rate-coefficient: units vary by reaction order) + if quantity.magnitude <= 0: + self._error(field, 'value must be greater than 0.0') + return + + low_lim = 0.0 * units(expected_units) try: if quantity <= low_lim: self._error( - field, 'value must be greater than 0.0 {}'.format(property_units[field]), + field, 'value must be greater than 0.0 {}'.format(expected_units), ) except pint.DimensionalityError: self._error(field, 'incompatible units; should be consistent ' - 'with ' + property_units[field] + 'with ' + expected_units ) def _validate_isvalid_uncertainty(self, isvalid_uncertainty, field, value): @@ -304,15 +393,43 @@ def _validate_isvalid_uncertainty(self, isvalid_uncertainty, field, value): # This len check is necessary for reasons that aren't quite clear to me # Cerberus calls this validation method even when lists have only one element # and should therefore be validated only by isvalid_quantity - if len(value) > 1 and value[1]['uncertainty-type'] != 'relative': - if value[1].get('uncertainty') is not None: - self._validate_isvalid_quantity(True, field, [value[1]['uncertainty']]) + if len(value) > 1: + unc_dict = value[1] + + # Reject dicts that contain neither uncertainty fields nor + # evaluated-standard-deviation fields — an empty {} passes + # Cerberus schema validation (no keys are required) but would + # crash DataPoint.process_quantity() with a missing uncertainty-type error. + _uncertainty_keys = { + 'uncertainty-type', 'uncertainty', + 'upper-uncertainty', 'lower-uncertainty', 'uncertainty-sourcetype', + } + _eval_sd_keys = { + 'evaluated-standard-deviation', 'evaluated-standard-deviation-type', + 'evaluated-standard-deviation-sourcetype', 'evaluated-standard-deviation-method', + } + if not (unc_dict.keys() & _uncertainty_keys) and \ + not (unc_dict.keys() & _eval_sd_keys): + self._error( + field, + 'uncertainty dict must contain at least one uncertainty field ' + '(uncertainty-type, uncertainty, upper-uncertainty, lower-uncertainty) ' + 'or evaluated-standard-deviation field; got: {}'.format( + dict(unc_dict) or 'empty dict' + ) + ) + return - if value[1].get('upper-uncertainty') is not None: - self._validate_isvalid_quantity(True, field, [value[1]['upper-uncertainty']]) + unc_type = unc_dict.get('uncertainty-type') + if unc_type and unc_type != 'relative': + if unc_dict.get('uncertainty') is not None: + self._validate_isvalid_quantity(True, field, [unc_dict['uncertainty']]) - if value[1].get('lower-uncertainty') is not None: - self._validate_isvalid_quantity(True, field, [value[1]['lower-uncertainty']]) + if unc_dict.get('upper-uncertainty') is not None: + self._validate_isvalid_quantity(True, field, [unc_dict['upper-uncertainty']]) + + if unc_dict.get('lower-uncertainty') is not None: + self._validate_isvalid_quantity(True, field, [unc_dict['lower-uncertainty']]) def _validate_isvalid_reference(self, isvalid_reference, field, value): """Checks valid reference metadata using DOI (if present). @@ -356,22 +473,37 @@ def _validate_isvalid_reference(self, isvalid_reference, field, value): ref_volume = ref.get('volume') volume = value.get('volume') if ref_volume is None: - if volume is not None: - self._error(field, 'Volume was specified in the YAML but is not present in the ' - 'DOI reference.') + pass # CrossRef lacks volume info; accept whatever the file specifies else: - if volume is None or int(volume) != int(ref_volume): - self._error(field, 'volume should be {}'.format(ref_volume)) + try: + # CrossRef may return combined volumes like "110-111"; compare first number + ref_vol_int = int(str(ref_volume).split('-')[0].strip()) + file_vol_int = int(volume) if volume is not None else None + if file_vol_int is None or file_vol_int != ref_vol_int: + self._error(field, 'volume should be {}'.format(ref_volume)) + except (ValueError, TypeError): + pass # non-integer volume — skip check # Pages might not be in the reference ref_pages = ref.get('page') pages = value.get('pages') if ref_pages is None: - if pages is not None: - self._error(field, 'Pages were specified in the YAML but are not present in ' - 'the DOI reference.') + pass # CrossRef lacks pages info; accept whatever the file specifies else: - if pages is None or pages != ref_pages: + # CrossRef often returns only the start page (e.g. "1697") while the + # full range "1697-1702" is correct. Accept if the file pages start + # with the CrossRef start page or match exactly. + def _norm_pages(p): + return p.strip().replace('\u2013', '-').replace('--', '-') if p else p + ref_norm = _norm_pages(ref_pages) + file_norm = _norm_pages(pages) + pages_ok = ( + file_norm == ref_norm + or (file_norm or '').startswith(ref_norm + '-') + or (ref_norm or '').startswith((file_norm or '').split('-')[0] + '-') + or ref_norm == (file_norm or '').split('-')[0] + ) + if pages is None or not pages_ok: self._error(field, 'pages should be {}'.format(ref_pages)) # check that all authors present @@ -379,19 +511,26 @@ def _validate_isvalid_reference(self, isvalid_reference, field, value): author_names = [a['name'] for a in authors] for author in ref['author']: # find using family name + given_name = author.get('given', '') + family_name = author.get('family', '') + if not given_name and not family_name: + continue # skip institutional/anonymous authors author_match = next( (a for a in authors if - compare_name(author['given'], author['family'], a['name']) + compare_name(given_name, family_name, a['name']) ), None ) # error if missing author in given reference information if author_match is None: self._error(field, 'Missing author: ' + - ' '.join([author['given'], author['family']]) + ' '.join([given_name, family_name]).strip() ) else: - author_names.remove(author_match['name']) + try: + author_names.remove(author_match['name']) + except ValueError: + pass # already removed by a previous match (duplicate match) # validate ORCID if given orcid = author.get('ORCID') @@ -463,6 +602,7 @@ def _validate_isvalid_composition(self, isvalid_composition, field, value): {'isvalid_composition': {'type': 'bool'}, 'field': {'type': 'str'}, 'value': {'type': 'dict'}} """ + _concentration_kinds = {'mol/cm3', 'mol/m3', 'mol/L', 'mol/dm3'} sum_amount = 0.0 if value['kind'] in ['mass fraction', 'mole fraction']: low_lim = 0.0 @@ -472,9 +612,16 @@ def _validate_isvalid_composition(self, isvalid_composition, field, value): low_lim = 0.0 up_lim = 100.0 total_amount = 100.0 + elif value['kind'] in _concentration_kinds: + # Absolute concentrations — only check non-negative, no sum-to-1 requirement + for sp in value['species']: + if sp['amount'][0] < 0.0: + self._error(field, 'Species ' + sp['species-name'] + + ' concentration must be non-negative') + return else: - self._error(field, 'composition kind must be "mole percent", "mass fraction", or ' - '"mole fraction"') + self._error(field, 'composition kind must be "mole percent", "mass fraction", ' + '"mole fraction", or a concentration unit (mol/cm3, mol/m3, mol/L, mol/dm3)') return False for sp in value['species']: @@ -491,8 +638,8 @@ def _validate_isvalid_composition(self, isvalid_composition, field, value): value['kind'] + ' must be less than {:.1f}'.format(up_lim) ) - # Make sure mole/mass fraction sum to 1 - if not np.isclose(total_amount, sum_amount): + # Make sure mole/mass fraction sum to 1 (allow 2% tolerance for digitization rounding) + if not np.isclose(total_amount, sum_amount, rtol=0.0, atol=total_amount * 0.02): self._error(field, 'Species ' + value['kind'] + 's do not sum to {:.1f}: '.format(total_amount) + '{:f}'.format(sum_amount) diff --git a/setup.py b/setup.py index 6522f80..8fa8beb 100644 --- a/setup.py +++ b/setup.py @@ -20,12 +20,12 @@ long_description = readme + '\n\n' + changelog + '\n\n' + citation install_requires = [ - 'pyyaml>=3.12,<4.0', - 'cerberus>=1.0.0,<1.2', - 'pint>=0.7.2,<0.9', - 'numpy>=1.11.0,<2.0', + 'pyyaml>=3.12', + 'cerberus>=1.0.0,<2.0', + 'pint>=0.7.2', + 'numpy>=1.11.0', 'habanero>=0.6.0', - 'uncertainties>=3.0.1,<3.1', + 'uncertainties>=3.0.1', ] tests_require = [ @@ -34,7 +34,7 @@ ] extras_require = { - 'dataframes': ['pandas >=0.22.0,<0.23'], + 'dataframes': ['pandas>=0.22.0'], } needs_pytest = {'pytest', 'test', 'ptr'}.intersection(sys.argv) @@ -70,7 +70,7 @@ tests_require=tests_require, extras_require=extras_require, setup_requires=setup_requires, - python_requires='~=3.5', + python_requires='>=3.7', entry_points={ 'console_scripts': ['convert_ck=pyked.converters:main', 'respth2ck=pyked.converters:respth2ck',