pdidev · jmorice91 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
@@ -42,9 +42,9 @@ The possible values for the keys are as follow:
   can be replaced inside the `DATA_SECTION`.
 * `datasets`: a key-value map associating a PDI type to string keys.
   Each string is the name of a dataset to create in the file on first
-  access, with the type described in the value. The string key can also be 
+  access, with the type described in the value. The string key is 
   a regular expression (regex), and be used to define "generic keys",
-  that can be used in `DATA_IO_DESC` for the keyword dataset.
+  that can be used in `DATA_IO_DESC` for the keyword dataset. The regex use the Modiﬁed ECMAScript regular expression grammar.
 * `collision_policy`: a string identifying a \ref COLLISION_POLICY
 * `deflate`: an integer value (from 0 to 9) defining the default deflate (GNU
   gzip) compression level to use for datasets created in this file.
@@ -114,7 +114,8 @@ The possible values for the keys are as follow:
   It defaults to selecting the whole data.
 * `dataset_selection`: a `SELECTION_DESC` specifying the selection of
   data in the file data to write or read.
-  This is only valid if the dataset is defined in the datasets.
+  This is only valid if the dataset is explicitly defined in the `datasets`
+  section.
 * `attributes`: a key-value map specifying the set of attributes to read from
   (respectively, write to) the file when the associated dataset is read
   (respectively, written).

@@ -0,0 +1,79 @@
+/*******************************************************************************
+ * Copyright (C) 2025 Commissariat a l'energie atomique et aux energies alternatives (CEA)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of CEA nor the names of its contributors may be used to
+ *   endorse or promote products derived from this software without specific
+ *   prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ ******************************************************************************/
+
+
+#ifndef DECL_HDF5_DATASET_EXPLICIT_TYPE_H_
+#define DECL_HDF5_DATASET_EXPLICIT_TYPE_H_
+
+// #include <hdf5.h>
+// #ifdef H5_HAVE_PARALLEL
+// #include <mpi.h>
+// #endif
+
+#include <regex>
+#include <string>
+
+#include <paraconf.h>
+
+#include <pdi/pdi_fwd.h>
+
+//#include <pdi/context.h>
+//#include <pdi/expression.h>
+
+namespace decl_hdf5 {
+
+/// Information about the types that should be used to create datasets as provided in the Yaml file
+struct Dataset_explicit_type {
+	std::string m_definition; ///< definition from the YAML as a string for debugging purpose
+	std::regex m_regex; ///< the parsed regex that determines if the provided type applies (depend only on m_definition and regex grammar)
+	int m_begin_line; ///< begin line number in the YAML for debugging purposes
+	int m_end_line; ///< end line number in the YAML for debugging purposes
+	PDI::Datatype_template_sptr m_type; ///< the type to use for the dataset in case the regex matches
+
+	Dataset_explicit_type() = default;
+
+	Dataset_explicit_type(std::string def, int b_line, int e_line, std::regex regex, PDI::Datatype_template_sptr type)
+		: m_definition(def)
+		, m_begin_line(b_line)
+		, m_end_line(e_line)
+		, m_regex(regex)
+		, m_type(type)
+	{}
+
+	/// function to get the line where the dataset is defined in Yaml file
+	std::string get_msg_err_line() const
+	{
+		std::string result;
+		if (m_begin_line == m_end_line) {
+			result = " defined in line " + std::to_string(m_begin_line + 1);
+		} else {
+			result = " defined in lines " + std::to_string(m_begin_line + 1) + " - " + std::to_string(m_end_line);
+		}
+		return result;
+	}
+};
+
+} // namespace decl_hdf5
+
+#endif // DECL_HDF5_DATASET_EXPLICIT_TYPE_H_
@@ -42,6 +42,7 @@
 #include <pdi/datatype.h>
 #include <pdi/datatype_template.h>
 #include <pdi/error.h>
+#include <pdi/fmt.h>
 #include <pdi/paraconf_wrapper.h>
 #include <pdi/ref_any.h>
 #include <pdi/scalar_datatype.h>
@@ -70,6 +71,8 @@ using PDI::Type_error;
 using PDI::Value_error;
 using std::dynamic_pointer_cast;
 using std::function;
+using std::pair;
+using std::regex;
 using std::string;
 using std::stringstream;
 using std::tie;
@@ -212,7 +215,7 @@ void Dataset_op::fletcher(Context& ctx, Expression value)
 	}
 }
 
-void Dataset_op::execute(Context& ctx, hid_t h5_file, bool use_mpio, const unordered_map<string, Datatype_template_sptr>& dsets)
+void Dataset_op::execute(Context& ctx, hid_t h5_file, bool use_mpio, const std::vector<Dataset_explicit_type>& dsets)
 {
 	Raii_hid xfer_lst = make_raii_hid(H5Pcreate(H5P_DATASET_XFER), H5Pclose);
 #ifdef H5_HAVE_PARALLEL
@@ -346,7 +349,7 @@ hid_t Dataset_op::dataset_creation_plist(Context& ctx, const Datatype* dataset_t
 	return dset_plist;
 }
 
-void Dataset_op::do_write(Context& ctx, hid_t h5_file, hid_t write_lst, const unordered_map<string, Datatype_template_sptr>& dsets)
+void Dataset_op::do_write(Context& ctx, hid_t h5_file, hid_t write_lst, const std::vector<Dataset_explicit_type>& dsets)
 {
 	string dataset_name = m_dataset.to_string(ctx);
 	ctx.logger().trace("Preparing for writing `{}' dataset", dataset_name);
@@ -364,65 +367,62 @@ void Dataset_op::do_write(Context& ctx, hid_t h5_file, hid_t write_lst, const un
 	Datatype_sptr dataset_type;
 	Raii_hid h5_file_type, h5_file_space;
 
-	int counter_dataset_found = 0;
+	bool dataset_found = false;
+	Dataset_explicit_type dset_found;
 	ctx.logger().trace("search `{}' in the list of datasets section", dataset_name);
 
-	for (auto&& dsets_elem: dsets) {
-		// create regex from string
-		std::regex dsets_elem_regex(dsets_elem.first);
-		// try if dataset_name is including in regex
-		if (std::regex_match(dataset_name, dsets_elem_regex)) {
-			counter_dataset_found++;
-			ctx.logger().trace(" `{}' match an element of datasets(defined as regex) with value := `{}'", dataset_name, dsets_elem.first);
-		}
-	}
+	for (auto&& dsets_elem = dsets.begin(); dsets_elem != dsets.end(); ++dsets_elem) {
+		if (std::regex_match(dataset_name, dsets_elem->m_regex)) {
+			if (!dataset_found) {
+				dataset_found = true;
+				ctx.logger().trace(" `{}' match an element of datasets(defined as regex) with value := `{}'", dataset_name, dsets_elem->m_definition);
+				dset_found = *dsets_elem;
+			} else {
+				// if we found an other element in the list of datasets, we can't choose the right dataset
+				// (if the elements found have different size, subsize, type, ...)
+				// send a error a message to the user
+				std::vector<string> list_dataset_found;
+				list_dataset_found.emplace_back(dset_found.m_definition + dset_found.get_msg_err_line());
+				list_dataset_found.emplace_back(dsets_elem->m_definition + dsets_elem->get_msg_err_line());
+				++dsets_elem; // get the next element in the iterator on dsets
+				// loop over the rest of the elements in the iterator on dsets
+				for (dsets_elem; dsets_elem != dsets.end(); ++dsets_elem) {
+					if (std::regex_match(dataset_name, dsets_elem->m_regex)) {
+						list_dataset_found.emplace_back(dsets_elem->m_definition + dsets_elem->get_msg_err_line());
+					}
+				}
 
-	ctx.logger().trace("Found `{}' match(s) in the list of datasets section for `{}'", counter_dataset_found, dataset_name);
-
-	if (counter_dataset_found > 1) {
-		// if we found two or more element in the list of datasets, we can't choose the right dataset (if the elements found have different size, subsize, type, ...)
-		// send a error a message to the user
-		std::stringstream msg_dataset_found;
-		msg_dataset_found << "\nThe elements that match " << dataset_name << " are:" << std::endl;
-		for (auto&& dsets_elem: dsets) {
-			// create regex from string
-			std::regex dsets_elem_regex(dsets_elem.first);
-			// try if dataset_name is including in regex
-			if (std::regex_match(dataset_name, dsets_elem_regex)) {
-				msg_dataset_found << " - " << dsets_elem.first << std::endl;
+				// Remark: message error is defined outside Config_error because is too long.
+				const char* msg_config_error
+					= "Found `{0}' match(es) in the list of datasets section for `{1}'."
+					  " Cannot choose the right element in datasets.\n"
+					  "The elements that match `{1}' are:\n"
+					  " - {2}\n"
+					  "Attention: The elements are considered as a regex.";
+
+				throw Config_error{
+					m_dataset_selection.selection_tree(),
+					msg_config_error,
+					list_dataset_found.size(),
+					dataset_name,
+					fmt::join(list_dataset_found, "\n - ")
+				};
 			}
 		}
-		msg_dataset_found << "Attention: The elements are considered as a regex.";
-
-		throw Config_error{
-			m_dataset_selection.selection_tree(),
-			"Found `{}' match(s) in the list of datasets section for `{}'. Cannot choose the right element in datasets.{}",
-			counter_dataset_found,
-			dataset_name,
-			msg_dataset_found.str()
-		};
 	}
 
-	if (counter_dataset_found == 1) {
-		for (auto&& dataset_type_iter_regex = dsets.begin(); dataset_type_iter_regex != dsets.end(); ++dataset_type_iter_regex) {
-			std::regex dsets_elem_regex(dataset_type_iter_regex->first);
-			if (std::regex_match(dataset_name, dsets_elem_regex)) {
-				// we found the dataset
-				ctx.logger().trace("Get the regex in the list of datasets section := `{}'", dataset_type_iter_regex->first);
-				dataset_type = dataset_type_iter_regex->second->evaluate(ctx);
-				tie(h5_file_space, h5_file_type) = space(dataset_type);
-				ctx.logger().trace("Applying `{}' dataset selection", dataset_name);
-				m_dataset_selection.apply(ctx, h5_file_space, h5_mem_space);
-				break; // stop the "for" loop
-			}
-		}
+	if (dataset_found) {
+		ctx.logger().trace("Get the regex in the list of datasets section := `{}'", dset_found.m_definition);
+		dataset_type = dset_found.m_type->evaluate(ctx);
+		tie(h5_file_space, h5_file_type) = space(dataset_type);
+		ctx.logger().trace("Applying `{}' dataset selection", dataset_name);
+		m_dataset_selection.apply(ctx, h5_file_space, h5_mem_space);
 	} else {
 		if (!m_dataset_selection.size().empty()) {
-			throw Config_error{m_dataset_selection.selection_tree(), "Dataset selection is invalid in implicit dataset `{}'", dataset_name};
-		} else {
-			dataset_type = ref.type();
-			tie(h5_file_space, h5_file_type) = space(dataset_type, true);
+			throw Config_error{m_dataset_selection.selection_tree(), "Dataset selection is invalid for implicit dataset `{}'", dataset_name};
 		}
+		dataset_type = ref.type();
+		tie(h5_file_space, h5_file_type) = space(dataset_type, true);
 	}
 
 	ctx.logger().trace("Validating `{}' dataset dataspaces selection", dataset_name);

@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (C) 2015-2024 Commissariat a l'energie atomique et aux energies alternatives (CEA)
+ * Copyright (C) 2015-2025 Commissariat a l'energie atomique et aux energies alternatives (CEA)
  * Copyright (C) 2021-2022 Institute of Bioorganic Chemistry Polish Academy of Science (PSNC)
  * All rights reserved.
  *
@@ -32,6 +32,7 @@
 #include <mpi.h>
 #endif
 
+#include <regex>
 #include <string>
 #include <unordered_map>
 
@@ -43,6 +44,7 @@
 
 #include "attribute_op.h"
 #include "collision_policy.h"
+#include "dataset_explicit_type.h"
 #include "selection.h"
 
 namespace decl_hdf5 {
@@ -186,14 +188,14 @@ class Dataset_op
 	 * \param ctx the context in which to operate
 	 * \param h5_file the already opened HDF5 file id
 	 * \param use_mpio whether the hdf5 read/write is parallel
-	 * \param dsets the type of the explicitly typed datasets
+	 * \param dsets the vector of the explicitly typed datasets defined in Yaml file.
 	 */
-	void execute(PDI::Context& ctx, hid_t h5_file, bool use_mpio, const std::unordered_map<std::string, PDI::Datatype_template_sptr>& dsets);
+	void execute(PDI::Context& ctx, hid_t h5_file, bool use_mpio, const std::vector<Dataset_explicit_type>& dsets);
 
 private:
 	void do_read(PDI::Context& ctx, hid_t h5_file, hid_t read_lst);
 
-	void do_write(PDI::Context& ctx, hid_t h5_file, hid_t xfer_lst, const std::unordered_map<std::string, PDI::Datatype_template_sptr>& dsets);
+	void do_write(PDI::Context& ctx, hid_t h5_file, hid_t xfer_lst, const std::vector<Dataset_explicit_type>& dsets);
 };
 
 } // namespace decl_hdf5

@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (C) 2015-2024 Commissariat a l'energie atomique et aux energies alternatives (CEA)
+ * Copyright (C) 2015-2025 Commissariat a l'energie atomique et aux energies alternatives (CEA)
  * Copyright (C) 2021-2022 Institute of Bioorganic Chemistry Polish Academy of Science (PSNC)
  * All rights reserved.
  *
@@ -89,7 +89,19 @@ vector<File_op> File_op::parse(Context& ctx, PC_tree_t tree)
 #endif
 		} else if (key == "datasets") {
 			each(value, [&](PC_tree_t dset_name, PC_tree_t dset_type) {
-				template_op.m_datasets.emplace(to_string(dset_name), ctx.datatype(dset_type));
+				std::string dset_name_value = to_string(dset_name);
+				std::regex dset_regex(dset_name_value, std::regex::ECMAScript);
+				if (dset_type.node && dset_name.node) {
+					template_op.m_datasets.emplace_back(
+						dset_name_value,
+						dset_name.node->start_mark.line,
+						dset_type.node->end_mark.line,
+						dset_regex,
+						ctx.datatype(dset_type)
+					);
+				} else {
+					Config_error{key_tree, "Error in the definiion of dataset `{}' in datasets section.", dset_name_value};
+				}
 			});
 		} else if (key == "deflate") {
 			deflate = value;
@@ -230,7 +242,7 @@ File_op::File_op(const File_op& other)
 	, m_dset_size_ops{other.m_dset_size_ops}
 {
 	for (auto&& dataset: other.m_datasets) {
-		m_datasets.emplace(dataset.first, dataset.second);
+		m_datasets.emplace_back(dataset);
 	}
 }
 

@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (C) 2015-2021 Commissariat a l'energie atomique et aux energies alternatives (CEA)
+ * Copyright (C) 2015-2025 Commissariat a l'energie atomique et aux energies alternatives (CEA)
  * Copyright (C) 2021 Institute of Bioorganic Chemistry Polish Academy of Science (PSNC)
  * All rights reserved.
  *
@@ -32,6 +32,7 @@
 #include <mpi.h>
 #endif
 
+//#include <regex>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -43,6 +44,7 @@
 
 #include "attribute_op.h"
 #include "collision_policy.h"
+#include "dataset_explicit_type.h"
 #include "dataset_op.h"
 
 namespace decl_hdf5 {
@@ -66,8 +68,8 @@ class File_op
 	PDI::Expression m_communicator;
 #endif
 
-	/// type of the datasets for which an explicit type is specified
-	std::unordered_map<std::string, PDI::Datatype_template_sptr> m_datasets;
+	/// type information for the datasets for which an explicit type is specified
+	std::vector<Dataset_explicit_type> m_datasets;
 
 	/// the dataset operations
 	std::vector<Dataset_op> m_dset_ops;