From a7edf676f1fa6ce23e1b96736f345722b448620d Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 16 Dec 2025 23:43:28 +0100 Subject: [PATCH 1/6] Allow secondary files for RNTupleTempSource --- Configuration/Applications/python/ConfigBuilder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Configuration/Applications/python/ConfigBuilder.py b/Configuration/Applications/python/ConfigBuilder.py index ef849484150b8..d4693da4791c5 100644 --- a/Configuration/Applications/python/ConfigBuilder.py +++ b/Configuration/Applications/python/ConfigBuilder.py @@ -440,7 +440,7 @@ def _datasetname_and_maxfiles(entry): self.process.source.fileNames.append(self._options.dirin+entry) if self._options.secondfilein: if not hasattr(self.process.source,"secondaryFileNames"): - raise Exception("--secondfilein not compatible with "+self._options.filetype+"input type") + raise Exception("--secondfilein not compatible with "+self._options.filetype+" input type") for entry in self._options.secondfilein.split(','): print("entry",entry) if entry.startswith("filelist:"): @@ -459,8 +459,8 @@ def _datasetname_and_maxfiles(entry): filesFromOption(self) if self._options.filetype == "EDM_RNTUPLE": self.process.source=cms.Source("RNTupleTempSource", - fileNames = cms.untracked.vstring())#, 2ndary not supported yet - #secondaryFileNames= cms.untracked.vstring()) + fileNames = cms.untracked.vstring(), + secondaryFileNames= cms.untracked.vstring()) filesFromOption(self) elif self._options.filetype == "DAT": self.process.source=cms.Source("NewEventStreamFileReader",fileNames = cms.untracked.vstring()) From 282d27418ebde7283d121eb4dd451740c69499e4 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 16 Dec 2025 23:59:23 +0100 Subject: [PATCH 2/6] Clarify file format exception message --- FWIO/RNTupleTempInput/src/RootRNTuple.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FWIO/RNTupleTempInput/src/RootRNTuple.cc b/FWIO/RNTupleTempInput/src/RootRNTuple.cc index be2e496bbb9b7..6291969518b99 100644 --- a/FWIO/RNTupleTempInput/src/RootRNTuple.cc +++ b/FWIO/RNTupleTempInput/src/RootRNTuple.cc @@ -78,8 +78,8 @@ namespace edm::rntuple_temp { } if (not reader_) { throw cms::Exception("WrongFileFormat") - << "The ROOT file does not contain a TTree named " << productTreeName - << "\n This is either not an edm ROOT file or is one that has been corrupted."; + << "The ROOT file does not contain a RNTuple named " << productTreeName + << "\n This is either not an edm RNTuple ROOT file or is one that has been corrupted."; } entries_ = reader_->GetNEntries(); } From 7d31460f00990d1062b901f6a83c68bc8552b0af Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 17 Dec 2025 17:17:43 +0100 Subject: [PATCH 3/6] Avoid using RNTuple output for steps before the last step using --secondfilein --- .../python/WorkFlowRunner.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Configuration/PyReleaseValidation/python/WorkFlowRunner.py b/Configuration/PyReleaseValidation/python/WorkFlowRunner.py index cd9e9d0e54c90..b5ec1620a9459 100644 --- a/Configuration/PyReleaseValidation/python/WorkFlowRunner.py +++ b/Configuration/PyReleaseValidation/python/WorkFlowRunner.py @@ -117,6 +117,21 @@ def run(self): def closeCmd(i,ID): return ' > %s 2>&1; ' % ('step%d_'%(i,)+ID+'.log ',) + # For --secondfilein the primary and secondary files must have + # the same format (TTree or RNTuple). For now find the last + # step that uses --secondfilein, and use TTree for all steps + # up to that step. Theoretically we could identify the exact + # steps that need TTree output in this case, but given the way + # --secondfilein is being used now, and the deployment plan + # for RNTuple for HL-LHC, that complexity does not seem worth it. + lastStepWithSecondFileIn = None + if self.useRNTuple: + for (istepmone,com) in enumerate(self.wf.cmds): + # I don't know what to do in case com is something else + if isinstance(com, str): + if "--secondfilein" in com: + lastStepWithSecondFileIn = istepmone+1 + inFile=None lumiRangeFile=None aborted=False @@ -192,7 +207,8 @@ def closeCmd(i,ID): cmd += com - if self.useRNTuple: + if self.useRNTuple and not \ + (lastStepWithSecondFileIn is not None and istep < lastStepWithSecondFileIn): cmd+=' --rntuple_out' if self.startFrom: steps = cmd.split("-s ")[1].split(" ")[0] From d6fb48148e29a03344025f776f7ae2217c9eb9db Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 17 Dec 2025 18:00:03 +0100 Subject: [PATCH 4/6] Use the file extension of previous step properly This approach is still not fully general. But it is sufficient to handle the case where an earlier step produces a TTree file (for reasons), and a later step produces an RNTuple file. --- .../python/WorkFlowRunner.py | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/Configuration/PyReleaseValidation/python/WorkFlowRunner.py b/Configuration/PyReleaseValidation/python/WorkFlowRunner.py index b5ec1620a9459..d83ee48fb0eac 100644 --- a/Configuration/PyReleaseValidation/python/WorkFlowRunner.py +++ b/Configuration/PyReleaseValidation/python/WorkFlowRunner.py @@ -60,24 +60,35 @@ def doCmd(self, cmd): return ret @staticmethod - def replace_filein_extensions(command_line, extension): + def replace_filein_extensions(command_line, outputExtensionForStep, defaultExtension): # Pattern to match --filein followed by file:file.ext entries (comma-separated) filein_pattern = re.compile( r'(--filein\s+)((?:file:[a-zA-Z0-9_]+\.[a-z]+(?:,\s*)?)*)' ) - # Inner pattern to match individual file entries - file_pattern = re.compile(r'file:([a-zA-Z0-9_]+)\.[a-z]+') + # Inner patterns to match individual file entries + # For stepN naming need to know the N + file_pattern_step = re.compile('file:step([1-9]+)(_[a-zA-Z]+)?\.[a-z]+') + # Some ALCA steps use special file names without stepN, those + # are assumed to use the default extension + file_pattern_gen = re.compile(r'file:([a-zA-Z0-9_]+)\.[a-z]+') def replace_filein_match(filein_match): filein_prefix = filein_match.group(1) file_list_str = filein_match.group(2) # Replace extensions in the file list - new_file_list = file_pattern.sub( - lambda m: 'file:{0}{1}'.format(m.group(1), extension), - file_list_str - ) + m = file_pattern_step.search(file_list_str) + if m: + new_file_list = file_pattern_step.sub( + lambda m: 'file:step{0}{1}{2}'.format(m.group(1), m.group(2) or "", outputExtensionForStep[int(m.group(1))]), + file_list_str + ) + else: + new_file_list = file_pattern_gen.sub( + lambda m: 'file:{0}{1}'.format(m.group(1), defaultExtension), + file_list_str + ) return filein_prefix + new_file_list @@ -243,16 +254,16 @@ def closeCmd(i,ID): if istep!=1 and not '--filein' in cmd and not 'premix_stage1' in cmd and not ("--fast" in cmd and "premix_stage2" in cmd): steps = cmd.split("-s ")[1].split(" ")[0] ## relying on the syntax: cmsDriver -s STEPS --otherFlags if "ALCA" not in steps: - cmd+=' --filein file:step%s%s '%(istep-1,extension) + cmd+=' --filein file:step%s%s '%(istep-1,outputExtensionForStep[istep-1]) elif "ALCA" in steps and "RECO" in steps: - cmd+=' --filein file:step%s%s '%(istep-1,extension) + cmd+=' --filein file:step%s%s '%(istep-1,outputExtensionForStep[istep-1]) elif self.recoOutput: cmd+=' --filein %s'%(self.recoOutput) else: - cmd+=' --filein file:step%s%s '%(istep-1,extension) + cmd+=' --filein file:step%s%s '%(istep-1,outputExtensionForStep[istep-1]) elif istep!=1 and '--filein' in cmd and '--filetype' not in cmd: # make sure correct extension is being used - cmd = self.replace_filein_extensions(cmd, extension) + cmd = self.replace_filein_extensions(cmd, outputExtensionForStep, extension) if not '--fileout' in com: cmd+=' --fileout file:step%s%s '%(istep,extension) if "RECO" in cmd: From 5a0d3c34c1238441c84e6751152e850dcc7952ec Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 17 Dec 2025 21:39:56 +0100 Subject: [PATCH 5/6] Handle also --pileup_input file extension --- Configuration/PyReleaseValidation/python/WorkFlowRunner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Configuration/PyReleaseValidation/python/WorkFlowRunner.py b/Configuration/PyReleaseValidation/python/WorkFlowRunner.py index d83ee48fb0eac..2925e6507afed 100644 --- a/Configuration/PyReleaseValidation/python/WorkFlowRunner.py +++ b/Configuration/PyReleaseValidation/python/WorkFlowRunner.py @@ -60,10 +60,10 @@ def doCmd(self, cmd): return ret @staticmethod - def replace_filein_extensions(command_line, outputExtensionForStep, defaultExtension): + def replace_filein_extensions(command_line, outputExtensionForStep, defaultExtension, fileOption='--filein'): # Pattern to match --filein followed by file:file.ext entries (comma-separated) filein_pattern = re.compile( - r'(--filein\s+)((?:file:[a-zA-Z0-9_]+\.[a-z]+(?:,\s*)?)*)' + r'('+fileOption+r'\s+)((?:file:[a-zA-Z0-9_]+\.[a-z]+(?:,\s*)?)*)' ) # Inner patterns to match individual file entries @@ -264,6 +264,9 @@ def closeCmd(i,ID): elif istep!=1 and '--filein' in cmd and '--filetype' not in cmd: # make sure correct extension is being used cmd = self.replace_filein_extensions(cmd, outputExtensionForStep, extension) + if '--pileup_input' in cmd and '--filetype' not in cmd: + # make sure correct extension is being used + cmd = self.replace_filein_extensions(cmd, outputExtensionForStep, extension, fileOption='--pileup_input') if not '--fileout' in com: cmd+=' --fileout file:step%s%s '%(istep,extension) if "RECO" in cmd: From d673b8277c7fdb30658dfaae544de1905ec0c8bd Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 17 Dec 2025 23:06:38 +0100 Subject: [PATCH 6/6] Override MixingModule input with EmbeddedRNTupleTempSource Generalize --- .../Applications/python/ConfigBuilder.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Configuration/Applications/python/ConfigBuilder.py b/Configuration/Applications/python/ConfigBuilder.py index d4693da4791c5..83dc0655c033e 100644 --- a/Configuration/Applications/python/ConfigBuilder.py +++ b/Configuration/Applications/python/ConfigBuilder.py @@ -817,6 +817,8 @@ def addStandardSequences(self): mixingDict['F']=(filesFromList(self._options.pileup_input[9:]))[0] else: mixingDict['F']=self._options.pileup_input.split(',') + + self.customizeMixingModuleForRNTuple(mixingDict.get('F', []), 'mix') specialization=defineMixing(mixingDict) for command in specialization: self.executeAndRemember(command) @@ -872,6 +874,20 @@ def addStandardSequences(self): else: self._options.inputCommands='keep *_randomEngineStateProducer_*_*,' + def customizeMixingModuleForRNTuple(self, files, mixingModuleLabel): + # Do we want a command-line option as well to switch the input type? + # Naively the 'filetype' looks attractive, but it would + # couple the primary Source and the SecSource to the same + # file format, which is not strictly necessary + useRNTuple= len(files) > 0 and files[0].lower().endswith(".rntpl") + if useRNTuple: + rntupleSrc = cms.SecSource("EmbeddedRNTupleRootSource") + mixingModule = getattr(self.process, mixingModuleLabel) + rntupleSrc.update_(mixingModule.input.parameters_()) + mixingModule.input = rntupleSrc + self.additionalCommands.append('rntupleSrc = cms.SecSource("EmbeddedRNTupleTempSource")') + self.additionalCommands.append(f'rntupleSrc.update_(process.{mixingModuleLabel}.input.parameters_())') + self.additionalCommands.append(f'process.{mixingModuleLabel}.input = rntupleSrc') def completeInputCommand(self): if self._options.inputEventContent: @@ -1591,6 +1607,8 @@ def prepare_DATAMIX(self, stepSpec = None): theFiles= (filesFromList(self._options.pileup_input[9:]))[0] else: theFiles=self._options.pileup_input.split(',') + + self.customizeMixingModuleForRNTuple(theFiles, 'mixData') #print theFiles self.executeAndRemember( "process.mixData.input.fileNames = cms.untracked.vstring(%s)"%( theFiles ) )