Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion Configuration/PyReleaseValidation/python/MatrixInjector.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,25 @@ def __init__(self,opt,mode='init',options=''):
if(opt.batchName):
self.batchName = '__'+opt.batchName+'-'+self.batchTime

####################################
# Checking and setting up GPU attributes
####################################
# Mendatory
self.RequiresGPU = opt.RequiresGPU
if self.RequiresGPU not in ('forbidden','optional','required'):
print('RequiresGPU must be forbidden, optional, required. Now, set to forbidden.')
self.RequiresGPU = 'forbidden'
if self.RequiresGPU == 'optional':
print('Optional GPU is turned off for RelVals. Now, changing it to forbidden')
self.RequiresGPU = 'forbidden'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if self.RequiresGPU == 'optional':
print('Optional GPU is turned off for RelVals. Now, changing it to forbidden')
self.RequiresGPU = 'forbidden'

otherwise, doesn't this completely defeat the purpose of "optional" ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we would like to keep optional as real optional definition also in relvals, this can be removed. No issue on removal. I will include it in the next commit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, if we don't want to use "optional" in the RelVals, we should simply not use it as an option when creating and submitting them.

Additionally, what I do not understand is how "optional" could be used for any other uses case if it is always removed by runTheMatrix ?

self.GPUMemoryMB = opt.GPUMemoryMB
self.CUDACapabilities = opt.CUDACapabilities.split(',')
self.CUDARuntime = opt.CUDARuntime
# optional
self.GPUName = opt.GPUName
self.CUDADriverVersion = opt.CUDADriverVersion
self.CUDARuntimeVersion = opt.CUDARuntimeVersion

# WMagent url
if not self.wmagent:
# Overwrite with env variable
Expand Down Expand Up @@ -180,8 +199,24 @@ def __init__(self,opt,mode='init',options=''):
"nowmIO": {},
"Multicore" : opt.nThreads, # this is the per-taskchain Multicore; it's the default assigned to a task if it has no value specified
"EventStreams": self.numberOfStreams,
"KeepOutput" : False
"KeepOutput" : False,
"RequiresGPU" : None,
"GPUParams": None
}
self.defaultGPUParams={
"GPUMemoryMB": self.GPUMemoryMB,
"CUDACapabilities": self.CUDACapabilities,
"CUDARuntime": self.CUDARuntime
}

self.dictGPUName={"GPUName": self.GPUName}
if self.GPUName: self.defaultGPUParams.update(self.dictGPUName)

self.dictCUDADriverVersion={"CUDADriverVersion": self.CUDADriverVersion}
if self.CUDADriverVersion: self.defaultGPUParams.update(self.dictCUDADriverVersion)

self.dictCUDARuntimeVersion={"CUDARuntimeVersion": self.CUDARuntimeVersion}
if self.CUDARuntimeVersion: elf.defaultGPUParams.update(self.dictCUDARuntimeVersion)

self.chainDicts={}

Expand Down Expand Up @@ -408,6 +443,9 @@ def prepare(self, mReader, directories, mode='init'):
if setPrimaryDs:
chainDict['nowmTasklist'][-1]['PrimaryDataset']=setPrimaryDs
nextHasDSInput=None
if 'GPU' in step and self.RequiresGPU == 'required':
chainDict['nowmTasklist'][-1]['RequiresGPU'] = self.RequiresGPU
chainDict['nowmTasklist'][-1]['GPUParams']=json.dumps(self.defaultGPUParams)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this part.
Why not simply

Suggested change
if 'GPU' in step and self.RequiresGPU == 'required':
chainDict['nowmTasklist'][-1]['RequiresGPU'] = self.RequiresGPU
chainDict['nowmTasklist'][-1]['GPUParams']=json.dumps(self.defaultGPUParams)
if self.RequiresGPU != 'forbidden':
chainDict['nowmTasklist'][-1]['RequiresGPU'] = self.RequiresGPU
chainDict['nowmTasklist'][-1]['GPUParams']=json.dumps(self.defaultGPUParams)

(and similarly below) ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, why so nested in all the checks, instead of simply being done for all steps ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I code in this way following the discussion in dmwm/WMCore#10393 (comment), to be flexible in the task/step level. In case of taskchain, one can do GEN-SIM in non-GPU env, while HLT in GPU env, for example.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO that is not going to be maintainable - we cannot add "GPU" in the name of all the steps that we want to (potentially) run on a GPU-equipped node.

For example, soon enough the HLT step of any Run-3 workflow will be able to run on GPUs; so it could make sense to submit jobs with --gpu optional, but I doublt we want to rename everything adding GPU in its name.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's the proposed update to the default values and help messages.

I didn't check if it runs !

Thanks very much @fwyzard
I've implemented all suggestions, but leave checking on GPU on step name for now. We can open a discussion with @amaltaro

else:
#not first step and no inputDS
chainDict['nowmTasklist'].append(copy.deepcopy(self.defaultTask))
Expand All @@ -420,6 +458,9 @@ def prepare(self, mReader, directories, mode='init'):
chainDict['nowmTasklist'][-1]['LumisPerJob']=splitForThisWf
if step in wmsplit:
chainDict['nowmTasklist'][-1]['LumisPerJob']=wmsplit[step]
if 'GPU' in step and self.RequiresGPU == 'required':
chainDict['nowmTasklist'][-1]['RequiresGPU'] = self.RequiresGPU
chainDict['nowmTasklist'][-1]['GPUParams']=json.dumps(self.defaultGPUParams)

# change LumisPerJob for Hadronizer steps.
if 'Hadronizer' in step:
Expand Down
90 changes: 80 additions & 10 deletions Configuration/PyReleaseValidation/scripts/runTheMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,26 +120,31 @@ def runSelected(opt):
dest='memoryOffset',
default=3000
)

parser.add_option('--addMemPerCore',
help='increase of memory per each n > 1 core: memory(n_core) = memoryOffset + (n_core-1) * memPerCore',
dest='memPerCore',
default=1500
)

parser.add_option('-j','--nproc',
help='number of processes. 0 Will use 4 processes, not execute anything but create the wfs',
dest='nProcs',
default=4
)

parser.add_option('-t','--nThreads',
help='number of threads per process to use in cmsRun.',
dest='nThreads',
default=1
)

parser.add_option('--nStreams',
help='number of streams to use in cmsRun.',
dest='nStreams',
default=0
)

parser.add_option('--numberEventsInLuminosityBlock',
help='number of events in a luminosity block',
dest='numberEventsInLuminosityBlock',
Expand All @@ -152,119 +157,142 @@ def runSelected(opt):
default=False,
action='store_true'
)

parser.add_option('-e','--extended',
help='Show details of workflows, used with --show',
dest='extended',
default=False,
action='store_true'
)

parser.add_option('-s','--selected',
help='Run a pre-defined selected matrix of wf. Deprecated, please use -l limited',
dest='restricted',
default=False,
action='store_true'
)

parser.add_option('-l','--list',
help='Coma separated list of workflow to be shown or ran. Possible keys are also '+str(predefinedSet.keys())+'. and wild card like muon, or mc',
help='Comma separated list of workflow to be shown or ran. Possible keys are also '+str(predefinedSet.keys())+'. and wild card like muon, or mc',
dest='testList',
default=None
)

parser.add_option('-r','--raw',
help='Temporary dump the .txt needed for prodAgent interface. To be discontinued soon. Argument must be the name of the set (standard, pileup,...)',
dest='raw'
)

parser.add_option('-i','--useInput',
help='Use recyling where available. Either all, or a coma separated list of wf number.',
help='Use recyling where available. Either all, or a comma separated list of wf number.',
dest='useInput',
default=None
)

parser.add_option('-w','--what',
help='Specify the set to be used. Argument must be the name of a set (standard, pileup,...) or multiple sets separated by commas (--what standard,pileup )',
dest='what',
default='all'
)

parser.add_option('--step1',
help='Used with --raw. Limit the production to step1',
dest='step1Only',
default=False
)

parser.add_option('--maxSteps',
help='Only run maximum on maxSteps. Used when we are only interested in first n steps.',
dest='maxSteps',
default=9999,
type="int"
)

parser.add_option('--fromScratch',
help='Coma separated list of wf to be run without recycling. all is not supported as default.',
help='Comma separated list of wf to be run without recycling. all is not supported as default.',
dest='fromScratch',
default=None
)

parser.add_option('--refRelease',
help='Allow to modify the recycling dataset version',
dest='refRel',
default=None
)

parser.add_option('--wmcontrol',
help='Create the workflows for injection to WMAgent. In the WORKING. -wmcontrol init will create the the workflows, -wmcontrol test will dryRun a test, -wmcontrol submit will submit to wmagent',
choices=['init','test','submit','force'],
dest='wmcontrol',
default=None,
)

parser.add_option('--revertDqmio',
help='When submitting workflows to wmcontrol, force DQM outout to use pool and not DQMIO',
choices=['yes','no'],
dest='revertDqmio',
default='no',
)

parser.add_option('--optionswm',
help='Specify a few things for wm injection',
default='',
dest='wmoptions')

parser.add_option('--keep',
help='allow to specify for which coma separated steps the output is needed',
help='allow to specify for which comma separated steps the output is needed',
default=None)

parser.add_option('--label',
help='allow to give a special label to the output dataset name',
default='')

parser.add_option('--command',
help='provide a way to add additional command to all of the cmsDriver commands in the matrix',
dest='command',
action='append',
default=None
)

parser.add_option('--apply',
help='allow to use the --command only for 1 coma separeated',
help='allow to use the --command only for 1 comma separeated',
dest='apply',
default=None)

parser.add_option('--workflow',
help='define a workflow to be created or altered from the matrix',
action='append',
dest='workflow',
default=None
)

parser.add_option('--dryRun',
help='do not run the wf at all',
action='store_true',
dest='dryRun',
default=False
)

parser.add_option('--testbed',
help='workflow injection to cmswebtest (you need dedicated rqmgr account)',
dest='testbed',
default=False,
action='store_true'
)

parser.add_option('--noCafVeto',
help='Run from any source, ignoring the CAF label',
dest='cafVeto',
default=True,
action='store_false'
)

parser.add_option('--overWrite',
help='Change the content of a step for another. List of pairs.',
dest='overWrite',
default=None
)

parser.add_option('--noRun',
help='Remove all run list selection from wfs',
dest='noRun',
Expand Down Expand Up @@ -294,6 +322,7 @@ def runSelected(opt):
dest='dasSites',
default='T2_CH_CERN',
action='store')

parser.add_option('--interactive',
help="Open the Matrix interactive shell",
action='store_true',
Expand All @@ -305,6 +334,47 @@ def runSelected(opt):
default=None,
action='store')

parser.add_option('--gpu',
help='Use GPU workflow setup if available',
dest='gpuEnable',
default=False,
action='store_true')

parser.add_option('--RequiresGPU',
help='if GPU is required or not: forbidden (default, CPU-only), optional, required. For relvals, the GPU option will be turned off for optional.',
dest='RequiresGPU',
default='forbidden')

parser.add_option('--GPUMemoryMB',
help='to specify GPU memory. Default = 8000 MB (for RequiresGPU = required).',
dest='GPUMemoryMB',
default=8000)

parser.add_option('--CUDACapabilities',
help='to specify CUDA capabilities. Default = 6.0,6.1,6.2,7.0,7.2,7.5 (for RequiresGPU = required). Use comma to identify various CUDACapabilities',
dest='CUDACapabilities',
default='6.0,6.1,6.2,7.0,7.2,7.5')

parser.add_option('--CUDARuntime',
help='to specify major and minor CUDA runtime used to build the application. Default = 11.2 (for RequiresGPU = required). FIX ME TO MATCH WITH CMSSW.',
dest='CUDARuntime',
default='11.2')

parser.add_option('--GPUName',
help='to specify GPU class. This is an optional parameter.',
dest='GPUName',
default='')

parser.add_option('--CUDADriverVersion',
help='to specify CUDA driver version. This is an optional parameter.',
dest='CUDADriverVersion',
default='')

parser.add_option('--CUDARuntimeVersion',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll double check, but I don't think this should ever be set on the client side ?

help='to specify CUDA runtime version. This is an optional parameter.',
dest='CUDARuntimeVersion',
default='')

opt,args = parser.parse_args()
if opt.command: opt.command = ' '.join(opt.command)
os.environ["CMSSW_DAS_QUERY_SITES"]=opt.dasSites
Expand Down Expand Up @@ -346,8 +416,6 @@ def stepOrIndex(s):
if opt.keep:
opt.keep=map(stepOrIndex,opt.keep.split(','))



if opt.testList:
testList=[]
for entry in opt.testList.split(','):
Expand All @@ -372,9 +440,11 @@ def stepOrIndex(s):
if opt.nProcs: opt.nProcs=int(opt.nProcs)
if opt.nThreads: opt.nThreads=int(opt.nThreads)
if opt.nStreams: opt.nStreams=int(opt.nStreams)
if (opt.numberEventsInLuminosityBlock): opt.numberEventsInLuminosityBlock=int(opt.numberEventsInLuminosityBlock)
if (opt.memoryOffset): opt.memoryOffset=int(opt.memoryOffset)
if (opt.memPerCore): opt.memPerCore=int(opt.memPerCore)
if opt.numberEventsInLuminosityBlock: opt.numberEventsInLuminosityBlock=int(opt.numberEventsInLuminosityBlock)
if opt.memoryOffset: opt.memoryOffset=int(opt.memoryOffset)
if opt.memPerCore: opt.memPerCore=int(opt.memPerCore)
if opt.GPUMemoryMB: opt.GPUMemoryMB=int(opt.GPUMemoryMB)
if opt.gpuEnable: opt.RequiresGPU="required"

if opt.wmcontrol:
performInjectionOptionTest(opt)
Expand Down