-
Notifications
You must be signed in to change notification settings - Fork 4.6k
Add GPU workflow to runTheMatrix #35263
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
cc07c62
7d8f062
5cec229
282edcf
ce3f561
b7862c5
7f17b26
6135782
1734e7f
6707ba0
bfdc428
de6a246
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -65,6 +65,25 @@ def __init__(self,opt,mode='init',options=''): | |||||||||||||
| if(opt.batchName): | ||||||||||||||
| self.batchName = '__'+opt.batchName+'-'+self.batchTime | ||||||||||||||
|
|
||||||||||||||
| #################################### | ||||||||||||||
| # Checking and setting up GPU attributes | ||||||||||||||
| #################################### | ||||||||||||||
| # Mendatory | ||||||||||||||
| self.RequiresGPU = opt.RequiresGPU | ||||||||||||||
| if self.RequiresGPU not in ('forbidden','optional','required'): | ||||||||||||||
| print('RequiresGPU must be forbidden, optional, required. Now, set to forbidden.') | ||||||||||||||
| self.RequiresGPU = 'forbidden' | ||||||||||||||
| if self.RequiresGPU == 'optional': | ||||||||||||||
| print('Optional GPU is turned off for RelVals. Now, changing it to forbidden') | ||||||||||||||
| self.RequiresGPU = 'forbidden' | ||||||||||||||
| self.GPUMemoryMB = opt.GPUMemoryMB | ||||||||||||||
| self.CUDACapabilities = opt.CUDACapabilities.split(',') | ||||||||||||||
| self.CUDARuntime = opt.CUDARuntime | ||||||||||||||
| # optional | ||||||||||||||
| self.GPUName = opt.GPUName | ||||||||||||||
| self.CUDADriverVersion = opt.CUDADriverVersion | ||||||||||||||
| self.CUDARuntimeVersion = opt.CUDARuntimeVersion | ||||||||||||||
|
|
||||||||||||||
| # WMagent url | ||||||||||||||
| if not self.wmagent: | ||||||||||||||
| # Overwrite with env variable | ||||||||||||||
|
|
@@ -180,8 +199,24 @@ def __init__(self,opt,mode='init',options=''): | |||||||||||||
| "nowmIO": {}, | ||||||||||||||
| "Multicore" : opt.nThreads, # this is the per-taskchain Multicore; it's the default assigned to a task if it has no value specified | ||||||||||||||
| "EventStreams": self.numberOfStreams, | ||||||||||||||
| "KeepOutput" : False | ||||||||||||||
| "KeepOutput" : False, | ||||||||||||||
| "RequiresGPU" : None, | ||||||||||||||
| "GPUParams": None | ||||||||||||||
| } | ||||||||||||||
| self.defaultGPUParams={ | ||||||||||||||
| "GPUMemoryMB": self.GPUMemoryMB, | ||||||||||||||
| "CUDACapabilities": self.CUDACapabilities, | ||||||||||||||
| "CUDARuntime": self.CUDARuntime | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| self.dictGPUName={"GPUName": self.GPUName} | ||||||||||||||
| if self.GPUName: self.defaultGPUParams.update(self.dictGPUName) | ||||||||||||||
srimanob marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||||||||||
|
|
||||||||||||||
| self.dictCUDADriverVersion={"CUDADriverVersion": self.CUDADriverVersion} | ||||||||||||||
| if self.CUDADriverVersion: self.defaultGPUParams.update(self.dictCUDADriverVersion) | ||||||||||||||
|
|
||||||||||||||
| self.dictCUDARuntimeVersion={"CUDARuntimeVersion": self.CUDARuntimeVersion} | ||||||||||||||
| if self.CUDARuntimeVersion: elf.defaultGPUParams.update(self.dictCUDARuntimeVersion) | ||||||||||||||
|
|
||||||||||||||
| self.chainDicts={} | ||||||||||||||
|
|
||||||||||||||
|
|
@@ -408,6 +443,9 @@ def prepare(self, mReader, directories, mode='init'): | |||||||||||||
| if setPrimaryDs: | ||||||||||||||
| chainDict['nowmTasklist'][-1]['PrimaryDataset']=setPrimaryDs | ||||||||||||||
| nextHasDSInput=None | ||||||||||||||
| if 'GPU' in step and self.RequiresGPU == 'required': | ||||||||||||||
| chainDict['nowmTasklist'][-1]['RequiresGPU'] = self.RequiresGPU | ||||||||||||||
| chainDict['nowmTasklist'][-1]['GPUParams']=json.dumps(self.defaultGPUParams) | ||||||||||||||
|
||||||||||||||
| if 'GPU' in step and self.RequiresGPU == 'required': | |
| chainDict['nowmTasklist'][-1]['RequiresGPU'] = self.RequiresGPU | |
| chainDict['nowmTasklist'][-1]['GPUParams']=json.dumps(self.defaultGPUParams) | |
| if self.RequiresGPU != 'forbidden': | |
| chainDict['nowmTasklist'][-1]['RequiresGPU'] = self.RequiresGPU | |
| chainDict['nowmTasklist'][-1]['GPUParams']=json.dumps(self.defaultGPUParams) |
(and similarly below) ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, why so nested in all the checks, instead of simply being done for all steps ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I code in this way following the discussion in dmwm/WMCore#10393 (comment), to be flexible in the task/step level. In case of taskchain, one can do GEN-SIM in non-GPU env, while HLT in GPU env, for example.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMHO that is not going to be maintainable - we cannot add "GPU" in the name of all the steps that we want to (potentially) run on a GPU-equipped node.
For example, soon enough the HLT step of any Run-3 workflow will be able to run on GPUs; so it could make sense to submit jobs with --gpu optional, but I doublt we want to rename everything adding GPU in its name.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -120,26 +120,31 @@ def runSelected(opt): | |
| dest='memoryOffset', | ||
| default=3000 | ||
| ) | ||
|
|
||
| parser.add_option('--addMemPerCore', | ||
| help='increase of memory per each n > 1 core: memory(n_core) = memoryOffset + (n_core-1) * memPerCore', | ||
| dest='memPerCore', | ||
| default=1500 | ||
| ) | ||
|
|
||
| parser.add_option('-j','--nproc', | ||
| help='number of processes. 0 Will use 4 processes, not execute anything but create the wfs', | ||
| dest='nProcs', | ||
| default=4 | ||
| ) | ||
|
|
||
| parser.add_option('-t','--nThreads', | ||
| help='number of threads per process to use in cmsRun.', | ||
| dest='nThreads', | ||
| default=1 | ||
| ) | ||
|
|
||
| parser.add_option('--nStreams', | ||
| help='number of streams to use in cmsRun.', | ||
| dest='nStreams', | ||
| default=0 | ||
| ) | ||
|
|
||
| parser.add_option('--numberEventsInLuminosityBlock', | ||
| help='number of events in a luminosity block', | ||
| dest='numberEventsInLuminosityBlock', | ||
|
|
@@ -152,119 +157,142 @@ def runSelected(opt): | |
| default=False, | ||
| action='store_true' | ||
| ) | ||
|
|
||
| parser.add_option('-e','--extended', | ||
| help='Show details of workflows, used with --show', | ||
| dest='extended', | ||
| default=False, | ||
| action='store_true' | ||
| ) | ||
|
|
||
| parser.add_option('-s','--selected', | ||
| help='Run a pre-defined selected matrix of wf. Deprecated, please use -l limited', | ||
| dest='restricted', | ||
| default=False, | ||
| action='store_true' | ||
| ) | ||
|
|
||
| parser.add_option('-l','--list', | ||
| help='Coma separated list of workflow to be shown or ran. Possible keys are also '+str(predefinedSet.keys())+'. and wild card like muon, or mc', | ||
| help='Comma separated list of workflow to be shown or ran. Possible keys are also '+str(predefinedSet.keys())+'. and wild card like muon, or mc', | ||
| dest='testList', | ||
| default=None | ||
| ) | ||
|
|
||
| parser.add_option('-r','--raw', | ||
| help='Temporary dump the .txt needed for prodAgent interface. To be discontinued soon. Argument must be the name of the set (standard, pileup,...)', | ||
| dest='raw' | ||
| ) | ||
|
|
||
| parser.add_option('-i','--useInput', | ||
| help='Use recyling where available. Either all, or a coma separated list of wf number.', | ||
| help='Use recyling where available. Either all, or a comma separated list of wf number.', | ||
| dest='useInput', | ||
| default=None | ||
| ) | ||
|
|
||
| parser.add_option('-w','--what', | ||
| help='Specify the set to be used. Argument must be the name of a set (standard, pileup,...) or multiple sets separated by commas (--what standard,pileup )', | ||
| dest='what', | ||
| default='all' | ||
| ) | ||
|
|
||
| parser.add_option('--step1', | ||
| help='Used with --raw. Limit the production to step1', | ||
| dest='step1Only', | ||
| default=False | ||
| ) | ||
|
|
||
| parser.add_option('--maxSteps', | ||
| help='Only run maximum on maxSteps. Used when we are only interested in first n steps.', | ||
| dest='maxSteps', | ||
| default=9999, | ||
| type="int" | ||
| ) | ||
|
|
||
| parser.add_option('--fromScratch', | ||
| help='Coma separated list of wf to be run without recycling. all is not supported as default.', | ||
| help='Comma separated list of wf to be run without recycling. all is not supported as default.', | ||
| dest='fromScratch', | ||
| default=None | ||
| ) | ||
|
|
||
| parser.add_option('--refRelease', | ||
| help='Allow to modify the recycling dataset version', | ||
| dest='refRel', | ||
| default=None | ||
| ) | ||
|
|
||
| parser.add_option('--wmcontrol', | ||
| help='Create the workflows for injection to WMAgent. In the WORKING. -wmcontrol init will create the the workflows, -wmcontrol test will dryRun a test, -wmcontrol submit will submit to wmagent', | ||
| choices=['init','test','submit','force'], | ||
| dest='wmcontrol', | ||
| default=None, | ||
| ) | ||
|
|
||
| parser.add_option('--revertDqmio', | ||
| help='When submitting workflows to wmcontrol, force DQM outout to use pool and not DQMIO', | ||
| choices=['yes','no'], | ||
| dest='revertDqmio', | ||
| default='no', | ||
| ) | ||
|
|
||
| parser.add_option('--optionswm', | ||
| help='Specify a few things for wm injection', | ||
| default='', | ||
| dest='wmoptions') | ||
|
|
||
| parser.add_option('--keep', | ||
| help='allow to specify for which coma separated steps the output is needed', | ||
| help='allow to specify for which comma separated steps the output is needed', | ||
| default=None) | ||
|
|
||
| parser.add_option('--label', | ||
| help='allow to give a special label to the output dataset name', | ||
| default='') | ||
|
|
||
| parser.add_option('--command', | ||
| help='provide a way to add additional command to all of the cmsDriver commands in the matrix', | ||
| dest='command', | ||
| action='append', | ||
| default=None | ||
| ) | ||
|
|
||
| parser.add_option('--apply', | ||
| help='allow to use the --command only for 1 coma separeated', | ||
| help='allow to use the --command only for 1 comma separeated', | ||
| dest='apply', | ||
| default=None) | ||
|
|
||
| parser.add_option('--workflow', | ||
| help='define a workflow to be created or altered from the matrix', | ||
| action='append', | ||
| dest='workflow', | ||
| default=None | ||
| ) | ||
|
|
||
| parser.add_option('--dryRun', | ||
| help='do not run the wf at all', | ||
| action='store_true', | ||
| dest='dryRun', | ||
| default=False | ||
| ) | ||
|
|
||
| parser.add_option('--testbed', | ||
| help='workflow injection to cmswebtest (you need dedicated rqmgr account)', | ||
| dest='testbed', | ||
| default=False, | ||
| action='store_true' | ||
| ) | ||
|
|
||
| parser.add_option('--noCafVeto', | ||
| help='Run from any source, ignoring the CAF label', | ||
| dest='cafVeto', | ||
| default=True, | ||
| action='store_false' | ||
| ) | ||
|
|
||
| parser.add_option('--overWrite', | ||
| help='Change the content of a step for another. List of pairs.', | ||
| dest='overWrite', | ||
| default=None | ||
| ) | ||
|
|
||
| parser.add_option('--noRun', | ||
| help='Remove all run list selection from wfs', | ||
| dest='noRun', | ||
|
|
@@ -294,6 +322,7 @@ def runSelected(opt): | |
| dest='dasSites', | ||
| default='T2_CH_CERN', | ||
| action='store') | ||
|
|
||
| parser.add_option('--interactive', | ||
| help="Open the Matrix interactive shell", | ||
| action='store_true', | ||
|
|
@@ -305,6 +334,47 @@ def runSelected(opt): | |
| default=None, | ||
| action='store') | ||
|
|
||
| parser.add_option('--gpu', | ||
| help='Use GPU workflow setup if available', | ||
| dest='gpuEnable', | ||
| default=False, | ||
| action='store_true') | ||
|
|
||
| parser.add_option('--RequiresGPU', | ||
srimanob marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| help='if GPU is required or not: forbidden (default, CPU-only), optional, required. For relvals, the GPU option will be turned off for optional.', | ||
| dest='RequiresGPU', | ||
| default='forbidden') | ||
|
|
||
| parser.add_option('--GPUMemoryMB', | ||
| help='to specify GPU memory. Default = 8000 MB (for RequiresGPU = required).', | ||
| dest='GPUMemoryMB', | ||
| default=8000) | ||
|
|
||
| parser.add_option('--CUDACapabilities', | ||
| help='to specify CUDA capabilities. Default = 6.0,6.1,6.2,7.0,7.2,7.5 (for RequiresGPU = required). Use comma to identify various CUDACapabilities', | ||
| dest='CUDACapabilities', | ||
| default='6.0,6.1,6.2,7.0,7.2,7.5') | ||
|
|
||
| parser.add_option('--CUDARuntime', | ||
| help='to specify major and minor CUDA runtime used to build the application. Default = 11.2 (for RequiresGPU = required). FIX ME TO MATCH WITH CMSSW.', | ||
| dest='CUDARuntime', | ||
| default='11.2') | ||
|
|
||
| parser.add_option('--GPUName', | ||
| help='to specify GPU class. This is an optional parameter.', | ||
| dest='GPUName', | ||
| default='') | ||
|
|
||
| parser.add_option('--CUDADriverVersion', | ||
| help='to specify CUDA driver version. This is an optional parameter.', | ||
| dest='CUDADriverVersion', | ||
| default='') | ||
|
|
||
| parser.add_option('--CUDARuntimeVersion', | ||
|
||
| help='to specify CUDA runtime version. This is an optional parameter.', | ||
| dest='CUDARuntimeVersion', | ||
| default='') | ||
|
|
||
| opt,args = parser.parse_args() | ||
| if opt.command: opt.command = ' '.join(opt.command) | ||
| os.environ["CMSSW_DAS_QUERY_SITES"]=opt.dasSites | ||
|
|
@@ -346,8 +416,6 @@ def stepOrIndex(s): | |
| if opt.keep: | ||
| opt.keep=map(stepOrIndex,opt.keep.split(',')) | ||
|
|
||
|
|
||
|
|
||
| if opt.testList: | ||
| testList=[] | ||
| for entry in opt.testList.split(','): | ||
|
|
@@ -372,9 +440,11 @@ def stepOrIndex(s): | |
| if opt.nProcs: opt.nProcs=int(opt.nProcs) | ||
| if opt.nThreads: opt.nThreads=int(opt.nThreads) | ||
| if opt.nStreams: opt.nStreams=int(opt.nStreams) | ||
| if (opt.numberEventsInLuminosityBlock): opt.numberEventsInLuminosityBlock=int(opt.numberEventsInLuminosityBlock) | ||
| if (opt.memoryOffset): opt.memoryOffset=int(opt.memoryOffset) | ||
| if (opt.memPerCore): opt.memPerCore=int(opt.memPerCore) | ||
| if opt.numberEventsInLuminosityBlock: opt.numberEventsInLuminosityBlock=int(opt.numberEventsInLuminosityBlock) | ||
| if opt.memoryOffset: opt.memoryOffset=int(opt.memoryOffset) | ||
| if opt.memPerCore: opt.memPerCore=int(opt.memPerCore) | ||
| if opt.GPUMemoryMB: opt.GPUMemoryMB=int(opt.GPUMemoryMB) | ||
srimanob marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if opt.gpuEnable: opt.RequiresGPU="required" | ||
|
|
||
| if opt.wmcontrol: | ||
| performInjectionOptionTest(opt) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
otherwise, doesn't this completely defeat the purpose of "optional" ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we would like to keep optional as real optional definition also in relvals, this can be removed. No issue on removal. I will include it in the next commit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, if we don't want to use "optional" in the RelVals, we should simply not use it as an option when creating and submitting them.
Additionally, what I do not understand is how "optional" could be used for any other uses case if it is always removed by
runTheMatrix?