diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..e496c08 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,22 @@ +name: SAMADhi + +on: [push] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python: [3.7, 3.8, 3.9] + + steps: + - uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python }} + - name: Install tox and SAMADhi + run: pip install tox + - name: Run tox + run: tox -e py diff --git a/.gitignore b/.gitignore index 93e2cbf..b65e671 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ python/__init__.py dist build eggs +.eggs parts bin var @@ -26,6 +27,7 @@ pip-log.txt .coverage .tox nosetests.xml +.pytest_cache # Translations *.mo diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..a8fe8fd --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,85 @@ +exclude: '(^tests/data/|^html|^data)' +repos: + +- repo: https://github.com/psf/black + rev: 21.9b0 + hooks: + - id: black + language_version: python3 + +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: check-added-large-files + - id: check-case-conflict + - id: check-merge-conflict + - id: trailing-whitespace + exclude: '^ext/jetclasses.patch$' + - id: end-of-file-fixer + - id: mixed-line-ending + - id: check-yaml + - id: check-ast + - id: fix-byte-order-marker + #- id: check-builtin-literals + - id: check-toml + - id: debug-statements + +- repo: https://github.com/pre-commit/pygrep-hooks + rev: v1.9.0 + hooks: + - id: python-check-blanket-noqa + - id: python-check-blanket-type-ignore + - id: python-no-log-warn + - id: python-no-eval + - id: python-use-type-annotations + - id: rst-backticks + - id: rst-directive-colons + - id: rst-inline-touching-normal + +- repo: https://github.com/PyCQA/isort + rev: 5.9.3 + hooks: + - id: isort + exclude: '^examples/df_nano.py$' + +- repo: https://github.com/asottile/pyupgrade + rev: v2.29.0 + hooks: + - id: pyupgrade + args: ["--py36-plus"] + +- repo: https://github.com/asottile/setup-cfg-fmt + rev: v1.18.0 + hooks: + - id: setup-cfg-fmt + +- repo: https://github.com/asottile/yesqa + rev: v1.2.3 + hooks: + - id: yesqa + exclude: docs/conf.py + additional_dependencies: &flake8_dependencies + - flake8-bugbear + - flake8-print + +- repo: https://github.com/pycqa/flake8 + rev: 3.9.2 + hooks: + - id: flake8 + exclude: docs/conf.py + additional_dependencies: *flake8_dependencies + args: ['--ignore=E501,W503'] + +- repo: local + hooks: + - id: disallow-caps + name: Disallow improper capitalization + language: pygrep + entry: PyBind|Numpy|Cmake|CCache|Github|PyTest + exclude: .pre-commit-config.yaml + +- repo: https://github.com/mgedmin/check-manifest + rev: "0.47" + hooks: + - id: check-manifest + stages: [manual] diff --git a/.travis.yml b/.travis.yml index 6b82c6f..164870d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,4 +5,5 @@ python: - "2.7" - "3.6" - "3.7" -script: python --version +install: pip install tox-travis +script: tox diff --git a/README.md b/README.md index a4f1b78..9ae00a8 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Samādhi in Hinduism, Buddhism, Jainism, Sikhism and yogic schools is a higher l This project is to develop a database to keep track of samples used by our group for CMS data analysis, and of (groups of) analysis results. -A python interface is provided via the STORM package. +A python interface is provided via the [peewee](http://docs.peewee-orm.com/en/latest/) package. Setup inside a CMSSW project area: ``` @@ -21,13 +21,13 @@ source installdeps_cmssw.sh ## only on first use scram b ``` -Standalone setup on ingrid: +For standalone use the python interface can be installed with setuptools or pip, +e.g. in a [virtual environment](https://packaging.python.org/tutorials/installing-packages/#creating-virtual-environments) with +```bash +python -m venv samadhi_env +source samadhi_env/bin/activate +pip install git+https://github.com/cp3-llbb/SAMADhi.git ``` -source setup_standalone.sh ## in every new shell -``` -this will create an install tree and symlink if needed, and otherwise only set some environment variables. -The python installation used can be customized with the `--python` option (e.g. `--python=/nfs/soft/python/python-2.7.5-sl6_amd64_gcc44/bin/python` on `ingrid-ui2`), -and the install tree location can be set with the `--install` option. To start the xataface interface in a docker image: diff --git a/documentation/SAMADhi_examples.py b/documentation/SAMADhi_examples.py index 126b9cf..ec38642 100644 --- a/documentation/SAMADhi_examples.py +++ b/documentation/SAMADhi_examples.py @@ -1,61 +1,80 @@ -from cp3_llbb.SAMADhi import SAMADhi +from cp3_llbb.SAMADhi.SAMADhi import SAMADhiDB, Sample + # Example method to generate a dictionary relating PAT name and luminosity -# This version is optimized and only load the needed columns. +# This version is optimized and only load the needed columns. # We also do an implicit join between dataset and sample. -def getPATlumi(sampletype=u"mc"): # can be u"mc", u"data", u"%" - dbstore = SAMADhi.DbStore() - pattuples = dbstore.find(SAMADhi.Sample,SAMADhi.Dataset.dataset_id==SAMADhi.Sample.source_dataset_id, - (SAMADhi.Sample.sampletype==u"PAT") & (SAMADhi.Dataset.datatype.like(sampletype))) - luminosities = pattuples.values(SAMADhi.Sample.name, SAMADhi.Sample.luminosity) - dictionary = {} - for name,lumi in luminosities: - dictionary[name]=lumi - return dictionary +def getPATlumi(sampletype="mc"): # can be "mc", "data", "%" + with SAMADhiDB() as db: + return { + smp.name: smp.luminosity + for smp in Sample.select(Sample.name, Sample.luminosity).where( + (Sample.sampletype == "PAT") & (Sample.source_dataset.datatype % sampletype) + ) + } + # Example method to access a PAT based on the path and access results and dataset -def getPAT(path=u"%"): - dbstore = SAMADhi.DbStore() - pattuples = dbstore.find(SAMADhi.Sample,(SAMADhi.Sample.sampletype==u"PAT") & (SAMADhi.Sample.path.like(path))) - for pattuple in pattuples: - print pattuple - print "results obtained from that sample:" - for res in pattuple.results: - print res - print "source dataset:" - print pattuple.source_dataset +def getPAT(path="%"): + with SAMADhiDB() as db: + for pattuple in Sample.select().where((Sample.sampletype == "PAT") & (Sample.path % path)): + print(pattuple) + print("results obtained from that sample:") + for res in pattuple.results: + print(res) + print("source dataset:") + print(pattuple.source_dataset) + # Example to access the weight of an event def getWeights(dataset, run, event): - dbstore = SAMADhi.DbStore() - event = dbstore.find(SAMADhi.Event,(SAMADhi.Event.run_number==run) & (SAMADhi.Event.event_number==event) & (SAMADhi.Event.dataset_id==dataset)) - theEvent = event.one() - for w in theEvent.weights: - print "weight for process %s (version %d): %g+/-%g"%(w.process.name,w.version,w.value,w.uncertainty) + dbstore = SAMADhi.DbStore() + event = dbstore.find( + SAMADhi.Event, + (SAMADhi.Event.run_number == run) + & (SAMADhi.Event.event_number == event) + & (SAMADhi.Event.dataset_id == dataset), + ) + theEvent = event.one() + for w in theEvent.weights: + print( + "weight for process %s (version %d): %g+/-%g" + % (w.process.name, w.version, w.value, w.uncertainty) + ) + # Get a single event weight # Note that I think that the getWeights above will be faster than n times this method. def getWeight(dataset, run, event, process, version=None): - dbstore = SAMADhi.DbStore() - weight = dbstore.find(SAMADhi.Weight, SAMADhi.Weight.event_id==SAMADhi.Event.event_id, - (SAMADhi.Event.run_number==run) & (SAMADhi.Event.event_number==event) & (SAMADhi.Event.dataset_id==dataset) & - (SAMADhi.Weight.madweight_process==process)) - if version is None: # take the most recent - w = weight.order_by(SAMADhi.Weight.version).last() - else: - w = weight.find(SAMADhi.Weight.version==version).one() - return (w.value, w.uncertainty) + dbstore = SAMADhi.DbStore() + weight = dbstore.find( + SAMADhi.Weight, + SAMADhi.Weight.event_id == SAMADhi.Event.event_id, + (SAMADhi.Event.run_number == run) + & (SAMADhi.Event.event_number == event) + & (SAMADhi.Event.dataset_id == dataset) + & (SAMADhi.Weight.madweight_process == process), + ) + if version is None: # take the most recent + w = weight.order_by(SAMADhi.Weight.version).last() + else: + w = weight.find(SAMADhi.Weight.version == version).one() + return (w.value, w.uncertainty) + # In the example above, you need the dataset id. It can be obtained this way # It could be combined in a complex query, but typically you will get this once # and avoid doing the joined query for every event. def dataset_id(dataset=None, pat=None): - dbstore = SAMADhi.DbStore() - if dataset is None and pat is not None: - dset = dbstore.find(SAMADhi.Dataset,SAMADhi.Dataset.dataset_id==SAMADhi.Sample.source_dataset_id,SAMADhi.Sample.name==pat) - elif dataset is not None and pat is None: - dset = dbstore.find(SAMADhi.Dataset,SAMADhi.Dataset.name==dataset) - else: - return 0 - return dset.one().dataset_id - + dbstore = SAMADhi.DbStore() + if dataset is None and pat is not None: + dset = dbstore.find( + SAMADhi.Dataset, + SAMADhi.Dataset.dataset_id == SAMADhi.Sample.source_dataset_id, + SAMADhi.Sample.name == pat, + ) + elif dataset is not None and pat is None: + dset = dbstore.find(SAMADhi.Dataset, SAMADhi.Dataset.name == dataset) + else: + return 0 + return dset.one().dataset_id diff --git a/installdeps_cmssw.sh b/installdeps_cmssw.sh index 977a17f..edc08d9 100644 --- a/installdeps_cmssw.sh +++ b/installdeps_cmssw.sh @@ -39,22 +39,24 @@ if [ $? -ne 0 ]; then python "${pipinstall}/get-pip.py" --prefix="${pipinstall}" --no-setuptools fi export PYTHONPATH="${pipinstall}/lib/python${pymajmin}/site-packages:${PYTHONPATH}" + python -m pip install --prefix="${pipinstall}" --upgrade Cython fi ## install dependencies installpath="${CMSSW_BASE}/install/samadhidep" -echo "--> Installing MySQL-python and storm" -python -m pip install --prefix="${installpath}" --ignore-installed --upgrade --upgrade-strategy=only-if-needed MySQL-python storm +echo "--> Installing peewee and pymysql" +NO_SQLITE=1 python -m pip install --prefix="${installpath}" --ignore-installed --upgrade peewee pymysql pytest pytest-console-scripts future # root_interface toolfile toolfile="${installpath}/samadhidep.xml" cat <"${toolfile}" - + + EOF_TOOLFILE diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b5e6299 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[build-system] +requires = ["setuptools", "wheel", "setuptools_scm[toml]>=6.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools_scm] + +[tool.black] +line-length = 100 +target-version = ['py37'] diff --git a/python/SAMADhi.py b/python/SAMADhi.py index 3d8b424..62fa2fd 100644 --- a/python/SAMADhi.py +++ b/python/SAMADhi.py @@ -1,304 +1,388 @@ -try: - from storm.locals import * -except ImportError as error: - raise ImportError("Could not import storm, please make sure to install the dependencies (source installdeps_cmssw.sh inside CMSSW, or SAMADhi/install_standalone.sh otherwise): {0}".format(error)) - -#db store connection - -def DbStore(credentials='~/.samadhi'): - """create a database object and returns the db store from STORM""" - +import datetime +import os +import warnings +from contextlib import contextmanager + +from peewee import * + +""" +Object representation of the SAMADhi database tables (based on peewee) + +Example: +>>> from cp3_llbb.SAMADhi.SAMADhi import * ## import models and SAMADhiDB +>>> with SAMADhiDB(): +>>> mySamples = Sample.select().where(Sample.author == "MYUSERNAME") +""" + +__all__ = ["loadCredentials", "SAMADhiDB"] ## models added below +_models = [] ## list for binding a database + +warnings.filterwarnings( + "ignore", module="peewee", category=UserWarning, message="Unable to determine MySQL version: .*" +) +if os.getenv("CMSSW_VERSION") is not None: + """Silence some warnings if inside CMSSW""" + for warnMod in ("pysqlite2.dbapi2", "peewee"): + warnings.filterwarnings( + "ignore", + module=warnMod, + category=DeprecationWarning, + message="Converters and adapters are deprecated. Please use only supported SQLite types. Any type mapping should happen in layer above this module.", + ) + + +def loadCredentials(path="~/.samadhi"): import json, os, stat - credentials = os.path.expanduser(credentials) - if not os.path.exists(credentials): - raise IOError('Credentials file %r not found.' % credentials) + credentials = os.path.expanduser(path) + if not os.path.exists(credentials): + raise OSError("Credentials file %r not found." % credentials) # Check permission mode = stat.S_IMODE(os.stat(credentials).st_mode) - if mode != int('400', 8): - raise IOError('Credentials file has wrong permission. Please execute \'chmod 400 %s\'' % credentials) + if mode != stat.S_IRUSR: + raise OSError( + "Credentials file has wrong permission. Please execute 'chmod 400 %s'" % credentials + ) - with open(credentials, 'r') as f: + with open(credentials) as f: data = json.load(f) - - login = data['login'] - password = data['password'] - hostname = data['hostname'] if 'hostname' in data else 'localhost' - database = data['database'] - - db_connection_string = "mysql://%s:%s@%s/%s" % (login, password, hostname, database) - return Store(create_database(db_connection_string)) - -#definition of the DB interface classes - -class Dataset(Storm): - """Table to represent one sample from DAS - on which we run the analysis""" - __storm_table__ = "dataset" - dataset_id = Int(primary=True) - name = Unicode() - nevents = Int() - dsize = Int() - process = Unicode() - xsection = Float() - cmssw_release = Unicode() - globaltag = Unicode() - datatype = Unicode() - user_comment = Unicode() - energy = Float() - creation_time = DateTime() - samples = ReferenceSet(dataset_id,"Sample.source_dataset_id") - - def __init__(self, name, datatype): - """Initialize a dataset by name and datatype. - Other attributes may be null and should be set separately""" - self.name = name - if datatype==u"mc" or datatype==u"data": - self.datatype = datatype - else: - raise ValueError('dataset type must be mc or data') - - def replaceBy(self, dataset): - """Replace one entry, but keep the same key""" - self.name = dataset.name - self.nevents = dataset.nevents - self.dsize = dataset.dsize - self.process = dataset.process - self.xsection = dataset.xsection - self.cmssw_release = dataset.cmssw_release - self.globaltag = dataset.globaltag - self.datatype = dataset.datatype - self.user_comment = dataset.user_comment - self.energy = dataset.energy - self.creation_time = dataset.creation_time - - def __str__(self): - result = "Dataset #%s:\n"%str(self.dataset_id) - result += " name: %s\n"%str(self.name) - result += " process: %s\n"%str(self.process) - result += " cross-section: %s\n"%str(self.xsection) - result += " number of events: %s\n"%str(self.nevents) - result += " size on disk: %s\n"%str(self.dsize) - result += " CMSSW release: %s\n"%str(self.cmssw_release) - result += " global tag: %s\n"%str(self.globaltag) - result += " type (data or mc): %s\n"%str(self.datatype) - result += " center-of-mass energy: %s TeV\n"%str(self.energy) - result += " creation time (on DAS): %s\n"%str(self.creation_time) - result += " comment: %s"%str(self.user_comment) - return result - -class Sample(Storm): - """Table to represent one processed sample, - typically a PATtupe, skim, RDS, CP, etc.""" - __storm_table__ = "sample" - sample_id = Int(primary=True) - name = Unicode() - path = Unicode() - sampletype = Unicode() - nevents_processed = Int() - nevents = Int() - normalization = Float() - event_weight_sum = Float() - extras_event_weight_sum = Unicode() # MEDIUMTEXT in MySQL - luminosity = Float() - processed_lumi = Unicode() # MEDIUMTEXT in MySQL - code_version = Unicode() - user_comment = Unicode() - author = Unicode() - creation_time = DateTime() - source_dataset_id = Int() - source_sample_id = Int() - source_dataset = Reference(source_dataset_id, "Dataset.dataset_id") - source_sample = Reference(source_sample_id, "Sample.sample_id") - derived_samples = ReferenceSet(sample_id,"Sample.source_sample_id") - results = ReferenceSet(sample_id,"SampleResult.sample_id","SampleResult.result_id","Result.result_id") - files = ReferenceSet(sample_id, "File.sample_id") - - SampleTypes = [ "PAT", "SKIM", "RDS", "LHCO", "NTUPLES", "HISTOS", "OTHER" ] - - def __init__(self, name, path, sampletype, nevents_processed): - """Initialize a dataset by name and datatype. - Other attributes may be null and should be set separately""" - self.name = name - self.path = path - self.nevents_processed = nevents_processed - if sampletype in self.SampleTypes: - self.sampletype = sampletype + if data.get("test", False): + if "database" not in data: + raise KeyError(f"Credentials json file at {credentials} does not contain 'database'") else: - raise ValueError('sample type %s is unkwown'%sampletype) - - def replaceBy(self, sample): - """Replace one entry, but keep the same key""" - self.name = sample.name - self.path = sample.path - self.sampletype = sample.sampletype - self.nevents_processed = sample.nevents_processed - self.nevents = sample.nevents - self.normalization = sample.normalization - self.event_weight_sum = sample.event_weight_sum - self.extras_event_weight_sum = sample.extras_event_weight_sum - self.luminosity = sample.luminosity - self.code_version = sample.code_version - self.user_comment = sample.user_comment - self.source_dataset_id = sample.source_dataset_id - self.source_sample_id = sample.source_sample_id - self.author = sample.author - self.creation_time = sample.creation_time - - def removeFiles(self, store): - store.find(File, File.sample_id == self.sample_id).remove() - self.files.clear() - - - def getLuminosity(self): - """Computes the sample (effective) luminosity""" - if self.luminosity is not None: - return self.luminosity - else: - if self.source_dataset is not None: - if self.source_dataset.datatype=="mc": - # for MC, it can be computed as Nevt/xsection - if self.nevents_processed is not None and self.source_dataset.xsection is not None: - return self.nevents_processed/self.source_dataset.xsection - else: - # for DATA, it can only be obtained from the parent sample - if self.source_sample is not None: - return self.source_sample.luminosity() - # in all other cases, it is impossible to compute a number. - return None - - def __str__(self): - result = "Sample #%s (created on %s by %s):\n"%(str(self.sample_id),str(self.creation_time),str(self.author)) - result += " name: %s\n"%str(self.name) - result += " path: %s\n"%str(self.path) - result += " type: %s\n"%str(self.sampletype) - result += " number of processed events: %s\n"%str(self.nevents_processed) - result += " number of events: %s\n"%str(self.nevents) - result += " normalization: %s\n"%str(self.normalization) - result += " sum of event weight: %s\n"%str(self.event_weight_sum) - if self.extras_event_weight_sum: - result += " has extras sum of event weight\n" - result += " (effective) luminosity: %s\n"%str(self.luminosity) - if self.processed_lumi: - result += " has processed luminosity sections information\n" - else: - result += " does not have processed luminosity sections information\n" - result += " code version: %s\n"%str(self.code_version) - result += " comment: %s\n"%str(self.user_comment) - result += " source dataset: %s\n"%str(self.source_dataset_id) - result += " source sample: %s\n"%str(self.source_sample_id) - if self.sample_id: - result += " %d files: \n" % (self.files.count()) - front_files = [] - last_file = None - if self.files.count() > 5: - c = 0 - for f in self.files: - if c < 3: - front_files.append(f) - - if c == self.files.count() - 1: - last_file = f - c += 1 - else: - front_files = self.files + for ky in ("login", "password", "database"): + if ky not in data: + raise KeyError(f"Credentials json file at {credentials} does not contain '{ky}'") + if "hostname" not in data: + data["hostname"] = "localhost" - for f in front_files: - result += " - %s (%d entries)\n" % (str(f.lfn), f.nevents) - if last_file: - result += " - ...\n" - result += " - %s (%d entries)\n" % (str(last_file.lfn), last_file.nevents) - else: - # No way to know if some files are here - result += " no files" - - return result - -class Result(Storm): - """Table to represent one physics result, - combining several samples.""" - __storm_table__ = "result" - result_id = Int(primary=True) - path = Unicode() - description = Unicode() - author = Unicode() - creation_time = DateTime() - analysis_id = Int() - analysis = Reference(analysis_id, "Analysis.analysis_id") - elog = Unicode() - samples = ReferenceSet(result_id,"SampleResult.result_id","SampleResult.sample_id","Sample.sample_id") - - def __init__(self,path): - self.path = path - - def replaceBy(self, result): - """Replace one entry, but keep the same key""" - self.path = result.path - self.description = result.description - self.author = result.author - self.analysis_id = result.analysis_id - self.elog = result.elog - - def __str__(self): - result = "Result in %s \n created on %s by %s\n "%(str(self.path),str(self.creation_time),str(self.author)) - result += "%s"%str(self.description) - if self.analysis is not None: - result += "\n part of analysis %s"%str(self.analysis.description) - if self.elog is not None: - result += "\n more details in %s"%str(self.elog) - return result - -class SampleResult(Storm): - """Many to many relationship between samples and results.""" - __storm_table__ = "sampleresult" - __storm_primary__ = "sample_id", "result_id" - sample_id = Int() - result_id = Int() - -class File(Storm): - __storm_table__ = "file" - id = Int(primary=True) - sample_id = Int() - lfn = Unicode() # Local file name: /store/ - pfn = Unicode() # Physical file name: srm:// or root:// - event_weight_sum = Float() - extras_event_weight_sum = Unicode() # MEDIUMTEXT in MySQL - nevents = Int() - - sample = Reference(sample_id, "Sample.sample_id") - - def __init__(self, lfn, pfn, event_weight_sum, extras_event_weight_sum, nevents): - self.lfn = lfn - self.pfn = pfn - self.event_weight_sum = event_weight_sum - self.extras_event_weight_sum = extras_event_weight_sum - self.nevents = nevents + return data + + +database = DatabaseProxy() + +# Code generated by: +# python -m pwiz -e mysql --host=cp3.irmp.ucl.ac.be --user=llbb --password --info llbb +# Peewee version: 3.9.4 +class BaseModel(Model): + class Meta: + database = database + + +class Analysis(BaseModel): + id = AutoField(column_name="analysis_id") + cadiline = TextField(null=True) + contact = TextField(null=True) + description = TextField(null=True) + + class Meta: + table_name = "analysis" + + def __str__(self): + return ( + "{0.description}\n" "{cadi}" "{contact}" " Number of associated results: {nresults:d}" + ).format( + self, + cadi=(f" CADI line: {self.cadiline}\n" if self.cadiline else ""), + contact=(f" Contact/Promotor: {self.contact}\n" if self.contact else ""), + nresults=self.results.count(), + ) + + +class Dataset(BaseModel): + """Table to represent one sample from DAS on which we run the analysis + + When creating a Dataset, at least the name and datatype (mc or data) attributes must be specified. + """ + + cmssw_release = CharField(null=True) + creation_time = DateTimeField(null=True) + id = AutoField(column_name="dataset_id") + datatype = CharField() + dsize = BigIntegerField(null=True) + energy = FloatField(null=True) + globaltag = CharField(null=True) + name = CharField(index=True) + nevents = IntegerField(null=True) + process = CharField(null=True) + user_comment = TextField(null=True) + xsection = FloatField(null=True) + + class Meta: + table_name = "dataset" + + @classmethod + def create(cls, **kwargs): + """Initialize a dataset by name and datatype. Other attributes may be null and should be set separately""" + for rK in ("name", "datatype"): + if rK not in kwargs: + raise RuntimeError( + f"Argument '{rK}' is required to construct {self.__class__.__name__}" + ) + if kwargs["datatype"] not in ("mc", "data"): + raise ValueError("dataset type must be mc or data, not {!r}".format(kwargs["datatype"])) + return super().create(**kwargs) + + def __str__(self): + return ( + "Dataset #{0.id:d}:\n" + " name: {0.name}\n" + " process: {0.process}\n" + " cross-section: {xsection}\n" + " number of events: {nevents}\n" + " size on disk: {dsize}\n" + " CMSSW release: {0.cmssw_release}\n" + " global tag: {0.globaltag}\n" + " type (data or mc): {0.datatype}\n" + " center-of-mass energy: {energy} TeV\n" + " creation time (on DAS): {0.creation_time!s}\n" + " comment: {0.user_comment}" + ).format( + self, + nevents=(f"{self.nevents:d}" if self.nevents is not None else "None"), + dsize=(f"{self.dsize:d}" if self.dsize is not None else "None"), + xsection=(f"{self.xsection:f}" if self.xsection is not None else "None"), + energy=(f"{self.energy:f}" if self.energy is not None else "None"), + ) + + +class Sample(BaseModel): + """Table to represent one processed sample, typically a PATtupe, skim, RDS, CP, etc. + + When creating a Sample, at least the name, path, sampletype (any of Sample.SampleTypes) + and nevents_processed attributes must be specified. + """ + + author = TextField(null=True) + code_version = CharField(null=True) + creation_time = DateTimeField( + constraints=[SQL("DEFAULT CURRENT_TIMESTAMP")], default=datetime.datetime.now + ) + event_weight_sum = FloatField(null=True) + extras_event_weight_sum = TextField(null=True) + luminosity = FloatField(null=True) + name = CharField(index=True) + nevents = IntegerField(null=True) + nevents_processed = IntegerField(null=True) + normalization = FloatField(constraints=[SQL("DEFAULT 1")], default=1.0) + path = CharField() + processed_lumi = TextField(null=True) + id = AutoField(column_name="sample_id") + sampletype = CharField() + source_dataset = ForeignKeyField(Dataset, null=True, backref="samples") + source_sample = ForeignKeyField("self", null=True, backref="derived_samples") + user_comment = TextField(null=True) + + class Meta: + table_name = "sample" + + @property + def results(self): + return Result.select().join(SampleResult).join(Sample).where(Sample.id == self.id) + + SampleTypes = ["PAT", "SKIM", "RDS", "LHCO", "NTUPLES", "HISTOS", "OTHER"] + + @classmethod + def create(cls, **kwargs): + for rK in ("name", "path", "sampletype", "nevents_processed"): + if rK not in kwargs: + raise RuntimeError( + f"Argument '{rK}' is required to construct {self.__class__.__name__}" + ) + if kwargs["sampletype"] not in Sample.SampleTypes: + raise ValueError( + "sample type {} is unknown (need one of {})".format( + kwargs["sampletype"], ", ".join(Sample.SampleTypes) + ) + ) + return super().create(**kwargs) + + def removeFiles(self): + File.delete().where(File.sample == self).execute() + + def getLuminosity(self): + """Computes the sample (effective) luminosity""" + if self.luminosity is not None: + return self.luminosity + else: + if self.source_dataset is not None: + if self.source_dataset.datatype == "MC": + # for MC, it can be computed as Nevt/xsection + if ( + self.nevents_processed is not None + and self.source_dataset.xsection is not None + ): + return self.nevents_processed / self.source_dataset.xsection + else: + # for DATA, it can only be obtained from the parent sample + if self.source_sample is not None: + return self.source_sample.luminosity + ## in cases not treated above it is impossible to compute a number, so return None def __str__(self): - return "%s"%(self.lfn) - -class Analysis(Storm): - __storm_table__ = "analysis" - analysis_id = Int(primary=True) - description = Unicode() - cadiline = Unicode() - contact = Unicode() - results = ReferenceSet(analysis_id, "Result.analysis_id") - - def __init__(self,description): - self.description = description - - def replaceBy(self, analysis): - self.description = analysis.description - self.cadiline = analysis.cadiline - self.contact = analysis.contact - + return ( + "Sample #{0.id:d} (created on {0.creation_time!s} by {0.author})\n" + " name: {0.name}\n" + " path: {0.path}\n" + " type: {0.sampletype}\n" + " number of processed events: {0.nevents_processed:d}\n" + " number of events: {nevents}\n" + " normalization: {0.normalization}\n" + " sum of event weights: {0.event_weight_sum}\n" + "{sumw_extras}" + " (effective) luminosity: : {0.luminosity}\n" + " {hasproclumi} processed luminosity sections information\n" + " code version: {0.code_version}\n" + " comment: {0.user_comment}\n" + "{source_dataset}" + "{source_sample}" + " {files}" + ).format( + self, + nevents=(f"{self.nevents:d}" if self.nevents is not None else "none"), + sumw_extras=( + " has extras sum of event weight\n" if self.extras_event_weight_sum else "" + ), + hasproclumi=("has" if self.processed_lumi else "does not have"), + source_dataset=( + f" source dataset: {self.source_dataset.id:d}\n" + if self.source_dataset is not None + else "" + ), + source_sample=( + f" source sample: {self.source_sample.id:d}\n" + if self.source_sample is not None + else "" + ), + files=( + "{:d} files: \n - {}".format( + self.files.count(), + "\n - ".join( + ( + "{0.lfn} ({nevents} entries)".format( + fl, nevents=f"{fl.nevents:d}" if fl.nevents is not None else "no" + ) + for fl in self.files + ) + if self.files.count() < 6 + else ( + ["{0.lfn} ({0.nevents:d} entries)".format(fl) for fl in self.files[:3]] + + ["...", "{0.lfn} ({0.nevents:d} entries)".format(self.files[-1])] + ) + ), + ) + if self.id + else "no files" + ), + ) + + +class File(BaseModel): + """Table to represent a file (in a sample) + + When creating a File, at least the lfn, pfn, event_weight_sum and nevents attributes must be specified. + """ + + event_weight_sum = FloatField(null=True) + extras_event_weight_sum = TextField(null=True) + id = BigAutoField() + lfn = CharField() # Local file name: /store/ + nevents = BigIntegerField(null=True) + pfn = CharField() # Physical file name: srm:// or root:// + sample = ForeignKeyField(Sample, backref="files") + + class Meta: + table_name = "file" + + @classmethod + def create(cls, **kwargs): + for rK in ("lfn", "pfn", "event_weight_sum", "nevents"): + if rK not in kwargs: + raise RuntimeError( + f"Argument '{rK}' is required to construct {self.__class__.__name__}" + ) + return super().create(**kwargs) + def __str__(self): - result = "%s\n"%self.description - if self.cadiline is not None: - result += " CADI line: %s\n"%self.cadiline - if self.contact is not None: - result += " Contact/Promotor: %s\n"%self.contact - result += " Number of associated results: %d"%self.results.count() - return result + return self.lfn + +class Result(BaseModel): + """Table to represent one physics result, combining several samples. + + When creating a Result, at least the path attribute must be specified. + """ + + analysis = ForeignKeyField(Analysis, null=True, backref="results") + author = TextField(null=True) + creation_time = DateTimeField( + constraints=[SQL("DEFAULT CURRENT_TIMESTAMP")], default=datetime.datetime.now + ) + description = TextField(null=True) + elog = CharField(null=True) + path = CharField(index=True) + id = AutoField(column_name="result_id") + + class Meta: + table_name = "result" + + @property + def samples(self): + return Sample.select().join(SampleResult).join(Result).where(Result.id == self.id) + + @classmethod + def create(cls, **kwargs): + for rK in ("path",): + if rK not in kwargs: + raise RuntimeError( + f"Argument '{rK}' is required to construct {self.__class__.__name__}" + ) + return super().create(**kwargs) + + def __str__(self): + return ( + "Result in {0.path}\n" + " created on {0.creation_time!s} by {0.author}" + "{desc}" + "{elog}" + ).format( + self, + desc=(f"\n part of analysis {self.analysis.description}" if self.analysis else ""), + elog=(f"\n more details in {self.elog}" if self.elog else ""), + ) + + +class SampleResult(BaseModel): + result = ForeignKeyField(Result, column_name="result_id") + sample = ForeignKeyField(Sample, column_name="sample_id") + + class Meta: + table_name = "sampleresult" + indexes = ((("sample", "result"), True),) + primary_key = CompositeKey("result", "sample") + + +# all models, for binding in SAMADhiDB and import +_models = [Analysis, Dataset, Sample, File, Result, SampleResult] +__all__ += _models + + +@contextmanager +def SAMADhiDB(credentials="~/.samadhi"): + """create a database object and returns the db handle from peewee""" + cred = loadCredentials(path=credentials) + if cred.get("test", False): + import os.path + + dbPath = cred["database"] + if not os.path.isabs(dbPath): + dbPath = os.path.join( + os.path.abspath(os.path.dirname(os.path.expanduser(credentials))), dbPath + ) + db = SqliteDatabase(dbPath) + else: + db = MySQLDatabase( + cred["database"], user=cred["login"], password=cred["password"], host=cred["hostname"] + ) + with db.bind_ctx(_models): + yield db diff --git a/python/das_import.py b/python/das_import.py index 9e217b5..7e27885 100755 --- a/python/das_import.py +++ b/python/das_import.py @@ -1,98 +1,85 @@ -import re +import datetime import json +import re import subprocess -from .SAMADhi import Dataset, DbStore -from .userPrompt import confirm +from .utils import confirm_transaction + def do_das_query(query): """ Execute das_client for the specified query, and return parsed JSON output """ - args = ['dasgoclient', '-json', '-format', 'json', '--query', query] + args = ["dasgoclient", "-json", "-query", query] result = subprocess.check_output(args) return json.loads(result) -def fillDataset(dataset, dct): - """ - Fill an instance of Dataset with values from a dictionnary - """ - import datetime - - # definition of the conversion key -> column - conversion = { "process": u'process', - "user_comment": u'comment', - "energy": u'energy', - "nevents": u'nevents', - "cmssw_release": u'release', - "dsize": u'file_size', - "globaltag": u'globalTag', - "xsection": u'xsection' } - - for column, key in conversion.items(): - setattr(dataset, column, dct[key]) - - # special cases - #dataset.creation_time = datetime.datetime.strptime(dct[u'creation_time'], "%Y-%m-%d %H:%M:%S") - dataset.creation_time = datetime.datetime.fromtimestamp(dct[u'creation_time']) - - return dataset def query_das(dataset): """ Do a DAS request for the given dataset and return the metadata collected """ - summary_query = "summary dataset=%s" % dataset - metadata_query = "dataset=%s" % dataset - release_query = "release dataset=%s" % dataset - config_query = "config dataset=%s system=dbs3" % dataset + summary_query = "summary dataset=%s" % dataset + metadata_query = "dataset=%s" % dataset + release_query = "release dataset=%s" % dataset + config_query = "config dataset=%s system=dbs3" % dataset summary_results = do_das_query(summary_query) metadata_results = do_das_query(metadata_query) release_results = do_das_query(release_query) config_results = do_das_query(config_query) - if not 'nresults' in summary_results: - raise Exception("Invalid DAS response") - - if summary_results['nresults'] > 1: - raise Exception("Error: more than one result for DAS query:%d"%summary_results['nresults']) - # Grab results from DAS metadata = {} - for d in metadata_results["data"][0]["dataset"]: + for d in next( + entry for entry in metadata_results if "dbs3:dataset_info" in entry["das"]["services"] + )["dataset"]: for key, value in d.items(): metadata[key] = value - for d in summary_results["data"][0]["summary"]: + for d in summary_results[0]["summary"]: for key, value in d.items(): metadata[key] = value # Set release in global tag - metadata.update({ - u'release': unicode(release_results["data"][0]["release"][0]["name"][0]), - u'globalTag': unicode(config_results["data"][0]["config"][0]["global_tag"]) - }) + metadata.update( + { + "release": release_results[0]["release"][0]["name"][0], + "globalTag": config_results[0]["config"][0]["global_tag"], + } + ) # Last chance for the global tag - for d in config_results["data"]: - if metadata[u'globalTag']==u'UNKNOWN': - metadata[u'globalTag']=unicode(d["config"][0]["global_tag"]) - if metadata[u'globalTag']==u'UNKNOWN': - del metadata[u'globalTag'] + for d in config_results: + if metadata["globalTag"] == "UNKNOWN": + metadata["globalTag"] = d["config"][0]["global_tag"] + if metadata["globalTag"] == "UNKNOWN": + del metadata["globalTag"] return metadata -def import_cms_dataset(dataset, process=None, energy=None, xsection=1.0, comment="", prompt=False): + +def import_cms_dataset( + dataset, + process=None, + energy=None, + xsection=1.0, + comment="", + assumeDefault=False, + credentials=None, +): """ Do a DAS request for the given dataset and insert it into SAMAdhi """ + if subprocess.call(["voms-proxy-info", "--exists", "--valid", "0:5"]) != 0: + raise RuntimeError("No valid proxy found (with at least 5 minutes left)") + # Guess default sane values for unspecifed parameters if not process: - splitString = dataset.split('/', 2) + splitString = dataset.split("/", 2) if len(splitString) > 1: process = splitString[1] @@ -102,41 +89,219 @@ def import_cms_dataset(dataset, process=None, energy=None, xsection=1.0, comment energy = float(energyRe.group(1)) metadata = query_das(dataset) + metadata.update( + {"process": process, "xsection": xsection, "energy": energy, "comment": comment} + ) + if not all(ky in metadata for ky in ("name", "datatype")): + raise RuntimeError(f"Could not find all required keys (name and datatype) in {metadata!s}") + + # definition of the conversion key -> column + column_conversion = { + "process": "process", + "user_comment": "comment", + "energy": "energy", + "nevents": "nevents", + "cmssw_release": "release", + "dsize": "file_size", + "globaltag": "globalTag", + "xsection": "xsection", + } + # columns of the dataset to create (if needed) + dset_columns = {col: metadata[key] for col, key in column_conversion.items()} + dset_columns["creation_time"] = ( + datetime.datetime.fromtimestamp(metadata["creation_time"]) + if "creation_time" in metadata + else None + ) + + from .SAMADhi import SAMADhiDB, Dataset + + with SAMADhiDB(credentials) as db: + existing = Dataset.get_or_none(Dataset.name == metadata["name"]) + with confirm_transaction( + db, + "Insert into the database?" if existing is None else "Update this dataset?", + assumeDefault=assumeDefault, + ): + dataset, created = Dataset.get_or_create( + name=metadata["name"], datatype=metadata["datatype"], defaults=dset_columns + ) + print(dataset) + return dataset + + +def main(args=None): + import argparse + + parser = argparse.ArgumentParser(description="Import CMS datasets into SAMADhi") + parser.add_argument("dataset", help="CMS dataset") + parser.add_argument("-p", "--process", help="Process name") + parser.add_argument("--xsection", type=float, default=1.0, help="Cross-section in pb") + parser.add_argument("--energy", type=float, dest="energy", help="CoM energy, in TeV") + parser.add_argument("--comment", default="", help="User defined comment") + parser.add_argument( + "--database", + default="~/.samadhi", + help="JSON Config file with database connection settings and credentials", + ) + parser.add_argument( + "-y", + "--continue", + dest="assumeDefault", + action="store_true", + help="Insert or replace without prompt for confirmation", + ) + args = parser.parse_args(args=args) + + import_cms_dataset( + args.dataset, + args.process, + args.energy, + args.xsection, + args.comment, + assumeDefault=args.assumeDefault, + credentials=args.database, + ) + + +def get_nanoFile_data(fileName): + from cppyy import gbl + + f = gbl.TFile.Open(fileName) + if not f: + print(f"Warning: could not open file {fileName}") + return None, None + eventsTree = f.Get("Events") + if (not eventsTree) or (not isinstance(eventsTree, gbl.TTree)): + print(f"No tree with name 'Events' found in {fileName}") + return None, None + entries = eventsTree.GetEntries() + runs = f.Get("Runs") + if (not runs) or (not isinstance(runs, gbl.TTree)): + print(f"No tree with name 'Runs' found in {fileName}") + return entries, None + sums = dict() + runs.GetEntry(0) + for lv in runs.GetListOfLeaves(): + lvn = lv.GetName() + if lvn != "run": + if lv.GetLeafCount(): + lvcn = lv.GetLeafCount().GetName() + if lvcn in sums: + del sums[lvcn] + sums[lvn] = [lv.GetValue(i) for i in range(lv.GetLeafCount().GetValueLong64())] + else: + sums[lvn] = lv.GetValue() + for entry in range(1, runs.GetEntries()): + runs.GetEntry(entry) + for cn, vals in sums.items(): + if hasattr(vals, "__iter__"): + entryvals = getattr(runs, cn) + ## warning and workaround (these should be consistent for all NanoAODs in a sample) + if len(vals) != len(entryvals): + logger.error( + f"Runs tree: array of sums {cn} has a different length in entry {entry:d}: {len(entryvals):d} (expected {len(vals):d})" + ) + for i in range(min(len(vals), len(entryvals))): + vals[i] += entryvals[i] + else: + sums[cn] += getattr(runs, cn) + return entries, sums + + +def import_nanoAOD_sample(args=None): + import argparse + + parser = argparse.ArgumentParser( + "Add a NanoAOD sample based on the DAS path and (optionally) cross-section" + ) + parser.add_argument("path", help="DAS path") + parser.add_argument("--xsection", default=1.0, type=float, help="Cross-section value") + parser.add_argument("--energy", default=13.0, type=float, help="CoM energy, in TeV") + parser.add_argument("-p", "--process", help="Process name") + parser.add_argument("--comment", default="", help="User defined comment") + parser.add_argument("--datasetcomment", default="", help="User defined comment") + parser.add_argument( + "--store", required=True, help="root path of the local CMS storage (e.g. /storage/data/cms)" + ) + parser.add_argument( + "--database", + default="~/.samadhi", + help="JSON Config file with database connection settings and credentials", + ) + parser.add_argument( + "-y", + "--continue", + dest="assumeDefault", + action="store_true", + help="Insert or replace without prompt for confirmation", + ) + args = parser.parse_args(args=args) + + if subprocess.call(["voms-proxy-info", "--exists", "--valid", "0:5"]) != 0: + raise RuntimeError("No valid proxy found (with at least 5 minutes left)") + + parent_results = do_das_query(f"parent dataset={args.path}") + if not (len(parent_results) == 1 and len(parent_results[0]["parent"]) == 1): + raise RuntimeError("Parent dataset query result has an unexpected format") + parent_name = parent_results[0]["parent"][0]["name"] + source_dataset = import_cms_dataset( + parent_name, + process=args.process, + energy=args.energy, + xsection=args.xsection, + comment=args.datasetcomment, + assumeDefault=args.assumeDefault, + credentials=args.database, + ) + + files_results = do_das_query(f"file dataset={args.path}") + nevents = sum(fr["file"][0]["nevents"] for fr in files_results) + + from .SAMADhi import Sample, File, SAMADhiDB + import os.path + + ## Next: the add_sample part + with SAMADhiDB(credentials=args.database) as db: + existing = Sample.get_or_none(Sample.name == args.path) + with confirm_transaction( + db, + "Insert into the database?" if existing is None else f"Replace existing {existing!s}?", + assumeDefault=args.assumeDefault, + ): + sample, created = Sample.get_or_create( + name=args.path, + path=args.path, + defaults={"sampletype": "NTUPLES", "nevents_processed": nevents}, + ) + sample.nevents = nevents + sample.normalization = 1.0 + sample.source_dataset = source_dataset + sample.source_sample = None + + sample_weight_sum = 0 + for fRes in files_results: + if len(fRes["file"]) != 1: + raise RuntimeError("File result from DAS query has an unexpected format") + fileInfo = fRes["file"][0] + pfn = os.path.join(args.store, fileInfo["name"].lstrip(os.path.sep)) + entries, weight_sums = get_nanoFile_data(pfn) + # print("For debug: nevents from DAS={0:d}, from file={1:d}".format(fileInfo["nevents"], entries)) + event_weight_sum = weight_sums["genEventSumw"] + # print("All event weight sums: {0!r}".format(weight_sums)) + sample_weight_sum += event_weight_sum + File.create( + lfn=fileInfo["name"], + pfn=pfn, + event_weight_sum=event_weight_sum, + nevents=(entries if entries is not None else 0), + sample=sample, + ) ## FIXME extras_event_weight_sum + + sample.event_weight_sum = sample_weight_sum + sample.luminosity = sample.getLuminosity() ## from xsection and sum of weights + sample.comment = args.comment + sample.author = "CMS" + sample.save() - metadata.update({ - u"process": unicode(process), - u"xsection": xsection, - u"energy": energy, - u"comment": unicode(comment) - }) - - # Connect to the database - dbstore = DbStore() - - # Check if the dataset is already in the dataset - update = False - dbResult = dbstore.find(Dataset, Dataset.name == unicode(metadata['name'])) - if (dbResult.is_empty()): - dataset = Dataset(metadata['name'], metadata['datatype']) - else: - update = True - dataset = dbResult.one() - - fillDataset(dataset, metadata) - - if prompt: - if not update: - dbstore.add(dataset) - dbstore.flush() - - print dataset - prompt = "Insert into the database?" if not update else "Update this dataset?" - if confirm(prompt=prompt, resp=True): - dbstore.commit() - else: - dbstore.rollback() - - else: - if not update: - dbstore.add(dataset) - dbstore.commit() + print(sample) diff --git a/python/dbAnalysis.py b/python/dbAnalysis.py new file mode 100755 index 0000000..b9e0bca --- /dev/null +++ b/python/dbAnalysis.py @@ -0,0 +1,599 @@ +#!/usr/bin/env python +""" Script to do basic checks to the database and output statistics on usage and issues """ + +import argparse +import errno +import json +import os +import re +from collections import defaultdict +from contextlib import contextmanager +from datetime import datetime + +import numpy as np + +from playhouse.shortcuts import model_to_dict + +from .das_import import query_das +from .SAMADhi import Analysis, Dataset +from .SAMADhi import File as SFile +from .SAMADhi import Result, SAMADhiDB, Sample + + +@contextmanager +def openRootFile(fileName, noOp=False, mode="update"): + if noOp: + yield + else: + from cppyy import gbl + + rootfile = gbl.TFile.Open(fileName, mode) + yield + rootfile.Write() + rootfile.Close() + + +def json_serialize(obj): + if isinstance(obj, datetime): + return obj.isoformat() + elif isinstance(obj, np.int64): + return str(obj) + else: + try: + return model_to_dict(obj) + except Exception as ex: + raise TypeError(f"Object {obj!r} could not be serialized: {ex}") + + +def saveReportJSON(jReport, outFileName, outDir=".", symlinkDir=None): + outFullName = os.path.join(outDir, outFileName) + with open(outFullName, "w") as outFile: + json.dump(jReport, outFile, default=json_serialize) + if symlinkDir: + force_symlink(outFullName, os.path.join(symlinkDir, outFileName)) + + +def main(args=None): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "-p", + "--path", + type=(lambda p: os.path.abspath(os.path.expandvars(os.path.expanduser(p)))), + default=datetime.now().strftime("%y%m%d-%H:%M:%S"), + help="Destination path", + ) + parser.add_argument("-b", "--basedir", help="Directory where the website will be installed") + parser.add_argument( + "-f", + "--full", + action="store_true", + dest="DAScrosscheck", + help="Full check: compares each Dataset entry to DAS and check for consistency (slow!)", + ) + parser.add_argument( + "-d", "--dry", action="store_true", dest="dryRun", help="Dry run: do no write to disk" + ) + parser.add_argument( + "--database", + default="~/.samadhi", + help="JSON Config file with database connection settings and credentials", + ) + args = parser.parse_args(args=args) + if not args.dryRun: + if os.path.exists(args.path): + raise OSError(errno.EEXIST, "Existing directory", args.path) + else: + os.makedirs(args.path) + + # connect to the MySQL database using default credentials + with SAMADhiDB(credentials=args.database) as db, openRootFile( + os.path.join(args.path, "analysisReport.root"), noOp=args.dryRun, mode="UPDATE" + ): + # run each of the checks and collect data + # collect general statistics + general = collectGeneralStats() + if not args.dryRun: + saveReportJSON( + general, + "stats.json", + outDir=args.path, + symlinkDir=os.path.join(args.basedir, "data"), + ) + # check datasets + datasets = { + "DatabaseInconsistencies": ( + checkDatasets() if args.DAScrosscheck else copyInconsistencies(args.basedir) + ), + "Orphans": findOrphanDatasets(), + "IncompleteData": checkDatasetsIntegrity(), + "DatasetsStatistics": analyzeDatasetsStatistics(writeRoot=(not args.dryRun)), + } + if not args.dryRun: + saveReportJSON( + datasets, + "DatasetsAnalysisReport.json", + outDir=args.path, + symlinkDir=os.path.join(args.basedir, "data"), + ) + # check samples + samples = { + "MissingDirSamples": checkSamplePath(), + "DatabaseInconsistencies": checkSampleConsistency(), + "SampleStatistics": analyzeSampleStatistics(writeRoot=(not args.dryRun)), + } + if not args.dryRun: + saveReportJSON( + samples, + "SamplesAnalysisReport.json", + outDir=args.path, + symlinkDir=os.path.join(args.basedir, "data"), + ) + # now, check results + results = { + "MissingDirSamples": checkResultPath(), + "DatabaseInconsistencies": checkResultConsistency(), + "SelectedResults": selectResults(os.path.join(args.basedir, "data")), + "ResultsStatistics": analyzeResultsStatistics(writeRoot=(not args.dryRun)), + } + if not args.dryRun: + saveReportJSON( + results, + "ResultsAnalysisReport.json", + outDir=args.path, + symlinkDir=os.path.join(args.basedir, "data"), + ) + # finally, some stats about Analysis objects + analyses = {"AnalysisStatistics": analyzeAnalysisStatistics(writeRoot=(not args.dryRun))} + if not args.dryRun: + saveReportJSON( + analyses, + "AnalysisAnalysisReport.json", + outDir=args.path, + symlinkDir=os.path.join(args.basedir, "data"), + ) + + +def collectGeneralStats(): + # get number of datasets, samples, results, analyses + result = { + "nDatasets": Dataset.select(Dataset.id).count(), + "nSamples": Sample.select(Sample.id).count(), + "nResults": Result.select(Result.id).count(), + "nAnalysis": Analysis.select(Analysis.id).count(), + } + print("\nGeneral statistics:") + print("======================") + for kt, num in result.items(): + print(f"{num:d} {kt[1:].lower()}") + return result + + +def checkDatasets(): + print("\nDatasets inconsistent with DAS:") + print("==================================") + result = [] + for dataset in Dataset.select(): + # query DAS to get the same dataset, by name + try: + metadata = query_das(dataset.name) + except: + result.append([dataset, "Inconsistent with DAS"]) + print( + "{0.name} (imported on {0.creation_time!s}) -- Error getting dataset in DAS".format( + dataset + ) + ) + continue + + # perform some checks: + try: + # release name either matches or is unknown in DAS + test1 = metadata["release"] == "unknown" or dataset.cmssw_release == metadata["release"] + # datatype matches + test2 = dataset.datatype == metadata["datatype"] + # nevents matches + test3 = dataset.nevents == metadata["nevents"] + # size matches + test4 = dataset.dsize == metadata["file_size"] + except: + result.append([dataset, "Inconsistent with DAS"]) + print("{0.name} (imported on {0.creation_time!s})".format(dataset)) + else: + if not (test1 and test2 and test3 and test4): + result.append([dataset, "Inconsistent with DAS"]) + print("{0.name} (imported on {0.creation_time!s})".format(dataset)) + return result + + +def findOrphanDatasets(): + print("\nOrphan Datasets:") + print("===================") + result = [] + for dataset in Dataset.select(): + if dataset.samples.count() == 0: + result.append(dataset) + print("{0.name} (imported on {0.creation_time!s})".format(dataset)) + if len(result) == 0: + print("None") + return result + + +def checkDatasetsIntegrity(): + print("\nDatasets integrity issues:") + print("===========================") + result = [] + for dataset in Dataset.select(): + if dataset.cmssw_release is None: + result.append([dataset, "missing CMSSW release"]) + print( + "{0.name} (imported on {0.creation_time!s}): missing CMSSW release".format(dataset) + ) + elif dataset.energy is None: + result.append([dataset, "missing Energy"]) + print("{0.name} (imported on {0.creation_time!s}): missing Energy".format(dataset)) + elif dataset.globaltag is None: + result.append([dataset, "missing Globaltag"]) + print("{0.name} (imported on {0.creation_time!s}): missing Globaltag".format(dataset)) + if len(result) == 0: + print("None") + return result + + +def makePie(uName, data, title=None, save=False): + from cppyy import gbl + + pie = gbl.TPie(f"{uName}Pie", title if title is not None else uName, len(data)) + for idx, (val, freq) in enumerate(data.items()): + pie.SetEntryVal(idx, freq) + pie.SetEntryLabel(idx, val) + pie.SetTextAngle(0) + pie.SetRadius(0.3) + pie.SetTextColor(1) + pie.SetTextFont(62) + pie.SetTextSize(0.03) + canvas = gbl.TCanvas(uName, "", 2) + pie.Draw("r") + if save: + gbl.gPad.Write() + + +def getFreqs(model, attName, addNoneTo=None): + from peewee import fn + + freqs = { + str(getattr(val, attName)): val.count + for val in model.select( + getattr(model, attName), fn.Count(model.id).alias("count") + ).group_by(getattr(model, attName)) + } + if addNoneTo is not None and None in freqs: + freqs[addNoneTo] = freqs.get(addNoneTo, 0) + freqs[None] + del freqs[None] + return freqs + + +def th1ToChart(histo): + return [ + [histo.GetBinCenter(ib), histo.GetBinContent(ib)] for ib in range(1, histo.GetNbinsX() + 1) + ] + + +def toTH1I(name, data, N, xMin, xMax, title=None): + from cppyy import gbl + + if title is None: + title = name + h = gbl.TH1I(name, title, N, xMin, xMax) + for x in data: + h.Fill(x) + return h + + +def toGraph(x, y=None): + from cppyy import gbl + + if y is None: + y = np.array(range(len(x) + 1)) + else: + assert len(x) == len(y) + gr = gbl.TGraph(len(x)) + for i, (x, y) in enumerate(zip(x, y)): + gr.SetPoint(i, x, y) + return gr + + +def analyzeDatasetsStatistics(writeRoot=False): + stats = {} + for prop in ("cmssw_release", "globaltag", "datatype", "energy"): + nDataset_by_prop = getFreqs(Dataset, prop, addNoneTo="Unknown") + stats[prop] = [[k, v] for k, v in nDataset_by_prop.items()] + makePie( + f"dataset{prop.capitalize()}", + nDataset_by_prop, + title=f"Datasets {prop}", + save=writeRoot, + ) + + dset_time, dset_nsamples, dset_nevents, dset_dsize = zip( + *( + ( + ( + int(dset.creation_time.strftime("%s")) * 1000 + if dset.creation_time is not None + else 0 + ), + dset.samples.count(), + (dset.nevents if dset.nevents is not None else 0), + (dset.dsize if dset.dsize is not None else 0), + ) + for dset in Dataset.select().order_by(Dataset.creation_time) + ) + ) + stats["datasetsNsamples"] = th1ToChart(toTH1I("dataseets_nsamples", dset_nsamples, 10, 0, 10)) + stats["datasetsNevents"] = th1ToChart(toTH1I("dataseets_nevents", dset_nevents, 100, 0, -100)) + stats["datasetsDsize"] = th1ToChart(toTH1I("dataseets_dsize", dset_dsize, 100, 0, -100)) + stats["datasetsTimeprof"] = [[tm, i + 1] for i, tm in enumerate(dset_time)] + if writeRoot: + toGraph(np.array(dset_time) / 1000.0).Write("datasetsTimeprof_graph") + + print("\nDatasets Statistics extracted.") + print("=================================") + + return stats + + +def checkResultPath(): + # get all samples + print("\nResults with missing path:") + print("===========================") + result = [] + for res in Result.select(): + # check that the path exists, and keep track of the sample if not the case. + if not os.path.exists(res.path): + print("Result #{0.id} (created on {0.creation_time} by {0.author}):".format(res)) + print(f" missing path: {res.path}") + result.append(res) + if len(result) == 0: + print("None") + return result + + +def checkSamplePath(): + print("\nSamples with missing path:") + print("===========================") + result = [] + for sample in Sample.select(): + # check that the path exists, and keep track of the sample if not the case. + vpath = getSamplePath(sample) + for path in vpath: + if not os.path.exists(path): + print( + "Sample #{0.id:d} (created on {0.creation_time!s} by {0.author}):".format( + sample + ) + ) + print(f" missing path: {path}") + print(vpath) + result.append(sample) + break + if len(result) == 0: + print("None") + return result + + +def getSamplePath(sample): + # the path should be stored in sample.path + # if it is empty, look for files in that path + if sample.path == "": + vpath = set() + regex = r".*SFN=(.*)" + for f in SFile.select().where(SFile.sample.id == sample.id): + m = re.search(regex, f.pfn) + if m: + vpath.add(os.path.dirname(m.group(1))) + return list(vpath) + else: + return [sample.path] + + +def selectResults(symlinkDir): + # look for result records pointing to a ROOT file + # eventually further filter + print("\nSelected results:") + print("===========================") + result = [] + for res in Result.select(): + path = res.path + if os.path.exists(path) and os.path.isdir(path): + files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] + if len(files) == 1: + path = os.path.join(path, f) + res.path = path + if os.path.exists(path) and os.path.isfile(path) and path.lower().endswith(".root"): + symlink = os.path.join(symlinkDir, f"res_{res.id}.root") + relpath = "../data/res_{0}.root" % (res.id) + force_symlink(path, symlink) + result.append([res, relpath]) + print("res #{0.id} (created on {0.creation_time} by {0.author}): ".format(res)) + print(symlink) + if len(result) == 0: + print("None") + return result + + +def checkResultConsistency(): + print("\nResults with missing source:") + print("=============================") + result = [] + for res in Result.select(): + # check that the source sample exists in the database. + # normaly, this should be protected already at the level of sql rules + for sample in res.samples: + if sample is None: + print( + "Result #{0.id:d} (created on {0.creation_time!s} by {0.author}):".format(res) + ) + print("inconsistent source sample") + result.append([res, "inconsistent source sample"]) + print(res) + break + if len(result) == 0: + print("None") + return result + + +def checkSampleConsistency(): + print("\nSamples with missing source:") + print("=============================") + result = [] + for sample in Sample.select(): + # check that either the source dataset or the source sample exists in the database. + # normaly, this should be protected already at the level of sql rules + sourceDataset = sample.source_dataset + sourceSample = sample.source_sample + if sample.source_dataset_id is not None and sample.source_dataset is None: + print("Sample #{0.id} (created on {0.creation_time} by {0.author}".format(sample)) + print("inconsistent source dataset") + result.append([sample, "inconsistent source dataset"]) + print(sample) + if sample.source_sample_id is not None and sample.source_sample is None: + print("Sample #{0.id} (created on {0.creation_time} by {0.author}".format(sample)) + print("inconsistent source sample") + result.append([sample, "inconsistent source sample"]) + if len(result) == 0: + print("None") + return result + + +def analyzeAnalysisStatistics(writeRoot=False): + stats = {} + nAnalyses_by_contact = getFreqs(Analysis, "contact", addNoneTo="Unknown") + stats["analysisContacts"] = [[k, v] for k, v in nAnalyses_by_contact.items()] + makePie("analysisContact", nAnalyses_by_contact, title="Analysis contacts", save=writeRoot) + nResults_by_analysis = { + ana.description: len(ana.results) for ana in Analysis.select() if len(ana.results) > 0 + } + stats["analysisResults"] = [[k, v] for k, v in nResults_by_analysis.items()] + makePie("analysisResults", nResults_by_analysis, title="Analysis results", save=writeRoot) + + # stats to collect: group distribution (from CADI line) (pie) + cadiExpr = re.compile(r".*([A-Z]{3})-\d{2}-\d{3}") + nAnalyses_by_physicsgroup = defaultdict(int) + for analysis in Analysis.select(Analysis.cadiline): + m = cadiExpr.search(analysis.cadiline) + nAnalyses_by_physicsgroup[m.group(1) if m else "NONE"] += 1 + stats["physicsGroup"] = [[k, v] for k, v in nAnalyses_by_physicsgroup.items()] + makePie("physicsGroup", nAnalyses_by_physicsgroup, title="Physics groups", save=writeRoot) + + print("\nAnalysis Statistics extracted.") + print("================================") + + return stats + + +def analyzeResultsStatistics(writeRoot=False): + stats = {} + nResults_by_author = getFreqs(Result, "author", addNoneTo="Unknown") + stats["resultsAuthors"] = [[k, v] for k, v in nResults_by_author.items()] + + res_time, res_nsamples = zip( + *( + ( + ( + int(res.creation_time.strftime("%s")) * 1000 + if res.creation_time is not None + else 0 + ), + res.samples.count(), + ) + for res in Result.select().order_by(Result.creation_time) + ) + ) + stats["resultNsamples"] = th1ToChart(toTH1I("result_nsamples", res_nsamples, 20, 0, 20)) + if writeRoot: + toGraph(np.array(res_time) / 1000.0).Write("resultsTimeprof_graph") + + print("\nResults Statistics extracted.") + print("================================") + + return stats + + +def analyzeSampleStatistics(writeRoot=False): + stats = {} + nSamples_by_author = getFreqs(Sample, "author", addNoneTo="Unknown") + stats["sampleAuthors"] = [[k, v] for k, v in nSamples_by_author.items()] + makePie("sampleAuthors", nSamples_by_author, title="Sample authors", save=writeRoot) + nSamples_by_type = getFreqs(Sample, "sampletype", addNoneTo="Unknown") + stats["sampleTypes"] = [[k, v] for k, v in nSamples_by_type.items()] + makePie("sampleTypes", nSamples_by_type, title="Sample types", save=writeRoot) + + samples_time, sample_nevents, sample_nevents_processed = zip( + *( + ( + ( + int(smp.creation_time.strftime("%s")) * 1000 + if smp.creation_time is not None + else 0 + ), + (smp.nevents if smp.nevents is not None else 0), + (smp.nevents_processed if smp.nevents is not None else 0), + ) + for smp in Sample.select( + Sample.creation_time, Sample.nevents, Sample.nevents_processed + ).order_by(Sample.creation_time) + ) + ) + stats["sampleNevents"] = th1ToChart(toTH1I("sample_nevents", sample_nevents, 100, 0, -100)) + stats["sampleNeventsProcessed"] = th1ToChart( + toTH1I("sample_nevents_processed", sample_nevents_processed, 100, 0, -100) + ) + stats["sampleNeventsTimeprof"] = list( + list(row) for row in zip(samples_time, np.cumsum(np.array(sample_nevents))) + ) + stats["sampleNeventsProcessedTimeprof"] = list( + list(row) for row in zip(samples_time, np.cumsum(np.array(sample_nevents_processed))) + ) + stats["samplesTimeprof"] = samples_time + if writeRoot: + samples_time_s = np.array(samples_time) / 1000.0 + toGraph(samples_time_s, np.cumsum(sample_nevents)).Write("sampleNeventsTimeprof_graph") + toGraph(samples_time_s, np.cumsum(sample_nevents_processed)).Write( + "sampleNeventsProcessedTimeprof_graph" + ) + toGraph(samples_time_s).Write("samplesTimeprof_graph") + + print("\nSamples Statistics extracted.") + print("================================") + + return stats + + +def force_symlink(file1, file2): + try: + os.symlink(file1, file2) + except OSError as e: + if e.errno == errno.EEXIST: + os.remove(file2) + os.symlink(file1, file2) + + +def copyInconsistencies(basedir): + # try to read inconsistencies from previous job + # the file must be there and must contain the relevant data + try: + with open(os.path.join(basedir, "data", "DatasetsAnalysisReport.json")) as jfile: + content = json.load(jfile) + return content["DatabaseInconsistencies"] + except OSError: + # no file. Return an empty string. + # This will happen if basedir is not (properly) set or if it is new. + print( + "No previous dataset analysis report found in path. The Database inconsistencies will be empty." + ) + return [] + except KeyError: + # no proper key. Return an empty string. + # This should not happen, so print a warning. + print("No DatabaseInconsistencies key in the previous json file ?!") + return [] diff --git a/python/luminosity.py b/python/luminosity.py new file mode 100644 index 0000000..9dc9778 --- /dev/null +++ b/python/luminosity.py @@ -0,0 +1,179 @@ +import argparse +import subprocess +from itertools import chain + + +""" +Helper functions for computing the luminosity for a set of samples +""" + + +def parse_luminosity_csv(result): + """Parse the CSV file produced by brilcalc, and return the total recorded luminosity in /pb""" + import csv + import StringIO + + f = StringIO.StringIO(result) + + lumi = 0 + reader = csv.reader(f, delimiter=",") + for row in reader: + if not row: + continue + + if row[0][0] == "#": + continue + lumi += float(row[-1]) + + return lumi / 1000.0 / 1000.0 + + +def compute_luminosity(sample, local=False, normtag=None, username=None): + print("Computing luminosity for %r") % str(sample.name) + + lumi = 0 + if not local: + print( + "Running brilcalc on lxplus... You'll probably need to enter your lxplus password in a moment" + ) + print("") + + cmds = [ + "brilcalc", + "lumi", + "--normtag", + normtag, + "--output-style", + "csv", + "-i", + '"%s"' % str(sample.processed_lumi.replace('"', "")), + ] + cmd = ( + 'export PATH="$HOME/.local/bin:/afs/cern.ch/cms/lumi/brilconda-1.1.7/bin:$PATH"; ' + + " ".join(cmds) + ) + ssh_cmds = ["ssh", "%s@lxplus.cern.ch" % username, cmd] + brilcalc_result = subprocess.check_output(ssh_cmds) + + lumi = parse_luminosity_csv(brilcalc_result) + else: + print("Running brilcalc locally...") + # FIXME one day + print("Error: running brilcalc locally is not supported for the moment.") + return 0 + + print("Sample luminosity: %.3f /pb" % lumi) + print("") + + # Update luminosity in the database + sample.luminosity = lumi + + return lumi + + +def install_brilcalc(local=False, username=None): + + if local: + print("Local installation of brilcalc is not supported.") + return + + print( + "Installing brilcalc on lxplus... You'll probably need to enter your lxplus password in a moment" + ) + + cmds = ["pip", "install", '--install-option="--prefix=$HOME/.local"', "--upgrade", "brilws"] + cmd = 'export PATH="$HOME/.local/bin:/afs/cern.ch/cms/lumi/brilconda-1.1.7/bin:$PATH"; %s' % ( + " ".join(cmds) + ) + ssh_cmds = ["ssh", "%s@lxplus.cern.ch" % username, cmd] + subprocess.call(ssh_cmds) + + +def update_brilcalc(local=False, username=None): + + if local: + print("Local installation of brilcalc is not supported.") + return + + print( + "Updating brilcalc on lxplus... You'll probably need to enter your lxplus password in a moment" + ) + + cmds = [ + "pip", + "install", + '--install-option="--prefix=$HOME/.local"', + "--upgrade", + "--force-reinstall", + "brilws", + ] + cmd = 'export PATH="$HOME/.local/bin:/afs/cern.ch/cms/lumi/brilconda-1.1.7/bin:$PATH"; %s' % ( + " ".join(cmds) + ) + ssh_cmds = ["ssh", "%s@lxplus.cern.ch" % username, cmd] + subprocess.call(ssh_cmds) + + +def compute_sample_luminosity(args=None): + parser = argparse.ArgumentParser(description="Compute luminosity of a set of samples") + parser.add_argument( + "-i", "--id", type=int, nargs="+", dest="ids", help="IDs of the samples", metavar="ID" + ) + parser.add_argument( + "--name", type=str, nargs="+", dest="names", help="Names of the samples", metavar="NAME" + ) + parser.add_argument( + "--local", action="store_true", help="Run brilcalc locally instead of on lxplus" + ) + parser.add_argument( + "--bootstrap", action="store_true", help="Install brilcalc. Needs to be done only once" + ) + parser.add_argument("--update", action="store_true", help="Update brilcalc") + parser.add_argument( + "-n", "--username", help="Remote lxplus username (local username by default)" + ) + parser.add_argument("-t", "--normtag", help="Normtag on /afs") + parser.add_argument( + "--database", + default="~/.samadhi", + help="JSON Config file with database connection settings and credentials", + ) + options = parser.parse_args(args=args) + + if ( + not options.bootstrap + and not options.update + and options.ids is None + and options.names is None + ): + parser.error("You must specify at least one sample id or sample name.") + if not options.bootstrap and not options.update and not options.normtag: + parser.error("You must specify a normtag file") + if options.ids is None: + options.ids = [] + if options.names is None: + options.names = [] + if options.username is None: + import pwd, os + + options.username = pwd.getpwuid(os.getuid()).pw_name + + from .SAMADhi import Sample, SAMADhiDB + from .utils import replaceWildcards + + if options.bootstrap: + install_brilcalc(local=options.local, username=options.username) + elif options.update: + update_brilcalc(local=options.local, username=options.username) + else: + with SAMADhiDB(credentials=args.database) as db: + for sample in chain( + (Sample.get_by_id(id_) for id_ in options.ids), + chain.from_iterable( + Sample.select().where(Sample.name % replaceWildcards(name, db=db)) + for name in options.names + ), + ): + compute_luminosity( + sample, normtag=options.normtag, local=options.local, username=options.username + ) diff --git a/python/scripts.py b/python/scripts.py new file mode 100644 index 0000000..eb871c2 --- /dev/null +++ b/python/scripts.py @@ -0,0 +1,547 @@ +import argparse +import glob +import os.path +from datetime import datetime + + +""" +Simple command-line SAMADhi utilities: search, interactive shell etc. +""" + + +def interactive(args=None): + """iSAMADhi: Explore (and manipulate) the SAMADhi database in an IPython shell""" + parser = argparse.ArgumentParser( + description="Explore (and manipulate) the SAMADhi database in an IPython shell" + ) + parser.add_argument( + "--database", + default="~/.samadhi", + help="JSON Config file with database connection settings and credentials", + ) + args = parser.parse_args(args=args) + + from .SAMADhi import _models, SAMADhiDB + + import IPython + + for md in _models: + locals()[md.__name__] = md + with SAMADhiDB(credentials=args.database) as db: + IPython.embed( + banner1=( + "Interactively exploring SAMADhi database {database}\n" + "Available models: {models}\n" + "WARNING: by default your changes *will* be committed to the database" + ).format( + database="{}({}){}".format( + db.__class__.__name__, + db.database, + ( + " at {}".format(db.connect_params["host"]) + if "host" in db.connect_params + else "" + ), + ), + models=", ".join(md.__name__ for md in _models), + ) + ) + + +def search(args=None): + """search_SAMADhi: search for datasets, samples, results, or analyses""" + parser = argparse.ArgumentParser( + description="Search for datasets, samples, results or analyses in SAMADhi" + ) + parser.add_argument( + "type", + help="Object type to search for", + choices=["dataset", "sample", "result", "analysis"], + ) + parser.add_argument("-l", "--long", action="store_true", help="detailed output") + pquery = parser.add_mutually_exclusive_group(required=True) + pquery.add_argument("-n", "--name", help="filter on name") + pquery.add_argument( + "-p", + "--path", + help="filter on path", + type=(lambda pth: os.path.abspath(os.path.expandvars(os.path.expanduser(pth)))), + ) + pquery.add_argument("-i", "--id", type=int, help="filter on id") + parser.add_argument( + "--database", + default="~/.samadhi", + help="JSON Config file with database connection settings and credentials", + ) + args = parser.parse_args(args=args) + # more validation + if args.type in ("dataset", "analysis") and args.path: + parser.error(f"Cannot search {args.type} by path") + elif args.type == "result" and args.name: + parser.error("Cannot search results by name") + + from . import SAMADhi + from .SAMADhi import SAMADhiDB + from .utils import replaceWildcards + + objCls = getattr(SAMADhi, args.type.capitalize()) + + with SAMADhiDB(credentials=args.database) as db: + qry = objCls.select() + if args.id: + qry = qry.where(objCls.id == args.id) + elif args.name: + qry = qry.where(objCls.name % replaceWildcards(args.name, db=db)) + elif args.path: + qry = qry.where(objCls.path % replaceWildcards(args.path, db=db)) + results = qry.order_by(objCls.id) + + if args.long: + for entry in results: + print(str(entry)) + print(86 * "-") + else: + fmtStr = "{{0.id}}\t{{0.{0}}}".format( + "name" if args.type not in ("result", "analysis") else "description" + ) + for res in results: + print(fmtStr.format(res)) + + +def update_datasets_cross_section(args=None): + parser = argparse.ArgumentParser(description="Update cross-sections of datasets.") + parser.add_argument( + "regex", + type=str, + help=( + "Regular expression used to filter *samples*." + "Only '*' and '?' wildcards are supported. Take note that filtering is applied to samples, and not to datasets." + ), + ) + parser.add_argument( + "-f", + "--force", + type=float, + help="For the cross-section of all datasets matching the regular expression to be this value", + metavar="XSEC", + ) + parser.add_argument("-w", "--write", action="store_true", help="Write changes to the database") + parser.add_argument( + "--database", + default="~/.samadhi", + help="JSON Config file with database connection settings and credentials", + ) + args = parser.parse_args(args) + + from .SAMADhi import Dataset, Sample, SAMADhiDB + from .utils import replaceWildcards, maybe_dryrun + + with SAMADhiDB(credentials=args.database) as db: + samples = Sample.select().where(Sample.name % replaceWildcards(args.regex, db=db)) + if samples.count() == 0: + print("No sample found.") + else: + with maybe_dryrun( + db, + dryRun=(not args.write), + dryMessage="Currently running in dry-run mode. If you are happy with the change, pass the '-w' flag to this script to store the changes into the database.", + ): + for sample in samples: + if sample.source_dataset.datatype == "data": + continue + # Consider a cross-section of one as a non-updated value + if ( + sample.source_dataset.xsection == 1 + or sample.source_dataset.xsection is None + ): + # Try to find a similar sample in the database, with the same center of mass energy + print(f"Updating cross-section of {sample.source_dataset.process}") + if args.force: + print(f" Forcing the cross-section to {args.force}") + sample.source_dataset.xsection = args.force + else: + possible_matches = Dataset.select().where( + (Dataset.process % sample.source_dataset.process) + & (Dataset.energy == sample.source_dataset.energy) + & (Dataset.id != sample.source_dataset.id) + ) + if possible_matches.count() == 0: + print("No match for this dataset found") + elif (possible_matches.count() > 1) and not all( + p.xsec == possible_matches[0].xsec for p in possible_matches + ): + print( + " Warning: more than one possible match found for this dataset, and they do not have the same cross-section. I do not know what to do..." + ) + else: + xsec = possible_matches[0].xsec + print(f" Updating with cross-section = {xsec}") + sample.source_dataset.xsection = xsec + + +def get_file_data(f_): + from cppyy import gbl + + f = gbl.TFile.Open(f_) + if not f: + return (None, None) + + weight_sum = f.Get("event_weight_sum") + if weight_sum: + weight_sum = weight_sum.GetVal() + else: + weight_sum = None + + entries = None + tree = f.Get("t") + if tree: + entries = tree.GetEntriesFast() + + return (weight_sum, entries) + + +def add_sample(args=None): + from .utils import ( + parsePath, + userFromPath, + timeFromPath, + confirm_transaction, + prompt_dataset, + prompt_sample, + ) + + parser = argparse.ArgumentParser(description="Add a sample to the database") + parser.add_argument("--name", help="specify sample name") + parser.add_argument( + "--processed", + type=int, + dest="nevents_processed", + help="number of processed events (from the input)", + ) + parser.add_argument("--nevents", type=int, help="number of events (in the sample)") + parser.add_argument("--norm", type=float, default=1.0, help="additional normalization factor") + parser.add_argument( + "--weight-sum", type=float, default=1.0, help="additional normalization factor" + ) + parser.add_argument("--lumi", type=float, help="sample (effective) luminosity") + parser.add_argument( + "--code_version", + default="", + help="version of the code used to process that sample (e.g. git tag or commit)", + ) + parser.add_argument("--comment", default="", help="comment about the dataset") + parser.add_argument("--source_dataset", type=int, help="reference to the source dataset") + parser.add_argument("--source_sample", type=int, help="reference to the source sample, if any") + parser.add_argument( + "-a", "--author", help="author of the result. If not specified, is taken from the path." + ) + parser.add_argument("--files", help="list of files (full path, comma-separated values)") + parser.add_argument( + "-t", + "--time", + help='result timestamp. If set to "path", timestamp will be taken from the path. Otherwise, it must be formated like YYYY-MM-DD HH:MM:SS. Default is current time.', + ) + parser.add_argument( + "--database", + default="~/.samadhi", + help="JSON Config file with database connection settings and credentials", + ) + parser.add_argument( + "-y", + "--continue", + dest="assumeDefault", + action="store_true", + help="Assume defaults instead of prompt", + ) + parser.add_argument( + "type", choices=["PAT", "SKIM", "RDS", "NTUPLES", "HISTOS"], help="Sample type" + ) + parser.add_argument("path", help="location of the sample on disk", type=parsePath) + args = parser.parse_args(args=args) + + if args.author is None: + args.author = userFromPath(args.path) + if args.time == "path": + args.time = timeFromPath(args.path) + elif args.time is not None: + args.time = datetime.strptime(args.time, "%Y-%m-%d %H:%M:%S") + else: + args.time = datetime.now() + if args.name is None: + args.name = next(tk for tk in reversed(args.path.split("/")) if len(tk)) + + from .SAMADhi import Dataset, Sample, File, SAMADhiDB + + with SAMADhiDB(credentials=args.database) as db: + existing = Sample.get_or_none(Sample.name == args.name) + with confirm_transaction( + db, + "Insert into the database?" if existing is None else f"Replace existing {existing!s}?", + assumeDefault=args.assumeDefault, + ): + sample, created = Sample.get_or_create( + name=args.name, + path=args.path, + defaults={"sampletype": args.type, "nevents_processed": args.nevents_processed}, + ) + sample.sampletype = args.type + sample.nevents_processed = args.nevents_processed + sample.nevents = args.nevents + sample.normalization = args.norm + sample.event_weight_sum = args.weight_sum + sample.luminosity = args.lumi + sample.code_version = args.code_version + sample.user_comment = args.comment + sample.source_dataset = ( + Dataset.get_or_none(Dataset.id == args.source_dataset) + if args.source_dataset is not None + else None + ) + sample.source_sample = ( + Sample.get_or_none(Sample.id == args.source_sample) + if args.source_sample is not None + else None + ) + sample.author = args.author + sample.creation_time = args.time + + if sample.source_dataset is None and not args.assumeDefault: + prompt_dataset(sample) ## TODO: check existence + if sample.source_sample is None and not args.assumeDefault: + prompt_sample(sample) ## TODO: check existence + + if sample.nevents_processed is None: + if sample.source_sample is not None: + sample.nevents_processed = sample.source_sample.nevents_processed + elif sample.source_dataset is not None: + sample.nevents_processed = sample.source_dataset.nevents + else: + print("Warning: Number of processed events not given, and no way to guess it.") + + if args.files is not None: + files = list(args.files.split(",")) + else: + files = glob.glob(os.path.join(sample.path, "*.root")) + if not files: + print(f"Warning: no root files found in {sample.path!r}") + for fName in files: + weight_sum, entries = get_file_data(fName) + File.create( + lfn=fName, + pfn=fName, + event_weight_sum=weight_sum, + nevents=(entries if entries is not None else 0), + sample=sample, + ) ## FIXME extras_event_weight_sum + + if sample.luminosity is None: + sample.luminosity = sample.getLuminosity() + sample.save() + + print(sample) + + +def add_result(args=None): + from .utils import parsePath, userFromPath, timeFromPath, confirm_transaction, prompt_samples + + parser = argparse.ArgumentParser(description="Add a result to the database") + parser.add_argument("path", type=parsePath) + parser.add_argument( + "-s", + "--sample", + dest="inputSamples", + help="comma separated list of samples used as input to produce that result", + ) + parser.add_argument("-d", "--description", help="description of the result") + parser.add_argument("-e", "--elog", help="elog with more details") + parser.add_argument("-A", "--analysis", type=int, help="analysis whose result belong to") + parser.add_argument( + "-a", "--author", help="author of the result. If not specified, is taken from the path" + ) + parser.add_argument( + "-t", + "--time", + help='result timestamp. If set to "path", timestamp will be taken from the path. Otherwise, it must be formated like YYYY-MM-DD HH:MM:SS', + ) + parser.add_argument( + "--database", + default="~/.samadhi", + help="JSON Config file with database connection settings and credentials", + ) + parser.add_argument( + "-y", + "--continue", + dest="assumeDefault", + action="store_true", + help="Assume defaults instead of prompt", + ) + args = parser.parse_args(args=args) + + if args.author is None: + args.author = userFromPath(args.path) + if args.time == "path": + time = timeFromPath(args.path) + elif args.time is not None: + time = datetime.strptime(args.time, "%Y-%m-%d %H:%M:%S") + else: + time = datetime.now() + + from .SAMADhi import Sample, Result, SampleResult, SAMADhiDB + + with SAMADhiDB(credentials=args.database) as db: + with confirm_transaction(db, "Insert into the database?", assumeDefault=args.assumeDefault): + result = Result.create( + path=args.path, + description=args.description, + author=args.author, + creation_time=time, + elog=args.elog, + analysis=args.analysis, + ) + if args.inputSamples is None: + inputSampleIDs = prompt_samples() + else: + inputSampleIDs = [int(x) for x in args.inputSamples.split(",")] + for smpId in inputSampleIDs: + smp = Sample.get_or_none(Sample.id == smpId) + if not smp: + print(f"Could not find sample #{smpId:d}") + else: + SampleResult.create(sample=smp, result=result) + print(result) + + +def splitWith(sequence, predicate): + trueList, falseList = [], [] + for element in sequence: + if predicate(element): + trueList.append(element) + else: + falseList.append(element) + + +def checkAndClean(args=None): + from .utils import parsePath, redirectOut, arg_loadJSON + + parser = argparse.ArgumentParser(description="Script to check samples for deletion") + parser.add_argument( + "-p", + "--path", + default="./", + type=parsePath, + help="Path to the json files with db analysis results", + ) + parser.add_argument("-o", "--output", default="-", help="Name of the output file") + parser.add_argument( + "-M", + "--cleanupMissing", + action="store_true", + help="Clean samples with missing path from the database.", + ) + parser.add_argument( + "-U", + "--cleanupUnreachable", + action="store_true", + help="Clean samples with unreachable path from the database", + ) + parser.add_argument( + "-D", + "--cleanupDatasets", + action="store_true", + help="Clean orphan datasets from the database", + ) + parser.add_argument( + "-w", + "--whitelist", + type=arg_loadJSON, + help="JSON file with sample whitelists per analysis.", + ) + parser.add_argument( + "-d", + "--dry-run", + action="store_true", + help="Dry run: do not write to file and/or touch the database.", + ) + parser.add_argument( + "--database", + default="~/.samadhi", + help="JSON Config file with database connection settings and credentials", + ) + args = parser.parse_args(args=args) + + from .SAMADhi import SAMADhiDB + + with redirectOut(args.output if not args.dry_run else "-"): + # open the sample analysis report and classify bad samples + samples_missing = arg_loadJSON(os.path.join(args.path, "SamplesAnalysisReport.json")).get( + "MissingDirSamples", [] + ) + smp_white, smp_nonWhite = splitWith( + samples_missing, + lambda smp: any(label in smp["name"] for v in args.whitelist.values() for label in v), + ) + smp_empty, smp_investigate = splitWith(smp_white, lambda smp: smp["path"] == "") + smp_empty_delete, smp_delete = splitWith(smp_nonwhite, lambda smp: smp["path"] == "") + # now clean orphan datasets + ds_orphan = arg_loadJSON(os.path.join(args.path, "DatasetsAnalysisReport.json")).get( + "Orphans", [] + ) + ## print a summary now + print( + "\n\nWhitelisted sample with missing path. Investigate:\n{}".format( + "\n".join(smp["name"] for smp in smp_empty) + ) + ) + print( + "\n\nWhitelisted sample with unreachable path. Investigate:\n{}".format( + "\n".join(smp["name"] for smp in smp_investigate) + ) + ) + print( + "\n\nSamples to be deleted because of missing path:\n{}".format( + "\n".join(smp["name"] for smp in smp_empty_delete) + ) + ) + print( + "\n\nSamples to be deleted because of unreachable path:\n{}".format( + "\n".join(smp["name"] for smp in smp_delete) + ) + ) + ## actually perform the cleanup + with SAMADhiDB(credentials=args.database) as db: + with maybe_dryrun(db, dryRun=args.dry_run): + if opts.cleanupMissing: + for smp in smp_empty_delete: + sample = Sample.get_or_none( + (Sample.id == smp["id"]) & (Sample.name == smp["name"]) + ) + if sample is None: + print( + "Could not find sample #{id} {name}".format(smp["id"], smp["name"]) + ) + else: + smp.removeFiles() + smp.delete_instance() + if opts.cleanupUnreachable: + for smp in smp_delete: + sample = Sample.get_or_none( + (Sample.id == smp["id"]) & (Sample.name == smp["name"]) + ) + if sample is None: + print( + "Could not find sample #{id} {name}".format(smp["id"], smp["name"]) + ) + else: + sample.removeFiles() + sample.delete_instance() + if args.cleanupDatasets: + for ids in ds_orphan: + dataset = Dataset.get_or_none( + (Dataset.id == ids["id"]) & (Dataset.name == ids["name"]) + ) + if dataset is None: + print( + "Could not find dataset #{id} {name}".format(ids["id"], ids["name"]) + ) + else: + dataset.delete_instance() diff --git a/python/userPrompt.py b/python/userPrompt.py deleted file mode 100644 index 32e2199..0000000 --- a/python/userPrompt.py +++ /dev/null @@ -1,107 +0,0 @@ -from cp3_llbb.SAMADhi.SAMADhi import Sample, Dataset - -def confirm(prompt=None, resp=False): - """prompts for yes or no response from the user. Returns True for yes and - False for no. 'resp' should be set to the default value assumed by the caller when - user simply types ENTER. - >>> confirm(prompt='Create Directory?', resp=True) - Create Directory? [y]|n: - True - >>> confirm(prompt='Create Directory?', resp=False) - Create Directory? [n]|y: - False - >>> confirm(prompt='Create Directory?', resp=False) - Create Directory? [n]|y: y - True - """ - if prompt is None: - prompt = 'Confirm' - if resp: - prompt = '%s [%s]|%s: ' % (prompt, 'y', 'n') - else: - prompt = '%s [%s]|%s: ' % (prompt, 'n', 'y') - while True: - ans = raw_input(prompt) - if not ans: - return resp - if ans not in ['y', 'Y', 'n', 'N']: - print 'please enter y or n.' - continue - if ans == 'y' or ans == 'Y': - return True - if ans == 'n' or ans == 'N': - return False - -def parse_samples(inputString): - """parse a comma-separated list of samples""" - return [ int(x) for x in inputString.split(',') ] - -def prompt_samples(store): - """prompts for the source sample among the existing ones""" - print "No source sample defined." - print "Please select the samples associated with this result." - # full list of samples - print "Sample\t\tName" - check = store.find(Sample) - all_samples = check.values(Sample.sample_id,Sample.name) - for dset in all_samples: - print "%i\t\t%s"%(dset[0], dset[1]) - # prompt - while True: - try: - return parse_samples(raw_input("Comma-separated list of sample id [None]?")) - except: - continue - -def prompt_sample(sample,store): - """prompts for the source sample among the existing ones""" - print "Please select the sample associated with this sample." - # full list of samples - print "Sample\t\tName" - check = store.find(Sample) - all_samples = check.values(Sample.sample_id,Sample.name) - for dset in all_samples: - print "%i\t\t%s"%(dset[0], dset[1]) - # prompt - while True: - try: - ans = int(raw_input("Sample id [None]?")) - except: - sample.source_sample_id = None - return - check = store.find(Sample,Sample.sample_id==ans) - if check.is_empty(): continue - else: - sample.source_sample_id = ans - return - -def prompt_dataset(sample,store): - """prompts for the source dataset among the existing ones""" - print "Please select the dataset associated with this sample." - # full list of datasets - print "Dataset\t\tName" - check = store.find(Dataset) - all_datasets = check.values(Dataset.dataset_id,Dataset.name) - for dset in all_datasets: - print "%i\t\t%s"%(dset[0], dset[1]) - # datasets whose name contain the sample name - check = store.find(Dataset,Dataset.name.contains_string(sample.name)) - if not check.is_empty(): - print "Suggestions:" - print "Dataset\t\tName" - suggested_datasets = check.values(Dataset.dataset_id,Dataset.name) - for dset in suggested_datasets: - print "%i\t\t%s"%(dset[0], dset[1]) - # prompt - while True: - try: - ans = int(raw_input("Dataset id [None]?")) - except: - sample.source_dataset_id = None - return - check = store.find(Dataset,Dataset.dataset_id==ans) - if check.is_empty(): continue - else: - sample.source_dataset_id = ans - return - diff --git a/python/utils.py b/python/utils.py new file mode 100644 index 0000000..9cc6223 --- /dev/null +++ b/python/utils.py @@ -0,0 +1,202 @@ +from contextlib import contextmanager + + +def parsePath(pth): + """Expand (user and vars), and check that a path is a valid file or directory""" + import os.path + import argparse + + pth = os.path.abspath(os.path.expandvars(os.path.expanduser(pth))) + if not os.path.exists(pth) or not (os.path.isdir(pth) or os.path.isfile(pth)): + raise argparse.ArgumentError(f"{pth} is not an existing file or directory") + return pth + + +def userFromPath(pth): + """Get the username of the path owner""" + import os + from pwd import getpwuid + + return getpwuid(os.stat(pth).st_uid).pw_name + + +def timeFromPath(pth): + import os.path + from datetime import datetime + + return datetime.fromtimestamp(os.path.getctime(pth)) + + +def checkWriteable(pth): + """Expand path, and check that it is writeable and does not exist yet""" + import os, os.path + + pth = os.path.abspath(os.path.expandvars(os.path.expanduser(pth))) + if not os.access(pth, os.W_OK): + raise argparse.ArgumentError(f"Cannot write to {pth}") + if os.path.isfile(pth): + raise argparse.ArgumentError(f"File already exists: {pth}") + return pth + + +@contextmanager +def redirectOut(outArg): + """Redirect sys.stdout to file (if the argument is a writeable file that does not exist yet), + no-op if the argument is '-'""" + if outArg == "-": + yield + else: + outPth = checkWriteable(outArg) + import sys + + with open(outPth, "W") as outF: + bk_stdout = sys.stdout + sys.stdout = outF + yield + sys.stdout = bk_stdout + + +def arg_loadJSON(pth): + """Try to parse the JSON file (type for argparse argumet)""" + if pth: + import json + + with open(parsePath(pth)) as jsF: + return json.load(jsF) + else: + return dict() + + +def replaceWildcards(arg, db=None): + if db: + from peewee import SqliteDatabase + + if isinstance(db, SqliteDatabase): + return arg ## sqlite uses the usual * etc. + return arg.replace("*", "%").replace("?", "_") + + +def confirm(prompt=None, resp=False, assumeDefault=False): + """prompts for yes or no response from the user. Returns True for yes and + False for no. 'resp' should be set to the default value assumed by the caller when + user simply types ENTER. + >>> confirm(prompt='Create Directory?', resp=True) + Create Directory? [y]|n: + True + >>> confirm(prompt='Create Directory?', resp=False) + Create Directory? [n]|y: + False + >>> confirm(prompt='Create Directory?', resp=False) + Create Directory? [n]|y: y + True + """ + if prompt is None: + prompt = "Confirm" + if resp: + prompt = "{} [{}]|{}: ".format(prompt, "y", "n") + else: + prompt = "{} [{}]|{}: ".format(prompt, "n", "y") + if assumeDefault: + print("".join((prompt, ("y" if resp else "n")))) + return resp + while True: + ans = input(prompt) + if not ans: + return resp + if ans not in ["y", "Y", "n", "N"]: + print("please enter y or n.") + continue + if ans == "y" or ans == "Y": + return True + if ans == "n" or ans == "N": + return False + + +def prompt_samples(): + """prompts for the source sample among the existing ones""" + from .SAMADhi import Sample + + print("No source sample defined.") + print("Please select the samples associated with this result.") + # full list of samples + print("Sample\t\tName") + for smp in Sample.select(): + print("%i\t\t%s" % (smp.id, smp.name)) + # prompt + while True: + try: + return [int(x) for x in input("Comma-separated list of sample id [None]?").split(",")] + except: + continue + + +def prompt_sample(sample): + """prompts for the source sample among the existing ones""" + from .SAMADhi import Sample + + print("Please select the sample associated with this sample.") + # full list of samples + print("Sample\t\tName") + for smp in Sample.select(): + print("%i\t\t%s" % (smp.id, smp.name)) + # prompt + while True: + try: + ans = int(input("Sample id [None]?")) + except: + sample.source_sample = None + return + smp_db = Sample.get_or_none(Sample.id == ans) + if smp_db is not None: + sample.source_sample = smp_db + else: + continue + + +def prompt_dataset(sample): + """prompts for the source dataset among the existing ones""" + from .SAMADhi import Dataset + + print("Please select the dataset associated with this sample.") + # full list of datasets + print("Dataset\t\tName") + for ds in Dataset.select(): + print("%i\t\t%s" % (ds.id, ds.name)) + # datasets whose name contain the sample name + suggestions = Dataset.select().where(Dataset.name.contains(sample.name)) + if suggestions.count() > 0: + print("Suggestions:") + print("Dataset\t\tName") + suggested_datasets = check.values(Dataset.id, Dataset.name) + for ds in suggested_datasets: + print("%i\t\t%s" % (ds.id, ds.name)) + # prompt + while True: + try: + ans = int(input("Dataset id [None]?")) + except: + sample.source_dataset = None + return + dset_db = Dataset.get_or_none(Dataset.id == ans) + if dset_db is not None: + sample.source_dataset = smp_db + else: + continue + + +@contextmanager +def confirm_transaction(db, prompt, assumeDefault=False): + with db.atomic() as txn: + yield + answer = confirm(prompt=prompt, resp=True, assumeDefault=assumeDefault) + if not answer: + txn.rollback() + + +@contextmanager +def maybe_dryrun(db, dryMessage=None, dryRun=False): + with db.atomic() as txn: + yield + if dryRun: + print(dryMessage) + txn.rollback() diff --git a/scripts/SAMADhi_dbAnalysis.py b/scripts/SAMADhi_dbAnalysis.py deleted file mode 100755 index 3dcd189..0000000 --- a/scripts/SAMADhi_dbAnalysis.py +++ /dev/null @@ -1,745 +0,0 @@ -#!/usr/bin/env python - -# Script to do basic checks to the database and output statistics on usage and issues - -import os,errno,json -import re -import ROOT -ROOT.gROOT.SetBatch() -from optparse import OptionParser, OptionGroup -from datetime import date -from collections import defaultdict -from cp3_llbb.SAMADhi.SAMADhi import Analysis, Dataset, Sample, Result, DbStore -from cp3_llbb.SAMADhi.SAMADhi import File as SFile -from storm.info import get_cls_info -from datetime import datetime -from collections import defaultdict -from cp3_llbb.SAMADhi.das_import import query_das - -class MyOptionParser: - """ - Client option parser - """ - def __init__(self): - usage = "Usage: %prog [options]\n" - self.parser = OptionParser(usage=usage) - self.parser.add_option("-p","--path", action="store", type="string", - dest="path", default=datetime.now().strftime("%y%m%d-%H:%M:%S"), - help="Destination path") - self.parser.add_option("-b","--basedir", action="store", type="string", - dest="basedir", default="", - help="Directory where the website will be installed") - self.parser.add_option("-f","--full", action="store_true", - dest="DAScrosscheck", default=False, - help="Full check: compares each Dataset entry to DAS and check for consistency (slow!)") - self.parser.add_option("-d","--dry", action="store_true", - dest="dryRun", default=False, - help="Dry run: do no write to disk") - - def get_opt(self): - """ - Returns parse list of options - """ - opts, args = self.parser.parse_args() - if opts.path is not None: - opts.path = os.path.abspath(os.path.expandvars(os.path.expanduser(opts.path))) - if not opts.dryRun and os.path.exists(opts.path): - raise OSError(errno.EEXIST,"Existing directory",opts.path); - return opts - -def main(): - """Main function""" - # get the options - optmgr = MyOptionParser() - opts = optmgr.get_opt() - # connect to the MySQL database using default credentials - dbstore = DbStore() - # prepare the output directory - if not os.path.exists(opts.path) and not opts.dryRun: - os.makedirs(opts.path) - # run each of the checks and collect data - - # collect general statistics - outputDict = collectGeneralStats(dbstore,opts) - if not opts.dryRun: - with open(opts.path+'/stats.json', 'w') as outfile: - json.dump(outputDict, outfile, default=encode_storm_object) - force_symlink(opts.path+'/stats.json',opts.basedir+'/data/stats.json') - - # check datasets - outputDict = {} - outputDict["DatabaseInconsistencies"] = checkDatasets(dbstore,opts) if opts.DAScrosscheck else copyInconsistencies(opts.basedir) - dbstore = DbStore() # reconnect, since the checkDatasets may take very long... - outputDict["Orphans"] = findOrphanDatasets(dbstore,opts) - outputDict["IncompleteData"] = checkDatasetsIntegrity(dbstore,opts) - outputDict["DatasetsStatistics"] = analyzeDatasetsStatistics(dbstore,opts) - if not opts.dryRun: - with open(opts.path+'/DatasetsAnalysisReport.json', 'w') as outfile: - json.dump(outputDict, outfile, default=encode_storm_object) - force_symlink(opts.path+'/DatasetsAnalysisReport.json',opts.basedir+'/data/DatasetsAnalysisReport.json') - - # check samples - outputDict = {} - outputDict["MissingDirSamples"] = checkSamplePath(dbstore,opts) - outputDict["DatabaseInconsistencies"] = checkSampleConsistency(dbstore,opts) - outputDict["SampleStatistics"] = analyzeSampleStatistics(dbstore,opts) - if not opts.dryRun: - with open(opts.path+'/SamplesAnalysisReport.json', 'w') as outfile: - json.dump(outputDict, outfile, default=encode_storm_object) - force_symlink(opts.path+'/SamplesAnalysisReport.json',opts.basedir+'/data/SamplesAnalysisReport.json') - - # now, check results - outputDict = {} - outputDict["MissingDirSamples"] = checkResultPath(dbstore,opts) - outputDict["DatabaseInconsistencies"] = checkResultConsistency(dbstore,opts) - outputDict["SelectedResults"] = selectResults(dbstore,opts) - outputDict["ResultsStatistics"] = analyzeResultsStatistics(dbstore,opts) - if not opts.dryRun: - with open(opts.path+'/ResultsAnalysisReport.json', 'w') as outfile: - json.dump(outputDict, outfile, default=encode_storm_object) - force_symlink(opts.path+'/ResultsAnalysisReport.json',opts.basedir+'/data/ResultsAnalysisReport.json') - - # finally, some stats about Analysis objects - outputDict = {} - outputDict["AnalysisStatistics"] = analyzeAnalysisStatistics(dbstore,opts) - if not opts.dryRun: - with open(opts.path+'/AnalysisAnalysisReport.json', 'w') as outfile: - json.dump(outputDict, outfile, default=encode_storm_object) - force_symlink(opts.path+'/AnalysisAnalysisReport.json',opts.basedir+'/data/AnalysisAnalysisReport.json') - -def collectGeneralStats(dbstore,opts): - # get number of datasets, samples, results, analyses - result = {} - results = dbstore.find(Result) - samples = dbstore.find(Sample) - datasets = dbstore.find(Dataset) - analyses = dbstore.find(Analysis) - result["nDatasets"] = datasets.count() - result["nSamples"] = samples.count() - result["nResults"] = results.count() - result["nAnalyses"] = analyses.count() - print "\nGeneral statistics:" - print '======================' - print datasets.count(), " datasets" - print samples.count(), " samples" - print results.count(), " results" - return result - -def checkDatasets(dbstore,opts): - datasets = dbstore.find(Dataset) - print "\nDatasets inconsistent with DAS:" - print '==================================' - result = [] - for dataset in datasets: - # query DAS to get the same dataset, by name - metadata = {} - try: - metadata = query_das(dataset.name) - except: - result.append([dataset,"Inconsistent with DAS"]) - print "%s (imported on %s) -- Error getting dataset in DAS"%(str(dataset.name),str(dataset.creation_time)) - continue - - # perform some checks: - try: - # release name either matches or is unknown in DAS - test1 = str(metadata[u'release'])=="unknown" or dataset.cmssw_release == str(metadata[u'release']) - # datatype matches - test2 = dataset.datatype == metadata[u'datatype'] - # nevents matches - test3 = dataset.nevents == metadata[u'nevents'] - # size matches - test4 = dataset.dsize == metadata[u'file_size'] - except: - result.append([dataset,"Inconsistent with DAS"]) - print "%s (imported on %s)"%(str(dataset.name),str(dataset.creation_time)) - else: - if not(test1 and test2 and test3 and test4): - result.append([dataset,"Inconsistent with DAS"]) - print "%s (imported on %s)"%(str(dataset.name),str(dataset.creation_time)) - return result - -def findOrphanDatasets(dbstore,opts): - datasets = dbstore.find(Dataset) - print "\nOrphan Datasets:" - print '===================' - result = [] - for dataset in datasets: - if dataset.samples.count()==0: - result.append(dataset) - print "%s (imported on %s)"%(str(dataset.name),str(dataset.creation_time)) - if len(result)==0: - print "None" - return result - -def checkDatasetsIntegrity(dbstore,opts): - datasets = dbstore.find(Dataset) - print "\nDatasets integrity issues:" - print '===========================' - result = [] - for dataset in datasets: - if dataset.cmssw_release is None: - result.append([dataset,"missing CMSSW release"]) - print "%s (imported on %s): missing CMSSW release"%(str(dataset.name),str(dataset.creation_time)) - elif dataset.energy is None: - result.append([dataset,"missing Energy"]) - print "%s (imported on %s): missing Energy"%(str(dataset.name),str(dataset.creation_time)) - elif dataset.globaltag is None: - result.append([dataset,"missing Globaltag"]) - print "%s (imported on %s): missing Globaltag"%(str(dataset.name),str(dataset.creation_time)) - if len(result)==0: - print "None" - return result - - -def analyzeDatasetsStatistics(dbstore,opts): - # ROOT output - if not opts.dryRun: - rootfile = ROOT.TFile(opts.path+"/analysisReport.root","update") - stats = {} - # Releases used - output = dbstore.execute("select dataset.cmssw_release,COUNT(dataset.dataset_id) as numOfDataset FROM dataset GROUP BY cmssw_release") - stats["cmssw_release"] = output.get_all() - if None in stats["cmssw_release"]: - stats["cmssw_release"]["Unknown"] = stats["cmssw_release"][None] + stats["cmssw_release"].get("Unknown",0) - del stats["cmssw_release"][None] - releasePie = ROOT.TPie("datasetReleasePie","Datasets release",len(stats["cmssw_release"])) - for index,entry in enumerate(stats["cmssw_release"]): - releasePie.SetEntryVal(index,entry[1]) - releasePie.SetEntryLabel(index,"None" if entry[0] is None else entry[0]) - releasePie.SetTextAngle(0); - releasePie.SetRadius(0.3); - releasePie.SetTextColor(1); - releasePie.SetTextFont(62); - releasePie.SetTextSize(0.03); - canvas = ROOT.TCanvas("datasetRelease","",2) - releasePie.Draw("r") - if not opts.dryRun: - ROOT.gPad.Write() - # GlobalTag used - output = dbstore.execute("select dataset.globaltag,COUNT(dataset.dataset_id) as numOfDataset FROM dataset GROUP BY globaltag") - stats["globaltag"] = output.get_all() - if None in stats["globaltag"]: - stats["globaltag"]["Unknown"] = stats["globaltag"][None] + stats["globaltag"].get("Unknown",0) - del stats["globaltag"][None] - globaltagPie = ROOT.TPie("datasetGTPie","Datasets globaltag",len(stats["globaltag"])) - for index,entry in enumerate(stats["globaltag"]): - globaltagPie.SetEntryVal(index,entry[1]) - globaltagPie.SetEntryLabel(index,"None" if entry[0] is None else entry[0]) - globaltagPie.SetTextAngle(0); - globaltagPie.SetRadius(0.3); - globaltagPie.SetTextColor(1); - globaltagPie.SetTextFont(62); - globaltagPie.SetTextSize(0.03); - canvas = ROOT.TCanvas("datasetGT","",2) - globaltagPie.Draw("r") - if not opts.dryRun: - ROOT.gPad.Write() - # Datatype - output = dbstore.execute("select dataset.datatype,COUNT(dataset.dataset_id) as numOfDataset FROM dataset GROUP BY datatype") - stats["datatype"] = output.get_all() - if None in stats["datatype"]: - stats["datatype"]["Unknown"] = stats["datatype"][None] + stats["datatype"].get("Unknown",0) - del stats["datatype"][None] - datatypePie = ROOT.TPie("datasetTypePie","Datasets datatype",len(stats["datatype"])) - for index,entry in enumerate(stats["datatype"]): - datatypePie.SetEntryVal(index,entry[1]) - datatypePie.SetEntryLabel(index,"None" if entry[0] is None else entry[0]) - datatypePie.SetTextAngle(0); - datatypePie.SetRadius(0.3); - datatypePie.SetTextColor(1); - datatypePie.SetTextFont(62); - datatypePie.SetTextSize(0.03); - canvas = ROOT.TCanvas("datasetType","",2) - datatypePie.Draw("r") - if not opts.dryRun: - ROOT.gPad.Write() - # Energy - output = dbstore.execute("select dataset.energy,COUNT(dataset.dataset_id) as numOfDataset FROM dataset GROUP BY energy") - stats["energy"] = output.get_all() - energyPie = ROOT.TPie("datasetEnergyPie","Datasets energy",len(stats["energy"])) - for index,entry in enumerate(stats["energy"]): - energyPie.SetEntryVal(index,entry[1]) - energyPie.SetEntryLabel(index,"None" if entry[0] is None else str(entry[0])) - energyPie.SetTextAngle(0); - energyPie.SetRadius(0.3); - energyPie.SetTextColor(1); - energyPie.SetTextFont(62); - energyPie.SetTextSize(0.03); - canvas = ROOT.TCanvas("datasetEnergy","",2) - energyPie.Draw("r") - if not opts.dryRun: - ROOT.gPad.Write() - # get all datasets to loop - datasets = dbstore.find(Dataset) - datasets.order_by(Dataset.creation_time) - # time evolution of # datasets (still in db) - datasets_time = [[0,0]] - # various stats (histograms) - datasets_nsamples = ROOT.TH1I("dataseets_nsamples","datasets_nsamples",10,0,10) - datasets_nevents = ROOT.TH1I("dataseets_nevents", "datasets_nevents" ,100,0,-100) - datasets_dsize = ROOT.TH1I("dataseets_dsize", "datasets_dsize" ,100,0,-100) - # let's go... loop - for dataset in datasets: - # for Highcharts the time format is #seconds since epoch - time = int(dataset.creation_time.strftime("%s"))*1000 - datasets_time.append([time,datasets_time[-1][1]+1]) - datasets_nsamples.Fill(dataset.samples.count()) - datasets_nevents.Fill(dataset.nevents) - datasets_dsize.Fill(dataset.dsize) - # drop this: just to initialize the loop - datasets_time.pop(0) - # output - stats["datasetsTimeprof"] = datasets_time - datasetsTimeprof_graph = ROOT.TGraph(len(datasets_time)) - for i,s in enumerate(datasets_time): - datasetsTimeprof_graph.SetPoint(i,s[0]/1000,s[1]) - if not opts.dryRun: - datasetsTimeprof_graph.Write("datasetsTimeprof_graph") - data = [] - for bin in range(1,datasets_nsamples.GetNbinsX()+1): - data.append([datasets_nsamples.GetBinCenter(bin),datasets_nsamples.GetBinContent(bin)]) - stats["datasetsNsamples"] = data - data = [] - for bin in range(1,datasets_nevents.GetNbinsX()+1): - data.append([datasets_nevents.GetBinCenter(bin),datasets_nevents.GetBinContent(bin)]) - stats["datasetsNevents"] = data - data = [] - for bin in range(1,datasets_dsize.GetNbinsX()+1): - data.append([datasets_dsize.GetBinCenter(bin),datasets_dsize.GetBinContent(bin)]) - stats["datasetsDsize"] = data - # some printout - print "\nDatasets Statistics extracted." - print '=================================' - # ROOT output - if not opts.dryRun: - rootfile.Write(); - rootfile.Close(); - # JSON output - return stats - - -def checkResultPath(dbstore,opts): - # get all samples - result = dbstore.find(Result) - print "\nResults with missing path:" - print '===========================' - array = [] - for res in result: - # check that the path exists, and keep track of the sample if not the case. - if not os.path.exists(res.path): - print "Result #%s (created on %s by %s):"%(str(res.result_id),str(res.creation_time),str(res.author)), - print " missing path: %s" %res.path - array.append(res) - if len(array)==0: print "None" - return array - - -def checkSamplePath(dbstore,opts): - # get all samples - result = dbstore.find(Sample) - print "\nSamples with missing path:" - print '===========================' - array = [] - for sample in result: - # check that the path exists, and keep track of the sample if not the case. - vpath = getSamplePath(sample,dbstore) - for path in vpath: - if not os.path.exists(path): - print "Sample #%s (created on %s by %s):"%(str(sample.sample_id),str(sample.creation_time),str(sample.author)), - print " missing path: %s" %path - print vpath - array.append(sample) - break - if len(array)==0: print "None" - return array - -def getSamplePath(sample,dbstore): - vpath=[] - # the path should be stored in sample.path - # if it is empty, look for files in that path - if sample.path=="": - regex = r".*SFN=(.*)" - files = dbstore.find(SFile, SFile.sample_id==sample.sample_id) - for file in files: - m = re.search(regex,str(file.pfn)) - if m: vpath.append(os.path.dirname(m.group(1))) - vpath=list(set(vpath)) - return vpath - else: - return [sample.path] - -def selectResults(dbstore,opts): - # look for result records pointing to a ROOT file - # eventually further filter - results = dbstore.find(Result) - print "\nSelected results:" - print '===========================' - array = [] - for result in results: - path = result.path - if os.path.exists(path) and os.path.isdir(path): - files = [ f for f in os.listdir(path) if os.path.isfile(path+"/"+f) ] - if len(files)==1: - path = path+"/"+f - result.path = path - if os.path.exists(path) and os.path.isfile(path) and path.lower().endswith(".root"): - symlink = "%s/data/result_%s.root"%(opts.basedir,str(result.result_id)) - relpath = "../data/result_%s.root"%(str(result.result_id)) - force_symlink(path,symlink) - array.append([result,relpath]) - print "Result #%s (created on %s by %s): "%(str(result.result_id),str(result.creation_time),str(result.author)), - print symlink - - if len(array)==0: print "None" - return array - -def checkResultConsistency(dbstore,opts): - # get all samples - result = dbstore.find(Result) - print "\nResults with missing source:" - print '=============================' - array = [] - for res in result: - # check that the source sample exists in the database. - # normaly, this should be protected already at the level of sql rules - for sample in res.samples: - if sample is None: - print "Result #%s (created on %s by %s):"%(str(res.result_id),str(res.creation_time),str(res.author)), - print "inconsistent source sample" - array.append([res,"inconsistent source sample"]) - print res - break - if len(array)==0: print "None" - return array - - -def checkSampleConsistency(dbstore,opts): - # get all samples - result = dbstore.find(Sample) - print "\nSamples with missing source:" - print '=============================' - array = [] - for sample in result: - # check that either the source dataset or the source sample exists in the database. - # normaly, this should be protected already at the level of sql rules - sourceDataset = sample.source_dataset - sourceSample = sample.source_sample - if (sample.source_dataset_id is not None) and (sourceDataset is None): - print "Sample #%s (created on %s by %s):"%(str(sample.sample_id),str(sample.creation_time),str(sample.author)), - print "inconsistent source dataset" - array.append([sample,"inconsistent source dataset"]) - print sample - if (sample.source_sample_id is not None) and (sourceSample is None): - print "Sample #%s (created on %s by %s):"%(str(sample.sample_id),str(sample.creation_time),str(sample.author)), - print "inconsistent source sample" - array.append([sample,"inconsistent source sample"]) - if len(array)==0: print "None" - return array - -def analyzeAnalysisStatistics(dbstore,opts): - stats = {} - # ROOT output - if not opts.dryRun: - rootfile = ROOT.TFile(opts.path+"/analysisReport.root","update") - # contact - output = dbstore.execute("select analysis.contact,COUNT(analysis.analysis_id) as numOfAnalysis FROM analysis GROUP BY contact") - stats["analysisContacts"] = output.get_all() - if None in stats["analysisContacts"]: - stats["analysisContacts"]["Unknown"] = stats["analysisContacts"][None] + stats["analysisContacts"].get("Unknown",0) - del stats["analysisContacts"][None] - contactPie = ROOT.TPie("AnalysisContactPie","Analysis contacts",len(stats["analysisContacts"])) - for index,entry in enumerate(stats["analysisContacts"]): - contactPie.SetEntryVal(index,entry[1]) - contactPie.SetEntryLabel(index,"None" if entry[0] is None else entry[0]) - contactPie.SetTextAngle(0); - contactPie.SetRadius(0.3); - contactPie.SetTextColor(1); - contactPie.SetTextFont(62); - contactPie.SetTextSize(0.03); - canvas = ROOT.TCanvas("analysisContact","",2) - contactPie.Draw("r") - if not opts.dryRun: - ROOT.gPad.Write() - # analysis size in terms of results (pie) - output = dbstore.execute("select analysis.description,COUNT(result.result_id) as numOfResults FROM result INNER JOIN analysis ON result.analysis_id=analysis.analysis_id GROUP BY result.analysis_id;") - stats["analysisResults"] = output.get_all() - if None in stats["analysisResults"]: - stats["analysisResults"]["Unknown"] = stats["analysisResults"][None] + stats["analysisResults"].get("Unknown",0) - del stats["analysisResults"][None] - resultPie = ROOT.TPie("AnalysisResultsPie","Analysis results",len(stats["analysisResults"])) - for index,entry in enumerate(stats["analysisResults"]): - resultPie.SetEntryVal(index,entry[1]) - resultPie.SetEntryLabel(index,"None" if entry[0] is None else entry[0]) - resultPie.SetTextAngle(0); - resultPie.SetRadius(0.3); - resultPie.SetTextColor(1); - resultPie.SetTextFont(62); - resultPie.SetTextSize(0.03); - canvas = ROOT.TCanvas("analysisResults","",2) - resultPie.Draw("r") - if not opts.dryRun: - ROOT.gPad.Write() - # stats to collect: group distribution (from CADI line) (pie) - analyses = dbstore.find(Analysis) - regex = r".*([A-Z]{3})-\d{2}-\d{3}" - stats["physicsGroup"] = defaultdict(int) - for analysis in analyses: - m = re.search(regex,str(analysis.cadiline)) - physicsGroup = "NONE" - if m: - physicsGroup = m.group(1) - stats["physicsGroup"][physicsGroup] += 1 - stats["physicsGroup"] = dict(stats["physicsGroup"]) - if None in stats["physicsGroup"]: - stats["physicsGroup"]["Unknown"] = stats["physicsGroup"][None] + stats["physicsGroup"].get("Unknown",0) - del stats["physicsGroup"][None] - - # the end of the loop, we have all what we need to fill a pie chart. - physicsGroupPie = ROOT.TPie("physicsGroupPie","Physics groups",len(stats["physicsGroup"])) - for index,(group,count) in enumerate(stats["physicsGroup"].iteritems()): - physicsGroupPie.SetEntryVal(index,count) - physicsGroupPie.SetEntryLabel(index,group) - physicsGroupPie.SetTextAngle(0); - physicsGroupPie.SetRadius(0.3); - physicsGroupPie.SetTextColor(1); - physicsGroupPie.SetTextFont(62); - physicsGroupPie.SetTextSize(0.03); - canvas = ROOT.TCanvas("physicsGroup","",2) - physicsGroupPie.Draw("r") - if not opts.dryRun: - ROOT.gPad.Write() - # some printout - print "\nAnalysis Statistics extracted." - print '================================' - # ROOT output - if not opts.dryRun: - rootfile.Write(); - rootfile.Close(); - # JSON output - stats["physicsGroup"] = [ [a,b] for (a,b) in stats["physicsGroup"].items()] - return stats - -def analyzeResultsStatistics(dbstore,opts): - stats = {} - # ROOT output - if not opts.dryRun: - rootfile = ROOT.TFile(opts.path+"/analysisReport.root","update") - #authors statistics - output = dbstore.execute("select result.author,COUNT(result.result_id) as numOfResults FROM result GROUP BY author") - stats["resultsAuthors"] = output.get_all() - if None in stats["resultsAuthors"]: - stats["resultsAuthors"]["Unknown"] = stats["resultsAuthors"][None] + stats["resultsAuthors"].get("Unknown",0) - del stats["resultsAuthors"][None] - authorPie = ROOT.TPie("resultsAuthorsPie","Results authors",len(stats["resultsAuthors"])) - for index,entry in enumerate(stats["resultsAuthors"]): - authorPie.SetEntryVal(index,entry[1]) - authorPie.SetEntryLabel(index,"None" if entry[0] is None else entry[0]) - authorPie.SetTextAngle(0); - authorPie.SetRadius(0.3); - authorPie.SetTextColor(1); - authorPie.SetTextFont(62); - authorPie.SetTextSize(0.03); - canvas = ROOT.TCanvas("resultsAuthor","",2) - authorPie.Draw("r") - if not opts.dryRun: - ROOT.gPad.Write() - result_nsamples = ROOT.TH1I("result_nsamples","result_nsamples",20,0,20) - # get all samples to loop - results = dbstore.find(Result) - results.order_by(Result.creation_time) - # time evolution of # results (still in db) - results_time = [[0,0]] - # let's go... loop - for result in results: - # for Highcharts the time format is #seconds since epoch - time = int(result.creation_time.strftime("%s"))*1000 - results_time.append([time,results_time[-1][1]+1]) - result_nsamples.Fill(result.samples.count()) - # drop this: just to initialize the loop - results_time.pop(0) - # output - stats["resultsTimeprof"] = results_time - resultsTimeprof_graph = ROOT.TGraph(len(results_time)) - for i,s in enumerate(results_time): - resultsTimeprof_graph.SetPoint(i,s[0]/1000,s[1]) - if not opts.dryRun: - resultsTimeprof_graph.Write("resultsTimeprof_graph") - data = [] - for bin in range(1,result_nsamples.GetNbinsX()+1): - data.append([result_nsamples.GetBinCenter(bin),result_nsamples.GetBinContent(bin)]) - stats["resultNsamples"] = data - # some printout - print "\nResults Statistics extracted." - print '================================' - # ROOT output - if not opts.dryRun: - rootfile.Write(); - rootfile.Close(); - # JSON output - return stats - -def analyzeSampleStatistics(dbstore,opts): - stats = {} - # ROOT output - if not opts.dryRun: - rootfile = ROOT.TFile(opts.path+"/analysisReport.root","update") - #authors statistics - output = dbstore.execute("select sample.author,COUNT(sample.sample_id) as numOfSamples FROM sample GROUP BY author") - stats["sampleAuthors"] = output.get_all() - if None in stats["sampleAuthors"]: - stats["sampleAuthors"]["Unknown"] = stats["sampleAuthors"][None] + stats["sampleAuthors"].get("Unknown",0) - del stats["sampleAuthors"][None] - authorPie = ROOT.TPie("sampleAuthorsPie","Samples authors",len(stats["sampleAuthors"])) - for index,entry in enumerate(stats["sampleAuthors"]): - authorPie.SetEntryVal(index,entry[1]) - authorPie.SetEntryLabel(index,"None" if entry[0] is None else entry[0]) - authorPie.SetTextAngle(0); - authorPie.SetRadius(0.3); - authorPie.SetTextColor(1); - authorPie.SetTextFont(62); - authorPie.SetTextSize(0.03); - canvas = ROOT.TCanvas("sampleAuthor","",2) - authorPie.Draw("r") - if not opts.dryRun: - ROOT.gPad.Write() - #sample types statistics - output = dbstore.execute("select sample.sampletype,COUNT(sample.sample_id) as numOfSamples FROM sample GROUP BY sampletype") - stats["sampleTypes"] = output.get_all() - if None in stats["sampleTypes"]: - stats["sampleTypes"]["Unknown"] = stats["sampleTypes"][None] + stats["sampleTypes"].get("Unknown",0) - del stats["sampleTypes"][None] - typePie = ROOT.TPie("sampleTypesPie","Samples types",len(stats["sampleTypes"])) - for index,entry in enumerate(stats["sampleTypes"]): - typePie.SetEntryVal(index,entry[1]) - typePie.SetEntryLabel(index,"None" if entry[0] is None else entry[0]) - typePie.SetTextAngle(0); - typePie.SetRadius(0.3); - typePie.SetTextColor(1); - typePie.SetTextFont(62); - typePie.SetTextSize(0.03); - canvas = ROOT.TCanvas("sampleType","",2) - typePie.Draw("r") - if not opts.dryRun: - ROOT.gPad.Write() - # get all samples to loop - result = dbstore.find(Sample) - result.order_by(Sample.creation_time) - # events statistics - sample_nevents_processed = ROOT.TH1I("sample_nevents_processed","sample_nevents_processed",100,0,-100) - sample_nevents = ROOT.TH1I("sample_nevents","sample_nevents",100,0,-100) - # time evolution of statistics & # samples (still in db) - sample_nevents_processed_time = [[0,0]] - sample_nevents_time = [[0,0]] - samples_time = [[0,0]] - # let's go... loop - for sample in result: - # for Highcharts the time format is #seconds since epoch - time = int(sample.creation_time.strftime("%s"))*1000 - ne = 0 if sample.nevents is None else sample.nevents - np = 0 if sample.nevents_processed is None else sample.nevents_processed - sample_nevents_processed.Fill(np) - sample_nevents.Fill(ne) - sample_nevents_processed_time.append([time,sample_nevents_processed_time[-1][1]+np]) - sample_nevents_time.append([time,sample_nevents_time[-1][1]+ne]) - samples_time.append([time,samples_time[-1][1]+1]) - # drop this: just to initialize the loop - sample_nevents_processed_time.pop(0) - sample_nevents_time.pop(0) - samples_time.pop(0) - # output - stats["sampleNeventsTimeprof"] = sample_nevents_time - stats["sampleNeventsProcessedTimeprof"] = sample_nevents_processed_time - stats["samplesTimeprof"] = samples_time - sampleNeventsTimeprof_graph = ROOT.TGraph(len(sample_nevents_time)) - sampleNeventsProcessedTimeprof_graph = ROOT.TGraph(len(sample_nevents_processed_time)) - samplesTimeprof_graph = ROOT.TGraph(len(samples_time)) - for i,s in enumerate(sample_nevents_time): - sampleNeventsTimeprof_graph.SetPoint(i,s[0]/1000,s[1]) - for i,s in enumerate(sample_nevents_processed_time): - sampleNeventsProcessedTimeprof_graph.SetPoint(i,s[0]/1000,s[1]) - for i,s in enumerate(samples_time): - samplesTimeprof_graph.SetPoint(i,s[0]/1000,s[1]) - if not opts.dryRun: - sampleNeventsTimeprof_graph.Write("sampleNeventsTimeprof_graph") - sampleNeventsProcessedTimeprof_graph.Write("sampleNeventsProcessedTimeprof_graph") - samplesTimeprof_graph.Write("samplesTimeprof_graph") - # unfortunately, TBufferJSON is not available in CMSSW (no libRHttp) -> no easy way to export to JSON - # the JSON format for highcharts data is [ [x1,y1], [x2,y2], ... ] - data = [] - for bin in range(1,sample_nevents.GetNbinsX()+1): - data.append([sample_nevents.GetBinCenter(bin),sample_nevents.GetBinContent(bin)]) - stats["sampleNevents"] = data - data = [] - for bin in range(1,sample_nevents_processed.GetNbinsX()+1): - data.append([sample_nevents_processed.GetBinCenter(bin),sample_nevents_processed.GetBinContent(bin)]) - stats["sampleNeventsProcessed"] = data - # some printout - print "\nSamples Statistics extracted." - print '================================' - # ROOT output - if not opts.dryRun: - rootfile.Write(); - rootfile.Close(); - # JSON output - return stats - -# function to serialize the storm objects, -# from Jamu Kakar and Mario Zito at https://lists.ubuntu.com/archives/storm/2010-May/001286.html -def encode_storm_object(object): - ''' Serializes to JSON a Storm object - - Use: - from storm.info import get_cls_info - import json - ... - storm_object = get_storm_object() - print json.dumps(storm_object, default=encode_storm_object) - - Warnings: - Serializes objects containing Int, Date and Unicode data types - other datatypes are not tested. MUST be improved - ''' - if not hasattr(object, "__storm_table__"): - raise TypeError(repr(object) + " is not JSON serializable") - result = {} - cls_info = get_cls_info(object.__class__) - for name in cls_info.attributes.iterkeys(): - value= getattr(object, name) - if (isinstance(value, date)): - value= str(value) - result[name] = value - return result - -def force_symlink(file1, file2): - try: - os.symlink(file1, file2) - except OSError, e: - if e.errno == errno.EEXIST: - os.remove(file2) - os.symlink(file1, file2) - -def copyInconsistencies(basedir): - # try to read inconsistencies from previous job - # the file must be there and must contain the relevant data - try: - with open(basedir+'/data/DatasetsAnalysisReport.json') as jfile: - content = json.load(jfile) - return content["DatabaseInconsistencies"] - except IOError: - # no file. Return an empty string. - # This will happen if basedir is not (properly) set or if it is new. - print("No previous dataset analysis report found in path. The Database inconsistencies will be empty.") - return [] - except KeyError: - # no proper key. Return an empty string. - # This should not happen, so print a warning. - print("No DatabaseInconsistencies key in the previous json file ?!") - return [] - -# -# main -# -if __name__ == '__main__': - main() - diff --git a/scripts/add_result.py b/scripts/add_result.py deleted file mode 100755 index 9e87297..0000000 --- a/scripts/add_result.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python - -# Script to add a sample to the database - -import os -from pwd import getpwuid -from datetime import datetime -from optparse import OptionParser -from cp3_llbb.SAMADhi.SAMADhi import Analysis, Sample, Result, DbStore -from cp3_llbb.SAMADhi.userPrompt import confirm, prompt_samples, parse_samples - -class MyOptionParser: - """ - Client option parser - """ - def __init__(self): - usage = "Usage: %prog path [options]\n" - self.parser = OptionParser(usage=usage) - self.parser.add_option("-s", "--sample", action="store", type="string", - default=None, dest="inputSamples", - help="comma separated list of samples used as input to produce that result") - self.parser.add_option("-d", "--description", action="store", type="string", - default=None, dest="desc", - help="description of the result") - self.parser.add_option("-e", "--elog", action="store", type="string", - default=None, dest="elog", - help="elog with more details") - self.parser.add_option("-A", "--analysis", action="store", type="int", - default=None, dest="ana", - help="analysis whose result belong to") - self.parser.add_option("-a", "--author", action="store", type="string", - default=None, dest="author", - help="author of the result. If not specified, is taken from the path.") - self.parser.add_option("-t", "--time", action="store", type="string", - default=None, dest="time", - help="result timestamp. If set to \"path\", timestamp will be taken from the path. Otherwise, it must be formated like YYYY-MM-DD HH:MM:SS") - - def get_opt(self): - """ - Returns parse list of options - """ - opts, args = self.parser.parse_args() - # check that the path exists - if len(args) < 1: - self.parser.error("path is mandatory") - opts.path = os.path.abspath(os.path.expandvars(os.path.expanduser(args[0]))) - if not os.path.exists(opts.path) or not ( os.path.isdir(opts.path) or os.path.isfile(opts.path)) : - self.parser.error("%s is not an existing file or directory"%opts.path) - # set author - if opts.author is None: - opts.author = getpwuid(os.stat(opts.path).st_uid).pw_name - # set timestamp - if not opts.time is None: - if opts.time=="path": - opts.datetime = datetime.fromtimestamp(os.path.getctime(opts.path)) - else: - opts.datetime = datetime.strptime(opts.time,'%Y-%m-%d %H:%M:%S') - else: - opts.datetime = datetime.now() - return opts - -def main(): - """Main function""" - # get the options - optmgr = MyOptionParser() - opts = optmgr.get_opt() - # build the result from user input - result = Result(unicode(opts.path)) - result.description = unicode(opts.desc) - result.author = unicode(opts.author) - result.creation_time = opts.datetime - result.elog = unicode(opts.elog) - result.analysis_id = opts.ana - # connect to the MySQL database using default credentials - dbstore = DbStore() - # unless the source is set, prompt the user and present a list to make a choice - if opts.inputSamples is None: - inputSamples = prompt_samples(dbstore) - else: - inputSamples = parse_samples(opts.inputSamples) - # create and store the relations - samples = dbstore.find(Sample,Sample.sample_id.is_in(inputSamples)) - if samples.is_empty(): - dbstore.add(result) - else: - for sample in samples: - sample.results.add(result) - # flush (populates the analysis if needed) - dbstore.flush() - # print the resulting object and ask for confirmation - print result - if confirm(prompt="Insert into the database?", resp=True): - dbstore.commit() - -# -# main -# -if __name__ == '__main__': - main() diff --git a/scripts/add_sample.py b/scripts/add_sample.py deleted file mode 100755 index 4970793..0000000 --- a/scripts/add_sample.py +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env python - -# Script to add a sample to the database - -import os -import glob -from pwd import getpwuid -from optparse import OptionParser -from datetime import datetime -from cp3_llbb.SAMADhi.SAMADhi import Dataset, Sample, File, DbStore -from cp3_llbb.SAMADhi.userPrompt import confirm, prompt_dataset, prompt_sample - -def get_file_data_(f_): - import ROOT - - f = ROOT.TFile.Open(f_) - if not f: - return (None, None) - - weight_sum = f.Get("event_weight_sum") - if weight_sum: - weight_sum = weight_sum.GetVal() - else: - weight_sum = None - - entries = None - tree = f.Get("t") - if tree: - entries = tree.GetEntriesFast() - - return (weight_sum, entries) - - -class MyOptionParser: - """ - Client option parser - """ - def __init__(self): - usage = "Usage: %prog type path [options]\n" - usage += "where type is one of PAT, SKIM, RDS, NTUPLES, HISTOS, ...\n" - usage += " and path is the location of the sample on disk" - self.parser = OptionParser(usage=usage) - self.parser.add_option("--name", action="store", type="string", - default=None, dest="name", - help="specify sample name") - self.parser.add_option("--processed", action="store", type="int", - default=None, dest="nevents_processed", - help="number of processed events (from the input)") - self.parser.add_option("--nevents", action="store", type="int", - default=None, dest="nevents", - help="number of events (in the sample)") - self.parser.add_option("--norm", action="store", type="float", - default=1.0, dest="normalization", - help="additional normalization factor") - self.parser.add_option("--weight-sum", action="store", type="float", - default=1.0, dest="weight_sum", - help="additional normalization factor") - self.parser.add_option("--lumi", action="store", type="float", - default=None, dest="luminosity", - help="sample (effective) luminosity") - self.parser.add_option("--code_version", action="store", type="string", - default="", dest="code_version", - help="version of the code used to process that sample (e.g. git tag or commit)") - self.parser.add_option("--comment", action="store", type="string", - default="", dest="user_comment", - help="comment about the dataset") - self.parser.add_option("--source_dataset", action="store", type="int", - default=None, dest="source_dataset_id", - help="reference to the source dataset") - self.parser.add_option("--source_sample", action="store", type="int", - default=None, dest="source_sample_id", - help="reference to the source sample, if any") - self.parser.add_option("-a", "--author", action="store", type="string", - default=None, dest="author", - help="author of the result. If not specified, is taken from the path.") - self.parser.add_option("--files", action="store", type="string", - default="", dest="files", - help="list of files (full path, comma-separated values)") - self.parser.add_option("-t", "--time", action="store", type="string", - default=None, dest="time", - help="result timestamp. If set to \"path\", timestamp will be taken from the path. Otherwise, it must be formated like YYYY-MM-DD HH:MM:SS. Default is current time.") - - def get_opt(self): - """ - Returns parse list of options - """ - opts, args = self.parser.parse_args() - # mandatory arguments - if len(args) < 2: - self.parser.error("type and path are mandatory") - opts.sampletype = args[0] - opts.path = os.path.abspath(os.path.expandvars(os.path.expanduser(args[1]))) - # check path - if not os.path.exists(opts.path) or not ( os.path.isdir(opts.path) or os.path.isfile(opts.path)) : - self.parser.error("%s is not an existing directory"%opts.path) - # set author - if opts.author is None: - opts.author = getpwuid(os.stat(opts.path).st_uid).pw_name - # set timestamp - if not opts.time is None: - if opts.time=="path": - opts.datetime = datetime.fromtimestamp(os.path.getctime(opts.path)) - else: - opts.datetime = datetime.strptime(opts.time,'%Y-%m-%d %H:%M:%S') - else: - opts.datetime = datetime.now() - # set name - if opts.name is None: - if opts.path[-1]=='/': - opts.name = opts.path.split('/')[-2] - else: - opts.name = opts.path.split('/')[-1] - return opts - -def main(): - """Main function""" - # get the options - optmgr = MyOptionParser() - opts = optmgr.get_opt() - # build the sample from user input - sample = Sample(unicode(opts.name), unicode(opts.path), unicode(opts.sampletype), opts.nevents_processed) - sample.nevents = opts.nevents - sample.normalization = opts.normalization - sample.event_weight_sum = opts.weight_sum - sample.luminosity = opts.luminosity - sample.code_version = unicode(opts.code_version) - sample.user_comment = unicode(opts.user_comment) - sample.source_dataset_id = opts.source_dataset_id - sample.source_sample_id = opts.source_sample_id - sample.author = unicode(opts.author) - sample.creation_time = opts.datetime - # connect to the MySQL database using default credentials - dbstore = DbStore() - # unless the source is set, prompt the user and present a list to make a choice - if sample.source_dataset_id is None: - prompt_dataset(sample,dbstore) - if sample.source_sample_id is None: - prompt_sample(sample,dbstore) - # check that source sample and dataset exist - if sample.source_dataset_id is not None: - checkExisting = dbstore.find(Dataset,Dataset.dataset_id==sample.source_dataset_id) - if checkExisting.is_empty(): - raise IndexError("No dataset with such index: %d"%sample.source_dataset_id) - if sample.source_sample_id is not None: - checkExisting = dbstore.find(Sample,Sample.sample_id==sample.source_sample_id) - if checkExisting.is_empty(): - raise IndexError("No sample with such index: %d"%sample.source_sample_id) - # if opts.nevents is not set, take #events from source sample (if set) or from source dataset (if set) in that order - if sample.nevents_processed is None and sample.source_sample_id is not None: - sample.nevents_processed = dbstore.find(Sample,Sample.sample_id==sample.source_sample_id).one().nevents_processed - if sample.nevents_processed is None and sample.source_dataset_id is not None: - sample.nevents_processed = dbstore.find(Dataset,Dataset.dataset_id==sample.source_dataset_id).one().nevents - if sample.nevents_processed is None: - print "Warning: Number of processed events not given, and no way to guess it." - - # List input files - files = [] - if opts.files == "": - files = glob.glob(os.path.join(sample.path, '*.root')) - else: - files = unicode(opts.files).split(",") - if len(files) == 0: - print "Warning: no root files found in %r" % sample.path - - # Try to guess the number of events stored into the file, as well as the weight sum - for f in files: - (weight_sum, entries) = get_file_data_(f) - sample.files.add(File(f, f, weight_sum, entries)) - - # check that there is no existing entry - checkExisting = dbstore.find(Sample,Sample.name==sample.name) - if checkExisting.is_empty(): - print sample - if confirm(prompt="Insert into the database?", resp=True): - dbstore.add(sample) - # compute the luminosity, if possible - if sample.luminosity is None: - dbstore.flush() - sample.luminosity = sample.getLuminosity() - else: - existing = checkExisting.one() - prompt = "Replace existing " - prompt += str(existing) - prompt += "\nby new " - prompt += str(sample) - prompt += "\n?" - if confirm(prompt, resp=False): - existing.replaceBy(sample) - if existing.luminosity is None: - dbstore.flush() - existing.luminosity = existing.getLuminosity() - # commit - dbstore.commit() - -# -# main -# -if __name__ == '__main__': - main() diff --git a/scripts/checkAndClean.py b/scripts/checkAndClean.py deleted file mode 100755 index d9907a6..0000000 --- a/scripts/checkAndClean.py +++ /dev/null @@ -1,178 +0,0 @@ -#!/usr/bin/env python -import json -import os -import sys -from cp3_llbb.SAMADhi.SAMADhi import Analysis, Dataset, Sample, Result, DbStore -from cp3_llbb.SAMADhi.SAMADhi import File as SFile -from optparse import OptionParser, OptionGroup - -class MyOptionParser: - """ - Client option parser - """ - def __init__(self): - usage = "Usage: %prog [options]\n" - self.parser = OptionParser(usage=usage) - self.parser.add_option("-p","--path", action="store", type="string", - dest="path", default="./", - help="Path to the json files with db analysis results.") - self.parser.add_option("-o","--output", action="store", type="string", - dest="output", default="-", - help="Name of the output file.") - self.parser.add_option("-M","--cleanupMissing", action="store_true", - dest="cleanupMissing", default=False, - help="Clean samples with missing path from the database.") - self.parser.add_option("-U","--cleanupUnreachable", action="store_true", - dest="cleanupUnreachable", default=False, - help="Clean samples with unreachable path from the database.") - self.parser.add_option("-D","--cleanupDatasets", action="store_true", - dest="cleanupDatasets", default=False, - help="Clean orphan datasets from the database.") - self.parser.add_option("-w","--whitelist", action="store", type="string", - dest="whitelist", default=None, - help="JSON file with sample whitelists per analysis.") - self.parser.add_option("-d","--dry-run", action="store_true", - dest="dryrun", default=False, - help="Dry run: do not write to file and/or touch the database.") - - def get_opt(self): - """ - Returns parse list of options - """ - opts, args = self.parser.parse_args() - if opts.path is not None: - opts.path = os.path.abspath(os.path.expandvars(os.path.expanduser(opts.path))) - if opts.output == "-": - opts.output = sys.__stdout__ - else: - filepath = os.path.dirname(os.path.realpath(os.path.expanduser(opts.output))) - if not os.access(filepath,os.W_OK): - self.parser.error("Cannot write to %s"%filepath) - if os.path.isfile(opts.output): - self.parser.error("File already exists: %s"%opts.output) - if not opts.dryrun: - try: - opts.output = open(opts.output,"w") - except: - self.parser.error("Cannot write to %s"%opts.output) - else: - opts.output = sys.__stdout__ - try: - opts.whitelist = open(opts.whitelist) - except: - self.parser.error("Cannot open whitelist.") - return opts - -class StoreCleaner(): - """ - handle to the db store, with basic facilities to cleanup entries - """ - - def __init__(self): - self.dbstore = DbStore() - - def deleteSample(self,sample_id): - store = self.dbstore - # first remove the files associated with the sample - files = store.find(SFile,SFile.sample_id==sample_id) - for sampleFile in files: - store.remove(sampleFile) - # then remove the sample - sample = store.find(Sample,Sample.sample_id==sample_id).one() - print("deleting sample %d"%sample_id) - store.remove(sample) - - def deleteDataset(self,dataset_id): - store = self.dbstore - # simply delete the dataset - dataset = store.find(Dataset,Dataset.dataset_id==dataset_id).one() - print("deleting dataset %d"%dataset_id) - store.remove(dataset) - - def commit(self): - self.dbstore.commit() - - def rollback(self): - self.dbstore.rollback() - - -# Script to check samples for deletion - -def main(): - """Main function""" - # get the options - optmgr = MyOptionParser() - opts = optmgr.get_opt() - - # set stdout - sys.stdout = opts.output - - # whitelist with samples that we should not touch ever - if opts.whitelist is not None: - whitelist = json.load(opts.whitelist) - else: - whitelist = {} - - # utility class to clean the db - myCleaner = StoreCleaner() - - # open the sample analysis report and classify bad samples - samplesAnalysisReport = os.path.join(opts.path, "SamplesAnalysisReport.json") - with open(samplesAnalysisReport) as jfile: - data = json.load(jfile) - samples = data["MissingDirSamples"] - investigate = [] - delete = [] - empty = [] - empty_delete = [] - for sample in samples: - whitelisted = False - for v in whitelist.values(): - for label in v: - if label in sample["name"]: - whitelisted = True - if whitelisted: - if sample["path"]=="": - empty.append(sample) - else: - investigate.append(sample) - else: - if sample["path"]=="": - empty_delete.append(sample) - else: - delete.append(sample) - print("\n\nWhitelisted sample with missing path. Investigate:") - for sample in empty: - print(sample["name"]) - print("\n\nWhitelisted sample with unreachable path. Investigate:") - for sample in investigate: - print(sample["name"]) - print("\n\nSamples to be deleted because of missing path:") - for sample in empty_delete: - print(sample["name"]) - if opts.cleanupMissing : myCleaner.deleteSample(sample["sample_id"]) - print("\n\nSamples to be deleted because of unreachable path:") - for sample in delete: - print(sample["name"]) - if opts.cleanupUnreachable : myCleaner.deleteSample(sample["sample_id"]) - - # now clean orphan datasets - datasetsAnalysisReport = os.path.join(opts.path, "DatasetsAnalysisReport.json") - with open(datasetsAnalysisReport) as jfile: - data = json.load(jfile) - datasets = data["Orphans"] - for dataset in datasets: - if opts.cleanupDatasets : myCleaner.deleteDataset(dataset["dataset_id"]) - - # and commit - if not opts.dryrun: - myCleaner.commit() - else: - myCleaner.rollback() - -# -# main -# -if __name__ == '__main__': - main() - diff --git a/scripts/compute_sample_luminosity.py b/scripts/compute_sample_luminosity.py deleted file mode 100755 index 88f4907..0000000 --- a/scripts/compute_sample_luminosity.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python -""" Simple script to compute the luminosity of a set of samples """ - -import subprocess -import argparse -from cp3_llbb.SAMADhi.SAMADhi import Sample, DbStore - -def get_options(): - parser = argparse.ArgumentParser(description='Compute luminosity of a set of samples.') - - parser.add_argument('-i', '--id', type=int, nargs='+', dest='ids', help='IDs of the samples', metavar='ID') - parser.add_argument('--name', type=str, nargs='+', dest='names', help='Names of the samples', metavar='NAME') - - parser.add_argument('--local', dest='local', action='store_true', help='Run brilcalc locally instead of on lxplus') - - parser.add_argument('--bootstrap', dest='bootstrap', action='store_true', help='Install brilcalc. Needs to be done only once') - - parser.add_argument('--update', dest='update', action='store_true', help='Update brilcalc') - - parser.add_argument('-n', '--username', dest='username', help='Remote lxplus username (local username by default)') - - parser.add_argument('-t', '--normtag', dest='normtag', help='Normtag on /afs') - - options = parser.parse_args() - - if not options.bootstrap and not options.update and options.ids is None and options.names is None: - parser.error('You must specify at least one sample id or sample name.') - - if not options.bootstrap and not options.update and not options.normtag: - parser.error('You must specify a normtag file') - - if options.ids is None: - options.ids = [] - - if options.names is None: - options.names = [] - - if options.username is None: - import pwd, os - options.username = pwd.getpwuid(os.getuid()).pw_name - - return options - -def get_sample(id, name): - - dbstore = DbStore() - - if id is not None: - result = dbstore.find(Sample, Sample.sample_id == id) - elif name is not None: - result = dbstore.find(Sample, Sample.name.like(unicode(name.replace('*', '%').replace('?', '_')))) - - return result.one() - -def parse_luminosity_csv(result): - """ Parse the CSV file produced by brilcalc, and return the total recorded luminosity in /pb """ - import csv - import StringIO - - f = StringIO.StringIO(result) - - lumi = 0 - reader = csv.reader(f, delimiter=',') - for row in reader: - if not row: - continue - - if row[0][0] == '#': - continue - lumi += float(row[-1]) - - return lumi / 1000. / 1000. - -def compute_luminosity(sample, options): - print("Computing luminosity for %r") % str(sample.name) - - lumi = 0 - if not options.local: - print("Running brilcalc on lxplus... You'll probably need to enter your lxplus password in a moment") - print('') - - cmds = ['brilcalc', 'lumi', '--normtag', options.normtag, '--output-style', 'csv', '-i', '"%s"' % str(sample.processed_lumi.replace('"', ''))] - cmd = 'export PATH="$HOME/.local/bin:/afs/cern.ch/cms/lumi/brilconda-1.1.7/bin:$PATH"; ' + ' '.join(cmds) - ssh_cmds = ['ssh', '%s@lxplus.cern.ch' % options.username, cmd] - brilcalc_result = subprocess.check_output(ssh_cmds) - - lumi = parse_luminosity_csv(brilcalc_result) - else: - print("Running brilcalc locally...") - # FIXME one day - print("Error: running brilcalc locally is not supported for the moment.") - return 0 - - print("Sample luminosity: %.3f /pb" % lumi) - print('') - - store = DbStore() - # Update luminosity in the database - store.find(Sample, Sample.sample_id == sample.sample_id).set(luminosity = lumi) - - store.commit() - - return lumi - -def install_brilcalc(options): - - if options.local: - print("Local installation of brilcalc is not supported.") - return - - print("Installing brilcalc on lxplus... You'll probably need to enter your lxplus password in a moment") - - cmds = ['pip', 'install', '--install-option="--prefix=$HOME/.local"', '--upgrade', 'brilws'] - cmd = 'export PATH="$HOME/.local/bin:/afs/cern.ch/cms/lumi/brilconda-1.1.7/bin:$PATH"; %s' % (" ".join(cmds)) - ssh_cmds = ['ssh', '%s@lxplus.cern.ch' % options.username, cmd] - subprocess.call(ssh_cmds) - -def update_brilcalc(options): - - if options.local: - print("Local installation of brilcalc is not supported.") - return - - print("Updating brilcalc on lxplus... You'll probably need to enter your lxplus password in a moment") - - cmds = ['pip', 'install', '--install-option="--prefix=$HOME/.local"', '--upgrade', '--force-reinstall', 'brilws'] - cmd = 'export PATH="$HOME/.local/bin:/afs/cern.ch/cms/lumi/brilconda-1.1.7/bin:$PATH"; %s' % (" ".join(cmds)) - ssh_cmds = ['ssh', '%s@lxplus.cern.ch' % options.username, cmd] - subprocess.call(ssh_cmds) - -def main(): - - options = get_options() - - if options.bootstrap: - install_brilcalc(options) - return - - if options.update: - update_brilcalc(options) - return - - for id_ in options.ids: - sample = get_sample(id_, None) - compute_luminosity(sample, options) - - for name in options.names: - sample = get_sample(None, name) - compute_luminosity(sample, options) - - -# -# main -# -if __name__ == '__main__': - main() diff --git a/scripts/das_import.py b/scripts/das_import.py deleted file mode 100755 index 9ed4a9d..0000000 --- a/scripts/das_import.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python - -import argparse - -from cp3_llbb.SAMADhi.das_import import import_cms_dataset - -def get_options(): - parser = argparse.ArgumentParser(description='Import CMS datasets into SAMADhi') - - parser.add_argument("-p", "--process", action="store", type=str, dest="process", help="Process name.") - - parser.add_argument("--xsection", action="store", type=float, default=1.0, dest="xsection", help="Cross-section in pb.") - - parser.add_argument("--energy", action="store", type=float, dest="energy", help="CoM energy, in TeV.") - - parser.add_argument("--comment", action="store", type=str, default="", dest="comment", help="User defined comment") - - parser.add_argument("dataset", action="store", type=str, nargs=1, help="CMS dataset") - - args = parser.parse_args() - - return args - -if __name__ == '__main__': - options = get_options() - import_cms_dataset(options.dataset[0], options.process, options.energy, options.xsection, options.comment, True) diff --git a/scripts/search_SAMADhi.py b/scripts/search_SAMADhi.py deleted file mode 100755 index b7189b9..0000000 --- a/scripts/search_SAMADhi.py +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env python - -# Script to add a sample to the database - -import os -from optparse import OptionParser -from cp3_llbb.SAMADhi.SAMADhi import Dataset, Sample, Result, DbStore, Analysis - -class MyOptionParser: - """ - Client option parser - """ - def __init__(self): - usage = "Usage: %prog type [options]\n" - usage += "Where type is one of dataset, sample, result, analysis" - self.parser = OptionParser(usage=usage) - self.parser.add_option("-l","--long", action="store_true", - dest="longOutput", default=False, - help="detailed output") - self.parser.add_option("-n","--name", action="store", type="string", - dest="name", default=None, - help="filter on name") - self.parser.add_option("-p","--path", action="store", type="string", - dest="path", default=None, - help="filter on path") - self.parser.add_option("-i","--id", action="store", type="int", - dest="objid", default=None, - help="filter on id") - - def get_opt(self): - """ - Returns parse list of options - """ - opts, args = self.parser.parse_args() - if len(args) == 0: - self.parser.error("must specify the type of item to search for") - if args[0] not in ["dataset","sample","result","analysis"]: - self.parser.error("type must be one of dataset, sample, result, analysis") - cnt = 0 - if opts.path is not None: - cnt +=1 - opts.path = os.path.abspath(os.path.expandvars(os.path.expanduser(opts.path))) - if opts.name is not None: cnt +=1 - if opts.objid is not None: cnt +=1 - if cnt>1: - self.parser.error("only one selection criteria may be applied") - if args[0]=="dataset" and opts.path is not None: - self.parser.error("cannot search dataset by path") - if args[0]=="result" and opts.name is not None: - self.parser.error("cannot search a result by name") - if args[0]=="analysis" and opts.path is not None: - self.parser.error("cannot search analysis by path") - opts.objtype = args[0] - return opts - -def main(): - """Main function""" - # get the options - optmgr = MyOptionParser() - opts = optmgr.get_opt() - # connect to the MySQL database using default credentials - dbstore = DbStore() - # build the query - if opts.objtype == "dataset": - objectClass = Dataset - objectId = Dataset.dataset_id - elif opts.objtype == "sample": - objectClass = Sample - objectId = Sample.sample_id - elif opts.objtype == "analysis": - objectClass = Analysis - objectId = Analysis.analysis_id - else: - objectClass = Result - objectId = Result.result_id - - if opts.objid is not None: - result = dbstore.find(objectClass, objectId==opts.objid) - elif opts.path is not None: - result = dbstore.find(objectClass, objectClass.path.like(unicode(opts.path.replace('*', '%').replace('?', '_')))) - elif opts.name is not None: - result = dbstore.find(objectClass, objectClass.name.like(unicode(opts.name.replace('*', '%').replace('?', '_')))) - else: - result = dbstore.find(objectClass) - - result = result.order_by(objectId) - # loop and print - if opts.longOutput: - for entry in result: - print entry - print "--------------------------------------------------------------------------------------" - else: - if opts.objtype != "result" and opts.objtype != "analysis": - data = result.values(objectId, objectClass.name) - else: - data = result.values(objectId, objectClass.description) - for dset in data: - print "%i\t%s"%(dset[0], dset[1]) - -# -# main -# -if __name__ == '__main__': - main() diff --git a/scripts/update_datasets_cross_section.py b/scripts/update_datasets_cross_section.py deleted file mode 100755 index 66c314c..0000000 --- a/scripts/update_datasets_cross_section.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python -""" Simple script to compute the luminosity of a set of samples """ - -import subprocess -import argparse -from cp3_llbb.SAMADhi.SAMADhi import Dataset, Sample, DbStore -from storm.locals import Desc - -def get_options(): - parser = argparse.ArgumentParser(description='Update cross-sections of datasets.') - - parser.add_argument('regex', type=str, help='Regular expression used to filter *samples*. Only \'*\' and \'?\' wildcards are supported. Take note that filtering is applied to samples, and not to datasets.', metavar='REGEX') - - parser.add_argument('-f', '--force', type=float, dest='force', help='For the cross-section of all datasets matching the regular expression to be this value', metavar='XSEC') - - parser.add_argument('-w', '--write', dest='write', action='store_true', help='Write changes to the database') - - options = parser.parse_args() - - return options - - -dbstore = DbStore() - -def get_samples(name): - return dbstore.find(Sample, Sample.name.like(unicode(name.replace('*', '%').replace('?', '_')))) - -def main(): - options = get_options() - samples = get_samples(options.regex) - - if samples.count() == 0: - print("No sample found.") - return - - for sample in samples: - if sample.source_dataset.datatype == "data": - continue - - # Consider a cross-section of one as a non-updated value - if sample.source_dataset.xsection == 1: - # Try to find a similar sample in the database, with the same center of mass energy - print("Updating cross-section of {}".format(sample.source_dataset.process)) - - if options.force: - print(" Forcing the cross-section to {}".format(options.force)) - if options.write: - sample.source_dataset.xsection = options.force - else: - possible_matches = dbstore.find(Dataset, Dataset.process.like(sample.source_dataset.process), - Dataset.energy == sample.source_dataset.energy, - Dataset.dataset_id != sample.source_dataset.dataset_id) - - xsec = None - if possible_matches.count() == 0: - print(" No match for this dataset found.") - else: - for p in possible_matches.order_by(Desc(Dataset.dataset_id)): - if not xsec: - xsec = p.xsection - else: - if xsec != p.xsection: - print(" Warning: more than one possible match found for this dataset, and they do not have the same cross-section. I do not know what to do...") - xsec = None - break - if xsec: - print(" Updating with cross-section = {}".format(xsec)) - if options.write: - sample.source_dataset.xsection = xsec - - - if options.write: - dbstore.commit() - else: - print("Currently running in dry-run mode. If you are happy with the change, pass the '-w' flag to this script to store the changes into the database.") - dbstore.rollback() -# -# main -# -if __name__ == '__main__': - main() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..83e967f --- /dev/null +++ b/setup.cfg @@ -0,0 +1,65 @@ +[metadata] +name = SAMADhi +description = SAmple MAnagement Database +long_description = file: README.md +long_description_content_type = text/markdown +url = 'https://github.com/cp3-llbb/SAMADhi' +author = Christophe Delaere +author_email = christophe.delaere@uclouvain.be +license = unknown +classifiers = + Development Status :: 3 - Beta + Intended Audience :: Developers + Intended Audience :: Science/Research + License :: Other/Proprietary License + Operating System :: OS Independent + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Programming Language :: Python :: Implementation :: CPython + Topic :: Software Development :: Libraries :: Python Modules +keywords = database + +[options] +packages = + cp3_llbb.SAMADhi +install_requires = + peewee + pymysql +python_requires = !=2.*, >=3.6 +package_dir = + cp3_llbb.SAMADhi = python +setup_requires = + pytest-runner + +[options.entry_points] +console_scripts = + search_SAMADhi = cp3_llbb.SAMADhi.scripts:search + iSAMADhi = cp3_llbb.SAMADhi.scripts:interactive + update_datasets_cross_section = cp3_llbb.SAMADhi.scripts:update_datasets_cross_section + add_sample = cp3_llbb.SAMADhi.scripts:add_sample + add_result = cp3_llbb.SAMADhi.scripts:add_result + checkAndClean = cp3_llbb.SAMADhi.scripts:checkAndClean + das_import = cp3_llbb.SAMADhi.das_import:main + das_import_nanoAOD = cp3_llbb.SAMADhi.das_import:import_nanoAOD_sample + compute_sample_luminosity = cp3_llbb.SAMADhi.luminosity:compute_sample_luminosity + SAMADhi_dbAnalysis = cp3_llbb.SAMADhi.dbAnalysis:main + +[options.extras_require] +tests = + pytest + pytest-console-scripts + +[bdist_wheel] +universal = 0 + +[aliases] +test = pytest + +[flake8] +max-line-length = 100 +select = E,F,W,B,B950 +ignore = E501,W503 diff --git a/setup_standalone.sh b/setup_standalone.sh deleted file mode 100644 index 12cde67..0000000 --- a/setup_standalone.sh +++ /dev/null @@ -1,119 +0,0 @@ -# no shebang, must be sourced - -# Creates a (symlinked) python install directory for SAMADhi and sets up environment variables, -# such that the `from cp3_llbb.SAMADhi.SAMADhi ...` imports can also be used standalone on ingrid. -# The python interpreter to use and the install path can be set through options - -## deduce source location from the script name -if [[ -z "${ZSH_NAME}" ]]; then - thisscript="$(readlink -f ${BASH_SOURCE})" -else - thisscript="$(readlink -f ${0})" -fi -samadhipath="$(dirname ${thisscript})" - -## option defaults -installpath="${samadhipath}/install" -python="$(which python)" -custom_python="" -## parse options -tmp_opts="$(getopt --longoptions=install:,python:,help --options=h -- $@)" -eval set -- "${tmp_opts}" -while true; do - case "${1}" in - --install) - installpath="${2}" - shift 2 ;; - --python) - python="${2}" - custom_python="yes" - shift 2 ;; - -h|--help) - echo "Usage: source install_standalone.sh [ --python=path_to_python_interpreter --install=./install ]" - shift - return 0 ;; - --) - shift; break ;; - esac -done - -echo "--> Install path: ${installpath}" -## prepend if necessary -function checkAndPrepend() -{ - local in_path="" - if [[ -z "${ZSH_NAME}" ]]; then - ## bash version - IFS=: local exp_path=${!1} - for apath in ${exp_path}; do - if [[ "${apath}" == "${2}" ]]; then - in_path="yes" - fi - done - else - ## zsh version - local exp_path="${(P)1}" - for apath in ${(s.:.)exp_path}; do - if [[ "${apath}" == "${2}" ]]; then - in_path="yes" - fi - done - fi - if [[ -z "${in_path}" ]]; then - export ${1}="${2}:${exp_path}" - echo "--> Added ${2} to ${1}" - fi -} -## pick up python interpreter -if [[ -n "${custom_python}" ]]; then - echo "--> Using python from ${python}" - pyinterpbindir="$(dirname ${python})" - pyinterprootdir="$(dirname ${pyinterpbindir})" - pyinterplibdir="${pyinterprootdir}/lib" - pyinterpsitedir="${pyinterplibdir}/python2.7/site-packages" - checkAndPrepend "LD_LIBRARY_PATH" "${pyinterplibdir}" - checkAndPrepend "PYTHONPATH" "${pyinterpsitedir}" -fi -pymajmin=$(${python} -c 'import sys; print(".".join(str(num) for num in sys.version_info[:2]))') -if [[ "${pymajmin}" != "2.7" ]]; then - echo "--> Only python 2.7 is supported, please pass a suitable interpreter using the --python option (found version ${pymajmin} for ${python})" - return 1 -fi -## install upgraded pip -if [[ ! -d "${installpath}" ]]; then - mkdir -p "${installpath}" - echo "--> upgrading pip from $(${python} -m pip --version)" - ${python} -m pip install --prefix="${installpath}" -I pip -fi -checkAndPrepend "LD_LIBRARY_PATH" "${installpath}/lib" -checkAndPrepend "LD_LIBRARY_PATH" "${installpath}/lib64" -pysitedir="${installpath}/lib/python${pymajmin}/site-packages" -checkAndPrepend "PYTHONPATH" "${pysitedir}" -checkAndPrepend "PYTHONPATH" "${installpath}/lib64/python${pymajmin}/site-packages" -( ${python} -c "import MySQLdb" > /dev/null 2> /dev/null ) || ${python} -m pip install --prefix="${installpath}" MySQL-python -( ${python} -c "import storm" > /dev/null 2> /dev/null ) || ${python} -m pip install --prefix="${installpath}" storm -( ${python} -c "import ROOT" > /dev/null 2> /dev/null ) || ${python} -m pip install --prefix="${installpath}" storm - -## Install SAMADhi -if [[ ! -d "${pysitedir}/cp3_llbb" ]]; then - mkdir -p "${pysitedir}/cp3_llbb/" -fi -## __init__.py for cp3_llbb -hatinitpy="${pysitedir}/cp3_llbb/__init__.py" -if [[ ! -f "${hatinitpy}" ]]; then - echo "" > "${hatinitpy}" -fi -## symlink -installpy="${pysitedir}/cp3_llbb/SAMADhi" -if [[ ! -a "${installpy}" ]]; then - ln -s "${samadhipath}/python" "${installpy}" - echo "--> Created symlink to SAMADhi" -elif [[ ! ( -L "${installpy}" ) ]]; then - echo "--> ${installpy} exists, but is not a symlink" - return 1 -fi -## __init__.py for cp3_llbb/SAMADhi -pkginitpy="${installpy}/__init__.py" -if [[ ! -f "${pkginitpy}" ]]; then - echo "" > "${pkginitpy}" -fi diff --git a/tests/data/params.json b/tests/data/params.json new file mode 100644 index 0000000..e03810a --- /dev/null +++ b/tests/data/params.json @@ -0,0 +1,4 @@ +{ + "test" : 1, + "database" : "test.db" +} diff --git a/tests/data/test.db b/tests/data/test.db new file mode 100644 index 0000000..9e8571f Binary files /dev/null and b/tests/data/test.db differ diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..7ba0269 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,205 @@ +import distutils.spawn +import logging +import os +import os.path +import stat +import subprocess + +import pytest + +from pytest_console_scripts import script_runner + +logger = logging.getLogger(__name__) + + +testDBCred = os.path.join(os.path.dirname(__file__), "data", "params.json") +if stat.S_IMODE(os.stat(testDBCred).st_mode) != stat.S_IRUSR: + os.chmod(testDBCred, stat.S_IRUSR) ## set 400 +testDBArg = f"--database={testDBCred}" + +_hasROOT = False +try: + import cppyy + + _hasROOT = True +except ImportError: + pass +needROOT = pytest.mark.skipif(not _hasROOT, reason="Needs ROOT") +needGridProxy = pytest.mark.skipif( + (distutils.spawn.find_executable("voms-proxy-info") is None) + or (subprocess.call(["voms-proxy-info", "--exists", "--valid", "0:5"]) != 0), + reason="Needs a valid grid proxy", +) + + +@pytest.fixture +def tmptestdbcopy(tmpdir): + import shutil + + shutil.copy2( + os.path.join(os.path.dirname(__file__), "data", "params.json"), + str(tmpdir.join("params.json")), + ) + shutil.copy2( + os.path.join(os.path.dirname(__file__), "data", "test.db"), str(tmpdir.join("test.db")) + ) + yield "--database={}".format(str(tmpdir.join("params.json"))) + + +def checkSuccessOutLines(ret, nOut=None, nErr=None): + logger.info(ret.stdout) + if not ret.success: + logger.info(ret.stderr) + assert ret.success + if nOut is not None: + assert (nOut == 0 and len(ret.stdout.strip()) == 0) or len( + ret.stdout.strip().split("\n") + ) == nOut + if nErr is not None: + assert (nErr == 0 and len(ret.stderr.strip()) == 0) or len( + ret.stderr.strip().split("\n") + ) == nErr + + +def test_search_sample(script_runner): + checkSuccessOutLines( + script_runner.run("search_SAMADhi", "sample", "--name=test*", testDBArg), nOut=8, nErr=0 + ) + + +def test_add_sample(script_runner, tmptestdbcopy): + checkSuccessOutLines( + script_runner.run( + "add_sample", + "--continue", + tmptestdbcopy, + "NTUPLES", + "--name=test_cli_addSample_1", + "--processed=-1", + "--nevents=10", + "--norm=2.", + "--weight-sum=12.", + "--lumi=0.3", + "--code_version=0.1.0", + "--comment='testing add_sample'", + "--source_dataset=7", + "--source_sample=8", + "--author=pytest", + "/tmp", + ) + ) + checkSuccessOutLines( + script_runner.run( + "search_SAMADhi", tmptestdbcopy, "--long", "sample", "--name=test_cli_addSample_1" + ) + ) + checkSuccessOutLines( + script_runner.run("search_SAMADhi", tmptestdbcopy, "sample", "--name=test_cli_addSample_1"), + nOut=1, + ) + + +def test_add_sample_noconfirm(script_runner, tmptestdbcopy): + import io + + checkSuccessOutLines( + script_runner.run( + "add_sample", + tmptestdbcopy, + "NTUPLES", + "--name=test_cli_addSample_2", + "--processed=-1", + "--nevents=10", + "--norm=2.", + "--weight-sum=12.", + "--lumi=0.3", + "--code_version=0.1.0", + "--comment='testing add_sample'", + "--author=pytest", + "/tmp", + stdin=io.StringIO("\n\nn\n"), + ) + ) ## no source sample or dataset, no insert + checkSuccessOutLines( + script_runner.run("search_SAMADhi", tmptestdbcopy, "sample", "--name=test_cli_addSample_2"), + nOut=0, + ) + + +@needROOT +def test_add_sample_files(script_runner, tmptestdbcopy): + checkSuccessOutLines( + script_runner.run( + "add_sample", + "--continue", + tmptestdbcopy, + "NTUPLES", + "--name=test_cli_addSample_3", + "--processed=-1", + "--nevents=10", + "--norm=2.", + "--weight-sum=12.", + "--lumi=0.3", + "--code_version=0.1.0", + "--comment='testing add_sample'", + "--author=pytest", + "--files=/foo/bar/test_cli_addSample/1.root,/foo/bar/test_cli_addSample/2.root", + "/tmp", + ) + ) + checkSuccessOutLines( + script_runner.run("search_SAMADhi", tmptestdbcopy, "sample", "--name=test_cli_addSample_3"), + nOut=1, + ) + + +def test_add_result(script_runner, tmptestdbcopy): + checkSuccessOutLines( + script_runner.run( + "add_result", + "--continue", + tmptestdbcopy, + "--analysis=1", + "--sample=4,5,6,7,8", + "--description='testing add_result'", + "--author=pytest", + "--elog=TODO", + "/tmp", + stdin=b"n", + ) + ) + checkSuccessOutLines( + script_runner.run("search_SAMADhi", tmptestdbcopy, "result", "--path=/tmp"), nOut=1 + ) + + +@needGridProxy +def test_import_dataset(script_runner, tmptestdbcopy): + dasName = "/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIIAutumn18NanoAODv4-Nano14Dec2018_102X_upgrade2018_realistic_v16-v1/NANOAODSIM" + checkSuccessOutLines( + script_runner.run( + "das_import", + "--continue", + tmptestdbcopy, + "--energy=13", + "--xsection=6225.42", + "--process=DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXF-pythia8", + dasName, + ) + ) + checkSuccessOutLines( + script_runner.run("search_SAMADhi", tmptestdbcopy, "dataset", f"--name={dasName}"), nOut=1 + ) + + +@needGridProxy +def test_import_dataset_update_xsec(script_runner, tmptestdbcopy): + checkSuccessOutLines( + script_runner.run( + "update_datasets_cross_section", + tmptestdbcopy, + "--write", + "--force=1", + "test_modelRels_mc1", + ) + ) diff --git a/tests/test_cli_existing.py b/tests/test_cli_existing.py new file mode 100644 index 0000000..c3d0859 --- /dev/null +++ b/tests/test_cli_existing.py @@ -0,0 +1,42 @@ +import logging +import os +import os.path + +import pytest + +from pytest_console_scripts import script_runner + +logger = logging.getLogger(__name__) + +needCredentials = pytest.mark.skipif( + not os.path.isfile( + os.path.expandvars(os.path.expanduser(os.getenv("SAMADHI_CREDENTIALS", "~/.samadhi"))) + ), + reason="Needs valid SAMADhi credentials", +) +dbArg = ( + "--database={}".format(os.getenv("SAMADHI_CREDENTIALS")) + if os.getenv("SAMADHI_CREDENTIALS") is not None + else None +) + + +def checkSuccessOutLines(ret, nOut=None, nErr=None): + logger.info(ret.stdout) + assert ret.success + if nOut is not None: + assert (nOut == 0 and len(ret.stdout.strip()) == 0) or len( + ret.stdout.strip().split("\n") + ) == nOut + if nErr is not None: + assert (nErr == 0 and len(ret.stderr.strip()) == 0) or len( + ret.stderr.strip().split("\n") + ) == nErr + + +@needCredentials +def test_search_sample(script_runner): + args = ["search_SAMADhi", "dataset", "--name=/DoubleMuon/Run2016*-03Feb2017-v*/MINIAOD"] + if dbArg: + args.append(dbArg) + checkSuccessOutLines(script_runner.run(*args), nOut=5, nErr=0) diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..5f0f83d --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,176 @@ +import logging + +import pytest + +logger = logging.getLogger(__name__) + + +@pytest.fixture(scope="module") +def sqlitetestdb(): + from cp3_llbb.SAMADhi.SAMADhi import _models as MODELS + from peewee import SqliteDatabase + + test_db = SqliteDatabase(":memory:") + test_db.bind(MODELS, bind_refs=False, bind_backrefs=False) + test_db.connect() + test_db.create_tables(MODELS) + yield + test_db.drop_tables(MODELS) + test_db.close() + + +def test_createAnalysis(sqlitetestdb): + from cp3_llbb.SAMADhi.SAMADhi import Analysis + + ana = Analysis.create( + cadiline="NP-20-001", contact="me ", description="Evidence for new physics" + ) + logger.info(str(ana)) + + +def test_createDataset_minimal(sqlitetestdb): + from cp3_llbb.SAMADhi.SAMADhi import Dataset + + dset = Dataset.create(name="test_createDataset_minimal dataset", datatype="mc") + logger.info(str(dset)) + + +def test_createDataset_fullNoRel(sqlitetestdb): + from cp3_llbb.SAMADhi.SAMADhi import Dataset + from datetime import datetime, timedelta + + dset = Dataset.create( + name="/NewPhysics/test_createDataset_fullNoRel/NANOAODSIM", + datatype="mc", + cmssw_release="CMSSW_10_6_0", + dsize=1024, + energy=14.0, + globaltag="mc_run2_106X_v0", + nevents=1000, + process="New Physics", + xsection=0.001, + user_comment="Your favourite sample", + creation_time=datetime.now() - timedelta(days=7), + ) + logger.info(str(dset)) + + +def test_createSample_minimal(sqlitetestdb): + from cp3_llbb.SAMADhi.SAMADhi import Sample + + smp = Sample.create( + name="test_createSample_minimal sample", + path="/test/sample/minimal", + sampletype="NTUPLES", + nevents_processed=1000, + ) + logger.info(str(smp)) + + +def test_createSample_fullNoRel(sqlitetestdb): + from cp3_llbb.SAMADhi.SAMADhi import Sample + + smp = Sample.create( + name="test_createSample_fullNoRel sample", + path="/test/sample/fullNoRel", + sampletype="NTUPLES", + nevents_processed=1000, + author="me ", + code_version="Framework_x.y.z_MyAnalysis_u.v.w", + event_weight_sum=1000.0, + extras_event_weight_sum="variations go here - not available", + luminosity=2.0, + nevents=215, + processed_lumi="almost all", + user_comment="hello world", + ) + logger.info(str(smp)) + + +def test_createFile(sqlitetestdb): + from cp3_llbb.SAMADhi.SAMADhi import Sample, File + + smp = Sample.create( + name="test_createFile_minimal sample", + path="/test/sample/minimal_for_fie", + sampletype="NTUPLES", + nevents_processed=1000, + ) + from cp3_llbb.SAMADhi.SAMADhi import File + + f = File.create( + lfn="/foo/bar/test_createFile_minimal.root", + pfn="/my/storage/foo/bar/test_createFile_minimal.root", + nevents=1, + event_weight_sum=1.0, + sample=smp, + ) + logger.info(str(f)) + + +def test_createResult_minimal(sqlitetestdb): + from cp3_llbb.SAMADhi.SAMADhi import Result + + result = Result.create(path="/my/home/test_minimal_result.pdf") + logger.info(str(result)) + + +def test_createResult_fullNoRel(sqlitetestdb): + from cp3_llbb.SAMADhi.SAMADhi import Result + + result = Result.create( + path="/my/home/test_fullNoRel_result.pdf", + author="me ", + description="An interesting result", + elog="TODO", + ) + logger.info(str(result)) + + +def test_modelRels(sqlitetestdb): + from cp3_llbb.SAMADhi.SAMADhi import Analysis, Dataset, Sample, File, Result + + ana = Analysis.create( + cadiline="NP-20-002", contact="me ", description="Measurement of XY->ZUVW" + ) + datasets = [ + Dataset.create(name=f"test_modelRels_data{i:d}", datatype="data") for i in range(3) + ] + [Dataset.create(name=f"test_modelRels_mc{i:d}", datatype="mc") for i in range(2)] + samples = [ + Sample.create( + name="test_modelRels_{}".format(ds.name.split("_")[-1]), + path=f"/test/sample/{ds.name}", + sampletype="NTUPLES", + nevents_processed=1000, + source_dataset=ds, + ) + for ds in datasets + ] + for smp in samples: + for i in range(4): + File.create( + sample=smp, + lfn=f"{smp.name}/{i:d}.root", + pfn=f"/store/me{smp.name}/{i:d}.root", + nevents=250, + event_weight_sum=250, + ) + res1 = Result.create( + analysis=ana, + author="me ", + description="Preliminary result", + elog="TODO", + path="/home/my/ana", + ) + res2 = Result.create( + analysis=ana, + author="me ", + description="Final result", + elog="TODO", + path="/home/my/paper", + ) + logger.info(str(ana)) + logger.info(str(datasets[-1])) + logger.info(str(samples[-1])) + logger.info(str(res1)) + logger.info(str(res2)) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..6107e5f --- /dev/null +++ b/tox.ini @@ -0,0 +1,8 @@ +[tox] +envlist = py36,py37,py38,py39 +isolated_build = True +[testenv] +deps = + pytest + pytest-console-scripts +commands = pytest