From 218e27c88f4ac3a62985a37916d20e93aae85a15 Mon Sep 17 00:00:00 2001 From: Fernando Garzon Date: Mon, 11 May 2020 14:15:57 -0500 Subject: [PATCH 01/21] Json writer to be tested --- .../XfedKibana/XRDFED-kibana-probe_to_JSON.py | 466 ++++++++++++++++++ 1 file changed, 466 insertions(+) create mode 100755 AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON.py diff --git a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON.py b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON.py new file mode 100755 index 0000000..55dd8f8 --- /dev/null +++ b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON.py @@ -0,0 +1,466 @@ +#!/usr/bin/python +# functional probe and SLS extractor for the "federation" xroot services +# highlights: +# - stateless (i.e. run from cron whenever needed) +# - will try to prevent parallel runs via lockfile +# - multithreaded, one thread per service to be tested +# - overall runtime cap at 10min +# - could extract some statistics from xroot directly, but these are ever-increasing counters +# Problems: +# - need to update the code whenever a service is addded/deleted/changed +# - uses "random" files on various Xroot services all over the world, these are (for now) the same as used by the experiments but these might change.. + +import xml.dom.minidom +import subprocess +import os +import sys +import signal +import re +import time +import Lemon.XMLAPI +import socket +import atexit +import threading +import tempfile +import json + +html_dir = '/var/www/html/aaa-probe/' # will create per-service xml files here + +#CERN_eosfile_rucio='/atlas/rucio/user/ivukotic:user.ivukotic.xrootd.cern-prod-1M' + +LOCKFILE='/var/lock/subsys/xrdfed-kibana-probe' + +class Alarm(Exception): + pass + +def alarm_handler(signum, frame): + print "ERROR: caught overall timeout after "+str(timeout_sec)+"s\n" + clear_lock() + sys.exit(2) + raise Alarm # should not reach this.. + +def clear_lock(): + try: + os.unlink(LOCKFILE) + except Exception,e: + print "could not remove lockfile:"+str(e) + +def env_setup(): + # GSI + os.environ['X509_USER_CERT']='/root/.globus/slsprobe-cert.pem' + os.environ['X509_USER_KEY']='/root/.globus/slsprobe-key.pem' + #os.environ['X509_USER_CERT']='/etc/grid-security/hostcert.pem' + #os.environ['X509_USER_KEY']='/etc/grid-security/hostkey.pem' + #set this only if get_proxy() is not used and proxy is refreshed by other mechanism e.g. from cronttab + os.environ['X509_USER_PROXY']='/root/.globus/slsprobe.proxy' + # these work for SSL + #os.environ['XrdSecSSLUSERKEY']='/etc/grid-security/hostkey.pem ' + #os.environ['XrdSecSSLUSERCERT']='/etc/grid-security/hostcert.pem ' + # specifically tell it to *not* use Kerberos (in case we run this interactively) + os.environ['KRB5CCNAME']='FILE:/dev/null' + os.environ['PATH']=os.environ['PATH']+":/opt/globus/bin/" + + +def get_proxy(): + dev_null = open('/dev/null', 'rw') + (proxyfd,proxy)=tempfile.mkstemp(prefix='x509_xrdfed_',suffix='.pem') + os.close(proxyfd) + os.environ['X509_USER_PROXY']=proxy + + ret = subprocess.call(['grid-proxy-init','-pwstdin'], + stdin=dev_null, + ) # most of the proxy details will get printed in "xrdcp" output anyway + if ret > 0: + raise Exception("Cannot get X509 proxy") + dev_null.close() + + +def cleanup_proxy(): + try: + os.unlink(os.environ['X509_USER_PROXY']) + except Exception,e: + print "could not remove proxy file:"+str(e) + +def try_lock(): + # use "lockfile" directly since wayy easier + ret = subprocess.call(['lockfile','-5','-r2',LOCKFILE]) + if ret > 0: + print "could not create lockfile" + return False + return True + +""" + +def prepare_xml(servicename): + doc = xml.dom.minidom.Document() + serviceUpdate = doc.createElementNS("http://sls.cern.ch/SLS/XML/update","serviceupdate") + serviceUpdate.setAttribute("xmlns", "http://sls.cern.ch/SLS/XML/update") + doc.appendChild(serviceUpdate) + id = doc.createElement("id") + id.appendChild(doc.createTextNode(servicename)) + serviceUpdate.appendChild(id) + data = doc.createElement("data") + serviceUpdate.appendChild(data) + timeStampTmp = time.strftime("%Y-%m-%dT%H:%M:%S%z") + timeStamp=timeStampTmp[0:-2]+':'+timeStampTmp[-2:] # xml needs a colon, nobody else.. + tValue = doc.createElement("timestamp") + tValue.appendChild(doc.createTextNode(timeStamp)) + serviceUpdate.appendChild(tValue) + return (doc,data, serviceUpdate) + +""" + +def prepare_dictionary(servicename): + timeStampTmp = time.strftime("%Y-%m-%dT%H:%M:%S%z") + timeStamp=timeStampTmp[0:-2]+':'+timeStampTmp[-2:] + dic={'serviceupdate':{ + 'xmlns': "http://sls.cern.ch/SLS/XML/update" + 'data': '', + 'id':servicename, + 'timestamp': timeStamp + } + + + } + return dic +def dnsalias_to_nodes(redirector): + (host,port) = redirector.split(':') + all_hosts = [] + data=socket.getaddrinfo(host,port,0, 0, socket.SOL_TCP ) + for addr in data: + (family, socktype, proto, canonname, sockaddr) = addr + (hostname, aliaslist, ipaddrlist) = socket.gethostbyaddr(sockaddr[0]) + if not hostname in all_hosts: + all_hosts.append(hostname) + return all_hosts + +def xrdcp_test(redirector,file): + (errtext,out,err,elapsed) = run_xrd_commands("xrdcp", + ["-d","2", + "-f", + "-DIReadCacheSize","0", + "-DIRedirCntTimeout","180", + "root://"+redirector+'/'+file, + '/dev/null']) + return (errtext,err,elapsed) + +def xrd_info(redirector): + version = "(unknown)" + (errtext,out,err,elapsed) = run_xrd_commands("xrd", + [redirector, + "query","1", # 1:kXR_QStats + "a"]) # a_ll stats + if not errtext: + try: + dom = xml.dom.minidom.parseString(out) + root_node = dom.documentElement + if root_node.tagName == 'statistics': + v_attr = root_node.getAttributeNode('ver') + version = v_attr.nodeValue + except Exception,e: + errtext = "ERROR: cannot parse answer:"+str(e) + return (errtext,version,out) + +def run_xrd_commands(cmd,args): + dev_null = open('/dev/null', 'r') + errtxt = '' + elapsed = -1.0 + xrd_args = [ 'perl','-e',"alarm 180; exec @ARGV", cmd, # one-line wrapper that *actually* kills the command + "-DIConnectTimeout","30", + "-DITransactionTimeout","60", + "-DIRequestTimeout","60" ] + args + try: + start = time.time() + proc = subprocess.Popen(xrd_args, + stdin=dev_null, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + (out, err) = proc.communicate() + ret = proc.returncode + elapsed = (time.time() - start) + err_redir_index = err.rfind('Received redirection to') + err_index3010 = err.rfind('(error code: 3010') # (permission denied) may be sort-of-OK - we are talking to final storage already - UK + err_index3005 = err.rfind('(error code: 3005') # (no user mapping) - INFN + if err_redir_index >= 0 and (err_index3010 >= 0 or err_index3005 >= 0): + errtxt = '' + else: + if(ret > 0): + errtxt = "client-side error - exit code "+str(ret)+"\n" + err_index = err.rfind('Last server error') + if err_index >= 0: + err_end_index=err.find("\n",err_index) + errtxt = errtxt + err[err_index:err_end_index] + except Exception,e: + errtext = errtxt + "Exception: "+str(e) + dev_null.close() + + return (errtxt,out,err,elapsed) + + + +def test_redirector(servicename, redirector, file_below=None, file_above=None, extra_notes=""): + servicename=servicename.upper() + #availability = 100.0 + availability = 'available' + availinfo = '' + need_xml_link=0 + + # prepare the dictionary. + dicci = prepare_dictionary(servicename) + #(doc, data, serviceUpdate) = prepare_xml(servicename) + notes_text = "Redirector:"+redirector+"
" + + # run the functional tests - first some simple check to get the version, if OK look for files + (err_info,version,dump_info) = xrd_info(redirector) + if(err_info): + #availability = 0.0 + availability = 'unavailable' + availinfo=availinfo+"
Error getting info from redirector
"+err_info + c="Detailed output for INFO for "+redirector+"\n"+ + err_info+"\n" + +dump_info + #serviceUpdate.appendChild(c) + dicci = {**dicci, **{'comment': c}} + numericvalue = { + + 'name': "xrdcp_below_time", + 'desc' : "Time to copy a file below redirector", + 'value': '0.000' + } + #nValue.setAttribute("name", "xrdcp_below_time") + #nValue.setAttribute("desc", "Time to copy a file below redirector") + #nValue.appendChild(doc.createTextNode("0.000")) + dicci['serviceupdate']['data'] = numericvalue + #data.appendChild(nValue) + + else: + availinfo="Version check: "+version+"\n" + if (file_below): + notes_text = notes_text + "File 'below': " + file_below + "
" + (err_below,dump_below,elapsed_below) = xrdcp_test(redirector, file_below) + if err_below : + #availability = 66.0 + availability = 'degraded' + availinfo=availinfo+"
Error below redirector
\n"+err_below + # sanitize the raw output in order to not trigger XML errors.. + dump_sane = re.sub('---*','__',dump_below) + c = "Detailed output for file BELOW "+redirector+":"+file_below+"\n"+ + err_below+"\n" + +dump_sane + dicci = {**dicci, **{'comment': c}} + need_xml_link=1 + else: + availinfo=availinfo+"
File below: OK
" + numericvalue = { + 'name': "xrdcp_below_time", + "desc": "Time to copy a file below redirector", + "elapsed_below": str(elapsed_below) + } + #nValue = doc.createElement("numericvalue") + #nValue.setAttribute("name", "xrdcp_below_time") + #nValue.setAttribute("desc", "Time to copy a file below redirector") + #nValue.appendChild(doc.createTextNode(str(elapsed_below))) + dicci['serviceupdate']['data'] = numericvalue + #data.appendChild(nValue) + else: + availinfo=availinfo+"
File below: not tested." + if(file_above): + notes_text = notes_text + "File 'elsewhere': " + file_above + "
" + (err_above,dump_above,elapsed_above) = xrdcp_test(redirector, file_above) + if err_above : + #We've changed availability from number to string so this below won't work; Marian commented out on 2015-11-06 + #availability = availability * 0.8 # less important if some remote site is failing.. + availinfo=availinfo+"
Error above redirector
"+err_above + # sanitize the raw output in order to not trigger XML errors.. in a comment. + dump_sane = re.sub('---*','__',dump_above) + c = "Detailed output for file ABOVE "+redirector+":"+file_above+"\n"+ + err_above+"\n" + +dump_sane + dicci = {**dicci, **{'comment': c}} + #serviceUpdate.appendChild(c) + need_xml_link=1 + else: + availinfo=availinfo+"
File above: OK
" + numericvalue = { + 'name': "xrdcp_below_time", + "desc": "Time to copy a file below redirector", + "elapsed_below": str(elapsed_below) + } + #nValue = doc.createElement("numericvalue") + #nValue.setAttribute("name", "xrdcp_above_time") + #nValue.setAttribute("desc", "Time to copy a file elsewhere in the federation") + #nValue.appendChild(doc.createTextNode(str(elapsed_above))) + dicci['serviceupdate']['data'] = numericvalue + #data.appendChild(nValue) + else: + availinfo=availinfo+"
File above: not tested." + + # save functional test info to XML + if need_xml_link: + myhostname = socket.gethostname() + notes_text = notes_text + "Details for failed test: http://" + myhostname + "/aaa-probe/" + servicename + ".xml
\n" + "Details for recently failed test : http://vocms039.cern.ch/aaa-probe/err/
\n" + availinfo = availinfo + "
" + notes_text + dicci = {**dicci, **{'status': str(availability)}} + dicci = {**dicci, **{'status': availinfo}} + #availabilityF = doc.createElement("status") + #availabilityF.appendChild(doc.createTextNode(str(availability))) + #serviceUpdate.appendChild(availabilityF) + #availabilityInfo = doc.createElement("availabilityinfo") + #availabilityInfo.appendChild(doc.createTextNode(availinfo)) + #serviceUpdate.appendChild(availabilityInfo) + + # commented out by Engin + #notes = doc.createElement("notes") + #notes.appendChild(doc.createTextNode(notes_text + extra_notes)) + #serviceUpdate.appendChild(notes) + + # collect LEMON stuff, save as numericvalues in XML + # Problem: need to add up for all the hosts in the alias + # 2015-11-09: commented out by MarianZ; note I removed functions getLemonMetric and collect_lemon + #collect_lemon(redirector=redirector,doc=doc,data=data) + + # try to get some "readable XML" that still is accepted by SLS: + #uglyXml = doc.toprettyxml(indent=" ",encoding="utf-8") + #text_re = re.compile('>\n\s+([^<>\s].*?)\n\s+\g<1> 1: + if sys.argv[1] == '-d': + debug=1 + if not try_lock(): + sys.exit(1) + if not os.path.exists(html_dir): + os.makedirs(html_dir) + env_setup() + # get a proxy cert + # get_proxy() + + timeout_sec = 10 * 60 # limit overall runtime to 10min + signal.signal(signal.SIGALRM, alarm_handler) + + ATLASLINK="%BR%Monitoring:%BR%\n http://atl-prod07.slac.stanford.edu:8080/display?page=xrd_report/aggregated/total_xrootd_lgn %BR%\n http://dashb-atlas-xrootd-transfers.cern.ch/ui %BR%\nhttp://dashb-atlas-ssb.cern.ch/dashboard/request.py/siteview#currentView=FAX+redirectors&highlight=false %BR%\n" + CMSLINK="%BR%Monitoring:%BR%\n http://xrootd.t2.ucsd.edu/dashboard/ %BR%\n http://dashb-cms-xrootd-transfers.cern.ch/ui %BR%\n" + #FILEABOVE="/store/test/xrootd/T2_US_Nebraska/store/mc/SAM/GenericTTbar/GEN-SIM-RECO/CMSSW_5_3_1_START53_V5-v1/0013/CE4D66EB-5AAE-E111-96D6-003048D37524.root" + #FILEBELOW="/store/test/xrootd/T2_CH_CERN/store/mc/SAM/GenericTTbar/GEN-SIM-RECO/CMSSW_5_3_1_START53_V5-v1/0013/CE4D66EB-5AAE-E111-96D6-003048D37524.root" + #FILEABOVE="/store/mc/SAM/GenericTTbar/GEN-SIM-RECO/CMSSW_5_3_1_START53_V5-v1/0013/CE4D66EB-5AAE-E111-96D6-003048D37524.root" + #FILEBELOW="/store/mc/SAM/GenericTTbar/GEN-SIM-RECO/CMSSW_5_3_1_START53_V5-v1/0013/CE4D66EB-5AAE-E111-96D6-003048D37524.root" + FILEABOVE="/store/mc/SAM/GenericTTbar/AODSIM/CMSSW_9_2_6_91X_mcRun1_realistic_v2-v1/00000/A64CCCF2-5C76-E711-B359-0CC47A78A3F8.root" + FILEBELOW="/store/mc/SAM/GenericTTbar/AODSIM/CMSSW_9_2_6_91X_mcRun1_realistic_v2-v1/00000/A64CCCF2-5C76-E711-B359-0CC47A78A3F8.root" + + services = { + "XRDFED_CMS-GLOBAL01-NEW":{'redirector':'cms-xrd-global01.cern.ch:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-GLOBAL02-NEW":{'redirector':'cms-xrd-global02.cern.ch:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + +# "XRDFED_CMS-GLOBAL01":{'redirector':'xrdcmsglobal01.cern.ch:1094', +# 'file_below': FILEABOVE, +# 'file_above': FILEBELOW, +# 'extra_notes':CMSLINK}, + +# "XRDFED_CMS-GLOBAL02":{'redirector':'xrdcmsglobal02.cern.ch:1094', +# 'file_below': FILEABOVE, +# 'file_above': FILEBELOW, +# 'extra_notes':CMSLINK}, + + "XRDFED_CMS-US-FNAL":{'redirector':'cmsxrootd2.fnal.gov:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-US-UNL":{'redirector':'xrootd.unl.edu:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU-BARI":{'redirector':'xrootd.ba.infn.it:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU-LLR":{'redirector':'llrxrd-redir.in2p3.fr:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU-PISA":{'redirector':'xrootd-redic.pi.infn.it:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-GLOBAL":{'redirector':'cms-xrd-global.cern.ch:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-US":{'redirector':'cmsxrootd.fnal.gov:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU":{'redirector':'xrootd-cms.infn.it:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU-IPv6":{ 'redirector':'xrootd-cms-redir-01.cr.cnaf.infn.it:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + "XRDFED_CMS-TRANSIT":{'redirector':'cms-xrd-transit.cern.ch:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-TRANSIT01":{'redirector':'vocms031.cern.ch:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-TRANSIT02":{'redirector':'vocms032.cern.ch:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + } + signal.alarm(timeout_sec) + try: + for xrd in services: + services[xrd].update(servicename=xrd) # would like to get the servicename= parameter for the function + if debug: + test_redirector(** services[xrd]) # sequentially + else: + t = threading.Thread(target=test_redirector, + kwargs = services[xrd]) # read: "run a thread with the test function and all the parameters above as arguments" + t.start() + + + except Alarm: + print "ERROR: caught overall timeout after "+str(timeout_sec)+"s\n" + clear_lock() + sys.exit(2) + signal.alarm(0) + # not cleaning up the proxy files (are shared via the ENV, and we don't want an extra thread to just remove that file, or wait for the individual tests to finish... + +if __name__ == '__main__': + main() From caf18b4df2cc157db3d243d6cca4af71d85d4c6c Mon Sep 17 00:00:00 2001 From: Oscar Fernando Garzon Miguez Date: Tue, 9 Jun 2020 21:30:10 +0200 Subject: [PATCH 02/21] File mismatch script updated for failing workflows --- consistency_check/File_mismatch_WS.py | 133 ++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 consistency_check/File_mismatch_WS.py diff --git a/consistency_check/File_mismatch_WS.py b/consistency_check/File_mismatch_WS.py new file mode 100644 index 0000000..e304c81 --- /dev/null +++ b/consistency_check/File_mismatch_WS.py @@ -0,0 +1,133 @@ +import urllib.request as request +import urllib +import json +import requests +import os +import pandas +from pandas.io.json import json_normalize +from bs4 import BeautifulSoup as soup +#import urllib.request.urlopen as uReq +import warnings +warnings.filterwarnings('ignore') + +class x509RESTSession(object): + datasvc = "https://cmsweb.cern.ch/phedex/datasvc/json/prod" + datasvc_xml = "https://cmsweb.cern.ch/phedex/datasvc/xml/prod" + + def __init__(self): + self._session = requests.Session() + home = os.path.expanduser('~/') + # $ openssl rsa -in userkey_protected.pem -out userkey.pem; chmod 0600 userkey.pem + self._session.cert = (home+'.globus/usercert.pem', home+'.globus/userkey2.pem') + self._session.verify = os.getenv('X509_CERT_DIR') + + def data(self, dataset): + res = self._session.get("%s/data" % self.datasvc, params={'dataset': dataset}) + resjson = json.loads(res.content) + out = [] + for _instance in resjson["phedex"]["dbs"]: + for _dataset in _instance["dataset"]: + for _block in _dataset["block"]: + for _file in _block["file"]: + out.append( + { + "Dataset": _dataset["name"], + "File_name": _file["lfn"], + "File_checksum": _file["checksum"] + + } + ) + df = pandas.io.json.json_normalize(out) + return df + #format_dates(df, ["Time_file_was_created", "Time_block_was_created"]) + + def dbsinfo(self, dataset): + + res = self._session.get("https://cmsweb.cern.ch/dbs/prod/global/DBSReader/files?detail=1&dataset="+dataset) + resjson = json.loads(res.content) + out = [] + for _instance in resjson: + out.append( + { + "Dataset": _instance["dataset"], + "Is_valid": _instance["is_file_valid"], + "File_name": _instance["logical_file_name"], + "File_checksum": _instance["check_sum"], + "last_modified_by": _instance ["last_modified_by"] + + } + ) + + df = pandas.io.json.json_normalize(out) + return df + + def load_html(self, url): + uClient = request.urlopen(url) + web_site = uClient.read() + uClient.close() + page = soup(web_site, "html.parser") + return page + + def jsonmethod(self, method, **params): + return self.getjson(url=self.jsonurl.join(method), params=params) + +def get_datasets(web): + datasets = [] + for table in web.findAll('table'): + tr = table.findAll('tr') + for i in range(len(tr)): + if i > 0: + casilla = tr[i].findAll('td') + if (len(casilla[0].findAll('a')) > 0): + datasets.append(str(casilla[1].text)) + else: + datasets.append(str(casilla[0].text)) + return datasets + +def main(sesion, web): + datasets = get_datasets(web) + invalidate_in_phedex = [] + invalidate_in_dbs = [] + dataset_empty_dbs = [] + dataset_empty_phedex = [] + for _dataset in datasets: + phedex = sesion.data(dataset = _dataset) + dbs = sesion.dbsinfo(dataset = _dataset) + if dbs.empty or phedex.empty: + if dbs.empty: + dataset_empty_dbs.append(_dataset) + if phedex.empty: + dataset_empty_phedex.append(_dataset) + else: + dbs_valid = dbs.loc[dbs['Is_valid'] == 1] + if len(phedex['File_name']) == len(dbs_valid['File_name']): + pass + elif len(phedex['File_name']) > len(dbs_valid['File_name']): + invalidated_in_dbs_by_unified = dbs.loc[dbs['last_modified_by'].str.contains('unified')] + invalidated_in_dbs_by_unified = invalidated_in_dbs_by_unified.loc[invalidated_in_dbs_by_unified['Is_valid'] == 0] + phedex['invalidated_by_unified_in_dbs'] = phedex["File_name"].isin(invalidated_in_dbs_by_unified["File_name"]) + array = phedex.loc[phedex["invalidated_by_unified_in_dbs"] == True, 'File_name'].to_numpy() + for i in range(len(array)): + invalidate_in_phedex.append(array[i]) + elif len(phedex['File_name']) < len(dbs_valid['File_name']): + dbs_valid["in_phedex"] = dbs_valid["File_name"].isin(phedex["File_name"]) + array = dbs_valid.loc[dbs_valid["in_phedex"] == False, 'File_name'].to_numpy() + for i in range(len(array)): + invalidate_in_dbs.append(array[i]) + with open('invalidate_in_dbs.txt', 'w') as f: + for item in invalidate_in_dbs: + f.write("%s\n" % item) + with open('invalidate_in_phedex.txt', 'w') as f: + for item in invalidate_in_phedex: + f.write("%s\n" % item) + with open('dataset_empty_dbs.txt', 'w') as f: + for item in dataset_empty_dbs: + f.write("%s\n" % item) + with open('dataset_empty_phedex.txt', 'w') as f: + for item in dataset_empty_phedex: + f.write("%s\n" % item) + +if __name__ == '__main__': + sesion = x509RESTSession() + web_info = sesion.load_html(url='https://cms-unified.web.cern.ch/cms-unified/assistance.html') + main(sesion, web_info) From 311d6678ff01d4de9b5a7f3fa3ce30f4000d98f6 Mon Sep 17 00:00:00 2001 From: Oscar Fernando Garzon Miguez Date: Fri, 12 Jun 2020 18:12:47 +0200 Subject: [PATCH 03/21] Kibana probe is producing JSON files --- .../XRDFED-kibana-probe_to_JSON-Copy1.py | 338 ++++++++++++++++++ 1 file changed, 338 insertions(+) create mode 100644 AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy1.py diff --git a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy1.py b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy1.py new file mode 100644 index 0000000..ff061dc --- /dev/null +++ b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy1.py @@ -0,0 +1,338 @@ +#!/usr/bin/python +# functional probe and SLS extractor for the "federation" xroot services +# highlights: +# - stateless (i.e. run from cron whenever needed) +# - will try to prevent parallel runs via lockfile +# - multithreaded, one thread per service to be tested +# - overall runtime cap at 10min +# - could extract some statistics from xroot directly, but these are ever-increasing counters +# Problems: +# - need to update the code whenever a service is addded/deleted/changed +# - uses "random" files on various Xroot services all over the world, these are (for now) the same as used by the experiments but these might change.. + +import xml.dom.minidom +import subprocess +import os +import sys +import signal +import re +import time +import Lemon.XMLAPI +import socket +import atexit +import threading +import tempfile +import json + +html_dir = '/var/www/html/aaa-probe/' # will create per-service json files here + +LOCKFILE='/var/lock/subsys/xrdfed-kibana-probe' + +class Alarm(Exception): + pass + +def alarm_handler(signum, frame): + print "ERROR: caught overall timeout after "+str(timeout_sec)+"s\n" + clear_lock() + sys.exit(2) + raise Alarm + +def clear_lock(): + try: + os.unlink(LOCKFILE) + except Exception,e: + print "could not remove lockfile:"+str(e) + +def env_setup(): + os.environ['X509_USER_CERT']='/root/.globus/slsprobe-cert.pem' + os.environ['X509_USER_KEY']='/root/.globus/slsprobe-key.pem' + os.environ['X509_USER_PROXY']='/root/.globus/slsprobe.proxy' + os.environ['KRB5CCNAME']='FILE:/dev/null' + os.environ['PATH']=os.environ['PATH']+":/opt/globus/bin/" + +def get_proxy(): + dev_null = open('/dev/null', 'rw') + (proxyfd,proxy)=tempfile.mkstemp(prefix='x509_xrdfed_',suffix='.pem') + os.close(proxyfd) + os.environ['X509_USER_PROXY']=proxy + ret = subprocess.call(['grid-proxy-init','-pwstdin'],stdin=dev_null,) + if ret > 0: + raise Exception("Cannot get X509 proxy") + dev_null.close() + +def cleanup_proxy(): + try: + os.unlink(os.environ['X509_USER_PROXY']) + except Exception,e: + print "could not remove proxy file:"+str(e) + +def try_lock(): + ret = subprocess.call(['lockfile','-5','-r2',LOCKFILE]) + if ret > 0: + print "could not create lockfile" + return False + return True + +def prepare_dictionary(servicename): + dic={'serviceName':servicename} + return dic +def dnsalias_to_nodes(redirector): + (host,port) = redirector.split(':') + all_hosts = [] + data=socket.getaddrinfo(host,port,0, 0, socket.SOL_TCP ) + for addr in data: + (family, socktype, proto, canonname, sockaddr) = addr + (hostname, aliaslist, ipaddrlist) = socket.gethostbyaddr(sockaddr[0]) + if not hostname in all_hosts: + all_hosts.append(hostname) + return all_hosts + +def xrdcp_test(redirector,file): + (errtext,out,err,elapsed) = run_xrd_commands("xrdcp", + ["-d","2", + "-f", + "-DIReadCacheSize","0", + "-DIRedirCntTimeout","180", + "root://"+redirector+'/'+file, + '/dev/null']) + return (errtext,err,elapsed) + +def xrd_info(redirector): + version = "(unknown)" + (errtext,out,err,elapsed) = run_xrd_commands("xrd", + [redirector, + "query","1", # 1:kXR_QStats + "a"]) # a_ll stats + if not errtext: + try: + dom = xml.dom.minidom.parseString(out) + root_node = dom.documentElement + if root_node.tagName == 'statistics': + v_attr = root_node.getAttributeNode('ver') + version = v_attr.nodeValue + except Exception,e: + errtext = "ERROR: cannot parse answer:"+str(e) + return (errtext,version,out) + +def run_xrd_commands(cmd,args): + dev_null = open('/dev/null', 'r') + errtxt = '' + elapsed = -1.0 + xrd_args = [ 'perl','-e',"alarm 180; exec @ARGV", cmd, # one-line wrapper that *actually* kills the command + "-DIConnectTimeout","30", + "-DITransactionTimeout","60", + "-DIRequestTimeout","60" ] + args + try: + start = time.time() + proc = subprocess.Popen(xrd_args, + stdin=dev_null, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + (out, err) = proc.communicate() + ret = proc.returncode + elapsed = (time.time() - start) + err_redir_index = err.rfind('Received redirection to') + err_index3010 = err.rfind('(error code: 3010') # (permission denied) may be sort-of-OK - we are talking to final storage already - UK + err_index3005 = err.rfind('(error code: 3005') # (no user mapping) - INFN + if err_redir_index >= 0 and (err_index3010 >= 0 or err_index3005 >= 0): + errtxt = '' + else: + if(ret > 0): + errtxt = "client-side error - exit code "+str(ret)+"\n" + err_index = err.rfind('Last server error') + if err_index >= 0: + err_end_index=err.find("\n",err_index) + errtxt = errtxt + err[err_index:err_end_index] + except Exception,e: + errtext = errtxt + "Exception: "+str(e) + dev_null.close() + + return (errtxt,out,err,elapsed) + +def test_redirector(servicename, redirector, file_below=None, file_above=None, extra_notes=""): + servicename=servicename.upper() + availability = 'available' + availinfo = '' + + # prepare the dictionary. + dicci = prepare_dictionary(servicename) + notes_text = "Redirector:"+redirector + + # run the functional tests - first some simple check to get the version, if OK look for files + (err_info,version,dump_info) = xrd_info(redirector) + if(err_info): + print('error_info_true') + availability = 'unavailable' + availinfo=availinfo+" Error getting info from redirector "+err_info + dicci["xrdcp_below_time"] = 0 + #dicci["status"] = "unavailable" + + else: + availinfo="Version check: "+version + if (file_below): + notes_text = notes_text + "File 'below': " + file_below + (err_below,dump_below,elapsed_below) = xrdcp_test(redirector, file_below) + if err_below: + availability = 'degraded' + availinfo=availinfo+" Error below redirector "+err_below + dump_sane = re.sub('---*','__',dump_below) + c = "Detailed output for file BELOW "+redirector+":"+file_below+" "+err_below+" "+dump_sane + #dicci['comment'] = c + else: + availinfo=availinfo+" File below: OK " + dicci['xrdcp_below_time'] = str(elapsed_below) + else: + availinfo=availinfo+" File below: not tested." + if(file_above): + notes_text = notes_text + "File 'elsewhere': " + file_above + (err_above,dump_above,elapsed_above) = xrdcp_test(redirector, file_above) + if err_above : + #We've changed availability from number to string so this below won't work; Marian commented out on 2015-11-06 + #availability = availability * 0.8 # less important if some remote site is failing.. + availinfo=availinfo+" Error above redirector "+err_above + # sanitize the raw output in order to not trigger XML errors.. in a comment. + dump_sane = re.sub('---*','__',dump_above) + #c = "Detailed output for file ABOVE "+redirector+":"+file_above+"\n"+ + #err_above+"\n" + #+dump_sane + #dicci = {**dicci, **{'comment': c}} + #serviceUpdate.appendChild(c) + #need_xml_link=1 + else: + availinfo=availinfo+" File above: OK " + #nValue = doc.createElement("numericvalue") + #nValue.setAttribute("name", "xrdcp_above_time") + #nValue.setAttribute("desc", "Time to copy a file elsewhere in the federation") + #nValue.appendChild(doc.createTextNode(str(elapsed_above))) + dicci['xrdcp_above_time'] = str(elapsed_above) + #data.appendChild(nValue) + else: + availinfo=availinfo+" File above: not tested." + + # save functional test info to XML + #if need_xml_link: + # myhostname = socket.gethostname() + # notes_text = notes_text + "Details for failed test: http://" + myhostname + "/aaa-probe/" + servicename + ".xml
\n" + "Details for recently failed test : http://vocms039.cern.ch/aaa-probe/err/
\n" + availinfo = availinfo + " " + notes_text + dicci['status']= str(availability) + #dicci['availabilityinfo']=availinfo + return dicci + + +def main(): + debug = 0 + atexit.register(clear_lock) + if len(sys.argv) > 1: + if sys.argv[1] == '-d': + debug=1 + if not try_lock(): + sys.exit(1) + if not os.path.exists(html_dir): + os.makedirs(html_dir) + env_setup() + # get a proxy cert + # get_proxy() + + timeout_sec = 10 * 60 # limit overall runtime to 10min + signal.signal(signal.SIGALRM, alarm_handler) + + ATLASLINK="%BR%Monitoring:%BR%\n http://atl-prod07.slac.stanford.edu:8080/display?page=xrd_report/aggregated/total_xrootd_lgn %BR%\n http://dashb-atlas-xrootd-transfers.cern.ch/ui %BR%\nhttp://dashb-atlas-ssb.cern.ch/dashboard/request.py/siteview#currentView=FAX+redirectors&highlight=false %BR%\n" + CMSLINK="%BR%Monitoring:%BR%\n http://xrootd.t2.ucsd.edu/dashboard/ %BR%\n http://dashb-cms-xrootd-transfers.cern.ch/ui %BR%\n" + FILEABOVE="/store/mc/SAM/GenericTTbar/AODSIM/CMSSW_9_2_6_91X_mcRun1_realistic_v2-v1/00000/A64CCCF2-5C76-E711-B359-0CC47A78A3F8.root" + FILEBELOW="/store/mc/SAM/GenericTTbar/AODSIM/CMSSW_9_2_6_91X_mcRun1_realistic_v2-v1/00000/A64CCCF2-5C76-E711-B359-0CC47A78A3F8.root" + + services = { + "XRDFED_CMS-GLOBAL01-NEW":{'redirector':'cms-xrd-global01.cern.ch:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-GLOBAL02-NEW":{'redirector':'cms-xrd-global02.cern.ch:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-US-FNAL":{'redirector':'cmsxrootd2.fnal.gov:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-US-UNL":{'redirector':'xrootd.unl.edu:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU-BARI":{'redirector':'xrootd.ba.infn.it:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU-LLR":{'redirector':'llrxrd-redir.in2p3.fr:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU-PISA":{'redirector':'xrootd-redic.pi.infn.it:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-GLOBAL":{'redirector':'cms-xrd-global.cern.ch:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-US":{'redirector':'cmsxrootd.fnal.gov:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU":{'redirector':'xrootd-cms.infn.it:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU-IPv6":{ 'redirector':'xrootd-cms-redir-01.cr.cnaf.infn.it:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + "XRDFED_CMS-TRANSIT":{'redirector':'cms-xrd-transit.cern.ch:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-TRANSIT01":{'redirector':'vocms031.cern.ch:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-TRANSIT02":{'redirector':'vocms032.cern.ch:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + } + signal.alarm(timeout_sec) + try: + with open('KIBANA_PROBES.js', 'w') as outfile: + for xrd in services: + services[xrd].update(servicename=xrd) + if debug: + dic = test_redirector(** services[xrd]) + json.dump(dic, outfile) + outfile.write('\n') + else: + t = threading.Thread(target=test_redirector, + kwargs = services[xrd]) # read: "run a thread with the test function and all the parameters above as arguments" + t.start() + + + except Alarm: + print "ERROR: caught overall timeout after "+str(timeout_sec)+"s\n" + clear_lock() + sys.exit(2) + signal.alarm(0) + # not cleaning up the proxy files (are shared via the ENV, and we don't want an extra thread to just remove that file, or wait for the individual tests to finish... + +if __name__ == '__main__': + main() From 170de026ca2e62c1d1ab3e0834f931b39bcf094a Mon Sep 17 00:00:00 2001 From: Oscar Fernando Garzon Miguez Date: Wed, 17 Jun 2020 16:58:22 +0200 Subject: [PATCH 04/21] File_mismatch_WS.py Modified for lxplus --- consistency_check/File_mismatch_WS.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/consistency_check/File_mismatch_WS.py b/consistency_check/File_mismatch_WS.py index e304c81..9ddceb0 100644 --- a/consistency_check/File_mismatch_WS.py +++ b/consistency_check/File_mismatch_WS.py @@ -86,6 +86,8 @@ def get_datasets(web): def main(sesion, web): datasets = get_datasets(web) + print(datasets) + print(len(datasets)) invalidate_in_phedex = [] invalidate_in_dbs = [] dataset_empty_dbs = [] From 839869f872b1b939d1eeda301f918c87789743f5 Mon Sep 17 00:00:00 2001 From: FernandoGarzon <61252446+FernandoGarzon@users.noreply.github.com> Date: Wed, 17 Jun 2020 11:41:06 -0500 Subject: [PATCH 05/21] Update README.md --- consistency_check/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/consistency_check/README.md b/consistency_check/README.md index 9804fa2..6c910c2 100644 --- a/consistency_check/README.md +++ b/consistency_check/README.md @@ -18,3 +18,15 @@ twiki: https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsTransferTeamConsist ``` ~/TransferTeam/consistency_check/BDV/BDVParser.sh --verbose --db ~/param/DBParam:Prod/Meric --node T2_CH_CERN --day 20 --output output_dir ``` +#### How to run File_mismatch_WS.py + +* Log in lxplus7 vm as usual and create a proxy (See https://github.com/CMSCompOps/TransferTeam/blob/master/scripts/setup.sh). +* Create and activate a python3 environment. Update the pandas library and then run the script: +``` +$ python3 -m venv env +$ source ./env/bin/activate +$ pip3 install update pandas +$ python3 File_mismatch_WS.py +``` + +* You will get four files: invalidate_in_dbs.txt, invalidate_in_phedex.txt, dataset_empty_dbs.txt and dataset_empty_phedex.txt. Proceed as necessary. From e681222e182ab3972deb1eff55f3e723b1bc13e3 Mon Sep 17 00:00:00 2001 From: Oscar Fernando Garzon Miguez Date: Wed, 17 Jun 2020 18:44:05 +0200 Subject: [PATCH 06/21] File_mismatch_WS.py ready to run on lxplus7 --- consistency_check/File_mismatch_WS.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/consistency_check/File_mismatch_WS.py b/consistency_check/File_mismatch_WS.py index 9ddceb0..926fe8e 100644 --- a/consistency_check/File_mismatch_WS.py +++ b/consistency_check/File_mismatch_WS.py @@ -17,7 +17,7 @@ class x509RESTSession(object): def __init__(self): self._session = requests.Session() home = os.path.expanduser('~/') - # $ openssl rsa -in userkey_protected.pem -out userkey.pem; chmod 0600 userkey.pem + os.system("openssl rsa -in "+home+".globus/userkey.pem -out "+home+".globus/userkey2.pem; chmod 0600 "+home+".globus/userkey2.pem") self._session.cert = (home+'.globus/usercert.pem', home+'.globus/userkey2.pem') self._session.verify = os.getenv('X509_CERT_DIR') @@ -86,8 +86,6 @@ def get_datasets(web): def main(sesion, web): datasets = get_datasets(web) - print(datasets) - print(len(datasets)) invalidate_in_phedex = [] invalidate_in_dbs = [] dataset_empty_dbs = [] From b9433524f6b775369c1c173cd811c4cd025b7484 Mon Sep 17 00:00:00 2001 From: dmielaikaite Date: Thu, 18 Jun 2020 17:28:04 +0200 Subject: [PATCH 07/21] XRDFED-kibana-probe up to date as in vocms039 --- .../XRDFED-kibana-probe_to_JSON-Copy2.py | 344 ++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py diff --git a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py new file mode 100644 index 0000000..2c7d236 --- /dev/null +++ b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py @@ -0,0 +1,344 @@ +#!/usr/bin/python +# functional probe and SLS extractor for the "federation" xroot services +# highlights: +# - stateless (i.e. run from cron whenever needed) +# - will try to prevent parallel runs via lockfile +# - multithreaded, one thread per service to be tested +# - overall runtime cap at 10min +# - could extract some statistics from xroot directly, but these are ever-increasing counters +# Problems: +# - need to update the code whenever a service is addded/deleted/changed +# - uses "random" files on various Xroot services all over the world, these are (for now) the same as used by the experiments but these might change.. + +import xml.dom.minidom +import subprocess +import os +import sys +import signal +import re +import time +import Lemon.XMLAPI +import socket +import atexit +import threading +import tempfile +import json +import shutil + +html_dir = '/root/ogarzonm/' # will create per-service json files here + +LOCKFILE='/var/lock/subsys/xrdfed-kibana-probe' + +class Alarm(Exception): + pass + +def alarm_handler(signum, frame): + print "ERROR: caught overall timeout after "+str(timeout_sec)+"s\n" + clear_lock() + sys.exit(2) + raise Alarm + +def clear_lock(): + try: + os.unlink(LOCKFILE) + except Exception,e: + print "could not remove lockfile:"+str(e) + +def env_setup(): + os.environ['X509_USER_CERT']='/root/.globus/slsprobe-cert.pem' + os.environ['X509_USER_KEY']='/root/.globus/slsprobe-key.pem' + os.environ['X509_USER_PROXY']='/root/.globus/slsprobe.proxy' + os.environ['KRB5CCNAME']='FILE:/dev/null' + os.environ['PATH']=os.environ['PATH']+":/opt/globus/bin/" + +def get_proxy(): + dev_null = open('/dev/null', 'rw') + (proxyfd,proxy)=tempfile.mkstemp(prefix='x509_xrdfed_',suffix='.pem') + os.close(proxyfd) + os.environ['X509_USER_PROXY']=proxy + ret = subprocess.call(['grid-proxy-init','-pwstdin'],stdin=dev_null,) + if ret > 0: + raise Exception("Cannot get X509 proxy") + dev_null.close() + +def cleanup_proxy(): + try: + os.unlink(os.environ['X509_USER_PROXY']) + except Exception,e: + print "could not remove proxy file:"+str(e) + +def try_lock(): + ret = subprocess.call(['lockfile','-5','-r2',LOCKFILE]) + if ret > 0: + print "could not create lockfile" + return False + return True + +def prepare_dictionary(servicename): + dic={'serviceName':servicename} + return dic +def dnsalias_to_nodes(redirector): + (host,port) = redirector.split(':') + all_hosts = [] + data=socket.getaddrinfo(host,port,0, 0, socket.SOL_TCP ) + for addr in data: + (family, socktype, proto, canonname, sockaddr) = addr + (hostname, aliaslist, ipaddrlist) = socket.gethostbyaddr(sockaddr[0]) + if not hostname in all_hosts: + all_hosts.append(hostname) + return all_hosts + +def xrdcp_test(redirector,file): + (errtext,out,err,elapsed) = run_xrd_commands("xrdcp", + ["-d","2", + "-f", + "-DIReadCacheSize","0", + "-DIRedirCntTimeout","180", + "root://"+redirector+'/'+file, + '/dev/null']) + return (errtext,err,elapsed) + +def xrd_info(redirector): + version = "(unknown)" + (errtext,out,err,elapsed) = run_xrd_commands("xrd", + [redirector, + "query","1", # 1:kXR_QStats + "a"]) # a_ll stats + if not errtext: + try: + dom = xml.dom.minidom.parseString(out) + root_node = dom.documentElement + if root_node.tagName == 'statistics': + v_attr = root_node.getAttributeNode('ver') + version = v_attr.nodeValue + except Exception,e: + errtext = "ERROR: cannot parse answer:"+str(e) + return (errtext,version,out) + +def run_xrd_commands(cmd,args): + dev_null = open('/dev/null', 'r') + errtxt = '' + elapsed = -1.0 + xrd_args = [ 'perl','-e',"alarm 180; exec @ARGV", cmd, # one-line wrapper that *actually* kills the command + "-DIConnectTimeout","30", + "-DITransactionTimeout","60", + "-DIRequestTimeout","60" ] + args + err = '' + out = '' + try: + ran_try = True + start = time.time() + proc = subprocess.Popen(xrd_args, + stdin=dev_null, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + (out, err) = proc.communicate() + + ret = proc.returncode + elapsed = (time.time() - start) + err_redir_index = err.rfind('Received redirection to') + err_index3010 = err.rfind('(error code: 3010') # (permission denied) may be sort-of-OK - we are talking to final storage already - UK + err_index3005 = err.rfind('(error code: 3005') # (no user mapping) - INFN + if err_redir_index >= 0 and (err_index3010 >= 0 or err_index3005 >= 0): + errtxt = '' + else: + if(ret > 0): + errtxt = "client-side error - exit code "+str(ret)+"\n" + err_index = err.rfind('Last server error') + if err_index >= 0: + err_end_index=err.find("\n",err_index) + errtxt = errtxt + err[err_index:err_end_index] + except Exception,e: + errtext = errtxt + "Exception: "+str(e) + dev_null.close() + return (errtxt,out,err,elapsed) + +def test_redirector(servicename, redirector, file_below=None, file_above=None, extra_notes=""): + servicename=servicename.upper() + availability = 'available' + availinfo = '' + + # prepare the dictionary. + dicci = prepare_dictionary(servicename) + notes_text = "Redirector:"+redirector + + # run the functional tests - first some simple check to get the version, if OK look for files + (err_info,version,dump_info) = xrd_info(redirector) + if(err_info): + + availability = 'unavailable' + availinfo=availinfo+" Error getting info from redirector "+err_info + dicci["xrdcp_below_time"] = 0 + #dicci["status"] = "unavailable" + + else: + availinfo="Version check: "+version + if (file_below): + notes_text = notes_text + "File 'below': " + file_below + (err_below,dump_below,elapsed_below) = xrdcp_test(redirector, file_below) + if err_below: + availability = 'degraded' + availinfo=availinfo+" Error below redirector "+err_below + dump_sane = re.sub('---*','__',dump_below) + c = "Detailed output for file BELOW "+redirector+":"+file_below+" "+err_below+" "+dump_sane + #dicci['comment'] = c + else: + availinfo=availinfo+" File below: OK " + dicci['xrdcp_below_time'] = str(elapsed_below) + else: + availinfo=availinfo+" File below: not tested." + if(file_above): + notes_text = notes_text + "File 'elsewhere': " + file_above + (err_above,dump_above,elapsed_above) = xrdcp_test(redirector, file_above) + if err_above : + #We've changed availability from number to string so this below won't work; Marian commented out on 2015-11-06 + #availability = availability * 0.8 # less important if some remote site is failing.. + availinfo=availinfo+" Error above redirector "+err_above + # sanitize the raw output in order to not trigger XML errors.. in a comment. + dump_sane = re.sub('---*','__',dump_above) + #c = "Detailed output for file ABOVE "+redirector+":"+file_above+"\n"+ + #err_above+"\n" + #+dump_sane + #dicci = {**dicci, **{'comment': c}} + #serviceUpdate.appendChild(c) + #need_xml_link=1 + else: + availinfo=availinfo+" File above: OK " + #nValue = doc.createElement("numericvalue") + #nValue.setAttribute("name", "xrdcp_above_time") + #nValue.setAttribute("desc", "Time to copy a file elsewhere in the federation") + #nValue.appendChild(doc.createTextNode(str(elapsed_above))) + dicci['xrdcp_above_time'] = str(elapsed_above) + #data.appendChild(nValue) + else: + availinfo=availinfo+" File above: not tested." + + # save functional test info to XML + #if need_xml_link: + # myhostname = socket.gethostname() + # notes_text = notes_text + "Details for failed test: http://" + myhostname + "/aaa-probe/" + servicename + ".xml
\n" + "Details for recently failed test : http://vocms039.cern.ch/aaa-probe/err/
\n" + availinfo = availinfo + " " + notes_text + dicci['status']= str(availability) + #dicci['availabilityinfo']=availinfo + with open(html_dir +'KIBANA_PROBES.json', 'a') as f: + json.dump(dicci, f) + f.write('\n') + + +def main(): + debug = 0 + atexit.register(clear_lock) + if len(sys.argv) > 1: + if sys.argv[1] == '-d': + debug=1 + if not try_lock(): + sys.exit(1) + if not os.path.exists(html_dir): + os.makedirs(html_dir) + env_setup() + # get a proxy cert + # get_proxy() + + timeout_sec = 10 * 60 # limit overall runtime to 10min + signal.signal(signal.SIGALRM, alarm_handler) + + ATLASLINK="%BR%Monitoring:%BR%\n http://atl-prod07.slac.stanford.edu:8080/display?page=xrd_report/aggregated/total_xrootd_lgn %BR%\n http://dashb-atlas-xrootd-transfers.cern.ch/ui %BR%\nhttp://dashb-atlas-ssb.cern.ch/dashboard/request.py/siteview#currentView=FAX+redirectors&highlight=false %BR%\n" + CMSLINK="%BR%Monitoring:%BR%\n http://xrootd.t2.ucsd.edu/dashboard/ %BR%\n http://dashb-cms-xrootd-transfers.cern.ch/ui %BR%\n" + FILEABOVE="/store/mc/SAM/GenericTTbar/AODSIM/CMSSW_9_2_6_91X_mcRun1_realistic_v2-v1/00000/A64CCCF2-5C76-E711-B359-0CC47A78A3F8.root" + FILEBELOW="/store/mc/SAM/GenericTTbar/AODSIM/CMSSW_9_2_6_91X_mcRun1_realistic_v2-v1/00000/A64CCCF2-5C76-E711-B359-0CC47A78A3F8.root" + + services = { + "XRDFED_CMS-GLOBAL01-NEW":{'redirector':'cms-xrd-global01.cern.ch:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-GLOBAL02-NEW":{'redirector':'cms-xrd-global02.cern.ch:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-US-FNAL":{'redirector':'cmsxrootd2.fnal.gov:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-US-UNL":{'redirector':'xrootd.unl.edu:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU-BARI":{'redirector':'xrootd.ba.infn.it:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU-LLR":{'redirector':'llrxrd-redir.in2p3.fr:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU-PISA":{'redirector':'xrootd-redic.pi.infn.it:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-GLOBAL":{'redirector':'cms-xrd-global.cern.ch:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-US":{'redirector':'cmsxrootd.fnal.gov:1094', + 'file_below': FILEABOVE, + 'file_above': FILEBELOW, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU":{'redirector':'xrootd-cms.infn.it:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-EU-IPv6":{ 'redirector':'xrootd-cms-redir-01.cr.cnaf.infn.it:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + "XRDFED_CMS-TRANSIT":{'redirector':'cms-xrd-transit.cern.ch:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-TRANSIT01":{'redirector':'vocms031.cern.ch:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + "XRDFED_CMS-TRANSIT02":{'redirector':'vocms032.cern.ch:1094', + 'file_below': FILEBELOW, + 'file_above': FILEABOVE, + 'extra_notes':CMSLINK}, + + } + signal.alarm(timeout_sec) + #shutil.copyfile(html_dir+'KIBANA_PROBES.json', html_dir+'KIBANA_PROBES_2.json') + os.remove(html_dir+'KIBANA_PROBES.json') + #shutil.copyfile(html_dir+'KIBANA_PROBES.json', html_dir+'KIBANA_PROBES_2.json') + try: + + for xrd in services: + services[xrd].update(servicename=xrd) + if debug: + test_redirector(** services[xrd]) + else: + t = threading.Thread(target=test_redirector, kwargs = services[xrd]) # read: "run a thread with the test function and all the parameters above as arguments" + t.start() + except Alarm: + print "ERROR: caught overall timeout after "+str(timeout_sec)+"s\n" + clear_lock() + sys.exit(2) + signal.alarm(0) + #shutil.copyfile(html_dir+'KIBANA_PROBES.json', html_dir+'KIBANA_PROBES_2.json') + # not cleaning up the proxy files (are shared via the ENV, and we don't want an extra thread to just remove that file, or wait for the individual tests to finish... + +if __name__ == '__main__': + main() + From 0bf44597bad9d9161acd49537d63f86429e316fb Mon Sep 17 00:00:00 2001 From: Fernando Garzon Date: Thu, 18 Jun 2020 10:53:06 -0500 Subject: [PATCH 08/21] Availinfo added when errors occur --- AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py index 2c7d236..008047a 100644 --- a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py +++ b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py @@ -157,6 +157,7 @@ def test_redirector(servicename, redirector, file_below=None, file_above=None, e servicename=servicename.upper() availability = 'available' availinfo = '' + c = 'No comment' # prepare the dictionary. dicci = prepare_dictionary(servicename) @@ -196,9 +197,9 @@ def test_redirector(servicename, redirector, file_below=None, file_above=None, e availinfo=availinfo+" Error above redirector "+err_above # sanitize the raw output in order to not trigger XML errors.. in a comment. dump_sane = re.sub('---*','__',dump_above) - #c = "Detailed output for file ABOVE "+redirector+":"+file_above+"\n"+ - #err_above+"\n" - #+dump_sane + c = "Detailed output for file ABOVE "+redirector+":"+file_above+" "+ + err_above+" " + +dump_sane #dicci = {**dicci, **{'comment': c}} #serviceUpdate.appendChild(c) #need_xml_link=1 @@ -219,6 +220,9 @@ def test_redirector(servicename, redirector, file_below=None, file_above=None, e # notes_text = notes_text + "Details for failed test: http://" + myhostname + "/aaa-probe/" + servicename + ".xml
\n" + "Details for recently failed test : http://vocms039.cern.ch/aaa-probe/err/
\n" availinfo = availinfo + " " + notes_text dicci['status']= str(availability) + if availability == 'unavailable' or if availability == 'degraded': + dicci ['availInfo'] = availinfo + dicci ['Comment'] = c #dicci['availabilityinfo']=availinfo with open(html_dir +'KIBANA_PROBES.json', 'a') as f: json.dump(dicci, f) From dfae6bf8905858cde5a726990dc78d41fa26474a Mon Sep 17 00:00:00 2001 From: dmielaikaite Date: Thu, 18 Jun 2020 18:05:23 +0200 Subject: [PATCH 09/21] Updated availinfo --- AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py index 008047a..c66d3f6 100644 --- a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py +++ b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py @@ -197,9 +197,7 @@ def test_redirector(servicename, redirector, file_below=None, file_above=None, e availinfo=availinfo+" Error above redirector "+err_above # sanitize the raw output in order to not trigger XML errors.. in a comment. dump_sane = re.sub('---*','__',dump_above) - c = "Detailed output for file ABOVE "+redirector+":"+file_above+" "+ - err_above+" " - +dump_sane + c = "Detailed output for file ABOVE "+redirector+":"+file_above+" "+err_above+" "+dump_sane #dicci = {**dicci, **{'comment': c}} #serviceUpdate.appendChild(c) #need_xml_link=1 @@ -220,7 +218,7 @@ def test_redirector(servicename, redirector, file_below=None, file_above=None, e # notes_text = notes_text + "Details for failed test: http://" + myhostname + "/aaa-probe/" + servicename + ".xml
\n" + "Details for recently failed test : http://vocms039.cern.ch/aaa-probe/err/
\n" availinfo = availinfo + " " + notes_text dicci['status']= str(availability) - if availability == 'unavailable' or if availability == 'degraded': + if availability == 'unavailable' or availability == 'degraded': dicci ['availInfo'] = availinfo dicci ['Comment'] = c #dicci['availabilityinfo']=availinfo From d1b0074b2eec50f196ed0306f207c3d93ca24e63 Mon Sep 17 00:00:00 2001 From: Fernando Garzon Date: Thu, 18 Jun 2020 11:14:22 -0500 Subject: [PATCH 10/21] Updated Probe --- AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py index c66d3f6..ed366a5 100644 --- a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py +++ b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py @@ -221,6 +221,8 @@ def test_redirector(servicename, redirector, file_below=None, file_above=None, e if availability == 'unavailable' or availability == 'degraded': dicci ['availInfo'] = availinfo dicci ['Comment'] = c + else: + dicci['availInfo'] = "Version check: "+ version #dicci['availabilityinfo']=availinfo with open(html_dir +'KIBANA_PROBES.json', 'a') as f: json.dump(dicci, f) From 0f8ac7eb336d1a7577d1c7dd3b21eacd13339195 Mon Sep 17 00:00:00 2001 From: Fernando Garzon Date: Thu, 18 Jun 2020 11:27:47 -0500 Subject: [PATCH 11/21] Host name added --- AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py index ed366a5..db084a4 100644 --- a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py +++ b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py @@ -161,7 +161,8 @@ def test_redirector(servicename, redirector, file_below=None, file_above=None, e # prepare the dictionary. dicci = prepare_dictionary(servicename) - notes_text = "Redirector:"+redirector + notes_text = "Redirector:"+redirector + dicci['Host Name'] = redirector # run the functional tests - first some simple check to get the version, if OK look for files (err_info,version,dump_info) = xrd_info(redirector) From 5b54a28533056c211b77e5e68ea38cc4a998d6b6 Mon Sep 17 00:00:00 2001 From: Fernando Garzon Date: Thu, 18 Jun 2020 11:41:03 -0500 Subject: [PATCH 12/21] Update --- AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py index db084a4..01180c8 100644 --- a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py +++ b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py @@ -75,7 +75,7 @@ def try_lock(): return True def prepare_dictionary(servicename): - dic={'serviceName':servicename} + dic={'service':servicename} return dic def dnsalias_to_nodes(redirector): (host,port) = redirector.split(':') @@ -161,8 +161,7 @@ def test_redirector(servicename, redirector, file_below=None, file_above=None, e # prepare the dictionary. dicci = prepare_dictionary(servicename) - notes_text = "Redirector:"+redirector - dicci['Host Name'] = redirector + dicci['Host'] = redirector.split(':')[0] # run the functional tests - first some simple check to get the version, if OK look for files (err_info,version,dump_info) = xrd_info(redirector) @@ -223,7 +222,7 @@ def test_redirector(servicename, redirector, file_below=None, file_above=None, e dicci ['availInfo'] = availinfo dicci ['Comment'] = c else: - dicci['availInfo'] = "Version check: "+ version + dicci['Version'] = version #dicci['availabilityinfo']=availinfo with open(html_dir +'KIBANA_PROBES.json', 'a') as f: json.dump(dicci, f) From 7aed0d27be6e43e5a878b36abc4cb398d207e700 Mon Sep 17 00:00:00 2001 From: Fernando Garzon Date: Thu, 18 Jun 2020 11:43:07 -0500 Subject: [PATCH 13/21] Update --- AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py index 01180c8..676a67c 100644 --- a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py +++ b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py @@ -155,6 +155,7 @@ def run_xrd_commands(cmd,args): def test_redirector(servicename, redirector, file_below=None, file_above=None, extra_notes=""): servicename=servicename.upper() + notes_text = "Redirector:"+redirector availability = 'available' availinfo = '' c = 'No comment' From 80a2e2b34f2d9bac6849e7bed0f66cd67e497911 Mon Sep 17 00:00:00 2001 From: Fernando Garzon Date: Thu, 25 Jun 2020 16:24:00 -0500 Subject: [PATCH 14/21] Script to verify files in DBS added --- .../Check_dbs_invalidations_file.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 consistency_check/Check_dbs_invalidations_file.py diff --git a/consistency_check/Check_dbs_invalidations_file.py b/consistency_check/Check_dbs_invalidations_file.py new file mode 100644 index 0000000..52a75e0 --- /dev/null +++ b/consistency_check/Check_dbs_invalidations_file.py @@ -0,0 +1,30 @@ +import urllib.request as request +import urllib +import json +import requests +import os +import pandas +from pandas.io.json import json_normalize +from bs4 import BeautifulSoup as soup +import warnings +warnings.filterwarnings('ignore') + +def file_read(fname): + content_array = [] + with open(fname) as f: + for line in f: + content_array.append(line) + return content_array + +def which_to_invalidate_in_dbs(fname): + invalidate = [] + files = file_read(fname) + for line_ in range(len(files)): + os.system("xrdfs cms-xrd-global.cern.ch locate -d -m "+line_+" > aux.txt") + sites = file_read("aux.txt") + if sites[0] == "[FATAL] Redirect limit has been reached": + invalidate.append(line_) + return invalidate + +def invalidate_in_dbs(list_of_files): + From 391fd076af606f3e937f230607ca8c4774446ed6 Mon Sep 17 00:00:00 2001 From: Fernando Garzon Date: Thu, 25 Jun 2020 16:32:15 -0500 Subject: [PATCH 15/21] Test script for invalidating files in DBS --- consistency_check/Check_dbs_invalidations_file.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/consistency_check/Check_dbs_invalidations_file.py b/consistency_check/Check_dbs_invalidations_file.py index 52a75e0..222747e 100644 --- a/consistency_check/Check_dbs_invalidations_file.py +++ b/consistency_check/Check_dbs_invalidations_file.py @@ -24,7 +24,14 @@ def which_to_invalidate_in_dbs(fname): sites = file_read("aux.txt") if sites[0] == "[FATAL] Redirect limit has been reached": invalidate.append(line_) + print(invalidate) return invalidate -def invalidate_in_dbs(list_of_files): +def main(list_of_files): + print('Files will be invalidated in DBS', list_of_files) + +if __name__ == '__main__': + + list_of_files = which_to_invalidate_in_dbs('invalidate_in_dbs.txt') + main(list_of_files) From 1a7298583a14e1a34671052a10229780f791b397 Mon Sep 17 00:00:00 2001 From: Oscar Fernando Garzon Miguez Date: Thu, 25 Jun 2020 23:36:36 +0200 Subject: [PATCH 16/21] Test_2 --- consistency_check/Check_dbs_invalidations_file.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/consistency_check/Check_dbs_invalidations_file.py b/consistency_check/Check_dbs_invalidations_file.py index 222747e..719bff5 100644 --- a/consistency_check/Check_dbs_invalidations_file.py +++ b/consistency_check/Check_dbs_invalidations_file.py @@ -1,11 +1,4 @@ -import urllib.request as request -import urllib -import json -import requests import os -import pandas -from pandas.io.json import json_normalize -from bs4 import BeautifulSoup as soup import warnings warnings.filterwarnings('ignore') From 09066a02b1e87b33bda973383e0924f05d10a527 Mon Sep 17 00:00:00 2001 From: Fernando Garzon Date: Thu, 25 Jun 2020 16:37:52 -0500 Subject: [PATCH 17/21] Test_2 --- consistency_check/Check_dbs_invalidations_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/consistency_check/Check_dbs_invalidations_file.py b/consistency_check/Check_dbs_invalidations_file.py index 719bff5..f12c76b 100644 --- a/consistency_check/Check_dbs_invalidations_file.py +++ b/consistency_check/Check_dbs_invalidations_file.py @@ -12,7 +12,7 @@ def file_read(fname): def which_to_invalidate_in_dbs(fname): invalidate = [] files = file_read(fname) - for line_ in range(len(files)): + for line_ in files: os.system("xrdfs cms-xrd-global.cern.ch locate -d -m "+line_+" > aux.txt") sites = file_read("aux.txt") if sites[0] == "[FATAL] Redirect limit has been reached": From c220a3bc68cee42341a0df4df3cdea08fc74fd5d Mon Sep 17 00:00:00 2001 From: Oscar Fernando Garzon Miguez Date: Fri, 10 Jul 2020 22:54:40 +0200 Subject: [PATCH 18/21] Set up enviroment for File mismatch script --- .../Check_dbs_invalidations_file.py | 17 ++++++++++------- consistency_check/File_mismatch_WS.py | 5 +++-- consistency_check/env_set_up.sh | 13 +++++++++++++ 3 files changed, 26 insertions(+), 9 deletions(-) create mode 100644 consistency_check/env_set_up.sh diff --git a/consistency_check/Check_dbs_invalidations_file.py b/consistency_check/Check_dbs_invalidations_file.py index f12c76b..4b4d818 100644 --- a/consistency_check/Check_dbs_invalidations_file.py +++ b/consistency_check/Check_dbs_invalidations_file.py @@ -13,18 +13,21 @@ def which_to_invalidate_in_dbs(fname): invalidate = [] files = file_read(fname) for line_ in files: + line_ = line_[:-1] os.system("xrdfs cms-xrd-global.cern.ch locate -d -m "+line_+" > aux.txt") sites = file_read("aux.txt") - if sites[0] == "[FATAL] Redirect limit has been reached": + #:print(sites[0]) + if len(sites) == 0: invalidate.append(line_) - print(invalidate) return invalidate -def main(list_of_files): - print('Files will be invalidated in DBS', list_of_files) - +def main(list_of_files_): + with open('Invalidate_in_DBS.txt', 'w') as f: + for item in list_of_files_: + f.write("%s\n" % item) + if __name__ == '__main__': - list_of_files = which_to_invalidate_in_dbs('invalidate_in_dbs.txt') + list_of_files = which_to_invalidate_in_dbs('check_in_dbs.txt') main(list_of_files) - + print(list_of_files) diff --git a/consistency_check/File_mismatch_WS.py b/consistency_check/File_mismatch_WS.py index 926fe8e..4d4621e 100644 --- a/consistency_check/File_mismatch_WS.py +++ b/consistency_check/File_mismatch_WS.py @@ -17,7 +17,7 @@ class x509RESTSession(object): def __init__(self): self._session = requests.Session() home = os.path.expanduser('~/') - os.system("openssl rsa -in "+home+".globus/userkey.pem -out "+home+".globus/userkey2.pem; chmod 0600 "+home+".globus/userkey2.pem") + #os.system("openssl rsa -in "+home+".globus/userkey.pem -out "+home+".globus/userkey2.pem; chmod 0600 "+home+".globus/userkey2.pem") self._session.cert = (home+'.globus/usercert.pem', home+'.globus/userkey2.pem') self._session.verify = os.getenv('X509_CERT_DIR') @@ -86,6 +86,7 @@ def get_datasets(web): def main(sesion, web): datasets = get_datasets(web) + print(datasets) invalidate_in_phedex = [] invalidate_in_dbs = [] dataset_empty_dbs = [] @@ -114,7 +115,7 @@ def main(sesion, web): array = dbs_valid.loc[dbs_valid["in_phedex"] == False, 'File_name'].to_numpy() for i in range(len(array)): invalidate_in_dbs.append(array[i]) - with open('invalidate_in_dbs.txt', 'w') as f: + with open('check_in_dbs.txt', 'w') as f: for item in invalidate_in_dbs: f.write("%s\n" % item) with open('invalidate_in_phedex.txt', 'w') as f: diff --git a/consistency_check/env_set_up.sh b/consistency_check/env_set_up.sh new file mode 100644 index 0000000..5eea30e --- /dev/null +++ b/consistency_check/env_set_up.sh @@ -0,0 +1,13 @@ +source ~/TransferTeam/scripts/setup.sh +cd /afs/cern.ch/user/o/ogarzonm/Fork_TransferTeam/TransferTeam/consistency_check/ +python3 -m venv env +source ./env/bin/activate +pip3 install update pandas +pip3 install requests +python3 File_mismatch_WS.py +python Check_dbs_invalidations_file.py +source ~/TransferTeam/scripts/setup_in_rucio.sh +awk '{system("rucio list-file-replicas cms:"$1)}' check_in_dbs.txt > files_in_rucio_test.txt +cut -f3 -d '|' files_in_rucio_test.txt > files_in_rucio.txt +sed -i 's/ //g' files_in_rucio.txt +diff check_in_dbs.txt files_in_rucio.txt | grep /store/ > files_not_yet_in_rucio.txt From 315b0945966de19a5847041f02b8f70e601f292e Mon Sep 17 00:00:00 2001 From: dmielaikaite Date: Sat, 11 Jul 2020 00:22:54 +0200 Subject: [PATCH 19/21] Kibana probe for json files ready to be deployed --- AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py index 676a67c..88f3c8e 100644 --- a/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py +++ b/AAAOps/XfedKibana/XRDFED-kibana-probe_to_JSON-Copy2.py @@ -90,7 +90,7 @@ def dnsalias_to_nodes(redirector): def xrdcp_test(redirector,file): (errtext,out,err,elapsed) = run_xrd_commands("xrdcp", - ["-d","2", + ["-d","1", "-f", "-DIReadCacheSize","0", "-DIRedirCntTimeout","180", From 4edd5984f7a5b3495248ce983ed9e7c1015d4cdf Mon Sep 17 00:00:00 2001 From: FernandoGarzon Date: Sat, 11 Jul 2020 00:27:51 +0200 Subject: [PATCH 20/21] Script added to check files in rucio --- consistency_check/Check_files_in_rucio.py | 130 ++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 consistency_check/Check_files_in_rucio.py diff --git a/consistency_check/Check_files_in_rucio.py b/consistency_check/Check_files_in_rucio.py new file mode 100644 index 0000000..cbc68aa --- /dev/null +++ b/consistency_check/Check_files_in_rucio.py @@ -0,0 +1,130 @@ +import urllib.request as request +import urllib +import json +import requests +import os +import pandas +from pandas.io.json import json_normalize +from bs4 import BeautifulSoup as soup +#import urllib.request.urlopen as uReq +import warnings +warnings.filterwarnings('ignore') + +class x509RESTSession(object): + datasvc = "https://cmsweb.cern.ch/phedex/datasvc/json/prod" + datasvc_xml = "https://cmsweb.cern.ch/phedex/datasvc/xml/prod" + + def __init__(self): + self._session = requests.Session() + home = os.path.expanduser('~/') + #os.system("openssl rsa -in "+home+".globus/userkey.pem -out "+home+".globus/userkey2.pem; chmod 0600 "+home+".globus/userkey2.pem") + self._session.cert = (home+'.globus/usercert.pem', home+'.globus/userkey2.pem') + self._session.verify = os.getenv('X509_CERT_DIR') + + def data(self, dataset): + res = self._session.get("%s/data" % self.datasvc, params={'dataset': dataset}) + resjson = json.loads(res.content) + out = [] + for _instance in resjson["phedex"]["dbs"]: + for _dataset in _instance["dataset"]: + for _block in _dataset["block"]: + for _file in _block["file"]: + out.append( + { + "Dataset": _dataset["name"], + "File_name": _file["lfn"], + "File_checksum": _file["checksum"] + + } + ) + df = pandas.io.json.json_normalize(out) + return df + #format_dates(df, ["Time_file_was_created", "Time_block_was_created"]) + + def dbsinfo(self, dataset): + + res = self._session.get("https://cmsweb.cern.ch/dbs/prod/global/DBSReader/files?detail=1&dataset="+dataset) + resjson = json.loads(res.content) + out = [] + for _instance in resjson: + out.append( + { + "Dataset": _instance["dataset"], + "Is_valid": _instance["is_file_valid"], + "File_name": _instance["logical_file_name"], + "File_checksum": _instance["check_sum"], + "last_modified_by": _instance ["last_modified_by"] + + } + ) + + df = pandas.io.json.json_normalize(out) + return df + + def load_html(self, url): + uClient = request.urlopen(url) + web_site = uClient.read() + uClient.close() + page = soup(web_site, "html.parser") + return page + + def jsonmethod(self, method, **params): + return self.getjson(url=self.jsonurl.join(method), params=params) + +def get_datasets(web): + datasets = [] + for table in web.findAll('table'): + tr = table.findAll('tr') + for i in range(len(tr)): + if i > 0: + casilla = tr[i].findAll('td') + if (len(casilla[0].findAll('a')) > 0): + datasets.append(str(casilla[1].text)) + else: + datasets.append(str(casilla[0].text)) + return datasets + +def main(sesion, web): + datasets = get_datasets(web) + print(datasets) + invalidate_in_phedex = [] + invalidate_in_dbs = [] + dataset_empty_dbs = [] + dataset_empty_phedex = [] + for _dataset in datasets: + phedex = sesion.data(dataset = _dataset) + dbs = sesion.dbsinfo(dataset = _dataset) + array = [] + if dbs.empty: + pass + else: + + invalidated_in_dbs_by_unified = dbs.loc[dbs['last_modified_by'].str.contains('ogarzonm')] + invalidated_in_dbs_by_unified = invalidated_in_dbs_by_unified.loc[dbs['File_name'].str.contains('NANOAOD')] + invalidated_in_dbs_by_unified = invalidated_in_dbs_by_unified.loc[invalidated_in_dbs_by_unified['Is_valid'] == 0] + array = invalidated_in_dbs_by_unified[['File_name']].to_numpy() + print(array) + for i in range(len(array)): + invalidate_in_phedex.append(array[i]) + + + + + + with open('check_in_dbs.txt', 'w') as f: + for item in invalidate_in_dbs: + f.write("%s\n" % item) + with open('invalidate_in_phedex.txt', 'w') as f: + for item in invalidate_in_phedex: + f.write("%s\n" % item) + with open('dataset_empty_dbs.txt', 'w') as f: + for item in dataset_empty_dbs: + f.write("%s\n" % item) + with open('dataset_empty_phedex.txt', 'w') as f: + for item in dataset_empty_phedex: + f.write("%s\n" % item) + +if __name__ == '__main__': + sesion = x509RESTSession() + web_info = sesion.load_html(url='https://cms-unified.web.cern.ch/cms-unified/assistance.html') + main(sesion, web_info) From f90274a5adaa63cb0c03571f1642013e978c7d46 Mon Sep 17 00:00:00 2001 From: dmielaikaite Date: Tue, 14 Jul 2020 17:58:05 +0200 Subject: [PATCH 21/21] Issue with minidom.parseString() method solve in XRDFED-kibana-probe.py --- AAAOps/XfedKibana/XRDFED-kibana-probe.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/AAAOps/XfedKibana/XRDFED-kibana-probe.py b/AAAOps/XfedKibana/XRDFED-kibana-probe.py index fffd610..1206a09 100755 --- a/AAAOps/XfedKibana/XRDFED-kibana-probe.py +++ b/AAAOps/XfedKibana/XRDFED-kibana-probe.py @@ -23,7 +23,7 @@ import threading import tempfile -html_dir = '/var/www/html/aaa-probe/' # will create per-service xml files here +html_dir = '/root/ogarzonm/' # will create per-service xml files here #CERN_eosfile_rucio='/atlas/rucio/user/ivukotic:user.ivukotic.xrootd.cern-prod-1M' @@ -132,9 +132,11 @@ def xrd_info(redirector): [redirector, "query","1", # 1:kXR_QStats "a"]) # a_ll stats - if not errtext: + if not out: + out = "1" + if not errtext: try: - dom = xml.dom.minidom.parseString(out) + dom = xml.dom.minidom.parseString(out) root_node = dom.documentElement if root_node.tagName == 'statistics': v_attr = root_node.getAttributeNode('ver')