diff --git a/CHANGELOG.md b/CHANGELOG.md index 84e7b18..cb4b72a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,20 @@ ### Bug fixes +## spacesavers2 v0.12.0 + +### New features + +- `spacesavers2_pdq` is now counting inodes (not files) and including links and directories (#95, @kopardev) +- "pathlib.glob" is replaced with "os.scandir" for speedy folder traversing +- `--quite` option added to `spacesavers2_pdq` and `spacesavers2_catalog` to suppress progress bar output when running non-interactively eg. as a cronjob. This reduces size of .err file. + +### Bug fixes + +- `spacesavers2_pdq` not does NOT ignore links and folders (#93, @kopardev) +- `redirect` correctly captures intermediate non-zero exit codes +- "eval" statements removed from `spacesavers2_e2e` to accurately capture non-zero exit codes; makes sure e2d fails if catalog fails internally + ## spacesavers2 0.11.6 ### New features diff --git a/bin/redirect b/bin/redirect index e2f0809..443ed0b 100755 --- a/bin/redirect +++ b/bin/redirect @@ -52,6 +52,6 @@ else fi if [[ "$run" == "0" ]]; then - ${TOOLDIR}/${TOOLNAME} "$@" || true + ${TOOLDIR}/${TOOLNAME} "$@" || exit 1 conda deactivate 2>/dev/null fi diff --git a/docs/index.md b/docs/index.md index fc357a5..003a763 100644 --- a/docs/index.md +++ b/docs/index.md @@ -27,7 +27,6 @@ - [spacesavers2_catalog](catalog.md) - [spacesavers2_mimeo](mimeo.md) - [spacesavers2_grubbers](grubbers.md) -- [spacesavers2_blamematrix](blamematrix.md) - [spacesavers2_usurp](usurp.md) - [spacesavers2_e2e](e2e.md) - [spacesavers2_pdq](pdq.md) diff --git a/docs/pdq.md b/docs/pdq.md index c22675b..f91cf2d 100644 --- a/docs/pdq.md +++ b/docs/pdq.md @@ -5,7 +5,7 @@ pdq = Pretty Darn Quick This uses `glob` library to list all files in a user-provided folder recursively. For each user it gathers information like: - - total number of files + - total number of inodes - total number of bytes It is quick tool to gather datapoints to monitor filesystem usage. Typically, can be run once daily and compared with previous days run to find large changes. @@ -21,12 +21,12 @@ It is quick tool to gather datapoints to monitor filesystem usage. Typically, ca ```bash usage: spacesavers2_pdq [-h] -f FOLDER [-p THREADS] [-o OUTFILE] [-j JSON] [-v] -spacesavers2_pdq: get quick per user info (number of files and bytes). +spacesavers2_pdq: get quick per user info (number of inodes and bytes). options: -h, --help show this help message and exit -f FOLDER, --folder FOLDER - spacesavers2_pdq will be run on all files in this folder and its subfolders + spacesavers2_pdq will be run on all inodes in this folder and its subfolders -p THREADS, --threads THREADS number of threads to be used (default 4) -o OUTFILE, --outfile OUTFILE @@ -55,11 +55,11 @@ user3 1499 126442496 The 3 items in the line are as follows: -| Column | Description | Example | -| ------ | ------------------------ | ---------------------------------------------------------------------------------------------- | -| 1 | username | "user1" | -| 2 | total no. of files owned | 1386138 | -| 3 | total no. of bytes occupied | 6089531321856 | +| Column | Description | Example | +| ------ | --------------------------- | ------------- | +| 1 | username | "user1" | +| 2 | total no. of inodes owned | 1386138 | +| 3 | total no. of bytes occupied | 6089531321856 | ## JSON output @@ -67,15 +67,15 @@ Here is an example output: ``` { - "/data/CCBR_Pipeliner/Tools/spacesavers2": { - "37513": { - "username": "kopardevn", - "nfiles": 1267, + "/path/to/some/folder ": { + "1234": { + "username": "user1", + "ninodes": 1267, "nbytes": 96084992 }, - "60731": { - "username": "sovacoolkl", - "nfiles": 895, + "4356": { + "username": "user2", + "ninodes": 895, "nbytes": 89249280 } } diff --git a/spacesavers2_catalog b/spacesavers2_catalog index f01695d..f1836ae 100755 --- a/spacesavers2_catalog +++ b/spacesavers2_catalog @@ -18,6 +18,7 @@ from pathlib import Path def task(f): fd = FileDetails() + # print(f"Initiating {f}") fd.initialize( f, buffersize=args.buffersize, @@ -27,8 +28,28 @@ def task(f): bottomhash=args.bottomhash, st_block_byte_size=args.st_block_byte_size, ) + # print(f"Returning {f}") return fd +def process(fd,broken_links,outfh,geezerage,geezersize,geezers): + uid = fd.get_userid() + if fd.get_type() == "L": # broken link + if not uid in broken_links: broken_links[uid] = list() + broken_links[uid].append(fd.get_filepath()) + else: + result = "%s" % (fd) + if not result == "": + outfh.write(f"{result}\n") + if fd.get_type() == "f": + age = fd.get_age() + size = fd.get_size() + if age > geezerage and size > geezersize: + x = list() + x.append("{0:.2f} yrs".format(age/365)) + x.append(fd.get_size_human_readable()) + x.append(fd.get_filepath()) + if not uid in geezers: geezers[uid] = list() + geezers[uid].append("\t".join(x)) def main(): elog = textwrap.dedent( @@ -131,6 +152,14 @@ def main(): action=argparse.BooleanOptionalAction, help="output per-user geezer files list.", ) + parser.add_argument( + "-q", + "--quite", + dest="quite", + required=False, + action=argparse.BooleanOptionalAction, + help="Do not show progress", + ) parser.add_argument( "-a", "--geezerage", @@ -154,16 +183,17 @@ def main(): global args args = parser.parse_args() + tqdm_disable = False + if args.quite: tqdm_disable = True + global sed sed = dict() for s in args.se.split(","): sed[s] = 1 folder = args.folder - p = Path(folder) - files = [p] - files2 = p.glob("**/*") - files.extend(files2) + p = Path(folder).absolute() + dirs = [p] broken_links = dict() geezers = dict() @@ -174,25 +204,12 @@ def main(): outfh = sys.stdout with Pool(processes=args.threads) as pool: - for fd in tqdm.tqdm(pool.imap_unordered(task, files),total=len(files)): - uid = fd.get_userid() - if fd.get_type() == "L": # broken link - if not uid in broken_links: broken_links[uid] = list() - broken_links[uid].append(fd.get_filepath()) - else: - result = "%s" % (fd) - if not result == "": - outfh.write(f"{result}\n") - if fd.get_type() == "f": - age = fd.get_age() - size = fd.get_size() - if age > args.geezerage and size > args.geezersize: - x = list() - x.append("{0:.2f} yrs".format(age/365)) - x.append(fd.get_size_human_readable()) - x.append(fd.get_filepath()) - if not uid in geezers: geezers[uid] = list() - geezers[uid].append("\t".join(x)) + for fd in tqdm.tqdm(pool.imap_unordered(task, scantree(p,dirs)),disable=tqdm_disable): + process(fd,broken_links,outfh,args.geezerage,args.geezersize,geezers) + + with Pool(processes=args.threads) as pool: + for fd in tqdm.tqdm(pool.imap_unordered(task, dirs),disable=tqdm_disable): + process(fd,broken_links,outfh,args.geezerage,args.geezersize,geezers) if args.outfile: outfh.close() diff --git a/spacesavers2_e2e b/spacesavers2_e2e index 45b77de..d574326 100755 --- a/spacesavers2_e2e +++ b/spacesavers2_e2e @@ -3,6 +3,7 @@ # spacesavers2 end-to-end wrapper script #################################################################################### set -e -o pipefail +sleep_duration=10 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) @@ -44,10 +45,12 @@ outfile_blamematrix_err="${OUTFOLDER}/${PREFIX}.blamematrix.err" if [ ! -d $OUTFOLDER ];then mkdir -p $OUTFOLDER;fi +exit_code=0 # spacesavers2_catalog if [ -d $OUTFOLDER ];then echo "Running spacesavers2_catalog..." echo "Creating File: $outfile_catalog" +spacesavers2_catalog --version cmd=$( cat << EOF spacesavers2_catalog \ @@ -56,15 +59,21 @@ spacesavers2_catalog \ --outfile ${outfile_catalog} \ --bottomhash \ --brokenlink \ - --geezers \ - > ${outfile_catalog_log} 2> ${outfile_catalog_err} + --geezers --quite EOF ) echo $cmd -eval $cmd +$cmd > ${outfile_catalog_log} 2> ${outfile_catalog_err} +exit_code=$? +echo "ExitCode:$exit_code" +if [ $exit_code -ne 0 ];then + exit 1 +fi +else # exit if $OUTFOLDER does not exist + exit 1 fi -sleep 60 +sleep $sleep_duration # spacesavers2_mimeo echo "Running spacesavers2_mimeo..." @@ -73,6 +82,7 @@ if [ ! -f "${outfile_catalog}" ];then echo "Creation of ${outfile_catalog} FAILED!!" exit 1 fi +spacesavers2_mimeo --version cmd=$( cat << EOF spacesavers2_mimeo \ @@ -82,14 +92,19 @@ spacesavers2_mimeo \ --duplicatesonly \ --maxdepth $MAXDEPTH \ --p $PREFIX \ - --kronaplot \ - > ${outfile_mimeo_log} 2> ${outfile_mimeo_err} + --kronaplot + EOF ) echo $cmd -eval $cmd +$cmd > ${outfile_mimeo_log} 2> ${outfile_mimeo_err} +exit_code=$? +echo "ExitCode:$exit_code" +if [ $exit_code -ne 0 ];then + exit 1 +fi -sleep 60 +sleep $sleep_duration # spacesavers2_grubbers echo "Running spacesavers2_grubbers..." @@ -103,17 +118,23 @@ for filegz in `ls ${OUTFOLDER}/${PREFIX}*files.gz`;do outfile=`echo $filegz|sed "s/mimeo.files.gz/grubbers.tsv/g"` logfile=`echo $filegz|sed "s/mimeo.files.gz/grubbers.log/g"` errfile=`echo $filegz|sed "s/mimeo.files.gz/grubbers.err/g"` + spacesavers2_grubbers --version cmd=$( cat << EOF spacesavers2_grubbers \ --filesgz $filegz \ --limit $LIMIT \ - --outfile $outfile \ - > $logfile 2> $errfile + --outfile $outfile EOF ) echo $cmd - eval $cmd +$cmd > $logfile 2> $errfile +exit_code=$? +echo "ExitCode:$exit_code" +if [ $exit_code -ne 0 ];then + exit 1 +fi + done diff --git a/spacesavers2_pdq b/spacesavers2_pdq index 93c521e..a129eb5 100755 --- a/spacesavers2_pdq +++ b/spacesavers2_pdq @@ -16,6 +16,7 @@ from multiprocessing import Pool import argparse from pathlib import Path import json +import os def task(f): @@ -23,6 +24,12 @@ def task(f): fd.set(f) return fd +def process(fd): + # requires global bigdict + uid = fd.get_uid() + if not uid in bigdict: bigdict[uid]=dict() + inode = fd.get_inode() + if not inode in bigdict[uid]: bigdict[uid][inode]=fd.get_size() def main(): elog = textwrap.dedent( @@ -73,6 +80,14 @@ def main(): type=str, help="outfile file in JSON format.", ) + parser.add_argument( + "-q", + "--quite", + dest="quite", + required=False, + action=argparse.BooleanOptionalAction, + help="Do not show progress", + ) parser.add_argument("-v", "--version", action="version", version=__version__) global args @@ -80,9 +95,13 @@ def main(): folder = args.folder p = Path(folder).absolute() - files = [p] - files2 = p.glob("**/*") - files.extend(files2) + dirs = [p] + + tqdm_disable = False + if args.quite: tqdm_disable = True + # files = [p] + # files2 = p.glob("**/*") + # files.extend(files2) if args.outfile: outfh = open(args.outfile, 'w') @@ -92,30 +111,34 @@ def main(): if args.json: outjson = open(args.json, 'w') + global bigdict bigdict=dict() with Pool(processes=args.threads) as pool: - for fd in tqdm.tqdm(pool.imap_unordered(task, files),total=len(files)): - if not fd.is_file(): continue - uid = fd.get_uid() - if not uid in bigdict: bigdict[uid]=dict() - inode = fd.get_inode() - if not inode in bigdict[uid]: bigdict[uid][inode]=fd.get_size() + for fd in tqdm.tqdm(pool.imap_unordered(task, scantree(p,dirs)),disable=tqdm_disable): + if not fd.is_fld(): continue # its either a file or link or directory + process(fd) + # now loop through dirs + with Pool(processes=args.threads) as pool: + for fd in tqdm.tqdm(pool.imap_unordered(task, dirs),disable=tqdm_disable): + if not fd.is_fld(): continue # its either a file or link or directory + process(fd) + outdict=dict() outdict[str(p)]=dict() for uid in bigdict.keys(): username = get_username_groupname(uid) outdict[str(p)][str(uid)]=dict() - nfiles = len(bigdict[uid]) + ninodes = len(bigdict[uid]) nbytes = 0 for inode in bigdict[uid].keys(): nbytes += bigdict[uid][inode] outdict[str(p)][str(uid)]['username']=username - outdict[str(p)][str(uid)]['nfiles']=nfiles + outdict[str(p)][str(uid)]['ninodes']=ninodes outdict[str(p)][str(uid)]['nbytes']=nbytes - outfh.write(f"{username}\t{nfiles}\t{nbytes}\n") + outfh.write(f"{username}\t{ninodes}\t{nbytes}\n") if args.json: json.dump(outdict,outjson,indent=1) diff --git a/src/FileDetails.py b/src/FileDetails.py index 5c128fe..450d83f 100644 --- a/src/FileDetails.py +++ b/src/FileDetails.py @@ -81,52 +81,56 @@ def initialize(self,f,thresholdsize=THRESHOLDSIZE, buffersize=BUFFERSIZE, tb=TB, self.apath = Path(f).absolute() # path is of type PosixPath ext = self.apath.suffix self.fld = get_type(self.apath) # get if it is a file or directory or link or unknown or absent - st = self.apath.stat(follow_symlinks=False) # gather stat results - self.size = st.st_size # size in bytes - self.calculated_size = st.st_blocks * st_block_byte_size # st_blocks gives number of 512 bytes blocks used - self.calculated_size_human_readable = get_human_readable_size(self.calculated_size) - self.dev = st.st_dev # Device id - self.inode = st.st_ino # Inode - self.nlink = st.st_nlink # number of hardlinks - self.atime = convert_time_to_age(st.st_atime) # access time - self.mtime = convert_time_to_age(st.st_mtime) # modification time - self.ctime = convert_time_to_age(st.st_ctime) # change time - self.uid = st.st_uid # user id - self.gid = st.st_gid # group id - if self.fld == "f": - try: - with open(self.apath,'rb') as fh: - if ext in sed: - if self.size > tb: - data = fh.read(thresholdsize) - data = fh.read(buffersize) - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - if bottomhash: - fh.seek(-1 * buffersize,2) + try: + st = self.apath.stat(follow_symlinks=False) # gather stat results + self.size = st.st_size # size in bytes + self.calculated_size = st.st_blocks * st_block_byte_size # st_blocks gives number of 512 bytes blocks used + self.calculated_size_human_readable = get_human_readable_size(self.calculated_size) + self.dev = st.st_dev # Device id + self.inode = st.st_ino # Inode + self.nlink = st.st_nlink # number of hardlinks + self.atime = convert_time_to_age(st.st_atime) # access time + self.mtime = convert_time_to_age(st.st_mtime) # modification time + self.ctime = convert_time_to_age(st.st_ctime) # change time + self.uid = st.st_uid # user id + self.gid = st.st_gid # group id + if self.fld == "f": + try: + with open(self.apath,'rb') as fh: + if ext in sed: + if self.size > tb: + data = fh.read(thresholdsize) data = fh.read(buffersize) - self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() + if bottomhash: + fh.seek(-1 * buffersize,2) + data = fh.read(buffersize) + self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + else: + self.xhash_bottom = self.xhash_top else: + data = fh.read() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() self.xhash_bottom = self.xhash_top else: - data = fh.read() - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - self.xhash_bottom = self.xhash_top - else: - if self.size > buffersize: - data = fh.read(buffersize) - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - if bottomhash: - fh.seek(-1 * buffersize,2) + if self.size > buffersize: data = fh.read(buffersize) - self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() + if bottomhash: + fh.seek(-1 * buffersize,2) + data = fh.read(buffersize) + self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + else: + self.xhash_bottom = self.xhash_top else: + data = fh.read() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() self.xhash_bottom = self.xhash_top - else: - data = fh.read() - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - self.xhash_bottom = self.xhash_top - except: - sys.stderr.write("spacesavers2:{}:File cannot be read:{}\n".format(self.__class__.__name__,str(self.apath))) + except: + sys.stderr.write("spacesavers2:{}:File cannot be read:{}\n".format(self.__class__.__name__,str(self.apath))) + except: + sys.stderr.write("spacesavers2:{}:File probably recently deleted!:{}\n".format(self.__class__.__name__,str(self.apath))) + # print(f"Done with {self.apath}") def set(self,ls_line): original_ls_line=ls_line diff --git a/src/VERSION b/src/VERSION index e5cbde3..d33c3a2 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -0.11.6 +0.12.0 \ No newline at end of file diff --git a/src/pdq.py b/src/pdq.py index 604780b..3a0bf5d 100644 --- a/src/pdq.py +++ b/src/pdq.py @@ -54,8 +54,10 @@ def get_uid(self): return self.uid def get_fld(self): return self.fld - def is_file(self): + def is_fld(self): if self.fld == "f": return True + if self.fld == "l": return True + if self.fld == "d": return True return False def get_inode(self): return self.inode diff --git a/src/utils.py b/src/utils.py index 68c4b51..56e43b4 100644 --- a/src/utils.py +++ b/src/utils.py @@ -5,6 +5,21 @@ import sys import time +def scantree(path,dirs): + # requires global dirs + """Recursively yield DirEntry objects for given directory.""" + try: + for entry in os.scandir(path): + if entry.is_dir(follow_symlinks=False): + # print(f"{entry.path} is DIR") + dirs.append(entry.path) + yield from scantree(entry.path,dirs) + else: + # print(f"{entry.path} is FILE") + yield entry.path + except: + return + def which(program): def is_exe(fpath):