From 99669142f8593a51357caa36a58fdec30d9d15bf Mon Sep 17 00:00:00 2001 From: kopardev Date: Wed, 28 Feb 2024 11:12:36 -0500 Subject: [PATCH 01/14] docs: update index ... remove blamematrix command link --- docs/index.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index fc357a5..003a763 100644 --- a/docs/index.md +++ b/docs/index.md @@ -27,7 +27,6 @@ - [spacesavers2_catalog](catalog.md) - [spacesavers2_mimeo](mimeo.md) - [spacesavers2_grubbers](grubbers.md) -- [spacesavers2_blamematrix](blamematrix.md) - [spacesavers2_usurp](usurp.md) - [spacesavers2_e2e](e2e.md) - [spacesavers2_pdq](pdq.md) From bf64b3dafbb4cf3458d7824fcffcfac45b796733 Mon Sep 17 00:00:00 2001 From: kopardev Date: Wed, 28 Feb 2024 11:31:04 -0500 Subject: [PATCH 02/14] feat: pdq counting links and folders; fix #93 --- spacesavers2_pdq | 8 ++++---- src/pdq.py | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/spacesavers2_pdq b/spacesavers2_pdq index 93c521e..d5b9c97 100755 --- a/spacesavers2_pdq +++ b/spacesavers2_pdq @@ -96,7 +96,7 @@ def main(): with Pool(processes=args.threads) as pool: for fd in tqdm.tqdm(pool.imap_unordered(task, files),total=len(files)): - if not fd.is_file(): continue + if not fd.is_fld(): continue # its either a file or link or directory uid = fd.get_uid() if not uid in bigdict: bigdict[uid]=dict() inode = fd.get_inode() @@ -108,14 +108,14 @@ def main(): for uid in bigdict.keys(): username = get_username_groupname(uid) outdict[str(p)][str(uid)]=dict() - nfiles = len(bigdict[uid]) + ninodes = len(bigdict[uid]) nbytes = 0 for inode in bigdict[uid].keys(): nbytes += bigdict[uid][inode] outdict[str(p)][str(uid)]['username']=username - outdict[str(p)][str(uid)]['nfiles']=nfiles + outdict[str(p)][str(uid)]['ninodes']=ninodes outdict[str(p)][str(uid)]['nbytes']=nbytes - outfh.write(f"{username}\t{nfiles}\t{nbytes}\n") + outfh.write(f"{username}\t{ninodes}\t{nbytes}\n") if args.json: json.dump(outdict,outjson,indent=1) diff --git a/src/pdq.py b/src/pdq.py index 604780b..3a0bf5d 100644 --- a/src/pdq.py +++ b/src/pdq.py @@ -54,8 +54,10 @@ def get_uid(self): return self.uid def get_fld(self): return self.fld - def is_file(self): + def is_fld(self): if self.fld == "f": return True + if self.fld == "l": return True + if self.fld == "d": return True return False def get_inode(self): return self.inode From e100700a84e3b4f3e9f8aab515aeec329c6af78d Mon Sep 17 00:00:00 2001 From: kopardev Date: Wed, 28 Feb 2024 11:31:53 -0500 Subject: [PATCH 03/14] docs: update pdq doc --- docs/pdq.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/pdq.md b/docs/pdq.md index c22675b..264f38a 100644 --- a/docs/pdq.md +++ b/docs/pdq.md @@ -5,7 +5,7 @@ pdq = Pretty Darn Quick This uses `glob` library to list all files in a user-provided folder recursively. For each user it gathers information like: - - total number of files + - total number of inodes - total number of bytes It is quick tool to gather datapoints to monitor filesystem usage. Typically, can be run once daily and compared with previous days run to find large changes. @@ -21,12 +21,12 @@ It is quick tool to gather datapoints to monitor filesystem usage. Typically, ca ```bash usage: spacesavers2_pdq [-h] -f FOLDER [-p THREADS] [-o OUTFILE] [-j JSON] [-v] -spacesavers2_pdq: get quick per user info (number of files and bytes). +spacesavers2_pdq: get quick per user info (number of inodes and bytes). options: -h, --help show this help message and exit -f FOLDER, --folder FOLDER - spacesavers2_pdq will be run on all files in this folder and its subfolders + spacesavers2_pdq will be run on all inodes in this folder and its subfolders -p THREADS, --threads THREADS number of threads to be used (default 4) -o OUTFILE, --outfile OUTFILE @@ -55,11 +55,11 @@ user3 1499 126442496 The 3 items in the line are as follows: -| Column | Description | Example | -| ------ | ------------------------ | ---------------------------------------------------------------------------------------------- | -| 1 | username | "user1" | -| 2 | total no. of files owned | 1386138 | -| 3 | total no. of bytes occupied | 6089531321856 | +| Column | Description | Example | +| ------ | --------------------------- | ------------- | +| 1 | username | "user1" | +| 2 | total no. of inodes owned | 1386138 | +| 3 | total no. of bytes occupied | 6089531321856 | ## JSON output @@ -67,14 +67,14 @@ Here is an example output: ``` { - "/data/CCBR_Pipeliner/Tools/spacesavers2": { - "37513": { - "username": "kopardevn", + "/path/to/some/folder": { + "1234": { + "username": "user1", "nfiles": 1267, "nbytes": 96084992 }, - "60731": { - "username": "sovacoolkl", + "4356": { + "username": "user2", "nfiles": 895, "nbytes": 89249280 } From 27244c26acfa0aa1215f235e9d8e1adb5af8484f Mon Sep 17 00:00:00 2001 From: kopardev Date: Wed, 28 Feb 2024 11:35:58 -0500 Subject: [PATCH 04/14] docs: changelog update, version increment --- CHANGELOG.md | 2 ++ src/VERSION | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 84e7b18..41d5d8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ ### Bug fixes +- `spacesavers2_pdq` not does NOT ignore links and folders (#93, @kopardev) + ## spacesavers2 0.11.6 ### New features diff --git a/src/VERSION b/src/VERSION index e5cbde3..47317ee 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -0.11.6 +0.11.6-dev From 5af66155f77b96e5aca908d959af608f3b0308bd Mon Sep 17 00:00:00 2001 From: kopardev Date: Wed, 28 Feb 2024 11:40:57 -0500 Subject: [PATCH 05/14] docs: nfiles are now ninodes --- docs/pdq.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/pdq.md b/docs/pdq.md index 264f38a..f91cf2d 100644 --- a/docs/pdq.md +++ b/docs/pdq.md @@ -67,15 +67,15 @@ Here is an example output: ``` { - "/path/to/some/folder": { + "/path/to/some/folder ": { "1234": { "username": "user1", - "nfiles": 1267, + "ninodes": 1267, "nbytes": 96084992 }, "4356": { "username": "user2", - "nfiles": 895, + "ninodes": 895, "nbytes": 89249280 } } From 1760823d64d582e87362733ae106c8939be2cad1 Mon Sep 17 00:00:00 2001 From: kopardev Date: Thu, 29 Feb 2024 14:59:45 -0500 Subject: [PATCH 06/14] refact: using scandir in place of glob... its faster --- spacesavers2_pdq | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/spacesavers2_pdq b/spacesavers2_pdq index d5b9c97..b6cae4d 100755 --- a/spacesavers2_pdq +++ b/spacesavers2_pdq @@ -16,6 +16,7 @@ from multiprocessing import Pool import argparse from pathlib import Path import json +import os def task(f): @@ -23,6 +24,12 @@ def task(f): fd.set(f) return fd +def process(fd): + # requires global bigdict + uid = fd.get_uid() + if not uid in bigdict: bigdict[uid]=dict() + inode = fd.get_inode() + if not inode in bigdict[uid]: bigdict[uid][inode]=fd.get_size() def main(): elog = textwrap.dedent( @@ -73,6 +80,14 @@ def main(): type=str, help="outfile file in JSON format.", ) + parser.add_argument( + "-q", + "--quite", + dest="quite", + required=False, + action=argparse.BooleanOptionalAction, + help="Do not show progress", + ) parser.add_argument("-v", "--version", action="version", version=__version__) global args @@ -80,9 +95,15 @@ def main(): folder = args.folder p = Path(folder).absolute() - files = [p] - files2 = p.glob("**/*") - files.extend(files2) + + global dirs + dirs = list() + + tqdm_disable = False + if args.quite: tqdm_disable = True + # files = [p] + # files2 = p.glob("**/*") + # files.extend(files2) if args.outfile: outfh = open(args.outfile, 'w') @@ -92,16 +113,20 @@ def main(): if args.json: outjson = open(args.json, 'w') + global bigdict bigdict=dict() with Pool(processes=args.threads) as pool: - for fd in tqdm.tqdm(pool.imap_unordered(task, files),total=len(files)): + for fd in tqdm.tqdm(pool.imap_unordered(task, scantree(p)),disable=tqdm_disable): if not fd.is_fld(): continue # its either a file or link or directory - uid = fd.get_uid() - if not uid in bigdict: bigdict[uid]=dict() - inode = fd.get_inode() - if not inode in bigdict[uid]: bigdict[uid][inode]=fd.get_size() + process(fd) + # now loop through dirs + with Pool(processes=args.threads) as pool: + for fd in tqdm.tqdm(pool.imap_unordered(task, dirs),disable=tqdm_disable): + if not fd.is_fld(): continue # its either a file or link or directory + process(fd) + outdict=dict() outdict[str(p)]=dict() From 12655d7b0cd5af19289366944cfa6a539f500d6e Mon Sep 17 00:00:00 2001 From: kopardev Date: Thu, 29 Feb 2024 15:00:49 -0500 Subject: [PATCH 07/14] refact: using scandir instead of glob in catalog too; adding quite mode to supress progress bar --- spacesavers2_catalog | 18 +++++++++++++----- src/utils.py | 12 ++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/spacesavers2_catalog b/spacesavers2_catalog index f01695d..324ac43 100755 --- a/spacesavers2_catalog +++ b/spacesavers2_catalog @@ -131,6 +131,14 @@ def main(): action=argparse.BooleanOptionalAction, help="output per-user geezer files list.", ) + parser.add_argument( + "-q", + "--quite", + dest="quite", + required=False, + action=argparse.BooleanOptionalAction, + help="Do not show progress", + ) parser.add_argument( "-a", "--geezerage", @@ -154,16 +162,16 @@ def main(): global args args = parser.parse_args() + tqdm_disable = False + if args.quite: tqdm_disable = True + global sed sed = dict() for s in args.se.split(","): sed[s] = 1 folder = args.folder - p = Path(folder) - files = [p] - files2 = p.glob("**/*") - files.extend(files2) + p = Path(folder).absolute() broken_links = dict() geezers = dict() @@ -174,7 +182,7 @@ def main(): outfh = sys.stdout with Pool(processes=args.threads) as pool: - for fd in tqdm.tqdm(pool.imap_unordered(task, files),total=len(files)): + for fd in tqdm.tqdm(pool.imap_unordered(task, scantree(p)),disable=tqdm_disable): uid = fd.get_userid() if fd.get_type() == "L": # broken link if not uid in broken_links: broken_links[uid] = list() diff --git a/src/utils.py b/src/utils.py index 68c4b51..fee38cf 100644 --- a/src/utils.py +++ b/src/utils.py @@ -5,6 +5,18 @@ import sys import time +def scantree(path): + # requires global dirs + """Recursively yield DirEntry objects for given directory.""" + for entry in os.scandir(path): + try: + if entry.is_dir(follow_symlinks=False): + dirs.append(entry.path) + yield from scantree(entry.path) + else: + yield entry.path + except: + return def which(program): def is_exe(fpath): From f19f98e6fd8e7096bd051354d78b1176bde7456b Mon Sep 17 00:00:00 2001 From: kopardev Date: Thu, 29 Feb 2024 18:40:44 -0500 Subject: [PATCH 08/14] refact: scantree need dirs ... no longer using global dirs variable --- src/utils.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/utils.py b/src/utils.py index fee38cf..56e43b4 100644 --- a/src/utils.py +++ b/src/utils.py @@ -5,18 +5,21 @@ import sys import time -def scantree(path): +def scantree(path,dirs): # requires global dirs """Recursively yield DirEntry objects for given directory.""" - for entry in os.scandir(path): - try: + try: + for entry in os.scandir(path): if entry.is_dir(follow_symlinks=False): + # print(f"{entry.path} is DIR") dirs.append(entry.path) - yield from scantree(entry.path) + yield from scantree(entry.path,dirs) else: + # print(f"{entry.path} is FILE") yield entry.path - except: - return + except: + return + def which(program): def is_exe(fpath): From a40280904117907e8d64a17bf7802af9f9e3d5b2 Mon Sep 17 00:00:00 2001 From: kopardev Date: Thu, 29 Feb 2024 18:41:42 -0500 Subject: [PATCH 09/14] fix: fix #95 --- src/FileDetails.py | 82 ++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/src/FileDetails.py b/src/FileDetails.py index 5c128fe..450d83f 100644 --- a/src/FileDetails.py +++ b/src/FileDetails.py @@ -81,52 +81,56 @@ def initialize(self,f,thresholdsize=THRESHOLDSIZE, buffersize=BUFFERSIZE, tb=TB, self.apath = Path(f).absolute() # path is of type PosixPath ext = self.apath.suffix self.fld = get_type(self.apath) # get if it is a file or directory or link or unknown or absent - st = self.apath.stat(follow_symlinks=False) # gather stat results - self.size = st.st_size # size in bytes - self.calculated_size = st.st_blocks * st_block_byte_size # st_blocks gives number of 512 bytes blocks used - self.calculated_size_human_readable = get_human_readable_size(self.calculated_size) - self.dev = st.st_dev # Device id - self.inode = st.st_ino # Inode - self.nlink = st.st_nlink # number of hardlinks - self.atime = convert_time_to_age(st.st_atime) # access time - self.mtime = convert_time_to_age(st.st_mtime) # modification time - self.ctime = convert_time_to_age(st.st_ctime) # change time - self.uid = st.st_uid # user id - self.gid = st.st_gid # group id - if self.fld == "f": - try: - with open(self.apath,'rb') as fh: - if ext in sed: - if self.size > tb: - data = fh.read(thresholdsize) - data = fh.read(buffersize) - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - if bottomhash: - fh.seek(-1 * buffersize,2) + try: + st = self.apath.stat(follow_symlinks=False) # gather stat results + self.size = st.st_size # size in bytes + self.calculated_size = st.st_blocks * st_block_byte_size # st_blocks gives number of 512 bytes blocks used + self.calculated_size_human_readable = get_human_readable_size(self.calculated_size) + self.dev = st.st_dev # Device id + self.inode = st.st_ino # Inode + self.nlink = st.st_nlink # number of hardlinks + self.atime = convert_time_to_age(st.st_atime) # access time + self.mtime = convert_time_to_age(st.st_mtime) # modification time + self.ctime = convert_time_to_age(st.st_ctime) # change time + self.uid = st.st_uid # user id + self.gid = st.st_gid # group id + if self.fld == "f": + try: + with open(self.apath,'rb') as fh: + if ext in sed: + if self.size > tb: + data = fh.read(thresholdsize) data = fh.read(buffersize) - self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() + if bottomhash: + fh.seek(-1 * buffersize,2) + data = fh.read(buffersize) + self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + else: + self.xhash_bottom = self.xhash_top else: + data = fh.read() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() self.xhash_bottom = self.xhash_top else: - data = fh.read() - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - self.xhash_bottom = self.xhash_top - else: - if self.size > buffersize: - data = fh.read(buffersize) - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - if bottomhash: - fh.seek(-1 * buffersize,2) + if self.size > buffersize: data = fh.read(buffersize) - self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() + if bottomhash: + fh.seek(-1 * buffersize,2) + data = fh.read(buffersize) + self.xhash_bottom = xxhash.xxh128(data,seed=SEED).hexdigest() + else: + self.xhash_bottom = self.xhash_top else: + data = fh.read() + self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() self.xhash_bottom = self.xhash_top - else: - data = fh.read() - self.xhash_top = xxhash.xxh128(data,seed=SEED).hexdigest() - self.xhash_bottom = self.xhash_top - except: - sys.stderr.write("spacesavers2:{}:File cannot be read:{}\n".format(self.__class__.__name__,str(self.apath))) + except: + sys.stderr.write("spacesavers2:{}:File cannot be read:{}\n".format(self.__class__.__name__,str(self.apath))) + except: + sys.stderr.write("spacesavers2:{}:File probably recently deleted!:{}\n".format(self.__class__.__name__,str(self.apath))) + # print(f"Done with {self.apath}") def set(self,ls_line): original_ls_line=ls_line From adc63fe4023b708a896c51f952f920fbdf373769 Mon Sep 17 00:00:00 2001 From: kopardev Date: Thu, 29 Feb 2024 18:42:47 -0500 Subject: [PATCH 10/14] fix: redirect now correctly denotes non-zero exit code --- bin/redirect | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/redirect b/bin/redirect index e2f0809..443ed0b 100755 --- a/bin/redirect +++ b/bin/redirect @@ -52,6 +52,6 @@ else fi if [[ "$run" == "0" ]]; then - ${TOOLDIR}/${TOOLNAME} "$@" || true + ${TOOLDIR}/${TOOLNAME} "$@" || exit 1 conda deactivate 2>/dev/null fi From 43bc5a8548a730b3202b40ccb39e13fc93fbfaeb Mon Sep 17 00:00:00 2001 From: kopardev Date: Thu, 29 Feb 2024 18:46:37 -0500 Subject: [PATCH 11/14] refact:process function added to reduce code redundancy --- spacesavers2_catalog | 47 ++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/spacesavers2_catalog b/spacesavers2_catalog index 324ac43..f1836ae 100755 --- a/spacesavers2_catalog +++ b/spacesavers2_catalog @@ -18,6 +18,7 @@ from pathlib import Path def task(f): fd = FileDetails() + # print(f"Initiating {f}") fd.initialize( f, buffersize=args.buffersize, @@ -27,8 +28,28 @@ def task(f): bottomhash=args.bottomhash, st_block_byte_size=args.st_block_byte_size, ) + # print(f"Returning {f}") return fd +def process(fd,broken_links,outfh,geezerage,geezersize,geezers): + uid = fd.get_userid() + if fd.get_type() == "L": # broken link + if not uid in broken_links: broken_links[uid] = list() + broken_links[uid].append(fd.get_filepath()) + else: + result = "%s" % (fd) + if not result == "": + outfh.write(f"{result}\n") + if fd.get_type() == "f": + age = fd.get_age() + size = fd.get_size() + if age > geezerage and size > geezersize: + x = list() + x.append("{0:.2f} yrs".format(age/365)) + x.append(fd.get_size_human_readable()) + x.append(fd.get_filepath()) + if not uid in geezers: geezers[uid] = list() + geezers[uid].append("\t".join(x)) def main(): elog = textwrap.dedent( @@ -172,6 +193,7 @@ def main(): folder = args.folder p = Path(folder).absolute() + dirs = [p] broken_links = dict() geezers = dict() @@ -182,25 +204,12 @@ def main(): outfh = sys.stdout with Pool(processes=args.threads) as pool: - for fd in tqdm.tqdm(pool.imap_unordered(task, scantree(p)),disable=tqdm_disable): - uid = fd.get_userid() - if fd.get_type() == "L": # broken link - if not uid in broken_links: broken_links[uid] = list() - broken_links[uid].append(fd.get_filepath()) - else: - result = "%s" % (fd) - if not result == "": - outfh.write(f"{result}\n") - if fd.get_type() == "f": - age = fd.get_age() - size = fd.get_size() - if age > args.geezerage and size > args.geezersize: - x = list() - x.append("{0:.2f} yrs".format(age/365)) - x.append(fd.get_size_human_readable()) - x.append(fd.get_filepath()) - if not uid in geezers: geezers[uid] = list() - geezers[uid].append("\t".join(x)) + for fd in tqdm.tqdm(pool.imap_unordered(task, scantree(p,dirs)),disable=tqdm_disable): + process(fd,broken_links,outfh,args.geezerage,args.geezersize,geezers) + + with Pool(processes=args.threads) as pool: + for fd in tqdm.tqdm(pool.imap_unordered(task, dirs),disable=tqdm_disable): + process(fd,broken_links,outfh,args.geezerage,args.geezersize,geezers) if args.outfile: outfh.close() From 9c9e9c31d380e596786025eadad47849081e9c6e Mon Sep 17 00:00:00 2001 From: kopardev Date: Thu, 29 Feb 2024 18:47:32 -0500 Subject: [PATCH 12/14] refact: eval statements removed to capture exit codes currectly --- spacesavers2_e2e | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/spacesavers2_e2e b/spacesavers2_e2e index 45b77de..d574326 100755 --- a/spacesavers2_e2e +++ b/spacesavers2_e2e @@ -3,6 +3,7 @@ # spacesavers2 end-to-end wrapper script #################################################################################### set -e -o pipefail +sleep_duration=10 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) @@ -44,10 +45,12 @@ outfile_blamematrix_err="${OUTFOLDER}/${PREFIX}.blamematrix.err" if [ ! -d $OUTFOLDER ];then mkdir -p $OUTFOLDER;fi +exit_code=0 # spacesavers2_catalog if [ -d $OUTFOLDER ];then echo "Running spacesavers2_catalog..." echo "Creating File: $outfile_catalog" +spacesavers2_catalog --version cmd=$( cat << EOF spacesavers2_catalog \ @@ -56,15 +59,21 @@ spacesavers2_catalog \ --outfile ${outfile_catalog} \ --bottomhash \ --brokenlink \ - --geezers \ - > ${outfile_catalog_log} 2> ${outfile_catalog_err} + --geezers --quite EOF ) echo $cmd -eval $cmd +$cmd > ${outfile_catalog_log} 2> ${outfile_catalog_err} +exit_code=$? +echo "ExitCode:$exit_code" +if [ $exit_code -ne 0 ];then + exit 1 +fi +else # exit if $OUTFOLDER does not exist + exit 1 fi -sleep 60 +sleep $sleep_duration # spacesavers2_mimeo echo "Running spacesavers2_mimeo..." @@ -73,6 +82,7 @@ if [ ! -f "${outfile_catalog}" ];then echo "Creation of ${outfile_catalog} FAILED!!" exit 1 fi +spacesavers2_mimeo --version cmd=$( cat << EOF spacesavers2_mimeo \ @@ -82,14 +92,19 @@ spacesavers2_mimeo \ --duplicatesonly \ --maxdepth $MAXDEPTH \ --p $PREFIX \ - --kronaplot \ - > ${outfile_mimeo_log} 2> ${outfile_mimeo_err} + --kronaplot + EOF ) echo $cmd -eval $cmd +$cmd > ${outfile_mimeo_log} 2> ${outfile_mimeo_err} +exit_code=$? +echo "ExitCode:$exit_code" +if [ $exit_code -ne 0 ];then + exit 1 +fi -sleep 60 +sleep $sleep_duration # spacesavers2_grubbers echo "Running spacesavers2_grubbers..." @@ -103,17 +118,23 @@ for filegz in `ls ${OUTFOLDER}/${PREFIX}*files.gz`;do outfile=`echo $filegz|sed "s/mimeo.files.gz/grubbers.tsv/g"` logfile=`echo $filegz|sed "s/mimeo.files.gz/grubbers.log/g"` errfile=`echo $filegz|sed "s/mimeo.files.gz/grubbers.err/g"` + spacesavers2_grubbers --version cmd=$( cat << EOF spacesavers2_grubbers \ --filesgz $filegz \ --limit $LIMIT \ - --outfile $outfile \ - > $logfile 2> $errfile + --outfile $outfile EOF ) echo $cmd - eval $cmd +$cmd > $logfile 2> $errfile +exit_code=$? +echo "ExitCode:$exit_code" +if [ $exit_code -ne 0 ];then + exit 1 +fi + done From 6ed82cfab84199d5fe9a76aae7d8b669aef38cdc Mon Sep 17 00:00:00 2001 From: kopardev Date: Thu, 29 Feb 2024 18:48:05 -0500 Subject: [PATCH 13/14] refact: adding the input folder to dirs list --- spacesavers2_pdq | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spacesavers2_pdq b/spacesavers2_pdq index b6cae4d..a129eb5 100755 --- a/spacesavers2_pdq +++ b/spacesavers2_pdq @@ -95,9 +95,7 @@ def main(): folder = args.folder p = Path(folder).absolute() - - global dirs - dirs = list() + dirs = [p] tqdm_disable = False if args.quite: tqdm_disable = True @@ -117,7 +115,7 @@ def main(): bigdict=dict() with Pool(processes=args.threads) as pool: - for fd in tqdm.tqdm(pool.imap_unordered(task, scantree(p)),disable=tqdm_disable): + for fd in tqdm.tqdm(pool.imap_unordered(task, scantree(p,dirs)),disable=tqdm_disable): if not fd.is_fld(): continue # its either a file or link or directory process(fd) From fefa09e3eaacefe6b687460137ead580d376e5b1 Mon Sep 17 00:00:00 2001 From: kopardev Date: Thu, 29 Feb 2024 19:01:21 -0500 Subject: [PATCH 14/14] chore: changelog and version updated --- CHANGELOG.md | 12 ++++++++++++ src/VERSION | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 41d5d8c..cb4b72a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,19 @@ ### Bug fixes +## spacesavers2 v0.12.0 + +### New features + +- `spacesavers2_pdq` is now counting inodes (not files) and including links and directories (#95, @kopardev) +- "pathlib.glob" is replaced with "os.scandir" for speedy folder traversing +- `--quite` option added to `spacesavers2_pdq` and `spacesavers2_catalog` to suppress progress bar output when running non-interactively eg. as a cronjob. This reduces size of .err file. + +### Bug fixes + - `spacesavers2_pdq` not does NOT ignore links and folders (#93, @kopardev) +- `redirect` correctly captures intermediate non-zero exit codes +- "eval" statements removed from `spacesavers2_e2e` to accurately capture non-zero exit codes; makes sure e2d fails if catalog fails internally ## spacesavers2 0.11.6 diff --git a/src/VERSION b/src/VERSION index 47317ee..d33c3a2 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -0.11.6-dev +0.12.0 \ No newline at end of file