diff --git a/CHANGELOG.md b/CHANGELOG.md index 82e70fa..0a052f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,16 @@ ### Bug fixes +## spacesavers2 v0.13.0 + +### New features + +- adding new commands `spacesavers2_pdq_create_db` and `spacesavers2_pdq_update_db` +- output TSV files from `spacesavers2_pdq` can be saved into a sqlite3 db with these commands +- future integration with Grafana will now be possible + +### Bug fixes + ## spacesavers2 v0.12.1 ### New features diff --git a/bin/spacesavers2_pdq_create_db b/bin/spacesavers2_pdq_create_db new file mode 120000 index 0000000..577f1ce --- /dev/null +++ b/bin/spacesavers2_pdq_create_db @@ -0,0 +1 @@ +redirect \ No newline at end of file diff --git a/bin/spacesavers2_pdq_update_db b/bin/spacesavers2_pdq_update_db new file mode 120000 index 0000000..577f1ce --- /dev/null +++ b/bin/spacesavers2_pdq_update_db @@ -0,0 +1 @@ +redirect \ No newline at end of file diff --git a/docs/assets/images/pdq_db_schema.png b/docs/assets/images/pdq_db_schema.png new file mode 100644 index 0000000..b01dee8 Binary files /dev/null and b/docs/assets/images/pdq_db_schema.png differ diff --git a/docs/pdq.md b/docs/pdq.md index f91cf2d..a9a237e 100644 --- a/docs/pdq.md +++ b/docs/pdq.md @@ -80,4 +80,9 @@ Here is an example output: } } } -``` \ No newline at end of file +``` + +`spacesavers2_pdq` creates TSV (or JSON) file per-datamount per-run (typically per-date). If run daily, this soon creates a lot of files to keep track of. Hence, it is best to save the data in a sqlite db using: + + - [`spacesavers2_pdq_create_db`](pdq_create_db.md) and + - [`spacesavers2_pdq_update_db`](pdq_update_db.md) \ No newline at end of file diff --git a/docs/pdq_create_db.md b/docs/pdq_create_db.md new file mode 100644 index 0000000..716eed2 --- /dev/null +++ b/docs/pdq_create_db.md @@ -0,0 +1,70 @@ +## spacesavers2_pdq_create_db + +pdq = Pretty Darn Quick + +[`spacesavers2_pdq`](pdq.md) creates TSV (or JSON) file per-datamount per-run (typically per-date). If run daily, this soon creates a lot of files to keep track of. Hence, it is best to save the data in a sqlite db. This command create the basic schema for that db. The schema looks like this: + +![pdq schema](assets/images/pdq_db_schema.png) + +### Inputs + - `--filepath`: where to create the ".db" file. + - `--overwrite`: toggle to overwrite if the ".db" file already exists. + +```bash +usage: spacesavers2_pdq_create_db [-h] -f FILEPATH [-o | --overwrite | --no-overwrite] [-v] + +spacesavers2_pdq_create_db: create a sqlitedb file with the optimized schema. + +options: + -h, --help show this help message and exit + -f FILEPATH, --filepath FILEPATH + spacesavers2_pdq_create_db will create this sqlitedb file + -o, --overwrite, --no-overwrite + overwrite output file if it already exists. Use this with caution as it will delete existing file and its contents!! + -v, --version show program's version number and exit + +Version: + v0.13.0-dev +Example: + > spacesavers2_pdq_create_db -f /path/to/sqlitedbfile +``` + +### Output + +## db file + +sqlite ".db" file with 4 tables + +```bash +% sqlite3 pdq.db +SQLite version 3.26.0 2018-12-01 12:34:55 +Enter ".help" for usage hints. +sqlite> .table +datamounts datapoints dates users +sqlite> .schema +CREATE TABLE users ( + user_id INTEGER PRIMARY KEY, + username TEXT NOT NULL, + first_name TEXT NOT NULL, + last_name TEXT NOT NULL + ); +CREATE TABLE dates ( + date_int INTEGER PRIMARY KEY, + date_text TEXT UNIQUE NOT NULL + ); +CREATE TABLE datamounts ( + datamount_id INTEGER PRIMARY KEY, + datamount_name TEXT UNIQUE NOT NULL + ); +CREATE TABLE datapoints ( + datapoint_id INTEGER PRIMARY KEY, + date_int INTEGER, + datamount_id INTEGER, + user_id INTEGER, + ninodes INTEGER, + nbytes INTEGER, + FOREIGN KEY (user_id) REFERENCES users(user_id), + FOREIGN KEY (datamount_id) REFERENCES datamounts(datamount_id), + FOREIGN KEY (date_int) REFERENCES dates(date_int) + ); +``` diff --git a/docs/pdq_update_db.md b/docs/pdq_update_db.md new file mode 100644 index 0000000..98a59e3 --- /dev/null +++ b/docs/pdq_update_db.md @@ -0,0 +1,47 @@ +## spacesavers2_pdq_update_db + +pdq = Pretty Darn Quick + +[`spacesavers2_pdq`](pdq.md) creates TSV (or JSON) file per-datamount per-run (typically per-date). If run daily, this soon creates a lot of files to keep track of. Hence, it is best to save the data in a sqlite db. [`spacesavers2_pdq_create_db`](pdq_create_db.md) command creates the basic schema for that db. Then this command can be used to populate the database. + +![pdq schema](assets/images/pdq_db_schema.png) + +### Inputs + - `--tsv`: `.tsv` or `.tsv.gz` created using `spacesavers2_pdq` + - `--database`: `.db` file created using `spacesavers2_pdq_create_db` + - `--datamount`: eg. `CCBR` or `CCBR_Pipeliner` + - `--date`: integer date in YYYYMMDD format + +```bash +usage: spacesavers2_pdq_update_db [-h] -t TSV -o DATABASE -m DATAMOUNT -d DATE [-v] + +spacesavers2_pdq_create_db: update/append date from TSV to DB file + +options: + -h, --help show this help message and exit + -t TSV, --tsv TSV spacesavers2_pdq output TSV file + -o DATABASE, --database DATABASE + database file path (use spacesavers2_pdb_create_db to create if it does not exists.) + -m DATAMOUNT, --datamount DATAMOUNT + name of the datamount eg. CCBR or CCBR_Pipeliner + -d DATE, --date DATE date in YYYYMMDD integer format + -v, --version show program's version number and exit + +Version: + v0.13.0-dev +Example: + > spacesavers2_pdq_update_db -t /path/to/tsv -o /path/to/db -m datamount_name -d date +``` + +### Output + +## updated db file + +sqlite ".db" file with 4 tables is updated. + +> NOTE: +> +> - new users are automatically added to "users" table +> - new datemounts are automatically added to "datamounts" table +> - new dates are automatically added to "dates" table +> - if >0 datapoints exist in the ".db" for a (date + datamount) combination then warning is displayed and no data is appended \ No newline at end of file diff --git a/extras/README.md b/extras/README.md new file mode 100644 index 0000000..0a23b94 --- /dev/null +++ b/extras/README.md @@ -0,0 +1 @@ +Location to store extra scripts! diff --git a/extras/create_and_append_db.sh b/extras/create_and_append_db.sh new file mode 100644 index 0000000..b485a82 --- /dev/null +++ b/extras/create_and_append_db.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# This script: +# 1. creates a sqlite3 database using `spacesavers2_pdq_create_db` +# 2. updates the database for "CCBR" mount related datapoints +# 3. updates the database for "CCBR_Pipeliner" mount related datapoints + +module load ccbrpipeliner/6 +BIN="/data/CCBR_Pipeliner/Tools/spacesavers2/pdq_db/bin" +DB="/data/CCBR_Pipeliner/userdata/spacesavers2_pdq/pdq.db" + +if [[ "1" == "0" ]];then +# Step 1. +${BIN}/spacesavers2_pdq_create_db -f $DB +fi + +# Step 2. +for f in `ls /data/CCBR_Pipeliner/userdata/spacesavers2_pdq/_data_CCBR.*.tsv*` +do + bn=$(basename $f) + echo $bn + dt=$(echo $bn|awk -F"." '{print $2}') + dm="CCBR" + ${BIN}/spacesavers2_pdq_update_db \ + --tsv $f \ + --database $DB \ + --datamount $dm --date $dt +done + +# Step 3. +for f in `ls /data/CCBR_Pipeliner/userdata/spacesavers2_pdq/_data_CCBR_Pipeliner.*.tsv*` +do + bn=$(basename $f) + echo $bn + dt=$(echo $bn|awk -F"." '{print $2}') + dm="CCBR_Pipeliner" + ${BIN}/spacesavers2_pdq_update_db \ + --tsv $f \ + --database $DB \ + --datamount $dm --date $dt +done diff --git a/mkdocs.yml b/mkdocs.yml index 3929983..3a4984c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -104,3 +104,6 @@ nav: - grubbers: grubbers.md - usurp: usurp.md - e2e: e2e.md + - pdq: pdq.md + - pdq_create_db: pdq_create_db.md + - pdq_update_db: pdq_update_db.md diff --git a/spacesavers2_pdq_create_db b/spacesavers2_pdq_create_db new file mode 100755 index 0000000..7f18bb5 --- /dev/null +++ b/spacesavers2_pdq_create_db @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# pqd = pretty darn quick + +from src.VersionCheck import version_check +from src.VersionCheck import __version__ +from src.utils import * + +version_check() + +# import required modules +import sqlite3 +import textwrap +import argparse +from pathlib import Path + +def main(): + elog = textwrap.dedent( + """\ + Version: + {} + Example: + > spacesavers2_pdq_create_db -f /path/to/sqlitedbfile + """.format( + __version__ + ) + ) + parser = argparse.ArgumentParser( + description="spacesavers2_pdq_create_db: create a sqlitedb file with the optimized schema.", + epilog=elog, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "-f", + "--filepath", + dest="filepath", + required=True, + type=str, + help="spacesavers2_pdq_create_db will create this sqlitedb file", + ) + parser.add_argument( + "-o", + "--overwrite", + dest="overwrite", + required=False, + action=argparse.BooleanOptionalAction, + help="overwrite output file if it already exists. Use this with caution as it will delete existing file and its contents!!", + ) + parser.add_argument("-v", "--version", action="version", version=__version__) + + global args + args = parser.parse_args() + + filepath = args.filepath + p = Path(filepath).absolute() + pp = p.parents[0] + if not os.access(pp, os.W_OK): + exit("ERROR: {} folder exists but cannot be written to".format(pp)) + if os.path.exists(p): + if not args.overwrite: + exit("ERROR: {} file exists and overwrite argument is not selected!".format(p)) + if not os.access(p, os.W_OK): + exit("ERROR: {} file exists but is not writeable/appendable".format(p)) + if args.overwrite and os.access(p, os.W_OK): + os.remove(p) + + # Connect to the SQLite database (or create it if it doesn't exist) + conn = sqlite3.connect(p) + cursor = conn.cursor() + + # Create the "users" table + cursor.execute('''CREATE TABLE IF NOT EXISTS users ( + user_id INTEGER PRIMARY KEY, + username TEXT NOT NULL, + first_name TEXT NOT NULL, + last_name TEXT NOT NULL + )''') + + # Create the "dates" table + cursor.execute('''CREATE TABLE IF NOT EXISTS dates ( + date_int INTEGER PRIMARY KEY, + date_text TEXT UNIQUE NOT NULL + )''') + + # Create datamounts table + cursor.execute('''CREATE TABLE IF NOT EXISTS datamounts ( + datamount_id INTEGER PRIMARY KEY, + datamount_name TEXT UNIQUE NOT NULL + )''') + + + # Create the "orders" table with a foreign key constraint + cursor.execute('''CREATE TABLE IF NOT EXISTS datapoints ( + datapoint_id INTEGER PRIMARY KEY, + date_int INTEGER, + datamount_id INTEGER, + user_id INTEGER, + ninodes INTEGER, + nbytes INTEGER, + FOREIGN KEY (user_id) REFERENCES users(user_id), + FOREIGN KEY (datamount_id) REFERENCES datamounts(datamount_id), + FOREIGN KEY (date_int) REFERENCES dates(date_int) + )''') + + # Commit changes and close the connection + conn.commit() + conn.close() + +if __name__ == "__main__": + main() diff --git a/spacesavers2_pdq_update_db b/spacesavers2_pdq_update_db new file mode 100755 index 0000000..8a1e125 --- /dev/null +++ b/spacesavers2_pdq_update_db @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 +# pqd = pretty darn quick +# This script appends records from the supplied TSV file into the supplied sqlite3 db file + +from src.VersionCheck import version_check +from src.VersionCheck import __version__ +from src.utils import * + +version_check() + +# import required modules +import sqlite3 +import textwrap +import argparse +from pathlib import Path +import subprocess +import pandas +import sys + +# data access functions + + +def get_full_name(uid): + cmd = "getent passwd {}".format(uid) + results = subprocess.run(cmd, + shell=True, + text=True, + capture_output=True) + if results.returncode != 0: + return "Unknown Unknown" + else: + x = results.stdout + x = x.split(":") + if len(x) != 7: + return "Unknown Unknown" + full_name = x[4] + return full_name + +def convert_date_int_to_date_str(date_int): + # Convert integer to string + date_str = str(date_int) + + # Insert hyphens at appropriate positions + date_text = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}" + + return date_text + +def count_column_equals_value(cursor,column_name, value, table_name): + + # Execute a SELECT query to count the occurrences of the value in the column + query = f"SELECT COUNT(*) FROM {table_name} WHERE {column_name} = ?" + cursor.execute(query, (value,)) + result = cursor.fetchone()[0] + + return result + +def count_two_columns_equals_two_values(cursor,column_name1, value1, column_name2, value2, table_name): + + # Execute a SELECT query to count the occurrences of the value in the column + query = f"SELECT COUNT(*) FROM {table_name} WHERE {column_name1} = ? AND {column_name2} = ?" + cursor.execute(query, (value1, value2,)) + result = cursor.fetchone()[0] + + return result + +def check_value_and_return_primary_key(cursor, value, column_name, table_name, primary_key_column): + + # Execute a SELECT query to check if the value exists in the column and return the primary key + query = f"SELECT {primary_key_column}, COUNT(*) FROM {table_name} WHERE {column_name} = ?" + cursor.execute(query, (value,)) + result = cursor.fetchone() + + # If the count is greater than 0, the value exists in the column + if result[1] > 0: + return result[0] # Return the primary key + else: + return None # Value not found + +def add_user(cursor,user_id,username): + + full_name = get_full_name(user_id) + full_name = full_name.split(" ") + last_name = full_name.pop(-1) + first_name = full_name.pop(0) + + # Execute INSERT statement + query = f"INSERT INTO users (user_id, username, first_name, last_name) VALUES (?, ?, ?, ?)" + cursor.execute(query, (user_id, username, first_name, last_name)) + +def add_date(cursor,date): + + date_text = convert_date_int_to_date_str(date) + + # Execute INSERT statement + query = f"INSERT INTO dates (date_int, date_text) VALUES (?, ?)" + cursor.execute(query, (int(date), date_text)) + +def add_datamount(cursor,datamount): + + # Execute INSERT statement + query = f"INSERT INTO datamounts (datamount_name) VALUES (?)" + cursor.execute(query,(datamount,)) + + # query = f"INSERT INTO datamounts (datamount_name) VALUES (\"{datamount}\")" + # cursor.execute(query) +def add_datapoint(cursor,datamount_id,date,user_id,ninodes,nbytes): + + query = f"INSERT INTO datapoints (date_int, datamount_id, user_id, ninodes, nbytes) VALUES (?, ?, ?, ?, ?)" + cursor.execute(query, (int(date),int(datamount_id),int(user_id),int(ninodes),int(nbytes))) + +########## + +def main(): + elog = textwrap.dedent( + """\ + Version: + {} + Example: + > spacesavers2_pdq_update_db -t /path/to/tsv -o /path/to/db -m datamount_name -d date + """.format( + __version__ + ) + ) + parser = argparse.ArgumentParser( + description="spacesavers2_pdq_create_db: update/append date from TSV to DB file", + epilog=elog, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "-t", + "--tsv", + dest="tsv", + required=True, + type=str, + help="spacesavers2_pdq output TSV file", + ) + parser.add_argument( + "-o", + "--database", + dest="database", + required=True, + help="database file path (use spacesavers2_pdb_create_db to create if it does not exists.)", + ) + parser.add_argument( + "-m", + "--datamount", + dest="datamount", + required=True, + type=str, + help="name of the datamount eg. CCBR or CCBR_Pipeliner", + ) + parser.add_argument( + "-d", + "--date", + dest="date", + required=True, + type=int, + help="date in YYYYMMDD integer format", + ) + parser.add_argument("-v", "--version", action="version", version=__version__) + + global args + args = parser.parse_args() + + # check TSV file + tsv = args.tsv + tsv = Path(tsv).absolute() + if not os.access(tsv, os.R_OK): + exit("ERROR: {} file exists but cannot be read from".format(tsv)) + + # check db file + db = args.database + db = Path(db).absolute() + if not os.path.exists(db): + exit("ERROR: {} does not exist. Create it using spacesavers2_pdq_create_db".format(db)) + if not os.access(db, os.W_OK): + exit("ERROR: {} file exists but cannot be written to".format(db)) + + # check date format + if len(str(args.date)) != 8: + exit("ERROR: date {} needs to be in format YYYYMMDD!".format(args.date)) + + # Connect to the SQLite database (or create it if it doesn't exist) + conn = sqlite3.connect(db) + cursor = conn.cursor() + + # read TSV + df = pandas.read_csv(args.tsv, + header=0, + sep="\t") + + date = int(args.date) + datamount = args.datamount + + # check if datamount is new or known + ndatamounts = count_column_equals_value(cursor=cursor, + column_name="datamount_name", + value=datamount, + table_name="datamounts") + if ndatamounts == 0: # this datamount is new... so add it + add_datamount(cursor=cursor,datamount=datamount) + conn.commit() + datamount_id = check_value_and_return_primary_key(cursor=cursor, + column_name="datamount_name", + table_name="datamounts", + primary_key_column="datamount_id", + value=datamount) + + # check if date is new or known + ndate = count_column_equals_value(cursor=cursor,column_name="date_int",table_name="dates",value=date) + if ndate == 0: # date is new ... so add it + add_date(cursor=cursor,date=date) + conn.commit() + else: # date is known ... check if datapoints exist for this date+datamount combination + ndatapoints = count_two_columns_equals_two_values(cursor=cursor, + column_name1="date_int", + value1=date, + column_name2="datamount_id", + value2=datamount_id, + table_name="datapoints") + if ndatapoints != 0: # data already entered for this data+datamount combo ... nothing to do + print("WARNING: db already contains {} data points for date {} and datamount {}".format(ndatapoints,date,datamount)) + exit(0) + + count = 0 + # Iterate over rows and append data + for row in df.itertuples(index=False): + # ignore all_users rows + if row.uid == 0: continue + + # check if user is new or known + nuid = count_column_equals_value(cursor=cursor,column_name="user_id",table_name="users",value=row.uid) + if nuid == 0: # user does not exist so add user + add_user(cursor=cursor, + user_id=row.uid, + username=row.username) + conn.commit() + + # add datapoint + add_datapoint(cursor=cursor, + date=date, + datamount_id=datamount_id, + user_id=row.uid, + ninodes=row.ninodes, + nbytes=row.nbytes) + conn.commit() + count += 1 + + # Commit changes and close the connection + conn.commit() + conn.close() + print(f"{count} new datapoints appended to database:{args.database}") + +if __name__ == "__main__": + main() diff --git a/src/VERSION b/src/VERSION index 34a8361..54d1a4f 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -0.12.1 +0.13.0