From 205264cd3617996d223a9108f654a046014a4293 Mon Sep 17 00:00:00 2001 From: elsiehoffet-94 <56258130+elsiehoffet-94@users.noreply.github.com> Date: Thu, 16 Jul 2020 16:29:49 +0200 Subject: [PATCH 1/4] set context" --- pagai/services/database_explorer.py | 12 +++++++ test/test_column_completion.py | 50 +++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 test/test_column_completion.py diff --git a/pagai/services/database_explorer.py b/pagai/services/database_explorer.py index d0a9dab..c36051c 100644 --- a/pagai/services/database_explorer.py +++ b/pagai/services/database_explorer.py @@ -136,3 +136,15 @@ def get_db_schema(self, owner: str, driver=POSTGRES): for row in result: db_schema[row["table_name"]].append(row["column_name"]) return db_schema + + # TODO: write a function which takes a list of table names and a database schema as an argument, and returns a + # percentage of completion for all the columns in the given tables + # in the format {"table_A": {"col1": 0.9, "col2": 0.6}, "table_B": {"col3":0, "col4": 1}} + def get_column_completion(self, db_schema: defaultdict(list), tables: []): + + targeted_tables= dict((table_name, db_schema[table_name]) for table_name in tables) + for table_name,column_name in targeted_tables.items(): + # TODO: write sql query to compute completion percentage for each column + # TODO: connect and execute the query + # TODO: format and return result + return "" \ No newline at end of file diff --git a/test/test_column_completion.py b/test/test_column_completion.py new file mode 100644 index 0000000..8abbf4a --- /dev/null +++ b/test/test_column_completion.py @@ -0,0 +1,50 @@ +from os import getenv + +from pagai.services.database_explorer import POSTGRES, ORACLE, DatabaseExplorer +from pagai.errors import OperationOutcome +from sqlalchemy.exc import OperationalError +import json + +def get_col_completion(owner, table_list): + """ + TODO: add function description + """ + + # switch on the possible db models + # if the db model is not supported, an error is raised. + db_drivers = {"POSTGRES": POSTGRES, "ORACLE": ORACLE} + + + credentials = { + 'model': 'ORACLE', + 'host': getenv('DB_HOST'), + 'port': int(getenv('DB_PORT', 1531)), + 'database': getenv('DB_NAME'), + 'login': getenv('DB_USER'), + 'password': getenv('DB_PASSWORD'), + } + print(credentials) + + db_model = "ORACLE" + if db_model not in db_drivers: + raise OperationOutcome(f"Database type {credentials.get('model')} is unknown") + + + try: + # Explore the Database + explorer = DatabaseExplorer(db_drivers[db_model], credentials) + # This returns the database schema, a dict of the form {"table_A": ["col1", "col2"], "table_B": ["col3", "col4"]} + schema = explorer.get_db_schema(owner=owner, driver=ORACLE) + col_completion = explorer.get_column_completion(db_schema=schema, tables=table_list) + return col_completion + except OperationalError as e: + if "could not connect to server" in str(e): + raise OperationOutcome(f"Could not connect to the database: {e}") + else: + raise OperationOutcome(e) + except Exception as e: + raise OperationOutcome(e) + +test = get_col_completion(owner='V500', table_list=["UF", "MALADE"]) + +print(test) \ No newline at end of file From 7df64d1010a1bafcda02f7cbbdef493815b902a6 Mon Sep 17 00:00:00 2001 From: Elsie Hoffet Date: Mon, 27 Jul 2020 10:50:20 +0000 Subject: [PATCH 2/4] add column completion to database explorer --- pagai/services/database_explorer.py | 32 ++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/pagai/services/database_explorer.py b/pagai/services/database_explorer.py index c36051c..56ce794 100644 --- a/pagai/services/database_explorer.py +++ b/pagai/services/database_explorer.py @@ -140,11 +140,29 @@ def get_db_schema(self, owner: str, driver=POSTGRES): # TODO: write a function which takes a list of table names and a database schema as an argument, and returns a # percentage of completion for all the columns in the given tables # in the format {"table_A": {"col1": 0.9, "col2": 0.6}, "table_B": {"col3":0, "col4": 1}} - def get_column_completion(self, db_schema: defaultdict(list), tables: []): + + def get_column_completion(self, db_schema: defaultdict(list), table: str, sort: bool): + """ + Returns the percentage of completion for all columns in the given table + """ + # result_display = {} + + # for column in db_schema[table] : + # sql_query = text(f"select round(count({column}) / count(*) * 100, 0) from {table}") - targeted_tables= dict((table_name, db_schema[table_name]) for table_name in tables) - for table_name,column_name in targeted_tables.items(): - # TODO: write sql query to compute completion percentage for each column - # TODO: connect and execute the query - # TODO: format and return result - return "" \ No newline at end of file + # with self._sql_engine.connect() as connection: + # result = connection.execute(sql_query).fetchall() + # result_display[column] = result[0][0] + # return result_display + # result_display = '' + query_tmp = "" + column_list = sorted(db_schema[table]) if sort else db_schema[table] + for column in column_list[:-1] : + query_tmp += f"round(count({column}) / count(*) * 100, 0), " + query_tmp += f"round(count({column_list[-1]}) / count(*) * 100, 0) " + sql_query = f"select {query_tmp} from {table}" + + with self._sql_engine.connect() as connection: + query_result = connection.execute(sql_query).fetchall() + result = list(zip(db_schema[table], query_result[0])) + return result \ No newline at end of file From 2c7d12f63937b1bdf666e8905c64db9b6cad165a Mon Sep 17 00:00:00 2001 From: Elsie Hoffet Date: Mon, 27 Jul 2020 12:53:32 +0000 Subject: [PATCH 3/4] add user tests --- pagai/services/database_explorer.py | 15 +-------------- test/test_column_completion.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 26 deletions(-) diff --git a/pagai/services/database_explorer.py b/pagai/services/database_explorer.py index 56ce794..15f3673 100644 --- a/pagai/services/database_explorer.py +++ b/pagai/services/database_explorer.py @@ -137,24 +137,11 @@ def get_db_schema(self, owner: str, driver=POSTGRES): db_schema[row["table_name"]].append(row["column_name"]) return db_schema - # TODO: write a function which takes a list of table names and a database schema as an argument, and returns a - # percentage of completion for all the columns in the given tables - # in the format {"table_A": {"col1": 0.9, "col2": 0.6}, "table_B": {"col3":0, "col4": 1}} def get_column_completion(self, db_schema: defaultdict(list), table: str, sort: bool): """ - Returns the percentage of completion for all columns in the given table + Returns a list of (column_name, percentage of completion) for all columns in a given table """ - # result_display = {} - - # for column in db_schema[table] : - # sql_query = text(f"select round(count({column}) / count(*) * 100, 0) from {table}") - - # with self._sql_engine.connect() as connection: - # result = connection.execute(sql_query).fetchall() - # result_display[column] = result[0][0] - # return result_display - # result_display = '' query_tmp = "" column_list = sorted(db_schema[table]) if sort else db_schema[table] for column in column_list[:-1] : diff --git a/test/test_column_completion.py b/test/test_column_completion.py index 8abbf4a..13229b8 100644 --- a/test/test_column_completion.py +++ b/test/test_column_completion.py @@ -5,16 +5,17 @@ from sqlalchemy.exc import OperationalError import json -def get_col_completion(owner, table_list): +def get_col_completion(owner, table_name, sorted): """ - TODO: add function description + Returns the percentage of completion for all columns in the given table """ # switch on the possible db models # if the db model is not supported, an error is raised. db_drivers = {"POSTGRES": POSTGRES, "ORACLE": ORACLE} - + with open("/home/arkhn/git/pagai/schema.json") as f: + schema=json.load(f) credentials = { 'model': 'ORACLE', 'host': getenv('DB_HOST'), @@ -23,20 +24,20 @@ def get_col_completion(owner, table_list): 'login': getenv('DB_USER'), 'password': getenv('DB_PASSWORD'), } - print(credentials) db_model = "ORACLE" if db_model not in db_drivers: raise OperationOutcome(f"Database type {credentials.get('model')} is unknown") - + result_display = "" try: - # Explore the Database explorer = DatabaseExplorer(db_drivers[db_model], credentials) - # This returns the database schema, a dict of the form {"table_A": ["col1", "col2"], "table_B": ["col3", "col4"]} - schema = explorer.get_db_schema(owner=owner, driver=ORACLE) - col_completion = explorer.get_column_completion(db_schema=schema, tables=table_list) - return col_completion + col_completion = explorer.get_column_completion(db_schema=schema, table=table_name, sort=sorted) + + # Format CSV friendly + for item in col_completion: + result_display += f"{item[0]}, {item[1]} \n" + return result_display except OperationalError as e: if "could not connect to server" in str(e): raise OperationOutcome(f"Could not connect to the database: {e}") @@ -45,6 +46,5 @@ def get_col_completion(owner, table_list): except Exception as e: raise OperationOutcome(e) -test = get_col_completion(owner='V500', table_list=["UF", "MALADE"]) -print(test) \ No newline at end of file +test = get_col_completion(owner='V500', table_name="UF", sorted= True) \ No newline at end of file From b64fbeec2e3250161670edd3488824fd7d0e23ce Mon Sep 17 00:00:00 2001 From: elsiehoffet-94 <56258130+elsiehoffet-94@users.noreply.github.com> Date: Mon, 27 Jul 2020 15:05:31 +0200 Subject: [PATCH 4/4] move user file --- test/test_column_completion.py => get_db_stats.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) rename test/test_column_completion.py => get_db_stats.py (83%) diff --git a/test/test_column_completion.py b/get_db_stats.py similarity index 83% rename from test/test_column_completion.py rename to get_db_stats.py index 13229b8..8677575 100644 --- a/test/test_column_completion.py +++ b/get_db_stats.py @@ -5,7 +5,7 @@ from sqlalchemy.exc import OperationalError import json -def get_col_completion(owner, table_name, sorted): +def get_col_completion(owner, table_name, sorted, db_model): """ Returns the percentage of completion for all columns in the given table """ @@ -14,10 +14,8 @@ def get_col_completion(owner, table_name, sorted): # if the db model is not supported, an error is raised. db_drivers = {"POSTGRES": POSTGRES, "ORACLE": ORACLE} - with open("/home/arkhn/git/pagai/schema.json") as f: - schema=json.load(f) credentials = { - 'model': 'ORACLE', + 'model': 'public', 'host': getenv('DB_HOST'), 'port': int(getenv('DB_PORT', 1531)), 'database': getenv('DB_NAME'), @@ -25,19 +23,21 @@ def get_col_completion(owner, table_name, sorted): 'password': getenv('DB_PASSWORD'), } - db_model = "ORACLE" + # db_model = "public" if db_model not in db_drivers: raise OperationOutcome(f"Database type {credentials.get('model')} is unknown") result_display = "" try: explorer = DatabaseExplorer(db_drivers[db_model], credentials) + schema = explorer.get_db_schema(owner="public", driver=db_config["model"]) col_completion = explorer.get_column_completion(db_schema=schema, table=table_name, sort=sorted) - # Format CSV friendly + # Return CSV friendly formatting for item in col_completion: result_display += f"{item[0]}, {item[1]} \n" return result_display + except OperationalError as e: if "could not connect to server" in str(e): raise OperationOutcome(f"Could not connect to the database: {e}") @@ -47,4 +47,3 @@ def get_col_completion(owner, table_name, sorted): raise OperationOutcome(e) -test = get_col_completion(owner='V500', table_name="UF", sorted= True) \ No newline at end of file