Skip to content
This repository has been archived by the owner on Mar 1, 2021. It is now read-only.

Get table completion percentage #46

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions get_db_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from os import getenv

from pagai.services.database_explorer import POSTGRES, ORACLE, DatabaseExplorer
from pagai.errors import OperationOutcome
from sqlalchemy.exc import OperationalError
import json

def get_col_completion(owner, table_name, sorted, db_model):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where will this be called? Do you think we should make it a route (as for get_db_schema)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When do you think sorted will be useful?

"""
Returns the percentage of completion for all columns in the given table
"""

# switch on the possible db models
# if the db model is not supported, an error is raised.
db_drivers = {"POSTGRES": POSTGRES, "ORACLE": ORACLE}

credentials = {
'model': 'public',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

public?

'host': getenv('DB_HOST'),
'port': int(getenv('DB_PORT', 1531)),
'database': getenv('DB_NAME'),
'login': getenv('DB_USER'),
'password': getenv('DB_PASSWORD'),
}

# db_model = "public"
if db_model not in db_drivers:
raise OperationOutcome(f"Database type {credentials.get('model')} is unknown")

result_display = ""
try:
explorer = DatabaseExplorer(db_drivers[db_model], credentials)
schema = explorer.get_db_schema(owner="public", driver=db_config["model"])
col_completion = explorer.get_column_completion(db_schema=schema, table=table_name, sort=sorted)

# Return CSV friendly formatting
for item in col_completion:
result_display += f"{item[0]}, {item[1]} \n"
return result_display

except OperationalError as e:
if "could not connect to server" in str(e):
raise OperationOutcome(f"Could not connect to the database: {e}")
else:
raise OperationOutcome(e)
except Exception as e:
raise OperationOutcome(e)


17 changes: 17 additions & 0 deletions pagai/services/database_explorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,20 @@ def get_db_schema(self, owner: str, driver=POSTGRES):
for row in result:
db_schema[row["table_name"].lower()].append(row["column_name"].lower())
return db_schema


def get_column_completion(self, db_schema: defaultdict(list), table: str, sort: bool):
"""
Returns a list of (column_name, percentage of completion) for all columns in a given table
"""
query_tmp = ""
column_list = sorted(db_schema[table]) if sort else db_schema[table]
for column in column_list[:-1] :
query_tmp += f"round(count({column}) / count(*) * 100, 0), "
query_tmp += f"round(count({column_list[-1]}) / count(*) * 100, 0) "
sql_query = f"select {query_tmp} from {table}"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think we could do that with sqlAlchemy?


with self._sql_engine.connect() as connection:
query_result = connection.execute(sql_query).fetchall()
result = list(zip(db_schema[table], query_result[0]))
return result