Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First Commit #16

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,6 @@ dmypy.json

# Pyre type checker
.pyre/
oracle-database-xe-21c-1.0-1.ol7.x86_64.rpm
oracle-database-xe*.rpm
oracle-database-xe*
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"git.ignoreLimitWarning": true
}
70 changes: 70 additions & 0 deletions Project_Dep/Analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import sqlglot as sq
import streamlit as st
import re
import networkx as nx
import matplotlib.pyplot as plt
# from sqlglot import expressions
# Function to extract tables and columns from the AST
parent_object=''
def extract_tables(parsed_query):
tables = set()
columns = set()
for node in parsed_query.walk():
if isinstance(node,sq.expressions.Create):
parent_object=node.this
if isinstance(node, sq.expressions.Table):
tables.add(node.name)
# elif isinstance(node, sq.expressions.Column):
# # node.this is the column name
# # node.table is the table name (if specified)
# column_name = node.name
# table_name = node.table
# columns.add((table_name, column_name))
return parent_object,tables
# ,columns

def read_query_from_path(file):
with open(file,'r') as f:
query=f.read()
# print(query)
return sq.parse_one(query)

# file='/workspaces/sfguide-data-engineering-with-snowpark-python-intro/Project Dep/view_demo.sql'
file='/workspaces/sfguide-data-engineering-with-snowpark-python-intro/Project Dep/proc.sql'
# Extract tables and columns
read_query_from_path(file)

parent,tables= extract_tables(read_query_from_path(file))
# ,columns
print("Parent:",parent)
print("Tables:", tables)

dependencies=[]

def create_hierarchy_graph(parent, tables):
# Create a directed graph (DiGraph)
G = nx.DiGraph()

# Add the parent node (view)
G.add_node(parent)

# Add the child nodes (tables) and establish the relationships
for table in tables:
G.add_node(table)
G.add_edge(parent, table) # Connect the parent to each table

return G
print(create_hierarchy_graph(parent=parent,tables=tables))

G = create_hierarchy_graph(parent, tables)
pos = nx.spring_layout(G, seed=42) # You can experiment with different layouts like shell_layout, etc.

# Draw the graph using matplotlib
plt.figure(figsize=(8, 6))
nx.draw(G, pos, with_labels=True, node_size=3000, node_color='skyblue', font_size=10, font_color='black', font_weight='bold', edge_color='gray')
plt.title("Hierarchy of v_department_summary and Related Tables")
# plt.show()
# name=parent.this.this+'.jpg'
# plt.savefig(name)
st.pyplot(plt)
# print(parent.this.this)
123 changes: 123 additions & 0 deletions Project_Dep/Analyzer_Plotly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import sqlglot as sq
import streamlit as st
import re
import networkx as nx
import plotly.graph_objects as go

# Function to extract tables and columns from the AST
def extract_tables(parsed_query):
tables = set()
columns = set()
parent_object = None

for node in parsed_query.walk():
if isinstance(node, sq.expressions.Create):
parent_object = node.this
if isinstance(node, sq.expressions.Table):
tables.add(node.name)
# Uncomment if you want to add columns to the extracted information
# elif isinstance(node, sq.expressions.Column):
# column_name = node.name
# table_name = node.table
# columns.add((table_name, column_name))
return parent_object, tables

def read_query_from_path(file):
with open(file, 'r') as f:
query = f.read()
return sq.parse_one(query)

# Define your SQL query file path
file = '/workspaces/sfguide-data-engineering-with-snowpark-python-intro/Project Dep/view_demo.sql'

# Read and parse the query
parsed_query = read_query_from_path(file)

# Extract parent (view name) and tables
parent, tables = extract_tables(parsed_query)

# Print extracted parent and tables for verification
print("Parent:", parent)
print("Tables:", tables)

# Function to create a hierarchy graph using NetworkX
def create_hierarchy_graph(parent, tables):
G = nx.DiGraph()
# Add the parent node (view)
G.add_node(parent)

# Add the child nodes (tables) and establish the relationships
for table in tables:
G.add_node(table)
G.add_edge(parent, table) # Connect the parent to each table

return G

# Create the graph using NetworkX
G = create_hierarchy_graph(parent, tables)

# Convert the NetworkX graph to a format compatible with Plotly
def networkx_to_plotly(G):
pos = nx.spring_layout(G, seed=42) # You can experiment with different layouts
node_x, node_y = [], []
for node, (x, y) in pos.items():
node_x.append(x)
node_y.append(y)

edge_x, edge_y = [], []
for edge in G.edges():
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x.append(x0)
edge_x.append(x1)
edge_y.append(y0)
edge_y.append(y1)

return node_x, node_y, edge_x, edge_y

# Convert to Plotly data
node_x, node_y, edge_x, edge_y = networkx_to_plotly(G)

# Create the edge trace (lines)
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
line=dict(width=0.5, color='#888'),
hoverinfo='none',
mode='lines'
)

# Create the node trace (points)
node_trace = go.Scatter(
x=node_x, y=node_y,
mode='markers+text',
hoverinfo='text',
marker=dict(
showscale=True,
colorscale='YlGnBu', # Change this to another color scale if you prefer
size=50,
colorbar=dict(thickness=15, title='Node Connections', xanchor='left', titleside='right')
)
)

# Add node labels (table names)
node_trace.text = [node for node in G.nodes()]

# **Corrected part**: Map the color values to a range from 0 to 1
node_trace.marker.color = [i / len(G.nodes()) for i in range(len(G.nodes()))] # Map the nodes to a color scale

# Create the layout for the plot
layout = go.Layout(
title="Hierarchy of v_department_summary and Related Tables",
showlegend=False,
hovermode='closest',
margin=dict(b=0, l=0, r=0, t=40),
xaxis=dict(showgrid=False, zeroline=False),
yaxis=dict(showgrid=False, zeroline=False),
titlefont_size=16
)

# Create the Plotly figure
fig = go.Figure(data=[edge_trace, node_trace], layout=layout)

# Show the interactive graph
fig.show()
66 changes: 66 additions & 0 deletions Project_Dep/Analyzer_pyplot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import sqlglot as sq
import streamlit as st
import re
import networkx as nx
import matplotlib.pyplot as plt
# from sqlglot import expressions
# Function to extract tables and columns from the AST
def extract_tables(parsed_query):
tables = set()
columns = set()
for node in parsed_query.walk():
if isinstance(node,sq.expressions.Create):
parent_object=node.this
if isinstance(node, sq.expressions.Table):
tables.add(node.name)
# elif isinstance(node, sq.expressions.Column):
# # node.this is the column name
# # node.table is the table name (if specified)
# column_name = node.name
# table_name = node.table
# columns.add((table_name, column_name))
return parent_object,tables
# ,columns

def read_query_from_path(file):
with open(file,'r') as f:
query=f.read()
# print(query)
return sq.parse_one(query)

file='/workspaces/sfguide-data-engineering-with-snowpark-python-intro/Project Dep/view_demo.sql'
# Extract tables and columns
read_query_from_path(file)

parent,tables= extract_tables(read_query_from_path(file))
# ,columns
print("Parent:",parent)
print("Tables:", tables)

dependencies=[]

def create_hierarchy_graph(parent, tables):
# Create a directed graph (DiGraph)
G = nx.DiGraph()

# Add the parent node (view)
G.add_node(parent)

# Add the child nodes (tables) and establish the relationships
for table in tables:
G.add_node(table)
G.add_edge(parent, table) # Connect the parent to each table

return G
print(create_hierarchy_graph(parent=parent,tables=tables))

G = create_hierarchy_graph(parent, tables)

pos = nx.spring_layout(G, seed=42) # You can experiment with different layouts like shell_layout, etc.

# Draw the graph using matplotlib
plt.figure(figsize=(8, 6))
nx.draw(G, pos, with_labels=True, node_size=3000, node_color='skyblue', font_size=10, font_color='black', font_weight='bold', edge_color='gray')
plt.title("Hierarchy of v_department_summary and Related Tables")
plt.show()
plt.savefig('plot.jpg')
122 changes: 122 additions & 0 deletions Project_Dep/gpt_Analyser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import sqlglot
import graphviz

class SQLDependencyExtractor:
def __init__(self, sql_query):
self.sql_query = sql_query
self.dependencies = {"tables": {}, "lineage": []}

def parse_sql(self):
"""Parse the SQL query into an Abstract Syntax Tree (AST)."""
# try:
# Parse the SQL query using SQLGlot
return sqlglot.parse_one(self.sql_query)
# except SqlglotError as e:
# raise Exception(f"Error parsing SQL: {e}")

def extract_dependencies(self, parsed):
"""Extract table and column dependencies using SQLGlot walk()."""
# print('====>',node.type)
def visitor(node):
# Check for table references
# print('====>',node.type)
if node.type in ['table', 'from', 'join']:
print(node.name)
table_name = node.name
if table_name and table_name not in self.dependencies['tables']:
self.dependencies['tables'][table_name] = []

# Check for column references
if node.type == 'column':
table_name = node.find_parent("table")
column_name = node.name
print(table_name.name,'=>',node.name)
if table_name and table_name.name:
table_name_str = table_name.name
if table_name_str not in self.dependencies['tables']:
self.dependencies['tables'][table_name_str] = []
self.dependencies['tables'][table_name_str].append(column_name)
self.dependencies['lineage'].append({
"table": table_name_str,
"column": column_name
})

# Walk the AST to extract tables and columns
sqlglot.Expression.walk(parsed)

# def generate_digraph(self, output_file="table_column_hierarchy"):
# """Generate a hierarchy diagram (Digraph) to visualize table-column dependencies."""
# graph = graphviz.Digraph(comment='Table-Column Dependency Graph')

# # Add table nodes
# for table in self.dependencies['tables']:
# graph.node(table, table)

# # Add column dependencies (edges)
# for lineage in self.dependencies['lineage']:
# graph.edge(lineage['table'], f"{lineage['table']}.{lineage['column']}")

# # Save and render the graph as a PNG file
# graph.render(output_file, format='png')

def get_dependencies(self):
"""Get the table and column dependencies."""
parsed = self.parse_sql()
print(parsed)
self.extract_dependencies(parsed)
return self.dependencies

# def display_dependencies(self):
# """Display the extracted dependencies in a readable format."""
# print("Tables and Columns:")
# for table, columns in self.dependencies["tables"].items():
# print(f"Table: {table}")
# for column in columns:
# print(f" - Column: {column}")

# print("\nLineage (Table -> Column):")
# for line in self.dependencies["lineage"]:
# print(f"Table: {line['table']}, Column: {line['column']}")

# Example SQL query
sql_query = """
CREATE OR REPLACE VIEW v_department_summary AS
SELECT
d.department_id,
d.department_name,
d.manager_id,
l.city AS department_location,
c.country_name,
r.region_name,
COUNT(e.employee_id) AS total_employees,
AVG(e.salary) AS avg_salary,
LISTAGG(j.job_title, ', ') WITHIN GROUP (ORDER BY j.job_title) AS job_titles
FROM
employees e
JOIN
departments d ON e.department_id = d.department_id
JOIN
locations l ON d.location_id = l.location_id
JOIN
countries c ON l.country_id = c.country_id
JOIN
regions r ON c.region_id = r.region_id
JOIN
jobs j ON e.job_id = j.job_id
GROUP BY
d.department_id, d.department_name, d.manager_id, l.city, c.country_name, r.region_name
ORDER BY
d.department_name;
"""

# Extract and display the dependencies
extractor = SQLDependencyExtractor(sql_query)
print(extractor.get_dependencies())
# dependencies = extractor.get_dependencies()

# Display the dependencies
# extractor.display_dependencies()

# Generate and render the Digraph to visualize the table-column hierarchy
# extractor.generate_digraph(output_file="table_column_hierarchy")
# print("Dependency graph generated and saved as 'table_column_hierarchy.png'")
Loading