Snowflake-Labs · ashishsirt · Oct 15, 2024
diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+oracle-database-xe-21c-1.0-1.ol7.x86_64.rpm
+oracle-database-xe*.rpm
+oracle-database-xe*
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "git.ignoreLimitWarning": true
+}
diff --git a/Project_Dep/Analyzer.py b/Project_Dep/Analyzer.py
@@ -0,0 +1,70 @@
+import sqlglot as sq
+import streamlit as st
+import re
+import networkx as nx
+import matplotlib.pyplot as plt
+# from sqlglot import expressions 
+# Function to extract tables and columns from the AST
+parent_object=''
+def extract_tables(parsed_query):
+    tables = set()
+    columns = set()
+    for node in parsed_query.walk():
+        if isinstance(node,sq.expressions.Create):
+            parent_object=node.this
+        if isinstance(node, sq.expressions.Table):
+            tables.add(node.name)
+        # elif isinstance(node, sq.expressions.Column):
+    #     # node.this is the column name
+    #     # node.table is the table name (if specified)
+            # column_name = node.name
+            # table_name = node.table
+            # columns.add((table_name, column_name))
+    return parent_object,tables
+# ,columns
+
+def read_query_from_path(file):
+    with open(file,'r') as f:
+        query=f.read()
+        # print(query)
+        return sq.parse_one(query)
+
+# file='/workspaces/sfguide-data-engineering-with-snowpark-python-intro/Project Dep/view_demo.sql'
+file='/workspaces/sfguide-data-engineering-with-snowpark-python-intro/Project Dep/proc.sql'
+# Extract tables and columns
+read_query_from_path(file)
+
+parent,tables= extract_tables(read_query_from_path(file))
+# ,columns
+print("Parent:",parent) 
+print("Tables:", tables)
+
+dependencies=[]
+
+def create_hierarchy_graph(parent, tables):
+    # Create a directed graph (DiGraph)
+    G = nx.DiGraph()
+
+    # Add the parent node (view)
+    G.add_node(parent)
+
+    # Add the child nodes (tables) and establish the relationships
+    for table in tables:
+        G.add_node(table)
+        G.add_edge(parent, table)  # Connect the parent to each table
+
+    return G
+print(create_hierarchy_graph(parent=parent,tables=tables))
+
+G = create_hierarchy_graph(parent, tables)
+pos = nx.spring_layout(G, seed=42)  # You can experiment with different layouts like shell_layout, etc.
+
+# Draw the graph using matplotlib
+plt.figure(figsize=(8, 6))
+nx.draw(G, pos, with_labels=True, node_size=3000, node_color='skyblue', font_size=10, font_color='black', font_weight='bold', edge_color='gray')
+plt.title("Hierarchy of v_department_summary and Related Tables")
+# plt.show()
+# name=parent.this.this+'.jpg'
+# plt.savefig(name)
+st.pyplot(plt)
+# print(parent.this.this)
diff --git a/Project_Dep/Analyzer_Plotly.py b/Project_Dep/Analyzer_Plotly.py
@@ -0,0 +1,123 @@
+import sqlglot as sq
+import streamlit as st
+import re
+import networkx as nx
+import plotly.graph_objects as go
+
+# Function to extract tables and columns from the AST
+def extract_tables(parsed_query):
+    tables = set()
+    columns = set()
+    parent_object = None
+
+    for node in parsed_query.walk():
+        if isinstance(node, sq.expressions.Create):
+            parent_object = node.this
+        if isinstance(node, sq.expressions.Table):
+            tables.add(node.name)
+        # Uncomment if you want to add columns to the extracted information
+        # elif isinstance(node, sq.expressions.Column):
+        #     column_name = node.name
+        #     table_name = node.table
+        #     columns.add((table_name, column_name))
+    return parent_object, tables
+
+def read_query_from_path(file):
+    with open(file, 'r') as f:
+        query = f.read()
+        return sq.parse_one(query)
+
+# Define your SQL query file path
+file = '/workspaces/sfguide-data-engineering-with-snowpark-python-intro/Project Dep/view_demo.sql'
+
+# Read and parse the query
+parsed_query = read_query_from_path(file)
+
+# Extract parent (view name) and tables
+parent, tables = extract_tables(parsed_query)
+
+# Print extracted parent and tables for verification
+print("Parent:", parent)
+print("Tables:", tables)
+
+# Function to create a hierarchy graph using NetworkX
+def create_hierarchy_graph(parent, tables):
+    G = nx.DiGraph()
+    # Add the parent node (view)
+    G.add_node(parent)
+
+    # Add the child nodes (tables) and establish the relationships
+    for table in tables:
+        G.add_node(table)
+        G.add_edge(parent, table)  # Connect the parent to each table
+
+    return G
+
+# Create the graph using NetworkX
+G = create_hierarchy_graph(parent, tables)
+
+# Convert the NetworkX graph to a format compatible with Plotly
+def networkx_to_plotly(G):
+    pos = nx.spring_layout(G, seed=42)  # You can experiment with different layouts
+    node_x, node_y = [], []
+    for node, (x, y) in pos.items():
+        node_x.append(x)
+        node_y.append(y)
+
+    edge_x, edge_y = [], []
+    for edge in G.edges():
+        x0, y0 = pos[edge[0]]
+        x1, y1 = pos[edge[1]]
+        edge_x.append(x0)
+        edge_x.append(x1)
+        edge_y.append(y0)
+        edge_y.append(y1)
+
+    return node_x, node_y, edge_x, edge_y
+
+# Convert to Plotly data
+node_x, node_y, edge_x, edge_y = networkx_to_plotly(G)
+
+# Create the edge trace (lines)
+edge_trace = go.Scatter(
+    x=edge_x, y=edge_y,
+    line=dict(width=0.5, color='#888'),
+    hoverinfo='none',
+    mode='lines'
+)
+
+# Create the node trace (points)
+node_trace = go.Scatter(
+    x=node_x, y=node_y,
+    mode='markers+text',
+    hoverinfo='text',
+    marker=dict(
+        showscale=True,
+        colorscale='YlGnBu',  # Change this to another color scale if you prefer
+        size=50,
+        colorbar=dict(thickness=15, title='Node Connections', xanchor='left', titleside='right')
+    )
+)
+
+# Add node labels (table names)
+node_trace.text = [node for node in G.nodes()]
+
+# **Corrected part**: Map the color values to a range from 0 to 1
+node_trace.marker.color = [i / len(G.nodes()) for i in range(len(G.nodes()))]  # Map the nodes to a color scale
+
+# Create the layout for the plot
+layout = go.Layout(
+    title="Hierarchy of v_department_summary and Related Tables",
+    showlegend=False,
+    hovermode='closest',
+    margin=dict(b=0, l=0, r=0, t=40),
+    xaxis=dict(showgrid=False, zeroline=False),
+    yaxis=dict(showgrid=False, zeroline=False),
+    titlefont_size=16
+)
+
+# Create the Plotly figure
+fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
+
+# Show the interactive graph
+fig.show()
diff --git a/Project_Dep/Analyzer_pyplot.py b/Project_Dep/Analyzer_pyplot.py
@@ -0,0 +1,66 @@
+import sqlglot as sq
+import streamlit as st
+import re
+import networkx as nx
+import matplotlib.pyplot as plt
+# from sqlglot import expressions 
+# Function to extract tables and columns from the AST
+def extract_tables(parsed_query):
+    tables = set()
+    columns = set()
+    for node in parsed_query.walk():
+        if isinstance(node,sq.expressions.Create):
+            parent_object=node.this
+        if isinstance(node, sq.expressions.Table):
+            tables.add(node.name)
+        # elif isinstance(node, sq.expressions.Column):
+    #     # node.this is the column name
+    #     # node.table is the table name (if specified)
+            # column_name = node.name
+            # table_name = node.table
+            # columns.add((table_name, column_name))
+    return parent_object,tables
+# ,columns
+
+def read_query_from_path(file):
+    with open(file,'r') as f:
+        query=f.read()
+        # print(query)
+        return sq.parse_one(query)
+
+file='/workspaces/sfguide-data-engineering-with-snowpark-python-intro/Project Dep/view_demo.sql'
+# Extract tables and columns
+read_query_from_path(file)
+
+parent,tables= extract_tables(read_query_from_path(file))
+# ,columns
+print("Parent:",parent) 
+print("Tables:", tables)
+
+dependencies=[]
+
+def create_hierarchy_graph(parent, tables):
+    # Create a directed graph (DiGraph)
+    G = nx.DiGraph()
+
+    # Add the parent node (view)
+    G.add_node(parent)
+
+    # Add the child nodes (tables) and establish the relationships
+    for table in tables:
+        G.add_node(table)
+        G.add_edge(parent, table)  # Connect the parent to each table
+
+    return G
+print(create_hierarchy_graph(parent=parent,tables=tables))
+
+G = create_hierarchy_graph(parent, tables)
+
+pos = nx.spring_layout(G, seed=42)  # You can experiment with different layouts like shell_layout, etc.
+
+# Draw the graph using matplotlib
+plt.figure(figsize=(8, 6))
+nx.draw(G, pos, with_labels=True, node_size=3000, node_color='skyblue', font_size=10, font_color='black', font_weight='bold', edge_color='gray')
+plt.title("Hierarchy of v_department_summary and Related Tables")
+plt.show()
+plt.savefig('plot.jpg')
diff --git a/Project_Dep/gpt_Analyser.py b/Project_Dep/gpt_Analyser.py
@@ -0,0 +1,122 @@
+import sqlglot
+import graphviz
+
+class SQLDependencyExtractor:
+    def __init__(self, sql_query):
+        self.sql_query = sql_query
+        self.dependencies = {"tables": {}, "lineage": []}
+
+    def parse_sql(self):
+        """Parse the SQL query into an Abstract Syntax Tree (AST)."""
+        # try:
+            # Parse the SQL query using SQLGlot
+        return sqlglot.parse_one(self.sql_query)
+        # except  SqlglotError as e:
+        #     raise Exception(f"Error parsing SQL: {e}")
+
+    def extract_dependencies(self, parsed):
+        """Extract table and column dependencies using SQLGlot walk()."""
+        # print('====>',node.type)
+        def visitor(node):
+            # Check for table references
+            # print('====>',node.type)
+            if node.type in ['table', 'from', 'join']:
+                print(node.name)
+                table_name = node.name
+                if table_name and table_name not in self.dependencies['tables']:
+                    self.dependencies['tables'][table_name] = []
+
+            # Check for column references
+            if node.type == 'column':
+                table_name = node.find_parent("table")
+                column_name = node.name
+                print(table_name.name,'=>',node.name)
+                if table_name and table_name.name:
+                    table_name_str = table_name.name
+                    if table_name_str not in self.dependencies['tables']:
+                        self.dependencies['tables'][table_name_str] = []
+                    self.dependencies['tables'][table_name_str].append(column_name)
+                    self.dependencies['lineage'].append({
+                        "table": table_name_str,
+                        "column": column_name
+                    })
+
+        # Walk the AST to extract tables and columns
+        sqlglot.Expression.walk(parsed)
+
+    # def generate_digraph(self, output_file="table_column_hierarchy"):
+    #     """Generate a hierarchy diagram (Digraph) to visualize table-column dependencies."""
+    #     graph = graphviz.Digraph(comment='Table-Column Dependency Graph')
+
+    #     # Add table nodes
+    #     for table in self.dependencies['tables']:
+    #         graph.node(table, table)
+
+    #     # Add column dependencies (edges)
+    #     for lineage in self.dependencies['lineage']:
+    #         graph.edge(lineage['table'], f"{lineage['table']}.{lineage['column']}")
+
+    #     # Save and render the graph as a PNG file
+    #     graph.render(output_file, format='png')
+
+    def get_dependencies(self):
+        """Get the table and column dependencies."""
+        parsed = self.parse_sql()
+        print(parsed)
+        self.extract_dependencies(parsed)
+        return self.dependencies
+
+    # def display_dependencies(self):
+    #     """Display the extracted dependencies in a readable format."""
+    #     print("Tables and Columns:")
+    #     for table, columns in self.dependencies["tables"].items():
+    #         print(f"Table: {table}")
+    #         for column in columns:
+    #             print(f"  - Column: {column}")
+
+    #     print("\nLineage (Table -> Column):")
+    #     for line in self.dependencies["lineage"]:
+    #         print(f"Table: {line['table']}, Column: {line['column']}")
+
+# Example SQL query
+sql_query = """
+CREATE OR REPLACE VIEW v_department_summary AS
+SELECT
+    d.department_id,
+    d.department_name,
+    d.manager_id,
+    l.city AS department_location,
+    c.country_name,
+    r.region_name,
+    COUNT(e.employee_id) AS total_employees,
+    AVG(e.salary) AS avg_salary,
+    LISTAGG(j.job_title, ', ') WITHIN GROUP (ORDER BY j.job_title) AS job_titles
+FROM
+    employees e
+JOIN
+    departments d ON e.department_id = d.department_id
+JOIN
+    locations l ON d.location_id = l.location_id
+JOIN
+    countries c ON l.country_id = c.country_id
+JOIN
+    regions r ON c.region_id = r.region_id
+JOIN
+    jobs j ON e.job_id = j.job_id
+GROUP BY
+    d.department_id, d.department_name, d.manager_id, l.city, c.country_name, r.region_name
+ORDER BY
+    d.department_name;
+"""
+
+# Extract and display the dependencies
+extractor = SQLDependencyExtractor(sql_query)
+print(extractor.get_dependencies())
+# dependencies = extractor.get_dependencies()
+
+# Display the dependencies
+# extractor.display_dependencies()
+
+# Generate and render the Digraph to visualize the table-column hierarchy
+# extractor.generate_digraph(output_file="table_column_hierarchy")
+# print("Dependency graph generated and saved as 'table_column_hierarchy.png'")