insight gpt

samarth30 · Dec 9, 2024 · 90d1b38 · 90d1b38
commit 90d1b38
Show file tree

Hide file tree

Showing 23 changed files with 259,600 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+env
+gpt_env
+.env
diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+# Insight GPT
diff --git a/__init__.py b/__init__.py
diff --git a/__pycache__/boilerplate.cpython-310.pyc b/__pycache__/boilerplate.cpython-310.pyc
diff --git a/__pycache__/boilerplate.cpython-311.pyc b/__pycache__/boilerplate.cpython-311.pyc
diff --git a/__pycache__/main.cpython-310.pyc b/__pycache__/main.cpython-310.pyc
diff --git a/__pycache__/main.cpython-311.pyc b/__pycache__/main.cpython-311.pyc
diff --git a/__pycache__/prefix.cpython-310.pyc b/__pycache__/prefix.cpython-310.pyc
diff --git a/__pycache__/prefix.cpython-311.pyc b/__pycache__/prefix.cpython-311.pyc
diff --git a/__pycache__/tools.cpython-310.pyc b/__pycache__/tools.cpython-310.pyc
diff --git a/__pycache__/tools.cpython-311.pyc b/__pycache__/tools.cpython-311.pyc
diff --git a/boilerplate.py b/boilerplate.py
@@ -0,0 +1,128 @@
+marker_boilerplate = """"var marker = new google.maps.Marker({
+position: {lat: markerData.lat, lng: markerData.lng},
+map: map,
+title: markerData.name + ' - ' + markerData.address, 
+label: markerData.name
+});
+"""
+
+holding_period_boilerplate = """
+
+WITH sale_deltas AS (
+    SELECT 
+        cs1.condo_unit_id, 
+        cs1.closing_date AS current_closing_date, 
+        cs2.closing_date AS previous_closing_date,
+        (cs1.closing_date - cs2.closing_date) AS delta_days
+    FROM 
+        core_condosale cs1
+    JOIN 
+        core_condosale cs2 
+    ON 
+        cs1.condo_unit_id = cs2.condo_unit_id 
+    WHERE 
+        cs1.closing_date > cs2.closing_date
+        AND cs1.blacklist = FALSE
+        AND cs2.blacklist = FALSE
+        AND cs2.closing_date = (
+            SELECT MAX(cs3.closing_date)
+            FROM core_condosale cs3
+            WHERE cs3.condo_unit_id = cs1.condo_unit_id
+            AND cs3.closing_date < cs1.closing_date
+            AND cs3.blacklist = FALSE
+        )
+        AND cs1.condo_unit_id IN (
+            SELECT id 
+            FROM core_condounit 
+            WHERE blacklist = FALSE 
+            AND building_id IN (
+                SELECT id 
+                FROM core_condobuilding 
+                WHERE market_id = (
+                    SELECT id FROM core_condomarket WHERE name = 'Brickell'
+                )
+            )
+        )
+)
+SELECT 
+    AVG(delta_days) AS average_delta
+FROM 
+    sale_deltas;
+
+
+
+"""
+
+two_bed_holding_period_boilerplate = """
+
+
+WITH sale_deltas AS (
+    SELECT 
+        cs1.condo_unit_id, 
+        cs1.closing_date AS current_closing_date, 
+        cs2.closing_date AS previous_closing_date,
+        (cs1.closing_date - cs2.closing_date) AS delta_days
+    FROM 
+        core_condosale cs1
+    JOIN 
+        core_condosale cs2 
+    ON 
+        cs1.condo_unit_id = cs2.condo_unit_id 
+    WHERE 
+        cs1.closing_date > cs2.closing_date
+        AND cs1.blacklist = FALSE
+        AND cs2.blacklist = FALSE
+        AND cs2.closing_date = (
+            SELECT MAX(cs3.closing_date)
+            FROM core_condosale cs3
+            WHERE cs3.condo_unit_id = cs1.condo_unit_id
+            AND cs3.closing_date < cs1.closing_date
+            AND cs3.blacklist = FALSE
+        )
+        AND cs1.condo_unit_id IN (
+            SELECT id 
+            FROM core_condounit 
+            WHERE blacklist = FALSE 
+            AND beds = 2
+            AND building_id IN (
+                SELECT id 
+                FROM core_condobuilding 
+                WHERE market_id = (
+                    SELECT id FROM core_condomarket WHERE name = 'Brickell'
+                )
+            )
+        )
+)
+SELECT 
+    AVG(delta_days) AS average_delta
+FROM 
+    sale_deltas;
+
+
+"""
+
+javascript_map_boilerplate = """
+
+function initMap() {
+    var locations = [
+        // Building and school markers will be listed here
+    ];
+
+    var map = new google.maps.Map(document.getElementById('map'), {
+        zoom: 13,
+        center: {lat: [average_lat], lng: [average_lng]}
+    });
+
+    locations.forEach(function(location) {
+        var marker = new google.maps.Marker({
+            position: {lat: location.lat, lng: location.lng},
+            map: map,
+            label: location.label
+        });
+    });
+}
+"""
+
+building_marker_format_boilerplate = "{lat: [building.lat], lng: [building.lon], label: '[building.alt_name] - [building.address]'}"
+
+school_marker_format_boilerplate = "{lat: [school.geometry.location.lat], lng: [school.geometry.location.lng], label: '[school.name]'}"
diff --git a/examples.py b/examples.py
@@ -0,0 +1,2 @@
+# currently unused, see guide in https://python.langchain.com/v0.2/docs/how_to/sql_prompting/
+examples = []
diff --git a/main.py b/main.py
@@ -0,0 +1,225 @@
+import ast
+import os
+import re
+
+import markdown
+from googlemaps import Client as GoogleMaps
+from langchain_community.utilities.sql_database import SQLDatabase
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_openai import ChatOpenAI
+from langgraph.prebuilt import create_react_agent
+from markupsafe import Markup
+
+# for generating the pdf report, we receive reportlab code and execute it arbitrarily
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+
+from boilerplate import (
+    building_marker_format_boilerplate,
+    holding_period_boilerplate,
+    javascript_map_boilerplate,
+    marker_boilerplate,
+    school_marker_format_boilerplate,
+    two_bed_holding_period_boilerplate,
+)
+from prefix import SQL_PREFIX
+from tools import setup_tools
+
+# Update the following variables with your database credentials
+POSTGRES_USER = os.getenv("PG_USER")
+POSTGRES_PASSWORD = os.getenv("PG_PASSWORD")
+POSTGRES_PORT = os.getenv("PG_PORT")
+POSTGRES_DB = os.getenv("PG_DB")
+
+connection_string = f"postgresql://{os.getenv('PG_USER')}:{os.getenv('PG_PASSWORD')}@localhost:{os.getenv('PG_PORT')}/{os.getenv('PG_DB')}"
+
+# Add debug print
+print(f"Using connection string: {connection_string}")
+
+# Add error handling
+try:
+    db = SQLDatabase.from_uri(connection_string)
+except Exception as e:
+    print(f"Connection error: {e}")
+    print(f"Environment variables:")
+    print(f"PG_USER: {os.getenv('PG_USER')}")
+    print(f"PG_DB: {os.getenv('PG_DB')}")
+    print(f"PG_PORT: {os.getenv('PG_PORT')}")
+    raise
+
+llm = ChatOpenAI(model="gpt-4o-mini")
+
+gmaps = GoogleMaps(os.getenv("GPLACES_API_KEY"))
+
+
+prefix = SQL_PREFIX.format(
+    table_names=db.get_usable_table_names(),
+    marker_boilerplate=marker_boilerplate,
+    holding_period_boilerplate=holding_period_boilerplate,
+    two_bed_holding_period_boilerplate=two_bed_holding_period_boilerplate,
+    javascript_map_boilerplate=javascript_map_boilerplate,
+    building_marker_format_boilerplate=building_marker_format_boilerplate,
+    school_marker_format_boilerplate=school_marker_format_boilerplate,
+)
+
+system_message = SystemMessage(content=prefix)
+
+
+def query_as_list(db, query):
+    res = db.run(query)
+    res = [el for sub in ast.literal_eval(res) for el in sub if el]
+    res = [re.sub(r"\b\d+\b", "", string).strip() for string in res]
+    return list(set(res))
+
+
+addresses = query_as_list(db, "SELECT address FROM core_condobuilding")
+alt_names = query_as_list(db, "SELECT alt_name FROM core_condobuilding")
+
+
+tools = setup_tools(db, llm)
+
+agent_executor = create_react_agent(
+    llm, tools, messages_modifier=system_message)
+
+
+def print_sql_1(sql):
+    print(
+        """
+The SQL query is:
+
+{}
+    """.format(
+            sql
+        )
+    )
+
+
+def extract_and_remove_html(text):
+    # Pattern to match HTML code block
+    html_pattern = r"```html\s*([\s\S]*?)\s*```"
+
+    # First look for any python code
+    python_pattern = (
+        r'<pre\s+class="codehilite"><code\s+class="language-python">(.*?)</code></pre>'
+    )
+    md_pattern = r"```python(.*?)```"
+    python_match = re.search(python_pattern, text, re.DOTALL | re.IGNORECASE)
+    md_match = re.search(md_pattern, text, re.DOTALL)
+    code_match = python_match or md_match
+    if code_match:
+        print(text)
+        code = code_match.group(1)
+        code = code.replace("&quot;", '"')
+        code = code.replace("&amp;", "&")
+        code = code.replace("&lt;", "<")
+        code = code.replace("&gt;", ">")
+        code = code.replace("&#39;", "'")
+        return None, "PDF Generated!", code
+
+    # Search for the pattern in the text
+    match = re.search(html_pattern, text, re.IGNORECASE)
+
+    if match:
+        # Extract the HTML code
+        html_code = match.group(1).strip()
+        cleaned_html = process_html(html_code)
+
+        # Remove the HTML code block from the original text
+        text_without_html = re.sub(
+            html_pattern, "", text, flags=re.IGNORECASE).strip()
+
+        # Return both the extracted HTML and the text without HTML
+        return Markup(cleaned_html), text_without_html, False
+    # If no HTML is found, return None for HTML and the original text
+    return None, text, False
+
+
+def process_markdown(text):
+    # Convert Markdown to HTML
+    html = markdown.markdown(text, extensions=["extra", "codehilite"])
+    # Wrap the result in Markup to prevent auto-escaping
+    return Markup(html)
+
+
+def process_html(text):
+    # Regular expression to find and remove the script tag containing {gmaps_api_key}
+    pattern = r"<script[^>]*\{gmaps_api_key\}[^>]*></script>"
+
+    # Replace the matched script tag with an empty string
+    return re.sub(pattern, "", text, flags=re.IGNORECASE)
+
+# Function to detect malicious patterns
+
+
+def detect_malicious_code(code):
+    # Define a list of regex patterns for dangerous functions or modules
+    malicious_patterns = [
+        # Importing dangerous modules
+        r'import\s+(sys|subprocess|shlex|socket|ctypes|signal|multiprocessing)',
+        # Dangerous os methods
+        r'os\.(system|popen|remove|rmdir|rename|chmod|chown|kill|fork)',
+        r'subprocess\.(Popen|run|call|check_output)',  # Subprocess methods
+        r'eval\(',  # Use of eval()
+        r'exec\(',  # Use of exec()
+        r'compile\(',  # Use of compile()
+        r'shutil\.(copy|move|rmtree)',  # shutil file operations
+        r'socket\.',  # Use of sockets for network access
+        r'requests\.',  # Use of requests library
+        r'urllib\.',  # Use of urllib library
+        r'getattr\(', r'setattr\(',  # Reflection
+        r'globals\(', r'locals\(',  # Accessing global or local variable scopes
+        r'importlib\.',  # Dynamic importing
+        r'input\(',  # Use of input() for potentially malicious prompts
+        r'os\.exec',  # exec family in os module
+        # Use of ast.literal_eval() for dynamic evaluation
+        r'ast\.(literal_eval)',
+    ]
+
+    for pattern in malicious_patterns:
+        if re.search(pattern, code):
+            print(f"Potentially dangerous pattern detected: {pattern}")
+            return True
+    return False
+
+
+def process_question(prompted_question, conversation_history):
+    context = "\n".join(
+        [
+            f"Q: {entry['question']}\nA: {entry['answer']}"
+            for entry in conversation_history
+        ]
+    )
+    consolidated_prompt = f"""
+    Previous conversation:
+    {context}
+
+    New question: {prompted_question}
+
+    Please answer the new question, taking into account the context from the previous conversation if relevant.
+    """
+    prompt = consolidated_prompt if conversation_history else prompted_question
+
+    content = []
+    for s in agent_executor.stream({"messages": [HumanMessage(content=prompt)]}):
+
+        for msg in s.get("agent", {}).get("messages", []):
+            for call in msg.tool_calls:
+                if sql := call.get("args", {}).get("query", None):
+                    print(print_sql_1(sql))
+
+            print(msg.content)
+            html, stripped_text, code = extract_and_remove_html(msg.content)
+            if code:
+                # # ----- Checking for Malicious Code
+
+                # Check for malicious patterns before executing
+                if not detect_malicious_code(code):
+                    exec(code)
+
+                # # ----- Checking for Malicious Code
+            content.append(process_markdown(stripped_text))
+            if html:
+                content.append(html)
+        print("----")
+
+    return content
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    env
+    gpt_env
+    .env
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# currently unused, see guide in https://python.langchain.com/v0.2/docs/how_to/sql_prompting/
		examples = []