updated to add scan & zip changes

chandresh-patelsf · chandresh-patelsf · commit 7414617dafcc · 2025-05-12T14:00:41.000-07:00
diff --git a/src/datacustomcode/cli.py b/src/datacustomcode/cli.py
@@ -68,6 +68,35 @@ def configure(
         login_url=login_url,
     ).update_ini(profile=profile)
 
+@cli.command()
+@click.option("--profile", default="default")
+@click.option("--path", default="payload")
+@click.option("--name", default="test_pkg")
+@click.option("--version", default="0.0.1")
+@click.option("--description", default="Custom Data Transform Code")
+def zip(profile: str, path: str, name: str, version: str, description: str):
+    from datacustomcode.credentials import Credentials
+    from datacustomcode.deploy import TransformationJobMetadata, zip, zip_and_upload_directory
+
+    logger.debug("Zipping project")
+
+    metadata = TransformationJobMetadata(
+        name=name,
+        version=version,
+        description=description,
+    )
+    try:
+        credentials = Credentials.from_ini(profile=profile)
+    except KeyError:
+        click.secho(
+            f"Error: Profile {profile} not found in credentials.ini. "
+            "Run `datacustomcode configure` to create a credentialsprofile.",
+            fg="red",
+        )
+        raise click.Abort() from None
+    zip(path, metadata, credentials, name)
+
+
 
 @cli.command()
 @click.option("--profile", default="default")
diff --git a/src/datacustomcode/deploy.py b/src/datacustomcode/deploy.py
@@ -169,23 +169,41 @@ def prepare_dependency_archive(directory: str) -> None:
         archive_file = os.path.join(archives_dir, DEPENDENCIES_ARCHIVE_NAME)
         with tarfile.open(archive_file, "w:gz") as tar:
             for file in os.listdir(temp_dir):
+                # Exclude requirements.txt from the archive
+                if file == "requirements.txt":
+                    continue
                 tar.add(os.path.join(temp_dir, file), arcname=file)
 
         logger.debug(f"Dependencies downloaded and archived to {archive_file}")
 
 
-def zip_and_upload_directory(directory: str, file_upload_url: str) -> None:
-    file_upload_url = unescape(file_upload_url)
+def zip_and_upload_directory(directory: str, name: str) -> None:
+    #   file_upload_url = unescape(file_upload_url)
 
     logger.debug(f"Zipping directory... {directory}")
-    shutil.make_archive(ZIP_FILE_NAME.rstrip(".zip"), "zip", directory)
 
-    logger.debug(f"Uploading deployment to {file_upload_url}")
-    with open(ZIP_FILE_NAME, "rb") as zip_file:
-        response = requests.put(
-            file_upload_url, data=zip_file, headers={"Content-Type": "application/zip"}
-        )
-        response.raise_for_status()
+    # Create a zip file excluding .DS_Store files
+    import zipfile
+
+    zip_filename = f"{name}.zip"
+    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
+        for root, dirs, files in os.walk(directory):
+            # Skip .DS_Store files when adding to zip
+            for file in files:
+                if file != '.DS_Store':
+                    file_path = os.path.join(root, file)
+                    # Preserve relative path structure in the zip file
+                    arcname = os.path.relpath(file_path, directory)
+                    zipf.write(file_path, arcname)
+
+    logger.debug(f"Created zip file: {zip_filename} (excluding .DS_Store files)")
+
+    # logger.debug(f"Uploading deployment to {file_upload_url}")
+    # with open(ZIP_FILE_NAME, "rb") as zip_file:
+    #     response = requests.put(
+    #         file_upload_url, data=zip_file, headers={"Content-Type": "application/zip"}
+    #     )
+    #     response.raise_for_status()
 
 
 class DeploymentsResponse(BaseModel):
@@ -324,6 +342,60 @@ def create_data_transform(
     response = _make_api_call(url, "POST", token=access_token.access_token, json=body)
     return response
 
+def has_nonempty_requirements_file(directory: str) -> bool:
+    """
+    Check if requirements.txt exists in the given directory and has at least one non-comment line.
+    Args:
+        directory (str): The directory to check for requirements.txt.
+    Returns:
+        bool: True if requirements.txt exists and has a non-comment line, False otherwise.
+    """
+    # Look for requirements.txt in the parent directory of the given directory
+    requirements_path = os.path.join(os.path.dirname(directory), "requirements.txt")
+    print(requirements_path)
+
+    try:
+        if os.path.isfile(requirements_path):
+            #print the contents of the file
+            with open(requirements_path, "r", encoding="utf-8") as f:
+                print(f.read())
+            with open(requirements_path, "r", encoding="utf-8") as f:
+                for line in f:
+                    # Consider non-empty if any line is not a comment (ignoring leading whitespace)
+                    if line.strip() and not line.lstrip().startswith('#'):
+                        return True
+    except Exception as e:
+        logger.error(f"Error reading requirements.txt: {e}")
+    return False
+
+
+def zip(
+    directory: str,
+    metadata: TransformationJobMetadata,
+    credentials: Credentials,
+    name: str,
+    callback=None,
+) -> AccessTokenResponse:
+    """Deploy a data transform in the DataCloud."""
+    access_token = _retrieve_access_token(credentials)
+
+    # prepare payload only if requirements.txt is non-empty
+    if has_nonempty_requirements_file(directory):
+        prepare_dependency_archive(directory)
+    else:
+        logger.info(f"Skipping dependency archive: requirements.txt is missing or empty in {directory}")
+ #   create_data_transform_config(directory)
+
+    # create deployment and upload payload
+#    deployment = create_deployment(access_token, metadata)
+    zip_and_upload_directory(directory, name)
+    #, deployment.fileUploadUrl)
+#    wait_for_deployment(access_token, metadata, callback)
+
+    # create data transform
+#   create_data_transform(directory, access_token, metadata)
+    return access_token
+
 
 def deploy_full(
     directory: str,
diff --git a/src/datacustomcode/scan.py b/src/datacustomcode/scan.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
+import os
+from typing import Union, Dict, List, FrozenSet, Set
 
 import ast
 from typing import (
@@ -131,6 +133,98 @@ def found(self) -> DataAccessLayerCalls:
         )
 
 
+class ImportVisitor(ast.NodeVisitor):
+    """AST Visitor that extracts external package imports from Python code."""
+
+    # Standard library modules that should be excluded from requirements
+    STANDARD_LIBS = {
+        "abc", "argparse", "ast", "asyncio", "base64", "collections", "configparser",
+        "contextlib", "copy", "csv", "datetime", "enum", "functools", "glob", "hashlib",
+        "http", "importlib", "inspect", "io", "itertools", "json", "logging", "math",
+        "os", "pathlib", "pickle", "random", "re", "shutil", "site", "socket", "sqlite3",
+        "string", "subprocess", "sys", "tempfile", "threading", "time", "traceback",
+        "typing", "uuid", "warnings", "xml", "zipfile"
+    }
+
+    # Additional packages to exclude from requirements.txt
+    EXCLUDED_PACKAGES = {
+        "datacustomcode",  # Internal package
+        "pyspark",  # Provided by the runtime environment
+    }
+
+    def __init__(self) -> None:
+        self.imports: Set[str] = set()
+
+    def visit_Import(self, node: ast.Import) -> None:
+        """Visit an import statement (e.g., import os, sys)."""
+        for name in node.names:
+            # Get the top-level package name
+            package = name.name.split('.')[0]
+            if (package not in self.STANDARD_LIBS and
+                    package not in self.EXCLUDED_PACKAGES and
+                    not package.startswith('_')):
+                self.imports.add(package)
+        self.generic_visit(node)
+
+    def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
+        """Visit a from-import statement (e.g., from os import path)."""
+        if node.module is not None:
+            # Get the top-level package
+            package = node.module.split('.')[0]
+            if (package not in self.STANDARD_LIBS and
+                    package not in self.EXCLUDED_PACKAGES and
+                    not package.startswith('_')):
+                self.imports.add(package)
+        self.generic_visit(node)
+
+
+def scan_file_for_imports(file_path: str) -> Set[str]:
+    """Scan a Python file for external package imports."""
+    with open(file_path, "r") as f:
+        code = f.read()
+        tree = ast.parse(code)
+        visitor = ImportVisitor()
+        visitor.visit(tree)
+        return visitor.imports
+
+
+def write_requirements_file(file_path: str, output_dir: str = None) -> str:
+    """
+    Scan a Python file for imports and write them to requirements.txt.
+
+    Args:
+        file_path: Path to the Python file to scan
+        output_dir: Directory where requirements.txt should be created (defaults to parent directory)
+
+    Returns:
+        Path to the generated requirements.txt file
+    """
+    imports = scan_file_for_imports(file_path)
+
+    if not output_dir:
+        # Use the parent directory rather than same directory as the file
+        file_dir = os.path.dirname(file_path)
+        output_dir = os.path.dirname(file_dir) if file_dir else "."
+
+    requirements_path = os.path.join(output_dir, "requirements.txt")
+
+    # If the file exists, read existing requirements and merge with new ones
+    existing_requirements = set()
+    if os.path.exists(requirements_path):
+        with open(requirements_path, "r") as f:
+            existing_requirements = {line.strip() for line in f if line.strip()}
+
+    # Merge existing requirements with newly discovered ones
+    all_requirements = existing_requirements.union(imports)
+
+    # Write the combined requirements
+    with open(requirements_path, "w") as f:
+        for package in sorted(all_requirements):
+            f.write(f"{package}\n")
+
+    return requirements_path
+
+
 def scan_file(file_path: str) -> DataAccessLayerCalls:
     """Scan a single Python file for Client read/write method calls."""
     with open(file_path, "r") as f:
diff --git a/src/datacustomcode/templates/account.ipynb b/src/datacustomcode/templates/account.ipynb
@@ -0,0 +1,86 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "253d95db-fdc6-4bbb-b75c-20b46639f2d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datacustomcode.client import Client\n",
+    "from datacustomcode.io.writer.base import WriteMode\n",
+    "from pyspark.sql.functions import col, upper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "debdfc62-489b-4ca8-af1d-56c60c0d32e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = Client()  \n",
+    " \n",
+    "df = client.read_dlo(\"Account_Home__dll\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d96ad7c8-f5ba-44a7-a2ad-8597beb20cf4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Perform transformations on the DataFrame\n",
+    "df_upper1 = df.withColumn(\"Description__c\", upper(col(\"Description__c\")))\n",
+    "\n",
+    "# Drop specific columns related to relationships\n",
+    "df_upper1 = df_upper1.drop(\"KQ_ParentId__c\")\n",
+    "df_upper1 = df_upper1.drop(\"KQ_Id__c\")\n",
+    "\n",
+    "df_upper1.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f823139-3a22-487f-a4a1-966c6269a708",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the transformed DataFrame\n",
+    "dlo_name = 'Account_Home_copy__dll'\n",
+    "client.write_to_dlo(dlo_name, df_upper1, write_mode=WriteMode.APPEND)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "425f383b-09b4-45ee-957c-f215d7a2ccf2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}