Skip to content

Commit dd56855

Browse files
authored
Merge pull request #14 from forcedotcom/chandresh-updates
Add zip cli command, generate requirements.txt during scan, and add example jupyter notebook
2 parents 1fc1da0 + 1d6fb56 commit dd56855

File tree

6 files changed

+562
-30
lines changed

6 files changed

+562
-30
lines changed

src/datacustomcode/cli.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,15 @@ def configure(
6969
).update_ini(profile=profile)
7070

7171

72+
@cli.command()
73+
@click.argument("path", default="payload")
74+
def zip(path: str):
75+
from datacustomcode.deploy import zip
76+
77+
logger.debug("Zipping project")
78+
zip(path)
79+
80+
7281
@cli.command()
7382
@click.option("--profile", default="default")
7483
@click.option("--path", default="payload")
@@ -127,8 +136,11 @@ def init(directory: str):
127136
@click.argument("filename")
128137
@click.option("--config")
129138
@click.option("--dry-run", is_flag=True)
130-
def scan(filename: str, config: str, dry_run: bool):
131-
from datacustomcode.scan import dc_config_json_from_file
139+
@click.option(
140+
"--no-requirements", is_flag=True, help="Skip generating requirements.txt file"
141+
)
142+
def scan(filename: str, config: str, dry_run: bool, no_requirements: bool):
143+
from datacustomcode.scan import dc_config_json_from_file, write_requirements_file
132144

133145
config_location = config or os.path.join(os.path.dirname(filename), "config.json")
134146
click.echo(
@@ -143,6 +155,13 @@ def scan(filename: str, config: str, dry_run: bool):
143155
with open(config_location, "w") as f:
144156
json.dump(config_json, f, indent=2)
145157

158+
if not no_requirements:
159+
requirements_path = write_requirements_file(filename)
160+
click.echo(
161+
"Generated requirements file: "
162+
+ click.style(requirements_path, fg="blue", bold=True)
163+
)
164+
146165

147166
@cli.command()
148167
@click.argument("entrypoint")

src/datacustomcode/deploy.py

Lines changed: 70 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -169,25 +169,14 @@ def prepare_dependency_archive(directory: str) -> None:
169169
archive_file = os.path.join(archives_dir, DEPENDENCIES_ARCHIVE_NAME)
170170
with tarfile.open(archive_file, "w:gz") as tar:
171171
for file in os.listdir(temp_dir):
172+
# Exclude requirements.txt from the archive
173+
if file == "requirements.txt":
174+
continue
172175
tar.add(os.path.join(temp_dir, file), arcname=file)
173176

174177
logger.debug(f"Dependencies downloaded and archived to {archive_file}")
175178

176179

177-
def zip_and_upload_directory(directory: str, file_upload_url: str) -> None:
178-
file_upload_url = unescape(file_upload_url)
179-
180-
logger.debug(f"Zipping directory... {directory}")
181-
shutil.make_archive(ZIP_FILE_NAME.rstrip(".zip"), "zip", directory)
182-
183-
logger.debug(f"Uploading deployment to {file_upload_url}")
184-
with open(ZIP_FILE_NAME, "rb") as zip_file:
185-
response = requests.put(
186-
file_upload_url, data=zip_file, headers={"Content-Type": "application/zip"}
187-
)
188-
response.raise_for_status()
189-
190-
191180
class DeploymentsResponse(BaseModel):
192181
deploymentStatus: str
193182

@@ -325,6 +314,71 @@ def create_data_transform(
325314
return response
326315

327316

317+
def has_nonempty_requirements_file(directory: str) -> bool:
318+
"""
319+
Check if requirements.txt exists in the given directory and has at least
320+
one non-comment line.
321+
Args:
322+
directory (str): The directory to check for requirements.txt.
323+
Returns:
324+
bool: True if requirements.txt exists and has a non-comment line,
325+
False otherwise.
326+
"""
327+
# Look for requirements.txt in the parent directory of the given directory
328+
requirements_path = os.path.join(os.path.dirname(directory), "requirements.txt")
329+
330+
try:
331+
if os.path.isfile(requirements_path):
332+
with open(requirements_path, "r", encoding="utf-8") as f:
333+
for line in f:
334+
# Consider non-empty if any line is not a comment (ignoring
335+
# leading whitespace)
336+
if line.strip() and not line.lstrip().startswith("#"):
337+
return True
338+
except Exception as e:
339+
logger.error(f"Error reading requirements.txt: {e}")
340+
return False
341+
342+
343+
def upload_zip(file_upload_url: str) -> None:
344+
file_upload_url = unescape(file_upload_url)
345+
with open(ZIP_FILE_NAME, "rb") as zip_file:
346+
response = requests.put(
347+
file_upload_url, data=zip_file, headers={"Content-Type": "application/zip"}
348+
)
349+
response.raise_for_status()
350+
351+
352+
def zip(
353+
directory: str,
354+
):
355+
# Create a zip file excluding .DS_Store files
356+
import zipfile
357+
358+
# prepare payload only if requirements.txt is non-empty
359+
if has_nonempty_requirements_file(directory):
360+
prepare_dependency_archive(directory)
361+
else:
362+
logger.info(
363+
f"Skipping dependency archive: requirements.txt is missing or empty "
364+
f"in {directory}"
365+
)
366+
367+
logger.debug(f"Zipping directory... {directory}")
368+
369+
with zipfile.ZipFile(ZIP_FILE_NAME, "w", zipfile.ZIP_DEFLATED) as zipf:
370+
for root, dirs, files in os.walk(directory):
371+
# Skip .DS_Store files when adding to zip
372+
for file in files:
373+
if file != ".DS_Store":
374+
file_path = os.path.join(root, file)
375+
# Preserve relative path structure in the zip file
376+
arcname = os.path.relpath(file_path, directory)
377+
zipf.write(file_path, arcname)
378+
379+
logger.debug(f"Created zip file: {ZIP_FILE_NAME}")
380+
381+
328382
def deploy_full(
329383
directory: str,
330384
metadata: TransformationJobMetadata,
@@ -340,7 +394,8 @@ def deploy_full(
340394

341395
# create deployment and upload payload
342396
deployment = create_deployment(access_token, metadata)
343-
zip_and_upload_directory(directory, deployment.fileUploadUrl)
397+
zip(directory)
398+
upload_zip(deployment.fileUploadUrl)
344399
wait_for_deployment(access_token, metadata, callback)
345400

346401
# create data transform

src/datacustomcode/scan.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,12 @@
1515
from __future__ import annotations
1616

1717
import ast
18+
import os
1819
from typing import (
1920
Any,
21+
ClassVar,
2022
Dict,
23+
Set,
2124
Union,
2225
)
2326

@@ -131,6 +134,137 @@ def found(self) -> DataAccessLayerCalls:
131134
)
132135

133136

137+
class ImportVisitor(ast.NodeVisitor):
138+
"""AST Visitor that extracts external package imports from Python code."""
139+
140+
# Standard library modules that should be excluded from requirements
141+
STANDARD_LIBS: ClassVar[set[str]] = {
142+
"abc",
143+
"argparse",
144+
"ast",
145+
"asyncio",
146+
"base64",
147+
"collections",
148+
"configparser",
149+
"contextlib",
150+
"copy",
151+
"csv",
152+
"datetime",
153+
"enum",
154+
"functools",
155+
"glob",
156+
"hashlib",
157+
"http",
158+
"importlib",
159+
"inspect",
160+
"io",
161+
"itertools",
162+
"json",
163+
"logging",
164+
"math",
165+
"os",
166+
"pathlib",
167+
"pickle",
168+
"random",
169+
"re",
170+
"shutil",
171+
"site",
172+
"socket",
173+
"sqlite3",
174+
"string",
175+
"subprocess",
176+
"sys",
177+
"tempfile",
178+
"threading",
179+
"time",
180+
"traceback",
181+
"typing",
182+
"uuid",
183+
"warnings",
184+
"xml",
185+
"zipfile",
186+
}
187+
188+
# Additional packages to exclude from requirements.txt
189+
EXCLUDED_PACKAGES: ClassVar[set[str]] = {
190+
"datacustomcode", # Internal package
191+
"pyspark", # Provided by the runtime environment
192+
}
193+
194+
def __init__(self) -> None:
195+
self.imports: Set[str] = set()
196+
197+
def visit_Import(self, node: ast.Import) -> None:
198+
"""Visit an import statement (e.g., import os, sys)."""
199+
for name in node.names:
200+
# Get the top-level package name
201+
package = name.name.split(".")[0]
202+
if (
203+
package not in self.STANDARD_LIBS
204+
and package not in self.EXCLUDED_PACKAGES
205+
and not package.startswith("_")
206+
):
207+
self.imports.add(package)
208+
self.generic_visit(node)
209+
210+
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
211+
"""Visit a from-import statement (e.g., from os import path)."""
212+
if node.module is not None:
213+
# Get the top-level package
214+
package = node.module.split(".")[0]
215+
if (
216+
package not in self.STANDARD_LIBS
217+
and package not in self.EXCLUDED_PACKAGES
218+
and not package.startswith("_")
219+
):
220+
self.imports.add(package)
221+
self.generic_visit(node)
222+
223+
224+
def scan_file_for_imports(file_path: str) -> Set[str]:
225+
"""Scan a Python file for external package imports."""
226+
with open(file_path, "r") as f:
227+
code = f.read()
228+
tree = ast.parse(code)
229+
visitor = ImportVisitor()
230+
visitor.visit(tree)
231+
return visitor.imports
232+
233+
234+
def write_requirements_file(file_path: str) -> str:
235+
"""
236+
Scan a Python file for imports and write them to requirements.txt.
237+
238+
Args:
239+
file_path: Path to the Python file to scan
240+
241+
Returns:
242+
Path to the generated requirements.txt file
243+
"""
244+
imports = scan_file_for_imports(file_path)
245+
246+
# Write requirements.txt in the parent directory of the Python file
247+
file_dir = os.path.dirname(file_path)
248+
parent_dir = os.path.dirname(file_dir) if file_dir else "."
249+
requirements_path = os.path.join(parent_dir, "requirements.txt")
250+
251+
# If the file exists, read existing requirements and merge with new ones
252+
existing_requirements = set()
253+
if os.path.exists(requirements_path):
254+
with open(requirements_path, "r") as f:
255+
existing_requirements = {line.strip() for line in f if line.strip()}
256+
257+
# Merge existing requirements with newly discovered ones
258+
all_requirements = existing_requirements.union(imports)
259+
260+
# Write the combined requirements
261+
with open(requirements_path, "w") as f:
262+
for package in sorted(all_requirements):
263+
f.write(f"{package}\n")
264+
265+
return requirements_path
266+
267+
134268
def scan_file(file_path: str) -> DataAccessLayerCalls:
135269
"""Scan a single Python file for Client read/write method calls."""
136270
with open(file_path, "r") as f:
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "0",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"from datacustomcode.client import Client\n",
11+
"from datacustomcode.io.writer.base import WriteMode\n",
12+
"from pyspark.sql.functions import col, upper"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": null,
18+
"id": "1",
19+
"metadata": {},
20+
"outputs": [],
21+
"source": [
22+
"client = Client()\n",
23+
"\n",
24+
"df = client.read_dlo(\"Account_Home__dll\")"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": null,
30+
"id": "2",
31+
"metadata": {},
32+
"outputs": [],
33+
"source": [
34+
"# Perform transformations on the DataFrame\n",
35+
"df_upper1 = df.withColumn(\"Description__c\", upper(col(\"Description__c\")))\n",
36+
"\n",
37+
"# Drop specific columns related to relationships\n",
38+
"df_upper1 = df_upper1.drop(\"KQ_ParentId__c\")\n",
39+
"df_upper1 = df_upper1.drop(\"KQ_Id__c\")\n",
40+
"\n",
41+
"df_upper1.show()"
42+
]
43+
},
44+
{
45+
"cell_type": "code",
46+
"execution_count": null,
47+
"id": "3",
48+
"metadata": {},
49+
"outputs": [],
50+
"source": [
51+
"# Save the transformed DataFrame\n",
52+
"dlo_name = \"Account_Home_copy__dll\"\n",
53+
"client.write_to_dlo(dlo_name, df_upper1, write_mode=WriteMode.APPEND)"
54+
]
55+
},
56+
{
57+
"cell_type": "code",
58+
"execution_count": null,
59+
"id": "4",
60+
"metadata": {},
61+
"outputs": [],
62+
"source": []
63+
}
64+
],
65+
"metadata": {
66+
"kernelspec": {
67+
"display_name": "Python 3 (ipykernel)",
68+
"language": "python",
69+
"name": "python3"
70+
},
71+
"language_info": {
72+
"codemirror_mode": {
73+
"name": "ipython",
74+
"version": 3
75+
},
76+
"file_extension": ".py",
77+
"mimetype": "text/x-python",
78+
"name": "python",
79+
"nbconvert_exporter": "python",
80+
"pygments_lexer": "ipython3",
81+
"version": "3.11.11"
82+
}
83+
},
84+
"nbformat": 4,
85+
"nbformat_minor": 5
86+
}

0 commit comments

Comments
 (0)