Skip to content

Commit 34b7596

Browse files
authored
Merge pull request #7 from codellm-devkit/6-better-more-performant-output-formatting
Add compression for output file with gzip and MessagePack. Add inbuil…
2 parents 9afad17 + 6c2b526 commit 34b7596

File tree

3 files changed

+145
-129
lines changed

3 files changed

+145
-129
lines changed

codeanalyzer/__main__.py

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
1-
import sys
2-
from contextlib import nullcontext
31
from pathlib import Path
42
from typing import Annotated, Optional
3+
from enum import Enum
54

65
import typer
76

87
from codeanalyzer.core import AnalyzerCore
98
from codeanalyzer.utils import _set_log_level, logger
109

1110

11+
class OutputFormat(str, Enum):
12+
JSON = "json"
13+
MSGPACK = "msgpack"
14+
15+
1216
def main(
1317
input: Annotated[
1418
Path, typer.Option("-i", "--input", help="Path to the project root directory.")
@@ -17,6 +21,15 @@ def main(
1721
Optional[Path],
1822
typer.Option("-o", "--output", help="Output directory for artifacts."),
1923
] = None,
24+
format: Annotated[
25+
OutputFormat,
26+
typer.Option(
27+
"-f",
28+
"--format",
29+
help="Output format: json or msgpack.",
30+
case_sensitive=False,
31+
),
32+
] = OutputFormat.JSON,
2033
analysis_level: Annotated[
2134
int,
2235
typer.Option("-a", "--analysis-level", help="1: symbol table, 2: call graph."),
@@ -58,16 +71,40 @@ def main(
5871
input, analysis_level, using_codeql, rebuild_analysis, cache_dir, clear_cache
5972
) as analyzer:
6073
artifacts = analyzer.analyze()
61-
print_stream = sys.stdout
62-
stream_context = nullcontext(print_stream)
6374

64-
if output is not None:
75+
# Handle output based on format
76+
if output is None:
77+
# Output to stdout (only for JSON)
78+
if format == OutputFormat.JSON:
79+
print(artifacts.model_dump_json(separators=(",", ":")))
80+
else:
81+
logger.error(
82+
f"Format '{format.value}' requires an output directory (use -o/--output)"
83+
)
84+
raise typer.Exit(code=1)
85+
else:
86+
# Output to file
6587
output.mkdir(parents=True, exist_ok=True)
66-
output_file = output / "analysis.json"
67-
stream_context = output_file.open("w")
88+
_write_output(artifacts, output, format)
89+
90+
91+
def _write_output(artifacts, output_dir: Path, format: OutputFormat):
92+
"""Write artifacts to file in the specified format."""
93+
if format == OutputFormat.JSON:
94+
output_file = output_dir / "analysis.json"
95+
with output_file.open("w") as f:
96+
f.write(artifacts.model_dump_json(separators=(",", ":")))
97+
logger.info(f"Analysis saved to {output_file}")
6898

69-
with stream_context as f:
70-
print(artifacts.model_dump_json(indent=4), file=f)
99+
elif format == OutputFormat.MSGPACK:
100+
output_file = output_dir / "analysis.msgpack"
101+
msgpack_data = artifacts.to_msgpack_bytes()
102+
with output_file.open("wb") as f:
103+
f.write(msgpack_data)
104+
logger.info(f"Analysis saved to {output_file}")
105+
logger.info(
106+
f"Compression ratio: {artifacts.get_compression_ratio():.1%} of JSON size"
107+
)
71108

72109

73110
app = typer.Typer(

codeanalyzer/schema/py_schema.py

Lines changed: 98 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,85 @@
2323
import inspect
2424
from pathlib import Path
2525
from typing import Any, Dict, List, Optional
26+
import gzip
2627

2728
from pydantic import BaseModel
2829
from typing_extensions import Literal
30+
import msgpack
31+
32+
33+
def msgpk(cls):
34+
"""
35+
Decorator that adds MessagePack serialization methods to Pydantic models.
36+
37+
Adds methods:
38+
- to_msgpack_bytes() -> bytes: Serialize to compact binary format
39+
- from_msgpack_bytes(data: bytes) -> cls: Deserialize from binary format
40+
- to_msgpack_dict() -> dict: Convert to msgpack-compatible dict
41+
- from_msgpack_dict(data: dict) -> cls: Create instance from msgpack dict
42+
"""
43+
44+
def _prepare_for_serialization(obj: Any) -> Any:
45+
"""Convert objects to serialization-friendly format."""
46+
if isinstance(obj, Path):
47+
return str(obj)
48+
elif isinstance(obj, dict):
49+
return {
50+
_prepare_for_serialization(k): _prepare_for_serialization(v)
51+
for k, v in obj.items()
52+
}
53+
elif isinstance(obj, list):
54+
return [_prepare_for_serialization(item) for item in obj]
55+
elif isinstance(obj, tuple):
56+
return tuple(_prepare_for_serialization(item) for item in obj)
57+
elif isinstance(obj, set):
58+
return [_prepare_for_serialization(item) for item in obj]
59+
elif hasattr(obj, "model_dump"): # Pydantic model
60+
return _prepare_for_serialization(obj.model_dump())
61+
else:
62+
return obj
63+
64+
def to_msgpack_bytes(self) -> bytes:
65+
"""Serialize the model to compact binary format using MessagePack + gzip."""
66+
data = _prepare_for_serialization(self.model_dump())
67+
msgpack_data = msgpack.packb(data, use_bin_type=True)
68+
return gzip.compress(msgpack_data)
69+
70+
@classmethod
71+
def from_msgpack_bytes(cls_obj, data: bytes):
72+
"""Deserialize from MessagePack + gzip binary format."""
73+
decompressed_data = gzip.decompress(data)
74+
obj_dict = msgpack.unpackb(decompressed_data, raw=False)
75+
return cls_obj.model_validate(obj_dict)
76+
77+
def to_msgpack_dict(self) -> dict:
78+
"""Convert to msgpack-compatible dictionary format."""
79+
return _prepare_for_serialization(self.model_dump())
80+
81+
@classmethod
82+
def from_msgpack_dict(cls_obj, data: dict):
83+
"""Create instance from msgpack-compatible dictionary."""
84+
return cls_obj.model_validate(data)
85+
86+
def get_msgpack_size(self) -> int:
87+
"""Get the size of the msgpack serialization in bytes."""
88+
return len(self.to_msgpack_bytes())
89+
90+
def get_compression_ratio(self) -> float:
91+
"""Get compression ratio compared to JSON."""
92+
json_size = len(self.model_dump_json().encode("utf-8"))
93+
msgpack_gzip_size = self.get_msgpack_size()
94+
return msgpack_gzip_size / json_size if json_size > 0 else 1.0
95+
96+
# Add methods to the class
97+
cls.to_msgpack_bytes = to_msgpack_bytes
98+
cls.from_msgpack_bytes = from_msgpack_bytes
99+
cls.to_msgpack_dict = to_msgpack_dict
100+
cls.from_msgpack_dict = from_msgpack_dict
101+
cls.get_msgpack_size = get_msgpack_size
102+
cls.get_compression_ratio = get_compression_ratio
103+
104+
return cls
29105

30106

31107
def builder(cls):
@@ -92,26 +168,9 @@ def build(self):
92168

93169

94170
@builder
171+
@msgpk
95172
class PyImport(BaseModel):
96-
"""Represents a Python import statement.
97-
98-
Attributes:
99-
module (str): The name of the module being imported.
100-
name (str): The name of the imported entity (e.g., function, class).
101-
alias (Optional[str]): An optional alias for the imported entity.
102-
start_line (int): The line number where the import statement starts.
103-
end_line (int): The line number where the import statement ends.
104-
start_column (int): The starting column of the import statement.
105-
end_column (int): The ending column of the import statement.
106-
107-
Example:
108-
- import numpy as np will be represented as:
109-
PyImport(module="numpy", name="np", alias="np", start_line=1, end_line=1, start_column=0, end_column=16)
110-
- from math import sqrt will be represented as:
111-
PyImport(module="math", name="sqrt", alias=None, start_line=2, end_line=2, start_column=0, end_column=20
112-
- from os.path import join as path_join will be represented as:
113-
PyImport(module="os.path", name="path_join", alias="join", start_line=3, end_line=3, start_column=0, end_column=30)
114-
"""
173+
"""Represents a Python import statement."""
115174

116175
module: str
117176
name: str
@@ -123,18 +182,9 @@ class PyImport(BaseModel):
123182

124183

125184
@builder
185+
@msgpk
126186
class PyComment(BaseModel):
127-
"""
128-
Represents a Python comment.
129-
130-
Attributes:
131-
content (str): The actual comment string (without the leading '#').
132-
start_line (int): The line number where the comment starts.
133-
end_line (int): The line number where the comment ends (same as start_line for single-line comments).
134-
start_column (int): The starting column of the comment.
135-
end_column (int): The ending column of the comment.
136-
is_docstring (bool): Whether this comment is actually a docstring (triple-quoted string).
137-
"""
187+
"""Represents a Python comment."""
138188

139189
content: str
140190
start_line: int = -1
@@ -145,20 +195,9 @@ class PyComment(BaseModel):
145195

146196

147197
@builder
198+
@msgpk
148199
class PySymbol(BaseModel):
149-
"""
150-
Represents a symbol used or declared in Python code.
151-
152-
Attributes:
153-
name (str): The name of the symbol (e.g., 'x', 'self.x', 'os.path').
154-
scope (Literal['local', 'nonlocal', 'global', 'class', 'module']): The scope where the symbol is accessed.
155-
kind (Literal['variable', 'parameter', 'attribute', 'function', 'class', 'module']): The kind of symbol.
156-
type (Optional[str]): Inferred or annotated type, if available.
157-
qualified_name (Optional[str]): Fully qualified name (e.g., 'self.x', 'os.path.join').
158-
is_builtin (bool): Whether this is a Python builtin.
159-
lineno (int): Line number where the symbol is accessed or declared.
160-
col_offset (int): Column offset.
161-
"""
200+
"""Represents a symbol used or declared in Python code."""
162201

163202
name: str
164203
scope: Literal["local", "nonlocal", "global", "class", "module"]
@@ -171,11 +210,9 @@ class PySymbol(BaseModel):
171210

172211

173212
@builder
213+
@msgpk
174214
class PyVariableDeclaration(BaseModel):
175-
"""Represents a Python variable declaration.
176-
177-
Attributes:
178-
"""
215+
"""Represents a Python variable declaration."""
179216

180217
name: str
181218
type: Optional[str]
@@ -189,18 +226,9 @@ class PyVariableDeclaration(BaseModel):
189226

190227

191228
@builder
229+
@msgpk
192230
class PyCallableParameter(BaseModel):
193-
"""Represents a parameter of a Python callable (function/method).
194-
195-
Attributes:
196-
name (str): The name of the parameter.
197-
type (str): The type of the parameter.
198-
default_value (str): The default value of the parameter, if any.
199-
start_line (int): The line number where the parameter is defined.
200-
end_line (int): The line number where the parameter definition ends.
201-
start_column (int): The column number where the parameter starts.
202-
end_column (int): The column number where the parameter ends.
203-
"""
231+
"""Represents a parameter of a Python callable (function/method)."""
204232

205233
name: str
206234
type: Optional[str] = None
@@ -212,10 +240,9 @@ class PyCallableParameter(BaseModel):
212240

213241

214242
@builder
243+
@msgpk
215244
class PyCallsite(BaseModel):
216-
"""
217-
Represents a Python call site (function or method invocation) with contextual metadata.
218-
"""
245+
"""Represents a Python call site (function or method invocation) with contextual metadata."""
219246

220247
method_name: str
221248
receiver_expr: Optional[str] = None
@@ -231,26 +258,9 @@ class PyCallsite(BaseModel):
231258

232259

233260
@builder
261+
@msgpk
234262
class PyCallable(BaseModel):
235-
"""Represents a Python callable (function/method).
236-
237-
Attributes:
238-
name (str): The name of the callable.
239-
signature (str): The fully qualified name of the callable (e.g., module.function_name).
240-
docstring (PyComment): The docstring of the callable.
241-
decorators (List[str]): List of decorators applied to the callable.
242-
parameters (List[PyCallableParameter]): List of parameters for the callable.
243-
return_type (Optional[str]): The type of the return value, if specified.
244-
code (str): The actual code of the callable.
245-
start_line (int): The line number where the callable is defined.
246-
end_line (int): The line number where the callable definition ends.
247-
code_start_line (int): The line number where the code block starts.
248-
accessed_symbols (List[str]): Symbols accessed within the callable.
249-
call_sites (List[str]): Call sites of this callable.
250-
is_entrypoint (bool): Whether this callable is an entry point.
251-
local_variables (List[PyVariableDeclaration]): Local variables within the callable.
252-
cyclomatic_complexity (int): Cyclomatic complexity of the callable.
253-
"""
263+
"""Represents a Python callable (function/method)."""
254264

255265
name: str
256266
path: str
@@ -274,16 +284,9 @@ def __hash__(self) -> int:
274284

275285

276286
@builder
287+
@msgpk
277288
class PyClassAttribute(BaseModel):
278-
"""Represents a Python class attribute.
279-
280-
Attributes:
281-
name (str): The name of the attribute.
282-
type (str): The type of the attribute.
283-
docstring (PyComment): The docstring of the attribute.
284-
start_line (int): The line number where the attribute is defined.
285-
end_line (int): The line number where the attribute definition ends.
286-
"""
289+
"""Represents a Python class attribute."""
287290

288291
name: str
289292
type: Optional[str] = None
@@ -293,20 +296,9 @@ class PyClassAttribute(BaseModel):
293296

294297

295298
@builder
299+
@msgpk
296300
class PyClass(BaseModel):
297-
"""Represents a Python class.
298-
299-
Attributes:
300-
name (str): The name of the class.
301-
signature (str): The fully qualified name of the class (e.g., module.class_name).
302-
docstring (PyComment): The docstring of the class.
303-
base_classes (List[str]): List of base class names.
304-
methods (Dict[str, PyCallable]): Mapping of method names to their callable representations.
305-
attributes (Dict[str, PyClassAttribute]): Mapping of attribute names to their variable declarations.
306-
inner_classes (Dict[str, "PyClass"]): Mapping of inner class names to their class representations.
307-
start_line (int): The line number where the class definition starts.
308-
end_line (int): The line number where the class definition ends.
309-
"""
301+
"""Represents a Python class."""
310302

311303
name: str
312304
signature: str # e.g., module.class_name
@@ -325,18 +317,9 @@ def __hash__(self):
325317

326318

327319
@builder
320+
@msgpk
328321
class PyModule(BaseModel):
329-
"""Represents a Python module.
330-
331-
Attributes:
332-
file_path (str): The file path of the module.
333-
module_name (str): The name of the module (e.g., module.submodule).
334-
imports (List[PyImport]): List of import statements in the module.
335-
comments (List[PyComment]): List of comments in the module.
336-
classes (Dict[str, PyClass]): Mapping of class names to their class representations.
337-
functions (Dict[str, PyCallable]): Mapping of function names to their callable representations.
338-
variables (List[PyVariableDeclaration]): List of variable declarations in the module.
339-
"""
322+
"""Represents a Python module."""
340323

341324
file_path: str
342325
module_name: str
@@ -348,13 +331,8 @@ class PyModule(BaseModel):
348331

349332

350333
@builder
334+
@msgpk
351335
class PyApplication(BaseModel):
352-
"""Represents a Python application.
353-
354-
Attributes:
355-
name (str): The name of the application.
356-
version (str): The version of the application.
357-
description (str): A brief description of the application.
358-
"""
336+
"""Represents a Python application."""
359337

360338
symbol_table: dict[Path, PyModule]

0 commit comments

Comments
 (0)