diff --git a/scalene/scalene-gui/scalene-gui.js b/scalene/scalene-gui/scalene-gui.js index f2e5918e4..188d56e6e 100644 --- a/scalene/scalene-gui/scalene-gui.js +++ b/scalene/scalene-gui/scalene-gui.js @@ -12,6 +12,84 @@ function vsNavigate(filename, lineno) { } catch {} } +function generateScaleneOptimizedCodeRequest( + context, + sourceCode, + line, + recommendedLibraries = [], + includeGpuOptimizations = false, +) { + // Default high-performance libraries known for their efficiency + const defaultLibraries = [ + "NumPy", + "Scikit-learn", + "Pandas", + "TensorFlow", + "PyTorch", + ]; + const highPerformanceLibraries = [ + ...new Set([...defaultLibraries, ...recommendedLibraries]), + ]; + + let promptParts = [ + "Optimize the following Python code to make it more efficient WITHOUT CHANGING ITS RESULTS.\n\n", + context.trim(), + "\n# Start of code\n", + sourceCode.trim(), + "\n# End of code\n\n", + "Rewrite the above Python code from 'Start of code' to 'End of code', aiming for clear and simple optimizations. ", + "Your output should consist only of valid Python code, with brief explanatory comments prefaced with #. ", + "Include a detailed explanatory comment before the code, starting with '# Proposed optimization:'. ", + "Leverage high-performance native libraries, especially those utilizing GPU, for significant performance improvements. ", + "Consider using the following other libraries, if appropriate:\n", + highPerformanceLibraries.map((e) => " import " + e).join("\n") + "\n", + "Eliminate as many for loops, while loops, and list or dict comprehensions as possible, replacing them with vectorized equivalents. ", +// "Consider GPU utilization, memory consumption, and copy volume when using GPU-accelerated libraries. ", +// "Low GPU utilization and high copy volume indicate inefficient use of such libraries. ", + "Quantify the expected speedup in terms of orders of magnitude if possible. ", + "Fix any errors in the optimized code. ", +// "Consider the peak amount of memory used per line and CPU utilization for targeted optimization. ", + // "Note on CPU utilization: Low utilization in libraries known for multi-threading/multi-processing indicates inefficiency.\n\n", + ]; + + // Conditional inclusion of GPU optimizations + if (includeGpuOptimizations) { + promptParts.push( + "Use GPU-accelerated libraries whenever it would substantially increase performance. ", + ); + } + + // Performance Insights + promptParts.push( + "Consider the following insights gathered from the Scalene profiler for optimization:\n", + ); + const total_cpu_percent = line.n_cpu_percent_python + line.n_cpu_percent_c + line.n_sys_percent; + + promptParts.push(`- CPU time: percent spent in the Python interpreter: ${(100*line.n_cpu_percent_python/total_cpu_percent).toFixed(2)}%\n`); + promptParts.push(`- CPU time: percent spent executing native code: ${(100*line.n_cpu_percent_c/total_cpu_percent).toFixed(2)}%\n`); + promptParts.push(`- CPU time: percent of system time: ${(100*line.n_sys_percent/total_cpu_percent).toFixed(2)}%\n`); + // `- CPU utilization: ${performanceMetrics.cpu_utilization}. Low utilization with high-core count might indicate inefficient use of multi-threaded/multi-process libraries.\n`, + promptParts.push(`- Core utilization: ${(100*line.n_core_utilization/total_cpu_percent).toFixed(2)}%\n`); + // `- Peak memory per line: Focus on lines with high memory usage, specifically ${performanceMetrics.peak_memory_per_line}.\n`, + promptParts.push(`- Peak memory usage: ${line.n_peak_mb.toFixed(0)}MB (${(100 * line.n_python_fraction).toFixed(2)}% Python memory)\n`); + // `- Copy volume: ${performanceMetrics.copy_volume} MB. High volume indicates inefficient data handling with GPU libraries.\n`, + if (line.n_copy_mb_s > 1) { + promptParts.push(`- Megabytes copied per second by memcpy/strcpy: ${line.n_copy_mb_s.toFixed(2)}\n`); + } + if (includeGpuOptimizations) { + // ` - GPU utilization: ${performanceMetrics.gpu_utilization}%. Low utilization indicates potential inefficiencies in GPU-accelerated library use.\n` + promptParts.push(`- GPU percent utilization: ${(100 * line.n_gpu_percent).toFixed(2)}%\n`); + // ` - GPU memory usage: ${performanceMetrics.gpu_memory} MB. Optimize to reduce unnecessary GPU memory consumption.\n` + // TODO GPU memory + } + promptParts.push(`Optimized code:`); + return promptParts.join(""); +} + +const recommendedLibraries = ["Cython", "Dask"]; // Add any domain-specific libraries here + +// const prompt = generateScaleneOptimizedCodeRequest(context, sourceCode, line, recommendedLibraries, true); + function extractPythonCodeBlock(markdown) { // Pattern to match code blocks optionally tagged with "python" // - ``` optionally followed by "python" @@ -354,9 +432,22 @@ function countSpaces(str) { return 0; } -async function optimizeCode(imports, code, context) { +async function optimizeCode(imports, code, line, context) { // Tailor prompt to request GPU optimizations or not. - const useGPUs = document.getElementById("use-gpu-checkbox").checked; // globalThis.profile.gpu; + const useGPUs = document.getElementById("use-gpu-checkbox").checked; // globalThis.profile.gpu; + + let recommendedLibraries = ["sklearn"]; + if (useGPUs) { + // Suggest cupy if we are using the GPU. + recommendedLibraries.push("cupy"); + } else { + // Suggest numpy otherwise. + recommendedLibraries.push("numpy"); + } + // TODO: remove anything already imported in imports + + const bigPrompt = generateScaleneOptimizedCodeRequest(context, code, line, recommendedLibraries, useGPUs); + const useGPUstring = useGPUs ? " or the GPU " : " "; // Check for a valid API key. // TODO: Add checks for Amazon / local @@ -411,10 +502,9 @@ async function optimizeCode(imports, code, context) { prompt = memoryEfficiencyPrompt; } - // const prompt = `Below is some Python code to optimize, from "Start of code" to "End of code":\n\n# Start of code\n\n${code}\n\n# End of code\n\nRewrite the above Python code to make it more efficient without changing the results. Assume the code has already executed these imports. Do NOT include them in the optimized code:\n\n${imports}\n\nUse fast native libraries if that would make it faster than pure Python. Your output should only consist of valid Python code. Output the resulting Python with brief explanations only included as comments prefaced with #. Include a detailed explanatory comment before the code, starting with the text "# Proposed optimization:". Make the code as clear and simple as possible, while also making it as fast and memory-efficient as possible. Use vectorized operations${useGPUstring}whenever it would substantially increase performance, and quantify the speedup in terms of orders of magnitude. If the performance is not likely to increase, leave the code unchanged. Check carefully by generating inputs to see that the output is identical for both the original and optimized versions. Correctly-optimized code:`; - - // const prev_prompt = `Below is some Python code to optimize:\n\n${code}\n\nRewrite the above Python code to make it more efficient while keeping the same semantics. Use fast native libraries if that would make it faster than pure Python. Your output should only consist of valid Python code. Output only the resulting Python with brief explanations only included as comments prefaced with #. Include a detailed explanatory comment before the code, starting with the text "# Proposed optimization:". Make the code as clear and simple as possible, while also making it as fast and memory-efficient as possible. Use vectorized operations or the GPU whenever it would substantially increase performance, and try to quantify the speedup in terms of orders of magnitude. If the performance is not likely to increase, leave the code unchanged. Your output should only consist of legal Python code. Format all comments to be less than 40 columns wide:\n\n`; - + // Just use big prompt maybe FIXME + prompt = bigPrompt; + // Use number of words in the original code as a proxy for the number of tokens. const numWords = code.match(/\b\w+\b/g).length; @@ -429,10 +519,11 @@ async function optimizeCode(imports, code, context) { return extractCode(result); } case "local": { - console.log("Running " + document.getElementById("service-select").value); - console.log(optimizePerformancePrompt_ollama); + console.log("Running " + document.getElementById("service-select").value); + console.log(prompt); +// console.log(optimizePerformancePrompt_ollama); const result = await sendPromptToOllama( - optimizePerformancePrompt_ollama, + prompt, // optimizePerformancePrompt_ollama, Math.max(numWords * 4, 500), document.getElementById("language-model-local").value, document.getElementById("local-ip").value, @@ -446,9 +537,9 @@ async function optimizeCode(imports, code, context) { } case "amazon": { console.log("Running " + document.getElementById("service-select").value); - console.log(optimizePerformancePrompt_ollama); + console.log(prompt); // optimizePerformancePrompt_ollama); const result = await sendPromptToAmazon( - optimizePerformancePrompt_ollama, + prompt, // optimizePerformancePrompt_ollama, Math.max(numWords * 4, 500), ); console.log( @@ -459,24 +550,24 @@ async function optimizeCode(imports, code, context) { } } -function proposeOptimizationRegion(filename, file_number, lineno) { - proposeOptimization(filename, file_number, lineno, { regions: true }); +function proposeOptimizationRegion(filename, file_number, line) { + proposeOptimization(filename, file_number, JSON.parse(decodeURIComponent(line)), { regions: true }); } -function proposeOptimizationLine(filename, file_number, lineno) { - proposeOptimization(filename, file_number, lineno, { regions: false }); +function proposeOptimizationLine(filename, file_number, line) { + proposeOptimization(filename, file_number, JSON.parse(decodeURIComponent(line)), { regions: false }); } -function proposeOptimization(filename, file_number, lineno, params) { +function proposeOptimization(filename, file_number, line, params) { filename = unescape(filename); const useRegion = params["regions"]; const prof = globalThis.profile; const this_file = prof.files[filename].lines; const imports = prof.files[filename].imports.join("\n"); - const start_region_line = this_file[lineno - 1]["start_region_line"]; - const end_region_line = this_file[lineno - 1]["end_region_line"]; + const start_region_line = this_file[line.lineno - 1]["start_region_line"]; + const end_region_line = this_file[line.lineno - 1]["end_region_line"]; let context; - const code_line = this_file[lineno - 1]["line"]; + const code_line = this_file[line.lineno - 1]["line"]; let code_region; if (useRegion) { code_region = this_file @@ -493,7 +584,10 @@ function proposeOptimization(filename, file_number, lineno, params) { } else { code_region = code_line; context = this_file - .slice(Math.max(0, lineno - 10), Math.min(lineno - 1, this_file.length)) + .slice( + Math.max(0, line.lineno - 10), + Math.min(line.lineno - 1, this_file.length), + ) .map((e) => e["line"]) .join(""); } @@ -501,7 +595,7 @@ function proposeOptimization(filename, file_number, lineno, params) { let leadingSpaceCount = countSpaces(code_line) + 3; // including the lightning bolt and explosion let indent = WhiteLightning + WhiteExplosion + " ".repeat(leadingSpaceCount - 1); - const elt = document.getElementById(`code-${file_number}-${lineno}`); + const elt = document.getElementById(`code-${file_number}-${line.lineno}`); (async () => { // TODO: check Amazon credentials const service = document.getElementById("service-select").value; @@ -530,7 +624,7 @@ function proposeOptimization(filename, file_number, lineno, params) { } } elt.innerHTML = `${indent}working...`; - let message = await optimizeCode(imports, code_region, context); + let message = await optimizeCode(imports, code_region, line, context); if (!message) { elt.innerHTML = ""; return; @@ -546,8 +640,8 @@ function proposeOptimization(filename, file_number, lineno, params) { ) .join("
"); // Display the proposed optimization, with click-to-copy functionality. - elt.innerHTML = `
${formattedCode}`; - thisElt = document.getElementById(`opt-${file_number}-${lineno}`); + elt.innerHTML = `
${formattedCode}`; + thisElt = document.getElementById(`opt-${file_number}-${line.lineno}`); thisElt.addEventListener("click", async (e) => { await copyOnClick(e, message); // After copying, briefly change the cursor back to the default to provide some visual feedback.. @@ -1322,12 +1416,30 @@ function makeProfileLine( const codeLine = Prism.highlight(line.line, Prism.languages.python, "python"); s += ``; - if (propose_optimizations && showExplosion) { - s += ` newLine.n_peak_mb) { + newLine.n_peak_mb = currline.n_peak_mb; + newLine.n_python_fraction = currline.n_python_fraction; + } + // TODO: + // GPU memory + newLine.n_core_utilization += (currline.n_cpu_percent_python + currline.n_cpu_percent_c) * currline.n_core_utilization; // weigh by percentage + } + newLine.n_copy_mb_s = mb_copied / prof.elapsed_time_sec; + s += `${regionOptimizationString}`; + )}', ${file_number}, '${encodeURIComponent(JSON.stringify(newLine))}'); event.preventDefault()">${regionOptimizationString}`; } else { s += regionOptimizationString; } @@ -1335,12 +1447,9 @@ function makeProfileLine( const lineOptimizationString = propose_optimizations ? `${Lightning}` : `${WhiteLightning}`; - if (propose_optimizations) { - s += `${lineOptimizationString}`; + if (propose_optimizations) { + s += `${lineOptimizationString}`; + // s += `${lineOptimizationString}`; } else { s += lineOptimizationString; } diff --git a/scalene/scalene_json.py b/scalene/scalene_json.py index 2d8afff9e..07d5264ff 100644 --- a/scalene/scalene_json.py +++ b/scalene/scalene_json.py @@ -286,6 +286,14 @@ def output_profiles( if result: program = Filename("[" + result.group(1) + "]") + # Process the stacks to normalize by total number of CPU samples. + for stk in stats.stacks.keys(): + (count, python_time, c_time, cpu_samples) = stats.stacks[stk] + stats.stacks[stk] = (count, + python_time / stats.total_cpu_samples, + c_time / stats.total_cpu_samples, + cpu_samples / stats.total_cpu_samples) + # Convert stacks into a representation suitable for JSON dumping. stks = [] for stk in stats.stacks.keys(): diff --git a/scalene/scalene_profiler.py b/scalene/scalene_profiler.py index a8a90e89a..b3c62f981 100644 --- a/scalene/scalene_profiler.py +++ b/scalene/scalene_profiler.py @@ -7,8 +7,7 @@ See the paper "docs/osdi23-berger.pdf" in this repository for technical details on Scalene's design. - by Emery Berger - https://emeryberger.com + by Emery Berger, Sam Stern, and Juan Altmayer Pizzorno usage: scalene test/testme.py usage help: scalene --help @@ -804,7 +803,7 @@ def cpu_signal_handler( def output_profile(program_args: Optional[List[str]] = None) -> bool: """Output the profile. Returns true iff there was any info reported the profile.""" # sourcery skip: inline-immediately-returned-variable - # print(Scalene.flamegraph_format(Scalene.__stats.stacks)) + # print(flamegraph_format(Scalene.__stats.stacks)) if Scalene.__args.json: json_output = Scalene.__json.output_profiles( Scalene.__program_being_profiled, @@ -1008,16 +1007,21 @@ def process_cpu_sample( main_thread_frame = new_frames[0][0] - if Scalene.__args.stacks: - add_stack( - main_thread_frame, Scalene.should_trace, Scalene.__stats.stacks - ) - average_python_time = python_time / total_frames average_c_time = c_time / total_frames average_gpu_time = gpu_time / total_frames average_cpu_time = (python_time + c_time) / total_frames + if Scalene.__args.stacks: + add_stack( + main_thread_frame, + Scalene.should_trace, + Scalene.__stats.stacks, + average_python_time, + average_c_time, + average_cpu_time + ) + # First, handle the main thread. Scalene.enter_function_meta(main_thread_frame, Scalene.__stats) fname = Filename(main_thread_frame.f_code.co_filename) @@ -1044,7 +1048,12 @@ def process_cpu_sample( for (frame, tident, orig_frame) in new_frames: if frame == main_thread_frame: continue - add_stack(frame, Scalene.should_trace, Scalene.__stats.stacks) + add_stack(frame, + Scalene.should_trace, + Scalene.__stats.stacks, + average_python_time, + average_c_time, + average_cpu_time) # In a thread. fname = Filename(frame.f_code.co_filename) @@ -1494,8 +1503,9 @@ def should_trace(filename: Filename, func: str) -> bool: if not Scalene.__args.profile_all: for n in sysconfig.get_scheme_names(): for p in sysconfig.get_path_names(): + the_path = sysconfig.get_path(p, n) libdir = str( - pathlib.Path(sysconfig.get_path(p, n)).resolve() + pathlib.Path(the_path).resolve() ) if libdir in resolved_filename: return False diff --git a/scalene/scalene_statistics.py b/scalene/scalene_statistics.py index 7d688a5b0..14d0b6031 100644 --- a/scalene/scalene_statistics.py +++ b/scalene/scalene_statistics.py @@ -40,7 +40,7 @@ def __init__(self) -> None: self.alloc_samples: int = 0 # full stacks taken during CPU samples, together with number of hits - self.stacks: Dict[Tuple[Any], int] = defaultdict(int) + self.stacks: Dict[Tuple[Any], Any] = defaultdict(None) # CPU samples for each location in the program # spent in the interpreter diff --git a/scalene/scalene_utility.py b/scalene/scalene_utility.py index 867495905..6f9498fcc 100644 --- a/scalene/scalene_utility.py +++ b/scalene/scalene_utility.py @@ -42,7 +42,10 @@ def __str__(self) -> str: def add_stack( frame: FrameType, should_trace: Callable[[Filename, str], bool], - stacks: Dict[Any, int], + stacks: Dict[Any, Any], + python_time: float, + c_time: float, + cpu_samples: float ) -> None: """Add one to the stack starting from this frame.""" stk: List[Tuple[str, str, int]] = list() @@ -51,7 +54,15 @@ def add_stack( if should_trace(Filename(f.f_code.co_filename), f.f_code.co_name): stk.insert(0, (f.f_code.co_filename, f.f_code.co_name, f.f_lineno)) f = f.f_back - stacks[tuple(stk)] += 1 + if tuple(stk) not in stacks: + stacks[tuple(stk)] = (1, python_time, c_time, cpu_samples) + else: + (prev_count, prev_python_time, prev_c_time, prev_cpu_samples) = stacks[tuple(stk)] + stacks[tuple(stk)] = (prev_count + 1, + prev_python_time + python_time, + prev_c_time + c_time, + prev_cpu_samples + cpu_samples) + # stacks[tuple(stk)] += 1 def on_stack( @@ -98,14 +109,14 @@ def get_fully_qualified_name(frame: FrameType) -> Filename: return fn_name -def flamegraph_format(stacks: Dict[Tuple[Any], int]) -> str: +def flamegraph_format(stacks: Dict[Tuple[Any], Any]) -> str: """Converts stacks to a string suitable for input to Brendan Gregg's flamegraph.pl script.""" output = "" for stk in stacks.keys(): for item in stk: (fname, fn_name, lineno) = item output += f"{fname} {fn_name}:{lineno};" - output += " " + str(stacks[stk]) + output += " " + str(stacks[stk][0]) output += "\n" return output