[WIP] Feedback-driven prompting (plasma-umass#772)

* Extended stacks with Python and C time %. * Prompt assembly. * Updated flamegraph functionality, added credits. * Updated flamegraph functionality. * Removed old prompt stuff. * Minor mod for clarity. * Removed numba.
Antipire · Feb 18, 2024 · 9b456c9 · 9b456c9
1 parent 98210c7
commit 9b456c9
Show file tree

Hide file tree

Showing 5 changed files with 188 additions and 50 deletions.
diff --git a/scalene/scalene-gui/scalene-gui.js b/scalene/scalene-gui/scalene-gui.js
@@ -12,6 +12,84 @@ function vsNavigate(filename, lineno) {
   } catch {}
 }
 
+function generateScaleneOptimizedCodeRequest(
+    context,
+    sourceCode,
+    line,
+  recommendedLibraries = [],
+  includeGpuOptimizations = false,
+) {
+  // Default high-performance libraries known for their efficiency
+  const defaultLibraries = [
+      "NumPy",
+      "Scikit-learn",
+      "Pandas",
+      "TensorFlow",
+      "PyTorch",
+  ];
+  const highPerformanceLibraries = [
+    ...new Set([...defaultLibraries, ...recommendedLibraries]),
+  ];
+
+  let promptParts = [
+      "Optimize the following Python code to make it more efficient WITHOUT CHANGING ITS RESULTS.\n\n",
+      context.trim(),
+    "\n# Start of code\n",
+    sourceCode.trim(),
+    "\n# End of code\n\n",
+    "Rewrite the above Python code from 'Start of code' to 'End of code', aiming for clear and simple optimizations. ",
+    "Your output should consist only of valid Python code, with brief explanatory comments prefaced with #. ",
+    "Include a detailed explanatory comment before the code, starting with '# Proposed optimization:'. ",
+      "Leverage high-performance native libraries, especially those utilizing GPU, for significant performance improvements. ",
+      "Consider using the following other libraries, if appropriate:\n",
+      highPerformanceLibraries.map((e) => "  import " + e).join("\n") + "\n",
+    "Eliminate as many for loops, while loops, and list or dict comprehensions as possible, replacing them with vectorized equivalents. ",
+//    "Consider GPU utilization, memory consumption, and copy volume when using GPU-accelerated libraries. ",
+//    "Low GPU utilization and high copy volume indicate inefficient use of such libraries. ",
+    "Quantify the expected speedup in terms of orders of magnitude if possible. ",
+    "Fix any errors in the optimized code. ",
+//    "Consider the peak amount of memory used per line and CPU utilization for targeted optimization. ",
+      //    "Note on CPU utilization: Low utilization in libraries known for multi-threading/multi-processing indicates inefficiency.\n\n",
+  ];
+
+  // Conditional inclusion of GPU optimizations
+  if (includeGpuOptimizations) {
+    promptParts.push(
+      "Use GPU-accelerated libraries whenever it would substantially increase performance. ",
+    );
+  }
+
+  // Performance Insights
+  promptParts.push(
+    "Consider the following insights gathered from the Scalene profiler for optimization:\n",
+  );
+    const total_cpu_percent = line.n_cpu_percent_python + line.n_cpu_percent_c + line.n_sys_percent;
+
+    promptParts.push(`- CPU time: percent spent in the Python interpreter: ${(100*line.n_cpu_percent_python/total_cpu_percent).toFixed(2)}%\n`);
+    promptParts.push(`- CPU time: percent spent executing native code: ${(100*line.n_cpu_percent_c/total_cpu_percent).toFixed(2)}%\n`);
+    promptParts.push(`- CPU time: percent of system time: ${(100*line.n_sys_percent/total_cpu_percent).toFixed(2)}%\n`);
+    // `- CPU utilization: ${performanceMetrics.cpu_utilization}. Low utilization with high-core count might indicate inefficient use of multi-threaded/multi-process libraries.\n`,
+    promptParts.push(`- Core utilization: ${(100*line.n_core_utilization/total_cpu_percent).toFixed(2)}%\n`);
+    //      `- Peak memory per line: Focus on lines with high memory usage, specifically ${performanceMetrics.peak_memory_per_line}.\n`,
+    promptParts.push(`- Peak memory usage: ${line.n_peak_mb.toFixed(0)}MB (${(100 * line.n_python_fraction).toFixed(2)}% Python memory)\n`);
+    //      `- Copy volume: ${performanceMetrics.copy_volume} MB. High volume indicates inefficient data handling with GPU libraries.\n`,
+    if (line.n_copy_mb_s > 1) {
+	promptParts.push(`- Megabytes copied per second by memcpy/strcpy: ${line.n_copy_mb_s.toFixed(2)}\n`);
+    }
+    if (includeGpuOptimizations) {
+        // `  - GPU utilization: ${performanceMetrics.gpu_utilization}%. Low utilization indicates potential inefficiencies in GPU-accelerated library use.\n`
+	promptParts.push(`- GPU percent utilization: ${(100 * line.n_gpu_percent).toFixed(2)}%\n`);
+        // `  - GPU memory usage: ${performanceMetrics.gpu_memory} MB. Optimize to reduce unnecessary GPU memory consumption.\n`
+	// TODO GPU memory
+    }
+    promptParts.push(`Optimized code:`);
+  return promptParts.join("");
+}
+
+const recommendedLibraries = ["Cython", "Dask"]; // Add any domain-specific libraries here
+
+// const prompt = generateScaleneOptimizedCodeRequest(context, sourceCode, line, recommendedLibraries, true);
+
 function extractPythonCodeBlock(markdown) {
   // Pattern to match code blocks optionally tagged with "python"
   // - ``` optionally followed by "python"
@@ -354,9 +432,22 @@ function countSpaces(str) {
   return 0;
 }
 
-async function optimizeCode(imports, code, context) {
+async function optimizeCode(imports, code, line, context) {
   // Tailor prompt to request GPU optimizations or not.
-  const useGPUs = document.getElementById("use-gpu-checkbox").checked; // globalThis.profile.gpu;
+    const useGPUs = document.getElementById("use-gpu-checkbox").checked; // globalThis.profile.gpu;
+
+    let recommendedLibraries = ["sklearn"];
+    if (useGPUs) {
+	// Suggest cupy if we are using the GPU.
+	recommendedLibraries.push("cupy");
+    } else {
+	// Suggest numpy otherwise.
+	recommendedLibraries.push("numpy");
+    }
+    // TODO: remove anything already imported in imports
+
+    const bigPrompt = generateScaleneOptimizedCodeRequest(context, code, line, recommendedLibraries, useGPUs);
+
   const useGPUstring = useGPUs ? " or the GPU " : " ";
   // Check for a valid API key.
   // TODO: Add checks for Amazon / local
@@ -411,10 +502,9 @@ async function optimizeCode(imports, code, context) {
     prompt = memoryEfficiencyPrompt;
   }
 
-  // const prompt = `Below is some Python code to optimize, from "Start of code" to "End of code":\n\n# Start of code\n\n${code}\n\n# End of code\n\nRewrite the above Python code to make it more efficient without changing the results. Assume the code has already executed these imports. Do NOT include them in the optimized code:\n\n${imports}\n\nUse fast native libraries if that would make it faster than pure Python. Your output should only consist of valid Python code. Output the resulting Python with brief explanations only included as comments prefaced with #. Include a detailed explanatory comment before the code, starting with the text "# Proposed optimization:". Make the code as clear and simple as possible, while also making it as fast and memory-efficient as possible. Use vectorized operations${useGPUstring}whenever it would substantially increase performance, and quantify the speedup in terms of orders of magnitude. If the performance is not likely to increase, leave the code unchanged. Check carefully by generating inputs to see that the output is identical for both the original and optimized versions. Correctly-optimized code:`;
-
-  // const prev_prompt =  `Below is some Python code to optimize:\n\n${code}\n\nRewrite the above Python code to make it more efficient while keeping the same semantics. Use fast native libraries if that would make it faster than pure Python. Your output should only consist of valid Python code. Output only the resulting Python with brief explanations only included as comments prefaced with #. Include a detailed explanatory comment before the code, starting with the text "# Proposed optimization:". Make the code as clear and simple as possible, while also making it as fast and memory-efficient as possible. Use vectorized operations or the GPU whenever it would substantially increase performance, and try to quantify the speedup in terms of orders of magnitude. If the performance is not likely to increase, leave the code unchanged. Your output should only consist of legal Python code. Format all comments to be less than 40 columns wide:\n\n`;
-
+    // Just use big prompt maybe FIXME
+    prompt = bigPrompt;
+
   // Use number of words in the original code as a proxy for the number of tokens.
   const numWords = code.match(/\b\w+\b/g).length;
 
@@ -429,10 +519,11 @@ async function optimizeCode(imports, code, context) {
       return extractCode(result);
     }
     case "local": {
-      console.log("Running " + document.getElementById("service-select").value);
-      console.log(optimizePerformancePrompt_ollama);
+	console.log("Running " + document.getElementById("service-select").value);
+	console.log(prompt);
+//      console.log(optimizePerformancePrompt_ollama);
       const result = await sendPromptToOllama(
-        optimizePerformancePrompt_ollama,
+          prompt, // optimizePerformancePrompt_ollama,
         Math.max(numWords * 4, 500),
         document.getElementById("language-model-local").value,
         document.getElementById("local-ip").value,
@@ -446,9 +537,9 @@ async function optimizeCode(imports, code, context) {
     }
     case "amazon": {
       console.log("Running " + document.getElementById("service-select").value);
-      console.log(optimizePerformancePrompt_ollama);
+	console.log(prompt); // optimizePerformancePrompt_ollama);
       const result = await sendPromptToAmazon(
-        optimizePerformancePrompt_ollama,
+          prompt, // optimizePerformancePrompt_ollama,
         Math.max(numWords * 4, 500),
       );
       console.log(
@@ -459,24 +550,24 @@ async function optimizeCode(imports, code, context) {
   }
 }
 
-function proposeOptimizationRegion(filename, file_number, lineno) {
-  proposeOptimization(filename, file_number, lineno, { regions: true });
+function proposeOptimizationRegion(filename, file_number, line) {
+    proposeOptimization(filename, file_number, JSON.parse(decodeURIComponent(line)), { regions: true });
 }
 
-function proposeOptimizationLine(filename, file_number, lineno) {
-  proposeOptimization(filename, file_number, lineno, { regions: false });
+function proposeOptimizationLine(filename, file_number, line) {
+    proposeOptimization(filename, file_number, JSON.parse(decodeURIComponent(line)), { regions: false });
 }
 
-function proposeOptimization(filename, file_number, lineno, params) {
+function proposeOptimization(filename, file_number, line, params) {
   filename = unescape(filename);
   const useRegion = params["regions"];
   const prof = globalThis.profile;
   const this_file = prof.files[filename].lines;
   const imports = prof.files[filename].imports.join("\n");
-  const start_region_line = this_file[lineno - 1]["start_region_line"];
-  const end_region_line = this_file[lineno - 1]["end_region_line"];
+  const start_region_line = this_file[line.lineno - 1]["start_region_line"];
+  const end_region_line = this_file[line.lineno - 1]["end_region_line"];
   let context;
-  const code_line = this_file[lineno - 1]["line"];
+  const code_line = this_file[line.lineno - 1]["line"];
   let code_region;
   if (useRegion) {
     code_region = this_file
@@ -493,15 +584,18 @@ function proposeOptimization(filename, file_number, lineno, params) {
   } else {
     code_region = code_line;
     context = this_file
-      .slice(Math.max(0, lineno - 10), Math.min(lineno - 1, this_file.length))
+      .slice(
+        Math.max(0, line.lineno - 10),
+        Math.min(line.lineno - 1, this_file.length),
+      )
       .map((e) => e["line"])
       .join("");
   }
   // Count the number of leading spaces to match indentation level on output
   let leadingSpaceCount = countSpaces(code_line) + 3; // including the lightning bolt and explosion
   let indent =
     WhiteLightning + WhiteExplosion + "&nbsp;".repeat(leadingSpaceCount - 1);
-  const elt = document.getElementById(`code-${file_number}-${lineno}`);
+  const elt = document.getElementById(`code-${file_number}-${line.lineno}`);
   (async () => {
     // TODO: check Amazon credentials
     const service = document.getElementById("service-select").value;
@@ -530,7 +624,7 @@ function proposeOptimization(filename, file_number, lineno, params) {
       }
     }
     elt.innerHTML = `<em>${indent}working...</em>`;
-    let message = await optimizeCode(imports, code_region, context);
+      let message = await optimizeCode(imports, code_region, line, context);
     if (!message) {
       elt.innerHTML = "";
       return;
@@ -546,8 +640,8 @@ function proposeOptimization(filename, file_number, lineno, params) {
       )
       .join("<br />");
     // Display the proposed optimization, with click-to-copy functionality.
-    elt.innerHTML = `<hr><span title="click to copy" style="cursor: copy" id="opt-${file_number}-${lineno}">${formattedCode}</span>`;
-    thisElt = document.getElementById(`opt-${file_number}-${lineno}`);
+    elt.innerHTML = `<hr><span title="click to copy" style="cursor: copy" id="opt-${file_number}-${line.lineno}">${formattedCode}</span>`;
+    thisElt = document.getElementById(`opt-${file_number}-${line.lineno}`);
     thisElt.addEventListener("click", async (e) => {
       await copyOnClick(e, message);
       // After copying, briefly change the cursor back to the default to provide some visual feedback..
@@ -1322,25 +1416,40 @@ function makeProfileLine(
 
   const codeLine = Prism.highlight(line.line, Prism.languages.python, "python");
   s += `<td style="height:10" align="left" bgcolor="whitesmoke" style="vertical-align: middle" data-sort="${line.lineno}">`;
-  if (propose_optimizations && showExplosion) {
-    s += `<span style="vertical-align: middle; cursor: pointer" title="Propose an optimization for the entire region starting here." onclick="proposeOptimizationRegion('${escape(
+    let newLine = structuredClone(line);
+    // TODO: verify that this isn't double counting anything
+    if (propose_optimizations && showExplosion) {
+	// Construct a new line corresponding to this region.
+	let mb_copied = 0;
+	for (let lineno = start_region_line; lineno < end_region_line; lineno++) {
+	    currline = prof["files"][filename]["lines"][lineno];
+	    mb_copied += currline.n_copy_mb * prof.elapsed_time_sec;
+	    newLine.n_cpu_percent_python += currline.n_cpu_percent_python;
+	    newLine.n_cpu_percent_c += currline.n_cpu_percent_c;
+	    newLine.n_sys_percent += currline.n_sys_percent;
+	    newLine.n_gpu_percent += currline.n_gpu_percent;
+	    if (currline.n_peak_mb > newLine.n_peak_mb) {
+		newLine.n_peak_mb = currline.n_peak_mb;
+		newLine.n_python_fraction = currline.n_python_fraction;
+	    }
+	    // TODO:
+	    // GPU memory
+	    newLine.n_core_utilization += (currline.n_cpu_percent_python + currline.n_cpu_percent_c) * currline.n_core_utilization; // weigh by percentage
+	}
+	newLine.n_copy_mb_s = mb_copied / prof.elapsed_time_sec;
+	s += `<span style="vertical-align: middle; cursor: pointer" title="Propose an optimization for the entire region starting here." onclick="proposeOptimizationRegion('${escape(
       filename,
-    )}', ${file_number}, ${parseInt(
-      line.lineno,
-    )}); event.preventDefault()">${regionOptimizationString}</span>`;
+    )}', ${file_number}, '${encodeURIComponent(JSON.stringify(newLine))}'); event.preventDefault()">${regionOptimizationString}</span>`;
   } else {
     s += regionOptimizationString;
   }
 
   const lineOptimizationString = propose_optimizations
     ? `${Lightning}`
     : `${WhiteLightning}`;
-  if (propose_optimizations) {
-    s += `<span style="vertical-align: middle; cursor: pointer" title="Propose an optimization for this line." onclick="proposeOptimizationLine('${escape(
-      filename,
-    )}', ${file_number}, ${parseInt(
-      line.lineno,
-    )}); event.preventDefault()">${lineOptimizationString}</span>`;
+    if (propose_optimizations) {
+	s += `<span style="vertical-align: middle; cursor: pointer" title="Propose an optimization for this line." onclick="proposeOptimizationLine('${escape(filename)}', ${file_number}, '${encodeURIComponent(JSON.stringify(line))}'); event.preventDefault()">${lineOptimizationString}</span>`;
+      // s += `<span style="vertical-align: middle; cursor: pointer" title="Propose an optimization for this line." onclick="proposeOptimizationLine('${escape(filename,)}', ${file_number}, ${JSON.stringify(line)}); event.preventDefault()">${lineOptimizationString}</span>`;
   } else {
     s += lineOptimizationString;
   }

diff --git a/scalene/scalene_json.py b/scalene/scalene_json.py
@@ -286,6 +286,14 @@ def output_profiles(
         if result:
             program = Filename("[" + result.group(1) + "]")
 
+        # Process the stacks to normalize by total number of CPU samples.
+        for stk in stats.stacks.keys():
+            (count, python_time, c_time, cpu_samples) = stats.stacks[stk]
+            stats.stacks[stk] = (count,
+                                 python_time / stats.total_cpu_samples,
+                                 c_time / stats.total_cpu_samples,
+                                 cpu_samples / stats.total_cpu_samples)
+
         # Convert stacks into a representation suitable for JSON dumping.
         stks = []
         for stk in stats.stacks.keys():

diff --git a/scalene/scalene_profiler.py b/scalene/scalene_profiler.py
@@ -7,8 +7,7 @@
     See the paper "docs/osdi23-berger.pdf" in this repository for technical
     details on Scalene's design.
 
-    by Emery Berger
-    https://emeryberger.com
+    by Emery Berger, Sam Stern, and Juan Altmayer Pizzorno
 
     usage: scalene test/testme.py
     usage help: scalene --help
@@ -804,7 +803,7 @@ def cpu_signal_handler(
     def output_profile(program_args: Optional[List[str]] = None) -> bool:
         """Output the profile. Returns true iff there was any info reported the profile."""
         # sourcery skip: inline-immediately-returned-variable
-        # print(Scalene.flamegraph_format(Scalene.__stats.stacks))
+        # print(flamegraph_format(Scalene.__stats.stacks))       
         if Scalene.__args.json:
             json_output = Scalene.__json.output_profiles(
                 Scalene.__program_being_profiled,
@@ -1008,16 +1007,21 @@ def process_cpu_sample(
 
         main_thread_frame = new_frames[0][0]
 
-        if Scalene.__args.stacks:
-            add_stack(
-                main_thread_frame, Scalene.should_trace, Scalene.__stats.stacks
-            )
-
         average_python_time = python_time / total_frames
         average_c_time = c_time / total_frames
         average_gpu_time = gpu_time / total_frames
         average_cpu_time = (python_time + c_time) / total_frames
 
+        if Scalene.__args.stacks:
+            add_stack(
+                main_thread_frame,
+                Scalene.should_trace,
+                Scalene.__stats.stacks,
+                average_python_time,
+                average_c_time,
+                average_cpu_time
+            )
+
         # First, handle the main thread.
         Scalene.enter_function_meta(main_thread_frame, Scalene.__stats)
         fname = Filename(main_thread_frame.f_code.co_filename)
@@ -1044,7 +1048,12 @@ def process_cpu_sample(
         for (frame, tident, orig_frame) in new_frames:
             if frame == main_thread_frame:
                 continue
-            add_stack(frame, Scalene.should_trace, Scalene.__stats.stacks)
+            add_stack(frame,
+                      Scalene.should_trace,
+                      Scalene.__stats.stacks,
+                      average_python_time,
+                      average_c_time,
+                      average_cpu_time)
 
             # In a thread.
             fname = Filename(frame.f_code.co_filename)
@@ -1494,8 +1503,9 @@ def should_trace(filename: Filename, func: str) -> bool:
         if not Scalene.__args.profile_all:
             for n in sysconfig.get_scheme_names():
                 for p in sysconfig.get_path_names():
+                    the_path = sysconfig.get_path(p, n)
                     libdir = str(
-                        pathlib.Path(sysconfig.get_path(p, n)).resolve()
+                        pathlib.Path(the_path).resolve()
                     )
                     if libdir in resolved_filename:
                         return False

diff --git a/scalene/scalene_statistics.py b/scalene/scalene_statistics.py
@@ -40,7 +40,7 @@ def __init__(self) -> None:
         self.alloc_samples: int = 0
 
         #  full stacks taken during CPU samples, together with number of hits
-        self.stacks: Dict[Tuple[Any], int] = defaultdict(int)
+        self.stacks: Dict[Tuple[Any], Any] = defaultdict(None)
 
         #   CPU samples for each location in the program
         #   spent in the interpreter