Improve Statistic Generation (#981)

flowr-analysis · Sep 17, 2024 · 3a4a3c9 · 3a4a3c9 · github-actions · Sep 17, 2024
2 parents 96e3833 + f4a7a85
commit 3a4a3c9
Show file tree

Hide file tree

Showing 18 changed files with 405 additions and 256 deletions.
diff --git a/src/cli/common/scripts-info.ts b/src/cli/common/scripts-info.ts
@@ -16,10 +16,15 @@ import { asOptionName } from '../repl/commands/commands';
 
 
 interface BaseScriptInformation extends MergeableRecord {
+	/** name of the tool to present to the user */
 	toolName:     string
+	/** internal module name to fork/execute, make sure to use the correct path to it with the help of `__dirname` */
 	target:       string
+	/** description of the tool for the user */
 	description:  string
+	/** example usage */
 	usageExample: string
+	/** command line options that are available */
 	options:      OptionDefinition[]
 }
 

diff --git a/src/cli/repl/execute.ts b/src/cli/repl/execute.ts
@@ -44,7 +44,7 @@ export function stdioCaptureProcessor(stdio: Stdio, onStdOutLine: (msg: string)
  *                      the output of the script, see {@link stdioCaptureProcessor}.
  * @param exitOnError - If set to `true`, the process will exit with the exit code of the script.
  */
-export async function waitOnScript(module: string, args: string[], io?: StdioProcessor, exitOnError = false): Promise<void> {
+export async function waitOnScript(module: string, args: readonly string[], io?: StdioProcessor, exitOnError = false): Promise<void> {
 	log.info(`starting script ${module} with args ${JSON.stringify(args)}`);
 	const child = cp.fork(module, args, {
 		silent: io !== undefined

diff --git a/src/cli/run-script.ts b/src/cli/run-script.ts
@@ -0,0 +1,13 @@
+import type { StdioProcessor } from './repl/execute';
+import { waitOnScript } from './repl/execute';
+import { scripts } from './common/scripts-info';
+import path from 'path';
+
+/**
+ * Path-safe helper of {@link waitOnScript} for other flowR scripts.
+ *
+ * @see waitOnScript
+ */
+export async function runScript(name: keyof typeof scripts, args: readonly string[], io?: StdioProcessor, exitOnError = false): Promise<void> {
+	return waitOnScript(path.resolve(__dirname,scripts[name].target), args, io, exitOnError);
+}
diff --git a/src/cli/script-core/statistics-core.ts b/src/cli/script-core/statistics-core.ts
@@ -0,0 +1,110 @@
+import path from 'path';
+import type { Arguments } from '../../util/parallel';
+import { LimitedThreadPool } from '../../util/parallel';
+import { allRFilesFrom } from '../../util/files';
+import { retrieveArchiveName, validateFeatures } from '../common/features';
+import fs from 'fs';
+import { initFileProvider } from '../../statistics/output/statistics-file';
+import { jsonReplacer } from '../../util/json';
+import { log } from '../../util/log';
+import type { StatsCliOptions } from '../statistics-app';
+import { getStatsForSingleFile } from './statistics-helper-core';
+import commandLineArgs from 'command-line-args';
+import { scripts } from '../common/scripts-info';
+import type { StatsHelperCliOptions } from '../statistics-helper-app';
+import { setFormatter, voidFormatter } from '../../util/ansi';
+
+const testRegex = /[^/]*\/test/i;
+const exampleRegex = /[^/]*\/example/i;
+
+function getPrefixForFile(file: string) {
+	if(testRegex.test(file)) {
+		return 'test-';
+	}	else if(exampleRegex.test(file)) {
+		return 'example-';
+	} else {
+		return '';
+	}
+}
+
+function getSuffixForFile(base: string, file: string) {
+	const subpath = path.relative(base, file);
+	return '--' + subpath.replace(/\//g, '／');
+}
+
+async function collectFileArguments(options: StatsCliOptions, verboseAdd: readonly string[], dumpJson: readonly string[], features: readonly string[]) {
+	const files: Arguments[] = [];
+	let counter = 0;
+	let presentSteps = 5000;
+	let skipped = 0;
+	for await (const f of allRFilesFrom(options.input)) {
+		const outputDir = path.join(options['output-dir'], `${getPrefixForFile(f.content)}${getSuffixForFile(options.input.length === 1 ? options.input[0] : '', f.content)}`);
+		const target = retrieveArchiveName(outputDir);
+		if(fs.existsSync(target)) {
+			console.log(`Archive ${target} exists. Skip.`);
+			skipped++;
+			continue;
+		}
+		files.push(['--input', f.content, '--output-dir', outputDir,'--compress', '--root-dir', options.input.length === 1 ? options.input[0] : '""', ...verboseAdd, ...features, ...dumpJson]);
+		if(++counter % presentSteps === 0) {
+			console.log(`Collected ${counter} files`);
+			if(counter >= 10 * presentSteps) {
+				presentSteps *= 5;
+			}
+		}
+	}
+	console.log(`Total: ${counter} files (${skipped} skipped with archive existing)`);
+	return files;
+}
+
+export async function flowrScriptGetStats(options: StatsCliOptions) {
+	if(options.input.length === 0) {
+		console.error('No input files given. Nothing to do. See \'--help\' if this is an error.');
+		process.exit(0);
+	}
+
+	if(options['no-ansi']) {
+		log.info('disabling ansi colors');
+		setFormatter(voidFormatter);
+	}
+
+	const processedFeatures = validateFeatures(options.features);
+	initFileProvider(options['output-dir']);
+	console.log(`Processing features: ${JSON.stringify(processedFeatures, jsonReplacer)}`);
+	console.log(`Using ${options.parallel} parallel executors`);
+
+	const verboseAdd = options.verbose ? ['--verbose'] : [];
+	const features = [...processedFeatures].flatMap(s => ['--features', s]);
+	const dumpJson = options['dump-json'] ? ['--dump-json'] : [];
+
+	// we do not use the limit argument to be able to pick the limit randomly
+	const args = await collectFileArguments(options, verboseAdd, dumpJson, features);
+
+	if(options.limit) {
+		console.log('Shuffle...');
+		log.info(`limiting to ${options.limit} files`);
+		// shuffle and limit
+		args.sort(() => Math.random() - 0.5);
+	}
+	console.log('Prepare Pool...');
+
+	const limit = options.limit ?? args.length;
+
+	if(options.parallel > 0) {
+		const pool = new LimitedThreadPool(
+			`${__dirname}/statistics-helper-app`,
+			args,
+			limit,
+			options.parallel
+		);
+		console.log('Run Pool...');
+		await pool.run();
+		const stats = pool.getStats();
+		console.log(`Processed ${stats.counter} files, skipped ${stats.skipped.length} files due to errors`);
+	} else {
+		console.log('Run Sequentially as parallel <= 0...');
+		for(const arg of args) {
+			await getStatsForSingleFile(commandLineArgs(scripts['stats-helper'].options, { argv: arg }) as StatsHelperCliOptions);
+		}
+	}
+}
diff --git a/src/cli/script-core/statistics-helper-core.ts b/src/cli/script-core/statistics-helper-core.ts
@@ -0,0 +1,90 @@
+import { retrieveArchiveName } from '../common/features';
+import fs from 'fs';
+import type { FeatureKey } from '../../statistics/features/feature';
+import { RShell } from '../../r-bridge/shell';
+import { initFileProvider, statisticsFileProvider } from '../../statistics/output/statistics-file';
+import { extractUsageStatistics, staticRequests } from '../../statistics/statistics';
+import { extractCFG } from '../../util/cfg/cfg';
+import { printStepResult, StepOutputFormat } from '../../core/print/print';
+import { PARSE_WITH_R_SHELL_STEP } from '../../core/steps/all/core/00-parse';
+import { NORMALIZE } from '../../core/steps/all/core/10-normalize';
+import { STATIC_DATAFLOW } from '../../core/steps/all/core/20-dataflow';
+import { jsonReplacer } from '../../util/json';
+import { log } from '../../util/log';
+import { guard } from '../../util/assert';
+import { date2string } from '../../util/time';
+import type { StatsHelperCliOptions } from '../statistics-helper-app';
+import { create } from 'tar';
+import { setFormatter, voidFormatter } from '../../util/ansi';
+
+
+function compressFolder(folder: string, target: string) {
+	// eslint-disable-next-line @typescript-eslint/no-unsafe-call,@typescript-eslint/no-unsafe-member-access
+	return create({
+		gzip:          true,
+		file:          target,
+		portable:      true,
+		preservePaths: false
+	}, [folder]).then(() => {
+		// now, remove the folder
+		fs.rmSync(folder, { recursive: true, force: true });
+	}, () => {
+		console.log(`failed to compress ${folder}`);
+	});
+}
+
+
+export async function getStatsForSingleFile(options: StatsHelperCliOptions) {
+	if(options['no-ansi']) {
+		log.info('disabling ansi colors');
+		setFormatter(voidFormatter);
+	}
+
+	let target: string | undefined = undefined;
+	if(options.compress) {
+		target = retrieveArchiveName(options['output-dir']);
+		if(fs.existsSync(target)) {
+			console.log(`Archive ${target} exists. Skip.`);
+			process.exit(0);
+		}
+	}
+
+	// assume correct
+	const processedFeatures = new Set<FeatureKey>(options.features as FeatureKey[]);
+
+	const shell = new RShell();
+
+	initFileProvider(options['output-dir']);
+
+	await shell.obtainTmpDir();
+	const stats = await extractUsageStatistics(shell,
+		() => { /* do nothing */ },
+		processedFeatures,
+		staticRequests({ request: 'file', content: options.input }),
+		options['root-dir']
+	);
+	// console.warn(`skipped ${stats.meta.failedRequests.length} requests due to errors (run with logs to get more info)`)
+
+	if(stats.outputs.size === 1) {
+		if(options['dump-json']) {
+			const [, output] = [...stats.outputs.entries()][0];
+			const cfg = extractCFG(output.normalize);
+			statisticsFileProvider.append('output-json', 'parse', await printStepResult(PARSE_WITH_R_SHELL_STEP, output.parse, StepOutputFormat.Json));
+			statisticsFileProvider.append('output-json', 'normalize', await printStepResult(NORMALIZE, output.normalize, StepOutputFormat.Json));
+			statisticsFileProvider.append('output-json', 'dataflow', await printStepResult(STATIC_DATAFLOW, output.dataflow, StepOutputFormat.Json));
+			statisticsFileProvider.append('output-json', 'cfg', JSON.stringify(cfg, jsonReplacer));
+		}
+
+		statisticsFileProvider.append('meta', 'stats', JSON.stringify({ ...stats.meta, file: options.input }, jsonReplacer));
+		statisticsFileProvider.append('meta', 'features', JSON.stringify(stats.features, jsonReplacer));
+	} else {
+		log.error(`expected exactly one output vs. ${stats.outputs.size}, got: ${JSON.stringify([...stats.outputs.keys()], jsonReplacer, 2)}`);
+	}
+	if(options.compress) {
+		guard(target !== undefined, 'target must be defined given the compress option');
+		console.log(`[${date2string(new Date())}] Compressing ${options['output-dir']} to ${target}`);
+		await compressFolder(options['output-dir'], target);
+	}
+
+	shell.close();
+}
diff --git a/src/cli/script-core/summarizer-core.ts b/src/cli/script-core/summarizer-core.ts
@@ -0,0 +1,56 @@
+import type { SummarizerCliOptions } from '../summarizer-app';
+import { StatisticsSummarizer } from '../../statistics/summarizer/summarizer';
+import { BenchmarkSummarizer } from '../../benchmark/summarizer/summarizer';
+import { detectSummarizationType } from '../../statistics/summarizer/auto-detect';
+import { SummarizerType } from '../../util/summarizer';
+import { allFeatureNames } from '../../statistics/features/feature';
+
+
+function getBenchmarkSummarizer(options: SummarizerCliOptions, outputBase: string) {
+	return new BenchmarkSummarizer({
+		graphOutputPath:        options.graph ? `${outputBase}-graph.json` : undefined,
+		inputPath:              options.input,
+		intermediateOutputPath: outputBase,
+		outputPath:             `${outputBase}-ultimate.json`,
+		logger:                 console.log
+	});
+}
+
+function getStatisticsSummarizer(options: SummarizerCliOptions, outputBase: string) {
+	return new StatisticsSummarizer({
+		inputPath:              options.input,
+		outputPath:             `${outputBase}-final`,
+		intermediateOutputPath: `${outputBase}-intermediate/`,
+		projectSkip:            options['project-skip'],
+		featuresToUse:          allFeatureNames,
+		logger:                 console.log
+	});
+}
+
+
+async function retrieveSummarizer(options: SummarizerCliOptions, outputBase: string): Promise<StatisticsSummarizer | BenchmarkSummarizer> {
+	const type = options.type === 'auto' ? await detectSummarizationType(options.input) : options.type;
+	if(type === SummarizerType.Benchmark) {
+		console.log('Summarizing benchmark');
+		return getBenchmarkSummarizer(options, outputBase);
+	} else if(type === SummarizerType.Statistics) {
+		console.log('Summarizing statistics');
+		return getStatisticsSummarizer(options, outputBase);
+	} else {
+		console.error('Unknown type', type, 'either give "benchmark" or "statistics"');
+		process.exit(1);
+	}
+}
+
+export async function flowrScriptSummarizer(options: SummarizerCliOptions) {
+	const outputBase = (options.output ?? options.input).replace(/\.json$|\/$/, '-summary');
+	console.log(`Writing outputs to base ${outputBase}`);
+
+	const summarizer = await retrieveSummarizer(options, outputBase);
+
+	if(!options['ultimate-only']) {
+		await summarizer.preparationPhase(options.categorize);
+	}
+
+	await summarizer.summarizePhase();
+}
Benchmark suite	Current: `3a4a3c9`	Previous: `9d8b361`	Ratio
`Retrieve AST from R code`	`233.51630863636362` ms (`98.42728308061673`)	`238.12351204545453` ms (`103.59720843756357`)	`0.98`
`Normalize R AST`	`19.09238813636364` ms (`32.88451942624459`)	`19.968034227272728` ms (`34.84298543847825`)	`0.96`
`Produce dataflow information`	`37.29713595454545` ms (`79.6841893426286`)	`38.310942090909094` ms (`82.04448044777155`)	`0.97`
`Total per-file`	`807.1687020454546` ms (`1432.9006933100156`)	`811.1703915909092` ms (`1431.4404310276739`)	`1.00`
`Static slicing`	`2.34738781930876` ms (`1.6261119892174756`)	`2.258090287874194` ms (`1.2792808105316449`)	`1.04`
`Reconstruct code`	`0.22065341321404836` ms (`0.16889238590670527`)	`0.22489327849282828` ms (`0.17585774592637268`)	`0.98`
`Total per-slice`	`2.5841764538418106` ms (`1.6702238972102916`)	`2.4996261233332735` ms (`1.3278746913052974`)	`1.03`
`failed to reconstruct/re-parse`	`0` #	`0` #	`1`
`times hit threshold`	`0` #	`0` #	`1`
`reduction (characters)`	`0.7869360165281424` #	`0.7869360165281424` #	`1`
`reduction (normalized tokens)`	`0.7639690077689504` #	`0.7639690077689504` #	`1`
`memory (df-graph)`	`147.42458274147728` KiB (`358.6827375397903`)	`147.42458274147728` KiB (`358.6827375397903`)	`1`
Benchmark suite	Current: `3a4a3c9`	Previous: `9d8b361`	Ratio
`Retrieve AST from R code`	`233.44970384` ms (`41.73554223159988`)	`238.40722376` ms (`42.95412443307438`)	`0.98`
`Normalize R AST`	`21.90124094` ms (`16.569524093502846`)	`22.0872248` ms (`17.016890594916376`)	`0.99`
`Produce dataflow information`	`72.99715495999999` ms (`85.8570988657697`)	`74.60461736` ms (`88.95210983454488`)	`0.98`
`Total per-file`	`11064.453014379998` ms (`52688.75402251567`)	`11091.201449639999` ms (`52310.41942604725`)	`1.00`
`Static slicing`	`21.98562903373952` ms (`78.82751436203577`)	`22.047137876062838` ms (`78.30877993604865`)	`1.00`
`Reconstruct code`	`0.213731997402135` ms (`0.14176452099017856`)	`0.2327517832436913` ms (`0.14954480815603388`)	`0.92`
`Total per-slice`	`22.20681718216753` ms (`78.84859290915547`)	`22.287796325154986` ms (`78.33211951742135`)	`1.00`
`failed to reconstruct/re-parse`	`0` #	`0` #	`1`
`times hit threshold`	`0` #	`0` #	`1`
`reduction (characters)`	`0.8719618340615195` #	`0.8719618340615195` #	`1`
`reduction (normalized tokens)`	`0.810633662275233` #	`0.810633662275233` #	`1`
`memory (df-graph)`	`145.6434765625` KiB (`153.49028997815503`)	`145.6434765625` KiB (`153.49028997815503`)	`1`