Skip to content

Commit

Permalink
Improve Statistic Generation (#981)
Browse files Browse the repository at this point in the history
  • Loading branch information
EagleoutIce authored Sep 17, 2024
2 parents 96e3833 + f4a7a85 commit 3a4a3c9
Show file tree
Hide file tree
Showing 18 changed files with 405 additions and 256 deletions.
5 changes: 5 additions & 0 deletions src/cli/common/scripts-info.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,15 @@ import { asOptionName } from '../repl/commands/commands';


interface BaseScriptInformation extends MergeableRecord {
/** name of the tool to present to the user */
toolName: string
/** internal module name to fork/execute, make sure to use the correct path to it with the help of `__dirname` */
target: string
/** description of the tool for the user */
description: string
/** example usage */
usageExample: string
/** command line options that are available */
options: OptionDefinition[]
}

Expand Down
2 changes: 1 addition & 1 deletion src/cli/repl/execute.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ export function stdioCaptureProcessor(stdio: Stdio, onStdOutLine: (msg: string)
* the output of the script, see {@link stdioCaptureProcessor}.
* @param exitOnError - If set to `true`, the process will exit with the exit code of the script.
*/
export async function waitOnScript(module: string, args: string[], io?: StdioProcessor, exitOnError = false): Promise<void> {
export async function waitOnScript(module: string, args: readonly string[], io?: StdioProcessor, exitOnError = false): Promise<void> {
log.info(`starting script ${module} with args ${JSON.stringify(args)}`);
const child = cp.fork(module, args, {
silent: io !== undefined
Expand Down
13 changes: 13 additions & 0 deletions src/cli/run-script.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import type { StdioProcessor } from './repl/execute';
import { waitOnScript } from './repl/execute';
import { scripts } from './common/scripts-info';
import path from 'path';

/**
* Path-safe helper of {@link waitOnScript} for other flowR scripts.
*
* @see waitOnScript
*/
export async function runScript(name: keyof typeof scripts, args: readonly string[], io?: StdioProcessor, exitOnError = false): Promise<void> {
return waitOnScript(path.resolve(__dirname,scripts[name].target), args, io, exitOnError);
}
110 changes: 110 additions & 0 deletions src/cli/script-core/statistics-core.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import path from 'path';
import type { Arguments } from '../../util/parallel';
import { LimitedThreadPool } from '../../util/parallel';
import { allRFilesFrom } from '../../util/files';
import { retrieveArchiveName, validateFeatures } from '../common/features';
import fs from 'fs';
import { initFileProvider } from '../../statistics/output/statistics-file';
import { jsonReplacer } from '../../util/json';
import { log } from '../../util/log';
import type { StatsCliOptions } from '../statistics-app';
import { getStatsForSingleFile } from './statistics-helper-core';
import commandLineArgs from 'command-line-args';
import { scripts } from '../common/scripts-info';
import type { StatsHelperCliOptions } from '../statistics-helper-app';
import { setFormatter, voidFormatter } from '../../util/ansi';

const testRegex = /[^/]*\/test/i;
const exampleRegex = /[^/]*\/example/i;

function getPrefixForFile(file: string) {
if(testRegex.test(file)) {
return 'test-';
} else if(exampleRegex.test(file)) {
return 'example-';
} else {
return '';
}
}

function getSuffixForFile(base: string, file: string) {
const subpath = path.relative(base, file);
return '--' + subpath.replace(/\//g, '/');
}

async function collectFileArguments(options: StatsCliOptions, verboseAdd: readonly string[], dumpJson: readonly string[], features: readonly string[]) {
const files: Arguments[] = [];
let counter = 0;
let presentSteps = 5000;
let skipped = 0;
for await (const f of allRFilesFrom(options.input)) {
const outputDir = path.join(options['output-dir'], `${getPrefixForFile(f.content)}${getSuffixForFile(options.input.length === 1 ? options.input[0] : '', f.content)}`);
const target = retrieveArchiveName(outputDir);
if(fs.existsSync(target)) {
console.log(`Archive ${target} exists. Skip.`);
skipped++;
continue;
}
files.push(['--input', f.content, '--output-dir', outputDir,'--compress', '--root-dir', options.input.length === 1 ? options.input[0] : '""', ...verboseAdd, ...features, ...dumpJson]);
if(++counter % presentSteps === 0) {
console.log(`Collected ${counter} files`);
if(counter >= 10 * presentSteps) {
presentSteps *= 5;
}
}
}
console.log(`Total: ${counter} files (${skipped} skipped with archive existing)`);
return files;
}

export async function flowrScriptGetStats(options: StatsCliOptions) {
if(options.input.length === 0) {
console.error('No input files given. Nothing to do. See \'--help\' if this is an error.');
process.exit(0);
}

if(options['no-ansi']) {
log.info('disabling ansi colors');
setFormatter(voidFormatter);
}

const processedFeatures = validateFeatures(options.features);
initFileProvider(options['output-dir']);
console.log(`Processing features: ${JSON.stringify(processedFeatures, jsonReplacer)}`);
console.log(`Using ${options.parallel} parallel executors`);

const verboseAdd = options.verbose ? ['--verbose'] : [];
const features = [...processedFeatures].flatMap(s => ['--features', s]);
const dumpJson = options['dump-json'] ? ['--dump-json'] : [];

// we do not use the limit argument to be able to pick the limit randomly
const args = await collectFileArguments(options, verboseAdd, dumpJson, features);

if(options.limit) {
console.log('Shuffle...');
log.info(`limiting to ${options.limit} files`);
// shuffle and limit
args.sort(() => Math.random() - 0.5);
}
console.log('Prepare Pool...');

const limit = options.limit ?? args.length;

if(options.parallel > 0) {
const pool = new LimitedThreadPool(
`${__dirname}/statistics-helper-app`,
args,
limit,
options.parallel
);
console.log('Run Pool...');
await pool.run();
const stats = pool.getStats();
console.log(`Processed ${stats.counter} files, skipped ${stats.skipped.length} files due to errors`);
} else {
console.log('Run Sequentially as parallel <= 0...');
for(const arg of args) {
await getStatsForSingleFile(commandLineArgs(scripts['stats-helper'].options, { argv: arg }) as StatsHelperCliOptions);
}
}
}
90 changes: 90 additions & 0 deletions src/cli/script-core/statistics-helper-core.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import { retrieveArchiveName } from '../common/features';
import fs from 'fs';
import type { FeatureKey } from '../../statistics/features/feature';
import { RShell } from '../../r-bridge/shell';
import { initFileProvider, statisticsFileProvider } from '../../statistics/output/statistics-file';
import { extractUsageStatistics, staticRequests } from '../../statistics/statistics';
import { extractCFG } from '../../util/cfg/cfg';
import { printStepResult, StepOutputFormat } from '../../core/print/print';
import { PARSE_WITH_R_SHELL_STEP } from '../../core/steps/all/core/00-parse';
import { NORMALIZE } from '../../core/steps/all/core/10-normalize';
import { STATIC_DATAFLOW } from '../../core/steps/all/core/20-dataflow';
import { jsonReplacer } from '../../util/json';
import { log } from '../../util/log';
import { guard } from '../../util/assert';
import { date2string } from '../../util/time';
import type { StatsHelperCliOptions } from '../statistics-helper-app';
import { create } from 'tar';
import { setFormatter, voidFormatter } from '../../util/ansi';


function compressFolder(folder: string, target: string) {
// eslint-disable-next-line @typescript-eslint/no-unsafe-call,@typescript-eslint/no-unsafe-member-access
return create({
gzip: true,
file: target,
portable: true,
preservePaths: false
}, [folder]).then(() => {
// now, remove the folder
fs.rmSync(folder, { recursive: true, force: true });
}, () => {
console.log(`failed to compress ${folder}`);
});
}


export async function getStatsForSingleFile(options: StatsHelperCliOptions) {
if(options['no-ansi']) {
log.info('disabling ansi colors');
setFormatter(voidFormatter);
}

let target: string | undefined = undefined;
if(options.compress) {
target = retrieveArchiveName(options['output-dir']);
if(fs.existsSync(target)) {
console.log(`Archive ${target} exists. Skip.`);
process.exit(0);
}
}

// assume correct
const processedFeatures = new Set<FeatureKey>(options.features as FeatureKey[]);

const shell = new RShell();

initFileProvider(options['output-dir']);

await shell.obtainTmpDir();
const stats = await extractUsageStatistics(shell,
() => { /* do nothing */ },
processedFeatures,
staticRequests({ request: 'file', content: options.input }),
options['root-dir']
);
// console.warn(`skipped ${stats.meta.failedRequests.length} requests due to errors (run with logs to get more info)`)

if(stats.outputs.size === 1) {
if(options['dump-json']) {
const [, output] = [...stats.outputs.entries()][0];
const cfg = extractCFG(output.normalize);
statisticsFileProvider.append('output-json', 'parse', await printStepResult(PARSE_WITH_R_SHELL_STEP, output.parse, StepOutputFormat.Json));
statisticsFileProvider.append('output-json', 'normalize', await printStepResult(NORMALIZE, output.normalize, StepOutputFormat.Json));
statisticsFileProvider.append('output-json', 'dataflow', await printStepResult(STATIC_DATAFLOW, output.dataflow, StepOutputFormat.Json));
statisticsFileProvider.append('output-json', 'cfg', JSON.stringify(cfg, jsonReplacer));
}

statisticsFileProvider.append('meta', 'stats', JSON.stringify({ ...stats.meta, file: options.input }, jsonReplacer));
statisticsFileProvider.append('meta', 'features', JSON.stringify(stats.features, jsonReplacer));
} else {
log.error(`expected exactly one output vs. ${stats.outputs.size}, got: ${JSON.stringify([...stats.outputs.keys()], jsonReplacer, 2)}`);
}
if(options.compress) {
guard(target !== undefined, 'target must be defined given the compress option');
console.log(`[${date2string(new Date())}] Compressing ${options['output-dir']} to ${target}`);
await compressFolder(options['output-dir'], target);
}

shell.close();
}
56 changes: 56 additions & 0 deletions src/cli/script-core/summarizer-core.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import type { SummarizerCliOptions } from '../summarizer-app';
import { StatisticsSummarizer } from '../../statistics/summarizer/summarizer';
import { BenchmarkSummarizer } from '../../benchmark/summarizer/summarizer';
import { detectSummarizationType } from '../../statistics/summarizer/auto-detect';
import { SummarizerType } from '../../util/summarizer';
import { allFeatureNames } from '../../statistics/features/feature';


function getBenchmarkSummarizer(options: SummarizerCliOptions, outputBase: string) {
return new BenchmarkSummarizer({
graphOutputPath: options.graph ? `${outputBase}-graph.json` : undefined,
inputPath: options.input,
intermediateOutputPath: outputBase,
outputPath: `${outputBase}-ultimate.json`,
logger: console.log
});
}

function getStatisticsSummarizer(options: SummarizerCliOptions, outputBase: string) {
return new StatisticsSummarizer({
inputPath: options.input,
outputPath: `${outputBase}-final`,
intermediateOutputPath: `${outputBase}-intermediate/`,
projectSkip: options['project-skip'],
featuresToUse: allFeatureNames,
logger: console.log
});
}


async function retrieveSummarizer(options: SummarizerCliOptions, outputBase: string): Promise<StatisticsSummarizer | BenchmarkSummarizer> {
const type = options.type === 'auto' ? await detectSummarizationType(options.input) : options.type;
if(type === SummarizerType.Benchmark) {
console.log('Summarizing benchmark');
return getBenchmarkSummarizer(options, outputBase);
} else if(type === SummarizerType.Statistics) {
console.log('Summarizing statistics');
return getStatisticsSummarizer(options, outputBase);
} else {
console.error('Unknown type', type, 'either give "benchmark" or "statistics"');
process.exit(1);
}
}

export async function flowrScriptSummarizer(options: SummarizerCliOptions) {
const outputBase = (options.output ?? options.input).replace(/\.json$|\/$/, '-summary');
console.log(`Writing outputs to base ${outputBase}`);

const summarizer = await retrieveSummarizer(options, outputBase);

if(!options['ultimate-only']) {
await summarizer.preparationPhase(options.categorize);
}

await summarizer.summarizePhase();
}
Loading

2 comments on commit 3a4a3c9

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"artificial" Benchmark Suite

Benchmark suite Current: 3a4a3c9 Previous: 9d8b361 Ratio
Retrieve AST from R code 233.51630863636362 ms (98.42728308061673) 238.12351204545453 ms (103.59720843756357) 0.98
Normalize R AST 19.09238813636364 ms (32.88451942624459) 19.968034227272728 ms (34.84298543847825) 0.96
Produce dataflow information 37.29713595454545 ms (79.6841893426286) 38.310942090909094 ms (82.04448044777155) 0.97
Total per-file 807.1687020454546 ms (1432.9006933100156) 811.1703915909092 ms (1431.4404310276739) 1.00
Static slicing 2.34738781930876 ms (1.6261119892174756) 2.258090287874194 ms (1.2792808105316449) 1.04
Reconstruct code 0.22065341321404836 ms (0.16889238590670527) 0.22489327849282828 ms (0.17585774592637268) 0.98
Total per-slice 2.5841764538418106 ms (1.6702238972102916) 2.4996261233332735 ms (1.3278746913052974) 1.03
failed to reconstruct/re-parse 0 # 0 # 1
times hit threshold 0 # 0 # 1
reduction (characters) 0.7869360165281424 # 0.7869360165281424 # 1
reduction (normalized tokens) 0.7639690077689504 # 0.7639690077689504 # 1
memory (df-graph) 147.42458274147728 KiB (358.6827375397903) 147.42458274147728 KiB (358.6827375397903) 1

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"social-science" Benchmark Suite

Benchmark suite Current: 3a4a3c9 Previous: 9d8b361 Ratio
Retrieve AST from R code 233.44970384 ms (41.73554223159988) 238.40722376 ms (42.95412443307438) 0.98
Normalize R AST 21.90124094 ms (16.569524093502846) 22.0872248 ms (17.016890594916376) 0.99
Produce dataflow information 72.99715495999999 ms (85.8570988657697) 74.60461736 ms (88.95210983454488) 0.98
Total per-file 11064.453014379998 ms (52688.75402251567) 11091.201449639999 ms (52310.41942604725) 1.00
Static slicing 21.98562903373952 ms (78.82751436203577) 22.047137876062838 ms (78.30877993604865) 1.00
Reconstruct code 0.213731997402135 ms (0.14176452099017856) 0.2327517832436913 ms (0.14954480815603388) 0.92
Total per-slice 22.20681718216753 ms (78.84859290915547) 22.287796325154986 ms (78.33211951742135) 1.00
failed to reconstruct/re-parse 0 # 0 # 1
times hit threshold 0 # 0 # 1
reduction (characters) 0.8719618340615195 # 0.8719618340615195 # 1
reduction (normalized tokens) 0.810633662275233 # 0.810633662275233 # 1
memory (df-graph) 145.6434765625 KiB (153.49028997815503) 145.6434765625 KiB (153.49028997815503) 1

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.