Skip to content

Commit

Permalink
PipReport back to experimental, add pre-generated PipReport parsing (#…
Browse files Browse the repository at this point in the history
…1201)

* revert experiment graduation, bump threads, and enable fast deps

* put reqs back

* add ability for pip to detect pregenerated reports with a specific naming scheme

* better directory handling

* improve logging
  • Loading branch information
pauld-msft authored Jul 16, 2024
1 parent dd3f531 commit 024e2a5
Show file tree
Hide file tree
Showing 17 changed files with 10,159 additions and 36 deletions.
7 changes: 7 additions & 0 deletions docs/detectors/pip.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ Serialization specifications:
- https://peps.python.org/pep-0508/
- https://peps.python.org/pep-0301/

The detector can also pick up installation reports that have already been generated in the same directory as the `setup.py` or `requirements.txt` files,
as long as the report adheres to the following naming scheme: `component-detection-pip-report.json` or `*.component-detection-pip-report.json`

### Legacy Detection (PipDetector, SimplePipDetector)

Pip detection is performed by running the following code snippet on every *setup.py*:
Expand Down Expand Up @@ -60,3 +63,7 @@ The environment variable `PipReportOverrideBehavior` is used to override pip rep
- `SourceCodeScan`: Scan `setup.py` and `requirements.txt` files, and record components explicitly from the package files without hitting a remote feed. Does not compile a dependency graph.

The environment variable `PipReportSkipFallbackOnFailure` is used to skip the default fallback behavior if pip report fails. Default behavior scans `setup.py` and `requirements.txt` files, and record components explicitly from the package files without hitting a remote feed. Does not compile a dependency graph.

The environment variable `PipReportFileLevelTimeoutSeconds` is used to control the timeout limit for generating the PipReport for individual files. This defaults to the overall timeout.

The environment variable `PipReportDisableFastDeps` is used to disable the fast deps feature in PipReport.
2 changes: 1 addition & 1 deletion src/Microsoft.ComponentDetection.Contracts/ScanRequest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public class ScanRequest
/// <param name="imagesToScan">Container images to scan.</param>
/// <param name="componentRecorder">Detector component recorder.</param>
/// <param name="maxThreads">Max number of threads to use for detection.</param>
public ScanRequest(DirectoryInfo sourceDirectory, ExcludeDirectoryPredicate directoryExclusionPredicate, ILogger logger, IDictionary<string, string> detectorArgs, IEnumerable<string> imagesToScan, IComponentRecorder componentRecorder, int maxThreads = 3)
public ScanRequest(DirectoryInfo sourceDirectory, ExcludeDirectoryPredicate directoryExclusionPredicate, ILogger logger, IDictionary<string, string> detectorArgs, IEnumerable<string> imagesToScan, IComponentRecorder componentRecorder, int maxThreads = 5)
{
this.SourceDirectory = sourceDirectory;
this.DirectoryExclusionPredicate = directoryExclusionPredicate;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ namespace Microsoft.ComponentDetection.Detectors.Pip;

public class PipCommandService : IPipCommandService
{
private const string PipReportDisableFastDepsEnvVar = "PipReportDisableFastDeps";

private readonly ICommandLineInvocationService commandLineInvocationService;
private readonly IPathUtilityService pathUtilityService;
private readonly IFileUtilityService fileUtilityService;
Expand Down Expand Up @@ -127,6 +129,11 @@ private async Task<bool> CanCommandBeLocatedAsync(string pipPath)
pipReportCommand += $" --index-url {this.environmentService.GetEnvironmentVariable("PIP_INDEX_URL")}";
}

if (!this.environmentService.DoesEnvironmentVariableExist(PipReportDisableFastDepsEnvVar) || !this.environmentService.IsEnvironmentVariableValueTrue(PipReportDisableFastDepsEnvVar))
{
pipReportCommand += $" --use-feature=fast-deps";
}

this.logger.LogDebug("PipReport: Generating pip installation report for {Path} with command: {Command}", formattedPath, pipReportCommand.RemoveSensitiveInformation());
command = await this.commandLineInvocationService.ExecuteCommandAsync(
pipExecutable,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace Microsoft.ComponentDetection.Detectors.Pip;
using Microsoft.ComponentDetection.Contracts.TypedComponent;
using Microsoft.Extensions.Logging;

public class PipComponentDetector : FileComponentDetector, IDefaultOffComponentDetector
public class PipComponentDetector : FileComponentDetector
{
private readonly IPythonCommandService pythonCommandService;
private readonly IPythonResolver pythonResolver;
Expand All @@ -38,7 +38,7 @@ public PipComponentDetector(

public override IEnumerable<ComponentType> SupportedComponentTypes { get; } = new[] { ComponentType.Pip };

public override int Version { get; } = 11;
public override int Version { get; } = 12;

protected override async Task<IObservable<ProcessRequest>> OnPrepareDetectionAsync(IObservable<ProcessRequest> processRequests, IDictionary<string, string> detectorArgs)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,16 @@ namespace Microsoft.ComponentDetection.Detectors.Pip;
using Microsoft.ComponentDetection.Contracts.Internal;
using Microsoft.ComponentDetection.Contracts.TypedComponent;
using Microsoft.Extensions.Logging;
using Newtonsoft.Json;

public class PipReportComponentDetector : FileComponentDetector
public class PipReportComponentDetector : FileComponentDetector, IExperimentalDetector
{
// environment variables
private const string PipReportOverrideBehaviorEnvVar = "PipReportOverrideBehavior";
private const string PipReportSkipFallbackOnFailureEnvVar = "PipReportSkipFallbackOnFailure";
private const string PipReportFileLevelTimeoutSecondsEnvVar = "PipReportFileLevelTimeoutSeconds";

private static readonly IList<string> PipReportPreGeneratedFilePatterns = new List<string> { "*.component-detection-pip-report.json", "component-detection-pip-report.json" };

/// <summary>
/// The maximum version of the report specification that this detector can handle.
Expand All @@ -33,6 +38,7 @@ public class PipReportComponentDetector : FileComponentDetector
private readonly IEnvironmentVariableService envVarService;
private readonly IPythonCommandService pythonCommandService;
private readonly IPythonResolver pythonResolver;
private readonly IFileUtilityService fileUtilityService;

public PipReportComponentDetector(
IComponentStreamEnumerableFactory componentStreamEnumerableFactory,
Expand All @@ -41,6 +47,7 @@ public PipReportComponentDetector(
IEnvironmentVariableService envVarService,
IPythonCommandService pythonCommandService,
IPythonResolver pythonResolver,
IFileUtilityService fileUtilityService,
ILogger<PipReportComponentDetector> logger)
{
this.ComponentStreamEnumerableFactory = componentStreamEnumerableFactory;
Expand All @@ -49,6 +56,7 @@ public PipReportComponentDetector(
this.envVarService = envVarService;
this.pythonCommandService = pythonCommandService;
this.pythonResolver = pythonResolver;
this.fileUtilityService = fileUtilityService;
this.Logger = logger;
}

Expand All @@ -67,7 +75,7 @@ private enum PipReportOverrideBehavior

public override IEnumerable<ComponentType> SupportedComponentTypes { get; } = new[] { ComponentType.Pip };

public override int Version { get; } = 4;
public override int Version { get; } = 5;

protected override bool EnableParallelism { get; set; } = true;

Expand Down Expand Up @@ -117,7 +125,7 @@ protected override async Task OnFileFoundAsync(ProcessRequest processRequest, ID
var singleFileComponentRecorder = processRequest.SingleFileComponentRecorder;
var file = processRequest.ComponentStream;

FileInfo reportFile = null;
List<FileInfo> reportFiles = new();
try
{
var pipOverride = this.GetPipReportOverrideBehavior();
Expand Down Expand Up @@ -150,44 +158,103 @@ protected override async Task OnFileFoundAsync(ProcessRequest processRequest, ID
}

var stopwatch = Stopwatch.StartNew();
this.Logger.LogInformation("PipReport: Generating pip installation report for {File}", file.Location);

// Call pip executable to generate the installation report of a given project file.
(var report, reportFile) = await this.pipCommandService.GenerateInstallationReportAsync(file.Location, pipExePath, cancellationToken);
// Search for a pre-generated pip report file in the same directory as the file being scanned.
var fileParentDirectory = Path.GetDirectoryName(file.Location);
if (fileParentDirectory is null)
{
this.Logger.LogWarning("PipReport: Unable to determine parent directory for {File}.", file.Location);
return;
}

var fileParentDirectoryInfo = Directory.Exists(fileParentDirectory)
? new DirectoryInfo(fileParentDirectory)
: null;

// The report version is used to determine how to parse the report. If it is greater
// than the maximum supported version, there may be new fields and the parsing will fail.
if (!int.TryParse(report.Version, out var reportVersion) || reportVersion > MaxReportVersion.Major)
List<FileInfo> preGeneratedReportFiles = null;
if (fileParentDirectoryInfo is not null)
{
this.Logger.LogWarning(
"PipReport: The pip installation report version {ReportVersion} is not supported. The maximum supported version is {MaxVersion}.",
report.Version,
MaxReportVersion);
preGeneratedReportFiles = PipReportPreGeneratedFilePatterns
.SelectMany(pattern => fileParentDirectoryInfo.GetFiles(pattern))
.Where(file => File.Exists(file.FullName))
.ToList();
}

List<PipInstallationReport> reports = new();
if (preGeneratedReportFiles is not null && preGeneratedReportFiles.Any())
{
this.Logger.LogInformation("PipReport: Found pre-generated pip report(s) for {File}.", file.Location);

using var versionRecord = new InvalidParseVersionTelemetryRecord
foreach (var existingReport in preGeneratedReportFiles)
{
DetectorId = this.Id,
FilePath = file.Location,
Version = report.Version,
MaxVersion = MaxReportVersion.ToString(),
};
this.Logger.LogInformation("PipReport: Using pre-generated pip report '{ReportFile}' for package file '{File}'.", existingReport.FullName, file.Location);
var reportOutput = await this.fileUtilityService.ReadAllTextAsync(existingReport);
var report = JsonConvert.DeserializeObject<PipInstallationReport>(reportOutput);
reports.Add(report);
}
}
else
{
this.Logger.LogInformation("PipReport: Generating pip installation report for {File}", file.Location);

return;
// create linked cancellation token that will cancel if the file level timeout is reached, or if the parent token is cancelled.
// default to only using parent token if the env var is not set or is invalid
var childCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
if (this.envVarService.DoesEnvironmentVariableExist(PipReportFileLevelTimeoutSecondsEnvVar)
&& int.TryParse(this.envVarService.GetEnvironmentVariable(PipReportFileLevelTimeoutSecondsEnvVar), out var timeoutSeconds))
{
childCts.CancelAfter(TimeSpan.FromSeconds(timeoutSeconds));
}

// Call pip executable to generate the installation report of a given project file.
(var report, var reportFile) = await this.pipCommandService.GenerateInstallationReportAsync(file.Location, pipExePath, childCts.Token);
reports.Add(report);
reportFiles.Add(reportFile);
}

stopwatch.Stop();
this.Logger.LogInformation(
"PipReport: Generating pip installation report for {File} completed in {TotalSeconds} seconds with {PkgCount} detected packages.",
file.Location,
stopwatch.ElapsedMilliseconds / 1000.0,
report.InstallItems?.Length ?? 0);
if (!reports.Any())
{
this.Logger.LogWarning("PipReport: Failed to generate or find pip installation report for {File}.", file.Location);
return;
}

// Now that all installed packages are known, we can build a graph of the dependencies.
if (report.InstallItems is not null)
foreach (var report in reports)
{
var graph = this.BuildGraphFromInstallationReport(report);
this.RecordComponents(singleFileComponentRecorder, graph);
// The report version is used to determine how to parse the report. If it is greater
// than the maximum supported version, there may be new fields and the parsing will fail.
if (!int.TryParse(report.Version, out var reportVersion) || reportVersion > MaxReportVersion.Major)
{
this.Logger.LogWarning(
"PipReport: The pip installation report version {ReportVersion} is not supported. The maximum supported version is {MaxVersion}.",
report.Version,
MaxReportVersion);

using var versionRecord = new InvalidParseVersionTelemetryRecord
{
DetectorId = this.Id,
FilePath = file.Location,
Version = report.Version,
MaxVersion = MaxReportVersion.ToString(),
};

return;
}

this.Logger.LogInformation(
"PipReport: Pip installation report for {File} completed in {TotalSeconds} seconds with {PkgCount} detected packages.",
file.Location,
stopwatch.ElapsedMilliseconds / 1000.0,
report.InstallItems?.Length ?? 0);

// Now that all installed packages are known, we can build a graph of the dependencies.
if (report.InstallItems is not null)
{
var graph = this.BuildGraphFromInstallationReport(report);
this.RecordComponents(singleFileComponentRecorder, graph);
}
}

stopwatch.Stop();
}
catch (Exception e)
{
Expand All @@ -211,9 +278,12 @@ protected override async Task OnFileFoundAsync(ProcessRequest processRequest, ID
finally
{
// Clean up the report output JSON file so it isn't left on the machine.
if (reportFile is not null && reportFile.Exists)
foreach (var reportFile in reportFiles)
{
reportFile.Delete();
if (reportFile is not null && reportFile.Exists)
{
reportFile.Delete();
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
namespace Microsoft.ComponentDetection.Orchestrator.Experiments.Configs;

using Microsoft.ComponentDetection.Contracts;
using Microsoft.ComponentDetection.Detectors.Pip;

/// <summary>
/// Validating the <see cref="PipReportComponentDetector"/>.
/// </summary>
public class PipReportExperiment : IExperimentConfiguration
{
public string Name => "PipReport";

public bool IsInControlGroup(IComponentDetector componentDetector) => componentDetector is PipComponentDetector;

public bool IsInExperimentGroup(IComponentDetector componentDetector) => componentDetector is PipReportComponentDetector;

public bool ShouldRecord(IComponentDetector componentDetector, int numComponents) => true;
}
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ public static IServiceCollection AddComponentDetection(this IServiceCollection s
services.AddSingleton<IExperimentConfiguration, RustCliDetectorExperiment>();
services.AddSingleton<IExperimentConfiguration, VcpkgExperiment>();
services.AddSingleton<IExperimentConfiguration, GoDetectorReplaceExperiment>();
services.AddSingleton<IExperimentConfiguration, PipReportExperiment>();

// Detectors
// CocoaPods
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ namespace Microsoft.ComponentDetection.Orchestrator.Services;

public class DetectorProcessingService : IDetectorProcessingService
{
private const int DefaultMaxDetectionThreads = 3;
private const int DefaultMaxDetectionThreads = 5;
private const int ExperimentalTimeoutSeconds = 240; // 4 minutes
private const int ProcessTimeoutBufferSeconds = 5;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
<None Update="Mocks\pip_report_single_pkg_bad_version.json">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
<None Update="Mocks\test.component-detection-pip-report.json">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
</ItemGroup>

</Project>
Loading

0 comments on commit 024e2a5

Please sign in to comment.