Skip to content

Commit

Permalink
Add diff ingester, increase test coverage (#5168)
Browse files Browse the repository at this point in the history
* Add diff ingester, increase test coverage

* Ignore dependency test data from Trivy scans

* Run make gen with latest tools
  • Loading branch information
evankanderson authored Dec 10, 2024
1 parent ba80578 commit 4e29d37
Show file tree
Hide file tree
Showing 10 changed files with 3,147 additions and 2,248 deletions.
1 change: 1 addition & 0 deletions .trivy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ scan:
skip-dirs:
- docs/
- ".cache/trivy/"
- internal/engine/ingester/deps/testdata/
12 changes: 12 additions & 0 deletions docs/docs/ref/proto.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

197 changes: 160 additions & 37 deletions internal/engine/ingester/deps/deps.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
package deps

import (
"cmp"
"context"
"errors"
"fmt"
"slices"

"github.com/go-git/go-billy/v5"
"github.com/go-git/go-billy/v5/helper/iofs"
"github.com/go-git/go-git/v5/plumbing"
"github.com/go-viper/mapstructure/v2"
"github.com/protobom/protobom/pkg/sbom"
"github.com/rs/zerolog"
Expand All @@ -19,6 +21,7 @@ import (
mdeps "github.com/mindersec/minder/internal/deps"
"github.com/mindersec/minder/internal/deps/scalibr"
engerrors "github.com/mindersec/minder/internal/engine/errors"
pbinternal "github.com/mindersec/minder/internal/proto"
pb "github.com/mindersec/minder/pkg/api/protobuf/go/minder/v1"
"github.com/mindersec/minder/pkg/engine/v1/interfaces"
"github.com/mindersec/minder/pkg/entities/v1/checkpoints"
Expand All @@ -38,11 +41,16 @@ type Deps struct {
extractor mdeps.Extractor
}

// Config is the set of parameters to the deps rule data ingest engine
type Config struct {
// RepoConfig is the set of parameters to the deps rule data ingest engine for repositories
type RepoConfig struct {
Branch string `json:"branch" yaml:"branch" mapstructure:"branch"`
}

// PullRequestConfig is the set of parameters to the deps rule data ingest engine for pull requests
type PullRequestConfig struct {
Filter string `json:"filter" yaml:"filter" mapstructure:"filter"`
}

// NewDepsIngester creates a new deps rule data ingest engine
func NewDepsIngester(cfg *pb.DepsType, gitprov provifv1.Git) (*Deps, error) {
if gitprov == nil {
Expand Down Expand Up @@ -76,6 +84,8 @@ func (gi *Deps) Ingest(ctx context.Context, ent protoreflect.ProtoMessage, param
switch entity := ent.(type) {
case *pb.Repository:
return gi.ingestRepository(ctx, entity, params)
case *pbinternal.PullRequest:
return gi.ingestPullRequest(ctx, entity, params)
default:
return nil, fmt.Errorf("deps is only supported for repositories")
}
Expand All @@ -85,7 +95,7 @@ func (gi *Deps) ingestRepository(ctx context.Context, repo *pb.Repository, param
var logger = zerolog.Ctx(ctx)
// the branch is left unset since we want to auto-discover it
// in case it's not explicitly set
userCfg := &Config{}
userCfg := &RepoConfig{}
if err := mapstructure.Decode(params, userCfg); err != nil {
return nil, fmt.Errorf("failed to read dependency ingester configuration from params: %w", err)
}
Expand All @@ -97,38 +107,13 @@ func (gi *Deps) ingestRepository(ctx context.Context, repo *pb.Repository, param
branch := gi.getBranch(repo, userCfg.Branch)
logger.Info().Interface("repo", repo).Msgf("extracting dependencies from %s#%s", repo.GetCloneUrl(), branch)

// We clone to the memfs go-billy filesystem driver, which doesn't
// allow for direct access to the underlying filesystem. This is
// because we want to be able to run this in a sandboxed environment
// where we don't have access to the underlying filesystem.
r, err := gi.gitprov.Clone(ctx, repo.GetCloneUrl(), branch)
if err != nil {
if errors.Is(err, provifv1.ErrProviderGitBranchNotFound) {
return nil, fmt.Errorf("%w: %s: branch %s", engerrors.ErrEvaluationFailed,
provifv1.ErrProviderGitBranchNotFound, branch)
} else if errors.Is(err, provifv1.ErrRepositoryEmpty) {
return nil, fmt.Errorf("%w: %s", engerrors.ErrEvaluationSkipped, provifv1.ErrRepositoryEmpty)
}
return nil, err
}

wt, err := r.Worktree()
if err != nil {
return nil, fmt.Errorf("could not get worktree: %w", err)
}

deps, err := gi.scanMemFs(ctx, wt.Filesystem)
deps, head, err := gi.scanFromUrl(ctx, repo.GetCloneUrl(), branch)
if err != nil {
return nil, fmt.Errorf("could not scan filesystem: %w", err)
}

logger.Debug().Interface("deps", deps).Msgf("Scanning successful: %d nodes found", len(deps.Nodes))

head, err := r.Head()
if err != nil {
return nil, fmt.Errorf("could not get head: %w", err)
}

hsh := head.Hash()

chkpoint := checkpoints.NewCheckpointV1Now().
Expand Down Expand Up @@ -163,16 +148,154 @@ func (gi *Deps) getBranch(repo *pb.Repository, userConfigBranch string) string {
return defaultBranch
}

// scanMemFs scans a billy memory filesystem for software dependencies.
func (gi *Deps) scanMemFs(ctx context.Context, memFS billy.Filesystem) (*sbom.NodeList, error) {
if memFS == nil {
return nil, fmt.Errorf("unable to scan dependencies, no active defined")
// ingestTypes returns a sorter function for the given filter type.
// items which compare equal are skipped in output.
var ingestTypes = map[string]func(*sbom.Node, *sbom.Node) int{
"new": func(base *sbom.Node, updated *sbom.Node) int {
return cmp.Compare(base.GetName(), updated.GetName())
},
"new_and_updated": func(base *sbom.Node, updated *sbom.Node) int {
return nodeSorter(base, updated)
},
"all": func(_ *sbom.Node, _ *sbom.Node) int {
return -1
},
}

func nodeSorter(a *sbom.Node, b *sbom.Node) int {
// If we compare by name and version first, we can avoid computing map keys.
res := cmp.Or(cmp.Compare(a.GetName(), b.GetName()),
cmp.Compare(a.GetVersion(), b.GetVersion()))
if res != 0 {
return res
}
// Same name and version, compare hashes. Go's shuffling map keys does not help here.
aHashes := make([]int32, 0, len(a.GetHashes()))
for algo := range a.GetHashes() {
aHashes = append(aHashes, algo)
}
slices.Sort(aHashes)
bHashes := make([]int32, 0, len(b.GetHashes()))
for algo := range b.GetHashes() {
bHashes = append(bHashes, algo)
}
slices.Sort(bHashes)
for i, algo := range aHashes {
if i >= len(bHashes) {
return 1
}
if r := cmp.Compare(algo, bHashes[i]); r != 0 {
return r
}
if r := cmp.Compare(a.GetHashes()[algo], b.GetHashes()[algo]); r != 0 {
return r
}
}
if len(aHashes) < len(bHashes) {
return -1
}
return 0
}

func filterNodes(base []*sbom.Node, updated []*sbom.Node, compare func(*sbom.Node, *sbom.Node) int) []*sbom.Node {
slices.SortFunc(base, nodeSorter)
slices.SortFunc(updated, nodeSorter)

ret := make([]*sbom.Node, 0, len(updated))

baseIdx, newIdx := 0, 0
for baseIdx < len(base) && newIdx < len(updated) {
cmpResult := compare(base[baseIdx], updated[newIdx])
if cmpResult < 0 {
baseIdx++
} else if cmpResult > 0 {
ret = append(ret, updated[newIdx])
newIdx++
} else {
newIdx++
}
}
if newIdx < len(updated) {
ret = append(ret, updated[newIdx:]...)
}
return ret
}

func (gi *Deps) ingestPullRequest(
ctx context.Context, pr *pbinternal.PullRequest, params map[string]any) (*interfaces.Result, error) {
userCfg := &PullRequestConfig{}
if err := mapstructure.Decode(params, userCfg); err != nil {
return nil, fmt.Errorf("failed to read dependency ingester configuration from params: %w", err)
}
if _, ok := ingestTypes[userCfg.Filter]; !ok {
return nil, fmt.Errorf("invalid filter type: %s", userCfg.Filter)
}

if pr.GetBaseCloneUrl() == "" {
return nil, errors.New("could not get base clone url")
}
if pr.GetTargetCloneUrl() == "" {
return nil, errors.New("could not get head clone url")
}
baseDeps, _, err := gi.scanFromUrl(ctx, pr.GetBaseCloneUrl(), pr.GetBaseRef())
if err != nil {
return nil, fmt.Errorf("could not scan base filesystem: %w", err)
}
targetDeps, ref, err := gi.scanFromUrl(ctx, pr.GetTargetCloneUrl(), pr.GetTargetRef())
if err != nil {
return nil, fmt.Errorf("could not scan target filesystem: %w", err)
}

// Overwrite the target list of nodes with the result of filtering by desired match.
// We checked that the filter is valid at the top of the function.
targetDeps.Nodes = filterNodes(baseDeps.GetNodes(), targetDeps.GetNodes(), ingestTypes[userCfg.Filter])

chkpoint := checkpoints.NewCheckpointV1Now().
WithBranch(pr.GetTargetRef()).
WithCommitHash(ref.Hash().String())

return &interfaces.Result{
Object: map[string]any{
"node_list": targetDeps,
},
Checkpoint: chkpoint,
}, nil
}

func (gi *Deps) scanFromUrl(ctx context.Context, url string, branch string) (*sbom.NodeList, *plumbing.Reference, error) {
// We clone to the memfs go-billy filesystem driver, which doesn't
// allow for direct access to the underlying filesystem. This is
// because we want to be able to run this in a sandboxed environment
// where we don't have access to the underlying filesystem.
repo, err := gi.gitprov.Clone(ctx, url, branch)
if err != nil {
if errors.Is(err, provifv1.ErrProviderGitBranchNotFound) {
return nil, nil, fmt.Errorf("%w: %s: branch %s", engerrors.ErrEvaluationFailed,
provifv1.ErrProviderGitBranchNotFound, branch)
} else if errors.Is(err, provifv1.ErrRepositoryEmpty) {
return nil, nil, fmt.Errorf("%w: %s", engerrors.ErrEvaluationSkipped, provifv1.ErrRepositoryEmpty)
}
return nil, nil, err
}

wt, err := repo.Worktree()
if err != nil {
return nil, nil, fmt.Errorf("could not get worktree: %w", err)
}

if wt.Filesystem == nil {
return nil, nil, fmt.Errorf("could not get filesystem")
}

deps, err := gi.extractor.ScanFilesystem(ctx, iofs.New(wt.Filesystem))
if err != nil {
return nil, nil, fmt.Errorf("%T extractor: %w", gi.extractor, err)
}

nl, err := gi.extractor.ScanFilesystem(ctx, iofs.New(memFS))
ref, err := repo.Head()
if err != nil {
return nil, fmt.Errorf("%T extractor: %w", gi.extractor, err)
return nil, nil, fmt.Errorf("could not get head: %w", err)
}

return nl, err
return deps, ref, nil
}
Loading

0 comments on commit 4e29d37

Please sign in to comment.