From f213f22a3c74d40a761ceac7fb940d09a517ee13 Mon Sep 17 00:00:00 2001 From: GeekMasher Date: Mon, 3 Mar 2025 12:09:13 +0000 Subject: [PATCH 1/4] feat(python): Add initial model generator support --- .../new/internal/DataFlowDispatch.qll | 5 + .../modelgenerator/CaptureSummaryModels.ql | 13 ++ .../modelgenerator/internal/CaptureModels.qll | 201 ++++++++++++++++++ 3 files changed, 219 insertions(+) create mode 100644 python/ql/src/utils/modelgenerator/CaptureSummaryModels.ql create mode 100644 python/ql/src/utils/modelgenerator/internal/CaptureModels.qll diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll index 1a38593bce48..a4471af2f70c 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll @@ -85,6 +85,11 @@ newtype TParameterPosition = /** A parameter position. */ class ParameterPosition extends TParameterPosition { + /** Gets the underlying integer position, if any. */ + int getPosition() { + this = TPositionalParameterPosition(result) + } + /** Holds if this position represents a `self`/`cls` parameter. */ predicate isSelf() { this = TSelfParameterPosition() } diff --git a/python/ql/src/utils/modelgenerator/CaptureSummaryModels.ql b/python/ql/src/utils/modelgenerator/CaptureSummaryModels.ql new file mode 100644 index 000000000000..e527761eaa5c --- /dev/null +++ b/python/ql/src/utils/modelgenerator/CaptureSummaryModels.ql @@ -0,0 +1,13 @@ +/** + * @name Capture summary models. + * @description Finds applicable summary models to be used by other queries. + * @kind diagnostic + * @id python/utils/modelgenerator/summary-models + * @tags modelgenerator + */ + +import internal.CaptureModels + +from DataFlowSummaryTargetApi api, string flow +where flow = captureFlow(api) +select flow order by flow diff --git a/python/ql/src/utils/modelgenerator/internal/CaptureModels.qll b/python/ql/src/utils/modelgenerator/internal/CaptureModels.qll new file mode 100644 index 000000000000..296bc6a776fa --- /dev/null +++ b/python/ql/src/utils/modelgenerator/internal/CaptureModels.qll @@ -0,0 +1,201 @@ +/** + * Provides predicates related to capturing summary models of the Standard or a 3rd party library. + */ + +private import codeql.util.Unit +private import python as P +// DataFlow +private import semmle.python.dataflow.new.DataFlow +private import semmle.python.dataflow.new.internal.DataFlowImpl +private import semmle.python.dataflow.new.internal.DataFlowDispatch +private import semmle.python.dataflow.new.internal.DataFlowImplSpecific +private import semmle.python.dataflow.new.internal.TaintTrackingImplSpecific +// ApiGraph +private import semmle.python.frameworks.data.internal.ApiGraphModels as ExternalFlow +private import semmle.python.dataflow.new.internal.DataFlowImplCommon as DataFlowImplCommon +private import semmle.python.dataflow.new.internal.DataFlowPrivate as DataFlowPrivate +private import codeql.mad.modelgenerator.internal.ModelGeneratorImpl +private import modeling.ModelEditor + +module ModelGeneratorInput implements ModelGeneratorInputSig { + class Type = Unit; // P::Type ? + + class Parameter = DataFlow::ParameterNode; + + // class Callable = Callable; + class Callable instanceof DataFlowCallable { + string toString() { result = super.toString() } + } + + class NodeExtended extends DataFlow::Node { + Callable getAsExprEnclosingCallable() { result = this.getEnclosingCallable() } + + Type getType() { any() } + + override Callable getEnclosingCallable() { result = super.getEnclosingCallable() } + + // override Callable getEnclosingCallable() { + // result = this.(DataFlow::Node).getEnclosingCallable().(DataFlowFunction).getScope() + // // result = this.(DataFlow::Node).getEnclosingCallable().(DataFlowFunction).getScope() + // // exists(P::Function func | + // // func.getScope() = this.(DataFlow::Node).getEnclosingCallable().getScope() + // // | + // // result = func + // // ) + // } + + Parameter asParameter() { result = this } + } + + private predicate relevant(Callable api) { any() } + + predicate isUninterestingForDataFlowModels(Callable api) { none() } + + predicate isUninterestingForHeuristicDataFlowModels(Callable api) { none() } + + class SourceOrSinkTargetApi extends Callable { + SourceOrSinkTargetApi() { relevant(this) } + } + + class SinkTargetApi extends SourceOrSinkTargetApi { } + + class SourceTargetApi extends SourceOrSinkTargetApi { } + + class SummaryTargetApi extends Callable { + private Callable lift; + + SummaryTargetApi() { + lift = this and + relevant(this) + } + + Callable lift() { result = lift } + + predicate isRelevant() { relevant(this) } + } + + // /** + // * ` + // */ + // private predicate qualifiedName(Callable c, string package, string type) { + // result = c. + // } + + predicate isRelevantType(Type t) { any() } + + Type getUnderlyingContentType(DataFlow::ContentSet c) { result = any(Type t) and exists(c) } + + string qualifierString() { result = "Argument[this]" } + + string parameterAccess(Parameter p) { + // TODO: Implement this to support named parameters + result = "Argument[" + p.getPosition().toString() + "]" + // result = "param[]" + } + + string parameterContentAccess(Parameter p) { result = "Argument[]" } + + class InstanceParameterNode extends DataFlow::ParameterNode { + InstanceParameterNode() { this.getParameter().isSelf() } + } + + bindingset[c] + string paramReturnNodeAsOutput(Callable c, ParameterPosition pos) { + result = parameterAccess(c.(DataFlowCallable).getParameter(pos)) + } + + bindingset[c] + string paramReturnNodeAsContentOutput(Callable c, ParameterPosition pos) { + result = parameterContentAccess(c.(DataFlowCallable).getParameter(pos)) + or + pos.isSelf() and result = qualifierString() + } + + Callable returnNodeEnclosingCallable(DataFlow::Node ret) { + // TODO + result = DataFlowImplCommon::getNodeEnclosingCallable(ret) + } + + predicate isOwnInstanceAccessNode(DataFlowPrivate::ReturnNode node) { none() } + + predicate sinkModelSanitizer(DataFlow::Node node) { none() } + + predicate apiSource(DataFlow::Node source) { none() } + + predicate irrelevantSourceSinkApi(Callable source, SourceTargetApi api) { none() } + + string getInputArgument(DataFlow::Node source) { result = "getInputArgument(" + source + ")" } + + bindingset[kind] + predicate isRelevantSinkKind(string kind) { + not kind = "log-injection" and + not kind.matches("regex-use%") and + not kind = "file-content-store" + } + + bindingset[kind] + predicate isRelevantSourceKind(string kind) { any() } + + predicate containerContent(DataFlow::ContentSet c) { + // TODO + any() + } + + predicate isAdditionalContentFlowStep(DataFlow::Node node1, DataFlow::Node node2) { none() } + + predicate isField(DataFlow::ContentSet c) { any() } + + predicate isCallback(DataFlow::ContentSet c) { none() } + + string getSyntheticName(DataFlow::ContentSet c) { none() } + + string printContent(DataFlow::ContentSet c) { + // TODO + result = "Memeber[]" + // exists(Parameter param | + // param = c.(Public::ParameterNode).getParameter() + // | + // result = "Member[" + param.getName() + "]" + // ) + // exists(string name, string arg | + // name = "Member" and + // if arg = "" then result = name else result = "Memeber[" + arg + "]" + // ) + } + + /** + * - ["argparse.ArgumentParser", "Member[_parse_known_args,_read_args_from_files]", "Argument[0,arg_strings:]", "ReturnValue", "taint"] + */ + string partialModelRow(Callable api, int i) { + exists(Endpoint e | e = api.(DataFlowFunction).getScope() | + i = 0 and result = e.getNamespace() + or + i = 1 and result = e.getClass() + or + i = 2 and result = e.getFunctionName() + or + i = 3 and result = e.getParameters() + + ) + // and + // // i = 0 and qualifiedName(api, result, _) // package[.Class] + // i = 0 and result = api.(DataFlowCallable) + // or + // i = 1 and result = "1" // name + // or + // i = 2 and + // result = "2" + // TODO + // exists(Parameter p | p = api.getArg(_) | result = "Member[" + p.getName() + "]") // parameters + } + + string partialNeutralModelRow(Callable api, int i) { result = partialModelRow(api, i) } + + // TODO: Implement this when we want to generate sources. + predicate sourceNode(DataFlow::Node node, string kind) { none() } + + // TODO: Implement this when we want to generate sinks. + predicate sinkNode(DataFlow::Node node, string kind) { none() } +} + +import MakeModelGenerator From f108276cf65c2bc506f3c0a0bb2ef81486264527 Mon Sep 17 00:00:00 2001 From: yoff Date: Mon, 3 Mar 2025 13:43:07 +0100 Subject: [PATCH 2/4] Python: implement missing bits - Settle on `DataFlowCallable` as `Callable`. Alternative is to use (something extending) `Function,` but then we ned to implement enclosing callable. - Reuse `EndPoint` from ModelEditor.qll. - clean up comments --- .../modelgenerator/internal/CaptureModels.qll | 67 +++---------------- 1 file changed, 10 insertions(+), 57 deletions(-) diff --git a/python/ql/src/utils/modelgenerator/internal/CaptureModels.qll b/python/ql/src/utils/modelgenerator/internal/CaptureModels.qll index 296bc6a776fa..e127f626534a 100644 --- a/python/ql/src/utils/modelgenerator/internal/CaptureModels.qll +++ b/python/ql/src/utils/modelgenerator/internal/CaptureModels.qll @@ -18,11 +18,10 @@ private import codeql.mad.modelgenerator.internal.ModelGeneratorImpl private import modeling.ModelEditor module ModelGeneratorInput implements ModelGeneratorInputSig { - class Type = Unit; // P::Type ? + class Type = Unit; class Parameter = DataFlow::ParameterNode; - // class Callable = Callable; class Callable instanceof DataFlowCallable { string toString() { result = super.toString() } } @@ -34,16 +33,6 @@ module ModelGeneratorInput implements ModelGeneratorInputSig Date: Mon, 3 Mar 2025 16:36:09 +0000 Subject: [PATCH 3/4] feat(python): Add / Update Capture models --- .../utils/modelgenerator/CaptureNeutralModels.ql | 13 +++++++++++++ .../src/utils/modelgenerator/CaptureSinkModels.ql | 13 +++++++++++++ .../src/utils/modelgenerator/CaptureSourceModels.ql | 13 +++++++++++++ .../utils/modelgenerator/CaptureSummaryModels.ql | 2 +- 4 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 python/ql/src/utils/modelgenerator/CaptureNeutralModels.ql create mode 100644 python/ql/src/utils/modelgenerator/CaptureSinkModels.ql create mode 100644 python/ql/src/utils/modelgenerator/CaptureSourceModels.ql diff --git a/python/ql/src/utils/modelgenerator/CaptureNeutralModels.ql b/python/ql/src/utils/modelgenerator/CaptureNeutralModels.ql new file mode 100644 index 000000000000..6c620f087213 --- /dev/null +++ b/python/ql/src/utils/modelgenerator/CaptureNeutralModels.ql @@ -0,0 +1,13 @@ +/** + * @name Capture neutral models. + * @description Finds neutral models to be used by other queries. + * @kind diagnostic + * @id py/utils/modelgenerator/neutral-models + * @tags modelgenerator + */ + +import internal.CaptureModels + +from DataFlowSummaryTargetApi api, string noflow +where noflow = captureNoFlow(api) +select noflow order by noflow diff --git a/python/ql/src/utils/modelgenerator/CaptureSinkModels.ql b/python/ql/src/utils/modelgenerator/CaptureSinkModels.ql new file mode 100644 index 000000000000..f917262d8350 --- /dev/null +++ b/python/ql/src/utils/modelgenerator/CaptureSinkModels.ql @@ -0,0 +1,13 @@ +/** + * @name Capture sink models. + * @description Finds public methods that act as sinks as they flow into a known sink. + * @kind diagnostic + * @id py/utils/modelgenerator/sink-models + * @tags modelgenerator + */ + +import internal.CaptureModels + +from DataFlowSinkTargetApi api, string sink +where sink = captureSink(api) +select sink order by sink diff --git a/python/ql/src/utils/modelgenerator/CaptureSourceModels.ql b/python/ql/src/utils/modelgenerator/CaptureSourceModels.ql new file mode 100644 index 000000000000..a22fd647362e --- /dev/null +++ b/python/ql/src/utils/modelgenerator/CaptureSourceModels.ql @@ -0,0 +1,13 @@ +/** + * @name Capture source models. + * @description Finds APIs that act as sources as they expose already known sources. + * @kind diagnostic + * @id py/utils/modelgenerator/source-models + * @tags modelgenerator + */ + +import internal.CaptureModels + +from DataFlowSourceTargetApi api, string source +where source = captureSource(api) +select source order by source diff --git a/python/ql/src/utils/modelgenerator/CaptureSummaryModels.ql b/python/ql/src/utils/modelgenerator/CaptureSummaryModels.ql index e527761eaa5c..05fd7bf5dcc3 100644 --- a/python/ql/src/utils/modelgenerator/CaptureSummaryModels.ql +++ b/python/ql/src/utils/modelgenerator/CaptureSummaryModels.ql @@ -2,7 +2,7 @@ * @name Capture summary models. * @description Finds applicable summary models to be used by other queries. * @kind diagnostic - * @id python/utils/modelgenerator/summary-models + * @id py/utils/modelgenerator/summary-models * @tags modelgenerator */ From e0437e51a13bc320d531e3b88e770f549e306a47 Mon Sep 17 00:00:00 2001 From: GeekMasher Date: Tue, 11 Mar 2025 16:12:04 +0000 Subject: [PATCH 4/4] feat(python): Update source and sink summary queries --- python/ql/lib/modeling/Util.qll | 2 +- .../data/internal/ApiGraphModels.qll | 2 +- .../modelgenerator/internal/CaptureModels.qll | 142 +++++++++++++++--- 3 files changed, 124 insertions(+), 22 deletions(-) diff --git a/python/ql/lib/modeling/Util.qll b/python/ql/lib/modeling/Util.qll index e0bbc334e4cb..99c99f633368 100644 --- a/python/ql/lib/modeling/Util.qll +++ b/python/ql/lib/modeling/Util.qll @@ -11,7 +11,7 @@ private import semmle.python.filters.Tests */ class TestFile extends File { TestFile() { - this.getRelativePath().regexpMatch(".*(test|spec|examples).+") and + this.getRelativePath().regexpMatch(".*(test|spec|examples|__main__).+") and not this.getAbsolutePath().matches("%/ql/test/%") // allows our test cases to work } } diff --git a/python/ql/lib/semmle/python/frameworks/data/internal/ApiGraphModels.qll b/python/ql/lib/semmle/python/frameworks/data/internal/ApiGraphModels.qll index c04a85487acf..ed8f98cdcc9a 100644 --- a/python/ql/lib/semmle/python/frameworks/data/internal/ApiGraphModels.qll +++ b/python/ql/lib/semmle/python/frameworks/data/internal/ApiGraphModels.qll @@ -332,7 +332,7 @@ predicate sourceModel(string type, string path, string kind, string model) { } /** Holds if a sink model exists for the given parameters. */ -private predicate sinkModel(string type, string path, string kind, string model) { +predicate sinkModel(string type, string path, string kind, string model) { any(DeprecationAdapter a).sinkModel(type, path, kind) and model = "SinkModelCsv" or diff --git a/python/ql/src/utils/modelgenerator/internal/CaptureModels.qll b/python/ql/src/utils/modelgenerator/internal/CaptureModels.qll index e127f626534a..c8f64c7df330 100644 --- a/python/ql/src/utils/modelgenerator/internal/CaptureModels.qll +++ b/python/ql/src/utils/modelgenerator/internal/CaptureModels.qll @@ -14,8 +14,15 @@ private import semmle.python.dataflow.new.internal.TaintTrackingImplSpecific private import semmle.python.frameworks.data.internal.ApiGraphModels as ExternalFlow private import semmle.python.dataflow.new.internal.DataFlowImplCommon as DataFlowImplCommon private import semmle.python.dataflow.new.internal.DataFlowPrivate as DataFlowPrivate +private import semmle.python.dataflow.new.TaintTracking private import codeql.mad.modelgenerator.internal.ModelGeneratorImpl private import modeling.ModelEditor +private import modeling.Util as ModelEditorUtil +// Concepts +private import semmle.python.Concepts +private import semmle.python.security.dataflow.CodeInjectionCustomizations +private import semmle.python.security.dataflow.ServerSideRequestForgeryCustomizations +private import semmle.python.security.dataflow.UnsafeDeserializationCustomizations module ModelGeneratorInput implements ModelGeneratorInputSig { class Type = Unit; @@ -36,19 +43,41 @@ module ModelGeneratorInput implements ModelGeneratorInputSig