From c1ecca3ca6b445a4261668d119cbf1f7661cb997 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 07:56:48 +0000 Subject: [PATCH 1/5] Initial plan From 88b36c44df9bd9e7c0403090e186e71310c19c71 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 08:34:36 +0000 Subject: [PATCH 2/5] Add shutil.unpack_archive and subprocess tar extraction as TarSlip sources and sinks - Add test cases for shutil.unpack_archive and subprocess.run(["tar", ...]) to tarslip.py - Add ShutilUnpackArchiveSource/Sink for shutil.unpack_archive calls with non-literal filenames - Add SubprocessTarExtractionSource/Sink for subprocess calls invoking tar with extraction flags - Update TarSlip.expected with expected test output for new cases Agent-Logs-Url: https://github.com/github/codeql/sessions/f31a3622-9b18-415f-85f1-62ec14a8319f Co-authored-by: hvitved <3667920+hvitved@users.noreply.github.com> --- .../dataflow/TarSlipCustomizations.qll | 69 +++++++++++++++++++ .../Security/CWE-022-TarSlip/TarSlip.expected | 6 ++ .../Security/CWE-022-TarSlip/tarslip.py | 13 ++++ 3 files changed, 88 insertions(+) diff --git a/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll b/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll index 2dbe2c542aee..4a0068990dcd 100644 --- a/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll +++ b/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll @@ -132,6 +132,75 @@ module TarSlip { } } + /** + * A call to `shutil.unpack_archive`, considered as a flow source. + * + * The archive filename is not hardcoded, so it may come from user input. + */ + class ShutilUnpackArchiveSource extends Source { + ShutilUnpackArchiveSource() { + this = API::moduleImport("shutil").getMember("unpack_archive").getACall() and + not this.(DataFlow::CallCfgNode).getArg(0).getALocalSource().asExpr() instanceof StringLiteral + } + } + + /** + * A call to `shutil.unpack_archive`, considered as a flow sink. + * + * The archive filename is not hardcoded, so it may come from user input. + */ + class ShutilUnpackArchiveSink extends Sink { + ShutilUnpackArchiveSink() { + this = API::moduleImport("shutil").getMember("unpack_archive").getACall() and + not this.(DataFlow::CallCfgNode).getArg(0).getALocalSource().asExpr() instanceof StringLiteral + } + } + + /** + * Holds if `call` is a subprocess call that invokes `tar` for archive extraction + * with at least one non-literal argument (the archive filename). + * + * Detects patterns like `subprocess.run(["tar", "-xf", untrusted_filename])`. + */ + private predicate isSubprocessTarExtraction(DataFlow::CallCfgNode call) { + exists(SequenceNode cmdList | + call = + API::moduleImport("subprocess") + .getMember(["run", "call", "check_call", "check_output", "Popen"]) + .getACall() and + cmdList = call.getArg(0).asCfgNode() and + // Command must be "tar" (possibly with a full path like "/usr/bin/tar") + cmdList.getElement(0).getNode().(StringLiteral).getText().matches("%tar") and + // At least one extraction-related flag must be present + exists(string flag | + flag = cmdList.getElement(_).getNode().(StringLiteral).getText() and + (flag.matches("%-x%") or flag = "--extract") + ) and + // At least one non-literal argument (the archive filename) + exists(int i | + i > 0 and + exists(cmdList.getElement(i)) and + not cmdList.getElement(i).getNode() instanceof StringLiteral + ) + ) + } + + /** + * A call to `subprocess` functions that invokes `tar` for archive extraction, + * considered as a flow source. + */ + class SubprocessTarExtractionSource extends Source { + SubprocessTarExtractionSource() { isSubprocessTarExtraction(this) } + } + + /** + * A call to `subprocess` functions that invokes `tar` for archive extraction, + * considered as a flow sink. + */ + class SubprocessTarExtractionSink extends Sink { + SubprocessTarExtractionSink() { isSubprocessTarExtraction(this) } + } + /** * Holds if `g` clears taint for `tarInfo`. * diff --git a/python/ql/test/query-tests/Security/CWE-022-TarSlip/TarSlip.expected b/python/ql/test/query-tests/Security/CWE-022-TarSlip/TarSlip.expected index 6f98ea1aae2b..73a257d13cb6 100644 --- a/python/ql/test/query-tests/Security/CWE-022-TarSlip/TarSlip.expected +++ b/python/ql/test/query-tests/Security/CWE-022-TarSlip/TarSlip.expected @@ -53,6 +53,9 @@ nodes | tarslip.py:112:1:112:3 | ControlFlowNode for tar | semmle.label | ControlFlowNode for tar | | tarslip.py:112:7:112:39 | ControlFlowNode for Attribute() | semmle.label | ControlFlowNode for Attribute() | | tarslip.py:113:24:113:26 | ControlFlowNode for tar | semmle.label | ControlFlowNode for tar | +| tarslip.py:122:1:122:49 | ControlFlowNode for Attribute() | semmle.label | ControlFlowNode for Attribute() | +| tarslip.py:126:1:126:51 | ControlFlowNode for Attribute() | semmle.label | ControlFlowNode for Attribute() | +| tarslip.py:127:1:127:58 | ControlFlowNode for Attribute() | semmle.label | ControlFlowNode for Attribute() | subpaths #select | tarslip.py:15:1:15:3 | ControlFlowNode for tar | tarslip.py:14:7:14:39 | ControlFlowNode for Attribute() | tarslip.py:15:1:15:3 | ControlFlowNode for tar | This file extraction depends on a $@. | tarslip.py:14:7:14:39 | ControlFlowNode for Attribute() | potentially untrusted source | @@ -64,3 +67,6 @@ subpaths | tarslip.py:96:17:96:21 | ControlFlowNode for entry | tarslip.py:94:7:94:39 | ControlFlowNode for Attribute() | tarslip.py:96:17:96:21 | ControlFlowNode for entry | This file extraction depends on a $@. | tarslip.py:94:7:94:39 | ControlFlowNode for Attribute() | potentially untrusted source | | tarslip.py:110:1:110:3 | ControlFlowNode for tar | tarslip.py:109:7:109:39 | ControlFlowNode for Attribute() | tarslip.py:110:1:110:3 | ControlFlowNode for tar | This file extraction depends on a $@. | tarslip.py:109:7:109:39 | ControlFlowNode for Attribute() | potentially untrusted source | | tarslip.py:113:24:113:26 | ControlFlowNode for tar | tarslip.py:112:7:112:39 | ControlFlowNode for Attribute() | tarslip.py:113:24:113:26 | ControlFlowNode for tar | This file extraction depends on a $@. | tarslip.py:112:7:112:39 | ControlFlowNode for Attribute() | potentially untrusted source | +| tarslip.py:122:1:122:49 | ControlFlowNode for Attribute() | tarslip.py:122:1:122:49 | ControlFlowNode for Attribute() | tarslip.py:122:1:122:49 | ControlFlowNode for Attribute() | This file extraction depends on a $@. | tarslip.py:122:1:122:49 | ControlFlowNode for Attribute() | potentially untrusted source | +| tarslip.py:126:1:126:51 | ControlFlowNode for Attribute() | tarslip.py:126:1:126:51 | ControlFlowNode for Attribute() | tarslip.py:126:1:126:51 | ControlFlowNode for Attribute() | This file extraction depends on a $@. | tarslip.py:126:1:126:51 | ControlFlowNode for Attribute() | potentially untrusted source | +| tarslip.py:127:1:127:58 | ControlFlowNode for Attribute() | tarslip.py:127:1:127:58 | ControlFlowNode for Attribute() | tarslip.py:127:1:127:58 | ControlFlowNode for Attribute() | This file extraction depends on a $@. | tarslip.py:127:1:127:58 | ControlFlowNode for Attribute() | potentially untrusted source | diff --git a/python/ql/test/query-tests/Security/CWE-022-TarSlip/tarslip.py b/python/ql/test/query-tests/Security/CWE-022-TarSlip/tarslip.py index 2c06d01adfd5..bf8e2d0e2642 100644 --- a/python/ql/test/query-tests/Security/CWE-022-TarSlip/tarslip.py +++ b/python/ql/test/query-tests/Security/CWE-022-TarSlip/tarslip.py @@ -114,3 +114,16 @@ def safemembers(members): tar = tarfile.open(unsafe_filename_tar) tar.extractall(members=safemembers(tar), filter=extraction_filter) # safe -- we assume `safemembers` makes up for the unsafe filter + +import shutil +import subprocess + +# shutil.unpack_archive +shutil.unpack_archive(unsafe_filename_tar, "out") # unsafe +shutil.unpack_archive("safe.tar", "out") # safe + +# subprocess tar extraction +subprocess.run(["tar", "-xf", unsafe_filename_tar]) # unsafe +subprocess.check_call(["tar", "-xf", unsafe_filename_tar]) # unsafe +subprocess.run(["tar", "-xf", "safe.tar"]) # safe +subprocess.run(["echo", unsafe_filename_tar]) # safe - not a tar extraction From 8efaa5daf154e9d50269f2eddd0d70d226601e7c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 08:38:54 +0000 Subject: [PATCH 3/5] Fix imprecise patterns in isSubprocessTarExtraction predicate Use regexpMatch instead of matches to avoid false positives: - Command name: regexpMatch(\"(.*/)?tar\") to match only \"tar\" or paths ending in \"/tar\" - Extraction flag: regexpMatch(\"-[a-zA-Z]*x[a-zA-Z]*\") to match only single-dash flags containing x Agent-Logs-Url: https://github.com/github/codeql/sessions/f31a3622-9b18-415f-85f1-62ec14a8319f Co-authored-by: hvitved <3667920+hvitved@users.noreply.github.com> --- .../python/security/dataflow/TarSlipCustomizations.qll | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll b/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll index 4a0068990dcd..29c801a74395 100644 --- a/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll +++ b/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll @@ -169,12 +169,13 @@ module TarSlip { .getMember(["run", "call", "check_call", "check_output", "Popen"]) .getACall() and cmdList = call.getArg(0).asCfgNode() and - // Command must be "tar" (possibly with a full path like "/usr/bin/tar") - cmdList.getElement(0).getNode().(StringLiteral).getText().matches("%tar") and - // At least one extraction-related flag must be present + // Command must be "tar" or a full path ending in "/tar" (e.g. "/usr/bin/tar") + cmdList.getElement(0).getNode().(StringLiteral).getText().regexpMatch("(.*/)?tar") and + // At least one extraction-related flag must be present: + // single-dash flags containing 'x' (like -x, -xf, -xvf) or the long option --extract exists(string flag | flag = cmdList.getElement(_).getNode().(StringLiteral).getText() and - (flag.matches("%-x%") or flag = "--extract") + (flag.regexpMatch("-[a-zA-Z]*x[a-zA-Z]*") or flag = "--extract") ) and // At least one non-literal argument (the archive filename) exists(int i | From 6d03548d9dc7c3692fe386992988c7f4791a3547 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 08:42:03 +0000 Subject: [PATCH 4/5] Improve clarity of subprocess tar extraction detection patterns Replace regexpMatch for command name with explicit equality check: - cmd = \"tar\" or cmd.matches(\"%/tar\") - clearly matches only exact \"tar\" or paths ending with \"/tar\" Keep flag check as regexpMatch since it naturally excludes double-dash flags Agent-Logs-Url: https://github.com/github/codeql/sessions/f31a3622-9b18-415f-85f1-62ec14a8319f Co-authored-by: hvitved <3667920+hvitved@users.noreply.github.com> --- .../python/security/dataflow/TarSlipCustomizations.qll | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll b/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll index 29c801a74395..2289afd59120 100644 --- a/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll +++ b/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll @@ -169,8 +169,11 @@ module TarSlip { .getMember(["run", "call", "check_call", "check_output", "Popen"]) .getACall() and cmdList = call.getArg(0).asCfgNode() and - // Command must be "tar" or a full path ending in "/tar" (e.g. "/usr/bin/tar") - cmdList.getElement(0).getNode().(StringLiteral).getText().regexpMatch("(.*/)?tar") and + // Command must be "tar" exactly or a path ending in "/tar" (e.g. "/usr/bin/tar") + exists(string cmd | + cmd = cmdList.getElement(0).getNode().(StringLiteral).getText() and + (cmd = "tar" or cmd.matches("%/tar")) + ) and // At least one extraction-related flag must be present: // single-dash flags containing 'x' (like -x, -xf, -xvf) or the long option --extract exists(string flag | From 50848834a88c70ca67adf1d9c1d6c422c5a3ae12 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Apr 2026 10:11:37 +0000 Subject: [PATCH 5/5] Address reviewer feedback: fix sink modeling for shutil and subprocess - Remove ShutilUnpackArchiveSource (should not be a source) - Change ShutilUnpackArchiveSink to target getArg(0) (the filename arg, not the whole call); removes the now-redundant literal check - Remove SubprocessTarExtractionSource (should not be a source) - Change SubprocessTarExtractionSink to target the specific non-literal element in the command list (the filename), not the call itself - Remove private isSubprocessTarExtraction predicate (inlined into the sink) - Revert TarSlip.expected to its pre-PR state (the new sinks require proper source taint flow to fire) Agent-Logs-Url: https://github.com/github/codeql/sessions/833673da-f868-4c3b-8bff-62364ee0ed19 Co-authored-by: hvitved <3667920+hvitved@users.noreply.github.com> --- .../dataflow/TarSlipCustomizations.qll | 87 +++++++------------ .../Security/CWE-022-TarSlip/TarSlip.expected | 6 -- 2 files changed, 30 insertions(+), 63 deletions(-) diff --git a/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll b/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll index 2289afd59120..fb28100f8e94 100644 --- a/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll +++ b/python/ql/lib/semmle/python/security/dataflow/TarSlipCustomizations.qll @@ -132,77 +132,50 @@ module TarSlip { } } - /** - * A call to `shutil.unpack_archive`, considered as a flow source. - * - * The archive filename is not hardcoded, so it may come from user input. - */ - class ShutilUnpackArchiveSource extends Source { - ShutilUnpackArchiveSource() { - this = API::moduleImport("shutil").getMember("unpack_archive").getACall() and - not this.(DataFlow::CallCfgNode).getArg(0).getALocalSource().asExpr() instanceof StringLiteral - } - } - /** * A call to `shutil.unpack_archive`, considered as a flow sink. * - * The archive filename is not hardcoded, so it may come from user input. + * The first argument (the archive filename) is the sink. */ class ShutilUnpackArchiveSink extends Sink { ShutilUnpackArchiveSink() { - this = API::moduleImport("shutil").getMember("unpack_archive").getACall() and - not this.(DataFlow::CallCfgNode).getArg(0).getALocalSource().asExpr() instanceof StringLiteral + this = API::moduleImport("shutil").getMember("unpack_archive").getACall().getArg(0) } } - /** - * Holds if `call` is a subprocess call that invokes `tar` for archive extraction - * with at least one non-literal argument (the archive filename). - * - * Detects patterns like `subprocess.run(["tar", "-xf", untrusted_filename])`. - */ - private predicate isSubprocessTarExtraction(DataFlow::CallCfgNode call) { - exists(SequenceNode cmdList | - call = - API::moduleImport("subprocess") - .getMember(["run", "call", "check_call", "check_output", "Popen"]) - .getACall() and - cmdList = call.getArg(0).asCfgNode() and - // Command must be "tar" exactly or a path ending in "/tar" (e.g. "/usr/bin/tar") - exists(string cmd | - cmd = cmdList.getElement(0).getNode().(StringLiteral).getText() and - (cmd = "tar" or cmd.matches("%/tar")) - ) and - // At least one extraction-related flag must be present: - // single-dash flags containing 'x' (like -x, -xf, -xvf) or the long option --extract - exists(string flag | - flag = cmdList.getElement(_).getNode().(StringLiteral).getText() and - (flag.regexpMatch("-[a-zA-Z]*x[a-zA-Z]*") or flag = "--extract") - ) and - // At least one non-literal argument (the archive filename) - exists(int i | - i > 0 and - exists(cmdList.getElement(i)) and - not cmdList.getElement(i).getNode() instanceof StringLiteral - ) - ) - } - - /** - * A call to `subprocess` functions that invokes `tar` for archive extraction, - * considered as a flow source. - */ - class SubprocessTarExtractionSource extends Source { - SubprocessTarExtractionSource() { isSubprocessTarExtraction(this) } - } - /** * A call to `subprocess` functions that invokes `tar` for archive extraction, * considered as a flow sink. + * + * The sink is the non-literal element in the command list that corresponds + * to the archive filename (e.g. the `unsafe_filename` in + * `subprocess.run(["tar", "-xf", unsafe_filename])`). */ class SubprocessTarExtractionSink extends Sink { - SubprocessTarExtractionSink() { isSubprocessTarExtraction(this) } + SubprocessTarExtractionSink() { + exists(SequenceNode cmdList, DataFlow::CallCfgNode call, int i | + call = + API::moduleImport("subprocess") + .getMember(["run", "call", "check_call", "check_output", "Popen"]) + .getACall() and + cmdList = call.getArg(0).asCfgNode() and + // Command must be "tar" exactly or a path ending in "/tar" (e.g. "/usr/bin/tar") + exists(string cmd | + cmd = cmdList.getElement(0).getNode().(StringLiteral).getText() and + (cmd = "tar" or cmd.matches("%/tar")) + ) and + // At least one extraction-related flag must be present: + // single-dash flags containing 'x' (like -x, -xf, -xvf) or the long option --extract + exists(string flag | + flag = cmdList.getElement(_).getNode().(StringLiteral).getText() and + (flag.regexpMatch("-[a-zA-Z]*x[a-zA-Z]*") or flag = "--extract") + ) and + // The sink is the specific non-literal argument (the archive filename) + i > 0 and + not cmdList.getElement(i).getNode() instanceof StringLiteral and + this.asCfgNode() = cmdList.getElement(i) + ) + } } /** diff --git a/python/ql/test/query-tests/Security/CWE-022-TarSlip/TarSlip.expected b/python/ql/test/query-tests/Security/CWE-022-TarSlip/TarSlip.expected index 73a257d13cb6..6f98ea1aae2b 100644 --- a/python/ql/test/query-tests/Security/CWE-022-TarSlip/TarSlip.expected +++ b/python/ql/test/query-tests/Security/CWE-022-TarSlip/TarSlip.expected @@ -53,9 +53,6 @@ nodes | tarslip.py:112:1:112:3 | ControlFlowNode for tar | semmle.label | ControlFlowNode for tar | | tarslip.py:112:7:112:39 | ControlFlowNode for Attribute() | semmle.label | ControlFlowNode for Attribute() | | tarslip.py:113:24:113:26 | ControlFlowNode for tar | semmle.label | ControlFlowNode for tar | -| tarslip.py:122:1:122:49 | ControlFlowNode for Attribute() | semmle.label | ControlFlowNode for Attribute() | -| tarslip.py:126:1:126:51 | ControlFlowNode for Attribute() | semmle.label | ControlFlowNode for Attribute() | -| tarslip.py:127:1:127:58 | ControlFlowNode for Attribute() | semmle.label | ControlFlowNode for Attribute() | subpaths #select | tarslip.py:15:1:15:3 | ControlFlowNode for tar | tarslip.py:14:7:14:39 | ControlFlowNode for Attribute() | tarslip.py:15:1:15:3 | ControlFlowNode for tar | This file extraction depends on a $@. | tarslip.py:14:7:14:39 | ControlFlowNode for Attribute() | potentially untrusted source | @@ -67,6 +64,3 @@ subpaths | tarslip.py:96:17:96:21 | ControlFlowNode for entry | tarslip.py:94:7:94:39 | ControlFlowNode for Attribute() | tarslip.py:96:17:96:21 | ControlFlowNode for entry | This file extraction depends on a $@. | tarslip.py:94:7:94:39 | ControlFlowNode for Attribute() | potentially untrusted source | | tarslip.py:110:1:110:3 | ControlFlowNode for tar | tarslip.py:109:7:109:39 | ControlFlowNode for Attribute() | tarslip.py:110:1:110:3 | ControlFlowNode for tar | This file extraction depends on a $@. | tarslip.py:109:7:109:39 | ControlFlowNode for Attribute() | potentially untrusted source | | tarslip.py:113:24:113:26 | ControlFlowNode for tar | tarslip.py:112:7:112:39 | ControlFlowNode for Attribute() | tarslip.py:113:24:113:26 | ControlFlowNode for tar | This file extraction depends on a $@. | tarslip.py:112:7:112:39 | ControlFlowNode for Attribute() | potentially untrusted source | -| tarslip.py:122:1:122:49 | ControlFlowNode for Attribute() | tarslip.py:122:1:122:49 | ControlFlowNode for Attribute() | tarslip.py:122:1:122:49 | ControlFlowNode for Attribute() | This file extraction depends on a $@. | tarslip.py:122:1:122:49 | ControlFlowNode for Attribute() | potentially untrusted source | -| tarslip.py:126:1:126:51 | ControlFlowNode for Attribute() | tarslip.py:126:1:126:51 | ControlFlowNode for Attribute() | tarslip.py:126:1:126:51 | ControlFlowNode for Attribute() | This file extraction depends on a $@. | tarslip.py:126:1:126:51 | ControlFlowNode for Attribute() | potentially untrusted source | -| tarslip.py:127:1:127:58 | ControlFlowNode for Attribute() | tarslip.py:127:1:127:58 | ControlFlowNode for Attribute() | tarslip.py:127:1:127:58 | ControlFlowNode for Attribute() | This file extraction depends on a $@. | tarslip.py:127:1:127:58 | ControlFlowNode for Attribute() | potentially untrusted source |