Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions codeanalyzer/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,32 @@ def main(
case_sensitive=False,
),
] = OutputFormat.JSON,
analysis_level: Annotated[
int,
typer.Option("-a", "--analysis-level", help="1: symbol table, 2: call graph (requires --codeql), 3: taint analysis (requires --codeql)."),
] = 1,
using_codeql: Annotated[
bool, typer.Option("--codeql/--no-codeql", help="Enable CodeQL-based analysis.")
] = False,
taint_config: Annotated[
Optional[Path],
typer.Option(
"--taint-config",
help="Path to taint analysis configuration file (YAML or JSON). Used with --analysis-level 3.",
),
] = None,
taint_use_defaults: Annotated[
bool,
typer.Option(
"--taint-defaults/--no-taint-defaults",
help=(
"Controls which taint sources/sinks/sanitizers are active:\n\n"
" (no --taint-config) → built-in defaults only\n"
" --taint-config + --taint-defaults → union of defaults and custom config [default]\n"
" --taint-config + --no-taint-defaults → custom config only, replaces all defaults"
),
),
] = True,
using_ray: Annotated[
bool,
typer.Option("--ray/--no-ray", help="Enable Ray for distributed analysis."),
Expand Down Expand Up @@ -74,10 +97,24 @@ def main(
int, typer.Option("-v", count=True, help="Increase verbosity: -v, -vv, -vvv")
] = 0,
):
# Validate analysis level requirements
if analysis_level >= 2 and not using_codeql:
logger.error("Analysis levels 2 and 3 require --codeql flag")
raise typer.Exit(code=1)

if analysis_level >= 3 and taint_config and not taint_config.exists():
logger.error(f"Taint configuration file '{taint_config}' does not exist.")
raise typer.Exit(code=1)

if not taint_use_defaults and not taint_config:
logger.error("--no-taint-defaults requires --taint-config (otherwise nothing would be analyzed).")
raise typer.Exit(code=1)

options = AnalysisOptions(
input=input,
output=output,
format=format,
analysis_level=analysis_level,
using_codeql=using_codeql,
using_ray=using_ray,
rebuild_analysis=rebuild_analysis,
Expand All @@ -86,6 +123,8 @@ def main(
cache_dir=cache_dir,
clear_cache=clear_cache,
verbosity=verbosity,
taint_config=taint_config,
taint_use_defaults=taint_use_defaults,
)

_set_log_level(options.verbosity)
Expand Down
181 changes: 181 additions & 0 deletions codeanalyzer/config/taint_config_defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
################################################################################
# Copyright IBM Corporation 2025
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

"""Default taint analysis configuration.

Design
------
The generated CodeQL query uses CodeQL's built-in security models as the
primary detection layer — all 20 ``*Customizations`` modules shipped with
``codeql/python-all 7.x`` are imported, covering:

SQL Injection, Command Injection, Code Injection, Path Traversal,
Reflected XSS, LDAP Injection, XXE, SSRF, SSTI, Unsafe Deserialization,
Open Redirect, Log Injection, NoSQL Injection, XPath Injection,
Tar/Zip Slip, HTTP Header Injection, Cleartext Storage, Cleartext Logging,
Cookie Injection, Regular Expression Injection (ReDoS).

The patterns defined here are **supplementary** — they extend built-in
coverage with sources that are not modelled by CodeQL's ``RemoteFlowSource``:

Sources not in RemoteFlowSource:
- ``sys.argv`` — command-line arguments
- ``input()`` — interactive user input
- ``os.getenv()`` — environment variables
- ``os.environ.get()`` — environment variables
- ``requests.*`` — outbound HTTP responses used as data sources

Sinks:
- The default sinks list is intentionally empty — all common sinks are
covered by the built-in CodeQL models. Add project-specific sinks here
only when they are NOT covered by the built-ins.

Sanitizers:
- Common HTML/path/command sanitizers that CodeQL may not model as barriers.

Users can extend or override this configuration via a YAML/JSON file passed
with ``--taint-config``. All CodeQL patterns must use double-quoted strings.
"""

from codeanalyzer.schema.py_schema import (
TaintAnalysisConfig,
TaintSourceConfig,
TaintSinkConfig,
TaintSanitizerConfig,
)


def get_default_taint_config() -> TaintAnalysisConfig:
"""Returns the default taint analysis configuration.

Combines CodeQL's built-in security models (primary) with supplementary
user-configured patterns for sources/sinks not covered by the built-ins.

Returns:
TaintAnalysisConfig: Default configuration
"""

return TaintAnalysisConfig(
sources=[
# --- Sources not covered by CodeQL's RemoteFlowSource ---

# Command-line arguments
TaintSourceConfig(
name="command_line_args",
description="Command-line arguments via sys.argv",
pattern='API::moduleImport("sys").getMember("argv")',
source_type="command_line_argument",
),

# Interactive user input
TaintSourceConfig(
name="user_input",
description="Direct user input via input() function",
pattern='API::builtin("input").getACall()',
source_type="user_input",
),

# Environment variables
TaintSourceConfig(
name="env_getenv",
description="Environment variables via os.getenv",
pattern='API::moduleImport("os").getMember("getenv").getACall()',
source_type="environment_variable",
),
TaintSourceConfig(
name="env_environ_get",
description="Environment variables via os.environ.get",
pattern='API::moduleImport("os").getMember("environ").getMember("get").getACall()',
source_type="environment_variable",
),

# Outbound HTTP responses used as data sources (requests library)
TaintSourceConfig(
name="requests_get_response",
description="HTTP GET response body (requests.get().text / .json())",
pattern='API::moduleImport("requests").getMember("get").getReturn().getMember("text")',
source_type="http_response",
),
TaintSourceConfig(
name="requests_post_response",
description="HTTP POST response body (requests.post().text / .json())",
pattern='API::moduleImport("requests").getMember("post").getReturn().getMember("text")',
source_type="http_response",
),
],

sinks=[
# The built-in CodeQL security models (imported in taint_query_generator.py) cover
# all common sinks: SQL, command, code, path, XSS, LDAP, XXE, SSRF, SSTI,
# deserialization, open redirect, log injection, NoSQL, XPath, tar/zip slip,
# HTTP header injection, cleartext storage/logging, cookie injection, ReDoS.
#
# Add project-specific sinks here only when they are NOT covered by the built-ins.
],

sanitizers=[
# HTML / XSS sanitizers
TaintSanitizerConfig(
name="html_escape",
description="HTML escape function (html.escape)",
pattern='API::moduleImport("html").getMember("escape").getACall()',
sanitizes=["xss", "template_injection"],
),
TaintSanitizerConfig(
name="markupsafe_escape",
description="MarkupSafe Markup() / escape()",
pattern='API::moduleImport("markupsafe").getMember("escape").getACall()',
sanitizes=["xss"],
),

# Command injection sanitizers
TaintSanitizerConfig(
name="shlex_quote",
description="Shell argument quoting via shlex.quote",
pattern='API::moduleImport("shlex").getMember("quote").getACall()',
sanitizes=["command_injection"],
),

# Path traversal sanitizers
TaintSanitizerConfig(
name="os_path_normpath",
description="Path normalization via os.path.normpath",
pattern='API::moduleImport("os").getMember("path").getMember("normpath").getACall()',
sanitizes=["path_traversal"],
),
TaintSanitizerConfig(
name="os_path_abspath",
description="Absolute path resolution via os.path.abspath",
pattern='API::moduleImport("os").getMember("path").getMember("abspath").getACall()',
sanitizes=["path_traversal"],
),
TaintSanitizerConfig(
name="pathlib_resolve",
description="Path resolution via pathlib.Path.resolve()",
pattern='API::moduleImport("pathlib").getMember("Path").getReturn().getMember("resolve").getACall()',
sanitizes=["path_traversal"],
),
],

# Analysis options
max_path_length=10,
include_implicit_flows=False,
confidence_threshold="medium",
exclude_files=[],
exclude_functions=[],
include_safe_flows=False,
group_by_vulnerability=True,
)
Loading