Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions cmd/mcpproxy/security_cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -2072,6 +2072,25 @@ func printFindingsList(findings []interface{}) {
}
}

// Deterministic-scanner transparency (Spec 076 US4): the combined
// confidence and the independent checks that contributed to this
// finding, so an operator can see WHY a tool was flagged and that
// agreement among checks raised its score.
if conf, ok := finding["confidence"].(float64); ok && conf > 0 {
fmt.Printf(" Confidence: %.2f\n", conf)
}
if rawSignals, ok := finding["signals"].([]interface{}); ok && len(rawSignals) > 0 {
names := make([]string, 0, len(rawSignals))
for _, s := range rawSignals {
if name, ok := s.(string); ok && name != "" {
names = append(names, name)
}
}
if len(names) > 0 {
fmt.Println(" Signals: " + strings.Join(names, ", "))
}
}

// Package info
if pkg != "" {
pkgLine := " Package: " + pkg
Expand Down
57 changes: 57 additions & 0 deletions cmd/mcpproxy/security_cmd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,63 @@ func TestPrintFindingsListSkipsEmptyFields(t *testing.T) {
}
}

// TestPrintFindingsListRendersConfidenceAndSignals verifies Spec-076 US4
// (FR-010 / SC-007): the CLI surfaces each finding's combined confidence and
// the deterministic checks that contributed to it, so an operator can see WHY a
// tool was flagged.
func TestPrintFindingsListRendersConfidenceAndSignals(t *testing.T) {
out := captureStdout(t, func() {
printFindingsList([]interface{}{
map[string]interface{}{
"severity": "high",
"rule_id": "detect.unicode.hidden",
"title": "Hidden Unicode in tool description",
"location": "srv:exfiltrate",
"scanner": "tpa-descriptions",
"threat_level": "dangerous",
"threat_type": "tool_poisoning",
"confidence": 0.92,
"signals": []interface{}{"unicode.hidden", "directive.imperative"},
},
})
})

wants := []string{
"Confidence:",
"0.92",
"Signals:",
"unicode.hidden",
"directive.imperative",
}
for _, w := range wants {
if !strings.Contains(out, w) {
t.Errorf("output missing %q\n--- output ---\n%s", w, out)
}
}
}

// TestPrintFindingsListSkipsConfidenceWhenAbsent ensures plain CVE-style
// findings (no deterministic-scanner data) do not grow a noisy empty
// "Confidence:"/"Signals:" row.
func TestPrintFindingsListSkipsConfidenceWhenAbsent(t *testing.T) {
out := captureStdout(t, func() {
printFindingsList([]interface{}{
map[string]interface{}{
"severity": "high",
"rule_id": "CVE-2024-0001",
"title": "CVE-2024-0001",
"location": "node_modules/foo/index.js",
},
})
})
if strings.Contains(out, "Confidence:") {
t.Errorf("expected no 'Confidence:' row when confidence is absent; got:\n%s", out)
}
if strings.Contains(out, "Signals:") {
t.Errorf("expected no 'Signals:' row when signals are absent; got:\n%s", out)
}
}

// TestPrintScanContextSection verifies that the scan-context block renders
// the source method, container, and file/tool counts. This is the signal
// users need to triage a finding (e.g. is "tools.json:85" from a real Docker
Expand Down
8 changes: 7 additions & 1 deletion docs/cli/security-commands.md
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,13 @@ The `Scanners: X run, Y failed (names) of Z` line surfaces per-scanner failures

- `risk_score` — composite 0-100 score
- `summary` — severity counts (`critical`, `high`, `medium`, `low`, `info`, `dangerous`, `warnings`, `info_level`, `total`)
- `findings` — normalized findings across all scanners
- `findings` — normalized findings across all scanners. Findings from the
deterministic tool-scanner (Spec 076) additionally carry `confidence` (0.0–1.0
combined confidence) and `signals` (the independent check IDs that fired, e.g.
`unicode.hidden`, `directive.imperative`). When several independent checks
agree on one tool, that agreement **adds** to the composite `risk_score`
rather than being collapsed — the table report renders these as `Confidence:`
and `Signals:` lines under the finding.
- `reports` — per-scanner raw results (also includes SARIF when `?include_sarif=true` is passed to the REST endpoint)
- `scanner_statuses` — per-scanner execution records, each with `scanner_id`, `status`, `started_at`, `completed_at`, `duration_ms` (wall-clock execution time in milliseconds), `findings_count`, and `error`
- `scan_context` — source method, source path, scanned file list
Expand Down
49 changes: 37 additions & 12 deletions internal/security/scanner/sarif.go
Original file line number Diff line number Diff line change
Expand Up @@ -253,12 +253,20 @@ func categorizeFromRule(ruleID string, props map[string]any, rules map[string]SA
//
// This uses logarithmic diminishing returns so duplicate findings from multiple
// scanners don't inflate the score, while still reflecting cumulative risk.
// Findings are deduplicated by (rule_id + location) before scoring.
// Identical findings reported by several scanners (same rule_id + location) are
// deduplicated before scoring.
//
// Formula per category: category_score = weight * log2(1 + unique_count)
// - Dangerous: weight 25 (1 finding=25, 2=40, 4=58, 8=72, cap 80)
// - Warning: weight 6 (1 finding=6, 2=10, 4=15, 8=18, cap 25)
// - Info: weight 2 (1 finding=2, 2=3, 4=5, 8=6, cap 10)
// Spec 076 (FR-006, SC-007) — consensus is additive: the deterministic scanner
// emits ONE finding per tool whose Signals list every independent check that
// fired. A finding contributes its consensus weight (the count of distinct
// contributing signals, min 1) to its category, so a tool flagged by several
// checks raises the score instead of being collapsed to one. Findings from
// scanners that emit no per-signal data weigh 1, so legacy scoring is unchanged.
//
// Formula per category: category_score = weight * log2(1 + weighted_count)
// - Dangerous: weight 25 (1=25, 2=40, 4=58, 8=72, cap 80)
// - Warning: weight 6 (1=6, 2=10, 4=15, 8=18, cap 25)
// - Info: weight 2 (1=2, 2=3, 4=5, 8=6, cap 10)
//
// Note: This score is an experimental heuristic. There is no industry standard
// for aggregating multi-scanner MCP security findings into a single number.
Expand All @@ -282,24 +290,28 @@ func CalculateRiskScore(findings []ScanFinding) int {
seen[key] = true
}

// Consensus weight: independent signals on one tool ADD to the score
// (FR-006). A single-signal or signal-less finding weighs 1.
weight := consensusWeight(f)

switch f.ThreatLevel {
case ThreatLevelDangerous:
dangerousCount++
dangerousCount += weight
case ThreatLevelWarning:
warningCount++
warningCount += weight
case ThreatLevelInfo:
infoCount++
infoCount += weight
default:
// Unclassified: use severity as fallback
switch f.Severity {
case SeverityCritical:
dangerousCount++
dangerousCount += weight
case SeverityHigh:
warningCount++
warningCount += weight
case SeverityMedium:
warningCount++
warningCount += weight
case SeverityLow:
infoCount++
infoCount += weight
}
}
}
Expand Down Expand Up @@ -327,6 +339,19 @@ func CalculateRiskScore(findings []ScanFinding) int {
return score
}

// consensusWeight returns how much a single (deduplicated) finding contributes
// to its risk category. The deterministic scanner (Spec 076) aggregates every
// independent check that fired on a tool into one finding's Signals list, so a
// finding flagged by N distinct checks weighs N — agreement raises the score
// rather than being collapsed (FR-006). Findings with zero or one signal — and
// every legacy scanner finding, which carries none — weigh 1.
func consensusWeight(f ScanFinding) int {
if n := len(f.Signals); n > 1 {
return n
}
return 1
}

// SummarizeFindings produces a ReportSummary from findings
func SummarizeFindings(findings []ScanFinding) ReportSummary {
summary := ReportSummary{Total: len(findings)}
Expand Down
50 changes: 50 additions & 0 deletions internal/security/scanner/sarif_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,56 @@ func TestCalculateRiskScore(t *testing.T) {
}
}

// TestCalculateRiskScoreConsensusRaisesScore proves Spec-076 FR-006 / SC-007:
// independent signals on one tool ADD to the risk score instead of being
// collapsed, so a tool flagged by several checks scores higher than one flagged
// by a single check (consensus is visible). The deterministic scanner emits ONE
// ScanFinding per tool carrying every contributing check in Signals; the score
// must weigh that finding by its signal count.
func TestCalculateRiskScoreConsensusRaisesScore(t *testing.T) {
single := []ScanFinding{{
RuleID: "detect.unicode.hidden",
Location: "srv:tool_a",
ThreatLevel: ThreatLevelWarning,
Signals: []string{"unicode.hidden"},
}}
consensus := []ScanFinding{{
RuleID: "detect.unicode.hidden",
Location: "srv:tool_a",
ThreatLevel: ThreatLevelWarning,
Signals: []string{"unicode.hidden", "directive.imperative", "capability.mismatch"},
}}

singleScore := CalculateRiskScore(single)
consensusScore := CalculateRiskScore(consensus)

// single: 1 signal → warning count 1 → 6*log2(2)=6
if singleScore != 6 {
t.Errorf("single-signal score = %d, want 6", singleScore)
}
// consensus: 3 signals → warning count 3 → 6*log2(4)=12
if consensusScore != 12 {
t.Errorf("consensus score = %d, want 12", consensusScore)
}
if consensusScore <= singleScore {
t.Errorf("consensus score %d must exceed single-signal score %d", consensusScore, singleScore)
}
}

// TestCalculateRiskScoreCrossScannerDedupRetained guards that the consensus
// change does NOT break the legitimate cross-scanner de-duplication: the same
// finding (rule_id+location) reported by multiple scanners with no per-signal
// data still counts once. Only independent signals WITHIN a finding add.
func TestCalculateRiskScoreCrossScannerDedupRetained(t *testing.T) {
findings := []ScanFinding{
{RuleID: "MCP-TP-003", Location: "tool:add_numbers", ThreatLevel: ThreatLevelDangerous, Scanner: "scanner-a"},
{RuleID: "MCP-TP-003", Location: "tool:add_numbers", ThreatLevel: ThreatLevelDangerous, Scanner: "scanner-b"},
}
if got := CalculateRiskScore(findings); got != 25 {
t.Errorf("cross-scanner duplicate score = %d, want 25 (deduped to one dangerous)", got)
}
}

func TestSummarizeFindings(t *testing.T) {
findings := []ScanFinding{
{Severity: SeverityCritical},
Expand Down
30 changes: 30 additions & 0 deletions internal/security/scanner/types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,36 @@ func TestScanFindingConfidenceSignalsRoundTrip(t *testing.T) {
}
}

// TestAggregatedReportSerializesConfidenceSignals confirms the Spec-076 US4
// transparency fields survive serialization at the REST scan-report level (the
// payload returned by GET /api/v1/.../scan-report), not just on a bare
// ScanFinding — operators consume confidence/signals through the report API.
func TestAggregatedReportSerializesConfidenceSignals(t *testing.T) {
report := AggregatedReport{
JobID: "job-1",
ServerName: "srv",
RiskScore: 42,
Findings: []ScanFinding{{
RuleID: "detect.unicode.hidden",
Severity: SeverityHigh,
ThreatLevel: ThreatLevelDangerous,
Location: "srv:exfiltrate",
Confidence: 0.92,
Signals: []string{"unicode.hidden", "directive.imperative"},
}},
}
data, err := json.Marshal(report)
if err != nil {
t.Fatalf("marshal: %v", err)
}
if !strings.Contains(string(data), `"confidence":0.92`) {
t.Errorf("report finding confidence not serialized: %s", data)
}
if !strings.Contains(string(data), `"signals":["unicode.hidden","directive.imperative"]`) {
t.Errorf("report finding signals not serialized: %s", data)
}
}

// TestScanFindingOmitEmpty ensures the new fields are omitempty so existing
// consumers and stored findings are byte-unaffected when they are unset.
func TestScanFindingOmitEmpty(t *testing.T) {
Expand Down
4 changes: 2 additions & 2 deletions specs/076-deterministic-tool-scanner/tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ Single Go module. New package `internal/security/detect/` (engine + `checks/`);

**Independent test**: a multi-signal tool yields a finding listing each check, carrying confidence, with severity rising by signal count.

- [ ] T020 [US4] Update the risk-score aggregation in `internal/security/scanner/` (types.go / sarif.go scoring) so independent signals on a tool ADD to the score rather than dedup by `(rule_id+location)`; write a scoring test proving consensus raises the score (FR-006, SC-007).
- [ ] T021 [P] [US4] Surface `confidence` + `signals` in the CLI report (`cmd/mcpproxy/security_cmd.go` printReportTable) and confirm they serialize in the REST scan report; add/update the report-rendering test.
- [x] T020 [US4] Update the risk-score aggregation in `internal/security/scanner/` (types.go / sarif.go scoring) so independent signals on a tool ADD to the score rather than dedup by `(rule_id+location)`; write a scoring test proving consensus raises the score (FR-006, SC-007).
- [x] T021 [P] [US4] Surface `confidence` + `signals` in the CLI report (`cmd/mcpproxy/security_cmd.go` printReportTable) and confirm they serialize in the REST scan report; add/update the report-rendering test.

**Checkpoint**: operator can see why a tool was flagged and how strongly.

Expand Down
Loading