Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 75 additions & 1 deletion pkg/sanitize/sanitize.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@ var policy *bluemonday.Policy
var policyOnce sync.Once

func Sanitize(input string) string {
return FilterHTMLTags(FilterCodeFenceMetadata(FilterInvisibleCharacters(input)))
s := FilterInvisibleCharacters(input)
s = FilterCodeFenceMetadata(s)
s = protectCodeAngles(s)
s = FilterHTMLTags(s)
s = restoreCodeAngles(s)
return s
}

// FilterInvisibleCharacters removes invisible or control characters that should not appear
Expand Down Expand Up @@ -207,3 +212,72 @@ func shouldRemoveRune(r rune) bool {

return false
}

// Placeholders used to shield angle brackets inside code regions from
// the HTML sanitizer. They must not look like HTML tags themselves and
// must be unlikely to appear in real content.
const (
codeLtPlaceholder = "\x00CODELT\x00"
codeGtPlaceholder = "\x00CODEGT\x00"
)

// protectCodeAngles replaces < and > with unique placeholders inside
// fenced code blocks so that bluemonday does not strip them as HTML tags.
// This must run after FilterCodeFenceMetadata (which cleans fence info
// strings) and before FilterHTMLTags.
func protectCodeAngles(input string) string {
if input == "" {
return input
}

lines := strings.Split(input, "\n")
insideFence := false
currentFenceLen := 0

for i, line := range lines {
fenceIdx := strings.Index(line, "```")

if fenceIdx != -1 && !hasNonWhitespace(line[:fenceIdx]) {
fenceEnd := fenceIdx
for fenceEnd < len(line) && line[fenceEnd] == '`' {
Comment on lines +237 to +242
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

protectCodeAngles re-implements fenced-code tracking instead of reusing sanitizeCodeFenceLine / FilterCodeFenceMetadata logic, and it uses different closing-fence semantics (>= currentFenceLen here vs exact-length matching in sanitizeCodeFenceLine). This divergence can cause the two passes to disagree about whether subsequent lines are inside a fence (e.g., with a longer closing fence), making future behavior brittle. Consider extracting a shared fence parser/toggler so both steps stay consistent (and updating one place if the fence rules change).

Copilot uses AI. Check for mistakes.
fenceEnd++
}
fenceLen := fenceEnd - fenceIdx

if fenceLen >= 3 {
if insideFence {
if currentFenceLen == 0 || fenceLen >= currentFenceLen {
// Valid closing fence (CommonMark: closing fence
// must be at least as long as the opening fence).
insideFence = false
currentFenceLen = 0
continue
}
// Fence length too short — still inside code.
} else {
// Opening fence.
insideFence = true
currentFenceLen = fenceLen
continue
}
}
}

if insideFence {
lines[i] = strings.ReplaceAll(
strings.ReplaceAll(line, "<", codeLtPlaceholder),
">", codeGtPlaceholder,
)
}
}

return strings.Join(lines, "\n")
}

// restoreCodeAngles reverses the placeholder substitution performed by
// protectCodeAngles.
func restoreCodeAngles(input string) string {
s := strings.ReplaceAll(input, codeLtPlaceholder, "<")
s = strings.ReplaceAll(s, codeGtPlaceholder, ">")
return s
Comment on lines +277 to +282
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

restoreCodeAngles blindly replaces the placeholders everywhere in the sanitized output. Since FilterInvisibleCharacters does not remove NUL ("\x00"), a crafted input could include the placeholder sequences (e.g., "\x00CODELT\x00") outside a fenced block and have them converted into literal </> after FilterHTMLTags, potentially reintroducing HTML tags that the sanitizer would otherwise remove. Consider either (a) stripping \x00 in FilterInvisibleCharacters, and/or (b) making restoreCodeAngles only restore placeholders within fenced code blocks (mirroring protectCodeAngles) so user-supplied placeholder text cannot bypass HTML sanitization.

Copilot uses AI. Check for mistakes.
}
148 changes: 148 additions & 0 deletions pkg/sanitize/sanitize_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,3 +300,151 @@ func TestSanitizeRemovesInvisibleCodeFenceMetadata(t *testing.T) {
result := Sanitize(input)
assert.Equal(t, expected, result)
}

func TestProtectCodeAngles(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "empty string",
input: "",
expected: "",
},
{
name: "no code blocks",
input: "Hello <b>World</b>",
expected: "Hello <b>World</b>",
},
{
name: "fenced code block with angle brackets",
input: "```\nvector<int> v;\n```",
expected: "```\nvector" + codeLtPlaceholder + "int" + codeGtPlaceholder + " v;\n```",
},
{
name: "fenced code block with language tag",
input: "```cpp\nmap<string, int> m;\n```",
expected: "```cpp\nmap" + codeLtPlaceholder + "string, int" + codeGtPlaceholder + " m;\n```",
},
{
name: "multiple code blocks",
input: "text\n```\na<b>c\n```\nmiddle\n```\nd<e>f\n```",
expected: "text\n```\na" + codeLtPlaceholder + "b" + codeGtPlaceholder + "c\n```\nmiddle\n```\nd" + codeLtPlaceholder + "e" + codeGtPlaceholder + "f\n```",
},
{
name: "angle brackets outside code blocks preserved as-is",
input: "Use <b>bold</b>\n```\ncode<T>\n```\nMore <em>text</em>",
expected: "Use <b>bold</b>\n```\ncode" + codeLtPlaceholder + "T" + codeGtPlaceholder + "\n```\nMore <em>text</em>",
},
{
name: "four-backtick fence",
input: "````\nfn foo<T>()\n````",
expected: "````\nfn foo" + codeLtPlaceholder + "T" + codeGtPlaceholder + "()\n````",
},
{
name: "shorter fence inside code does not close block",
input: "````\nline<A>\n```\nstill<B>\n````",
expected: "````\nline" + codeLtPlaceholder + "A" + codeGtPlaceholder + "\n```\nstill" + codeLtPlaceholder + "B" + codeGtPlaceholder + "\n````",
},
{
name: "longer closing fence closes the block (CommonMark)",
input: "```\ncode<T>\n````\noutside<b>text</b>",
expected: "```\ncode" + codeLtPlaceholder + "T" + codeGtPlaceholder + "\n````\noutside<b>text</b>",
},
{
name: "unclosed fence protects remaining lines",
input: "```\na<b>c\nmore<d>",
expected: "```\na" + codeLtPlaceholder + "b" + codeGtPlaceholder + "c\nmore" + codeLtPlaceholder + "d" + codeGtPlaceholder,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := protectCodeAngles(tt.input)
assert.Equal(t, tt.expected, result)
})
}
}

func TestRestoreCodeAngles(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "empty string",
input: "",
expected: "",
},
{
name: "no placeholders",
input: "Hello World",
expected: "Hello World",
},
{
name: "restores lt and gt",
input: "vector" + codeLtPlaceholder + "int" + codeGtPlaceholder,
expected: "vector<int>",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := restoreCodeAngles(tt.input)
assert.Equal(t, tt.expected, result)
})
}
}

func TestSanitizePreservesAngleBracketsInCodeBlocks(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "issue 2202: template parameter in code block",
input: "```\nlet ptr: mut_raw_ptr<int> = raw_new int;\n```",
expected: "```\nlet ptr: mut_raw_ptr<int> = raw_new int;\n```",
},
{
name: "C++ template in code block",
input: "```cpp\nstd::vector<std::string> items;\n```",
expected: "```cpp\nstd::vector<std::string> items;\n```",
},
{
name: "HTML-like tags outside code blocks still sanitized",
input: "<script>alert(1)</script>\n```\nvector<int> v;\n```",
expected: "\n```\nvector<int> v;\n```",
},
{
name: "allowed HTML outside code blocks preserved",
input: "<b>bold</b>\n```\nfoo<T>()\n```",
expected: "<b>bold</b>\n```\nfoo<T>()\n```",
},
{
name: "multiple angle brackets in code",
input: "```\nMap<String, List<Integer>> m;\n```",
expected: "```\nMap<String, List<Integer>> m;\n```",
},
{
name: "script tags after code block still sanitized",
input: "```\nvector<int> v;\n```\n<script>alert(1)</script>",
expected: "```\nvector<int> v;\n```\n",
},
{
name: "longer closing fence does not leak protection",
input: "```\ncode<T>\n````\n<script>alert(1)</script>",
expected: "```\ncode<T>\n````\n",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := Sanitize(tt.input)
assert.Equal(t, tt.expected, result)
})
}
}
Loading