Skip to content

Commit d12190b

Browse files
committed
fix: normalize 'ascii' encoding to 'utf-8' (ASCII is valid UTF-8)
1 parent a10792a commit d12190b

2 files changed

Lines changed: 24 additions & 1 deletion

File tree

src/services/file_service.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,17 @@ def _detect_encoding(raw: bytes) -> str:
8080
8181
Module-level so it can be tested without instantiating FileService.
8282
Falls back to utf-8 if chardet is unavailable or confidence is too low.
83+
Normalizes 'ascii' to 'utf-8' because ASCII is a proper subset of UTF-8
84+
and saving as utf-8 is always safe for ASCII content.
8385
"""
8486
try:
8587
import chardet # optional dependency
8688

8789
result = chardet.detect(raw)
8890
if result["encoding"] and result["confidence"] >= _ENCODING_MIN_CONFIDENCE:
89-
return result["encoding"]
91+
detected = result["encoding"].lower()
92+
# ASCII is a valid subset of UTF-8; normalize to avoid confusing the UI.
93+
return "utf-8" if detected == "ascii" else result["encoding"]
9094
except ImportError:
9195
logger.debug("chardet not installed; defaulting to utf-8")
9296
return _ENCODING_FALLBACK

tests/unit/test_file_service.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,25 @@ def test_falls_back_to_utf8_for_undetectable_bytes(self):
9797
assert encoding == "utf-8"
9898

9999

100+
class TestDetectEncodingNormalization:
101+
def test_ascii_content_returns_utf8(self):
102+
"""Pure-ASCII bytes must be labelled utf-8, not ascii.
103+
104+
chardet correctly identifies ASCII bytes as 'ascii'. We normalize to
105+
'utf-8' because ASCII is a proper subset of UTF-8 and saving as utf-8
106+
is always safe for ASCII content.
107+
"""
108+
from src.services.file_service import _detect_encoding
109+
raw = b"hello world" # pure ASCII
110+
assert _detect_encoding(raw) == "utf-8"
111+
112+
def test_non_ascii_utf8_content_returns_utf8(self):
113+
"""Non-ASCII UTF-8 bytes (é, ñ) must return utf-8."""
114+
from src.services.file_service import _detect_encoding
115+
raw = "café résumé naïve".encode("utf-8")
116+
assert _detect_encoding(raw) == "utf-8"
117+
118+
100119
class TestAtomicSaveCleanup:
101120
"""Verify temp file is cleaned up when os.replace fails (lines 70-75)."""
102121

0 commit comments

Comments
 (0)