Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions sjsonnet/src-js/sjsonnet/stdlib/PlatformBase64.scala
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package sjsonnet.stdlib

import java.nio.charset.StandardCharsets

/**
* Scala.js implementation of base64 encode/decode. Delegates to java.util.Base64 (provided by
* Scala.js stdlib emulation).
Expand All @@ -9,6 +11,12 @@ object PlatformBase64 {
def encodeToString(input: Array[Byte]): String =
java.util.Base64.getEncoder.encodeToString(input)

/** See JVM `PlatformBase64.encodeStringToString` — same contract. */
def encodeStringToString(input: String, asciiSafe: Boolean): String = {
val charset = if (asciiSafe) StandardCharsets.ISO_8859_1 else StandardCharsets.UTF_8
java.util.Base64.getEncoder.encodeToString(input.getBytes(charset))
}

def decode(input: String): Array[Byte] = {
Base64Validation.requireStrictPadding(input)
java.util.Base64.getDecoder.decode(input)
Expand Down
16 changes: 16 additions & 0 deletions sjsonnet/src-jvm/sjsonnet/stdlib/PlatformBase64.scala
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package sjsonnet.stdlib

import java.nio.charset.StandardCharsets

/**
* JVM implementation of base64 encode/decode. Delegates to java.util.Base64 which has HotSpot
* intrinsics for high performance.
Expand All @@ -9,6 +11,20 @@ object PlatformBase64 {
def encodeToString(input: Array[Byte]): String =
java.util.Base64.getEncoder.encodeToString(input)

/**
* Encode a `String` directly to a base64 string. The `asciiSafe` flag is a hot-path hint: when
* `true`, the caller has already proven every char fits in 0x00–0x7F, so we can use ISO-8859-1
* instead of UTF-8 — both produce byte-identical output for ASCII, but ISO-8859-1 skips the UTF-8
* encoder's pre-count scan over the input. On JVMs with compact strings (Java 9+) a pure-ASCII
* string is already LATIN1-tagged, so `getBytes(ISO_8859_1)` is essentially an array copy. The
* intermediate `Array[Byte]` allocation is unavoidable on the JVM (the platform Base64 encoder
* takes a byte array); the bigger win lives on the Scala Native side.
*/
def encodeStringToString(input: String, asciiSafe: Boolean): String = {
val charset = if (asciiSafe) StandardCharsets.ISO_8859_1 else StandardCharsets.UTF_8
java.util.Base64.getEncoder.encodeToString(input.getBytes(charset))
}

def decode(input: String): Array[Byte] = {
Base64Validation.requireStrictPadding(input)
java.util.Base64.getDecoder.decode(input)
Expand Down
43 changes: 43 additions & 0 deletions sjsonnet/src-native/sjsonnet/stdlib/PlatformBase64.scala
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,49 @@ object PlatformBase64 {
}
}

/**
* Encode a `String` directly to base64 without materialising an intermediate `Array[Byte]` for
* the input side. On Scala Native, `String.getBytes(UTF_8)` for an ASCII-only input still has to
* walk every char checking for non-ASCII codepoints and then allocate a `Array[Byte]` of equal
* length; for a 3.5 KB Lorem-ipsum-style input that's two full passes over the data before the
* SIMD encoder even sees it.
*
* The `asciiSafe` flag is a hot-path contract: when `true` the caller (e.g. `std.base64` for a
* [[sjsonnet.Val.AsciiSafeStr]] input) has already proven every char is in 0x20-0x7F, excluding
* quote and backslash. We then write the input straight into the zone-allocated source buffer
* with a single tight `char.toByte` loop, skipping both the UTF-8 codec and the heap
* `Array[Byte]`. When `false`, we keep the original `getBytes(UTF_8)` slow path for correctness
* on non-ASCII strings.
*/
def encodeStringToString(input: String, asciiSafe: Boolean): String = {
if (input.isEmpty) return ""
if (!asciiSafe) return encodeToString(input.getBytes(java.nio.charset.StandardCharsets.UTF_8))

val len = input.length
val maxOutLen = ((len.toLong + 2) / 3) * 4
if (maxOutLen > Int.MaxValue)
throw new IllegalArgumentException("Input too large for base64 encoding")
val outSize = maxOutLen.toInt
Zone.acquire { implicit z =>
val srcPtr = alloc[Byte](len.toUSize)
// Narrow ASCII chars directly into the zone buffer. The AsciiSafeStr contract guarantees
// every char fits in 0x20..0x7F (minus quote/backslash) per Parser.constructString +
// CharSWAR.isAsciiJsonSafe, so the high byte of each Char is zero and `.toByte` is lossless.
var i = 0
while (i < len) {
!(srcPtr + i.toUSize) = input.charAt(i).toByte
i += 1
}
val outPtr = alloc[Byte]((outSize + 1).toUSize)
val outLenPtr = alloc[CSize](1.toUSize)
libbase64.base64_encode(srcPtr, len.toUSize, outPtr, outLenPtr, 0)
val actualLen = (!outLenPtr).toInt
val result = new Array[Byte](actualLen)
memcpy(result.at(0), outPtr, actualLen.toUSize)
new String(result, "US-ASCII")
}
}

def decode(input: String): Array[Byte] = {
if (input.isEmpty) return Array.emptyByteArray
val srcBytes = input.getBytes("US-ASCII")
Expand Down
6 changes: 3 additions & 3 deletions sjsonnet/src/sjsonnet/Val.scala
Original file line number Diff line number Diff line change
Expand Up @@ -405,9 +405,9 @@ object Val {
}

/**
* String known to contain only printable ASCII (0x20-0x7E) with no characters requiring JSON
* escaping (no `"`, `\`, or control chars). [[ByteRenderer]] checks for this subclass to skip
* SWAR escape scanning and UTF-8 encoding, writing bytes directly.
* String known to contain only single-byte ASCII chars in the JSON-safe range 0x20-0x7F,
* excluding characters requiring JSON escaping (`"` and `\`). [[ByteRenderer]] checks for this
* subclass to skip SWAR escape scanning and UTF-8 encoding, writing bytes directly.
*
* Marker subclass instead of a boolean field saves 8 bytes per instance (boolean + alignment
* padding) — significant for string-heavy workloads where Val.Str instances number in millions.
Expand Down
12 changes: 10 additions & 2 deletions sjsonnet/src/sjsonnet/stdlib/EncodingModule.scala
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,16 @@ object EncodingModule extends AbstractFunctionModule {
*/
builtin("base64", "input") { (pos, _, input: Val) =>
(input match {
case Val.Str(_, value) =>
Val.Str.asciiSafe(pos, PlatformBase64.encodeToString(value.getBytes(UTF_8)))
case s: Val.Str =>
// For [[Val.AsciiSafeStr]] inputs every char fits in 0x20-0x7F, excluding quote and
// backslash (see Parser.constructString + CharSWAR.isAsciiJsonSafe), so the byte representation
// under ISO-8859-1, US-ASCII and UTF-8 is identical. Skipping `getBytes(UTF_8)` lets
// the Native fast path build the encoder input directly with a single char-to-byte
// loop into the zone-allocated buffer (avoiding both the UTF-8 pre-count scan and the
// intermediate heap Array[Byte]); on the JVM/JS side we still allocate the byte array
// but skip the encoder's pre-count branch. See docs/perf-gap-vs-jrsonnet.md.
val asciiSafe = s.isInstanceOf[Val.AsciiSafeStr]
Val.Str.asciiSafe(pos, PlatformBase64.encodeStringToString(s.str, asciiSafe))
case ba: Val.ByteArr =>
Val.Str.asciiSafe(pos, PlatformBase64.encodeToString(ba.rawBytes))
case arr: Val.Arr =>
Expand Down
43 changes: 43 additions & 0 deletions sjsonnet/test/src/sjsonnet/Base64Tests.scala
Original file line number Diff line number Diff line change
Expand Up @@ -546,5 +546,48 @@ object Base64Tests extends TestSuite {
assert(err.contains("Invalid base64"))
}
}

// ================================================================
// AsciiSafeStr fast path — large ASCII string literals are tagged
// by the parser as Val.AsciiSafeStr (see Parser.constructString),
// which triggers the platform fast path that skips getBytes(UTF_8).
// These tests pin the behaviour: the fast path must produce bytes
// identical to the slow path for any ASCII input, and large unicode
// inputs must continue to take the slow path correctly.
// ================================================================
test("asciiSafeFastPath") {
test("largeAsciiLiteralRoundtrips") {
// 2 KB ASCII literal — well past the 1024-byte threshold in the
// parser, so it gets tagged as AsciiSafeStr at parse time.
val src = "Lorem ipsum dolor sit amet. " * 80
assert(src.length > 1024)
val r = eval(s"""local s = "$src"; std.base64Decode(std.base64(s)) == s""")
assert(r == ujson.True)
}
test("largeAsciiMatchesByteArrayPath") {
// Encoding the string and encoding its byte array must give the
// same result — proves the ASCII fast path is byte-identical.
val src = ("abcdef0123!@#$%^&*()" * 60)
assert(src.length > 1024)
val r = eval(
s"""local s = "$src";
|std.base64(s) == std.base64(std.encodeUTF8(s))
|""".stripMargin
)
assert(r == ujson.True)
}
test("largeUnicodeStillCorrect") {
// 1500+ char unicode string — must NOT take the ASCII fast path
// (AsciiSafeStr is only tagged for pure-ASCII literals), and the
// result must equal what we get going via the byte array.
val src = "日本語テスト" * 250
val r = eval(
s"""local s = "$src";
|std.base64(s) == std.base64(std.encodeUTF8(s))
|""".stripMargin
)
assert(r == ujson.True)
}
}
}
}
Loading