databricks · He-Pin · May 23, 2026 · May 23, 2026
diff --git a/sjsonnet/src-js/sjsonnet/stdlib/PlatformBase64.scala b/sjsonnet/src-js/sjsonnet/stdlib/PlatformBase64.scala
@@ -1,5 +1,7 @@
 package sjsonnet.stdlib
 
+import java.nio.charset.StandardCharsets
+
 /**
  * Scala.js implementation of base64 encode/decode. Delegates to java.util.Base64 (provided by
  * Scala.js stdlib emulation).
@@ -9,6 +11,12 @@ object PlatformBase64 {
   def encodeToString(input: Array[Byte]): String =
     java.util.Base64.getEncoder.encodeToString(input)
 
+  /** See JVM `PlatformBase64.encodeStringToString` — same contract. */
+  def encodeStringToString(input: String, asciiSafe: Boolean): String = {
+    val charset = if (asciiSafe) StandardCharsets.ISO_8859_1 else StandardCharsets.UTF_8
+    java.util.Base64.getEncoder.encodeToString(input.getBytes(charset))
+  }
+
   def decode(input: String): Array[Byte] = {
     Base64Validation.requireStrictPadding(input)
     java.util.Base64.getDecoder.decode(input)

diff --git a/sjsonnet/src-jvm/sjsonnet/stdlib/PlatformBase64.scala b/sjsonnet/src-jvm/sjsonnet/stdlib/PlatformBase64.scala
@@ -1,5 +1,7 @@
 package sjsonnet.stdlib
 
+import java.nio.charset.StandardCharsets
+
 /**
  * JVM implementation of base64 encode/decode. Delegates to java.util.Base64 which has HotSpot
  * intrinsics for high performance.
@@ -9,6 +11,20 @@ object PlatformBase64 {
   def encodeToString(input: Array[Byte]): String =
     java.util.Base64.getEncoder.encodeToString(input)
 
+  /**
+   * Encode a `String` directly to a base64 string. The `asciiSafe` flag is a hot-path hint: when
+   * `true`, the caller has already proven every char fits in 0x00–0x7F, so we can use ISO-8859-1
+   * instead of UTF-8 — both produce byte-identical output for ASCII, but ISO-8859-1 skips the UTF-8
+   * encoder's pre-count scan over the input. On JVMs with compact strings (Java 9+) a pure-ASCII
+   * string is already LATIN1-tagged, so `getBytes(ISO_8859_1)` is essentially an array copy. The
+   * intermediate `Array[Byte]` allocation is unavoidable on the JVM (the platform Base64 encoder
+   * takes a byte array); the bigger win lives on the Scala Native side.
+   */
+  def encodeStringToString(input: String, asciiSafe: Boolean): String = {
+    val charset = if (asciiSafe) StandardCharsets.ISO_8859_1 else StandardCharsets.UTF_8
+    java.util.Base64.getEncoder.encodeToString(input.getBytes(charset))
+  }
+
   def decode(input: String): Array[Byte] = {
     Base64Validation.requireStrictPadding(input)
     java.util.Base64.getDecoder.decode(input)

diff --git a/sjsonnet/src-native/sjsonnet/stdlib/PlatformBase64.scala b/sjsonnet/src-native/sjsonnet/stdlib/PlatformBase64.scala
@@ -102,6 +102,49 @@ object PlatformBase64 {
     }
   }
 
+  /**
+   * Encode a `String` directly to base64 without materialising an intermediate `Array[Byte]` for
+   * the input side. On Scala Native, `String.getBytes(UTF_8)` for an ASCII-only input still has to
+   * walk every char checking for non-ASCII codepoints and then allocate a `Array[Byte]` of equal
+   * length; for a 3.5 KB Lorem-ipsum-style input that's two full passes over the data before the
+   * SIMD encoder even sees it.
+   *
+   * The `asciiSafe` flag is a hot-path contract: when `true` the caller (e.g. `std.base64` for a
+   * [[sjsonnet.Val.AsciiSafeStr]] input) has already proven every char is in 0x20-0x7F, excluding
+   * quote and backslash. We then write the input straight into the zone-allocated source buffer
+   * with a single tight `char.toByte` loop, skipping both the UTF-8 codec and the heap
+   * `Array[Byte]`. When `false`, we keep the original `getBytes(UTF_8)` slow path for correctness
+   * on non-ASCII strings.
+   */
+  def encodeStringToString(input: String, asciiSafe: Boolean): String = {
+    if (input.isEmpty) return ""
+    if (!asciiSafe) return encodeToString(input.getBytes(java.nio.charset.StandardCharsets.UTF_8))
+
+    val len = input.length
+    val maxOutLen = ((len.toLong + 2) / 3) * 4
+    if (maxOutLen > Int.MaxValue)
+      throw new IllegalArgumentException("Input too large for base64 encoding")
+    val outSize = maxOutLen.toInt
+    Zone.acquire { implicit z =>
+      val srcPtr = alloc[Byte](len.toUSize)
+      // Narrow ASCII chars directly into the zone buffer. The AsciiSafeStr contract guarantees
+      // every char fits in 0x20..0x7F (minus quote/backslash) per Parser.constructString +
+      // CharSWAR.isAsciiJsonSafe, so the high byte of each Char is zero and `.toByte` is lossless.
+      var i = 0
+      while (i < len) {
+        !(srcPtr + i.toUSize) = input.charAt(i).toByte
+        i += 1
+      }
+      val outPtr = alloc[Byte]((outSize + 1).toUSize)
+      val outLenPtr = alloc[CSize](1.toUSize)
+      libbase64.base64_encode(srcPtr, len.toUSize, outPtr, outLenPtr, 0)
+      val actualLen = (!outLenPtr).toInt
+      val result = new Array[Byte](actualLen)
+      memcpy(result.at(0), outPtr, actualLen.toUSize)
+      new String(result, "US-ASCII")
+    }
+  }
+
   def decode(input: String): Array[Byte] = {
     if (input.isEmpty) return Array.emptyByteArray
     val srcBytes = input.getBytes("US-ASCII")

diff --git a/sjsonnet/src/sjsonnet/Val.scala b/sjsonnet/src/sjsonnet/Val.scala
@@ -405,9 +405,9 @@ object Val {
   }
 
   /**
-   * String known to contain only printable ASCII (0x20-0x7E) with no characters requiring JSON
-   * escaping (no `"`, `\`, or control chars). [[ByteRenderer]] checks for this subclass to skip
-   * SWAR escape scanning and UTF-8 encoding, writing bytes directly.
+   * String known to contain only single-byte ASCII chars in the JSON-safe range 0x20-0x7F,
+   * excluding characters requiring JSON escaping (`"` and `\`). [[ByteRenderer]] checks for this
+   * subclass to skip SWAR escape scanning and UTF-8 encoding, writing bytes directly.
    *
    * Marker subclass instead of a boolean field saves 8 bytes per instance (boolean + alignment
    * padding) — significant for string-heavy workloads where Val.Str instances number in millions.

diff --git a/sjsonnet/src/sjsonnet/stdlib/EncodingModule.scala b/sjsonnet/src/sjsonnet/stdlib/EncodingModule.scala
@@ -58,8 +58,16 @@ object EncodingModule extends AbstractFunctionModule {
      */
     builtin("base64", "input") { (pos, _, input: Val) =>
       (input match {
-        case Val.Str(_, value) =>
-          Val.Str.asciiSafe(pos, PlatformBase64.encodeToString(value.getBytes(UTF_8)))
+        case s: Val.Str =>
+          // For [[Val.AsciiSafeStr]] inputs every char fits in 0x20-0x7F, excluding quote and
+          // backslash (see Parser.constructString + CharSWAR.isAsciiJsonSafe), so the byte representation
+          // under ISO-8859-1, US-ASCII and UTF-8 is identical. Skipping `getBytes(UTF_8)` lets
+          // the Native fast path build the encoder input directly with a single char-to-byte
+          // loop into the zone-allocated buffer (avoiding both the UTF-8 pre-count scan and the
+          // intermediate heap Array[Byte]); on the JVM/JS side we still allocate the byte array
+          // but skip the encoder's pre-count branch. See docs/perf-gap-vs-jrsonnet.md.
+          val asciiSafe = s.isInstanceOf[Val.AsciiSafeStr]
+          Val.Str.asciiSafe(pos, PlatformBase64.encodeStringToString(s.str, asciiSafe))
         case ba: Val.ByteArr =>
           Val.Str.asciiSafe(pos, PlatformBase64.encodeToString(ba.rawBytes))
         case arr: Val.Arr =>

diff --git a/sjsonnet/test/src/sjsonnet/Base64Tests.scala b/sjsonnet/test/src/sjsonnet/Base64Tests.scala
@@ -546,5 +546,48 @@ object Base64Tests extends TestSuite {
         assert(err.contains("Invalid base64"))
       }
     }
+
+    // ================================================================
+    // AsciiSafeStr fast path — large ASCII string literals are tagged
+    // by the parser as Val.AsciiSafeStr (see Parser.constructString),
+    // which triggers the platform fast path that skips getBytes(UTF_8).
+    // These tests pin the behaviour: the fast path must produce bytes
+    // identical to the slow path for any ASCII input, and large unicode
+    // inputs must continue to take the slow path correctly.
+    // ================================================================
+    test("asciiSafeFastPath") {
+      test("largeAsciiLiteralRoundtrips") {
+        // 2 KB ASCII literal — well past the 1024-byte threshold in the
+        // parser, so it gets tagged as AsciiSafeStr at parse time.
+        val src = "Lorem ipsum dolor sit amet. " * 80
+        assert(src.length > 1024)
+        val r = eval(s"""local s = "$src"; std.base64Decode(std.base64(s)) == s""")
+        assert(r == ujson.True)
+      }
+      test("largeAsciiMatchesByteArrayPath") {
+        // Encoding the string and encoding its byte array must give the
+        // same result — proves the ASCII fast path is byte-identical.
+        val src = ("abcdef0123!@#$%^&*()" * 60)
+        assert(src.length > 1024)
+        val r = eval(
+          s"""local s = "$src";
+             |std.base64(s) == std.base64(std.encodeUTF8(s))
+             |""".stripMargin
+        )
+        assert(r == ujson.True)
+      }
+      test("largeUnicodeStillCorrect") {
+        // 1500+ char unicode string — must NOT take the ASCII fast path
+        // (AsciiSafeStr is only tagged for pure-ASCII literals), and the
+        // result must equal what we get going via the byte array.
+        val src = "日本語テスト" * 250
+        val r = eval(
+          s"""local s = "$src";
+             |std.base64(s) == std.base64(std.encodeUTF8(s))
+             |""".stripMargin
+        )
+        assert(r == ujson.True)
+      }
+    }
   }
 }