From 720738c84ec8a60d7dbd7e1ea328ac5816e0d2f0 Mon Sep 17 00:00:00 2001 From: bubio Date: Thu, 11 Jun 2026 14:21:51 +0900 Subject: [PATCH] feat: move HSV filter to app-layer GPU shader; fix filter frame pacing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The NP2kai HSV-smooth video filter ran per emulated frame on the CPU. Under the catch-up / No-Wait frame scheduler it was applied to every generated frame (up to 16x), and even a single full-screen pass was ~27ms on a Ryzen 5 2600 — collapsing HSV-on frame rates to single digits on Windows and under No-Wait on macOS. Reimplement the filter as a GPU fragment-shader pass (blit_hsv.fs.{hlsl,metal, glsl}), selected at draw time via ui.display_hsv. The NP2kai core video filter is left disabled and the core source is unmodified, keeping upstream merges clean. HSV-on now holds a solid 60fps. Also: - frame(): only the last emulated frame of each host tick is drawn, so the expensive draw pass runs once per presented frame instead of once per emulated frame (the intermediate frames are never displayed). - FPS readout: average presented frames over a ~0.5s window instead of an instantaneous 1/dt sample, which beat against the ~60Hz draw gate into misleading 3-digit spikes. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/main.zig | 87 +++++++++++++++++--------- src/platform/linux.zig | 1 + src/platform/macos.zig | 1 + src/platform/shaders/blit_hsv.fs.glsl | 76 ++++++++++++++++++++++ src/platform/shaders/blit_hsv.fs.hlsl | 75 ++++++++++++++++++++++ src/platform/shaders/blit_hsv.fs.metal | 73 +++++++++++++++++++++ src/platform/windows.zig | 1 + src/ui.zig | 9 +-- 8 files changed, 287 insertions(+), 36 deletions(-) create mode 100644 src/platform/shaders/blit_hsv.fs.glsl create mode 100644 src/platform/shaders/blit_hsv.fs.hlsl create mode 100644 src/platform/shaders/blit_hsv.fs.metal diff --git a/src/main.zig b/src/main.zig index 2655b6f..4ce4c81 100644 --- a/src/main.zig +++ b/src/main.zig @@ -62,6 +62,10 @@ fn fbViewport(win_w: u32, win_h: u32) Viewport { const State = struct { pipeline: sg.Pipeline = .{}, + // Second pipeline whose fragment shader applies the HSV-smooth filter on the + // GPU; selected at draw time when ui.display_hsv is on. Keeping the filter in + // a shader means the NP2kai core needs no video-filter changes. + pipeline_hsv: sg.Pipeline = .{}, bindings: sg.Bindings = .{}, pass_action: sg.PassAction = .{}, image: sg.Image = .{}, @@ -82,7 +86,11 @@ var last_emu_ns: i128 = 0; var skip_counter: u32 = 0; const nowait_frames_per_tick: u32 = 16; var draw_fps: f32 = 0.0; -var last_draw_ns: i128 = 0; +// Windowed FPS: average presented-frame count over a ~0.5s wall-clock window. +// Replaces an instantaneous 1/Δt readout, which swung wildly (into the hundreds) +// whenever the host frame() rate beat against the ~60Hz draw gate. +var fps_draw_count: u32 = 0; +var fps_window_start_ns: i128 = 0; // 音声バッファ(Zig側で変換用に使用) var audio_buffer: [4096 * 2]f32 = undefined; @@ -135,13 +143,11 @@ export fn init() void { // because the reset's diskdrv_hddbind() binds drives from the config. configureHdds(expanded_disks); } - // Always load the HSV-smooth profile into np2cfg before pccore_init() reads - // it into the filter manager, so the Screen menu can toggle it live. The - // --video-filter flag only decides whether it starts on. UsaProject never - // reads np2kai's .cfg, so this is the only path that configures it. - const vf_on = if (parsed_opts) |o| o.video_filter else false; - cz.usa_setup_video_filter(if (vf_on) 1 else 0); - ui.display_hsv = vf_on; + // The HSV-smooth filter now lives in the app layer as a GPU fragment-shader + // pass (state.pipeline_hsv), so the NP2kai core filter stays disabled and + // the core source is left unmodified. The --video-filter flag only decides + // whether the shader filter starts on; the Screen menu toggles it live. + ui.display_hsv = if (parsed_opts) |o| o.video_filter else false; cz.pccore_init(); cz.pccore_reset(); if (parsed_opts) |opts| { @@ -195,16 +201,21 @@ export fn init() void { state.bindings.views[0] = state.view; state.bindings.samplers[0] = state.sampler; + const blit_attrs = init: { + var attrs: [16]sg.VertexAttrState = @splat(.{}); + attrs[0].format = .FLOAT3; + attrs[1].format = .FLOAT2; + break :init attrs; + }; state.pipeline = sg.makePipeline(.{ - .shader = makeBlitShader(), - .layout = .{ - .attrs = init: { - var attrs: [16]sg.VertexAttrState = @splat(.{}); - attrs[0].format = .FLOAT3; - attrs[1].format = .FLOAT2; - break :init attrs; - }, - }, + .shader = makeBlitShader(platform.os.shader_fs_source), + .layout = .{ .attrs = blit_attrs }, + .index_type = .UINT16, + }); + // HSV-smooth variant: same geometry/layout, filtering fragment shader. + state.pipeline_hsv = sg.makePipeline(.{ + .shader = makeBlitShader(platform.os.shader_fs_hsv_source), + .layout = .{ .attrs = blit_attrs }, .index_type = .UINT16, }); @@ -321,14 +332,14 @@ fn setupDataDir() void { config.setDataDir(dir); } -fn makeBlitShader() sg.Shader { +fn makeBlitShader(fs_source: [*:0]const u8) sg.Shader { return sg.makeShader(.{ .vertex_func = .{ .source = platform.os.shader_vs_source, }, .fragment_func = .{ .entry = platform.os.shader_entry, - .source = platform.os.shader_fs_source, + .source = fs_source, }, .attrs = init: { var a: [16]sg.ShaderVertexAttr = @splat(.{}); @@ -390,28 +401,44 @@ export fn frame() void { const draw_skip = cz.usa_get_draw_skip(); var i: u32 = 0; while (i < frames) : (i += 1) { - const should_draw = blk: { - if (draw_skip <= 1) break :blk true; + // Only the final emulated frame of this host frame is ever uploaded + // to the GPU (a single updateImage() after the loop), so only it + // needs the expensive scrndraw_draw() + HSV filter pass. Intermediate + // frames just advance the CPU with draw=false; rendering them — and + // especially running the per-pixel HSV-smooth filter on them — is + // pure waste (16x under No-Wait, up to 4x under catch-up) that + // collapses the frame rate to single digits when the filter is on. + const is_last = (i + 1 == frames); + var should_draw = is_last; + // draw_skip is the user's frame-skip setting; apply it to the one + // presented frame per host tick so heavy load can drop whole frames. + if (is_last and draw_skip > 1) { skip_counter += 1; if (skip_counter >= draw_skip) { skip_counter = 0; - break :blk true; + } else { + should_draw = false; } - break :blk false; - }; + } cz.pccore_exec(should_draw); cz.sound_sync(); if (should_draw) { cz.scrndraw_redraw(); - const draw_dt_ns = now - last_draw_ns; - if (draw_dt_ns > 0) { - draw_fps = @floatCast(1_000_000_000.0 / @as(f64, @floatFromInt(draw_dt_ns))); - } - last_draw_ns = now; + fps_draw_count += 1; } } } + // Publish smoothed FPS once per ~0.5s window. + if (fps_window_start_ns == 0) fps_window_start_ns = now; + const win_ns = now - fps_window_start_ns; + if (win_ns >= 500_000_000) { + const win_s = @as(f64, @floatFromInt(win_ns)) / 1_000_000_000.0; + draw_fps = @floatCast(@as(f64, @floatFromInt(fps_draw_count)) / win_s); + fps_draw_count = 0; + fps_window_start_ns = now; + } + pixel.rgb565BufferToRgba8(&fb_rgba, cz.pc98_framebuffer[0 .. FB_WIDTH * FB_HEIGHT]); var img_data = sg.ImageData{}; @@ -446,7 +473,7 @@ export fn frame() void { }); sg.beginPass(.{ .action = state.pass_action, .swapchain = sglue.swapchain() }); - sg.applyPipeline(state.pipeline); + sg.applyPipeline(if (ui.display_hsv) state.pipeline_hsv else state.pipeline); // Live scaling-filter choice from the Screen menu (nearest vs linear). state.bindings.samplers[0] = if (ui.display_scale_linear) state.sampler_linear else state.sampler; sg.applyBindings(state.bindings); diff --git a/src/platform/linux.zig b/src/platform/linux.zig index 06ac7c7..890e8af 100644 --- a/src/platform/linux.zig +++ b/src/platform/linux.zig @@ -3,6 +3,7 @@ const sapp = @import("sokol").app; pub const shader_vs_source = @embedFile("shaders/blit.vs.glsl"); pub const shader_fs_source = @embedFile("shaders/blit.fs.glsl"); +pub const shader_fs_hsv_source = @embedFile("shaders/blit_hsv.fs.glsl"); pub const shader_entry = "main"; pub const data_dir_template = "{s}/.local/share/{s}"; diff --git a/src/platform/macos.zig b/src/platform/macos.zig index 7c70b5f..4075a9b 100644 --- a/src/platform/macos.zig +++ b/src/platform/macos.zig @@ -3,6 +3,7 @@ const sapp = @import("sokol").app; pub const shader_vs_source = @embedFile("shaders/blit.vs.metal"); pub const shader_fs_source = @embedFile("shaders/blit.fs.metal"); +pub const shader_fs_hsv_source = @embedFile("shaders/blit_hsv.fs.metal"); pub const shader_entry = "_main"; pub const data_dir_template = "{s}/Library/Application Support/{s}"; diff --git a/src/platform/shaders/blit_hsv.fs.glsl b/src/platform/shaders/blit_hsv.fs.glsl new file mode 100644 index 0000000..e8e7c95 --- /dev/null +++ b/src/platform/shaders/blit_hsv.fs.glsl @@ -0,0 +1,76 @@ +#version 330 +// App-layer HSV-smooth filter (replaces NP2kai's core videofilter). For each +// output texel it averages the 3x3 neighbourhood in HSV space, keeping only +// neighbours within a hue/saturation/value tolerance of the centre — a faithful +// GPU port of VideoFilter_HSVSmooth (preset radius=15 -> 3x3, dH=30, dS=30, +// dV=90, weight type 0). Running it on the GPU keeps the emulator core +// unmodified and is effectively free versus the CPU implementation. +uniform sampler2D tex_smp; +in vec2 uv; +out vec4 frag_color; + +const float dHtol = 30.0; +const float dStol = 30.0 / 255.0; +const float dVtol = 90.0 / 255.0; + +// H in [0,360), S and V in [0,1]. +vec3 rgb2hsv(vec3 c) { + float mx = max(c.r, max(c.g, c.b)); + float mn = min(c.r, min(c.g, c.b)); + float d = mx - mn; + float h = 0.0; + if (d > 0.0) { + if (mx == c.r) h = mod((c.g - c.b) / d, 6.0); + else if (mx == c.g) h = (c.b - c.r) / d + 2.0; + else h = (c.r - c.g) / d + 4.0; + h *= 60.0; + if (h < 0.0) h += 360.0; + } + float s = (mx <= 0.0) ? 0.0 : d / mx; + return vec3(h, s, mx); +} + +vec3 hsv2rgb(vec3 c) { + float h = c.x, s = c.y, v = c.z; + float cc = v * s; + float x = cc * (1.0 - abs(mod(h / 60.0, 2.0) - 1.0)); + float m = v - cc; + vec3 rgb; + if (h < 60.0) rgb = vec3(cc, x, 0.0); + else if (h < 120.0) rgb = vec3(x, cc, 0.0); + else if (h < 180.0) rgb = vec3(0.0, cc, x); + else if (h < 240.0) rgb = vec3(0.0, x, cc); + else if (h < 300.0) rgb = vec3(x, 0.0, cc); + else rgb = vec3(cc, 0.0, x); + return rgb + m; +} + +void main() { + vec2 texel = 1.0 / vec2(textureSize(tex_smp, 0)); + vec3 C = rgb2hsv(texture(tex_smp, uv).rgb); + float sumH = 0.0, sumS = 0.0, sumV = 0.0, count = 0.0; + for (int dy = -1; dy <= 1; dy++) { + for (int dx = -1; dx <= 1; dx++) { + vec3 D = rgb2hsv(texture(tex_smp, uv + vec2(float(dx), float(dy)) * texel).rgb); + float dH = D.x - C.x; + if (dH > 180.0) dH -= 360.0; + else if (dH < -180.0) dH += 360.0; + if (D.y <= 0.0) dH = 0.0; + float dS = D.y - C.y; + float dV = D.z - C.z; + bool ok = true; + if (D.z > 0.0 && abs(dH) > dHtol) ok = false; + if (D.z > 0.0 && abs(dS) > dStol) ok = false; + if (abs(dV) > dVtol) ok = false; + if (ok) { + sumH += C.x + dH; + sumS += C.y + dS; + sumV += C.z + dV; + count += 1.0; + } + } + } + float H = mod(sumH / count, 360.0); + if (H < 0.0) H += 360.0; + frag_color = vec4(hsv2rgb(vec3(H, sumS / count, sumV / count)), 1.0); +} diff --git a/src/platform/shaders/blit_hsv.fs.hlsl b/src/platform/shaders/blit_hsv.fs.hlsl new file mode 100644 index 0000000..d7d28a8 --- /dev/null +++ b/src/platform/shaders/blit_hsv.fs.hlsl @@ -0,0 +1,75 @@ +// App-layer HSV-smooth filter (replaces NP2kai's core videofilter). See +// blit_hsv.fs.glsl for the algorithm notes; this is the D3D11/HLSL port. +Texture2D tex : register(t0); +SamplerState smp : register(s0); +struct fs_in { + float2 uv : TEXCOORD0; +}; + +static const float dHtol = 30.0; +static const float dStol = 30.0 / 255.0; +static const float dVtol = 90.0 / 255.0; + +// H in [0,360), S and V in [0,1]. +float3 rgb2hsv(float3 c) { + float mx = max(c.r, max(c.g, c.b)); + float mn = min(c.r, min(c.g, c.b)); + float d = mx - mn; + float h = 0.0; + if (d > 0.0) { + if (mx == c.r) h = fmod((c.g - c.b) / d, 6.0); + else if (mx == c.g) h = (c.b - c.r) / d + 2.0; + else h = (c.r - c.g) / d + 4.0; + h *= 60.0; + if (h < 0.0) h += 360.0; + } + float s = (mx <= 0.0) ? 0.0 : d / mx; + return float3(h, s, mx); +} + +float3 hsv2rgb(float3 c) { + float h = c.x, s = c.y, v = c.z; + float cc = v * s; + float x = cc * (1.0 - abs(fmod(h / 60.0, 2.0) - 1.0)); + float m = v - cc; + float3 rgb; + if (h < 60.0) rgb = float3(cc, x, 0.0); + else if (h < 120.0) rgb = float3(x, cc, 0.0); + else if (h < 180.0) rgb = float3(0.0, cc, x); + else if (h < 240.0) rgb = float3(0.0, x, cc); + else if (h < 300.0) rgb = float3(x, 0.0, cc); + else rgb = float3(cc, 0.0, x); + return rgb + m; +} + +float4 main(fs_in inp) : SV_Target0 { + float tw, th; + tex.GetDimensions(tw, th); + float2 texel = float2(1.0 / tw, 1.0 / th); + float3 C = rgb2hsv(tex.Sample(smp, inp.uv).rgb); + float sumH = 0.0, sumS = 0.0, sumV = 0.0, count = 0.0; + [unroll] for (int dy = -1; dy <= 1; dy++) { + [unroll] for (int dx = -1; dx <= 1; dx++) { + float3 D = rgb2hsv(tex.Sample(smp, inp.uv + float2(dx, dy) * texel).rgb); + float dH = D.x - C.x; + if (dH > 180.0) dH -= 360.0; + else if (dH < -180.0) dH += 360.0; + if (D.y <= 0.0) dH = 0.0; + float dS = D.y - C.y; + float dV = D.z - C.z; + bool ok = true; + if (D.z > 0.0 && abs(dH) > dHtol) ok = false; + if (D.z > 0.0 && abs(dS) > dStol) ok = false; + if (abs(dV) > dVtol) ok = false; + if (ok) { + sumH += C.x + dH; + sumS += C.y + dS; + sumV += C.z + dV; + count += 1.0; + } + } + } + float H = fmod(sumH / count, 360.0); + if (H < 0.0) H += 360.0; + return float4(hsv2rgb(float3(H, sumS / count, sumV / count)), 1.0); +} diff --git a/src/platform/shaders/blit_hsv.fs.metal b/src/platform/shaders/blit_hsv.fs.metal new file mode 100644 index 0000000..3bf867c --- /dev/null +++ b/src/platform/shaders/blit_hsv.fs.metal @@ -0,0 +1,73 @@ +// App-layer HSV-smooth filter (replaces NP2kai's core videofilter). See +// blit_hsv.fs.glsl for the algorithm notes; this is the Metal port. +#include +using namespace metal; +struct fs_in { + float2 uv; +}; + +constant float dHtol = 30.0; +constant float dStol = 30.0 / 255.0; +constant float dVtol = 90.0 / 255.0; + +// H in [0,360), S and V in [0,1]. +static float3 rgb2hsv(float3 c) { + float mx = max(c.r, max(c.g, c.b)); + float mn = min(c.r, min(c.g, c.b)); + float d = mx - mn; + float h = 0.0; + if (d > 0.0) { + if (mx == c.r) h = fmod((c.g - c.b) / d, 6.0); + else if (mx == c.g) h = (c.b - c.r) / d + 2.0; + else h = (c.r - c.g) / d + 4.0; + h *= 60.0; + if (h < 0.0) h += 360.0; + } + float s = (mx <= 0.0) ? 0.0 : d / mx; + return float3(h, s, mx); +} + +static float3 hsv2rgb(float3 c) { + float h = c.x, s = c.y, v = c.z; + float cc = v * s; + float x = cc * (1.0 - abs(fmod(h / 60.0, 2.0) - 1.0)); + float m = v - cc; + float3 rgb; + if (h < 60.0) rgb = float3(cc, x, 0.0); + else if (h < 120.0) rgb = float3(x, cc, 0.0); + else if (h < 180.0) rgb = float3(0.0, cc, x); + else if (h < 240.0) rgb = float3(0.0, x, cc); + else if (h < 300.0) rgb = float3(x, 0.0, cc); + else rgb = float3(cc, 0.0, x); + return rgb + m; +} + +fragment float4 _main(fs_in in [[stage_in]], texture2d tex [[texture(0)]], sampler smp [[sampler(0)]]) { + float2 texel = float2(1.0 / tex.get_width(), 1.0 / tex.get_height()); + float3 C = rgb2hsv(tex.sample(smp, in.uv).rgb); + float sumH = 0.0, sumS = 0.0, sumV = 0.0, count = 0.0; + for (int dy = -1; dy <= 1; dy++) { + for (int dx = -1; dx <= 1; dx++) { + float3 D = rgb2hsv(tex.sample(smp, in.uv + float2(dx, dy) * texel).rgb); + float dH = D.x - C.x; + if (dH > 180.0) dH -= 360.0; + else if (dH < -180.0) dH += 360.0; + if (D.y <= 0.0) dH = 0.0; + float dS = D.y - C.y; + float dV = D.z - C.z; + bool ok = true; + if (D.z > 0.0 && abs(dH) > dHtol) ok = false; + if (D.z > 0.0 && abs(dS) > dStol) ok = false; + if (abs(dV) > dVtol) ok = false; + if (ok) { + sumH += C.x + dH; + sumS += C.y + dS; + sumV += C.z + dV; + count += 1.0; + } + } + } + float H = fmod(sumH / count, 360.0); + if (H < 0.0) H += 360.0; + return float4(hsv2rgb(float3(H, sumS / count, sumV / count)), 1.0); +} diff --git a/src/platform/windows.zig b/src/platform/windows.zig index 12e5d2f..5059259 100644 --- a/src/platform/windows.zig +++ b/src/platform/windows.zig @@ -3,6 +3,7 @@ const sapp = @import("sokol").app; pub const shader_vs_source = @embedFile("shaders/blit.vs.hlsl"); pub const shader_fs_source = @embedFile("shaders/blit.fs.hlsl"); +pub const shader_fs_hsv_source = @embedFile("shaders/blit_hsv.fs.hlsl"); pub const shader_entry = "main"; // Backslashes (not forward slashes): the path is handed to `explorer` to diff --git a/src/ui.zig b/src/ui.zig index 0e11a3d..4ee66fd 100644 --- a/src/ui.zig +++ b/src/ui.zig @@ -405,14 +405,11 @@ fn menuScreen(ctx: *c.nk_context) void { var linear_v: c_int = if (display_scale_linear) 1 else 0; _ = c.nk_checkbox_label(ctx, "Smooth Scaling", &linear_v); display_scale_linear = linear_v != 0; - // HSV Filter — NP2kai HSV-smooth on the emulated output; toggled live. + // HSV Filter — app-layer HSV-smooth, applied as a GPU shader pass in + // main.frame() when display_hsv is set. No core-side call needed. var hsv_v: c_int = if (display_hsv) 1 else 0; _ = c.nk_checkbox_label(ctx, "HSV Filter", &hsv_v); - const hsv_new = hsv_v != 0; - if (hsv_new != display_hsv) { - display_hsv = hsv_new; - cz.usa_set_video_filter(if (hsv_new) 1 else 0); - } + display_hsv = hsv_v != 0; c.nk_menu_end(ctx); } }