From f85187eff3485311ab630b0d8f919bca51cab8c5 Mon Sep 17 00:00:00 2001 From: Zachary Levy Date: Mon, 20 Apr 2026 20:27:40 -0700 Subject: [PATCH] Clean up memory management --- draw/README.md | 222 ++++++++++++++++++++++++++++++++++++++++++----- draw/draw.odin | 5 +- draw/shapes.odin | 6 ++ draw/text.odin | 2 + 4 files changed, 213 insertions(+), 22 deletions(-) diff --git a/draw/README.md b/draw/README.md index 5f9225a..1066a7e 100644 --- a/draw/README.md +++ b/draw/README.md @@ -81,32 +81,63 @@ shader contains both a 20-register RRect SDF and a 72-register frosted-glass blu — even trivial RRects — is allocated 72 registers. This directly reduces **occupancy** (the number of warps that can run simultaneously), which reduces the GPU's ability to hide memory latency. -Concrete example on a modern NVIDIA SM with 65,536 registers: +Concrete occupancy analysis on modern NVIDIA SMs, which have 65,536 32-bit registers and a +hardware-imposed maximum thread count per SM that varies by architecture (Volta/A100: 2,048; +consumer Ampere/Ada: 1,536). Occupancy is register-limited only when `65536 / regs_per_thread` falls +below the hardware thread cap; above that cap, occupancy is 100% regardless of register count. -| Register allocation | Max concurrent threads | Occupancy | -| ------------------------- | ---------------------- | --------- | -| 20 regs (RRect only) | 3,276 | ~100% | -| 48 regs (+ drop shadow) | 1,365 | ~42% | -| 72 regs (+ frosted glass) | 910 | ~28% | +On consumer Ampere/Ada GPUs (RTX 30xx/40xx, max 1,536 threads per SM): -For a 4K frame (3840×2160) at 1.5× overdraw (~12.4M fragments), running all fragments at 28% -occupancy instead of 100% roughly triples fragment shading time. At 4K this is severe: if the main -pipeline's fragment work at full occupancy takes ~2ms, a single unified shader containing the glass -branch would push it to ~6ms — consuming 72% of the 8.3ms budget available at 120 FPS and leaving -almost nothing for CPU work, uploads, and presentation. This is a per-frame multiplier, not a -per-primitive cost — it applies even when the heavy branch is never taken. +| Register allocation | Reg-limited threads | Actual (hw-capped) | Occupancy | +| ------------------------- | ------------------- | ------------------ | --------- | +| 20 regs (RRect only) | 3,276 | 1,536 | 100% | +| 32 regs | 2,048 | 1,536 | 100% | +| 48 regs (+ drop shadow) | 1,365 | 1,365 | ~89% | +| 72 regs (+ frosted glass) | 910 | 910 | ~59% | + +On Volta/A100 GPUs (max 2,048 threads per SM): + +| Register allocation | Reg-limited threads | Actual (hw-capped) | Occupancy | +| ------------------------- | ------------------- | ------------------ | --------- | +| 20 regs (RRect only) | 3,276 | 2,048 | 100% | +| 32 regs | 2,048 | 2,048 | 100% | +| 48 regs (+ drop shadow) | 1,365 | 1,365 | ~67% | +| 72 regs (+ frosted glass) | 910 | 910 | ~44% | + +The register cliff — where occupancy begins dropping — starts at ~43 regs/thread on consumer +Ampere/Ada (65536 / 1536) and ~32 regs/thread on Volta/A100 (65536 / 2048). Below the cliff, +adding registers has zero occupancy cost. + +The impact of reduced occupancy depends on whether the shader is memory-latency-bound (where +occupancy is critical for hiding latency) or ALU-bound (where it matters less). For the +backdrop-effects pipeline's frosted-glass shader, which performs multiple dependent texture reads, +59% occupancy (consumer) or 44% occupancy (Volta) meaningfully reduces the GPU's ability to hide +texture latency — roughly a 1.7× to 2.3× throughput reduction compared to full occupancy. At 4K with +1.5× overdraw (~12.4M fragments), if the main pipeline's fragment work at full occupancy takes ~2ms, +a single unified shader containing the glass branch would push it to ~3.4–4.6ms depending on +architecture. This is a per-frame multiplier, not a per-primitive cost — it applies even when the +heavy branch is never taken, because the compiler allocates registers for the worst-case path. + +**Note on Apple M3+ GPUs:** Apple's M3 GPU architecture introduces Dynamic Caching (register file +virtualization), which allocates registers dynamically at runtime based on actual usage rather than +worst-case declared usage. This significantly reduces the static register-pressure-to-occupancy +penalty described above. The tier split remains useful on Apple hardware for other reasons (keeping +the backdrop texture-copy out of the main render pass, isolating blur ALU complexity), but the +register-pressure argument specifically weakens on M3 and later. The three-pipeline split groups primitives by register footprint so that: -- Main pipeline (~20 regs): 90%+ of fragments run at near-full occupancy. -- Effects pipeline (~55 regs): shadow/glow fragments run at moderate occupancy; unavoidable given the - blur math complexity. -- Backdrop-effects pipeline (~75 regs): glass fragments run at low occupancy; also unavoidable, and - structurally separated anyway by the texture-copy requirement. +- Main pipeline (~20 regs): all fragments run at full occupancy on every architecture. +- Effects pipeline (~48–55 regs): shadow/glow fragments run at 67–89% occupancy depending on + architecture; unavoidable given the blur math complexity. +- Backdrop-effects pipeline (~72–75 regs): glass fragments run at 44–59% occupancy; also + unavoidable, and structurally separated anyway by the texture-copy requirement. This avoids the register-pressure tax of a single unified shader while keeping pipeline count minimal (3 vs. Zed GPUI's 7). The effects that drag occupancy down are isolated to the fragments that -actually need them. +actually need them. Crucially, all shape kinds within the main pipeline (SDF, tessellated, text) +cluster at 12–24 registers — well below the register cliff on every architecture — so unifying them +costs nothing in occupancy. **Why not per-primitive-type pipelines (GPUI's approach)?** Zed's GPUI uses 7 separate shader pairs: quad, shadow, underline, monochrome sprite, polychrome sprite, path, surface. This eliminates all @@ -160,9 +191,9 @@ in submission order: cheaper than the pipeline-switching alternative. The split we _do_ perform (main / effects / backdrop-effects) is motivated by register-pressure tier -boundaries where occupancy differences are catastrophic at 4K (see numbers above). Within a tier, -unified is strictly better by every measure: fewer draw calls, simpler Z-order, lower CPU overhead, -and negligible GPU-side branching cost. +boundaries where occupancy drops are significant at 4K (see numbers above). Within a tier, unified is +strictly better by every measure: fewer draw calls, simpler Z-order, lower CPU overhead, and +negligible GPU-side branching cost. **References:** @@ -172,6 +203,16 @@ and negligible GPU-side branching cost. https://github.com/zed-industries/zed/blob/cb6fc11/crates/gpui/src/platform/mac/shaders.metal - NVIDIA Nsight Graphics 2024.3 documentation on active-threads-per-warp and divergence analysis: https://developer.nvidia.com/blog/optimize-gpu-workloads-for-graphics-applications-with-nvidia-nsight-graphics/ +- NVIDIA Ampere GPU Architecture Tuning Guide — SM specs, max warps per SM (48 for cc 8.6, 64 for + cc 8.0), register file size (64K), occupancy factors: + https://docs.nvidia.com/cuda/ampere-tuning-guide/index.html +- NVIDIA Ada GPU Architecture Tuning Guide — SM specs, max warps per SM (48 for cc 8.9): + https://docs.nvidia.com/cuda/ada-tuning-guide/index.html +- CUDA Occupancy Calculation walkthrough (register allocation granularity, worked examples): + https://leimao.github.io/blog/CUDA-Occupancy-Calculation/ +- Apple M3 GPU architecture — Dynamic Caching (register file virtualization) eliminates static + worst-case register allocation, reducing the occupancy penalty for high-register shaders: + https://asplos.dev/wiki/m3-chip-explainer/gpu/index.html ### Why fragment shader branching is safe in this design @@ -539,6 +580,145 @@ changes. - Valve's original SDF text rendering paper (SIGGRAPH 2007): https://steamcdn-a.akamaihd.net/apps/valve/2007/SIGGRAPH2007_AlphaTestedMagnification.pdf +### Textures + +Textures plug into the existing main pipeline — no additional GPU pipeline, no shader rewrite. The +work is a resource layer (registration, upload, sampling, lifecycle) plus two textured-draw procs +that route into the existing tessellated and SDF paths respectively. + +#### Why draw owns registered textures + +A texture's GPU resource (the `^sdl.GPUTexture`, transfer buffer, shader resource view) is created +and destroyed by draw. The user provides raw bytes and a descriptor at registration time; draw +uploads synchronously and returns an opaque `Texture_Id` handle. The user can free their CPU-side +bytes immediately after `register_texture` returns. + +This follows the model used by the RAD Debugger's render layer (`src/render/render_core.h` in +EpicGamesExt/raddebugger, MIT license), where `r_tex2d_alloc` takes `(kind, size, format, data)` +and returns an opaque handle that the renderer owns and releases. The single-owner model eliminates +an entire class of lifecycle bugs (double-free, use-after-free across subsystems, unclear cleanup +responsibility) that dual-ownership designs introduce. + +If advanced interop is ever needed (e.g., a future 3D pipeline or compute shader sharing the same +GPU texture), the clean extension is a borrowed-reference accessor (`get_gpu_texture(id)`) that +returns the underlying handle without transferring ownership. This is purely additive and does not +require changing the registration API. + +#### Why `Texture_Kind` exists + +`Texture_Kind` (Static / Dynamic / Stream) is a driver hint for update frequency, adopted from the +RAD Debugger's `R_ResourceKind`. It maps directly to SDL3 GPU usage patterns: + +- **Static**: uploaded once, never changes. Covers QR codes, decoded PNGs, icons — the 90% case. +- **Dynamic**: updatable via `update_texture_region`. Covers font atlas growth, procedural updates. +- **Stream**: frequent full re-uploads. Covers video playback, per-frame procedural generation. + +This costs one byte in the descriptor and lets the backend pick optimal memory placement without a +future API change. + +#### Why samplers are per-draw, not per-texture + +A sampler describes how to filter and address a texture during sampling — nearest vs bilinear, clamp +vs repeat. This is a property of the _draw_, not the texture. The same QR code texture should be +sampled with `Nearest_Clamp` when displayed at native resolution but could reasonably be sampled +with `Linear_Clamp` in a zoomed-out thumbnail. The same icon atlas might be sampled with +`Nearest_Clamp` for pixel art or `Linear_Clamp` for smooth scaling. + +The RAD Debugger follows this pattern: `R_BatchGroup2DParams` carries `tex_sample_kind` alongside +the texture handle, chosen per batch group at draw time. We do the same — `Sampler_Preset` is a +parameter on the draw procs, not a field on `Texture_Desc`. + +Internally, draw keeps a small pool of pre-created `^sdl.GPUSampler` objects (one per preset, +lazily initialized). Sub-batch coalescing keys on `(kind, texture_id, sampler_preset)` — draws +with the same texture but different samplers produce separate draw calls, which is correct. + +#### Textured draw procs + +Textured rectangles route through the existing SDF path via `draw.rectangle_texture` and +`draw.rectangle_texture_corners`, mirroring `draw.rectangle` and `draw.rectangle_corners` exactly — +same parameters, same naming — with the color parameter replaced by a texture ID plus an optional +tint. + +An earlier iteration of this design considered a separate tessellated `draw.texture` proc for +"simple" fullscreen quads, on the theory that the tessellated path's lower register count (~16 regs +vs ~24 for the SDF textured branch) would improve occupancy at large fragment counts. Applying the +register-pressure analysis from the pipeline-strategy section above shows this is wrong: both 16 and +24 registers are well below the register cliff (~43 regs on consumer Ampere/Ada, ~32 on Volta/A100), +so both run at 100% occupancy. The remaining ALU difference (~15 extra instructions for the SDF +evaluation) amounts to ~20μs at 4K — below noise. Meanwhile, splitting into a separate pipeline +would add ~1–5μs per pipeline bind on the CPU side per scissor, matching or exceeding the GPU-side +savings. Within the main tier, unified remains strictly better. + +The naming convention follows the existing shape API: `rectangle_texture` and +`rectangle_texture_corners` sit alongside `rectangle` and `rectangle_corners`, mirroring the +`rectangle_gradient` / `circle_gradient` pattern where the shape is the primary noun and the +modifier (gradient, texture) is secondary. This groups related procs together in autocomplete +(`rectangle_*`) and reads as natural English ("draw a rectangle with a texture"). + +Future per-shape texture variants (`circle_texture`, `ellipse_texture`, `polygon_texture`) are +reserved by this naming convention and require only a `Shape_Flag.Textured` bit plus a small +per-shape UV mapping function in the fragment shader. These are additive. + +#### What SDF anti-aliasing does and does not do for textured draws + +The SDF path anti-aliases the **shape's outer silhouette** — rounded-corner edges, rotated edges, +stroke outlines. It does not anti-alias or sharpen the texture content. Inside the shape, fragments +sample through the chosen `Sampler_Preset`, and image quality is whatever the sampler produces from +the source texels. A low-resolution texture displayed at a large size shows bilinear blur regardless +of which draw proc is used. This matches the current text-rendering model, where glyph sharpness +depends on how closely the display size matches the SDL_ttf atlas's rasterized size. + +#### Fit modes are a computation layer, not a renderer concept + +Standard image-fit behaviors (stretch, fill/cover, fit/contain, tile, center) are expressed as UV +sub-region computations on top of the `uv_rect` parameter that both textured-draw procs accept. The +renderer has no knowledge of fit modes — it samples whatever UV region it is given. + +A `fit_params` helper computes the appropriate `uv_rect`, sampler preset, and (for letterbox/fit +mode) shrunken inner rect from a `Fit_Mode` enum, the target rect, and the texture's pixel size. +Users who need custom UV control (sprite atlas sub-regions, UV animation, nine-patch slicing) skip +the helper and compute `uv_rect` directly. This keeps the renderer primitive minimal while making +the common cases convenient. + +#### Deferred release + +`unregister_texture` does not immediately release the GPU texture. It queues the slot for release at +the end of the current frame, after `SubmitGPUCommandBuffer` has handed work to the GPU. This +prevents a race condition where a texture is freed while the GPU is still sampling from it in an +already-submitted command buffer. The same deferred-release pattern is applied to `clear_text_cache` +and `clear_text_cache_entry`, fixing a pre-existing latent bug where destroying a cached +`^sdl_ttf.Text` mid-frame could free an atlas texture still referenced by in-flight draw batches. + +This pattern is standard in production renderers — the RAD Debugger's `r_tex2d_release` queues +textures onto a free list that is processed in `r_end_frame`, not at the call site. + +#### Clay integration + +Clay's `RenderCommandType.Image` is handled by dereferencing `imageData: rawptr` as a pointer to a +`Clay_Image_Data` struct containing a `Texture_Id`, `Fit_Mode`, and tint color. Routing mirrors the +existing rectangle handling: zero `cornerRadius` dispatches to `draw.texture` (tessellated), nonzero +dispatches to `draw.rectangle_texture_corners` (SDF). A `fit_params` call computes UVs from the fit +mode before dispatch. + +#### Deferred features + +The following are plumbed in the descriptor but not implemented in phase 1: + +- **Mipmaps**: `Texture_Desc.mip_levels` field exists; generation via SDL3 deferred. +- **Compressed formats**: `Texture_Desc.format` accepts BC/ASTC; upload path deferred. +- **Render-to-texture**: `Texture_Desc.usage` accepts `.COLOR_TARGET`; render-pass refactor deferred. +- **3D textures, arrays, cube maps**: `Texture_Desc.type` and `depth_or_layers` fields exist. +- **Additional samplers**: anisotropic, trilinear, clamp-to-border — additive enum values. +- **Atlas packing**: internal optimization for sub-batch coalescing; invisible to callers. +- **Per-shape texture variants**: `circle_texture`, `ellipse_texture`, etc. — reserved by naming. + +**References:** + +- RAD Debugger render layer (ownership model, deferred release, sampler-at-draw-time): + https://github.com/EpicGamesExt/raddebugger — `src/render/render_core.h`, `src/render/d3d11/render_d3d11.c` +- Casey Muratori, Handmade Hero day 472 — texture handling as a renderer-owned resource concern, + atlases as a separate layer above the renderer. + ## 3D rendering 3D pipeline architecture is under consideration and will be documented separately. The current diff --git a/draw/draw.odin b/draw/draw.odin index 0ed28b0..0cb0f82 100644 --- a/draw/draw.odin +++ b/draw/draw.odin @@ -265,6 +265,7 @@ measure_text_clay :: proc "c" ( context = GLOB.odin_context text := string(text.chars[:text.length]) c_text := strings.clone_to_cstring(text, context.temp_allocator) + defer delete(c_text, context.temp_allocator) width, height: c.int if !sdl_ttf.GetStringSize(get_font(config.fontId, config.fontSize), c_text, 0, &width, &height) { log.panicf("Failed to measure text: %s", sdl.GetError()) @@ -502,6 +503,7 @@ prepare_clay_batch :: proc( mouse_wheel_delta: [2]f32, frame_time: f32 = 0, custom_draw: Custom_Draw = nil, + temp_allocator := context.temp_allocator, ) { mouse_pos: [2]f32 mouse_flags := sdl.GetMouseState(&mouse_pos.x, &mouse_pos.y) @@ -541,7 +543,8 @@ prepare_clay_batch :: proc( case clay.RenderCommandType.Text: render_data := render_command.renderData.text txt := string(render_data.stringContents.chars[:render_data.stringContents.length]) - c_text := strings.clone_to_cstring(txt, context.temp_allocator) + c_text := strings.clone_to_cstring(txt, temp_allocator) + defer delete(c_text, temp_allocator) // Clay render-command IDs are derived via Clay's internal HashNumber (Jenkins-family) // and namespaced with .Clay so they can never collide with user-provided custom text IDs. sdl_text := cache_get_or_update( diff --git a/draw/shapes.odin b/draw/shapes.odin index 2b15f25..5a8b929 100644 --- a/draw/shapes.odin +++ b/draw/shapes.odin @@ -83,6 +83,7 @@ rectangle_gradient :: proc( temp_allocator := context.temp_allocator, ) { vertices := make([]Vertex, 6, temp_allocator) + defer delete(vertices, temp_allocator) corner_top_left := [2]f32{rect.x, rect.y} corner_top_right := [2]f32{rect.x + rect.width, rect.y} @@ -115,6 +116,7 @@ circle_sector :: proc( vertex_count := segment_count * 3 vertices := make([]Vertex, vertex_count, temp_allocator) + defer delete(vertices, temp_allocator) start_radians := math.to_radians(start_angle) end_radians := math.to_radians(end_angle) @@ -167,6 +169,7 @@ circle_gradient :: proc( vertex_count := segment_count * 3 vertices := make([]Vertex, vertex_count, temp_allocator) + defer delete(vertices, temp_allocator) step_angle := math.TAU / f32(segment_count) @@ -238,6 +241,7 @@ triangle_lines :: proc( temp_allocator := context.temp_allocator, ) { vertices := make([]Vertex, 18, temp_allocator) + defer delete(vertices, temp_allocator) write_offset := 0 if !needs_transform(origin, rotation) { @@ -273,6 +277,7 @@ triangle_fan :: proc( triangle_count := len(points) - 2 vertex_count := triangle_count * 3 vertices := make([]Vertex, vertex_count, temp_allocator) + defer delete(vertices, temp_allocator) if !needs_transform(origin, rotation) { for i in 1 ..< len(points) - 1 { @@ -312,6 +317,7 @@ triangle_strip :: proc( triangle_count := len(points) - 2 vertex_count := triangle_count * 3 vertices := make([]Vertex, vertex_count, temp_allocator) + defer delete(vertices, temp_allocator) if !needs_transform(origin, rotation) { for i in 0 ..< triangle_count { diff --git a/draw/text.odin b/draw/text.odin index 5ff7265..7400b33 100644 --- a/draw/text.odin +++ b/draw/text.odin @@ -139,6 +139,7 @@ text :: proc( temp_allocator := context.temp_allocator, ) { c_str := strings.clone_to_cstring(text_string, temp_allocator) + defer delete(c_str, temp_allocator) sdl_text: ^sdl_ttf.Text cached := false @@ -180,6 +181,7 @@ measure_text :: proc( allocator := context.temp_allocator, ) -> [2]f32 { c_str := strings.clone_to_cstring(text_string, allocator) + defer delete(c_str, allocator) width, height: c.int if !sdl_ttf.GetStringSize(get_font(font_id, font_size), c_str, 0, &width, &height) { log.panicf("Failed to measure text: %s", sdl.GetError())