From f85187eff3485311ab630b0d8f919bca51cab8c5 Mon Sep 17 00:00:00 2001
From: Zachary Levy <zachary@sunforge.is>
Date: Mon, 20 Apr 2026 20:27:40 -0700
Subject: [PATCH] Clean up memory management

---
 draw/README.md   | 222 ++++++++++++++++++++++++++++++++++++++++++-----
 draw/draw.odin   |   5 +-
 draw/shapes.odin |   6 ++
 draw/text.odin   |   2 +
 4 files changed, 213 insertions(+), 22 deletions(-)

diff --git a/draw/README.md b/draw/README.md
index 5f9225a..1066a7e 100644
--- a/draw/README.md
+++ b/draw/README.md
@@ -81,32 +81,63 @@ shader contains both a 20-register RRect SDF and a 72-register frosted-glass blu
 — even trivial RRects — is allocated 72 registers. This directly reduces **occupancy** (the number of
 warps that can run simultaneously), which reduces the GPU's ability to hide memory latency.
 
-Concrete example on a modern NVIDIA SM with 65,536 registers:
+Concrete occupancy analysis on modern NVIDIA SMs, which have 65,536 32-bit registers and a
+hardware-imposed maximum thread count per SM that varies by architecture (Volta/A100: 2,048;
+consumer Ampere/Ada: 1,536). Occupancy is register-limited only when `65536 / regs_per_thread` falls
+below the hardware thread cap; above that cap, occupancy is 100% regardless of register count.
 
-| Register allocation       | Max concurrent threads | Occupancy |
-| ------------------------- | ---------------------- | --------- |
-| 20 regs (RRect only)      | 3,276                  | ~100%     |
-| 48 regs (+ drop shadow)   | 1,365                  | ~42%      |
-| 72 regs (+ frosted glass) | 910                    | ~28%      |
+On consumer Ampere/Ada GPUs (RTX 30xx/40xx, max 1,536 threads per SM):
 
-For a 4K frame (3840×2160) at 1.5× overdraw (~12.4M fragments), running all fragments at 28%
-occupancy instead of 100% roughly triples fragment shading time. At 4K this is severe: if the main
-pipeline's fragment work at full occupancy takes ~2ms, a single unified shader containing the glass
-branch would push it to ~6ms — consuming 72% of the 8.3ms budget available at 120 FPS and leaving
-almost nothing for CPU work, uploads, and presentation. This is a per-frame multiplier, not a
-per-primitive cost — it applies even when the heavy branch is never taken.
+| Register allocation       | Reg-limited threads | Actual (hw-capped) | Occupancy |
+| ------------------------- | ------------------- | ------------------ | --------- |
+| 20 regs (RRect only)      | 3,276               | 1,536              | 100%      |
+| 32 regs                   | 2,048               | 1,536              | 100%      |
+| 48 regs (+ drop shadow)   | 1,365               | 1,365              | ~89%      |
+| 72 regs (+ frosted glass) | 910                 | 910                | ~59%      |
+
+On Volta/A100 GPUs (max 2,048 threads per SM):
+
+| Register allocation       | Reg-limited threads | Actual (hw-capped) | Occupancy |
+| ------------------------- | ------------------- | ------------------ | --------- |
+| 20 regs (RRect only)      | 3,276               | 2,048              | 100%      |
+| 32 regs                   | 2,048               | 2,048              | 100%      |
+| 48 regs (+ drop shadow)   | 1,365               | 1,365              | ~67%      |
+| 72 regs (+ frosted glass) | 910                 | 910                | ~44%      |
+
+The register cliff — where occupancy begins dropping — starts at ~43 regs/thread on consumer
+Ampere/Ada (65536 / 1536) and ~32 regs/thread on Volta/A100 (65536 / 2048). Below the cliff,
+adding registers has zero occupancy cost.
+
+The impact of reduced occupancy depends on whether the shader is memory-latency-bound (where
+occupancy is critical for hiding latency) or ALU-bound (where it matters less). For the
+backdrop-effects pipeline's frosted-glass shader, which performs multiple dependent texture reads,
+59% occupancy (consumer) or 44% occupancy (Volta) meaningfully reduces the GPU's ability to hide
+texture latency — roughly a 1.7× to 2.3× throughput reduction compared to full occupancy. At 4K with
+1.5× overdraw (~12.4M fragments), if the main pipeline's fragment work at full occupancy takes ~2ms,
+a single unified shader containing the glass branch would push it to ~3.4–4.6ms depending on
+architecture. This is a per-frame multiplier, not a per-primitive cost — it applies even when the
+heavy branch is never taken, because the compiler allocates registers for the worst-case path.
+
+**Note on Apple M3+ GPUs:** Apple's M3 GPU architecture introduces Dynamic Caching (register file
+virtualization), which allocates registers dynamically at runtime based on actual usage rather than
+worst-case declared usage. This significantly reduces the static register-pressure-to-occupancy
+penalty described above. The tier split remains useful on Apple hardware for other reasons (keeping
+the backdrop texture-copy out of the main render pass, isolating blur ALU complexity), but the
+register-pressure argument specifically weakens on M3 and later.
 
 The three-pipeline split groups primitives by register footprint so that:
 
-- Main pipeline (~20 regs): 90%+ of fragments run at near-full occupancy.
-- Effects pipeline (~55 regs): shadow/glow fragments run at moderate occupancy; unavoidable given the
-  blur math complexity.
-- Backdrop-effects pipeline (~75 regs): glass fragments run at low occupancy; also unavoidable, and
-  structurally separated anyway by the texture-copy requirement.
+- Main pipeline (~20 regs): all fragments run at full occupancy on every architecture.
+- Effects pipeline (~48–55 regs): shadow/glow fragments run at 67–89% occupancy depending on
+  architecture; unavoidable given the blur math complexity.
+- Backdrop-effects pipeline (~72–75 regs): glass fragments run at 44–59% occupancy; also
+  unavoidable, and structurally separated anyway by the texture-copy requirement.
 
 This avoids the register-pressure tax of a single unified shader while keeping pipeline count minimal
 (3 vs. Zed GPUI's 7). The effects that drag occupancy down are isolated to the fragments that
-actually need them.
+actually need them. Crucially, all shape kinds within the main pipeline (SDF, tessellated, text)
+cluster at 12–24 registers — well below the register cliff on every architecture — so unifying them
+costs nothing in occupancy.
 
 **Why not per-primitive-type pipelines (GPUI's approach)?** Zed's GPUI uses 7 separate shader pairs:
 quad, shadow, underline, monochrome sprite, polychrome sprite, path, surface. This eliminates all
@@ -160,9 +191,9 @@ in submission order:
   cheaper than the pipeline-switching alternative.
 
 The split we _do_ perform (main / effects / backdrop-effects) is motivated by register-pressure tier
-boundaries where occupancy differences are catastrophic at 4K (see numbers above). Within a tier,
-unified is strictly better by every measure: fewer draw calls, simpler Z-order, lower CPU overhead,
-and negligible GPU-side branching cost.
+boundaries where occupancy drops are significant at 4K (see numbers above). Within a tier, unified is
+strictly better by every measure: fewer draw calls, simpler Z-order, lower CPU overhead, and
+negligible GPU-side branching cost.
 
 **References:**
 
@@ -172,6 +203,16 @@ and negligible GPU-side branching cost.
   https://github.com/zed-industries/zed/blob/cb6fc11/crates/gpui/src/platform/mac/shaders.metal
 - NVIDIA Nsight Graphics 2024.3 documentation on active-threads-per-warp and divergence analysis:
   https://developer.nvidia.com/blog/optimize-gpu-workloads-for-graphics-applications-with-nvidia-nsight-graphics/
+- NVIDIA Ampere GPU Architecture Tuning Guide — SM specs, max warps per SM (48 for cc 8.6, 64 for
+  cc 8.0), register file size (64K), occupancy factors:
+  https://docs.nvidia.com/cuda/ampere-tuning-guide/index.html
+- NVIDIA Ada GPU Architecture Tuning Guide — SM specs, max warps per SM (48 for cc 8.9):
+  https://docs.nvidia.com/cuda/ada-tuning-guide/index.html
+- CUDA Occupancy Calculation walkthrough (register allocation granularity, worked examples):
+  https://leimao.github.io/blog/CUDA-Occupancy-Calculation/
+- Apple M3 GPU architecture — Dynamic Caching (register file virtualization) eliminates static
+  worst-case register allocation, reducing the occupancy penalty for high-register shaders:
+  https://asplos.dev/wiki/m3-chip-explainer/gpu/index.html
 
 ### Why fragment shader branching is safe in this design
 
@@ -539,6 +580,145 @@ changes.
 - Valve's original SDF text rendering paper (SIGGRAPH 2007):
   https://steamcdn-a.akamaihd.net/apps/valve/2007/SIGGRAPH2007_AlphaTestedMagnification.pdf
 
+### Textures
+
+Textures plug into the existing main pipeline — no additional GPU pipeline, no shader rewrite. The
+work is a resource layer (registration, upload, sampling, lifecycle) plus two textured-draw procs
+that route into the existing tessellated and SDF paths respectively.
+
+#### Why draw owns registered textures
+
+A texture's GPU resource (the `^sdl.GPUTexture`, transfer buffer, shader resource view) is created
+and destroyed by draw. The user provides raw bytes and a descriptor at registration time; draw
+uploads synchronously and returns an opaque `Texture_Id` handle. The user can free their CPU-side
+bytes immediately after `register_texture` returns.
+
+This follows the model used by the RAD Debugger's render layer (`src/render/render_core.h` in
+EpicGamesExt/raddebugger, MIT license), where `r_tex2d_alloc` takes `(kind, size, format, data)`
+and returns an opaque handle that the renderer owns and releases. The single-owner model eliminates
+an entire class of lifecycle bugs (double-free, use-after-free across subsystems, unclear cleanup
+responsibility) that dual-ownership designs introduce.
+
+If advanced interop is ever needed (e.g., a future 3D pipeline or compute shader sharing the same
+GPU texture), the clean extension is a borrowed-reference accessor (`get_gpu_texture(id)`) that
+returns the underlying handle without transferring ownership. This is purely additive and does not
+require changing the registration API.
+
+#### Why `Texture_Kind` exists
+
+`Texture_Kind` (Static / Dynamic / Stream) is a driver hint for update frequency, adopted from the
+RAD Debugger's `R_ResourceKind`. It maps directly to SDL3 GPU usage patterns:
+
+- **Static**: uploaded once, never changes. Covers QR codes, decoded PNGs, icons — the 90% case.
+- **Dynamic**: updatable via `update_texture_region`. Covers font atlas growth, procedural updates.
+- **Stream**: frequent full re-uploads. Covers video playback, per-frame procedural generation.
+
+This costs one byte in the descriptor and lets the backend pick optimal memory placement without a
+future API change.
+
+#### Why samplers are per-draw, not per-texture
+
+A sampler describes how to filter and address a texture during sampling — nearest vs bilinear, clamp
+vs repeat. This is a property of the _draw_, not the texture. The same QR code texture should be
+sampled with `Nearest_Clamp` when displayed at native resolution but could reasonably be sampled
+with `Linear_Clamp` in a zoomed-out thumbnail. The same icon atlas might be sampled with
+`Nearest_Clamp` for pixel art or `Linear_Clamp` for smooth scaling.
+
+The RAD Debugger follows this pattern: `R_BatchGroup2DParams` carries `tex_sample_kind` alongside
+the texture handle, chosen per batch group at draw time. We do the same — `Sampler_Preset` is a
+parameter on the draw procs, not a field on `Texture_Desc`.
+
+Internally, draw keeps a small pool of pre-created `^sdl.GPUSampler` objects (one per preset,
+lazily initialized). Sub-batch coalescing keys on `(kind, texture_id, sampler_preset)` — draws
+with the same texture but different samplers produce separate draw calls, which is correct.
+
+#### Textured draw procs
+
+Textured rectangles route through the existing SDF path via `draw.rectangle_texture` and
+`draw.rectangle_texture_corners`, mirroring `draw.rectangle` and `draw.rectangle_corners` exactly —
+same parameters, same naming — with the color parameter replaced by a texture ID plus an optional
+tint.
+
+An earlier iteration of this design considered a separate tessellated `draw.texture` proc for
+"simple" fullscreen quads, on the theory that the tessellated path's lower register count (~16 regs
+vs ~24 for the SDF textured branch) would improve occupancy at large fragment counts. Applying the
+register-pressure analysis from the pipeline-strategy section above shows this is wrong: both 16 and
+24 registers are well below the register cliff (~43 regs on consumer Ampere/Ada, ~32 on Volta/A100),
+so both run at 100% occupancy. The remaining ALU difference (~15 extra instructions for the SDF
+evaluation) amounts to ~20μs at 4K — below noise. Meanwhile, splitting into a separate pipeline
+would add ~1–5μs per pipeline bind on the CPU side per scissor, matching or exceeding the GPU-side
+savings. Within the main tier, unified remains strictly better.
+
+The naming convention follows the existing shape API: `rectangle_texture` and
+`rectangle_texture_corners` sit alongside `rectangle` and `rectangle_corners`, mirroring the
+`rectangle_gradient` / `circle_gradient` pattern where the shape is the primary noun and the
+modifier (gradient, texture) is secondary. This groups related procs together in autocomplete
+(`rectangle_*`) and reads as natural English ("draw a rectangle with a texture").
+
+Future per-shape texture variants (`circle_texture`, `ellipse_texture`, `polygon_texture`) are
+reserved by this naming convention and require only a `Shape_Flag.Textured` bit plus a small
+per-shape UV mapping function in the fragment shader. These are additive.
+
+#### What SDF anti-aliasing does and does not do for textured draws
+
+The SDF path anti-aliases the **shape's outer silhouette** — rounded-corner edges, rotated edges,
+stroke outlines. It does not anti-alias or sharpen the texture content. Inside the shape, fragments
+sample through the chosen `Sampler_Preset`, and image quality is whatever the sampler produces from
+the source texels. A low-resolution texture displayed at a large size shows bilinear blur regardless
+of which draw proc is used. This matches the current text-rendering model, where glyph sharpness
+depends on how closely the display size matches the SDL_ttf atlas's rasterized size.
+
+#### Fit modes are a computation layer, not a renderer concept
+
+Standard image-fit behaviors (stretch, fill/cover, fit/contain, tile, center) are expressed as UV
+sub-region computations on top of the `uv_rect` parameter that both textured-draw procs accept. The
+renderer has no knowledge of fit modes — it samples whatever UV region it is given.
+
+A `fit_params` helper computes the appropriate `uv_rect`, sampler preset, and (for letterbox/fit
+mode) shrunken inner rect from a `Fit_Mode` enum, the target rect, and the texture's pixel size.
+Users who need custom UV control (sprite atlas sub-regions, UV animation, nine-patch slicing) skip
+the helper and compute `uv_rect` directly. This keeps the renderer primitive minimal while making
+the common cases convenient.
+
+#### Deferred release
+
+`unregister_texture` does not immediately release the GPU texture. It queues the slot for release at
+the end of the current frame, after `SubmitGPUCommandBuffer` has handed work to the GPU. This
+prevents a race condition where a texture is freed while the GPU is still sampling from it in an
+already-submitted command buffer. The same deferred-release pattern is applied to `clear_text_cache`
+and `clear_text_cache_entry`, fixing a pre-existing latent bug where destroying a cached
+`^sdl_ttf.Text` mid-frame could free an atlas texture still referenced by in-flight draw batches.
+
+This pattern is standard in production renderers — the RAD Debugger's `r_tex2d_release` queues
+textures onto a free list that is processed in `r_end_frame`, not at the call site.
+
+#### Clay integration
+
+Clay's `RenderCommandType.Image` is handled by dereferencing `imageData: rawptr` as a pointer to a
+`Clay_Image_Data` struct containing a `Texture_Id`, `Fit_Mode`, and tint color. Routing mirrors the
+existing rectangle handling: zero `cornerRadius` dispatches to `draw.texture` (tessellated), nonzero
+dispatches to `draw.rectangle_texture_corners` (SDF). A `fit_params` call computes UVs from the fit
+mode before dispatch.
+
+#### Deferred features
+
+The following are plumbed in the descriptor but not implemented in phase 1:
+
+- **Mipmaps**: `Texture_Desc.mip_levels` field exists; generation via SDL3 deferred.
+- **Compressed formats**: `Texture_Desc.format` accepts BC/ASTC; upload path deferred.
+- **Render-to-texture**: `Texture_Desc.usage` accepts `.COLOR_TARGET`; render-pass refactor deferred.
+- **3D textures, arrays, cube maps**: `Texture_Desc.type` and `depth_or_layers` fields exist.
+- **Additional samplers**: anisotropic, trilinear, clamp-to-border — additive enum values.
+- **Atlas packing**: internal optimization for sub-batch coalescing; invisible to callers.
+- **Per-shape texture variants**: `circle_texture`, `ellipse_texture`, etc. — reserved by naming.
+
+**References:**
+
+- RAD Debugger render layer (ownership model, deferred release, sampler-at-draw-time):
+  https://github.com/EpicGamesExt/raddebugger — `src/render/render_core.h`, `src/render/d3d11/render_d3d11.c`
+- Casey Muratori, Handmade Hero day 472 — texture handling as a renderer-owned resource concern,
+  atlases as a separate layer above the renderer.
+
 ## 3D rendering
 
 3D pipeline architecture is under consideration and will be documented separately. The current
diff --git a/draw/draw.odin b/draw/draw.odin
index 0ed28b0..0cb0f82 100644
--- a/draw/draw.odin
+++ b/draw/draw.odin
@@ -265,6 +265,7 @@ measure_text_clay :: proc "c" (
 	context = GLOB.odin_context
 	text := string(text.chars[:text.length])
 	c_text := strings.clone_to_cstring(text, context.temp_allocator)
+	defer delete(c_text, context.temp_allocator)
 	width, height: c.int
 	if !sdl_ttf.GetStringSize(get_font(config.fontId, config.fontSize), c_text, 0, &width, &height) {
 		log.panicf("Failed to measure text: %s", sdl.GetError())
@@ -502,6 +503,7 @@ prepare_clay_batch :: proc(
 	mouse_wheel_delta: [2]f32,
 	frame_time: f32 = 0,
 	custom_draw: Custom_Draw = nil,
+	temp_allocator := context.temp_allocator,
 ) {
 	mouse_pos: [2]f32
 	mouse_flags := sdl.GetMouseState(&mouse_pos.x, &mouse_pos.y)
@@ -541,7 +543,8 @@ prepare_clay_batch :: proc(
 		case clay.RenderCommandType.Text:
 			render_data := render_command.renderData.text
 			txt := string(render_data.stringContents.chars[:render_data.stringContents.length])
-			c_text := strings.clone_to_cstring(txt, context.temp_allocator)
+			c_text := strings.clone_to_cstring(txt, temp_allocator)
+			defer delete(c_text, temp_allocator)
 			// Clay render-command IDs are derived via Clay's internal HashNumber (Jenkins-family)
 			// and namespaced with .Clay so they can never collide with user-provided custom text IDs.
 			sdl_text := cache_get_or_update(
diff --git a/draw/shapes.odin b/draw/shapes.odin
index 2b15f25..5a8b929 100644
--- a/draw/shapes.odin
+++ b/draw/shapes.odin
@@ -83,6 +83,7 @@ rectangle_gradient :: proc(
 	temp_allocator := context.temp_allocator,
 ) {
 	vertices := make([]Vertex, 6, temp_allocator)
+	defer delete(vertices, temp_allocator)
 
 	corner_top_left := [2]f32{rect.x, rect.y}
 	corner_top_right := [2]f32{rect.x + rect.width, rect.y}
@@ -115,6 +116,7 @@ circle_sector :: proc(
 
 	vertex_count := segment_count * 3
 	vertices := make([]Vertex, vertex_count, temp_allocator)
+	defer delete(vertices, temp_allocator)
 
 	start_radians := math.to_radians(start_angle)
 	end_radians := math.to_radians(end_angle)
@@ -167,6 +169,7 @@ circle_gradient :: proc(
 
 	vertex_count := segment_count * 3
 	vertices := make([]Vertex, vertex_count, temp_allocator)
+	defer delete(vertices, temp_allocator)
 
 	step_angle := math.TAU / f32(segment_count)
 
@@ -238,6 +241,7 @@ triangle_lines :: proc(
 	temp_allocator := context.temp_allocator,
 ) {
 	vertices := make([]Vertex, 18, temp_allocator)
+	defer delete(vertices, temp_allocator)
 	write_offset := 0
 
 	if !needs_transform(origin, rotation) {
@@ -273,6 +277,7 @@ triangle_fan :: proc(
 	triangle_count := len(points) - 2
 	vertex_count := triangle_count * 3
 	vertices := make([]Vertex, vertex_count, temp_allocator)
+	defer delete(vertices, temp_allocator)
 
 	if !needs_transform(origin, rotation) {
 		for i in 1 ..< len(points) - 1 {
@@ -312,6 +317,7 @@ triangle_strip :: proc(
 	triangle_count := len(points) - 2
 	vertex_count := triangle_count * 3
 	vertices := make([]Vertex, vertex_count, temp_allocator)
+	defer delete(vertices, temp_allocator)
 
 	if !needs_transform(origin, rotation) {
 		for i in 0 ..< triangle_count {
diff --git a/draw/text.odin b/draw/text.odin
index 5ff7265..7400b33 100644
--- a/draw/text.odin
+++ b/draw/text.odin
@@ -139,6 +139,7 @@ text :: proc(
 	temp_allocator := context.temp_allocator,
 ) {
 	c_str := strings.clone_to_cstring(text_string, temp_allocator)
+	defer delete(c_str, temp_allocator)
 
 	sdl_text: ^sdl_ttf.Text
 	cached := false
@@ -180,6 +181,7 @@ measure_text :: proc(
 	allocator := context.temp_allocator,
 ) -> [2]f32 {
 	c_str := strings.clone_to_cstring(text_string, allocator)
+	defer delete(c_str, allocator)
 	width, height: c.int
 	if !sdl_ttf.GetStringSize(get_font(font_id, font_size), c_str, 0, &width, &height) {
 		log.panicf("Failed to measure text: %s", sdl.GetError())