2026-04-20 20:14:57 +00:00
26 changed files with 5331 additions and 1 deletions
@@ -38,13 +38,33 @@
    "cwd": "$ZED_WORKTREE_ROOT",
  },
  // ---------------------------------------------------------------------------------------------------------------------
-  // ----- LMDB Examples ------------------------
+  // ----- Examples ------------------------
  // ---------------------------------------------------------------------------------------------------------------------
  {
    "label": "Run lmdb example",
    "command": "odin run vendor/lmdb/examples -debug -out=out/debug/lmdb-examples",
    "cwd": "$ZED_WORKTREE_ROOT",
  },
+  {
+    "label": "Run draw hellope-clay example",
+    "command": "odin run draw/examples -debug -out=out/debug/draw-examples -- hellope-clay",
+    "cwd": "$ZED_WORKTREE_ROOT",
+  },
+  {
+    "label": "Run draw hellope-shapes example",
+    "command": "odin run draw/examples -debug -out=out/debug/draw-examples -- hellope-shapes",
+    "cwd": "$ZED_WORKTREE_ROOT",
+  },
+  {
+    "label": "Run draw hellope-text example",
+    "command": "odin run draw/examples -debug -out=out/debug/draw-examples -- hellope-text",
+    "cwd": "$ZED_WORKTREE_ROOT",
+  },
+  {
+    "label": "Run draw hellope-custom example",
+    "command": "odin run draw/examples -debug -out=out/debug/draw-examples -- hellope-custom",
+    "cwd": "$ZED_WORKTREE_ROOT",
+  },
  // ---------------------------------------------------------------------------------------------------------------------
  // ----- Other ------------------------
  // ---------------------------------------------------------------------------------------------------------------------
@@ -1,3 +1,19 @@
 # LevLib

 Narya + BFPOWER unified Odin library collection.
+
+## Meta Tools
+
+The `meta/` package contains build tools that can be run from the project root:
+
+```
+odin run meta -- <command>
+```
+
+Running with no arguments prints available commands.
+
+### Commands
+
+| Command       | Description                                                                                                                                                                                   |
+| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `gen-shaders` | Compile all GLSL shaders in `draw/shaders/source/` to SPIR-V and Metal Shading Language, writing results to `draw/shaders/generated/`. Requires `glslangValidator` and `spirv-cross` on PATH. |
@@ -0,0 +1,580 @@
+# draw
+
+2D rendering library built on SDL3 GPU, providing a unified shape-drawing and text-rendering API with
+Clay UI integration.
+
+## Current state
+
+The renderer uses a single unified `Pipeline_2D_Base` (`TRIANGLELIST` pipeline) with two submission
+modes dispatched by a push constant:
+
+- **Mode 0 (Tessellated):** Vertex buffer contains real geometry. Used for text (indexed draws into
+  SDL_ttf atlas textures), axis-aligned sharp-corner rectangles (already optimal as 2 triangles),
+  per-vertex color gradients (`rectangle_gradient`, `circle_gradient`), angular-clipped circle
+  sectors (`circle_sector`), and arbitrary user geometry (`triangle`, `triangle_fan`,
+  `triangle_strip`). The fragment shader computes `out = color * texture(tex, uv)`.
+
+- **Mode 1 (SDF):** A static 6-vertex unit-quad buffer is drawn instanced, with per-primitive
+  `Primitive` structs uploaded each frame to a GPU storage buffer. The vertex shader reads
+  `primitives[gl_InstanceIndex]`, computes world-space position from unit quad corners + primitive
+  bounds. The fragment shader dispatches on `Shape_Kind` to evaluate the correct signed distance
+  function analytically.
+
+Seven SDF shape kinds are implemented:
+
+1. **RRect** — rounded rectangle with per-corner radii (iq's `sdRoundedBox`)
+2. **Circle** — filled or stroked circle
+3. **Ellipse** — exact signed-distance ellipse (iq's iterative `sdEllipse`)
+4. **Segment** — capsule-style line segment with rounded caps
+5. **Ring_Arc** — annular ring with angular clipping for arcs
+6. **NGon** — regular polygon with arbitrary side count and rotation
+7. **Polyline** — decomposed into independent `Segment` primitives per adjacent point pair
+
+All SDF shapes support fill and stroke modes via `Shape_Flags`, and produce mathematically exact
+curves with analytical anti-aliasing via `smoothstep` — no tessellation, no piecewise-linear
+approximation. A rounded rectangle is 1 primitive (64 bytes) instead of ~250 vertices (~5000 bytes).
+
+MSAA is opt-in (default `._1`, no MSAA) via `Init_Options.msaa_samples`. SDF rendering does not
+benefit from MSAA because fragment coverage is computed analytically. MSAA remains useful for text
+glyph edges and tessellated user geometry if desired.
+
+## 2D rendering pipeline plan
+
+This section documents the planned architecture for levlib's 2D rendering system. The design is driven
+by three goals: **draw quality** (mathematically exact curves with perfect anti-aliasing), **efficiency**
+(minimal vertex bandwidth, high GPU occupancy, low draw-call count), and **extensibility** (new
+primitives and effects can be added to the library without architectural changes).
+
+### Overview: three pipelines
+
+The 2D renderer will use three GPU pipelines, split by **register pressure compatibility** and
+**render-state requirements**:
+
+1. **Main pipeline** — shapes (SDF and tessellated) and text. Low register footprint (~18–22
+   registers per thread). Runs at high GPU occupancy. Handles 90%+ of all fragments in a typical
+   frame.
+
+2. **Effects pipeline** — drop shadows, inner shadows, outer glow, and similar ALU-bound blur
+   effects. Medium register footprint (~48–60 registers). Each effects primitive includes the base
+   shape's SDF so that it can draw both the effect and the shape in a single fragment pass, avoiding
+   redundant overdraw.
+
+3. **Backdrop-effects pipeline** — frosted glass, refraction, and any effect that samples the current
+   render target as input. High register footprint (~70–80 registers) and structurally requires a
+   `CopyGPUTextureToTexture` from the render target before drawing. Separated both for register
+   pressure and because the texture-copy requirement forces a render-pass-level state change.
+
+A typical UI frame with no effects uses 1 pipeline bind and 0 switches. A frame with drop shadows
+uses 2 pipelines and 1 switch. A frame with shadows and frosted glass uses all 3 pipelines and 2
+switches plus 1 texture copy. At ~5μs per pipeline bind on modern APIs, worst-case switching overhead
+is under 0.15% of an 8.3ms (120 FPS) frame budget.
+
+### Why three pipelines, not one or seven
+
+The natural question is whether we should use a single unified pipeline (fewer state changes, simpler
+code) or many per-primitive-type pipelines (no branching overhead, lean per-shader register usage).
+
+The dominant cost factor is **GPU register pressure**, not pipeline switching overhead or fragment
+shader branching. A GPU shader core has a fixed register pool shared among all concurrent threads. The
+compiler allocates registers pessimistically based on the worst-case path through the shader. If the
+shader contains both a 20-register RRect SDF and a 72-register frosted-glass blur, _every_ fragment
+— even trivial RRects — is allocated 72 registers. This directly reduces **occupancy** (the number of
+warps that can run simultaneously), which reduces the GPU's ability to hide memory latency.
+
+Concrete example on a modern NVIDIA SM with 65,536 registers:
+
+| Register allocation       | Max concurrent threads | Occupancy |
+| ------------------------- | ---------------------- | --------- |
+| 20 regs (RRect only)      | 3,276                  | ~100%     |
+| 48 regs (+ drop shadow)   | 1,365                  | ~42%      |
+| 72 regs (+ frosted glass) | 910                    | ~28%      |
+
+For a 4K frame (3840×2160) at 1.5× overdraw (~12.4M fragments), running all fragments at 28%
+occupancy instead of 100% roughly triples fragment shading time. At 4K this is severe: if the main
+pipeline's fragment work at full occupancy takes ~2ms, a single unified shader containing the glass
+branch would push it to ~6ms — consuming 72% of the 8.3ms budget available at 120 FPS and leaving
+almost nothing for CPU work, uploads, and presentation. This is a per-frame multiplier, not a
+per-primitive cost — it applies even when the heavy branch is never taken.
+
+The three-pipeline split groups primitives by register footprint so that:
+
+- Main pipeline (~20 regs): 90%+ of fragments run at near-full occupancy.
+- Effects pipeline (~55 regs): shadow/glow fragments run at moderate occupancy; unavoidable given the
+  blur math complexity.
+- Backdrop-effects pipeline (~75 regs): glass fragments run at low occupancy; also unavoidable, and
+  structurally separated anyway by the texture-copy requirement.
+
+This avoids the register-pressure tax of a single unified shader while keeping pipeline count minimal
+(3 vs. Zed GPUI's 7). The effects that drag occupancy down are isolated to the fragments that
+actually need them.
+
+**Why not per-primitive-type pipelines (GPUI's approach)?** Zed's GPUI uses 7 separate shader pairs:
+quad, shadow, underline, monochrome sprite, polychrome sprite, path, surface. This eliminates all
+branching and gives each shader minimal register usage. Three concrete costs make this approach wrong
+for our use case:
+
+**Draw call count scales with kind variety, not just scissor count.** With a unified pipeline,
+one instanced draw call per scissor covers all primitive kinds from a single storage buffer. With
+per-kind pipelines, each scissor requires one draw call and one pipeline bind per kind used. For a
+typical UI frame with 15 scissors and 3–4 primitive kinds per scissor, per-kind splitting produces
+~45–60 draw calls and pipeline binds; our unified approach produces ~15–20 draw calls and 1–5
+pipeline binds. At ~5μs each for CPU-side command encoding on modern APIs, per-kind splitting adds
+375–500μs of CPU overhead per frame — **4.5–6% of an 8.3ms (120 FPS) budget** — with no
+compensating GPU-side benefit, because the register-pressure savings within the simple-SDF tier are
+negligible (all members cluster at 12–22 registers).
+
+**Z-order preservation forces the API to expose layers.** With a single pipeline drawing all kinds
+from one storage buffer, submission order equals draw order — Clay's painterly render commands flow
+through without reordering. With separate pipelines per kind, primitives can only batch with
+same-kind neighbors, which means interleaved kinds (e.g., `[rrect, circle, text, rrect, text]`) must
+either issue one draw call per primitive (defeating batching entirely) or force the user to pre-sort
+by kind and reason about explicit layers. GPUI chose the latter, baking layer semantics into their
+API where each layer draws shadows before quads before glyphs. Our design avoids this constraint:
+submission order is draw order, no layer juggling required.
+
+**PSO compilation costs multiply.** Each pipeline takes 1–50ms to compile on Metal/Vulkan/D3D12 at
+first use. 7 pipelines is ~175ms cold startup; 3 pipelines is ~75ms. Adding state axes (MSAA
+variants, blend modes, color formats) multiplies combinatorially — a 2.3× larger variant matrix per
+additional axis with 7 pipelines vs 3.
+
+**Branching cost comparison: unified vs per-kind in the effects pipeline.** The effects pipeline is
+the strongest candidate for per-kind splitting because effect branches are heavier than shape
+branches (~80 instructions for drop shadow vs ~20 for an SDF). Even here, per-kind splitting loses.
+Consider a worst-case scissor with 15 drop-shadowed cards and 2 inner-shadowed elements interleaved
+in submission order:
+
+- _Unified effects pipeline (our plan):_ 1 pipeline bind, 1 instanced draw call. Category-3
+  divergence occurs at drop-shadow/inner-shadow boundaries where ~4 warps straddle per boundary × 2
+  boundaries = ~8 divergent warps out of ~19,924 total (0.04%). Each divergent warp pays ~80 extra
+  instructions. Total divergence cost: 8 × 32 × 80 / 12G inst/sec ≈ **1.7μs**.
+
+- _Per-kind effects pipelines (GPUI-style):_ 2 pipeline binds + 2 draw calls. But submission order
+  is `[drop, drop, inner, drop, drop, inner, drop, ...]` — the two inner-shadow primitives split the
+  drop-shadow run into three segments. To preserve Z-order, this requires 5 draw calls and 4 pipeline
+  switches, not 2. Cost: 5 × 5μs + 4 × 5μs = **45μs**.
+
+  The per-kind approach costs **26× more** than the unified approach's divergence penalty (45μs vs
+  1.7μs), while eliminating only 0.04% warp divergence that was already negligible. Even in the most
+  extreme stacked-effects scenario (10 cards each with both drop shadow and inner shadow, producing
+  ~60 boundary warps at ~80 extra instructions each), unified divergence costs ~13μs — still 3.5×
+  cheaper than the pipeline-switching alternative.
+
+The split we _do_ perform (main / effects / backdrop-effects) is motivated by register-pressure tier
+boundaries where occupancy differences are catastrophic at 4K (see numbers above). Within a tier,
+unified is strictly better by every measure: fewer draw calls, simpler Z-order, lower CPU overhead,
+and negligible GPU-side branching cost.
+
+**References:**
+
+- Zed GPUI blog post on their per-primitive pipeline architecture:
+  https://zed.dev/blog/videogame
+- Zed GPUI Metal shader source (7 shader pairs):
+  https://github.com/zed-industries/zed/blob/cb6fc11/crates/gpui/src/platform/mac/shaders.metal
+- NVIDIA Nsight Graphics 2024.3 documentation on active-threads-per-warp and divergence analysis:
+  https://developer.nvidia.com/blog/optimize-gpu-workloads-for-graphics-applications-with-nvidia-nsight-graphics/
+
+### Why fragment shader branching is safe in this design
+
+There is longstanding folklore that "branches in shaders are bad." This was true on pre-2010 hardware
+where shader cores had no branch instructions at all — compilers emitted code for both sides of every
+branch and used conditional select to pick the result. On modern GPUs (everything from ~2012 onward),
+this is no longer the case. Native dynamic branching is fully supported on all current hardware.
+However, branching _can_ still be costly in specific circumstances. Understanding which circumstances
+apply to our design — and which do not — is critical to justifying the unified-pipeline approach.
+
+#### How GPU branching works
+
+GPUs execute fragment shaders in **warps** (NVIDIA/Intel, 32 threads) or **wavefronts** (AMD, 32 or
+64 threads). All threads in a warp execute the same instruction simultaneously (SIMT model). When a
+branch condition evaluates the same way for every thread in a warp, the GPU simply jumps to the taken
+path and skips the other — **zero cost**, identical to a CPU branch. This is called a **uniform
+branch** or **warp-coherent branch**.
+
+When threads within the same warp disagree on which path to take, the warp must execute both paths
+sequentially, masking off threads that don't belong to the active path. This is called **warp
+divergence** and it causes the warp to pay the cost of both sides of the branch. In the worst case
+(50/50 split), throughput halves for that warp.
+
+There are three categories of branch condition in a fragment shader, ranked by cost:
+
+| Category                         | Condition source                                                  | GPU behavior                                                                                   | Cost                  |
+| -------------------------------- | ----------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | --------------------- |
+| **Compile-time constant**        | `#ifdef`, `const bool`                                            | Dead code eliminated by compiler                                                               | Zero                  |
+| **Uniform / push constant**      | Same value for entire draw call                                   | Warp-coherent; GPU skips dead path                                                             | Effectively zero      |
+| **Per-primitive `flat` varying** | Same value across all fragments of a primitive                    | Warp-coherent for all warps fully inside one primitive; divergent only at primitive boundaries | Near zero (see below) |
+| **Per-fragment varying**         | Different value per pixel (e.g., texture lookup, screen position) | Potentially divergent within every warp                                                        | Can be expensive      |
+
+#### Which category our branches fall into
+
+Our design has two branch points:
+
+1. **`mode` (push constant): tessellated vs. SDF.** This is category 2 — uniform per draw call.
+   Every thread in every warp of a draw call sees the same `mode` value. **Zero divergence, zero
+   cost.**
+
+2. **`shape_kind` (flat varying from storage buffer): which SDF to evaluate.** This is category 3.
+   The `flat` interpolation qualifier ensures that all fragments rasterized from one primitive's quad
+   receive the same `shape_kind` value. Divergence can only occur at the **boundary between two
+   adjacent primitives of different kinds**, where the rasterizer might pack fragments from both
+   primitives into the same warp.
+
+For category 3, the divergence analysis depends on primitive size:
+
+- **Large primitives** (buttons, panels, containers — 50+ pixels on a side): a 200×100 rect
+  produces ~20,000 fragments = ~625 warps. At most ~4 boundary warps might straddle a neighbor of a
+  different kind. Divergence rate: **0.6%** of warps.
+
+- **Small primitives** (icons, dots — 16×16): 256 fragments = ~8 warps. At most 2 boundary warps
+  diverge. Divergence rate: **25%** of warps for that primitive, but the primitive itself covers a
+  tiny fraction of the frame's total fragments.
+
+- **Worst realistic case**: a dense grid of alternating shape kinds (e.g., circle-rect-circle-rect
+  icons). Even here, the interior warps of each primitive are coherent. Only the edges diverge. Total
+  frame-level divergence is typically **1–3%** of all warps.
+
+At 1–3% divergence, the throughput impact is negligible. At 4K with 12.4M total fragments
+(~387,000 warps), divergent boundary warps number in the low thousands. Each divergent warp pays at
+most ~25 extra instructions (the cost of the longest untaken SDF branch). At ~12G instructions/sec
+on a mid-range GPU, that totals ~4μs — under 0.05% of an 8.3ms (120 FPS) frame budget. This is
+confirmed by production renderers that use exactly this pattern:
+
+- **vger / vger-rs** (Audulus): single pipeline, 11 primitive kinds dispatched by a `switch` on a
+  flat varying `prim_type`. Ships at 120 FPS on iPads. The author (Taylor Holliday) replaced nanovg
+  specifically because CPU-side tessellation was the bottleneck, not fragment branching:
+  https://github.com/audulus/vger-rs
+
+- **Randy Gaul's 2D renderer**: single pipeline with `shape_type` encoded as a vertex attribute.
+  Reports that warp divergence "really hasn't been an issue for any game I've seen so far" because
+  "games tend to draw a lot of the same shape type":
+  https://randygaul.github.io/graphics/2025/03/04/2D-Rendering-SDF-and-Atlases.html
+
+#### What kind of branching IS expensive
+
+For completeness, here are the cases where shader branching genuinely hurts — none of which apply to
+our design:
+
+1. **Per-fragment data-dependent branches with high divergence.** Example: `if (texture(noise, uv).r
+
+   > 0.5)` where the noise texture produces a random pattern. Every warp has ~50% divergence. Every
+   > warp pays for both paths. This is the scenario the "branches are bad" folklore warns about. We
+   > have no per-fragment data-dependent branches in the main pipeline.
+
+2. **Branches where both paths are very long.** If both sides of a branch are 500+ instructions,
+   divergent warps pay double a large cost. Our SDF functions are 10–25 instructions each. Even
+   fully divergent, the penalty is ~25 extra instructions — less than a single texture sample's
+   latency.
+
+3. **Branches that prevent compiler optimizations.** Some compilers cannot schedule instructions
+   across branch boundaries, reducing VLIW utilization on older architectures. Modern GPUs (NVIDIA
+   Volta+, AMD RDNA+, Apple M-series) use scalar+vector execution models where this is not a
+   concern.
+
+4. **Register pressure from the union of all branches.** This is the real cost, and it is why we
+   split heavy effects (shadows, glass) into separate pipelines. Within the main pipeline, all SDF
+   branches have similar register footprints (12–22 registers), so combining them causes negligible
+   occupancy loss.
+
+**References:**
+
+- ARM solidpixel blog on branches in mobile shaders — comprehensive taxonomy of branch execution
+  models across GPU generations, confirms uniform and warp-coherent branches are free on modern
+  hardware:
+  https://solidpixel.github.io/2021/12/09/branches_in_shaders.html
+- Peter Stefek's "A Note on Branching Within a Shader" — practical measurements showing that
+  warp-coherent branches have zero overhead on Pascal/Volta/Ampere, with clear explanation of the
+  SIMT divergence mechanism:
+  https://www.peterstefek.me/shader-branch.html
+- NVIDIA Volta architecture whitepaper — documents independent thread scheduling which allows
+  divergent threads to reconverge more efficiently than older architectures:
+  https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+- Randy Gaul on warp divergence in practice with per-primitive shape_type branching:
+  https://randygaul.github.io/graphics/2025/03/04/2D-Rendering-SDF-and-Atlases.html
+
+### Main pipeline: SDF + tessellated (unified)
+
+The main pipeline serves two submission modes through a single `TRIANGLELIST` pipeline and a single
+vertex input layout, distinguished by a push constant:
+
+- **Tessellated mode** (`mode = 0`): direct vertex buffer with explicit geometry. Unchanged from
+  today. Used for text (SDL_ttf atlas sampling), polylines, triangle fans/strips, gradient-filled
+  shapes, and any user-provided raw vertex geometry.
+- **SDF mode** (`mode = 1`): shared unit-quad vertex buffer + GPU storage buffer of `Primitive`
+  structs, drawn instanced. Used for all shapes with closed-form signed distance functions.
+
+Both modes converge on the same fragment shader, which dispatches on a `shape_kind` discriminant
+carried either in the vertex data (tessellated, always `Solid = 0`) or in the storage-buffer
+primitive struct (SDF modes).
+
+#### Why SDF for shapes
+
+CPU-side adaptive tessellation for curved shapes (the current approach) has three problems:
+
+1. **Vertex bandwidth.** A rounded rectangle with four corner arcs produces ~250 vertices × 20 bytes
+   = 5 KB. An SDF rounded rectangle is one `Primitive` struct (~56 bytes) plus 4 shared unit-quad
+   vertices. That is roughly a 90× reduction per shape.
+
+2. **Quality.** Tessellated curves are piecewise-linear approximations. At high DPI or under
+   animation/zoom, faceting is visible at any practical segment count. SDF evaluation produces
+   mathematically exact boundaries with perfect anti-aliasing via `smoothstep` in the fragment
+   shader.
+
+3. **Feature cost.** Adding soft edges, outlines, stroke effects, or rounded-cap line segments
+   requires extensive per-shape tessellation code. With SDF, these are trivial fragment shader
+   operations: `abs(d) - thickness` for stroke, `smoothstep(-soft, soft, d)` for soft edges.
+
+**References:**
+
+- Inigo Quilez's 2D SDF primitive catalog (primary source for all SDF functions used):
+  https://iquilezles.org/articles/distfunctions2d/
+- Valve's 2007 SIGGRAPH paper on SDF for vector textures and glyphs (foundational reference):
+  https://steamcdn-a.akamaihd.net/apps/valve/2007/SIGGRAPH2007_AlphaTestedMagnification.pdf
+- Randy Gaul's practical writeup on SDF 2D rendering with shape-type branching, attribute layout,
+  warp divergence tradeoffs, and polyline rendering:
+  https://randygaul.github.io/graphics/2025/03/04/2D-Rendering-SDF-and-Atlases.html
+- Audulus vger-rs — production 2D renderer using a single unified pipeline with SDF type
+  discriminant, same architecture as this plan. Replaced nanovg, achieving 120 FPS where nanovg fell
+  to 30 FPS due to CPU-side tessellation:
+  https://github.com/audulus/vger-rs
+
+#### Storage-buffer instancing for SDF primitives
+
+SDF primitives are submitted via a GPU storage buffer indexed by `gl_InstanceIndex` in the vertex
+shader, rather than encoding per-primitive data redundantly in vertex attributes. This follows the
+pattern used by both Zed GPUI and vger-rs.
+
+Each SDF shape is described by a single `Primitive` struct (~56 bytes) in the storage buffer. The
+vertex shader reads `primitives[gl_InstanceIndex]`, computes the quad corner position from the unit
+vertex and the primitive's bounds, and passes shape parameters to the fragment shader via `flat`
+interpolated varyings.
+
+Compared to encoding per-primitive data in vertex attributes (the "fat vertex" approach), storage-
+buffer instancing eliminates the 4–6× data duplication across quad corners. A rounded rectangle costs
+56 bytes instead of 4 vertices × 40+ bytes = 160+ bytes.
+
+The tessellated path retains the existing direct vertex buffer layout (20 bytes/vertex, no storage
+buffer access). The vertex shader branch on `mode` (push constant) is warp-uniform — every invocation
+in a draw call has the same mode — so it is effectively free on all modern GPUs.
+
+#### Shape kinds
+
+Primitives in the main pipeline's storage buffer carry a `Shape_Kind` discriminant:
+
+| Kind       | SDF function                           | Notes                                                     |
+| ---------- | -------------------------------------- | --------------------------------------------------------- |
+| `RRect`    | `sdRoundedBox` (iq)                    | Per-corner radii. Covers all Clay rectangles and borders. |
+| `Circle`   | `sdCircle`                             | Filled and stroked.                                       |
+| `Ellipse`  | `sdEllipse`                            | Exact (iq's closed-form).                                 |
+| `Segment`  | `sdSegment` capsule                    | Rounded caps, correct sub-pixel thin lines.               |
+| `Ring_Arc` | `abs(sdCircle) - thickness` + arc mask | Rings, arcs, circle sectors unified.                      |
+| `NGon`     | `sdRegularPolygon`                     | Regular n-gon for n ≥ 5.                                  |
+
+The `Solid` kind (value 0) is reserved for the tessellated path, where `shape_kind` is implicitly
+zero because the fragment shader receives it from zero-initialized vertex attributes.
+
+Stroke/outline variants of each shape are handled by the `Shape_Flags` bit set rather than separate
+shape kinds. The fragment shader transforms `d = abs(d) - stroke_width` when the `Stroke` flag is
+set.
+
+**What stays tessellated:**
+
+- Text (SDL_ttf atlas, pending future MSDF evaluation)
+- `rectangle_gradient`, `circle_gradient` (per-vertex color interpolation)
+- `triangle_fan`, `triangle_strip` (arbitrary user-provided point lists)
+- `line_strip` / polylines (SDF polyline rendering is possible but complex; deferred)
+- Any raw vertex geometry submitted via `prepare_shape`
+
+The rule: if the shape has a closed-form SDF, it goes SDF. If it's described only by a vertex list or
+needs per-vertex color interpolation, it stays tessellated.
+
+### Effects pipeline
+
+The effects pipeline handles blur-based visual effects: drop shadows, inner shadows, outer glow, and
+similar. It uses the same storage-buffer instancing pattern as the main pipeline's SDF path, with a
+dedicated pipeline state object that has its own compiled fragment shader.
+
+#### Combined shape + effect rendering
+
+When a shape has an effect (e.g., a rounded rectangle with a drop shadow), the shape is drawn
+**once**, entirely in the effects pipeline. The effects fragment shader evaluates both the effect
+(blur math) and the base shape's SDF, compositing them in a single pass. The shape is not duplicated
+across pipelines.
+
+This avoids redundant overdraw. Consider a 200×100 rounded rect with a drop shadow offset by (5, 5)
+and blur sigma 10:
+
+- **Separate-primitive approach** (shape in main pipeline + shadow in effects pipeline): the shadow
+  quad covers ~230×130 = 29,900 pixels, the shape quad covers 200×100 = 20,000 pixels. The ~18,500
+  shadow fragments underneath the shape run the expensive blur shader only to be overwritten by the
+  shape. Total fragment invocations: ~49,900.
+
+- **Combined approach** (one primitive in effects pipeline): one quad covers ~230×130 = 29,900
+  pixels. The fragment shader evaluates the blur, then evaluates the shape SDF, composites the shape
+  on top. Total fragment invocations: ~29,900. The 20,000 shape-region fragments run the blur+shape
+  shader, but the shape SDF evaluation adds only ~15 ops to an ~80 op blur shader.
+
+The combined approach uses **~40% fewer fragment invocations** per effected shape (29,900 vs 49,900)
+in the common opaque case. The shape-region fragments pay a small additional cost for shape SDF
+evaluation in the effects shader (~15 ops), but this is far cheaper than running 18,500 fragments
+through the full blur shader (~80 ops each) and then discarding their output. For a UI with 10
+shadowed elements, the combined approach saves roughly 200,000 fragment invocations per frame.
+
+An `Effect_Flag.Draw_Base_Shape` flag controls whether the sharp shape layer composites on top
+(default true for drop shadow, always true for inner shadow). Standalone effects (e.g., a glow with
+no shape on top) clear this flag.
+
+Shapes without effects are submitted to the main pipeline as normal. Only shapes that have effects
+are routed to the effects pipeline.
+
+#### Drop shadow implementation
+
+Drop shadows use the analytical blurred-rounded-rectangle technique. Raph Levien's 2020 blog post
+describes an erf-based approximation that computes a Gaussian-blurred rounded rectangle in closed
+form along one axis and with a 4-sample numerical integration along the other. Total fragment cost is
+~80 FLOPs, one sqrt, no texture samples. This is the same technique used by Zed GPUI (via Evan
+Wallace's variant) and vger-rs.
+
+**References:**
+
+- Raph Levien's blurred rounded rectangles post (erf approximation, squircle contour refinement):
+  https://raphlinus.github.io/graphics/2020/04/21/blurred-rounded-rects.html
+- Evan Wallace's original WebGL implementation (used by Figma):
+  https://madebyevan.com/shaders/fast-rounded-rectangle-shadows/
+- Vello's implementation of blurred rounded rectangle as a gradient type:
+  https://github.com/linebender/vello/pull/665
+
+### Backdrop-effects pipeline
+
+The backdrop-effects pipeline handles effects that sample the current render target as input: frosted
+glass, refraction, mirror surfaces. It is structurally separated from the effects pipeline for two
+reasons:
+
+1. **Render-state requirement.** Before any backdrop-sampling fragment can run, the current render
+   target must be copied to a separate texture via `CopyGPUTextureToTexture`. This is a command-
+   buffer-level operation that cannot happen mid-render-pass. The copy naturally creates a pipeline
+   boundary.
+
+2. **Register pressure.** Backdrop-sampling shaders read from a texture with Gaussian kernel weights
+   (multiple texture fetches per fragment), pushing register usage to ~70–80. Including this in the
+   effects pipeline would reduce occupancy for all shadow/glow fragments from ~30% to ~20%, costing
+   measurable throughput on the common case.
+
+The backdrop-effects pipeline binds a secondary sampler pointing at the captured backdrop texture. When
+no backdrop effects are present in a frame, this pipeline is never bound and the texture copy never
+happens — zero cost.
+
+### Vertex layout
+
+The vertex struct is unchanged from the current 20-byte layout:
+
+```
+Vertex :: struct {
+    position: [2]f32,  //  0: screen-space position
+    uv:       [2]f32,  //  8: atlas UV (text) or unused (shapes)
+    color:    Color,   // 16: u8x4, GPU-normalized to float
+}
+```
+
+This layout is shared between the tessellated path and the SDF unit-quad vertices. For tessellated
+draws, `position` carries actual world-space geometry. For SDF draws, `position` carries unit-quad
+corners (0,0 to 1,1) and the vertex shader computes world-space position from the storage-buffer
+primitive's bounds.
+
+The `Primitive` struct for SDF shapes lives in the storage buffer, not in vertex attributes:
+
+```
+Primitive :: struct {
+    kind:   Shape_Kind,     //  0: enum u8
+    flags:  Shape_Flags,    //  1: bit_set[Shape_Flag; u8]
+    _pad:   u16,            //  2: reserved
+    bounds: [4]f32,         //  4: min_x, min_y, max_x, max_y
+    color:  Color,          // 20: u8x4
+    _pad2:  [3]u8,          // 24: alignment
+    params: Shape_Params,   // 28: raw union, 32 bytes
+}
+// Total: 60 bytes (padded to 64 for GPU alignment)
+```
+
+`Shape_Params` is a `#raw_union` with named variants per shape kind (`rrect`, `circle`, `segment`,
+etc.), ensuring type safety on the CPU side and zero-cost reinterpretation on the GPU side.
+
+### Draw submission order
+
+Within each scissor region, draws are issued in submission order to preserve the painter's algorithm:
+
+1. Bind **effects pipeline** → draw all queued effects primitives for this scissor (instanced, one
+   draw call). Each effects primitive includes its base shape and composites internally.
+2. Bind **main pipeline, tessellated mode** → draw all queued tessellated vertices (non-indexed for
+   shapes, indexed for text). Pipeline state unchanged from today.
+3. Bind **main pipeline, SDF mode** → draw all queued SDF primitives (instanced, one draw call).
+4. If backdrop effects are present: copy render target, bind **backdrop-effects pipeline** → draw
+   backdrop primitives.
+
+The exact ordering within a scissor may be refined based on actual Z-ordering requirements. The key
+invariant is that each primitive is drawn exactly once, in the pipeline that owns it.
+
+### Text rendering
+
+Text rendering currently uses SDL_ttf's GPU text engine, which rasterizes glyphs per `(font, size)`
+pair into bitmap atlases and emits indexed triangle data via `GetGPUTextDrawData`. This path is
+**unchanged** by the SDF migration — text continues to flow through the main pipeline's tessellated
+mode with `shape_kind = Solid`, sampling the SDL_ttf atlas texture.
+
+A future phase may evaluate MSDF (multi-channel signed distance field) text rendering, which would
+allow resolution-independent glyph rendering from a single small atlas per font. This would involve:
+
+- Offline atlas generation via Chlumský's msdf-atlas-gen tool.
+- Runtime glyph metrics via `vendor:stb/truetype` (already in the Odin distribution).
+- A new `Shape_Kind.MSDF_Glyph` variant in the main pipeline's fragment shader.
+- Potential removal of the SDL_ttf dependency.
+
+This is explicitly deferred. The SDF shape migration is independent of and does not block text
+changes.
+
+**References:**
+
+- Viktor Chlumský's MSDF master's thesis and msdfgen tool:
+  https://github.com/Chlumsky/msdfgen
+- MSDF atlas generator for font atlas packing:
+  https://github.com/Chlumsky/msdf-atlas-gen
+- Valve's original SDF text rendering paper (SIGGRAPH 2007):
+  https://steamcdn-a.akamaihd.net/apps/valve/2007/SIGGRAPH2007_AlphaTestedMagnification.pdf
+
+## 3D rendering
+
+3D pipeline architecture is under consideration and will be documented separately. The current
+expectation is that 3D rendering will use dedicated pipelines (separate from the 2D pipelines)
+sharing GPU resources (textures, samplers, command buffer lifecycle) with the 2D renderer.
+
+## Building shaders
+
+GLSL shader sources live in `shaders/source/`. Compiled outputs (SPIR-V and Metal Shading Language)
+are generated into `shaders/generated/` via the meta tool:
+
+```
+odin run meta -- gen-shaders
+```
+
+Requires `glslangValidator` and `spirv-cross` on PATH.
+
+### Shader format selection
+
+The library embeds shader bytecode per compile target — MSL + `main0` entry point on Darwin (via
+`spirv-cross --msl`, which renames `main` because it is reserved in Metal), SPIR-V + `main` entry
+point elsewhere. Three compile-time constants in `draw.odin` expose the build's shader configuration:
+
+| Constant                      | Type                      | Darwin    | Other      |
+| ----------------------------- | ------------------------- | --------- | ---------- |
+| `PLATFORM_SHADER_FORMAT_FLAG` | `sdl.GPUShaderFormatFlag` | `.MSL`    | `.SPIRV`   |
+| `PLATFORM_SHADER_FORMAT`      | `sdl.GPUShaderFormat`     | `{.MSL}`  | `{.SPIRV}` |
+| `SHADER_ENTRY`                | `cstring`                 | `"main0"` | `"main"`   |
+
+Pass `PLATFORM_SHADER_FORMAT` to `sdl.CreateGPUDevice` so SDL selects a backend compatible with the
+embedded bytecode:
+
+```
+gpu := sdl.CreateGPUDevice(draw.PLATFORM_SHADER_FORMAT, true, nil)
+```
+
+At init time the library calls `sdl.GetGPUShaderFormats(device)` to verify the active backend
+accepts `PLATFORM_SHADER_FORMAT_FLAG`. If it does not, `draw.init` returns `false` with a
+descriptive log message showing both the embedded and active format sets.
@@ -0,0 +1,938 @@
+package draw
+
+import "base:runtime"
+import "core:c"
+import "core:log"
+import "core:math"
+import "core:strings"
+import sdl "vendor:sdl3"
+import sdl_ttf "vendor:sdl3/ttf"
+
+import clay "../vendor/clay"
+
+when ODIN_OS == .Darwin {
+	PLATFORM_SHADER_FORMAT_FLAG :: sdl.GPUShaderFormatFlag.MSL
+	SHADER_ENTRY :: cstring("main0")
+	BASE_VERT_2D_RAW :: #load("shaders/generated/base_2d.vert.metal")
+	BASE_FRAG_2D_RAW :: #load("shaders/generated/base_2d.frag.metal")
+} else {
+	PLATFORM_SHADER_FORMAT_FLAG :: sdl.GPUShaderFormatFlag.SPIRV
+	SHADER_ENTRY :: cstring("main")
+	BASE_VERT_2D_RAW :: #load("shaders/generated/base_2d.vert.spv")
+	BASE_FRAG_2D_RAW :: #load("shaders/generated/base_2d.frag.spv")
+}
+PLATFORM_SHADER_FORMAT :: sdl.GPUShaderFormat{PLATFORM_SHADER_FORMAT_FLAG}
+
+BUFFER_INIT_SIZE :: 256
+INITIAL_LAYER_SIZE :: 5
+INITIAL_SCISSOR_SIZE :: 10
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Color -------------------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+Color :: distinct [4]u8
+
+BLACK :: Color{0, 0, 0, 255}
+WHITE :: Color{255, 255, 255, 255}
+RED :: Color{255, 0, 0, 255}
+GREEN :: Color{0, 255, 0, 255}
+BLUE :: Color{0, 0, 255, 255}
+BLANK :: Color{0, 0, 0, 0}
+
+// Convert clay.Color ([4]c.float in 0–255 range) to Color.
+color_from_clay :: proc(clay_color: clay.Color) -> Color {
+	return Color{u8(clay_color[0]), u8(clay_color[1]), u8(clay_color[2]), u8(clay_color[3])}
+}
+
+// Convert Color to [4]f32 in 0.0–1.0 range. Useful for SDL interop (e.g. clear color).
+color_to_f32 :: proc(color: Color) -> [4]f32 {
+	INV :: 1.0 / 255.0
+	return {f32(color[0]) * INV, f32(color[1]) * INV, f32(color[2]) * INV, f32(color[3]) * INV}
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Core types --------------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+Rectangle :: struct {
+	x:      f32,
+	y:      f32,
+	width:  f32,
+	height: f32,
+}
+
+Sub_Batch_Kind :: enum u8 {
+	Shapes, // non-indexed, white texture, mode 0
+	Text, // indexed, atlas texture, mode 0
+	SDF, // instanced unit quad, white texture, mode 1
+}
+
+Sub_Batch :: struct {
+	kind:   Sub_Batch_Kind,
+	offset: u32, // Shapes: vertex offset; Text: text_batch index; SDF: primitive index
+	count:  u32, // Shapes: vertex count; Text: always 1; SDF: primitive count
+}
+
+Layer :: struct {
+	bounds:          Rectangle,
+	sub_batch_start: u32,
+	sub_batch_len:   u32,
+	scissor_start:   u32,
+	scissor_len:     u32,
+}
+
+Scissor :: struct {
+	bounds:          sdl.Rect,
+	sub_batch_start: u32,
+	sub_batch_len:   u32,
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Global state ------------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+GLOB: Global
+
+Global :: struct {
+	odin_context:      runtime.Context,
+	pipeline_2d_base:  Pipeline_2D_Base,
+	text_cache:        Text_Cache,
+	layers:            [dynamic]Layer,
+	scissors:          [dynamic]Scissor,
+	tmp_shape_verts:   [dynamic]Vertex,
+	tmp_text_verts:    [dynamic]Vertex,
+	tmp_text_indices:  [dynamic]c.int,
+	tmp_text_batches:  [dynamic]TextBatch,
+	tmp_primitives:    [dynamic]Primitive,
+	tmp_sub_batches:   [dynamic]Sub_Batch,
+	tmp_uncached_text: [dynamic]^sdl_ttf.Text, // Uncached TTF_Text objects to destroy after end()
+	clay_memory:       [^]u8,
+	msaa_texture:      ^sdl.GPUTexture,
+	curr_layer_index:  uint,
+	max_layers:        int,
+	max_scissors:      int,
+	max_shape_verts:   int,
+	max_text_verts:    int,
+	max_text_indices:  int,
+	max_text_batches:  int,
+	max_primitives:    int,
+	max_sub_batches:   int,
+	dpi_scaling:       f32,
+	msaa_width:        u32,
+	msaa_height:       u32,
+	sample_count:      sdl.GPUSampleCount,
+	clay_z_index:      i16,
+	cleared:           bool,
+}
+
+Init_Options :: struct {
+	// MSAA sample count. Default is ._1 (no MSAA). SDF rendering does not benefit from MSAA
+	// because SDF fragments compute coverage analytically via `smoothstep`. MSAA helps for
+	// text glyph edges and tessellated user geometry. Set to ._4 or ._8 for text-heavy UIs,
+	// or use `MSAA_MAX` to request the highest sample count the GPU supports for the swapchain
+	// format.
+	msaa_samples: sdl.GPUSampleCount,
+}
+
+// Sentinel value: when passed as msaa_samples, `init` will use the maximum MSAA sample count
+// supported by the GPU for the swapchain format.
+MSAA_MAX :: sdl.GPUSampleCount(0xFF)
+
+// Initialize the renderer. Returns false if GPU pipeline or text engine creation fails.
+@(require_results)
+init :: proc(
+	device: ^sdl.GPUDevice,
+	window: ^sdl.Window,
+	options: Init_Options = {},
+	allocator := context.allocator,
+	odin_context := context,
+) -> (
+	ok: bool,
+) {
+	min_memory_size: c.size_t = cast(c.size_t)clay.MinMemorySize()
+	resolved_sample_count := options.msaa_samples
+	if resolved_sample_count == MSAA_MAX {
+		resolved_sample_count = max_sample_count(device, window)
+	}
+
+	pipeline, pipeline_ok := create_pipeline_2d_base(device, window, resolved_sample_count)
+	if !pipeline_ok {
+		return false
+	}
+
+	text_cache, text_ok := init_text_cache(device, allocator)
+	if !text_ok {
+		destroy_pipeline_2d_base(device, &pipeline)
+		return false
+	}
+
+	GLOB = Global {
+		layers            = make([dynamic]Layer, 0, INITIAL_LAYER_SIZE, allocator = allocator),
+		scissors          = make([dynamic]Scissor, 0, INITIAL_SCISSOR_SIZE, allocator = allocator),
+		tmp_shape_verts   = make([dynamic]Vertex, 0, BUFFER_INIT_SIZE, allocator = allocator),
+		tmp_text_verts    = make([dynamic]Vertex, 0, BUFFER_INIT_SIZE, allocator = allocator),
+		tmp_text_indices  = make([dynamic]c.int, 0, BUFFER_INIT_SIZE, allocator = allocator),
+		tmp_text_batches  = make([dynamic]TextBatch, 0, BUFFER_INIT_SIZE, allocator = allocator),
+		tmp_primitives    = make([dynamic]Primitive, 0, BUFFER_INIT_SIZE, allocator = allocator),
+		tmp_sub_batches   = make([dynamic]Sub_Batch, 0, BUFFER_INIT_SIZE, allocator = allocator),
+		tmp_uncached_text = make([dynamic]^sdl_ttf.Text, 0, 16, allocator = allocator),
+		odin_context      = odin_context,
+		dpi_scaling       = sdl.GetWindowDisplayScale(window),
+		clay_memory       = make([^]u8, min_memory_size, allocator = allocator),
+		sample_count      = resolved_sample_count,
+		pipeline_2d_base  = pipeline,
+		text_cache        = text_cache,
+	}
+	log.debug("Window DPI scaling:", GLOB.dpi_scaling)
+	arena := clay.CreateArenaWithCapacityAndMemory(min_memory_size, GLOB.clay_memory)
+	window_width, window_height: c.int
+	sdl.GetWindowSize(window, &window_width, &window_height)
+
+	clay.Initialize(arena, {f32(window_width), f32(window_height)}, {handler = clay_error_handler})
+	clay.SetMeasureTextFunction(measure_text_clay, nil)
+	return true
+}
+
+// TODO Either every x frames nuke max values in case of edge cases where max gets set very high
+// or leave to application code to decide the right time for resize
+resize_global :: proc() {
+	if len(GLOB.layers) > GLOB.max_layers do GLOB.max_layers = len(GLOB.layers)
+	shrink(&GLOB.layers, GLOB.max_layers)
+	if len(GLOB.scissors) > GLOB.max_scissors do GLOB.max_scissors = len(GLOB.scissors)
+	shrink(&GLOB.scissors, GLOB.max_scissors)
+	if len(GLOB.tmp_shape_verts) > GLOB.max_shape_verts do GLOB.max_shape_verts = len(GLOB.tmp_shape_verts)
+	shrink(&GLOB.tmp_shape_verts, GLOB.max_shape_verts)
+	if len(GLOB.tmp_text_verts) > GLOB.max_text_verts do GLOB.max_text_verts = len(GLOB.tmp_text_verts)
+	shrink(&GLOB.tmp_text_verts, GLOB.max_text_verts)
+	if len(GLOB.tmp_text_indices) > GLOB.max_text_indices do GLOB.max_text_indices = len(GLOB.tmp_text_indices)
+	shrink(&GLOB.tmp_text_indices, GLOB.max_text_indices)
+	if len(GLOB.tmp_text_batches) > GLOB.max_text_batches do GLOB.max_text_batches = len(GLOB.tmp_text_batches)
+	shrink(&GLOB.tmp_text_batches, GLOB.max_text_batches)
+	if len(GLOB.tmp_primitives) > GLOB.max_primitives do GLOB.max_primitives = len(GLOB.tmp_primitives)
+	shrink(&GLOB.tmp_primitives, GLOB.max_primitives)
+	if len(GLOB.tmp_sub_batches) > GLOB.max_sub_batches do GLOB.max_sub_batches = len(GLOB.tmp_sub_batches)
+	shrink(&GLOB.tmp_sub_batches, GLOB.max_sub_batches)
+}
+
+destroy :: proc(device: ^sdl.GPUDevice, allocator := context.allocator) {
+	delete(GLOB.layers)
+	delete(GLOB.scissors)
+	delete(GLOB.tmp_shape_verts)
+	delete(GLOB.tmp_text_verts)
+	delete(GLOB.tmp_text_indices)
+	delete(GLOB.tmp_text_batches)
+	delete(GLOB.tmp_primitives)
+	delete(GLOB.tmp_sub_batches)
+	for ttf_text in GLOB.tmp_uncached_text do sdl_ttf.DestroyText(ttf_text)
+	delete(GLOB.tmp_uncached_text)
+	free(GLOB.clay_memory, allocator)
+	if GLOB.msaa_texture != nil {
+		sdl.ReleaseGPUTexture(device, GLOB.msaa_texture)
+	}
+	destroy_pipeline_2d_base(device, &GLOB.pipeline_2d_base)
+	destroy_text_cache()
+}
+
+// Internal
+clear_global :: proc() {
+	GLOB.curr_layer_index = 0
+	GLOB.clay_z_index = 0
+	GLOB.cleared = false
+	// Destroy uncached TTF_Text objects from the previous frame (after end() has submitted draw data)
+	for ttf_text in GLOB.tmp_uncached_text do sdl_ttf.DestroyText(ttf_text)
+	clear(&GLOB.tmp_uncached_text)
+	clear(&GLOB.layers)
+	clear(&GLOB.scissors)
+	clear(&GLOB.tmp_shape_verts)
+	clear(&GLOB.tmp_text_verts)
+	clear(&GLOB.tmp_text_indices)
+	clear(&GLOB.tmp_text_batches)
+	clear(&GLOB.tmp_primitives)
+	clear(&GLOB.tmp_sub_batches)
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Text measurement (Clay) -------
+// ---------------------------------------------------------------------------------------------------------------------
+
+@(private = "file")
+measure_text_clay :: proc "c" (
+	text: clay.StringSlice,
+	config: ^clay.TextElementConfig,
+	user_data: rawptr,
+) -> clay.Dimensions {
+	context = GLOB.odin_context
+	text := string(text.chars[:text.length])
+	c_text := strings.clone_to_cstring(text, context.temp_allocator)
+	width, height: c.int
+	if !sdl_ttf.GetStringSize(get_font(config.fontId, config.fontSize), c_text, 0, &width, &height) {
+		log.panicf("Failed to measure text: %s", sdl.GetError())
+	}
+
+	return clay.Dimensions{width = f32(width) / GLOB.dpi_scaling, height = f32(height) / GLOB.dpi_scaling}
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Frame lifecycle ---------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+// Sets up renderer to begin upload to the GPU. Returns starting `Layer` to begin processing primitives for.
+begin :: proc(bounds: Rectangle) -> ^Layer {
+	// Cleanup
+	clear_global()
+
+	// Begin new layer + start a new scissor
+	scissor := Scissor {
+		bounds = sdl.Rect {
+			x = i32(bounds.x * GLOB.dpi_scaling),
+			y = i32(bounds.y * GLOB.dpi_scaling),
+			w = i32(bounds.width * GLOB.dpi_scaling),
+			h = i32(bounds.height * GLOB.dpi_scaling),
+		},
+	}
+	append(&GLOB.scissors, scissor)
+
+	layer := Layer {
+		bounds      = bounds,
+		scissor_len = 1,
+	}
+	append(&GLOB.layers, layer)
+	return &GLOB.layers[GLOB.curr_layer_index]
+}
+
+// Creates a new layer
+new_layer :: proc(prev_layer: ^Layer, bounds: Rectangle) -> ^Layer {
+	layer := Layer {
+		bounds          = bounds,
+		sub_batch_start = prev_layer.sub_batch_start + prev_layer.sub_batch_len,
+		scissor_start   = prev_layer.scissor_start + prev_layer.scissor_len,
+		scissor_len     = 1,
+	}
+	append(&GLOB.layers, layer)
+	GLOB.curr_layer_index += 1
+	log.debug("Added new layer; curr index", GLOB.curr_layer_index)
+
+	scissor := Scissor {
+		sub_batch_start = u32(len(GLOB.tmp_sub_batches)),
+		bounds = sdl.Rect {
+			x = i32(bounds.x * GLOB.dpi_scaling),
+			y = i32(bounds.y * GLOB.dpi_scaling),
+			w = i32(bounds.width * GLOB.dpi_scaling),
+			h = i32(bounds.height * GLOB.dpi_scaling),
+		},
+	}
+	append(&GLOB.scissors, scissor)
+	return &GLOB.layers[GLOB.curr_layer_index]
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Built-in primitive processing --
+// ---------------------------------------------------------------------------------------------------------------------
+
+// Submit shape vertices (colored triangles) to the given layer for rendering.
+prepare_shape :: proc(layer: ^Layer, vertices: []Vertex) {
+	if len(vertices) == 0 do return
+	offset := u32(len(GLOB.tmp_shape_verts))
+	append(&GLOB.tmp_shape_verts, ..vertices)
+	scissor := &GLOB.scissors[layer.scissor_start + layer.scissor_len - 1]
+	append_or_extend_sub_batch(scissor, layer, .Shapes, offset, u32(len(vertices)))
+}
+
+// Submit an SDF primitive to the given layer for rendering.
+prepare_sdf_primitive :: proc(layer: ^Layer, prim: Primitive) {
+	offset := u32(len(GLOB.tmp_primitives))
+	append(&GLOB.tmp_primitives, prim)
+	scissor := &GLOB.scissors[layer.scissor_start + layer.scissor_len - 1]
+	append_or_extend_sub_batch(scissor, layer, .SDF, offset, 1)
+}
+
+// Submit a text element to the given layer for rendering.
+// Copies SDL_ttf vertices directly (with baked position) and copies indices for indexed drawing.
+prepare_text :: proc(layer: ^Layer, text: Text) {
+	data := sdl_ttf.GetGPUTextDrawData(text.sdl_text)
+	if data == nil {
+		return // nil is normal for empty text
+	}
+
+	scissor := &GLOB.scissors[layer.scissor_start + layer.scissor_len - 1]
+
+	// Snap base position to integer physical pixels to avoid atlas sub-pixel
+	// sampling blur (and the off-by-one bottom-row clip that comes with it).
+	base_x := math.round(text.position[0] * GLOB.dpi_scaling)
+	base_y := math.round(text.position[1] * GLOB.dpi_scaling)
+
+	for data != nil {
+		vertex_start := u32(len(GLOB.tmp_text_verts))
+		index_start := u32(len(GLOB.tmp_text_indices))
+
+		// Copy vertices with baked position offset
+		for i in 0 ..< data.num_vertices {
+			pos := data.xy[i]
+			uv := data.uv[i]
+			append(
+				&GLOB.tmp_text_verts,
+				Vertex{position = {pos.x + base_x, -pos.y + base_y}, uv = {uv.x, uv.y}, color = text.color},
+			)
+		}
+
+		// Copy indices directly
+		append(&GLOB.tmp_text_indices, ..data.indices[:data.num_indices])
+
+		batch_idx := u32(len(GLOB.tmp_text_batches))
+		append(
+			&GLOB.tmp_text_batches,
+			TextBatch {
+				atlas_texture = data.atlas_texture,
+				vertex_start = vertex_start,
+				vertex_count = u32(data.num_vertices),
+				index_start = index_start,
+				index_count = u32(data.num_indices),
+			},
+		)
+
+		// Each atlas chunk is a separate sub-batch (different atlas textures can't coalesce)
+		append_or_extend_sub_batch(scissor, layer, .Text, batch_idx, 1)
+
+		data = data.next
+	}
+}
+
+// Submit a text element with a 2D affine transform applied to vertices.
+// Used by the high-level `text` proc when rotation or a non-zero origin is specified.
+// NOTE: xform must be in physical (DPI-scaled) pixel space — the caller pre-scales
+// pos and origin by GLOB.dpi_scaling before building the transform.
+prepare_text_transformed :: proc(layer: ^Layer, text: Text, transform: Transform_2D) {
+	data := sdl_ttf.GetGPUTextDrawData(text.sdl_text)
+	if data == nil {
+		return
+	}
+
+	scissor := &GLOB.scissors[layer.scissor_start + layer.scissor_len - 1]
+
+	for data != nil {
+		vertex_start := u32(len(GLOB.tmp_text_verts))
+		index_start := u32(len(GLOB.tmp_text_indices))
+
+		for i in 0 ..< data.num_vertices {
+			pos := data.xy[i]
+			uv := data.uv[i]
+			// SDL_ttf gives glyph positions in physical pixels relative to text origin.
+			// The transform is already in physical-pixel space (caller pre-scaled),
+			// so we apply directly — no per-vertex DPI divide/multiply.
+			append(
+				&GLOB.tmp_text_verts,
+				Vertex{position = apply_transform(transform, {pos.x, -pos.y}), uv = {uv.x, uv.y}, color = text.color},
+			)
+		}
+
+		append(&GLOB.tmp_text_indices, ..data.indices[:data.num_indices])
+
+		batch_idx := u32(len(GLOB.tmp_text_batches))
+		append(
+			&GLOB.tmp_text_batches,
+			TextBatch {
+				atlas_texture = data.atlas_texture,
+				vertex_start = vertex_start,
+				vertex_count = u32(data.num_vertices),
+				index_start = index_start,
+				index_count = u32(data.num_indices),
+			},
+		)
+
+		append_or_extend_sub_batch(scissor, layer, .Text, batch_idx, 1)
+
+		data = data.next
+	}
+}
+
+// Append a new sub-batch or extend the last one if same kind and contiguous.
+@(private)
+append_or_extend_sub_batch :: proc(
+	scissor: ^Scissor,
+	layer: ^Layer,
+	kind: Sub_Batch_Kind,
+	offset: u32,
+	count: u32,
+) {
+	if scissor.sub_batch_len > 0 {
+		last := &GLOB.tmp_sub_batches[scissor.sub_batch_start + scissor.sub_batch_len - 1]
+		if last.kind == kind && kind != .Text && last.offset + last.count == offset {
+			last.count += count
+			return
+		}
+	}
+	append(&GLOB.tmp_sub_batches, Sub_Batch{kind = kind, offset = offset, count = count})
+	scissor.sub_batch_len += 1
+	layer.sub_batch_len += 1
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Clay ------------------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+@(private = "file")
+clay_error_handler :: proc "c" (errorData: clay.ErrorData) {
+	context = GLOB.odin_context
+	log.error("Clay error:", errorData.errorType, errorData.errorText)
+}
+
+// Called for each Clay `RenderCommandType.Custom` render command that
+// `prepare_clay_batch` encounters.
+//
+// - `layer` is the layer the command belongs to (post-z-index promotion).
+// - `bounds` is already translated into the active layer's coordinate system
+//   and pre-DPI, matching what the built-in shape procs expect.
+// - `render_data` is Clay's `CustomRenderData` for the element, exposing
+//   `backgroundColor`, `cornerRadius`, and the `customData` pointer the caller
+//   attached to `clay.CustomElementConfig.customData`.
+//
+// The callback must not call `new_layer` or `prepare_clay_batch`.
+Custom_Draw :: #type proc(layer: ^Layer, bounds: Rectangle, render_data: clay.CustomRenderData)
+
+ClayBatch :: struct {
+	bounds: Rectangle,
+	cmds:   clay.ClayArray(clay.RenderCommand),
+}
+
+// Process Clay render commands into shape and text primitives.
+prepare_clay_batch :: proc(
+	base_layer: ^Layer,
+	batch: ^ClayBatch,
+	mouse_wheel_delta: [2]f32,
+	frame_time: f32 = 0,
+	custom_draw: Custom_Draw = nil,
+) {
+	mouse_pos: [2]f32
+	mouse_flags := sdl.GetMouseState(&mouse_pos.x, &mouse_pos.y)
+
+	// Update clay internals
+	clay.SetPointerState(
+		clay.Vector2{mouse_pos.x - base_layer.bounds.x, mouse_pos.y - base_layer.bounds.y},
+		.LEFT in mouse_flags,
+	)
+	clay.UpdateScrollContainers(true, mouse_wheel_delta, frame_time)
+
+	layer := base_layer
+
+	// Parse render commands
+	for i in 0 ..< int(batch.cmds.length) {
+		render_command := clay.RenderCommandArray_Get(&batch.cmds, cast(i32)i)
+
+		// Translate bounding box of the primitive by the layer position
+		bounds := Rectangle {
+			x      = render_command.boundingBox.x + layer.bounds.x,
+			y      = render_command.boundingBox.y + layer.bounds.y,
+			width  = render_command.boundingBox.width,
+			height = render_command.boundingBox.height,
+		}
+
+		if render_command.zIndex > GLOB.clay_z_index {
+			log.debug("Higher zIndex found, creating new layer & setting z_index to", render_command.zIndex)
+			layer = new_layer(layer, bounds)
+			// Update bounds to new layer offset
+			bounds.x = render_command.boundingBox.x + layer.bounds.x
+			bounds.y = render_command.boundingBox.y + layer.bounds.y
+			GLOB.clay_z_index = render_command.zIndex
+		}
+
+		switch (render_command.commandType) {
+		case clay.RenderCommandType.None:
+		case clay.RenderCommandType.Text:
+			render_data := render_command.renderData.text
+			txt := string(render_data.stringContents.chars[:render_data.stringContents.length])
+			c_text := strings.clone_to_cstring(txt, context.temp_allocator)
+			// Clay render-command IDs are derived via Clay's internal HashNumber (Jenkins-family)
+			// and namespaced with .Clay so they can never collide with user-provided custom text IDs.
+			sdl_text := cache_get_or_update(
+				Cache_Key{render_command.id, .Clay},
+				c_text,
+				get_font(render_data.fontId, render_data.fontSize),
+			)
+			prepare_text(layer, Text{sdl_text, {bounds.x, bounds.y}, color_from_clay(render_data.textColor)})
+		case clay.RenderCommandType.Image:
+		case clay.RenderCommandType.ScissorStart:
+			if bounds.width == 0 || bounds.height == 0 do continue
+
+			curr_scissor := &GLOB.scissors[layer.scissor_start + layer.scissor_len - 1]
+
+			if curr_scissor.sub_batch_len != 0 {
+				// Scissor has some content, need to make a new scissor
+				new := Scissor {
+					sub_batch_start = curr_scissor.sub_batch_start + curr_scissor.sub_batch_len,
+					bounds          = sdl.Rect {
+						c.int(bounds.x * GLOB.dpi_scaling),
+						c.int(bounds.y * GLOB.dpi_scaling),
+						c.int(bounds.width * GLOB.dpi_scaling),
+						c.int(bounds.height * GLOB.dpi_scaling),
+					},
+				}
+				append(&GLOB.scissors, new)
+				layer.scissor_len += 1
+			} else {
+				curr_scissor.bounds = sdl.Rect {
+					c.int(bounds.x * GLOB.dpi_scaling),
+					c.int(bounds.y * GLOB.dpi_scaling),
+					c.int(bounds.width * GLOB.dpi_scaling),
+					c.int(bounds.height * GLOB.dpi_scaling),
+				}
+			}
+		case clay.RenderCommandType.ScissorEnd:
+		case clay.RenderCommandType.Rectangle:
+			render_data := render_command.renderData.rectangle
+			cr := render_data.cornerRadius
+			color := color_from_clay(render_data.backgroundColor)
+			radii := [4]f32{cr.topLeft, cr.topRight, cr.bottomRight, cr.bottomLeft}
+
+			if radii == {0, 0, 0, 0} {
+				rectangle(layer, bounds, color)
+			} else {
+				rectangle_corners(layer, bounds, radii, color)
+			}
+		case clay.RenderCommandType.Border:
+			render_data := render_command.renderData.border
+			cr := render_data.cornerRadius
+			color := color_from_clay(render_data.color)
+			thickness := f32(render_data.width.top)
+			radii := [4]f32{cr.topLeft, cr.topRight, cr.bottomRight, cr.bottomLeft}
+
+			if radii == {0, 0, 0, 0} {
+				rectangle_lines(layer, bounds, color, thickness)
+			} else {
+				rectangle_corners_lines(layer, bounds, radii, color, thickness)
+			}
+		case clay.RenderCommandType.Custom: if custom_draw != nil {
+					custom_draw(layer, bounds, render_command.renderData.custom)
+				}
+		}
+	}
+}
+
+// Render primitives. clear_color is the background fill before any layers are drawn.
+end :: proc(device: ^sdl.GPUDevice, window: ^sdl.Window, clear_color: Color = BLACK) {
+	cmd_buffer := sdl.AcquireGPUCommandBuffer(device)
+	if cmd_buffer == nil {
+		log.panicf("Failed to acquire GPU command buffer: %s", sdl.GetError())
+	}
+
+	// Upload primitives to GPU
+	copy_pass := sdl.BeginGPUCopyPass(cmd_buffer)
+	upload(device, copy_pass)
+	sdl.EndGPUCopyPass(copy_pass)
+
+	swapchain_texture: ^sdl.GPUTexture
+	width, height: u32
+	if !sdl.WaitAndAcquireGPUSwapchainTexture(cmd_buffer, window, &swapchain_texture, &width, &height) {
+		log.panicf("Failed to acquire swapchain texture: %s", sdl.GetError())
+	}
+
+	if swapchain_texture == nil {
+		// Window is minimized or not visible — submit and skip this frame
+		if !sdl.SubmitGPUCommandBuffer(cmd_buffer) {
+			log.panicf("Failed to submit GPU command buffer (minimized window): %s", sdl.GetError())
+		}
+		return
+	}
+
+	use_msaa := GLOB.sample_count != ._1
+	render_texture := swapchain_texture
+
+	if use_msaa {
+		ensure_msaa_texture(device, sdl.GetGPUSwapchainTextureFormat(device, window), width, height)
+		render_texture = GLOB.msaa_texture
+	}
+
+	clear_color_f32 := color_to_f32(clear_color)
+
+	// Draw layers. One render pass per layer; sub-batches draw in submission order within each scissor.
+	for &layer, index in GLOB.layers {
+		log.debug("Drawing layer", index)
+		draw_layer(device, window, cmd_buffer, render_texture, width, height, clear_color_f32, &layer)
+	}
+
+	// Resolve MSAA render texture to the swapchain.
+	if use_msaa {
+		resolve_pass := sdl.BeginGPURenderPass(
+			cmd_buffer,
+			&sdl.GPUColorTargetInfo {
+				texture = render_texture,
+				load_op = .LOAD,
+				store_op = .RESOLVE,
+				resolve_texture = swapchain_texture,
+			},
+			1,
+			nil,
+		)
+		sdl.EndGPURenderPass(resolve_pass)
+	}
+
+	if !sdl.SubmitGPUCommandBuffer(cmd_buffer) {
+		log.panicf("Failed to submit GPU command buffer: %s", sdl.GetError())
+	}
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- MSAA --------------------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+// Query the highest MSAA sample count supported by the GPU for the swapchain format.
+max_sample_count :: proc(device: ^sdl.GPUDevice, window: ^sdl.Window) -> sdl.GPUSampleCount {
+	format := sdl.GetGPUSwapchainTextureFormat(device, window)
+	counts := [?]sdl.GPUSampleCount{._8, ._4, ._2}
+	for count in counts {
+		if sdl.GPUTextureSupportsSampleCount(device, format, count) do return count
+	}
+	return ._1
+}
+
+@(private = "file")
+ensure_msaa_texture :: proc(device: ^sdl.GPUDevice, format: sdl.GPUTextureFormat, width, height: u32) {
+	if GLOB.msaa_texture != nil && GLOB.msaa_width == width && GLOB.msaa_height == height {
+		return
+	}
+	if GLOB.msaa_texture != nil {
+		sdl.ReleaseGPUTexture(device, GLOB.msaa_texture)
+	}
+	GLOB.msaa_texture = sdl.CreateGPUTexture(
+		device,
+		sdl.GPUTextureCreateInfo {
+			type = .D2,
+			format = format,
+			usage = {.COLOR_TARGET},
+			width = width,
+			height = height,
+			layer_count_or_depth = 1,
+			num_levels = 1,
+			sample_count = GLOB.sample_count,
+		},
+	)
+	if GLOB.msaa_texture == nil {
+		log.panicf("Failed to create MSAA texture (%dx%d): %s", width, height, sdl.GetError())
+	}
+	GLOB.msaa_width = width
+	GLOB.msaa_height = height
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Utility -----------------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+ortho_rh :: proc(left: f32, right: f32, bottom: f32, top: f32, near: f32, far: f32) -> matrix[4, 4]f32 {
+	return matrix[4, 4]f32{
+		2.0 / (right - left), 0.0, 0.0, -(right + left) / (right - left),
+		0.0, 2.0 / (top - bottom), 0.0, -(top + bottom) / (top - bottom),
+		0.0, 0.0, -2.0 / (far - near), -(far + near) / (far - near),
+		0.0, 0.0, 0.0, 1.0,
+	}
+}
+
+Draw_Mode :: enum u32 {
+	Tessellated = 0,
+	SDF         = 1,
+}
+
+Vertex_Uniforms :: struct {
+	projection: matrix[4, 4]f32,
+	scale:      f32,
+	mode:       Draw_Mode,
+}
+
+// Push projection, dpi scale, and rendering mode as a single uniform block (slot 0).
+push_globals :: proc(
+	cmd_buffer: ^sdl.GPUCommandBuffer,
+	width: f32,
+	height: f32,
+	mode: Draw_Mode = .Tessellated,
+) {
+	globals := Vertex_Uniforms {
+		projection = ortho_rh(
+			left = 0.0,
+			top = 0.0,
+			right = f32(width),
+			bottom = f32(height),
+			near = -1.0,
+			far = 1.0,
+		),
+		scale      = GLOB.dpi_scaling,
+		mode       = mode,
+	}
+
+	sdl.PushGPUVertexUniformData(cmd_buffer, 0, &globals, size_of(Vertex_Uniforms))
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Buffer ------------------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+Buffer :: struct {
+	gpu:      ^sdl.GPUBuffer,
+	transfer: ^sdl.GPUTransferBuffer,
+	size:     u32,
+}
+
+@(require_results)
+create_buffer :: proc(
+	device: ^sdl.GPUDevice,
+	size: u32,
+	gpu_usage: sdl.GPUBufferUsageFlags,
+) -> (
+	buffer: Buffer,
+	ok: bool,
+) {
+	gpu := sdl.CreateGPUBuffer(device, sdl.GPUBufferCreateInfo{usage = gpu_usage, size = size})
+	if gpu == nil {
+		log.errorf("Failed to create GPU buffer (size=%d): %s", size, sdl.GetError())
+		return buffer, false
+	}
+	transfer := sdl.CreateGPUTransferBuffer(
+		device,
+		sdl.GPUTransferBufferCreateInfo{usage = .UPLOAD, size = size},
+	)
+	if transfer == nil {
+		sdl.ReleaseGPUBuffer(device, gpu)
+		log.errorf("Failed to create GPU transfer buffer (size=%d): %s", size, sdl.GetError())
+		return buffer, false
+	}
+	return Buffer{gpu, transfer, size}, true
+}
+
+grow_buffer_if_needed :: proc(
+	device: ^sdl.GPUDevice,
+	buffer: ^Buffer,
+	new_size: u32,
+	gpu_usage: sdl.GPUBufferUsageFlags,
+) {
+	if new_size > buffer.size {
+		log.debug("Resizing buffer from", buffer.size, "to", new_size)
+		destroy_buffer(device, buffer)
+		buffer.gpu = sdl.CreateGPUBuffer(device, sdl.GPUBufferCreateInfo{usage = gpu_usage, size = new_size})
+		if buffer.gpu == nil {
+			log.panicf("Failed to grow GPU buffer (new_size=%d): %s", new_size, sdl.GetError())
+		}
+		buffer.transfer = sdl.CreateGPUTransferBuffer(
+			device,
+			sdl.GPUTransferBufferCreateInfo{usage = .UPLOAD, size = new_size},
+		)
+		if buffer.transfer == nil {
+			log.panicf("Failed to grow GPU transfer buffer (new_size=%d): %s", new_size, sdl.GetError())
+		}
+		buffer.size = new_size
+	}
+}
+
+destroy_buffer :: proc(device: ^sdl.GPUDevice, buffer: ^Buffer) {
+	sdl.ReleaseGPUBuffer(device, buffer.gpu)
+	sdl.ReleaseGPUTransferBuffer(device, buffer.transfer)
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Transform ------------------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+// 2x3 affine transform for 2D pivot-rotation.
+// Used internally by rotation-aware drawing procs.
+Transform_2D :: struct {
+	m00, m01: f32, // row 0: rotation/scale
+	m10, m11: f32, // row 1: rotation/scale
+	tx, ty:   f32, // translation
+}
+
+// Build a pivot-rotation transform.
+//
+// Semantics (raylib-style):
+//   The point whose local coordinates equal `origin` lands at `pos` in world space.
+//   The rest of the shape rotates around that pivot.
+//
+// Formula:  p_world = pos + R(θ) · (p_local - origin)
+//
+// Parameters:
+//   pos          – world-space position where the pivot lands.
+//   origin       – pivot point in local space (measured from the shape's natural reference point).
+//   rotation_deg – rotation in degrees, counter-clockwise.
+//
+build_pivot_rotation :: proc(position: [2]f32, origin: [2]f32, rotation_deg: f32) -> Transform_2D {
+	radians := math.to_radians(rotation_deg)
+	cos_angle := math.cos(radians)
+	sin_angle := math.sin(radians)
+	return Transform_2D {
+		m00 = cos_angle,
+		m01 = -sin_angle,
+		m10 = sin_angle,
+		m11 = cos_angle,
+		tx = position.x - (cos_angle * origin.x - sin_angle * origin.y),
+		ty = position.y - (sin_angle * origin.x + cos_angle * origin.y),
+	}
+}
+
+// Apply the transform to a local-space point, producing a world-space point.
+apply_transform :: #force_inline proc(transform: Transform_2D, point: [2]f32) -> [2]f32 {
+	return {
+		transform.m00 * point.x + transform.m01 * point.y + transform.tx,
+		transform.m10 * point.x + transform.m11 * point.y + transform.ty,
+	}
+}
+
+// Fast-path check callers use BEFORE building a transform.
+// Returns true if either the origin is non-zero or rotation is non-zero,
+// meaning a transform actually needs to be computed.
+needs_transform :: #force_inline proc(origin: [2]f32, rotation: f32) -> bool {
+	return origin != {0, 0} || rotation != 0
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Procedure Groups ------------------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+center_of :: proc {
+	center_of_rectangle,
+	center_of_triangle,
+	center_of_text,
+}
+
+top_left_of :: proc {
+	top_left_of_rectangle,
+	top_left_of_triangle,
+	top_left_of_text,
+}
+
+top_of :: proc {
+	top_of_rectangle,
+	top_of_triangle,
+	top_of_text,
+}
+
+top_right_of :: proc {
+	top_right_of_rectangle,
+	top_right_of_triangle,
+	top_right_of_text,
+}
+
+left_of :: proc {
+	left_of_rectangle,
+	left_of_triangle,
+	left_of_text,
+}
+
+right_of :: proc {
+	right_of_rectangle,
+	right_of_triangle,
+	right_of_text,
+}
+
+bottom_left_of :: proc {
+	bottom_left_of_rectangle,
+	bottom_left_of_triangle,
+	bottom_left_of_text,
+}
+
+bottom_of :: proc {
+	bottom_of_rectangle,
+	bottom_of_triangle,
+	bottom_of_text,
+}
+
+bottom_right_of :: proc {
+	bottom_right_of_rectangle,
+	bottom_right_of_triangle,
+	bottom_right_of_text,
+}
@@ -0,0 +1,351 @@
+package examples
+
+import "../../draw"
+import "../../vendor/clay"
+import "core:math"
+import "core:os"
+import sdl "vendor:sdl3"
+
+JETBRAINS_MONO_REGULAR_RAW :: #load("fonts/JetBrainsMono-Regular.ttf")
+JETBRAINS_MONO_REGULAR: draw.Font_Id = max(draw.Font_Id) // Max so we crash if registration is forgotten
+
+hellope_shapes :: proc() {
+	if !sdl.Init({.VIDEO}) do os.exit(1)
+	window := sdl.CreateWindow("Hellope!", 500, 500, {.HIGH_PIXEL_DENSITY})
+	gpu := sdl.CreateGPUDevice(draw.PLATFORM_SHADER_FORMAT, true, nil)
+	if !sdl.ClaimWindowForGPUDevice(gpu, window) do os.exit(1)
+	if !draw.init(gpu, window) do os.exit(1)
+
+	spin_angle: f32 = 0
+
+	for {
+		defer free_all(context.temp_allocator)
+		ev: sdl.Event
+		for sdl.PollEvent(&ev) {
+			if ev.type == .QUIT do return
+		}
+		spin_angle += 1
+		base_layer := draw.begin({width = 500, height = 500})
+
+		// Background
+		draw.rectangle(base_layer, {0, 0, 500, 500}, {40, 40, 40, 255})
+
+		// ----- Shapes without rotation (existing demo) -----
+		draw.rectangle(base_layer, {20, 20, 200, 120}, {80, 120, 200, 255})
+		draw.rectangle_lines(base_layer, {20, 20, 200, 120}, draw.WHITE, thickness = 2)
+		draw.rectangle(base_layer, {240, 20, 240, 120}, {200, 80, 80, 255}, roundness = 0.3)
+		draw.rectangle_gradient(
+			base_layer,
+			{20, 160, 460, 60},
+			{255, 0, 0, 255},
+			{0, 255, 0, 255},
+			{0, 0, 255, 255},
+			{255, 255, 0, 255},
+		)
+
+		// ----- Rotation demos -----
+
+		// Rectangle rotating around its center
+		rect := draw.Rectangle{100, 320, 80, 50}
+		draw.rectangle(
+			base_layer,
+			rect,
+			{100, 200, 100, 255},
+			origin = draw.center_of(rect),
+			rotation = spin_angle,
+		)
+		draw.rectangle_lines(
+			base_layer,
+			rect,
+			draw.WHITE,
+			thickness = 2,
+			origin = draw.center_of(rect),
+			rotation = spin_angle,
+		)
+
+		// Rounded rectangle rotating around its center
+		rrect := draw.Rectangle{230, 300, 100, 80}
+		draw.rectangle(
+			base_layer,
+			rrect,
+			{200, 100, 200, 255},
+			roundness = 0.4,
+			origin = draw.center_of(rrect),
+			rotation = spin_angle,
+		)
+
+		// Ellipse rotating around its center (tilted ellipse)
+		draw.ellipse(base_layer, {410, 340}, 50, 30, {255, 200, 50, 255}, rotation = spin_angle)
+
+		// Circle orbiting a point (moon orbiting planet)
+		planet_pos := [2]f32{100, 450}
+		moon_pos := planet_pos + {0, -40}
+		draw.circle(base_layer, planet_pos, 8, {200, 200, 200, 255}) // planet (stationary)
+		draw.circle(base_layer, moon_pos, 5, {100, 150, 255, 255}, origin = {0, 40}, rotation = spin_angle) // moon orbiting
+
+		// Ring arc rotating in place
+		draw.ring(base_layer, {250, 450}, 15, 30, 0, 270, {100, 100, 220, 255}, rotation = spin_angle)
+
+		// Triangle rotating around its center
+		tv1 := [2]f32{350, 420}
+		tv2 := [2]f32{420, 480}
+		tv3 := [2]f32{340, 480}
+		draw.triangle(
+			base_layer,
+			tv1,
+			tv2,
+			tv3,
+			{220, 180, 60, 255},
+			origin = draw.center_of(tv1, tv2, tv3),
+			rotation = spin_angle,
+		)
+
+		// Polygon rotating around its center (already had rotation; now with origin for orbit)
+		draw.polygon(base_layer, {460, 450}, 6, 30, {180, 100, 220, 255}, rotation = spin_angle)
+		draw.polygon_lines(base_layer, {460, 450}, 6, 30, draw.WHITE, rotation = spin_angle, thickness = 2)
+
+		draw.end(gpu, window)
+	}
+}
+
+hellope_text :: proc() {
+	HELLOPE_ID :: 1
+	ROTATING_SENTENCE_ID :: 2
+	MEASURED_ID :: 3
+	CORNER_SPIN_ID :: 4
+
+	if !sdl.Init({.VIDEO}) do os.exit(1)
+	window := sdl.CreateWindow("Hellope!", 600, 600, {.HIGH_PIXEL_DENSITY})
+	gpu := sdl.CreateGPUDevice(draw.PLATFORM_SHADER_FORMAT, true, nil)
+	if !sdl.ClaimWindowForGPUDevice(gpu, window) do os.exit(1)
+	if !draw.init(gpu, window) do os.exit(1)
+	JETBRAINS_MONO_REGULAR = draw.register_font(JETBRAINS_MONO_REGULAR_RAW)
+
+	FONT_SIZE :: u16(24)
+	spin_angle: f32 = 0
+
+	for {
+		defer free_all(context.temp_allocator)
+		ev: sdl.Event
+		for sdl.PollEvent(&ev) {
+			if ev.type == .QUIT do return
+		}
+		spin_angle += 0.5
+		base_layer := draw.begin({width = 600, height = 600})
+
+		// Grey background
+		draw.rectangle(base_layer, {0, 0, 600, 600}, {127, 127, 127, 255})
+
+		// ----- Text API demos -----
+
+		// Cached text with id — TTF_Text reused across frames (good for text-heavy apps)
+		draw.text(
+			base_layer,
+			"Hellope!",
+			{300, 80},
+			JETBRAINS_MONO_REGULAR,
+			FONT_SIZE,
+			color = draw.WHITE,
+			origin = draw.center_of("Hellope!", JETBRAINS_MONO_REGULAR, FONT_SIZE),
+			id = HELLOPE_ID,
+		)
+
+		// Rotating sentence — verifies multi-word text rotation around center
+		draw.text(
+			base_layer,
+			"Hellope World!",
+			{300, 250},
+			JETBRAINS_MONO_REGULAR,
+			FONT_SIZE,
+			color = {255, 200, 50, 255},
+			origin = draw.center_of("Hellope World!", JETBRAINS_MONO_REGULAR, FONT_SIZE),
+			rotation = spin_angle,
+			id = ROTATING_SENTENCE_ID,
+		)
+
+		// Uncached text (no id) — created and destroyed each frame, simplest usage
+		draw.text(
+			base_layer,
+			"Top-left anchored",
+			{20, 450},
+			JETBRAINS_MONO_REGULAR,
+			FONT_SIZE,
+			color = draw.WHITE,
+		)
+
+		// Measure text for manual layout
+		size := draw.measure_text("Measured!", JETBRAINS_MONO_REGULAR, FONT_SIZE)
+		draw.rectangle(base_layer, {300 - size.x / 2, 380, size.x, size.y}, {60, 60, 60, 200})
+		draw.text(
+			base_layer,
+			"Measured!",
+			{300, 380},
+			JETBRAINS_MONO_REGULAR,
+			FONT_SIZE,
+			color = draw.WHITE,
+			origin = draw.top_of("Measured!", JETBRAINS_MONO_REGULAR, FONT_SIZE),
+			id = MEASURED_ID,
+		)
+
+		// Rotating text anchored at top-left (no origin offset) — spins around top-left corner
+		draw.text(
+			base_layer,
+			"Corner spin",
+			{150, 530},
+			JETBRAINS_MONO_REGULAR,
+			FONT_SIZE,
+			color = {100, 200, 255, 255},
+			rotation = spin_angle,
+			id = CORNER_SPIN_ID,
+		)
+
+		draw.end(gpu, window)
+	}
+}
+
+hellope_clay :: proc() {
+	if !sdl.Init({.VIDEO}) do os.exit(1)
+	window := sdl.CreateWindow("Hellope!", 500, 500, {.HIGH_PIXEL_DENSITY})
+	gpu := sdl.CreateGPUDevice(draw.PLATFORM_SHADER_FORMAT, true, nil)
+	if !sdl.ClaimWindowForGPUDevice(gpu, window) do os.exit(1)
+	if !draw.init(gpu, window) do os.exit(1)
+	JETBRAINS_MONO_REGULAR = draw.register_font(JETBRAINS_MONO_REGULAR_RAW)
+
+	text_config := clay.TextElementConfig {
+		fontId    = JETBRAINS_MONO_REGULAR,
+		fontSize  = 36,
+		textColor = {255, 255, 255, 255},
+	}
+
+	for {
+		defer free_all(context.temp_allocator)
+		ev: sdl.Event
+		for sdl.PollEvent(&ev) {
+			if ev.type == .QUIT do return
+		}
+		base_layer := draw.begin({width = 500, height = 500})
+		clay.SetLayoutDimensions({width = base_layer.bounds.width, height = base_layer.bounds.height})
+		clay.BeginLayout()
+		if clay.UI()(
+		{
+			id = clay.ID("outer"),
+			layout = {
+				sizing = {clay.SizingGrow({}), clay.SizingGrow({})},
+				childAlignment = {x = .Center, y = .Center},
+			},
+			backgroundColor = {127, 127, 127, 255},
+		},
+		) {
+			clay.Text("Hellope!", &text_config)
+		}
+		clay_batch := draw.ClayBatch {
+			bounds = base_layer.bounds,
+			cmds   = clay.EndLayout(),
+		}
+		draw.prepare_clay_batch(base_layer, &clay_batch, {0, 0})
+		draw.end(gpu, window)
+	}
+}
+
+hellope_custom :: proc() {
+	if !sdl.Init({.VIDEO}) do os.exit(1)
+	window := sdl.CreateWindow("Hellope Custom!", 600, 400, {.HIGH_PIXEL_DENSITY})
+	gpu := sdl.CreateGPUDevice(draw.PLATFORM_SHADER_FORMAT, true, nil)
+	if !sdl.ClaimWindowForGPUDevice(gpu, window) do os.exit(1)
+	if !draw.init(gpu, window) do os.exit(1)
+	JETBRAINS_MONO_REGULAR = draw.register_font(JETBRAINS_MONO_REGULAR_RAW)
+
+	text_config := clay.TextElementConfig {
+		fontId    = JETBRAINS_MONO_REGULAR,
+		fontSize  = 24,
+		textColor = {255, 255, 255, 255},
+	}
+
+	gauge := Gauge {
+		value = 0.73,
+		color = {50, 200, 100, 255},
+	}
+	gauge2 := Gauge {
+		value = 0.45,
+		color = {200, 100, 50, 255},
+	}
+	spin_angle: f32 = 0
+
+	for {
+		defer free_all(context.temp_allocator)
+		ev: sdl.Event
+		for sdl.PollEvent(&ev) {
+			if ev.type == .QUIT do return
+		}
+
+		spin_angle += 1
+		gauge.value = (math.sin(spin_angle * 0.02) + 1) * 0.5
+		gauge2.value = (math.cos(spin_angle * 0.03) + 1) * 0.5
+
+		base_layer := draw.begin({width = 600, height = 400})
+		clay.SetLayoutDimensions({width = base_layer.bounds.width, height = base_layer.bounds.height})
+		clay.BeginLayout()
+
+		if clay.UI()(
+		{
+			id = clay.ID("outer"),
+			layout = {
+				sizing = {clay.SizingGrow({}), clay.SizingGrow({})},
+				childAlignment = {x = .Center, y = .Center},
+				layoutDirection = .TopToBottom,
+				childGap = 20,
+			},
+			backgroundColor = {50, 50, 50, 255},
+		},
+		) {
+			if clay.UI()({id = clay.ID("title"), layout = {sizing = {clay.SizingFit({}), clay.SizingFit({})}}}) {
+				clay.Text("Custom Draw Demo", &text_config)
+			}
+
+			if clay.UI()(
+			{
+				id = clay.ID("gauge"),
+				layout = {sizing = {clay.SizingFixed(300), clay.SizingFixed(30)}},
+				custom = {customData = &gauge},
+				backgroundColor = {80, 80, 80, 255},
+			},
+			) {}
+
+			if clay.UI()(
+			{
+				id = clay.ID("gauge2"),
+				layout = {sizing = {clay.SizingFixed(300), clay.SizingFixed(30)}},
+				custom = {customData = &gauge2},
+				backgroundColor = {80, 80, 80, 255},
+			},
+			) {}
+		}
+
+		clay_batch := draw.ClayBatch {
+			bounds = base_layer.bounds,
+			cmds   = clay.EndLayout(),
+		}
+		draw.prepare_clay_batch(base_layer, &clay_batch, {0, 0}, custom_draw = draw_custom)
+		draw.end(gpu, window)
+	}
+
+	Gauge :: struct {
+		value: f32,
+		color: draw.Color,
+	}
+
+	draw_custom :: proc(layer: ^draw.Layer, bounds: draw.Rectangle, render_data: clay.CustomRenderData) {
+		gauge := cast(^Gauge)render_data.customData
+
+		// Background from clay's backgroundColor
+		draw.rectangle(layer, bounds, draw.color_from_clay(render_data.backgroundColor), roundness = 0.25)
+
+		// Fill bar
+		fill := bounds
+		fill.width *= gauge.value
+		draw.rectangle(layer, fill, gauge.color, roundness = 0.25)
+
+		// Border
+		draw.rectangle_lines(layer, bounds, draw.WHITE, thickness = 2, roundness = 0.25)
+	}
+}
@@ -0,0 +1,74 @@
+package examples
+
+import "core:fmt"
+import "core:mem"
+import "core:os"
+
+main :: proc() {
+	//----- Tracking allocator ----------------------------------
+	{
+		tracking_temp_allocator := false
+		// Temp
+		track_temp: mem.Tracking_Allocator
+		if tracking_temp_allocator {
+			mem.tracking_allocator_init(&track_temp, context.temp_allocator)
+			context.temp_allocator = mem.tracking_allocator(&track_temp)
+		}
+		// Default
+		track: mem.Tracking_Allocator
+		mem.tracking_allocator_init(&track, context.allocator)
+		context.allocator = mem.tracking_allocator(&track)
+		// Log a warning about any memory that was not freed by the end of the program.
+		// This could be fine for some global state or it could be a memory leak.
+		defer {
+			// Temp allocator
+			if tracking_temp_allocator {
+				if len(track_temp.allocation_map) > 0 {
+					fmt.eprintf("=== %v allocations not freed - temp allocator: ===\n", len(track_temp.allocation_map))
+					for _, entry in track_temp.allocation_map {
+						fmt.eprintf("- %v bytes @ %v\n", entry.size, entry.location)
+					}
+				}
+				if len(track_temp.bad_free_array) > 0 {
+					fmt.eprintf("=== %v incorrect frees - temp allocator: ===\n", len(track_temp.bad_free_array))
+					for entry in track_temp.bad_free_array {
+						fmt.eprintf("- %p @ %v\n", entry.memory, entry.location)
+					}
+				}
+				mem.tracking_allocator_destroy(&track_temp)
+			}
+			// Default allocator
+			if len(track.allocation_map) > 0 {
+				fmt.eprintf("=== %v allocations not freed - main allocator: ===\n", len(track.allocation_map))
+				for _, entry in track.allocation_map {
+					fmt.eprintf("- %v bytes @ %v\n", entry.size, entry.location)
+				}
+			}
+			if len(track.bad_free_array) > 0 {
+				fmt.eprintf("=== %v incorrect frees - main allocator: ===\n", len(track.bad_free_array))
+				for entry in track.bad_free_array {
+					fmt.eprintf("- %p @ %v\n", entry.memory, entry.location)
+				}
+			}
+			mem.tracking_allocator_destroy(&track)
+		}
+	}
+
+	args := os.args
+	if len(args) < 2 {
+		fmt.eprintln("Usage: examples <example_name>")
+		fmt.eprintln("Available examples: hellope-shapes, hellope-text, hellope-clay, hellope-custom")
+		os.exit(1)
+	}
+
+	switch args[1] {
+	case "hellope-clay": hellope_clay()
+	case "hellope-custom": hellope_custom()
+	case "hellope-shapes": hellope_shapes()
+	case "hellope-text": hellope_text()
+	case:
+		fmt.eprintf("Unknown example: %v\n", args[1])
+		fmt.eprintln("Available examples: hellope-shapes, hellope-text, hellope-clay, hellope-custom")
+		os.exit(1)
+	}
+}
@@ -0,0 +1,662 @@
+package draw
+
+import "core:c"
+import "core:log"
+import "core:mem"
+import sdl "vendor:sdl3"
+
+Vertex :: struct {
+	position: [2]f32,
+	uv:       [2]f32,
+	color:    Color,
+}
+
+TextBatch :: struct {
+	atlas_texture: ^sdl.GPUTexture,
+	vertex_start:  u32,
+	vertex_count:  u32,
+	index_start:   u32,
+	index_count:   u32,
+}
+
+// ----------------------------------------------------------------------------------------------------------------
+// ----- SDF primitive types -----------
+// ----------------------------------------------------------------------------------------------------------------
+
+Shape_Kind :: enum u8 {
+	Solid    = 0,
+	RRect    = 1,
+	Circle   = 2,
+	Ellipse  = 3,
+	Segment  = 4,
+	Ring_Arc = 5,
+	NGon     = 6,
+}
+
+Shape_Flag :: enum u8 {
+	Stroke,
+}
+
+Shape_Flags :: bit_set[Shape_Flag;u8]
+
+RRect_Params :: struct {
+	half_size: [2]f32,
+	radii:     [4]f32,
+	soft_px:   f32,
+	stroke_px: f32,
+}
+
+Circle_Params :: struct {
+	radius:    f32,
+	soft_px:   f32,
+	stroke_px: f32,
+	_:         [5]f32,
+}
+
+Ellipse_Params :: struct {
+	radii:     [2]f32,
+	soft_px:   f32,
+	stroke_px: f32,
+	_:         [4]f32,
+}
+
+Segment_Params :: struct {
+	a:       [2]f32,
+	b:       [2]f32,
+	width:   f32,
+	soft_px: f32,
+	_:       [2]f32,
+}
+
+Ring_Arc_Params :: struct {
+	inner_radius: f32,
+	outer_radius: f32,
+	start_rad:    f32,
+	end_rad:      f32,
+	soft_px:      f32,
+	_:            [3]f32,
+}
+
+NGon_Params :: struct {
+	radius:    f32,
+	rotation:  f32,
+	sides:     f32,
+	soft_px:   f32,
+	stroke_px: f32,
+	_:         [3]f32,
+}
+
+Shape_Params :: struct #raw_union {
+	rrect:    RRect_Params,
+	circle:   Circle_Params,
+	ellipse:  Ellipse_Params,
+	segment:  Segment_Params,
+	ring_arc: Ring_Arc_Params,
+	ngon:     NGon_Params,
+	raw:      [8]f32,
+}
+
+#assert(size_of(Shape_Params) == 32)
+
+// GPU layout: 64 bytes, std430-compatible. The shader declares this as a storage buffer struct.
+Primitive :: struct {
+	bounds:     [4]f32, //  0: min_x, min_y, max_x, max_y (world-space, pre-DPI)
+	color:      Color, // 16: u8x4, unpacked in shader via unpackUnorm4x8
+	kind_flags: u32, // 20: (kind as u32) | (flags as u32 << 8)
+	rotation:   f32, // 24: shader self-rotation in radians (used by RRect, Ellipse)
+	_pad:       f32, // 28: alignment to vec4 boundary
+	params:     Shape_Params, // 32: two vec4s of shape params
+}
+
+#assert(size_of(Primitive) == 64)
+
+pack_kind_flags :: #force_inline proc(kind: Shape_Kind, flags: Shape_Flags) -> u32 {
+	return u32(kind) | (u32(transmute(u8)flags) << 8)
+}
+
+Pipeline_2D_Base :: struct {
+	sdl_pipeline:     ^sdl.GPUGraphicsPipeline,
+	vertex_buffer:    Buffer,
+	index_buffer:     Buffer,
+	unit_quad_buffer: ^sdl.GPUBuffer,
+	primitive_buffer: Buffer,
+	white_texture:    ^sdl.GPUTexture,
+	sampler:          ^sdl.GPUSampler,
+}
+
+@(private)
+create_pipeline_2d_base :: proc(
+	device: ^sdl.GPUDevice,
+	window: ^sdl.Window,
+	sample_count: sdl.GPUSampleCount,
+) -> (
+	pipeline: Pipeline_2D_Base,
+	ok: bool,
+) {
+	// On failure, clean up any partially-created resources
+	defer if !ok {
+		if pipeline.sampler != nil do sdl.ReleaseGPUSampler(device, pipeline.sampler)
+		if pipeline.white_texture != nil do sdl.ReleaseGPUTexture(device, pipeline.white_texture)
+		if pipeline.unit_quad_buffer != nil do sdl.ReleaseGPUBuffer(device, pipeline.unit_quad_buffer)
+		if pipeline.primitive_buffer.gpu != nil do destroy_buffer(device, &pipeline.primitive_buffer)
+		if pipeline.index_buffer.gpu != nil do destroy_buffer(device, &pipeline.index_buffer)
+		if pipeline.vertex_buffer.gpu != nil do destroy_buffer(device, &pipeline.vertex_buffer)
+		if pipeline.sdl_pipeline != nil do sdl.ReleaseGPUGraphicsPipeline(device, pipeline.sdl_pipeline)
+	}
+
+	active_shader_formats := sdl.GetGPUShaderFormats(device)
+	if PLATFORM_SHADER_FORMAT_FLAG not_in active_shader_formats {
+		log.errorf(
+			"draw: no embedded shader matches active GPU formats; this build supports %v but device reports %v",
+			PLATFORM_SHADER_FORMAT,
+			active_shader_formats,
+		)
+		return pipeline, false
+	}
+
+	log.debug("Loaded", len(BASE_VERT_2D_RAW), "vert bytes")
+	log.debug("Loaded", len(BASE_FRAG_2D_RAW), "frag bytes")
+
+	vert_info := sdl.GPUShaderCreateInfo {
+		code_size           = len(BASE_VERT_2D_RAW),
+		code                = raw_data(BASE_VERT_2D_RAW),
+		entrypoint          = SHADER_ENTRY,
+		format              = {PLATFORM_SHADER_FORMAT_FLAG},
+		stage               = .VERTEX,
+		num_uniform_buffers = 1,
+		num_storage_buffers = 1,
+	}
+
+	frag_info := sdl.GPUShaderCreateInfo {
+		code_size    = len(BASE_FRAG_2D_RAW),
+		code         = raw_data(BASE_FRAG_2D_RAW),
+		entrypoint   = SHADER_ENTRY,
+		format       = {PLATFORM_SHADER_FORMAT_FLAG},
+		stage        = .FRAGMENT,
+		num_samplers = 1,
+	}
+
+	vert_shader := sdl.CreateGPUShader(device, vert_info)
+	if vert_shader == nil {
+		log.errorf("Could not create draw vertex shader: %s", sdl.GetError())
+		return pipeline, false
+	}
+
+	frag_shader := sdl.CreateGPUShader(device, frag_info)
+	if frag_shader == nil {
+		sdl.ReleaseGPUShader(device, vert_shader)
+		log.errorf("Could not create draw fragment shader: %s", sdl.GetError())
+		return pipeline, false
+	}
+
+	vertex_attributes: [3]sdl.GPUVertexAttribute = {
+		// position (GLSL location 0)
+		sdl.GPUVertexAttribute{buffer_slot = 0, location = 0, format = .FLOAT2, offset = 0},
+		// uv (GLSL location 1)
+		sdl.GPUVertexAttribute{buffer_slot = 0, location = 1, format = .FLOAT2, offset = size_of([2]f32)},
+		// color (GLSL location 2, u8x4 normalized to float by GPU)
+		sdl.GPUVertexAttribute{buffer_slot = 0, location = 2, format = .UBYTE4_NORM, offset = size_of([2]f32) * 2},
+	}
+
+	pipeline_info := sdl.GPUGraphicsPipelineCreateInfo {
+		vertex_shader = vert_shader,
+		fragment_shader = frag_shader,
+		primitive_type = .TRIANGLELIST,
+		multisample_state = sdl.GPUMultisampleState{sample_count = sample_count},
+		target_info = sdl.GPUGraphicsPipelineTargetInfo {
+			color_target_descriptions = &sdl.GPUColorTargetDescription {
+				format = sdl.GetGPUSwapchainTextureFormat(device, window),
+				blend_state = sdl.GPUColorTargetBlendState {
+					enable_blend = true,
+					enable_color_write_mask = true,
+					src_color_blendfactor = .SRC_ALPHA,
+					dst_color_blendfactor = .ONE_MINUS_SRC_ALPHA,
+					color_blend_op = .ADD,
+					src_alpha_blendfactor = .SRC_ALPHA,
+					dst_alpha_blendfactor = .ONE_MINUS_SRC_ALPHA,
+					alpha_blend_op = .ADD,
+					color_write_mask = sdl.GPUColorComponentFlags{.R, .G, .B, .A},
+				},
+			},
+			num_color_targets = 1,
+		},
+		vertex_input_state = sdl.GPUVertexInputState {
+			vertex_buffer_descriptions = &sdl.GPUVertexBufferDescription {
+				slot = 0,
+				input_rate = .VERTEX,
+				pitch = size_of(Vertex),
+			},
+			num_vertex_buffers = 1,
+			vertex_attributes = raw_data(vertex_attributes[:]),
+			num_vertex_attributes = 3,
+		},
+	}
+
+	pipeline.sdl_pipeline = sdl.CreateGPUGraphicsPipeline(device, pipeline_info)
+	// Shaders are no longer needed regardless of pipeline creation success
+	sdl.ReleaseGPUShader(device, vert_shader)
+	sdl.ReleaseGPUShader(device, frag_shader)
+	if pipeline.sdl_pipeline == nil {
+		log.errorf("Failed to create draw graphics pipeline: %s", sdl.GetError())
+		return pipeline, false
+	}
+
+	// Create vertex buffer
+	vert_buf_ok: bool
+	pipeline.vertex_buffer, vert_buf_ok = create_buffer(
+		device,
+		size_of(Vertex) * BUFFER_INIT_SIZE,
+		sdl.GPUBufferUsageFlags{.VERTEX},
+	)
+	if !vert_buf_ok do return pipeline, false
+
+	// Create index buffer (used by text)
+	idx_buf_ok: bool
+	pipeline.index_buffer, idx_buf_ok = create_buffer(
+		device,
+		size_of(c.int) * BUFFER_INIT_SIZE,
+		sdl.GPUBufferUsageFlags{.INDEX},
+	)
+	if !idx_buf_ok do return pipeline, false
+
+	// Create primitive storage buffer (used by SDF instanced drawing)
+	prim_buf_ok: bool
+	pipeline.primitive_buffer, prim_buf_ok = create_buffer(
+		device,
+		size_of(Primitive) * BUFFER_INIT_SIZE,
+		sdl.GPUBufferUsageFlags{.GRAPHICS_STORAGE_READ},
+	)
+	if !prim_buf_ok do return pipeline, false
+
+	// Create static 6-vertex unit quad buffer (two triangles, TRIANGLELIST)
+	pipeline.unit_quad_buffer = sdl.CreateGPUBuffer(
+		device,
+		sdl.GPUBufferCreateInfo{usage = {.VERTEX}, size = 6 * size_of(Vertex)},
+	)
+	if pipeline.unit_quad_buffer == nil {
+		log.errorf("Failed to create unit quad buffer: %s", sdl.GetError())
+		return pipeline, false
+	}
+
+	// Create 1x1 white pixel texture
+	pipeline.white_texture = sdl.CreateGPUTexture(
+		device,
+		sdl.GPUTextureCreateInfo {
+			type = .D2,
+			format = .R8G8B8A8_UNORM,
+			usage = {.SAMPLER},
+			width = 1,
+			height = 1,
+			layer_count_or_depth = 1,
+			num_levels = 1,
+			sample_count = ._1,
+		},
+	)
+	if pipeline.white_texture == nil {
+		log.errorf("Failed to create white pixel texture: %s", sdl.GetError())
+		return pipeline, false
+	}
+
+	// Upload white pixel and unit quad data in a single command buffer
+	white_pixel := [4]u8{255, 255, 255, 255}
+	white_transfer_buf := sdl.CreateGPUTransferBuffer(
+		device,
+		sdl.GPUTransferBufferCreateInfo{usage = .UPLOAD, size = size_of(white_pixel)},
+	)
+	if white_transfer_buf == nil {
+		log.errorf("Failed to create white pixel transfer buffer: %s", sdl.GetError())
+		return pipeline, false
+	}
+	defer sdl.ReleaseGPUTransferBuffer(device, white_transfer_buf)
+
+	white_ptr := sdl.MapGPUTransferBuffer(device, white_transfer_buf, false)
+	if white_ptr == nil {
+		log.errorf("Failed to map white pixel transfer buffer: %s", sdl.GetError())
+		return pipeline, false
+	}
+	mem.copy(white_ptr, &white_pixel, size_of(white_pixel))
+	sdl.UnmapGPUTransferBuffer(device, white_transfer_buf)
+
+	quad_verts := [6]Vertex {
+		{position = {0, 0}},
+		{position = {1, 0}},
+		{position = {0, 1}},
+		{position = {0, 1}},
+		{position = {1, 0}},
+		{position = {1, 1}},
+	}
+	quad_transfer_buf := sdl.CreateGPUTransferBuffer(
+		device,
+		sdl.GPUTransferBufferCreateInfo{usage = .UPLOAD, size = size_of(quad_verts)},
+	)
+	if quad_transfer_buf == nil {
+		log.errorf("Failed to create unit quad transfer buffer: %s", sdl.GetError())
+		return pipeline, false
+	}
+	defer sdl.ReleaseGPUTransferBuffer(device, quad_transfer_buf)
+
+	quad_ptr := sdl.MapGPUTransferBuffer(device, quad_transfer_buf, false)
+	if quad_ptr == nil {
+		log.errorf("Failed to map unit quad transfer buffer: %s", sdl.GetError())
+		return pipeline, false
+	}
+	mem.copy(quad_ptr, &quad_verts, size_of(quad_verts))
+	sdl.UnmapGPUTransferBuffer(device, quad_transfer_buf)
+
+	upload_cmd_buffer := sdl.AcquireGPUCommandBuffer(device)
+	if upload_cmd_buffer == nil {
+		log.errorf("Failed to acquire command buffer for init upload: %s", sdl.GetError())
+		return pipeline, false
+	}
+	upload_pass := sdl.BeginGPUCopyPass(upload_cmd_buffer)
+
+	sdl.UploadToGPUTexture(
+		upload_pass,
+		sdl.GPUTextureTransferInfo{transfer_buffer = white_transfer_buf},
+		sdl.GPUTextureRegion{texture = pipeline.white_texture, w = 1, h = 1, d = 1},
+		false,
+	)
+
+	sdl.UploadToGPUBuffer(
+		upload_pass,
+		sdl.GPUTransferBufferLocation{transfer_buffer = quad_transfer_buf},
+		sdl.GPUBufferRegion{buffer = pipeline.unit_quad_buffer, offset = 0, size = size_of(quad_verts)},
+		false,
+	)
+
+	sdl.EndGPUCopyPass(upload_pass)
+	if !sdl.SubmitGPUCommandBuffer(upload_cmd_buffer) {
+		log.errorf("Failed to submit init upload command buffer: %s", sdl.GetError())
+		return pipeline, false
+	}
+
+	log.debug("White pixel texture and unit quad buffer created and uploaded")
+
+	// Create sampler (shared by shapes and text)
+	pipeline.sampler = sdl.CreateGPUSampler(
+		device,
+		sdl.GPUSamplerCreateInfo {
+			min_filter = .LINEAR,
+			mag_filter = .LINEAR,
+			mipmap_mode = .LINEAR,
+			address_mode_u = .CLAMP_TO_EDGE,
+			address_mode_v = .CLAMP_TO_EDGE,
+			address_mode_w = .CLAMP_TO_EDGE,
+		},
+	)
+	if pipeline.sampler == nil {
+		log.errorf("Could not create GPU sampler: %s", sdl.GetError())
+		return pipeline, false
+	}
+
+	log.debug("Done creating unified draw pipeline")
+	return pipeline, true
+}
+
+@(private)
+upload :: proc(device: ^sdl.GPUDevice, pass: ^sdl.GPUCopyPass) {
+	// Upload vertices (shapes then text into one buffer)
+	shape_vert_count := u32(len(GLOB.tmp_shape_verts))
+	text_vert_count := u32(len(GLOB.tmp_text_verts))
+	total_vert_count := shape_vert_count + text_vert_count
+
+	if total_vert_count > 0 {
+		total_vert_size := total_vert_count * size_of(Vertex)
+		shape_vert_size := shape_vert_count * size_of(Vertex)
+		text_vert_size := text_vert_count * size_of(Vertex)
+
+		grow_buffer_if_needed(
+			device,
+			&GLOB.pipeline_2d_base.vertex_buffer,
+			total_vert_size,
+			sdl.GPUBufferUsageFlags{.VERTEX},
+		)
+
+		vert_array := sdl.MapGPUTransferBuffer(device, GLOB.pipeline_2d_base.vertex_buffer.transfer, false)
+		if vert_array == nil {
+			log.panicf("Failed to map vertex transfer buffer: %s", sdl.GetError())
+		}
+		if shape_vert_size > 0 {
+			mem.copy(vert_array, raw_data(GLOB.tmp_shape_verts), int(shape_vert_size))
+		}
+		if text_vert_size > 0 {
+			mem.copy(
+				rawptr(uintptr(vert_array) + uintptr(shape_vert_size)),
+				raw_data(GLOB.tmp_text_verts),
+				int(text_vert_size),
+			)
+		}
+		sdl.UnmapGPUTransferBuffer(device, GLOB.pipeline_2d_base.vertex_buffer.transfer)
+
+		sdl.UploadToGPUBuffer(
+			pass,
+			sdl.GPUTransferBufferLocation{transfer_buffer = GLOB.pipeline_2d_base.vertex_buffer.transfer},
+			sdl.GPUBufferRegion{buffer = GLOB.pipeline_2d_base.vertex_buffer.gpu, offset = 0, size = total_vert_size},
+			false,
+		)
+	}
+
+	// Upload text indices
+	index_count := u32(len(GLOB.tmp_text_indices))
+	if index_count > 0 {
+		index_size := index_count * size_of(c.int)
+
+		grow_buffer_if_needed(
+			device,
+			&GLOB.pipeline_2d_base.index_buffer,
+			index_size,
+			sdl.GPUBufferUsageFlags{.INDEX},
+		)
+
+		idx_array := sdl.MapGPUTransferBuffer(device, GLOB.pipeline_2d_base.index_buffer.transfer, false)
+		if idx_array == nil {
+			log.panicf("Failed to map index transfer buffer: %s", sdl.GetError())
+		}
+		mem.copy(idx_array, raw_data(GLOB.tmp_text_indices), int(index_size))
+		sdl.UnmapGPUTransferBuffer(device, GLOB.pipeline_2d_base.index_buffer.transfer)
+
+		sdl.UploadToGPUBuffer(
+			pass,
+			sdl.GPUTransferBufferLocation{transfer_buffer = GLOB.pipeline_2d_base.index_buffer.transfer},
+			sdl.GPUBufferRegion{buffer = GLOB.pipeline_2d_base.index_buffer.gpu, offset = 0, size = index_size},
+			false,
+		)
+	}
+
+	// Upload SDF primitives
+	prim_count := u32(len(GLOB.tmp_primitives))
+	if prim_count > 0 {
+		prim_size := prim_count * size_of(Primitive)
+
+		grow_buffer_if_needed(
+			device,
+			&GLOB.pipeline_2d_base.primitive_buffer,
+			prim_size,
+			sdl.GPUBufferUsageFlags{.GRAPHICS_STORAGE_READ},
+		)
+
+		prim_array := sdl.MapGPUTransferBuffer(device, GLOB.pipeline_2d_base.primitive_buffer.transfer, false)
+		if prim_array == nil {
+			log.panicf("Failed to map primitive transfer buffer: %s", sdl.GetError())
+		}
+		mem.copy(prim_array, raw_data(GLOB.tmp_primitives), int(prim_size))
+		sdl.UnmapGPUTransferBuffer(device, GLOB.pipeline_2d_base.primitive_buffer.transfer)
+
+		sdl.UploadToGPUBuffer(
+			pass,
+			sdl.GPUTransferBufferLocation{transfer_buffer = GLOB.pipeline_2d_base.primitive_buffer.transfer},
+			sdl.GPUBufferRegion{buffer = GLOB.pipeline_2d_base.primitive_buffer.gpu, offset = 0, size = prim_size},
+			false,
+		)
+	}
+}
+
+@(private)
+draw_layer :: proc(
+	device: ^sdl.GPUDevice,
+	window: ^sdl.Window,
+	cmd_buffer: ^sdl.GPUCommandBuffer,
+	render_texture: ^sdl.GPUTexture,
+	swapchain_width: u32,
+	swapchain_height: u32,
+	clear_color: [4]f32,
+	layer: ^Layer,
+) {
+	if layer.sub_batch_len == 0 {
+		if !GLOB.cleared {
+			pass := sdl.BeginGPURenderPass(
+				cmd_buffer,
+				&sdl.GPUColorTargetInfo {
+					texture = render_texture,
+					clear_color = sdl.FColor{clear_color[0], clear_color[1], clear_color[2], clear_color[3]},
+					load_op = .CLEAR,
+					store_op = .STORE,
+				},
+				1,
+				nil,
+			)
+			sdl.EndGPURenderPass(pass)
+			GLOB.cleared = true
+		}
+		return
+	}
+
+	render_pass := sdl.BeginGPURenderPass(
+		cmd_buffer,
+		&sdl.GPUColorTargetInfo {
+			texture = render_texture,
+			clear_color = sdl.FColor{clear_color[0], clear_color[1], clear_color[2], clear_color[3]},
+			load_op = GLOB.cleared ? .LOAD : .CLEAR,
+			store_op = .STORE,
+		},
+		1,
+		nil,
+	)
+	GLOB.cleared = true
+
+	sdl.BindGPUGraphicsPipeline(render_pass, GLOB.pipeline_2d_base.sdl_pipeline)
+
+	// Bind storage buffer (read by vertex shader in SDF mode)
+	sdl.BindGPUVertexStorageBuffers(
+		render_pass,
+		0,
+		([^]^sdl.GPUBuffer)(&GLOB.pipeline_2d_base.primitive_buffer.gpu),
+		1,
+	)
+
+	// Always bind index buffer — harmless if no indexed draws are issued
+	sdl.BindGPUIndexBuffer(
+		render_pass,
+		sdl.GPUBufferBinding{buffer = GLOB.pipeline_2d_base.index_buffer.gpu, offset = 0},
+		._32BIT,
+	)
+
+	// Shorthand aliases for frequently-used pipeline resources
+	main_vert_buf := GLOB.pipeline_2d_base.vertex_buffer.gpu
+	unit_quad := GLOB.pipeline_2d_base.unit_quad_buffer
+	white_texture := GLOB.pipeline_2d_base.white_texture
+	sampler := GLOB.pipeline_2d_base.sampler
+	width := f32(swapchain_width)
+	height := f32(swapchain_height)
+
+	// Initial GPU state: tessellated mode, main vertex buffer, no atlas bound yet
+	push_globals(cmd_buffer, width, height, .Tessellated)
+	sdl.BindGPUVertexBuffers(render_pass, 0, &sdl.GPUBufferBinding{buffer = main_vert_buf, offset = 0}, 1)
+
+	current_mode: Draw_Mode = .Tessellated
+	current_vert_buf := main_vert_buf
+	current_atlas: ^sdl.GPUTexture
+
+	// Text vertices live after shape vertices in the GPU vertex buffer
+	text_vertex_gpu_base := u32(len(GLOB.tmp_shape_verts))
+
+	for &scissor in GLOB.scissors[layer.scissor_start:][:layer.scissor_len] {
+		sdl.SetGPUScissor(render_pass, scissor.bounds)
+
+		for &batch in GLOB.tmp_sub_batches[scissor.sub_batch_start:][:scissor.sub_batch_len] {
+			switch batch.kind {
+			case .Shapes:
+				if current_mode != .Tessellated {
+					push_globals(cmd_buffer, width, height, .Tessellated)
+					current_mode = .Tessellated
+				}
+				if current_vert_buf != main_vert_buf {
+					sdl.BindGPUVertexBuffers(render_pass, 0, &sdl.GPUBufferBinding{buffer = main_vert_buf, offset = 0}, 1)
+					current_vert_buf = main_vert_buf
+				}
+				if current_atlas != white_texture {
+					sdl.BindGPUFragmentSamplers(
+						render_pass,
+						0,
+						&sdl.GPUTextureSamplerBinding{texture = white_texture, sampler = sampler},
+						1,
+					)
+					current_atlas = white_texture
+				}
+				sdl.DrawGPUPrimitives(render_pass, batch.count, 1, batch.offset, 0)
+
+			case .Text:
+				if current_mode != .Tessellated {
+					push_globals(cmd_buffer, width, height, .Tessellated)
+					current_mode = .Tessellated
+				}
+				if current_vert_buf != main_vert_buf {
+					sdl.BindGPUVertexBuffers(render_pass, 0, &sdl.GPUBufferBinding{buffer = main_vert_buf, offset = 0}, 1)
+					current_vert_buf = main_vert_buf
+				}
+				text_batch := &GLOB.tmp_text_batches[batch.offset]
+				if current_atlas != text_batch.atlas_texture {
+					sdl.BindGPUFragmentSamplers(
+						render_pass,
+						0,
+						&sdl.GPUTextureSamplerBinding{texture = text_batch.atlas_texture, sampler = sampler},
+						1,
+					)
+					current_atlas = text_batch.atlas_texture
+				}
+				sdl.DrawGPUIndexedPrimitives(
+					render_pass,
+					text_batch.index_count,
+					1,
+					text_batch.index_start,
+					i32(text_vertex_gpu_base + text_batch.vertex_start),
+					0,
+				)
+
+			case .SDF:
+				if current_mode != .SDF {
+					push_globals(cmd_buffer, width, height, .SDF)
+					current_mode = .SDF
+				}
+				if current_vert_buf != unit_quad {
+					sdl.BindGPUVertexBuffers(render_pass, 0, &sdl.GPUBufferBinding{buffer = unit_quad, offset = 0}, 1)
+					current_vert_buf = unit_quad
+				}
+				if current_atlas != white_texture {
+					sdl.BindGPUFragmentSamplers(
+						render_pass,
+						0,
+						&sdl.GPUTextureSamplerBinding{texture = white_texture, sampler = sampler},
+						1,
+					)
+					current_atlas = white_texture
+				}
+				sdl.DrawGPUPrimitives(render_pass, 6, batch.count, 0, batch.offset)
+			}
+		}
+	}
+
+	sdl.EndGPURenderPass(render_pass)
+}
+
+destroy_pipeline_2d_base :: proc(device: ^sdl.GPUDevice, pipeline: ^Pipeline_2D_Base) {
+	destroy_buffer(device, &pipeline.vertex_buffer)
+	destroy_buffer(device, &pipeline.index_buffer)
+	destroy_buffer(device, &pipeline.primitive_buffer)
+	if pipeline.unit_quad_buffer != nil {
+		sdl.ReleaseGPUBuffer(device, pipeline.unit_quad_buffer)
+	}
+	sdl.ReleaseGPUTexture(device, pipeline.white_texture)
+	sdl.ReleaseGPUSampler(device, pipeline.sampler)
+	sdl.ReleaseGPUGraphicsPipeline(device, pipeline.sdl_pipeline)
+}
@@ -0,0 +1,296 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+// Implementation of the GLSL mod() function, which is slightly different than Metal fmod()
+template<typename Tx, typename Ty>
+inline Tx mod(Tx x, Ty y)
+{
+    return x - y * floor(x / y);
+}
+
+struct main0_out
+{
+    float4 out_color [[color(0)]];
+};
+
+struct main0_in
+{
+    float4 f_color [[user(locn0)]];
+    float2 f_local_or_uv [[user(locn1)]];
+    float4 f_params [[user(locn2)]];
+    float4 f_params2 [[user(locn3)]];
+    uint f_kind_flags [[user(locn4)]];
+    float f_rotation [[user(locn5), flat]];
+};
+
+static inline __attribute__((always_inline))
+float2 apply_rotation(thread const float2& p, thread const float& angle)
+{
+    float cr = cos(-angle);
+    float sr = sin(-angle);
+    return float2x2(float2(cr, sr), float2(-sr, cr)) * p;
+}
+
+static inline __attribute__((always_inline))
+float sdRoundedBox(thread const float2& p, thread const float2& b, thread float4& r)
+{
+    float2 _61;
+    if (p.x > 0.0)
+    {
+        _61 = r.xy;
+    }
+    else
+    {
+        _61 = r.zw;
+    }
+    r.x = _61.x;
+    r.y = _61.y;
+    float _78;
+    if (p.y > 0.0)
+    {
+        _78 = r.x;
+    }
+    else
+    {
+        _78 = r.y;
+    }
+    r.x = _78;
+    float2 q = (abs(p) - b) + float2(r.x);
+    return (fast::min(fast::max(q.x, q.y), 0.0) + length(fast::max(q, float2(0.0)))) - r.x;
+}
+
+static inline __attribute__((always_inline))
+float sdf_stroke(thread const float& d, thread const float& stroke_width)
+{
+    return abs(d) - (stroke_width * 0.5);
+}
+
+static inline __attribute__((always_inline))
+float sdCircle(thread const float2& p, thread const float& r)
+{
+    return length(p) - r;
+}
+
+static inline __attribute__((always_inline))
+float sdEllipse(thread float2& p, thread float2& ab)
+{
+    p = abs(p);
+    if (p.x > p.y)
+    {
+        p = p.yx;
+        ab = ab.yx;
+    }
+    float l = (ab.y * ab.y) - (ab.x * ab.x);
+    float m = (ab.x * p.x) / l;
+    float m2 = m * m;
+    float n = (ab.y * p.y) / l;
+    float n2 = n * n;
+    float c = ((m2 + n2) - 1.0) / 3.0;
+    float c3 = (c * c) * c;
+    float q = c3 + ((m2 * n2) * 2.0);
+    float d = c3 + (m2 * n2);
+    float g = m + (m * n2);
+    float co;
+    if (d < 0.0)
+    {
+        float h = acos(q / c3) / 3.0;
+        float s = cos(h);
+        float t = sin(h) * 1.73205077648162841796875;
+        float rx = sqrt(((-c) * ((s + t) + 2.0)) + m2);
+        float ry = sqrt(((-c) * ((s - t) + 2.0)) + m2);
+        co = (((ry + (sign(l) * rx)) + (abs(g) / (rx * ry))) - m) / 2.0;
+    }
+    else
+    {
+        float h_1 = ((2.0 * m) * n) * sqrt(d);
+        float s_1 = sign(q + h_1) * powr(abs(q + h_1), 0.3333333432674407958984375);
+        float u = sign(q - h_1) * powr(abs(q - h_1), 0.3333333432674407958984375);
+        float rx_1 = (((-s_1) - u) - (c * 4.0)) + (2.0 * m2);
+        float ry_1 = (s_1 - u) * 1.73205077648162841796875;
+        float rm = sqrt((rx_1 * rx_1) + (ry_1 * ry_1));
+        co = (((ry_1 / sqrt(rm - rx_1)) + ((2.0 * g) / rm)) - m) / 2.0;
+    }
+    float2 r = ab * float2(co, sqrt(1.0 - (co * co)));
+    return length(r - p) * sign(p.y - r.y);
+}
+
+static inline __attribute__((always_inline))
+float sdSegment(thread const float2& p, thread const float2& a, thread const float2& b)
+{
+    float2 pa = p - a;
+    float2 ba = b - a;
+    float h = fast::clamp(dot(pa, ba) / dot(ba, ba), 0.0, 1.0);
+    return length(pa - (ba * h));
+}
+
+static inline __attribute__((always_inline))
+float sdf_alpha(thread const float& d, thread const float& soft)
+{
+    return 1.0 - smoothstep(-soft, soft, d);
+}
+
+fragment main0_out main0(main0_in in [[stage_in]], texture2d<float> tex [[texture(0)]], sampler texSmplr [[sampler(0)]])
+{
+    main0_out out = {};
+    uint kind = in.f_kind_flags & 255u;
+    uint flags = (in.f_kind_flags >> 8u) & 255u;
+    if (kind == 0u)
+    {
+        out.out_color = in.f_color * tex.sample(texSmplr, in.f_local_or_uv);
+        return out;
+    }
+    float d = 1000000015047466219876688855040.0;
+    float soft = 1.0;
+    if (kind == 1u)
+    {
+        float2 b = in.f_params.xy;
+        float4 r = float4(in.f_params.zw, in.f_params2.xy);
+        soft = fast::max(in.f_params2.z, 1.0);
+        float stroke_px = in.f_params2.w;
+        float2 p_local = in.f_local_or_uv;
+        if (in.f_rotation != 0.0)
+        {
+            float2 param = p_local;
+            float param_1 = in.f_rotation;
+            p_local = apply_rotation(param, param_1);
+        }
+        float2 param_2 = p_local;
+        float2 param_3 = b;
+        float4 param_4 = r;
+        float _491 = sdRoundedBox(param_2, param_3, param_4);
+        d = _491;
+        if ((flags & 1u) != 0u)
+        {
+            float param_5 = d;
+            float param_6 = stroke_px;
+            d = sdf_stroke(param_5, param_6);
+        }
+    }
+    else
+    {
+        if (kind == 2u)
+        {
+            float radius = in.f_params.x;
+            soft = fast::max(in.f_params.y, 1.0);
+            float stroke_px_1 = in.f_params.z;
+            float2 param_7 = in.f_local_or_uv;
+            float param_8 = radius;
+            d = sdCircle(param_7, param_8);
+            if ((flags & 1u) != 0u)
+            {
+                float param_9 = d;
+                float param_10 = stroke_px_1;
+                d = sdf_stroke(param_9, param_10);
+            }
+        }
+        else
+        {
+            if (kind == 3u)
+            {
+                float2 ab = in.f_params.xy;
+                soft = fast::max(in.f_params.z, 1.0);
+                float stroke_px_2 = in.f_params.w;
+                float2 p_local_1 = in.f_local_or_uv;
+                if (in.f_rotation != 0.0)
+                {
+                    float2 param_11 = p_local_1;
+                    float param_12 = in.f_rotation;
+                    p_local_1 = apply_rotation(param_11, param_12);
+                }
+                float2 param_13 = p_local_1;
+                float2 param_14 = ab;
+                float _560 = sdEllipse(param_13, param_14);
+                d = _560;
+                if ((flags & 1u) != 0u)
+                {
+                    float param_15 = d;
+                    float param_16 = stroke_px_2;
+                    d = sdf_stroke(param_15, param_16);
+                }
+            }
+            else
+            {
+                if (kind == 4u)
+                {
+                    float2 a = in.f_params.xy;
+                    float2 b_1 = in.f_params.zw;
+                    float width = in.f_params2.x;
+                    soft = fast::max(in.f_params2.y, 1.0);
+                    float2 param_17 = in.f_local_or_uv;
+                    float2 param_18 = a;
+                    float2 param_19 = b_1;
+                    d = sdSegment(param_17, param_18, param_19) - (width * 0.5);
+                }
+                else
+                {
+                    if (kind == 5u)
+                    {
+                        float inner = in.f_params.x;
+                        float outer = in.f_params.y;
+                        float start_rad = in.f_params.z;
+                        float end_rad = in.f_params.w;
+                        soft = fast::max(in.f_params2.x, 1.0);
+                        float r_1 = length(in.f_local_or_uv);
+                        float d_ring = fast::max(inner - r_1, r_1 - outer);
+                        float angle = precise::atan2(in.f_local_or_uv.y, in.f_local_or_uv.x);
+                        if (angle < 0.0)
+                        {
+                            angle += 6.283185482025146484375;
+                        }
+                        float ang_start = mod(start_rad, 6.283185482025146484375);
+                        float ang_end = mod(end_rad, 6.283185482025146484375);
+                        float _654;
+                        if (ang_end > ang_start)
+                        {
+                            _654 = float((angle >= ang_start) && (angle <= ang_end));
+                        }
+                        else
+                        {
+                            _654 = float((angle >= ang_start) || (angle <= ang_end));
+                        }
+                        float in_arc = _654;
+                        if (abs(ang_end - ang_start) >= 6.282185077667236328125)
+                        {
+                            in_arc = 1.0;
+                        }
+                        d = (in_arc > 0.5) ? d_ring : 1000000015047466219876688855040.0;
+                    }
+                    else
+                    {
+                        if (kind == 6u)
+                        {
+                            float radius_1 = in.f_params.x;
+                            float rotation = in.f_params.y;
+                            float sides = in.f_params.z;
+                            soft = fast::max(in.f_params.w, 1.0);
+                            float stroke_px_3 = in.f_params2.x;
+                            float2 p = in.f_local_or_uv;
+                            float c = cos(rotation);
+                            float s = sin(rotation);
+                            p = float2x2(float2(c, -s), float2(s, c)) * p;
+                            float an = 3.1415927410125732421875 / sides;
+                            float bn = mod(precise::atan2(p.y, p.x), 2.0 * an) - an;
+                            d = (length(p) * cos(bn)) - radius_1;
+                            if ((flags & 1u) != 0u)
+                            {
+                                float param_20 = d;
+                                float param_21 = stroke_px_3;
+                                d = sdf_stroke(param_20, param_21);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    float param_22 = d;
+    float param_23 = soft;
+    float alpha = sdf_alpha(param_22, param_23);
+    out.out_color = float4(in.f_color.xyz, in.f_color.w * alpha);
+    return out;
+}
+
@@ -0,0 +1,94 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Uniforms
+{
+    float4x4 projection;
+    float dpi_scale;
+    uint mode;
+};
+
+struct Primitive
+{
+    float4 bounds;
+    uint color;
+    uint kind_flags;
+    float rotation;
+    float _pad;
+    float4 params;
+    float4 params2;
+};
+
+struct Primitive_1
+{
+    float4 bounds;
+    uint color;
+    uint kind_flags;
+    float rotation;
+    float _pad;
+    float4 params;
+    float4 params2;
+};
+
+struct Primitives
+{
+    Primitive_1 primitives[1];
+};
+
+struct main0_out
+{
+    float4 f_color [[user(locn0)]];
+    float2 f_local_or_uv [[user(locn1)]];
+    float4 f_params [[user(locn2)]];
+    float4 f_params2 [[user(locn3)]];
+    uint f_kind_flags [[user(locn4)]];
+    float f_rotation [[user(locn5)]];
+    float4 gl_Position [[position]];
+};
+
+struct main0_in
+{
+    float2 v_position [[attribute(0)]];
+    float2 v_uv [[attribute(1)]];
+    float4 v_color [[attribute(2)]];
+};
+
+vertex main0_out main0(main0_in in [[stage_in]], constant Uniforms& _12 [[buffer(0)]], const device Primitives& _72 [[buffer(1)]], uint gl_InstanceIndex [[instance_id]])
+{
+    main0_out out = {};
+    if (_12.mode == 0u)
+    {
+        out.f_color = in.v_color;
+        out.f_local_or_uv = in.v_uv;
+        out.f_params = float4(0.0);
+        out.f_params2 = float4(0.0);
+        out.f_kind_flags = 0u;
+        out.f_rotation = 0.0;
+        out.gl_Position = _12.projection * float4(in.v_position * _12.dpi_scale, 0.0, 1.0);
+    }
+    else
+    {
+        Primitive p;
+        p.bounds = _72.primitives[int(gl_InstanceIndex)].bounds;
+        p.color = _72.primitives[int(gl_InstanceIndex)].color;
+        p.kind_flags = _72.primitives[int(gl_InstanceIndex)].kind_flags;
+        p.rotation = _72.primitives[int(gl_InstanceIndex)].rotation;
+        p._pad = _72.primitives[int(gl_InstanceIndex)]._pad;
+        p.params = _72.primitives[int(gl_InstanceIndex)].params;
+        p.params2 = _72.primitives[int(gl_InstanceIndex)].params2;
+        float2 corner = in.v_position;
+        float2 world_pos = mix(p.bounds.xy, p.bounds.zw, corner);
+        float2 center = (p.bounds.xy + p.bounds.zw) * 0.5;
+        out.f_color = unpack_unorm4x8_to_float(p.color);
+        out.f_local_or_uv = (world_pos - center) * _12.dpi_scale;
+        out.f_params = p.params;
+        out.f_params2 = p.params2;
+        out.f_kind_flags = p.kind_flags;
+        out.f_rotation = p.rotation;
+        out.gl_Position = _12.projection * float4(world_pos * _12.dpi_scale, 0.0, 1.0);
+    }
+    return out;
+}
+
@@ -0,0 +1,210 @@
+#version 450 core
+
+// --- Inputs from vertex shader ---
+layout(location = 0) in vec4 f_color;
+layout(location = 1) in vec2 f_local_or_uv;
+layout(location = 2) in vec4 f_params;
+layout(location = 3) in vec4 f_params2;
+layout(location = 4) flat in uint f_kind_flags;
+layout(location = 5) flat in float f_rotation;
+
+// --- Output ---
+layout(location = 0) out vec4 out_color;
+
+// --- Texture sampler (for tessellated/text path) ---
+layout(set = 2, binding = 0) uniform sampler2D tex;
+
+// ---------------------------------------------------------------------------
+// SDF helper functions (Inigo Quilez)
+// All operate in physical pixel space — no dpi_scale needed here.
+// ---------------------------------------------------------------------------
+
+const float PI = 3.14159265358979;
+
+float sdCircle(vec2 p, float r) {
+    return length(p) - r;
+}
+
+float sdRoundedBox(vec2 p, vec2 b, vec4 r) {
+    r.xy = (p.x > 0.0) ? r.xy : r.zw;
+    r.x = (p.y > 0.0) ? r.x : r.y;
+    vec2 q = abs(p) - b + r.x;
+    return min(max(q.x, q.y), 0.0) + length(max(q, vec2(0.0))) - r.x;
+}
+
+float sdSegment(vec2 p, vec2 a, vec2 b) {
+    vec2 pa = p - a, ba = b - a;
+    float h = clamp(dot(pa, ba) / dot(ba, ba), 0.0, 1.0);
+    return length(pa - ba * h);
+}
+
+float sdEllipse(vec2 p, vec2 ab) {
+    p = abs(p);
+    if (p.x > p.y) {
+        p = p.yx;
+        ab = ab.yx;
+    }
+    float l = ab.y * ab.y - ab.x * ab.x;
+    float m = ab.x * p.x / l;
+    float m2 = m * m;
+    float n = ab.y * p.y / l;
+    float n2 = n * n;
+    float c = (m2 + n2 - 1.0) / 3.0;
+    float c3 = c * c * c;
+    float q = c3 + m2 * n2 * 2.0;
+    float d = c3 + m2 * n2;
+    float g = m + m * n2;
+    float co;
+    if (d < 0.0) {
+        float h = acos(q / c3) / 3.0;
+        float s = cos(h);
+        float t = sin(h) * sqrt(3.0);
+        float rx = sqrt(-c * (s + t + 2.0) + m2);
+        float ry = sqrt(-c * (s - t + 2.0) + m2);
+        co = (ry + sign(l) * rx + abs(g) / (rx * ry) - m) / 2.0;
+    } else {
+        float h = 2.0 * m * n * sqrt(d);
+        float s = sign(q + h) * pow(abs(q + h), 1.0 / 3.0);
+        float u = sign(q - h) * pow(abs(q - h), 1.0 / 3.0);
+        float rx = -s - u - c * 4.0 + 2.0 * m2;
+        float ry = (s - u) * sqrt(3.0);
+        float rm = sqrt(rx * rx + ry * ry);
+        co = (ry / sqrt(rm - rx) + 2.0 * g / rm - m) / 2.0;
+    }
+    vec2 r = ab * vec2(co, sqrt(1.0 - co * co));
+    return length(r - p) * sign(p.y - r.y);
+}
+
+float sdf_alpha(float d, float soft) {
+    return 1.0 - smoothstep(-soft, soft, d);
+}
+
+float sdf_stroke(float d, float stroke_width) {
+    return abs(d) - stroke_width * 0.5;
+}
+
+// Rotate a 2D point by the negative of the given angle (inverse rotation).
+// Used to rotate the sampling frame opposite to the shape's rotation so that
+// the SDF evaluates correctly for the rotated shape.
+vec2 apply_rotation(vec2 p, float angle) {
+    float cr = cos(-angle);
+    float sr = sin(-angle);
+    return mat2(cr, sr, -sr, cr) * p;
+}
+
+// ---------------------------------------------------------------------------
+// main
+// ---------------------------------------------------------------------------
+
+void main() {
+    uint kind = f_kind_flags & 0xFFu;
+    uint flags = (f_kind_flags >> 8u) & 0xFFu;
+
+    // -----------------------------------------------------------------------
+    // Kind 0: Tessellated path. Texture multiply for text atlas,
+    //         white pixel for solid shapes.
+    // -----------------------------------------------------------------------
+    if (kind == 0u) {
+        out_color = f_color * texture(tex, f_local_or_uv);
+        return;
+    }
+
+    // -----------------------------------------------------------------------
+    // SDF path. f_local_or_uv = shape-centered position in physical pixels.
+    // All dimensional params are already in physical pixels (CPU pre-scaled).
+    // -----------------------------------------------------------------------
+    float d = 1e30;
+    float soft = 1.0;
+
+    if (kind == 1u) {
+        // RRect: rounded box
+        vec2 b = f_params.xy; // half_size (phys px)
+        vec4 r = vec4(f_params.zw, f_params2.xy); // corner radii: tr, br, tl, bl
+        soft = max(f_params2.z, 1.0);
+        float stroke_px = f_params2.w;
+
+        vec2 p_local = f_local_or_uv;
+        if (f_rotation != 0.0) {
+            p_local = apply_rotation(p_local, f_rotation);
+        }
+
+        d = sdRoundedBox(p_local, b, r);
+        if ((flags & 1u) != 0u) d = sdf_stroke(d, stroke_px);
+    }
+    else if (kind == 2u) {
+        // Circle — rotationally symmetric, no rotation needed
+        float radius = f_params.x;
+        soft = max(f_params.y, 1.0);
+        float stroke_px = f_params.z;
+
+        d = sdCircle(f_local_or_uv, radius);
+        if ((flags & 1u) != 0u) d = sdf_stroke(d, stroke_px);
+    }
+    else if (kind == 3u) {
+        // Ellipse
+        vec2 ab = f_params.xy;
+        soft = max(f_params.z, 1.0);
+        float stroke_px = f_params.w;
+
+        vec2 p_local = f_local_or_uv;
+        if (f_rotation != 0.0) {
+            p_local = apply_rotation(p_local, f_rotation);
+        }
+
+        d = sdEllipse(p_local, ab);
+        if ((flags & 1u) != 0u) d = sdf_stroke(d, stroke_px);
+    }
+    else if (kind == 4u) {
+        // Segment (capsule line) — no rotation (excluded)
+        vec2 a = f_params.xy; // already in local physical pixels
+        vec2 b = f_params.zw;
+        float width = f_params2.x;
+        soft = max(f_params2.y, 1.0);
+
+        d = sdSegment(f_local_or_uv, a, b) - width * 0.5;
+    }
+    else if (kind == 5u) {
+        // Ring / Arc — rotation handled by CPU angle offset, no shader rotation
+        float inner = f_params.x;
+        float outer = f_params.y;
+        float start_rad = f_params.z;
+        float end_rad = f_params.w;
+        soft = max(f_params2.x, 1.0);
+
+        float r = length(f_local_or_uv);
+        float d_ring = max(inner - r, r - outer);
+
+        // Angular clip
+        float angle = atan(f_local_or_uv.y, f_local_or_uv.x);
+        if (angle < 0.0) angle += 2.0 * PI;
+        float ang_start = mod(start_rad, 2.0 * PI);
+        float ang_end = mod(end_rad, 2.0 * PI);
+
+        float in_arc = (ang_end > ang_start)
+            ? ((angle >= ang_start && angle <= ang_end) ? 1.0 : 0.0) : ((angle >= ang_start || angle <= ang_end) ? 1.0 : 0.0);
+        if (abs(ang_end - ang_start) >= 2.0 * PI - 0.001) in_arc = 1.0;
+
+        d = in_arc > 0.5 ? d_ring : 1e30;
+    }
+    else if (kind == 6u) {
+        // Regular N-gon — has its own rotation in params, no Primitive.rotation used
+        float radius = f_params.x;
+        float rotation = f_params.y;
+        float sides = f_params.z;
+        soft = max(f_params.w, 1.0);
+        float stroke_px = f_params2.x;
+
+        vec2 p = f_local_or_uv;
+        float c = cos(rotation), s = sin(rotation);
+        p = mat2(c, -s, s, c) * p;
+
+        float an = PI / sides;
+        float bn = mod(atan(p.y, p.x), 2.0 * an) - an;
+        d = length(p) * cos(bn) - radius;
+
+        if ((flags & 1u) != 0u) d = sdf_stroke(d, stroke_px);
+    }
+
+    float alpha = sdf_alpha(d, soft);
+    out_color = vec4(f_color.rgb, f_color.a * alpha);
+}
@@ -0,0 +1,67 @@
+#version 450 core
+
+// ---------- Vertex attributes (used in both modes) ----------
+layout(location = 0) in vec2 v_position;
+layout(location = 1) in vec2 v_uv;
+layout(location = 2) in vec4 v_color;
+
+// ---------- Outputs to fragment shader ----------
+layout(location = 0) out vec4 f_color;
+layout(location = 1) out vec2 f_local_or_uv;
+layout(location = 2) out vec4 f_params;
+layout(location = 3) out vec4 f_params2;
+layout(location = 4) flat out uint f_kind_flags;
+layout(location = 5) flat out float f_rotation;
+
+// ---------- Uniforms (single block — avoids spirv-cross reordering on Metal) ----------
+layout(set = 1, binding = 0) uniform Uniforms {
+    mat4 projection;
+    float dpi_scale;
+    uint mode; // 0 = tessellated, 1 = SDF
+};
+
+// ---------- SDF primitive storage buffer ----------
+struct Primitive {
+    vec4 bounds; // 0-15:  min_x, min_y, max_x, max_y
+    uint color; // 16-19: packed u8x4 (unpack with unpackUnorm4x8)
+    uint kind_flags; // 20-23: kind | (flags << 8)
+    float rotation; // 24-27: shader self-rotation in radians
+    float _pad; // 28-31: alignment padding
+    vec4 params; // 32-47: shape params part 1
+    vec4 params2; // 48-63: shape params part 2
+};
+
+layout(std430, set = 0, binding = 0) readonly buffer Primitives {
+    Primitive primitives[];
+};
+
+// ---------- Entry point ----------
+void main() {
+    if (mode == 0u) {
+        // ---- Mode 0: Tessellated (legacy) ----
+        f_color = v_color;
+        f_local_or_uv = v_uv;
+        f_params = vec4(0.0);
+        f_params2 = vec4(0.0);
+        f_kind_flags = 0u;
+        f_rotation = 0.0;
+
+        gl_Position = projection * vec4(v_position * dpi_scale, 0.0, 1.0);
+    } else {
+        // ---- Mode 1: SDF instanced quads ----
+        Primitive p = primitives[gl_InstanceIndex];
+
+        vec2 corner = v_position; // unit quad corners: (0,0)-(1,1)
+        vec2 world_pos = mix(p.bounds.xy, p.bounds.zw, corner);
+        vec2 center = 0.5 * (p.bounds.xy + p.bounds.zw);
+
+        f_color = unpackUnorm4x8(p.color);
+        f_local_or_uv = (world_pos - center) * dpi_scale; // shape-centered physical pixels
+        f_params = p.params;
+        f_params2 = p.params2;
+        f_kind_flags = p.kind_flags;
+        f_rotation = p.rotation;
+
+        gl_Position = projection * vec4(world_pos * dpi_scale, 0.0, 1.0);
+    }
+}
@@ -0,0 +1,312 @@
+package draw
+
+import "core:c"
+import "core:log"
+import "core:strings"
+import sdl "vendor:sdl3"
+import sdl_ttf "vendor:sdl3/ttf"
+
+Font_Id :: u16
+
+Font_Key :: struct {
+	id:   Font_Id,
+	size: u16,
+}
+
+Cache_Source :: enum u8 {
+	Custom,
+	Clay,
+}
+
+Cache_Key :: struct {
+	id:     u32,
+	source: Cache_Source,
+}
+
+Text_Cache :: struct {
+	engine:     ^sdl_ttf.TextEngine,
+	font_bytes: [dynamic][]u8,
+	sdl_fonts:  map[Font_Key]^sdl_ttf.Font,
+	cache:      map[Cache_Key]^sdl_ttf.Text,
+}
+
+// Internal for fetching SDL TTF font pointer for rendering
+get_font :: proc(id: Font_Id, size: u16) -> ^sdl_ttf.Font {
+	assert(int(id) < len(GLOB.text_cache.font_bytes), "Invalid font ID.")
+	key := Font_Key{id, size}
+	font := GLOB.text_cache.sdl_fonts[key]
+
+	if font == nil {
+		log.debug("Font with id:", id, "and size:", size, "not found. Adding..")
+
+		font_bytes := GLOB.text_cache.font_bytes[id]
+		if font_bytes == nil {
+			log.panicf("Font must first be registered with register_font before using (id=%d)", id)
+		}
+
+		font_io := sdl.IOFromConstMem(raw_data(font_bytes[:]), len(font_bytes))
+		if font_io == nil {
+			log.panicf("Failed to create IOStream for font id=%d: %s", id, sdl.GetError())
+		}
+
+		sdl_font := sdl_ttf.OpenFontIO(font_io, true, f32(size))
+		if sdl_font == nil {
+			log.panicf("Failed to create SDL font for font id=%d size=%d: %s", id, size, sdl.GetError())
+		}
+
+		if !sdl_ttf.SetFontSizeDPI(sdl_font, f32(size), 72 * i32(GLOB.dpi_scaling), 72 * i32(GLOB.dpi_scaling)) {
+			log.panicf("Failed to set font DPI for font id=%d size=%d: %s", id, size, sdl.GetError())
+		}
+
+		GLOB.text_cache.sdl_fonts[key] = sdl_font
+		return sdl_font
+	} else {
+		return font
+	}
+}
+
+// Returns `false` if there are more than max(u16) fonts
+register_font :: proc(bytes: []u8) -> (id: Font_Id, ok: bool) #optional_ok {
+	if GLOB.text_cache.engine == nil {
+		log.panicf("Cannot register font: text system not initialized. Call init() first.")
+	}
+	if len(GLOB.text_cache.font_bytes) > int(max(Font_Id)) do return 0, false
+
+	log.debug("Registering font...")
+	append(&GLOB.text_cache.font_bytes, bytes)
+	return Font_Id(len(GLOB.text_cache.font_bytes) - 1), true
+}
+
+Text :: struct {
+	sdl_text: ^sdl_ttf.Text,
+	position: [2]f32,
+	color:    Color,
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Text cache lookup -------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+// Shared cache lookup/create/update logic used by both the `text` proc and the Clay render path.
+// Returns the cached (or newly created) TTF_Text pointer.
+@(private)
+cache_get_or_update :: proc(key: Cache_Key, c_str: cstring, font: ^sdl_ttf.Font) -> ^sdl_ttf.Text {
+	existing, found := GLOB.text_cache.cache[key]
+	if !found {
+		sdl_text := sdl_ttf.CreateText(GLOB.text_cache.engine, font, c_str, 0)
+		if sdl_text == nil {
+			log.panicf("Failed to create SDL text: %s", sdl.GetError())
+		}
+		GLOB.text_cache.cache[key] = sdl_text
+		return sdl_text
+	} else {
+		if !sdl_ttf.SetTextString(existing, c_str, 0) {
+			log.panicf("Failed to update SDL text string: %s", sdl.GetError())
+		}
+		return existing
+	}
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Text drawing ------------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+// Draw text at a position with optional rotation and origin.
+//
+// When `id` is nil (the default), the text is created and destroyed each frame — simple and
+// leak-free, appropriate for HUDs and moderate UI (up to ~50 text elements per frame).
+//
+// When `id` is set, the TTF_Text object is cached across frames keyed by the provided u32.
+// This avoids per-frame HarfBuzz shaping and allocation, which matters for text-heavy apps
+// (editors, terminals, chat). The user is responsible for choosing unique IDs per logical text
+// element and calling `clear_text_cache` or `clear_text_cache_entry` when cached entries are
+// no longer needed. Custom text IDs occupy a separate namespace from Clay text IDs, so
+// collisions between the two are impossible.
+//
+// `origin` is in pixels from the text block's top-left corner (raylib convention).
+// The point whose local coords equal `origin` lands at `pos` in world space.
+// `rotation` is in degrees, counter-clockwise.
+text :: proc(
+	layer: ^Layer,
+	text_string: string,
+	position: [2]f32,
+	font_id: Font_Id,
+	font_size: u16 = 44,
+	color: Color = BLACK,
+	origin: [2]f32 = {0, 0},
+	rotation: f32 = 0,
+	id: Maybe(u32) = nil,
+	temp_allocator := context.temp_allocator,
+) {
+	c_str := strings.clone_to_cstring(text_string, temp_allocator)
+
+	sdl_text: ^sdl_ttf.Text
+	cached := false
+
+	if cache_id, ok := id.?; ok {
+		cached = true
+		sdl_text = cache_get_or_update(Cache_Key{cache_id, .Custom}, c_str, get_font(font_id, font_size))
+	} else {
+		sdl_text = sdl_ttf.CreateText(GLOB.text_cache.engine, get_font(font_id, font_size), c_str, 0)
+		if sdl_text == nil {
+			log.panicf("Failed to create SDL text: %s", sdl.GetError())
+		}
+	}
+
+	if needs_transform(origin, rotation) {
+		dpi_scale := GLOB.dpi_scaling
+		transform := build_pivot_rotation(position * dpi_scale, origin * dpi_scale, rotation)
+		prepare_text_transformed(layer, Text{sdl_text, {0, 0}, color}, transform)
+	} else {
+		prepare_text(layer, Text{sdl_text, position, color})
+	}
+
+	if !cached {
+		// Don't destroy now — the draw data (atlas texture, vertices) is still referenced
+		// by the batch buffers until end() submits to the GPU. Deferred to clear_global().
+		append(&GLOB.tmp_uncached_text, sdl_text)
+	}
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Public text measurement -------
+// ---------------------------------------------------------------------------------------------------------------------
+
+// Measure a string in logical pixels (pre-DPI-scaling) using the same font backend as the renderer.
+measure_text :: proc(
+	text_string: string,
+	font_id: Font_Id,
+	font_size: u16 = 44,
+	allocator := context.temp_allocator,
+) -> [2]f32 {
+	c_str := strings.clone_to_cstring(text_string, allocator)
+	width, height: c.int
+	if !sdl_ttf.GetStringSize(get_font(font_id, font_size), c_str, 0, &width, &height) {
+		log.panicf("Failed to measure text: %s", sdl.GetError())
+	}
+	return {f32(width) / GLOB.dpi_scaling, f32(height) / GLOB.dpi_scaling}
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Text anchor helpers -----------
+// ---------------------------------------------------------------------------------------------------------------------
+
+center_of_text :: proc(text_string: string, font_id: Font_Id, font_size: u16 = 44) -> [2]f32 {
+	size := measure_text(text_string, font_id, font_size)
+	return size * 0.5
+}
+
+top_left_of_text :: proc(text_string: string, font_id: Font_Id, font_size: u16 = 44) -> [2]f32 {
+	return {0, 0}
+}
+
+top_of_text :: proc(text_string: string, font_id: Font_Id, font_size: u16 = 44) -> [2]f32 {
+	size := measure_text(text_string, font_id, font_size)
+	return {size.x * 0.5, 0}
+}
+
+top_right_of_text :: proc(text_string: string, font_id: Font_Id, font_size: u16 = 44) -> [2]f32 {
+	size := measure_text(text_string, font_id, font_size)
+	return {size.x, 0}
+}
+
+left_of_text :: proc(text_string: string, font_id: Font_Id, font_size: u16 = 44) -> [2]f32 {
+	size := measure_text(text_string, font_id, font_size)
+	return {0, size.y * 0.5}
+}
+
+right_of_text :: proc(text_string: string, font_id: Font_Id, font_size: u16 = 44) -> [2]f32 {
+	size := measure_text(text_string, font_id, font_size)
+	return {size.x, size.y * 0.5}
+}
+
+bottom_left_of_text :: proc(text_string: string, font_id: Font_Id, font_size: u16 = 44) -> [2]f32 {
+	size := measure_text(text_string, font_id, font_size)
+	return {0, size.y}
+}
+
+bottom_of_text :: proc(text_string: string, font_id: Font_Id, font_size: u16 = 44) -> [2]f32 {
+	size := measure_text(text_string, font_id, font_size)
+	return {size.x * 0.5, size.y}
+}
+
+bottom_right_of_text :: proc(text_string: string, font_id: Font_Id, font_size: u16 = 44) -> [2]f32 {
+	size := measure_text(text_string, font_id, font_size)
+	return size
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Cache management --------------
+// ---------------------------------------------------------------------------------------------------------------------
+
+// Destroy all cached text objects (both custom and Clay entries). Call on scene transitions,
+// view changes, or periodically in apps that produce many distinct cached text entries over time.
+// After calling this, subsequent text draws with an `id` will re-create their cache entries.
+clear_text_cache :: proc() {
+	for _, sdl_text in GLOB.text_cache.cache {
+		sdl_ttf.DestroyText(sdl_text)
+	}
+	clear(&GLOB.text_cache.cache)
+}
+
+// Destroy a specific cached custom text entry by its u32 id (the same value passed to the
+// `text` proc's `id` parameter). This only affects custom text entries — Clay text entries
+// are managed internally and are not addressable by the user.
+// No-op if the id is not in the cache.
+clear_text_cache_entry :: proc(id: u32) {
+	key := Cache_Key{id, .Custom}
+	sdl_text, ok := GLOB.text_cache.cache[key]
+	if ok {
+		sdl_ttf.DestroyText(sdl_text)
+		delete_key(&GLOB.text_cache.cache, key)
+	}
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// ----- Internal cache lifecycle ------
+// ---------------------------------------------------------------------------------------------------------------------
+
+@(private, require_results)
+init_text_cache :: proc(
+	device: ^sdl.GPUDevice,
+	allocator := context.allocator,
+) -> (
+	text_cache: Text_Cache,
+	ok: bool,
+) {
+	log.debug("Initializing text state")
+	if !sdl_ttf.Init() {
+		log.errorf("Failed to initialize SDL_ttf: %s", sdl.GetError())
+		return text_cache, false
+	}
+
+	engine := sdl_ttf.CreateGPUTextEngine(device)
+	if engine == nil {
+		log.errorf("Failed to create GPU text engine: %s", sdl.GetError())
+		sdl_ttf.Quit()
+		return text_cache, false
+	}
+	sdl_ttf.SetGPUTextEngineWinding(engine, .COUNTER_CLOCKWISE)
+
+	text_cache = Text_Cache {
+		engine = engine,
+		cache  = make(map[Cache_Key]^sdl_ttf.Text, allocator = allocator),
+	}
+
+	log.debug("Done initializing text cache")
+	return text_cache, true
+}
+
+destroy_text_cache :: proc() {
+	for _, font in GLOB.text_cache.sdl_fonts {
+		sdl_ttf.CloseFont(font)
+	}
+	for _, sdl_text in GLOB.text_cache.cache {
+		sdl_ttf.DestroyText(sdl_text)
+	}
+	delete(GLOB.text_cache.sdl_fonts)
+	delete(GLOB.text_cache.font_bytes)
+	delete(GLOB.text_cache.cache)
+	sdl_ttf.DestroyGPUTextEngine(GLOB.text_cache.engine)
+	sdl_ttf.Quit()
+}
@@ -0,0 +1,141 @@
+package meta
+
+import "core:fmt"
+import "core:os"
+import "core:strings"
+
+// Compiles all GLSL shaders in source_dir to both SPIR-V (.spv) and
+// Metal Shading Language (.metal), writing results to generated_dir.
+// Overwrites any previously generated files with matching names.
+// Requires `glslangValidator` and `spirv-cross` on PATH.
+gen_shaders :: proc(source_dir, generated_dir: string) -> (success: bool) {
+	if !verify_shader_tool("glslangValidator") do return false
+	if !verify_shader_tool("spirv-cross") do return false
+
+	source_entries, read_err := os.read_all_directory_by_path(source_dir, context.temp_allocator)
+	if read_err != nil {
+		fmt.eprintfln("Failed to read shader source directory '%s': %v", source_dir, read_err)
+		return false
+	}
+	shader_names := make([dynamic]string, len = 0, cap = 24, allocator = context.temp_allocator)
+
+	for entry in source_entries {
+		if strings.has_suffix(entry.name, ".vert") || strings.has_suffix(entry.name, ".frag") {
+			append(&shader_names, entry.name)
+		}
+	}
+
+	if len(shader_names) == 0 {
+		fmt.eprintfln("No shader source files (.vert, .frag) found in '%s'.", source_dir)
+		return false
+	}
+	if os.exists(generated_dir) {
+		rmdir_err := os.remove_all(generated_dir)
+		if rmdir_err != nil {
+			fmt.eprintfln("Failed to remove old output directory '%s': %v", generated_dir, rmdir_err)
+			return false
+		}
+	}
+	mkdir_err := os.mkdir(generated_dir)
+	if mkdir_err != nil {
+		fmt.eprintfln("Failed to create output directory '%s': %v", generated_dir, mkdir_err)
+		return false
+	}
+
+	compiled_count := 0
+	for shader_name in shader_names {
+		source_path := fmt.tprintf("%s/%s", source_dir, shader_name)
+		spv_path := fmt.tprintf("%s/%s.spv", generated_dir, shader_name)
+		metal_path := fmt.tprintf("%s/%s.metal", generated_dir, shader_name)
+
+		fmt.printfln("[GLSL -> SPIR-V]  %s", shader_name)
+		if !compile_glsl_to_spirv(source_path, spv_path) do continue
+
+		fmt.printfln("[SPIR-V -> MSL]   %s", shader_name)
+		if !compile_spirv_to_msl(spv_path, metal_path) do continue
+
+		compiled_count += 1
+	}
+
+	total := len(shader_names)
+	if compiled_count == total {
+		fmt.printfln("Successfully compiled all %d shaders.", total)
+		return true
+	}
+
+	fmt.eprintfln("%d of %d shaders failed to compile.", total - compiled_count, total)
+	return false
+}
+
+verify_shader_tool :: proc(tool_name: string) -> bool {
+	_, _, _, err := os.process_exec(
+		os.Process_Desc{command = []string{tool_name, "--version"}},
+		context.temp_allocator,
+	)
+
+	if err != nil {
+		fmt.eprintfln("Required tool '%s' not found on PATH.", tool_name)
+		if tool_name == "glslangValidator" {
+			fmt.eprintln("\tInstall the Vulkan SDK or the glslang package:")
+			fmt.eprintln("\t  macOS:   brew install glslang")
+			fmt.eprintln("\t  Arch:    sudo pacman -S glslang")
+			fmt.eprintln("\t  Debian:  sudo apt install glslang-tools")
+		} else if tool_name == "spirv-cross" {
+			fmt.eprintln("\tInstall SPIRV-Cross:")
+			fmt.eprintln("\t  macOS:   brew install spirv-cross")
+			fmt.eprintln("\t  Arch:    sudo pacman -S spirv-cross")
+			fmt.eprintln("\t  Debian:  sudo apt install spirv-cross")
+		}
+		return false
+	}
+
+	return true
+}
+
+compile_glsl_to_spirv :: proc(source_path, output_path: string) -> bool {
+	state, stdout_bytes, stderr_bytes, err := os.process_exec(
+		os.Process_Desc{command = []string{"glslangValidator", "-V", source_path, "-o", output_path}},
+		context.temp_allocator,
+	)
+
+	if err != nil {
+		fmt.eprintfln("\tFailed to run glslangValidator for '%s': %v", source_path, err)
+		return false
+	}
+
+	if !state.success {
+		fmt.eprintfln("\tglslangValidator failed for '%s' (exit code %d):", source_path, state.exit_code)
+		print_tool_output(stdout_bytes, stderr_bytes)
+		return false
+	}
+
+	return true
+}
+
+compile_spirv_to_msl :: proc(spv_path, output_path: string) -> bool {
+	state, stdout_bytes, stderr_bytes, err := os.process_exec(
+		os.Process_Desc{command = []string{"spirv-cross", "--msl", spv_path, "--output", output_path}},
+		context.temp_allocator,
+	)
+
+	if err != nil {
+		fmt.eprintfln("\tFailed to run spirv-cross for '%s': %v", spv_path, err)
+		return false
+	}
+
+	if !state.success {
+		fmt.eprintfln("\tspirv-cross failed for '%s' (exit code %d):", spv_path, state.exit_code)
+		print_tool_output(stdout_bytes, stderr_bytes)
+		return false
+	}
+
+	return true
+}
+
+print_tool_output :: proc(stdout_bytes, stderr_bytes: []u8) {
+	stderr_text := strings.trim_right_space(transmute(string)stderr_bytes)
+	stdout_text := strings.trim_right_space(transmute(string)stdout_bytes)
+
+	if len(stderr_text) > 0 do fmt.eprintfln("\t%s", stderr_text)
+	if len(stdout_text) > 0 do fmt.eprintfln("\t%s", stdout_text)
+}
@@ -0,0 +1,51 @@
+package meta
+
+import "core:fmt"
+import "core:os"
+
+Command :: struct {
+	name:        string,
+	description: string,
+	run:         proc() -> bool,
+}
+
+COMMANDS :: []Command {
+	{
+		name = "gen-shaders",
+		description = "Compile GLSL shaders to SPIR-V and Metal Shading Language.",
+		run = proc() -> bool {
+			return gen_shaders("draw/shaders/source", "draw/shaders/generated")
+		},
+	},
+}
+
+main :: proc() {
+	args := os.args[1:]
+
+	if len(args) == 0 {
+		print_usage()
+		return
+	}
+
+	command_name := args[0]
+	for command in COMMANDS {
+		if command.name == command_name {
+			if !command.run() do os.exit(1)
+			return
+		}
+	}
+
+	fmt.eprintfln("Unknown command '%s'.", command_name)
+	fmt.eprintln()
+	print_usage()
+	os.exit(1)
+}
+
+print_usage :: proc() {
+	fmt.eprintln("Usage: meta <command>")
+	fmt.eprintln()
+	fmt.eprintln("Commands:")
+	for command in COMMANDS {
+		fmt.eprintfln("  %-20s %s", command.name, command.description)
+	}
+}
@@ -0,0 +1,489 @@
+package clay
+
+import "core:c"
+
+when ODIN_OS == .Windows {
+	foreign import Clay "windows/clay.lib"
+} else when ODIN_OS == .Linux {
+	foreign import Clay "linux/clay.a"
+} else when ODIN_OS == .Darwin {
+	when ODIN_ARCH == .arm64 {
+		foreign import Clay "macos-arm64/clay.a"
+	} else {
+		foreign import Clay "macos/clay.a"
+	}
+} else when ODIN_ARCH == .wasm32 || ODIN_ARCH == .wasm64p32 {
+	foreign import Clay "wasm/clay.o"
+}
+
+String :: struct {
+	isStaticallyAllocated: c.bool,
+	length: c.int32_t,
+	chars:  [^]c.char,
+}
+
+StringSlice :: struct {
+	length: c.int32_t,
+	chars:  [^]c.char,
+	baseChars:  [^]c.char,
+}
+
+Vector2 :: [2]c.float
+
+Dimensions :: struct {
+	width:  c.float,
+	height: c.float,
+}
+
+Arena :: struct {
+	nextAllocation: uintptr,
+	capacity:       c.size_t,
+	memory:         [^]c.char,
+}
+
+BoundingBox :: struct {
+	x:      c.float,
+	y:      c.float,
+	width:  c.float,
+	height: c.float,
+}
+
+Color :: [4]c.float
+
+CornerRadius :: struct {
+	topLeft:     c.float,
+	topRight:    c.float,
+	bottomLeft:  c.float,
+	bottomRight: c.float,
+}
+
+BorderData :: struct {
+	width: u32,
+	color: Color,
+}
+
+ElementId :: struct {
+	id:       u32,
+	offset:   u32,
+	baseId:   u32,
+	stringId: String,
+}
+
+when ODIN_OS == .Windows {
+	EnumBackingType :: u32
+} else {
+	EnumBackingType :: u8
+}
+
+RenderCommandType :: enum EnumBackingType {
+	None,
+	Rectangle,
+	Border,
+	Text,
+	Image,
+	ScissorStart,
+	ScissorEnd,
+	Custom,
+}
+
+RectangleElementConfig :: struct {
+	color:        Color,
+}
+
+TextWrapMode :: enum EnumBackingType {
+	Words,
+	Newlines,
+	None,
+}
+
+TextAlignment :: enum EnumBackingType {
+	Left,
+	Center,
+	Right,
+}
+
+TextElementConfig :: struct {
+	userData:           rawptr,
+	textColor:          Color,
+	fontId:             u16,
+	fontSize:           u16,
+	letterSpacing:      u16,
+	lineHeight:         u16,
+	wrapMode:           TextWrapMode,
+	textAlignment:      TextAlignment,
+}
+
+AspectRatioElementConfig :: struct {
+	aspectRatio:        f32,
+}
+
+ImageElementConfig :: struct {
+	imageData:        rawptr,
+}
+
+CustomElementConfig :: struct {
+	customData: rawptr,
+}
+
+BorderWidth :: struct {
+	left: u16,
+	right: u16,
+	top: u16,
+	bottom: u16,
+	betweenChildren: u16,
+}
+
+BorderElementConfig :: struct {
+	color: Color,
+	width: BorderWidth,
+}
+
+ClipElementConfig :: struct {
+	horizontal:  bool, // clip overflowing elements on the "X" axis
+	vertical:    bool, // clip overflowing elements on the "Y" axis
+	childOffset: Vector2, // offsets the [X,Y] positions of all child elements, primarily for scrolling containers
+}
+
+FloatingAttachPointType :: enum EnumBackingType {
+	LeftTop,
+	LeftCenter,
+	LeftBottom,
+	CenterTop,
+	CenterCenter,
+	CenterBottom,
+	RightTop,
+	RightCenter,
+	RightBottom,
+}
+
+FloatingAttachPoints :: struct {
+	element: FloatingAttachPointType,
+	parent:  FloatingAttachPointType,
+}
+
+PointerCaptureMode :: enum EnumBackingType {
+	Capture,
+	Passthrough,
+}
+
+FloatingAttachToElement :: enum EnumBackingType {
+	None,
+	Parent,
+	ElementWithId,
+	Root,
+}
+
+FloatingClipToElement :: enum EnumBackingType {
+	None,
+	AttachedParent,
+}
+
+FloatingElementConfig :: struct {
+	offset:             Vector2,
+	expand:             Dimensions,
+	parentId:           u32,
+	zIndex:             i16,
+	attachment:         FloatingAttachPoints,
+	pointerCaptureMode: PointerCaptureMode,
+	attachTo:           FloatingAttachToElement,
+	clipTo: 			FloatingClipToElement,
+}
+
+TextRenderData :: struct {
+	stringContents: StringSlice,
+	textColor: Color,
+	fontId: u16,
+	fontSize: u16,
+	letterSpacing: u16,
+	lineHeight: u16,
+}
+
+RectangleRenderData :: struct {
+	backgroundColor: Color,
+	cornerRadius: CornerRadius,
+}
+
+ImageRenderData :: struct {
+	backgroundColor: Color,
+	cornerRadius: CornerRadius,
+	imageData: rawptr,
+}
+
+CustomRenderData :: struct {
+	backgroundColor: Color,
+	cornerRadius: CornerRadius,
+	customData: rawptr,
+}
+
+BorderRenderData :: struct {
+	color: Color,
+	cornerRadius: CornerRadius,
+	width: BorderWidth,
+}
+
+RenderCommandData :: struct #raw_union {
+	rectangle: RectangleRenderData,
+	text: TextRenderData,
+	image: ImageRenderData,
+	custom: CustomRenderData,
+	border: BorderRenderData,
+}
+
+RenderCommand :: struct {
+	boundingBox:        BoundingBox,
+	renderData:         RenderCommandData,
+	userData:           rawptr,
+	id:                 u32,
+	zIndex:             i16,
+	commandType:        RenderCommandType,
+}
+
+ScrollContainerData :: struct {
+	// Note: This is a pointer to the real internal scroll position, mutating it may cause a change in final layout.
+	// Intended for use with external functionality that modifies scroll position, such as scroll bars or auto scrolling.
+	scrollPosition:            ^Vector2,
+	scrollContainerDimensions: Dimensions,
+	contentDimensions:         Dimensions,
+	config:                    ClipElementConfig,
+	// Indicates whether an actual scroll container matched the provided ID or if the default struct was returned.
+	found:                     bool,
+}
+
+ElementData :: struct {
+	boundingBox: BoundingBox,
+	found:       bool,
+}
+
+PointerDataInteractionState :: enum EnumBackingType {
+	PressedThisFrame,
+	Pressed,
+	ReleasedThisFrame,
+	Released,
+}
+
+PointerData :: struct {
+	position: Vector2,
+	state:    PointerDataInteractionState,
+}
+
+SizingType :: enum EnumBackingType {
+	Fit,
+	Grow,
+	Percent,
+	Fixed,
+}
+
+SizingConstraintsMinMax :: struct {
+	min: c.float,
+	max: c.float,
+}
+
+SizingConstraints :: struct #raw_union {
+	sizeMinMax:  SizingConstraintsMinMax,
+	sizePercent: c.float,
+}
+
+SizingAxis :: struct {
+	// Note: `min` is used for CLAY_SIZING_PERCENT, slightly different to clay.h due to lack of C anonymous unions
+	constraints: SizingConstraints,
+	type:        SizingType,
+}
+
+Sizing :: struct {
+	width:  SizingAxis,
+	height: SizingAxis,
+}
+
+Padding :: struct {
+	left: u16,
+	right: u16,
+	top: u16,
+	bottom: u16,
+}
+
+LayoutDirection :: enum EnumBackingType {
+	LeftToRight,
+	TopToBottom,
+}
+
+LayoutAlignmentX :: enum EnumBackingType {
+	Left,
+	Right,
+	Center,
+}
+
+LayoutAlignmentY :: enum EnumBackingType {
+	Top,
+	Bottom,
+	Center,
+}
+
+ChildAlignment :: struct {
+	x: LayoutAlignmentX,
+	y: LayoutAlignmentY,
+}
+
+LayoutConfig :: struct {
+	sizing:          Sizing,
+	padding:         Padding,
+	childGap:        u16,
+	childAlignment:  ChildAlignment,
+	layoutDirection: LayoutDirection,
+}
+
+ClayArray :: struct($type: typeid) {
+	capacity:      i32,
+	length:        i32,
+	internalArray: [^]type,
+}
+
+ElementDeclaration :: struct {
+	id:              ElementId,
+	layout:          LayoutConfig,
+	backgroundColor: Color,
+	cornerRadius:    CornerRadius,
+	aspectRatio: 	 AspectRatioElementConfig,
+	image:           ImageElementConfig,
+	floating:        FloatingElementConfig,
+	custom:          CustomElementConfig,
+	clip:            ClipElementConfig,
+	border:          BorderElementConfig,
+	userData:        rawptr,
+}
+
+ErrorType :: enum EnumBackingType {
+	TextMeasurementFunctionNotProvided,
+	ArenaCapacityExceeded,
+	ElementsCapacityExceeded,
+	TextMeasurementCapacityExceeded,
+	DuplicateId,
+	FloatingContainerParentNotFound,
+	PercentageOver1,
+	InternalError,
+}
+
+ErrorData :: struct {
+	errorType: ErrorType,
+	errorText: String,
+	userData: rawptr,
+}
+
+ErrorHandler :: struct {
+	handler: proc "c" (errorData: ErrorData),
+	userData: rawptr,
+}
+
+Context :: struct {} // opaque structure, only use as a pointer
+
+@(link_prefix = "Clay_", default_calling_convention = "c")
+foreign Clay {
+	_OpenElement :: proc() ---
+	_CloseElement :: proc() ---
+	MinMemorySize :: proc() -> u32 ---
+	CreateArenaWithCapacityAndMemory :: proc(capacity: c.size_t, offset: [^]u8) -> Arena ---
+	SetPointerState :: proc(position: Vector2, pointerDown: bool) ---
+	Initialize :: proc(arena: Arena, layoutDimensions: Dimensions, errorHandler: ErrorHandler) -> ^Context ---
+	GetCurrentContext :: proc() -> ^Context ---
+	SetCurrentContext :: proc(ctx: ^Context) ---
+	UpdateScrollContainers :: proc(enableDragScrolling: bool, scrollDelta: Vector2, deltaTime: c.float) ---
+	SetLayoutDimensions :: proc(dimensions: Dimensions) ---
+	BeginLayout :: proc() ---
+	EndLayout :: proc() -> ClayArray(RenderCommand) ---
+	GetElementId :: proc(id: String) -> ElementId ---
+	GetElementIdWithIndex :: proc(id: String, index: u32) -> ElementId ---
+	GetElementData :: proc(id: ElementId) -> ElementData ---
+	Hovered :: proc() -> bool ---
+	OnHover :: proc(onHoverFunction: proc "c" (id: ElementId, pointerData: PointerData, userData: rawptr), userData: rawptr) ---
+	PointerOver :: proc(id: ElementId) -> bool ---
+	GetScrollOffset :: proc() -> Vector2 ---
+	GetScrollContainerData :: proc(id: ElementId) -> ScrollContainerData ---
+	SetMeasureTextFunction :: proc(measureTextFunction: proc "c" (text: StringSlice, config: ^TextElementConfig, userData: rawptr) -> Dimensions, userData: rawptr) ---
+	SetQueryScrollOffsetFunction :: proc(queryScrollOffsetFunction: proc "c" (elementId: u32, userData: rawptr) -> Vector2, userData: rawptr) ---
+	RenderCommandArray_Get :: proc(array: ^ClayArray(RenderCommand), index: i32) -> ^RenderCommand ---
+	SetDebugModeEnabled :: proc(enabled: bool) ---
+	IsDebugModeEnabled :: proc() -> bool ---
+	SetCullingEnabled :: proc(enabled: bool) ---
+	GetMaxElementCount :: proc() -> i32 ---
+	SetMaxElementCount :: proc(maxElementCount: i32) ---
+	GetMaxMeasureTextCacheWordCount :: proc() -> i32 ---
+	SetMaxMeasureTextCacheWordCount :: proc(maxMeasureTextCacheWordCount: i32) ---
+	ResetMeasureTextCache :: proc() ---
+}
+
+@(link_prefix = "Clay_", default_calling_convention = "c", private)
+foreign Clay {
+	_ConfigureOpenElement :: proc(config: ElementDeclaration) ---
+	_HashString :: proc(key: String, offset: u32, seed: u32) -> ElementId ---
+	_OpenTextElement :: proc(text: String, textConfig: ^TextElementConfig) ---
+	_StoreTextElementConfig :: proc(config: TextElementConfig) -> ^TextElementConfig ---
+	_GetParentElementId :: proc() -> u32 ---
+}
+
+ConfigureOpenElement :: proc(config: ElementDeclaration) -> bool {
+	_ConfigureOpenElement(config)
+	return true
+}
+
+@(deferred_none = _CloseElement)
+UI :: proc() -> proc (config: ElementDeclaration) -> bool {
+	_OpenElement()
+	return ConfigureOpenElement
+}
+
+Text :: proc($text: string, config: ^TextElementConfig) {
+	wrapped := MakeString(text)
+	wrapped.isStaticallyAllocated = true
+	_OpenTextElement(wrapped, config)
+}
+
+TextDynamic :: proc(text: string, config: ^TextElementConfig) {
+	_OpenTextElement(MakeString(text), config)
+}
+
+TextConfig :: proc(config: TextElementConfig) -> ^TextElementConfig {
+	return _StoreTextElementConfig(config)
+}
+
+PaddingAll :: proc(allPadding: u16) -> Padding {
+	return { left = allPadding, right = allPadding, top = allPadding, bottom = allPadding }
+}
+
+BorderOutside :: proc(width: u16) -> BorderWidth {
+	return {width, width, width, width, 0}
+}
+
+BorderAll :: proc(width: u16) -> BorderWidth {
+	return {width, width, width, width, width}
+}
+
+CornerRadiusAll :: proc(radius: f32) -> CornerRadius {
+	return CornerRadius{radius, radius, radius, radius}
+}
+
+SizingFit :: proc(sizeMinMax: SizingConstraintsMinMax) -> SizingAxis {
+	return SizingAxis{type = SizingType.Fit, constraints = {sizeMinMax = sizeMinMax}}
+}
+
+SizingGrow :: proc(sizeMinMax: SizingConstraintsMinMax) -> SizingAxis {
+	return SizingAxis{type = SizingType.Grow, constraints = {sizeMinMax = sizeMinMax}}
+}
+
+SizingFixed :: proc(size: c.float) -> SizingAxis {
+	return SizingAxis{type = SizingType.Fixed, constraints = {sizeMinMax = {size, size}}}
+}
+
+SizingPercent :: proc(sizePercent: c.float) -> SizingAxis {
+	return SizingAxis{type = SizingType.Percent, constraints = {sizePercent = sizePercent}}
+}
+
+MakeString :: proc(label: string) -> String {
+	return String{chars = raw_data(label), length = cast(c.int)len(label)}
+}
+
+ID :: proc(label: string, index: u32 = 0) -> ElementId {
+	return _HashString(MakeString(label), index, 0)
+}
+
+ID_LOCAL :: proc(label: string, index: u32 = 0) -> ElementId {
+	return _HashString(MakeString(label), index, _GetParentElementId())
+}
@@ -0,0 +1,6 @@
+{
+	"$schema": "https://raw.githubusercontent.com/DanielGavin/ols/master/misc/odinfmt.schema.json",
+	"character_width": 180,
+	"sort_imports": true,
+	"tabs": false
+}