From b25139074e5e53799714d564d15d647f08a4dfb3 Mon Sep 17 00:00:00 2001 From: lucasmerlin Date: Fri, 22 May 2026 11:37:53 +0200 Subject: [PATCH] Speed up egui-wgpu screenshot readback The map_async callback fires on the render thread inside Queue::submit, so its CPU work (pixel swizzle + buffer copy) blocks the next frame's GPU dispatch. Move the swizzle/send to a dedicated worker thread so submit returns immediately. On wasm (no real threads, !Send Buffer) keep the work inline. While there, replace the per-byte runtime-indexed swizzle loop with chunks_exact and constant indices so LLVM can autovectorize, and add a fast path for Rgba8Unorm that just bytemuck::cast_slice's each row straight into the Vec. --- crates/egui-wgpu/src/capture.rs | 108 +++++++++++++++++++++++--------- 1 file changed, 78 insertions(+), 30 deletions(-) diff --git a/crates/egui-wgpu/src/capture.rs b/crates/egui-wgpu/src/capture.rs index 7eb1bd3ca..69db2a0c6 100644 --- a/crates/egui-wgpu/src/capture.rs +++ b/crates/egui-wgpu/src/capture.rs @@ -194,9 +194,12 @@ impl CaptureState { let format = self.texture.format(); let tex_extent = self.texture.size(); let padding = self.padding; - let to_rgba = match format { - wgpu::TextureFormat::Rgba8Unorm => [0, 1, 2, 3], - wgpu::TextureFormat::Bgra8Unorm => [2, 1, 0, 3], + // Dispatch on format once outside the hot loop. Using constant indices for the swizzle + // (instead of a runtime `[usize; 4]` table) lets LLVM auto-vectorize the per-pixel + // path, and the Rgba8 case skips the swizzle entirely via a row-wise bytemuck copy. + let is_bgra = match format { + wgpu::TextureFormat::Rgba8Unorm => false, + wgpu::TextureFormat::Bgra8Unorm => true, _ => { log::error!( "Screen can't be captured unless the surface format is Rgba8Unorm or Bgra8Unorm. Current surface format is {format:?}" @@ -209,39 +212,84 @@ impl CaptureState { log::error!("Failed to map buffer for reading: {err}"); return; } - let buffer_slice = buffer.slice(..); - - let mut pixels = Vec::with_capacity((tex_extent.width * tex_extent.height) as usize); - for padded_row in buffer_slice - .get_mapped_range() - .chunks(padding.padded_bytes_per_row as usize) + // The `map_async` callback is fired by wgpu on the render thread inside + // `Queue::submit`. The pixel swizzle on a full-screen buffer is CPU-bound and + // would block the next frame's GPU dispatch. Off-load to a worker thread so + // submit returns immediately. On wasm we can't move `Arc` across + // threads (and there are no real threads anyway), so we keep the work inline. + #[cfg(not(target_arch = "wasm32"))] { - let row = &padded_row[..padding.unpadded_bytes_per_row as usize]; - for color in row.chunks(4) { - pixels.push(epaint::Color32::from_rgba_premultiplied( - color[to_rgba[0]], - color[to_rgba[1]], - color[to_rgba[2]], - color[to_rgba[3]], - )); - } + let _ = std::thread::Builder::new() + .name("egui_wgpu_capture_decode".into()) + .spawn(move || { + decode_and_send( + buffer, padding, tex_extent, is_bgra, ctx, tx, data, viewport_id, + ); + }); } - buffer.unmap(); - - tx.send(( - viewport_id, - data, - ColorImage::new( - [tex_extent.width as usize, tex_extent.height as usize], - pixels, - ), - )) - .ok(); - ctx.request_repaint(); + #[cfg(target_arch = "wasm32")] + decode_and_send( + buffer, padding, tex_extent, is_bgra, ctx, tx, data, viewport_id, + ); }); } } +/// Map the captured buffer, swizzle BGRA → RGBA if needed, ship the resulting +/// [`ColorImage`] through `tx`, and wake the UI thread. Called either inline (wasm) or on +/// a dedicated worker thread (native) — never on the render thread, so the per-pixel work +/// can't backpressure GPU submission. +fn decode_and_send( + buffer: Arc, + padding: BufferPadding, + tex_extent: wgpu::Extent3d, + is_bgra: bool, + ctx: egui::Context, + tx: CaptureSender, + data: Vec, + viewport_id: ViewportId, +) { + let buffer_slice = buffer.slice(..); + let total_pixels = (tex_extent.width * tex_extent.height) as usize; + let mut pixels: Vec = Vec::with_capacity(total_pixels); + let padded_bpr = padding.padded_bytes_per_row as usize; + let unpadded_bpr = padding.unpadded_bytes_per_row as usize; + let mapped = buffer_slice.get_mapped_range(); + if is_bgra { + // Byte-by-byte swizzle BGRA → RGBA. `chunks_exact` is faster than `chunks` + // (no remainder/Option dance) and literal indices vectorize well. + for padded_row in mapped.chunks_exact(padded_bpr) { + let row = &padded_row[..unpadded_bpr]; + for px in row.chunks_exact(4) { + pixels.push(epaint::Color32::from_rgba_premultiplied( + px[2], px[1], px[0], px[3], + )); + } + } + } else { + // RGBA already → pure memcpy per row into the Vec via bytemuck. + // `Color32` is `#[repr(C)] [u8; 4]` with a `bytemuck::Pod` impl, so this is a + // straight reinterpretation. + for padded_row in mapped.chunks_exact(padded_bpr) { + let row = &padded_row[..unpadded_bpr]; + pixels.extend_from_slice(bytemuck::cast_slice::(row)); + } + } + drop(mapped); + buffer.unmap(); + + tx.send(( + viewport_id, + data, + ColorImage::new( + [tex_extent.width as usize, tex_extent.height as usize], + pixels, + ), + )) + .ok(); + ctx.request_repaint(); +} + #[derive(Copy, Clone)] struct BufferPadding { unpadded_bytes_per_row: u32,