1
0
mirror of https://github.com/emilk/egui.git synced 2026-06-26 14:49:06 -04:00

Speed up egui-wgpu screenshot readback

The map_async callback fires on the render thread inside Queue::submit,
so its CPU work (pixel swizzle + buffer copy) blocks the next frame's
GPU dispatch. Move the swizzle/send to a dedicated worker thread so
submit returns immediately. On wasm (no real threads, !Send Buffer) keep
the work inline.

While there, replace the per-byte runtime-indexed swizzle loop with
chunks_exact and constant indices so LLVM can autovectorize, and add a
fast path for Rgba8Unorm that just bytemuck::cast_slice's each row
straight into the Vec<Color32>.
This commit is contained in:
lucasmerlin
2026-05-22 11:37:53 +02:00
parent 82aaef3530
commit b25139074e

View File

@@ -194,9 +194,12 @@ impl CaptureState {
let format = self.texture.format();
let tex_extent = self.texture.size();
let padding = self.padding;
let to_rgba = match format {
wgpu::TextureFormat::Rgba8Unorm => [0, 1, 2, 3],
wgpu::TextureFormat::Bgra8Unorm => [2, 1, 0, 3],
// Dispatch on format once outside the hot loop. Using constant indices for the swizzle
// (instead of a runtime `[usize; 4]` table) lets LLVM auto-vectorize the per-pixel
// path, and the Rgba8 case skips the swizzle entirely via a row-wise bytemuck copy.
let is_bgra = match format {
wgpu::TextureFormat::Rgba8Unorm => false,
wgpu::TextureFormat::Bgra8Unorm => true,
_ => {
log::error!(
"Screen can't be captured unless the surface format is Rgba8Unorm or Bgra8Unorm. Current surface format is {format:?}"
@@ -209,23 +212,70 @@ impl CaptureState {
log::error!("Failed to map buffer for reading: {err}");
return;
}
let buffer_slice = buffer.slice(..);
let mut pixels = Vec::with_capacity((tex_extent.width * tex_extent.height) as usize);
for padded_row in buffer_slice
.get_mapped_range()
.chunks(padding.padded_bytes_per_row as usize)
// The `map_async` callback is fired by wgpu on the render thread inside
// `Queue::submit`. The pixel swizzle on a full-screen buffer is CPU-bound and
// would block the next frame's GPU dispatch. Off-load to a worker thread so
// submit returns immediately. On wasm we can't move `Arc<Buffer>` across
// threads (and there are no real threads anyway), so we keep the work inline.
#[cfg(not(target_arch = "wasm32"))]
{
let row = &padded_row[..padding.unpadded_bytes_per_row as usize];
for color in row.chunks(4) {
let _ = std::thread::Builder::new()
.name("egui_wgpu_capture_decode".into())
.spawn(move || {
decode_and_send(
buffer, padding, tex_extent, is_bgra, ctx, tx, data, viewport_id,
);
});
}
#[cfg(target_arch = "wasm32")]
decode_and_send(
buffer, padding, tex_extent, is_bgra, ctx, tx, data, viewport_id,
);
});
}
}
/// Map the captured buffer, swizzle BGRA → RGBA if needed, ship the resulting
/// [`ColorImage`] through `tx`, and wake the UI thread. Called either inline (wasm) or on
/// a dedicated worker thread (native) — never on the render thread, so the per-pixel work
/// can't backpressure GPU submission.
fn decode_and_send(
buffer: Arc<wgpu::Buffer>,
padding: BufferPadding,
tex_extent: wgpu::Extent3d,
is_bgra: bool,
ctx: egui::Context,
tx: CaptureSender,
data: Vec<UserData>,
viewport_id: ViewportId,
) {
let buffer_slice = buffer.slice(..);
let total_pixels = (tex_extent.width * tex_extent.height) as usize;
let mut pixels: Vec<epaint::Color32> = Vec::with_capacity(total_pixels);
let padded_bpr = padding.padded_bytes_per_row as usize;
let unpadded_bpr = padding.unpadded_bytes_per_row as usize;
let mapped = buffer_slice.get_mapped_range();
if is_bgra {
// Byte-by-byte swizzle BGRA → RGBA. `chunks_exact` is faster than `chunks`
// (no remainder/Option dance) and literal indices vectorize well.
for padded_row in mapped.chunks_exact(padded_bpr) {
let row = &padded_row[..unpadded_bpr];
for px in row.chunks_exact(4) {
pixels.push(epaint::Color32::from_rgba_premultiplied(
color[to_rgba[0]],
color[to_rgba[1]],
color[to_rgba[2]],
color[to_rgba[3]],
px[2], px[1], px[0], px[3],
));
}
}
} else {
// RGBA already → pure memcpy per row into the Vec<Color32> via bytemuck.
// `Color32` is `#[repr(C)] [u8; 4]` with a `bytemuck::Pod` impl, so this is a
// straight reinterpretation.
for padded_row in mapped.chunks_exact(padded_bpr) {
let row = &padded_row[..unpadded_bpr];
pixels.extend_from_slice(bytemuck::cast_slice::<u8, epaint::Color32>(row));
}
}
drop(mapped);
buffer.unmap();
tx.send((
@@ -238,8 +288,6 @@ impl CaptureState {
))
.ok();
ctx.request_repaint();
});
}
}
#[derive(Copy, Clone)]