diff --git a/winit-appkit/src/view.rs b/winit-appkit/src/view.rs index 818e46027..396f8d1af 100644 --- a/winit-appkit/src/view.rs +++ b/winit-appkit/src/view.rs @@ -273,27 +273,26 @@ define_class!( self.ivars().ime_state.set(ImeState::Ground); } + let string = string.to_string(); let cursor_range = if string.is_empty() { // An empty string basically means that there's no preedit, so indicate that by // sending a `None` cursor range. None } else { - // Clamp to string length to avoid NSRangeException from out-of-bounds - // indices sent by macOS IME (e.g. native Pinyin, see - // https://github.com/alacritty/alacritty/issues/8791). - let len = string.length(); - let location = selected_range.location.min(len); - let end = selected_range.end().min(len); - // Convert the selected range from UTF-16 indices to UTF-8 indices. - let sub_string_a = string.substringToIndex(location); - let sub_string_b = string.substringToIndex(end); - let lowerbound_utf8 = sub_string_a.len(); - let upperbound_utf8 = sub_string_b.len(); + // Convert the selected range from UTF-16 code unit indices to UTF-8 byte + // offsets. `utf16_to_utf8_offset` is defensive: it snaps an offset that would + // split a surrogate pair down to the character boundary and clamps an + // out-of-bounds offset to the string length, so no `NSRangeException` is + // possible and the resulting range can never be inverted (`lower <= upper`). + // IMEs are known to send both mid-surrogate and out-of-bounds offsets (e.g. + // native Pinyin, see https://github.com/alacritty/alacritty/issues/8791). + let lowerbound_utf8 = utf16_to_utf8_offset(&string, selected_range.location); + let upperbound_utf8 = utf16_to_utf8_offset(&string, selected_range.end()); Some((lowerbound_utf8, upperbound_utf8)) }; // Send WindowEvent for updating marked text - self.queue_event(WindowEvent::Ime(Ime::Preedit(string.to_string(), cursor_range))); + self.queue_event(WindowEvent::Ime(Ime::Preedit(string, cursor_range))); } #[unsafe(method(unmarkText))] @@ -1170,3 +1169,92 @@ fn replace_event(event: &NSEvent, option_as_alt: OptionAsAlt) -> Retained). +/// +/// The mapping is monotone non-decreasing, so applying it to the location and end of an +/// `NSRange` (where `location <= end`) can never produce an inverted byte range. +fn utf16_to_utf8_offset(s: &str, utf16_offset: usize) -> usize { + let mut utf16_pos = 0; + for (utf8_pos, ch) in s.char_indices() { + if utf16_pos >= utf16_offset { + return utf8_pos; + } + utf16_pos += ch.len_utf16(); + // The target offset lands strictly inside this character's UTF-16 representation, + // i.e. it splits a surrogate pair: snap down to the character boundary. + if utf16_pos > utf16_offset { + return utf8_pos; + } + } + s.len() +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Apply the UTF-16 -> UTF-8 conversion to both ends of a `selectedRange {loc, len}`, + /// mirroring what `set_marked_text` does for the emitted `Ime::Preedit` cursor range. + fn convert(s: &str, loc: usize, len: usize) -> (usize, usize) { + (utf16_to_utf8_offset(s, loc), utf16_to_utf8_offset(s, loc + len)) + } + + #[test] + fn mid_surrogate_offset_snaps_down() { + // "😀a": 😀 is one char = 2 UTF-16 units = 4 UTF-8 bytes; offset 1 is mid-pair. + assert_eq!(utf16_to_utf8_offset("\u{1F600}a", 1), 0); + // Offset 2 is the boundary just after the pair. + assert_eq!(utf16_to_utf8_offset("\u{1F600}a", 2), 4); + } + + #[test] + fn no_longer_inverted() { + // "a😀b" with selectedRange {1,1}: previously emitted (1, 0) -- lower > upper, a + // slice-panic vector. The boundary-snapping conversion keeps lower <= upper. + assert_eq!(convert("a\u{1F600}b", 1, 1), (1, 1)); + } + + #[test] + fn prefix_preserved_on_mid_pair_collapse() { + // "a😀b" with selectedRange {2,0}: previously collapsed to (0, 0), discarding the + // valid "a" prefix; now snaps to the char boundary after "a". + assert_eq!(convert("a\u{1F600}b", 2, 0), (1, 1)); + } + + #[test] + fn out_of_bounds_clamps_to_len() { + // Subsumes the #4494 `.min(len)` clamp: an out-of-bounds offset maps to the string + // length instead of triggering an NSRangeException. + assert_eq!(convert("\u{1F600}a", 99, 0), (5, 5)); + } + + #[test] + fn well_formed_inputs_are_identity() { + // The common case (well-formed boundary indices) must be byte-for-byte unchanged. + assert_eq!(convert("a\u{1F600}b", 3, 0), (5, 5)); + assert_eq!(convert("a\u{1F600}b", 4, 0), (6, 6)); + // BMP multi-byte (Japanese): each char is 1 UTF-16 unit and 3 UTF-8 bytes. + assert_eq!(convert("\u{3053}\u{3093}", 1, 1), (3, 6)); + } + + #[test] + fn monotone_non_decreasing() { + // Sweep every UTF-16 offset (including out-of-bounds) over a string mixing BMP and + // non-BMP characters and assert the conversion never goes backwards, which is what + // guarantees `lower <= upper` for any `NSRange`. + let s = "a\u{1F600}b\u{3053}\u{1F4A9}c"; + let mut prev = 0; + for off in 0..=20 { + let cur = utf16_to_utf8_offset(s, off); + assert!(cur >= prev, "non-monotone at offset {off}: {cur} < {prev}"); + assert!(cur <= s.len(), "offset {off} mapped past end: {cur} > {}", s.len()); + prev = cur; + } + } +} diff --git a/winit/src/changelog/unreleased.md b/winit/src/changelog/unreleased.md index e8d5e5f96..1c58b725c 100644 --- a/winit/src/changelog/unreleased.md +++ b/winit/src/changelog/unreleased.md @@ -66,3 +66,4 @@ changelog entry. - On Wayland, switch from using the `ahash` hashing algorithm to `foldhash`. - On macOS, fix borderless game presentation options not sticking after switching spaces. - On macOS, fix IME being locked on (regardless of requests to disable) after being enabled once. +- On macOS, fix a panic and incorrect cursor position in Ime::Preedit when the preedit string contains special characters (ie. emojis) caused by incorrect UTF-16 to UTF-8 offset conversion.