mirror of
https://github.com/rust-windowing/winit.git
synced 2026-06-26 14:49:07 -04:00
macOS: make Ime::Preedit cursor range surrogate-safe in setMarkedText
`setMarkedText:selectedRange:replacementRange:` converted the IME's UTF-16 `selectedRange` into UTF-8 byte offsets by taking `substringToIndex:` prefixes and measuring them with `NSString::len()` (`lengthOfBytesUsingEncoding:NSUTF8StringEncoding`). When an index falls inside a surrogate pair, the prefix ends in a lone high surrogate, which UTF-8 cannot represent, so `lengthOfBytesUsingEncoding:` returns 0 for the entire prefix and the offset silently collapses to 0.
This commit is contained in:
@@ -273,27 +273,26 @@ define_class!(
|
||||
self.ivars().ime_state.set(ImeState::Ground);
|
||||
}
|
||||
|
||||
let string = string.to_string();
|
||||
let cursor_range = if string.is_empty() {
|
||||
// An empty string basically means that there's no preedit, so indicate that by
|
||||
// sending a `None` cursor range.
|
||||
None
|
||||
} else {
|
||||
// Clamp to string length to avoid NSRangeException from out-of-bounds
|
||||
// indices sent by macOS IME (e.g. native Pinyin, see
|
||||
// https://github.com/alacritty/alacritty/issues/8791).
|
||||
let len = string.length();
|
||||
let location = selected_range.location.min(len);
|
||||
let end = selected_range.end().min(len);
|
||||
// Convert the selected range from UTF-16 indices to UTF-8 indices.
|
||||
let sub_string_a = string.substringToIndex(location);
|
||||
let sub_string_b = string.substringToIndex(end);
|
||||
let lowerbound_utf8 = sub_string_a.len();
|
||||
let upperbound_utf8 = sub_string_b.len();
|
||||
// Convert the selected range from UTF-16 code unit indices to UTF-8 byte
|
||||
// offsets. `utf16_to_utf8_offset` is defensive: it snaps an offset that would
|
||||
// split a surrogate pair down to the character boundary and clamps an
|
||||
// out-of-bounds offset to the string length, so no `NSRangeException` is
|
||||
// possible and the resulting range can never be inverted (`lower <= upper`).
|
||||
// IMEs are known to send both mid-surrogate and out-of-bounds offsets (e.g.
|
||||
// native Pinyin, see https://github.com/alacritty/alacritty/issues/8791).
|
||||
let lowerbound_utf8 = utf16_to_utf8_offset(&string, selected_range.location);
|
||||
let upperbound_utf8 = utf16_to_utf8_offset(&string, selected_range.end());
|
||||
Some((lowerbound_utf8, upperbound_utf8))
|
||||
};
|
||||
|
||||
// Send WindowEvent for updating marked text
|
||||
self.queue_event(WindowEvent::Ime(Ime::Preedit(string.to_string(), cursor_range)));
|
||||
self.queue_event(WindowEvent::Ime(Ime::Preedit(string, cursor_range)));
|
||||
}
|
||||
|
||||
#[unsafe(method(unmarkText))]
|
||||
@@ -1170,3 +1169,92 @@ fn replace_event(event: &NSEvent, option_as_alt: OptionAsAlt) -> Retained<NSEven
|
||||
event.copy()
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a UTF-16 code unit offset into the corresponding UTF-8 byte offset within `s`.
|
||||
///
|
||||
/// IMEs are not required to send well-formed offsets, so this is defensive: an offset that
|
||||
/// would split a surrogate pair is snapped down to the start of that character, and an
|
||||
/// out-of-bounds offset is clamped to the end of the string (e.g. native Pinyin sends
|
||||
/// out-of-bounds indices, see <https://github.com/alacritty/alacritty/issues/8791>).
|
||||
///
|
||||
/// The mapping is monotone non-decreasing, so applying it to the location and end of an
|
||||
/// `NSRange` (where `location <= end`) can never produce an inverted byte range.
|
||||
fn utf16_to_utf8_offset(s: &str, utf16_offset: usize) -> usize {
|
||||
let mut utf16_pos = 0;
|
||||
for (utf8_pos, ch) in s.char_indices() {
|
||||
if utf16_pos >= utf16_offset {
|
||||
return utf8_pos;
|
||||
}
|
||||
utf16_pos += ch.len_utf16();
|
||||
// The target offset lands strictly inside this character's UTF-16 representation,
|
||||
// i.e. it splits a surrogate pair: snap down to the character boundary.
|
||||
if utf16_pos > utf16_offset {
|
||||
return utf8_pos;
|
||||
}
|
||||
}
|
||||
s.len()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Apply the UTF-16 -> UTF-8 conversion to both ends of a `selectedRange {loc, len}`,
|
||||
/// mirroring what `set_marked_text` does for the emitted `Ime::Preedit` cursor range.
|
||||
fn convert(s: &str, loc: usize, len: usize) -> (usize, usize) {
|
||||
(utf16_to_utf8_offset(s, loc), utf16_to_utf8_offset(s, loc + len))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mid_surrogate_offset_snaps_down() {
|
||||
// "😀a": 😀 is one char = 2 UTF-16 units = 4 UTF-8 bytes; offset 1 is mid-pair.
|
||||
assert_eq!(utf16_to_utf8_offset("\u{1F600}a", 1), 0);
|
||||
// Offset 2 is the boundary just after the pair.
|
||||
assert_eq!(utf16_to_utf8_offset("\u{1F600}a", 2), 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_longer_inverted() {
|
||||
// "a😀b" with selectedRange {1,1}: previously emitted (1, 0) -- lower > upper, a
|
||||
// slice-panic vector. The boundary-snapping conversion keeps lower <= upper.
|
||||
assert_eq!(convert("a\u{1F600}b", 1, 1), (1, 1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prefix_preserved_on_mid_pair_collapse() {
|
||||
// "a😀b" with selectedRange {2,0}: previously collapsed to (0, 0), discarding the
|
||||
// valid "a" prefix; now snaps to the char boundary after "a".
|
||||
assert_eq!(convert("a\u{1F600}b", 2, 0), (1, 1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn out_of_bounds_clamps_to_len() {
|
||||
// Subsumes the #4494 `.min(len)` clamp: an out-of-bounds offset maps to the string
|
||||
// length instead of triggering an NSRangeException.
|
||||
assert_eq!(convert("\u{1F600}a", 99, 0), (5, 5));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn well_formed_inputs_are_identity() {
|
||||
// The common case (well-formed boundary indices) must be byte-for-byte unchanged.
|
||||
assert_eq!(convert("a\u{1F600}b", 3, 0), (5, 5));
|
||||
assert_eq!(convert("a\u{1F600}b", 4, 0), (6, 6));
|
||||
// BMP multi-byte (Japanese): each char is 1 UTF-16 unit and 3 UTF-8 bytes.
|
||||
assert_eq!(convert("\u{3053}\u{3093}", 1, 1), (3, 6));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn monotone_non_decreasing() {
|
||||
// Sweep every UTF-16 offset (including out-of-bounds) over a string mixing BMP and
|
||||
// non-BMP characters and assert the conversion never goes backwards, which is what
|
||||
// guarantees `lower <= upper` for any `NSRange`.
|
||||
let s = "a\u{1F600}b\u{3053}\u{1F4A9}c";
|
||||
let mut prev = 0;
|
||||
for off in 0..=20 {
|
||||
let cur = utf16_to_utf8_offset(s, off);
|
||||
assert!(cur >= prev, "non-monotone at offset {off}: {cur} < {prev}");
|
||||
assert!(cur <= s.len(), "offset {off} mapped past end: {cur} > {}", s.len());
|
||||
prev = cur;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,3 +66,4 @@ changelog entry.
|
||||
- On Wayland, switch from using the `ahash` hashing algorithm to `foldhash`.
|
||||
- On macOS, fix borderless game presentation options not sticking after switching spaces.
|
||||
- On macOS, fix IME being locked on (regardless of requests to disable) after being enabled once.
|
||||
- On macOS, fix a panic and incorrect cursor position in Ime::Preedit when the preedit string contains special characters (ie. emojis) caused by incorrect UTF-16 to UTF-8 offset conversion.
|
||||
|
||||
Reference in New Issue
Block a user