macOS: make Ime::Preedit cursor range surrogate-safe in setMarkedText

`setMarkedText:selectedRange:replacementRange:` converted the IME's UTF-16
`selectedRange` into UTF-8 byte offsets by taking `substringToIndex:` prefixes
and measuring them with `NSString::len()`
(`lengthOfBytesUsingEncoding:NSUTF8StringEncoding`). When an index falls inside
a surrogate pair, the prefix ends in a lone high surrogate, which UTF-8 cannot
represent, so `lengthOfBytesUsingEncoding:` returns 0 for the entire prefix and
the offset silently collapses to 0.
This commit is contained in:
William-Selna
2026-06-19 18:34:43 +02:00
committed by GitHub
parent 7fe8c206ae
commit 4ef144b519
2 changed files with 101 additions and 12 deletions

View File

@@ -273,27 +273,26 @@ define_class!(
self.ivars().ime_state.set(ImeState::Ground); self.ivars().ime_state.set(ImeState::Ground);
} }
let string = string.to_string();
let cursor_range = if string.is_empty() { let cursor_range = if string.is_empty() {
// An empty string basically means that there's no preedit, so indicate that by // An empty string basically means that there's no preedit, so indicate that by
// sending a `None` cursor range. // sending a `None` cursor range.
None None
} else { } else {
// Clamp to string length to avoid NSRangeException from out-of-bounds // Convert the selected range from UTF-16 code unit indices to UTF-8 byte
// indices sent by macOS IME (e.g. native Pinyin, see // offsets. `utf16_to_utf8_offset` is defensive: it snaps an offset that would
// https://github.com/alacritty/alacritty/issues/8791). // split a surrogate pair down to the character boundary and clamps an
let len = string.length(); // out-of-bounds offset to the string length, so no `NSRangeException` is
let location = selected_range.location.min(len); // possible and the resulting range can never be inverted (`lower <= upper`).
let end = selected_range.end().min(len); // IMEs are known to send both mid-surrogate and out-of-bounds offsets (e.g.
// Convert the selected range from UTF-16 indices to UTF-8 indices. // native Pinyin, see https://github.com/alacritty/alacritty/issues/8791).
let sub_string_a = string.substringToIndex(location); let lowerbound_utf8 = utf16_to_utf8_offset(&string, selected_range.location);
let sub_string_b = string.substringToIndex(end); let upperbound_utf8 = utf16_to_utf8_offset(&string, selected_range.end());
let lowerbound_utf8 = sub_string_a.len();
let upperbound_utf8 = sub_string_b.len();
Some((lowerbound_utf8, upperbound_utf8)) Some((lowerbound_utf8, upperbound_utf8))
}; };
// Send WindowEvent for updating marked text // Send WindowEvent for updating marked text
self.queue_event(WindowEvent::Ime(Ime::Preedit(string.to_string(), cursor_range))); self.queue_event(WindowEvent::Ime(Ime::Preedit(string, cursor_range)));
} }
#[unsafe(method(unmarkText))] #[unsafe(method(unmarkText))]
@@ -1170,3 +1169,92 @@ fn replace_event(event: &NSEvent, option_as_alt: OptionAsAlt) -> Retained<NSEven
event.copy() event.copy()
} }
} }
/// Convert a UTF-16 code unit offset into the corresponding UTF-8 byte offset within `s`.
///
/// IMEs are not required to send well-formed offsets, so this is defensive: an offset that
/// would split a surrogate pair is snapped down to the start of that character, and an
/// out-of-bounds offset is clamped to the end of the string (e.g. native Pinyin sends
/// out-of-bounds indices, see <https://github.com/alacritty/alacritty/issues/8791>).
///
/// The mapping is monotone non-decreasing, so applying it to the location and end of an
/// `NSRange` (where `location <= end`) can never produce an inverted byte range.
fn utf16_to_utf8_offset(s: &str, utf16_offset: usize) -> usize {
let mut utf16_pos = 0;
for (utf8_pos, ch) in s.char_indices() {
if utf16_pos >= utf16_offset {
return utf8_pos;
}
utf16_pos += ch.len_utf16();
// The target offset lands strictly inside this character's UTF-16 representation,
// i.e. it splits a surrogate pair: snap down to the character boundary.
if utf16_pos > utf16_offset {
return utf8_pos;
}
}
s.len()
}
#[cfg(test)]
mod tests {
use super::*;
/// Apply the UTF-16 -> UTF-8 conversion to both ends of a `selectedRange {loc, len}`,
/// mirroring what `set_marked_text` does for the emitted `Ime::Preedit` cursor range.
fn convert(s: &str, loc: usize, len: usize) -> (usize, usize) {
(utf16_to_utf8_offset(s, loc), utf16_to_utf8_offset(s, loc + len))
}
#[test]
fn mid_surrogate_offset_snaps_down() {
// "😀a": 😀 is one char = 2 UTF-16 units = 4 UTF-8 bytes; offset 1 is mid-pair.
assert_eq!(utf16_to_utf8_offset("\u{1F600}a", 1), 0);
// Offset 2 is the boundary just after the pair.
assert_eq!(utf16_to_utf8_offset("\u{1F600}a", 2), 4);
}
#[test]
fn no_longer_inverted() {
// "a😀b" with selectedRange {1,1}: previously emitted (1, 0) -- lower > upper, a
// slice-panic vector. The boundary-snapping conversion keeps lower <= upper.
assert_eq!(convert("a\u{1F600}b", 1, 1), (1, 1));
}
#[test]
fn prefix_preserved_on_mid_pair_collapse() {
// "a😀b" with selectedRange {2,0}: previously collapsed to (0, 0), discarding the
// valid "a" prefix; now snaps to the char boundary after "a".
assert_eq!(convert("a\u{1F600}b", 2, 0), (1, 1));
}
#[test]
fn out_of_bounds_clamps_to_len() {
// Subsumes the #4494 `.min(len)` clamp: an out-of-bounds offset maps to the string
// length instead of triggering an NSRangeException.
assert_eq!(convert("\u{1F600}a", 99, 0), (5, 5));
}
#[test]
fn well_formed_inputs_are_identity() {
// The common case (well-formed boundary indices) must be byte-for-byte unchanged.
assert_eq!(convert("a\u{1F600}b", 3, 0), (5, 5));
assert_eq!(convert("a\u{1F600}b", 4, 0), (6, 6));
// BMP multi-byte (Japanese): each char is 1 UTF-16 unit and 3 UTF-8 bytes.
assert_eq!(convert("\u{3053}\u{3093}", 1, 1), (3, 6));
}
#[test]
fn monotone_non_decreasing() {
// Sweep every UTF-16 offset (including out-of-bounds) over a string mixing BMP and
// non-BMP characters and assert the conversion never goes backwards, which is what
// guarantees `lower <= upper` for any `NSRange`.
let s = "a\u{1F600}b\u{3053}\u{1F4A9}c";
let mut prev = 0;
for off in 0..=20 {
let cur = utf16_to_utf8_offset(s, off);
assert!(cur >= prev, "non-monotone at offset {off}: {cur} < {prev}");
assert!(cur <= s.len(), "offset {off} mapped past end: {cur} > {}", s.len());
prev = cur;
}
}
}

View File

@@ -66,3 +66,4 @@ changelog entry.
- On Wayland, switch from using the `ahash` hashing algorithm to `foldhash`. - On Wayland, switch from using the `ahash` hashing algorithm to `foldhash`.
- On macOS, fix borderless game presentation options not sticking after switching spaces. - On macOS, fix borderless game presentation options not sticking after switching spaces.
- On macOS, fix IME being locked on (regardless of requests to disable) after being enabled once. - On macOS, fix IME being locked on (regardless of requests to disable) after being enabled once.
- On macOS, fix a panic and incorrect cursor position in Ime::Preedit when the preedit string contains special characters (ie. emojis) caused by incorrect UTF-16 to UTF-8 offset conversion.