mirror of
https://github.com/rust-windowing/winit.git
synced 2026-06-26 14:49:07 -04:00
macOS: make Ime::Preedit cursor range surrogate-safe in setMarkedText
`setMarkedText:selectedRange:replacementRange:` converted the IME's UTF-16 `selectedRange` into UTF-8 byte offsets by taking `substringToIndex:` prefixes and measuring them with `NSString::len()` (`lengthOfBytesUsingEncoding:NSUTF8StringEncoding`). When an index falls inside a surrogate pair, the prefix ends in a lone high surrogate, which UTF-8 cannot represent, so `lengthOfBytesUsingEncoding:` returns 0 for the entire prefix and the offset silently collapses to 0.
This commit is contained in:
@@ -273,27 +273,26 @@ define_class!(
|
|||||||
self.ivars().ime_state.set(ImeState::Ground);
|
self.ivars().ime_state.set(ImeState::Ground);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let string = string.to_string();
|
||||||
let cursor_range = if string.is_empty() {
|
let cursor_range = if string.is_empty() {
|
||||||
// An empty string basically means that there's no preedit, so indicate that by
|
// An empty string basically means that there's no preedit, so indicate that by
|
||||||
// sending a `None` cursor range.
|
// sending a `None` cursor range.
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
// Clamp to string length to avoid NSRangeException from out-of-bounds
|
// Convert the selected range from UTF-16 code unit indices to UTF-8 byte
|
||||||
// indices sent by macOS IME (e.g. native Pinyin, see
|
// offsets. `utf16_to_utf8_offset` is defensive: it snaps an offset that would
|
||||||
// https://github.com/alacritty/alacritty/issues/8791).
|
// split a surrogate pair down to the character boundary and clamps an
|
||||||
let len = string.length();
|
// out-of-bounds offset to the string length, so no `NSRangeException` is
|
||||||
let location = selected_range.location.min(len);
|
// possible and the resulting range can never be inverted (`lower <= upper`).
|
||||||
let end = selected_range.end().min(len);
|
// IMEs are known to send both mid-surrogate and out-of-bounds offsets (e.g.
|
||||||
// Convert the selected range from UTF-16 indices to UTF-8 indices.
|
// native Pinyin, see https://github.com/alacritty/alacritty/issues/8791).
|
||||||
let sub_string_a = string.substringToIndex(location);
|
let lowerbound_utf8 = utf16_to_utf8_offset(&string, selected_range.location);
|
||||||
let sub_string_b = string.substringToIndex(end);
|
let upperbound_utf8 = utf16_to_utf8_offset(&string, selected_range.end());
|
||||||
let lowerbound_utf8 = sub_string_a.len();
|
|
||||||
let upperbound_utf8 = sub_string_b.len();
|
|
||||||
Some((lowerbound_utf8, upperbound_utf8))
|
Some((lowerbound_utf8, upperbound_utf8))
|
||||||
};
|
};
|
||||||
|
|
||||||
// Send WindowEvent for updating marked text
|
// Send WindowEvent for updating marked text
|
||||||
self.queue_event(WindowEvent::Ime(Ime::Preedit(string.to_string(), cursor_range)));
|
self.queue_event(WindowEvent::Ime(Ime::Preedit(string, cursor_range)));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[unsafe(method(unmarkText))]
|
#[unsafe(method(unmarkText))]
|
||||||
@@ -1170,3 +1169,92 @@ fn replace_event(event: &NSEvent, option_as_alt: OptionAsAlt) -> Retained<NSEven
|
|||||||
event.copy()
|
event.copy()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert a UTF-16 code unit offset into the corresponding UTF-8 byte offset within `s`.
|
||||||
|
///
|
||||||
|
/// IMEs are not required to send well-formed offsets, so this is defensive: an offset that
|
||||||
|
/// would split a surrogate pair is snapped down to the start of that character, and an
|
||||||
|
/// out-of-bounds offset is clamped to the end of the string (e.g. native Pinyin sends
|
||||||
|
/// out-of-bounds indices, see <https://github.com/alacritty/alacritty/issues/8791>).
|
||||||
|
///
|
||||||
|
/// The mapping is monotone non-decreasing, so applying it to the location and end of an
|
||||||
|
/// `NSRange` (where `location <= end`) can never produce an inverted byte range.
|
||||||
|
fn utf16_to_utf8_offset(s: &str, utf16_offset: usize) -> usize {
|
||||||
|
let mut utf16_pos = 0;
|
||||||
|
for (utf8_pos, ch) in s.char_indices() {
|
||||||
|
if utf16_pos >= utf16_offset {
|
||||||
|
return utf8_pos;
|
||||||
|
}
|
||||||
|
utf16_pos += ch.len_utf16();
|
||||||
|
// The target offset lands strictly inside this character's UTF-16 representation,
|
||||||
|
// i.e. it splits a surrogate pair: snap down to the character boundary.
|
||||||
|
if utf16_pos > utf16_offset {
|
||||||
|
return utf8_pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// Apply the UTF-16 -> UTF-8 conversion to both ends of a `selectedRange {loc, len}`,
|
||||||
|
/// mirroring what `set_marked_text` does for the emitted `Ime::Preedit` cursor range.
|
||||||
|
fn convert(s: &str, loc: usize, len: usize) -> (usize, usize) {
|
||||||
|
(utf16_to_utf8_offset(s, loc), utf16_to_utf8_offset(s, loc + len))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn mid_surrogate_offset_snaps_down() {
|
||||||
|
// "😀a": 😀 is one char = 2 UTF-16 units = 4 UTF-8 bytes; offset 1 is mid-pair.
|
||||||
|
assert_eq!(utf16_to_utf8_offset("\u{1F600}a", 1), 0);
|
||||||
|
// Offset 2 is the boundary just after the pair.
|
||||||
|
assert_eq!(utf16_to_utf8_offset("\u{1F600}a", 2), 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn no_longer_inverted() {
|
||||||
|
// "a😀b" with selectedRange {1,1}: previously emitted (1, 0) -- lower > upper, a
|
||||||
|
// slice-panic vector. The boundary-snapping conversion keeps lower <= upper.
|
||||||
|
assert_eq!(convert("a\u{1F600}b", 1, 1), (1, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn prefix_preserved_on_mid_pair_collapse() {
|
||||||
|
// "a😀b" with selectedRange {2,0}: previously collapsed to (0, 0), discarding the
|
||||||
|
// valid "a" prefix; now snaps to the char boundary after "a".
|
||||||
|
assert_eq!(convert("a\u{1F600}b", 2, 0), (1, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn out_of_bounds_clamps_to_len() {
|
||||||
|
// Subsumes the #4494 `.min(len)` clamp: an out-of-bounds offset maps to the string
|
||||||
|
// length instead of triggering an NSRangeException.
|
||||||
|
assert_eq!(convert("\u{1F600}a", 99, 0), (5, 5));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn well_formed_inputs_are_identity() {
|
||||||
|
// The common case (well-formed boundary indices) must be byte-for-byte unchanged.
|
||||||
|
assert_eq!(convert("a\u{1F600}b", 3, 0), (5, 5));
|
||||||
|
assert_eq!(convert("a\u{1F600}b", 4, 0), (6, 6));
|
||||||
|
// BMP multi-byte (Japanese): each char is 1 UTF-16 unit and 3 UTF-8 bytes.
|
||||||
|
assert_eq!(convert("\u{3053}\u{3093}", 1, 1), (3, 6));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn monotone_non_decreasing() {
|
||||||
|
// Sweep every UTF-16 offset (including out-of-bounds) over a string mixing BMP and
|
||||||
|
// non-BMP characters and assert the conversion never goes backwards, which is what
|
||||||
|
// guarantees `lower <= upper` for any `NSRange`.
|
||||||
|
let s = "a\u{1F600}b\u{3053}\u{1F4A9}c";
|
||||||
|
let mut prev = 0;
|
||||||
|
for off in 0..=20 {
|
||||||
|
let cur = utf16_to_utf8_offset(s, off);
|
||||||
|
assert!(cur >= prev, "non-monotone at offset {off}: {cur} < {prev}");
|
||||||
|
assert!(cur <= s.len(), "offset {off} mapped past end: {cur} > {}", s.len());
|
||||||
|
prev = cur;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -66,3 +66,4 @@ changelog entry.
|
|||||||
- On Wayland, switch from using the `ahash` hashing algorithm to `foldhash`.
|
- On Wayland, switch from using the `ahash` hashing algorithm to `foldhash`.
|
||||||
- On macOS, fix borderless game presentation options not sticking after switching spaces.
|
- On macOS, fix borderless game presentation options not sticking after switching spaces.
|
||||||
- On macOS, fix IME being locked on (regardless of requests to disable) after being enabled once.
|
- On macOS, fix IME being locked on (regardless of requests to disable) after being enabled once.
|
||||||
|
- On macOS, fix a panic and incorrect cursor position in Ime::Preedit when the preedit string contains special characters (ie. emojis) caused by incorrect UTF-16 to UTF-8 offset conversion.
|
||||||
|
|||||||
Reference in New Issue
Block a user