diff --git a/src/grapheme.rs b/src/grapheme.rs index 50d53b3..c006eb9 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -8,9 +8,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use core::cmp; - use crate::tables::grapheme::GraphemeCat; +use core::cmp; /// External iterator for grapheme clusters and byte offsets. /// @@ -177,7 +176,11 @@ enum GraphemeState { Regional, /// The codepoint after is Extended_Pictographic, /// so whether it's a boundary depends on pre-context according to GB11. - Emoji, + Emoji { + /// Whether the ZWJ char has been seen already an only a "\p{Extended_Pictographic} Extend*" + /// part of GB11 has to be checked + seen_zwj: bool, + }, } /// Cursor-based segmenter for grapheme clusters. @@ -424,7 +427,7 @@ impl GraphemeCursor { match self.state { GraphemeState::InCbConsonant => self.handle_incb_consonant(chunk, chunk_start), GraphemeState::Regional => self.handle_regional(chunk, chunk_start), - GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start), + GraphemeState::Emoji { seen_zwj } => self.handle_emoji(chunk, chunk_start, seen_zwj), _ => { if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start { let ch = chunk.chars().next_back().unwrap(); @@ -532,13 +535,18 @@ impl GraphemeCursor { } #[inline] - fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) { + fn handle_emoji(&mut self, chunk: &str, chunk_start: usize, mut seen_zwj: bool) { + // \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic} use crate::tables::grapheme as gr; let mut iter = chunk.chars().rev(); - if let Some(ch) = iter.next() { - if self.grapheme_category(ch) != gr::GC_ZWJ { - self.decide(true); - return; + if !seen_zwj { + if let Some(ch) = iter.next() { + if self.grapheme_category(ch) != gr::GC_ZWJ { + self.decide(true); + return; + } else { + seen_zwj = true; + } } } for ch in iter { @@ -558,7 +566,7 @@ impl GraphemeCursor { self.decide(true); } else { self.pre_context_offset = Some(chunk_start); - self.state = GraphemeState::Emoji; + self.state = GraphemeState::Emoji { seen_zwj }; } } @@ -616,7 +624,9 @@ impl GraphemeCursor { match self.cat_after.unwrap() { gr::GC_InCB_Consonant => self.state = GraphemeState::InCbConsonant, gr::GC_Regional_Indicator => self.state = GraphemeState::Regional, - gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji, + gr::GC_Extended_Pictographic => { + self.state = GraphemeState::Emoji { seen_zwj: false } + } _ => need_pre_context = self.cat_before.is_none(), } if need_pre_context { @@ -647,7 +657,7 @@ impl GraphemeCursor { self.is_boundary_result() } PairResult::Emoji => { - self.handle_emoji(&chunk[..offset_in_chunk], chunk_start); + self.handle_emoji(&chunk[..offset_in_chunk], chunk_start, false); self.is_boundary_result() } } @@ -882,3 +892,105 @@ fn test_grapheme_cursor_prev_boundary_chunk_start() { ); assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1))); } + +#[test] +fn test_grapheme_cursor_boundary_with_zwj_on_chunk_start() { + use GraphemeIncomplete::*; + + let chunk0 = "👩"; // 4 bytes + let chunk1 = "\u{200d}🔬"; // 3 bytes + 4 bytes + + let full_len = chunk0.len() + chunk1.len(); + + let mut cur = GraphemeCursor::new(0, full_len, true); + assert_eq!(cur.next_boundary(chunk0, 0), Err(NextChunk)); + match cur.next_boundary(chunk1, chunk0.len()) { + Ok(res) => assert_eq!(res, Some(11)), + Err(PreContext(_)) => { + cur.provide_context(chunk0, 0); + assert_eq!(cur.next_boundary(chunk1, chunk0.len()), Ok(Some(11))); + } + _ => unreachable!(), + } +} + +#[test] +fn test_grapheme_cursor_emoji_no_zwj() { + use GraphemeIncomplete::*; + let chunk0 = "🍒"; // 4 bytes + let chunk1 = "🥑"; // 4 bytes + let full_len = chunk0.len() + chunk1.len(); + + let mut c = GraphemeCursor::new(0, full_len, true); + assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk)); + assert_eq!( + c.next_boundary(chunk1, chunk0.len()), + Err(PreContext(chunk0.len())) + ); + c.provide_context(chunk0, 0); + assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(4))); + assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(8))); + assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(None)); +} + +#[test] +fn test_grapheme_cursor_emoji_chunk_boundary_before_zwj() { + use GraphemeIncomplete::*; + let chunk0 = "🍒"; // 4 bytes + let chunk1 = "\u{200d}🥑"; // 3 + 4 bytes + let full_len = chunk0.len() + chunk1.len(); // 11 + + let mut c = GraphemeCursor::new(0, full_len, true); + assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk)); + assert_eq!( + c.next_boundary(chunk1, chunk0.len()), + Err(PreContext(chunk0.len())) + ); + c.provide_context(chunk0, 0); + assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(11))); + assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(None)); +} + +#[test] +fn test_grapheme_cursor_emoji_chunk_boundary_after_zwj() { + use GraphemeIncomplete::*; + let chunk0 = "🍒\u{200d}"; // 4 + 3 bytes + let chunk1 = "🥑"; // 4 bytes + let full_len = chunk0.len() + chunk1.len(); // 11 + + let mut c = GraphemeCursor::new(0, full_len, true); + assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk)); + assert_eq!( + c.next_boundary(chunk1, chunk0.len()), + Err(PreContext(chunk0.len())) + ); + c.provide_context(chunk0, 0); + assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(11))); + assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(None)); +} + +#[test] +fn test_grapheme_cursor_emoji_zwj_across_chunks() { + use GraphemeIncomplete::*; + let chunk0 = "🍒"; // 4 bytes + let chunk1 = "\u{200d}"; // 3 bytes + let chunk2 = "🥑"; // 4 bytes + let full_len = chunk0.len() + chunk1.len() + chunk2.len(); // 11 + let chunk2_start = chunk0.len() + chunk1.len(); + + let mut c = GraphemeCursor::new(0, full_len, true); + assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk)); + assert_eq!(c.next_boundary(chunk1, chunk0.len()), Err(NextChunk)); + assert_eq!( + c.next_boundary(chunk2, chunk2_start), + Err(PreContext(chunk2_start)) + ); + c.provide_context(chunk1, chunk0.len()); + assert_eq!( + c.next_boundary(chunk2, chunk2_start), + Err(PreContext(chunk0.len())) + ); + c.provide_context(chunk0, 0); + assert_eq!(c.next_boundary(chunk2, chunk2_start), Ok(Some(11))); + assert_eq!(c.next_boundary(chunk2, chunk2_start), Ok(None)); +}