aboutsummaryrefslogtreecommitdiff
path: root/unicode-segmentation/src/grapheme.rs
diff options
context:
space:
mode:
Diffstat (limited to 'unicode-segmentation/src/grapheme.rs')
-rw-r--r--unicode-segmentation/src/grapheme.rs708
1 files changed, 0 insertions, 708 deletions
diff --git a/unicode-segmentation/src/grapheme.rs b/unicode-segmentation/src/grapheme.rs
deleted file mode 100644
index cde6526..0000000
--- a/unicode-segmentation/src/grapheme.rs
+++ /dev/null
@@ -1,708 +0,0 @@
-// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-use core::cmp;
-
-use tables::grapheme::GraphemeCat;
-
-/// External iterator for grapheme clusters and byte offsets.
-#[derive(Clone)]
-pub struct GraphemeIndices<'a> {
- start_offset: usize,
- iter: Graphemes<'a>,
-}
-
-impl<'a> GraphemeIndices<'a> {
- #[inline]
- /// View the underlying data (the part yet to be iterated) as a slice of the original string.
- ///
- /// ```rust
- /// # use unicode_segmentation::UnicodeSegmentation;
- /// let mut iter = "abc".grapheme_indices(true);
- /// assert_eq!(iter.as_str(), "abc");
- /// iter.next();
- /// assert_eq!(iter.as_str(), "bc");
- /// iter.next();
- /// iter.next();
- /// assert_eq!(iter.as_str(), "");
- /// ```
- pub fn as_str(&self) -> &'a str {
- self.iter.as_str()
- }
-}
-
-impl<'a> Iterator for GraphemeIndices<'a> {
- type Item = (usize, &'a str);
-
- #[inline]
- fn next(&mut self) -> Option<(usize, &'a str)> {
- self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
- }
-
- #[inline]
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.iter.size_hint()
- }
-}
-
-impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
- #[inline]
- fn next_back(&mut self) -> Option<(usize, &'a str)> {
- self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
- }
-}
-
-/// External iterator for a string's
-/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
-#[derive(Clone)]
-pub struct Graphemes<'a> {
- string: &'a str,
- cursor: GraphemeCursor,
- cursor_back: GraphemeCursor,
-}
-
-impl<'a> Graphemes<'a> {
- #[inline]
- /// View the underlying data (the part yet to be iterated) as a slice of the original string.
- ///
- /// ```rust
- /// # use unicode_segmentation::UnicodeSegmentation;
- /// let mut iter = "abc".graphemes(true);
- /// assert_eq!(iter.as_str(), "abc");
- /// iter.next();
- /// assert_eq!(iter.as_str(), "bc");
- /// iter.next();
- /// iter.next();
- /// assert_eq!(iter.as_str(), "");
- /// ```
- pub fn as_str(&self) -> &'a str {
- &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
- }
-}
-
-impl<'a> Iterator for Graphemes<'a> {
- type Item = &'a str;
-
- #[inline]
- fn size_hint(&self) -> (usize, Option<usize>) {
- let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
- (cmp::min(slen, 1), Some(slen))
- }
-
- #[inline]
- fn next(&mut self) -> Option<&'a str> {
- let start = self.cursor.cur_cursor();
- if start == self.cursor_back.cur_cursor() {
- return None;
- }
- let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
- Some(&self.string[start..next])
- }
-}
-
-impl<'a> DoubleEndedIterator for Graphemes<'a> {
- #[inline]
- fn next_back(&mut self) -> Option<&'a str> {
- let end = self.cursor_back.cur_cursor();
- if end == self.cursor.cur_cursor() {
- return None;
- }
- let prev = self.cursor_back.prev_boundary(self.string, 0).unwrap().unwrap();
- Some(&self.string[prev..end])
- }
-}
-
-#[inline]
-pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
- let len = s.len();
- Graphemes {
- string: s,
- cursor: GraphemeCursor::new(0, len, is_extended),
- cursor_back: GraphemeCursor::new(len, len, is_extended),
- }
-}
-
-#[inline]
-pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
- GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }
-}
-
-// maybe unify with PairResult?
-// An enum describing information about a potential boundary.
-#[derive(PartialEq, Eq, Clone)]
-enum GraphemeState {
- // No information is known.
- Unknown,
- // It is known to not be a boundary.
- NotBreak,
- // It is known to be a boundary.
- Break,
- // The codepoint after is a Regional Indicator Symbol, so a boundary iff
- // it is preceded by an even number of RIS codepoints. (GB12, GB13)
- Regional,
- // The codepoint after is Extended_Pictographic,
- // so whether it's a boundary depends on pre-context according to GB11.
- Emoji,
-}
-
-/// Cursor-based segmenter for grapheme clusters.
-#[derive(Clone)]
-pub struct GraphemeCursor {
- // Current cursor position.
- offset: usize,
- // Total length of the string.
- len: usize,
- // A config flag indicating whether this cursor computes legacy or extended
- // grapheme cluster boundaries (enables GB9a and GB9b if set).
- is_extended: bool,
- // Information about the potential boundary at `offset`
- state: GraphemeState,
- // Category of codepoint immediately preceding cursor, if known.
- cat_before: Option<GraphemeCat>,
- // Category of codepoint immediately after cursor, if known.
- cat_after: Option<GraphemeCat>,
- // If set, at least one more codepoint immediately preceding this offset
- // is needed to resolve whether there's a boundary at `offset`.
- pre_context_offset: Option<usize>,
- // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
- // is set, then counts the number of RIS between that and `offset`, otherwise
- // is an accurate count relative to the string.
- ris_count: Option<usize>,
- // Set if a call to `prev_boundary` or `next_boundary` was suspended due
- // to needing more input.
- resuming: bool,
-}
-
-/// An error return indicating that not enough content was available in the
-/// provided chunk to satisfy the query, and that more content must be provided.
-#[derive(PartialEq, Eq, Debug)]
-pub enum GraphemeIncomplete {
- /// More pre-context is needed. The caller should call `provide_context`
- /// with a chunk ending at the offset given, then retry the query. This
- /// will only be returned if the `chunk_start` parameter is nonzero.
- PreContext(usize),
-
- /// When requesting `prev_boundary`, the cursor is moving past the beginning
- /// of the current chunk, so the chunk before that is requested. This will
- /// only be returned if the `chunk_start` parameter is nonzero.
- PrevChunk,
-
- /// When requesting `next_boundary`, the cursor is moving past the end of the
- /// current chunk, so the chunk after that is requested. This will only be
- /// returned if the chunk ends before the `len` parameter provided on
- /// creation of the cursor.
- NextChunk, // requesting chunk following the one given
-
- /// An error returned when the chunk given does not contain the cursor position.
- InvalidOffset,
-}
-
-// An enum describing the result from lookup of a pair of categories.
-#[derive(PartialEq, Eq)]
-enum PairResult {
- NotBreak, // definitely not a break
- Break, // definitely a break
- Extended, // a break iff not in extended mode
- Regional, // a break if preceded by an even number of RIS
- Emoji, // a break if preceded by emoji base and (Extend)*
-}
-
-fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
- use tables::grapheme::GraphemeCat::*;
- use self::PairResult::*;
- match (before, after) {
- (GC_CR, GC_LF) => NotBreak, // GB3
- (GC_Control, _) => Break, // GB4
- (GC_CR, _) => Break, // GB4
- (GC_LF, _) => Break, // GB4
- (_, GC_Control) => Break, // GB5
- (_, GC_CR) => Break, // GB5
- (_, GC_LF) => Break, // GB5
- (GC_L, GC_L) => NotBreak, // GB6
- (GC_L, GC_V) => NotBreak, // GB6
- (GC_L, GC_LV) => NotBreak, // GB6
- (GC_L, GC_LVT) => NotBreak, // GB6
- (GC_LV, GC_V) => NotBreak, // GB7
- (GC_LV, GC_T) => NotBreak, // GB7
- (GC_V, GC_V) => NotBreak, // GB7
- (GC_V, GC_T) => NotBreak, // GB7
- (GC_LVT, GC_T) => NotBreak, // GB8
- (GC_T, GC_T) => NotBreak, // GB8
- (_, GC_Extend) => NotBreak, // GB9
- (_, GC_ZWJ) => NotBreak, // GB9
- (_, GC_SpacingMark) => Extended, // GB9a
- (GC_Prepend, _) => Extended, // GB9b
- (GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
- (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
- (_, _) => Break, // GB999
- }
-}
-
-impl GraphemeCursor {
- /// Create a new cursor. The string and initial offset are given at creation
- /// time, but the contents of the string are not. The `is_extended` parameter
- /// controls whether extended grapheme clusters are selected.
- ///
- /// The `offset` parameter must be on a codepoint boundary.
- ///
- /// ```rust
- /// # use unicode_segmentation::GraphemeCursor;
- /// let s = "हिन्दी";
- /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
- /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
- /// let mut extended = GraphemeCursor::new(0, s.len(), true);
- /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
- /// ```
- pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
- let state = if offset == 0 || offset == len {
- GraphemeState::Break
- } else {
- GraphemeState::Unknown
- };
- GraphemeCursor {
- offset: offset,
- len: len,
- state: state,
- is_extended: is_extended,
- cat_before: None,
- cat_after: None,
- pre_context_offset: None,
- ris_count: None,
- resuming: false,
- }
- }
-
- // Not sure I'm gonna keep this, the advantage over new() seems thin.
-
- /// Set the cursor to a new location in the same string.
- ///
- /// ```rust
- /// # use unicode_segmentation::GraphemeCursor;
- /// let s = "abcd";
- /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
- /// assert_eq!(cursor.cur_cursor(), 0);
- /// cursor.set_cursor(2);
- /// assert_eq!(cursor.cur_cursor(), 2);
- /// ```
- pub fn set_cursor(&mut self, offset: usize) {
- if offset != self.offset {
- self.offset = offset;
- self.state = if offset == 0 || offset == self.len {
- GraphemeState::Break
- } else {
- GraphemeState::Unknown
- };
- // reset state derived from text around cursor
- self.cat_before = None;
- self.cat_after = None;
- self.ris_count = None;
- }
- }
-
- #[inline]
- /// The current offset of the cursor. Equal to the last value provided to
- /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
- /// `prev_boundary()`.
- ///
- /// ```rust
- /// # use unicode_segmentation::GraphemeCursor;
- /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
- /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
- /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
- /// assert_eq!(cursor.cur_cursor(), 4);
- /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
- /// assert_eq!(cursor.cur_cursor(), 8);
- /// ```
- pub fn cur_cursor(&self) -> usize {
- self.offset
- }
-
- /// Provide additional pre-context when it is needed to decide a boundary.
- /// The end of the chunk must coincide with the value given in the
- /// `GraphemeIncomplete::PreContext` request.
- ///
- /// ```rust
- /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
- /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
- /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
- /// // Not enough pre-context to decide if there's a boundary between the two flags.
- /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
- /// // Provide one more Regional Indicator Symbol of pre-context
- /// cursor.provide_context(&flags[4..8], 4);
- /// // Still not enough context to decide.
- /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
- /// // Provide additional requested context.
- /// cursor.provide_context(&flags[0..4], 0);
- /// // That's enough to decide (it always is when context goes to the start of the string)
- /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
- /// ```
- pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
- use tables::grapheme as gr;
- assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
- self.pre_context_offset = None;
- if self.is_extended && chunk_start + chunk.len() == self.offset {
- let ch = chunk.chars().rev().next().unwrap();
- if gr::grapheme_category(ch) == gr::GC_Prepend {
- self.decide(false); // GB9b
- return;
- }
- }
- match self.state {
- GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
- GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
- _ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
- let ch = chunk.chars().rev().next().unwrap();
- self.cat_before = Some(gr::grapheme_category(ch));
- },
- }
- }
-
- fn decide(&mut self, is_break: bool) {
- self.state = if is_break {
- GraphemeState::Break
- } else {
- GraphemeState::NotBreak
- };
- }
-
- fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
- self.decide(is_break);
- Ok(is_break)
- }
-
- fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
- if self.state == GraphemeState::Break {
- Ok(true)
- } else if self.state == GraphemeState::NotBreak {
- Ok(false)
- } else if let Some(pre_context_offset) = self.pre_context_offset {
- Err(GraphemeIncomplete::PreContext(pre_context_offset))
- } else {
- unreachable!("inconsistent state");
- }
- }
-
- fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
- use tables::grapheme as gr;
- let mut ris_count = self.ris_count.unwrap_or(0);
- for ch in chunk.chars().rev() {
- if gr::grapheme_category(ch) != gr::GC_Regional_Indicator {
- self.ris_count = Some(ris_count);
- self.decide((ris_count % 2) == 0);
- return;
- }
- ris_count += 1;
- }
- self.ris_count = Some(ris_count);
- if chunk_start == 0 {
- self.decide((ris_count % 2) == 0);
- return;
- }
- self.pre_context_offset = Some(chunk_start);
- self.state = GraphemeState::Regional;
- }
-
- fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
- use tables::grapheme as gr;
- let mut iter = chunk.chars().rev();
- if let Some(ch) = iter.next() {
- if gr::grapheme_category(ch) != gr::GC_ZWJ {
- self.decide(true);
- return;
- }
- }
- for ch in iter {
- match gr::grapheme_category(ch) {
- gr::GC_Extend => (),
- gr::GC_Extended_Pictographic => {
- self.decide(false);
- return;
- }
- _ => {
- self.decide(true);
- return;
- }
- }
- }
- if chunk_start == 0 {
- self.decide(true);
- return;
- }
- self.pre_context_offset = Some(chunk_start);
- self.state = GraphemeState::Emoji;
- }
-
- /// Determine whether the current cursor location is a grapheme cluster boundary.
- /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
- /// the length of `chunk` is not equal to `len` on creation, then this method
- /// may return `GraphemeIncomplete::PreContext`. The caller should then
- /// call `provide_context` with the requested chunk, then retry calling this
- /// method.
- ///
- /// For partial chunks, if the cursor is not at the beginning or end of the
- /// string, the chunk should contain at least the codepoint following the cursor.
- /// If the string is nonempty, the chunk must be nonempty.
- ///
- /// All calls should have consistent chunk contents (ie, if a chunk provides
- /// content for a given slice, all further chunks covering that slice must have
- /// the same content for it).
- ///
- /// ```rust
- /// # use unicode_segmentation::GraphemeCursor;
- /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
- /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
- /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
- /// cursor.set_cursor(12);
- /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
- /// ```
- pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> {
- use tables::grapheme as gr;
- if self.state == GraphemeState::Break {
- return Ok(true)
- }
- if self.state == GraphemeState::NotBreak {
- return Ok(false)
- }
- if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
- if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
- return Err(GraphemeIncomplete::InvalidOffset)
- }
- }
- if let Some(pre_context_offset) = self.pre_context_offset {
- return Err(GraphemeIncomplete::PreContext(pre_context_offset));
- }
- let offset_in_chunk = self.offset - chunk_start;
- if self.cat_after.is_none() {
- let ch = chunk[offset_in_chunk..].chars().next().unwrap();
- self.cat_after = Some(gr::grapheme_category(ch));
- }
- if self.offset == chunk_start {
- let mut need_pre_context = true;
- match self.cat_after.unwrap() {
- gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
- gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
- _ => need_pre_context = self.cat_before.is_none(),
- }
- if need_pre_context {
- self.pre_context_offset = Some(chunk_start);
- return Err(GraphemeIncomplete::PreContext(chunk_start));
- }
- }
- if self.cat_before.is_none() {
- let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
- self.cat_before = Some(gr::grapheme_category(ch));
- }
- match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
- PairResult::NotBreak => return self.decision(false),
- PairResult::Break => return self.decision(true),
- PairResult::Extended => {
- let is_extended = self.is_extended;
- return self.decision(!is_extended);
- }
- PairResult::Regional => {
- if let Some(ris_count) = self.ris_count {
- return self.decision((ris_count % 2) == 0);
- }
- self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
- self.is_boundary_result()
- }
- PairResult::Emoji => {
- self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
- self.is_boundary_result()
- }
- }
- }
-
- /// Find the next boundary after the current cursor position. Only a part of
- /// the string need be supplied. If the chunk is incomplete, then this
- /// method might return `GraphemeIncomplete::PreContext` or
- /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
- /// call `provide_context` with the requested chunk, then retry. In the
- /// latter case, the caller should provide the chunk following the one
- /// given, then retry.
- ///
- /// See `is_boundary` for expectations on the provided chunk.
- ///
- /// ```rust
- /// # use unicode_segmentation::GraphemeCursor;
- /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
- /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
- /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
- /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
- /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
- /// ```
- ///
- /// And an example that uses partial strings:
- ///
- /// ```rust
- /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
- /// let s = "abcd";
- /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
- /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
- /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
- /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
- /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
- /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
- /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
- /// ```
- pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
- use tables::grapheme as gr;
- if self.offset == self.len {
- return Ok(None);
- }
- let mut iter = chunk[self.offset - chunk_start..].chars();
- let mut ch = iter.next().unwrap();
- loop {
- if self.resuming {
- if self.cat_after.is_none() {
- self.cat_after = Some(gr::grapheme_category(ch));
- }
- } else {
- self.offset += ch.len_utf8();
- self.state = GraphemeState::Unknown;
- self.cat_before = self.cat_after.take();
- if self.cat_before.is_none() {
- self.cat_before = Some(gr::grapheme_category(ch));
- }
- if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
- self.ris_count = self.ris_count.map(|c| c + 1);
- } else {
- self.ris_count = Some(0);
- }
- if let Some(next_ch) = iter.next() {
- ch = next_ch;
- self.cat_after = Some(gr::grapheme_category(ch));
- } else if self.offset == self.len {
- self.decide(true);
- } else {
- self.resuming = true;
- return Err(GraphemeIncomplete::NextChunk);
- }
- }
- self.resuming = true;
- if self.is_boundary(chunk, chunk_start)? {
- self.resuming = false;
- return Ok(Some(self.offset));
- }
- self.resuming = false;
- }
- }
-
- /// Find the previous boundary after the current cursor position. Only a part
- /// of the string need be supplied. If the chunk is incomplete, then this
- /// method might return `GraphemeIncomplete::PreContext` or
- /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
- /// call `provide_context` with the requested chunk, then retry. In the
- /// latter case, the caller should provide the chunk preceding the one
- /// given, then retry.
- ///
- /// See `is_boundary` for expectations on the provided chunk.
- ///
- /// ```rust
- /// # use unicode_segmentation::GraphemeCursor;
- /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
- /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
- /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
- /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
- /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
- /// ```
- ///
- /// And an example that uses partial strings (note the exact return is not
- /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
- ///
- /// ```rust
- /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
- /// let s = "abcd";
- /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
- /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
- /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
- /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
- /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
- /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
- /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
- /// ```
- pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
- use tables::grapheme as gr;
- if self.offset == 0 {
- return Ok(None);
- }
- if self.offset == chunk_start {
- return Err(GraphemeIncomplete::PrevChunk);
- }
- let mut iter = chunk[..self.offset - chunk_start].chars().rev();
- let mut ch = iter.next().unwrap();
- loop {
- if self.offset == chunk_start {
- self.resuming = true;
- return Err(GraphemeIncomplete::PrevChunk);
- }
- if self.resuming {
- self.cat_before = Some(gr::grapheme_category(ch));
- } else {
- self.offset -= ch.len_utf8();
- self.cat_after = self.cat_before.take();
- self.state = GraphemeState::Unknown;
- if let Some(ris_count) = self.ris_count {
- self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None };
- }
- if let Some(prev_ch) = iter.next() {
- ch = prev_ch;
- self.cat_before = Some(gr::grapheme_category(ch));
- } else if self.offset == 0 {
- self.decide(true);
- } else {
- self.resuming = true;
- self.cat_after = Some(gr::grapheme_category(ch));
- return Err(GraphemeIncomplete::PrevChunk);
- }
- }
- self.resuming = true;
- if self.is_boundary(chunk, chunk_start)? {
- self.resuming = false;
- return Ok(Some(self.offset));
- }
- self.resuming = false;
- }
- }
-}
-
-#[test]
-fn test_grapheme_cursor_ris_precontext() {
- let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
- let mut c = GraphemeCursor::new(8, s.len(), true);
- assert_eq!(c.is_boundary(&s[4..], 4), Err(GraphemeIncomplete::PreContext(4)));
- c.provide_context(&s[..4], 0);
- assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
-}
-
-#[test]
-fn test_grapheme_cursor_chunk_start_require_precontext() {
- let s = "\r\n";
- let mut c = GraphemeCursor::new(1, s.len(), true);
- assert_eq!(c.is_boundary(&s[1..], 1), Err(GraphemeIncomplete::PreContext(1)));
- c.provide_context(&s[..1], 0);
- assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
-}
-
-#[test]
-fn test_grapheme_cursor_prev_boundary() {
- let s = "abcd";
- let mut c = GraphemeCursor::new(3, s.len(), true);
- assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
- assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
-}
-
-#[test]
-fn test_grapheme_cursor_prev_boundary_chunk_start() {
- let s = "abcd";
- let mut c = GraphemeCursor::new(2, s.len(), true);
- assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
- assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
-}