diff options
Diffstat (limited to 'unicode-segmentation/src/word.rs')
-rw-r--r-- | unicode-segmentation/src/word.rs | 664 |
1 files changed, 0 insertions, 664 deletions
diff --git a/unicode-segmentation/src/word.rs b/unicode-segmentation/src/word.rs deleted file mode 100644 index 6e9c049..0000000 --- a/unicode-segmentation/src/word.rs +++ /dev/null @@ -1,664 +0,0 @@ -// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or -// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license -// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use core::cmp; -use core::iter::Filter; - -use tables::word::WordCat; - -/// An iterator over the substrings of a string which, after splitting the string on -/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), -/// contain any characters with the -/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) -/// property, or with -/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). -pub struct UnicodeWords<'a> { - inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>, -} - -impl<'a> Iterator for UnicodeWords<'a> { - type Item = &'a str; - - #[inline] - fn next(&mut self) -> Option<&'a str> { self.inner.next() } -} -impl<'a> DoubleEndedIterator for UnicodeWords<'a> { - #[inline] - fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() } -} - -/// External iterator for a string's -/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). -#[derive(Clone)] -pub struct UWordBounds<'a> { - string: &'a str, - cat: Option<WordCat>, - catb: Option<WordCat>, -} - -/// External iterator for word boundaries and byte offsets. -#[derive(Clone)] -pub struct UWordBoundIndices<'a> { - start_offset: usize, - iter: UWordBounds<'a>, -} - -impl<'a> UWordBoundIndices<'a> { - #[inline] - /// View the underlying data (the part yet to be iterated) as a slice of the original string. - /// - /// ```rust - /// # use unicode_segmentation::UnicodeSegmentation; - /// let mut iter = "Hello world".split_word_bound_indices(); - /// assert_eq!(iter.as_str(), "Hello world"); - /// iter.next(); - /// assert_eq!(iter.as_str(), " world"); - /// iter.next(); - /// assert_eq!(iter.as_str(), "world"); - /// ``` - pub fn as_str(&self) -> &'a str { - self.iter.as_str() - } -} - -impl<'a> Iterator for UWordBoundIndices<'a> { - type Item = (usize, &'a str); - - #[inline] - fn next(&mut self) -> Option<(usize, &'a str)> { - self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s)) - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.iter.size_hint() - } -} - -impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> { - #[inline] - fn next_back(&mut self) -> Option<(usize, &'a str)> { - self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s)) - } -} - -// state machine for word boundary rules -#[derive(Clone,Copy,PartialEq,Eq,Debug)] -enum UWordBoundsState { - Start, - Letter, - HLetter, - Numeric, - Katakana, - ExtendNumLet, - Regional(RegionalState), - FormatExtend(FormatExtendType), - Zwj, - Emoji, - WSegSpace, -} - -// subtypes for FormatExtend state in UWordBoundsState -#[derive(Clone,Copy,PartialEq,Eq,Debug)] -enum FormatExtendType { - AcceptAny, - AcceptNone, - RequireLetter, - RequireHLetter, - AcceptQLetter, - RequireNumeric, -} - -#[derive(Clone,Copy,PartialEq,Eq,Debug)] -enum RegionalState { - Half, - Full, - Unknown, -} - -fn is_emoji(ch: char) -> bool { - use tables::emoji; - emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic -} - -impl<'a> Iterator for UWordBounds<'a> { - type Item = &'a str; - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - let slen = self.string.len(); - (cmp::min(slen, 1), Some(slen)) - } - - #[inline] - fn next(&mut self) -> Option<&'a str> { - use self::UWordBoundsState::*; - use self::FormatExtendType::*; - use tables::word as wd; - if self.string.len() == 0 { - return None; - } - - let mut take_curr = true; - let mut take_cat = true; - let mut idx = 0; - let mut saveidx = 0; - let mut state = Start; - let mut cat = wd::WC_Any; - let mut savecat = wd::WC_Any; - - // Whether or not the previous category was ZWJ - // ZWJs get collapsed, so this handles precedence of WB3c over WB4 - let mut prev_zwj; - // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4 - let mut skipped_format_extend = false; - for (curr, ch) in self.string.char_indices() { - idx = curr; - prev_zwj = cat == wd::WC_ZWJ; - // if there's a category cached, grab it - cat = match self.cat { - None => wd::word_category(ch), - _ => self.cat.take().unwrap() - }; - take_cat = true; - - // handle rule WB4 - // just skip all format, extend, and zwj chars - // note that Start is a special case: if there's a bunch of Format | Extend - // characters at the beginning of a block of text, dump them out as one unit. - // - // (This is not obvious from the wording of UAX#29, but if you look at the - // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt - // then the "correct" interpretation of WB4 becomes apparent.) - if state != Start { - match cat { - wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => { - skipped_format_extend = true; - continue - } - _ => {} - } - } - - // rule WB3c - // WB4 makes all ZWJs collapse into the previous state - // but you can still be in a Zwj state if you started with Zwj - // - // This means that an EP + Zwj will collapse into EP, which is wrong, - // since EP+EP is not a boundary but EP+ZWJ+EP is - // - // Thus, we separately keep track of whether or not the last character - // was a ZWJ. This is an additional bit of state tracked outside of the - // state enum; the state enum represents the last non-zwj state encountered. - // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state, - // however we are in the previous state for the purposes of all other rules. - if prev_zwj { - if is_emoji(ch) { - state = Emoji; - continue; - } - } - // Don't use `continue` in this match without updating `cat` - state = match state { - Start if cat == wd::WC_CR => { - idx += match self.get_next_cat(idx) { - Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3 - _ => 0 - }; - break; // rule WB3a - }, - Start => match cat { - wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a - wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a - wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a - wd::WC_Katakana => Katakana, // rule WB13, WB13a - wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b - wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c - wd::WC_LF | wd::WC_Newline => break, // rule WB3a - wd::WC_ZWJ => Zwj, // rule WB3c - wd::WC_WSegSpace => WSegSpace, // rule WB3d - _ => { - if let Some(ncat) = self.get_next_cat(idx) { // rule WB4 - if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ { - state = FormatExtend(AcceptNone); - self.cat = Some(ncat); - continue; - } - } - break; // rule WB999 - } - }, - WSegSpace => match cat { - wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, - _ => { - take_curr = false; - break; - } - }, - Zwj => { - // We already handle WB3c above. - take_curr = false; - break; - } - Letter | HLetter => match cat { - wd::WC_ALetter => Letter, // rule WB5 - wd::WC_Hebrew_Letter => HLetter, // rule WB5 - wd::WC_Numeric => Numeric, // rule WB9 - wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a - wd::WC_Double_Quote if state == HLetter => { - savecat = cat; - saveidx = idx; - FormatExtend(RequireHLetter) // rule WB7b - }, - wd::WC_Single_Quote if state == HLetter => { - FormatExtend(AcceptQLetter) // rule WB7a - }, - wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { - savecat = cat; - saveidx = idx; - FormatExtend(RequireLetter) // rule WB6 - }, - _ => { - take_curr = false; - break; - } - }, - Numeric => match cat { - wd::WC_Numeric => Numeric, // rule WB8 - wd::WC_ALetter => Letter, // rule WB10 - wd::WC_Hebrew_Letter => HLetter, // rule WB10 - wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a - wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { - savecat = cat; - saveidx = idx; - FormatExtend(RequireNumeric) // rule WB12 - }, - _ => { - take_curr = false; - break; - } - }, - Katakana => match cat { - wd::WC_Katakana => Katakana, // rule WB13 - wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a - _ => { - take_curr = false; - break; - } - }, - ExtendNumLet => match cat { - wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a - wd::WC_ALetter => Letter, // rule WB13b - wd::WC_Hebrew_Letter => HLetter, // rule WB13b - wd::WC_Numeric => Numeric, // rule WB13b - wd::WC_Katakana => Katakana, // rule WB13b - _ => { - take_curr = false; - break; - } - }, - Regional(RegionalState::Full) => { - // if it reaches here we've gone too far, - // a full flag can only compose with ZWJ/Extend/Format - // proceeding it. - take_curr = false; - break; - } - Regional(RegionalState::Half) => match cat { - wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c - _ => { - take_curr = false; - break; - } - }, - Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"), - Emoji => { - // We already handle WB3c above. If you've reached this point, the emoji sequence is over. - take_curr = false; - break; - }, - FormatExtend(t) => match t { // handle FormatExtends depending on what type - RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11 - RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7 - RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a - RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b - AcceptNone | AcceptQLetter => { - take_curr = false; // emit all the Format|Extend characters - take_cat = false; - break; - }, - _ => break // rewind (in if statement below) - } - } - } - - if let FormatExtend(t) = state { - // we were looking for something and didn't find it; we have to back up - if t == RequireLetter || t == RequireHLetter || t == RequireNumeric { - idx = saveidx; - cat = savecat; - take_curr = false; - } - } - - self.cat = if take_curr { - idx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); - None - } else if take_cat { - Some(cat) - } else { - None - }; - - let retstr = &self.string[..idx]; - self.string = &self.string[idx..]; - Some(retstr) - } -} - -impl<'a> DoubleEndedIterator for UWordBounds<'a> { - #[inline] - fn next_back(&mut self) -> Option<&'a str> { - use self::UWordBoundsState::*; - use self::FormatExtendType::*; - use tables::word as wd; - if self.string.len() == 0 { - return None; - } - - let mut take_curr = true; - let mut take_cat = true; - let mut idx = self.string.len(); - idx -= self.string.chars().next_back().unwrap().len_utf8(); - let mut previdx = idx; - let mut saveidx = idx; - let mut state = Start; - let mut savestate = Start; - let mut cat = wd::WC_Any; - - let mut skipped_format_extend = false; - - for (curr, ch) in self.string.char_indices().rev() { - previdx = idx; - idx = curr; - - // if there's a category cached, grab it - cat = match self.catb { - None => wd::word_category(ch), - _ => self.catb.take().unwrap() - }; - take_cat = true; - - // backward iterator over word boundaries. Mostly the same as the forward - // iterator, with two weirdnesses: - // (1) If we encounter a single quote in the Start state, we have to check for a - // Hebrew Letter immediately before it. - // (2) Format and Extend char handling takes some gymnastics. - - if cat == wd::WC_Extend - || cat == wd::WC_Format - || (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not - // fold in that case - if match state { - FormatExtend(_) | Start => false, - _ => true - } { - saveidx = previdx; - savestate = state; - state = FormatExtend(AcceptNone); - } - - if state != Start { - continue; - } - } else if state == FormatExtend(AcceptNone) { - // finished a scan of some Format|Extend chars, restore previous state - state = savestate; - previdx = saveidx; - take_cat = false; - skipped_format_extend = true; - } - - // Don't use `continue` in this match without updating `catb` - state = match state { - Start | FormatExtend(AcceptAny) => match cat { - _ if is_emoji(ch) => Zwj, - wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b - wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b - wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b - wd::WC_Katakana => Katakana, // rule WB13, WB13b - wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a - wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c - // rule WB4: - wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny), - wd::WC_Single_Quote => { - saveidx = idx; - FormatExtend(AcceptQLetter) // rule WB7a - }, - wd::WC_WSegSpace => WSegSpace, - wd::WC_CR | wd::WC_LF | wd::WC_Newline => { - if state == Start { - if cat == wd::WC_LF { - idx -= match self.get_prev_cat(idx) { - Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3 - _ => 0 - }; - } - } else { - take_curr = false; - } - break; // rule WB3a - }, - _ => break // rule WB999 - }, - Zwj => match cat { // rule WB3c - wd::WC_ZWJ => { - FormatExtend(AcceptAny) - } - _ => { - take_curr = false; - break; - } - }, - WSegSpace => match cat { // rule WB3d - wd::WC_WSegSpace if !skipped_format_extend => { - WSegSpace - } - _ => { - take_curr = false; - break; - } - }, - Letter | HLetter => match cat { - wd::WC_ALetter => Letter, // rule WB5 - wd::WC_Hebrew_Letter => HLetter, // rule WB5 - wd::WC_Numeric => Numeric, // rule WB10 - wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b - wd::WC_Double_Quote if state == HLetter => { - saveidx = previdx; - FormatExtend(RequireHLetter) // rule WB7c - }, - wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { - saveidx = previdx; - FormatExtend(RequireLetter) // rule WB7 - }, - _ => { - take_curr = false; - break; - } - }, - Numeric => match cat { - wd::WC_Numeric => Numeric, // rule WB8 - wd::WC_ALetter => Letter, // rule WB9 - wd::WC_Hebrew_Letter => HLetter, // rule WB9 - wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b - wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { - saveidx = previdx; - FormatExtend(RequireNumeric) // rule WB11 - }, - _ => { - take_curr = false; - break; - } - }, - Katakana => match cat { - wd::WC_Katakana => Katakana, // rule WB13 - wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b - _ => { - take_curr = false; - break; - } - }, - ExtendNumLet => match cat { - wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a - wd::WC_ALetter => Letter, // rule WB13a - wd::WC_Hebrew_Letter => HLetter, // rule WB13a - wd::WC_Numeric => Numeric, // rule WB13a - wd::WC_Katakana => Katakana, // rule WB13a - _ => { - take_curr = false; - break; - } - }, - Regional(mut regional_state) => match cat { - // rule WB13c - wd::WC_Regional_Indicator => { - if regional_state == RegionalState::Unknown { - let count = self.string[..previdx] - .chars().rev() - .map(|c| wd::word_category(c)) - .filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)) - .take_while(|&c| c == wd::WC_Regional_Indicator) - .count(); - regional_state = if count % 2 == 0 { - RegionalState::Full - } else { - RegionalState::Half - }; - } - if regional_state == RegionalState::Full { - take_curr = false; - break; - } else { - Regional(RegionalState::Full) - } - } - _ => { - take_curr = false; - break; - } - }, - Emoji => { - if is_emoji(ch) { // rule WB3c - Zwj - } else { - take_curr = false; - break; - } - }, - FormatExtend(t) => match t { - RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12 - RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6 - RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6 - AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a - RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b - _ => break // backtrack will happens - } - } - } - - if let FormatExtend(t) = state { - // if we required something but didn't find it, backtrack - if t == RequireLetter || t == RequireHLetter || - t == RequireNumeric || t == AcceptNone || t == AcceptQLetter { - previdx = saveidx; - take_cat = false; - take_curr = false; - } - } - - self.catb = if take_curr { - None - } else { - idx = previdx; - if take_cat { - Some(cat) - } else { - None - } - }; - - let retstr = &self.string[idx..]; - self.string = &self.string[..idx]; - Some(retstr) - } -} - -impl<'a> UWordBounds<'a> { - #[inline] - /// View the underlying data (the part yet to be iterated) as a slice of the original string. - /// - /// ```rust - /// # use unicode_segmentation::UnicodeSegmentation; - /// let mut iter = "Hello world".split_word_bounds(); - /// assert_eq!(iter.as_str(), "Hello world"); - /// iter.next(); - /// assert_eq!(iter.as_str(), " world"); - /// iter.next(); - /// assert_eq!(iter.as_str(), "world"); - /// ``` - pub fn as_str(&self) -> &'a str { - self.string - } - - #[inline] - fn get_next_cat(&self, idx: usize) -> Option<WordCat> { - use tables::word as wd; - let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); - if nidx < self.string.len() { - let nch = self.string[nidx..].chars().next().unwrap(); - Some(wd::word_category(nch)) - } else { - None - } - } - - #[inline] - fn get_prev_cat(&self, idx: usize) -> Option<WordCat> { - use tables::word as wd; - if idx > 0 { - let nch = self.string[..idx].chars().next_back().unwrap(); - Some(wd::word_category(nch)) - } else { - None - } - } -} - -#[inline] -pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> { - UWordBounds { string: s, cat: None, catb: None } -} - -#[inline] -pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> { - UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) } -} - -#[inline] -pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> { - use super::UnicodeSegmentation; - use tables::util::is_alphanumeric; - - fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) } - let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer - - UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) } -} |