aboutsummaryrefslogtreecommitdiff
path: root/unicode-segmentation/src/word.rs
diff options
context:
space:
mode:
Diffstat (limited to 'unicode-segmentation/src/word.rs')
-rw-r--r--unicode-segmentation/src/word.rs664
1 files changed, 0 insertions, 664 deletions
diff --git a/unicode-segmentation/src/word.rs b/unicode-segmentation/src/word.rs
deleted file mode 100644
index 6e9c049..0000000
--- a/unicode-segmentation/src/word.rs
+++ /dev/null
@@ -1,664 +0,0 @@
-// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-use core::cmp;
-use core::iter::Filter;
-
-use tables::word::WordCat;
-
-/// An iterator over the substrings of a string which, after splitting the string on
-/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
-/// contain any characters with the
-/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
-/// property, or with
-/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
-pub struct UnicodeWords<'a> {
- inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
-}
-
-impl<'a> Iterator for UnicodeWords<'a> {
- type Item = &'a str;
-
- #[inline]
- fn next(&mut self) -> Option<&'a str> { self.inner.next() }
-}
-impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
- #[inline]
- fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
-}
-
-/// External iterator for a string's
-/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
-#[derive(Clone)]
-pub struct UWordBounds<'a> {
- string: &'a str,
- cat: Option<WordCat>,
- catb: Option<WordCat>,
-}
-
-/// External iterator for word boundaries and byte offsets.
-#[derive(Clone)]
-pub struct UWordBoundIndices<'a> {
- start_offset: usize,
- iter: UWordBounds<'a>,
-}
-
-impl<'a> UWordBoundIndices<'a> {
- #[inline]
- /// View the underlying data (the part yet to be iterated) as a slice of the original string.
- ///
- /// ```rust
- /// # use unicode_segmentation::UnicodeSegmentation;
- /// let mut iter = "Hello world".split_word_bound_indices();
- /// assert_eq!(iter.as_str(), "Hello world");
- /// iter.next();
- /// assert_eq!(iter.as_str(), " world");
- /// iter.next();
- /// assert_eq!(iter.as_str(), "world");
- /// ```
- pub fn as_str(&self) -> &'a str {
- self.iter.as_str()
- }
-}
-
-impl<'a> Iterator for UWordBoundIndices<'a> {
- type Item = (usize, &'a str);
-
- #[inline]
- fn next(&mut self) -> Option<(usize, &'a str)> {
- self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
- }
-
- #[inline]
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.iter.size_hint()
- }
-}
-
-impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
- #[inline]
- fn next_back(&mut self) -> Option<(usize, &'a str)> {
- self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
- }
-}
-
-// state machine for word boundary rules
-#[derive(Clone,Copy,PartialEq,Eq,Debug)]
-enum UWordBoundsState {
- Start,
- Letter,
- HLetter,
- Numeric,
- Katakana,
- ExtendNumLet,
- Regional(RegionalState),
- FormatExtend(FormatExtendType),
- Zwj,
- Emoji,
- WSegSpace,
-}
-
-// subtypes for FormatExtend state in UWordBoundsState
-#[derive(Clone,Copy,PartialEq,Eq,Debug)]
-enum FormatExtendType {
- AcceptAny,
- AcceptNone,
- RequireLetter,
- RequireHLetter,
- AcceptQLetter,
- RequireNumeric,
-}
-
-#[derive(Clone,Copy,PartialEq,Eq,Debug)]
-enum RegionalState {
- Half,
- Full,
- Unknown,
-}
-
-fn is_emoji(ch: char) -> bool {
- use tables::emoji;
- emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic
-}
-
-impl<'a> Iterator for UWordBounds<'a> {
- type Item = &'a str;
-
- #[inline]
- fn size_hint(&self) -> (usize, Option<usize>) {
- let slen = self.string.len();
- (cmp::min(slen, 1), Some(slen))
- }
-
- #[inline]
- fn next(&mut self) -> Option<&'a str> {
- use self::UWordBoundsState::*;
- use self::FormatExtendType::*;
- use tables::word as wd;
- if self.string.len() == 0 {
- return None;
- }
-
- let mut take_curr = true;
- let mut take_cat = true;
- let mut idx = 0;
- let mut saveidx = 0;
- let mut state = Start;
- let mut cat = wd::WC_Any;
- let mut savecat = wd::WC_Any;
-
- // Whether or not the previous category was ZWJ
- // ZWJs get collapsed, so this handles precedence of WB3c over WB4
- let mut prev_zwj;
- // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
- let mut skipped_format_extend = false;
- for (curr, ch) in self.string.char_indices() {
- idx = curr;
- prev_zwj = cat == wd::WC_ZWJ;
- // if there's a category cached, grab it
- cat = match self.cat {
- None => wd::word_category(ch),
- _ => self.cat.take().unwrap()
- };
- take_cat = true;
-
- // handle rule WB4
- // just skip all format, extend, and zwj chars
- // note that Start is a special case: if there's a bunch of Format | Extend
- // characters at the beginning of a block of text, dump them out as one unit.
- //
- // (This is not obvious from the wording of UAX#29, but if you look at the
- // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
- // then the "correct" interpretation of WB4 becomes apparent.)
- if state != Start {
- match cat {
- wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
- skipped_format_extend = true;
- continue
- }
- _ => {}
- }
- }
-
- // rule WB3c
- // WB4 makes all ZWJs collapse into the previous state
- // but you can still be in a Zwj state if you started with Zwj
- //
- // This means that an EP + Zwj will collapse into EP, which is wrong,
- // since EP+EP is not a boundary but EP+ZWJ+EP is
- //
- // Thus, we separately keep track of whether or not the last character
- // was a ZWJ. This is an additional bit of state tracked outside of the
- // state enum; the state enum represents the last non-zwj state encountered.
- // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
- // however we are in the previous state for the purposes of all other rules.
- if prev_zwj {
- if is_emoji(ch) {
- state = Emoji;
- continue;
- }
- }
- // Don't use `continue` in this match without updating `cat`
- state = match state {
- Start if cat == wd::WC_CR => {
- idx += match self.get_next_cat(idx) {
- Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
- _ => 0
- };
- break; // rule WB3a
- },
- Start => match cat {
- wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
- wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
- wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
- wd::WC_Katakana => Katakana, // rule WB13, WB13a
- wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
- wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
- wd::WC_LF | wd::WC_Newline => break, // rule WB3a
- wd::WC_ZWJ => Zwj, // rule WB3c
- wd::WC_WSegSpace => WSegSpace, // rule WB3d
- _ => {
- if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
- if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
- state = FormatExtend(AcceptNone);
- self.cat = Some(ncat);
- continue;
- }
- }
- break; // rule WB999
- }
- },
- WSegSpace => match cat {
- wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
- _ => {
- take_curr = false;
- break;
- }
- },
- Zwj => {
- // We already handle WB3c above.
- take_curr = false;
- break;
- }
- Letter | HLetter => match cat {
- wd::WC_ALetter => Letter, // rule WB5
- wd::WC_Hebrew_Letter => HLetter, // rule WB5
- wd::WC_Numeric => Numeric, // rule WB9
- wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
- wd::WC_Double_Quote if state == HLetter => {
- savecat = cat;
- saveidx = idx;
- FormatExtend(RequireHLetter) // rule WB7b
- },
- wd::WC_Single_Quote if state == HLetter => {
- FormatExtend(AcceptQLetter) // rule WB7a
- },
- wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
- savecat = cat;
- saveidx = idx;
- FormatExtend(RequireLetter) // rule WB6
- },
- _ => {
- take_curr = false;
- break;
- }
- },
- Numeric => match cat {
- wd::WC_Numeric => Numeric, // rule WB8
- wd::WC_ALetter => Letter, // rule WB10
- wd::WC_Hebrew_Letter => HLetter, // rule WB10
- wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
- wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
- savecat = cat;
- saveidx = idx;
- FormatExtend(RequireNumeric) // rule WB12
- },
- _ => {
- take_curr = false;
- break;
- }
- },
- Katakana => match cat {
- wd::WC_Katakana => Katakana, // rule WB13
- wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
- _ => {
- take_curr = false;
- break;
- }
- },
- ExtendNumLet => match cat {
- wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
- wd::WC_ALetter => Letter, // rule WB13b
- wd::WC_Hebrew_Letter => HLetter, // rule WB13b
- wd::WC_Numeric => Numeric, // rule WB13b
- wd::WC_Katakana => Katakana, // rule WB13b
- _ => {
- take_curr = false;
- break;
- }
- },
- Regional(RegionalState::Full) => {
- // if it reaches here we've gone too far,
- // a full flag can only compose with ZWJ/Extend/Format
- // proceeding it.
- take_curr = false;
- break;
- }
- Regional(RegionalState::Half) => match cat {
- wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
- _ => {
- take_curr = false;
- break;
- }
- },
- Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
- Emoji => {
- // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
- take_curr = false;
- break;
- },
- FormatExtend(t) => match t { // handle FormatExtends depending on what type
- RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
- RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
- RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
- RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
- AcceptNone | AcceptQLetter => {
- take_curr = false; // emit all the Format|Extend characters
- take_cat = false;
- break;
- },
- _ => break // rewind (in if statement below)
- }
- }
- }
-
- if let FormatExtend(t) = state {
- // we were looking for something and didn't find it; we have to back up
- if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
- idx = saveidx;
- cat = savecat;
- take_curr = false;
- }
- }
-
- self.cat = if take_curr {
- idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
- None
- } else if take_cat {
- Some(cat)
- } else {
- None
- };
-
- let retstr = &self.string[..idx];
- self.string = &self.string[idx..];
- Some(retstr)
- }
-}
-
-impl<'a> DoubleEndedIterator for UWordBounds<'a> {
- #[inline]
- fn next_back(&mut self) -> Option<&'a str> {
- use self::UWordBoundsState::*;
- use self::FormatExtendType::*;
- use tables::word as wd;
- if self.string.len() == 0 {
- return None;
- }
-
- let mut take_curr = true;
- let mut take_cat = true;
- let mut idx = self.string.len();
- idx -= self.string.chars().next_back().unwrap().len_utf8();
- let mut previdx = idx;
- let mut saveidx = idx;
- let mut state = Start;
- let mut savestate = Start;
- let mut cat = wd::WC_Any;
-
- let mut skipped_format_extend = false;
-
- for (curr, ch) in self.string.char_indices().rev() {
- previdx = idx;
- idx = curr;
-
- // if there's a category cached, grab it
- cat = match self.catb {
- None => wd::word_category(ch),
- _ => self.catb.take().unwrap()
- };
- take_cat = true;
-
- // backward iterator over word boundaries. Mostly the same as the forward
- // iterator, with two weirdnesses:
- // (1) If we encounter a single quote in the Start state, we have to check for a
- // Hebrew Letter immediately before it.
- // (2) Format and Extend char handling takes some gymnastics.
-
- if cat == wd::WC_Extend
- || cat == wd::WC_Format
- || (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not
- // fold in that case
- if match state {
- FormatExtend(_) | Start => false,
- _ => true
- } {
- saveidx = previdx;
- savestate = state;
- state = FormatExtend(AcceptNone);
- }
-
- if state != Start {
- continue;
- }
- } else if state == FormatExtend(AcceptNone) {
- // finished a scan of some Format|Extend chars, restore previous state
- state = savestate;
- previdx = saveidx;
- take_cat = false;
- skipped_format_extend = true;
- }
-
- // Don't use `continue` in this match without updating `catb`
- state = match state {
- Start | FormatExtend(AcceptAny) => match cat {
- _ if is_emoji(ch) => Zwj,
- wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
- wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
- wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
- wd::WC_Katakana => Katakana, // rule WB13, WB13b
- wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
- wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
- // rule WB4:
- wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
- wd::WC_Single_Quote => {
- saveidx = idx;
- FormatExtend(AcceptQLetter) // rule WB7a
- },
- wd::WC_WSegSpace => WSegSpace,
- wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
- if state == Start {
- if cat == wd::WC_LF {
- idx -= match self.get_prev_cat(idx) {
- Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
- _ => 0
- };
- }
- } else {
- take_curr = false;
- }
- break; // rule WB3a
- },
- _ => break // rule WB999
- },
- Zwj => match cat { // rule WB3c
- wd::WC_ZWJ => {
- FormatExtend(AcceptAny)
- }
- _ => {
- take_curr = false;
- break;
- }
- },
- WSegSpace => match cat { // rule WB3d
- wd::WC_WSegSpace if !skipped_format_extend => {
- WSegSpace
- }
- _ => {
- take_curr = false;
- break;
- }
- },
- Letter | HLetter => match cat {
- wd::WC_ALetter => Letter, // rule WB5
- wd::WC_Hebrew_Letter => HLetter, // rule WB5
- wd::WC_Numeric => Numeric, // rule WB10
- wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
- wd::WC_Double_Quote if state == HLetter => {
- saveidx = previdx;
- FormatExtend(RequireHLetter) // rule WB7c
- },
- wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
- saveidx = previdx;
- FormatExtend(RequireLetter) // rule WB7
- },
- _ => {
- take_curr = false;
- break;
- }
- },
- Numeric => match cat {
- wd::WC_Numeric => Numeric, // rule WB8
- wd::WC_ALetter => Letter, // rule WB9
- wd::WC_Hebrew_Letter => HLetter, // rule WB9
- wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
- wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
- saveidx = previdx;
- FormatExtend(RequireNumeric) // rule WB11
- },
- _ => {
- take_curr = false;
- break;
- }
- },
- Katakana => match cat {
- wd::WC_Katakana => Katakana, // rule WB13
- wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
- _ => {
- take_curr = false;
- break;
- }
- },
- ExtendNumLet => match cat {
- wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
- wd::WC_ALetter => Letter, // rule WB13a
- wd::WC_Hebrew_Letter => HLetter, // rule WB13a
- wd::WC_Numeric => Numeric, // rule WB13a
- wd::WC_Katakana => Katakana, // rule WB13a
- _ => {
- take_curr = false;
- break;
- }
- },
- Regional(mut regional_state) => match cat {
- // rule WB13c
- wd::WC_Regional_Indicator => {
- if regional_state == RegionalState::Unknown {
- let count = self.string[..previdx]
- .chars().rev()
- .map(|c| wd::word_category(c))
- .filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format))
- .take_while(|&c| c == wd::WC_Regional_Indicator)
- .count();
- regional_state = if count % 2 == 0 {
- RegionalState::Full
- } else {
- RegionalState::Half
- };
- }
- if regional_state == RegionalState::Full {
- take_curr = false;
- break;
- } else {
- Regional(RegionalState::Full)
- }
- }
- _ => {
- take_curr = false;
- break;
- }
- },
- Emoji => {
- if is_emoji(ch) { // rule WB3c
- Zwj
- } else {
- take_curr = false;
- break;
- }
- },
- FormatExtend(t) => match t {
- RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
- RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
- RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
- AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
- RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
- _ => break // backtrack will happens
- }
- }
- }
-
- if let FormatExtend(t) = state {
- // if we required something but didn't find it, backtrack
- if t == RequireLetter || t == RequireHLetter ||
- t == RequireNumeric || t == AcceptNone || t == AcceptQLetter {
- previdx = saveidx;
- take_cat = false;
- take_curr = false;
- }
- }
-
- self.catb = if take_curr {
- None
- } else {
- idx = previdx;
- if take_cat {
- Some(cat)
- } else {
- None
- }
- };
-
- let retstr = &self.string[idx..];
- self.string = &self.string[..idx];
- Some(retstr)
- }
-}
-
-impl<'a> UWordBounds<'a> {
- #[inline]
- /// View the underlying data (the part yet to be iterated) as a slice of the original string.
- ///
- /// ```rust
- /// # use unicode_segmentation::UnicodeSegmentation;
- /// let mut iter = "Hello world".split_word_bounds();
- /// assert_eq!(iter.as_str(), "Hello world");
- /// iter.next();
- /// assert_eq!(iter.as_str(), " world");
- /// iter.next();
- /// assert_eq!(iter.as_str(), "world");
- /// ```
- pub fn as_str(&self) -> &'a str {
- self.string
- }
-
- #[inline]
- fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
- use tables::word as wd;
- let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
- if nidx < self.string.len() {
- let nch = self.string[nidx..].chars().next().unwrap();
- Some(wd::word_category(nch))
- } else {
- None
- }
- }
-
- #[inline]
- fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
- use tables::word as wd;
- if idx > 0 {
- let nch = self.string[..idx].chars().next_back().unwrap();
- Some(wd::word_category(nch))
- } else {
- None
- }
- }
-}
-
-#[inline]
-pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
- UWordBounds { string: s, cat: None, catb: None }
-}
-
-#[inline]
-pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
- UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) }
-}
-
-#[inline]
-pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
- use super::UnicodeSegmentation;
- use tables::util::is_alphanumeric;
-
- fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
- let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
-
- UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
-}