From 5e20a29b4fdc8a2d442d1093681b396dcb4b816b Mon Sep 17 00:00:00 2001 From: Robin Krahl Date: Tue, 7 Jan 2020 11:18:04 +0000 Subject: Add structopt dependency in version 0.3.7 This patch series replaces argparse with structopt in the argument handling code. As a first step, we need structopt as a dependency. Import subrepo structopt/:structopt at efbdda4753592e27bc430fb01f7b9650b2f3174d Import subrepo bitflags/:bitflags at 30668016aca6bd3b02c766e8347e0b4080d4c296 Import subrepo clap/:clap at 784524f7eb193e35f81082cc69454c8c21b948f7 Import subrepo heck/:heck at 093d56fbf001e1506e56dbfa38631d99b1066df1 Import subrepo proc-macro-error/:proc-macro-error at 6c4cfe79a622c5de8ae68557993542be46eacae2 Import subrepo proc-macro2/:proc-macro2 at d5d48eddca4566e5438e8a2cbed4a74e049544de Import subrepo quote/:quote at 727436c6c137b20f0f34dde5d8fda2679b9747ad Import subrepo rustversion/:rustversion at 0c5663313516263059ce9059ef81fc7a1cf655ca Import subrepo syn-mid/:syn-mid at 5d3d85414a9e6674e1857ec22a87b96e04a6851a Import subrepo syn/:syn at e87c27e87f6f4ef8919d0372bdb056d53ef0d8f3 Import subrepo textwrap/:textwrap at abcd618beae3f74841032aa5b53c1086b0a57ca2 Import subrepo unicode-segmentation/:unicode-segmentation at 637c9874c4fe0c205ff27787faf150a40295c6c3 Import subrepo unicode-width/:unicode-width at 3033826f8bf05e82724140a981d5941e48fce393 Import subrepo unicode-xid/:unicode-xid at 4baae9fffb156ba229665b972a9cd5991787ceb7 --- unicode-segmentation/src/word.rs | 664 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 664 insertions(+) create mode 100644 unicode-segmentation/src/word.rs (limited to 'unicode-segmentation/src/word.rs') diff --git a/unicode-segmentation/src/word.rs b/unicode-segmentation/src/word.rs new file mode 100644 index 0000000..6e9c049 --- /dev/null +++ b/unicode-segmentation/src/word.rs @@ -0,0 +1,664 @@ +// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use core::cmp; +use core::iter::Filter; + +use tables::word::WordCat; + +/// An iterator over the substrings of a string which, after splitting the string on +/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), +/// contain any characters with the +/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) +/// property, or with +/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). +pub struct UnicodeWords<'a> { + inner: Filter, fn(&&str) -> bool>, +} + +impl<'a> Iterator for UnicodeWords<'a> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { self.inner.next() } +} +impl<'a> DoubleEndedIterator for UnicodeWords<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() } +} + +/// External iterator for a string's +/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). +#[derive(Clone)] +pub struct UWordBounds<'a> { + string: &'a str, + cat: Option, + catb: Option, +} + +/// External iterator for word boundaries and byte offsets. +#[derive(Clone)] +pub struct UWordBoundIndices<'a> { + start_offset: usize, + iter: UWordBounds<'a>, +} + +impl<'a> UWordBoundIndices<'a> { + #[inline] + /// View the underlying data (the part yet to be iterated) as a slice of the original string. + /// + /// ```rust + /// # use unicode_segmentation::UnicodeSegmentation; + /// let mut iter = "Hello world".split_word_bound_indices(); + /// assert_eq!(iter.as_str(), "Hello world"); + /// iter.next(); + /// assert_eq!(iter.as_str(), " world"); + /// iter.next(); + /// assert_eq!(iter.as_str(), "world"); + /// ``` + pub fn as_str(&self) -> &'a str { + self.iter.as_str() + } +} + +impl<'a> Iterator for UWordBoundIndices<'a> { + type Item = (usize, &'a str); + + #[inline] + fn next(&mut self) -> Option<(usize, &'a str)> { + self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s)) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } +} + +impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> { + #[inline] + fn next_back(&mut self) -> Option<(usize, &'a str)> { + self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s)) + } +} + +// state machine for word boundary rules +#[derive(Clone,Copy,PartialEq,Eq,Debug)] +enum UWordBoundsState { + Start, + Letter, + HLetter, + Numeric, + Katakana, + ExtendNumLet, + Regional(RegionalState), + FormatExtend(FormatExtendType), + Zwj, + Emoji, + WSegSpace, +} + +// subtypes for FormatExtend state in UWordBoundsState +#[derive(Clone,Copy,PartialEq,Eq,Debug)] +enum FormatExtendType { + AcceptAny, + AcceptNone, + RequireLetter, + RequireHLetter, + AcceptQLetter, + RequireNumeric, +} + +#[derive(Clone,Copy,PartialEq,Eq,Debug)] +enum RegionalState { + Half, + Full, + Unknown, +} + +fn is_emoji(ch: char) -> bool { + use tables::emoji; + emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic +} + +impl<'a> Iterator for UWordBounds<'a> { + type Item = &'a str; + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let slen = self.string.len(); + (cmp::min(slen, 1), Some(slen)) + } + + #[inline] + fn next(&mut self) -> Option<&'a str> { + use self::UWordBoundsState::*; + use self::FormatExtendType::*; + use tables::word as wd; + if self.string.len() == 0 { + return None; + } + + let mut take_curr = true; + let mut take_cat = true; + let mut idx = 0; + let mut saveidx = 0; + let mut state = Start; + let mut cat = wd::WC_Any; + let mut savecat = wd::WC_Any; + + // Whether or not the previous category was ZWJ + // ZWJs get collapsed, so this handles precedence of WB3c over WB4 + let mut prev_zwj; + // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4 + let mut skipped_format_extend = false; + for (curr, ch) in self.string.char_indices() { + idx = curr; + prev_zwj = cat == wd::WC_ZWJ; + // if there's a category cached, grab it + cat = match self.cat { + None => wd::word_category(ch), + _ => self.cat.take().unwrap() + }; + take_cat = true; + + // handle rule WB4 + // just skip all format, extend, and zwj chars + // note that Start is a special case: if there's a bunch of Format | Extend + // characters at the beginning of a block of text, dump them out as one unit. + // + // (This is not obvious from the wording of UAX#29, but if you look at the + // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt + // then the "correct" interpretation of WB4 becomes apparent.) + if state != Start { + match cat { + wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => { + skipped_format_extend = true; + continue + } + _ => {} + } + } + + // rule WB3c + // WB4 makes all ZWJs collapse into the previous state + // but you can still be in a Zwj state if you started with Zwj + // + // This means that an EP + Zwj will collapse into EP, which is wrong, + // since EP+EP is not a boundary but EP+ZWJ+EP is + // + // Thus, we separately keep track of whether or not the last character + // was a ZWJ. This is an additional bit of state tracked outside of the + // state enum; the state enum represents the last non-zwj state encountered. + // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state, + // however we are in the previous state for the purposes of all other rules. + if prev_zwj { + if is_emoji(ch) { + state = Emoji; + continue; + } + } + // Don't use `continue` in this match without updating `cat` + state = match state { + Start if cat == wd::WC_CR => { + idx += match self.get_next_cat(idx) { + Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3 + _ => 0 + }; + break; // rule WB3a + }, + Start => match cat { + wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a + wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a + wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a + wd::WC_Katakana => Katakana, // rule WB13, WB13a + wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b + wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c + wd::WC_LF | wd::WC_Newline => break, // rule WB3a + wd::WC_ZWJ => Zwj, // rule WB3c + wd::WC_WSegSpace => WSegSpace, // rule WB3d + _ => { + if let Some(ncat) = self.get_next_cat(idx) { // rule WB4 + if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ { + state = FormatExtend(AcceptNone); + self.cat = Some(ncat); + continue; + } + } + break; // rule WB999 + } + }, + WSegSpace => match cat { + wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, + _ => { + take_curr = false; + break; + } + }, + Zwj => { + // We already handle WB3c above. + take_curr = false; + break; + } + Letter | HLetter => match cat { + wd::WC_ALetter => Letter, // rule WB5 + wd::WC_Hebrew_Letter => HLetter, // rule WB5 + wd::WC_Numeric => Numeric, // rule WB9 + wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a + wd::WC_Double_Quote if state == HLetter => { + savecat = cat; + saveidx = idx; + FormatExtend(RequireHLetter) // rule WB7b + }, + wd::WC_Single_Quote if state == HLetter => { + FormatExtend(AcceptQLetter) // rule WB7a + }, + wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { + savecat = cat; + saveidx = idx; + FormatExtend(RequireLetter) // rule WB6 + }, + _ => { + take_curr = false; + break; + } + }, + Numeric => match cat { + wd::WC_Numeric => Numeric, // rule WB8 + wd::WC_ALetter => Letter, // rule WB10 + wd::WC_Hebrew_Letter => HLetter, // rule WB10 + wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a + wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { + savecat = cat; + saveidx = idx; + FormatExtend(RequireNumeric) // rule WB12 + }, + _ => { + take_curr = false; + break; + } + }, + Katakana => match cat { + wd::WC_Katakana => Katakana, // rule WB13 + wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a + _ => { + take_curr = false; + break; + } + }, + ExtendNumLet => match cat { + wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a + wd::WC_ALetter => Letter, // rule WB13b + wd::WC_Hebrew_Letter => HLetter, // rule WB13b + wd::WC_Numeric => Numeric, // rule WB13b + wd::WC_Katakana => Katakana, // rule WB13b + _ => { + take_curr = false; + break; + } + }, + Regional(RegionalState::Full) => { + // if it reaches here we've gone too far, + // a full flag can only compose with ZWJ/Extend/Format + // proceeding it. + take_curr = false; + break; + } + Regional(RegionalState::Half) => match cat { + wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c + _ => { + take_curr = false; + break; + } + }, + Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"), + Emoji => { + // We already handle WB3c above. If you've reached this point, the emoji sequence is over. + take_curr = false; + break; + }, + FormatExtend(t) => match t { // handle FormatExtends depending on what type + RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11 + RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7 + RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a + RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b + AcceptNone | AcceptQLetter => { + take_curr = false; // emit all the Format|Extend characters + take_cat = false; + break; + }, + _ => break // rewind (in if statement below) + } + } + } + + if let FormatExtend(t) = state { + // we were looking for something and didn't find it; we have to back up + if t == RequireLetter || t == RequireHLetter || t == RequireNumeric { + idx = saveidx; + cat = savecat; + take_curr = false; + } + } + + self.cat = if take_curr { + idx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); + None + } else if take_cat { + Some(cat) + } else { + None + }; + + let retstr = &self.string[..idx]; + self.string = &self.string[idx..]; + Some(retstr) + } +} + +impl<'a> DoubleEndedIterator for UWordBounds<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + use self::UWordBoundsState::*; + use self::FormatExtendType::*; + use tables::word as wd; + if self.string.len() == 0 { + return None; + } + + let mut take_curr = true; + let mut take_cat = true; + let mut idx = self.string.len(); + idx -= self.string.chars().next_back().unwrap().len_utf8(); + let mut previdx = idx; + let mut saveidx = idx; + let mut state = Start; + let mut savestate = Start; + let mut cat = wd::WC_Any; + + let mut skipped_format_extend = false; + + for (curr, ch) in self.string.char_indices().rev() { + previdx = idx; + idx = curr; + + // if there's a category cached, grab it + cat = match self.catb { + None => wd::word_category(ch), + _ => self.catb.take().unwrap() + }; + take_cat = true; + + // backward iterator over word boundaries. Mostly the same as the forward + // iterator, with two weirdnesses: + // (1) If we encounter a single quote in the Start state, we have to check for a + // Hebrew Letter immediately before it. + // (2) Format and Extend char handling takes some gymnastics. + + if cat == wd::WC_Extend + || cat == wd::WC_Format + || (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not + // fold in that case + if match state { + FormatExtend(_) | Start => false, + _ => true + } { + saveidx = previdx; + savestate = state; + state = FormatExtend(AcceptNone); + } + + if state != Start { + continue; + } + } else if state == FormatExtend(AcceptNone) { + // finished a scan of some Format|Extend chars, restore previous state + state = savestate; + previdx = saveidx; + take_cat = false; + skipped_format_extend = true; + } + + // Don't use `continue` in this match without updating `catb` + state = match state { + Start | FormatExtend(AcceptAny) => match cat { + _ if is_emoji(ch) => Zwj, + wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b + wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b + wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b + wd::WC_Katakana => Katakana, // rule WB13, WB13b + wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a + wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c + // rule WB4: + wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny), + wd::WC_Single_Quote => { + saveidx = idx; + FormatExtend(AcceptQLetter) // rule WB7a + }, + wd::WC_WSegSpace => WSegSpace, + wd::WC_CR | wd::WC_LF | wd::WC_Newline => { + if state == Start { + if cat == wd::WC_LF { + idx -= match self.get_prev_cat(idx) { + Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3 + _ => 0 + }; + } + } else { + take_curr = false; + } + break; // rule WB3a + }, + _ => break // rule WB999 + }, + Zwj => match cat { // rule WB3c + wd::WC_ZWJ => { + FormatExtend(AcceptAny) + } + _ => { + take_curr = false; + break; + } + }, + WSegSpace => match cat { // rule WB3d + wd::WC_WSegSpace if !skipped_format_extend => { + WSegSpace + } + _ => { + take_curr = false; + break; + } + }, + Letter | HLetter => match cat { + wd::WC_ALetter => Letter, // rule WB5 + wd::WC_Hebrew_Letter => HLetter, // rule WB5 + wd::WC_Numeric => Numeric, // rule WB10 + wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b + wd::WC_Double_Quote if state == HLetter => { + saveidx = previdx; + FormatExtend(RequireHLetter) // rule WB7c + }, + wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { + saveidx = previdx; + FormatExtend(RequireLetter) // rule WB7 + }, + _ => { + take_curr = false; + break; + } + }, + Numeric => match cat { + wd::WC_Numeric => Numeric, // rule WB8 + wd::WC_ALetter => Letter, // rule WB9 + wd::WC_Hebrew_Letter => HLetter, // rule WB9 + wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b + wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { + saveidx = previdx; + FormatExtend(RequireNumeric) // rule WB11 + }, + _ => { + take_curr = false; + break; + } + }, + Katakana => match cat { + wd::WC_Katakana => Katakana, // rule WB13 + wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b + _ => { + take_curr = false; + break; + } + }, + ExtendNumLet => match cat { + wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a + wd::WC_ALetter => Letter, // rule WB13a + wd::WC_Hebrew_Letter => HLetter, // rule WB13a + wd::WC_Numeric => Numeric, // rule WB13a + wd::WC_Katakana => Katakana, // rule WB13a + _ => { + take_curr = false; + break; + } + }, + Regional(mut regional_state) => match cat { + // rule WB13c + wd::WC_Regional_Indicator => { + if regional_state == RegionalState::Unknown { + let count = self.string[..previdx] + .chars().rev() + .map(|c| wd::word_category(c)) + .filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)) + .take_while(|&c| c == wd::WC_Regional_Indicator) + .count(); + regional_state = if count % 2 == 0 { + RegionalState::Full + } else { + RegionalState::Half + }; + } + if regional_state == RegionalState::Full { + take_curr = false; + break; + } else { + Regional(RegionalState::Full) + } + } + _ => { + take_curr = false; + break; + } + }, + Emoji => { + if is_emoji(ch) { // rule WB3c + Zwj + } else { + take_curr = false; + break; + } + }, + FormatExtend(t) => match t { + RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12 + RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6 + RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6 + AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a + RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b + _ => break // backtrack will happens + } + } + } + + if let FormatExtend(t) = state { + // if we required something but didn't find it, backtrack + if t == RequireLetter || t == RequireHLetter || + t == RequireNumeric || t == AcceptNone || t == AcceptQLetter { + previdx = saveidx; + take_cat = false; + take_curr = false; + } + } + + self.catb = if take_curr { + None + } else { + idx = previdx; + if take_cat { + Some(cat) + } else { + None + } + }; + + let retstr = &self.string[idx..]; + self.string = &self.string[..idx]; + Some(retstr) + } +} + +impl<'a> UWordBounds<'a> { + #[inline] + /// View the underlying data (the part yet to be iterated) as a slice of the original string. + /// + /// ```rust + /// # use unicode_segmentation::UnicodeSegmentation; + /// let mut iter = "Hello world".split_word_bounds(); + /// assert_eq!(iter.as_str(), "Hello world"); + /// iter.next(); + /// assert_eq!(iter.as_str(), " world"); + /// iter.next(); + /// assert_eq!(iter.as_str(), "world"); + /// ``` + pub fn as_str(&self) -> &'a str { + self.string + } + + #[inline] + fn get_next_cat(&self, idx: usize) -> Option { + use tables::word as wd; + let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); + if nidx < self.string.len() { + let nch = self.string[nidx..].chars().next().unwrap(); + Some(wd::word_category(nch)) + } else { + None + } + } + + #[inline] + fn get_prev_cat(&self, idx: usize) -> Option { + use tables::word as wd; + if idx > 0 { + let nch = self.string[..idx].chars().next_back().unwrap(); + Some(wd::word_category(nch)) + } else { + None + } + } +} + +#[inline] +pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> { + UWordBounds { string: s, cat: None, catb: None } +} + +#[inline] +pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> { + UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) } +} + +#[inline] +pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> { + use super::UnicodeSegmentation; + use tables::util::is_alphanumeric; + + fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) } + let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer + + UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) } +} -- cgit v1.2.1