diff options
Diffstat (limited to 'unicode-segmentation/src/sentence.rs')
-rw-r--r-- | unicode-segmentation/src/sentence.rs | 373 |
1 files changed, 0 insertions, 373 deletions
diff --git a/unicode-segmentation/src/sentence.rs b/unicode-segmentation/src/sentence.rs deleted file mode 100644 index c16c927..0000000 --- a/unicode-segmentation/src/sentence.rs +++ /dev/null @@ -1,373 +0,0 @@ -// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or -// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license -// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use core::cmp; -use core::iter::Filter; - -// All of the logic for forward iteration over sentences -mod fwd { - use tables::sentence::SentenceCat; - use core::cmp; - - // Describe a parsed part of source string as described in this table: - // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries - #[derive(Clone, Copy, PartialEq, Eq)] - enum StatePart { - Sot, - Eot, - Other, - CR, - LF, - Sep, - ATerm, - UpperLower, - ClosePlus, - SpPlus, - STerm - } - - #[derive(Clone, PartialEq, Eq)] - struct SentenceBreaksState(pub [StatePart; 4]); - - const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([ - StatePart::Sot, - StatePart::Sot, - StatePart::Sot, - StatePart::Sot - ]); - - #[derive(Clone)] - pub struct SentenceBreaks<'a> { - pub string: &'a str, - pos: usize, - state: SentenceBreaksState - } - - impl SentenceBreaksState { - // Attempt to advance the internal state by one part - // Whitespace and some punctutation will be collapsed - fn next(&self, cat: SentenceCat) -> SentenceBreaksState { - let &SentenceBreaksState(parts) = self; - let parts = match (parts[3], cat) { - (StatePart::ClosePlus, SentenceCat::SC_Close) => parts, - (StatePart::SpPlus, SentenceCat::SC_Sp) => parts, - _ => [ - parts[1], - parts[2], - parts[3], - match cat { - SentenceCat::SC_CR => StatePart::CR, - SentenceCat::SC_LF => StatePart::LF, - SentenceCat::SC_Sep => StatePart::Sep, - SentenceCat::SC_ATerm => StatePart::ATerm, - SentenceCat::SC_Upper | - SentenceCat::SC_Lower => StatePart::UpperLower, - SentenceCat::SC_Close => StatePart::ClosePlus, - SentenceCat::SC_Sp => StatePart::SpPlus, - SentenceCat::SC_STerm => StatePart::STerm, - _ => StatePart::Other - } - ] - }; - SentenceBreaksState(parts) - } - - fn end(&self) -> SentenceBreaksState { - let &SentenceBreaksState(parts) = self; - SentenceBreaksState([ - parts[1], - parts[2], - parts[3], - StatePart::Eot - ]) - } - - // Helper function to check if state head matches a single `StatePart` - fn match1(&self, part: StatePart) -> bool { - let &SentenceBreaksState(parts) = self; - part == parts[3] - } - - // Helper function to check if first two `StateParts` in state match - // the given two - fn match2(&self, part1: StatePart, part2: StatePart) -> bool { - let &SentenceBreaksState(parts) = self; - part1 == parts[2] && part2 == parts[3] - } - } - - // https://unicode.org/reports/tr29/#SB8 - // TODO cache this, it is currently quadratic - fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool { - let &SentenceBreaksState(parts) = state; - let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; - if parts[idx] == StatePart::ClosePlus { idx -= 1 } - - if parts[idx] == StatePart::ATerm { - use tables::sentence as se; - - for next_char in ahead.chars() { - //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower - match se::sentence_category(next_char) { - se::SC_Lower => return true, - se::SC_OLetter | - se::SC_Upper | - se::SC_Sep | se::SC_CR | se::SC_LF | - se::SC_STerm | se::SC_ATerm => return false, - _ => continue - } - } - } - - false - } - - // https://unicode.org/reports/tr29/#SB8a - fn match_sb8a(state: &SentenceBreaksState) -> bool { - // SATerm Close* Sp* - let &SentenceBreaksState(parts) = state; - let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; - if parts[idx] == StatePart::ClosePlus { idx -= 1 } - parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm - } - - // https://unicode.org/reports/tr29/#SB9 - fn match_sb9(state: &SentenceBreaksState) -> bool { - // SATerm Close* - let &SentenceBreaksState(parts) = state; - let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 }; - parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm - } - - // https://unicode.org/reports/tr29/#SB11 - fn match_sb11(state: &SentenceBreaksState) -> bool { - // SATerm Close* Sp* ParaSep? - let &SentenceBreaksState(parts) = state; - let mut idx = match parts[3] { - StatePart::Sep | - StatePart::CR | - StatePart::LF => 2, - _ => 3 - }; - - if parts[idx] == StatePart::SpPlus { idx -= 1 } - if parts[idx] == StatePart::ClosePlus { idx -= 1} - - parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm - } - - impl<'a> Iterator for SentenceBreaks<'a> { - // Returns the index of the character which follows a break - type Item = usize; - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - let slen = self.string.len(); - // A sentence could be one character - (cmp::min(slen, 2), Some(slen + 1)) - } - - #[inline] - fn next(&mut self) -> Option<usize> { - use tables::sentence as se; - - for next_char in self.string[self.pos..].chars() { - let position_before = self.pos; - let state_before = self.state.clone(); - - let next_cat = se::sentence_category(next_char); - - self.pos += next_char.len_utf8(); - self.state = self.state.next(next_cat); - - match next_cat { - // SB1 https://unicode.org/reports/tr29/#SB1 - _ if state_before.match1(StatePart::Sot) => - return Some(position_before), - - // SB2 is handled when inner iterator (chars) is finished - - // SB3 https://unicode.org/reports/tr29/#SB3 - SentenceCat::SC_LF if state_before.match1(StatePart::CR) => - continue, - - // SB4 https://unicode.org/reports/tr29/#SB4 - _ if state_before.match1(StatePart::Sep) - || state_before.match1(StatePart::CR) - || state_before.match1(StatePart::LF) - => return Some(position_before), - - // SB5 https://unicode.org/reports/tr29/#SB5 - SentenceCat::SC_Extend | - SentenceCat::SC_Format => self.state = state_before, - - // SB6 https://unicode.org/reports/tr29/#SB6 - SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => - continue, - - // SB7 https://unicode.org/reports/tr29/#SB7 - SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) => - continue, - - // SB8 https://unicode.org/reports/tr29/#SB8 - _ if match_sb8(&state_before, &self.string[position_before..]) => - continue, - - // SB8a https://unicode.org/reports/tr29/#SB8a - SentenceCat::SC_SContinue | - SentenceCat::SC_STerm | - SentenceCat::SC_ATerm if match_sb8a(&state_before) => - continue, - - // SB9 https://unicode.org/reports/tr29/#SB9 - SentenceCat::SC_Close | - SentenceCat::SC_Sp | - SentenceCat::SC_Sep | - SentenceCat::SC_CR | - SentenceCat::SC_LF if match_sb9(&state_before) => - continue, - - // SB10 https://unicode.org/reports/tr29/#SB10 - SentenceCat::SC_Sp | - SentenceCat::SC_Sep | - SentenceCat::SC_CR | - SentenceCat::SC_LF if match_sb8a(&state_before) => - continue, - - // SB11 https://unicode.org/reports/tr29/#SB11 - _ if match_sb11(&state_before) => - return Some(position_before), - - // SB998 https://unicode.org/reports/tr29/#SB998 - _ => continue - } - } - - // SB2 https://unicode.org/reports/tr29/#SB2 - if self.state.match1(StatePart::Sot) { - None - } else if self.state.match1(StatePart::Eot) { - None - } else { - self.state = self.state.end(); - Some(self.pos) - } - } - } - - pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> { - SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE } - } - -} - -/// An iterator over the substrings of a string which, after splitting the string on -/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries), -/// contain any characters with the -/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) -/// property, or with -/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). -#[derive(Clone)] -pub struct UnicodeSentences<'a> { - inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>, -} - -/// External iterator for a string's -/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). -#[derive(Clone)] -pub struct USentenceBounds<'a> { - iter: fwd::SentenceBreaks<'a>, - sentence_start: Option<usize> -} - -/// External iterator for sentence boundaries and byte offsets. -#[derive(Clone)] -pub struct USentenceBoundIndices<'a> { - start_offset: usize, - iter: USentenceBounds<'a>, -} - -#[inline] -pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> { - USentenceBounds { - iter: fwd::new_sentence_breaks(source), - sentence_start: None - } -} - -#[inline] -pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> { - USentenceBoundIndices { - start_offset: source.as_ptr() as usize, - iter: new_sentence_bounds(source) - } -} - -#[inline] -pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> { - use super::UnicodeSegmentation; - use tables::util::is_alphanumeric; - - fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) } - let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer - - UnicodeSentences { inner: s.split_sentence_bounds().filter(has_alphanumeric) } -} - -impl<'a> Iterator for UnicodeSentences<'a> { - type Item = &'a str; - - #[inline] - fn next(&mut self) -> Option<&'a str> { self.inner.next() } -} - -impl<'a> Iterator for USentenceBounds<'a> { - type Item = &'a str; - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - let (lower, upper) = self.iter.size_hint(); - (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1))) - } - - #[inline] - fn next(&mut self) -> Option<&'a str> { - if self.sentence_start == None { - if let Some(start_pos) = self.iter.next() { - self.sentence_start = Some(start_pos) - } else { - return None - } - } - - if let Some(break_pos) = self.iter.next() { - let start_pos = self.sentence_start.unwrap(); - let sentence = &self.iter.string[start_pos..break_pos]; - self.sentence_start = Some(break_pos); - Some(sentence) - } else { - None - } - } -} - -impl<'a> Iterator for USentenceBoundIndices<'a> { - type Item = (usize, &'a str); - - #[inline] - fn next(&mut self) -> Option<(usize, &'a str)> { - self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s)) - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.iter.size_hint() - } -} |