aboutsummaryrefslogtreecommitdiff
path: root/unicode-segmentation/src/sentence.rs
diff options
context:
space:
mode:
Diffstat (limited to 'unicode-segmentation/src/sentence.rs')
-rw-r--r--unicode-segmentation/src/sentence.rs373
1 files changed, 0 insertions, 373 deletions
diff --git a/unicode-segmentation/src/sentence.rs b/unicode-segmentation/src/sentence.rs
deleted file mode 100644
index c16c927..0000000
--- a/unicode-segmentation/src/sentence.rs
+++ /dev/null
@@ -1,373 +0,0 @@
-// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-use core::cmp;
-use core::iter::Filter;
-
-// All of the logic for forward iteration over sentences
-mod fwd {
- use tables::sentence::SentenceCat;
- use core::cmp;
-
- // Describe a parsed part of source string as described in this table:
- // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
- #[derive(Clone, Copy, PartialEq, Eq)]
- enum StatePart {
- Sot,
- Eot,
- Other,
- CR,
- LF,
- Sep,
- ATerm,
- UpperLower,
- ClosePlus,
- SpPlus,
- STerm
- }
-
- #[derive(Clone, PartialEq, Eq)]
- struct SentenceBreaksState(pub [StatePart; 4]);
-
- const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
- StatePart::Sot,
- StatePart::Sot,
- StatePart::Sot,
- StatePart::Sot
- ]);
-
- #[derive(Clone)]
- pub struct SentenceBreaks<'a> {
- pub string: &'a str,
- pos: usize,
- state: SentenceBreaksState
- }
-
- impl SentenceBreaksState {
- // Attempt to advance the internal state by one part
- // Whitespace and some punctutation will be collapsed
- fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
- let &SentenceBreaksState(parts) = self;
- let parts = match (parts[3], cat) {
- (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
- (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
- _ => [
- parts[1],
- parts[2],
- parts[3],
- match cat {
- SentenceCat::SC_CR => StatePart::CR,
- SentenceCat::SC_LF => StatePart::LF,
- SentenceCat::SC_Sep => StatePart::Sep,
- SentenceCat::SC_ATerm => StatePart::ATerm,
- SentenceCat::SC_Upper |
- SentenceCat::SC_Lower => StatePart::UpperLower,
- SentenceCat::SC_Close => StatePart::ClosePlus,
- SentenceCat::SC_Sp => StatePart::SpPlus,
- SentenceCat::SC_STerm => StatePart::STerm,
- _ => StatePart::Other
- }
- ]
- };
- SentenceBreaksState(parts)
- }
-
- fn end(&self) -> SentenceBreaksState {
- let &SentenceBreaksState(parts) = self;
- SentenceBreaksState([
- parts[1],
- parts[2],
- parts[3],
- StatePart::Eot
- ])
- }
-
- // Helper function to check if state head matches a single `StatePart`
- fn match1(&self, part: StatePart) -> bool {
- let &SentenceBreaksState(parts) = self;
- part == parts[3]
- }
-
- // Helper function to check if first two `StateParts` in state match
- // the given two
- fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
- let &SentenceBreaksState(parts) = self;
- part1 == parts[2] && part2 == parts[3]
- }
- }
-
- // https://unicode.org/reports/tr29/#SB8
- // TODO cache this, it is currently quadratic
- fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
- let &SentenceBreaksState(parts) = state;
- let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
- if parts[idx] == StatePart::ClosePlus { idx -= 1 }
-
- if parts[idx] == StatePart::ATerm {
- use tables::sentence as se;
-
- for next_char in ahead.chars() {
- //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
- match se::sentence_category(next_char) {
- se::SC_Lower => return true,
- se::SC_OLetter |
- se::SC_Upper |
- se::SC_Sep | se::SC_CR | se::SC_LF |
- se::SC_STerm | se::SC_ATerm => return false,
- _ => continue
- }
- }
- }
-
- false
- }
-
- // https://unicode.org/reports/tr29/#SB8a
- fn match_sb8a(state: &SentenceBreaksState) -> bool {
- // SATerm Close* Sp*
- let &SentenceBreaksState(parts) = state;
- let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
- if parts[idx] == StatePart::ClosePlus { idx -= 1 }
- parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
- }
-
- // https://unicode.org/reports/tr29/#SB9
- fn match_sb9(state: &SentenceBreaksState) -> bool {
- // SATerm Close*
- let &SentenceBreaksState(parts) = state;
- let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 };
- parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
- }
-
- // https://unicode.org/reports/tr29/#SB11
- fn match_sb11(state: &SentenceBreaksState) -> bool {
- // SATerm Close* Sp* ParaSep?
- let &SentenceBreaksState(parts) = state;
- let mut idx = match parts[3] {
- StatePart::Sep |
- StatePart::CR |
- StatePart::LF => 2,
- _ => 3
- };
-
- if parts[idx] == StatePart::SpPlus { idx -= 1 }
- if parts[idx] == StatePart::ClosePlus { idx -= 1}
-
- parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
- }
-
- impl<'a> Iterator for SentenceBreaks<'a> {
- // Returns the index of the character which follows a break
- type Item = usize;
-
- #[inline]
- fn size_hint(&self) -> (usize, Option<usize>) {
- let slen = self.string.len();
- // A sentence could be one character
- (cmp::min(slen, 2), Some(slen + 1))
- }
-
- #[inline]
- fn next(&mut self) -> Option<usize> {
- use tables::sentence as se;
-
- for next_char in self.string[self.pos..].chars() {
- let position_before = self.pos;
- let state_before = self.state.clone();
-
- let next_cat = se::sentence_category(next_char);
-
- self.pos += next_char.len_utf8();
- self.state = self.state.next(next_cat);
-
- match next_cat {
- // SB1 https://unicode.org/reports/tr29/#SB1
- _ if state_before.match1(StatePart::Sot) =>
- return Some(position_before),
-
- // SB2 is handled when inner iterator (chars) is finished
-
- // SB3 https://unicode.org/reports/tr29/#SB3
- SentenceCat::SC_LF if state_before.match1(StatePart::CR) =>
- continue,
-
- // SB4 https://unicode.org/reports/tr29/#SB4
- _ if state_before.match1(StatePart::Sep)
- || state_before.match1(StatePart::CR)
- || state_before.match1(StatePart::LF)
- => return Some(position_before),
-
- // SB5 https://unicode.org/reports/tr29/#SB5
- SentenceCat::SC_Extend |
- SentenceCat::SC_Format => self.state = state_before,
-
- // SB6 https://unicode.org/reports/tr29/#SB6
- SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) =>
- continue,
-
- // SB7 https://unicode.org/reports/tr29/#SB7
- SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
- continue,
-
- // SB8 https://unicode.org/reports/tr29/#SB8
- _ if match_sb8(&state_before, &self.string[position_before..]) =>
- continue,
-
- // SB8a https://unicode.org/reports/tr29/#SB8a
- SentenceCat::SC_SContinue |
- SentenceCat::SC_STerm |
- SentenceCat::SC_ATerm if match_sb8a(&state_before) =>
- continue,
-
- // SB9 https://unicode.org/reports/tr29/#SB9
- SentenceCat::SC_Close |
- SentenceCat::SC_Sp |
- SentenceCat::SC_Sep |
- SentenceCat::SC_CR |
- SentenceCat::SC_LF if match_sb9(&state_before) =>
- continue,
-
- // SB10 https://unicode.org/reports/tr29/#SB10
- SentenceCat::SC_Sp |
- SentenceCat::SC_Sep |
- SentenceCat::SC_CR |
- SentenceCat::SC_LF if match_sb8a(&state_before) =>
- continue,
-
- // SB11 https://unicode.org/reports/tr29/#SB11
- _ if match_sb11(&state_before) =>
- return Some(position_before),
-
- // SB998 https://unicode.org/reports/tr29/#SB998
- _ => continue
- }
- }
-
- // SB2 https://unicode.org/reports/tr29/#SB2
- if self.state.match1(StatePart::Sot) {
- None
- } else if self.state.match1(StatePart::Eot) {
- None
- } else {
- self.state = self.state.end();
- Some(self.pos)
- }
- }
- }
-
- pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
- SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE }
- }
-
-}
-
-/// An iterator over the substrings of a string which, after splitting the string on
-/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
-/// contain any characters with the
-/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
-/// property, or with
-/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
-#[derive(Clone)]
-pub struct UnicodeSentences<'a> {
- inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
-}
-
-/// External iterator for a string's
-/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
-#[derive(Clone)]
-pub struct USentenceBounds<'a> {
- iter: fwd::SentenceBreaks<'a>,
- sentence_start: Option<usize>
-}
-
-/// External iterator for sentence boundaries and byte offsets.
-#[derive(Clone)]
-pub struct USentenceBoundIndices<'a> {
- start_offset: usize,
- iter: USentenceBounds<'a>,
-}
-
-#[inline]
-pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
- USentenceBounds {
- iter: fwd::new_sentence_breaks(source),
- sentence_start: None
- }
-}
-
-#[inline]
-pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
- USentenceBoundIndices {
- start_offset: source.as_ptr() as usize,
- iter: new_sentence_bounds(source)
- }
-}
-
-#[inline]
-pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
- use super::UnicodeSegmentation;
- use tables::util::is_alphanumeric;
-
- fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
- let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
-
- UnicodeSentences { inner: s.split_sentence_bounds().filter(has_alphanumeric) }
-}
-
-impl<'a> Iterator for UnicodeSentences<'a> {
- type Item = &'a str;
-
- #[inline]
- fn next(&mut self) -> Option<&'a str> { self.inner.next() }
-}
-
-impl<'a> Iterator for USentenceBounds<'a> {
- type Item = &'a str;
-
- #[inline]
- fn size_hint(&self) -> (usize, Option<usize>) {
- let (lower, upper) = self.iter.size_hint();
- (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
- }
-
- #[inline]
- fn next(&mut self) -> Option<&'a str> {
- if self.sentence_start == None {
- if let Some(start_pos) = self.iter.next() {
- self.sentence_start = Some(start_pos)
- } else {
- return None
- }
- }
-
- if let Some(break_pos) = self.iter.next() {
- let start_pos = self.sentence_start.unwrap();
- let sentence = &self.iter.string[start_pos..break_pos];
- self.sentence_start = Some(break_pos);
- Some(sentence)
- } else {
- None
- }
- }
-}
-
-impl<'a> Iterator for USentenceBoundIndices<'a> {
- type Item = (usize, &'a str);
-
- #[inline]
- fn next(&mut self) -> Option<(usize, &'a str)> {
- self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
- }
-
- #[inline]
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.iter.size_hint()
- }
-}