// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use core::cmp; use core::iter::Filter; // All of the logic for forward iteration over sentences mod fwd { use tables::sentence::SentenceCat; use core::cmp; // Describe a parsed part of source string as described in this table: // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries #[derive(Clone, Copy, PartialEq, Eq)] enum StatePart { Sot, Eot, Other, CR, LF, Sep, ATerm, UpperLower, ClosePlus, SpPlus, STerm } #[derive(Clone, PartialEq, Eq)] struct SentenceBreaksState(pub [StatePart; 4]); const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([ StatePart::Sot, StatePart::Sot, StatePart::Sot, StatePart::Sot ]); #[derive(Clone)] pub struct SentenceBreaks<'a> { pub string: &'a str, pos: usize, state: SentenceBreaksState } impl SentenceBreaksState { // Attempt to advance the internal state by one part // Whitespace and some punctutation will be collapsed fn next(&self, cat: SentenceCat) -> SentenceBreaksState { let &SentenceBreaksState(parts) = self; let parts = match (parts[3], cat) { (StatePart::ClosePlus, SentenceCat::SC_Close) => parts, (StatePart::SpPlus, SentenceCat::SC_Sp) => parts, _ => [ parts[1], parts[2], parts[3], match cat { SentenceCat::SC_CR => StatePart::CR, SentenceCat::SC_LF => StatePart::LF, SentenceCat::SC_Sep => StatePart::Sep, SentenceCat::SC_ATerm => StatePart::ATerm, SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower, SentenceCat::SC_Close => StatePart::ClosePlus, SentenceCat::SC_Sp => StatePart::SpPlus, SentenceCat::SC_STerm => StatePart::STerm, _ => StatePart::Other } ] }; SentenceBreaksState(parts) } fn end(&self) -> SentenceBreaksState { let &SentenceBreaksState(parts) = self; SentenceBreaksState([ parts[1], parts[2], parts[3], StatePart::Eot ]) } // Helper function to check if state head matches a single `StatePart` fn match1(&self, part: StatePart) -> bool { let &SentenceBreaksState(parts) = self; part == parts[3] } // Helper function to check if first two `StateParts` in state match // the given two fn match2(&self, part1: StatePart, part2: StatePart) -> bool { let &SentenceBreaksState(parts) = self; part1 == parts[2] && part2 == parts[3] } } // https://unicode.org/reports/tr29/#SB8 // TODO cache this, it is currently quadratic fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool { let &SentenceBreaksState(parts) = state; let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; if parts[idx] == StatePart::ClosePlus { idx -= 1 } if parts[idx] == StatePart::ATerm { use tables::sentence as se; for next_char in ahead.chars() { //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower match se::sentence_category(next_char) { se::SC_Lower => return true, se::SC_OLetter | se::SC_Upper | se::SC_Sep | se::SC_CR | se::SC_LF | se::SC_STerm | se::SC_ATerm => return false, _ => continue } } } false } // https://unicode.org/reports/tr29/#SB8a fn match_sb8a(state: &SentenceBreaksState) -> bool { // SATerm Close* Sp* let &SentenceBreaksState(parts) = state; let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; if parts[idx] == StatePart::ClosePlus { idx -= 1 } parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm } // https://unicode.org/reports/tr29/#SB9 fn match_sb9(state: &SentenceBreaksState) -> bool { // SATerm Close* let &SentenceBreaksState(parts) = state; let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 }; parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm } // https://unicode.org/reports/tr29/#SB11 fn match_sb11(state: &SentenceBreaksState) -> bool { // SATerm Close* Sp* ParaSep? let &SentenceBreaksState(parts) = state; let mut idx = match parts[3] { StatePart::Sep | StatePart::CR | StatePart::LF => 2, _ => 3 }; if parts[idx] == StatePart::SpPlus { idx -= 1 } if parts[idx] == StatePart::ClosePlus { idx -= 1} parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm } impl<'a> Iterator for SentenceBreaks<'a> { // Returns the index of the character which follows a break type Item = usize; #[inline] fn size_hint(&self) -> (usize, Option) { let slen = self.string.len(); // A sentence could be one character (cmp::min(slen, 2), Some(slen + 1)) } #[inline] fn next(&mut self) -> Option { use tables::sentence as se; for next_char in self.string[self.pos..].chars() { let position_before = self.pos; let state_before = self.state.clone(); let next_cat = se::sentence_category(next_char); self.pos += next_char.len_utf8(); self.state = self.state.next(next_cat); match next_cat { // SB1 https://unicode.org/reports/tr29/#SB1 _ if state_before.match1(StatePart::Sot) => return Some(position_before), // SB2 is handled when inner iterator (chars) is finished // SB3 https://unicode.org/reports/tr29/#SB3 SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue, // SB4 https://unicode.org/reports/tr29/#SB4 _ if state_before.match1(StatePart::Sep) || state_before.match1(StatePart::CR) || state_before.match1(StatePart::LF) => return Some(position_before), // SB5 https://unicode.org/reports/tr29/#SB5 SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before, // SB6 https://unicode.org/reports/tr29/#SB6 SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue, // SB7 https://unicode.org/reports/tr29/#SB7 SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) => continue, // SB8 https://unicode.org/reports/tr29/#SB8 _ if match_sb8(&state_before, &self.string[position_before..]) => continue, // SB8a https://unicode.org/reports/tr29/#SB8a SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm if match_sb8a(&state_before) => continue, // SB9 https://unicode.org/reports/tr29/#SB9 SentenceCat::SC_Close | SentenceCat::SC_Sp | SentenceCat::SC_Sep | SentenceCat::SC_CR | SentenceCat::SC_LF if match_sb9(&state_before) => continue, // SB10 https://unicode.org/reports/tr29/#SB10 SentenceCat::SC_Sp | SentenceCat::SC_Sep | SentenceCat::SC_CR | SentenceCat::SC_LF if match_sb8a(&state_before) => continue, // SB11 https://unicode.org/reports/tr29/#SB11 _ if match_sb11(&state_before) => return Some(position_before), // SB998 https://unicode.org/reports/tr29/#SB998 _ => continue } } // SB2 https://unicode.org/reports/tr29/#SB2 if self.state.match1(StatePart::Sot) { None } else if self.state.match1(StatePart::Eot) { None } else { self.state = self.state.end(); Some(self.pos) } } } pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> { SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE } } } /// An iterator over the substrings of a string which, after splitting the string on /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries), /// contain any characters with the /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) /// property, or with /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). #[derive(Clone)] pub struct UnicodeSentences<'a> { inner: Filter, fn(&&str) -> bool>, } /// External iterator for a string's /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). #[derive(Clone)] pub struct USentenceBounds<'a> { iter: fwd::SentenceBreaks<'a>, sentence_start: Option } /// External iterator for sentence boundaries and byte offsets. #[derive(Clone)] pub struct USentenceBoundIndices<'a> { start_offset: usize, iter: USentenceBounds<'a>, } #[inline] pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> { USentenceBounds { iter: fwd::new_sentence_breaks(source), sentence_start: None } } #[inline] pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> { USentenceBoundIndices { start_offset: source.as_ptr() as usize, iter: new_sentence_bounds(source) } } #[inline] pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> { use super::UnicodeSegmentation; use tables::util::is_alphanumeric; fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) } let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer UnicodeSentences { inner: s.split_sentence_bounds().filter(has_alphanumeric) } } impl<'a> Iterator for UnicodeSentences<'a> { type Item = &'a str; #[inline] fn next(&mut self) -> Option<&'a str> { self.inner.next() } } impl<'a> Iterator for USentenceBounds<'a> { type Item = &'a str; #[inline] fn size_hint(&self) -> (usize, Option) { let (lower, upper) = self.iter.size_hint(); (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1))) } #[inline] fn next(&mut self) -> Option<&'a str> { if self.sentence_start == None { if let Some(start_pos) = self.iter.next() { self.sentence_start = Some(start_pos) } else { return None } } if let Some(break_pos) = self.iter.next() { let start_pos = self.sentence_start.unwrap(); let sentence = &self.iter.string[start_pos..break_pos]; self.sentence_start = Some(break_pos); Some(sentence) } else { None } } } impl<'a> Iterator for USentenceBoundIndices<'a> { type Item = (usize, &'a str); #[inline] fn next(&mut self) -> Option<(usize, &'a str)> { self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s)) } #[inline] fn size_hint(&self) -> (usize, Option) { self.iter.size_hint() } }