aboutsummaryrefslogtreecommitdiff
path: root/textwrap/src/splitting.rs
blob: f6b65afda1e186afc59a93b1fe3fcd7576545088 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
//! Word splitting functionality.
//!
//! To wrap text into lines, long words sometimes need to be split
//! across lines. The [`WordSplitter`] trait defines this
//! functionality. [`HyphenSplitter`] is the default implementation of
//! this treat: it will simply split words on existing hyphens.

#[cfg(feature = "hyphenation")]
use hyphenation::{Hyphenator, Standard};

/// An interface for splitting words.
///
/// When the [`wrap_iter`] method will try to fit text into a line, it
/// will eventually find a word that it too large the current text
/// width. It will then call the currently configured `WordSplitter` to
/// have it attempt to split the word into smaller parts. This trait
/// describes that functionality via the [`split`] method.
///
/// If the `textwrap` crate has been compiled with the `hyphenation`
/// feature enabled, you will find an implementation of `WordSplitter`
/// by the `hyphenation::language::Corpus` struct. Use this struct for
/// language-aware hyphenation. See the [`hyphenation` documentation]
/// for details.
///
/// [`wrap_iter`]: ../struct.Wrapper.html#method.wrap_iter
/// [`split`]: #tymethod.split
/// [`hyphenation` documentation]: https://docs.rs/hyphenation/
pub trait WordSplitter {
    /// Return all possible splits of word. Each split is a triple
    /// with a head, a hyphen, and a tail where `head + &hyphen +
    /// &tail == word`. The hyphen can be empty if there is already a
    /// hyphen in the head.
    ///
    /// The splits should go from smallest to longest and should
    /// include no split at all. So the word "technology" could be
    /// split into
    ///
    /// ```no_run
    /// vec![("tech", "-", "nology"),
    ///      ("technol", "-", "ogy"),
    ///      ("technolo", "-", "gy"),
    ///      ("technology", "", "")];
    /// ```
    fn split<'w>(&self, word: &'w str) -> Vec<(&'w str, &'w str, &'w str)>;
}

/// Use this as a [`Wrapper.splitter`] to avoid any kind of
/// hyphenation:
///
/// ```
/// use textwrap::{Wrapper, NoHyphenation};
///
/// let wrapper = Wrapper::with_splitter(8, NoHyphenation);
/// assert_eq!(wrapper.wrap("foo bar-baz"), vec!["foo", "bar-baz"]);
/// ```
///
/// [`Wrapper.splitter`]: ../struct.Wrapper.html#structfield.splitter
#[derive(Clone, Debug)]
pub struct NoHyphenation;

/// `NoHyphenation` implements `WordSplitter` by not splitting the
/// word at all.
impl WordSplitter for NoHyphenation {
    fn split<'w>(&self, word: &'w str) -> Vec<(&'w str, &'w str, &'w str)> {
        vec![(word, "", "")]
    }
}

/// Simple and default way to split words: splitting on existing
/// hyphens only.
///
/// You probably don't need to use this type since it's already used
/// by default by `Wrapper::new`.
#[derive(Clone, Debug)]
pub struct HyphenSplitter;

/// `HyphenSplitter` is the default `WordSplitter` used by
/// `Wrapper::new`. It will split words on any existing hyphens in the
/// word.
///
/// It will only use hyphens that are surrounded by alphanumeric
/// characters, which prevents a word like "--foo-bar" from being
/// split on the first or second hyphen.
impl WordSplitter for HyphenSplitter {
    fn split<'w>(&self, word: &'w str) -> Vec<(&'w str, &'w str, &'w str)> {
        let mut triples = Vec::new();
        // Split on hyphens, smallest split first. We only use hyphens
        // that are surrounded by alphanumeric characters. This is to
        // avoid splitting on repeated hyphens, such as those found in
        // --foo-bar.
        let mut char_indices = word.char_indices();
        // Early return if the word is empty.
        let mut prev = match char_indices.next() {
            None => return vec![(word, "", "")],
            Some((_, ch)) => ch,
        };

        // Find current word, or return early if the word only has a
        // single character.
        let (mut idx, mut cur) = match char_indices.next() {
            None => return vec![(word, "", "")],
            Some((idx, cur)) => (idx, cur),
        };

        for (i, next) in char_indices {
            if prev.is_alphanumeric() && cur == '-' && next.is_alphanumeric() {
                let (head, tail) = word.split_at(idx + 1);
                triples.push((head, "", tail));
            }
            prev = cur;
            idx = i;
            cur = next;
        }

        // Finally option is no split at all.
        triples.push((word, "", ""));

        triples
    }
}

/// A hyphenation dictionary can be used to do language-specific
/// hyphenation using patterns from the hyphenation crate.
#[cfg(feature = "hyphenation")]
impl WordSplitter for Standard {
    fn split<'w>(&self, word: &'w str) -> Vec<(&'w str, &'w str, &'w str)> {
        // Find splits based on language dictionary.
        let mut triples = Vec::new();
        for n in self.hyphenate(word).breaks {
            let (head, tail) = word.split_at(n);
            let hyphen = if head.ends_with('-') { "" } else { "-" };
            triples.push((head, hyphen, tail));
        }
        // Finally option is no split at all.
        triples.push((word, "", ""));

        triples
    }
}