1 files changed, 212 insertions, 0 deletions
diff --git a/unicode-segmentation/scripts/unicode_gen_breaktests.py b/unicode-segmentation/scripts/unicode_gen_breaktests.py
new file mode 100755
index 0000000..113afa9
--- /dev/null
+++ b/unicode-segmentation/scripts/unicode_gen_breaktests.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python
+# -*- coding: utf-8
+#
+# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+# This script uses the following Unicode tables:
+# - auxiliary/GraphemeBreakTest.txt
+# - auxiliary/WordBreakTest.txt
+#
+# Since this should not require frequent updates, we just store this
+# out-of-line and check the unicode.rs file into git.
+from __future__ import print_function
+
+import unicode, re, os, fileinput
+
+def load_test_data(f, optsplit=[]):
+    testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
+
+    unicode.fetch(f)
+    data = []
+    for line in fileinput.input(os.path.basename(f)):
+        # lines that include a test start with the ÷ character
+        if len(line) < 2 or not line.startswith('÷'):
+            continue
+
+        m = testRe1.match(line)
+        if not m:
+            print("error: no match on line where test was expected: %s" % line)
+            continue
+
+        # process the characters in this test case
+        chars = process_split_string(m.group(1))
+        # skip test case if it contains invalid characters (viz., surrogates)
+        if not chars:
+            continue
+
+        # now process test cases
+        (chars, info) = process_split_info(m.group(2), chars, optsplit)
+
+        # make sure that we have break info for each break!
+        assert len(chars) - 1 == len(info)
+
+        data.append((chars, info))
+
+    return data
+
+def process_split_info(s, c, o):
+    outcs = []
+    outis = []
+    workcs = c.pop(0)
+
+    # are we on a × or a ÷?
+    isX = False
+    if s.startswith('×'):
+        isX = True
+
+    # find each instance of '(÷|×) [x.y] '
+    while s:
+        # find the currently considered rule number
+        sInd = s.index('[') + 1
+        eInd = s.index(']')
+
+        # if it's '× [a.b]' where 'a.b' is in o, then
+        # we consider it a split even though it's not
+        # marked as one
+        # if it's ÷ then it's always a split
+        if not isX or s[sInd:eInd] in o:
+            outis.append(s[sInd:eInd])
+            outcs.append(workcs)
+            workcs = c.pop(0)
+        else:
+            workcs.extend(c.pop(0))
+
+        idx = 1
+        while idx < len(s):
+            if s[idx:].startswith('×'):
+                isX = True
+                break
+            if s[idx:].startswith('÷'):
+                isX = False
+                break
+            idx += 1
+        s = s[idx:]
+
+    outcs.append(workcs)
+    return (outcs, outis)
+
+def process_split_string(s):
+    outls = []
+    workls = []
+
+    inls = s.split()
+
+    for i in inls:
+        if i == '÷' or i == '×':
+            outls.append(workls)
+            workls = []
+            continue
+
+        ival = int(i,16)
+
+        if unicode.is_surrogate(ival):
+            return []
+
+        workls.append(ival)
+
+    if workls:
+        outls.append(workls)
+
+    return outls
+
+def showfun(x):
+    outstr = '("'
+    for c in x[0]:
+        outstr += "\\u{%x}" % c
+    outstr += '",&['
+    xfirst = True
+    for xx in x[1:]:
+        if not xfirst:
+            outstr += '],&['
+        xfirst = False
+        sfirst = True
+        for sp in xx:
+            if not sfirst:
+                outstr += ','
+            sfirst = False
+            outstr += '"'
+            for c in sp:
+                outstr += "\\u{%x}" % c
+            outstr += '"'
+    outstr += '])'
+    return outstr
+
+def create_grapheme_data(f):
+    # rules 9.1 and 9.2 are for extended graphemes only
+    optsplits = ['9.1','9.2']
+    d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
+
+    test_same = []
+    test_diff = []
+
+    for (c, i) in d:
+        allchars = [cn for s in c for cn in s]
+        extgraphs = []
+        extwork = []
+
+        extwork.extend(c[0])
+        for n in range(0,len(i)):
+            if i[n] in optsplits:
+                extwork.extend(c[n+1])
+            else:
+                extgraphs.append(extwork)
+                extwork = []
+                extwork.extend(c[n+1])
+
+        # these are the extended grapheme clusters
+        extgraphs.append(extwork)
+
+        if extgraphs == c:
+            test_same.append((allchars, c))
+        else:
+            test_diff.append((allchars, extgraphs, c))
+
+    stype = "&'static [(&'static str, &'static [&'static str])]"
+    dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
+    f.write("    // official Unicode test data\n")
+    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
+    unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
+    unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
+
+def create_words_data(f):
+    d = load_test_data("auxiliary/WordBreakTest.txt")
+
+    test = []
+
+    for (c, i) in d:
+        allchars = [cn for s in c for cn in s]
+        test.append((allchars, c))
+
+    wtype = "&'static [(&'static str, &'static [&'static str])]"
+    f.write("    // official Unicode test data\n")
+    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
+    unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
+
+def create_sentence_data(f):
+    d = load_test_data("auxiliary/SentenceBreakTest.txt")
+
+    test = []
+
+    for (c, i) in d:
+        allchars = [cn for s in c for cn in s]
+        test.append((allchars, c))
+
+    wtype = "&'static [(&'static str, &'static [&'static str])]"
+    f.write("    // official Unicode test data\n")
+    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
+    unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
+
+if __name__ == "__main__":
+    with open("testdata.rs", "w") as rf:
+        rf.write(unicode.preamble)
+        create_grapheme_data(rf)
+        create_words_data(rf)
+        create_sentence_data(rf)