diff options
Diffstat (limited to 'unicode-segmentation/scripts/unicode_gen_breaktests.py')
-rwxr-xr-x | unicode-segmentation/scripts/unicode_gen_breaktests.py | 212 |
1 files changed, 0 insertions, 212 deletions
diff --git a/unicode-segmentation/scripts/unicode_gen_breaktests.py b/unicode-segmentation/scripts/unicode_gen_breaktests.py deleted file mode 100755 index 113afa9..0000000 --- a/unicode-segmentation/scripts/unicode_gen_breaktests.py +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -# -# Copyright 2015 The Rust Project Developers. See the COPYRIGHT -# file at the top-level directory of this distribution and at -# http://rust-lang.org/COPYRIGHT. -# -# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or -# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license -# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your -# option. This file may not be copied, modified, or distributed -# except according to those terms. - -# This script uses the following Unicode tables: -# - auxiliary/GraphemeBreakTest.txt -# - auxiliary/WordBreakTest.txt -# -# Since this should not require frequent updates, we just store this -# out-of-line and check the unicode.rs file into git. -from __future__ import print_function - -import unicode, re, os, fileinput - -def load_test_data(f, optsplit=[]): - testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$") - - unicode.fetch(f) - data = [] - for line in fileinput.input(os.path.basename(f)): - # lines that include a test start with the ÷ character - if len(line) < 2 or not line.startswith('÷'): - continue - - m = testRe1.match(line) - if not m: - print("error: no match on line where test was expected: %s" % line) - continue - - # process the characters in this test case - chars = process_split_string(m.group(1)) - # skip test case if it contains invalid characters (viz., surrogates) - if not chars: - continue - - # now process test cases - (chars, info) = process_split_info(m.group(2), chars, optsplit) - - # make sure that we have break info for each break! - assert len(chars) - 1 == len(info) - - data.append((chars, info)) - - return data - -def process_split_info(s, c, o): - outcs = [] - outis = [] - workcs = c.pop(0) - - # are we on a × or a ÷? - isX = False - if s.startswith('×'): - isX = True - - # find each instance of '(÷|×) [x.y] ' - while s: - # find the currently considered rule number - sInd = s.index('[') + 1 - eInd = s.index(']') - - # if it's '× [a.b]' where 'a.b' is in o, then - # we consider it a split even though it's not - # marked as one - # if it's ÷ then it's always a split - if not isX or s[sInd:eInd] in o: - outis.append(s[sInd:eInd]) - outcs.append(workcs) - workcs = c.pop(0) - else: - workcs.extend(c.pop(0)) - - idx = 1 - while idx < len(s): - if s[idx:].startswith('×'): - isX = True - break - if s[idx:].startswith('÷'): - isX = False - break - idx += 1 - s = s[idx:] - - outcs.append(workcs) - return (outcs, outis) - -def process_split_string(s): - outls = [] - workls = [] - - inls = s.split() - - for i in inls: - if i == '÷' or i == '×': - outls.append(workls) - workls = [] - continue - - ival = int(i,16) - - if unicode.is_surrogate(ival): - return [] - - workls.append(ival) - - if workls: - outls.append(workls) - - return outls - -def showfun(x): - outstr = '("' - for c in x[0]: - outstr += "\\u{%x}" % c - outstr += '",&[' - xfirst = True - for xx in x[1:]: - if not xfirst: - outstr += '],&[' - xfirst = False - sfirst = True - for sp in xx: - if not sfirst: - outstr += ',' - sfirst = False - outstr += '"' - for c in sp: - outstr += "\\u{%x}" % c - outstr += '"' - outstr += '])' - return outstr - -def create_grapheme_data(f): - # rules 9.1 and 9.2 are for extended graphemes only - optsplits = ['9.1','9.2'] - d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits) - - test_same = [] - test_diff = [] - - for (c, i) in d: - allchars = [cn for s in c for cn in s] - extgraphs = [] - extwork = [] - - extwork.extend(c[0]) - for n in range(0,len(i)): - if i[n] in optsplits: - extwork.extend(c[n+1]) - else: - extgraphs.append(extwork) - extwork = [] - extwork.extend(c[n+1]) - - # these are the extended grapheme clusters - extgraphs.append(extwork) - - if extgraphs == c: - test_same.append((allchars, c)) - else: - test_diff.append((allchars, extgraphs, c)) - - stype = "&'static [(&'static str, &'static [&'static str])]" - dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]" - f.write(" // official Unicode test data\n") - f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER) - unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True) - unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True) - -def create_words_data(f): - d = load_test_data("auxiliary/WordBreakTest.txt") - - test = [] - - for (c, i) in d: - allchars = [cn for s in c for cn in s] - test.append((allchars, c)) - - wtype = "&'static [(&'static str, &'static [&'static str])]" - f.write(" // official Unicode test data\n") - f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER) - unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True) - -def create_sentence_data(f): - d = load_test_data("auxiliary/SentenceBreakTest.txt") - - test = [] - - for (c, i) in d: - allchars = [cn for s in c for cn in s] - test.append((allchars, c)) - - wtype = "&'static [(&'static str, &'static [&'static str])]" - f.write(" // official Unicode test data\n") - f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER) - unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True) - -if __name__ == "__main__": - with open("testdata.rs", "w") as rf: - rf.write(unicode.preamble) - create_grapheme_data(rf) - create_words_data(rf) - create_sentence_data(rf) |