diff options
Diffstat (limited to 'unicode-width/scripts')
| -rwxr-xr-x | unicode-width/scripts/unicode.py | 321 | 
1 files changed, 321 insertions, 0 deletions
| diff --git a/unicode-width/scripts/unicode.py b/unicode-width/scripts/unicode.py new file mode 100755 index 0000000..5456ee3 --- /dev/null +++ b/unicode-width/scripts/unicode.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 +# +# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# This script uses the following Unicode tables: +# - EastAsianWidth.txt +# - ReadMe.txt +# - UnicodeData.txt +# +# Since this should not require frequent updates, we just store this +# out-of-line and check the unicode.rs file into git. + +import fileinput, re, os, sys, operator + +preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly + +#![allow(missing_docs, non_upper_case_globals, non_snake_case)] +''' + +# Mapping taken from Table 12 from: +# http://www.unicode.org/reports/tr44/#General_Category_Values +expanded_categories = { +    'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], +    'Lm': ['L'], 'Lo': ['L'], +    'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], +    'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], +    'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], +    'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], +    'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], +    'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], +    'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], +} + +# these are the surrogate codepoints, which are not valid rust characters +surrogate_codepoints = (0xd800, 0xdfff) + +def fetch(f): +    if not os.path.exists(os.path.basename(f)): +        os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" +                  % f) + +    if not os.path.exists(os.path.basename(f)): +        sys.stderr.write("cannot load %s" % f) +        exit(1) + +def is_surrogate(n): +    return surrogate_codepoints[0] <= n <= surrogate_codepoints[1] + +def load_unicode_data(f): +    fetch(f) +    gencats = {} + +    udict = {} +    range_start = -1 +    for line in fileinput.input(f): +        data = line.split(';') +        if len(data) != 15: +            continue +        cp = int(data[0], 16) +        if is_surrogate(cp): +            continue +        if range_start >= 0: +            for i in range(range_start, cp): +                udict[i] = data +            range_start = -1 +        if data[1].endswith(", First>"): +            range_start = cp +            continue +        udict[cp] = data + +    for code in udict: +        [code_org, name, gencat, combine, bidi, +         decomp, deci, digit, num, mirror, +         old, iso, upcase, lowcase, titlecase ] = udict[code] + +        # place letter in categories as appropriate +        for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []): +            if cat not in gencats: +                gencats[cat] = [] +            gencats[cat].append(code) + +    gencats = group_cats(gencats) + +    return gencats + +def group_cats(cats): +    cats_out = {} +    for cat in cats: +        cats_out[cat] = group_cat(cats[cat]) +    return cats_out + +def group_cat(cat): +    cat_out = [] +    letters = sorted(set(cat)) +    cur_start = letters.pop(0) +    cur_end = cur_start +    for letter in letters: +        assert letter > cur_end, \ +            "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter)) +        if letter == cur_end + 1: +            cur_end = letter +        else: +            cat_out.append((cur_start, cur_end)) +            cur_start = cur_end = letter +    cat_out.append((cur_start, cur_end)) +    return cat_out + +def format_table_content(f, content, indent): +    line = " "*indent +    first = True +    for chunk in content.split(","): +        if len(line) + len(chunk) < 98: +            if first: +                line += chunk +            else: +                line += ", " + chunk +            first = False +        else: +            f.write(line + ",\n") +            line = " "*indent + chunk +    f.write(line) + +# load all widths of want_widths, except those in except_cats +def load_east_asian_width(want_widths, except_cats): +    f = "EastAsianWidth.txt" +    fetch(f) +    widths = {} +    re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)") +    re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)") + +    for line in fileinput.input(f): +        width = None +        d_lo = 0 +        d_hi = 0 +        cat = None +        m = re1.match(line) +        if m: +            d_lo = m.group(1) +            d_hi = m.group(1) +            width = m.group(2) +            cat = m.group(3) +        else: +            m = re2.match(line) +            if m: +                d_lo = m.group(1) +                d_hi = m.group(2) +                width = m.group(3) +                cat = m.group(4) +            else: +                continue +        if cat in except_cats or width not in want_widths: +            continue +        d_lo = int(d_lo, 16) +        d_hi = int(d_hi, 16) +        if width not in widths: +            widths[width] = [] +        widths[width].append((d_lo, d_hi)) +    return widths + +def escape_char(c): +    return "'\\u{%x}'" % c + +def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, +        pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True): +    pub_string = "const" +    if not is_const: +        pub_string = "let" +    if is_pub: +        pub_string = "pub " + pub_string +    f.write("    %s %s: %s = &[\n" % (pub_string, name, t_type)) +    data = "" +    first = True +    for dat in t_data: +        if not first: +            data += "," +        first = False +        data += pfun(dat) +    format_table_content(f, data, 8) +    f.write("\n    ];\n\n") + +def emit_charwidth_module(f, width_table): +    f.write("pub mod charwidth {") +    f.write(""" +    use core::option::Option::{self, Some, None}; +    use core::result::Result::{Ok, Err}; + +    #[inline] +    fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 { +        use core::cmp::Ordering::{Equal, Less, Greater}; +        match r.binary_search_by(|&(lo, hi, _, _)| { +            if lo <= c && c <= hi { Equal } +            else if hi < c { Less } +            else { Greater } +        }) { +            Ok(idx) => { +                let (_, _, r_ncjk, r_cjk) = r[idx]; +                if is_cjk { r_cjk } else { r_ncjk } +            } +            Err(_) => 1 +        } +    } +""") + +    f.write(""" +    #[inline] +    pub fn width(c: char, is_cjk: bool) -> Option<usize> { +        match c as usize { +            _c @ 0 => Some(0),          // null is zero width +            cu if cu < 0x20 => None,    // control sequences have no width +            cu if cu < 0x7F => Some(1), // ASCII +            cu if cu < 0xA0 => None,    // more control sequences +            _ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as usize) +        } +    } + +""") + +    f.write("    // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n") +    f.write("    //     http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n") +    emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False, +            pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3])) +    f.write("}\n\n") + +def remove_from_wtable(wtable, val): +    wtable_out = [] +    while wtable: +        if wtable[0][1] < val: +            wtable_out.append(wtable.pop(0)) +        elif wtable[0][0] > val: +            break +        else: +            (wt_lo, wt_hi, width, width_cjk) = wtable.pop(0) +            if wt_lo == wt_hi == val: +                continue +            elif wt_lo == val: +                wtable_out.append((wt_lo+1, wt_hi, width, width_cjk)) +            elif wt_hi == val: +                wtable_out.append((wt_lo, wt_hi-1, width, width_cjk)) +            else: +                wtable_out.append((wt_lo, val-1, width, width_cjk)) +                wtable_out.append((val+1, wt_hi, width, width_cjk)) +    if wtable: +        wtable_out.extend(wtable) +    return wtable_out + + + +def optimize_width_table(wtable): +    wtable_out = [] +    w_this = wtable.pop(0) +    while wtable: +        if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]: +            w_tmp = wtable.pop(0) +            w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3]) +        else: +            wtable_out.append(w_this) +            w_this = wtable.pop(0) +    wtable_out.append(w_this) +    return wtable_out + +if __name__ == "__main__": +    r = "tables.rs" +    if os.path.exists(r): +        os.remove(r) +    with open(r, "w") as rf: +        # write the file's preamble +        rf.write(preamble) + +        # download and parse all the data +        fetch("ReadMe.txt") +        with open("ReadMe.txt") as readme: +            pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode" +            unicode_version = re.search(pattern, readme.read()).groups() +        rf.write(""" +/// The version of [Unicode](http://www.unicode.org/) +/// that this version of unicode-width is based on. +pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); + +""" % unicode_version) +        gencats = load_unicode_data("UnicodeData.txt") + +        ### character width module +        width_table = [] +        for zwcat in ["Me", "Mn", "Cf"]: +            width_table.extend([(lo_hi[0], lo_hi[1], 0, 0) for lo_hi in gencats[zwcat]]) +        width_table.append((4448, 4607, 0, 0)) + +        # get widths, except those that are explicitly marked zero-width above +        ea_widths = load_east_asian_width(["W", "F", "A"], ["Me", "Mn", "Cf"]) +        # these are doublewidth +        for dwcat in ["W", "F"]: +            width_table.extend([(lo_hi1[0], lo_hi1[1], 2, 2) for lo_hi1 in ea_widths[dwcat]]) +        width_table.extend([(lo_hi2[0], lo_hi2[1], 1, 2) for lo_hi2 in ea_widths["A"]]) + +        width_table.sort(key=lambda w: w[0]) + +        # soft hyphen is not zero width in preformatted text; it's used to indicate +        # a hyphen inserted to facilitate a linebreak. +        width_table = remove_from_wtable(width_table, 173) + +        # optimize the width table by collapsing adjacent entities when possible +        width_table = optimize_width_table(width_table) +        emit_charwidth_module(rf, width_table) | 
