OpenFPGA/libs/EXTERNAL/tcl8.6.12/tools/uniParse.tcl

417 lines
11 KiB
Tcl
Raw Normal View History

2022-06-07 11:15:20 -05:00
# uniParse.tcl --
#
# This program parses the UnicodeData file and generates the
# corresponding tclUniData.c file with compressed character
# data tables. The input to this program should be the latest
# UnicodeData file from:
# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
#
# Copyright (c) 1998-1999 Scriptics Corporation.
# All rights reserved.
namespace eval uni {
set shift 5; # number of bits of data within a page
# This value can be adjusted to find the
# best split to minimize table size
variable pMap; # map from page to page index, each entry is
# an index into the pages table, indexed by
# page number
variable pages; # map from page index to page info, each
# entry is a list of indices into the groups
# table, the list is indexed by the offset
variable groups; # list of character info values, indexed by
# group number, initialized with the
# unassigned character group
variable categories {
Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
}; # Ordered list of character categories, must
# match the enumeration in the header file.
}
proc uni::getValue {items index} {
variable categories
# Extract character info
set category [lindex $items 2]
if {[scan [lindex $items 12] %x toupper] == 1} {
set toupper [expr {$index - $toupper}]
} else {
set toupper 0
}
if {[scan [lindex $items 13] %x tolower] == 1} {
set tolower [expr {$tolower - $index}]
} else {
set tolower 0
}
if {[scan [lindex $items 14] %x totitle] == 1} {
set totitle [expr {$index - $totitle}]
} elseif {$tolower} {
set totitle 0
} else {
set totitle $toupper
}
set categoryIndex [lsearch -exact $categories $category]
if {$categoryIndex < 0} {
error "Unexpected character category: $index($category)"
}
return [list $categoryIndex $toupper $tolower $totitle]
}
proc uni::getGroup {value} {
variable groups
set gIndex [lsearch -exact $groups $value]
if {$gIndex < 0} {
set gIndex [llength $groups]
lappend groups $value
}
return $gIndex
}
proc uni::addPage {info} {
variable pMap
variable pages
variable shift
set pIndex [lsearch -exact $pages $info]
if {$pIndex < 0} {
set pIndex [llength $pages]
lappend pages $info
}
lappend pMap [expr {$pIndex << $shift}]
return
}
proc uni::buildTables {data} {
variable shift
variable pMap {}
variable pages {}
variable groups {{0 0 0 0}}
variable next 0
set info {} ;# temporary page info
set mask [expr {(1 << $shift) - 1}]
foreach line [split $data \n] {
if {$line eq ""} {
if {!($next & $mask)} {
# next character is already on page boundary
continue
}
# fill remaining page
set line [format %X [expr {($next-1)|$mask}]]
append line ";;Cn;0;ON;;;;;N;;;;;\n"
}
set items [split $line \;]
scan [lindex $items 0] %x index
if {$index > 0x3FFFF} then {
# Ignore characters > plane 3
continue
}
set index [format %d $index]
set gIndex [getGroup [getValue $items $index]]
# Since the input table omits unassigned characters, these will
# show up as gaps in the index sequence. There are a few special cases
# where the gaps correspond to a uniform block of assigned characters.
# These are indicated as such in the character name.
# Enter all unassigned characters up to the current character.
if {($index > $next) \
&& ![regexp "Last>$" [lindex $items 1]]} {
for {} {$next < $index} {incr next} {
lappend info 0
if {($next & $mask) == $mask} {
addPage $info
set info {}
}
}
}
# Enter all assigned characters up to the current character
for {set i $next} {$i <= $index} {incr i} {
# Add the group index to the info for the current page
lappend info $gIndex
# If this is the last entry in the page, add the page
if {($i & $mask) == $mask} {
addPage $info
set info {}
}
}
set next [expr {$index + 1}]
}
return
}
proc uni::main {} {
global argc argv0 argv
variable pMap
variable pages
variable groups
variable shift
variable next
if {$argc != 2} {
puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
exit 1
}
set f [open [lindex $argv 0] r]
set data [read $f]
close $f
buildTables $data
puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
set size [expr {[llength $pMap]*2 + ([llength $pages]<<$shift)}]
puts "shift = $shift, space = $size"
set f [open [file join [lindex $argv 1] tclUniData.c] w]
fconfigure $f -translation lf -encoding utf-8
puts $f "/*
* tclUniData.c --
*
* Declarations of Unicode character information tables. This file is
* automatically generated by the tools/uniParse.tcl script. Do not
* modify this file by hand.
*
* Copyright (c) 1998 Scriptics Corporation.
* All rights reserved.
*/
/*
* A 16-bit Unicode character is split into two parts in order to index
* into the following tables. The lower OFFSET_BITS comprise an offset
* into a page of characters. The upper bits comprise the page number.
*/
#define OFFSET_BITS $shift
/*
* The pageMap is indexed by page number and returns an alternate page number
* that identifies a unique page of characters. Many Unicode characters map
* to the same alternate page number.
*/
static const unsigned short pageMap\[\] = {"
set line " "
set last [expr {[llength $pMap] - 1}]
for {set i 0} {$i <= $last} {incr i} {
if {$i == [expr {0x10000 >> $shift}]} {
set line [string trimright $line " \t,"]
puts $f $line
set lastpage [expr {[lindex $line end] >> $shift}]
puts stdout "lastpage: $lastpage"
puts $f "#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6"
set line " ,"
}
append line [lindex $pMap $i]
if {$i != $last} {
append line ", "
}
if {[string length $line] > 70} {
puts $f [string trimright $line]
set line " "
}
}
puts $f $line
puts $f "#endif /* TCL_UTF_MAX > 3 */"
puts $f "};
/*
* The groupMap is indexed by combining the alternate page number with
* the page offset and returns a group number that identifies a unique
* set of character attributes.
*/
static const unsigned char groupMap\[\] = {"
set line " "
set lasti [expr {[llength $pages] - 1}]
for {set i 0} {$i <= $lasti} {incr i} {
set page [lindex $pages $i]
set lastj [expr {[llength $page] - 1}]
if {$i == ($lastpage + 1)} {
puts $f [string trimright $line " \t,"]
puts $f "#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6"
set line " ,"
}
for {set j 0} {$j <= $lastj} {incr j} {
append line [lindex $page $j]
if {$j != $lastj || $i != $lasti} {
append line ", "
}
if {[string length $line] > 70} {
puts $f [string trimright $line]
set line " "
}
}
}
puts $f $line
puts $f "#endif /* TCL_UTF_MAX > 3 */"
puts $f "};
/*
* Each group represents a unique set of character attributes. The attributes
* are encoded into a 32-bit value as follows:
*
* Bits 0-4 Character category: see the constants listed below.
*
* Bits 5-7 Case delta type: 000 = identity
* 010 = add delta for lower
* 011 = add delta for lower, add 1 for title
* 100 = subtract delta for title/upper
* 101 = sub delta for upper, sub 1 for title
* 110 = sub delta for upper, add delta for lower
* 111 = subtract delta for upper
*
* Bits 8-31 Case delta: delta for case conversions. This should be the
* highest field so we can easily sign extend.
*/
static const int groups\[\] = {"
set line " "
set last [expr {[llength $groups] - 1}]
for {set i 0} {$i <= $last} {incr i} {
foreach {type toupper tolower totitle} [lindex $groups $i] {}
# Compute the case conversion type and delta
if {$totitle} {
if {$totitle == $toupper} {
# subtract delta for title or upper
set case 4
set delta $toupper
if {$tolower} {
error "New case conversion type needed: $toupper $tolower $totitle"
}
} elseif {$toupper} {
# subtract delta for upper, subtract 1 for title
set case 5
set delta $toupper
if {($totitle != 1) || $tolower} {
error "New case conversion type needed: $toupper $tolower $totitle"
}
} else {
# add delta for lower, add 1 for title
set case 3
set delta $tolower
if {$totitle != -1} {
error "New case conversion type needed: $toupper $tolower $totitle"
}
}
} elseif {$toupper} {
set delta $toupper
if {$tolower == $toupper} {
# subtract delta for upper, add delta for lower
set case 6
} elseif {!$tolower} {
# subtract delta for upper
set case 7
} else {
error "New case conversion type needed: $toupper $tolower $totitle"
}
} elseif {$tolower} {
# add delta for lower
set case 2
set delta $tolower
} else {
# noop
set case 0
set delta 0
}
append line [expr {($delta << 8) | ($case << 5) | $type}]
if {$i != $last} {
append line ", "
}
if {[string length $line] > 65} {
puts $f [string trimright $line]
set line " "
}
}
puts $f $line
puts -nonewline $f "};
#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6
# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1FFFFF) >= [format 0x%X $next])
#else
# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1F0000) != 0)
#endif
/*
* The following constants are used to determine the category of a
* Unicode character.
*/
enum {
UNASSIGNED,
UPPERCASE_LETTER,
LOWERCASE_LETTER,
TITLECASE_LETTER,
MODIFIER_LETTER,
OTHER_LETTER,
NON_SPACING_MARK,
ENCLOSING_MARK,
COMBINING_SPACING_MARK,
DECIMAL_DIGIT_NUMBER,
LETTER_NUMBER,
OTHER_NUMBER,
SPACE_SEPARATOR,
LINE_SEPARATOR,
PARAGRAPH_SEPARATOR,
CONTROL,
FORMAT,
PRIVATE_USE,
SURROGATE,
CONNECTOR_PUNCTUATION,
DASH_PUNCTUATION,
OPEN_PUNCTUATION,
CLOSE_PUNCTUATION,
INITIAL_QUOTE_PUNCTUATION,
FINAL_QUOTE_PUNCTUATION,
OTHER_PUNCTUATION,
MATH_SYMBOL,
CURRENCY_SYMBOL,
MODIFIER_SYMBOL,
OTHER_SYMBOL
};
/*
* The following macros extract the fields of the character info. The
* GetDelta() macro is complicated because we can't rely on the C compiler
* to do sign extension on right shifts.
*/
#define GetCaseType(info) (((info) & 0xE0) >> 5)
#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1F)
#define GetDelta(info) ((info) >> 8)
/*
* This macro extracts the information about a character from the
* Unicode character tables.
*/
#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6
# define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0x1FFFFF) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
#else
# define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xFFFF) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
#endif
"
close $f
}
uni::main
return