mirror of
				https://github.com/tursodatabase/libsql.git
				synced 2025-10-31 11:06:13 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			206 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			Tcl
		
	
	
	
	
	
			
		
		
	
	
			206 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			Tcl
		
	
	
	
	
	
| 
 | |
| #--------------------------------------------------------------------------
 | |
| # Parameter $zName must be a path to the file UnicodeData.txt. This command
 | |
| # reads the file and returns a list of mappings required to remove all
 | |
| # diacritical marks from a unicode string. Each mapping is itself a list
 | |
| # consisting of two elements - the unicode codepoint and the single ASCII
 | |
| # character that it should be replaced with, or an empty string if the 
 | |
| # codepoint should simply be removed from the input. Examples:
 | |
| #
 | |
| #   { 224 a  0 }     (replace codepoint 224 to "a")
 | |
| #   { 769 "" 0 }     (remove codepoint 769 from input)
 | |
| #
 | |
| # Mappings are only returned for non-upper case codepoints. It is assumed
 | |
| # that the input has already been folded to lower case.
 | |
| #
 | |
| # The third value in the list is always either 0 or 1. 0 if the 
 | |
| # UnicodeData.txt file maps the codepoint to a single ASCII character and
 | |
| # a diacritic, or 1 if the mapping is indirect. For example, consider the 
 | |
| # two entries:
 | |
| #
 | |
| # 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
 | |
| # 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
 | |
| #
 | |
| # The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a 
 | |
| # diacritic). The second is an indirect mapping, as it maps to the
 | |
| # first codepoint plus 0302 (a diacritic).
 | |
| #
 | |
| proc rd_load_unicodedata_text {zName} {
 | |
|   global tl_lookup_table
 | |
| 
 | |
|   set fd [open $zName]
 | |
|   set lField {
 | |
|     code
 | |
|     character_name
 | |
|     general_category
 | |
|     canonical_combining_classes
 | |
|     bidirectional_category
 | |
|     character_decomposition_mapping
 | |
|     decimal_digit_value
 | |
|     digit_value
 | |
|     numeric_value
 | |
|     mirrored
 | |
|     unicode_1_name
 | |
|     iso10646_comment_field
 | |
|     uppercase_mapping
 | |
|     lowercase_mapping
 | |
|     titlecase_mapping
 | |
|   }
 | |
|   set lRet [list]
 | |
| 
 | |
|   while { ![eof $fd] } {
 | |
|     set line [gets $fd]
 | |
|     if {$line == ""} continue
 | |
| 
 | |
|     set fields [split $line ";"]
 | |
|     if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
 | |
|     foreach $lField $fields {}
 | |
|     if { [llength $character_decomposition_mapping]!=2
 | |
|       || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
 | |
|     } {
 | |
|       continue
 | |
|     }
 | |
| 
 | |
|     set iCode  [expr "0x$code"]
 | |
|     set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
 | |
|     set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
 | |
| 
 | |
|     # Filter out upper-case characters, as they will be mapped to their
 | |
|     # lower-case equivalents before this data is used.
 | |
|     if {[info exists tl_lookup_table($iCode)]} continue
 | |
| 
 | |
|     # Check if this is an indirect mapping. If so, set bIndirect to true
 | |
|     # and change $iAscii to the indirectly mappped ASCII character.
 | |
|     set bIndirect 0
 | |
|     if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
 | |
|       set iAscii $mapping($iAscii)
 | |
|       set bIndirect 1
 | |
|     }
 | |
| 
 | |
|     if { ($iAscii >= 97 && $iAscii <= 122)
 | |
|       || ($iAscii >= 65 && $iAscii <= 90)
 | |
|     } {
 | |
|       lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
 | |
|       set mapping($iCode) $iAscii
 | |
|       set dia($iDia) 1
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   foreach d [array names dia] {
 | |
|     lappend lRet [list $d "" 0]
 | |
|   }
 | |
|   set lRet [lsort -integer -index 0 $lRet]
 | |
| 
 | |
|   close $fd
 | |
|   set lRet
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------
 | |
| # Parameter $zName must be a path to the file UnicodeData.txt. This command
 | |
| # reads the file and returns a list of codepoints (integers). The list
 | |
| # contains all codepoints in the UnicodeData.txt assigned to any "General
 | |
| # Category" that is not a "Letter" or "Number".
 | |
| #
 | |
| proc an_load_unicodedata_text {zName} {
 | |
|   set fd [open $zName]
 | |
|   set lField {
 | |
|     code
 | |
|     character_name
 | |
|     general_category
 | |
|     canonical_combining_classes
 | |
|     bidirectional_category
 | |
|     character_decomposition_mapping
 | |
|     decimal_digit_value
 | |
|     digit_value
 | |
|     numeric_value
 | |
|     mirrored
 | |
|     unicode_1_name
 | |
|     iso10646_comment_field
 | |
|     uppercase_mapping
 | |
|     lowercase_mapping
 | |
|     titlecase_mapping
 | |
|   }
 | |
|   set lRet [list]
 | |
| 
 | |
|   while { ![eof $fd] } {
 | |
|     set line [gets $fd]
 | |
|     if {$line == ""} continue
 | |
| 
 | |
|     set fields [split $line ";"]
 | |
|     if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
 | |
|     foreach $lField $fields {}
 | |
| 
 | |
|     set iCode [expr "0x$code"]
 | |
|     set bAlnum [expr {
 | |
|          [lsearch {L N} [string range $general_category 0 0]] >= 0
 | |
|       || $general_category=="Co"
 | |
|     }]
 | |
| 
 | |
|     if { !$bAlnum } { lappend lRet $iCode }
 | |
|   }
 | |
| 
 | |
|   close $fd
 | |
|   set lRet
 | |
| }
 | |
| 
 | |
| proc tl_load_casefolding_txt {zName} {
 | |
|   global tl_lookup_table
 | |
| 
 | |
|   set fd [open $zName]
 | |
|   while { ![eof $fd] } {
 | |
|     set line [gets $fd]
 | |
|     if {[string range $line 0 0] == "#"} continue
 | |
|     if {$line == ""} continue
 | |
| 
 | |
|     foreach x {a b c d} {unset -nocomplain $x}
 | |
|     foreach {a b c d} [split $line ";"] {}
 | |
| 
 | |
|     set a2 [list]
 | |
|     set c2 [list]
 | |
|     foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
 | |
|     foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
 | |
|     set b [string trim $b]
 | |
|     set d [string trim $d]
 | |
| 
 | |
|     if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
 | |
|   }
 | |
| }
 | |
| 
 | |
| proc cc_load_unicodedata_text {zName} {
 | |
|   set fd [open $zName]
 | |
|   set lField {
 | |
|     code
 | |
|     character_name
 | |
|     general_category
 | |
|     canonical_combining_classes
 | |
|     bidirectional_category
 | |
|     character_decomposition_mapping
 | |
|     decimal_digit_value
 | |
|     digit_value
 | |
|     numeric_value
 | |
|     mirrored
 | |
|     unicode_1_name
 | |
|     iso10646_comment_field
 | |
|     uppercase_mapping
 | |
|     lowercase_mapping
 | |
|     titlecase_mapping
 | |
|   }
 | |
|   set lRet [list]
 | |
| 
 | |
|   while { ![eof $fd] } {
 | |
|     set line [gets $fd]
 | |
|     if {$line == ""} continue
 | |
| 
 | |
|     set fields [split $line ";"]
 | |
|     if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
 | |
|     foreach $lField $fields {}
 | |
| 
 | |
|     lappend lRet [list $code $general_category]
 | |
|   }
 | |
| 
 | |
|   close $fd
 | |
|   set lRet
 | |
| }
 | |
| 
 | |
| 
 |