mirror of
				https://github.com/tursodatabase/libsql.git
				synced 2025-10-31 11:06:13 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			985 lines
		
	
	
		
			25 KiB
		
	
	
	
		
			Tcl
		
	
	
	
	
	
			
		
		
	
	
			985 lines
		
	
	
		
			25 KiB
		
	
	
	
		
			Tcl
		
	
	
	
	
	
| 
 | |
| source [file join [file dirname [info script]] parseunicode.tcl]
 | |
| 
 | |
| proc print_rd {map} {
 | |
|   global tl_lookup_table
 | |
|   set aChar [list]
 | |
|   set lRange [list]
 | |
| 
 | |
|   set nRange 1
 | |
|   set iFirst  [lindex $map 0 0]
 | |
|   set cPrev   [lindex $map 0 1]
 | |
|   set fPrev   [lindex $map 0 2]
 | |
| 
 | |
|   foreach m [lrange $map 1 end] {
 | |
|     foreach {i c f} $m {}
 | |
| 
 | |
|     if {$cPrev == $c && $fPrev==$f} {
 | |
|       for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
 | |
|         if {[info exists tl_lookup_table($j)]==0} break
 | |
|       }
 | |
| 
 | |
|       if {$j==$i} {
 | |
|         set nNew [expr {(1 + $i - $iFirst)}]
 | |
|         if {$nNew<=8} {
 | |
|           set nRange $nNew
 | |
|           continue
 | |
|         }
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     lappend lRange [list $iFirst $nRange]
 | |
|     lappend aChar  $cPrev
 | |
|     lappend aFlag  $fPrev
 | |
| 
 | |
|     set iFirst $i
 | |
|     set cPrev  $c
 | |
|     set fPrev  $f
 | |
|     set nRange 1
 | |
|   }
 | |
|   lappend lRange [list $iFirst $nRange]
 | |
|   lappend aChar $cPrev
 | |
|   lappend aFlag $fPrev
 | |
| 
 | |
|   puts "/*"
 | |
|   puts "** If the argument is a codepoint corresponding to a lowercase letter"
 | |
|   puts "** in the ASCII range with a diacritic added, return the codepoint"
 | |
|   puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
 | |
|   puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
 | |
|   puts "** E\"). The resuls of passing a codepoint that corresponds to an"
 | |
|   puts "** uppercase letter are undefined."
 | |
|   puts "*/"
 | |
|   puts "static int ${::remove_diacritic}(int c, int bComplex)\{"
 | |
|   puts "  unsigned short aDia\[\] = \{"
 | |
|   puts -nonewline "        0, "
 | |
|   set i 1
 | |
|   foreach r $lRange {
 | |
|     foreach {iCode nRange} $r {}
 | |
|     if {($i % 8)==0} {puts "" ; puts -nonewline "    " }
 | |
|     incr i
 | |
| 
 | |
|     puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
 | |
|     puts -nonewline ", "
 | |
|   }
 | |
|   puts ""
 | |
|   puts "  \};"
 | |
|   puts "#define HIBIT ((unsigned char)0x80)"
 | |
|   puts "  unsigned char aChar\[\] = \{"
 | |
|   puts -nonewline "    '\\0',      "
 | |
|   set i 1
 | |
|   foreach c $aChar f $aFlag {
 | |
|     if { $f } {
 | |
|       set str "'$c'|HIBIT, "
 | |
|     } else {
 | |
|       set str "'$c',       "
 | |
|     }
 | |
|     if {$c == ""} { set str "'\\0',      " }
 | |
| 
 | |
|     if {($i % 6)==0} {puts "" ; puts -nonewline "    " }
 | |
|     incr i
 | |
|     puts -nonewline "$str"
 | |
|   }
 | |
|   puts ""
 | |
|   puts "  \};"
 | |
|   puts {
 | |
|   unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
 | |
|   int iRes = 0;
 | |
|   int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
 | |
|   int iLo = 0;
 | |
|   while( iHi>=iLo ){
 | |
|     int iTest = (iHi + iLo) / 2;
 | |
|     if( key >= aDia[iTest] ){
 | |
|       iRes = iTest;
 | |
|       iLo = iTest+1;
 | |
|     }else{
 | |
|       iHi = iTest-1;
 | |
|     }
 | |
|   }
 | |
|   assert( key>=aDia[iRes] );
 | |
|   if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
 | |
|   return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);}
 | |
|   puts "\}"
 | |
| }
 | |
| 
 | |
| proc print_isdiacritic {zFunc map} {
 | |
| 
 | |
|   set lCode [list]
 | |
|   foreach m $map {
 | |
|     foreach {code char flag} $m {}
 | |
|     if {$flag} continue
 | |
|     if {$code && $char == ""} { lappend lCode $code }
 | |
|   }
 | |
|   set lCode [lsort -integer $lCode]
 | |
|   set iFirst [lindex $lCode 0]
 | |
|   set iLast [lindex $lCode end]
 | |
| 
 | |
|   set i1 0
 | |
|   set i2 0
 | |
| 
 | |
|   foreach c $lCode {
 | |
|     set i [expr $c - $iFirst]
 | |
|     if {$i < 32} {
 | |
|       set i1 [expr {$i1 | (1<<$i)}]
 | |
|     } else {
 | |
|       set i2 [expr {$i2 | (1<<($i-32))}]
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   puts "/*"
 | |
|   puts "** Return true if the argument interpreted as a unicode codepoint" 
 | |
|   puts "** is a diacritical modifier character."
 | |
|   puts "*/"
 | |
|   puts "int ${zFunc}\(int c)\{"
 | |
|   puts "  unsigned int mask0 = [format "0x%08X" $i1];"
 | |
|   puts "  unsigned int mask1 = [format "0x%08X" $i2];"
 | |
| 
 | |
|   puts "  if( c<$iFirst || c>$iLast ) return 0;"
 | |
|   puts "  return (c < $iFirst+32) ?"
 | |
|   puts "      (mask0 & ((unsigned int)1 << (c-$iFirst))) :"
 | |
|   puts "      (mask1 & ((unsigned int)1 << (c-$iFirst-32)));"
 | |
|   puts "\}"
 | |
| }
 | |
| 
 | |
| 
 | |
| #-------------------------------------------------------------------------
 | |
| 
 | |
| proc an_load_separator_ranges {} {
 | |
|   global unicodedata.txt
 | |
|   set lSep [an_load_unicodedata_text ${unicodedata.txt}]
 | |
|   unset -nocomplain iFirst 
 | |
|   unset -nocomplain nRange 
 | |
|   set lRange [list]
 | |
|   foreach sep $lSep {
 | |
|     if {0==[info exists iFirst]} {
 | |
|       set iFirst $sep
 | |
|       set nRange 1
 | |
|     } elseif { $sep == ($iFirst+$nRange) } {
 | |
|       incr nRange
 | |
|     } else {
 | |
|       lappend lRange [list $iFirst $nRange]
 | |
|       set iFirst $sep
 | |
|       set nRange 1
 | |
|     }
 | |
|   } 
 | |
|   lappend lRange [list $iFirst $nRange]
 | |
|   set lRange
 | |
| }
 | |
| 
 | |
| proc an_print_range_array {lRange} {
 | |
|   set iFirstMax 0
 | |
|   set nRangeMax 0
 | |
|   foreach range $lRange {
 | |
|     foreach {iFirst nRange} $range {}
 | |
|     if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
 | |
|     if {$nRange > $nRangeMax} {set nRangeMax $nRange}
 | |
|   }
 | |
|   if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
 | |
|   if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}
 | |
| 
 | |
|   puts -nonewline "  "
 | |
|   puts [string trim {
 | |
|   /* Each unsigned integer in the following array corresponds to a contiguous
 | |
|   ** range of unicode codepoints that are not either letters or numbers (i.e.
 | |
|   ** codepoints for which this function should return 0).
 | |
|   **
 | |
|   ** The most significant 22 bits in each 32-bit value contain the first 
 | |
|   ** codepoint in the range. The least significant 10 bits are used to store
 | |
|   ** the size of the range (always at least 1). In other words, the value 
 | |
|   ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 
 | |
|   ** C. It is not possible to represent a range larger than 1023 codepoints 
 | |
|   ** using this format.
 | |
|   */
 | |
|   }]
 | |
|   puts -nonewline "  static const unsigned int aEntry\[\] = \{"
 | |
|   set i 0
 | |
|   foreach range $lRange {
 | |
|     foreach {iFirst nRange} $range {}
 | |
|     set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]
 | |
| 
 | |
|     if {($i % 5)==0} {puts "" ; puts -nonewline "   "}
 | |
|     puts -nonewline " $u32,"
 | |
|     incr i
 | |
|   }
 | |
|   puts ""
 | |
|   puts "  \};"
 | |
| }
 | |
| 
 | |
| proc an_print_ascii_bitmap {lRange} {
 | |
|   foreach range $lRange {
 | |
|     foreach {iFirst nRange} $range {}
 | |
|     for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} {
 | |
|       if {$i<=127} { set a($i) 1 }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   set aAscii [list 0 0 0 0]
 | |
|   foreach key [array names a] {
 | |
|     set idx [expr $key >> 5]
 | |
|     lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))]
 | |
|   }
 | |
| 
 | |
|   puts "  static const unsigned int aAscii\[4\] = \{"
 | |
|   puts -nonewline "   "
 | |
|   foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] }
 | |
|   puts ""
 | |
|   puts "  \};"
 | |
| }
 | |
| 
 | |
| proc print_isalnum {zFunc lRange} {
 | |
|   puts "/*"
 | |
|   puts "** Return true if the argument corresponds to a unicode codepoint"
 | |
|   puts "** classified as either a letter or a number. Otherwise false."
 | |
|   puts "**"
 | |
|   puts "** The results are undefined if the value passed to this function"
 | |
|   puts "** is less than zero."
 | |
|   puts "*/"
 | |
|   puts "int ${zFunc}\(int c)\{"
 | |
|   an_print_range_array $lRange
 | |
|   an_print_ascii_bitmap $lRange
 | |
|   puts {
 | |
|   if( (unsigned int)c<128 ){
 | |
|     return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 );
 | |
|   }else if( (unsigned int)c<(1<<22) ){
 | |
|     unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
 | |
|     int iRes = 0;
 | |
|     int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
 | |
|     int iLo = 0;
 | |
|     while( iHi>=iLo ){
 | |
|       int iTest = (iHi + iLo) / 2;
 | |
|       if( key >= aEntry[iTest] ){
 | |
|         iRes = iTest;
 | |
|         iLo = iTest+1;
 | |
|       }else{
 | |
|         iHi = iTest-1;
 | |
|       }
 | |
|     }
 | |
|     assert( aEntry[0]<key );
 | |
|     assert( key>=aEntry[iRes] );
 | |
|     return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
 | |
|   }
 | |
|   return 1;}
 | |
|   puts "\}"
 | |
| }
 | |
| 
 | |
| proc print_test_isalnum {zFunc lRange} {
 | |
|   foreach range $lRange {
 | |
|     foreach {iFirst nRange} $range {}
 | |
|     for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 }
 | |
|   }
 | |
| 
 | |
|   puts "static int isalnum_test(int *piCode)\{"
 | |
|   puts -nonewline "  unsigned char aAlnum\[\] = \{"
 | |
|   for {set i 0} {$i < 70000} {incr i} {
 | |
|     if {($i % 32)==0} { puts "" ; puts -nonewline "    " }
 | |
|     set bFlag [expr ![info exists a($i)]]
 | |
|     puts -nonewline "${bFlag},"
 | |
|   }
 | |
|   puts ""
 | |
|   puts "  \};"
 | |
| 
 | |
|   puts -nonewline "  int aLargeSep\[\] = \{"
 | |
|   set i 0
 | |
|   foreach iSep [lsort -integer [array names a]] {
 | |
|     if {$iSep<70000} continue
 | |
|     if {($i % 8)==0} { puts "" ; puts -nonewline "   " }
 | |
|     puts -nonewline " $iSep,"
 | |
|     incr i
 | |
|   }
 | |
|   puts ""
 | |
|   puts "  \};"
 | |
|   puts -nonewline "  int aLargeOther\[\] = \{"
 | |
|   set i 0
 | |
|   foreach iSep [lsort -integer [array names a]] {
 | |
|     if {$iSep<70000} continue
 | |
|     if {[info exists a([expr $iSep-1])]==0} {
 | |
|       if {($i % 8)==0} { puts "" ; puts -nonewline "   " }
 | |
|       puts -nonewline " [expr $iSep-1],"
 | |
|       incr i
 | |
|     }
 | |
|     if {[info exists a([expr $iSep+1])]==0} {
 | |
|       if {($i % 8)==0} { puts "" ; puts -nonewline "   " }
 | |
|       puts -nonewline " [expr $iSep+1],"
 | |
|       incr i
 | |
|     }
 | |
|   }
 | |
|   puts ""
 | |
|   puts "  \};"
 | |
| 
 | |
|   puts [subst -nocommands {
 | |
|   int i;
 | |
|   for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){
 | |
|     if( ${zFunc}(i)!=aAlnum[i] ){
 | |
|       *piCode = i;
 | |
|       return 1;
 | |
|     }
 | |
|   }
 | |
|   for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){
 | |
|     if( ${zFunc}(aLargeSep[i])!=0 ){
 | |
|       *piCode = aLargeSep[i];
 | |
|       return 1;
 | |
|     }
 | |
|   }
 | |
|   for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){
 | |
|     if( ${zFunc}(aLargeOther[i])!=1 ){
 | |
|       *piCode = aLargeOther[i];
 | |
|       return 1;
 | |
|     }
 | |
|   }
 | |
|   }]
 | |
|   puts "  return 0;"
 | |
|   puts "\}"
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------
 | |
| 
 | |
| proc tl_create_records {} {
 | |
|   global tl_lookup_table
 | |
| 
 | |
|   set iFirst ""
 | |
|   set nOff 0
 | |
|   set nRange 0
 | |
|   set nIncr 0
 | |
| 
 | |
|   set lRecord [list]
 | |
|   foreach code [lsort -integer [array names tl_lookup_table]] {
 | |
|     set mapping $tl_lookup_table($code)
 | |
|     if {$iFirst == ""} {
 | |
|       set iFirst $code
 | |
|       set nOff   [expr $mapping - $code]
 | |
|       set nRange 1
 | |
|       set nIncr 1
 | |
|     } else {
 | |
|       set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))]
 | |
|       if { $nRange==1 && ($diff==1 || $diff==2) } {
 | |
|         set nIncr $diff
 | |
|       }
 | |
| 
 | |
|       if {$diff != $nIncr || ($mapping - $code)!=$nOff} {
 | |
|         if { $nRange==1 } {set nIncr 1}
 | |
|         lappend lRecord [list $iFirst $nIncr $nRange $nOff]
 | |
|         set iFirst $code
 | |
|         set nOff   [expr $mapping - $code]
 | |
|         set nRange 1
 | |
|         set nIncr 1
 | |
|       } else {
 | |
|         incr nRange
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   lappend lRecord [list $iFirst $nIncr $nRange $nOff]
 | |
| 
 | |
|   set lRecord
 | |
| }
 | |
| 
 | |
| proc tl_print_table_header {} {
 | |
|   puts -nonewline "  "
 | |
|   puts [string trim {
 | |
|   /* Each entry in the following array defines a rule for folding a range
 | |
|   ** of codepoints to lower case. The rule applies to a range of nRange
 | |
|   ** codepoints starting at codepoint iCode.
 | |
|   **
 | |
|   ** If the least significant bit in flags is clear, then the rule applies
 | |
|   ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
 | |
|   ** need to be folded). Or, if it is set, then the rule only applies to
 | |
|   ** every second codepoint in the range, starting with codepoint C.
 | |
|   **
 | |
|   ** The 7 most significant bits in flags are an index into the aiOff[]
 | |
|   ** array. If a specific codepoint C does require folding, then its lower
 | |
|   ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
 | |
|   **
 | |
|   ** The contents of this array are generated by parsing the CaseFolding.txt
 | |
|   ** file distributed as part of the "Unicode Character Database". See
 | |
|   ** http://www.unicode.org for details.
 | |
|   */
 | |
|   }]
 | |
|   puts "  static const struct TableEntry \{"
 | |
|   puts "    unsigned short iCode;"
 | |
|   puts "    unsigned char flags;"
 | |
|   puts "    unsigned char nRange;"
 | |
|   puts "  \} aEntry\[\] = \{"
 | |
| }
 | |
| 
 | |
| proc tl_print_table_entry {togglevar entry liOff} {
 | |
|   upvar $togglevar t
 | |
|   foreach {iFirst nIncr nRange nOff} $entry {}
 | |
| 
 | |
|   if {$iFirst > (1<<16)} { return 1 }
 | |
| 
 | |
|   if {[info exists t]==0} {set t 0}
 | |
|   if {$t==0} { puts -nonewline "    " }
 | |
| 
 | |
|   set flags 0
 | |
|   if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]}
 | |
|   if {$nOff<0}   { incr nOff [expr (1<<16)] }
 | |
| 
 | |
|   set idx [lsearch $liOff $nOff]
 | |
|   if {$idx<0} {error "malfunction generating aiOff"}
 | |
|   set flags [expr $flags + $idx*2]
 | |
| 
 | |
|   set txt "{$iFirst, $flags, $nRange},"
 | |
|   if {$t==2} {
 | |
|     puts $txt
 | |
|   } else {
 | |
|     puts -nonewline [format "% -23s" $txt]
 | |
|   }
 | |
|   set t [expr ($t+1)%3]
 | |
| 
 | |
|   return 0
 | |
| }
 | |
| 
 | |
| proc tl_print_table_footer {togglevar} {
 | |
|   upvar $togglevar t
 | |
|   if {$t!=0} {puts ""}
 | |
|   puts "  \};"
 | |
| }
 | |
| 
 | |
| proc tl_print_if_entry {entry} {
 | |
|   foreach {iFirst nIncr nRange nOff} $entry {}
 | |
|   if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}
 | |
| 
 | |
|   puts "  else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
 | |
|   puts "    ret = c + $nOff;"
 | |
|   puts "  \}"
 | |
| }
 | |
| 
 | |
| proc tl_generate_ioff_table {lRecord} {
 | |
|   foreach entry $lRecord {
 | |
|     foreach {iFirst nIncr nRange iOff} $entry {}
 | |
|     if {$iOff<0}   { incr iOff [expr (1<<16)] }
 | |
|     if {[info exists a($iOff)]} continue
 | |
|     set a($iOff) 1
 | |
|   }
 | |
| 
 | |
|   set liOff [lsort -integer [array names a]]
 | |
|   if {[llength $liOff]>128} { error "Too many distinct ioffs" }
 | |
|   return $liOff
 | |
| }
 | |
| 
 | |
| proc tl_print_ioff_table {liOff} {
 | |
|   puts -nonewline "  static const unsigned short aiOff\[\] = \{"
 | |
|   set i 0
 | |
|   foreach off $liOff {
 | |
|     if {($i % 8)==0} {puts "" ; puts -nonewline "   "}
 | |
|     puts -nonewline [format "% -7s" "$off,"]
 | |
|     incr i
 | |
|   }
 | |
|   puts ""
 | |
|   puts "  \};"
 | |
| 
 | |
| }
 | |
| 
 | |
| proc print_fold {zFunc} {
 | |
| 
 | |
|   set lRecord [tl_create_records]
 | |
| 
 | |
|   set lHigh [list]
 | |
|   puts "/*"
 | |
|   puts "** Interpret the argument as a unicode codepoint. If the codepoint"
 | |
|   puts "** is an upper case character that has a lower case equivalent,"
 | |
|   puts "** return the codepoint corresponding to the lower case version."
 | |
|   puts "** Otherwise, return a copy of the argument."
 | |
|   puts "**"
 | |
|   puts "** The results are undefined if the value passed to this function"
 | |
|   puts "** is less than zero."
 | |
|   puts "*/"
 | |
|   puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{"
 | |
| 
 | |
|   set liOff [tl_generate_ioff_table $lRecord]
 | |
|   tl_print_table_header
 | |
|   foreach entry $lRecord { 
 | |
|     if {[tl_print_table_entry toggle $entry $liOff]} { 
 | |
|       lappend lHigh $entry 
 | |
|     } 
 | |
|   }
 | |
|   tl_print_table_footer toggle
 | |
|   tl_print_ioff_table $liOff
 | |
| 
 | |
|   puts [subst -nocommands {
 | |
|   int ret = c;
 | |
| 
 | |
|   assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
 | |
| 
 | |
|   if( c<128 ){
 | |
|     if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
 | |
|   }else if( c<65536 ){
 | |
|     const struct TableEntry *p;
 | |
|     int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
 | |
|     int iLo = 0;
 | |
|     int iRes = -1;
 | |
| 
 | |
|     assert( c>aEntry[0].iCode );
 | |
|     while( iHi>=iLo ){
 | |
|       int iTest = (iHi + iLo) / 2;
 | |
|       int cmp = (c - aEntry[iTest].iCode);
 | |
|       if( cmp>=0 ){
 | |
|         iRes = iTest;
 | |
|         iLo = iTest+1;
 | |
|       }else{
 | |
|         iHi = iTest-1;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     assert( iRes>=0 && c>=aEntry[iRes].iCode );
 | |
|     p = &aEntry[iRes];
 | |
|     if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
 | |
|       ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
 | |
|       assert( ret>0 );
 | |
|     }
 | |
| 
 | |
|     if( eRemoveDiacritic ){
 | |
|       ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2);
 | |
|     }
 | |
|   }
 | |
|   }]
 | |
| 
 | |
|   foreach entry $lHigh {
 | |
|     tl_print_if_entry $entry
 | |
|   }
 | |
| 
 | |
|   puts ""
 | |
|   puts "  return ret;"
 | |
|   puts "\}"
 | |
| }
 | |
| 
 | |
| proc code {txt} {
 | |
|   set txt [string trimright $txt]
 | |
|   set txt [string trimleft $txt "\n"]
 | |
|   set n [expr {[string length $txt] - [string length [string trim $txt]]}]
 | |
|   set ret ""
 | |
|   foreach L [split $txt "\n"] {
 | |
|     append ret "[string range $L $n end]\n"
 | |
|   }
 | |
|   return [uplevel "subst -nocommands {$ret}"]
 | |
| }
 | |
| 
 | |
| proc intarray {lInt} {
 | |
|   set ret ""
 | |
|   set n [llength $lInt]
 | |
|   for {set i 0} {$i < $n} {incr i 10} {
 | |
|     append ret "\n    "
 | |
|     foreach int [lrange $lInt $i [expr $i+9]] {
 | |
|       append ret [format "%-7s" "$int, "]
 | |
|     }
 | |
|   }
 | |
|   append ret "\n  "
 | |
|   set ret
 | |
| }
 | |
| 
 | |
| proc categories_switch {Cvar first lSecond} {
 | |
|   upvar $Cvar C
 | |
|   set ret ""
 | |
|   append ret "case '$first':\n"
 | |
|   append ret "          switch( zCat\[1\] ){\n"
 | |
|   foreach s $lSecond {
 | |
|     append ret "            case '$s': aArray\[$C($first$s)\] = 1; break;\n"
 | |
|   }
 | |
|   append ret "            case '*': \n"
 | |
|   foreach s $lSecond {
 | |
|     append ret "              aArray\[$C($first$s)\] = 1;\n"
 | |
|   }
 | |
|   append ret "              break;\n"
 | |
|   append ret "            default: return 1;"
 | |
|   append ret "          }\n"
 | |
|   append ret "          break;\n"
 | |
| }
 | |
| 
 | |
| # Argument is a list. Each element of which is itself a list of two elements:
 | |
| #
 | |
| #   * the codepoint
 | |
| #   * the category
 | |
| #
 | |
| # List elements are sorted in order of codepoint.
 | |
| #
 | |
| proc print_categories {lMap} {
 | |
|   set categories {
 | |
|     Cc Cf Cn Cs
 | |
|     Ll Lm Lo Lt Lu
 | |
|     Mc Me Mn
 | |
|     Nd Nl No
 | |
|     Pc Pd Pe Pf Pi Po Ps
 | |
|     Sc Sk Sm So
 | |
|     Zl Zp Zs
 | |
| 
 | |
|     LC Co
 | |
|   }
 | |
| 
 | |
|   for {set i 0} {$i < [llength $categories]} {incr i} {
 | |
|     set C([lindex $categories $i]) [expr 1+$i]
 | |
|   }
 | |
| 
 | |
|   set caseC [categories_switch C C {c f n s o}]
 | |
|   set caseL [categories_switch C L {l m o t u C}]
 | |
|   set caseM [categories_switch C M {c e n}]
 | |
|   set caseN [categories_switch C N {d l o}]
 | |
|   set caseP [categories_switch C P {c d e f i o s}]
 | |
|   set caseS [categories_switch C S {c k m o}]
 | |
|   set caseZ [categories_switch C Z {l p s}]
 | |
| 
 | |
|   set nCat [expr [llength [array names C]] + 1]
 | |
|   puts [code {
 | |
|     int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){ 
 | |
|       aArray[0] = 1;
 | |
|       switch( zCat[0] ){
 | |
|         $caseC
 | |
|         $caseL
 | |
|         $caseM
 | |
|         $caseN
 | |
|         $caseP
 | |
|         $caseS
 | |
|         $caseZ
 | |
|       }
 | |
|       return 0;
 | |
|     }
 | |
|   }]
 | |
| 
 | |
|   set nRepeat 0
 | |
|   set first   [lindex $lMap 0 0]
 | |
|   set class   [lindex $lMap 0 1]
 | |
|   set prev -1
 | |
| 
 | |
|   set CASE(0) "Lu"
 | |
|   set CASE(1) "Ll"
 | |
| 
 | |
|   foreach m $lMap {
 | |
|     foreach {codepoint cl} $m {}
 | |
|     set codepoint [expr "0x$codepoint"]
 | |
|     if {$codepoint>=(1<<20)} continue
 | |
| 
 | |
|     set bNew 0
 | |
|     if {$codepoint!=($prev+1)} {
 | |
|       set bNew 1
 | |
|     } elseif {
 | |
|       $cl==$class || ($class=="LC" && $cl==$CASE([expr $nRepeat & 0x01]))
 | |
|     } {
 | |
|       incr nRepeat
 | |
|     } elseif {$class=="Lu" && $nRepeat==1 && $cl=="Ll"} {
 | |
|       set class LC
 | |
|       incr nRepeat
 | |
|     } else {
 | |
|       set bNew 1
 | |
|     }
 | |
|     if {$bNew} {
 | |
|       lappend lEntries [list $first $class $nRepeat]
 | |
|       set nRepeat 1
 | |
|       set first $codepoint
 | |
|       set class $cl
 | |
|     }
 | |
|     set prev $codepoint
 | |
|   }
 | |
|   if {$nRepeat>0} {
 | |
|     lappend lEntries [list $first $class $nRepeat]
 | |
|   }
 | |
| 
 | |
|   set aBlock [list 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 | |
|   set aMap [list]
 | |
|   foreach e $lEntries {
 | |
|     foreach {cp class nRepeat} $e {}
 | |
|     set block [expr ($cp>>16)]
 | |
|     if {$block>0 && [lindex $aBlock $block]==0} {
 | |
|       for {set i 1} {$i<=$block} {incr i} {
 | |
|         if {[lindex $aBlock $i]==0} {
 | |
|           lset aBlock $i [llength $aMap]
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|     lappend aMap [expr {$cp & 0xFFFF}]
 | |
|     lappend aData [expr {($nRepeat << 5) + $C($class)}]
 | |
|   }
 | |
|   for {set i 1} {$i<[llength $aBlock]} {incr i} {
 | |
|     if {[lindex $aBlock $i]==0} {
 | |
|       lset aBlock $i [llength $aMap]
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   set aBlockArray [intarray $aBlock]
 | |
|   set aMapArray [intarray $aMap]
 | |
|   set aDataArray [intarray $aData]
 | |
|   puts [code {
 | |
|     static u16 aFts5UnicodeBlock[] = {$aBlockArray};
 | |
|     static u16 aFts5UnicodeMap[] = {$aMapArray};
 | |
|     static u16 aFts5UnicodeData[] = {$aDataArray};
 | |
| 
 | |
|     int sqlite3Fts5UnicodeCategory(u32 iCode) { 
 | |
|       int iRes = -1;
 | |
|       int iHi;
 | |
|       int iLo;
 | |
|       int ret;
 | |
|       u16 iKey;
 | |
| 
 | |
|       if( iCode>=(1<<20) ){
 | |
|         return 0;
 | |
|       }
 | |
|       iLo = aFts5UnicodeBlock[(iCode>>16)];
 | |
|       iHi = aFts5UnicodeBlock[1+(iCode>>16)];
 | |
|       iKey = (iCode & 0xFFFF);
 | |
|       while( iHi>iLo ){
 | |
|         int iTest = (iHi + iLo) / 2;
 | |
|         assert( iTest>=iLo && iTest<iHi );
 | |
|         if( iKey>=aFts5UnicodeMap[iTest] ){
 | |
|           iRes = iTest;
 | |
|           iLo = iTest+1;
 | |
|         }else{
 | |
|           iHi = iTest;
 | |
|         }
 | |
|       }
 | |
| 
 | |
|       if( iRes<0 ) return 0;
 | |
|       if( iKey>=(aFts5UnicodeMap[iRes]+(aFts5UnicodeData[iRes]>>5)) ) return 0;
 | |
|       ret = aFts5UnicodeData[iRes] & 0x1F;
 | |
|       if( ret!=$C(LC) ) return ret;
 | |
|       return ((iKey - aFts5UnicodeMap[iRes]) & 0x01) ? $C(Ll) : $C(Lu);
 | |
|     }
 | |
| 
 | |
|     void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){
 | |
|       int i = 0;
 | |
|       int iTbl = 0;
 | |
|       while( i<128 ){
 | |
|         int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ];
 | |
|         int n = (aFts5UnicodeData[iTbl] >> 5) + i;
 | |
|         for(; i<128 && i<n; i++){
 | |
|           aAscii[i] = (u8)bToken;
 | |
|         }
 | |
|         iTbl++;
 | |
|       }
 | |
|       aAscii[0] = 0;                  /* 0x00 is never a token character */
 | |
|     }
 | |
|   }]
 | |
| }
 | |
| 
 | |
| proc print_test_categories {lMap} {
 | |
| 
 | |
|   set lCP [list]
 | |
|   foreach e $lMap {
 | |
|     foreach {cp cat} $e {}
 | |
|     if {[expr 0x$cp] < (1<<20)} {
 | |
|       lappend lCP "{0x$cp, \"$cat\"}, "
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   set aCP "\n"
 | |
|   for {set i 0} {$i < [llength $lCP]} {incr i 4} {
 | |
|     append aCP "    [join [lrange $lCP $i $i+3]]\n"
 | |
|   }
 | |
| 
 | |
| 
 | |
|   puts [code {
 | |
|     static int categories_test (int *piCode){
 | |
|       struct Codepoint {
 | |
|         int iCode;
 | |
|         const char *zCat;
 | |
|       } aCP[] = {$aCP};
 | |
|       int i;
 | |
|       int iCP = 0;
 | |
| 
 | |
|       for(i=0; i<1000000; i++){
 | |
|         u8 aArray[40];
 | |
|         int cat = 0;
 | |
|         int c = 0;
 | |
|         memset(aArray, 0, sizeof(aArray));
 | |
|         if( aCP[iCP].iCode==i ){
 | |
|           sqlite3Fts5UnicodeCatParse(aCP[iCP].zCat, aArray);
 | |
|           iCP++;
 | |
|         }else{
 | |
|           aArray[0] = 1;
 | |
|         }
 | |
| 
 | |
|         c = sqlite3Fts5UnicodeCategory((u32)i);
 | |
|         if( aArray[c]==0 ){
 | |
|           *piCode = i;
 | |
|           return 1;
 | |
|         }
 | |
|       }
 | |
| 
 | |
|       return 0;
 | |
|     }
 | |
|   }]
 | |
| }
 | |
| 
 | |
| proc print_fold_test {zFunc mappings} {
 | |
|   global tl_lookup_table
 | |
| 
 | |
|   foreach m $mappings {
 | |
|     set c [lindex $m 1]
 | |
|     if {$c == ""} {
 | |
|       set extra([lindex $m 0]) 0
 | |
|     } else {
 | |
|       scan $c %c i
 | |
|       set extra([lindex $m 0]) $i
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   puts "static int fold_test(int *piCode)\{"
 | |
|   puts -nonewline "  static int aLookup\[\] = \{"
 | |
|   for {set i 0} {$i < 70000} {incr i} {
 | |
| 
 | |
|     set expected $i
 | |
|     catch { set expected $tl_lookup_table($i) }
 | |
|     set expected2 $expected
 | |
|     catch { set expected2 $extra($expected2) }
 | |
| 
 | |
|     if {($i % 4)==0}  { puts "" ; puts -nonewline "    " }
 | |
|     puts -nonewline "$expected, $expected2, "
 | |
|   }
 | |
|   puts "  \};"
 | |
|   puts "  int i;"
 | |
|   puts "  for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
 | |
|   puts "    int iCode = (i/2);"
 | |
|   puts "    int bFlag = i & 0x0001;"
 | |
|   puts "    if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{"
 | |
|   puts "      *piCode = iCode;"
 | |
|   puts "      return 1;"
 | |
|   puts "    \}"
 | |
|   puts "  \}"
 | |
|   puts "  return 0;"
 | |
|   puts "\}"
 | |
| }
 | |
| 
 | |
| 
 | |
| proc print_fileheader {} {
 | |
|   puts [string trim {
 | |
| /*
 | |
| ** 2012-05-25
 | |
| **
 | |
| ** The author disclaims copyright to this source code.  In place of
 | |
| ** a legal notice, here is a blessing:
 | |
| **
 | |
| **    May you do good and not evil.
 | |
| **    May you find forgiveness for yourself and forgive others.
 | |
| **    May you share freely, never taking more than you give.
 | |
| **
 | |
| ******************************************************************************
 | |
| */
 | |
| 
 | |
| /*
 | |
| ** DO NOT EDIT THIS MACHINE GENERATED FILE.
 | |
| */
 | |
|   }]
 | |
|   puts ""
 | |
|   if {$::generate_fts5_code} {
 | |
|     # no-op
 | |
|   } else {
 | |
|     puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE"
 | |
|     puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
 | |
|   }
 | |
|   puts ""
 | |
|   puts "#include <assert.h>"
 | |
|   puts ""
 | |
| }
 | |
| 
 | |
| proc print_test_main {} {
 | |
|   puts ""
 | |
|   puts "#include <stdio.h>"
 | |
|   puts ""
 | |
|   puts "int main(int argc, char **argv)\{"
 | |
|   puts "  int r1, r2, r3;"
 | |
|   puts "  int code;"
 | |
|   puts "  r3 = 0;"
 | |
|   puts "  r1 = isalnum_test(&code);"
 | |
|   puts "  if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
 | |
|   puts "  else printf(\"isalnum(): test passed\\n\");"
 | |
|   puts "  r2 = fold_test(&code);"
 | |
|   puts "  if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
 | |
|   puts "  else printf(\"fold(): test passed\\n\");"
 | |
|   if {$::generate_fts5_code} {
 | |
|     puts "  r3 = categories_test(&code);"
 | |
|     puts "  if( r3 ) printf(\"categories(): Problem with code %d\\n\",code);"
 | |
|     puts "  else printf(\"categories(): test passed\\n\");"
 | |
|   }
 | |
|   puts "  return (r1 || r2 || r3);"
 | |
|   puts "\}"
 | |
| }
 | |
| 
 | |
| # Proces the command line arguments. Exit early if they are not to
 | |
| # our liking.
 | |
| #
 | |
| proc usage {} {
 | |
|   puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? "
 | |
|   puts            stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
 | |
|   exit 1
 | |
| }
 | |
| if {[llength $argv]<2} usage
 | |
| set unicodedata.txt [lindex $argv end]
 | |
| set casefolding.txt [lindex $argv end-1]
 | |
| 
 | |
| set remove_diacritic remove_diacritic
 | |
| set generate_test_code 0
 | |
| set generate_fts5_code 0
 | |
| set function_prefix "sqlite3Fts"
 | |
| for {set i 0} {$i < [llength $argv]-2} {incr i} {
 | |
|   switch -- [lindex $argv $i] {
 | |
|     -test {
 | |
|       set generate_test_code 1
 | |
|     }
 | |
|     -fts5 {
 | |
|       set function_prefix sqlite3Fts5
 | |
|       set generate_fts5_code 1
 | |
|       set remove_diacritic fts5_remove_diacritic
 | |
|     }
 | |
|     default {
 | |
|       usage
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| print_fileheader
 | |
| 
 | |
| if {$::generate_test_code} {
 | |
|   puts "typedef unsigned short int u16;"
 | |
|   puts "typedef unsigned char u8;"
 | |
|   puts "#include <string.h>"
 | |
| }
 | |
| 
 | |
| # Print the isalnum() function to stdout.
 | |
| #
 | |
| set lRange [an_load_separator_ranges]
 | |
| if {$generate_fts5_code==0} {
 | |
|   print_isalnum ${function_prefix}UnicodeIsalnum $lRange
 | |
| }
 | |
| 
 | |
| # Leave a gap between the two generated C functions.
 | |
| #
 | |
| puts ""
 | |
| puts ""
 | |
| 
 | |
| # Load the fold data. This is used by the [rd_XXX] commands
 | |
| # as well as [print_fold].
 | |
| tl_load_casefolding_txt ${casefolding.txt}
 | |
| 
 | |
| set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
 | |
| print_rd $mappings
 | |
| puts ""
 | |
| puts ""
 | |
| print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings
 | |
| puts ""
 | |
| puts ""
 | |
| 
 | |
| # Print the fold() function to stdout.
 | |
| #
 | |
| print_fold ${function_prefix}UnicodeFold
 | |
| 
 | |
| if {$generate_fts5_code} {
 | |
|   puts ""
 | |
|   puts ""
 | |
|   print_categories [cc_load_unicodedata_text ${unicodedata.txt}]
 | |
| }
 | |
| 
 | |
| # Print the test routines and main() function to stdout, if -test 
 | |
| # was specified.
 | |
| #
 | |
| if {$::generate_test_code} {
 | |
|   if {$generate_fts5_code==0} {
 | |
|     print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange
 | |
|   }
 | |
|   print_fold_test ${function_prefix}UnicodeFold $mappings
 | |
|   print_test_categories [cc_load_unicodedata_text ${unicodedata.txt}]
 | |
|   print_test_main 
 | |
| }
 | |
| 
 | |
| if {$generate_fts5_code} {
 | |
|   # no-op
 | |
| } else {
 | |
|   puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
 | |
|   puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"
 | |
| }
 |