mirror of
https://github.com/tursodatabase/libsql.git
synced 2025-03-08 23:41:50 +00:00
A common complain with libSQL is how to run extensions. The main mechanism, with a .so, has a lot of issues around how those .so are distributed. The most common extensions are the ones in the sqlean package. We can improve this experience by bundling them in our sqlite build. Not all SQLean extensions are kosher: some of them, like fileio, use the vfs. Others, are deemed too complex. The extensions included here are a subset that we deem important enough, and low risk enough, to just be a part of the main bundle.
611 lines
26 KiB
C
611 lines
26 KiB
C
// Originally from the spellfix SQLite exension, Public Domain
|
||
// https://www.sqlite.org/src/file/ext/misc/spellfix.c
|
||
// Modified by Anton Zhiyanov, https://github.com/nalgeon/sqlean/, MIT License
|
||
|
||
#include <stdlib.h>
|
||
|
||
#include "fuzzy/common.h"
|
||
|
||
extern const unsigned char midClass[];
|
||
extern const unsigned char initClass[];
|
||
extern const unsigned char className[];
|
||
|
||
/*
|
||
** This lookup table is used to help decode the first byte of
|
||
** a multi-byte UTF8 character.
|
||
*/
|
||
static const unsigned char translit_utf8_lookup[] = {
|
||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
|
||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
|
||
};
|
||
|
||
/*
|
||
** Return the value of the first UTF-8 character in the string.
|
||
*/
|
||
static int utf8Read(const unsigned char* z, int n, int* pSize) {
|
||
int c, i;
|
||
|
||
/* All callers to this routine (in the current implementation)
|
||
** always have n>0. */
|
||
if (NEVER(n == 0)) {
|
||
c = i = 0;
|
||
} else {
|
||
c = z[0];
|
||
i = 1;
|
||
if (c >= 0xc0) {
|
||
c = translit_utf8_lookup[c - 0xc0];
|
||
while (i < n && (z[i] & 0xc0) == 0x80) {
|
||
c = (c << 6) + (0x3f & z[i++]);
|
||
}
|
||
}
|
||
}
|
||
*pSize = i;
|
||
return c;
|
||
}
|
||
|
||
typedef struct Transliteration Transliteration;
|
||
struct Transliteration {
|
||
unsigned short int cFrom;
|
||
unsigned char cTo0, cTo1, cTo2, cTo3;
|
||
};
|
||
|
||
/*
|
||
** Table of translations from unicode characters into ASCII.
|
||
*/
|
||
static const Transliteration translit[] = {
|
||
{0x00A0, 0x20, 0x00, 0x00, 0x00}, /* to */
|
||
{0x00B5, 0x75, 0x00, 0x00, 0x00}, /* µ to u */
|
||
{0x00C0, 0x41, 0x00, 0x00, 0x00}, /* À to A */
|
||
{0x00C1, 0x41, 0x00, 0x00, 0x00}, /* Á to A */
|
||
{0x00C2, 0x41, 0x00, 0x00, 0x00}, /* Â to A */
|
||
{0x00C3, 0x41, 0x00, 0x00, 0x00}, /* Ã to A */
|
||
{0x00C4, 0x41, 0x65, 0x00, 0x00}, /* Ä to Ae */
|
||
{0x00C5, 0x41, 0x61, 0x00, 0x00}, /* Å to Aa */
|
||
{0x00C6, 0x41, 0x45, 0x00, 0x00}, /* Æ to AE */
|
||
{0x00C7, 0x43, 0x00, 0x00, 0x00}, /* Ç to C */
|
||
{0x00C8, 0x45, 0x00, 0x00, 0x00}, /* È to E */
|
||
{0x00C9, 0x45, 0x00, 0x00, 0x00}, /* É to E */
|
||
{0x00CA, 0x45, 0x00, 0x00, 0x00}, /* Ê to E */
|
||
{0x00CB, 0x45, 0x00, 0x00, 0x00}, /* Ë to E */
|
||
{0x00CC, 0x49, 0x00, 0x00, 0x00}, /* Ì to I */
|
||
{0x00CD, 0x49, 0x00, 0x00, 0x00}, /* Í to I */
|
||
{0x00CE, 0x49, 0x00, 0x00, 0x00}, /* Î to I */
|
||
{0x00CF, 0x49, 0x00, 0x00, 0x00}, /* Ï to I */
|
||
{0x00D0, 0x44, 0x00, 0x00, 0x00}, /* Ð to D */
|
||
{0x00D1, 0x4E, 0x00, 0x00, 0x00}, /* Ñ to N */
|
||
{0x00D2, 0x4F, 0x00, 0x00, 0x00}, /* Ò to O */
|
||
{0x00D3, 0x4F, 0x00, 0x00, 0x00}, /* Ó to O */
|
||
{0x00D4, 0x4F, 0x00, 0x00, 0x00}, /* Ô to O */
|
||
{0x00D5, 0x4F, 0x00, 0x00, 0x00}, /* Õ to O */
|
||
{0x00D6, 0x4F, 0x65, 0x00, 0x00}, /* Ö to Oe */
|
||
{0x00D7, 0x78, 0x00, 0x00, 0x00}, /* × to x */
|
||
{0x00D8, 0x4F, 0x00, 0x00, 0x00}, /* Ø to O */
|
||
{0x00D9, 0x55, 0x00, 0x00, 0x00}, /* Ù to U */
|
||
{0x00DA, 0x55, 0x00, 0x00, 0x00}, /* Ú to U */
|
||
{0x00DB, 0x55, 0x00, 0x00, 0x00}, /* Û to U */
|
||
{0x00DC, 0x55, 0x65, 0x00, 0x00}, /* Ü to Ue */
|
||
{0x00DD, 0x59, 0x00, 0x00, 0x00}, /* Ý to Y */
|
||
{0x00DE, 0x54, 0x68, 0x00, 0x00}, /* Þ to Th */
|
||
{0x00DF, 0x73, 0x73, 0x00, 0x00}, /* ß to ss */
|
||
{0x00E0, 0x61, 0x00, 0x00, 0x00}, /* à to a */
|
||
{0x00E1, 0x61, 0x00, 0x00, 0x00}, /* á to a */
|
||
{0x00E2, 0x61, 0x00, 0x00, 0x00}, /* â to a */
|
||
{0x00E3, 0x61, 0x00, 0x00, 0x00}, /* ã to a */
|
||
{0x00E4, 0x61, 0x65, 0x00, 0x00}, /* ä to ae */
|
||
{0x00E5, 0x61, 0x61, 0x00, 0x00}, /* å to aa */
|
||
{0x00E6, 0x61, 0x65, 0x00, 0x00}, /* æ to ae */
|
||
{0x00E7, 0x63, 0x00, 0x00, 0x00}, /* ç to c */
|
||
{0x00E8, 0x65, 0x00, 0x00, 0x00}, /* è to e */
|
||
{0x00E9, 0x65, 0x00, 0x00, 0x00}, /* é to e */
|
||
{0x00EA, 0x65, 0x00, 0x00, 0x00}, /* ê to e */
|
||
{0x00EB, 0x65, 0x00, 0x00, 0x00}, /* ë to e */
|
||
{0x00EC, 0x69, 0x00, 0x00, 0x00}, /* ì to i */
|
||
{0x00ED, 0x69, 0x00, 0x00, 0x00}, /* í to i */
|
||
{0x00EE, 0x69, 0x00, 0x00, 0x00}, /* î to i */
|
||
{0x00EF, 0x69, 0x00, 0x00, 0x00}, /* ï to i */
|
||
{0x00F0, 0x64, 0x00, 0x00, 0x00}, /* ð to d */
|
||
{0x00F1, 0x6E, 0x00, 0x00, 0x00}, /* ñ to n */
|
||
{0x00F2, 0x6F, 0x00, 0x00, 0x00}, /* ò to o */
|
||
{0x00F3, 0x6F, 0x00, 0x00, 0x00}, /* ó to o */
|
||
{0x00F4, 0x6F, 0x00, 0x00, 0x00}, /* ô to o */
|
||
{0x00F5, 0x6F, 0x00, 0x00, 0x00}, /* õ to o */
|
||
{0x00F6, 0x6F, 0x65, 0x00, 0x00}, /* ö to oe */
|
||
{0x00F7, 0x3A, 0x00, 0x00, 0x00}, /* ÷ to : */
|
||
{0x00F8, 0x6F, 0x00, 0x00, 0x00}, /* ø to o */
|
||
{0x00F9, 0x75, 0x00, 0x00, 0x00}, /* ù to u */
|
||
{0x00FA, 0x75, 0x00, 0x00, 0x00}, /* ú to u */
|
||
{0x00FB, 0x75, 0x00, 0x00, 0x00}, /* û to u */
|
||
{0x00FC, 0x75, 0x65, 0x00, 0x00}, /* ü to ue */
|
||
{0x00FD, 0x79, 0x00, 0x00, 0x00}, /* ý to y */
|
||
{0x00FE, 0x74, 0x68, 0x00, 0x00}, /* þ to th */
|
||
{0x00FF, 0x79, 0x00, 0x00, 0x00}, /* ÿ to y */
|
||
{0x0100, 0x41, 0x00, 0x00, 0x00}, /* Ā to A */
|
||
{0x0101, 0x61, 0x00, 0x00, 0x00}, /* ā to a */
|
||
{0x0102, 0x41, 0x00, 0x00, 0x00}, /* Ă to A */
|
||
{0x0103, 0x61, 0x00, 0x00, 0x00}, /* ă to a */
|
||
{0x0104, 0x41, 0x00, 0x00, 0x00}, /* Ą to A */
|
||
{0x0105, 0x61, 0x00, 0x00, 0x00}, /* ą to a */
|
||
{0x0106, 0x43, 0x00, 0x00, 0x00}, /* Ć to C */
|
||
{0x0107, 0x63, 0x00, 0x00, 0x00}, /* ć to c */
|
||
{0x0108, 0x43, 0x68, 0x00, 0x00}, /* Ĉ to Ch */
|
||
{0x0109, 0x63, 0x68, 0x00, 0x00}, /* ĉ to ch */
|
||
{0x010A, 0x43, 0x00, 0x00, 0x00}, /* Ċ to C */
|
||
{0x010B, 0x63, 0x00, 0x00, 0x00}, /* ċ to c */
|
||
{0x010C, 0x43, 0x00, 0x00, 0x00}, /* Č to C */
|
||
{0x010D, 0x63, 0x00, 0x00, 0x00}, /* č to c */
|
||
{0x010E, 0x44, 0x00, 0x00, 0x00}, /* Ď to D */
|
||
{0x010F, 0x64, 0x00, 0x00, 0x00}, /* ď to d */
|
||
{0x0110, 0x44, 0x00, 0x00, 0x00}, /* Đ to D */
|
||
{0x0111, 0x64, 0x00, 0x00, 0x00}, /* đ to d */
|
||
{0x0112, 0x45, 0x00, 0x00, 0x00}, /* Ē to E */
|
||
{0x0113, 0x65, 0x00, 0x00, 0x00}, /* ē to e */
|
||
{0x0114, 0x45, 0x00, 0x00, 0x00}, /* Ĕ to E */
|
||
{0x0115, 0x65, 0x00, 0x00, 0x00}, /* ĕ to e */
|
||
{0x0116, 0x45, 0x00, 0x00, 0x00}, /* Ė to E */
|
||
{0x0117, 0x65, 0x00, 0x00, 0x00}, /* ė to e */
|
||
{0x0118, 0x45, 0x00, 0x00, 0x00}, /* Ę to E */
|
||
{0x0119, 0x65, 0x00, 0x00, 0x00}, /* ę to e */
|
||
{0x011A, 0x45, 0x00, 0x00, 0x00}, /* Ě to E */
|
||
{0x011B, 0x65, 0x00, 0x00, 0x00}, /* ě to e */
|
||
{0x011C, 0x47, 0x68, 0x00, 0x00}, /* Ĝ to Gh */
|
||
{0x011D, 0x67, 0x68, 0x00, 0x00}, /* ĝ to gh */
|
||
{0x011E, 0x47, 0x00, 0x00, 0x00}, /* Ğ to G */
|
||
{0x011F, 0x67, 0x00, 0x00, 0x00}, /* ğ to g */
|
||
{0x0120, 0x47, 0x00, 0x00, 0x00}, /* Ġ to G */
|
||
{0x0121, 0x67, 0x00, 0x00, 0x00}, /* ġ to g */
|
||
{0x0122, 0x47, 0x00, 0x00, 0x00}, /* Ģ to G */
|
||
{0x0123, 0x67, 0x00, 0x00, 0x00}, /* ģ to g */
|
||
{0x0124, 0x48, 0x68, 0x00, 0x00}, /* Ĥ to Hh */
|
||
{0x0125, 0x68, 0x68, 0x00, 0x00}, /* ĥ to hh */
|
||
{0x0126, 0x48, 0x00, 0x00, 0x00}, /* Ħ to H */
|
||
{0x0127, 0x68, 0x00, 0x00, 0x00}, /* ħ to h */
|
||
{0x0128, 0x49, 0x00, 0x00, 0x00}, /* Ĩ to I */
|
||
{0x0129, 0x69, 0x00, 0x00, 0x00}, /* ĩ to i */
|
||
{0x012A, 0x49, 0x00, 0x00, 0x00}, /* Ī to I */
|
||
{0x012B, 0x69, 0x00, 0x00, 0x00}, /* ī to i */
|
||
{0x012C, 0x49, 0x00, 0x00, 0x00}, /* Ĭ to I */
|
||
{0x012D, 0x69, 0x00, 0x00, 0x00}, /* ĭ to i */
|
||
{0x012E, 0x49, 0x00, 0x00, 0x00}, /* Į to I */
|
||
{0x012F, 0x69, 0x00, 0x00, 0x00}, /* į to i */
|
||
{0x0130, 0x49, 0x00, 0x00, 0x00}, /* İ to I */
|
||
{0x0131, 0x69, 0x00, 0x00, 0x00}, /* ı to i */
|
||
{0x0132, 0x49, 0x4A, 0x00, 0x00}, /* IJ to IJ */
|
||
{0x0133, 0x69, 0x6A, 0x00, 0x00}, /* ij to ij */
|
||
{0x0134, 0x4A, 0x68, 0x00, 0x00}, /* Ĵ to Jh */
|
||
{0x0135, 0x6A, 0x68, 0x00, 0x00}, /* ĵ to jh */
|
||
{0x0136, 0x4B, 0x00, 0x00, 0x00}, /* Ķ to K */
|
||
{0x0137, 0x6B, 0x00, 0x00, 0x00}, /* ķ to k */
|
||
{0x0138, 0x6B, 0x00, 0x00, 0x00}, /* ĸ to k */
|
||
{0x0139, 0x4C, 0x00, 0x00, 0x00}, /* Ĺ to L */
|
||
{0x013A, 0x6C, 0x00, 0x00, 0x00}, /* ĺ to l */
|
||
{0x013B, 0x4C, 0x00, 0x00, 0x00}, /* Ļ to L */
|
||
{0x013C, 0x6C, 0x00, 0x00, 0x00}, /* ļ to l */
|
||
{0x013D, 0x4C, 0x00, 0x00, 0x00}, /* Ľ to L */
|
||
{0x013E, 0x6C, 0x00, 0x00, 0x00}, /* ľ to l */
|
||
{0x013F, 0x4C, 0x2E, 0x00, 0x00}, /* Ŀ to L. */
|
||
{0x0140, 0x6C, 0x2E, 0x00, 0x00}, /* ŀ to l. */
|
||
{0x0141, 0x4C, 0x00, 0x00, 0x00}, /* Ł to L */
|
||
{0x0142, 0x6C, 0x00, 0x00, 0x00}, /* ł to l */
|
||
{0x0143, 0x4E, 0x00, 0x00, 0x00}, /* Ń to N */
|
||
{0x0144, 0x6E, 0x00, 0x00, 0x00}, /* ń to n */
|
||
{0x0145, 0x4E, 0x00, 0x00, 0x00}, /* Ņ to N */
|
||
{0x0146, 0x6E, 0x00, 0x00, 0x00}, /* ņ to n */
|
||
{0x0147, 0x4E, 0x00, 0x00, 0x00}, /* Ň to N */
|
||
{0x0148, 0x6E, 0x00, 0x00, 0x00}, /* ň to n */
|
||
{0x0149, 0x27, 0x6E, 0x00, 0x00}, /* ʼn to 'n */
|
||
{0x014A, 0x4E, 0x47, 0x00, 0x00}, /* Ŋ to NG */
|
||
{0x014B, 0x6E, 0x67, 0x00, 0x00}, /* ŋ to ng */
|
||
{0x014C, 0x4F, 0x00, 0x00, 0x00}, /* Ō to O */
|
||
{0x014D, 0x6F, 0x00, 0x00, 0x00}, /* ō to o */
|
||
{0x014E, 0x4F, 0x00, 0x00, 0x00}, /* Ŏ to O */
|
||
{0x014F, 0x6F, 0x00, 0x00, 0x00}, /* ŏ to o */
|
||
{0x0150, 0x4F, 0x00, 0x00, 0x00}, /* Ő to O */
|
||
{0x0151, 0x6F, 0x00, 0x00, 0x00}, /* ő to o */
|
||
{0x0152, 0x4F, 0x45, 0x00, 0x00}, /* Œ to OE */
|
||
{0x0153, 0x6F, 0x65, 0x00, 0x00}, /* œ to oe */
|
||
{0x0154, 0x52, 0x00, 0x00, 0x00}, /* Ŕ to R */
|
||
{0x0155, 0x72, 0x00, 0x00, 0x00}, /* ŕ to r */
|
||
{0x0156, 0x52, 0x00, 0x00, 0x00}, /* Ŗ to R */
|
||
{0x0157, 0x72, 0x00, 0x00, 0x00}, /* ŗ to r */
|
||
{0x0158, 0x52, 0x00, 0x00, 0x00}, /* Ř to R */
|
||
{0x0159, 0x72, 0x00, 0x00, 0x00}, /* ř to r */
|
||
{0x015A, 0x53, 0x00, 0x00, 0x00}, /* Ś to S */
|
||
{0x015B, 0x73, 0x00, 0x00, 0x00}, /* ś to s */
|
||
{0x015C, 0x53, 0x68, 0x00, 0x00}, /* Ŝ to Sh */
|
||
{0x015D, 0x73, 0x68, 0x00, 0x00}, /* ŝ to sh */
|
||
{0x015E, 0x53, 0x00, 0x00, 0x00}, /* Ş to S */
|
||
{0x015F, 0x73, 0x00, 0x00, 0x00}, /* ş to s */
|
||
{0x0160, 0x53, 0x00, 0x00, 0x00}, /* Š to S */
|
||
{0x0161, 0x73, 0x00, 0x00, 0x00}, /* š to s */
|
||
{0x0162, 0x54, 0x00, 0x00, 0x00}, /* Ţ to T */
|
||
{0x0163, 0x74, 0x00, 0x00, 0x00}, /* ţ to t */
|
||
{0x0164, 0x54, 0x00, 0x00, 0x00}, /* Ť to T */
|
||
{0x0165, 0x74, 0x00, 0x00, 0x00}, /* ť to t */
|
||
{0x0166, 0x54, 0x00, 0x00, 0x00}, /* Ŧ to T */
|
||
{0x0167, 0x74, 0x00, 0x00, 0x00}, /* ŧ to t */
|
||
{0x0168, 0x55, 0x00, 0x00, 0x00}, /* Ũ to U */
|
||
{0x0169, 0x75, 0x00, 0x00, 0x00}, /* ũ to u */
|
||
{0x016A, 0x55, 0x00, 0x00, 0x00}, /* Ū to U */
|
||
{0x016B, 0x75, 0x00, 0x00, 0x00}, /* ū to u */
|
||
{0x016C, 0x55, 0x00, 0x00, 0x00}, /* Ŭ to U */
|
||
{0x016D, 0x75, 0x00, 0x00, 0x00}, /* ŭ to u */
|
||
{0x016E, 0x55, 0x00, 0x00, 0x00}, /* Ů to U */
|
||
{0x016F, 0x75, 0x00, 0x00, 0x00}, /* ů to u */
|
||
{0x0170, 0x55, 0x00, 0x00, 0x00}, /* Ű to U */
|
||
{0x0171, 0x75, 0x00, 0x00, 0x00}, /* ű to u */
|
||
{0x0172, 0x55, 0x00, 0x00, 0x00}, /* Ų to U */
|
||
{0x0173, 0x75, 0x00, 0x00, 0x00}, /* ų to u */
|
||
{0x0174, 0x57, 0x00, 0x00, 0x00}, /* Ŵ to W */
|
||
{0x0175, 0x77, 0x00, 0x00, 0x00}, /* ŵ to w */
|
||
{0x0176, 0x59, 0x00, 0x00, 0x00}, /* Ŷ to Y */
|
||
{0x0177, 0x79, 0x00, 0x00, 0x00}, /* ŷ to y */
|
||
{0x0178, 0x59, 0x00, 0x00, 0x00}, /* Ÿ to Y */
|
||
{0x0179, 0x5A, 0x00, 0x00, 0x00}, /* Ź to Z */
|
||
{0x017A, 0x7A, 0x00, 0x00, 0x00}, /* ź to z */
|
||
{0x017B, 0x5A, 0x00, 0x00, 0x00}, /* Ż to Z */
|
||
{0x017C, 0x7A, 0x00, 0x00, 0x00}, /* ż to z */
|
||
{0x017D, 0x5A, 0x00, 0x00, 0x00}, /* Ž to Z */
|
||
{0x017E, 0x7A, 0x00, 0x00, 0x00}, /* ž to z */
|
||
{0x017F, 0x73, 0x00, 0x00, 0x00}, /* ſ to s */
|
||
{0x0192, 0x66, 0x00, 0x00, 0x00}, /* ƒ to f */
|
||
{0x0218, 0x53, 0x00, 0x00, 0x00}, /* Ș to S */
|
||
{0x0219, 0x73, 0x00, 0x00, 0x00}, /* ș to s */
|
||
{0x021A, 0x54, 0x00, 0x00, 0x00}, /* Ț to T */
|
||
{0x021B, 0x74, 0x00, 0x00, 0x00}, /* ț to t */
|
||
{0x0386, 0x41, 0x00, 0x00, 0x00}, /* Ά to A */
|
||
{0x0388, 0x45, 0x00, 0x00, 0x00}, /* Έ to E */
|
||
{0x0389, 0x49, 0x00, 0x00, 0x00}, /* Ή to I */
|
||
{0x038A, 0x49, 0x00, 0x00, 0x00}, /* Ί to I */
|
||
{0x038C, 0x4f, 0x00, 0x00, 0x00}, /* Ό to O */
|
||
{0x038E, 0x59, 0x00, 0x00, 0x00}, /* Ύ to Y */
|
||
{0x038F, 0x4f, 0x00, 0x00, 0x00}, /* Ώ to O */
|
||
{0x0390, 0x69, 0x00, 0x00, 0x00}, /* ΐ to i */
|
||
{0x0391, 0x41, 0x00, 0x00, 0x00}, /* Α to A */
|
||
{0x0392, 0x42, 0x00, 0x00, 0x00}, /* Β to B */
|
||
{0x0393, 0x47, 0x00, 0x00, 0x00}, /* Γ to G */
|
||
{0x0394, 0x44, 0x00, 0x00, 0x00}, /* Δ to D */
|
||
{0x0395, 0x45, 0x00, 0x00, 0x00}, /* Ε to E */
|
||
{0x0396, 0x5a, 0x00, 0x00, 0x00}, /* Ζ to Z */
|
||
{0x0397, 0x49, 0x00, 0x00, 0x00}, /* Η to I */
|
||
{0x0398, 0x54, 0x68, 0x00, 0x00}, /* Θ to Th */
|
||
{0x0399, 0x49, 0x00, 0x00, 0x00}, /* Ι to I */
|
||
{0x039A, 0x4b, 0x00, 0x00, 0x00}, /* Κ to K */
|
||
{0x039B, 0x4c, 0x00, 0x00, 0x00}, /* Λ to L */
|
||
{0x039C, 0x4d, 0x00, 0x00, 0x00}, /* Μ to M */
|
||
{0x039D, 0x4e, 0x00, 0x00, 0x00}, /* Ν to N */
|
||
{0x039E, 0x58, 0x00, 0x00, 0x00}, /* Ξ to X */
|
||
{0x039F, 0x4f, 0x00, 0x00, 0x00}, /* Ο to O */
|
||
{0x03A0, 0x50, 0x00, 0x00, 0x00}, /* Π to P */
|
||
{0x03A1, 0x52, 0x00, 0x00, 0x00}, /* Ρ to R */
|
||
{0x03A3, 0x53, 0x00, 0x00, 0x00}, /* Σ to S */
|
||
{0x03A4, 0x54, 0x00, 0x00, 0x00}, /* Τ to T */
|
||
{0x03A5, 0x59, 0x00, 0x00, 0x00}, /* Υ to Y */
|
||
{0x03A6, 0x46, 0x00, 0x00, 0x00}, /* Φ to F */
|
||
{0x03A7, 0x43, 0x68, 0x00, 0x00}, /* Χ to Ch */
|
||
{0x03A8, 0x50, 0x73, 0x00, 0x00}, /* Ψ to Ps */
|
||
{0x03A9, 0x4f, 0x00, 0x00, 0x00}, /* Ω to O */
|
||
{0x03AA, 0x49, 0x00, 0x00, 0x00}, /* Ϊ to I */
|
||
{0x03AB, 0x59, 0x00, 0x00, 0x00}, /* Ϋ to Y */
|
||
{0x03AC, 0x61, 0x00, 0x00, 0x00}, /* ά to a */
|
||
{0x03AD, 0x65, 0x00, 0x00, 0x00}, /* έ to e */
|
||
{0x03AE, 0x69, 0x00, 0x00, 0x00}, /* ή to i */
|
||
{0x03AF, 0x69, 0x00, 0x00, 0x00}, /* ί to i */
|
||
{0x03B1, 0x61, 0x00, 0x00, 0x00}, /* α to a */
|
||
{0x03B2, 0x62, 0x00, 0x00, 0x00}, /* β to b */
|
||
{0x03B3, 0x67, 0x00, 0x00, 0x00}, /* γ to g */
|
||
{0x03B4, 0x64, 0x00, 0x00, 0x00}, /* δ to d */
|
||
{0x03B5, 0x65, 0x00, 0x00, 0x00}, /* ε to e */
|
||
{0x03B6, 0x7a, 0x00, 0x00, 0x00}, /* ζ to z */
|
||
{0x03B7, 0x69, 0x00, 0x00, 0x00}, /* η to i */
|
||
{0x03B8, 0x74, 0x68, 0x00, 0x00}, /* θ to th */
|
||
{0x03B9, 0x69, 0x00, 0x00, 0x00}, /* ι to i */
|
||
{0x03BA, 0x6b, 0x00, 0x00, 0x00}, /* κ to k */
|
||
{0x03BB, 0x6c, 0x00, 0x00, 0x00}, /* λ to l */
|
||
{0x03BC, 0x6d, 0x00, 0x00, 0x00}, /* μ to m */
|
||
{0x03BD, 0x6e, 0x00, 0x00, 0x00}, /* ν to n */
|
||
{0x03BE, 0x78, 0x00, 0x00, 0x00}, /* ξ to x */
|
||
{0x03BF, 0x6f, 0x00, 0x00, 0x00}, /* ο to o */
|
||
{0x03C0, 0x70, 0x00, 0x00, 0x00}, /* π to p */
|
||
{0x03C1, 0x72, 0x00, 0x00, 0x00}, /* ρ to r */
|
||
{0x03C3, 0x73, 0x00, 0x00, 0x00}, /* σ to s */
|
||
{0x03C4, 0x74, 0x00, 0x00, 0x00}, /* τ to t */
|
||
{0x03C5, 0x79, 0x00, 0x00, 0x00}, /* υ to y */
|
||
{0x03C6, 0x66, 0x00, 0x00, 0x00}, /* φ to f */
|
||
{0x03C7, 0x63, 0x68, 0x00, 0x00}, /* χ to ch */
|
||
{0x03C8, 0x70, 0x73, 0x00, 0x00}, /* ψ to ps */
|
||
{0x03C9, 0x6f, 0x00, 0x00, 0x00}, /* ω to o */
|
||
{0x03CA, 0x69, 0x00, 0x00, 0x00}, /* ϊ to i */
|
||
{0x03CB, 0x79, 0x00, 0x00, 0x00}, /* ϋ to y */
|
||
{0x03CC, 0x6f, 0x00, 0x00, 0x00}, /* ό to o */
|
||
{0x03CD, 0x79, 0x00, 0x00, 0x00}, /* ύ to y */
|
||
{0x03CE, 0x69, 0x00, 0x00, 0x00}, /* ώ to i */
|
||
{0x0400, 0x45, 0x00, 0x00, 0x00}, /* Ѐ to E */
|
||
{0x0401, 0x45, 0x00, 0x00, 0x00}, /* Ё to E */
|
||
{0x0402, 0x44, 0x00, 0x00, 0x00}, /* Ђ to D */
|
||
{0x0403, 0x47, 0x00, 0x00, 0x00}, /* Ѓ to G */
|
||
{0x0404, 0x45, 0x00, 0x00, 0x00}, /* Є to E */
|
||
{0x0405, 0x5a, 0x00, 0x00, 0x00}, /* Ѕ to Z */
|
||
{0x0406, 0x49, 0x00, 0x00, 0x00}, /* І to I */
|
||
{0x0407, 0x49, 0x00, 0x00, 0x00}, /* Ї to I */
|
||
{0x0408, 0x4a, 0x00, 0x00, 0x00}, /* Ј to J */
|
||
{0x0409, 0x49, 0x00, 0x00, 0x00}, /* Љ to I */
|
||
{0x040A, 0x4e, 0x00, 0x00, 0x00}, /* Њ to N */
|
||
{0x040B, 0x44, 0x00, 0x00, 0x00}, /* Ћ to D */
|
||
{0x040C, 0x4b, 0x00, 0x00, 0x00}, /* Ќ to K */
|
||
{0x040D, 0x49, 0x00, 0x00, 0x00}, /* Ѝ to I */
|
||
{0x040E, 0x55, 0x00, 0x00, 0x00}, /* Ў to U */
|
||
{0x040F, 0x44, 0x00, 0x00, 0x00}, /* Џ to D */
|
||
{0x0410, 0x41, 0x00, 0x00, 0x00}, /* А to A */
|
||
{0x0411, 0x42, 0x00, 0x00, 0x00}, /* Б to B */
|
||
{0x0412, 0x56, 0x00, 0x00, 0x00}, /* В to V */
|
||
{0x0413, 0x47, 0x00, 0x00, 0x00}, /* Г to G */
|
||
{0x0414, 0x44, 0x00, 0x00, 0x00}, /* Д to D */
|
||
{0x0415, 0x45, 0x00, 0x00, 0x00}, /* Е to E */
|
||
{0x0416, 0x5a, 0x68, 0x00, 0x00}, /* Ж to Zh */
|
||
{0x0417, 0x5a, 0x00, 0x00, 0x00}, /* З to Z */
|
||
{0x0418, 0x49, 0x00, 0x00, 0x00}, /* И to I */
|
||
{0x0419, 0x49, 0x00, 0x00, 0x00}, /* Й to I */
|
||
{0x041A, 0x4b, 0x00, 0x00, 0x00}, /* К to K */
|
||
{0x041B, 0x4c, 0x00, 0x00, 0x00}, /* Л to L */
|
||
{0x041C, 0x4d, 0x00, 0x00, 0x00}, /* М to M */
|
||
{0x041D, 0x4e, 0x00, 0x00, 0x00}, /* Н to N */
|
||
{0x041E, 0x4f, 0x00, 0x00, 0x00}, /* О to O */
|
||
{0x041F, 0x50, 0x00, 0x00, 0x00}, /* П to P */
|
||
{0x0420, 0x52, 0x00, 0x00, 0x00}, /* Р to R */
|
||
{0x0421, 0x53, 0x00, 0x00, 0x00}, /* С to S */
|
||
{0x0422, 0x54, 0x00, 0x00, 0x00}, /* Т to T */
|
||
{0x0423, 0x55, 0x00, 0x00, 0x00}, /* У to U */
|
||
{0x0424, 0x46, 0x00, 0x00, 0x00}, /* Ф to F */
|
||
{0x0425, 0x4b, 0x68, 0x00, 0x00}, /* Х to Kh */
|
||
{0x0426, 0x54, 0x63, 0x00, 0x00}, /* Ц to Tc */
|
||
{0x0427, 0x43, 0x68, 0x00, 0x00}, /* Ч to Ch */
|
||
{0x0428, 0x53, 0x68, 0x00, 0x00}, /* Ш to Sh */
|
||
{0x0429, 0x53, 0x68, 0x63, 0x68}, /* Щ to Shch */
|
||
{0x042A, 0x61, 0x00, 0x00, 0x00}, /* to A */
|
||
{0x042B, 0x59, 0x00, 0x00, 0x00}, /* Ы to Y */
|
||
{0x042C, 0x59, 0x00, 0x00, 0x00}, /* to Y */
|
||
{0x042D, 0x45, 0x00, 0x00, 0x00}, /* Э to E */
|
||
{0x042E, 0x49, 0x75, 0x00, 0x00}, /* Ю to Iu */
|
||
{0x042F, 0x49, 0x61, 0x00, 0x00}, /* Я to Ia */
|
||
{0x0430, 0x61, 0x00, 0x00, 0x00}, /* а to a */
|
||
{0x0431, 0x62, 0x00, 0x00, 0x00}, /* б to b */
|
||
{0x0432, 0x76, 0x00, 0x00, 0x00}, /* в to v */
|
||
{0x0433, 0x67, 0x00, 0x00, 0x00}, /* г to g */
|
||
{0x0434, 0x64, 0x00, 0x00, 0x00}, /* д to d */
|
||
{0x0435, 0x65, 0x00, 0x00, 0x00}, /* е to e */
|
||
{0x0436, 0x7a, 0x68, 0x00, 0x00}, /* ж to zh */
|
||
{0x0437, 0x7a, 0x00, 0x00, 0x00}, /* з to z */
|
||
{0x0438, 0x69, 0x00, 0x00, 0x00}, /* и to i */
|
||
{0x0439, 0x69, 0x00, 0x00, 0x00}, /* й to i */
|
||
{0x043A, 0x6b, 0x00, 0x00, 0x00}, /* к to k */
|
||
{0x043B, 0x6c, 0x00, 0x00, 0x00}, /* л to l */
|
||
{0x043C, 0x6d, 0x00, 0x00, 0x00}, /* м to m */
|
||
{0x043D, 0x6e, 0x00, 0x00, 0x00}, /* н to n */
|
||
{0x043E, 0x6f, 0x00, 0x00, 0x00}, /* о to o */
|
||
{0x043F, 0x70, 0x00, 0x00, 0x00}, /* п to p */
|
||
{0x0440, 0x72, 0x00, 0x00, 0x00}, /* р to r */
|
||
{0x0441, 0x73, 0x00, 0x00, 0x00}, /* с to s */
|
||
{0x0442, 0x74, 0x00, 0x00, 0x00}, /* т to t */
|
||
{0x0443, 0x75, 0x00, 0x00, 0x00}, /* у to u */
|
||
{0x0444, 0x66, 0x00, 0x00, 0x00}, /* ф to f */
|
||
{0x0445, 0x6b, 0x68, 0x00, 0x00}, /* х to kh */
|
||
{0x0446, 0x74, 0x63, 0x00, 0x00}, /* ц to tc */
|
||
{0x0447, 0x63, 0x68, 0x00, 0x00}, /* ч to ch */
|
||
{0x0448, 0x73, 0x68, 0x00, 0x00}, /* ш to sh */
|
||
{0x0449, 0x73, 0x68, 0x63, 0x68}, /* щ to shch */
|
||
{0x044A, 0x61, 0x00, 0x00, 0x00}, /* to a */
|
||
{0x044B, 0x79, 0x00, 0x00, 0x00}, /* ы to y */
|
||
{0x044C, 0x79, 0x00, 0x00, 0x00}, /* to y */
|
||
{0x044D, 0x65, 0x00, 0x00, 0x00}, /* э to e */
|
||
{0x044E, 0x69, 0x75, 0x00, 0x00}, /* ю to iu */
|
||
{0x044F, 0x69, 0x61, 0x00, 0x00}, /* я to ia */
|
||
{0x0450, 0x65, 0x00, 0x00, 0x00}, /* ѐ to e */
|
||
{0x0451, 0x65, 0x00, 0x00, 0x00}, /* ё to e */
|
||
{0x0452, 0x64, 0x00, 0x00, 0x00}, /* ђ to d */
|
||
{0x0453, 0x67, 0x00, 0x00, 0x00}, /* ѓ to g */
|
||
{0x0454, 0x65, 0x00, 0x00, 0x00}, /* є to e */
|
||
{0x0455, 0x7a, 0x00, 0x00, 0x00}, /* ѕ to z */
|
||
{0x0456, 0x69, 0x00, 0x00, 0x00}, /* і to i */
|
||
{0x0457, 0x69, 0x00, 0x00, 0x00}, /* ї to i */
|
||
{0x0458, 0x6a, 0x00, 0x00, 0x00}, /* ј to j */
|
||
{0x0459, 0x69, 0x00, 0x00, 0x00}, /* љ to i */
|
||
{0x045A, 0x6e, 0x00, 0x00, 0x00}, /* њ to n */
|
||
{0x045B, 0x64, 0x00, 0x00, 0x00}, /* ћ to d */
|
||
{0x045C, 0x6b, 0x00, 0x00, 0x00}, /* ќ to k */
|
||
{0x045D, 0x69, 0x00, 0x00, 0x00}, /* ѝ to i */
|
||
{0x045E, 0x75, 0x00, 0x00, 0x00}, /* ў to u */
|
||
{0x045F, 0x64, 0x00, 0x00, 0x00}, /* џ to d */
|
||
{0x1E02, 0x42, 0x00, 0x00, 0x00}, /* Ḃ to B */
|
||
{0x1E03, 0x62, 0x00, 0x00, 0x00}, /* ḃ to b */
|
||
{0x1E0A, 0x44, 0x00, 0x00, 0x00}, /* Ḋ to D */
|
||
{0x1E0B, 0x64, 0x00, 0x00, 0x00}, /* ḋ to d */
|
||
{0x1E1E, 0x46, 0x00, 0x00, 0x00}, /* Ḟ to F */
|
||
{0x1E1F, 0x66, 0x00, 0x00, 0x00}, /* ḟ to f */
|
||
{0x1E40, 0x4D, 0x00, 0x00, 0x00}, /* Ṁ to M */
|
||
{0x1E41, 0x6D, 0x00, 0x00, 0x00}, /* ṁ to m */
|
||
{0x1E56, 0x50, 0x00, 0x00, 0x00}, /* Ṗ to P */
|
||
{0x1E57, 0x70, 0x00, 0x00, 0x00}, /* ṗ to p */
|
||
{0x1E60, 0x53, 0x00, 0x00, 0x00}, /* Ṡ to S */
|
||
{0x1E61, 0x73, 0x00, 0x00, 0x00}, /* ṡ to s */
|
||
{0x1E6A, 0x54, 0x00, 0x00, 0x00}, /* Ṫ to T */
|
||
{0x1E6B, 0x74, 0x00, 0x00, 0x00}, /* ṫ to t */
|
||
{0x1E80, 0x57, 0x00, 0x00, 0x00}, /* Ẁ to W */
|
||
{0x1E81, 0x77, 0x00, 0x00, 0x00}, /* ẁ to w */
|
||
{0x1E82, 0x57, 0x00, 0x00, 0x00}, /* Ẃ to W */
|
||
{0x1E83, 0x77, 0x00, 0x00, 0x00}, /* ẃ to w */
|
||
{0x1E84, 0x57, 0x00, 0x00, 0x00}, /* Ẅ to W */
|
||
{0x1E85, 0x77, 0x00, 0x00, 0x00}, /* ẅ to w */
|
||
{0x1EF2, 0x59, 0x00, 0x00, 0x00}, /* Ỳ to Y */
|
||
{0x1EF3, 0x79, 0x00, 0x00, 0x00}, /* ỳ to y */
|
||
{0xFB00, 0x66, 0x66, 0x00, 0x00}, /* ff to ff */
|
||
{0xFB01, 0x66, 0x69, 0x00, 0x00}, /* fi to fi */
|
||
{0xFB02, 0x66, 0x6C, 0x00, 0x00}, /* fl to fl */
|
||
{0xFB05, 0x73, 0x74, 0x00, 0x00}, /* ſt to st */
|
||
{0xFB06, 0x73, 0x74, 0x00, 0x00}, /* st to st */
|
||
};
|
||
|
||
static const Transliteration* spellfixFindTranslit(int c, int* pxTop) {
|
||
*pxTop = (sizeof(translit) / sizeof(translit[0])) - 1;
|
||
return translit;
|
||
}
|
||
|
||
/*
|
||
** Convert the input string from UTF-8 into pure ASCII by converting
|
||
** all non-ASCII characters to some combination of characters in the
|
||
** ASCII subset.
|
||
**
|
||
** The returned string might contain more characters than the input.
|
||
**
|
||
** Space to hold the returned string comes from sqlite3_malloc() and
|
||
** should be freed by the caller.
|
||
*/
|
||
unsigned char* transliterate(const unsigned char* zIn, int nIn) {
|
||
unsigned char* zOut = malloc(nIn * 4 + 1);
|
||
int c, sz, nOut;
|
||
if (zOut == 0)
|
||
return 0;
|
||
nOut = 0;
|
||
while (nIn > 0) {
|
||
c = utf8Read(zIn, nIn, &sz);
|
||
zIn += sz;
|
||
nIn -= sz;
|
||
if (c <= 127) {
|
||
zOut[nOut++] = (unsigned char)c;
|
||
} else {
|
||
int xTop, xBtm, x;
|
||
const Transliteration* tbl = spellfixFindTranslit(c, &xTop);
|
||
xBtm = 0;
|
||
while (xTop >= xBtm) {
|
||
x = (xTop + xBtm) / 2;
|
||
if (tbl[x].cFrom == c) {
|
||
zOut[nOut++] = tbl[x].cTo0;
|
||
if (tbl[x].cTo1) {
|
||
zOut[nOut++] = tbl[x].cTo1;
|
||
if (tbl[x].cTo2) {
|
||
zOut[nOut++] = tbl[x].cTo2;
|
||
if (tbl[x].cTo3) {
|
||
zOut[nOut++] = tbl[x].cTo3;
|
||
}
|
||
}
|
||
}
|
||
c = 0;
|
||
break;
|
||
} else if (tbl[x].cFrom > c) {
|
||
xTop = x - 1;
|
||
} else {
|
||
xBtm = x + 1;
|
||
}
|
||
}
|
||
if (c)
|
||
zOut[nOut++] = '?';
|
||
}
|
||
}
|
||
zOut[nOut] = 0;
|
||
return zOut;
|
||
}
|
||
|
||
/*
|
||
** Return the number of characters in the shortest prefix of the input
|
||
** string that transliterates to an ASCII string nTrans bytes or longer.
|
||
** Or, if the transliteration of the input string is less than nTrans
|
||
** bytes in size, return the number of characters in the input string.
|
||
*/
|
||
int translen_to_charlen(const char* zIn, int nIn, int nTrans) {
|
||
int i, c, sz, nOut;
|
||
int nChar;
|
||
|
||
i = nOut = 0;
|
||
for (nChar = 0; i < nIn && nOut < nTrans; nChar++) {
|
||
c = utf8Read((const unsigned char*)&zIn[i], nIn - i, &sz);
|
||
i += sz;
|
||
|
||
nOut++;
|
||
if (c >= 128) {
|
||
int xTop, xBtm, x;
|
||
const Transliteration* tbl = spellfixFindTranslit(c, &xTop);
|
||
xBtm = 0;
|
||
while (xTop >= xBtm) {
|
||
x = (xTop + xBtm) / 2;
|
||
if (tbl[x].cFrom == c) {
|
||
if (tbl[x].cTo1) {
|
||
nOut++;
|
||
if (tbl[x].cTo2) {
|
||
nOut++;
|
||
if (tbl[x].cTo3) {
|
||
nOut++;
|
||
}
|
||
}
|
||
}
|
||
break;
|
||
} else if (tbl[x].cFrom > c) {
|
||
xTop = x - 1;
|
||
} else {
|
||
xBtm = x + 1;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
return nChar;
|
||
}
|
||
|
||
/*
|
||
* Try to determine the dominant script used by the word zIn of length nIn
|
||
* and return its ISO 15924 numeric code.
|
||
*/
|
||
int script_code(const unsigned char* zIn, int nIn) {
|
||
int c, sz;
|
||
int scriptMask = 0;
|
||
int res;
|
||
int seenDigit = 0;
|
||
|
||
while (nIn > 0) {
|
||
c = utf8Read(zIn, nIn, &sz);
|
||
zIn += sz;
|
||
nIn -= sz;
|
||
if (c < 0x02af) {
|
||
if (c >= 0x80 || midClass[c & 0x7f] < CCLASS_DIGIT) {
|
||
scriptMask |= SCRIPT_LATIN;
|
||
} else if (c >= '0' && c <= '9') {
|
||
seenDigit = 1;
|
||
}
|
||
} else if (c >= 0x0400 && c <= 0x04ff) {
|
||
scriptMask |= SCRIPT_CYRILLIC;
|
||
} else if (c >= 0x0386 && c <= 0x03ce) {
|
||
scriptMask |= SCRIPT_GREEK;
|
||
} else if (c >= 0x0590 && c <= 0x05ff) {
|
||
scriptMask |= SCRIPT_HEBREW;
|
||
} else if (c >= 0x0600 && c <= 0x06ff) {
|
||
scriptMask |= SCRIPT_ARABIC;
|
||
}
|
||
}
|
||
if (scriptMask == 0 && seenDigit)
|
||
scriptMask = SCRIPT_LATIN;
|
||
switch (scriptMask) {
|
||
case 0:
|
||
res = 999;
|
||
break;
|
||
case SCRIPT_LATIN:
|
||
res = 215;
|
||
break;
|
||
case SCRIPT_CYRILLIC:
|
||
res = 220;
|
||
break;
|
||
case SCRIPT_GREEK:
|
||
res = 200;
|
||
break;
|
||
case SCRIPT_HEBREW:
|
||
res = 125;
|
||
break;
|
||
case SCRIPT_ARABIC:
|
||
res = 160;
|
||
break;
|
||
default:
|
||
res = 998;
|
||
break;
|
||
}
|
||
return res;
|
||
}
|