0
0
mirror of https://github.com/tursodatabase/libsql.git synced 2025-03-08 23:41:50 +00:00
Glauber Costa d3a156caf5 bundle SQLean extensions
A common complain with libSQL is how to run extensions. The main
mechanism, with a .so, has a lot of issues around how those .so are
distributed.

The most common extensions are the ones in the sqlean package. We can
improve this experience by bundling them in our sqlite build.

Not all SQLean extensions are kosher: some of them, like fileio, use
the vfs. Others, are deemed too complex.

The extensions included here are a subset that we deem important enough,
and low risk enough, to just be a part of the main bundle.
2025-01-16 22:25:16 -05:00

611 lines
26 KiB
C
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Originally from the spellfix SQLite exension, Public Domain
// https://www.sqlite.org/src/file/ext/misc/spellfix.c
// Modified by Anton Zhiyanov, https://github.com/nalgeon/sqlean/, MIT License
#include <stdlib.h>
#include "fuzzy/common.h"
extern const unsigned char midClass[];
extern const unsigned char initClass[];
extern const unsigned char className[];
/*
** This lookup table is used to help decode the first byte of
** a multi-byte UTF8 character.
*/
static const unsigned char translit_utf8_lookup[] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
};
/*
** Return the value of the first UTF-8 character in the string.
*/
static int utf8Read(const unsigned char* z, int n, int* pSize) {
int c, i;
/* All callers to this routine (in the current implementation)
** always have n>0. */
if (NEVER(n == 0)) {
c = i = 0;
} else {
c = z[0];
i = 1;
if (c >= 0xc0) {
c = translit_utf8_lookup[c - 0xc0];
while (i < n && (z[i] & 0xc0) == 0x80) {
c = (c << 6) + (0x3f & z[i++]);
}
}
}
*pSize = i;
return c;
}
typedef struct Transliteration Transliteration;
struct Transliteration {
unsigned short int cFrom;
unsigned char cTo0, cTo1, cTo2, cTo3;
};
/*
** Table of translations from unicode characters into ASCII.
*/
static const Transliteration translit[] = {
{0x00A0, 0x20, 0x00, 0x00, 0x00}, /*   to */
{0x00B5, 0x75, 0x00, 0x00, 0x00}, /* µ to u */
{0x00C0, 0x41, 0x00, 0x00, 0x00}, /* À to A */
{0x00C1, 0x41, 0x00, 0x00, 0x00}, /* Á to A */
{0x00C2, 0x41, 0x00, 0x00, 0x00}, /* Â to A */
{0x00C3, 0x41, 0x00, 0x00, 0x00}, /* Ã to A */
{0x00C4, 0x41, 0x65, 0x00, 0x00}, /* Ä to Ae */
{0x00C5, 0x41, 0x61, 0x00, 0x00}, /* Å to Aa */
{0x00C6, 0x41, 0x45, 0x00, 0x00}, /* Æ to AE */
{0x00C7, 0x43, 0x00, 0x00, 0x00}, /* Ç to C */
{0x00C8, 0x45, 0x00, 0x00, 0x00}, /* È to E */
{0x00C9, 0x45, 0x00, 0x00, 0x00}, /* É to E */
{0x00CA, 0x45, 0x00, 0x00, 0x00}, /* Ê to E */
{0x00CB, 0x45, 0x00, 0x00, 0x00}, /* Ë to E */
{0x00CC, 0x49, 0x00, 0x00, 0x00}, /* Ì to I */
{0x00CD, 0x49, 0x00, 0x00, 0x00}, /* Í to I */
{0x00CE, 0x49, 0x00, 0x00, 0x00}, /* Î to I */
{0x00CF, 0x49, 0x00, 0x00, 0x00}, /* Ï to I */
{0x00D0, 0x44, 0x00, 0x00, 0x00}, /* Ð to D */
{0x00D1, 0x4E, 0x00, 0x00, 0x00}, /* Ñ to N */
{0x00D2, 0x4F, 0x00, 0x00, 0x00}, /* Ò to O */
{0x00D3, 0x4F, 0x00, 0x00, 0x00}, /* Ó to O */
{0x00D4, 0x4F, 0x00, 0x00, 0x00}, /* Ô to O */
{0x00D5, 0x4F, 0x00, 0x00, 0x00}, /* Õ to O */
{0x00D6, 0x4F, 0x65, 0x00, 0x00}, /* Ö to Oe */
{0x00D7, 0x78, 0x00, 0x00, 0x00}, /* × to x */
{0x00D8, 0x4F, 0x00, 0x00, 0x00}, /* Ø to O */
{0x00D9, 0x55, 0x00, 0x00, 0x00}, /* Ù to U */
{0x00DA, 0x55, 0x00, 0x00, 0x00}, /* Ú to U */
{0x00DB, 0x55, 0x00, 0x00, 0x00}, /* Û to U */
{0x00DC, 0x55, 0x65, 0x00, 0x00}, /* Ü to Ue */
{0x00DD, 0x59, 0x00, 0x00, 0x00}, /* Ý to Y */
{0x00DE, 0x54, 0x68, 0x00, 0x00}, /* Þ to Th */
{0x00DF, 0x73, 0x73, 0x00, 0x00}, /* ß to ss */
{0x00E0, 0x61, 0x00, 0x00, 0x00}, /* à to a */
{0x00E1, 0x61, 0x00, 0x00, 0x00}, /* á to a */
{0x00E2, 0x61, 0x00, 0x00, 0x00}, /* â to a */
{0x00E3, 0x61, 0x00, 0x00, 0x00}, /* ã to a */
{0x00E4, 0x61, 0x65, 0x00, 0x00}, /* ä to ae */
{0x00E5, 0x61, 0x61, 0x00, 0x00}, /* å to aa */
{0x00E6, 0x61, 0x65, 0x00, 0x00}, /* æ to ae */
{0x00E7, 0x63, 0x00, 0x00, 0x00}, /* ç to c */
{0x00E8, 0x65, 0x00, 0x00, 0x00}, /* è to e */
{0x00E9, 0x65, 0x00, 0x00, 0x00}, /* é to e */
{0x00EA, 0x65, 0x00, 0x00, 0x00}, /* ê to e */
{0x00EB, 0x65, 0x00, 0x00, 0x00}, /* ë to e */
{0x00EC, 0x69, 0x00, 0x00, 0x00}, /* ì to i */
{0x00ED, 0x69, 0x00, 0x00, 0x00}, /* í to i */
{0x00EE, 0x69, 0x00, 0x00, 0x00}, /* î to i */
{0x00EF, 0x69, 0x00, 0x00, 0x00}, /* ï to i */
{0x00F0, 0x64, 0x00, 0x00, 0x00}, /* ð to d */
{0x00F1, 0x6E, 0x00, 0x00, 0x00}, /* ñ to n */
{0x00F2, 0x6F, 0x00, 0x00, 0x00}, /* ò to o */
{0x00F3, 0x6F, 0x00, 0x00, 0x00}, /* ó to o */
{0x00F4, 0x6F, 0x00, 0x00, 0x00}, /* ô to o */
{0x00F5, 0x6F, 0x00, 0x00, 0x00}, /* õ to o */
{0x00F6, 0x6F, 0x65, 0x00, 0x00}, /* ö to oe */
{0x00F7, 0x3A, 0x00, 0x00, 0x00}, /* ÷ to : */
{0x00F8, 0x6F, 0x00, 0x00, 0x00}, /* ø to o */
{0x00F9, 0x75, 0x00, 0x00, 0x00}, /* ù to u */
{0x00FA, 0x75, 0x00, 0x00, 0x00}, /* ú to u */
{0x00FB, 0x75, 0x00, 0x00, 0x00}, /* û to u */
{0x00FC, 0x75, 0x65, 0x00, 0x00}, /* ü to ue */
{0x00FD, 0x79, 0x00, 0x00, 0x00}, /* ý to y */
{0x00FE, 0x74, 0x68, 0x00, 0x00}, /* þ to th */
{0x00FF, 0x79, 0x00, 0x00, 0x00}, /* ÿ to y */
{0x0100, 0x41, 0x00, 0x00, 0x00}, /* Ā to A */
{0x0101, 0x61, 0x00, 0x00, 0x00}, /* ā to a */
{0x0102, 0x41, 0x00, 0x00, 0x00}, /* Ă to A */
{0x0103, 0x61, 0x00, 0x00, 0x00}, /* ă to a */
{0x0104, 0x41, 0x00, 0x00, 0x00}, /* Ą to A */
{0x0105, 0x61, 0x00, 0x00, 0x00}, /* ą to a */
{0x0106, 0x43, 0x00, 0x00, 0x00}, /* Ć to C */
{0x0107, 0x63, 0x00, 0x00, 0x00}, /* ć to c */
{0x0108, 0x43, 0x68, 0x00, 0x00}, /* Ĉ to Ch */
{0x0109, 0x63, 0x68, 0x00, 0x00}, /* ĉ to ch */
{0x010A, 0x43, 0x00, 0x00, 0x00}, /* Ċ to C */
{0x010B, 0x63, 0x00, 0x00, 0x00}, /* ċ to c */
{0x010C, 0x43, 0x00, 0x00, 0x00}, /* Č to C */
{0x010D, 0x63, 0x00, 0x00, 0x00}, /* č to c */
{0x010E, 0x44, 0x00, 0x00, 0x00}, /* Ď to D */
{0x010F, 0x64, 0x00, 0x00, 0x00}, /* ď to d */
{0x0110, 0x44, 0x00, 0x00, 0x00}, /* Đ to D */
{0x0111, 0x64, 0x00, 0x00, 0x00}, /* đ to d */
{0x0112, 0x45, 0x00, 0x00, 0x00}, /* Ē to E */
{0x0113, 0x65, 0x00, 0x00, 0x00}, /* ē to e */
{0x0114, 0x45, 0x00, 0x00, 0x00}, /* Ĕ to E */
{0x0115, 0x65, 0x00, 0x00, 0x00}, /* ĕ to e */
{0x0116, 0x45, 0x00, 0x00, 0x00}, /* Ė to E */
{0x0117, 0x65, 0x00, 0x00, 0x00}, /* ė to e */
{0x0118, 0x45, 0x00, 0x00, 0x00}, /* Ę to E */
{0x0119, 0x65, 0x00, 0x00, 0x00}, /* ę to e */
{0x011A, 0x45, 0x00, 0x00, 0x00}, /* Ě to E */
{0x011B, 0x65, 0x00, 0x00, 0x00}, /* ě to e */
{0x011C, 0x47, 0x68, 0x00, 0x00}, /* Ĝ to Gh */
{0x011D, 0x67, 0x68, 0x00, 0x00}, /* ĝ to gh */
{0x011E, 0x47, 0x00, 0x00, 0x00}, /* Ğ to G */
{0x011F, 0x67, 0x00, 0x00, 0x00}, /* ğ to g */
{0x0120, 0x47, 0x00, 0x00, 0x00}, /* Ġ to G */
{0x0121, 0x67, 0x00, 0x00, 0x00}, /* ġ to g */
{0x0122, 0x47, 0x00, 0x00, 0x00}, /* Ģ to G */
{0x0123, 0x67, 0x00, 0x00, 0x00}, /* ģ to g */
{0x0124, 0x48, 0x68, 0x00, 0x00}, /* Ĥ to Hh */
{0x0125, 0x68, 0x68, 0x00, 0x00}, /* ĥ to hh */
{0x0126, 0x48, 0x00, 0x00, 0x00}, /* Ħ to H */
{0x0127, 0x68, 0x00, 0x00, 0x00}, /* ħ to h */
{0x0128, 0x49, 0x00, 0x00, 0x00}, /* Ĩ to I */
{0x0129, 0x69, 0x00, 0x00, 0x00}, /* ĩ to i */
{0x012A, 0x49, 0x00, 0x00, 0x00}, /* Ī to I */
{0x012B, 0x69, 0x00, 0x00, 0x00}, /* ī to i */
{0x012C, 0x49, 0x00, 0x00, 0x00}, /* Ĭ to I */
{0x012D, 0x69, 0x00, 0x00, 0x00}, /* ĭ to i */
{0x012E, 0x49, 0x00, 0x00, 0x00}, /* Į to I */
{0x012F, 0x69, 0x00, 0x00, 0x00}, /* į to i */
{0x0130, 0x49, 0x00, 0x00, 0x00}, /* İ to I */
{0x0131, 0x69, 0x00, 0x00, 0x00}, /* ı to i */
{0x0132, 0x49, 0x4A, 0x00, 0x00}, /* IJ to IJ */
{0x0133, 0x69, 0x6A, 0x00, 0x00}, /* ij to ij */
{0x0134, 0x4A, 0x68, 0x00, 0x00}, /* Ĵ to Jh */
{0x0135, 0x6A, 0x68, 0x00, 0x00}, /* ĵ to jh */
{0x0136, 0x4B, 0x00, 0x00, 0x00}, /* Ķ to K */
{0x0137, 0x6B, 0x00, 0x00, 0x00}, /* ķ to k */
{0x0138, 0x6B, 0x00, 0x00, 0x00}, /* ĸ to k */
{0x0139, 0x4C, 0x00, 0x00, 0x00}, /* Ĺ to L */
{0x013A, 0x6C, 0x00, 0x00, 0x00}, /* ĺ to l */
{0x013B, 0x4C, 0x00, 0x00, 0x00}, /* Ļ to L */
{0x013C, 0x6C, 0x00, 0x00, 0x00}, /* ļ to l */
{0x013D, 0x4C, 0x00, 0x00, 0x00}, /* Ľ to L */
{0x013E, 0x6C, 0x00, 0x00, 0x00}, /* ľ to l */
{0x013F, 0x4C, 0x2E, 0x00, 0x00}, /* Ŀ to L. */
{0x0140, 0x6C, 0x2E, 0x00, 0x00}, /* ŀ to l. */
{0x0141, 0x4C, 0x00, 0x00, 0x00}, /* Ł to L */
{0x0142, 0x6C, 0x00, 0x00, 0x00}, /* ł to l */
{0x0143, 0x4E, 0x00, 0x00, 0x00}, /* Ń to N */
{0x0144, 0x6E, 0x00, 0x00, 0x00}, /* ń to n */
{0x0145, 0x4E, 0x00, 0x00, 0x00}, /* Ņ to N */
{0x0146, 0x6E, 0x00, 0x00, 0x00}, /* ņ to n */
{0x0147, 0x4E, 0x00, 0x00, 0x00}, /* Ň to N */
{0x0148, 0x6E, 0x00, 0x00, 0x00}, /* ň to n */
{0x0149, 0x27, 0x6E, 0x00, 0x00}, /* ʼn to 'n */
{0x014A, 0x4E, 0x47, 0x00, 0x00}, /* Ŋ to NG */
{0x014B, 0x6E, 0x67, 0x00, 0x00}, /* ŋ to ng */
{0x014C, 0x4F, 0x00, 0x00, 0x00}, /* Ō to O */
{0x014D, 0x6F, 0x00, 0x00, 0x00}, /* ō to o */
{0x014E, 0x4F, 0x00, 0x00, 0x00}, /* Ŏ to O */
{0x014F, 0x6F, 0x00, 0x00, 0x00}, /* ŏ to o */
{0x0150, 0x4F, 0x00, 0x00, 0x00}, /* Ő to O */
{0x0151, 0x6F, 0x00, 0x00, 0x00}, /* ő to o */
{0x0152, 0x4F, 0x45, 0x00, 0x00}, /* Œ to OE */
{0x0153, 0x6F, 0x65, 0x00, 0x00}, /* œ to oe */
{0x0154, 0x52, 0x00, 0x00, 0x00}, /* Ŕ to R */
{0x0155, 0x72, 0x00, 0x00, 0x00}, /* ŕ to r */
{0x0156, 0x52, 0x00, 0x00, 0x00}, /* Ŗ to R */
{0x0157, 0x72, 0x00, 0x00, 0x00}, /* ŗ to r */
{0x0158, 0x52, 0x00, 0x00, 0x00}, /* Ř to R */
{0x0159, 0x72, 0x00, 0x00, 0x00}, /* ř to r */
{0x015A, 0x53, 0x00, 0x00, 0x00}, /* Ś to S */
{0x015B, 0x73, 0x00, 0x00, 0x00}, /* ś to s */
{0x015C, 0x53, 0x68, 0x00, 0x00}, /* Ŝ to Sh */
{0x015D, 0x73, 0x68, 0x00, 0x00}, /* ŝ to sh */
{0x015E, 0x53, 0x00, 0x00, 0x00}, /* Ş to S */
{0x015F, 0x73, 0x00, 0x00, 0x00}, /* ş to s */
{0x0160, 0x53, 0x00, 0x00, 0x00}, /* Š to S */
{0x0161, 0x73, 0x00, 0x00, 0x00}, /* š to s */
{0x0162, 0x54, 0x00, 0x00, 0x00}, /* Ţ to T */
{0x0163, 0x74, 0x00, 0x00, 0x00}, /* ţ to t */
{0x0164, 0x54, 0x00, 0x00, 0x00}, /* Ť to T */
{0x0165, 0x74, 0x00, 0x00, 0x00}, /* ť to t */
{0x0166, 0x54, 0x00, 0x00, 0x00}, /* Ŧ to T */
{0x0167, 0x74, 0x00, 0x00, 0x00}, /* ŧ to t */
{0x0168, 0x55, 0x00, 0x00, 0x00}, /* Ũ to U */
{0x0169, 0x75, 0x00, 0x00, 0x00}, /* ũ to u */
{0x016A, 0x55, 0x00, 0x00, 0x00}, /* Ū to U */
{0x016B, 0x75, 0x00, 0x00, 0x00}, /* ū to u */
{0x016C, 0x55, 0x00, 0x00, 0x00}, /* Ŭ to U */
{0x016D, 0x75, 0x00, 0x00, 0x00}, /* ŭ to u */
{0x016E, 0x55, 0x00, 0x00, 0x00}, /* Ů to U */
{0x016F, 0x75, 0x00, 0x00, 0x00}, /* ů to u */
{0x0170, 0x55, 0x00, 0x00, 0x00}, /* Ű to U */
{0x0171, 0x75, 0x00, 0x00, 0x00}, /* ű to u */
{0x0172, 0x55, 0x00, 0x00, 0x00}, /* Ų to U */
{0x0173, 0x75, 0x00, 0x00, 0x00}, /* ų to u */
{0x0174, 0x57, 0x00, 0x00, 0x00}, /* Ŵ to W */
{0x0175, 0x77, 0x00, 0x00, 0x00}, /* ŵ to w */
{0x0176, 0x59, 0x00, 0x00, 0x00}, /* Ŷ to Y */
{0x0177, 0x79, 0x00, 0x00, 0x00}, /* ŷ to y */
{0x0178, 0x59, 0x00, 0x00, 0x00}, /* Ÿ to Y */
{0x0179, 0x5A, 0x00, 0x00, 0x00}, /* Ź to Z */
{0x017A, 0x7A, 0x00, 0x00, 0x00}, /* ź to z */
{0x017B, 0x5A, 0x00, 0x00, 0x00}, /* Ż to Z */
{0x017C, 0x7A, 0x00, 0x00, 0x00}, /* ż to z */
{0x017D, 0x5A, 0x00, 0x00, 0x00}, /* Ž to Z */
{0x017E, 0x7A, 0x00, 0x00, 0x00}, /* ž to z */
{0x017F, 0x73, 0x00, 0x00, 0x00}, /* ſ to s */
{0x0192, 0x66, 0x00, 0x00, 0x00}, /* ƒ to f */
{0x0218, 0x53, 0x00, 0x00, 0x00}, /* Ș to S */
{0x0219, 0x73, 0x00, 0x00, 0x00}, /* ș to s */
{0x021A, 0x54, 0x00, 0x00, 0x00}, /* Ț to T */
{0x021B, 0x74, 0x00, 0x00, 0x00}, /* ț to t */
{0x0386, 0x41, 0x00, 0x00, 0x00}, /* Ά to A */
{0x0388, 0x45, 0x00, 0x00, 0x00}, /* Έ to E */
{0x0389, 0x49, 0x00, 0x00, 0x00}, /* Ή to I */
{0x038A, 0x49, 0x00, 0x00, 0x00}, /* Ί to I */
{0x038C, 0x4f, 0x00, 0x00, 0x00}, /* Ό to O */
{0x038E, 0x59, 0x00, 0x00, 0x00}, /* Ύ to Y */
{0x038F, 0x4f, 0x00, 0x00, 0x00}, /* Ώ to O */
{0x0390, 0x69, 0x00, 0x00, 0x00}, /* ΐ to i */
{0x0391, 0x41, 0x00, 0x00, 0x00}, /* Α to A */
{0x0392, 0x42, 0x00, 0x00, 0x00}, /* Β to B */
{0x0393, 0x47, 0x00, 0x00, 0x00}, /* Γ to G */
{0x0394, 0x44, 0x00, 0x00, 0x00}, /* Δ to D */
{0x0395, 0x45, 0x00, 0x00, 0x00}, /* Ε to E */
{0x0396, 0x5a, 0x00, 0x00, 0x00}, /* Ζ to Z */
{0x0397, 0x49, 0x00, 0x00, 0x00}, /* Η to I */
{0x0398, 0x54, 0x68, 0x00, 0x00}, /* Θ to Th */
{0x0399, 0x49, 0x00, 0x00, 0x00}, /* Ι to I */
{0x039A, 0x4b, 0x00, 0x00, 0x00}, /* Κ to K */
{0x039B, 0x4c, 0x00, 0x00, 0x00}, /* Λ to L */
{0x039C, 0x4d, 0x00, 0x00, 0x00}, /* Μ to M */
{0x039D, 0x4e, 0x00, 0x00, 0x00}, /* Ν to N */
{0x039E, 0x58, 0x00, 0x00, 0x00}, /* Ξ to X */
{0x039F, 0x4f, 0x00, 0x00, 0x00}, /* Ο to O */
{0x03A0, 0x50, 0x00, 0x00, 0x00}, /* Π to P */
{0x03A1, 0x52, 0x00, 0x00, 0x00}, /* Ρ to R */
{0x03A3, 0x53, 0x00, 0x00, 0x00}, /* Σ to S */
{0x03A4, 0x54, 0x00, 0x00, 0x00}, /* Τ to T */
{0x03A5, 0x59, 0x00, 0x00, 0x00}, /* Υ to Y */
{0x03A6, 0x46, 0x00, 0x00, 0x00}, /* Φ to F */
{0x03A7, 0x43, 0x68, 0x00, 0x00}, /* Χ to Ch */
{0x03A8, 0x50, 0x73, 0x00, 0x00}, /* Ψ to Ps */
{0x03A9, 0x4f, 0x00, 0x00, 0x00}, /* Ω to O */
{0x03AA, 0x49, 0x00, 0x00, 0x00}, /* Ϊ to I */
{0x03AB, 0x59, 0x00, 0x00, 0x00}, /* Ϋ to Y */
{0x03AC, 0x61, 0x00, 0x00, 0x00}, /* ά to a */
{0x03AD, 0x65, 0x00, 0x00, 0x00}, /* έ to e */
{0x03AE, 0x69, 0x00, 0x00, 0x00}, /* ή to i */
{0x03AF, 0x69, 0x00, 0x00, 0x00}, /* ί to i */
{0x03B1, 0x61, 0x00, 0x00, 0x00}, /* α to a */
{0x03B2, 0x62, 0x00, 0x00, 0x00}, /* β to b */
{0x03B3, 0x67, 0x00, 0x00, 0x00}, /* γ to g */
{0x03B4, 0x64, 0x00, 0x00, 0x00}, /* δ to d */
{0x03B5, 0x65, 0x00, 0x00, 0x00}, /* ε to e */
{0x03B6, 0x7a, 0x00, 0x00, 0x00}, /* ζ to z */
{0x03B7, 0x69, 0x00, 0x00, 0x00}, /* η to i */
{0x03B8, 0x74, 0x68, 0x00, 0x00}, /* θ to th */
{0x03B9, 0x69, 0x00, 0x00, 0x00}, /* ι to i */
{0x03BA, 0x6b, 0x00, 0x00, 0x00}, /* κ to k */
{0x03BB, 0x6c, 0x00, 0x00, 0x00}, /* λ to l */
{0x03BC, 0x6d, 0x00, 0x00, 0x00}, /* μ to m */
{0x03BD, 0x6e, 0x00, 0x00, 0x00}, /* ν to n */
{0x03BE, 0x78, 0x00, 0x00, 0x00}, /* ξ to x */
{0x03BF, 0x6f, 0x00, 0x00, 0x00}, /* ο to o */
{0x03C0, 0x70, 0x00, 0x00, 0x00}, /* π to p */
{0x03C1, 0x72, 0x00, 0x00, 0x00}, /* ρ to r */
{0x03C3, 0x73, 0x00, 0x00, 0x00}, /* σ to s */
{0x03C4, 0x74, 0x00, 0x00, 0x00}, /* τ to t */
{0x03C5, 0x79, 0x00, 0x00, 0x00}, /* υ to y */
{0x03C6, 0x66, 0x00, 0x00, 0x00}, /* φ to f */
{0x03C7, 0x63, 0x68, 0x00, 0x00}, /* χ to ch */
{0x03C8, 0x70, 0x73, 0x00, 0x00}, /* ψ to ps */
{0x03C9, 0x6f, 0x00, 0x00, 0x00}, /* ω to o */
{0x03CA, 0x69, 0x00, 0x00, 0x00}, /* ϊ to i */
{0x03CB, 0x79, 0x00, 0x00, 0x00}, /* ϋ to y */
{0x03CC, 0x6f, 0x00, 0x00, 0x00}, /* ό to o */
{0x03CD, 0x79, 0x00, 0x00, 0x00}, /* ύ to y */
{0x03CE, 0x69, 0x00, 0x00, 0x00}, /* ώ to i */
{0x0400, 0x45, 0x00, 0x00, 0x00}, /* Ѐ to E */
{0x0401, 0x45, 0x00, 0x00, 0x00}, /* Ё to E */
{0x0402, 0x44, 0x00, 0x00, 0x00}, /* Ђ to D */
{0x0403, 0x47, 0x00, 0x00, 0x00}, /* Ѓ to G */
{0x0404, 0x45, 0x00, 0x00, 0x00}, /* Є to E */
{0x0405, 0x5a, 0x00, 0x00, 0x00}, /* Ѕ to Z */
{0x0406, 0x49, 0x00, 0x00, 0x00}, /* І to I */
{0x0407, 0x49, 0x00, 0x00, 0x00}, /* Ї to I */
{0x0408, 0x4a, 0x00, 0x00, 0x00}, /* Ј to J */
{0x0409, 0x49, 0x00, 0x00, 0x00}, /* Љ to I */
{0x040A, 0x4e, 0x00, 0x00, 0x00}, /* Њ to N */
{0x040B, 0x44, 0x00, 0x00, 0x00}, /* Ћ to D */
{0x040C, 0x4b, 0x00, 0x00, 0x00}, /* Ќ to K */
{0x040D, 0x49, 0x00, 0x00, 0x00}, /* Ѝ to I */
{0x040E, 0x55, 0x00, 0x00, 0x00}, /* Ў to U */
{0x040F, 0x44, 0x00, 0x00, 0x00}, /* Џ to D */
{0x0410, 0x41, 0x00, 0x00, 0x00}, /* А to A */
{0x0411, 0x42, 0x00, 0x00, 0x00}, /* Б to B */
{0x0412, 0x56, 0x00, 0x00, 0x00}, /* В to V */
{0x0413, 0x47, 0x00, 0x00, 0x00}, /* Г to G */
{0x0414, 0x44, 0x00, 0x00, 0x00}, /* Д to D */
{0x0415, 0x45, 0x00, 0x00, 0x00}, /* Е to E */
{0x0416, 0x5a, 0x68, 0x00, 0x00}, /* Ж to Zh */
{0x0417, 0x5a, 0x00, 0x00, 0x00}, /* З to Z */
{0x0418, 0x49, 0x00, 0x00, 0x00}, /* И to I */
{0x0419, 0x49, 0x00, 0x00, 0x00}, /* Й to I */
{0x041A, 0x4b, 0x00, 0x00, 0x00}, /* К to K */
{0x041B, 0x4c, 0x00, 0x00, 0x00}, /* Л to L */
{0x041C, 0x4d, 0x00, 0x00, 0x00}, /* М to M */
{0x041D, 0x4e, 0x00, 0x00, 0x00}, /* Н to N */
{0x041E, 0x4f, 0x00, 0x00, 0x00}, /* О to O */
{0x041F, 0x50, 0x00, 0x00, 0x00}, /* П to P */
{0x0420, 0x52, 0x00, 0x00, 0x00}, /* Р to R */
{0x0421, 0x53, 0x00, 0x00, 0x00}, /* С to S */
{0x0422, 0x54, 0x00, 0x00, 0x00}, /* Т to T */
{0x0423, 0x55, 0x00, 0x00, 0x00}, /* У to U */
{0x0424, 0x46, 0x00, 0x00, 0x00}, /* Ф to F */
{0x0425, 0x4b, 0x68, 0x00, 0x00}, /* Х to Kh */
{0x0426, 0x54, 0x63, 0x00, 0x00}, /* Ц to Tc */
{0x0427, 0x43, 0x68, 0x00, 0x00}, /* Ч to Ch */
{0x0428, 0x53, 0x68, 0x00, 0x00}, /* Ш to Sh */
{0x0429, 0x53, 0x68, 0x63, 0x68}, /* Щ to Shch */
{0x042A, 0x61, 0x00, 0x00, 0x00}, /* to A */
{0x042B, 0x59, 0x00, 0x00, 0x00}, /* Ы to Y */
{0x042C, 0x59, 0x00, 0x00, 0x00}, /* to Y */
{0x042D, 0x45, 0x00, 0x00, 0x00}, /* Э to E */
{0x042E, 0x49, 0x75, 0x00, 0x00}, /* Ю to Iu */
{0x042F, 0x49, 0x61, 0x00, 0x00}, /* Я to Ia */
{0x0430, 0x61, 0x00, 0x00, 0x00}, /* а to a */
{0x0431, 0x62, 0x00, 0x00, 0x00}, /* б to b */
{0x0432, 0x76, 0x00, 0x00, 0x00}, /* в to v */
{0x0433, 0x67, 0x00, 0x00, 0x00}, /* г to g */
{0x0434, 0x64, 0x00, 0x00, 0x00}, /* д to d */
{0x0435, 0x65, 0x00, 0x00, 0x00}, /* е to e */
{0x0436, 0x7a, 0x68, 0x00, 0x00}, /* ж to zh */
{0x0437, 0x7a, 0x00, 0x00, 0x00}, /* з to z */
{0x0438, 0x69, 0x00, 0x00, 0x00}, /* и to i */
{0x0439, 0x69, 0x00, 0x00, 0x00}, /* й to i */
{0x043A, 0x6b, 0x00, 0x00, 0x00}, /* к to k */
{0x043B, 0x6c, 0x00, 0x00, 0x00}, /* л to l */
{0x043C, 0x6d, 0x00, 0x00, 0x00}, /* м to m */
{0x043D, 0x6e, 0x00, 0x00, 0x00}, /* н to n */
{0x043E, 0x6f, 0x00, 0x00, 0x00}, /* о to o */
{0x043F, 0x70, 0x00, 0x00, 0x00}, /* п to p */
{0x0440, 0x72, 0x00, 0x00, 0x00}, /* р to r */
{0x0441, 0x73, 0x00, 0x00, 0x00}, /* с to s */
{0x0442, 0x74, 0x00, 0x00, 0x00}, /* т to t */
{0x0443, 0x75, 0x00, 0x00, 0x00}, /* у to u */
{0x0444, 0x66, 0x00, 0x00, 0x00}, /* ф to f */
{0x0445, 0x6b, 0x68, 0x00, 0x00}, /* х to kh */
{0x0446, 0x74, 0x63, 0x00, 0x00}, /* ц to tc */
{0x0447, 0x63, 0x68, 0x00, 0x00}, /* ч to ch */
{0x0448, 0x73, 0x68, 0x00, 0x00}, /* ш to sh */
{0x0449, 0x73, 0x68, 0x63, 0x68}, /* щ to shch */
{0x044A, 0x61, 0x00, 0x00, 0x00}, /* to a */
{0x044B, 0x79, 0x00, 0x00, 0x00}, /* ы to y */
{0x044C, 0x79, 0x00, 0x00, 0x00}, /* to y */
{0x044D, 0x65, 0x00, 0x00, 0x00}, /* э to e */
{0x044E, 0x69, 0x75, 0x00, 0x00}, /* ю to iu */
{0x044F, 0x69, 0x61, 0x00, 0x00}, /* я to ia */
{0x0450, 0x65, 0x00, 0x00, 0x00}, /* ѐ to e */
{0x0451, 0x65, 0x00, 0x00, 0x00}, /* ё to e */
{0x0452, 0x64, 0x00, 0x00, 0x00}, /* ђ to d */
{0x0453, 0x67, 0x00, 0x00, 0x00}, /* ѓ to g */
{0x0454, 0x65, 0x00, 0x00, 0x00}, /* є to e */
{0x0455, 0x7a, 0x00, 0x00, 0x00}, /* ѕ to z */
{0x0456, 0x69, 0x00, 0x00, 0x00}, /* і to i */
{0x0457, 0x69, 0x00, 0x00, 0x00}, /* ї to i */
{0x0458, 0x6a, 0x00, 0x00, 0x00}, /* ј to j */
{0x0459, 0x69, 0x00, 0x00, 0x00}, /* љ to i */
{0x045A, 0x6e, 0x00, 0x00, 0x00}, /* њ to n */
{0x045B, 0x64, 0x00, 0x00, 0x00}, /* ћ to d */
{0x045C, 0x6b, 0x00, 0x00, 0x00}, /* ќ to k */
{0x045D, 0x69, 0x00, 0x00, 0x00}, /* ѝ to i */
{0x045E, 0x75, 0x00, 0x00, 0x00}, /* ў to u */
{0x045F, 0x64, 0x00, 0x00, 0x00}, /* џ to d */
{0x1E02, 0x42, 0x00, 0x00, 0x00}, /* Ḃ to B */
{0x1E03, 0x62, 0x00, 0x00, 0x00}, /* ḃ to b */
{0x1E0A, 0x44, 0x00, 0x00, 0x00}, /* Ḋ to D */
{0x1E0B, 0x64, 0x00, 0x00, 0x00}, /* ḋ to d */
{0x1E1E, 0x46, 0x00, 0x00, 0x00}, /* Ḟ to F */
{0x1E1F, 0x66, 0x00, 0x00, 0x00}, /* ḟ to f */
{0x1E40, 0x4D, 0x00, 0x00, 0x00}, /* Ṁ to M */
{0x1E41, 0x6D, 0x00, 0x00, 0x00}, /* ṁ to m */
{0x1E56, 0x50, 0x00, 0x00, 0x00}, /* Ṗ to P */
{0x1E57, 0x70, 0x00, 0x00, 0x00}, /* ṗ to p */
{0x1E60, 0x53, 0x00, 0x00, 0x00}, /* Ṡ to S */
{0x1E61, 0x73, 0x00, 0x00, 0x00}, /* ṡ to s */
{0x1E6A, 0x54, 0x00, 0x00, 0x00}, /* Ṫ to T */
{0x1E6B, 0x74, 0x00, 0x00, 0x00}, /* ṫ to t */
{0x1E80, 0x57, 0x00, 0x00, 0x00}, /* Ẁ to W */
{0x1E81, 0x77, 0x00, 0x00, 0x00}, /* ẁ to w */
{0x1E82, 0x57, 0x00, 0x00, 0x00}, /* Ẃ to W */
{0x1E83, 0x77, 0x00, 0x00, 0x00}, /* ẃ to w */
{0x1E84, 0x57, 0x00, 0x00, 0x00}, /* Ẅ to W */
{0x1E85, 0x77, 0x00, 0x00, 0x00}, /* ẅ to w */
{0x1EF2, 0x59, 0x00, 0x00, 0x00}, /* Ỳ to Y */
{0x1EF3, 0x79, 0x00, 0x00, 0x00}, /* ỳ to y */
{0xFB00, 0x66, 0x66, 0x00, 0x00}, /* ff to ff */
{0xFB01, 0x66, 0x69, 0x00, 0x00}, /* fi to fi */
{0xFB02, 0x66, 0x6C, 0x00, 0x00}, /* fl to fl */
{0xFB05, 0x73, 0x74, 0x00, 0x00}, /* ſt to st */
{0xFB06, 0x73, 0x74, 0x00, 0x00}, /* st to st */
};
static const Transliteration* spellfixFindTranslit(int c, int* pxTop) {
*pxTop = (sizeof(translit) / sizeof(translit[0])) - 1;
return translit;
}
/*
** Convert the input string from UTF-8 into pure ASCII by converting
** all non-ASCII characters to some combination of characters in the
** ASCII subset.
**
** The returned string might contain more characters than the input.
**
** Space to hold the returned string comes from sqlite3_malloc() and
** should be freed by the caller.
*/
unsigned char* transliterate(const unsigned char* zIn, int nIn) {
unsigned char* zOut = malloc(nIn * 4 + 1);
int c, sz, nOut;
if (zOut == 0)
return 0;
nOut = 0;
while (nIn > 0) {
c = utf8Read(zIn, nIn, &sz);
zIn += sz;
nIn -= sz;
if (c <= 127) {
zOut[nOut++] = (unsigned char)c;
} else {
int xTop, xBtm, x;
const Transliteration* tbl = spellfixFindTranslit(c, &xTop);
xBtm = 0;
while (xTop >= xBtm) {
x = (xTop + xBtm) / 2;
if (tbl[x].cFrom == c) {
zOut[nOut++] = tbl[x].cTo0;
if (tbl[x].cTo1) {
zOut[nOut++] = tbl[x].cTo1;
if (tbl[x].cTo2) {
zOut[nOut++] = tbl[x].cTo2;
if (tbl[x].cTo3) {
zOut[nOut++] = tbl[x].cTo3;
}
}
}
c = 0;
break;
} else if (tbl[x].cFrom > c) {
xTop = x - 1;
} else {
xBtm = x + 1;
}
}
if (c)
zOut[nOut++] = '?';
}
}
zOut[nOut] = 0;
return zOut;
}
/*
** Return the number of characters in the shortest prefix of the input
** string that transliterates to an ASCII string nTrans bytes or longer.
** Or, if the transliteration of the input string is less than nTrans
** bytes in size, return the number of characters in the input string.
*/
int translen_to_charlen(const char* zIn, int nIn, int nTrans) {
int i, c, sz, nOut;
int nChar;
i = nOut = 0;
for (nChar = 0; i < nIn && nOut < nTrans; nChar++) {
c = utf8Read((const unsigned char*)&zIn[i], nIn - i, &sz);
i += sz;
nOut++;
if (c >= 128) {
int xTop, xBtm, x;
const Transliteration* tbl = spellfixFindTranslit(c, &xTop);
xBtm = 0;
while (xTop >= xBtm) {
x = (xTop + xBtm) / 2;
if (tbl[x].cFrom == c) {
if (tbl[x].cTo1) {
nOut++;
if (tbl[x].cTo2) {
nOut++;
if (tbl[x].cTo3) {
nOut++;
}
}
}
break;
} else if (tbl[x].cFrom > c) {
xTop = x - 1;
} else {
xBtm = x + 1;
}
}
}
}
return nChar;
}
/*
* Try to determine the dominant script used by the word zIn of length nIn
* and return its ISO 15924 numeric code.
*/
int script_code(const unsigned char* zIn, int nIn) {
int c, sz;
int scriptMask = 0;
int res;
int seenDigit = 0;
while (nIn > 0) {
c = utf8Read(zIn, nIn, &sz);
zIn += sz;
nIn -= sz;
if (c < 0x02af) {
if (c >= 0x80 || midClass[c & 0x7f] < CCLASS_DIGIT) {
scriptMask |= SCRIPT_LATIN;
} else if (c >= '0' && c <= '9') {
seenDigit = 1;
}
} else if (c >= 0x0400 && c <= 0x04ff) {
scriptMask |= SCRIPT_CYRILLIC;
} else if (c >= 0x0386 && c <= 0x03ce) {
scriptMask |= SCRIPT_GREEK;
} else if (c >= 0x0590 && c <= 0x05ff) {
scriptMask |= SCRIPT_HEBREW;
} else if (c >= 0x0600 && c <= 0x06ff) {
scriptMask |= SCRIPT_ARABIC;
}
}
if (scriptMask == 0 && seenDigit)
scriptMask = SCRIPT_LATIN;
switch (scriptMask) {
case 0:
res = 999;
break;
case SCRIPT_LATIN:
res = 215;
break;
case SCRIPT_CYRILLIC:
res = 220;
break;
case SCRIPT_GREEK:
res = 200;
break;
case SCRIPT_HEBREW:
res = 125;
break;
case SCRIPT_ARABIC:
res = 160;
break;
default:
res = 998;
break;
}
return res;
}