mirror of
https://github.com/tursodatabase/libsql.git
synced 2025-05-03 10:22:14 +00:00
A common complain with libSQL is how to run extensions. The main mechanism, with a .so, has a lot of issues around how those .so are distributed. The most common extensions are the ones in the sqlean package. We can improve this experience by bundling them in our sqlite build. Not all SQLean extensions are kosher: some of them, like fileio, use the vfs. Others, are deemed too complex. The extensions included here are a subset that we deem important enough, and low risk enough, to just be a part of the main bundle.
274 lines
8.0 KiB
C
274 lines
8.0 KiB
C
// Originally from the spellfix SQLite exension, Public Domain
|
|
// https://www.sqlite.org/src/file/ext/misc/spellfix.c
|
|
// Modified by Anton Zhiyanov, https://github.com/nalgeon/sqlean/, MIT License
|
|
|
|
#include <assert.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "fuzzy/common.h"
|
|
|
|
extern const unsigned char midClass[];
|
|
extern const unsigned char initClass[];
|
|
extern const unsigned char className[];
|
|
|
|
/*
|
|
** Return the character class number for a character given its
|
|
** context.
|
|
*/
|
|
static char characterClass(char cPrev, char c) {
|
|
return cPrev == 0 ? initClass[c & 0x7f] : midClass[c & 0x7f];
|
|
}
|
|
|
|
/*
|
|
** Return the cost of inserting or deleting character c immediately
|
|
** following character cPrev. If cPrev==0, that means c is the first
|
|
** character of the word.
|
|
*/
|
|
static int insertOrDeleteCost(char cPrev, char c, char cNext) {
|
|
char classC = characterClass(cPrev, c);
|
|
char classCprev;
|
|
|
|
if (classC == CCLASS_SILENT) {
|
|
/* Insert or delete "silent" characters such as H or W */
|
|
return 1;
|
|
}
|
|
if (cPrev == c) {
|
|
/* Repeated characters, or miss a repeat */
|
|
return 10;
|
|
}
|
|
if (classC == CCLASS_VOWEL && (cPrev == 'r' || cNext == 'r')) {
|
|
return 20; /* Insert a vowel before or after 'r' */
|
|
}
|
|
classCprev = characterClass(cPrev, cPrev);
|
|
if (classC == classCprev) {
|
|
if (classC == CCLASS_VOWEL) {
|
|
/* Remove or add a new vowel to a vowel cluster */
|
|
return 15;
|
|
} else {
|
|
/* Remove or add a consonant not in the same class */
|
|
return 50;
|
|
}
|
|
}
|
|
|
|
/* any other character insertion or deletion */
|
|
return 100;
|
|
}
|
|
|
|
/*
|
|
** Divide the insertion cost by this factor when appending to the
|
|
** end of the word.
|
|
*/
|
|
#define FINAL_INS_COST_DIV 4
|
|
|
|
/*
|
|
** Return the cost of substituting cTo in place of cFrom assuming
|
|
** the previous character is cPrev. If cPrev==0 then cTo is the first
|
|
** character of the word.
|
|
*/
|
|
static int substituteCost(char cPrev, char cFrom, char cTo) {
|
|
char classFrom, classTo;
|
|
if (cFrom == cTo) {
|
|
/* Exact match */
|
|
return 0;
|
|
}
|
|
if (cFrom == (cTo ^ 0x20) && ((cTo >= 'A' && cTo <= 'Z') || (cTo >= 'a' && cTo <= 'z'))) {
|
|
/* differ only in case */
|
|
return 0;
|
|
}
|
|
classFrom = characterClass(cPrev, cFrom);
|
|
classTo = characterClass(cPrev, cTo);
|
|
if (classFrom == classTo) {
|
|
/* Same character class */
|
|
return 40;
|
|
}
|
|
if (classFrom >= CCLASS_B && classFrom <= CCLASS_Y && classTo >= CCLASS_B &&
|
|
classTo <= CCLASS_Y) {
|
|
/* Convert from one consonant to another, but in a different class */
|
|
return 75;
|
|
}
|
|
/* Any other subsitution */
|
|
return 100;
|
|
}
|
|
|
|
/*
|
|
** Given two strings zA and zB which are pure ASCII, return the cost
|
|
** of transforming zA into zB. If zA ends with '*' assume that it is
|
|
** a prefix of zB and give only minimal penalty for extra characters
|
|
** on the end of zB.
|
|
**
|
|
** Smaller numbers mean a closer match.
|
|
**
|
|
** Negative values indicate an error:
|
|
** -1 One of the inputs is NULL
|
|
** -2 Non-ASCII characters on input
|
|
** -3 Unable to allocate memory
|
|
**
|
|
** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
|
|
** of zB that matched the pattern in zA. If zA does not end with a '*',
|
|
** then this value is always the number of bytes in zB (i.e. strlen(zB)).
|
|
** If zA does end in a '*', then it is the number of bytes in the prefix
|
|
** of zB that was deemed to match zA.
|
|
*/
|
|
int edit_distance(const char* zA, const char* zB, int* pnMatch) {
|
|
int nA, nB; /* Number of characters in zA[] and zB[] */
|
|
int xA, xB; /* Loop counters for zA[] and zB[] */
|
|
char cA = 0, cB; /* Current character of zA and zB */
|
|
char cAprev, cBprev; /* Previous character of zA and zB */
|
|
char cAnext, cBnext; /* Next character in zA and zB */
|
|
int d; /* North-west cost value */
|
|
int dc = 0; /* North-west character value */
|
|
int res; /* Final result */
|
|
int* m; /* The cost matrix */
|
|
char* cx; /* Corresponding character values */
|
|
int* toFree = 0; /* Malloced space */
|
|
int nMatch = 0;
|
|
int mStack[60 + 15]; /* Stack space to use if not too much is needed */
|
|
|
|
/* Early out if either input is NULL */
|
|
if (zA == 0 || zB == 0)
|
|
return -1;
|
|
|
|
/* Skip any common prefix */
|
|
while (zA[0] && zA[0] == zB[0]) {
|
|
dc = zA[0];
|
|
zA++;
|
|
zB++;
|
|
nMatch++;
|
|
}
|
|
if (pnMatch)
|
|
*pnMatch = nMatch;
|
|
if (zA[0] == 0 && zB[0] == 0)
|
|
return 0;
|
|
|
|
#if 0
|
|
printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
|
|
#endif
|
|
|
|
/* Verify input strings and measure their lengths */
|
|
for (nA = 0; zA[nA]; nA++) {
|
|
if (zA[nA] & 0x80)
|
|
return -2;
|
|
}
|
|
for (nB = 0; zB[nB]; nB++) {
|
|
if (zB[nB] & 0x80)
|
|
return -2;
|
|
}
|
|
|
|
/* Special processing if either string is empty */
|
|
if (nA == 0) {
|
|
cBprev = (char)dc;
|
|
for (xB = res = 0; (cB = zB[xB]) != 0; xB++) {
|
|
res += insertOrDeleteCost(cBprev, cB, zB[xB + 1]) / FINAL_INS_COST_DIV;
|
|
cBprev = cB;
|
|
}
|
|
return res;
|
|
}
|
|
if (nB == 0) {
|
|
cAprev = (char)dc;
|
|
for (xA = res = 0; (cA = zA[xA]) != 0; xA++) {
|
|
res += insertOrDeleteCost(cAprev, cA, zA[xA + 1]);
|
|
cAprev = cA;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
/* A is a prefix of B */
|
|
if (zA[0] == '*' && zA[1] == 0)
|
|
return 0;
|
|
|
|
/* Allocate and initialize the Wagner matrix */
|
|
if ((size_t)nB < (sizeof(mStack) * 4) / (sizeof(mStack[0]) * 5)) {
|
|
m = mStack;
|
|
} else {
|
|
m = toFree = malloc((nB + 1) * 5 * sizeof(m[0]) / 4);
|
|
if (m == 0)
|
|
return -3;
|
|
}
|
|
cx = (char*)&m[nB + 1];
|
|
|
|
/* Compute the Wagner edit distance */
|
|
m[0] = 0;
|
|
cx[0] = (char)dc;
|
|
cBprev = (char)dc;
|
|
for (xB = 1; xB <= nB; xB++) {
|
|
cBnext = zB[xB];
|
|
cB = zB[xB - 1];
|
|
cx[xB] = cB;
|
|
m[xB] = m[xB - 1] + insertOrDeleteCost(cBprev, cB, cBnext);
|
|
cBprev = cB;
|
|
}
|
|
cAprev = (char)dc;
|
|
for (xA = 1; xA <= nA; xA++) {
|
|
int lastA = (xA == nA);
|
|
cA = zA[xA - 1];
|
|
cAnext = zA[xA];
|
|
if (cA == '*' && lastA)
|
|
break;
|
|
d = m[0];
|
|
dc = cx[0];
|
|
m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext);
|
|
cBprev = 0;
|
|
for (xB = 1; xB <= nB; xB++) {
|
|
int totalCost, insCost, delCost, subCost, ncx;
|
|
cB = zB[xB - 1];
|
|
cBnext = zB[xB];
|
|
|
|
/* Cost to insert cB */
|
|
insCost = insertOrDeleteCost(cx[xB - 1], cB, cBnext);
|
|
if (lastA)
|
|
insCost /= FINAL_INS_COST_DIV;
|
|
|
|
/* Cost to delete cA */
|
|
delCost = insertOrDeleteCost(cx[xB], cA, cBnext);
|
|
|
|
/* Cost to substitute cA->cB */
|
|
subCost = substituteCost(cx[xB - 1], cA, cB);
|
|
|
|
/* Best cost */
|
|
totalCost = insCost + m[xB - 1];
|
|
ncx = cB;
|
|
if ((delCost + m[xB]) < totalCost) {
|
|
totalCost = delCost + m[xB];
|
|
ncx = cA;
|
|
}
|
|
if ((subCost + d) < totalCost) {
|
|
totalCost = subCost + d;
|
|
}
|
|
|
|
#if 0
|
|
printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c"
|
|
" ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n",
|
|
xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB,
|
|
insCost, delCost, subCost, totalCost, ncx?ncx:' ');
|
|
#endif
|
|
|
|
/* Update the matrix */
|
|
d = m[xB];
|
|
dc = cx[xB];
|
|
m[xB] = totalCost;
|
|
cx[xB] = (char)ncx;
|
|
cBprev = cB;
|
|
}
|
|
cAprev = cA;
|
|
}
|
|
|
|
/* Free the wagner matrix and return the result */
|
|
if (cA == '*') {
|
|
res = m[1];
|
|
for (xB = 1; xB <= nB; xB++) {
|
|
if (m[xB] < res) {
|
|
res = m[xB];
|
|
if (pnMatch)
|
|
*pnMatch = xB + nMatch;
|
|
}
|
|
}
|
|
} else {
|
|
res = m[nB];
|
|
/* In the current implementation, pnMatch is always NULL if zA does
|
|
** not end in "*" */
|
|
assert(pnMatch == 0);
|
|
}
|
|
free(toFree);
|
|
return res;
|
|
}
|