0
0
mirror of https://github.com/tursodatabase/libsql.git synced 2025-03-08 23:41:50 +00:00
Glauber Costa d3a156caf5 bundle SQLean extensions
A common complain with libSQL is how to run extensions. The main
mechanism, with a .so, has a lot of issues around how those .so are
distributed.

The most common extensions are the ones in the sqlean package. We can
improve this experience by bundling them in our sqlite build.

Not all SQLean extensions are kosher: some of them, like fileio, use
the vfs. Others, are deemed too complex.

The extensions included here are a subset that we deem important enough,
and low risk enough, to just be a part of the main bundle.
2025-01-16 22:25:16 -05:00

290 lines
11 KiB
C

// Copyright (c) 2023 Anton Zhiyanov, MIT License
// https://github.com/nalgeon/sqlean
// Fuzzy string matching and phonetics.
#include <assert.h>
#include <stdbool.h>
#include <stdlib.h>
#include "sqlite3ext.h"
SQLITE_EXTENSION_INIT3
#include "fuzzy/fuzzy.h"
// is_ascii checks if the string consists of ASCII symbols only
static bool is_ascii(const unsigned char* str) {
for (int idx = 0; str[idx]; idx++) {
if (str[idx] & 0x80) {
return false;
}
}
return true;
}
// Below are functions extracted from the
// https://github.com/Rostepher/libstrcmp/
// fuzzy_damlev implements Damerau-Levenshtein distance
static void fuzzy_damlev(sqlite3_context* context, int argc, sqlite3_value** argv) {
assert(argc == 2);
const unsigned char* str1 = sqlite3_value_text(argv[0]);
const unsigned char* str2 = sqlite3_value_text(argv[1]);
if (str1 == 0 || str2 == 0) {
sqlite3_result_error(context, "arguments should not be NULL", -1);
return;
}
if (!is_ascii(str1) || !is_ascii(str2)) {
sqlite3_result_error(context, "arguments should be ASCII strings", -1);
return;
}
unsigned distance = damerau_levenshtein((const char*)str1, (const char*)str2);
sqlite3_result_int(context, distance);
}
// fuzzy_hamming implements Hamming distance
static void fuzzy_hamming(sqlite3_context* context, int argc, sqlite3_value** argv) {
assert(argc == 2);
const unsigned char* str1 = sqlite3_value_text(argv[0]);
const unsigned char* str2 = sqlite3_value_text(argv[1]);
if (str1 == 0 || str2 == 0) {
sqlite3_result_error(context, "arguments should not be NULL", -1);
return;
}
if (!is_ascii(str1) || !is_ascii(str2)) {
sqlite3_result_error(context, "arguments should be ASCII strings", -1);
return;
}
int distance = hamming((const char*)str1, (const char*)str2);
sqlite3_result_int(context, distance);
}
// fuzzy_jarowin implements Jaro-Winkler distance
static void fuzzy_jarowin(sqlite3_context* context, int argc, sqlite3_value** argv) {
assert(argc == 2);
const unsigned char* str1 = sqlite3_value_text(argv[0]);
const unsigned char* str2 = sqlite3_value_text(argv[1]);
if (str1 == 0 || str2 == 0) {
sqlite3_result_error(context, "arguments should not be NULL", -1);
return;
}
if (!is_ascii(str1) || !is_ascii(str2)) {
sqlite3_result_error(context, "arguments should be ASCII strings", -1);
return;
}
double distance = jaro_winkler((const char*)str1, (const char*)str2);
sqlite3_result_double(context, distance);
}
// fuzzy_leven implements Levenshtein distance
static void fuzzy_leven(sqlite3_context* context, int argc, sqlite3_value** argv) {
assert(argc == 2);
const unsigned char* str1 = sqlite3_value_text(argv[0]);
const unsigned char* str2 = sqlite3_value_text(argv[1]);
if (str1 == 0 || str2 == 0) {
sqlite3_result_error(context, "arguments should not be NULL", -1);
return;
}
if (!is_ascii(str1) || !is_ascii(str2)) {
sqlite3_result_error(context, "arguments should be ASCII strings", -1);
return;
}
unsigned distance = levenshtein((const char*)str1, (const char*)str2);
sqlite3_result_int(context, distance);
}
// fuzzy_osadist implements Optimal String Alignment distance
static void fuzzy_osadist(sqlite3_context* context, int argc, sqlite3_value** argv) {
assert(argc == 2);
const unsigned char* str1 = sqlite3_value_text(argv[0]);
const unsigned char* str2 = sqlite3_value_text(argv[1]);
if (str1 == 0 || str2 == 0) {
sqlite3_result_error(context, "arguments should not be NULL", -1);
return;
}
if (!is_ascii(str1) || !is_ascii(str2)) {
sqlite3_result_error(context, "arguments should be ASCII strings", -1);
return;
}
unsigned distance = optimal_string_alignment((const char*)str1, (const char*)str2);
sqlite3_result_int(context, distance);
}
// fuzzy_soundex implements Soundex coding
static void fuzzy_soundex(sqlite3_context* context, int argc, sqlite3_value** argv) {
assert(argc == 1);
const unsigned char* source = sqlite3_value_text(argv[0]);
if (source == 0) {
return;
}
if (!is_ascii(source)) {
sqlite3_result_error(context, "argument should be ASCII string", -1);
return;
}
char* result = soundex((const char*)source);
sqlite3_result_text(context, result, -1, free);
}
// fuzzy_rsoundex implements Refined Soundex coding
static void fuzzy_rsoundex(sqlite3_context* context, int argc, sqlite3_value** argv) {
assert(argc == 1);
const unsigned char* source = sqlite3_value_text(argv[0]);
if (source == 0) {
return;
}
if (!is_ascii(source)) {
sqlite3_result_error(context, "argument should be ASCII string", -1);
return;
}
char* result = refined_soundex((const char*)source);
sqlite3_result_text(context, result, -1, free);
}
// Below are functions extracted from the spellfix SQLite exension
// https://www.sqlite.org/src/file/ext/misc/spellfix.c
/*
** fuzzy_phonetic(X)
**
** Generate a "phonetic hash" from a string of ASCII characters in X.
**
** * Map characters by character class as defined above.
** * Omit double-letters
** * Omit vowels beside R and L
** * Omit T when followed by CH
** * Omit W when followed by R
** * Omit D when followed by J or G
** * Omit K in KN or G in GN at the beginning of a word
**
** Space to hold the result is obtained from sqlite3_malloc()
**
** Return NULL if memory allocation fails.
*/
static void fuzzy_phonetic(sqlite3_context* context, int argc, sqlite3_value** argv) {
const unsigned char* zIn;
unsigned char* zOut;
zIn = sqlite3_value_text(argv[0]);
if (zIn == 0)
return;
zOut = phonetic_hash(zIn, sqlite3_value_bytes(argv[0]));
if (zOut == 0) {
sqlite3_result_error_nomem(context);
} else {
sqlite3_result_text(context, (char*)zOut, -1, free);
}
}
/*
** fuzzy_editdist(A,B)
**
** Return the cost of transforming string A into string B. Both strings
** must be pure ASCII text. If A ends with '*' then it is assumed to be
** a prefix of B and extra characters on the end of B have minimal additional
** cost.
*/
static void fuzzy_editdist(sqlite3_context* context, int argc, sqlite3_value** argv) {
int res = edit_distance((const char*)sqlite3_value_text(argv[0]),
(const char*)sqlite3_value_text(argv[1]), 0);
if (res < 0) {
if (res == (-3)) {
sqlite3_result_error_nomem(context);
} else if (res == (-2)) {
sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
} else {
sqlite3_result_error(context, "NULL input to editdist()", -1);
}
} else {
sqlite3_result_int(context, res);
}
}
/*
** fuzzy_translit(X)
**
** Convert a string that contains non-ASCII Roman characters into
** pure ASCII.
*/
static void fuzzy_translit(sqlite3_context* context, int argc, sqlite3_value** argv) {
const unsigned char* zIn = sqlite3_value_text(argv[0]);
int nIn = sqlite3_value_bytes(argv[0]);
unsigned char* zOut = transliterate(zIn, nIn);
if (zOut == 0) {
sqlite3_result_error_nomem(context);
} else {
sqlite3_result_text(context, (char*)zOut, -1, free);
}
}
/*
** fuzzy_script(X)
**
** Try to determine the dominant script used by the word X and return
** its ISO 15924 numeric code.
**
** The current implementation only understands the following scripts:
**
** 215 (Latin)
** 220 (Cyrillic)
** 200 (Greek)
**
** This routine will return 998 if the input X contains characters from
** two or more of the above scripts or 999 if X contains no characters
** from any of the above scripts.
*/
static void fuzzy_script(sqlite3_context* context, int argc, sqlite3_value** argv) {
const unsigned char* zIn = sqlite3_value_text(argv[0]);
int nIn = sqlite3_value_bytes(argv[0]);
int res = script_code(zIn, nIn);
sqlite3_result_int(context, res);
}
// Below are custom functions
// fuzzy_caver implements Caverphone coding
static void fuzzy_caver(sqlite3_context* context, int argc, sqlite3_value** argv) {
assert(argc == 1);
const unsigned char* source = sqlite3_value_text(argv[0]);
if (source == 0) {
return;
}
if (!is_ascii(source)) {
sqlite3_result_error(context, "argument should be ASCII string", -1);
return;
}
char* result = caverphone((const char*)source);
sqlite3_result_text(context, result, -1, free);
}
int fuzzy_init(sqlite3* db) {
static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC;
// libstrcmp
sqlite3_create_function(db, "fuzzy_damlev", 2, flags, 0, fuzzy_damlev, 0, 0);
sqlite3_create_function(db, "dlevenshtein", 2, flags, 0, fuzzy_damlev, 0, 0);
sqlite3_create_function(db, "fuzzy_hamming", 2, flags, 0, fuzzy_hamming, 0, 0);
sqlite3_create_function(db, "hamming", 2, flags, 0, fuzzy_hamming, 0, 0);
sqlite3_create_function(db, "fuzzy_jarowin", 2, flags, 0, fuzzy_jarowin, 0, 0);
sqlite3_create_function(db, "jaro_winkler", 2, flags, 0, fuzzy_jarowin, 0, 0);
sqlite3_create_function(db, "fuzzy_leven", 2, flags, 0, fuzzy_leven, 0, 0);
sqlite3_create_function(db, "levenshtein", 2, flags, 0, fuzzy_leven, 0, 0);
sqlite3_create_function(db, "fuzzy_osadist", 2, flags, 0, fuzzy_osadist, 0, 0);
sqlite3_create_function(db, "osa_distance", 2, flags, 0, fuzzy_osadist, 0, 0);
sqlite3_create_function(db, "fuzzy_soundex", 1, flags, 0, fuzzy_soundex, 0, 0);
sqlite3_create_function(db, "soundex", 1, flags, 0, fuzzy_soundex, 0, 0);
sqlite3_create_function(db, "fuzzy_rsoundex", 1, flags, 0, fuzzy_rsoundex, 0, 0);
sqlite3_create_function(db, "rsoundex", 1, flags, 0, fuzzy_rsoundex, 0, 0);
// spellfix
sqlite3_create_function(db, "fuzzy_editdist", 2, flags, 0, fuzzy_editdist, 0, 0);
sqlite3_create_function(db, "edit_distance", 2, flags, 0, fuzzy_editdist, 0, 0);
sqlite3_create_function(db, "fuzzy_phonetic", 1, flags, 0, fuzzy_phonetic, 0, 0);
sqlite3_create_function(db, "phonetic_hash", 1, flags, 0, fuzzy_phonetic, 0, 0);
sqlite3_create_function(db, "fuzzy_script", 1, flags, 0, fuzzy_script, 0, 0);
sqlite3_create_function(db, "script_code", 1, flags, 0, fuzzy_script, 0, 0);
sqlite3_create_function(db, "fuzzy_translit", 1, flags, 0, fuzzy_translit, 0, 0);
sqlite3_create_function(db, "translit", 1, flags, 0, fuzzy_translit, 0, 0);
// custom
sqlite3_create_function(db, "fuzzy_caver", 1, flags, 0, fuzzy_caver, 0, 0);
sqlite3_create_function(db, "caverphone", 1, flags, 0, fuzzy_caver, 0, 0);
return SQLITE_OK;
}