0
0
mirror of https://github.com/tursodatabase/libsql.git synced 2025-03-08 23:41:50 +00:00
Glauber Costa d3a156caf5 bundle SQLean extensions
A common complain with libSQL is how to run extensions. The main
mechanism, with a .so, has a lot of issues around how those .so are
distributed.

The most common extensions are the ones in the sqlean package. We can
improve this experience by bundling them in our sqlite build.

Not all SQLean extensions are kosher: some of them, like fileio, use
the vfs. Others, are deemed too complex.

The extensions included here are a subset that we deem important enough,
and low risk enough, to just be a part of the main bundle.
2025-01-16 22:25:16 -05:00

135 lines
3.8 KiB
C

// Copyright (c) 2014 Ross Bayer, MIT License
// https://github.com/Rostepher/libstrcmp
#include <assert.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include "fuzzy/common.h"
/// Calculates and returns the Jaro distance of two non NULL strings.
/// More information about the algorithm can be found here:
/// http://en.wikipedia.org/wiki/Jaro-Winkler_distance
///
/// @param str1 first non NULL string
/// @param str2 second non NULL string
///
/// @returns the jaro distance of str1 and str2
double jaro(const char* str1, const char* str2) {
// strings cannot be NULL
assert(str1 != NULL);
assert(str2 != NULL);
int str1_len = strlen(str1);
int str2_len = strlen(str2);
// if both strings are empty return 1
// if only one of the strings is empty return 0
if (str1_len == 0) {
return (str2_len == 0) ? 1.0 : 0.0;
}
// max distance between two chars to be considered matching
// floor() is ommitted due to integer division rules
int match_dist = (int)MAX(str1_len, str2_len) / 2 - 1;
// arrays of bools that signify if that char in the matcing string has a
// match
int* str1_matches = calloc(str1_len, sizeof(int));
int* str2_matches = calloc(str2_len, sizeof(int));
// number of matches and transpositions
double matches = 0.0;
double trans = 0.0;
// find the matches
for (int i = 0; i < str1_len; i++) {
// start and end take into account the match distance
int start = MAX(0, i - match_dist);
int end = MIN(i + match_dist + 1, str2_len);
for (int k = start; k < end; k++) {
// if str2 already has a match or str1 and str2 are not equal
// continue
if (str2_matches[k] || NOT_EQ(str1[i], str2[k])) {
continue;
}
// otherwise assume there is a match
str1_matches[i] = true;
str2_matches[k] = true;
matches++;
break;
}
}
// if there are no matches return 0
if (matches == 0) {
free(str1_matches);
free(str2_matches);
return 0.0;
}
// count transpositions
int k = 0;
for (int i = 0; i < str1_len; i++) {
// if there are no matches in str1 continue
if (!str1_matches[i]) {
continue;
}
// while there is no match in str2 increment k
while (!str2_matches[k]) {
k++;
}
// increment trans
if (NOT_EQ(str1[i], str2[k])) {
trans++;
}
k++;
}
// divide the number of transpositions by two as per the algorithm specs
// this division is valid because the counted transpositions include both
// instances of the transposed characters.
trans /= 2.0;
// free allocated memory
free(str1_matches);
free(str2_matches);
// return the jaro distance
return ((matches / str1_len) + (matches / str2_len) + ((matches - trans) / matches)) / 3.0;
}
/// Calculates and returns the Jaro-Winkler distance of two non NULL strings.
/// More information about the algorithm can be found here:
/// http://en.wikipedia.org/wiki/Jaro-Winkler_distance
///
/// @param str1 first non NULL string
/// @param str2 second non NULL string
///
/// @returns the jaro-winkler distance of str1 and str2
double jaro_winkler(const char* str1, const char* str2) {
// strings cannot be NULL
assert(str1 != NULL);
assert(str2 != NULL);
// compute the jaro distance
double dist = jaro(str1, str2);
// finds the number of common terms in the first 3 strings, max 3.
int prefix_length = 0;
if (strlen(str1) != 0 && strlen(str2) != 0) {
while (prefix_length < 3 && EQ(*str1++, *str2++)) {
prefix_length++;
}
}
// 0.1 is the default scaling factor
return dist + prefix_length * 0.1 * (1 - dist);
}