mirror of
https://github.com/tursodatabase/libsql.git
synced 2025-03-08 23:41:50 +00:00
A common complain with libSQL is how to run extensions. The main mechanism, with a .so, has a lot of issues around how those .so are distributed. The most common extensions are the ones in the sqlean package. We can improve this experience by bundling them in our sqlite build. Not all SQLean extensions are kosher: some of them, like fileio, use the vfs. Others, are deemed too complex. The extensions included here are a subset that we deem important enough, and low risk enough, to just be a part of the main bundle.
135 lines
3.8 KiB
C
135 lines
3.8 KiB
C
// Copyright (c) 2014 Ross Bayer, MIT License
|
|
// https://github.com/Rostepher/libstrcmp
|
|
|
|
#include <assert.h>
|
|
#include <stdbool.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "fuzzy/common.h"
|
|
|
|
/// Calculates and returns the Jaro distance of two non NULL strings.
|
|
/// More information about the algorithm can be found here:
|
|
/// http://en.wikipedia.org/wiki/Jaro-Winkler_distance
|
|
///
|
|
/// @param str1 first non NULL string
|
|
/// @param str2 second non NULL string
|
|
///
|
|
/// @returns the jaro distance of str1 and str2
|
|
double jaro(const char* str1, const char* str2) {
|
|
// strings cannot be NULL
|
|
assert(str1 != NULL);
|
|
assert(str2 != NULL);
|
|
|
|
int str1_len = strlen(str1);
|
|
int str2_len = strlen(str2);
|
|
|
|
// if both strings are empty return 1
|
|
// if only one of the strings is empty return 0
|
|
if (str1_len == 0) {
|
|
return (str2_len == 0) ? 1.0 : 0.0;
|
|
}
|
|
|
|
// max distance between two chars to be considered matching
|
|
// floor() is ommitted due to integer division rules
|
|
int match_dist = (int)MAX(str1_len, str2_len) / 2 - 1;
|
|
|
|
// arrays of bools that signify if that char in the matcing string has a
|
|
// match
|
|
int* str1_matches = calloc(str1_len, sizeof(int));
|
|
int* str2_matches = calloc(str2_len, sizeof(int));
|
|
|
|
// number of matches and transpositions
|
|
double matches = 0.0;
|
|
double trans = 0.0;
|
|
|
|
// find the matches
|
|
for (int i = 0; i < str1_len; i++) {
|
|
// start and end take into account the match distance
|
|
int start = MAX(0, i - match_dist);
|
|
int end = MIN(i + match_dist + 1, str2_len);
|
|
|
|
for (int k = start; k < end; k++) {
|
|
// if str2 already has a match or str1 and str2 are not equal
|
|
// continue
|
|
if (str2_matches[k] || NOT_EQ(str1[i], str2[k])) {
|
|
continue;
|
|
}
|
|
|
|
// otherwise assume there is a match
|
|
str1_matches[i] = true;
|
|
str2_matches[k] = true;
|
|
matches++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// if there are no matches return 0
|
|
if (matches == 0) {
|
|
free(str1_matches);
|
|
free(str2_matches);
|
|
return 0.0;
|
|
}
|
|
|
|
// count transpositions
|
|
int k = 0;
|
|
for (int i = 0; i < str1_len; i++) {
|
|
// if there are no matches in str1 continue
|
|
if (!str1_matches[i]) {
|
|
continue;
|
|
}
|
|
|
|
// while there is no match in str2 increment k
|
|
while (!str2_matches[k]) {
|
|
k++;
|
|
}
|
|
|
|
// increment trans
|
|
if (NOT_EQ(str1[i], str2[k])) {
|
|
trans++;
|
|
}
|
|
|
|
k++;
|
|
}
|
|
|
|
// divide the number of transpositions by two as per the algorithm specs
|
|
// this division is valid because the counted transpositions include both
|
|
// instances of the transposed characters.
|
|
trans /= 2.0;
|
|
|
|
// free allocated memory
|
|
free(str1_matches);
|
|
free(str2_matches);
|
|
|
|
// return the jaro distance
|
|
return ((matches / str1_len) + (matches / str2_len) + ((matches - trans) / matches)) / 3.0;
|
|
}
|
|
|
|
/// Calculates and returns the Jaro-Winkler distance of two non NULL strings.
|
|
/// More information about the algorithm can be found here:
|
|
/// http://en.wikipedia.org/wiki/Jaro-Winkler_distance
|
|
///
|
|
/// @param str1 first non NULL string
|
|
/// @param str2 second non NULL string
|
|
///
|
|
/// @returns the jaro-winkler distance of str1 and str2
|
|
double jaro_winkler(const char* str1, const char* str2) {
|
|
// strings cannot be NULL
|
|
assert(str1 != NULL);
|
|
assert(str2 != NULL);
|
|
|
|
// compute the jaro distance
|
|
double dist = jaro(str1, str2);
|
|
|
|
// finds the number of common terms in the first 3 strings, max 3.
|
|
int prefix_length = 0;
|
|
if (strlen(str1) != 0 && strlen(str2) != 0) {
|
|
while (prefix_length < 3 && EQ(*str1++, *str2++)) {
|
|
prefix_length++;
|
|
}
|
|
}
|
|
|
|
// 0.1 is the default scaling factor
|
|
return dist + prefix_length * 0.1 * (1 - dist);
|
|
}
|