2012-05-25 17:50:19 +00:00
/*
* * 2012 May 24
* *
* * The author disclaims copyright to this source code . In place of
* * a legal notice , here is a blessing :
* *
* * May you do good and not evil .
* * May you find forgiveness for yourself and forgive others .
* * May you share freely , never taking more than you give .
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* *
* * Implementation of the " unicode " full - text - search tokenizer .
*/
2014-07-03 12:18:22 +00:00
# ifndef SQLITE_DISABLE_FTS3_UNICODE
2012-05-26 18:28:14 +00:00
2012-05-25 17:50:19 +00:00
# include "fts3Int.h"
# if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
# include <assert.h>
# include <stdlib.h>
# include <stdio.h>
# include <string.h>
# include "fts3_tokenizer.h"
/*
* * The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
* * from the sqlite3 source file utf . c . If this file is compiled as part
* * of the amalgamation , they are not required .
*/
# ifndef SQLITE_AMALGAMATION
static const unsigned char sqlite3Utf8Trans1 [ ] = {
0x00 , 0x01 , 0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 ,
0x08 , 0x09 , 0x0a , 0x0b , 0x0c , 0x0d , 0x0e , 0x0f ,
0x10 , 0x11 , 0x12 , 0x13 , 0x14 , 0x15 , 0x16 , 0x17 ,
0x18 , 0x19 , 0x1a , 0x1b , 0x1c , 0x1d , 0x1e , 0x1f ,
0x00 , 0x01 , 0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 ,
0x08 , 0x09 , 0x0a , 0x0b , 0x0c , 0x0d , 0x0e , 0x0f ,
0x00 , 0x01 , 0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 ,
0x00 , 0x01 , 0x02 , 0x03 , 0x00 , 0x01 , 0x00 , 0x00 ,
} ;
# define READ_UTF8(zIn, zTerm, c) \
c = * ( zIn + + ) ; \
if ( c > = 0xc0 ) { \
c = sqlite3Utf8Trans1 [ c - 0xc0 ] ; \
while ( zIn ! = zTerm & & ( * zIn & 0xc0 ) = = 0x80 ) { \
c = ( c < < 6 ) + ( 0x3f & * ( zIn + + ) ) ; \
} \
if ( c < 0x80 \
| | ( c & 0xFFFFF800 ) = = 0xD800 \
| | ( c & 0xFFFFFFFE ) = = 0xFFFE ) { c = 0xFFFD ; } \
}
# define WRITE_UTF8(zOut, c) { \
if ( c < 0x00080 ) { \
* zOut + + = ( u8 ) ( c & 0xFF ) ; \
} \
else if ( c < 0x00800 ) { \
* zOut + + = 0xC0 + ( u8 ) ( ( c > > 6 ) & 0x1F ) ; \
* zOut + + = 0x80 + ( u8 ) ( c & 0x3F ) ; \
} \
else if ( c < 0x10000 ) { \
* zOut + + = 0xE0 + ( u8 ) ( ( c > > 12 ) & 0x0F ) ; \
* zOut + + = 0x80 + ( u8 ) ( ( c > > 6 ) & 0x3F ) ; \
* zOut + + = 0x80 + ( u8 ) ( c & 0x3F ) ; \
} else { \
* zOut + + = 0xF0 + ( u8 ) ( ( c > > 18 ) & 0x07 ) ; \
* zOut + + = 0x80 + ( u8 ) ( ( c > > 12 ) & 0x3F ) ; \
* zOut + + = 0x80 + ( u8 ) ( ( c > > 6 ) & 0x3F ) ; \
* zOut + + = 0x80 + ( u8 ) ( c & 0x3F ) ; \
} \
}
# endif /* ifndef SQLITE_AMALGAMATION */
typedef struct unicode_tokenizer unicode_tokenizer ;
typedef struct unicode_cursor unicode_cursor ;
struct unicode_tokenizer {
sqlite3_tokenizer base ;
2018-12-03 16:14:49 +00:00
int eRemoveDiacritic ;
2012-06-07 15:53:48 +00:00
int nException ;
int * aiException ;
2012-05-25 17:50:19 +00:00
} ;
struct unicode_cursor {
sqlite3_tokenizer_cursor base ;
const unsigned char * aInput ; /* Input text being tokenized */
int nInput ; /* Size of aInput[] in bytes */
int iOff ; /* Current offset within aInput[] */
int iToken ; /* Index of next token to be returned */
char * zToken ; /* storage for current token */
int nAlloc ; /* space allocated at zToken */
} ;
2012-06-07 15:53:48 +00:00
/*
* * Destroy a tokenizer allocated by unicodeCreate ( ) .
*/
static int unicodeDestroy ( sqlite3_tokenizer * pTokenizer ) {
if ( pTokenizer ) {
unicode_tokenizer * p = ( unicode_tokenizer * ) pTokenizer ;
sqlite3_free ( p - > aiException ) ;
sqlite3_free ( p ) ;
}
return SQLITE_OK ;
}
/*
* * As part of a tokenchars = or separators = option , the CREATE VIRTUAL TABLE
* * statement has specified that the tokenizer for this table shall consider
* * all characters in string zIn / nIn to be separators ( if bAlnum = = 0 ) or
* * token characters ( if bAlnum = = 1 ) .
* *
* * For each codepoint in the zIn / nIn string , this function checks if the
* * sqlite3FtsUnicodeIsalnum ( ) function already returns the desired result .
* * If so , no action is taken . Otherwise , the codepoint is added to the
* * unicode_tokenizer . aiException [ ] array . For the purposes of tokenization ,
* * the return value of sqlite3FtsUnicodeIsalnum ( ) is inverted for all
* * codepoints in the aiException [ ] array .
* *
* * If a standalone diacritic mark ( one that sqlite3FtsUnicodeIsdiacritic ( )
* * identifies as a diacritic ) occurs in the zIn / nIn string it is ignored .
2013-03-21 21:20:32 +00:00
* * It is not possible to change the behavior of the tokenizer with respect
2012-06-07 15:53:48 +00:00
* * to these codepoints .
*/
static int unicodeAddExceptions (
unicode_tokenizer * p , /* Tokenizer to add exceptions to */
int bAlnum , /* Replace Isalnum() return value with this */
const char * zIn , /* Array of characters to make exceptions */
int nIn /* Length of z in bytes */
) {
const unsigned char * z = ( const unsigned char * ) zIn ;
const unsigned char * zTerm = & z [ nIn ] ;
2017-03-20 18:53:32 +00:00
unsigned int iCode ;
2012-06-07 15:53:48 +00:00
int nEntry = 0 ;
assert ( bAlnum = = 0 | | bAlnum = = 1 ) ;
while ( z < zTerm ) {
READ_UTF8 ( z , zTerm , iCode ) ;
2017-03-20 18:53:32 +00:00
assert ( ( sqlite3FtsUnicodeIsalnum ( ( int ) iCode ) & 0xFFFFFFFE ) = = 0 ) ;
if ( sqlite3FtsUnicodeIsalnum ( ( int ) iCode ) ! = bAlnum
& & sqlite3FtsUnicodeIsdiacritic ( ( int ) iCode ) = = 0
2012-06-07 15:53:48 +00:00
) {
nEntry + + ;
}
}
if ( nEntry ) {
int * aNew ; /* New aiException[] array */
int nNew ; /* Number of valid entries in array aNew[] */
2019-01-08 20:02:48 +00:00
aNew = sqlite3_realloc64 ( p - > aiException , ( p - > nException + nEntry ) * sizeof ( int ) ) ;
2012-06-07 15:53:48 +00:00
if ( aNew = = 0 ) return SQLITE_NOMEM ;
nNew = p - > nException ;
z = ( const unsigned char * ) zIn ;
while ( z < zTerm ) {
READ_UTF8 ( z , zTerm , iCode ) ;
2017-03-20 18:53:32 +00:00
if ( sqlite3FtsUnicodeIsalnum ( ( int ) iCode ) ! = bAlnum
& & sqlite3FtsUnicodeIsdiacritic ( ( int ) iCode ) = = 0
2012-06-07 15:53:48 +00:00
) {
int i , j ;
2017-03-20 18:53:32 +00:00
for ( i = 0 ; i < nNew & & aNew [ i ] < ( int ) iCode ; i + + ) ;
2012-06-07 15:53:48 +00:00
for ( j = nNew ; j > i ; j - - ) aNew [ j ] = aNew [ j - 1 ] ;
2017-03-20 18:53:32 +00:00
aNew [ i ] = ( int ) iCode ;
2012-06-07 15:53:48 +00:00
nNew + + ;
}
}
p - > aiException = aNew ;
p - > nException = nNew ;
}
return SQLITE_OK ;
}
/*
* * Return true if the p - > aiException [ ] array contains the value iCode .
*/
static int unicodeIsException ( unicode_tokenizer * p , int iCode ) {
if ( p - > nException > 0 ) {
int * a = p - > aiException ;
int iLo = 0 ;
int iHi = p - > nException - 1 ;
while ( iHi > = iLo ) {
int iTest = ( iHi + iLo ) / 2 ;
if ( iCode = = a [ iTest ] ) {
return 1 ;
} else if ( iCode > a [ iTest ] ) {
iLo = iTest + 1 ;
} else {
iHi = iTest - 1 ;
}
}
}
return 0 ;
}
/*
* * Return true if , for the purposes of tokenization , codepoint iCode is
* * considered a token character ( not a separator ) .
*/
static int unicodeIsAlnum ( unicode_tokenizer * p , int iCode ) {
assert ( ( sqlite3FtsUnicodeIsalnum ( iCode ) & 0xFFFFFFFE ) = = 0 ) ;
return sqlite3FtsUnicodeIsalnum ( iCode ) ^ unicodeIsException ( p , iCode ) ;
}
2012-05-25 17:50:19 +00:00
/*
* * Create a new tokenizer instance .
*/
static int unicodeCreate (
int nArg , /* Size of array argv[] */
const char * const * azArg , /* Tokenizer creation arguments */
sqlite3_tokenizer * * pp /* OUT: New tokenizer handle */
) {
unicode_tokenizer * pNew ; /* New tokenizer object */
2012-06-06 19:30:38 +00:00
int i ;
2012-06-07 15:53:48 +00:00
int rc = SQLITE_OK ;
2012-05-25 17:50:19 +00:00
pNew = ( unicode_tokenizer * ) sqlite3_malloc ( sizeof ( unicode_tokenizer ) ) ;
2012-06-07 15:53:48 +00:00
if ( pNew = = NULL ) return SQLITE_NOMEM ;
2012-05-25 17:50:19 +00:00
memset ( pNew , 0 , sizeof ( unicode_tokenizer ) ) ;
2018-12-03 16:14:49 +00:00
pNew - > eRemoveDiacritic = 1 ;
2012-06-06 19:30:38 +00:00
2012-06-07 15:53:48 +00:00
for ( i = 0 ; rc = = SQLITE_OK & & i < nArg ; i + + ) {
2012-06-06 19:30:38 +00:00
const char * z = azArg [ i ] ;
2014-07-19 15:44:29 +00:00
int n = ( int ) strlen ( z ) ;
2012-06-06 19:30:38 +00:00
if ( n = = 19 & & memcmp ( " remove_diacritics=1 " , z , 19 ) = = 0 ) {
2018-12-03 16:14:49 +00:00
pNew - > eRemoveDiacritic = 1 ;
2012-06-06 19:30:38 +00:00
}
else if ( n = = 19 & & memcmp ( " remove_diacritics=0 " , z , 19 ) = = 0 ) {
2018-12-03 16:14:49 +00:00
pNew - > eRemoveDiacritic = 0 ;
}
else if ( n = = 19 & & memcmp ( " remove_diacritics=2 " , z , 19 ) = = 0 ) {
pNew - > eRemoveDiacritic = 2 ;
2012-06-06 19:30:38 +00:00
}
2012-06-07 15:53:48 +00:00
else if ( n > = 11 & & memcmp ( " tokenchars= " , z , 11 ) = = 0 ) {
rc = unicodeAddExceptions ( pNew , 1 , & z [ 11 ] , n - 11 ) ;
}
else if ( n > = 11 & & memcmp ( " separators= " , z , 11 ) = = 0 ) {
rc = unicodeAddExceptions ( pNew , 0 , & z [ 11 ] , n - 11 ) ;
}
2012-06-06 19:30:38 +00:00
else {
/* Unrecognized argument */
2012-06-07 15:53:48 +00:00
rc = SQLITE_ERROR ;
2012-06-06 19:30:38 +00:00
}
}
2012-06-07 15:53:48 +00:00
if ( rc ! = SQLITE_OK ) {
unicodeDestroy ( ( sqlite3_tokenizer * ) pNew ) ;
pNew = 0 ;
}
* pp = ( sqlite3_tokenizer * ) pNew ;
return rc ;
2012-05-25 17:50:19 +00:00
}
/*
* * Prepare to begin tokenizing a particular string . The input
* * string to be tokenized is pInput [ 0. . nBytes - 1 ] . A cursor
* * used to incrementally tokenize this string is returned in
* * * ppCursor .
*/
static int unicodeOpen (
sqlite3_tokenizer * p , /* The tokenizer */
const char * aInput , /* Input string */
int nInput , /* Size of string aInput in bytes */
sqlite3_tokenizer_cursor * * pp /* OUT: New cursor object */
) {
unicode_cursor * pCsr ;
pCsr = ( unicode_cursor * ) sqlite3_malloc ( sizeof ( unicode_cursor ) ) ;
if ( pCsr = = 0 ) {
return SQLITE_NOMEM ;
}
memset ( pCsr , 0 , sizeof ( unicode_cursor ) ) ;
pCsr - > aInput = ( const unsigned char * ) aInput ;
if ( aInput = = 0 ) {
pCsr - > nInput = 0 ;
2021-03-10 20:04:21 +00:00
pCsr - > aInput = ( const unsigned char * ) " " ;
2012-05-25 17:50:19 +00:00
} else if ( nInput < 0 ) {
pCsr - > nInput = ( int ) strlen ( aInput ) ;
} else {
pCsr - > nInput = nInput ;
}
* pp = & pCsr - > base ;
UNUSED_PARAMETER ( p ) ;
return SQLITE_OK ;
}
/*
* * Close a tokenization cursor previously opened by a call to
* * simpleOpen ( ) above .
*/
static int unicodeClose ( sqlite3_tokenizer_cursor * pCursor ) {
unicode_cursor * pCsr = ( unicode_cursor * ) pCursor ;
sqlite3_free ( pCsr - > zToken ) ;
sqlite3_free ( pCsr ) ;
return SQLITE_OK ;
}
/*
* * Extract the next token from a tokenization cursor . The cursor must
* * have been opened by a prior call to simpleOpen ( ) .
*/
static int unicodeNext (
2012-06-07 15:53:48 +00:00
sqlite3_tokenizer_cursor * pC , /* Cursor returned by simpleOpen */
2012-05-25 17:50:19 +00:00
const char * * paToken , /* OUT: Token text */
int * pnToken , /* OUT: Number of bytes at *paToken */
int * piStart , /* OUT: Starting offset of token */
int * piEnd , /* OUT: Ending offset of token */
int * piPos /* OUT: Position integer of token */
) {
2012-06-07 15:53:48 +00:00
unicode_cursor * pCsr = ( unicode_cursor * ) pC ;
unicode_tokenizer * p = ( ( unicode_tokenizer * ) pCsr - > base . pTokenizer ) ;
2017-03-20 18:53:32 +00:00
unsigned int iCode = 0 ;
2012-05-25 17:50:19 +00:00
char * zOut ;
const unsigned char * z = & pCsr - > aInput [ pCsr - > iOff ] ;
const unsigned char * zStart = z ;
const unsigned char * zEnd ;
const unsigned char * zTerm = & pCsr - > aInput [ pCsr - > nInput ] ;
/* Scan past any delimiter characters before the start of the next token.
* * Return SQLITE_DONE early if this takes us all the way to the end of
* * the input . */
while ( z < zTerm ) {
READ_UTF8 ( z , zTerm , iCode ) ;
2017-03-20 18:53:32 +00:00
if ( unicodeIsAlnum ( p , ( int ) iCode ) ) break ;
2012-05-25 17:50:19 +00:00
zStart = z ;
}
if ( zStart > = zTerm ) return SQLITE_DONE ;
zOut = pCsr - > zToken ;
do {
2012-06-06 19:30:38 +00:00
int iOut ;
2012-05-25 17:50:19 +00:00
/* Grow the output buffer if required. */
if ( ( zOut - pCsr - > zToken ) > = ( pCsr - > nAlloc - 4 ) ) {
2019-01-08 20:02:48 +00:00
char * zNew = sqlite3_realloc64 ( pCsr - > zToken , pCsr - > nAlloc + 64 ) ;
2012-05-25 17:50:19 +00:00
if ( ! zNew ) return SQLITE_NOMEM ;
zOut = & zNew [ zOut - pCsr - > zToken ] ;
pCsr - > zToken = zNew ;
2012-05-25 19:50:12 +00:00
pCsr - > nAlloc + = 64 ;
2012-05-25 17:50:19 +00:00
}
/* Write the folded case of the last character read to the output */
zEnd = z ;
2018-12-03 16:14:49 +00:00
iOut = sqlite3FtsUnicodeFold ( ( int ) iCode , p - > eRemoveDiacritic ) ;
2012-06-06 19:30:38 +00:00
if ( iOut ) {
WRITE_UTF8 ( zOut , iOut ) ;
}
2012-05-25 17:50:19 +00:00
/* If the cursor is not at EOF, read the next character */
if ( z > = zTerm ) break ;
READ_UTF8 ( z , zTerm , iCode ) ;
2017-03-20 18:53:32 +00:00
} while ( unicodeIsAlnum ( p , ( int ) iCode )
| | sqlite3FtsUnicodeIsdiacritic ( ( int ) iCode )
2012-06-06 19:30:38 +00:00
) ;
2012-05-25 17:50:19 +00:00
/* Set the output variables and return. */
2014-07-19 15:44:29 +00:00
pCsr - > iOff = ( int ) ( z - pCsr - > aInput ) ;
2012-05-25 17:50:19 +00:00
* paToken = pCsr - > zToken ;
2014-07-19 15:44:29 +00:00
* pnToken = ( int ) ( zOut - pCsr - > zToken ) ;
* piStart = ( int ) ( zStart - pCsr - > aInput ) ;
* piEnd = ( int ) ( zEnd - pCsr - > aInput ) ;
2012-05-25 17:50:19 +00:00
* piPos = pCsr - > iToken + + ;
return SQLITE_OK ;
}
/*
* * Set * ppModule to a pointer to the sqlite3_tokenizer_module
* * structure for the unicode tokenizer .
*/
void sqlite3Fts3UnicodeTokenizer ( sqlite3_tokenizer_module const * * ppModule ) {
static const sqlite3_tokenizer_module module = {
0 ,
unicodeCreate ,
unicodeDestroy ,
unicodeOpen ,
unicodeClose ,
unicodeNext ,
0 ,
} ;
* ppModule = & module ;
}
# endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
2014-07-03 12:18:22 +00:00
# endif /* ifndef SQLITE_DISABLE_FTS3_UNICODE */