2007-08-20 17:37:02 +00:00
/*
* * 2006 Oct 10
* *
* * The author disclaims copyright to this source code . In place of
* * a legal notice , here is a blessing :
* *
* * May you do good and not evil .
* * May you find forgiveness for yourself and forgive others .
* * May you share freely , never taking more than you give .
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* *
* * Implementation of the " simple " full - text - search tokenizer .
*/
/*
* * The code in this file is only compiled if :
* *
* * * The FTS3 module is being built as an extension
* * ( in which case SQLITE_CORE is not defined ) , or
* *
* * * The FTS3 module is being built into the core of
* * SQLite ( in which case SQLITE_ENABLE_FTS3 is defined ) .
*/
2009-12-03 06:26:46 +00:00
# include "fts3Int.h"
2011-06-16 00:54:45 +00:00
# if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
2007-08-20 17:37:02 +00:00
# include <assert.h>
# include <stdlib.h>
# include <stdio.h>
# include <string.h>
# include "fts3_tokenizer.h"
typedef struct simple_tokenizer {
sqlite3_tokenizer base ;
char delim [ 128 ] ; /* flag ASCII delimiters */
} simple_tokenizer ;
typedef struct simple_tokenizer_cursor {
sqlite3_tokenizer_cursor base ;
const char * pInput ; /* input we are tokenizing */
int nBytes ; /* size of the input */
int iOffset ; /* current position in pInput */
int iToken ; /* index of next token to be returned */
char * pToken ; /* storage for current token */
int nTokenAllocated ; /* space allocated to zToken buffer */
} simple_tokenizer_cursor ;
static int simpleDelim ( simple_tokenizer * t , unsigned char c ) {
return c < 0x80 & & t - > delim [ c ] ;
}
2010-08-06 19:00:12 +00:00
static int fts3_isalnum ( int x ) {
return ( x > = ' 0 ' & & x < = ' 9 ' ) | | ( x > = ' A ' & & x < = ' Z ' ) | | ( x > = ' a ' & & x < = ' z ' ) ;
}
2007-08-20 17:37:02 +00:00
/*
* * Create a new tokenizer instance .
*/
static int simpleCreate (
int argc , const char * const * argv ,
sqlite3_tokenizer * * ppTokenizer
) {
simple_tokenizer * t ;
2007-11-23 17:31:17 +00:00
t = ( simple_tokenizer * ) sqlite3_malloc ( sizeof ( * t ) ) ;
2007-08-20 17:37:02 +00:00
if ( t = = NULL ) return SQLITE_NOMEM ;
2007-11-23 17:31:17 +00:00
memset ( t , 0 , sizeof ( * t ) ) ;
2007-08-20 17:37:02 +00:00
/* TODO(shess) Delimiters need to remain the same from run to run,
* * else we need to reindex . One solution would be a meta - table to
* * track such information in the database , then we ' d only want this
* * information on the initial create .
*/
if ( argc > 1 ) {
2009-12-03 06:26:46 +00:00
int i , n = ( int ) strlen ( argv [ 1 ] ) ;
2007-08-20 17:37:02 +00:00
for ( i = 0 ; i < n ; i + + ) {
unsigned char ch = argv [ 1 ] [ i ] ;
/* We explicitly don't support UTF-8 delimiters for now. */
if ( ch > = 0x80 ) {
2007-11-23 17:31:17 +00:00
sqlite3_free ( t ) ;
2007-08-20 17:37:02 +00:00
return SQLITE_ERROR ;
}
t - > delim [ ch ] = 1 ;
}
} else {
/* Mark non-alphanumeric ASCII characters as delimiters */
int i ;
for ( i = 1 ; i < 0x80 ; i + + ) {
2010-08-06 19:00:12 +00:00
t - > delim [ i ] = ! fts3_isalnum ( i ) ? - 1 : 0 ;
2007-08-20 17:37:02 +00:00
}
}
* ppTokenizer = & t - > base ;
return SQLITE_OK ;
}
/*
* * Destroy a tokenizer
*/
static int simpleDestroy ( sqlite3_tokenizer * pTokenizer ) {
2007-11-23 17:31:17 +00:00
sqlite3_free ( pTokenizer ) ;
2007-08-20 17:37:02 +00:00
return SQLITE_OK ;
}
/*
* * Prepare to begin tokenizing a particular string . The input
* * string to be tokenized is pInput [ 0. . nBytes - 1 ] . A cursor
* * used to incrementally tokenize this string is returned in
* * * ppCursor .
*/
static int simpleOpen (
sqlite3_tokenizer * pTokenizer , /* The tokenizer */
const char * pInput , int nBytes , /* String to be tokenized */
sqlite3_tokenizer_cursor * * ppCursor /* OUT: Tokenization cursor */
) {
simple_tokenizer_cursor * c ;
2009-12-03 06:26:46 +00:00
UNUSED_PARAMETER ( pTokenizer ) ;
2007-11-23 17:31:17 +00:00
c = ( simple_tokenizer_cursor * ) sqlite3_malloc ( sizeof ( * c ) ) ;
2007-08-20 17:37:02 +00:00
if ( c = = NULL ) return SQLITE_NOMEM ;
c - > pInput = pInput ;
if ( pInput = = 0 ) {
c - > nBytes = 0 ;
} else if ( nBytes < 0 ) {
c - > nBytes = ( int ) strlen ( pInput ) ;
} else {
c - > nBytes = nBytes ;
}
c - > iOffset = 0 ; /* start tokenizing at the beginning */
c - > iToken = 0 ;
c - > pToken = NULL ; /* no space allocated, yet. */
c - > nTokenAllocated = 0 ;
* ppCursor = & c - > base ;
return SQLITE_OK ;
}
/*
* * Close a tokenization cursor previously opened by a call to
* * simpleOpen ( ) above .
*/
static int simpleClose ( sqlite3_tokenizer_cursor * pCursor ) {
simple_tokenizer_cursor * c = ( simple_tokenizer_cursor * ) pCursor ;
2007-11-23 17:31:17 +00:00
sqlite3_free ( c - > pToken ) ;
sqlite3_free ( c ) ;
2007-08-20 17:37:02 +00:00
return SQLITE_OK ;
}
/*
* * Extract the next token from a tokenization cursor . The cursor must
* * have been opened by a prior call to simpleOpen ( ) .
*/
static int simpleNext (
sqlite3_tokenizer_cursor * pCursor , /* Cursor returned by simpleOpen */
const char * * ppToken , /* OUT: *ppToken is the token text */
int * pnBytes , /* OUT: Number of bytes in token */
int * piStartOffset , /* OUT: Starting offset of token */
int * piEndOffset , /* OUT: Ending offset of token */
int * piPosition /* OUT: Position integer of token */
) {
simple_tokenizer_cursor * c = ( simple_tokenizer_cursor * ) pCursor ;
simple_tokenizer * t = ( simple_tokenizer * ) pCursor - > pTokenizer ;
unsigned char * p = ( unsigned char * ) c - > pInput ;
while ( c - > iOffset < c - > nBytes ) {
int iStartOffset ;
/* Scan past delimiter characters */
while ( c - > iOffset < c - > nBytes & & simpleDelim ( t , p [ c - > iOffset ] ) ) {
c - > iOffset + + ;
}
/* Count non-delimiter characters. */
iStartOffset = c - > iOffset ;
while ( c - > iOffset < c - > nBytes & & ! simpleDelim ( t , p [ c - > iOffset ] ) ) {
c - > iOffset + + ;
}
if ( c - > iOffset > iStartOffset ) {
int i , n = c - > iOffset - iStartOffset ;
if ( n > c - > nTokenAllocated ) {
2010-01-22 15:48:18 +00:00
char * pNew ;
2007-08-20 17:37:02 +00:00
c - > nTokenAllocated = n + 20 ;
2022-09-27 01:53:05 +00:00
pNew = sqlite3_realloc64 ( c - > pToken , c - > nTokenAllocated ) ;
2010-01-22 15:48:18 +00:00
if ( ! pNew ) return SQLITE_NOMEM ;
c - > pToken = pNew ;
2007-08-20 17:37:02 +00:00
}
for ( i = 0 ; i < n ; i + + ) {
/* TODO(shess) This needs expansion to handle UTF-8
* * case - insensitivity .
*/
unsigned char ch = p [ iStartOffset + i ] ;
2010-08-06 19:00:12 +00:00
c - > pToken [ i ] = ( char ) ( ( ch > = ' A ' & & ch < = ' Z ' ) ? ch - ' A ' + ' a ' : ch ) ;
2007-08-20 17:37:02 +00:00
}
* ppToken = c - > pToken ;
* pnBytes = n ;
* piStartOffset = iStartOffset ;
* piEndOffset = c - > iOffset ;
* piPosition = c - > iToken + + ;
return SQLITE_OK ;
}
}
return SQLITE_DONE ;
}
/*
* * The set of routines that implement the simple tokenizer
*/
static const sqlite3_tokenizer_module simpleTokenizerModule = {
0 ,
simpleCreate ,
simpleDestroy ,
simpleOpen ,
simpleClose ,
simpleNext ,
2012-03-16 00:28:11 +00:00
0 ,
2007-08-20 17:37:02 +00:00
} ;
/*
* * Allocate a new simple tokenizer . Return a pointer to the new
* * tokenizer in * ppModule
*/
void sqlite3Fts3SimpleTokenizerModule (
sqlite3_tokenizer_module const * * ppModule
) {
* ppModule = & simpleTokenizerModule ;
}
# endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */