mirror of
				https://github.com/tursodatabase/libsql.git
				synced 2025-11-04 07:08:56 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			136 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			136 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
 | 
						|
1. FTS3 Tokenizers
 | 
						|
 | 
						|
  When creating a new full-text table, FTS3 allows the user to select
 | 
						|
  the text tokenizer implementation to be used when indexing text
 | 
						|
  by specifying a "tokenize" clause as part of the CREATE VIRTUAL TABLE
 | 
						|
  statement:
 | 
						|
 | 
						|
    CREATE VIRTUAL TABLE <table-name> USING fts3(
 | 
						|
      <columns ...> [, tokenize <tokenizer-name> [<tokenizer-args>]]
 | 
						|
    );
 | 
						|
 | 
						|
  The built-in tokenizers (valid values to pass as <tokenizer name>) are
 | 
						|
  "simple", "porter" and "unicode".
 | 
						|
 | 
						|
  <tokenizer-args> should consist of zero or more white-space separated
 | 
						|
  arguments to pass to the selected tokenizer implementation. The 
 | 
						|
  interpretation of the arguments, if any, depends on the individual 
 | 
						|
  tokenizer.
 | 
						|
 | 
						|
2. Custom Tokenizers
 | 
						|
 | 
						|
  FTS3 allows users to provide custom tokenizer implementations. The 
 | 
						|
  interface used to create a new tokenizer is defined and described in 
 | 
						|
  the fts3_tokenizer.h source file.
 | 
						|
 | 
						|
  Registering a new FTS3 tokenizer is similar to registering a new 
 | 
						|
  virtual table module with SQLite. The user passes a pointer to a
 | 
						|
  structure containing pointers to various callback functions that
 | 
						|
  make up the implementation of the new tokenizer type. For tokenizers,
 | 
						|
  the structure (defined in fts3_tokenizer.h) is called
 | 
						|
  "sqlite3_tokenizer_module".
 | 
						|
 | 
						|
  FTS3 does not expose a C-function that users call to register new
 | 
						|
  tokenizer types with a database handle. Instead, the pointer must
 | 
						|
  be encoded as an SQL blob value and passed to FTS3 through the SQL
 | 
						|
  engine by evaluating a special scalar function, "fts3_tokenizer()".
 | 
						|
  The fts3_tokenizer() function may be called with one or two arguments,
 | 
						|
  as follows:
 | 
						|
 | 
						|
    SELECT fts3_tokenizer(<tokenizer-name>);
 | 
						|
    SELECT fts3_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
 | 
						|
  
 | 
						|
  Where <tokenizer-name> is a string identifying the tokenizer and
 | 
						|
  <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module
 | 
						|
  structure encoded as an SQL blob. If the second argument is present,
 | 
						|
  it is registered as tokenizer <tokenizer-name> and a copy of it
 | 
						|
  returned. If only one argument is passed, a pointer to the tokenizer
 | 
						|
  implementation currently registered as <tokenizer-name> is returned,
 | 
						|
  encoded as a blob. Or, if no such tokenizer exists, an SQL exception
 | 
						|
  (error) is raised.
 | 
						|
 | 
						|
  SECURITY: If the fts3 extension is used in an environment where potentially
 | 
						|
    malicious users may execute arbitrary SQL (i.e. gears), they should be
 | 
						|
    prevented from invoking the fts3_tokenizer() function.  The
 | 
						|
    fts3_tokenizer() function is disabled by default. It is only enabled
 | 
						|
    by SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER. Do not enable it in
 | 
						|
    security sensitive environments.
 | 
						|
 | 
						|
  See "Sample code" below for an example of calling the fts3_tokenizer()
 | 
						|
  function from C code.
 | 
						|
 | 
						|
3. ICU Library Tokenizers
 | 
						|
 | 
						|
  If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor 
 | 
						|
  symbol defined, then there exists a built-in tokenizer named "icu" 
 | 
						|
  implemented using the ICU library. The first argument passed to the
 | 
						|
  xCreate() method (see fts3_tokenizer.h) of this tokenizer may be
 | 
						|
  an ICU locale identifier. For example "tr_TR" for Turkish as used
 | 
						|
  in Turkey, or "en_AU" for English as used in Australia. For example:
 | 
						|
 | 
						|
    "CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenizer icu th_TH)"
 | 
						|
 | 
						|
  The ICU tokenizer implementation is very simple. It splits the input
 | 
						|
  text according to the ICU rules for finding word boundaries and discards
 | 
						|
  any tokens that consist entirely of white-space. This may be suitable
 | 
						|
  for some applications in some locales, but not all. If more complex
 | 
						|
  processing is required, for example to implement stemming or 
 | 
						|
  discard punctuation, this can be done by creating a tokenizer 
 | 
						|
  implementation that uses the ICU tokenizer as part of its implementation.
 | 
						|
 | 
						|
  When using the ICU tokenizer this way, it is safe to overwrite the
 | 
						|
  contents of the strings returned by the xNext() method (see
 | 
						|
  fts3_tokenizer.h).
 | 
						|
 | 
						|
4. Sample code.
 | 
						|
 | 
						|
  The following two code samples illustrate the way C code should invoke
 | 
						|
  the fts3_tokenizer() scalar function:
 | 
						|
 | 
						|
      int registerTokenizer(
 | 
						|
        sqlite3 *db, 
 | 
						|
        char *zName, 
 | 
						|
        const sqlite3_tokenizer_module *p
 | 
						|
      ){
 | 
						|
        int rc;
 | 
						|
        sqlite3_stmt *pStmt;
 | 
						|
        const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
 | 
						|
      
 | 
						|
        rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
 | 
						|
        if( rc!=SQLITE_OK ){
 | 
						|
          return rc;
 | 
						|
        }
 | 
						|
      
 | 
						|
        sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
 | 
						|
        sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
 | 
						|
        sqlite3_step(pStmt);
 | 
						|
      
 | 
						|
        return sqlite3_finalize(pStmt);
 | 
						|
      }
 | 
						|
      
 | 
						|
      int queryTokenizer(
 | 
						|
        sqlite3 *db, 
 | 
						|
        char *zName,  
 | 
						|
        const sqlite3_tokenizer_module **pp
 | 
						|
      ){
 | 
						|
        int rc;
 | 
						|
        sqlite3_stmt *pStmt;
 | 
						|
        const char zSql[] = "SELECT fts3_tokenizer(?)";
 | 
						|
      
 | 
						|
        *pp = 0;
 | 
						|
        rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
 | 
						|
        if( rc!=SQLITE_OK ){
 | 
						|
          return rc;
 | 
						|
        }
 | 
						|
      
 | 
						|
        sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
 | 
						|
        if( SQLITE_ROW==sqlite3_step(pStmt) ){
 | 
						|
          if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
 | 
						|
            memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
 | 
						|
          }
 | 
						|
        }
 | 
						|
      
 | 
						|
        return sqlite3_finalize(pStmt);
 | 
						|
      }
 |