| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| #include "fts3Int.h" |
| #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) |
| #ifdef SQLITE_ENABLE_ICU |
|
|
| #include <assert.h> |
| #include <string.h> |
| #include "fts3_tokenizer.h" |
|
|
| #include <unicode/ubrk.h> |
| #include <unicode/ucol.h> |
| #include <unicode/ustring.h> |
| #include <unicode/utf16.h> |
|
|
| typedef struct IcuTokenizer IcuTokenizer; |
| typedef struct IcuCursor IcuCursor; |
|
|
| struct IcuTokenizer { |
| sqlite3_tokenizer base; |
| char *zLocale; |
| }; |
|
|
| struct IcuCursor { |
| sqlite3_tokenizer_cursor base; |
|
|
| UBreakIterator *pIter; |
| int nChar; |
| UChar *aChar; |
| int *aOffset; |
|
|
| int nBuffer; |
| char *zBuffer; |
|
|
| int iToken; |
| }; |
|
|
| |
| |
| |
| static int icuCreate( |
| int argc, |
| const char * const *argv, |
| sqlite3_tokenizer **ppTokenizer |
| ){ |
| IcuTokenizer *p; |
| int n = 0; |
|
|
| if( argc>0 ){ |
| n = strlen(argv[0])+1; |
| } |
| p = (IcuTokenizer *)sqlite3_malloc64(sizeof(IcuTokenizer)+n); |
| if( !p ){ |
| return SQLITE_NOMEM; |
| } |
| memset(p, 0, sizeof(IcuTokenizer)); |
|
|
| if( n ){ |
| p->zLocale = (char *)&p[1]; |
| memcpy(p->zLocale, argv[0], n); |
| } |
|
|
| *ppTokenizer = (sqlite3_tokenizer *)p; |
|
|
| return SQLITE_OK; |
| } |
|
|
| |
| |
| |
| static int icuDestroy(sqlite3_tokenizer *pTokenizer){ |
| IcuTokenizer *p = (IcuTokenizer *)pTokenizer; |
| sqlite3_free(p); |
| return SQLITE_OK; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| static int icuOpen( |
| sqlite3_tokenizer *pTokenizer, |
| const char *zInput, |
| int nInput, |
| sqlite3_tokenizer_cursor **ppCursor |
| ){ |
| IcuTokenizer *p = (IcuTokenizer *)pTokenizer; |
| IcuCursor *pCsr; |
|
|
| const int32_t opt = U_FOLD_CASE_DEFAULT; |
| UErrorCode status = U_ZERO_ERROR; |
| int nChar; |
|
|
| UChar32 c; |
| int iInput = 0; |
| int iOut = 0; |
|
|
| *ppCursor = 0; |
|
|
| if( zInput==0 ){ |
| nInput = 0; |
| zInput = ""; |
| }else if( nInput<0 ){ |
| nInput = strlen(zInput); |
| } |
| nChar = nInput+1; |
| pCsr = (IcuCursor *)sqlite3_malloc64( |
| sizeof(IcuCursor) + |
| ((nChar+3)&~3) * sizeof(UChar) + |
| (nChar+1) * sizeof(int) |
| ); |
| if( !pCsr ){ |
| return SQLITE_NOMEM; |
| } |
| memset(pCsr, 0, sizeof(IcuCursor)); |
| pCsr->aChar = (UChar *)&pCsr[1]; |
| pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3]; |
|
|
| pCsr->aOffset[iOut] = iInput; |
| U8_NEXT(zInput, iInput, nInput, c); |
| while( c>0 ){ |
| int isError = 0; |
| c = u_foldCase(c, opt); |
| U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); |
| if( isError ){ |
| sqlite3_free(pCsr); |
| return SQLITE_ERROR; |
| } |
| pCsr->aOffset[iOut] = iInput; |
|
|
| if( iInput<nInput ){ |
| U8_NEXT(zInput, iInput, nInput, c); |
| }else{ |
| c = 0; |
| } |
| } |
|
|
| pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); |
| if( !U_SUCCESS(status) ){ |
| sqlite3_free(pCsr); |
| return SQLITE_ERROR; |
| } |
| pCsr->nChar = iOut; |
|
|
| ubrk_first(pCsr->pIter); |
| *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; |
| return SQLITE_OK; |
| } |
|
|
| |
| |
| |
| static int icuClose(sqlite3_tokenizer_cursor *pCursor){ |
| IcuCursor *pCsr = (IcuCursor *)pCursor; |
| ubrk_close(pCsr->pIter); |
| sqlite3_free(pCsr->zBuffer); |
| sqlite3_free(pCsr); |
| return SQLITE_OK; |
| } |
|
|
| |
| |
| |
| static int icuNext( |
| sqlite3_tokenizer_cursor *pCursor, |
| const char **ppToken, |
| int *pnBytes, |
| int *piStartOffset, |
| int *piEndOffset, |
| int *piPosition |
| ){ |
| IcuCursor *pCsr = (IcuCursor *)pCursor; |
|
|
| int iStart = 0; |
| int iEnd = 0; |
| int nByte = 0; |
|
|
| while( iStart==iEnd ){ |
| UChar32 c; |
|
|
| iStart = ubrk_current(pCsr->pIter); |
| iEnd = ubrk_next(pCsr->pIter); |
| if( iEnd==UBRK_DONE ){ |
| return SQLITE_DONE; |
| } |
|
|
| while( iStart<iEnd ){ |
| int iWhite = iStart; |
| U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); |
| if( u_isspace(c) ){ |
| iStart = iWhite; |
| }else{ |
| break; |
| } |
| } |
| assert(iStart<=iEnd); |
| } |
|
|
| do { |
| UErrorCode status = U_ZERO_ERROR; |
| if( nByte ){ |
| char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); |
| if( !zNew ){ |
| return SQLITE_NOMEM; |
| } |
| pCsr->zBuffer = zNew; |
| pCsr->nBuffer = nByte; |
| } |
|
|
| u_strToUTF8( |
| pCsr->zBuffer, pCsr->nBuffer, &nByte, |
| &pCsr->aChar[iStart], iEnd-iStart, |
| &status |
| ); |
| } while( nByte>pCsr->nBuffer ); |
|
|
| *ppToken = pCsr->zBuffer; |
| *pnBytes = nByte; |
| *piStartOffset = pCsr->aOffset[iStart]; |
| *piEndOffset = pCsr->aOffset[iEnd]; |
| *piPosition = pCsr->iToken++; |
|
|
| return SQLITE_OK; |
| } |
|
|
| |
| |
| |
| static const sqlite3_tokenizer_module icuTokenizerModule = { |
| 0, |
| icuCreate, |
| icuDestroy, |
| icuOpen, |
| icuClose, |
| icuNext, |
| 0, |
| }; |
|
|
| |
| |
| |
| void sqlite3Fts3IcuTokenizerModule( |
| sqlite3_tokenizer_module const**ppModule |
| ){ |
| *ppModule = &icuTokenizerModule; |
| } |
|
|
| #endif |
| #endif |
|
|