Commit 5f5cc579 authored by paul@teton.kitebird.com's avatar paul@teton.kitebird.com

Merge paul@work.mysql.com:/home/bk/mysql-4.0

into teton.kitebird.com:/home/paul/mysql-4.0
parents f917c8e0 e1a6d0e1
......@@ -36191,6 +36191,9 @@ others, but will not be excluded altogether, as it would be with the
@item *
An asterisk is the truncation operator. Unlike the other operators, it
should be @strong{appended} to the word, not prepended.
@item "
The phrase, that is enclosed in double quotes @code{"}, matches only
rows that contain this phrase @strong{literally, as it was typed}.
@end table
And here are some examples:
......@@ -36199,16 +36202,18 @@ And here are some examples:
@item apple banana
find rows that contain at least one of these words.
@item +apple +juice
... both words
... both words.
@item +apple macintosh
... word ``apple'', but rank it higher if it also contain ``macintosh''
... word ``apple'', but rank it higher if it also contain ``macintosh''.
@item +apple -macintosh
... word ``apple'' but not ``macintosh''
... word ``apple'' but not ``macintosh''.
@item +apple +(>pie <strudel)
... ``apple'' and ``pie'', or ``apple'' and ``strudel'' (in any
order), but rank ``apple pie'' higher than ``apple strudel''.
@item apple*
... ``apple'', ``apples'', ``applesauce'', and ``applet''
... ``apple'', ``apples'', ``applesauce'', and ``applet''.
@item "some words"
... ``some words of wisdom'', but not ``some noise words''.
@end table
@menu
......@@ -48928,6 +48933,8 @@ Our TODO section contains what we plan to have in 4.0. @xref{TODO MySQL 4.0}.
@itemize @bullet
@item
Boolean fulltext search now supports "phrase searches".
@item
New configure option @code{--without-query-cache}.
@item
Memory allocation strategy for 'root memory' changed. Block size now grows
......@@ -59,6 +59,7 @@ static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */
typedef struct st_ftb_expr FTB_EXPR;
struct st_ftb_expr {
FTB_EXPR *up;
byte *quot, *qend;
float weight;
uint flags;
my_off_t docid[2]; /* for index search and for scan */
......@@ -84,6 +85,7 @@ typedef struct st_ft_info {
struct _ft_vft *please;
MI_INFO *info;
uint keynr;
CHARSET_INFO *charset;
enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE /*, SCAN*/ } state;
uint with_scan;
FTB_EXPR *root;
......@@ -101,18 +103,18 @@ int FTB_WORD_cmp(void *v __attribute__((unused)), FTB_WORD *a, FTB_WORD *b)
return i;
}
int FTB_WORD_cmp_list(void *v __attribute__((unused)), FTB_WORD **a, FTB_WORD **b)
int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
{
/* ORDER BY word DESC, ndepth DESC */
int i=_mi_compare_text(default_charset_info, (*b)->word+1,(*b)->len-1,
(*a)->word+1,(*a)->len-1,0);
int i=_mi_compare_text(cs, (*b)->word+1,(*b)->len-1,
(*a)->word+1,(*a)->len-1,0);
if (!i)
i=CMP_NUM((*b)->ndepth,(*a)->ndepth);
return i;
}
void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
FTB_EXPR *up, uint depth)
FTB_EXPR *up, uint depth)
{
byte res;
FTB_PARAM param;
......@@ -125,16 +127,17 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
return;
param.prev=' ';
param.quot=up->quot;
while ((res=ft_get_word(start,end,&w,&param)))
{
int r=param.plusminus;
int r=param.plusminus;
float weight= (float) (param.pmsign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)];
switch (res) {
case 1: /* word found */
ftbw=(FTB_WORD *)alloc_root(&ftb->mem_root,
sizeof(FTB_WORD) +
(param.trunc ? MI_MAX_KEY_BUFF :
w.len+extra));
sizeof(FTB_WORD) +
(param.trunc ? MI_MAX_KEY_BUFF :
w.len+extra));
ftbw->len=w.len+1;
ftbw->flags=0;
if (param.yesno>0) ftbw->flags|=FTB_FLAG_YES;
......@@ -148,7 +151,7 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
ftbw->word[0]=w.len;
if (param.yesno > 0) up->ythresh++;
queue_insert(& ftb->queue, (byte *)ftbw);
ftb->with_scan|=param.trunc;
ftb->with_scan|=(param.trunc & FTB_FLAG_TRUNC);
break;
case 2: /* left bracket */
ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR));
......@@ -159,10 +162,12 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end,
ftbe->up=up;
ftbe->ythresh=ftbe->yweaks=0;
ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR;
if ((ftbe->quot=param.quot)) ftb->with_scan|=2;
if (param.yesno > 0) up->ythresh++;
_ftb_parse_query(ftb, start, end, ftbe, depth+1);
break;
case 3: /* right bracket */
if (up->quot) up->qend=param.quot;
return;
}
}
......@@ -203,12 +208,12 @@ void _ftb_init_index_search(FT_INFO *ftb)
SEARCH_FIND | SEARCH_BIGGER, keyroot);
if (!r)
{
r=_mi_compare_text(default_charset_info,
r=_mi_compare_text(ftb->charset,
info->lastkey + (ftbw->flags&FTB_FLAG_TRUNC),
ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC),
ftbw->word + (ftbw->flags&FTB_FLAG_TRUNC),
ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC),
0);
0);
}
if (r) /* not found */
{
......@@ -241,6 +246,9 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query,
ftb->state=UNINITIALIZED;
ftb->info=info;
ftb->keynr=keynr;
ftb->charset= ((keynr==NO_SUCH_KEY) ?
default_charset_info :
info->s->keyinfo[keynr].seg->charset);
ftb->with_scan=0;
init_alloc_root(&ftb->mem_root, 1024, 1024);
......@@ -256,26 +264,49 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query,
ftbe->weight=1;
ftbe->flags=FTB_FLAG_YES;
ftbe->nos=1;
ftbe->up=0;
ftbe->quot=ftbe->up=0;
ftbe->ythresh=ftbe->yweaks=0;
ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR;
ftb->root=ftbe;
_ftb_parse_query(ftb, &query, query+query_len, ftbe, 0);
ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root,
sizeof(FTB_WORD *)*ftb->queue.elements);
memcpy(ftb->list, ftb->queue.root, sizeof(FTB_WORD *)*ftb->queue.elements);
memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
(qsort2_cmp)FTB_WORD_cmp_list, 0);
if (ftb->queue.elements<2) ftb->with_scan=0;
(qsort2_cmp)FTB_WORD_cmp_list, ftb->charset);
if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
ftb->state=READY;
return ftb;
}
void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode)
/* returns 1 if str0 contain str1 */
int _ftb_strstr(const byte *s0, const byte *e0,
const byte *s1, const byte *e1,
CHARSET_INFO *cs)
{
const byte *p;
while (s0 < e0)
{
while (s0 < e0 && cs->to_upper[*s0++] != cs->to_upper[*s1])
/* no-op */;
if (s0 >= e0)
return 0;
p=s1+1;
while (s0 < e0 && p < e1 && cs->to_upper[*s0++] == cs->to_upper[*p++])
/* no-op */;
if (p >= e1)
return 1;
}
return 0;
}
void _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig)
{
FT_SEG_ITERATOR ftsi;
FTB_EXPR *ftbe;
float weight=ftbw->weight;
int yn=ftbw->flags, ythresh;
int yn=ftbw->flags, ythresh, mode=(ftsi_orig != 0);
my_off_t curdoc=ftbw->docid[mode];
for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
......@@ -296,6 +327,20 @@ void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode)
{
yn=ftbe->flags;
weight=ftbe->cur_weight*ftbe->weight;
if (mode && ftbe->quot)
{
int not_found=1;
memcpy(&ftsi, ftsi_orig, sizeof(ftsi));
while (_mi_ft_segiterator(&ftsi) && not_found)
{
if (!ftsi.pos)
continue;
not_found = ! _ftb_strstr(ftsi.pos, ftsi.pos+ftsi.len,
ftbe->quot, ftbe->qend, ftb->charset);
}
if (not_found) break;
} /* ftbe->quot */
}
else
break;
......@@ -352,18 +397,18 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
{
while (curdoc==(ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0])
{
_ftb_climb_the_tree(ftbw,0);
_ftb_climb_the_tree(ftb, ftbw, 0);
/* update queue */
r=_mi_search(info, keyinfo, (uchar*) ftbw->word, USE_WHOLE_KEY,
SEARCH_BIGGER , keyroot);
if (!r)
{
r=_mi_compare_text(default_charset_info,
r=_mi_compare_text(ftb->charset,
info->lastkey + (ftbw->flags&FTB_FLAG_TRUNC),
ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC),
ftbw->word + (ftbw->flags&FTB_FLAG_TRUNC),
ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC),
ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC),
0);
}
if (r) /* not found */
......@@ -410,7 +455,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
FT_WORD word;
FTB_WORD *ftbw;
FTB_EXPR *ftbe;
FT_SEG_ITERATOR ftsi;
FT_SEG_ITERATOR ftsi, ftsi2;
const byte *end;
my_off_t docid=ftb->info->lastpos;
......@@ -419,17 +464,11 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
if (!ftb->queue.elements)
return 0;
#if NOT_USED
if (ftb->state == READY || ftb->state == INDEX_DONE)
ftb->state=SCAN;
else if (ftb->state != SCAN)
return -3.0;
#endif
if (ftb->keynr==NO_SUCH_KEY)
_mi_ft_segiterator_dummy_init(record, length, &ftsi);
else
_mi_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi);
memcpy(&ftsi2, &ftsi, sizeof(ftsi));
while (_mi_ft_segiterator(&ftsi))
{
......@@ -443,7 +482,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
for (a=0, b=ftb->queue.elements, c=(a+b)/2; b-a>1; c=(a+b)/2)
{
ftbw=(FTB_WORD *)(ftb->list[c]);
if (_mi_compare_text(default_charset_info, word.pos,word.len,
if (_mi_compare_text(ftb->charset, word.pos,word.len,
(uchar*) ftbw->word+1,ftbw->len-1,
(ftbw->flags&FTB_FLAG_TRUNC) ) >0)
b=c;
......@@ -453,14 +492,14 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
for (; c>=0; c--)
{
ftbw=(FTB_WORD *)(ftb->list[c]);
if (_mi_compare_text(default_charset_info, word.pos,word.len,
if (_mi_compare_text(ftb->charset, word.pos,word.len,
(uchar*) ftbw->word+1,ftbw->len-1,
(ftbw->flags&FTB_FLAG_TRUNC) ))
break;
if (ftbw->docid[1] == docid)
continue;
ftbw->docid[1]=docid;
_ftb_climb_the_tree(ftbw,1);
_ftb_climb_the_tree(ftb, ftbw, &ftsi2);
}
}
}
......
......@@ -37,6 +37,7 @@ struct st_ft_info {
typedef struct st_all_in_one {
MI_INFO *info;
uint keynr;
CHARSET_INFO *charset;
uchar *keybuff;
MI_KEYDEF *keyinfo;
my_off_t key_root;
......@@ -93,7 +94,7 @@ static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
while(!r)
{
if (_mi_compare_text(default_charset_info,
if (_mi_compare_text(aio->charset,
aio->info->lastkey,keylen,
aio->keybuff,keylen,0)) break;
......@@ -184,8 +185,9 @@ FT_INFO *ft_init_nlq_search(MI_INFO *info, uint keynr, byte *query,
aio.info=info;
aio.keynr=keynr;
aio.keybuff=info->lastkey+info->s->base.max_key_length;
aio.keyinfo=info->s->keyinfo+keynr;
aio.charset=aio.keyinfo->seg->charset;
aio.keybuff=info->lastkey+info->s->base.max_key_length;
aio.key_root=info->s->state.key_root[keynr];
bzero(&allocated_wtree,sizeof(allocated_wtree));
......@@ -193,6 +195,7 @@ FT_INFO *ft_init_nlq_search(MI_INFO *info, uint keynr, byte *query,
init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp,0,
NULL, NULL);
ft_parse_init(&allocated_wtree, aio.charset);
if(ft_parse(&allocated_wtree,query,query_len))
goto err;
......
......@@ -35,12 +35,10 @@ typedef struct st_ft_docstat {
} FT_DOCSTAT;
static int FT_WORD_cmp(void* cmp_arg, FT_WORD *w1, FT_WORD *w2)
static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2)
{
return _mi_compare_text(default_charset_info,
(uchar*) w1->pos, w1->len,
(uchar*) w2->pos, w2->len,
(my_bool) (cmp_arg != 0));
return _mi_compare_text(cs, (uchar*) w1->pos, w1->len,
(uchar*) w2->pos, w2->len, 0);
}
static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
......@@ -135,13 +133,20 @@ byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param)
for (;doc<end;doc++)
{
if (true_word_char(*doc)) break;
if (*doc == FTB_LBR || *doc == FTB_RBR)
if (*doc == FTB_RQUOT && param->quot) {
param->quot=doc-1;
*start=doc+1;
return 3; /* FTB_RBR */
}
if ((*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
&& !param->quot)
{
/* param->prev=' '; */
*start=doc+1;
if (*doc == FTB_LQUOT) param->quot=*start;
return (*doc == FTB_RBR)+2;
}
if (param->prev == ' ')
if (param->prev == ' ' && !param->quot)
{
if (*doc == FTB_YES ) { param->yesno=+1; continue; } else
if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else
......@@ -151,7 +156,8 @@ byte ft_get_word(byte **start, byte *end, FT_WORD *word, FTB_PARAM *param)
if (*doc == FTB_NEG ) { param->pmsign=!param->pmsign; continue; }
}
param->prev=*doc;
param->yesno=param->plusminus=param->pmsign=0;
param->yesno=(param->quot != 0);
param->plusminus=param->pmsign=0;
}
mwc=0;
......@@ -207,16 +213,17 @@ byte ft_simple_get_word(byte **start, byte *end, FT_WORD *word)
return 0;
}
void ft_parse_init(TREE *wtree, CHARSET_INFO *cs)
{
if (!is_tree_inited(wtree))
init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0,NULL, cs);
}
int ft_parse(TREE *wtree, byte *doc, int doclen)
{
byte *end=doc+doclen;
FT_WORD w;
if (!is_tree_inited(wtree))
{
init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0,NULL, NULL);
}
while (ft_simple_get_word(&doc,end,&w))
{
if (!tree_insert(wtree, &w, 0))
......
......@@ -90,16 +90,13 @@ uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record)
FT_SEG_ITERATOR ftsi;
_mi_ft_segiterator_init(info, keynr, record, &ftsi);
ft_parse_init(parsed, info->s->keyinfo[keynr].seg->charset);
while (_mi_ft_segiterator(&ftsi))
if (ftsi.pos)
if (ft_parse(parsed, (byte *)ftsi.pos, ftsi.len))
return 1;
/* Handle the case where all columns are NULL */
if (!is_tree_inited(parsed) && ft_parse(parsed, (byte*) "", 0))
return 1;
else
return 0;
return 0;
}
FT_WORD * _mi_ft_parserecord(MI_INFO *info, uint keynr,
......@@ -153,6 +150,7 @@ static int _mi_ft_erase(MI_INFO *info, uint keynr, byte *keybuf, FT_WORD *wlist,
int _mi_ft_cmp(MI_INFO *info, uint keynr, const byte *rec1, const byte *rec2)
{
FT_SEG_ITERATOR ftsi1, ftsi2;
CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
_mi_ft_segiterator_init(info, keynr, rec1, &ftsi1);
_mi_ft_segiterator_init(info, keynr, rec2, &ftsi2);
......@@ -160,9 +158,8 @@ int _mi_ft_cmp(MI_INFO *info, uint keynr, const byte *rec1, const byte *rec2)
{
if ((ftsi1.pos != ftsi2.pos) &&
(!ftsi1.pos || !ftsi2.pos ||
_mi_compare_text(default_charset_info,
(uchar*) ftsi1.pos,ftsi1.len,
(uchar*) ftsi2.pos,ftsi2.len,0)))
_mi_compare_text(cs, (uchar*) ftsi1.pos,ftsi1.len,
(uchar*) ftsi2.pos,ftsi2.len,0)))
return THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT;
}
return GEE_THEY_ARE_ABSOLUTELY_IDENTICAL;
......@@ -174,6 +171,7 @@ int _mi_ft_update(MI_INFO *info, uint keynr, byte *keybuf,
{
int error= -1;
FT_WORD *oldlist,*newlist, *old_word, *new_word;
CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
uint key_length;
int cmp, cmp2;
......@@ -185,9 +183,8 @@ int _mi_ft_update(MI_INFO *info, uint keynr, byte *keybuf,
error=0;
while(old_word->pos && new_word->pos)
{
cmp=_mi_compare_text(default_charset_info,
(uchar*) old_word->pos,old_word->len,
(uchar*) new_word->pos,new_word->len,0);
cmp=_mi_compare_text(cs, (uchar*) old_word->pos,old_word->len,
(uchar*) new_word->pos,new_word->len,0);
cmp2= cmp ? 0 : (fabs(old_word->weight - new_word->weight) > 1.e-5);
if (cmp < 0 || cmp2)
......
......@@ -95,6 +95,8 @@ extern ulong collstat;
#define FTB_RBR (ft_boolean_syntax[6])
#define FTB_NEG (ft_boolean_syntax[7])
#define FTB_TRUNC (ft_boolean_syntax[8])
#define FTB_LQUOT (ft_boolean_syntax[10])
#define FTB_RQUOT (ft_boolean_syntax[11])
typedef struct st_ft_word {
byte * pos;
......@@ -111,6 +113,7 @@ typedef struct st_ftb_param {
int plusminus;
bool pmsign;
bool trunc;
byte *quot;
} FTB_PARAM;
int is_stopword(char *word, uint len);
......@@ -130,8 +133,9 @@ void _mi_ft_segiterator_init(MI_INFO *, uint, const byte *, FT_SEG_ITERATOR *);
void _mi_ft_segiterator_dummy_init(const byte *, uint, FT_SEG_ITERATOR *);
uint _mi_ft_segiterator(FT_SEG_ITERATOR *);
void ft_parse_init(TREE *, CHARSET_INFO *);
int ft_parse(TREE *, byte *, int);
FT_WORD * ft_linearize(/*MI_INFO *, uint, byte *, */TREE *);
FT_WORD * ft_linearize(TREE *);
FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, byte *, const byte *);
uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record);
......
......@@ -67,6 +67,9 @@ Full-text indexes are called collections 1
Only MyISAM tables support collections 2
Function MATCH ... AGAINST() is used to do a search 0
Full-text search in MySQL implements vector space model 0
select * from t1 where MATCH a,b AGAINST ('"Now sUPPort"' IN BOOLEAN MODE);
a b
MySQL has now support for full-text search
select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE);
a b
Full-text search in MySQL implements vector space model
......
......@@ -20,7 +20,6 @@ select * from t1 where MATCH(a,b) AGAINST ("indexes collections");
# UNION of fulltext's
select * from t1 where MATCH(a,b) AGAINST ("collections") UNION ALL select * from t1 where MATCH(a,b) AGAINST ("indexes");
# boolean search
select * from t1 where MATCH(a,b) AGAINST("support -collections" IN BOOLEAN MODE);
......@@ -34,6 +33,8 @@ select * from t1 where MATCH(a,b) AGAINST("+search -(support vector)" IN BOOLEAN
select *, MATCH(a,b) AGAINST("support collections" IN BOOLEAN MODE) as x from t1;
select *, MATCH(a,b) AGAINST("collections support" IN BOOLEAN MODE) as x from t1;
select * from t1 where MATCH a,b AGAINST ('"Now sUPPort"' IN BOOLEAN MODE);
# boolean w/o index:
select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment