gen_lex_hash.cc 12.7 KB
Newer Older
bk@work.mysql.com's avatar
bk@work.mysql.com committed
1
/* Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2

bk@work.mysql.com's avatar
bk@work.mysql.com committed
3 4 5 6
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
7

bk@work.mysql.com's avatar
bk@work.mysql.com committed
8 9 10 11
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
12

bk@work.mysql.com's avatar
bk@work.mysql.com committed
13 14 15 16
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
/*

The idea of presented algorithm see in 
"The Art of Computer Programming" by Donald E. Knuth
Volume 3 "Sorting and searching"
(chapter 6.3 "Digital searching" - name and number of chapter 
   is back translation from Russian edition :))

as illustration of data structures, imagine next table:

static SYMBOL symbols[] = {
  { "ADD",              SYM(ADD),0,0},
  { "AND",              SYM(AND),0,0},
  { "DAY",              SYM(DAY_SYM),0,0},
};

for this structure, presented program generate next searching-structure:

+-----------+-+-+-+
|       len |1|2|3|
+-----------+-+-+-+
|first_char |0|0|a|
|last_char  |0|0|d|
|link       |0|0|+|
                 |
                 V
       +----------+-+-+-+--+
       |    1 char|a|b|c|d |
       +----------+-+-+-+--+
       |first_char|b|0|0|0 |
       |last_char |n|0|0|-1|
       |link      |+|0|0|+ |
                   |     |
                   |     V
                   |  symbols[2] ( "DAY" )
                   V
+----------+--+-+-+-+-+-+-+-+-+-+--+
|    2 char|d |e|f|j|h|i|j|k|l|m|n |
+----------+--+-+-+-+-+-+-+-+-+-+--+
|first_char|0 |0|0|0|0|0|0|0|0|0|0 |
|last_char |-1|0|0|0|0|0|0|0|0|0|-1|
|link      |+ |0|0|0|0|0|0|0|0|0|+ |
            |                    |
            V                    V
         symbols[0] ( "ADD" )  symbols[1] ( "AND" )

for optimization, link is the 16-bit index in 'symbols' or 'sql_functions'
or search-array..

So, we can read full search-structure as 32-bit word

TODO:
1. use instead to_upper_lex, special array 
   (substitute chars) without skip codes..
2. try use reverse order of comparing..
       
*/
bk@work.mysql.com's avatar
bk@work.mysql.com committed
74 75

#define NO_YACC_SYMBOLS
76 77 78
#include "my_global.h"
#include "my_sys.h"
#include "m_string.h"
bk@work.mysql.com's avatar
bk@work.mysql.com committed
79
#ifndef __GNU_LIBRARY__
monty@hundin.mysql.fi's avatar
monty@hundin.mysql.fi committed
80
#define __GNU_LIBRARY__				// Skip warnings in getopt.h
bk@work.mysql.com's avatar
bk@work.mysql.com committed
81
#endif
82
#include <my_getopt.h>
bk@work.mysql.com's avatar
bk@work.mysql.com committed
83 84 85
#include "mysql_version.h"
#include "lex.h"

86 87 88 89 90 91 92
static struct my_option my_long_options[] =
{
  {"help", '?', "Display help and exit",
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
  {"version", 'V', "Output version information and exit",
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
};
bk@work.mysql.com's avatar
bk@work.mysql.com committed
93

94 95 96 97 98 99 100 101 102
struct hash_lex_struct
{
  char first_char;
  char last_char;
  union{
    hash_lex_struct *char_tails;
    int iresult;
  };
  int ithis;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
103
};
104 105 106
				
hash_lex_struct *get_hash_struct_by_len(hash_lex_struct **root_by_len, 
					    int len, int *max_len)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
107
{
108 109 110 111 112 113 114 115 116
  if (*max_len<len){
    *root_by_len= (hash_lex_struct *)realloc((char*)*root_by_len,
			    sizeof(hash_lex_struct)*len);
    hash_lex_struct *cur, *end= *root_by_len + len;
    for (cur= *root_by_len + *max_len; cur<end; cur++)
      cur->first_char= 0;
    *max_len= len;
  }
  return (*root_by_len)+(len-1);
bk@work.mysql.com's avatar
bk@work.mysql.com committed
117 118
}

119 120
void insert_into_hash(hash_lex_struct *root, const char *name, 
		      int len_from_begin, int index, int function)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
121
{
122
  hash_lex_struct *end, *cur, *tails;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
123

124 125 126 127
  if (!root->first_char){
    root->first_char= -1;
    root->iresult= index;
    return;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
128 129
  }

130 131 132 133 134 135 136 137 138 139 140
  if (root->first_char==-1){
    int index2= root->iresult;
    const char *name2= 
      (index2<0 ? sql_functions[-index2-1] : symbols[index2]).name + len_from_begin;
    root->first_char= name2[0];
    root->last_char= root->first_char;
    tails= (hash_lex_struct*)malloc(sizeof(hash_lex_struct));
    root->char_tails= tails;
    tails->first_char= -1;
    tails->iresult= index2;
  }
bk@work.mysql.com's avatar
bk@work.mysql.com committed
141

142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
  size_t real_size= (root->last_char-root->first_char+1);

  if (root->first_char>(*name)){
    size_t new_size= root->last_char-(*name)+1;
    if (new_size<real_size) printf("error!!!!\n");
    tails= root->char_tails;
    tails= (hash_lex_struct*)realloc((char*)tails,
				       sizeof(hash_lex_struct)*new_size);
    root->char_tails= tails;
    memmove(tails+(new_size-real_size),tails,real_size*sizeof(hash_lex_struct));
    end= tails + new_size - real_size;
    for (cur= tails; cur<end; cur++)
      cur->first_char= 0;
    root->first_char= (*name);
  }
157

158 159 160 161 162 163 164 165 166 167 168
  if (root->last_char<(*name)){
    size_t new_size= (*name)-root->first_char+1;
    if (new_size<real_size) printf("error!!!!\n");
    tails= root->char_tails;
    tails= (hash_lex_struct*)realloc((char*)tails,
				    sizeof(hash_lex_struct)*new_size);
    root->char_tails= tails;
    end= tails + new_size;
    for (cur= tails+real_size; cur<end; cur++)
      cur->first_char= 0;
    root->last_char= (*name);
bk@work.mysql.com's avatar
bk@work.mysql.com committed
169
  }
170

171 172
  insert_into_hash (root->char_tails+(*name)-root->first_char,
		    name+1,len_from_begin+1,index,function);
bk@work.mysql.com's avatar
bk@work.mysql.com committed
173 174
}

175 176
hash_lex_struct *root_by_len= 0;
int max_len=0;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
177

178 179 180 181
hash_lex_struct *root_by_len2= 0;
int max_len2=0;

void insert_symbols()
bk@work.mysql.com's avatar
bk@work.mysql.com committed
182
{
183 184 185 186 187 188
  size_t i= 0;
  SYMBOL *cur;
  for (cur= symbols; i<array_elements(symbols); cur++, i++){
    hash_lex_struct *root= 
      get_hash_struct_by_len(&root_by_len,cur->length,&max_len);
    insert_into_hash(root,cur->name,0,i,0);
bk@work.mysql.com's avatar
bk@work.mysql.com committed
189 190 191
  }
}

192
void insert_sql_functions()
bk@work.mysql.com's avatar
bk@work.mysql.com committed
193
{
194 195 196 197 198 199 200 201
  size_t i= 0;
  SYMBOL *cur;
  for (cur= sql_functions; i<array_elements(sql_functions); cur++, i++){
    hash_lex_struct *root= 
      get_hash_struct_by_len(&root_by_len,cur->length,&max_len);
    insert_into_hash(root,cur->name,0,-i-1,1);
  }
}
bk@work.mysql.com's avatar
bk@work.mysql.com committed
202

203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
void generate_find_structs()
{
  root_by_len= 0;
  max_len=0;
  size_t i;

  SYMBOL *cur, *end= symbols + array_elements(symbols);
  for (cur= symbols; cur < end; cur++)
    cur->length=(uchar) strlen(cur->name);
  end= sql_functions + array_elements(sql_functions);
  for (cur= sql_functions; cur<end; cur++)
    cur->length=(uchar) strlen(cur->name);
  
  insert_symbols();

  root_by_len2= root_by_len;
  max_len2= max_len;

  root_by_len= 0;
  max_len= 0;

  insert_symbols();
  insert_sql_functions();
}
bk@work.mysql.com's avatar
bk@work.mysql.com committed
227

228 229
char *hash_map= 0;
int size_hash_map= 0;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
230

231 232 233 234 235 236 237 238 239
void add_struct_to_map(hash_lex_struct *st)
{
  st->ithis= size_hash_map/4;
  size_hash_map+= 4;
  hash_map= (char*)realloc((char*)hash_map,size_hash_map);
  hash_map[size_hash_map-4]= st->first_char==-1 ? 0 : st->first_char;
  hash_map[size_hash_map-3]= 
    st->first_char==-1 || st->first_char==0 ? 0 : st->last_char;
  if (st->first_char==-1)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
240
  {
241 242
    hash_map[size_hash_map-2]= ((unsigned int)(int16)st->iresult)&255;
    hash_map[size_hash_map-1]= ((unsigned int)(int16)st->iresult)>>8;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
243
  }
244
  else if (st->first_char==0)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
245
  {
246 247
    hash_map[size_hash_map-2]= ((unsigned int)(int16)array_elements(symbols))&255;
    hash_map[size_hash_map-1]= ((unsigned int)(int16)array_elements(symbols))>>8;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
248
  }
249
};
bk@work.mysql.com's avatar
bk@work.mysql.com committed
250

251 252 253 254 255 256 257 258 259
void add_structs_to_map(hash_lex_struct *st, int len)
{
  hash_lex_struct *cur, *end= st+len;
  for (cur= st; cur<end; cur++)
    add_struct_to_map(cur);
  for (cur= st; cur<end; cur++)
    if (cur->first_char && cur->first_char!=-1)
      add_structs_to_map(cur->char_tails,cur->last_char-cur->first_char+1);
}
bk@work.mysql.com's avatar
bk@work.mysql.com committed
260

261 262 263 264 265 266 267 268 269
void set_links(hash_lex_struct *st, int len)
{
  hash_lex_struct *cur, *end= st+len;
  for (cur= st; cur<end; cur++)
    if (cur->first_char!=0 && cur->first_char!=-1){
      int ilink= cur->char_tails->ithis;
      hash_map[cur->ithis*4+2]= ilink%256;
      hash_map[cur->ithis*4+3]= ilink/256;
      set_links(cur->char_tails,cur->last_char-cur->first_char+1);
bk@work.mysql.com's avatar
bk@work.mysql.com committed
270 271 272
    }
}

273
void print_hash_map(const char *name)
bk@work.mysql.com's avatar
bk@work.mysql.com committed
274
{
275 276 277 278 279 280 281 282 283 284 285 286 287
  printf("uchar %s[%d]= {\n",name,size_hash_map);
  char *cur;
  int i;
  for (i=0, cur= hash_map; i<size_hash_map; i++, cur++){
    switch(i%4){
    case 0: case 1:
      if (!*cur)
	printf("0,   ");
      else
	printf("\'%c\', ",*cur);
      break;
    case 2: printf("%u, ",(uint)(uchar)*cur); break;
    case 3: printf("%u,\n",(uint)(uchar)*cur); break;
monty@hundin.mysql.fi's avatar
monty@hundin.mysql.fi committed
288
    }
bk@work.mysql.com's avatar
bk@work.mysql.com committed
289
  }
290 291
  printf("};\n");
}
bk@work.mysql.com's avatar
bk@work.mysql.com committed
292

293 294 295 296 297
void print_find_structs()
{
  add_structs_to_map(root_by_len,max_len);
  set_links(root_by_len,max_len);
  print_hash_map("sql_functions_map");
bk@work.mysql.com's avatar
bk@work.mysql.com committed
298

299 300
  hash_map= 0;
  size_hash_map= 0;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
301

302
  printf("\n");
bk@work.mysql.com's avatar
bk@work.mysql.com committed
303

304 305 306 307
  add_structs_to_map(root_by_len2,max_len2);
  set_links(root_by_len2,max_len2);
  print_hash_map("symbols_map");
}
bk@work.mysql.com's avatar
bk@work.mysql.com committed
308 309 310

static void usage(int version)
{
311
  printf("%s  Ver 3.5 Distrib %s, for %s (%s)\n",
bk@work.mysql.com's avatar
bk@work.mysql.com committed
312 313 314
	 my_progname, MYSQL_SERVER_VERSION, SYSTEM_TYPE, MACHINE_TYPE);
  if (version)
    return;
315 316 317
  puts("Copyright (C) 2001 MySQL AB, by VVA and Monty");
  puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,\n\
and you are welcome to modify and redistribute it under the GPL license\n");
bk@work.mysql.com's avatar
bk@work.mysql.com committed
318
  puts("This program generates a perfect hashing function for the sql_lex.cc");
319 320 321
  printf("Usage: %s [OPTIONS]\n\n", my_progname);
  my_print_help(my_long_options);
}
bk@work.mysql.com's avatar
bk@work.mysql.com committed
322

323 324 325 326
static my_bool
get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
	       char *argument __attribute__((unused)))
{
327
  switch(optid) {
328 329 330 331 332 333 334 335 336
  case 'V':
    usage(1);
    exit(0);
  case 'I':
  case '?':
    usage(0);
    exit(0);
  }
  return 0;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
337 338 339 340
}

static int get_options(int argc, char **argv)
{
341
  int ho_error;
bk@work.mysql.com's avatar
bk@work.mysql.com committed
342

343
  if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option)))
344 345
    exit(ho_error);

bk@work.mysql.com's avatar
bk@work.mysql.com committed
346 347 348
  if (argc >= 1)
  {
    usage(0);
349
     exit(1);
bk@work.mysql.com's avatar
bk@work.mysql.com committed
350 351 352 353 354 355 356
  }
  return(0);
}

int main(int argc,char **argv)
{
  MY_INIT(argv[0]);
357

358
  if (get_options(argc,(char **) argv))
bk@work.mysql.com's avatar
bk@work.mysql.com committed
359 360
    exit(1);

monty@hundin.mysql.fi's avatar
monty@hundin.mysql.fi committed
361
  printf("/* Copyright (C) 2001 MySQL AB\n\
bk@work.mysql.com's avatar
bk@work.mysql.com committed
362 363 364 365 366 367 368 369 370 371
   This program is free software; you can redistribute it and/or modify\n\
   it under the terms of the GNU General Public License as published by\n\
   the Free Software Foundation; either version 2 of the License, or\n\
   (at your option) any later version.\n\n\
   This program is distributed in the hope that it will be useful,\n\
   but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n\
   GNU General Public License for more details.\n\n\
   You should have received a copy of the GNU General Public License\n\
   along with this program; if not, write to the Free Software\n\
372 373
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307\
  USA */\n\n");
bk@work.mysql.com's avatar
bk@work.mysql.com committed
374

375 376
  printf("/* This code is generated by gen_lex_hash.cc that seeks for\
 a perfect\nhash function */\n\n");
bk@work.mysql.com's avatar
bk@work.mysql.com committed
377 378
  printf("#include \"lex.h\"\n\n");

379 380
  generate_find_structs();
  print_find_structs();
bk@work.mysql.com's avatar
bk@work.mysql.com committed
381

382 383
  printf("\nunsigned int sql_functions_max_len=%d;\n",max_len);
  printf("\nunsigned int symbols_max_len=%d;\n\n",max_len2);
384

385 386 387 388
  printf
(
"inline SYMBOL *get_hash_symbol(const char *s,\n\
                                    unsigned int len,bool function)\n\
bk@work.mysql.com's avatar
bk@work.mysql.com committed
389
{\n\
390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
  register uchar *hash_map;\n\
  register const char *cur_str= s;\n\
  if (function){\n\
    if (len>sql_functions_max_len) return 0;\n\
    hash_map= sql_functions_map;\n\
    register uint32 cur_struct= uint4korr(hash_map+((len-1)*4));\n\
\n\
    for(;;){\n\
      register uchar first_char= (uchar)cur_struct;\n\
\n\
      if (first_char==0){\n\
        register int16 ires= (int16)(cur_struct>>16);\n\
        if (ires==array_elements(symbols)) return 0;\n\
        register SYMBOL *res;\n\
        if (ires>=0) \n\
          res= symbols+ires;\n\
        else\n\
          res= sql_functions-ires-1;\n\
        register uint count= cur_str-s;\n\
        return lex_casecmp(cur_str,res->name+count,len-count) ? 0 : res;\n\
      }\n\
\n\
      register uchar cur_char= (uchar)to_upper_lex[(uchar)*cur_str];\n\
      if (cur_char<first_char) return 0;\n\
      cur_struct>>=8;\n\
      if (cur_char>(uchar)cur_struct) return 0;\n\
\n\
      cur_struct>>=8;\n\
      cur_struct= uint4korr(hash_map+\n\
                        (((uint16)cur_struct + cur_char - first_char)*4));\n\
      cur_str++;\n\
    }\n\
  }else{\n\
    if (len>symbols_max_len) return 0;\n\
    hash_map= symbols_map;\n\
    register uint32 cur_struct= uint4korr(hash_map+((len-1)*4));\n\
\n\
    for(;;){\n\
      register uchar first_char= (uchar)cur_struct;\n\
\n\
      if (first_char==0){\n\
        register int16 ires= (int16)(cur_struct>>16);\n\
        if (ires==array_elements(symbols)) return 0;\n\
        register SYMBOL *res= symbols+ires;\n\
        register uint count= cur_str-s;\n\
        return lex_casecmp(cur_str,res->name+count,len-count)!=0 ? 0 : res;\n\
      }\n\
\n\
      register uchar cur_char= (uchar)to_upper_lex[(uchar)*cur_str];\n\
      if (cur_char<first_char) return 0;\n\
      cur_struct>>=8;\n\
      if (cur_char>(uchar)cur_struct) return 0;\n\
\n\
      cur_struct>>=8;\n\
      cur_struct= uint4korr(hash_map+\n\
                        (((uint16)cur_struct + cur_char - first_char)*4));\n\
      cur_str++;\n\
    }\n\
bk@work.mysql.com's avatar
bk@work.mysql.com committed
448
  }\n\
449 450
}\n"
);
bk@work.mysql.com's avatar
bk@work.mysql.com committed
451
}
452