Commit b3cedf63 authored by Vladislav Vaintroub's avatar Vladislav Vaintroub

MDEV-4742 - address review comments.

- Remove second optional parameter to natural_sort_key(), and all fraction
handling.

- Rename natsort_num2str() to natsort_encode_length() to show the intention
that it encodes string *lengths*, and not encode whitespaces and what not.

Handles lengths for which log10(len) >= 10,  even if they do not happen for
MariaDB Strings (where length is limited by 32bit, and log10(len) is <= 9)

- Do not let natural sort key grow past max_packet_length.


- Split Item_func_natural_sort_key::val_str() further and add
natsort_encode_numeric_string(), which contains comment on how
whitespaces are handled.

- Simplify, and speedup to_natsort_key() in common case, by removing
handling of weird charsets utf16/32, that encode numbers in several bytes.
In rare cases utf16/32 is used, we'll convert to utf8 prior to
creating keys, and back to original charset afterwards.
parent 5b29d407
......@@ -64,106 +64,45 @@ SHOW CREATE TABLE t2;
Table Create Table
t2 CREATE TABLE `t2` (
`c` varchar(30) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
`NATURAL_SORT_KEY(c)` varchar(45) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL
`NATURAL_SORT_KEY(c)` varchar(60) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
DROP TABLE t1,t2;
SELECT RPAD(val,21,' ') value , RPAD(NATURAL_SORT_KEY(val),26,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead
SELECT RPAD(val,28,' ') value , RPAD(NATURAL_SORT_KEY(val),35,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead
FROM
(
SELECT 0 val
UNION VALUES ('0.0'),('0.1'), ('0,15'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'),('01'),('0001')
UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_20
UNION VALUES ('1'),('01'),('0001')
UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_27
) AS numbers ORDER BY sortkey;
value sortkey encoding_overhead
0 00 1
0,15 00,115 2
0.0 00.00 2
0.1 00.01 2
1 01 1
1.1 01.01 2
1.001 01.011 1
1.02 01.020 2
1.002 01.021 1
1.3 01.03 2
1.010 01.1100 2
01 010 1
0001 012 -1
10 110 1
100 2100 1
1000 31000 1
10000 410000 1
100000 5100000 1
1000000 61000000 1
10000000 710000000 1
100000000 8100000000 1
1000000000 901000000000 2
10000000000 9110000000000 2
100000000000 92100000000000 2
1000000000000 931000000000000 2
10000000000000 9410000000000000 2
100000000000000 95100000000000000 2
1000000000000000 961000000000000000 2
10000000000000000 9710000000000000000 2
100000000000000000 98100000000000000000 2
1000000000000000000 9901000000000000000000 3
10000000000000000000 9912010000000000000000000 5
100000000000000000000 99121100000000000000000000 5
# Disable fractions handling by passing NULL as second parameter to NATURAL_SORT_KEY
SELECT val value
FROM
(
SELECT 0 val
UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'), ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1'))
AS numbers ORDER BY NATURAL_SORT_KEY(val, NULL);
value
0
0,1
0.1
1
1,1
1,001
1,02
1,002
1,3
1,010
1.1
1.001
1.02
1.002
1.3
1.010
# Use ',' as decimal separator for NATURAL_SORT_KEY
SELECT val value, NATURAL_SORT_KEY(val,',') sortkey
FROM
(
SELECT 0 val
UNION VALUES ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1'))
AS numbers ORDER BY sortkey;
value sortkey
0 00
0,1 00,1
1 01
1,001 01,001
1,002 01,002
1,010 01,010
1,02 01,02
1,1 01,1
1,3 01,3
# Use '.' as decimal separator for NATURAL_SORT_KEY
SELECT val value,NATURAL_SORT_KEY(val,'.') sortkey
FROM
(
SELECT 0 val
UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'))
AS numbers ORDER BY sortkey;
value sortkey
0 00
0.1 00.1
1 01
1.001 01.001
1.002 01.002
1.010 01.010
1.02 01.02
1.1 01.1
1.3 01.3
SET NAMES DEFAULT;
0 000 2
1 010 2
01 011 1
0001 013 -1
10 1100 2
100 21000 2
1000 310000 2
10000 4100000 2
100000 51000000 2
1000000 610000000 2
10000000 7100000000 2
100000000 81000000000 2
1000000000 9010000000000 3
10000000000 91100000000000 3
100000000000 921000000000000 3
1000000000000 9310000000000000 3
10000000000000 94100000000000000 3
100000000000000 951000000000000000 3
1000000000000000 9610000000000000000 3
10000000000000000 97100000000000000000 3
100000000000000000 981000000000000000000 3
1000000000000000000 99010000000000000000000 4
10000000000000000000 991100000000000000000000 4
100000000000000000000 9921000000000000000000000 4
1000000000000000000000 99310000000000000000000000 4
10000000000000000000000 994100000000000000000000000 4
100000000000000000000000 9951000000000000000000000000 4
1000000000000000000000000 99610000000000000000000000000 4
10000000000000000000000000 997100000000000000000000000000 4
100000000000000000000000000 9981000000000000000000000000000 4
1000000000000000000000000000 99901271000000000000000000000000000 8
......@@ -29,36 +29,11 @@ CREATE TABLE t2 AS SELECT c, NATURAL_SORT_KEY(c) FROM t1 WHERE 0;
SHOW CREATE TABLE t2;
DROP TABLE t1,t2;
#Show encoding of numbers, including fractions, and leading whitespace.
SELECT RPAD(val,21,' ') value , RPAD(NATURAL_SORT_KEY(val),26,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead
#Show encoding of numbers, with some leading whitespace.
SELECT RPAD(val,28,' ') value , RPAD(NATURAL_SORT_KEY(val),35,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead
FROM
(
SELECT 0 val
UNION VALUES ('0.0'),('0.1'), ('0,15'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'),('01'),('0001')
UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_20
UNION VALUES ('1'),('01'),('0001')
UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_27
) AS numbers ORDER BY sortkey;
--echo # Disable fractions handling by passing NULL as second parameter to NATURAL_SORT_KEY
SELECT val value
FROM
(
SELECT 0 val
UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'), ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1'))
AS numbers ORDER BY NATURAL_SORT_KEY(val, NULL);
--echo # Use ',' as decimal separator for NATURAL_SORT_KEY
SELECT val value, NATURAL_SORT_KEY(val,',') sortkey
FROM
(
SELECT 0 val
UNION VALUES ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1'))
AS numbers ORDER BY sortkey;
--echo # Use '.' as decimal separator for NATURAL_SORT_KEY
SELECT val value,NATURAL_SORT_KEY(val,'.') sortkey
FROM
(
SELECT 0 val
UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'))
AS numbers ORDER BY sortkey;
SET NAMES DEFAULT;
......@@ -1622,11 +1622,10 @@ class Create_func_name_const : public Create_func_arg2
virtual ~Create_func_name_const() {}
};
class Create_func_natural_sort_key : public Create_native_func
class Create_func_natural_sort_key : public Create_func_arg1
{
public:
virtual Item *create_native(THD *thd, LEX_CSTRING *name,
List<Item> *item_list) override;
virtual Item *create_1_arg(THD *thd, Item *arg1) override;
static Create_func_natural_sort_key s_singleton;
protected:
Create_func_natural_sort_key() {}
......@@ -4664,33 +4663,9 @@ Create_func_md5::create_1_arg(THD *thd, Item *arg1)
Create_func_natural_sort_key Create_func_natural_sort_key::s_singleton;
Item *Create_func_natural_sort_key::create_native(THD *thd, LEX_CSTRING *name,
List<Item> *item_list)
Item *Create_func_natural_sort_key::create_1_arg(THD *thd, Item* arg1)
{
Item *func= NULL;
int arg_count= 0;
if (item_list != NULL)
arg_count= item_list->elements;
Item *param_1, *param_2;
switch (arg_count)
{
case 1:
param_1= item_list->pop();
func= new (thd->mem_root) Item_func_natural_sort_key(thd, param_1);
break;
case 2:
param_1= item_list->pop();
param_2= item_list->pop();
func= new (thd->mem_root) Item_func_natural_sort_key(thd, param_1, param_2);
break;
default:
my_error(ER_WRONG_PARAMCOUNT_TO_NATIVE_FCT, MYF(0), name->str);
break;
}
return func;
return new (thd->mem_root) Item_func_natural_sort_key(thd, arg1);
}
Create_func_monthname Create_func_monthname::s_singleton;
......
......@@ -5440,212 +5440,264 @@ String *Item_temptable_rowid::val_str(String *str)
}
/**
Here is how we generate a key suitable for lexicographic comparison
from a numeric string (representing positive number).
We prepend variable length prefix, that only encodes string length
The longer the string, the lexicographically larger is the prefix.
Rules for generating prefix is following.
1. Small length (1 to 9)
Prefix is single char '0' + length -1
This gives the range '0' - '8'
2. Medium length(10 to 18)
Prefix is 2 chars - concat('9', '0'+length-10)'
This gives range '90'-'98'
3. Large length(19)
Prefix is "990"
4. Huge length (20 to 2^32-1)
Prefix stars with '99', then log10 of the length is added as char
then the number itself is added
The range is '99120' - '9994294967295'
*/
Helper routine to encode length prefix
in natsort_encode_numeric_string().
#define NATSORT_BUFSIZE 16
static size_t natsort_num2str(size_t n, char *s)
{
if (n < 10)
{
s[0]= '0' + (char)n -1;
return 1;
}
The idea is so that bigger input numbers correspond
lexicographically bigger output strings.
if (n < 19)
{
s[0]= '9';
s[1]= '0'+ (char)n- 10;
return 2;
}
Note, that in real use the number would typically
small, as it only computes variable *length prefixes*.
@param[in] n - the number
@param[in] s - output buffer (should be at least 26 bytes large)
@return - length of encoding
Here is how encoding works
- n is from 0 to 8
Output string calculated as '0'+n (range '0' - '8')
- n is from 9 to 17
Output calculated as concat('9', '0' + n -9)'
Output range: '90'-'98'
if (n == 19)
-n is from 18 to 26
Output calculated as concat('99', '0' + n -18)'
Output range '990'-'998'
- n is from 28 to SIZE_T_MAX
Output starts with '999',
then log10(n) is encoded as 2-digit decimal number
then the number itself is added.
Example : for 28 key is concat('999', '01' , '28')
i.e '9990128'
Key legth is 5 + ceil(log10(n))
Output range is
(64bit)'9990128' - '9991918446744073709551615'
(32bit)'9990128' - '999094294967295'
*/
/* Largest length of encoded string.*/
#define NATSORT_BUFSIZE 26
static size_t natsort_encode_length(size_t n, char *s)
{
if (n < 27)
{
s[0]= '9';
s[1]= '9';
s[2]= '0';
return 3;
size_t n_nines= n / 9;
if (n_nines)
memset(s, '9', n_nines);
s[n_nines]= char(n % 9 + '0');
s[n_nines + 1]= 0;
return n_nines + 1;
}
/* Here, n > 19*/
s[0]= '9';
s[1]= '9';
memset(s, '9', 3);
size_t log10n= 0;
for (size_t tmp= n/10; tmp; tmp /= 10)
for (size_t tmp= n / 10; tmp; tmp/= 10)
log10n++;
DBUG_ASSERT(log10n && log10n < 10);
s[2]='0' + (char)log10n;
return ll2str(n,s+3,10,0) - s;
s[3]= '0' + (char) (log10n / 10);
s[4]= '0' + (char) (log10n % 10);
return longlong10_to_str(n, s + 5, 10) - s;
}
enum class NATSORT_ERR
{
SUCCESS= 0,
KEY_TOO_LARGE= 1,
ALLOC_ERROR= 2
};
/*
Encode numeric string for natural sorting.
@param[in] in - start of the numeric string
skipping leading zeros
@param[in] n_digits - length of the string,
in characters, not counting leading zeros.
@param[in] n_lead_zeros - leading zeros count.
@param[out] out - String to write to.
Note:
Special case, where there are only leading zeros
n_digits == 0 and n_leading_zeros > 0
will treated as-if in = "0", n_digits = 1, n_leading_zeros
is decremented.
The resulting encoding of the numeric string is then
CONCAT(natsort_encode_length(n_digits), in,
natsort_encode_length(n_leading_zeros))
*/
static NATSORT_ERR natsort_encode_numeric_string(const char *in,
size_t n_digits,
size_t n_leading_zeros,
String *out)
{
char buf[NATSORT_BUFSIZE];
DBUG_ASSERT(in);
DBUG_ASSERT(n_digits);
if (out->append(buf, natsort_encode_length(n_digits - 1, buf)))
return NATSORT_ERR::ALLOC_ERROR;
if (out->append(in, n_digits))
return NATSORT_ERR::ALLOC_ERROR;
if (out->append(buf, natsort_encode_length(n_leading_zeros, buf)))
return NATSORT_ERR::ALLOC_ERROR;
return NATSORT_ERR::SUCCESS;
}
/*
Calculate max size of the natsort key.
A digit expands to 3 chars - length_prefix, the digit, lead_zero_cnt(0)
With even length L=2N, the largest key corresponds to input string
in form REPEAT('<letter><digit>',N) and the length of a key is
3N + N = 4*N = 2*L
With odd input length L=2*N+1, largest key corresponds to
CONCAT(<digit>,REPEAT('<letter><digit>', N)
with key length is 3 + 4*N = 2*L+1
*/
static size_t natsort_max_key_size(size_t input_size)
{
return input_size * 2 + input_size % 2;
}
/**
Convert a string to natural sort key.
@param[in] in - input string
@param[out] s - output string
@param[out] out - output string
We assume that memory is preallocated for the output
string, so that appends do not fail.
The lead zeros count, if positive, is stored after
the numeric string.
Fractional parts are stored and compared as strings.
Even if they have numbers in them, they should just
be compared as strings.
*/
static void to_natsort_key(const String *in, String *s, my_wc_t decimal_sep)
static NATSORT_ERR to_natsort_key(const String *in, String *out,
size_t max_key_size= (size_t) -1)
{
CHARSET_INFO *cs= in->charset();
uchar *pos= (uchar *) in->ptr();
uchar *end= pos + in->length();
size_t num_len= 0;
size_t leading_zeros= 0;
uchar *num_start= nullptr;
bool in_fraction= false;
size_t fraction_len=0;
size_t n_digits= 0;
size_t n_lead_zeros= 0;
size_t num_start;
for (;;)
for (size_t pos= 0;; pos++)
{
my_wc_t c= 0;
int charlen= cs->mb_wc(&c, pos, end);
bool is_digit= charlen >= 1 && (c >= '0' && c <= '9');
if (!is_digit && (num_start || leading_zeros))
char c= pos < in->length() ? (*in)[pos] : 0;
bool is_digit= (c >= '0' && c <= '9');
if (!is_digit && (n_digits || n_lead_zeros))
{
/*
Handle end of digits run.
Write prefix, natsort_num2str(length) is without leading zeros
Write numeric value, without leading zeros
If there were leading zeros, write variable length suffix
*/
char buf[NATSORT_BUFSIZE];
if (!num_len)
/* Handle end of digits run.*/
if (!n_digits)
{
// number was zeros only
leading_zeros--;
num_len= 1;
/*We only have zeros.*/
n_lead_zeros--;
num_start= pos - 1;
n_digits= 1;
}
NATSORT_ERR err= natsort_encode_numeric_string(
in->ptr() + num_start, n_digits, n_lead_zeros, out);
if (err != NATSORT_ERR::SUCCESS)
return err;
s->append(buf, natsort_num2str(num_len, buf));
if (!num_start)
s->append('0');
else
s->append((const char *) num_start, pos - num_start, cs);
if (leading_zeros)
s->append(buf, natsort_num2str(leading_zeros, buf));
if (out->length() > max_key_size)
return NATSORT_ERR::KEY_TOO_LARGE;
/* Reset state.*/
num_len= 0;
num_start= nullptr;
leading_zeros= 0;
in_fraction= c == decimal_sep;
fraction_len= 0;
n_digits= 0;
num_start= size_t(-1);
n_lead_zeros= 0;
}
if (charlen < 1)
if (pos == in->length())
break;
if (is_digit && !in_fraction)
if (!is_digit)
{
/* Do not count leading zeros in num_len, and num_pos */
if (c != '0' || num_len)
{
num_len++;
if (!num_start)
num_start= pos;
}
else
leading_zeros++;
if (out->append(c))
return NATSORT_ERR::KEY_TOO_LARGE;
}
else
{
/* Append a non-digit, or fractional part of the number (handling the same as string) */
s->append((const char *) pos, charlen);
if (in_fraction)
{
if (!is_digit && fraction_len)
in_fraction= false;
fraction_len++;
}
}
pos+= charlen;
else if (c == '0' && !n_digits)
n_lead_zeros++;
else if (!n_digits++)
num_start= pos;
if (out->length() + n_digits > max_key_size)
return NATSORT_ERR::KEY_TOO_LARGE;
}
return NATSORT_ERR::SUCCESS;
}
String *Item_func_natural_sort_key::val_str(String *s)
String *Item_func_natural_sort_key::val_str(String *out)
{
if (args[0]->is_null())
{
null_value= true;
return nullptr;
}
NATSORT_ERR err= NATSORT_ERR::SUCCESS;
String *in= args[0]->val_str();
CHARSET_INFO *cs= in->charset();
ulong max_allowed_packet= current_thd->variables.max_allowed_packet;
size_t reserve_length= (size_t) in->length() + in->length() / 2 + 2;
if (s->alloc((uint32)reserve_length))
goto error_exit;
s->length(0);
s->set_charset(cs);
to_natsort_key(in, s, decimal_separator());
DBUG_ASSERT(s->length() < reserve_length);
size_t reserve_length= std::max(natsort_max_key_size(in->length()),
(size_t) max_allowed_packet);
uint errs;
String tmp;
/*
to_natsort_key() only support charsets where digits are represented by
a single byte in range 0x30-0x39. Almost everything is OK, just utf16/32
won't do. Full ASCII compatibility is not required, so that SJIS and SWE7
are fine.
*/
if (cs->mbminlen != 1)
{
if (!tmp.copy(in, &my_charset_utf8mb4_bin, &errs))
goto error_exit;
in= &tmp;
}
if (s->length() > max_allowed_packet)
if (out->alloc((uint32) reserve_length))
{
push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN,
ER_WARN_ALLOWED_PACKET_OVERFLOWED,
ER(ER_WARN_ALLOWED_PACKET_OVERFLOWED), func_name(),
max_allowed_packet);
err= NATSORT_ERR::ALLOC_ERROR;
goto error_exit;
}
return s;
error_exit:
null_value=true;
return nullptr;
}
out->length(0);
out->set_charset(in->charset());
err= to_natsort_key(in, out, max_allowed_packet / cs->mbminlen);
my_wc_t Item_func_natural_sort_key::decimal_separator()
{
if (m_decimal_separator != DECIMAL_SEP_UNDEFINED)
return m_decimal_separator;
if (err != NATSORT_ERR::SUCCESS)
{
if (err == NATSORT_ERR::KEY_TOO_LARGE)
{
push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN,
ER_WARN_ALLOWED_PACKET_OVERFLOWED,
ER(ER_WARN_ALLOWED_PACKET_OVERFLOWED), func_name(),
max_allowed_packet);
}
goto error_exit;
}
DBUG_ASSERT(arg_count > 1);
String *s= args[1]->val_str();
if (s && s->length())
if (cs->mbminlen != 1)
{
my_wc_t c;
if (s->charset()->mb_wc(&c, (const uchar *) s->ptr(),
(const uchar *) s->end()))
return c;
/* output string is now utf8, convert to input charset.*/
if (tmp.copy(out, cs, &errs) || out->copy(tmp))
goto error_exit;
}
return DECIMAL_SEP_NONE;
return out;
error_exit:
null_value= true;
return nullptr;
}
bool Item_func_natural_sort_key::fix_length_and_dec(void)
......@@ -5653,15 +5705,13 @@ bool Item_func_natural_sort_key::fix_length_and_dec(void)
if (agg_arg_charsets_for_string_result(collation, args, 1))
return true;
DBUG_ASSERT(collation.collation != NULL);
uint32 max_clen= args[0]->max_char_length();
fix_char_length(max_clen + max_clen/2 + ((max_clen <= 3)?1:0));
set_maybe_null();
uint32 max_char_len=
(uint32) natsort_max_key_size(args[0]->max_char_length());
fix_char_length(max_char_len);
if (arg_count> 1 && args[1]->can_eval_in_optimize())
{
/* Initialize decimal separator */
m_decimal_separator= decimal_separator();
}
set_maybe_null(args[0]->maybe_null() ||
max_char_len * collation.collation->mbmaxlen >
current_thd->variables.max_allowed_packet);
return false;
}
......
......@@ -273,17 +273,9 @@ class Item_func_aes_decrypt :public Item_aes_crypt
class Item_func_natural_sort_key : public Item_str_func
{
my_wc_t m_decimal_separator;
static constexpr my_wc_t DECIMAL_SEP_UNDEFINED=ULONG_MAX-1;
static constexpr my_wc_t DECIMAL_SEP_NONE= ULONG_MAX;
public:
Item_func_natural_sort_key(THD *thd, Item *a)
: Item_str_func(thd, a), m_decimal_separator(DECIMAL_SEP_NONE){}
Item_func_natural_sort_key(THD *thd, Item *a, Item *b)
: Item_str_func(thd, a, b), m_decimal_separator(DECIMAL_SEP_UNDEFINED){}
my_wc_t decimal_separator();
: Item_str_func(thd, a){};
String *val_str(String *) override;
LEX_CSTRING func_name_cstring() const override
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment