Commit 0c66f4a6 authored by V Narayanan's avatar V Narayanan

Bug#45803 Inaccurate estimates for partial key values with IBMDB2I

Some collations were causing IBMDB2I to report
inaccurate key range estimations to the optimizer
for LIKE clauses that select substrings. This can
be seen by running EXPLAIN. This problem primarily
affects multi-byte and unicode character sets.

This patch involves substantial changes to several
modules. There are a number of problems with the
character set and collation handling. These problems
have been or are being fixed,  and a comprehensive
test has been included which should provide much
better coverage than there was before. This test
is enabled only for IBM i 6.1, because that version
has support for the greatest number of collations.

mysql-test/suite/ibmdb2i/r/ibmdb2i_collations.result:
  Bug#45803 Inaccurate estimates for partial key values with IBMDB2I
  
  result file for test case.
mysql-test/suite/ibmdb2i/t/ibmdb2i_collations.test:
  Bug#45803 Inaccurate estimates for partial key values with IBMDB2I
  
  Tests for character sets and collations. This test
  is enabled only for IBM i 6.1, because that version
  has support for the greatest number of collations.
storage/ibmdb2i/db2i_conversion.cc:
  Bug#45803 Inaccurate estimates for partial key values with IBMDB2I
  
  - Added support in convertFieldChars to enable records_in_range
    to determine how many substitute characters were inserted and
    to suppress conversion warnings.
  
  - Fixed bug which was causing all multi-byte and Unicode fields
    to be created as UTF16 (CCSID 1200) fields in DB2. The corrected
    code will now create UCS2 fields as UCS2 (CCSID 13488), UTF8
    fields (except for utf8_general_ci) as UTF8 (CCSID 1208), and
    all other multi-byte or Unicode fields as UTF16.  This will only
    affect tables that are newly created through the IBMDB2I storage
    engine. Existing IBMDB2I tables will retain the original CCSID
    until recreated. The existing behavior is believed to be
    functionally correct, but it may negatively impact performance
    by causing unnecessary character conversion. Additionally, users
    accessing IBMDB2I tables through DB2 should be aware that mixing 
    tables created before and after this change may require extra type
    casts or other workarounds.  For this reason, users who have
    existing IBMDB2I tables using a Unicode collation other than
    utf8_general_ci are encouraged to recreate their tables (e.g.
    ALTER TABLE t1 ENGINE=IBMDB2I) in order to get the updated CCSIDs
    associated with their DB2 tables.
  
  - Improved error reporting for unsupported character sets by forcing
    a check for the iconv conversion table at table creation time,
    rather than at data access time.
storage/ibmdb2i/db2i_myconv.h:
  Bug#45803 Inaccurate estimates for partial key values with IBMDB2I
  
  Fix to set errno when iconv fails.
storage/ibmdb2i/db2i_rir.cc:
  Bug#45803 Inaccurate estimates for partial key values with IBMDB2I
  
  Significant improvements were made to the records_in_range code
  that handles partial length string data in keys for optimizer plan
  estimation. Previously, to obtain an estimate for a partial key
  value, the implementation would perform any necessary character
  conversion and then attempt to determine the unpadded length of
  the partial key by searching for the minimum or maximum sort
  character. While this algorithm was sufficient for most single-byte
  character sets, it did not treat Unicode and multi-byte strings
  correctly. Furthermore, due to an operating system limitation,
  partial keys having UTF8 collations (ICU sort sequences in DB2)
  could not be estimated with this method.
  
  With this patch, the code no longer attempts to explicitly determine
  the unpadded length of the key. Instead, the entire key is converted
  (if necessary), including padding, and then passed to the operating
  system for estimation. Depending on the source and target character
  sets and collations, additional logic is required to correctly
  handle cases in which MySQL uses unconvertible or differently
  -weighted values to pad the key. The bulk of the patch exists
  to implement this additional logic.
storage/ibmdb2i/ha_ibmdb2i.h:
  Bug#45803 Inaccurate estimates for partial key values with IBMDB2I
  
  The convertFieldChars declaration was updated to support additional 
  optional behaviors.
parent 6a14a235
This diff is collapsed.
source suite/ibmdb2i/include/have_ibmdb2i.inc;
source suite/ibmdb2i/include/have_i61.inc;
--disable_warnings
drop table if exists t1, ffd, fd;
--enable_warnings
--disable_abort_on_error
--error 0,255
exec system "DLTF QGPL/FFDOUT" > /dev/null;
--error 0,255
exec system "DLTF QGPL/FDOUT" > /dev/null;
--enable_abort_on_error
let $count= query_get_value(select count(*) from information_schema.COLLATIONS where COLLATION_NAME <> "binary", count(*),1);
while ($count)
{
let $collation = query_get_value(select COLLATION_NAME from information_schema.COLLATIONS where COLLATION_NAME <> "binary" order by COLLATION_NAME desc, COLLATION_NAME, $count);
error 0,1005,2504,2028;
eval CREATE TABLE t1 ($collation integer, c char(10), v varchar(20), index(c), index(v)) collate $collation engine=ibmdb2i;
if (!$mysql_errno)
{
insert into t1 (c,v) values ("abc","def"),("abcd", "def"),("abcde","defg"),("aaaa","bbbb");
insert into t1 select * from t1;
explain select c,v from t1 force index(c) where c like "ab%";
explain select c,v from t1 force index(v) where v like "de%";
drop table t1;
eval create table t1 ($collation char(10) primary key) collate $collation engine=ibmdb2i;
system system "DSPFFD FILE(\"test\"/\"t1\") OUTPUT(*OUTFILE) OUTFILE(QGPL/FFDOUT) OUTMBR(*FIRST *ADD)" > /dev/null;
system system "DSPFD FILE(\"test\"/\"t1\") TYPE(*SEQ) OUTPUT(*OUTFILE) OUTFILE(QGPL/FDOUT) OUTMBR(*FIRST *ADD)" > /dev/null;
drop table t1;
}
dec $count;
}
create table ffd (WHCHD1 CHAR(20), WHCSID decimal(5,0)) engine=ibmdb2i;
system system "CPYF FROMFILE(QGPL/FFDOUT) TOFILE(\"test\"/\"ffd\") mbropt(*replace) fmtopt(*drop *map)" > /dev/null;
create table fd (SQSSEQ CHAR(10)) engine=ibmdb2i;
system system "CPYF FROMFILE(QGPL/FDOUT) TOFILE(\"test\"/\"fd\") mbropt(*replace) fmtopt(*drop *map)" > /dev/null;
create temporary table intermed (row integer key auto_increment, cs char(30), ccsid integer);
insert into intermed (cs, ccsid) select * from ffd;
create temporary table intermed2 (row integer key auto_increment, srtseq char(10));
insert into intermed2 (srtseq) select * from fd;
select ccsid, cs, srtseq from intermed inner join intermed2 on intermed.row = intermed2.row;
drop table ffd, fd;
......@@ -137,7 +137,9 @@ int ha_ibmdb2i::convertFieldChars(enum_conversionDirection direction,
char* output,
size_t ilen,
size_t olen,
size_t* outDataLen)
size_t* outDataLen,
bool tacitErrors,
size_t* substChars)
{
DBUG_PRINT("ha_ibmdb2i::convertFieldChars",("Direction: %d; length = %d", direction, ilen));
......@@ -157,26 +159,26 @@ int ha_ibmdb2i::convertFieldChars(enum_conversionDirection direction,
size_t initOLen= olen;
size_t substitutedChars = 0;
int rc = iconv(conversion, (char**)&input, &ilen, &output, &olen, &substitutedChars );
if (outDataLen) *outDataLen = initOLen - olen;
if (substChars) *substChars = substitutedChars;
if (unlikely(rc < 0))
{
int er = errno;
if (er == EILSEQ)
{
getErrTxt(DB2I_ERR_ILL_CHAR, table->field[fieldID]->field_name);
if (!tacitErrors) getErrTxt(DB2I_ERR_ILL_CHAR, table->field[fieldID]->field_name);
return (DB2I_ERR_ILL_CHAR);
}
else
{
getErrTxt(DB2I_ERR_ICONV,er);
if (!tacitErrors) getErrTxt(DB2I_ERR_ICONV,er);
return (DB2I_ERR_ICONV);
}
}
if (unlikely(substitutedChars))
if (unlikely(substitutedChars) && (!tacitErrors))
{
warning(ha_thd(), DB2I_ERR_SUB_CHARS, table->field[fieldID]->field_name);
}
if (outDataLen) *outDataLen = initOLen - olen;
return (0);
}
......@@ -555,12 +557,12 @@ int ha_ibmdb2i::getFieldTypeMapping(Field* field,
return 1;
if (fieldCharSet->mbmaxlen > 1)
{
if (strncmp(fieldCharSet->name, "ucs2_", sizeof("ucs2_")) == 0 ) // UCS2
if (memcmp(fieldCharSet->name, "ucs2_", sizeof("ucs2_")-1) == 0 ) // UCS2
{
sprintf(stringBuildBuffer, "GRAPHIC(%d)", max(fieldLength / fieldCharSet->mbmaxlen, 1)); // Number of characters
db2Ccsid = 13488;
}
else if (strncmp(fieldCharSet->name, "utf8_", sizeof("utf8_")) == 0 &&
else if (memcmp(fieldCharSet->name, "utf8_", sizeof("utf8_")-1) == 0 &&
strcmp(fieldCharSet->name, "utf8_general_ci") != 0)
{
sprintf(stringBuildBuffer, "CHAR(%d)", max(fieldLength, 1)); // Number of bytes
......@@ -584,12 +586,12 @@ int ha_ibmdb2i::getFieldTypeMapping(Field* field,
{
if (fieldCharSet->mbmaxlen > 1)
{
if (strncmp(fieldCharSet->name, "ucs2_", sizeof("ucs2_")) == 0 ) // UCS2
if (memcmp(fieldCharSet->name, "ucs2_", sizeof("ucs2_")-1) == 0 ) // UCS2
{
sprintf(stringBuildBuffer, "VARGRAPHIC(%d)", max(fieldLength / fieldCharSet->mbmaxlen, 1)); // Number of characters
db2Ccsid = 13488;
}
else if (strncmp(fieldCharSet->name, "utf8_", sizeof("utf8_")) == 0 &&
else if (memcmp(fieldCharSet->name, "utf8_", sizeof("utf8_")-1) == 0 &&
strcmp(fieldCharSet->name, "utf8_general_ci") != 0)
{
sprintf(stringBuildBuffer, "VARCHAR(%d)", max(fieldLength, 1)); // Number of bytes
......@@ -611,12 +613,12 @@ int ha_ibmdb2i::getFieldTypeMapping(Field* field,
{
if (fieldCharSet->mbmaxlen > 1)
{
if (strncmp(fieldCharSet->name, "ucs2_", sizeof("ucs2_")) == 0 ) // UCS2
if (memcmp(fieldCharSet->name, "ucs2_", sizeof("ucs2_")-1) == 0 ) // UCS2
{
sprintf(stringBuildBuffer, "LONG VARGRAPHIC ");
db2Ccsid = 13488;
}
else if (strncmp(fieldCharSet->name, "utf8_", sizeof("utf8_")) == 0 &&
else if (memcmp(fieldCharSet->name, "utf8_", sizeof("utf8_")-1) == 0 &&
strcmp(fieldCharSet->name, "utf8_general_ci") != 0)
{
sprintf(stringBuildBuffer, "LONG VARCHAR ");
......@@ -639,12 +641,12 @@ int ha_ibmdb2i::getFieldTypeMapping(Field* field,
if (fieldCharSet->mbmaxlen > 1)
{
if (strncmp(fieldCharSet->name, "ucs2_", sizeof("ucs2_")) == 0 ) // UCS2
if (memcmp(fieldCharSet->name, "ucs2_", sizeof("ucs2_")-1) == 0 ) // UCS2
{
sprintf(stringBuildBuffer, "DBCLOB(%d)", max(fieldLength / fieldCharSet->mbmaxlen, 1)); // Number of characters
db2Ccsid = 13488;
}
else if (strncmp(fieldCharSet->name, "utf8_", sizeof("utf8_")) == 0 &&
else if (memcmp(fieldCharSet->name, "utf8_", sizeof("utf8_")-1) == 0 &&
strcmp(fieldCharSet->name, "utf8_general_ci") != 0)
{
sprintf(stringBuildBuffer, "CLOB(%d)", max(fieldLength, 1)); // Number of bytes
......@@ -671,11 +673,15 @@ int ha_ibmdb2i::getFieldTypeMapping(Field* field,
return rtnCode;
}
// Check whether there is a character conversion available.
iconv_t temp;
int32 rc = getConversion(toDB2, fieldCharSet, db2Ccsid, temp);
if (unlikely(rc))
return rc;
if (db2Ccsid != 1208 &&
db2Ccsid != 13488)
{
// Check whether there is a character conversion available.
iconv_t temp;
int32 rc = getConversion(toDB2, fieldCharSet, db2Ccsid, temp);
if (unlikely(rc))
return rc;
}
sprintf(stringBuildBuffer, " CCSID %d ", db2Ccsid);
mapping.append(stringBuildBuffer);
......
......@@ -220,6 +220,7 @@ INTERN size_t myconv_dmap(myconv_t cd,
} else {
*pOut=dmapS2S[*pIn];
if (*pOut == 0x00) {
errno=EILSEQ; /* 116 */
*outBytesLeft-=(*inBytesLeft-inLen);
*inBytesLeft=inLen;
*outBuf=pOut;
......
This diff is collapsed.
......@@ -383,7 +383,15 @@ private:
int32 prepareWriteBufferForLobs();
uint32 adjustLobBuffersForRead();
bool lobFieldsRequested();
int convertFieldChars(enum_conversionDirection direction, uint16 fieldID, const char* input, char* output, size_t ilen, size_t olen, size_t* outDataLen);
int convertFieldChars(enum_conversionDirection direction,
uint16 fieldID,
const char* input,
char* output,
size_t ilen,
size_t olen,
size_t* outDataLen,
bool tacitErrors=FALSE,
size_t* substChars=NULL);
/**
Fast integer log2 function
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment