Commit f48dc5cc authored by Alexander Barkov's avatar Alexander Barkov

Moving the conversion code from String::well_formed_copy()

to my_convert_fix() - a new function in /strings.
parent c4b268ad
...@@ -382,6 +382,16 @@ typedef struct ...@@ -382,6 +382,16 @@ typedef struct
} MY_STRCOPY_STATUS; } MY_STRCOPY_STATUS;
/*
A structure to return the statistics of a Unicode string conversion.
*/
typedef struct
{
MY_STRCOPY_STATUS m_native_copy_status;
const char *m_cannot_convert_error_pos;
} MY_STRCONV_STATUS;
/* See strings/CHARSET_INFO.txt about information on this structure */ /* See strings/CHARSET_INFO.txt about information on this structure */
struct my_charset_handler_st struct my_charset_handler_st
{ {
...@@ -852,10 +862,38 @@ const MY_CONTRACTIONS *my_charset_get_contractions(CHARSET_INFO *cs, ...@@ -852,10 +862,38 @@ const MY_CONTRACTIONS *my_charset_get_contractions(CHARSET_INFO *cs,
extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n, extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n,
const char* fmt, va_list ap); const char* fmt, va_list ap);
/*
Convert a string between two character sets.
Bad byte sequences as well as characters that cannot be
encoded in the destination character set are replaced to '?'.
*/
uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
const char *from, uint32 from_length, const char *from, uint32 from_length,
CHARSET_INFO *from_cs, uint *errors); CHARSET_INFO *from_cs, uint *errors);
/*
Convert a string between two character sets.
Bad byte sequences as well as characters that cannot be
encoded in the destination character set are replaced to '?'.
Not more than "nchars" characters are copied.
Conversion statistics is returnd in "status" and is set as follows:
- status->m_native_copy_status.m_source_end_pos - to the position
between (src) and (src+src_length), where the function stopped reading
the source string.
- status->m_native_copy_status.m_well_formed_error_pos - to the position
between (src) and (src+src_length), where the first badly formed byte
sequence was found, or to NULL if the string was well formed in the
given range.
- status->m_cannot_convert_error_pos - to the position
between (src) and (src+src_length), where the first character that
cannot be represented in the destination character set was found,
or to NULL if all characters in the given range were successfully
converted.
*/
size_t my_convert_fix(CHARSET_INFO *dstcs, char *dst, size_t dst_length,
CHARSET_INFO *srccs, const char *src, size_t src_length,
size_t nchars, MY_STRCONV_STATUS *status);
#define _MY_U 01 /* Upper case */ #define _MY_U 01 /* Upper case */
#define _MY_L 02 /* Lower case */ #define _MY_L 02 /* Lower case */
#define _MY_NMR 04 /* Numeral (digit) */ #define _MY_NMR 04 /* Numeral (digit) */
......
...@@ -914,8 +914,6 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs, ...@@ -914,8 +914,6 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs,
const char *from, uint from_length, const char *from, uint from_length,
uint nchars) uint nchars)
{ {
uint res;
if ((to_cs == &my_charset_bin) || if ((to_cs == &my_charset_bin) ||
(from_cs == &my_charset_bin) || (from_cs == &my_charset_bin) ||
(to_cs == from_cs) || (to_cs == from_cs) ||
...@@ -923,73 +921,10 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs, ...@@ -923,73 +921,10 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs,
{ {
m_cannot_convert_error_pos= NULL; m_cannot_convert_error_pos= NULL;
return to_cs->cset->copy_fix(to_cs, to, to_length, from, from_length, return to_cs->cset->copy_fix(to_cs, to, to_length, from, from_length,
nchars, this); nchars, &m_native_copy_status);
}
else
{
int cnvres;
my_wc_t wc;
my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
const uchar *from_end= (const uchar*) from + from_length;
uchar *to_end= (uchar*) to + to_length;
char *to_start= to;
m_well_formed_error_pos= NULL;
m_cannot_convert_error_pos= NULL;
for ( ; nchars; nchars--)
{
const char *from_prev= from;
if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
from+= cnvres;
else if (cnvres == MY_CS_ILSEQ)
{
if (!m_well_formed_error_pos)
m_well_formed_error_pos= from;
from++;
wc= '?';
}
else if (cnvres > MY_CS_TOOSMALL)
{
/*
A correct multibyte sequence detected
But it doesn't have Unicode mapping.
*/
if (!m_cannot_convert_error_pos)
m_cannot_convert_error_pos= from;
from+= (-cnvres);
wc= '?';
}
else
{
if ((uchar *) from >= from_end)
break; // End of line
// Incomplete byte sequence
if (!m_well_formed_error_pos)
m_well_formed_error_pos= from;
from++;
wc= '?';
}
outp:
if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
to+= cnvres;
else if (cnvres == MY_CS_ILUNI && wc != '?')
{
if (!m_cannot_convert_error_pos)
m_cannot_convert_error_pos= from_prev;
wc= '?';
goto outp;
}
else
{
from= from_prev;
break;
}
}
m_source_end_pos= from;
res= (uint) (to - to_start);
} }
return res; return my_convert_fix(to_cs, to, to_length, from_cs, from, from_length,
nchars, this);
} }
......
...@@ -43,14 +43,13 @@ inline uint32 copy_and_convert(char *to, uint32 to_length, ...@@ -43,14 +43,13 @@ inline uint32 copy_and_convert(char *to, uint32 to_length,
} }
class String_copier: private MY_STRCOPY_STATUS class String_copier: private MY_STRCONV_STATUS
{ {
const char *m_cannot_convert_error_pos;
public: public:
const char *source_end_pos() const const char *source_end_pos() const
{ return m_source_end_pos; } { return m_native_copy_status.m_source_end_pos; }
const char *well_formed_error_pos() const const char *well_formed_error_pos() const
{ return m_well_formed_error_pos; } { return m_native_copy_status.m_well_formed_error_pos; }
const char *cannot_convert_error_pos() const const char *cannot_convert_error_pos() const
{ return m_cannot_convert_error_pos; } { return m_cannot_convert_error_pos; }
const char *most_important_error_pos() const const char *most_important_error_pos() const
......
...@@ -1161,3 +1161,76 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, ...@@ -1161,3 +1161,76 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
DBUG_ASSERT(FALSE); // Should never get to here DBUG_ASSERT(FALSE); // Should never get to here
return 0; // Make compiler happy return 0; // Make compiler happy
} }
size_t
my_convert_fix(CHARSET_INFO *to_cs, char *to, size_t to_length,
CHARSET_INFO *from_cs, const char *from, size_t from_length,
size_t nchars, MY_STRCONV_STATUS *status)
{
int cnvres;
my_wc_t wc;
my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
const uchar *from_end= (const uchar*) from + from_length;
uchar *to_end= (uchar*) to + to_length;
char *to_start= to;
DBUG_ASSERT(to_cs != &my_charset_bin);
DBUG_ASSERT(from_cs != &my_charset_bin);
status->m_native_copy_status.m_well_formed_error_pos= NULL;
status->m_cannot_convert_error_pos= NULL;
for ( ; nchars; nchars--)
{
const char *from_prev= from;
if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
from+= cnvres;
else if (cnvres == MY_CS_ILSEQ)
{
if (!status->m_native_copy_status.m_well_formed_error_pos)
status->m_native_copy_status.m_well_formed_error_pos= from;
from++;
wc= '?';
}
else if (cnvres > MY_CS_TOOSMALL)
{
/*
A correct multibyte sequence detected
But it doesn't have Unicode mapping.
*/
if (!status->m_cannot_convert_error_pos)
status->m_cannot_convert_error_pos= from;
from+= (-cnvres);
wc= '?';
}
else
{
if ((uchar *) from >= from_end)
break; // End of line
// Incomplete byte sequence
if (!status->m_native_copy_status.m_well_formed_error_pos)
status->m_native_copy_status.m_well_formed_error_pos= from;
from++;
wc= '?';
}
outp:
if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
to+= cnvres;
else if (cnvres == MY_CS_ILUNI && wc != '?')
{
if (!status->m_cannot_convert_error_pos)
status->m_cannot_convert_error_pos= from_prev;
wc= '?';
goto outp;
}
else
{
from= from_prev;
break;
}
}
status->m_native_copy_status.m_source_end_pos= from;
return to - to_start;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment