From 02f8c163e65139ef3d2d7a9967611b30ac6c0f72 Mon Sep 17 00:00:00 2001
From: unknown <monty@mashka.mysql.fi>
Date: Sat, 24 Aug 2002 05:44:16 +0300
Subject: [PATCH] Give better error from reconnect() Fixed hang in
 start_slave_threads() when thread dies quickly.

Docs/manual.texi:
  Changelog
client/mysqltest.c:
  Indentation cleanup
  More DBUG info
libmysql/libmysql.c:
  More DBUG info
  Give better error from reconnect()
mysql-test/r/rpl_rotate_logs.result:
  Update results
mysql-test/t/rpl_log_pos.test:
  Fix for fast machines
mysql-test/t/rpl_rotate_logs.test:
  Updated test to be more portable
scripts/mysql_zap.sh:
  Update for MacOSX
sql/mini_client.cc:
  Better error messages from reconnect.
  Indentation cleanups
sql/slave.cc:
  Fixed hang in start_slave_threads() when thread dies quickly.
sql/slave.h:
  Fixed hang in start_slave_threads() when thread dies quickly.
---
 Docs/manual.texi                    |  2 +
 client/mysqltest.c                  | 21 ++++----
 libmysql/libmysql.c                 | 29 ++++++----
 mysql-test/r/rpl_rotate_logs.result |  2 +-
 mysql-test/t/rpl_log_pos.test       |  3 +-
 mysql-test/t/rpl_rotate_logs.test   |  6 ++-
 scripts/mysql_zap.sh                |  2 +-
 sql/mini_client.cc                  | 13 ++---
 sql/slave.cc                        | 82 +++++++++++++++++------------
 sql/slave.h                         | 14 +++--
 10 files changed, 102 insertions(+), 72 deletions(-)

diff --git a/Docs/manual.texi b/Docs/manual.texi
index 543370f906..2209568ea4 100644
--- a/Docs/manual.texi
+++ b/Docs/manual.texi
@@ -50380,6 +50380,8 @@ Fixed some problems with @code{CREATE TABLE ... SELECT function()}.
 @code{mysqld} now has the option @code{--temp-pool} enabled by default as this
 gives better performance with some operating systems.
 @item
+Fixed hang in @code{CHANGE MASTER TO} if the slave thread died very quickly.
+@item
 Big cleanup in replication code (less logging, better error messages, etc..)
 @item
 If the @code{--code-file} option is specified, the server calls
diff --git a/client/mysqltest.c b/client/mysqltest.c
index cc253e21be..987a614a25 100644
--- a/client/mysqltest.c
+++ b/client/mysqltest.c
@@ -1042,11 +1042,11 @@ int do_let(struct st_query* q)
   if (!*p)
     die("Missing variable name in let\n");
   var_name = p;
-  while(*p && (*p != '=' || isspace(*p)))
+  while (*p && (*p != '=' || isspace(*p)))
     p++;
   var_name_end = p;
   if (*p == '=') p++;
-  while(*p && isspace(*p))
+  while (*p && isspace(*p))
     p++;
   var_val_start = p;
   return var_set(var_name, var_name_end, var_val_start, q->end);
@@ -1054,9 +1054,10 @@ int do_let(struct st_query* q)
 
 int do_rpl_probe(struct st_query* q __attribute__((unused)))
 {
+  DBUG_ENTER("do_rpl_probe");
   if (mysql_rpl_probe(&cur_con->mysql))
-    die("Failed in mysql_rpl_probe(): %s", mysql_error(&cur_con->mysql));
-  return 0;
+    die("Failed in mysql_rpl_probe(): '%s'", mysql_error(&cur_con->mysql));
+  DBUG_RETURN(0);
 }
 
 int do_enable_rpl_parse(struct st_query* q __attribute__((unused)))
@@ -1077,7 +1078,7 @@ int do_sleep(struct st_query* q, my_bool real_sleep)
   char* p=q->first_argument;
   struct timeval t;
   int dec_mul = 1000000;
-  while(*p && isspace(*p)) p++;
+  while (*p && isspace(*p)) p++;
   if (!*p)
     die("Missing argument in sleep\n");
   t.tv_usec = 0;
@@ -1097,7 +1098,7 @@ int do_sleep(struct st_query* q, my_bool real_sleep)
   else
   {
     t.tv_sec = atoi(p);
-    while(*p && *p != '.' && !isspace(*p))
+    while (*p && *p != '.' && !isspace(*p))
       p++;
     if (*p == '.')
     {
@@ -1308,7 +1309,7 @@ int select_connection(struct st_query* q)
   if (!*p)
     die("Missing connection name in connect\n");
   name = p;
-  while(*p && !isspace(*p))
+  while (*p && !isspace(*p))
     p++;
   *p = 0;
 
@@ -1334,7 +1335,7 @@ int close_connection(struct st_query* q)
   if (!*p)
     die("Missing connection name in connect\n");
   name = p;
-  while(*p && !isspace(*p))
+  while (*p && !isspace(*p))
     p++;
   *p = 0;
 
@@ -1790,7 +1791,7 @@ int read_query(struct st_query** q_ptr)
       }
     }
 
-    while(*p && isspace(*p)) p++ ;
+    while (*p && isspace(*p)) p++ ;
     if (*p == '@')
     {
       p++;
@@ -2503,7 +2504,7 @@ int main(int argc, char** argv)
     if (!processed)
     {
       current_line_inc = 0;
-      switch(q->type) {
+      switch (q->type) {
       case Q_WHILE: do_while(q); break;
       case Q_END_BLOCK: do_done(q); break;
       default: current_line_inc = 1; break;
diff --git a/libmysql/libmysql.c b/libmysql/libmysql.c
index 32148baf1d..40b3fb4cc6 100644
--- a/libmysql/libmysql.c
+++ b/libmysql/libmysql.c
@@ -1160,14 +1160,15 @@ static void expand_error(MYSQL* mysql, int error)
 static int get_master(MYSQL* mysql, MYSQL_RES* res, MYSQL_ROW row)
 {
   MYSQL* master;
+  DBUG_ENTER("get_master");
   if (mysql_num_fields(res) < 3)
-    return 1; /* safety */
+    DBUG_RETURN(1); /* safety */
 
   /* use the same username and password as the original connection */
   if (!(master = spawn_init(mysql, row[0], atoi(row[2]), 0, 0)))
-    return 1;
+    DBUG_RETURN(1);
   mysql->master = master;
-  return 0;
+  DBUG_RETURN(0);
 }
 
 
@@ -1183,18 +1184,19 @@ static int get_slaves_from_master(MYSQL* mysql)
   int error = 1;
   int has_auth_info;
   int port_ind;
+  DBUG_ENTER("get_slaves_from_master");
 
   if (!mysql->net.vio && !mysql_real_connect(mysql,0,0,0,0,0,0,0))
   {
     expand_error(mysql, CR_PROBE_MASTER_CONNECT);
-    return 1;
+    DBUG_RETURN(1);
   }
 
   if (mysql_query(mysql, "SHOW SLAVE HOSTS") ||
       !(res = mysql_store_result(mysql)))
   {
     expand_error(mysql, CR_PROBE_SLAVE_HOSTS);
-    return 1;
+    DBUG_RETURN(1);
   }
 
   switch (mysql_num_fields(res)) {
@@ -1238,15 +1240,17 @@ static int get_slaves_from_master(MYSQL* mysql)
 err:
   if (res)
    mysql_free_result(res);
-  return error;
+  DBUG_RETURN(error);
 }
 
 
 int STDCALL mysql_rpl_probe(MYSQL* mysql)
 {
-  MYSQL_RES* res = 0;
+  MYSQL_RES *res= 0;
   MYSQL_ROW row;
   int error = 1;
+  DBUG_ENTER("mysql_rpl_probe");
+
   /*
     First determine the replication role of the server we connected to
     the most reliable way to do this is to run SHOW SLAVE STATUS and see
@@ -1259,7 +1263,7 @@ int STDCALL mysql_rpl_probe(MYSQL* mysql)
       !(res = mysql_store_result(mysql)))
   {
     expand_error(mysql, CR_PROBE_SLAVE_STATUS);
-    return 1;
+    DBUG_RETURN(1);
   }
 
   row= mysql_fetch_row(res);
@@ -1284,7 +1288,7 @@ int STDCALL mysql_rpl_probe(MYSQL* mysql)
 err:
   if (res)
     mysql_free_result(res);
-  return error;
+  DBUG_RETURN(error);
 }
 
 
@@ -1979,7 +1983,11 @@ static my_bool mysql_reconnect(MYSQL *mysql)
   if (!mysql_real_connect(&tmp_mysql,mysql->host,mysql->user,mysql->passwd,
 			  mysql->db, mysql->port, mysql->unix_socket,
 			  mysql->client_flag))
+  {
+    mysql->net.last_errno= tmp_mysql.net.last_errno;
+    strmov(mysql->net.last_error, tmp_mysql.net.last_error);
     DBUG_RETURN(1);
+  }
   tmp_mysql.free_me=mysql->free_me;
   mysql->free_me=0;
   mysql_close(mysql);
@@ -2060,7 +2068,7 @@ mysql_close(MYSQL *mysql)
       mysql->status=MYSQL_STATUS_READY; /* Force command */
       mysql->reconnect=0;
       simple_command(mysql,COM_QUIT,NullS,0,1);
-      end_server(mysql);
+      end_server(mysql);			/* Sets mysql->net.vio= 0 */
     }
     my_free((gptr) mysql->host_info,MYF(MY_ALLOW_ZERO_PTR));
     my_free(mysql->user,MYF(MY_ALLOW_ZERO_PTR));
@@ -2082,7 +2090,6 @@ mysql_close(MYSQL *mysql)
     /* Clear pointers for better safety */
     mysql->host_info=mysql->user=mysql->passwd=mysql->db=0;
     bzero((char*) &mysql->options,sizeof(mysql->options));
-    mysql->net.vio = 0;
 
     /* free/close slave list */
     if (mysql->rpl_pivot)
diff --git a/mysql-test/r/rpl_rotate_logs.result b/mysql-test/r/rpl_rotate_logs.result
index 01e6d2c3a4..d440e157ed 100644
--- a/mysql-test/r/rpl_rotate_logs.result
+++ b/mysql-test/r/rpl_rotate_logs.result
@@ -76,7 +76,7 @@ a
 testing temporary tables part 2
 show slave status;
 Master_Host	Master_User	Master_Port	Connect_retry	Master_Log_File	Read_Master_Log_Pos	Relay_Log_File	Relay_Log_Pos	Relay_Master_Log_File	Slave_IO_Running	Slave_SQL_Running	Replicate_do_db	Replicate_ignore_db	Last_errno	Last_error	Skip_counter	Exec_master_log_pos	Relay_log_space
-127.0.0.1	root	MASTER_PORT	60	master-bin.006	838	slave-relay-bin.004	1816	master-bin.006	Yes	Yes			0		0	838	1816
+127.0.0.1	root	MASTER_PORT	60	master-bin.006	838	slave-relay-bin.001	8034	master-bin.006	Yes	Yes			0		0	838	8034
 lock tables t3 read;
 select count(*) from t3 where n >= 4;
 count(*)
diff --git a/mysql-test/t/rpl_log_pos.test b/mysql-test/t/rpl_log_pos.test
index cce52dc5da..f585fa233c 100644
--- a/mysql-test/t/rpl_log_pos.test
+++ b/mysql-test/t/rpl_log_pos.test
@@ -9,13 +9,14 @@ sync_with_master;
 --replace_result 3306 MASTER_PORT 9306 MASTER_PORT 3334 MASTER_PORT 3336 MASTER_PORT
 show slave status;
 change master to master_log_pos=73;
+sleep 5;
 slave stop;
 
 change master to master_log_pos=73;
 --replace_result 3306 MASTER_PORT 9306 MASTER_PORT 3334 MASTER_PORT 3336 MASTER_PORT
 show slave status;
 slave start;
-sleep 2;
+sleep 5;
 --replace_result 3306 MASTER_PORT 9306 MASTER_PORT 3334 MASTER_PORT 3336 MASTER_PORT
 show slave status;
 change master to master_log_pos=173;
diff --git a/mysql-test/t/rpl_rotate_logs.test b/mysql-test/t/rpl_rotate_logs.test
index fa0c38ae99..cea2f9008d 100644
--- a/mysql-test/t/rpl_rotate_logs.test
+++ b/mysql-test/t/rpl_rotate_logs.test
@@ -14,9 +14,11 @@ connect (slave,localhost,root,,test,0,slave.sock);
 system cat /dev/null > var/slave-data/master.info;
 system chmod 000 var/slave-data/master.info;
 connection slave;
-!slave start;
+--error 1201
+slave start;
 system chmod 600 var/slave-data/master.info;
-!slave start;
+--error 1201
+slave start;
 --replace_result 3306 MASTER_PORT 9306 MASTER_PORT 3334 MASTER_PORT 3336 MASTER_PORT
 !eval change master to master_host='127.0.0.1',master_port=$MASTER_MYPORT,
  master_user='root'; 
diff --git a/scripts/mysql_zap.sh b/scripts/mysql_zap.sh
index 312d15e34d..f485d16428 100644
--- a/scripts/mysql_zap.sh
+++ b/scripts/mysql_zap.sh
@@ -12,7 +12,7 @@ $opt_f= 0;
 $opt_t= 0;
 $opt_a = "";
 
-$BSD = -f '/vmunix' || $ENV{"OS"} eq "SunOS4";
+$BSD = -f '/vmunix' || $ENV{"OS"} eq "SunOS4" || $^O eq 'darwin';
 $LINUX = $^O eq 'linux';
 $pscmd = $BSD ? "/bin/ps -auxww" : $LINUX ? "/bin/ps axuw" : "/bin/ps -ef";
 
diff --git a/sql/mini_client.cc b/sql/mini_client.cc
index 743d522e4b..5bd88e9b09 100644
--- a/sql/mini_client.cc
+++ b/sql/mini_client.cc
@@ -414,10 +414,8 @@ my_bool mc_mysql_reconnect(MYSQL *mysql)
 			mysql->db, mysql->port, mysql->unix_socket,
 			mysql->client_flag, mysql->net.read_timeout))
   {
-#ifdef NOT_USED
-    mysql->net.last_errno=CR_RECONNECT_FAILED;
-    strmov(mysql->net.last_error, ER(mysql->net.last_errno));
-#endif
+    mysql->net.last_errno= tmp_mysql.net.last_errno;
+    strmov(mysql->net.last_error, tmp_mysql.net.last_error);
     DBUG_RETURN(1);
   }
   tmp_mysql.free_me=mysql->free_me;
@@ -888,7 +886,6 @@ mc_mysql_close(MYSQL *mysql)
     /* Clear pointers for better safety */
     mysql->host_info=mysql->user=mysql->passwd=mysql->db=0;
     bzero((char*) &mysql->options,sizeof(mysql->options));
-    mysql->net.vio = 0;
 #ifdef HAVE_OPENSSL
     mysql_ssl_clear(mysql);
 #endif /* HAVE_OPENSSL */
@@ -976,13 +973,13 @@ mc_unpack_fields(MYSQL_DATA *data,MEM_ROOT *alloc,uint fields,
   DBUG_RETURN(result);
 }
 
-int 
-mc_mysql_send_query(MYSQL* mysql, const char* query, uint length)
+int mc_mysql_send_query(MYSQL* mysql, const char* query, uint length)
 {
   return mc_simple_command(mysql, COM_QUERY, query, length, 1);
 }
 
-int  mc_mysql_read_query_result(MYSQL *mysql)
+
+int mc_mysql_read_query_result(MYSQL *mysql)
 {
   uchar *pos;
   ulong field_count;
diff --git a/sql/slave.cc b/sql/slave.cc
index 93a5c6171d..27e9030c00 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -443,14 +443,18 @@ int terminate_slave_thread(THD* thd, pthread_mutex_t* term_lock,
 }
 
 
-int start_slave_thread(pthread_handler h_func, pthread_mutex_t* start_lock,
+int start_slave_thread(pthread_handler h_func, pthread_mutex_t *start_lock,
 		       pthread_mutex_t *cond_lock,
-		       pthread_cond_t* start_cond,
-		       volatile bool* slave_running,
+		       pthread_cond_t *start_cond,
+		       volatile bool *slave_running,
+		       volatile ulong *slave_run_id,
 		       MASTER_INFO* mi)
 {
   pthread_t th;
+  ulong start_id;
   DBUG_ASSERT(mi->inited);
+  DBUG_ENTER("start_slave_thread");
+
   if (start_lock)
     pthread_mutex_lock(start_lock);
   if (!server_id)
@@ -460,7 +464,7 @@ int start_slave_thread(pthread_handler h_func, pthread_mutex_t* start_lock,
     if (start_lock)
       pthread_mutex_unlock(start_lock);
     sql_print_error("Server id not set, will not start slave");
-    return ER_BAD_SLAVE;
+    DBUG_RETURN(ER_BAD_SLAVE);
   }
   
   if (*slave_running)
@@ -469,39 +473,36 @@ int start_slave_thread(pthread_handler h_func, pthread_mutex_t* start_lock,
       pthread_cond_broadcast(start_cond);
     if (start_lock)
       pthread_mutex_unlock(start_lock);
-    return ER_SLAVE_MUST_STOP;
+    DBUG_RETURN(ER_SLAVE_MUST_STOP);
   }
+  start_id= *slave_run_id;
+  DBUG_PRINT("info",("Creating new slave thread"));
   if (pthread_create(&th, &connection_attrib, h_func, (void*)mi))
   {
     if (start_lock)
       pthread_mutex_unlock(start_lock);
-    return ER_SLAVE_THREAD;
+    DBUG_RETURN(ER_SLAVE_THREAD);
   }
   if (start_cond && cond_lock)
   {
     THD* thd = current_thd;
-    while (!*slave_running)
+    while (start_id == *slave_run_id)
     {
+      DBUG_PRINT("sleep",("Waiting for slave thread to start"));
       const char* old_msg = thd->enter_cond(start_cond,cond_lock,
 					    "Waiting for slave thread to start");
       pthread_cond_wait(start_cond,cond_lock);
       thd->exit_cond(old_msg);
-      /*
-	TODO: in a very rare case of init_slave_thread failing, it is
-	possible that we can get stuck here since slave_running will not
-	be set. We need to change slave_running to int and have -1 as
-	error code.
-      */
       if (thd->killed)
       {
 	pthread_mutex_unlock(cond_lock);
-	return ER_SERVER_SHUTDOWN;
+	DBUG_RETURN(ER_SERVER_SHUTDOWN);
       }
     }
   }
   if (start_lock)
     pthread_mutex_unlock(start_lock);
-  return 0;
+  DBUG_RETURN(0);
 }
 
 
@@ -535,13 +536,15 @@ int start_slave_threads(bool need_slave_mutex, bool wait_for_start,
 
   if (thread_mask & SLAVE_IO)
     error=start_slave_thread(handle_slave_io,lock_io,lock_cond_io,
-			     cond_io,&mi->slave_running,
+			     cond_io,
+			     &mi->slave_running, &mi->slave_run_id,
 			     mi);
   if (!error && (thread_mask & SLAVE_SQL))
   {
     error=start_slave_thread(handle_slave_sql,lock_sql,lock_cond_sql,
 			     cond_sql,
-			     &mi->rli.slave_running,mi);
+			     &mi->rli.slave_running, &mi->rli.slave_run_id,
+			     mi);
     if (error)
       terminate_slave_threads(mi, thread_mask & SLAVE_IO, 0);
   }
@@ -1807,23 +1810,30 @@ This may also be a network problem, or just a bug in the master or slave code.\
 /* slave I/O thread */
 pthread_handler_decl(handle_slave_io,arg)
 {
+  THD *thd; // needs to be first for thread_stack
+  MYSQL *mysql;
+  MASTER_INFO *mi = (MASTER_INFO*)arg; 
+  char llbuff[22];
+  uint retry_count;
+  
+  // needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff
+  my_thread_init();
+
 #ifndef DBUG_OFF
 slave_begin:  
 #endif  
-  THD *thd; // needs to be first for thread_stack
-  MYSQL *mysql = NULL ;
-  MASTER_INFO* mi = (MASTER_INFO*)arg; 
-  char llbuff[22];
-  uint retry_count= 0;
   DBUG_ASSERT(mi->inited);
-  
+  mysql= NULL ;
+  retry_count= 0;
+
   pthread_mutex_lock(&mi->run_lock);
+  /* Inform waiting threads that slave has started */
+  mi->slave_run_id++;
+
 #ifndef DBUG_OFF  
   mi->events_till_abort = abort_slave_event_count;
 #endif  
   
-  // needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff
-  my_thread_init();
   thd= new THD; // note that contructor of THD uses DBUG_ !
   DBUG_ENTER("handle_slave_io");
   THD_CHECK_SENTRY(thd);
@@ -2071,26 +2081,32 @@ err:
 
 pthread_handler_decl(handle_slave_sql,arg)
 {
-#ifndef DBUG_OFF
-slave_begin:  
-#endif  
   THD *thd;			/* needs to be first for thread_stack */
   char llbuff[22],llbuff1[22];
   RELAY_LOG_INFO* rli = &((MASTER_INFO*)arg)->rli; 
-  const char* errmsg=0;
+  const char *errmsg;
+
+  // needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff
+  my_thread_init();
+
+#ifndef DBUG_OFF
+slave_begin:  
+#endif  
+
   DBUG_ASSERT(rli->inited);
   pthread_mutex_lock(&rli->run_lock);
   DBUG_ASSERT(!rli->slave_running);
+  errmsg= 0;
 #ifndef DBUG_OFF  
   rli->events_till_abort = abort_slave_event_count;
 #endif  
-  
-  // needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff
-  my_thread_init();
-  thd = new THD; // note that contructor of THD uses DBUG_ !
   DBUG_ENTER("handle_slave_sql");
 
+  thd = new THD; // note that contructor of THD uses DBUG_ !
   THD_CHECK_SENTRY(thd);
+  /* Inform waiting threads that slave has started */
+  rli->slave_run_id++;
+
   pthread_detach_this_thread();
   if (init_slave_thread(thd, SLAVE_THD_SQL))
   {
diff --git a/sql/slave.h b/sql/slave.h
index 4be0178517..b527aceb43 100644
--- a/sql/slave.h
+++ b/sql/slave.h
@@ -154,6 +154,7 @@ typedef struct st_relay_log_info
   */
   volatile uint32 slave_skip_counter;
   volatile ulong abort_pos_wait;	/* Incremented on change master */
+  volatile ulong slave_run_id;		/* Incremented on slave start */
   pthread_mutex_t log_space_lock;
   pthread_cond_t log_space_cond;
   THD * sql_thd;
@@ -171,8 +172,8 @@ typedef struct st_relay_log_info
   
   st_relay_log_info()
   :info_fd(-1),cur_log_fd(-1), cur_log_old_open_count(0), abort_pos_wait(0),
-   inited(0), abort_slave(0), slave_running(0), log_pos_current(0),
-   skip_log_purge(0)
+   slave_run_id(0), inited(0), abort_slave(0), slave_running(0),
+   log_pos_current(0), skip_log_purge(0)
   {
     relay_log_name[0] = master_log_name[0] = 0;
     bzero(&info_file,sizeof(info_file));
@@ -283,11 +284,13 @@ typedef struct st_master_info
   bool inited;
   bool old_format;			/* master binlog is in 3.23 format */
   volatile bool abort_slave, slave_running;
+  volatile ulong slave_run_id;
   bool ignore_stop_event;
   
   
-  st_master_info():fd(-1), io_thd(0), inited(0), old_format(0),abort_slave(0),
-		   slave_running(0)
+  st_master_info()
+    :fd(-1), io_thd(0), inited(0), old_format(0),abort_slave(0),
+     slave_running(0), slave_run_id(0)
   {
     host[0] = 0; user[0] = 0; password[0] = 0;
     bzero(&file, sizeof(file));
@@ -360,7 +363,8 @@ int start_slave_threads(bool need_slave_mutex, bool wait_for_start,
 int start_slave_thread(pthread_handler h_func, pthread_mutex_t* start_lock,
 		       pthread_mutex_t *cond_lock,
 		       pthread_cond_t* start_cond,
-		       volatile bool* slave_running,
+		       volatile bool *slave_running,
+		       volatile ulong *slave_run_id,
 		       MASTER_INFO* mi);
 
 /* If fd is -1, dump to NET */
-- 
2.30.9