Fix for BUG#24415: Instance manager test im_daemon_life_cycle fails randomly.

The cause of im_daemon_life_cycle.imtest random failures was the following
behaviour of some implementations of LINUX threads: let's suppose that a
process has several threads (in LINUX threads, there is a separate process for
each thread). When the main process gets killed, the parent receives SIGCHLD
before all threads (child processes) die. In other words, the parent receives
SIGCHLD, when its child is not completely dead.

In terms of IM, that means that IM-angel receives SIGCHLD when IM-main is not dead
and still holds some resources. After receiving SIGCHLD, IM-angel restarts
IM-main, but IM-main failed to initialize, because previous instance (copy) of
IM-main still holds server socket (TCP-port).

Another problem here was that IM-angel restarted IM-main only if it was killed
by signal. If it exited with error, IM-angel thought it's intended / graceful
shutdown and exited itself.

So, when the second instance of IM-main failed to initialize, IM-angel thought
it's intended shutdown and quit.

The fix is
  1. to change IM-angel so that it restarts IM-main if it exited with error code;
  2. to change IM-main so that it returns proper exit code in case of failure.
parent 3c88372a
...@@ -10,7 +10,6 @@ ...@@ -10,7 +10,6 @@
# #
############################################################################## ##############################################################################
im_daemon_life_cycle : Bug#24415 see note: [19 Dec 23:17] Trudy Pelzer
ndb_load : Bug#17233 ndb_load : Bug#17233
user_limits : Bug#23921 random failure of user_limits.test user_limits : Bug#23921 random failure of user_limits.test
...@@ -210,10 +210,13 @@ void Listener_thread::run() ...@@ -210,10 +210,13 @@ void Listener_thread::run()
return; return;
err: err:
log_error("Listener: failed to initialize. Initiate shutdown...");
// we have to close the ip sockets in case of error // we have to close the ip sockets in case of error
for (i= 0; i < num_sockets; i++) for (i= 0; i < num_sockets; i++)
closesocket(sockets[i]); closesocket(sockets[i]);
thread_registry.set_error_status();
thread_registry.unregister_thread(&thread_info); thread_registry.unregister_thread(&thread_info);
thread_registry.request_shutdown(); thread_registry.request_shutdown();
my_thread_end(); my_thread_end();
......
...@@ -114,6 +114,9 @@ void stop_all(Guardian_thread *guardian, Thread_registry *registry) ...@@ -114,6 +114,9 @@ void stop_all(Guardian_thread *guardian, Thread_registry *registry)
pthread_cond_signal(&guardian->COND_guardian); pthread_cond_signal(&guardian->COND_guardian);
/* stop all threads */ /* stop all threads */
registry->deliver_shutdown(); registry->deliver_shutdown();
/* Set error status in the thread registry. */
registry->set_error_status();
} }
/* /*
...@@ -123,7 +126,7 @@ void stop_all(Guardian_thread *guardian, Thread_registry *registry) ...@@ -123,7 +126,7 @@ void stop_all(Guardian_thread *guardian, Thread_registry *registry)
architecture. architecture.
*/ */
void manager(const Options &options) int manager(const Options &options)
{ {
Thread_registry thread_registry; Thread_registry thread_registry;
/* /*
...@@ -145,10 +148,10 @@ void manager(const Options &options) ...@@ -145,10 +148,10 @@ void manager(const Options &options)
instance_map.guardian= &guardian_thread; instance_map.guardian= &guardian_thread;
if (instance_map.init() || user_map.init()) if (instance_map.init() || user_map.init())
return; return 1;
if (user_map.load(options.password_file_name)) if (user_map.load(options.password_file_name))
return; return 1;
/* write Instance Manager pid file */ /* write Instance Manager pid file */
...@@ -157,7 +160,7 @@ void manager(const Options &options) ...@@ -157,7 +160,7 @@ void manager(const Options &options)
(int) manager_pid); (int) manager_pid);
if (create_pid_file(options.pid_file_name, manager_pid)) if (create_pid_file(options.pid_file_name, manager_pid))
return; return 1;
/* /*
Initialize signals and alarm-infrastructure. Initialize signals and alarm-infrastructure.
...@@ -301,5 +304,6 @@ err: ...@@ -301,5 +304,6 @@ err:
end_thr_alarm(1); end_thr_alarm(1);
/* don't pthread_exit to kill all threads who did not shut down in time */ /* don't pthread_exit to kill all threads who did not shut down in time */
#endif #endif
}
return thread_registry.get_error_status() ? 1 : 0;
}
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
struct Options; struct Options;
void manager(const Options &options); int manager(const Options &options);
int create_pid_file(const char *pid_file_name, int pid); int create_pid_file(const char *pid_file_name, int pid);
......
...@@ -110,8 +110,7 @@ int main(int argc, char *argv[]) ...@@ -110,8 +110,7 @@ int main(int argc, char *argv[])
else else
#endif #endif
manager(options); return_value= manager(options);
return_value= 0;
err: err:
options.cleanup(); options.cleanup();
...@@ -254,26 +253,23 @@ static void daemonize(const char *log_file_name) ...@@ -254,26 +253,23 @@ static void daemonize(const char *log_file_name)
enum { CHILD_OK= 0, CHILD_NEED_RESPAWN, CHILD_EXIT_ANGEL }; enum { CHILD_OK= 0, CHILD_NEED_RESPAWN, CHILD_EXIT_ANGEL };
static volatile sig_atomic_t child_status= CHILD_OK; static volatile sig_atomic_t child_status= CHILD_OK;
static volatile sig_atomic_t child_exit_code= 0;
/* /*
Signal handler for SIGCHLD: reap child, analyze child exit status, and set Signal handler for SIGCHLD: reap child, analyze child exit code, and set
child_status appropriately. child_status appropriately.
*/ */
void reap_child(int __attribute__((unused)) signo) void reap_child(int __attribute__((unused)) signo)
{ {
int child_exit_status; /* NOTE: As we have only one child, no need to cycle waitpid(). */
/* As we have only one child, no need to cycle waitpid */
if (waitpid(0, &child_exit_status, WNOHANG) > 0) int exit_code;
if (waitpid(0, &exit_code, WNOHANG) > 0)
{ {
if (WIFSIGNALED(child_exit_status)) child_exit_code= exit_code;
child_status= CHILD_NEED_RESPAWN; child_status= exit_code ? CHILD_NEED_RESPAWN : CHILD_EXIT_ANGEL;
else
/*
As reap_child is not called for SIGSTOP, we should be here only
if the child exited normally.
*/
child_status= CHILD_EXIT_ANGEL;
} }
} }
...@@ -353,7 +349,9 @@ spawn: ...@@ -353,7 +349,9 @@ spawn:
else if (child_status == CHILD_NEED_RESPAWN) else if (child_status == CHILD_NEED_RESPAWN)
{ {
child_status= CHILD_OK; child_status= CHILD_OK;
log_error("angel(): mysqlmanager exited abnormally: respawning..."); log_error("angel(): mysqlmanager exited abnormally (exit code: %d):"
"respawning...",
(int) child_exit_code);
sleep(1); /* don't respawn too fast */ sleep(1); /* don't respawn too fast */
goto spawn; goto spawn;
} }
......
...@@ -53,6 +53,7 @@ Thread_info::Thread_info(pthread_t thread_id_arg) : ...@@ -53,6 +53,7 @@ Thread_info::Thread_info(pthread_t thread_id_arg) :
Thread_registry::Thread_registry() : Thread_registry::Thread_registry() :
shutdown_in_progress(false) shutdown_in_progress(false)
,sigwait_thread_pid(pthread_self()) ,sigwait_thread_pid(pthread_self())
,error_status(FALSE)
{ {
pthread_mutex_init(&LOCK_thread_registry, 0); pthread_mutex_init(&LOCK_thread_registry, 0);
pthread_cond_init(&COND_thread_registry_is_empty, 0); pthread_cond_init(&COND_thread_registry_is_empty, 0);
...@@ -243,3 +244,23 @@ void Thread_registry::request_shutdown() ...@@ -243,3 +244,23 @@ void Thread_registry::request_shutdown()
{ {
pthread_kill(sigwait_thread_pid, SIGTERM); pthread_kill(sigwait_thread_pid, SIGTERM);
} }
int Thread_registry::get_error_status()
{
int ret_error_status;
pthread_mutex_lock(&LOCK_thread_registry);
ret_error_status= error_status;
pthread_mutex_unlock(&LOCK_thread_registry);
return ret_error_status;
}
void Thread_registry::set_error_status()
{
pthread_mutex_lock(&LOCK_thread_registry);
error_status= TRUE;
pthread_mutex_unlock(&LOCK_thread_registry);
}
...@@ -97,12 +97,16 @@ public: ...@@ -97,12 +97,16 @@ public:
pthread_mutex_t *mutex); pthread_mutex_t *mutex);
int cond_timedwait(Thread_info *info, pthread_cond_t *cond, int cond_timedwait(Thread_info *info, pthread_cond_t *cond,
pthread_mutex_t *mutex, struct timespec *wait_time); pthread_mutex_t *mutex, struct timespec *wait_time);
int get_error_status();
void set_error_status();
private: private:
Thread_info head; Thread_info head;
bool shutdown_in_progress; bool shutdown_in_progress;
pthread_mutex_t LOCK_thread_registry; pthread_mutex_t LOCK_thread_registry;
pthread_cond_t COND_thread_registry_is_empty; pthread_cond_t COND_thread_registry_is_empty;
pthread_t sigwait_thread_pid; pthread_t sigwait_thread_pid;
bool error_status;
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment