2 Monitor status of quagga daemons and restart if necessary.
4 Copyright (C) 2004 Andrew J. Schorr
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 #include <lib/version.h>
33 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
36 /* Macros to help randomize timers. */
37 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
38 #define FUZZY(X) ((X)+JITTER((X)/20))
40 #define DEFAULT_PERIOD 5
41 #define DEFAULT_TIMEOUT 10
42 #define DEFAULT_RESTART_TIMEOUT 20
43 #define DEFAULT_LOGLEVEL LOG_INFO
44 #define DEFAULT_MIN_RESTART 60
45 #define DEFAULT_MAX_RESTART 600
46 #ifdef PATH_WATCHQUAGGA_PID
47 #define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
49 #define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
52 #define VTYDIR DAEMON_VTY_DIR
54 #define VTYDIR STATEDIR
57 #define PING_TOKEN "PING"
59 /* Needs to be global, referenced somewhere inside libzebra. */
60 struct thread_master *master;
66 MODE_SEPARATE_RESTART,
67 MODE_PHASED_ZEBRA_RESTART,
68 MODE_PHASED_ALL_RESTART
71 static const char *mode_str[] =
75 "individual daemon restart",
76 "phased zebra restart",
77 "phased global restart for any failure",
85 PHASE_ZEBRA_RESTART_PENDING,
86 PHASE_WAITING_ZEBRA_UP
89 static const char *phase_str[] =
93 "Waiting for other daemons to come down",
94 "Zebra restart job running",
95 "Waiting for zebra to come up",
99 #define PHASE_TIMEOUT (3*gs.restart_timeout)
108 struct thread *t_kill;
112 static struct global_state
115 restart_phase_t phase;
116 struct thread *t_phase_hanging;
120 long restart_timeout;
121 long min_restart_interval;
122 long max_restart_interval;
124 struct daemon *daemons;
125 const char *restart_command;
126 const char *start_command;
127 const char *stop_command;
128 struct restart_info restart;
129 int unresponsive_restart;
131 struct daemon *special; /* points to zebra when doing phased restart */
134 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
136 .mode = MODE_MONITOR,
139 .period = 1000*DEFAULT_PERIOD,
140 .timeout = DEFAULT_TIMEOUT,
141 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
142 .loglevel = DEFAULT_LOGLEVEL,
143 .min_restart_interval = DEFAULT_MIN_RESTART,
144 .max_restart_interval = DEFAULT_MAX_RESTART,
158 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
160 static const char *state_str[] =
171 daemon_state_t state;
173 struct timeval echo_sent;
175 struct thread *t_wakeup;
176 struct thread *t_read;
177 struct thread *t_write;
179 struct restart_info restart;
182 static const struct option longopts[] =
184 { "daemon", no_argument, NULL, 'd'},
185 { "statedir", required_argument, NULL, 'S'},
186 { "no-echo", no_argument, NULL, 'e'},
187 { "loglevel", required_argument, NULL, 'l'},
188 { "interval", required_argument, NULL, 'i'},
189 { "timeout", required_argument, NULL, 't'},
190 { "restart-timeout", required_argument, NULL, 'T'},
191 { "restart", required_argument, NULL, 'r'},
192 { "start-command", required_argument, NULL, 's'},
193 { "kill-command", required_argument, NULL, 'k'},
194 { "restart-all", required_argument, NULL, 'R'},
195 { "all-restart", no_argument, NULL, 'a'},
196 { "always-all-restart", no_argument, NULL, 'A'},
197 { "unresponsive-restart", no_argument, NULL, 'z'},
198 { "min-restart-interval", required_argument, NULL, 'm'},
199 { "max-restart-interval", required_argument, NULL, 'M'},
200 { "pid-file", required_argument, NULL, 'p'},
201 { "blank-string", required_argument, NULL, 'b'},
202 { "help", no_argument, NULL, 'h'},
203 { "version", no_argument, NULL, 'v'},
207 static int try_connect(struct daemon *dmn);
208 static int wakeup_send_echo(struct thread *t_wakeup);
209 static void try_restart(struct daemon *dmn);
210 static void phase_check(void);
213 usage(const char *progname, int status)
216 fprintf(stderr, "Try `%s --help' for more information.\n", progname);
219 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
220 Watchdog program to monitor status of quagga daemons and try to restart\n\
221 them if they are down or unresponsive. It determines whether a daemon is\n\
222 up based on whether it can connect to the daemon's vty unix stream socket.\n\
223 It then repeatedly sends echo commands over that socket to determine whether\n\
224 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
225 on the socket connection and know immediately that the daemon is down.\n\n\
226 The daemons to be monitored should be listed on the command line.\n\n\
227 This program can run in one of 5 modes:\n\n\
229 Just monitor and report on status changes. Example:\n\
230 %s -d zebra ospfd bgpd\n\n\
232 Whenever any daemon hangs or crashes, use the given command to restart\n\
233 them all. Example:\n\
235 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
238 When any single daemon hangs or crashes, restart only the daemon that's\n\
239 in trouble using the supplied restart command. Example:\n\
240 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
242 The same as the previous mode, except that there is special treatment when\n\
243 the zebra daemon is in trouble. In that case, a phased restart approach\n\
244 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
246 %s -adz -r '/sbin/service %%s restart' \\\n\
247 -s '/sbin/service %%s start' \\\n\
248 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
250 This is the same as the previous mode, except that the phased restart\n\
251 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
252 %s -Adz -r '/sbin/service %%s restart' \\\n\
253 -s '/sbin/service %%s start' \\\n\
254 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
255 As of this writing, it is believed that mode 2 [%s]\n\
256 is not safe, and mode 3 [%s] may not be safe with some of the\n\
257 routing daemons.\n\n\
258 In order to avoid attempting to restart the daemons in a fast loop,\n\
259 the -m and -M options allow you to control the minimum delay between\n\
260 restart commands. The minimum restart delay is recalculated each time\n\
261 a restart is attempted: if the time since the last restart attempt exceeds\n\
262 twice the -M value, then the restart delay is set to the -m value.\n\
263 Otherwise, the interval is doubled (but capped at the -M value).\n\n",
264 progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
265 progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],
269 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
270 to syslog instead of stdout.\n\
271 -S, --statedir Set the vty socket directory (default is %s)\n\
272 -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
273 option is necessary if the daemons do not support the\n\
275 -l, --loglevel Set the logging level (default is %d).\n\
276 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
277 but it can be set higher than %d if extra-verbose debugging\n\
278 messages are desired.\n\
279 -m, --min-restart-interval\n\
280 Set the minimum seconds to wait between invocations of daemon\n\
281 restart commands (default is %d).\n\
282 -M, --max-restart-interval\n\
283 Set the maximum seconds to wait between invocations of daemon\n\
284 restart commands (default is %d).\n\
285 -i, --interval Set the status polling interval in seconds (default is %d)\n\
286 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
287 -T, --restart-timeout\n\
288 Set the restart (kill) timeout in seconds (default is %d).\n\
289 If any background jobs are still running after this much\n\
290 time has elapsed, they will be killed.\n\
291 -r, --restart Supply a Bourne shell command to use to restart a single\n\
292 daemon. The command string should include '%%s' where the\n\
293 name of the daemon should be substituted.\n\
294 Note that -r and -R are incompatible.\n\
295 -s, --start-command\n\
296 Supply a Bourne shell to command to use to start a single\n\
297 daemon. The command string should include '%%s' where the\n\
298 name of the daemon should be substituted.\n\
299 -k, --kill-command\n\
300 Supply a Bourne shell to command to use to stop a single\n\
301 daemon. The command string should include '%%s' where the\n\
302 name of the daemon should be substituted.\n\
304 When one or more daemons is down, try to restart everything\n\
305 using the Bourne shell command supplied as the argument.\n\
306 Note that -r and -R are incompatible.\n\
307 -z, --unresponsive-restart\n\
308 When a daemon is unresponsive, treat it as being down for\n\
311 When zebra hangs or crashes, restart all daemons using\n\
312 this phased approach: 1. stop all other daemons; 2. restart\n\
313 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
314 -A, --always-all-restart\n\
315 When any daemon (not just zebra) hangs or crashes, use the\n\
316 same phased restart mechanism described above for -a.\n\
317 Requires -r, -s, and -k.\n\
318 -p, --pid-file Set process identifier file name\n\
320 -b, --blank-string\n\
321 When the supplied argument string is found in any of the\n\
322 various shell command arguments (-r, -s, -k, or -R), replace\n\
323 it with a space. This is an ugly hack to circumvent problems\n\
324 passing command-line arguments with embedded spaces.\n\
325 -v, --version Print program version\n\
326 -h, --help Display this help and exit\n",
327 VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
328 DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
329 DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,
337 run_background(char *shell_cmd)
341 switch (child = fork())
344 zlog_err("fork failed, cannot run command [%s]: %s",
345 shell_cmd,safe_strerror(errno));
349 /* Use separate process group so child processes can be killed easily. */
350 if (setpgid(0,0) < 0)
351 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
355 char *const argv[4] = { shell, dashc, shell_cmd, NULL};
356 execv("/bin/sh", argv);
357 zlog_err("execv(/bin/sh -c '%s') failed: %s",
358 shell_cmd,safe_strerror(errno));
362 /* Parent process: we will reap the child later. */
363 zlog_err("Forked background command [pid %d]: %s",(int)child,shell_cmd);
368 static struct timeval *
369 time_elapsed(struct timeval *result, const struct timeval *start_time)
371 gettimeofday(result,NULL);
372 result->tv_sec -= start_time->tv_sec;
373 result->tv_usec -= start_time->tv_usec;
374 while (result->tv_usec < 0)
376 result->tv_usec += 1000000L;
383 restart_kill(struct thread *t_kill)
385 struct restart_info *restart = THREAD_ARG(t_kill);
386 struct timeval delay;
388 time_elapsed(&delay,&restart->time);
389 zlog_warn("Warning: %s %s child process %d still running after "
390 "%ld seconds, sending signal %d",
391 restart->what,restart->name,(int)restart->pid, (long)delay.tv_sec,
392 (restart->kills ? SIGKILL : SIGTERM));
393 kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
395 restart->t_kill = thread_add_timer(master,restart_kill,restart,
400 static struct restart_info *
401 find_child(pid_t child)
403 if (gs.mode == MODE_GLOBAL_RESTART)
405 if (gs.restart.pid == child)
411 for (dmn = gs.daemons; dmn; dmn = dmn->next)
413 if (dmn->restart.pid == child)
414 return &dmn->restart;
427 struct restart_info *restart;
429 switch (child = waitpid(-1,&status,WNOHANG))
432 zlog_err("waitpid failed: %s",safe_strerror(errno));
435 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
439 if ((restart = find_child(child)) != NULL)
441 name = restart->name;
442 what = restart->what;
445 thread_cancel(restart->t_kill);
446 restart->t_kill = NULL;
447 /* Update restart time to reflect the time the command completed. */
448 gettimeofday(&restart->time,NULL);
452 zlog_err("waitpid returned status for an unknown child process %d",
457 if (WIFSTOPPED(status))
458 zlog_warn("warning: %s %s process %d is stopped",
459 what,name,(int)child);
460 else if (WIFSIGNALED(status))
461 zlog_warn("%s %s process %d terminated due to signal %d",
462 what,name,(int)child,WTERMSIG(status));
463 else if (WIFEXITED(status))
465 if (WEXITSTATUS(status) != 0)
466 zlog_warn("%s %s process %d exited with non-zero status %d",
467 what,name,(int)child,WEXITSTATUS(status));
469 zlog_debug("%s %s process %d exited normally",what,name,(int)child);
472 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
473 what,name,(int)child,status);
478 run_job(struct restart_info *restart, const char *cmdtype, const char *command,
479 int force, int update_interval)
481 struct timeval delay;
483 if (gs.loglevel > LOG_DEBUG+1)
484 zlog_debug("attempting to %s %s",cmdtype,restart->name);
488 if (gs.loglevel > LOG_DEBUG+1)
489 zlog_debug("cannot %s %s, previous pid %d still running",
490 cmdtype,restart->name,(int)restart->pid);
494 /* Note: time_elapsed test must come before the force test, since we need
495 to make sure that delay is initialized for use below in updating the
497 if ((time_elapsed(&delay,&restart->time)->tv_sec < restart->interval) &&
500 if (gs.loglevel > LOG_DEBUG+1)
501 zlog_debug("postponing %s %s: "
502 "elapsed time %ld < retry interval %ld",
503 cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
507 gettimeofday(&restart->time,NULL);
510 char cmd[strlen(command)+strlen(restart->name)+1];
511 snprintf(cmd,sizeof(cmd),command,restart->name);
512 if ((restart->pid = run_background(cmd)) > 0)
514 restart->t_kill = thread_add_timer(master,restart_kill,restart,
516 restart->what = cmdtype;
523 /* Calculate the new restart interval. */
526 if (delay.tv_sec > 2*gs.max_restart_interval)
527 restart->interval = gs.min_restart_interval;
528 else if ((restart->interval *= 2) > gs.max_restart_interval)
529 restart->interval = gs.max_restart_interval;
530 if (gs.loglevel > LOG_DEBUG+1)
531 zlog_debug("restart %s interval is now %ld",
532 restart->name,restart->interval);
537 #define SET_READ_HANDLER(DMN) \
538 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
540 #define SET_WAKEUP_DOWN(DMN) \
541 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
544 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
545 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
548 #define SET_WAKEUP_ECHO(DMN) \
549 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
553 wakeup_down(struct thread *t_wakeup)
555 struct daemon *dmn = THREAD_ARG(t_wakeup);
557 dmn->t_wakeup = NULL;
558 if (try_connect(dmn) < 0)
559 SET_WAKEUP_DOWN(dmn);
560 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
566 wakeup_init(struct thread *t_wakeup)
568 struct daemon *dmn = THREAD_ARG(t_wakeup);
570 dmn->t_wakeup = NULL;
571 if (try_connect(dmn) < 0)
573 SET_WAKEUP_DOWN(dmn);
574 zlog_err("%s state -> down : initial connection attempt failed",
576 dmn->state = DAEMON_DOWN;
582 daemon_down(struct daemon *dmn, const char *why)
584 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
585 zlog_err("%s state -> down : %s",dmn->name,why);
586 else if (gs.loglevel > LOG_DEBUG)
587 zlog_debug("%s still down : %s",dmn->name,why);
590 dmn->state = DAEMON_DOWN;
596 THREAD_OFF(dmn->t_read);
597 THREAD_OFF(dmn->t_write);
598 THREAD_OFF(dmn->t_wakeup);
599 if (try_connect(dmn) < 0)
600 SET_WAKEUP_DOWN(dmn);
605 handle_read(struct thread *t_read)
607 struct daemon *dmn = THREAD_ARG(t_read);
608 static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
609 char buf[sizeof(resp)+100];
611 struct timeval delay;
614 if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
618 if (ERRNO_IO_RETRY(errno))
620 /* Pretend it never happened. */
621 SET_READ_HANDLER(dmn);
624 snprintf(why,sizeof(why),"unexpected read error: %s",
625 safe_strerror(errno));
626 daemon_down(dmn,why);
631 daemon_down(dmn,"read returned EOF");
634 if (!dmn->echo_sent.tv_sec)
636 char why[sizeof(buf)+100];
637 snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
638 (int)rc,(int)rc,buf);
639 daemon_down(dmn,why);
643 /* We are expecting an echo response: is there any chance that the
644 response would not be returned entirely in the first read? That
645 seems inconceivable... */
646 if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
648 char why[100+sizeof(buf)];
649 snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
650 "(expecting %u): %.*s",
651 (int)rc,(u_int)sizeof(resp),(int)rc,buf);
652 daemon_down(dmn,why);
656 time_elapsed(&delay,&dmn->echo_sent);
657 dmn->echo_sent.tv_sec = 0;
658 if (dmn->state == DAEMON_UNRESPONSIVE)
660 if (delay.tv_sec < gs.timeout)
662 dmn->state = DAEMON_UP;
663 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
664 "seconds", dmn->name,
665 (long)delay.tv_sec, (long)delay.tv_usec);
668 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
669 "seconds", dmn->name,
670 (long)delay.tv_sec, (long)delay.tv_usec);
672 else if (gs.loglevel > LOG_DEBUG+1)
673 zlog_debug("%s: echo response received after %ld.%06ld seconds",
674 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
676 SET_READ_HANDLER(dmn);
678 thread_cancel(dmn->t_wakeup);
679 SET_WAKEUP_ECHO(dmn);
685 daemon_up(struct daemon *dmn, const char *why)
687 dmn->state = DAEMON_UP;
689 dmn->connect_tries = 0;
690 zlog_notice("%s state -> up : %s",dmn->name,why);
692 SET_WAKEUP_ECHO(dmn);
697 check_connect(struct thread *t_write)
699 struct daemon *dmn = THREAD_ARG(t_write);
701 socklen_t reslen = sizeof(sockerr);
704 if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
706 zlog_warn("%s: check_connect: getsockopt failed: %s",
707 dmn->name,safe_strerror(errno));
708 daemon_down(dmn,"getsockopt failed checking connection success");
711 if ((reslen == sizeof(sockerr)) && sockerr)
714 snprintf(why,sizeof(why),
715 "getsockopt reports that connection attempt failed: %s",
716 safe_strerror(sockerr));
717 daemon_down(dmn,why);
721 daemon_up(dmn,"delayed connect succeeded");
726 wakeup_connect_hanging(struct thread *t_wakeup)
728 struct daemon *dmn = THREAD_ARG(t_wakeup);
731 dmn->t_wakeup = NULL;
732 snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
734 daemon_down(dmn,why);
738 /* Making connection to protocol daemon. */
740 try_connect(struct daemon *dmn)
743 struct sockaddr_un addr;
746 if (gs.loglevel > LOG_DEBUG+1)
747 zlog_debug("%s: attempting to connect",dmn->name);
748 dmn->connect_tries++;
750 memset (&addr, 0, sizeof (struct sockaddr_un));
751 addr.sun_family = AF_UNIX;
752 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
753 gs.vtydir,dmn->name);
754 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
755 len = addr.sun_len = SUN_LEN(&addr);
757 len = sizeof (addr.sun_family) + strlen (addr.sun_path);
758 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
760 /* Quick check to see if we might succeed before we go to the trouble
761 of creating a socket. */
762 if (access(addr.sun_path, W_OK) < 0)
765 zlog_err("%s: access to socket %s denied: %s",
766 dmn->name,addr.sun_path,safe_strerror(errno));
770 if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
772 zlog_err("%s(%s): cannot make socket: %s",
773 __func__,addr.sun_path, safe_strerror(errno));
777 if (set_nonblocking(sock) < 0)
779 zlog_err("%s(%s): set_nonblocking(%d) failed",
780 __func__, addr.sun_path, sock);
785 if (connect (sock, (struct sockaddr *) &addr, len) < 0)
787 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
789 if (gs.loglevel > LOG_DEBUG)
790 zlog_debug("%s(%s): connect failed: %s",
791 __func__,addr.sun_path, safe_strerror(errno));
795 if (gs.loglevel > LOG_DEBUG)
796 zlog_debug("%s: connection in progress",dmn->name);
797 dmn->state = DAEMON_CONNECTING;
799 dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
800 dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
802 SET_READ_HANDLER(dmn);
807 SET_READ_HANDLER(dmn);
808 daemon_up(dmn,"connect succeeded");
813 phase_hanging(struct thread *t_hanging)
815 gs.t_phase_hanging = NULL;
816 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
817 phase_str[gs.phase],PHASE_TIMEOUT);
818 gs.phase = PHASE_NONE;
823 set_phase(restart_phase_t new_phase)
825 gs.phase = new_phase;
826 if (gs.t_phase_hanging)
827 thread_cancel(gs.t_phase_hanging);
828 gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
839 case PHASE_STOPS_PENDING:
842 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
843 set_phase(PHASE_WAITING_DOWN);
845 case PHASE_WAITING_DOWN:
846 if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
848 zlog_info("Phased restart: all routing daemons now down.");
849 run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
850 set_phase(PHASE_ZEBRA_RESTART_PENDING);
852 case PHASE_ZEBRA_RESTART_PENDING:
853 if (gs.special->restart.pid)
855 zlog_info("Phased restart: %s restart job completed.",gs.special->name);
856 set_phase(PHASE_WAITING_ZEBRA_UP);
858 case PHASE_WAITING_ZEBRA_UP:
859 if (!IS_UP(gs.special))
861 zlog_info("Phased restart: %s is now up.",gs.special->name);
864 for (dmn = gs.daemons; dmn; dmn = dmn->next)
866 if (dmn != gs.special)
867 run_job(&dmn->restart,"start",gs.start_command,1,0);
870 gs.phase = PHASE_NONE;
871 THREAD_OFF(gs.t_phase_hanging);
872 zlog_notice("Phased global restart has completed.");
878 try_restart(struct daemon *dmn)
884 case MODE_GLOBAL_RESTART:
885 run_job(&gs.restart,"restart",gs.restart_command,0,1);
887 case MODE_SEPARATE_RESTART:
888 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
890 case MODE_PHASED_ZEBRA_RESTART:
891 if (dmn != gs.special)
893 if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
894 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
896 zlog_debug("%s: postponing restart attempt because master %s daemon "
897 "not up [%s], or phased restart in progress",
898 dmn->name,gs.special->name,state_str[gs.special->state]);
902 case MODE_PHASED_ALL_RESTART:
903 if ((gs.phase != PHASE_NONE) || gs.numpids)
905 if (gs.loglevel > LOG_DEBUG+1)
906 zlog_debug("postponing phased global restart: restart already in "
907 "progress [%s], or outstanding child processes [%d]",
908 phase_str[gs.phase],gs.numpids);
911 /* Is it too soon for a restart? */
913 struct timeval delay;
914 if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
915 gs.special->restart.interval)
917 if (gs.loglevel > LOG_DEBUG+1)
918 zlog_debug("postponing phased global restart: "
919 "elapsed time %ld < retry interval %ld",
920 (long)delay.tv_sec,gs.special->restart.interval);
924 zlog_info("Phased restart: stopping all routing daemons.");
925 /* First step: stop all other daemons. */
926 for (dmn = gs.daemons; dmn; dmn = dmn->next)
928 if (dmn != gs.special)
929 run_job(&dmn->restart,"stop",gs.stop_command,1,1);
931 set_phase(PHASE_STOPS_PENDING);
934 zlog_err("error: unknown restart mode %d",gs.mode);
940 wakeup_unresponsive(struct thread *t_wakeup)
942 struct daemon *dmn = THREAD_ARG(t_wakeup);
944 dmn->t_wakeup = NULL;
945 if (dmn->state != DAEMON_UNRESPONSIVE)
946 zlog_err("%s: no longer unresponsive (now %s), "
947 "wakeup should have been cancelled!",
948 dmn->name,state_str[dmn->state]);
951 SET_WAKEUP_UNRESPONSIVE(dmn);
958 wakeup_no_answer(struct thread *t_wakeup)
960 struct daemon *dmn = THREAD_ARG(t_wakeup);
962 dmn->t_wakeup = NULL;
963 dmn->state = DAEMON_UNRESPONSIVE;
964 zlog_err("%s state -> unresponsive : no response yet to ping "
965 "sent %ld seconds ago",dmn->name,gs.timeout);
966 if (gs.unresponsive_restart)
968 SET_WAKEUP_UNRESPONSIVE(dmn);
975 wakeup_send_echo(struct thread *t_wakeup)
977 static const char echocmd[] = "echo " PING_TOKEN;
979 struct daemon *dmn = THREAD_ARG(t_wakeup);
981 dmn->t_wakeup = NULL;
982 if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
983 ((size_t)rc != sizeof(echocmd)))
985 char why[100+sizeof(echocmd)];
986 snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
987 echocmd,(int)rc,(u_int)sizeof(echocmd));
988 daemon_down(dmn,why);
992 gettimeofday(&dmn->echo_sent,NULL);
993 dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
1001 zlog_notice("Terminating on signal");
1006 valid_command(const char *cmd)
1010 return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
1013 /* This is an ugly hack to circumvent problems with passing command-line
1014 arguments that contain spaces. The fix is to use a configuration file. */
1016 translate_blanks(const char *cmd, const char *blankstr)
1020 size_t bslen = strlen(blankstr);
1022 if (!(res = strdup(cmd)))
1027 while ((p = strstr(res,blankstr)) != NULL)
1031 memmove(p+1,p+bslen,strlen(p+bslen)+1);
1037 main(int argc, char **argv)
1039 const char *progname;
1041 int daemon_mode = 0;
1042 const char *pidfile = DEFAULT_PIDFILE;
1043 const char *special = "zebra";
1044 const char *blankstr = NULL;
1045 static struct quagga_signal_t my_signals[] =
1057 .handler = sigchild,
1061 if ((progname = strrchr (argv[0], '/')) != NULL)
1066 gs.restart.name = "all";
1067 while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
1068 longopts, 0)) != EOF)
1075 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1077 fputs("Ambiguous operating mode selected.\n",stderr);
1078 return usage(progname,1);
1080 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1083 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1085 fputs("Ambiguous operating mode selected.\n",stderr);
1086 return usage(progname,1);
1088 gs.mode = MODE_PHASED_ALL_RESTART;
1100 if (!valid_command(optarg))
1102 fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
1104 return usage(progname,1);
1106 gs.stop_command = optarg;
1111 if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
1112 (gs.loglevel < LOG_EMERG))
1114 fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
1115 return usage(progname,1);
1122 if ((sscanf(optarg,"%ld%1s",
1123 &gs.min_restart_interval,garbage) != 1) ||
1124 (gs.min_restart_interval < 0))
1126 fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
1128 return usage(progname,1);
1135 if ((sscanf(optarg,"%ld%1s",
1136 &gs.max_restart_interval,garbage) != 1) ||
1137 (gs.max_restart_interval < 0))
1139 fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
1141 return usage(progname,1);
1149 if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
1152 fprintf(stderr,"Invalid interval argument: %s\n",optarg);
1153 return usage(progname,1);
1155 gs.period = 1000*period;
1162 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1163 (gs.mode == MODE_SEPARATE_RESTART))
1165 fputs("Ambiguous operating mode selected.\n",stderr);
1166 return usage(progname,1);
1168 if (!valid_command(optarg))
1171 "Invalid restart command, must contain '%%s': %s\n",
1173 return usage(progname,1);
1175 gs.restart_command = optarg;
1176 if (gs.mode == MODE_MONITOR)
1177 gs.mode = MODE_SEPARATE_RESTART;
1180 if (gs.mode != MODE_MONITOR)
1182 fputs("Ambiguous operating mode selected.\n",stderr);
1183 return usage(progname,1);
1185 if (strchr(optarg,'%'))
1188 "Invalid restart-all arg, must not contain '%%s': %s\n",
1190 return usage(progname,1);
1192 gs.restart_command = optarg;
1193 gs.mode = MODE_GLOBAL_RESTART;
1196 if (!valid_command(optarg))
1198 fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
1200 return usage(progname,1);
1202 gs.start_command = optarg;
1210 if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
1213 fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
1214 return usage(progname,1);
1221 if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
1222 (gs.restart_timeout < 1))
1224 fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
1225 return usage(progname,1);
1230 gs.unresponsive_restart = 1;
1233 printf ("%s version %s\n", progname, QUAGGA_VERSION);
1234 puts("Copyright 2004 Andrew J. Schorr");
1237 return usage(progname,0);
1239 fputs("Invalid option.\n",stderr);
1240 return usage(progname,1);
1244 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
1246 fputs("Option -z requires a -r or -R restart option.\n",stderr);
1247 return usage(progname,1);
1252 if (gs.restart_command || gs.start_command || gs.stop_command)
1254 fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
1256 return usage(progname,1);
1259 case MODE_GLOBAL_RESTART:
1260 case MODE_SEPARATE_RESTART:
1261 if (!gs.restart_command || gs.start_command || gs.stop_command)
1263 fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
1265 return usage(progname,1);
1268 case MODE_PHASED_ZEBRA_RESTART:
1269 case MODE_PHASED_ALL_RESTART:
1270 if (!gs.restart_command || !gs.start_command || !gs.stop_command)
1273 "Need start, kill, and restart commands in [%s] mode.\n",
1275 return usage(progname,1);
1282 if (gs.restart_command)
1283 gs.restart_command = translate_blanks(gs.restart_command,blankstr);
1284 if (gs.start_command)
1285 gs.start_command = translate_blanks(gs.start_command,blankstr);
1286 if (gs.stop_command)
1287 gs.stop_command = translate_blanks(gs.stop_command,blankstr);
1290 gs.restart.interval = gs.min_restart_interval;
1291 master = thread_master_create();
1292 signal_init (master, array_size(my_signals), my_signals);
1293 srandom(time(NULL));
1297 struct daemon *tail = NULL;
1299 for (i = optind; i < argc; i++)
1303 if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
1305 fprintf(stderr,"calloc(1,%u) failed: %s\n",
1306 (u_int)sizeof(*dmn), safe_strerror(errno));
1309 dmn->name = dmn->restart.name = argv[i];
1310 dmn->state = DAEMON_INIT;
1314 dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
1315 100+(random() % 900));
1316 dmn->restart.interval = gs.min_restart_interval;
1323 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1324 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1325 !strcmp(dmn->name,special))
1331 fputs("Must specify one or more daemons to monitor.\n",stderr);
1332 return usage(progname,1);
1334 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1335 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
1337 fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
1338 mode_str[gs.mode],special);
1339 return usage(progname,1);
1341 if (gs.special && (gs.numdaemons < 2))
1343 fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
1344 "to watch.\n",mode_str[gs.mode]);
1345 return usage(progname,1);
1348 zlog_default = openzlog(progname, ZLOG_NONE,
1349 LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
1350 zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1353 zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
1354 if (daemon (0, 0) < 0)
1356 fprintf(stderr, "Watchquagga daemon failed: %s", strerror(errno));
1361 zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
1363 /* Make sure we're not already running. */
1364 pid_output (pidfile);
1366 /* Announce which daemons are being monitored. */
1371 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1372 len += strlen(dmn->name)+1;
1378 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1382 strcpy(p,dmn->name);
1385 zlog_notice("%s %s watching [%s], mode [%s]",
1386 progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
1390 thread_main (master);