[Openais] defect 169 fixed up (and 172)

Steven Dake sdake at mvista.com
Tue Oct 26 23:58:16 PDT 2004


Mark,

I have a patch for defect 169 (assert on ifdown).  If the interface is
downed during operation, the processor will enter a singleton
configuration and continue to operate.  If the interface is then uped
later, the processor will attempt to join any other configurations it
can locate on the multicast address.  In the process I found a pretty
nasty bug (defect 172) which causes two singleton configurations not to
be able to form a configuration because the local variables that are
normally reset by the evs algorithm are not changed in the singleton
configuration mode.

If you could give it a spin and let me know how it works for you, that'd
be cool.

There may be some kind of bug with this because ckptbench freezes
(meaning it lost some messages) sometimes during operation.

Thanks
-steve

-------------- next part --------------
Files defect-169.orig/exec/.gmi.c.swp and defect-169/exec/.gmi.c.swp differ
Files defect-169.orig/exec/.nfs00cecc2e00000024 and defect-169/exec/.nfs00cecc2e00000024 differ
Files defect-169.orig/exec/.nfs00cecc6e00000028 and defect-169/exec/.nfs00cecc6e00000028 differ
Files defect-169.orig/exec/aispoll.o and defect-169/exec/aispoll.o differ
Files defect-169.orig/exec/clm.o and defect-169/exec/clm.o differ
Files defect-169.orig/exec/crypto.o and defect-169/exec/crypto.o differ
diff -uNr defect-169.orig/exec/gmi.c defect-169/exec/gmi.c
--- defect-169.orig/exec/gmi.c	2004-10-25 12:15:12.000000000 -0700
+++ defect-169/exec/gmi.c	2004-10-26 16:50:09.000000000 -0700
@@ -97,6 +97,7 @@
 #define MAXIOVS						4
 #define RETRANSMIT_ENTRIES_MAX		50
 #define MISSING_MCAST_WINDOW		64
+
 #define TIMEOUT_STATE_GATHER		100
 #define TIMEOUT_TOKEN				1500
 #define TIMEOUT_TOKEN_RETRANSMIT	750	
@@ -105,6 +106,7 @@
 #define HOLE_LIST_MAX				MISSING_MCAST_WINDOW
 #define PRIORITY_MAX				4
 #define PACKET_SIZE_MAX				1500
+#define TIMEOUT_DOWNCHECK			1000
 
 /*
  * Authentication of messages
@@ -281,6 +283,8 @@
 
 poll_timer_handle timer_single_member = 0;
 
+poll_timer_handle timer_netif_check_timeout = 0;
+
 /*
  * Function called when new message received
  */
@@ -408,6 +412,8 @@
 
 static struct sockaddr_in gmi_bound_to;
 
+static struct gmi_interface *gmi_interfaces;
+
 static struct sockaddr_in memb_list[MAX_MEMBERS];
 static int memb_list_entries = 1;
 static int memb_list_entries_confchg = 1;
@@ -486,7 +492,14 @@
 static int message_handler_memb_form_token (struct sockaddr_in *, struct iovec *, int, int);
 static void memb_conf_id_build (struct memb_conf_id *, struct in_addr);
 static int recv_handler (poll_handle handle, int fd, int revents, void *data, unsigned int *prio);
-static int netif_determine (struct sockaddr_in *bindnet, struct sockaddr_in *bound_to);
+static int netif_determine (struct sockaddr_in *bindnet,
+	struct sockaddr_in *bound_to,
+	int *interface_up);
+
+static void netif_down_check (void);
+
+int netif_down_report_down = 1;
+
 static int gmi_build_sockets (struct sockaddr_in *sockaddr_mcast,
 	struct sockaddr_in *sockaddr_bindnet,
 	struct gmi_socket *sockets,
@@ -606,6 +619,7 @@
 	}
 
 	memcpy (&gmi_bound_to, &interfaces->boundto, sizeof (struct sockaddr_in));
+	gmi_interfaces = interfaces;
 
 	/*
 	 * This stuff depends on gmi_build_sockets
@@ -616,7 +630,7 @@
 
 	memcpy (&memb_form_token_conf_id, &memb_conf_id, sizeof (struct memb_conf_id));
 
-	memb_state_gather_enter ();
+	netif_down_check ();
 
 	memset (&memb_next, 0, sizeof (struct sockaddr_in));
 
@@ -1198,9 +1212,9 @@
 	plug_state = GMI_PLUG_PROCESSOR_PLUGGED;
 }
 
-
 static int netif_determine (struct sockaddr_in *bindnet,
-	struct sockaddr_in *bound_to)
+	struct sockaddr_in *bound_to,
+	int *interface_up)
 {
 	struct sockaddr_in *sockaddr_in;
 	int id_fd;
@@ -1210,10 +1224,11 @@
 	int i;
 	in_addr_t mask_addr;
 
+	id_fd = socket (AF_INET, SOCK_STREAM, 0);
+
 	/*
 	 * Generate list of local interfaces in ifc.ifc_req structure
 	 */
-	id_fd = socket (AF_INET, SOCK_STREAM, 0);
 	ifc.ifc_buf = 0;
 	do {
 		numreqs += 32;
@@ -1238,18 +1253,58 @@
 			(sockaddr_in->sin_addr.s_addr & mask_addr) ==
 			(bindnet->sin_addr.s_addr & mask_addr)) {
 
+		sockaddr_in = (struct sockaddr_in *)&ifc.ifc_ifcu.ifcu_req[i].ifr_ifru.ifru_addr;
 			bound_to->sin_addr.s_addr = sockaddr_in->sin_addr.s_addr;
 			res = i;
+
+			if (ioctl(id_fd, SIOCGIFFLAGS,
+				&ifc.ifc_ifcu.ifcu_req[i]) < 0) {
+				printf ("couldn't do ioctl\n");
+			}
+			*interface_up = ifc.ifc_ifcu.ifcu_req[i].ifr_ifru.ifru_flags & IFF_UP;
 			break; /* for */
 		}
 	}
 	free (ifc.ifc_buf);
 	close (id_fd);
 	
-printf ("res is %d\n", res);
 	return (res);
 }
 
+static void timer_function_netif_check_timeout (void *data)
+{
+	int interface_up;
+
+	if (netif_down_report_down) {
+		gmi_log_printf (gmi_log_level_warning,
+			"The network interface is down.\n");
+		netif_down_report_down = 0;
+	}
+
+	netif_determine (&gmi_interfaces[0].bindnet,
+		&gmi_interfaces[0].boundto,
+		&interface_up);
+	if (interface_up) {
+		gmi_log_printf (gmi_log_level_warning,
+			"The network interface is now up.\n");
+		memb_state_gather_enter ();
+		netif_down_report_down = 1;
+	} else {
+		poll_timer_add (*gmi_poll_handle, TIMEOUT_DOWNCHECK, 0,
+			timer_function_netif_check_timeout,
+			&timer_netif_check_timeout);
+	}
+}
+
+/*
+ * Check if an interface is down and reconfigure
+ * gmi waiting for it to come back up
+ */
+static void netif_down_check (void)
+{
+	timer_function_netif_check_timeout (0);
+}
+
 static int gmi_build_sockets (struct sockaddr_in *sockaddr_mcast,
 	struct sockaddr_in *sockaddr_bindnet,
 	struct gmi_socket *sockets,
@@ -1259,6 +1314,7 @@
 	struct sockaddr_in sockaddr_in;
 	char flag;
 	int res;
+	int up;
 	
 	memset (&mreq, 0, sizeof (struct ip_mreq));
 
@@ -1266,7 +1322,8 @@
 	 * Determine the ip address bound to and the interface name
 	 */
 	res = netif_determine (sockaddr_bindnet,
-		bound_to);
+		bound_to,
+		&up);
 
 	if (res == -1) {
 		return (-1);
@@ -1635,7 +1692,13 @@
 	 * If messages mcasted, deliver any new messages to pending queues
 	 */
 	if (fcc_mcast_current) {
-		if (gmi_pend_trans_item->mcast->header.seqid > gmi_highest_seq) {
+		/*
+		 * Only update the highest seq if we have more then
+		 * one processor in the configuration, otherwise the evs
+		 * algorithm will not recover
+		 */
+		if (memb_list_entries > 1 &&
+			gmi_pend_trans_item->mcast->header.seqid > gmi_highest_seq) {
 			gmi_highest_seq = gmi_pend_trans_item->mcast->header.seqid;
 		}
 		pending_queues_deliver ();
@@ -2199,7 +2262,13 @@
 	msg_orf_token.msg_flags = 0;
 	
 	res = sendmsg (gmi_sockets[0].token, &msg_orf_token, MSG_NOSIGNAL);
-	assert (res != -1);
+
+	/*
+	 * was the interface downed?
+	 */
+	if (res == -1 && errno == ENETUNREACH) {
+		netif_down_check ();
+	}
 }
 
 void timer_function_form_token_timeout (void *data)
@@ -2305,7 +2374,6 @@
 //}
 
 	res = sendmsg (gmi_sockets[0].token, &msg_orf_token, MSG_NOSIGNAL);
-	assert (res != -1);
 	
 	/*
 	 * res not used here errors are handled by algorithm
@@ -2448,6 +2516,20 @@
 	 		 */
 			recovery_plug_set ();
 
+			/*
+			 * Reset local variables if we are the only member
+			 */
+			gmi_arut = 0;
+			gmi_highest_seq = 0;
+			gmi_highest_seq_old = 0;
+			last_group_arut = 0;
+			last_released = 0;
+			gmi_barrier_seq = 0;
+
+			sq_reinit (&queue_rtr_items, 0);
+
+			memb_failed_list_entries = 0;
+
 			if (gmi_confchg_fn) {
 				/*
 				 * Determine nodes that left the configuration
@@ -2727,6 +2809,7 @@
 		 */
 		conf_desc->highest_seq = gmi_highest_seq;
 		conf_desc->arut = gmi_arut;
+printf ("setting arut to %d highest seq %d\n", gmi_arut, conf_desc->highest_seq);
 // TODO holes not currently implemented conf_desc->hole_list_entries = 0;
 		memcpy (&conf_desc->conf_id,
 			&memb_form_token_conf_id, sizeof (struct memb_conf_id));
Files defect-169.orig/exec/gmi.o and defect-169/exec/gmi.o differ
Files defect-169.orig/exec/hdb.o and defect-169/exec/hdb.o differ
Files defect-169.orig/exec/libgmi.a and defect-169/exec/libgmi.a differ
Files defect-169.orig/exec/libgmi.so and defect-169/exec/libgmi.so differ
Files defect-169.orig/exec/libgmi.so.1 and defect-169/exec/libgmi.so.1 differ
Files defect-169.orig/exec/libgmi.so.1.0 and defect-169/exec/libgmi.so.1.0 differ
Files defect-169.orig/exec/main.o and defect-169/exec/main.o differ
Files defect-169.orig/exec/tlist.o and defect-169/exec/tlist.o differ


More information about the Openais mailing list