[Openais] [PATCH 1/4] Resolve abort during simulatenous stopping of atleast 4 nodes

Steven Dake sdake at redhat.com
Tue Mar 29 14:10:51 PDT 2011


Reviewed-by: Steven Dake <sdake at redhat.com>

On 03/29/2011 02:55 AM, Jan Friesse wrote:
> Backport of Corosync d99fba72e65545d8a3573b754525bd2ec8dcc540
> 
> consider 5 nodes.
> 
> node 3,4 stopped (by random stopping) node 1,2,5 form new configuration
> and during recovery node 1 and node 2 are stopped (via service service
> corosync stop).  This causes 5 never to finish recovery within the timeout
> period, triggering a token loss in recovery.  Bug #623176 resolved an assert
> which happens because the full ring id was being restored.  The resolution
> to Bug #623176 was to not restore the full ring id, and instead operate
> (according to specifications) the new ring id.  Unfortunately this exposes
> a problem whereby the restarting of nodes 1-4 generate the same ring id.
> This ring id gets to the recovery failed node 5 which is now in gather,
> and triggers a condition not accounted for in the original totem specification.
> 
> It appears later work from Dr. Agarwal's PHD dissertation considers this
> scenario.  That solution entails rejecting the regular token in the above
> condition.  Since the ring id is also used to make decisions for commit token
> acceptance, we must also take care to reject the regular token in all cases
> after transitioning from OPERATIONAL.
> 
> Signed-off-by: Jan Friesse <jfriesse at redhat.com>
> ---
>  branches/whitetank/exec/totemsrp.c |   12 ++++++++++++
>  1 files changed, 12 insertions(+), 0 deletions(-)
> 
> diff --git a/branches/whitetank/exec/totemsrp.c b/branches/whitetank/exec/totemsrp.c
> index 5f3c319..9fe79e7 100644
> --- a/branches/whitetank/exec/totemsrp.c
> +++ b/branches/whitetank/exec/totemsrp.c
> @@ -498,6 +498,8 @@ struct totemsrp_instance {
>  	unsigned int my_pbl;
>  
>  	unsigned int my_cbl;
> +
> +	uint32_t orf_token_discard;
>  };
>  
>  struct message_handlers {
> @@ -637,6 +639,8 @@ void totemsrp_instance_initialize (struct totemsrp_instance *instance)
>  	instance->my_high_seq_received = SEQNO_START_MSG;
>  
>  	instance->my_high_delivered = SEQNO_START_MSG;
> +
> +	instance->orf_token_discard = 0;
>  }
>  
>  void main_token_seqid_get (
> @@ -1461,6 +1465,7 @@ static void timer_function_orf_token_timeout (void *data)
>  			log_printf (instance->totemsrp_log_level_notice,
>  				"The token was lost in the RECOVERY state.\n");
>  			memb_recovery_state_token_loss (instance);
> +			instance->orf_token_discard = 1;
>  			break;
>  	}
>  }
> @@ -1711,6 +1716,8 @@ static void memb_state_gather_enter (
>  	struct totemsrp_instance *instance,
>  	int gather_from)
>  {
> +	instance->orf_token_discard = 1;
> +
>  	memb_set_merge (
>  		&instance->my_id, 1,
>  		instance->my_proc_list, &instance->my_proc_list_entries);
> @@ -1823,6 +1830,8 @@ static void memb_state_recovery_enter (
>  	log_printf (instance->totemsrp_log_level_notice,
>  		"entering RECOVERY state.\n");
>  
> +	instance->orf_token_discard = 0;
> +
>  	instance->my_high_ring_delivered = 0;
>  
>  	sq_reinit (&instance->recovery_sort_queue, SEQNO_START_MSG);
> @@ -3278,6 +3287,9 @@ static int message_handler_orf_token (
>  			/ 1000.0);
>  #endif
>  
> +	if (instance->orf_token_discard) {
> +		return (0);
> +	}
>  #ifdef TEST_DROP_ORF_TOKEN_PERCENTAGE
>  	if (random()%100 < TEST_DROP_ORF_TOKEN_PERCENTAGE) {
>  		return (0);



More information about the Openais mailing list