[Openais] firewire

ray klassen julius_ahenobarbus at yahoo.co.uk
Tue Mar 8 20:36:55 PST 2011


As requested

COROSYNC.CONF ********************************************************

# Please read the corosync.conf.5 manual page
compatibility: whitetank
cluster {
        name : HA

                clusternodes {
                        clusternode {     
                                votes: 1  
                                nodeid: 1 
                                name: ulmo
                        }
                        clusternode {
                                votes: 1  
                                nodeid: 2 
                                name: osse
                        }
                }
                cman {
                        expected_votes: 2
                        cluster_id: 1 
                        nodename: osse
                        two-node: 1   
                        max_queued: 10
                }
}
service {
        name: corosync_cman
        ver: 0
}
quorum {
        provider: quorum_cman
}

aisexec {
        user: root 
        group: root
}



totem {
    version: 2
 
     # How long before declaring a token lost (ms)
     token:          5000
 
     # How many token retransmits before forming a new configuration
     token_retransmits_before_loss_const: 20
 
     # How long to wait for join messages in the membership protocol (ms)
     join:           1000
 
     # How long to wait for consensus to be achieved before starting a new round 
of membership configuration (ms)
     consensus:      7500
 
     # Turn off the virtual synchrony filter
     vsftype:        none
 
     # Number of messages that may be sent by one processor on receipt of the 
token
     max_messages:   20
 
     # Disable encryption
     secauth:    off
 
     # How many threads to use for encryption/decryption
     threads:       0
     
     # Limit generated nodeids to 31-bits (positive signed integers)
     clear_node_high_bit: yes
     
     # Optionally assign a fixed node id (integer)
     # nodeid:         1234
    interface {
        member {
            memberaddr: 10.10.10.20
        }
        member {
            memberaddr: 10.10.10.10
        }
        ringnumber: 0
        bindnetaddr: 10.10.10.0
        mcastport: 5405
    }
    transport: udpu
}

logging {
    fileline: off
    to_logfile: yes
    to_syslog: yes
    debug: on
    logfile: /var/log/cluster/corosync.log
    debug: off
    timestamp: on
    logger_subsys {
        subsys: AMF
        debug: off
    }
}

amf {
     mode: disabled
 }


COROSYNC-OBJCTL****************************************************

compatibility=whitetank
cluster.name=HA
cluster.clusternodes.clusternode.votes=1
cluster.clusternodes.clusternode.nodeid=1
cluster.clusternodes.clusternode.name=ulmo
cluster.clusternodes.clusternode.votes=1
cluster.clusternodes.clusternode.nodeid=2
cluster.clusternodes.clusternode.name=osse
cluster.cman.expected_votes=2
cluster.cman.cluster_id=1
cluster.cman.nodename=osse
cluster.cman.two-node=1
cluster.cman.max_queued=10
service.name=corosync_cman
service.ver=0
quorum.provider=quorum_cman
aisexec.user=root
aisexec.group=root
totem.version=2
totem.token=5000
totem.token_retransmits_before_loss_const=20
totem.join=1000
totem.consensus=7500
totem.vsftype=none
totem.max_messages=20
totem.secauth=off
totem.threads=0
totem.clear_node_high_bit=yes
totem.transport=udpu
totem.interface.ringnumber=0
totem.interface.bindnetaddr=10.10.10.0
totem.interface.mcastport=5405
totem.interface.member.memberaddr=10.10.10.20
totem.interface.member.memberaddr=10.10.10.10
logging.fileline=off
logging.to_logfile=yes
logging.to_syslog=yes
logging.debug=off
logging.logfile=/var/log/cluster/corosync.log
logging.timestamp=on
logging.logger_subsys.subsys=AMF
logging.logger_subsys.debug=off
amf.mode=disabled
runtime.services.cman.service_id=9
runtime.services.evs.service_id=0
runtime.services.evs.0.tx=0
runtime.services.evs.0.rx=0
runtime.services.cfg.service_id=7
runtime.services.cfg.0.tx=0
runtime.services.cfg.0.rx=0
runtime.services.cfg.1.tx=0
runtime.services.cfg.1.rx=0
runtime.services.cfg.2.tx=0
runtime.services.cfg.2.rx=0
runtime.services.cfg.3.tx=0
runtime.services.cfg.3.rx=0
runtime.services.cpg.service_id=8
runtime.services.cpg.0.tx=5
runtime.services.cpg.0.rx=10
runtime.services.cpg.1.tx=0
runtime.services.cpg.1.rx=0
runtime.services.cpg.2.tx=0
runtime.services.cpg.2.rx=0
runtime.services.cpg.3.tx=40
runtime.services.cpg.3.rx=60
runtime.services.cpg.4.tx=0
runtime.services.cpg.4.rx=0
runtime.services.cpg.5.tx=1
runtime.services.cpg.5.rx=2
runtime.services.confdb.service_id=11
runtime.services.pload.service_id=13
runtime.services.pload.0.tx=0
runtime.services.pload.0.rx=0
runtime.services.pload.1.tx=0
runtime.services.pload.1.rx=0
runtime.services.quorum.service_id=12
runtime.connections.active=7
runtime.connections.closed=1
runtime.connections.pacemakerd:1506:0.service_id=7
runtime.connections.pacemakerd:1506:0.client_pid=1506
runtime.connections.pacemakerd:1506:0.responses=2
runtime.connections.pacemakerd:1506:0.dispatched=0
runtime.connections.pacemakerd:1506:0.requests=2
runtime.connections.pacemakerd:1506:0.sem_retry_count=0
runtime.connections.pacemakerd:1506:0.send_retry_count=0
runtime.connections.pacemakerd:1506:0.recv_retry_count=0
runtime.connections.pacemakerd:1506:0.flow_control=0
runtime.connections.pacemakerd:1506:0.flow_control_count=0
runtime.connections.pacemakerd:1506:0.queue_size=0
runtime.connections.pacemakerd:1506:14.service_id=8
runtime.connections.pacemakerd:1506:14.client_pid=1506
runtime.connections.pacemakerd:1506:14.responses=10
runtime.connections.pacemakerd:1506:14.dispatched=10
runtime.connections.pacemakerd:1506:14.requests=10
runtime.connections.pacemakerd:1506:14.sem_retry_count=0
runtime.connections.pacemakerd:1506:14.send_retry_count=0
runtime.connections.pacemakerd:1506:14.recv_retry_count=0
runtime.connections.pacemakerd:1506:14.flow_control=0
runtime.connections.pacemakerd:1506:14.flow_control_count=0
runtime.connections.pacemakerd:1506:14.queue_size=0
runtime.connections.stonithd:1510:15.service_id=8
runtime.connections.stonithd:1510:15.client_pid=1510
runtime.connections.stonithd:1510:15.responses=2
runtime.connections.stonithd:1510:15.dispatched=1
runtime.connections.stonithd:1510:15.requests=2
runtime.connections.stonithd:1510:15.sem_retry_count=0
runtime.connections.stonithd:1510:15.send_retry_count=0
runtime.connections.stonithd:1510:15.recv_retry_count=0
runtime.connections.stonithd:1510:15.flow_control=0
runtime.connections.stonithd:1510:15.flow_control_count=0
runtime.connections.stonithd:1510:15.queue_size=0
runtime.connections.attrd:1513:16.service_id=8
runtime.connections.attrd:1513:16.client_pid=1513
runtime.connections.attrd:1513:16.responses=5
runtime.connections.attrd:1513:16.dispatched=7
runtime.connections.attrd:1513:16.requests=5
runtime.connections.attrd:1513:16.sem_retry_count=0
runtime.connections.attrd:1513:16.send_retry_count=0
runtime.connections.attrd:1513:16.recv_retry_count=0
runtime.connections.attrd:1513:16.flow_control=0
runtime.connections.attrd:1513:16.flow_control_count=0
runtime.connections.attrd:1513:16.queue_size=0
runtime.connections.cib:1511:17.service_id=8
runtime.connections.cib:1511:17.client_pid=1511
runtime.connections.cib:1511:17.responses=21
runtime.connections.cib:1511:17.dispatched=22
runtime.connections.cib:1511:17.requests=21
runtime.connections.cib:1511:17.sem_retry_count=0
runtime.connections.cib:1511:17.send_retry_count=0
runtime.connections.cib:1511:17.recv_retry_count=0
runtime.connections.cib:1511:17.flow_control=0
runtime.connections.cib:1511:17.flow_control_count=0
runtime.connections.cib:1511:17.queue_size=0
runtime.connections.crmd:1515:18.service_id=8
runtime.connections.crmd:1515:18.client_pid=1515
runtime.connections.crmd:1515:18.responses=12
runtime.connections.crmd:1515:18.dispatched=15
runtime.connections.crmd:1515:18.requests=12
runtime.connections.crmd:1515:18.sem_retry_count=0
runtime.connections.crmd:1515:18.send_retry_count=0
runtime.connections.crmd:1515:18.recv_retry_count=0
runtime.connections.crmd:1515:18.flow_control=0
runtime.connections.crmd:1515:18.flow_control_count=0
runtime.connections.crmd:1515:18.queue_size=0
runtime.connections.corosync-objctl:1877:20.service_id=11
runtime.connections.corosync-objctl:1877:20.client_pid=1877
runtime.connections.corosync-objctl:1877:20.responses=278
runtime.connections.corosync-objctl:1877:20.dispatched=0
runtime.connections.corosync-objctl:1877:20.requests=281
runtime.connections.corosync-objctl:1877:20.sem_retry_count=0
runtime.connections.corosync-objctl:1877:20.send_retry_count=0
runtime.connections.corosync-objctl:1877:20.recv_retry_count=0
runtime.connections.corosync-objctl:1877:20.flow_control=0
runtime.connections.corosync-objctl:1877:20.flow_control_count=0
runtime.connections.corosync-objctl:1877:20.queue_size=0
runtime.totem.pg.mrp.srp.orf_token_tx=0
runtime.totem.pg.mrp.srp.orf_token_rx=142206
runtime.totem.pg.mrp.srp.memb_merge_detect_tx=0
runtime.totem.pg.mrp.srp.memb_merge_detect_rx=70047
runtime.totem.pg.mrp.srp.memb_join_tx=2
runtime.totem.pg.mrp.srp.memb_join_rx=3
runtime.totem.pg.mrp.srp.mcast_tx=77
runtime.totem.pg.mrp.srp.mcast_retx=0
runtime.totem.pg.mrp.srp.mcast_rx=115
runtime.totem.pg.mrp.srp.memb_commit_token_tx=2
runtime.totem.pg.mrp.srp.memb_commit_token_rx=2
runtime.totem.pg.mrp.srp.token_hold_cancel_tx=37
runtime.totem.pg.mrp.srp.token_hold_cancel_rx=58
runtime.totem.pg.mrp.srp.operational_entered=1
runtime.totem.pg.mrp.srp.operational_token_lost=0
runtime.totem.pg.mrp.srp.gather_entered=2
runtime.totem.pg.mrp.srp.gather_token_lost=0
runtime.totem.pg.mrp.srp.commit_entered=1
runtime.totem.pg.mrp.srp.commit_token_lost=0
runtime.totem.pg.mrp.srp.recovery_entered=1
runtime.totem.pg.mrp.srp.recovery_token_lost=0
runtime.totem.pg.mrp.srp.consensus_timeouts=0
runtime.totem.pg.mrp.srp.mtt_rx_token=196
runtime.totem.pg.mrp.srp.avg_token_workload=0
runtime.totem.pg.mrp.srp.avg_backlog_calc=0
runtime.totem.pg.mrp.srp.rx_msg_dropped=0
runtime.totem.pg.mrp.srp.members.168430090.ip=r(0) ip(10.10.10.10) 
runtime.totem.pg.mrp.srp.members.168430090.join_count=1
runtime.totem.pg.mrp.srp.members.168430090.status=joined
runtime.totem.pg.mrp.srp.members.336202250.ip=r(0) ip(10.10.10.20) 
runtime.totem.pg.mrp.srp.members.336202250.join_count=1
runtime.totem.pg.mrp.srp.members.336202250.status=joined
runtime.blackbox.dump_flight_data=no
runtime.blackbox.dump_state=no



(Thanks for taking the time with this...)

----- Original Message ----
From: Steven Dake <sdake at redhat.com>
To: ray klassen <julius_ahenobarbus at yahoo.co.uk>
Cc: openais at lists.linux-foundation.org
Sent: Tue, 8 March, 2011 13:56:45
Subject: Re: [Openais] firewire

This indicates you dont have a proper configuration.

Please send your corosync-objctl output and config file as requested
previously.

Thanks
-steve
On 03/08/2011 02:40 PM, ray klassen wrote:
> one other thing. in this configuration, corosync has to be shot in the head 
> itself to stop. /etc/init.d/corosync stop results in something like
> "Waiting for corosync services to stop" and lines and lines of dots. Kill -9 is 
>
> the only way, it seems.
> 
> 
> 
> 
> ----- Original Message ----
> From: ray klassen <julius_ahenobarbus at yahoo.co.uk>
> To: openais at lists.linux-foundation.org
> Sent: Tue, 8 March, 2011 13:12:27
> Subject: Re: [Openais] firewire
> 
> MCP is not really mentioned anywhere except ClusterGuy's blog (maybe you're 
>him) 
>
> 
> but from that I'm assuming that you mean starting the pacemaker separately. as 

> /etc/init.d/pacemaker. So I removed the /etc/corosync/services.d/pcmk file. I 
> also (from ClusterGuy's page on 'MCP' 
>http://theclusterguy.clusterlabs.org/post/907043024/introducing-the-pacemaker-master-control-process-for
>r
> 
> ) added 'cman' (yum install cman -- for mailing list readers yet to come) from 

> the alternative 2. 
> 
> 
> And it does work. I now can view a 'partition with quorum' with crm_mon. over 
> firewire, with udpu. 
> 
> 
> Just don't really know how it works. how does pacemaker communicate with the 
> stack? etc.? unix sockets? shared memory? how does corosync communicate with 
>the 
>
> 
> stack? 
> 
> 
> 
> 
> 
> 
> 
> ----- Original Message ----
> From: Steven Dake <sdake at redhat.com>
> To: ray klassen <julius_ahenobarbus at yahoo.co.uk>
> Cc: openais at lists.linux-foundation.org
> Sent: Tue, 8 March, 2011 10:02:28
> Subject: Re: [Openais] firewire
> 
> First off, I'd recommend using the "MCP" process that is part of
> Pacemaker rather then the plugin.
> 
> Second, if you could run corosync-objctl and put the output on the list,
> along with your /etc/corosync/corosyn.conf, that would be helpful.
> 
> Regards
> -steve
> 
> On 03/08/2011 09:19 AM, ray klassen wrote:
>> what I'm finding on further investigation is that all the pacemaker
>> child processes are dying on startup
>>
>>
>> Mar 08 08:15:28 corosync [pcmk  ] ERROR: pcmk_wait_dispatch: Child
>> process lrmd exited (pid=6356, rc=100)
>> Mar 08 08:15:28 corosync [pcmk  ] notice: pcmk_wait_dispatch: Child
>> process lrmd no longer wishes to be respawned
>> Mar 08 08:15:28 corosync [pcmk  ] debug: send_cluster_id: Leaving
>> born-on unset: 308
>> Mar 08 08:15:28 corosync [pcmk  ] debug: send_cluster_id: Local update:
>> id=168430090, born=0, seq=308
>> Mar 08 08:15:28 corosync [pcmk  ] info: update_member: Node wwww.com now
>> has process list: 00000000000000000000000000111302 (1118978)
>> Mar 08 08:15:28 corosync [pcmk  ] ERROR: pcmk_wait_dispatch: Child
>> process cib exited (pid=6355, rc=100)
>> Mar 08 08:15:28 corosync [pcmk  ] notice: pcmk_wait_dispatch: Child
>> process cib no longer wishes to be respawned
>> Mar 08 08:15:28 corosync [pcmk  ] debug: send_cluster_id: Leaving
>> born-on unset: 308
>> Mar 08 08:15:28 corosync [pcmk  ] debug: send_cluster_id: Local update:
>> id=168430090, born=0, seq=308
>> Mar 08 08:15:28 corosync [pcmk  ] info: update_member: Node wwww.com now
>> has process list: 00000000000000000000000000111202 (1118722)
>> Mar 08 08:15:28 corosync [pcmk  ] ERROR: pcmk_wait_dispatch: Child
>> process crmd exited (pid=6359, rc=100)
>> Mar 08 08:15:28 corosync [pcmk  ] notice: pcmk_wait_dispatch: Child
>> process crmd no longer wishes to be respawned
>> Mar 08 08:15:28 corosync [pcmk  ] debug: send_cluster_id: Leaving
>> born-on unset: 308
>> Mar 08 08:15:28 corosync [pcmk  ] debug: send_cluster_id: Local update:
>> id=168430090, born=0, seq=308
>> Mar 08 08:15:28 corosync [pcmk  ] info: update_member: Node wwww.com now
>> has process list: 00000000000000000000000000111002 (1118210)
>> Mar 08 08:15:28 corosync [TOTEM ] mcasted message added to pending queue
>> Mar 08 08:15:28 corosync [pcmk  ] ERROR: pcmk_wait_dispatch: Child
>> process attrd exited (pid=6357, rc=100)
>> Mar 08 08:15:28 corosync [pcmk  ] notice: pcmk_wait_dispatch: Child
>> process attrd no longer wishes to be respawned
>> Mar 08 08:15:28 corosync [pcmk  ] debug: send_cluster_id: Leaving
>> born-on unset: 308
>> Mar 08 08:15:28 corosync [pcmk  ] debug: send_cluster_id: Local update:
>> id=168430090, born=0, seq=308
>> Mar 08 08:15:28 corosync [pcmk  ] info: update_member: Node wwww.com now
>> has process list: 00000000000000000000000000110002 (1114114)
>> Mar 08 08:15:28 corosync [pcmk  ] ERROR: pcmk_wait_dispatch: Child
>> process pengine exited (pid=6358, rc=100)
>> Mar 08 08:15:28 corosync [pcmk  ] notice: pcmk_wait_dispatch: Child
>> process pengine no longer wishes to be respawned
>> Mar 08 08:15:28 corosync [pcmk  ] debug: send_cluster_id: Leaving
>> born-on unset: 308
>> Mar 08 08:15:28 corosync [pcmk  ] debug: send_cluster_id: Local update:
>> id=168430090, born=0, seq=308
>> Mar 08 08:15:28 corosync [pcmk  ] info: update_member: Node wwww.com now
>> has process list: 00000000000000000000000000100002 (1048578)
>> Mar 08 08:15:28 corosync [TOTEM ] mcasted message added to pending queue
>> Mar 08 08:15:28 corosync [pcmk  ] ERROR: pcmk_wait_dispatch: Child
>> process stonith-ng exited (pid=6354, rc=100)
>> Mar 08 08:15:28 corosync [pcmk  ] notice: pcmk_wait_dispatch: Child
>> process stonith-ng no longer wishes to be respawned
>> Mar 08 08:15:28 corosync [pcmk  ] debug: send_cluster_id: Leaving
>> born-on unset: 308
>> Mar 08 08:15:28 corosync [pcmk  ] debug: send_cluster_id: Local update:
>> id=168430090, born=0, seq=308
>> Mar 08 08:15:28 corosync [pcmk  ] info: update_member: Node wwww.com now
>> has process list: 00000000000000000000000000000002 (2)
>> Mar 08 08:15:28 corosync [TOTEM ] mcasted message added to pending queue
>> Mar
>>
>>
>>
>> ------------------------------------------------------------------------
>> *From:* Dan Frincu <df.cluster at gmail.com>
>> *To:* openais at lists.linux-foundation.org
>> *Sent:* Tue, 8 March, 2011 2:45:00
>> *Subject:* Re: [Openais] firewire
>>
>>
>>
>> On Tue, Mar 8, 2011 at 2:07 AM, ray klassen
>> <julius_ahenobarbus at yahoo.co.uk <mailto:julius_ahenobarbus at yahoo.co.uk>>
>> wrote:
>>
>>     well I have the 1.3.0 version of corosync seemingly happy with udpu and
>>     firewire. The logs report connection back and forth between the two
>>     boxes. But
>>     now crm_mon never connects. Does pacemaker not support udpu yet?
>>
>>
>> Pacemaker is the Cluster Resource Manager, so it doesn't really care
>> about the underlying method that the Messaging and Membership layer uses
>> to connect between nodes.
>>
>> I've had this issue (crm_mon not connecting) when I performed an upgrade
>> from openais-0.80 to corosync-1.3.0 with udpu, I solved it by eventually
>> rebooting the servers. In your case I doubt it's an upgrade between
>> versions of software, since you've reinstalled.
>>
>> My 2 cents.
>>  
>>
>>
>>     pacemaker-1.1.4-5.fc14.i686
>>     (I switched to fedora from debian to get the latest version of corosync)
>>
>>
>>
>>
>>     ----- Original Message ----
>>     From: Steven Dake <sdake at redhat.com <mailto:sdake at redhat.com>>
>>     To: ray klassen <julius_ahenobarbus at yahoo.co.uk
>>     <mailto:julius_ahenobarbus at yahoo.co.uk>>
>>     Cc: openais at lists.linux-foundation.org
>>     <mailto:openais at lists.linux-foundation.org>
>>     Sent: Thu, 3 March, 2011 16:56:21
>>     Subject: Re: [Openais] firewire
>>
>>     On 03/03/2011 05:45 PM, ray klassen wrote:
>>     > Has anyone had any success running corosync with the firewire-net
>>     module? I
>>     >want
>>     >
>>     > to set up a two node router cluster with a dedicated link between
>>     the routers.
>>
>>     > Only problem is, I've run out of ethernet ports so I've got ip
>>     configured on
>>     >the
>>     >
>>     > firewire ports. pinging's no problem between the addresses.. funny
>>     thing is, on
>>     >
>>     > one of them (and they're really identical) corosync starts up no
>>     problem at all
>>     >
>>     > and stays up. on the other one corosync fails with  "ERROR:
>>     ais_dispatch:
>>     > Receiving message body failed: (2) Library error: Resource temporarily
>>     > unavailable (11)."
>>     >
>>     >
>>     > Reading up on the firewire-net mailing outstanding issues turned
>>     up that
>>     > multicast wasn't fully implemented so my corosync.conf files both say
>>     >broadcast:
>>     >
>>     > yes. instead of mcast-addr
>>     >
>>     > Firewire-net was emitting fwnet_write_complete: failed: 10  errors
>>     so I pulled
>>
>>     > down the latest vanilla kernel 2.6.37.2 and am running that. with
>>     far fewer of
>>
>>     > that error..
>>     >
>>     > otherwise versions are
>>     > Debian Squeeze
>>     > Corosync Version: 1.2.1-4
>>     > Pacemaker 1.0.9.1+hg15626-1
>>     >
>>     > Is this a hopeless case? I've a got a debug log from corosync that
>>     doesn't seem
>>     >
>>     > that helpful. If you want I can post that as well
>>     >
>>     > Thanks
>>     >
>>
>>     I'm hesitant to suggest using firewire as a transport as your the first
>>     person that has ever tried it.  If multicast is broken on your hardware,
>>     you might try the "udpu" transport which uses UDP only (udp is the basis
>>     for all network communication).
>>
>>     Regards
>>     -steve
>>
>>     >
>>     >
>>     > _______________________________________________
>>     > Openais mailing list
>>     > Openais at lists.linux-foundation.org
>>     <mailto:Openais at lists.linux-foundation.org>
>>     > https://lists.linux-foundation.org/mailman/listinfo/openais
>>
>>
>>
>>     _______________________________________________
>>     Openais mailing list
>>    Openais at lists.linux-foundation.org
>>     <mailto:Openais at lists.linux-foundation.org>
>>    https://lists.linux-foundation.org/mailman/listinfo/openais
>>
>>
>>
>>
>> -- 
>> Dan Frincu
>> CCNA, RHCE
>>
>>
>>
>>
>> _______________________________________________
>> Openais mailing list
>> Openais at lists.linux-foundation.org
>> https://lists.linux-foundation.org/mailman/listinfo/openais
> 
> 
>      
> _______________________________________________
> Openais mailing list
> Openais at lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/openais
> 
> 
> 
>      
> _______________________________________________
> Openais mailing list
> Openais at lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/openais


      


More information about the Openais mailing list