1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * 4 * Copyright (C) 2005 Oracle. All rights reserved. 5 */ 6 7 /* This quorum hack is only here until we transition to some more rational 8 * approach that is driven from userspace. Honest. No foolin'. 9 * 10 * Imagine two nodes lose network connectivity to each other but they're still 11 * up and operating in every other way. Presumably a network timeout indicates 12 * that a node is broken and should be recovered. They can't both recover each 13 * other and both carry on without serialising their access to the file system. 14 * They need to decide who is authoritative. Now extend that problem to 15 * arbitrary groups of nodes losing connectivity between each other. 16 * 17 * So we declare that a node which has given up on connecting to a majority 18 * of nodes who are still heartbeating will fence itself. 19 * 20 * There are huge opportunities for races here. After we give up on a node's 21 * connection we need to wait long enough to give heartbeat an opportunity 22 * to declare the node as truly dead. We also need to be careful with the 23 * race between when we see a node start heartbeating and when we connect 24 * to it. 25 * 26 * So nodes that are in this transtion put a hold on the quorum decision 27 * with a counter. As they fall out of this transition they drop the count 28 * and if they're the last, they fire off the decision. 29 */ 30 #include <linux/kernel.h> 31 #include <linux/workqueue.h> 32 #include <linux/reboot.h> 33 34 #include "heartbeat.h" 35 #include "nodemanager.h" 36 #define MLOG_MASK_PREFIX ML_QUORUM 37 #include "masklog.h" 38 #include "quorum.h" 39 40 static struct o2quo_state { 41 spinlock_t qs_lock; 42 struct work_struct qs_work; 43 int qs_pending; 44 int qs_heartbeating; 45 unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; 46 int qs_connected; 47 unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; 48 int qs_holds; 49 unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; 50 } o2quo_state; 51 52 /* this is horribly heavy-handed. It should instead flip the file 53 * system RO and call some userspace script. */ 54 static void o2quo_fence_self(void) 55 { 56 /* panic spins with interrupts enabled. with preempt 57 * threads can still schedule, etc, etc */ 58 o2hb_stop_all_regions(); 59 60 switch (o2nm_single_cluster->cl_fence_method) { 61 case O2NM_FENCE_PANIC: 62 panic("*** ocfs2 is very sorry to be fencing this system by " 63 "panicing ***\n"); 64 break; 65 default: 66 WARN_ON(o2nm_single_cluster->cl_fence_method >= 67 O2NM_FENCE_METHODS); 68 fallthrough; 69 case O2NM_FENCE_RESET: 70 printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " 71 "system by restarting ***\n"); 72 emergency_restart(); 73 break; 74 } 75 } 76 77 /* Indicate that a timeout occurred on a heartbeat region write. The 78 * other nodes in the cluster may consider us dead at that time so we 79 * want to "fence" ourselves so that we don't scribble on the disk 80 * after they think they've recovered us. This can't solve all 81 * problems related to writeout after recovery but this hack can at 82 * least close some of those gaps. When we have real fencing, this can 83 * go away as our node would be fenced externally before other nodes 84 * begin recovery. */ 85 void o2quo_disk_timeout(void) 86 { 87 o2quo_fence_self(); 88 } 89 90 static void o2quo_make_decision(struct work_struct *work) 91 { 92 int quorum; 93 int lowest_hb, lowest_reachable = 0, fence = 0; 94 struct o2quo_state *qs = &o2quo_state; 95 96 spin_lock_bh(&qs->qs_lock); 97 98 lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); 99 if (lowest_hb != O2NM_MAX_NODES) 100 lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm); 101 102 mlog(0, "heartbeating: %d, connected: %d, " 103 "lowest: %d (%sreachable)\n", qs->qs_heartbeating, 104 qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un"); 105 106 if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) || 107 qs->qs_heartbeating == 1) 108 goto out; 109 110 if (qs->qs_heartbeating & 1) { 111 /* the odd numbered cluster case is straight forward -- 112 * if we can't talk to the majority we're hosed */ 113 quorum = (qs->qs_heartbeating + 1)/2; 114 if (qs->qs_connected < quorum) { 115 mlog(ML_ERROR, "fencing this node because it is " 116 "only connected to %u nodes and %u is needed " 117 "to make a quorum out of %u heartbeating nodes\n", 118 qs->qs_connected, quorum, 119 qs->qs_heartbeating); 120 fence = 1; 121 } 122 } else { 123 /* the even numbered cluster adds the possibility of each half 124 * of the cluster being able to talk amongst themselves.. in 125 * that case we're hosed if we can't talk to the group that has 126 * the lowest numbered node */ 127 quorum = qs->qs_heartbeating / 2; 128 if (qs->qs_connected < quorum) { 129 mlog(ML_ERROR, "fencing this node because it is " 130 "only connected to %u nodes and %u is needed " 131 "to make a quorum out of %u heartbeating nodes\n", 132 qs->qs_connected, quorum, 133 qs->qs_heartbeating); 134 fence = 1; 135 } 136 else if ((qs->qs_connected == quorum) && 137 !lowest_reachable) { 138 mlog(ML_ERROR, "fencing this node because it is " 139 "connected to a half-quorum of %u out of %u " 140 "nodes which doesn't include the lowest active " 141 "node %u\n", quorum, qs->qs_heartbeating, 142 lowest_hb); 143 fence = 1; 144 } 145 } 146 147 out: 148 if (fence) { 149 spin_unlock_bh(&qs->qs_lock); 150 o2quo_fence_self(); 151 } else { 152 mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " 153 "connected: %d, lowest: %d (%sreachable)\n", 154 qs->qs_heartbeating, qs->qs_connected, lowest_hb, 155 lowest_reachable ? "" : "un"); 156 spin_unlock_bh(&qs->qs_lock); 157 158 } 159 160 } 161 162 static void o2quo_set_hold(struct o2quo_state *qs, u8 node) 163 { 164 assert_spin_locked(&qs->qs_lock); 165 166 if (!test_and_set_bit(node, qs->qs_hold_bm)) { 167 qs->qs_holds++; 168 mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES, 169 "node %u\n", node); 170 mlog(0, "node %u, %d total\n", node, qs->qs_holds); 171 } 172 } 173 174 static void o2quo_clear_hold(struct o2quo_state *qs, u8 node) 175 { 176 assert_spin_locked(&qs->qs_lock); 177 178 if (test_and_clear_bit(node, qs->qs_hold_bm)) { 179 mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1); 180 if (--qs->qs_holds == 0) { 181 if (qs->qs_pending) { 182 qs->qs_pending = 0; 183 schedule_work(&qs->qs_work); 184 } 185 } 186 mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n", 187 node, qs->qs_holds); 188 } 189 } 190 191 /* as a node comes up we delay the quorum decision until we know the fate of 192 * the connection. the hold will be droped in conn_up or hb_down. it might be 193 * perpetuated by con_err until hb_down. if we already have a conn, we might 194 * be dropping a hold that conn_up got. */ 195 void o2quo_hb_up(u8 node) 196 { 197 struct o2quo_state *qs = &o2quo_state; 198 199 spin_lock_bh(&qs->qs_lock); 200 201 qs->qs_heartbeating++; 202 mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, 203 "node %u\n", node); 204 mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node); 205 set_bit(node, qs->qs_hb_bm); 206 207 mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); 208 209 if (!test_bit(node, qs->qs_conn_bm)) 210 o2quo_set_hold(qs, node); 211 else 212 o2quo_clear_hold(qs, node); 213 214 spin_unlock_bh(&qs->qs_lock); 215 } 216 217 /* hb going down releases any holds we might have had due to this node from 218 * conn_up, conn_err, or hb_up */ 219 void o2quo_hb_down(u8 node) 220 { 221 struct o2quo_state *qs = &o2quo_state; 222 223 spin_lock_bh(&qs->qs_lock); 224 225 qs->qs_heartbeating--; 226 mlog_bug_on_msg(qs->qs_heartbeating < 0, 227 "node %u, %d heartbeating\n", 228 node, qs->qs_heartbeating); 229 mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node); 230 clear_bit(node, qs->qs_hb_bm); 231 232 mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); 233 234 o2quo_clear_hold(qs, node); 235 236 spin_unlock_bh(&qs->qs_lock); 237 } 238 239 /* this tells us that we've decided that the node is still heartbeating 240 * even though we've lost it's conn. it must only be called after conn_err 241 * and indicates that we must now make a quorum decision in the future, 242 * though we might be doing so after waiting for holds to drain. Here 243 * we'll be dropping the hold from conn_err. */ 244 void o2quo_hb_still_up(u8 node) 245 { 246 struct o2quo_state *qs = &o2quo_state; 247 248 spin_lock_bh(&qs->qs_lock); 249 250 mlog(0, "node %u\n", node); 251 252 qs->qs_pending = 1; 253 o2quo_clear_hold(qs, node); 254 255 spin_unlock_bh(&qs->qs_lock); 256 } 257 258 /* This is analogous to hb_up. as a node's connection comes up we delay the 259 * quorum decision until we see it heartbeating. the hold will be droped in 260 * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if 261 * it's already heartbeating we might be dropping a hold that conn_up got. 262 * */ 263 void o2quo_conn_up(u8 node) 264 { 265 struct o2quo_state *qs = &o2quo_state; 266 267 spin_lock_bh(&qs->qs_lock); 268 269 qs->qs_connected++; 270 mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, 271 "node %u\n", node); 272 mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node); 273 set_bit(node, qs->qs_conn_bm); 274 275 mlog(0, "node %u, %d total\n", node, qs->qs_connected); 276 277 if (!test_bit(node, qs->qs_hb_bm)) 278 o2quo_set_hold(qs, node); 279 else 280 o2quo_clear_hold(qs, node); 281 282 spin_unlock_bh(&qs->qs_lock); 283 } 284 285 /* we've decided that we won't ever be connecting to the node again. if it's 286 * still heartbeating we grab a hold that will delay decisions until either the 287 * node stops heartbeating from hb_down or the caller decides that the node is 288 * still up and calls still_up */ 289 void o2quo_conn_err(u8 node) 290 { 291 struct o2quo_state *qs = &o2quo_state; 292 293 spin_lock_bh(&qs->qs_lock); 294 295 if (test_bit(node, qs->qs_conn_bm)) { 296 qs->qs_connected--; 297 mlog_bug_on_msg(qs->qs_connected < 0, 298 "node %u, connected %d\n", 299 node, qs->qs_connected); 300 301 clear_bit(node, qs->qs_conn_bm); 302 303 if (test_bit(node, qs->qs_hb_bm)) 304 o2quo_set_hold(qs, node); 305 } 306 307 mlog(0, "node %u, %d total\n", node, qs->qs_connected); 308 309 310 spin_unlock_bh(&qs->qs_lock); 311 } 312 313 void o2quo_init(void) 314 { 315 struct o2quo_state *qs = &o2quo_state; 316 317 spin_lock_init(&qs->qs_lock); 318 INIT_WORK(&qs->qs_work, o2quo_make_decision); 319 } 320 321 void o2quo_exit(void) 322 { 323 struct o2quo_state *qs = &o2quo_state; 324 325 flush_work(&qs->qs_work); 326 } 327
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.