Merge branch 'cluster' into for-next

2026-05-01 15:00:59 -07:00 · 2015-04-22 08:00:20 +10:00
parent 47d68979cc 97f6cd39da
commit d51e4fe6d6
12 changed files with 1709 additions and 82 deletions
@@ -0,0 +1,176 @@
+The cluster MD is a shared-device RAID for a cluster.
+
+
+1. On-disk format
+
+Separate write-intent-bitmap are used for each cluster node.
+The bitmaps record all writes that may have been started on that node,
+and may not yet have finished. The on-disk layout is:
+
+0                    4k                     8k                    12k
+-------------------------------------------------------------------
+| idle                | md super            | bm super [0] + bits |
+| bm bits[0, contd]   | bm super[1] + bits  | bm bits[1, contd]   |
+| bm super[2] + bits  | bm bits [2, contd]  | bm super[3] + bits  |
+| bm bits [3, contd]  |                     |                     |
+
+During "normal" functioning we assume the filesystem ensures that only one
+node writes to any given block at a time, so a write
+request will
+ - set the appropriate bit (if not already set)
+ - commit the write to all mirrors
+ - schedule the bit to be cleared after a timeout.
+
+Reads are just handled normally.  It is up to the filesystem to
+ensure one node doesn't read from a location where another node (or the same
+node) is writing.
+
+
+2. DLM Locks for management
+
+There are two locks for managing the device:
+
+2.1 Bitmap lock resource (bm_lockres)
+
+ The bm_lockres protects individual node bitmaps. They are named in the
+ form bitmap001 for node 1, bitmap002 for node and so on. When a node
+ joins the cluster, it acquires the lock in PW mode and it stays so
+ during the lifetime the node is part of the cluster. The lock resource
+ number is based on the slot number returned by the DLM subsystem. Since
+ DLM starts node count from one and bitmap slots start from zero, one is
+ subtracted from the DLM slot number to arrive at the bitmap slot number.
+
+3. Communication
+
+Each node has to communicate with other nodes when starting or ending
+resync, and metadata superblock updates.
+
+3.1 Message Types
+
+ There are 3 types, of messages which are passed
+
+ 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has been
+   updated, and the node must re-read the md superblock. This is performed
+   synchronously.
+
+ 3.1.2 RESYNC: informs other nodes that a resync is initiated or ended
+   so that each node may suspend or resume the region.
+
+3.2 Communication mechanism
+
+ The DLM LVB is used to communicate within nodes of the cluster. There
+ are three resources used for the purpose:
+
+  3.2.1 Token: The resource which protects the entire communication
+   system. The node having the token resource is allowed to
+   communicate.
+
+  3.2.2 Message: The lock resource which carries the data to
+   communicate.
+
+  3.2.3 Ack: The resource, acquiring which means the message has been
+   acknowledged by all nodes in the cluster. The BAST of the resource
+   is used to inform the receive node that a node wants to communicate.
+
+The algorithm is:
+
+ 1. receive status
+
+   sender                         receiver                   receiver
+   ACK:CR                          ACK:CR                     ACK:CR
+
+ 2. sender get EX of TOKEN
+    sender get EX of MESSAGE
+    sender                        receiver                 receiver
+    TOKEN:EX                       ACK:CR                   ACK:CR
+    MESSAGE:EX
+    ACK:CR
+
+    Sender checks that it still needs to send a message. Messages received
+    or other events that happened while waiting for the TOKEN may have made
+    this message inappropriate or redundant.
+
+ 3. sender write LVB.
+    sender down-convert MESSAGE from EX to CR
+    sender try to get EX of ACK
+    [ wait until all receiver has *processed* the MESSAGE ]
+
+                                     [ triggered by bast of ACK ]
+                                     receiver get CR of MESSAGE
+                                     receiver read LVB
+                                     receiver processes the message
+                                     [ wait finish ]
+                                     receiver release ACK
+
+   sender                         receiver                   receiver
+   TOKEN:EX                       MESSAGE:CR                 MESSAGE:CR
+   MESSAGE:CR
+   ACK:EX
+
+ 4. triggered by grant of EX on ACK (indicating all receivers have processed
+    message)
+    sender down-convert ACK from EX to CR
+    sender release MESSAGE
+    sender release TOKEN
+                               receiver upconvert to EX of MESSAGE
+                               receiver get CR of ACK
+                               receiver release MESSAGE
+
+   sender                      receiver                   receiver
+   ACK:CR                       ACK:CR                     ACK:CR
+
+
+4. Handling Failures
+
+4.1 Node Failure
+ When a node fails, the DLM informs the cluster with the slot. The node
+ starts a cluster recovery thread. The cluster recovery thread:
+	- acquires the bitmap<number> lock of the failed node
+	- opens the bitmap
+	- reads the bitmap of the failed node
+	- copies the set bitmap to local node
+	- cleans the bitmap of the failed node
+	- releases bitmap<number> lock of the failed node
+	- initiates resync of the bitmap on the current node
+
+ The resync process, is the regular md resync. However, in a clustered
+ environment when a resync is performed, it needs to tell other nodes
+ of the areas which are suspended. Before a resync starts, the node
+ send out RESYNC_START with the (lo,hi) range of the area which needs
+ to be suspended. Each node maintains a suspend_list, which contains
+ the list  of ranges which are currently suspended. On receiving
+ RESYNC_START, the node adds the range to the suspend_list. Similarly,
+ when the node performing resync finishes, it send RESYNC_FINISHED
+ to other nodes and other nodes remove the corresponding entry from
+ the suspend_list.
+
+ A helper function, should_suspend() can be used to check if a particular
+ I/O range should be suspended or not.
+
+4.2 Device Failure
+ Device failures are handled and communicated with the metadata update
+ routine.
+
+5. Adding a new Device
+For adding a new device, it is necessary that all nodes "see" the new device
+to be added. For this, the following algorithm is used:
+
+    1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues
+       ioctl(ADD_NEW_DISC with disc.state set to MD_DISK_CLUSTER_ADD)
+    2. Node 1 sends NEWDISK with uuid and slot number
+    3. Other nodes issue kobject_uevent_env with uuid and slot number
+       (Steps 4,5 could be a udev rule)
+    4. In userspace, the node searches for the disk, perhaps
+       using blkid -t SUB_UUID=""
+    5. Other nodes issue either of the following depending on whether the disk
+       was found:
+       ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and
+                disc.number set to slot number)
+       ioctl(CLUSTERED_DISK_NACK)
+    6. Other nodes drop lock on no-new-devs (CR) if device is found
+    7. Node 1 attempts EX lock on no-new-devs
+    8. If node 1 gets the lock, it sends METADATA_UPDATED after unmarking the disk
+       as SpareLocal
+    9. If not (get no-new-dev lock), it fails the operation and sends METADATA_UPDATED
+    10. Other nodes get the information whether a disk is added or not
+	by the following METADATA_UPDATED.
@@ -175,6 +175,22 @@ config MD_FAULTY

 	  In unsure, say N.

+
+config MD_CLUSTER
+	tristate "Cluster Support for MD (EXPERIMENTAL)"
+	depends on BLK_DEV_MD
+	depends on DLM
+	default n
+	---help---
+	Clustering support for MD devices. This enables locking and
+	synchronization across multiple systems on the cluster, so all
+	nodes in the cluster can access the MD devices simultaneously.
+
+	This brings the redundancy (and uptime) of RAID levels across the
+	nodes of the cluster.
+
+	If unsure, say N.
+
 source "drivers/md/bcache/Kconfig"

 config BLK_DEV_DM_BUILTIN
@@ -30,6 +30,7 @@ obj-$(CONFIG_MD_RAID10)		+= raid10.o
 obj-$(CONFIG_MD_RAID456)	+= raid456.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
+obj-$(CONFIG_MD_CLUSTER)	+= md-cluster.o
 obj-$(CONFIG_BCACHE)		+= bcache/
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
@@ -205,6 +205,10 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 	struct block_device *bdev;
 	struct mddev *mddev = bitmap->mddev;
 	struct bitmap_storage *store = &bitmap->storage;
+	int node_offset = 0;
+
+	if (mddev_is_clustered(bitmap->mddev))
+		node_offset = bitmap->cluster_slot * store->file_pages;

 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
 		int size = PAGE_SIZE;
@@ -433,6 +437,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	/* This might have been changed by a reshape */
 	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
 	sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
+	sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
 	sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
 					   bitmap_info.space);
 	kunmap_atomic(sb);
@@ -544,6 +549,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	bitmap_super_t *sb;
 	unsigned long chunksize, daemon_sleep, write_behind;
 	unsigned long long events;
+	int nodes = 0;
 	unsigned long sectors_reserved = 0;
 	int err = -EINVAL;
 	struct page *sb_page;
@@ -562,6 +568,22 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		return -ENOMEM;
 	bitmap->storage.sb_page = sb_page;

+re_read:
+	/* If cluster_slot is set, the cluster is setup */
+	if (bitmap->cluster_slot >= 0) {
+		sector_t bm_blocks = bitmap->mddev->resync_max_sectors;
+
+		sector_div(bm_blocks,
+			   bitmap->mddev->bitmap_info.chunksize >> 9);
+		/* bits to bytes */
+		bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
+		/* to 4k blocks */
+		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
+		bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3);
+		pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
+			bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset);
+	}
+
 	if (bitmap->storage.file) {
 		loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
 		int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
@@ -577,12 +599,15 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	if (err)
 		return err;

+	err = -EINVAL;
 	sb = kmap_atomic(sb_page);

 	chunksize = le32_to_cpu(sb->chunksize);
 	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
 	write_behind = le32_to_cpu(sb->write_behind);
 	sectors_reserved = le32_to_cpu(sb->sectors_reserved);
+	nodes = le32_to_cpu(sb->nodes);
+	strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);

 	/* verify that the bitmap-specific fields are valid */
 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -619,7 +644,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 			goto out;
 		}
 		events = le64_to_cpu(sb->events);
-		if (events < bitmap->mddev->events) {
+		if (!nodes && (events < bitmap->mddev->events)) {
 			printk(KERN_INFO
 			       "%s: bitmap file is out of date (%llu < %llu) "
 			       "-- forcing full recovery\n",
@@ -634,20 +659,40 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
 		set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
 	bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
+	strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
 	err = 0;
+
 out:
 	kunmap_atomic(sb);
+	/* Assiging chunksize is required for "re_read" */
+	bitmap->mddev->bitmap_info.chunksize = chunksize;
+	if (nodes && (bitmap->cluster_slot < 0)) {
+		err = md_setup_cluster(bitmap->mddev, nodes);
+		if (err) {
+			pr_err("%s: Could not setup cluster service (%d)\n",
+					bmname(bitmap), err);
+			goto out_no_sb;
+		}
+		bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
+		goto re_read;
+	}
+
+
 out_no_sb:
 	if (test_bit(BITMAP_STALE, &bitmap->flags))
 		bitmap->events_cleared = bitmap->mddev->events;
 	bitmap->mddev->bitmap_info.chunksize = chunksize;
 	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
 	bitmap->mddev->bitmap_info.max_write_behind = write_behind;
+	bitmap->mddev->bitmap_info.nodes = nodes;
 	if (bitmap->mddev->bitmap_info.space == 0 ||
 	    bitmap->mddev->bitmap_info.space > sectors_reserved)
 		bitmap->mddev->bitmap_info.space = sectors_reserved;
-	if (err)
+	if (err) {
 		bitmap_print_sb(bitmap);
+		if (bitmap->cluster_slot < 0)
+			md_cluster_stop(bitmap->mddev);
+	}
 	return err;
 }

@@ -692,9 +737,10 @@ static inline struct page *filemap_get_page(struct bitmap_storage *store,
 }

 static int bitmap_storage_alloc(struct bitmap_storage *store,
-				unsigned long chunks, int with_super)
+				unsigned long chunks, int with_super,
+				int slot_number)
 {
-	int pnum;
+	int pnum, offset = 0;
 	unsigned long num_pages;
 	unsigned long bytes;

@@ -703,6 +749,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
 		bytes += sizeof(bitmap_super_t);

 	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
+	offset = slot_number * (num_pages - 1);

 	store->filemap = kmalloc(sizeof(struct page *)
 				 * num_pages, GFP_KERNEL);
@@ -713,20 +760,22 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
 		store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
 		if (store->sb_page == NULL)
 			return -ENOMEM;
-		store->sb_page->index = 0;
 	}
+
 	pnum = 0;
 	if (store->sb_page) {
 		store->filemap[0] = store->sb_page;
 		pnum = 1;
+		store->sb_page->index = offset;
 	}
+
 	for ( ; pnum < num_pages; pnum++) {
 		store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
 		if (!store->filemap[pnum]) {
 			store->file_pages = pnum;
 			return -ENOMEM;
 		}
-		store->filemap[pnum]->index = pnum;
+		store->filemap[pnum]->index = pnum + offset;
 	}
 	store->file_pages = pnum;

@@ -885,6 +934,28 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	}
 }

+static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
+{
+	unsigned long bit;
+	struct page *page;
+	void *paddr;
+	unsigned long chunk = block >> bitmap->counts.chunkshift;
+	int set = 0;
+
+	page = filemap_get_page(&bitmap->storage, chunk);
+	if (!page)
+		return -EINVAL;
+	bit = file_page_offset(&bitmap->storage, chunk);
+	paddr = kmap_atomic(page);
+	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
+		set = test_bit(bit, paddr);
+	else
+		set = test_bit_le(bit, paddr);
+	kunmap_atomic(paddr);
+	return set;
+}
+
+
 /* this gets called when the md device is ready to unplug its underlying
 * (slave) device queues -- before we let any writes go down, we need to
 * sync the dirty pages of the bitmap file to disk */
@@ -935,7 +1006,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
 */
 static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 {
-	unsigned long i, chunks, index, oldindex, bit;
+	unsigned long i, chunks, index, oldindex, bit, node_offset = 0;
 	struct page *page = NULL;
 	unsigned long bit_cnt = 0;
 	struct file *file;
@@ -981,6 +1052,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	if (!bitmap->mddev->bitmap_info.external)
 		offset = sizeof(bitmap_super_t);

+	if (mddev_is_clustered(bitmap->mddev))
+		node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
+
 	for (i = 0; i < chunks; i++) {
 		int b;
 		index = file_page_index(&bitmap->storage, i);
@@ -1001,7 +1075,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 					bitmap->mddev,
 					bitmap->mddev->bitmap_info.offset,
 					page,
-					index, count);
+					index + node_offset, count);

 			if (ret)
 				goto err;
@@ -1207,7 +1281,6 @@ void bitmap_daemon_work(struct mddev *mddev)
 	     j < bitmap->storage.file_pages
 		     && !test_bit(BITMAP_STALE, &bitmap->flags);
 	     j++) {
-
 		if (test_page_attr(bitmap, j,
 				   BITMAP_PAGE_DIRTY))
 			/* bitmap_unplug will handle the rest */
@@ -1530,11 +1603,13 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
 		return;
 	}
 	if (!*bmc) {
-		*bmc = 2 | (needed ? NEEDED_MASK : 0);
+		*bmc = 2;
 		bitmap_count_page(&bitmap->counts, offset, 1);
 		bitmap_set_pending(&bitmap->counts, offset);
 		bitmap->allclean = 0;
 	}
+	if (needed)
+		*bmc |= NEEDED_MASK;
 	spin_unlock_irq(&bitmap->counts.lock);
 }

@@ -1591,6 +1666,10 @@ static void bitmap_free(struct bitmap *bitmap)
 	if (!bitmap) /* there was no bitmap */
 		return;

+	if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info &&
+		bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev))
+		md_cluster_stop(bitmap->mddev);
+
 	/* Shouldn't be needed - but just in case.... */
 	wait_event(bitmap->write_wait,
 		   atomic_read(&bitmap->pending_writes) == 0);
@@ -1636,7 +1715,7 @@ void bitmap_destroy(struct mddev *mddev)
 * initialize the bitmap structure
 * if this returns an error, bitmap_destroy must be called to do clean up
 */
-int bitmap_create(struct mddev *mddev)
+struct bitmap *bitmap_create(struct mddev *mddev, int slot)
 {
 	struct bitmap *bitmap;
 	sector_t blocks = mddev->resync_max_sectors;
@@ -1650,7 +1729,7 @@ int bitmap_create(struct mddev *mddev)

 	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
 	if (!bitmap)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);

 	spin_lock_init(&bitmap->counts.lock);
 	atomic_set(&bitmap->pending_writes, 0);
@@ -1659,6 +1738,7 @@ int bitmap_create(struct mddev *mddev)
 	init_waitqueue_head(&bitmap->behind_wait);

 	bitmap->mddev = mddev;
+	bitmap->cluster_slot = slot;

 	if (mddev->kobj.sd)
 		bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
@@ -1706,12 +1786,14 @@ int bitmap_create(struct mddev *mddev)
 	printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
 	       bitmap->counts.pages, bmname(bitmap));

-	mddev->bitmap = bitmap;
-	return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
+	err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
+	if (err)
+		goto error;

+	return bitmap;
 error:
 	bitmap_free(bitmap);
-	return err;
+	return ERR_PTR(err);
 }

 int bitmap_load(struct mddev *mddev)
@@ -1765,6 +1847,60 @@ out:
 }
 EXPORT_SYMBOL_GPL(bitmap_load);

+/* Loads the bitmap associated with slot and copies the resync information
+ * to our bitmap
+ */
+int bitmap_copy_from_slot(struct mddev *mddev, int slot,
+		sector_t *low, sector_t *high, bool clear_bits)
+{
+	int rv = 0, i, j;
+	sector_t block, lo = 0, hi = 0;
+	struct bitmap_counts *counts;
+	struct bitmap *bitmap = bitmap_create(mddev, slot);
+
+	if (IS_ERR(bitmap))
+		return PTR_ERR(bitmap);
+
+	rv = bitmap_read_sb(bitmap);
+	if (rv)
+		goto err;
+
+	rv = bitmap_init_from_disk(bitmap, 0);
+	if (rv)
+		goto err;
+
+	counts = &bitmap->counts;
+	for (j = 0; j < counts->chunks; j++) {
+		block = (sector_t)j << counts->chunkshift;
+		if (bitmap_file_test_bit(bitmap, block)) {
+			if (!lo)
+				lo = block;
+			hi = block;
+			bitmap_file_clear_bit(bitmap, block);
+			bitmap_set_memory_bits(mddev->bitmap, block, 1);
+			bitmap_file_set_bit(mddev->bitmap, block);
+		}
+	}
+
+	if (clear_bits) {
+		bitmap_update_sb(bitmap);
+		/* Setting this for the ev_page should be enough.
+		 * And we do not require both write_all and PAGE_DIRT either
+		 */
+		for (i = 0; i < bitmap->storage.file_pages; i++)
+			set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
+		bitmap_write_all(bitmap);
+		bitmap_unplug(bitmap);
+	}
+	*low = lo;
+	*high = hi;
+err:
+	bitmap_free(bitmap);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
+
+
 void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
 {
 	unsigned long chunk_kb;
@@ -1849,7 +1985,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 	memset(&store, 0, sizeof(store));
 	if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
 		ret = bitmap_storage_alloc(&store, chunks,
-					   !bitmap->mddev->bitmap_info.external);
+					   !bitmap->mddev->bitmap_info.external,
+					   bitmap->cluster_slot);
 	if (ret)
 		goto err;

@@ -2021,13 +2158,18 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 				return -EINVAL;
 			mddev->bitmap_info.offset = offset;
 			if (mddev->pers) {
+				struct bitmap *bitmap;
 				mddev->pers->quiesce(mddev, 1);
-				rv = bitmap_create(mddev);
-				if (!rv)
+				bitmap = bitmap_create(mddev, -1);
+				if (IS_ERR(bitmap))
+					rv = PTR_ERR(bitmap);
+				else {
+					mddev->bitmap = bitmap;
 					rv = bitmap_load(mddev);
-				if (rv) {
-					bitmap_destroy(mddev);
-					mddev->bitmap_info.offset = 0;
+					if (rv) {
+						bitmap_destroy(mddev);
+						mddev->bitmap_info.offset = 0;
+					}
 				}
 				mddev->pers->quiesce(mddev, 0);
 				if (rv)
@@ -2186,6 +2328,8 @@ __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);

 static ssize_t metadata_show(struct mddev *mddev, char *page)
 {
+	if (mddev_is_clustered(mddev))
+		return sprintf(page, "clustered\n");
 	return sprintf(page, "%s\n", (mddev->bitmap_info.external
 				      ? "external" : "internal"));
 }
@@ -2198,7 +2342,8 @@ static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
 		return -EBUSY;
 	if (strncmp(buf, "external", 8) == 0)
 		mddev->bitmap_info.external = 1;
-	else if (strncmp(buf, "internal", 8) == 0)
+	else if ((strncmp(buf, "internal", 8) == 0) ||
+			(strncmp(buf, "clustered", 9) == 0))
 		mddev->bitmap_info.external = 0;
 	else
 		return -EINVAL;
@@ -130,8 +130,9 @@ typedef struct bitmap_super_s {
 	__le32 write_behind; /* 60  number of outstanding write-behind writes */
 	__le32 sectors_reserved; /* 64 number of 512-byte sectors that are
 				  * reserved for the bitmap. */
-
-	__u8  pad[256 - 68]; /* set to zero */
+	__le32 nodes;        /* 68 the maximum number of nodes in cluster. */
+	__u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
+	__u8  pad[256 - 136]; /* set to zero */
 } bitmap_super_t;

 /* notes:
@@ -226,12 +227,13 @@ struct bitmap {
 	wait_queue_head_t behind_wait;

 	struct kernfs_node *sysfs_can_clear;
+	int cluster_slot;		/* Slot offset for clustered env */
 };

 /* the bitmap API */

 /* these are used only by md/bitmap */
-int  bitmap_create(struct mddev *mddev);
+struct bitmap *bitmap_create(struct mddev *mddev, int slot);
 int bitmap_load(struct mddev *mddev);
 void bitmap_flush(struct mddev *mddev);
 void bitmap_destroy(struct mddev *mddev);
@@ -260,6 +262,8 @@ void bitmap_daemon_work(struct mddev *mddev);

 int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 		  int chunksize, int init);
+int bitmap_copy_from_slot(struct mddev *mddev, int slot,
+				sector_t *lo, sector_t *hi, bool clear_bits);
 #endif

 #endif
@@ -0,0 +1,29 @@
+
+
+#ifndef _MD_CLUSTER_H
+#define _MD_CLUSTER_H
+
+#include "md.h"
+
+struct mddev;
+struct md_rdev;
+
+struct md_cluster_operations {
+	int (*join)(struct mddev *mddev, int nodes);
+	int (*leave)(struct mddev *mddev);
+	int (*slot_number)(struct mddev *mddev);
+	void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
+	int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
+	void (*resync_finish)(struct mddev *mddev);
+	int (*metadata_update_start)(struct mddev *mddev);
+	int (*metadata_update_finish)(struct mddev *mddev);
+	int (*metadata_update_cancel)(struct mddev *mddev);
+	int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi);
+	int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
+	int (*add_new_disk_finish)(struct mddev *mddev);
+	int (*new_disk_ack)(struct mddev *mddev, bool ack);
+	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
+	int (*gather_bitmaps)(struct md_rdev *rdev);
+};
+
+#endif /* _MD_CLUSTER_H */
@@ -23,6 +23,7 @@
 #include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
+#include "md-cluster.h"

 #define MaxSector (~(sector_t)0)

@@ -170,6 +171,10 @@ enum flag_bits {
 				 * a want_replacement device with same
 				 * raid_disk number.
 				 */
+	Candidate,		/* For clustered environments only:
+				 * This device is seen locally but not
+				 * by the whole cluster
+				 */
 };

 #define BB_LEN_MASK	(0x00000000000001FFULL)
@@ -202,6 +207,8 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 				int is_new);
 extern void md_ack_all_badblocks(struct badblocks *bb);

+struct md_cluster_info;
+
 struct mddev {
 	void				*private;
 	struct md_personality		*pers;
@@ -430,6 +437,8 @@ struct mddev {
 		unsigned long		daemon_sleep; /* how many jiffies between updates? */
 		unsigned long		max_write_behind; /* write-behind mode */
 		int			external;
+		int			nodes; /* Maximum number of nodes in the cluster */
+		char                    cluster_name[64]; /* Name of the cluster */
 	} bitmap_info;

 	atomic_t			max_corr_read_errors; /* max read retries */
@@ -448,6 +457,7 @@ struct mddev {
 	struct work_struct flush_work;
 	struct work_struct event_work;	/* used by dm to report failure event */
 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
+	struct md_cluster_info		*cluster_info;
 };

 static inline int __must_check mddev_lock(struct mddev *mddev)
@@ -608,6 +618,11 @@ static inline void safe_put_page(struct page *p)

 extern int register_md_personality(struct md_personality *p);
 extern int unregister_md_personality(struct md_personality *p);
+extern int register_md_cluster_operations(struct md_cluster_operations *ops,
+		struct module *module);
+extern int unregister_md_cluster_operations(void);
+extern int md_setup_cluster(struct mddev *mddev, int nodes);
+extern void md_cluster_stop(struct mddev *mddev);
 extern struct md_thread *md_register_thread(
 	void (*run)(struct md_thread *thread),
 	struct mddev *mddev,
@@ -654,6 +669,10 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   struct mddev *mddev);

 extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
+extern void md_reload_sb(struct mddev *mddev);
+extern void md_update_sb(struct mddev *mddev, int force);
+extern void md_kick_rdev_from_array(struct md_rdev * rdev);
+struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
 static inline int mddev_check_plugged(struct mddev *mddev)
 {
 	return !!blk_check_plugged(md_unplug, mddev,
@@ -669,4 +688,9 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
 	}
 }

+extern struct md_cluster_operations *md_cluster_ops;
+static inline int mddev_is_clustered(struct mddev *mddev)
+{
+	return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
+}
 #endif /* _MD_MD_H */
@@ -539,7 +539,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	has_nonrot_disk = 0;
 	choose_next_idle = 0;

-	choose_first = (conf->mddev->recovery_cp < this_sector + sectors);
+	if ((conf->mddev->recovery_cp < this_sector + sectors) ||
+	    (mddev_is_clustered(conf->mddev) &&
+	    md_cluster_ops->area_resyncing(conf->mddev, this_sector,
+		    this_sector + sectors)))
+		choose_first = 1;
+	else
+		choose_first = 0;

 	for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
 		sector_t dist;
@@ -1102,8 +1108,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	md_write_start(mddev, bio); /* wait on superblock update early */

 	if (bio_data_dir(bio) == WRITE &&
-	    bio_end_sector(bio) > mddev->suspend_lo &&
-	    bio->bi_iter.bi_sector < mddev->suspend_hi) {
+	    ((bio_end_sector(bio) > mddev->suspend_lo &&
+	    bio->bi_iter.bi_sector < mddev->suspend_hi) ||
+	    (mddev_is_clustered(mddev) &&
+	     md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
 		/* As the suspend_* range is controlled by
 		 * userspace, we want an interruptible
 		 * wait.
@@ -1114,7 +1122,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_INTERRUPTIBLE);
 			if (bio_end_sector(bio) <= mddev->suspend_lo ||
-			    bio->bi_iter.bi_sector >= mddev->suspend_hi)
+			    bio->bi_iter.bi_sector >= mddev->suspend_hi ||
+			    (mddev_is_clustered(mddev) &&
+			     !md_cluster_ops->area_resyncing(mddev,
+				     bio->bi_iter.bi_sector, bio_end_sector(bio))))
 				break;
 			schedule();
 		}
@@ -1561,6 +1572,7 @@ static int raid1_spare_active(struct mddev *mddev)
 		struct md_rdev *rdev = conf->mirrors[i].rdev;
 		struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
 		if (repl
+		    && !test_bit(Candidate, &repl->flags)
 		    && repl->recovery_offset == MaxSector
 		    && !test_bit(Faulty, &repl->flags)
 		    && !test_and_set_bit(In_sync, &repl->flags)) {
@@ -78,6 +78,12 @@
 #define MD_DISK_ACTIVE		1 /* disk is running or spare disk */
 #define MD_DISK_SYNC		2 /* disk is in sync with the raid set */
 #define MD_DISK_REMOVED		3 /* disk is in sync with the raid set */
+#define MD_DISK_CLUSTER_ADD     4 /* Initiate a disk add across the cluster
+				   * For clustered enviroments only.
+				   */
+#define MD_DISK_CANDIDATE	5 /* disk is added as spare (local) until confirmed
+				   * For clustered enviroments only.
+				   */

 #define	MD_DISK_WRITEMOSTLY	9 /* disk is "write-mostly" is RAID1 config.
 				   * read requests will only be sent here in
@@ -101,6 +107,7 @@ typedef struct mdp_device_descriptor_s {
 #define MD_SB_CLEAN		0
 #define MD_SB_ERRORS		1

+#define	MD_SB_CLUSTERED		5 /* MD is clustered */
 #define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */

 /*
@@ -62,6 +62,7 @@
 #define STOP_ARRAY		_IO (MD_MAJOR, 0x32)
 #define STOP_ARRAY_RO		_IO (MD_MAJOR, 0x33)
 #define RESTART_ARRAY_RW	_IO (MD_MAJOR, 0x34)
+#define CLUSTERED_DISK_NACK	_IO (MD_MAJOR, 0x35)

 /* 63 partitions with the alternate major number (mdp) */
 #define MdpMinorShift 6