Merge tag 'md/4.4' of git://neil.brown.name/md

Pull md updates from Neil Brown:
 "Two major components to this update.

   1) The clustered-raid1 support from SUSE is nearly complete.  There
      are a few outstanding issues being worked on.  Maybe half a dozen
      patches will bring this to a usable state.

   2) The first stage of journalled-raid5 support from Facebook makes an
      appearance.  With a journal device configured (typically NVRAM or
      SSD), the "RAID5 write hole" should be closed - a crash during
      degraded operations cannot result in data corruption.

      The next stage will be to use the journal as a write-behind cache
      so that latency can be reduced and in some cases throughput
      increased by performing more full-stripe writes.

* tag 'md/4.4' of git://neil.brown.name/md: (66 commits)
  MD: when RAID journal is missing/faulty, block RESTART_ARRAY_RW
  MD: set journal disk ->raid_disk
  MD: kick out journal disk if it's not fresh
  raid5-cache: start raid5 readonly if journal is missing
  MD: add new bit to indicate raid array with journal
  raid5-cache: IO error handling
  raid5: journal disk can't be removed
  raid5-cache: add trim support for log
  MD: fix info output for journal disk
  raid5-cache: use bio chaining
  raid5-cache: small log->seq cleanup
  raid5-cache: new helper: r5_reserve_log_entry
  raid5-cache: inline r5l_alloc_io_unit into r5l_new_meta
  raid5-cache: take rdev->data_offset into account early on
  raid5-cache: refactor bio allocation
  raid5-cache: clean up r5l_get_meta
  raid5-cache: simplify state machine when caches flushes are not needed
  raid5-cache: factor out a helper to run all stripes for an I/O unit
  raid5-cache: rename flushed_ios to finished_ios
  raid5-cache: free I/O units earlier
  ...
This commit is contained in:
Linus Torvalds
2015-11-04 21:12:47 -08:00
14 changed files with 1991 additions and 312 deletions

View File

@@ -89,6 +89,12 @@
* read requests will only be sent here in
* dire need
*/
#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */
#define MD_DISK_ROLE_SPARE 0xffff
#define MD_DISK_ROLE_FAULTY 0xfffe
#define MD_DISK_ROLE_JOURNAL 0xfffd
#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */
typedef struct mdp_device_descriptor_s {
__u32 number; /* 0 Device number in the entire set */
@@ -252,7 +258,10 @@ struct mdp_superblock_1 {
__le64 data_offset; /* sector start of data, often 0 */
__le64 data_size; /* sectors in this device that can be used for data */
__le64 super_offset; /* sector start of this superblock */
__le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
union {
__le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
__le64 journal_tail;/* journal tail of journal device (from data_offset) */
};
__le32 dev_number; /* permanent identifier of this device - not role in raid */
__le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */
@@ -302,6 +311,8 @@ struct mdp_superblock_1 {
#define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening
* is guided by bitmap.
*/
#define MD_FEATURE_CLUSTERED 256 /* clustered MD */
#define MD_FEATURE_JOURNAL 512 /* support write cache */
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \
@@ -310,6 +321,66 @@ struct mdp_superblock_1 {
|MD_FEATURE_RESHAPE_BACKWARDS \
|MD_FEATURE_NEW_OFFSET \
|MD_FEATURE_RECOVERY_BITMAP \
|MD_FEATURE_CLUSTERED \
|MD_FEATURE_JOURNAL \
)
struct r5l_payload_header {
__le16 type;
__le16 flags;
} __attribute__ ((__packed__));
enum r5l_payload_type {
R5LOG_PAYLOAD_DATA = 0,
R5LOG_PAYLOAD_PARITY = 1,
R5LOG_PAYLOAD_FLUSH = 2,
};
struct r5l_payload_data_parity {
struct r5l_payload_header header;
__le32 size; /* sector. data/parity size. each 4k
* has a checksum */
__le64 location; /* sector. For data, it's raid sector. For
* parity, it's stripe sector */
__le32 checksum[];
} __attribute__ ((__packed__));
enum r5l_payload_data_parity_flag {
R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */
/*
* RESHAPED/RESHAPING is only set when there is reshape activity. Note,
* both data/parity of a stripe should have the same flag set
*
* RESHAPED: reshape is running, and this stripe finished reshape
* RESHAPING: reshape is running, and this stripe isn't reshaped
*/
R5LOG_PAYLOAD_FLAG_RESHAPED = 2,
R5LOG_PAYLOAD_FLAG_RESHAPING = 3,
};
struct r5l_payload_flush {
struct r5l_payload_header header;
__le32 size; /* flush_stripes size, bytes */
__le64 flush_stripes[];
} __attribute__ ((__packed__));
enum r5l_payload_flush_flag {
R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
};
struct r5l_meta_block {
__le32 magic;
__le32 checksum;
__u8 version;
__u8 __zero_pading_1;
__le16 __zero_pading_2;
__le32 meta_size; /* whole size of the block */
__le64 seq;
__le64 position; /* sector, start from rdev->data_offset, current position */
struct r5l_payload_header payloads[];
} __attribute__ ((__packed__));
#define R5LOG_VERSION 0x1
#define R5LOG_MAGIC 0x6433c509
#endif