MD – BAFM

Well, I recently had to flatten my archive NAS (well only the OS part … *wheeeh*). Since I didn’t have the chance to backup the old settings I had to do everything from scratch … And this time I decided, I wasn’t doing a script but rather the proper way.

I spent a while reading through the Internetz about the various settings until I stumbled upon a Frauenhofer Wiki entry. From there I ended up writing those udev-rules and the sysctl configs…

# Settings from http://www.fhgfs.com/wiki/StorageServerTuning

# Set an appropriate IO scheduler for file servers.
KERNEL=="sd[a-z]", ATTR{queue/scheduler}="deadline"
KERNEL=="sd[a-i][a-z]", ATTR{queue/scheduler}="deadline"

# Give the IO scheduler more flexibility by increasing the number of
# schedulable requests.
KERNEL=="sd[a-z]", ATTR{queue/nr_requests}="4096"
KERNEL=="sd[a-i][a-z]", ATTR{queue/nr_requests}="4096"

# To improve throughput for sequential reads, increase the maximum amount of
# read-ahead data. The actual amount of read-ahead is adaptive, so using a
#  high value here won't harm performance for small random access.
KERNEL=="sd[a-z]", ATTR{queue/read_ahead_kb}="73728"
KERNEL=="sd[a-i][a-z]", ATTR{queue/read_ahead_kb}="73728"
KERNEL=="sd[a-z]", RUN+="/sbin/blockdev --setra 73728 /dev/%n"
KERNEL=="sd[a-i][a-z]", RUN+="/sbin/blockdev --setra 73728 /dev/%n"

SUBSYSTEM=="block", KERNEL=="md[0-9]*", RUN+="/sbin/blockdev --setra 663552 /dev/%n"
SUBSYSTEM=="block", KERNEL=="md[0-9]*", ATTR{md/stripe_cache_size}="9216"

# Optimal performance for hardware RAID systems often depends on large IOs
# being sent to the device in a single large operation. Please refer to your
# hardware storage vendor for the corresponding optimal size of
# /sys/block/sdX/max_sectors_kb.
# It is typically good if this size can be increased to at least match your
# RAID stripe set size (i.e. chunk_size x number_of_disks):
KERNEL=="sd[a-z]", ATTR{queue/max_sectors_kb}="512"
KERNEL=="sd[a-i][a-z]", ATTR{queue/max_sectors_kb}="512"

KERNEL=="sd[a-z]", ATTR{device/queue_depth}="1"
KERNEL=="sd[a-i][a-z]", ATTR{device/queue_depth}="1"

# Settings from http://www.fhgfs.com/wiki/StorageServerTuning

# Set an appropriate IO scheduler for file servers.

KERNEL=="sd[a-z]", ATTR{queue/scheduler}="deadline"

KERNEL=="sd[a-i][a-z]", ATTR{queue/scheduler}="deadline"

# Give the IO scheduler more flexibility by increasing the number of

# schedulable requests.

KERNEL=="sd[a-z]", ATTR{queue/nr_requests}="4096"

KERNEL=="sd[a-i][a-z]", ATTR{queue/nr_requests}="4096"

# To improve throughput for sequential reads, increase the maximum amount of

# read-ahead data. The actual amount of read-ahead is adaptive, so using a

# high value here won't harm performance for small random access.

KERNEL=="sd[a-z]", ATTR{queue/read_ahead_kb}="73728"

KERNEL=="sd[a-i][a-z]", ATTR{queue/read_ahead_kb}="73728"

KERNEL=="sd[a-z]", RUN+="/sbin/blockdev --setra 73728 /dev/%n"

KERNEL=="sd[a-i][a-z]", RUN+="/sbin/blockdev --setra 73728 /dev/%n"

SUBSYSTEM=="block", KERNEL=="md[0-9]*", RUN+="/sbin/blockdev --setra 663552 /dev/%n"

SUBSYSTEM=="block", KERNEL=="md[0-9]*", ATTR{md/stripe_cache_size}="9216"

# Optimal performance for hardware RAID systems often depends on large IOs

# being sent to the device in a single large operation. Please refer to your

# hardware storage vendor for the corresponding optimal size of

# /sys/block/sdX/max_sectors_kb.

# It is typically good if this size can be increased to at least match your

# RAID stripe set size (i.e. chunk_size x number_of_disks):

KERNEL=="sd[a-z]", ATTR{queue/max_sectors_kb}="512"

KERNEL=="sd[a-i][a-z]", ATTR{queue/max_sectors_kb}="512"

KERNEL=="sd[a-z]", ATTR{device/queue_depth}="1"

KERNEL=="sd[a-i][a-z]", ATTR{device/queue_depth}="1"

# Settings taken from http://www.fhgfs.com/wiki/StorageServerTuning

# To avoid long IO stalls (latencies) for write cache flushing in a production
# environment with very different workloads, you will typically want to limit
# the kernel dirty (write) cache size.

vm.dirty_background_ratio = 5
vm.dirty_ratio = 10

# Assigning slightly higher priority to inode caching helps to avoid disk seeks
# for inode loading
vm.vfs_cache_pressure = 50

# Buffering of file system data requires frequent memory allocation. Raising the
# amount of reserved kernel memory will enable faster and more reliable memory
# allocation in critical situations. Raise the corresponding value to 64MB if
# you have less than 8GB of memory, otherwise raise it to at least 256MB
vm.min_free_kbytes = 262144

# Settings taken from http://www.fhgfs.com/wiki/StorageServerTuning

# To avoid long IO stalls (latencies) for write cache flushing in a production

# environment with very different workloads, you will typically want to limit

# the kernel dirty (write) cache size.

vm.dirty_background_ratio = 5

vm.dirty_ratio = 10

# Assigning slightly higher priority to inode caching helps to avoid disk seeks

# for inode loading

vm.vfs_cache_pressure = 50

# Buffering of file system data requires frequent memory allocation. Raising the

# amount of reserved kernel memory will enable faster and more reliable memory

# allocation in critical situations. Raise the corresponding value to 64MB if

# you have less than 8GB of memory, otherwise raise it to at least 256MB

vm.min_free_kbytes = 262144

For now, I’m rather pleased with the results …

root:(charon.ka.heimdaheim.de) PWD:/
Wed Jul 09, 15:02:08 [0] > mdadm --detail /dev/md127
/dev/md127:
        Version : 1.2
  Creation Time : Sat Jan 26 18:35:19 2013
     Raid Level : raid5
     Array Size : 15626121216 (14902.23 GiB 16001.15 GB)
  Used Dev Size : 1953265152 (1862.78 GiB 2000.14 GB)
   Raid Devices : 9
  Total Devices : 10
    Persistence : Superblock is persistent

    Update Time : Wed Jul  9 15:03:28 2014
          State : clean
 Active Devices : 9
Working Devices : 10
 Failed Devices : 0
  Spare Devices : 1

         Layout : left-symmetric
     Chunk Size : 512K

           Name : charon:aggr1  (local to host charon)
           UUID : 6d11820f:04847070:2725c434:9ee39718
         Events : 11186

    Number   Major   Minor   RaidDevice State
       0       8      129        0      active sync   /dev/sdi1
       1       8       33        1      active sync   /dev/sdc1
       2       8       49        2      active sync   /dev/sdd1
       4       8       65        3      active sync   /dev/sde1
       5       8       17        4      active sync   /dev/sdb1
      10       8       97        5      active sync   /dev/sdg1
       9       8       81        6      active sync   /dev/sdf1
       8       8      161        7      active sync   /dev/sdk1
       7       8      145        8      active sync   /dev/sdj1

       6       8      113        -      spare   /dev/sdh1

root:(charon.ka.heimdaheim.de) PWD:/

Wed Jul 09, 15:02:08 [0] > mdadm --detail /dev/md127

/dev/md127:

Version : 1.2

Creation Time : Sat Jan 26 18:35:19 2013

Raid Level : raid5

Array Size : 15626121216 (14902.23 GiB 16001.15 GB)

Used Dev Size : 1953265152 (1862.78 GiB 2000.14 GB)

Raid Devices : 9

Total Devices : 10

Persistence : Superblock is persistent

Update Time : Wed Jul 9 15:03:28 2014

State : clean

Active Devices : 9

Working Devices : 10

Failed Devices : 0

Spare Devices : 1

Layout : left-symmetric

Chunk Size : 512K

Name : charon:aggr1 (local to host charon)

UUID : 6d11820f:04847070:2725c434:9ee39718

Events : 11186

Number Major Minor RaidDevice State

0 8 129 0 active sync /dev/sdi1

1 8 33 1 active sync /dev/sdc1

2 8 49 2 active sync /dev/sdd1

4 8 65 3 active sync /dev/sde1

5 8 17 4 active sync /dev/sdb1

10 8 97 5 active sync /dev/sdg1

9 8 81 6 active sync /dev/sdf1

8 8 161 7 active sync /dev/sdk1

7 8 145 8 active sync /dev/sdj1

6 8 113 - spare /dev/sdh1

And here’s the dd output:

root:(charon.ka.heimdaheim.de) PWD:/
Wed Jul 09, 14:57:32 [0] > dd if=/dev/zero of=/srv/smb/tmp bs=1G count=100 \
   oflag=direct
100+0 records in
100+0 records out
107374182400 bytes (107 GB) copied, 257.341 s, 417 MB/s

root:(charon.ka.heimdaheim.de) PWD:/

Wed Jul 09, 14:57:32 [0] > dd if=/dev/zero of=/srv/smb/tmp bs=1G count=100 \

oflag=direct

100+0 records in

100+0 records out

107374182400 bytes (107 GB) copied, 257.341 s, 417 MB/s

Tag: MD

mdstat

Linux NAS optimizations

More MD weirdness