• Filipe Manana's avatar
    btrfs: reorder some members of struct btrfs_delayed_ref_head · 315dd5cc
    Filipe Manana authored
    Currently struct delayed_ref_head has its 'bytenr' and 'href_node' members
    in different cache lines (even on a release, non-debug, kernel). This is
    not optimal because when iterating the red black tree of delayed ref heads
    for inserting a new delayed ref head (htree_insert()) we have to pull in 2
    cache lines of delayed ref heads we find in a patch, one for the tree node
    (struct rb_node) and another one for the 'bytenr' field. The same applies
    when searching for an existing delayed ref head (find_ref_head()).
    On a release (non-debug) kernel, the structure also has two 4 bytes holes,
    which makes it 8 bytes longer than necessary. Its current layout is the
    following:
    
      struct btrfs_delayed_ref_head {
              u64                        bytenr;               /*     0     8 */
              u64                        num_bytes;            /*     8     8 */
              refcount_t                 refs;                 /*    16     4 */
    
              /* XXX 4 bytes hole, try to pack */
    
              struct mutex               mutex;                /*    24    32 */
              spinlock_t                 lock;                 /*    56     4 */
    
              /* XXX 4 bytes hole, try to pack */
    
              /* --- cacheline 1 boundary (64 bytes) --- */
              struct rb_root_cached      ref_tree;             /*    64    16 */
              struct list_head           ref_add_list;         /*    80    16 */
              struct rb_node             href_node __attribute__((__aligned__(8))); /*    96    24 */
              struct btrfs_delayed_extent_op * extent_op;      /*   120     8 */
              /* --- cacheline 2 boundary (128 bytes) --- */
              int                        total_ref_mod;        /*   128     4 */
              int                        ref_mod;              /*   132     4 */
              unsigned int               must_insert_reserved:1; /*   136: 0  4 */
              unsigned int               is_data:1;            /*   136: 1  4 */
              unsigned int               is_system:1;          /*   136: 2  4 */
              unsigned int               processing:1;         /*   136: 3  4 */
    
              /* size: 144, cachelines: 3, members: 15 */
              /* sum members: 128, holes: 2, sum holes: 8 */
              /* sum bitfield members: 4 bits (0 bytes) */
              /* padding: 4 */
              /* bit_padding: 28 bits */
              /* forced alignments: 1 */
              /* last cacheline: 16 bytes */
      } __attribute__((__aligned__(8)));
    
    This change reorders the 'href_node' and 'refs' members so that we have
    the 'href_node' in the same cache line as the 'bytenr' field, while also
    eliminating the two holes and reducing the structure size from 144 bytes
    down to 136 bytes, so we can now have 30 ref heads per 4K page (on x86_64)
    instead of 28. The new structure layout after this change is now:
    
      struct btrfs_delayed_ref_head {
              u64                        bytenr;               /*     0     8 */
              u64                        num_bytes;            /*     8     8 */
              struct rb_node             href_node __attribute__((__aligned__(8))); /*    16    24 */
              struct mutex               mutex;                /*    40    32 */
              /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
              refcount_t                 refs;                 /*    72     4 */
              spinlock_t                 lock;                 /*    76     4 */
              struct rb_root_cached      ref_tree;             /*    80    16 */
              struct list_head           ref_add_list;         /*    96    16 */
              struct btrfs_delayed_extent_op * extent_op;      /*   112     8 */
              int                        total_ref_mod;        /*   120     4 */
              int                        ref_mod;              /*   124     4 */
              /* --- cacheline 2 boundary (128 bytes) --- */
              unsigned int               must_insert_reserved:1; /*   128: 0  4 */
              unsigned int               is_data:1;            /*   128: 1  4 */
              unsigned int               is_system:1;          /*   128: 2  4 */
              unsigned int               processing:1;         /*   128: 3  4 */
    
              /* size: 136, cachelines: 3, members: 15 */
              /* padding: 4 */
              /* bit_padding: 28 bits */
              /* forced alignments: 1 */
              /* last cacheline: 8 bytes */
      } __attribute__((__aligned__(8)));
    
    Running the following fs_mark test shows some significant improvement.
    
      $ cat test.sh
      #!/bin/bash
    
      # 15G null block device
      DEV=/dev/nullb0
      MNT=/mnt/nullb0
      FILES=100000
      THREADS=$(nproc --all)
      FILE_SIZE=0
    
      echo "performance" | \
          tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
    
      mkfs.btrfs -f $DEV
      mount -o ssd $DEV $MNT
    
      OPTS="-S 0 -L 5 -n $FILES -s $FILE_SIZE -t $THREADS -k"
      for ((i = 1; i <= $THREADS; i++)); do
          OPTS="$OPTS -d $MNT/d$i"
      done
    
      fs_mark $OPTS
    
      umount $MNT
    
    Before this change:
    
    FSUse%        Count         Size    Files/sec     App Overhead
        10      1200000            0     112631.3         11928055
        16      2400000            0     189943.8         12140777
        23      3600000            0     150719.2         13178480
        50      4800000            0      99137.3         12504293
        53      6000000            0     111733.9         12670836
    
                        Total files/sec: 664165.5
    
    After this change:
    
    FSUse%        Count         Size    Files/sec     App Overhead
        10      1200000            0     148589.5         11565889
        16      2400000            0     227743.8         11561596
        23      3600000            0     191590.5         12550755
        30      4800000            0     179812.3         12629610
        53      6000000            0      92471.4         12352383
    
                        Total files/sec: 840207.5
    
    Measuring the execution times of htree_insert(), in nanoseconds, during
    those fs_mark runs:
    
    Before this change:
    
      Range:  0.000 - 940647.000; Mean: 619.733; Median: 548.000; Stddev: 1834.231
      Percentiles:  90th: 980.000; 95th: 1208.000; 99th: 2090.000
         0.000 -    6.384:       257 |
         6.384 -   26.259:       977 |
        26.259 -   99.635:      4963 |
        99.635 -  370.526:    136800 #############
       370.526 - 1370.603:    566110 #####################################################
      1370.603 - 5062.704:     24945 ##
      5062.704 - 18693.248:      944 |
      18693.248 - 69014.670:     211 |
      69014.670 - 254791.959:     30 |
      254791.959 - 940647.000:     4 |
    
    After this change:
    
      Range:  0.000 - 299200.000; Mean: 587.754; Median: 542.000; Stddev: 1030.422
      Percentiles:  90th: 918.000; 95th: 1113.000; 99th: 1987.000
         0.000 -    5.585:      163 |
         5.585 -   20.678:      452 |
        20.678 -   70.369:     1806 |
        70.369 -  233.965:    26268 ####
       233.965 -  772.564:   333519 #####################################################
       772.564 - 2545.771:    91820 ###############
      2545.771 - 8383.615:     2238 |
      8383.615 - 27603.280:     170 |
      27603.280 - 90879.297:     68 |
      90879.297 - 299200.000:    12 |
    
    Mean, percentiles, maximum times are all better, as well as a lower
    standard deviation.
    Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
    Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
    Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
    315dd5cc
delayed-ref.h 11.9 KB