-
Filipe Manana authored
Currently struct delayed_ref_head has its 'bytenr' and 'href_node' members in different cache lines (even on a release, non-debug, kernel). This is not optimal because when iterating the red black tree of delayed ref heads for inserting a new delayed ref head (htree_insert()) we have to pull in 2 cache lines of delayed ref heads we find in a patch, one for the tree node (struct rb_node) and another one for the 'bytenr' field. The same applies when searching for an existing delayed ref head (find_ref_head()). On a release (non-debug) kernel, the structure also has two 4 bytes holes, which makes it 8 bytes longer than necessary. Its current layout is the following: struct btrfs_delayed_ref_head { u64 bytenr; /* 0 8 */ u64 num_bytes; /* 8 8 */ refcount_t refs; /* 16 4 */ /* XXX 4 bytes hole, try to pack */ struct mutex mutex; /* 24 32 */ spinlock_t lock; /* 56 4 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ struct rb_root_cached ref_tree; /* 64 16 */ struct list_head ref_add_list; /* 80 16 */ struct rb_node href_node __attribute__((__aligned__(8))); /* 96 24 */ struct btrfs_delayed_extent_op * extent_op; /* 120 8 */ /* --- cacheline 2 boundary (128 bytes) --- */ int total_ref_mod; /* 128 4 */ int ref_mod; /* 132 4 */ unsigned int must_insert_reserved:1; /* 136: 0 4 */ unsigned int is_data:1; /* 136: 1 4 */ unsigned int is_system:1; /* 136: 2 4 */ unsigned int processing:1; /* 136: 3 4 */ /* size: 144, cachelines: 3, members: 15 */ /* sum members: 128, holes: 2, sum holes: 8 */ /* sum bitfield members: 4 bits (0 bytes) */ /* padding: 4 */ /* bit_padding: 28 bits */ /* forced alignments: 1 */ /* last cacheline: 16 bytes */ } __attribute__((__aligned__(8))); This change reorders the 'href_node' and 'refs' members so that we have the 'href_node' in the same cache line as the 'bytenr' field, while also eliminating the two holes and reducing the structure size from 144 bytes down to 136 bytes, so we can now have 30 ref heads per 4K page (on x86_64) instead of 28. The new structure layout after this change is now: struct btrfs_delayed_ref_head { u64 bytenr; /* 0 8 */ u64 num_bytes; /* 8 8 */ struct rb_node href_node __attribute__((__aligned__(8))); /* 16 24 */ struct mutex mutex; /* 40 32 */ /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */ refcount_t refs; /* 72 4 */ spinlock_t lock; /* 76 4 */ struct rb_root_cached ref_tree; /* 80 16 */ struct list_head ref_add_list; /* 96 16 */ struct btrfs_delayed_extent_op * extent_op; /* 112 8 */ int total_ref_mod; /* 120 4 */ int ref_mod; /* 124 4 */ /* --- cacheline 2 boundary (128 bytes) --- */ unsigned int must_insert_reserved:1; /* 128: 0 4 */ unsigned int is_data:1; /* 128: 1 4 */ unsigned int is_system:1; /* 128: 2 4 */ unsigned int processing:1; /* 128: 3 4 */ /* size: 136, cachelines: 3, members: 15 */ /* padding: 4 */ /* bit_padding: 28 bits */ /* forced alignments: 1 */ /* last cacheline: 8 bytes */ } __attribute__((__aligned__(8))); Running the following fs_mark test shows some significant improvement. $ cat test.sh #!/bin/bash # 15G null block device DEV=/dev/nullb0 MNT=/mnt/nullb0 FILES=100000 THREADS=$(nproc --all) FILE_SIZE=0 echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT OPTS="-S 0 -L 5 -n $FILES -s $FILE_SIZE -t $THREADS -k" for ((i = 1; i <= $THREADS; i++)); do OPTS="$OPTS -d $MNT/d$i" done fs_mark $OPTS umount $MNT Before this change: FSUse% Count Size Files/sec App Overhead 10 1200000 0 112631.3 11928055 16 2400000 0 189943.8 12140777 23 3600000 0 150719.2 13178480 50 4800000 0 99137.3 12504293 53 6000000 0 111733.9 12670836 Total files/sec: 664165.5 After this change: FSUse% Count Size Files/sec App Overhead 10 1200000 0 148589.5 11565889 16 2400000 0 227743.8 11561596 23 3600000 0 191590.5 12550755 30 4800000 0 179812.3 12629610 53 6000000 0 92471.4 12352383 Total files/sec: 840207.5 Measuring the execution times of htree_insert(), in nanoseconds, during those fs_mark runs: Before this change: Range: 0.000 - 940647.000; Mean: 619.733; Median: 548.000; Stddev: 1834.231 Percentiles: 90th: 980.000; 95th: 1208.000; 99th: 2090.000 0.000 - 6.384: 257 | 6.384 - 26.259: 977 | 26.259 - 99.635: 4963 | 99.635 - 370.526: 136800 ############# 370.526 - 1370.603: 566110 ##################################################### 1370.603 - 5062.704: 24945 ## 5062.704 - 18693.248: 944 | 18693.248 - 69014.670: 211 | 69014.670 - 254791.959: 30 | 254791.959 - 940647.000: 4 | After this change: Range: 0.000 - 299200.000; Mean: 587.754; Median: 542.000; Stddev: 1030.422 Percentiles: 90th: 918.000; 95th: 1113.000; 99th: 1987.000 0.000 - 5.585: 163 | 5.585 - 20.678: 452 | 20.678 - 70.369: 1806 | 70.369 - 233.965: 26268 #### 233.965 - 772.564: 333519 ##################################################### 772.564 - 2545.771: 91820 ############### 2545.771 - 8383.615: 2238 | 8383.615 - 27603.280: 170 | 27603.280 - 90879.297: 68 | 90879.297 - 299200.000: 12 | Mean, percentiles, maximum times are all better, as well as a lower standard deviation. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
315dd5cc