Commit b40c2e66 authored by Tino Reichardt's avatar Tino Reichardt Committed by Dave Kleikamp

fs/jfs: TRIM support for JFS Filesystem

This patch adds support for the two linux interfaces of the discard/TRIM
command for SSD devices and sparse/thinly-provisioned LUNs.

JFS will support batched discard via FITRIM ioctl and online discard
with the discard mount option.
Signed-off-by: default avatarTino Reichardt <list-jfs@mcmilk.de>
Signed-off-by: default avatarDave Kleikamp <dave.kleikamp@oracle.com>
parent fbcbe2b3
......@@ -3,6 +3,7 @@ IBM's Journaled File System (JFS) for Linux
JFS Homepage: http://jfs.sourceforge.net/
The following mount options are supported:
(*) == default
iocharset=name Character set to use for converting from Unicode to
ASCII. The default is to do no conversion. Use
......@@ -21,12 +22,12 @@ nointegrity Do not write to the journal. The primary use of this option
from backup media. The integrity of the volume is not
guaranteed if the system abnormally abends.
integrity Default. Commit metadata changes to the journal. Use this
option to remount a volume where the nointegrity option was
integrity(*) Commit metadata changes to the journal. Use this option to
remount a volume where the nointegrity option was
previously specified in order to restore normal behavior.
errors=continue Keep going on a filesystem error.
errors=remount-ro Default. Remount the filesystem read-only on an error.
errors=remount-ro(*) Remount the filesystem read-only on an error.
errors=panic Panic and halt the machine if an error occurs.
uid=value Override on-disk uid with specified value
......@@ -35,6 +36,18 @@ umask=value Override on-disk umask with specified octal value. For
directories, the execute bit will be set if the corresponding
read bit is set.
discard=minlen This enables/disables the use of discard/TRIM commands.
discard The discard/TRIM commands are sent to the underlying
nodiscard(*) block device when blocks are freed. This is useful for SSD
devices and sparse/thinly-provisioned LUNs. The FITRIM ioctl
command is also available together with the nodiscard option.
The value of minlen specifies the minimum blockcount, when
a TRIM command to the block device is considered usefull.
When no value is given to the discard option, it defaults to
64 blocks, which means 256KiB in JFS.
The minlen value of discard overrides the minlen value given
on an FITRIM ioctl().
Please send bugs, comments, cards and letters to shaggy@linux.vnet.ibm.com.
The JFS mailing list can be subscribed to by using the link labeled
......
......@@ -6,7 +6,7 @@ obj-$(CONFIG_JFS_FS) += jfs.o
jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \
jfs_unicode.o jfs_dtree.o jfs_inode.o \
jfs_unicode.o jfs_dtree.o jfs_inode.o jfs_discard.o \
jfs_extent.o symlink.o jfs_metapage.o \
jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o \
resize.o xattr.o ioctl.o
......
......@@ -11,13 +11,17 @@
#include <linux/mount.h>
#include <linux/time.h>
#include <linux/sched.h>
#include <linux/blkdev.h>
#include <asm/current.h>
#include <asm/uaccess.h>
#include "jfs_filsys.h"
#include "jfs_debug.h"
#include "jfs_incore.h"
#include "jfs_dinode.h"
#include "jfs_inode.h"
#include "jfs_dmap.h"
#include "jfs_discard.h"
static struct {
long jfs_flag;
......@@ -123,6 +127,40 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
mnt_drop_write_file(filp);
return err;
}
case FITRIM:
{
struct super_block *sb = inode->i_sb;
struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
s64 ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!blk_queue_discard(q)) {
jfs_warn("FITRIM not supported on device");
return -EOPNOTSUPP;
}
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
sizeof(range)))
return -EFAULT;
range.minlen = max_t(unsigned int, range.minlen,
q->limits.discard_granularity);
ret = jfs_ioc_trim(inode, &range);
if (ret < 0)
return ret;
if (copy_to_user((struct fstrim_range __user *)arg, &range,
sizeof(range)))
return -EFAULT;
return 0;
}
default:
return -ENOTTY;
}
......@@ -142,6 +180,9 @@ long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case JFS_IOC_SETFLAGS32:
cmd = JFS_IOC_SETFLAGS;
break;
case FITRIM:
cmd = FITRIM;
break;
}
return jfs_ioctl(filp, cmd, arg);
}
......
/*
* Copyright (C) Tino Reichardt, 2012
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
* the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include "jfs_incore.h"
#include "jfs_superblock.h"
#include "jfs_discard.h"
#include "jfs_dmap.h"
#include "jfs_debug.h"
/*
* NAME: jfs_issue_discard()
*
* FUNCTION: TRIM the specified block range on device, if supported
*
* PARAMETERS:
* ip - pointer to in-core inode
* blkno - starting block number to be trimmed (0..N)
* nblocks - number of blocks to be trimmed
*
* RETURN VALUES:
* none
*
* serialization: IREAD_LOCK(ipbmap) held on entry/exit;
*/
void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks)
{
struct super_block *sb = ip->i_sb;
int r = 0;
r = sb_issue_discard(sb, blkno, nblocks, GFP_NOFS, 0);
if (unlikely(r != 0)) {
jfs_err("JFS: sb_issue_discard" \
"(%p, %llu, %llu, GFP_NOFS, 0) = %d => failed!\n",
sb, (unsigned long long)blkno,
(unsigned long long)nblocks, r);
}
jfs_info("JFS: sb_issue_discard" \
"(%p, %llu, %llu, GFP_NOFS, 0) = %d\n",
sb, (unsigned long long)blkno,
(unsigned long long)nblocks, r);
return;
}
/*
* NAME: jfs_ioc_trim()
*
* FUNCTION: attempt to discard (TRIM) all free blocks from the
* filesystem.
*
* PARAMETERS:
* ip - pointer to in-core inode;
* range - the range, given by user space
*
* RETURN VALUES:
* 0 - success
* -EIO - i/o error
*/
int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range)
{
struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
struct super_block *sb = ipbmap->i_sb;
int agno, agno_end;
s64 start, end, minlen;
u64 trimmed = 0;
/**
* convert byte values to block size of filesystem:
* start: First Byte to trim
* len: number of Bytes to trim from start
* minlen: minimum extent length in Bytes
*/
start = range->start >> sb->s_blocksize_bits;
if (start < 0)
start = 0;
end = start + (range->len >> sb->s_blocksize_bits) - 1;
if (end >= bmp->db_mapsize)
end = bmp->db_mapsize - 1;
minlen = range->minlen >> sb->s_blocksize_bits;
if (minlen <= 0)
minlen = 1;
/**
* we trim all ag's within the range
*/
agno = BLKTOAG(start, JFS_SBI(ip->i_sb));
agno_end = BLKTOAG(end, JFS_SBI(ip->i_sb));
while (agno <= agno_end) {
trimmed += dbDiscardAG(ip, agno, minlen);
agno++;
}
range->len = trimmed << sb->s_blocksize_bits;
return 0;
}
/*
* Copyright (C) Tino Reichardt, 2012
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
* the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _H_JFS_DISCARD
#define _H_JFS_DISCARD
struct fstrim_range;
extern void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks);
extern int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range);
#endif /* _H_JFS_DISCARD */
/*
* Copyright (C) International Business Machines Corp., 2000-2004
* Portions Copyright (C) Tino Reichardt, 2012
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......@@ -25,6 +26,7 @@
#include "jfs_lock.h"
#include "jfs_metapage.h"
#include "jfs_debug.h"
#include "jfs_discard.h"
/*
* SERIALIZATION of the Block Allocation Map.
......@@ -104,7 +106,6 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
int nblocks);
static int dbMaxBud(u8 * cp);
s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
static int blkstol2(s64 nb);
static int cntlz(u32 value);
......@@ -145,7 +146,6 @@ static const s8 budtab[256] = {
2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
};
/*
* NAME: dbMount()
*
......@@ -310,7 +310,6 @@ int dbSync(struct inode *ipbmap)
return (0);
}
/*
* NAME: dbFree()
*
......@@ -337,6 +336,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
s64 lblkno, rem;
struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
struct super_block *sb = ipbmap->i_sb;
IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
......@@ -351,6 +351,13 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
return -EIO;
}
/**
* TRIM the blocks, when mounted with discard option
*/
if (JFS_SBI(sb)->flag & JFS_DISCARD)
if (JFS_SBI(sb)->minblks_trim <= nblocks)
jfs_issue_discard(ipbmap, blkno, nblocks);
/*
* free the blocks a dmap at a time.
*/
......@@ -1095,7 +1102,6 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
/* we were not successful */
release_metapage(mp);
return (rc);
}
......@@ -1589,6 +1595,117 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
}
/*
* NAME: dbDiscardAG()
*
* FUNCTION: attempt to discard (TRIM) all free blocks of specific AG
*
* algorithm:
* 1) allocate blocks, as large as possible and save them
* while holding IWRITE_LOCK on ipbmap
* 2) trim all these saved block/length values
* 3) mark the blocks free again
*
* benefit:
* - we work only on one ag at some time, minimizing how long we
* need to lock ipbmap
* - reading / writing the fs is possible most time, even on
* trimming
*
* downside:
* - we write two times to the dmapctl and dmap pages
* - but for me, this seems the best way, better ideas?
* /TR 2012
*
* PARAMETERS:
* ip - pointer to in-core inode
* agno - ag to trim
* minlen - minimum value of contiguous blocks
*
* RETURN VALUES:
* s64 - actual number of blocks trimmed
*/
s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
{
struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
s64 nblocks, blkno;
u64 trimmed = 0;
int rc, l2nb;
struct super_block *sb = ipbmap->i_sb;
struct range2trim {
u64 blkno;
u64 nblocks;
} *totrim, *tt;
/* max blkno / nblocks pairs to trim */
int count = 0, range_cnt;
/* prevent others from writing new stuff here, while trimming */
IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
nblocks = bmp->db_agfree[agno];
range_cnt = nblocks;
do_div(range_cnt, (int)minlen);
range_cnt = min(range_cnt + 1, 32 * 1024);
totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS);
if (totrim == NULL) {
jfs_error(bmp->db_ipbmap->i_sb,
"dbDiscardAG: no memory for trim array");
IWRITE_UNLOCK(ipbmap);
return 0;
}
tt = totrim;
while (nblocks >= minlen) {
l2nb = BLKSTOL2(nblocks);
/* 0 = okay, -EIO = fatal, -ENOSPC -> try smaller block */
rc = dbAllocAG(bmp, agno, nblocks, l2nb, &blkno);
if (rc == 0) {
tt->blkno = blkno;
tt->nblocks = nblocks;
tt++; count++;
/* the whole ag is free, trim now */
if (bmp->db_agfree[agno] == 0)
break;
/* give a hint for the next while */
nblocks = bmp->db_agfree[agno];
continue;
} else if (rc == -ENOSPC) {
/* search for next smaller log2 block */
l2nb = BLKSTOL2(nblocks) - 1;
nblocks = 1 << l2nb;
} else {
/* Trim any already allocated blocks */
jfs_error(bmp->db_ipbmap->i_sb,
"dbDiscardAG: -EIO");
break;
}
/* check, if our trim array is full */
if (unlikely(count >= range_cnt - 1))
break;
}
IWRITE_UNLOCK(ipbmap);
tt->nblocks = 0; /* mark the current end */
for (tt = totrim; tt->nblocks != 0; tt++) {
/* when mounted with online discard, dbFree() will
* call jfs_issue_discard() itself */
if (!(JFS_SBI(sb)->flag & JFS_DISCARD))
jfs_issue_discard(ip, tt->blkno, tt->nblocks);
dbFree(ip, tt->blkno, tt->nblocks);
trimmed += tt->nblocks;
}
kfree(totrim);
return trimmed;
}
/*
* NAME: dbFindCtl()
*
......
......@@ -311,4 +311,6 @@ extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks);
extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks);
extern void dbFinalizeBmap(struct inode *ipbmap);
extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
extern s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen);
#endif /* _H_JFS_DMAP */
......@@ -45,6 +45,9 @@
/* mount time flag to disable journaling to disk */
#define JFS_NOINTEGRITY 0x00000040
/* mount time flag to enable TRIM to ssd disks */
#define JFS_DISCARD 0x00000080
/* commit option */
#define JFS_COMMIT 0x00000f00 /* commit option mask */
#define JFS_GROUPCOMMIT 0x00000100 /* group (of 1) commit */
......
......@@ -195,6 +195,7 @@ struct jfs_sb_info {
uint uid; /* uid to override on-disk uid */
uint gid; /* gid to override on-disk gid */
uint umask; /* umask to override on-disk umask */
uint minblks_trim; /* minimum blocks, for online trim */
};
/* jfs_sb_info commit_state */
......
......@@ -33,6 +33,7 @@
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
......@@ -100,7 +101,7 @@ void jfs_error(struct super_block *sb, const char * function, ...)
vsnprintf(error_buf, sizeof(error_buf), function, args);
va_end(args);
printk(KERN_ERR "ERROR: (device %s): %s\n", sb->s_id, error_buf);
pr_err("ERROR: (device %s): %s\n", sb->s_id, error_buf);
jfs_handle_error(sb);
}
......@@ -197,7 +198,8 @@ static void jfs_put_super(struct super_block *sb)
enum {
Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize,
Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota,
Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask
Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask,
Opt_discard, Opt_nodiscard, Opt_discard_minblk
};
static const match_table_t tokens = {
......@@ -214,6 +216,9 @@ static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_umask, "umask=%u"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_discard_minblk, "discard=%u"},
{Opt_err, NULL}
};
......@@ -255,8 +260,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
else {
nls_map = load_nls(args[0].from);
if (!nls_map) {
printk(KERN_ERR
"JFS: charset not found\n");
pr_err("JFS: charset not found\n");
goto cleanup;
}
}
......@@ -272,8 +276,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
*newLVSize = sb->s_bdev->bd_inode->i_size >>
sb->s_blocksize_bits;
if (*newLVSize == 0)
printk(KERN_ERR
"JFS: Cannot determine volume size\n");
pr_err("JFS: Cannot determine volume size\n");
break;
}
case Opt_errors:
......@@ -294,8 +297,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
*flag &= ~JFS_ERR_REMOUNT_RO;
*flag |= JFS_ERR_PANIC;
} else {
printk(KERN_ERR
"JFS: %s is an invalid error handler\n",
pr_err("JFS: %s is an invalid error handler\n",
errors);
goto cleanup;
}
......@@ -314,8 +316,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
case Opt_usrquota:
case Opt_grpquota:
case Opt_quota:
printk(KERN_ERR
"JFS: quota operations not supported\n");
pr_err("JFS: quota operations not supported\n");
break;
#endif
case Opt_uid:
......@@ -324,23 +325,61 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
sbi->uid = simple_strtoul(uid, &uid, 0);
break;
}
case Opt_gid:
{
char *gid = args[0].from;
sbi->gid = simple_strtoul(gid, &gid, 0);
break;
}
case Opt_umask:
{
char *umask = args[0].from;
sbi->umask = simple_strtoul(umask, &umask, 8);
if (sbi->umask & ~0777) {
printk(KERN_ERR
"JFS: Invalid value of umask\n");
pr_err("JFS: Invalid value of umask\n");
goto cleanup;
}
break;
}
case Opt_discard:
{
struct request_queue *q = bdev_get_queue(sb->s_bdev);
/* if set to 1, even copying files will cause
* trimming :O
* -> user has more control over the online trimming
*/
sbi->minblks_trim = 64;
if (blk_queue_discard(q)) {
*flag |= JFS_DISCARD;
} else {
pr_err("JFS: discard option " \
"not supported on device\n");
}
break;
}
case Opt_nodiscard:
*flag &= ~JFS_DISCARD;
break;
case Opt_discard_minblk:
{
struct request_queue *q = bdev_get_queue(sb->s_bdev);
char *minblks_trim = args[0].from;
if (blk_queue_discard(q)) {
*flag |= JFS_DISCARD;
sbi->minblks_trim = simple_strtoull(
minblks_trim, &minblks_trim, 0);
} else {
pr_err("JFS: discard option " \
"not supported on device\n");
}
break;
}
default:
printk("jfs: Unrecognized mount option \"%s\" "
" or missing value\n", p);
......@@ -374,8 +413,8 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
if (newLVSize) {
if (sb->s_flags & MS_RDONLY) {
printk(KERN_ERR
"JFS: resize requires volume to be mounted read-write\n");
pr_err("JFS: resize requires volume" \
" to be mounted read-write\n");
return -EROFS;
}
rc = jfs_extendfs(sb, newLVSize, 0);
......@@ -457,7 +496,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
#endif
if (newLVSize) {
printk(KERN_ERR "resize option for remount only\n");
pr_err("resize option for remount only\n");
goto out_kfree;
}
......@@ -625,6 +664,8 @@ static int jfs_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",umask=%03o", sbi->umask);
if (sbi->flag & JFS_NOINTEGRITY)
seq_puts(seq, ",nointegrity");
if (sbi->flag & JFS_DISCARD)
seq_printf(seq, ",discard=%u", sbi->minblks_trim);
if (sbi->nls_tab)
seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset);
if (sbi->flag & JFS_ERR_CONTINUE)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment