buffer_head_io.c 11 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3 4 5 6 7 8 9 10 11 12
 * io.c
 *
 * Buffer cache handling
 *
 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
 */

#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
13
#include <linux/bio.h>
14 15 16 17 18 19 20 21 22 23

#include <cluster/masklog.h>

#include "ocfs2.h"

#include "alloc.h"
#include "inode.h"
#include "journal.h"
#include "uptodate.h"
#include "buffer_head_io.h"
Tao Ma's avatar
Tao Ma committed
24
#include "ocfs2_trace.h"
25

26 27 28
/*
 * Bits on bh->b_state used by ocfs2.
 *
29
 * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
30 31
 */
enum ocfs2_state_bits {
32
	BH_NeedsValidate = BH_JBDPrivateStart,
33 34 35 36 37
};

/* Expand the magic b_state functions */
BUFFER_FNS(NeedsValidate, needs_validate);

38
int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
39
		      struct ocfs2_caching_info *ci)
40 41 42
{
	int ret = 0;

Tao Ma's avatar
Tao Ma committed
43
	trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
44 45 46 47 48 49 50 51 52

	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
	BUG_ON(buffer_jbd(bh));

	/* No need to check for a soft readonly file system here. non
	 * journalled writes are only ever done on system files which
	 * can get modified during recovery even if read-only. */
	if (ocfs2_is_hard_readonly(osb)) {
		ret = -EROFS;
Tao Ma's avatar
Tao Ma committed
53
		mlog_errno(ret);
54 55 56
		goto out;
	}

57
	ocfs2_metadata_cache_io_lock(ci);
58 59 60 61 62 63 64

	lock_buffer(bh);
	set_buffer_uptodate(bh);

	/* remove from dirty list before I/O. */
	clear_buffer_dirty(bh);

65
	get_bh(bh); /* for end_buffer_write_sync() */
66
	bh->b_end_io = end_buffer_write_sync;
67
	submit_bh(REQ_OP_WRITE, bh);
68 69 70 71

	wait_on_buffer(bh);

	if (buffer_uptodate(bh)) {
72
		ocfs2_set_buffer_uptodate(ci, bh);
73 74 75 76 77
	} else {
		/* We don't need to remove the clustered uptodate
		 * information for this bh as it's not marked locally
		 * uptodate. */
		ret = -EIO;
Tao Ma's avatar
Tao Ma committed
78
		mlog_errno(ret);
79 80
	}

81
	ocfs2_metadata_cache_io_unlock(ci);
82 83 84 85
out:
	return ret;
}

86 87 88
/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
 * will be easier to handle read failure.
 */
89 90 91 92 93 94
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
			   unsigned int nr, struct buffer_head *bhs[])
{
	int status = 0;
	unsigned int i;
	struct buffer_head *bh;
95
	int new_bh = 0;
96

Tao Ma's avatar
Tao Ma committed
97 98 99
	trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);

	if (!nr)
100 101
		goto bail;

102 103 104 105 106
	/* Don't put buffer head and re-assign it to NULL if it is allocated
	 * outside since the caller can't be aware of this alternation!
	 */
	new_bh = (bhs[0] == NULL);

107 108 109 110
	for (i = 0 ; i < nr ; i++) {
		if (bhs[i] == NULL) {
			bhs[i] = sb_getblk(osb->sb, block++);
			if (bhs[i] == NULL) {
111
				status = -ENOMEM;
112
				mlog_errno(status);
113
				break;
114 115 116 117 118
			}
		}
		bh = bhs[i];

		if (buffer_jbd(bh)) {
Tao Ma's avatar
Tao Ma committed
119 120
			trace_ocfs2_read_blocks_sync_jbd(
					(unsigned long long)bh->b_blocknr);
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
			continue;
		}

		if (buffer_dirty(bh)) {
			/* This should probably be a BUG, or
			 * at least return an error. */
			mlog(ML_ERROR,
			     "trying to sync read a dirty "
			     "buffer! (blocknr = %llu), skipping\n",
			     (unsigned long long)bh->b_blocknr);
			continue;
		}

		lock_buffer(bh);
		if (buffer_jbd(bh)) {
136
#ifdef CATCH_BH_JBD_RACES
137 138 139 140 141
			mlog(ML_ERROR,
			     "block %llu had the JBD bit set "
			     "while I was in lock_buffer!",
			     (unsigned long long)bh->b_blocknr);
			BUG();
142 143 144 145
#else
			unlock_buffer(bh);
			continue;
#endif
146 147 148 149
		}

		get_bh(bh); /* for end_buffer_read_sync() */
		bh->b_end_io = end_buffer_read_sync;
150
		submit_bh(REQ_OP_READ, bh);
151 152
	}

153
read_failure:
154 155 156
	for (i = nr; i > 0; i--) {
		bh = bhs[i - 1];

157 158 159 160
		if (unlikely(status)) {
			if (new_bh && bh) {
				/* If middle bh fails, let previous bh
				 * finish its read and then put it to
161
				 * avoid bh leak
162 163 164 165 166 167 168 169 170 171 172
				 */
				if (!buffer_jbd(bh))
					wait_on_buffer(bh);
				put_bh(bh);
				bhs[i - 1] = NULL;
			} else if (bh && buffer_uptodate(bh)) {
				clear_buffer_uptodate(bh);
			}
			continue;
		}

173 174 175
		/* No need to wait on the buffer if it's managed by JBD. */
		if (!buffer_jbd(bh))
			wait_on_buffer(bh);
176 177 178 179 180 181

		if (!buffer_uptodate(bh)) {
			/* Status won't be cleared from here on out,
			 * so we can safely record this and loop back
			 * to cleanup the other buffers. */
			status = -EIO;
182
			goto read_failure;
183 184 185 186 187 188 189
		}
	}

bail:
	return status;
}

190 191 192
/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
 * will be easier to handle read failure.
 */
193
int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
194 195 196
		      struct buffer_head *bhs[], int flags,
		      int (*validate)(struct super_block *sb,
				      struct buffer_head *bh))
197 198 199 200
{
	int status = 0;
	int i, ignore_cache = 0;
	struct buffer_head *bh;
201
	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
202
	int new_bh = 0;
203

Tao Ma's avatar
Tao Ma committed
204
	trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
205

206
	BUG_ON(!ci);
207 208
	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
	       (flags & OCFS2_BH_IGNORE_CACHE));
209

210
	if (bhs == NULL) {
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
		status = -EINVAL;
		mlog_errno(status);
		goto bail;
	}

	if (nr < 0) {
		mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
		status = -EINVAL;
		mlog_errno(status);
		goto bail;
	}

	if (nr == 0) {
		status = 0;
		goto bail;
	}

228 229 230 231 232
	/* Don't put buffer head and re-assign it to NULL if it is allocated
	 * outside since the caller can't be aware of this alternation!
	 */
	new_bh = (bhs[0] == NULL);

233
	ocfs2_metadata_cache_io_lock(ci);
234 235
	for (i = 0 ; i < nr ; i++) {
		if (bhs[i] == NULL) {
236
			bhs[i] = sb_getblk(sb, block++);
237
			if (bhs[i] == NULL) {
238
				status = -ENOMEM;
239
				mlog_errno(status);
240 241
				/* Don't forget to put previous bh! */
				break;
242 243 244
			}
		}
		bh = bhs[i];
245
		ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
246

247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
		/* There are three read-ahead cases here which we need to
		 * be concerned with. All three assume a buffer has
		 * previously been submitted with OCFS2_BH_READAHEAD
		 * and it hasn't yet completed I/O.
		 *
		 * 1) The current request is sync to disk. This rarely
		 *    happens these days, and never when performance
		 *    matters - the code can just wait on the buffer
		 *    lock and re-submit.
		 *
		 * 2) The current request is cached, but not
		 *    readahead. ocfs2_buffer_uptodate() will return
		 *    false anyway, so we'll wind up waiting on the
		 *    buffer lock to do I/O. We re-check the request
		 *    with after getting the lock to avoid a re-submit.
		 *
		 * 3) The current request is readahead (and so must
		 *    also be a caching one). We short circuit if the
		 *    buffer is locked (under I/O) and if it's in the
		 *    uptodate cache. The re-check from #2 catches the
		 *    case that the previous read-ahead completes just
		 *    before our is-it-in-flight check.
		 */

271
		if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
Tao Ma's avatar
Tao Ma committed
272
			trace_ocfs2_read_blocks_from_disk(
273
			     (unsigned long long)bh->b_blocknr,
274
			     (unsigned long long)ocfs2_metadata_cache_owner(ci));
275 276
			/* We're using ignore_cache here to say
			 * "go to disk" */
277 278 279
			ignore_cache = 1;
		}

Tao Ma's avatar
Tao Ma committed
280 281 282
		trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
			ignore_cache, buffer_jbd(bh), buffer_dirty(bh));

283 284 285 286
		if (buffer_jbd(bh)) {
			continue;
		}

287
		if (ignore_cache) {
288 289 290 291 292 293
			if (buffer_dirty(bh)) {
				/* This should probably be a BUG, or
				 * at least return an error. */
				continue;
			}

294 295 296 297 298
			/* A read-ahead request was made - if the
			 * buffer is already under read-ahead from a
			 * previously submitted request than we are
			 * done here. */
			if ((flags & OCFS2_BH_READAHEAD)
299
			    && ocfs2_buffer_read_ahead(ci, bh))
300 301
				continue;

302 303 304 305 306 307 308 309 310 311 312 313
			lock_buffer(bh);
			if (buffer_jbd(bh)) {
#ifdef CATCH_BH_JBD_RACES
				mlog(ML_ERROR, "block %llu had the JBD bit set "
					       "while I was in lock_buffer!",
				     (unsigned long long)bh->b_blocknr);
				BUG();
#else
				unlock_buffer(bh);
				continue;
#endif
			}
314 315 316 317 318

			/* Re-check ocfs2_buffer_uptodate() as a
			 * previously read-ahead buffer may have
			 * completed I/O while we were waiting for the
			 * buffer lock. */
319
			if (!(flags & OCFS2_BH_IGNORE_CACHE)
320
			    && !(flags & OCFS2_BH_READAHEAD)
321
			    && ocfs2_buffer_uptodate(ci, bh)) {
322 323 324 325
				unlock_buffer(bh);
				continue;
			}

326
			get_bh(bh); /* for end_buffer_read_sync() */
327 328
			if (validate)
				set_buffer_needs_validate(bh);
329
			bh->b_end_io = end_buffer_read_sync;
330
			submit_bh(REQ_OP_READ, bh);
331 332 333 334
			continue;
		}
	}

335
read_failure:
336 337 338
	for (i = (nr - 1); i >= 0; i--) {
		bh = bhs[i];

339
		if (!(flags & OCFS2_BH_READAHEAD)) {
340 341 342 343 344 345 346
			if (unlikely(status)) {
				/* Clear the buffers on error including those
				 * ever succeeded in reading
				 */
				if (new_bh && bh) {
					/* If middle bh fails, let previous bh
					 * finish its read and then put it to
347
					 * avoid bh leak
348 349 350 351 352 353 354 355
					 */
					if (!buffer_jbd(bh))
						wait_on_buffer(bh);
					put_bh(bh);
					bhs[i] = NULL;
				} else if (bh && buffer_uptodate(bh)) {
					clear_buffer_uptodate(bh);
				}
356 357
				continue;
			}
358
			/* We know this can't have changed as we hold the
359
			 * owner sem. Avoid doing any work on the bh if the
360 361 362 363 364 365 366 367 368 369 370 371
			 * journal has it. */
			if (!buffer_jbd(bh))
				wait_on_buffer(bh);

			if (!buffer_uptodate(bh)) {
				/* Status won't be cleared from here on out,
				 * so we can safely record this and loop back
				 * to cleanup the other buffers. Don't need to
				 * remove the clustered uptodate information
				 * for this bh as it's not marked locally
				 * uptodate. */
				status = -EIO;
372
				clear_buffer_needs_validate(bh);
373
				goto read_failure;
374
			}
375 376 377 378 379 380 381

			if (buffer_needs_validate(bh)) {
				/* We never set NeedsValidate if the
				 * buffer was held by the journal, so
				 * that better not have changed */
				BUG_ON(buffer_jbd(bh));
				clear_buffer_needs_validate(bh);
382
				status = validate(sb, bh);
383 384
				if (status)
					goto read_failure;
385
			}
386 387
		}

388 389 390
		/* Always set the buffer in the cache, even if it was
		 * a forced read, or read-ahead which hasn't yet
		 * completed. */
391 392
		if (bh)
			ocfs2_set_buffer_uptodate(ci, bh);
393
	}
394
	ocfs2_metadata_cache_io_unlock(ci);
395

Tao Ma's avatar
Tao Ma committed
396 397
	trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
				    flags, ignore_cache);
398 399 400 401 402

bail:

	return status;
}
403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424

/* Check whether the blkno is the super block or one of the backups. */
static void ocfs2_check_super_or_backup(struct super_block *sb,
					sector_t blkno)
{
	int i;
	u64 backup_blkno;

	if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
		return;

	for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
		backup_blkno = ocfs2_backup_super_blkno(sb, i);
		if (backup_blkno == blkno)
			return;
	}

	BUG();
}

/*
 * Write super block and backups doesn't need to collaborate with journal,
425
 * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
426 427 428 429 430 431
 * into this function.
 */
int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
				struct buffer_head *bh)
{
	int ret = 0;
432
	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
433 434 435 436 437 438

	BUG_ON(buffer_jbd(bh));
	ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);

	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
		ret = -EROFS;
Tao Ma's avatar
Tao Ma committed
439
		mlog_errno(ret);
440 441 442 443 444 445 446 447 448 449 450
		goto out;
	}

	lock_buffer(bh);
	set_buffer_uptodate(bh);

	/* remove from dirty list before I/O. */
	clear_buffer_dirty(bh);

	get_bh(bh); /* for end_buffer_write_sync() */
	bh->b_end_io = end_buffer_write_sync;
451
	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
452
	submit_bh(REQ_OP_WRITE, bh);
453 454 455 456 457

	wait_on_buffer(bh);

	if (!buffer_uptodate(bh)) {
		ret = -EIO;
Tao Ma's avatar
Tao Ma committed
458
		mlog_errno(ret);
459 460 461 462 463
	}

out:
	return ret;
}