Eric Lee / smarc-ti-linux-kernel | Embedian Git Server

Commit feaf222925cdfbc841a695fd30df8c6d0a694146

Authored by Linus Torvalds 2015-02-23 10:05:13 +0800

Exists in ti-lsk-linux-4.1.y and in 10 other branches

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 fixes from Ted Ts'o:
 "Ext4 bug fixes.

  We also reserved code points for encryption and read-only images (for
  which the implementation is mostly just the reserved code point for a
  read-only feature :-)"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: fix indirect punch hole corruption
  ext4: ignore journal checksum on remount; don't fail
  ext4: remove duplicate remount check for JOURNAL_CHECKSUM change
  ext4: fix mmap data corruption in nodelalloc mode when blocksize < pagesize
  ext4: support read-only images
  ext4: change to use setup_timer() instead of init_timer()
  ext4: reserve codepoints used by the ext4 encryption feature
  jbd2: complain about descriptor block checksum errors

Showing 5 changed files Inline Diff

fs/ext4/ext4.h
fs/ext4/indirect.c
fs/ext4/inode.c
fs/ext4/super.c
fs/jbd2/recovery.c

fs/ext4/ext4.h

Diff comments View file @ feaf222

 /*
  *  ext4.h
  *
  * Copyright (C) 1992, 1993, 1994, 1995
  * Remy Card (card@masi.ibp.fr)
  * Laboratoire MASI - Institut Blaise Pascal
  * Universite Pierre et Marie Curie (Paris VI)
  *
  *  from
  *
  *  linux/include/linux/minix_fs.h
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 #ifndef _EXT4_H
 #define _EXT4_H
 #include <linux/types.h>
 #include <linux/blkdev.h>
 #include <linux/magic.h>
 #include <linux/jbd2.h>
 #include <linux/quota.h>
 #include <linux/rwsem.h>
 #include <linux/rbtree.h>
 #include <linux/seqlock.h>
 #include <linux/mutex.h>
 #include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
 #include <linux/falloc.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
 /*
  * The fourth extended filesystem constants/structures
  */
 /*
  * Define EXT4FS_DEBUG to produce debug messages
  */
 #undef EXT4FS_DEBUG
 /*
  * Debug code
  */
 #ifdef EXT4FS_DEBUG
 #define ext4_debug(f, a...)						\
 	do {								\
 		printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
 			__FILE__, __LINE__, __func__);			\
 		printk(KERN_DEBUG f, ## a);				\
 	} while (0)
 #else
 #define ext4_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 /*
  * Turn on EXT_DEBUG to get lots of info about extents operations.
  */
 #define EXT_DEBUG__
 #ifdef EXT_DEBUG
 #define ext_debug(fmt, ...)	printk(fmt, ##__VA_ARGS__)
 #else
 #define ext_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
 	ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
 #define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)			\
 	ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
 #define EXT4_ERROR_FILE(file, block, fmt, a...)				\
 	ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
 /* data type for filesystem-wide blocks number */
 typedef unsigned long long ext4_fsblk_t;
 /* data type for file logical block number */
 typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned int ext4_group_t;
 /*
  * Flags used in mballoc's allocation_context flags field.
  *
  * Also used to show what's going on for debugging purposes when the
  * flag field is exported via the traceport interface
  */
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE		0x0001
 /* blocks already reserved */
 #define EXT4_MB_HINT_RESERVED		0x0002
 /* metadata is being allocated */
 #define EXT4_MB_HINT_METADATA		0x0004
 /* first blocks in the file */
 #define EXT4_MB_HINT_FIRST		0x0008
 /* search for the best chunk */
 #define EXT4_MB_HINT_BEST		0x0010
 /* data is being allocated */
 #define EXT4_MB_HINT_DATA		0x0020
 /* don't preallocate (for tails) */
 #define EXT4_MB_HINT_NOPREALLOC		0x0040
 /* allocate for locality group */
 #define EXT4_MB_HINT_GROUP_ALLOC	0x0080
 /* allocate goal blocks or none */
 #define EXT4_MB_HINT_GOAL_ONLY		0x0100
 /* goal is meaningful */
 #define EXT4_MB_HINT_TRY_GOAL		0x0200
 /* blocks already pre-reserved by delayed allocation */
 #define EXT4_MB_DELALLOC_RESERVED	0x0400
 /* We are doing stream allocation */
 #define EXT4_MB_STREAM_ALLOC		0x0800
 /* Use reserved root blocks if needed */
 #define EXT4_MB_USE_ROOT_BLOCKS		0x1000
 /* Use blocks from reserved pool */
 #define EXT4_MB_USE_RESERVED		0x2000
 struct ext4_allocation_request {
 	/* target inode for block we're allocating */
 	struct inode *inode;
 	/* how many blocks we want to allocate */
 	unsigned int len;
 	/* logical block in target inode */
 	ext4_lblk_t logical;
 	/* the closest logical allocated block to the left */
 	ext4_lblk_t lleft;
 	/* the closest logical allocated block to the right */
 	ext4_lblk_t lright;
 	/* phys. target (a hint) */
 	ext4_fsblk_t goal;
 	/* phys. block for the closest logical allocated block to the left */
 	ext4_fsblk_t pleft;
 	/* phys. block for the closest logical allocated block to the right */
 	ext4_fsblk_t pright;
 	/* flags. see above EXT4_MB_HINT_* */
 	unsigned int flags;
 };
 /*
  * Logical to physical block mapping, used by ext4_map_blocks()
  *
  * This structure is used to pass requests into ext4_map_blocks() as
  * well as to store the information returned by ext4_map_blocks().  It
  * takes less room on the stack than a struct buffer_head.
  */
 #define EXT4_MAP_NEW		(1 << BH_New)
 #define EXT4_MAP_MAPPED		(1 << BH_Mapped)
 #define EXT4_MAP_UNWRITTEN	(1 << BH_Unwritten)
 #define EXT4_MAP_BOUNDARY	(1 << BH_Boundary)
 #define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
 				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
 struct ext4_map_blocks {
 	ext4_fsblk_t m_pblk;
 	ext4_lblk_t m_lblk;
 	unsigned int m_len;
 	unsigned int m_flags;
 };
 /*
  * Flags for ext4_io_end->flags
  */
 #define	EXT4_IO_END_UNWRITTEN	0x0001
 /*
  * For converting unwritten extents on a work queue. 'handle' is used for
  * buffered writeback.
  */
 typedef struct ext4_io_end {
 	struct list_head	list;		/* per-file finished IO list */
 	handle_t		*handle;	/* handle reserved for extent
 						 * conversion */
 	struct inode		*inode;		/* file being written to */
 	struct bio		*bio;		/* Linked list of completed
 						 * bios covering the extent */
 	unsigned int		flag;		/* unwritten or not */
 	loff_t			offset;		/* offset in the file */
 	ssize_t			size;		/* size of the extent */
 	atomic_t		count;		/* reference counter */
 } ext4_io_end_t;
 struct ext4_io_submit {
 	int			io_op;
 	struct bio		*io_bio;
 	ext4_io_end_t		*io_end;
 	sector_t		io_next_block;
 };
 /*
  * Special inodes numbers
  */
 #define	EXT4_BAD_INO		 1	/* Bad blocks inode */
 #define EXT4_ROOT_INO		 2	/* Root inode */
 #define EXT4_USR_QUOTA_INO	 3	/* User quota inode */
 #define EXT4_GRP_QUOTA_INO	 4	/* Group quota inode */
 #define EXT4_BOOT_LOADER_INO	 5	/* Boot loader inode */
 #define EXT4_UNDEL_DIR_INO	 6	/* Undelete directory inode */
 #define EXT4_RESIZE_INO		 7	/* Reserved group descriptors inode */
 #define EXT4_JOURNAL_INO	 8	/* Journal inode */
 /* First non-reserved inode for old ext4 filesystems */
 #define EXT4_GOOD_OLD_FIRST_INO	11
 /*
  * Maximal count of links to a file
  */
 #define EXT4_LINK_MAX		65000
 /*
  * Macro-instructions used to manage several block sizes
  */
 #define EXT4_MIN_BLOCK_SIZE		1024
 #define	EXT4_MAX_BLOCK_SIZE		65536
 #define EXT4_MIN_BLOCK_LOG_SIZE		10
 #define EXT4_MAX_BLOCK_LOG_SIZE		16
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE(s)		((s)->s_blocksize)
 #else
 # define EXT4_BLOCK_SIZE(s)		(EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
 #endif
 #define	EXT4_ADDR_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / sizeof(__u32))
 #define EXT4_CLUSTER_SIZE(s)		(EXT4_BLOCK_SIZE(s) << \
 					 EXT4_SB(s)->s_cluster_bits)
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
 # define EXT4_CLUSTER_BITS(s)		(EXT4_SB(s)->s_cluster_bits)
 #else
 # define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_log_block_size + 10)
 #endif
 #ifdef __KERNEL__
 #define	EXT4_ADDR_PER_BLOCK_BITS(s)	(EXT4_SB(s)->s_addr_per_block_bits)
 #define EXT4_INODE_SIZE(s)		(EXT4_SB(s)->s_inode_size)
 #define EXT4_FIRST_INO(s)		(EXT4_SB(s)->s_first_ino)
 #else
 #define EXT4_INODE_SIZE(s)	(((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
 				 EXT4_GOOD_OLD_INODE_SIZE : \
 				 (s)->s_inode_size)
 #define EXT4_FIRST_INO(s)	(((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
 				 EXT4_GOOD_OLD_FIRST_INO : \
 				 (s)->s_first_ino)
 #endif
 #define EXT4_BLOCK_ALIGN(size, blkbits)		ALIGN((size), (1 << (blkbits)))
 /* Translate a block number to a cluster number */
 #define EXT4_B2C(sbi, blk)	((blk) >> (sbi)->s_cluster_bits)
 /* Translate a cluster number to a block number */
 #define EXT4_C2B(sbi, cluster)	((cluster) << (sbi)->s_cluster_bits)
 /* Translate # of blks to # of clusters */
 #define EXT4_NUM_B2C(sbi, blks)	(((blks) + (sbi)->s_cluster_ratio - 1) >> \
 				 (sbi)->s_cluster_bits)
 /* Mask out the low bits to get the starting block of the cluster */
 #define EXT4_PBLK_CMASK(s, pblk) ((pblk) &				\
 				  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
 #define EXT4_LBLK_CMASK(s, lblk) ((lblk) &				\
 				  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
 /* Get the cluster offset */
 #define EXT4_PBLK_COFF(s, pblk) ((pblk) &				\
 				 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
 #define EXT4_LBLK_COFF(s, lblk) ((lblk) &				\
 				 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))
 /*
  * Structure of a blocks group descriptor
  */
 struct ext4_group_desc
 {
 	__le32	bg_block_bitmap_lo;	/* Blocks bitmap block */
 	__le32	bg_inode_bitmap_lo;	/* Inodes bitmap block */
 	__le32	bg_inode_table_lo;	/* Inodes table block */
 	__le16	bg_free_blocks_count_lo;/* Free blocks count */
 	__le16	bg_free_inodes_count_lo;/* Free inodes count */
 	__le16	bg_used_dirs_count_lo;	/* Directories count */
 	__le16	bg_flags;		/* EXT4_BG_flags (INODE_UNINIT, etc) */
 	__le32  bg_exclude_bitmap_lo;   /* Exclude bitmap for snapshots */
 	__le16  bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
 	__le16  bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
 	__le16  bg_itable_unused_lo;	/* Unused inodes count */
 	__le16  bg_checksum;		/* crc16(sb_uuid+group+desc) */
 	__le32	bg_block_bitmap_hi;	/* Blocks bitmap block MSB */
 	__le32	bg_inode_bitmap_hi;	/* Inodes bitmap block MSB */
 	__le32	bg_inode_table_hi;	/* Inodes table block MSB */
 	__le16	bg_free_blocks_count_hi;/* Free blocks count MSB */
 	__le16	bg_free_inodes_count_hi;/* Free inodes count MSB */
 	__le16	bg_used_dirs_count_hi;	/* Directories count MSB */
 	__le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
 	__le32  bg_exclude_bitmap_hi;   /* Exclude bitmap block MSB */
 	__le16  bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
 	__le16  bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
 	__u32   bg_reserved;
 };
 #define EXT4_BG_INODE_BITMAP_CSUM_HI_END	\
 	(offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
 	 sizeof(__le16))
 #define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END	\
 	(offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
 	 sizeof(__le16))
 /*
  * Structure of a flex block group info
  */
 struct flex_groups {
 	atomic64_t	free_clusters;
 	atomic_t	free_inodes;
 	atomic_t	used_dirs;
 };
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
 #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
 /*
  * Macro-instructions used to manage group descriptors
  */
 #define EXT4_MIN_DESC_SIZE		32
 #define EXT4_MIN_DESC_SIZE_64BIT	64
 #define	EXT4_MAX_DESC_SIZE		EXT4_MIN_BLOCK_SIZE
 #define EXT4_DESC_SIZE(s)		(EXT4_SB(s)->s_desc_size)
 #ifdef __KERNEL__
 # define EXT4_BLOCKS_PER_GROUP(s)	(EXT4_SB(s)->s_blocks_per_group)
 # define EXT4_CLUSTERS_PER_GROUP(s)	(EXT4_SB(s)->s_clusters_per_group)
 # define EXT4_DESC_PER_BLOCK(s)		(EXT4_SB(s)->s_desc_per_block)
 # define EXT4_INODES_PER_GROUP(s)	(EXT4_SB(s)->s_inodes_per_group)
 # define EXT4_DESC_PER_BLOCK_BITS(s)	(EXT4_SB(s)->s_desc_per_block_bits)
 #else
 # define EXT4_BLOCKS_PER_GROUP(s)	((s)->s_blocks_per_group)
 # define EXT4_DESC_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
 # define EXT4_INODES_PER_GROUP(s)	((s)->s_inodes_per_group)
 #endif
 /*
  * Constants relative to the data blocks
  */
 #define	EXT4_NDIR_BLOCKS		12
 #define	EXT4_IND_BLOCK			EXT4_NDIR_BLOCKS
 #define	EXT4_DIND_BLOCK			(EXT4_IND_BLOCK + 1)
 #define	EXT4_TIND_BLOCK			(EXT4_DIND_BLOCK + 1)
 #define	EXT4_N_BLOCKS			(EXT4_TIND_BLOCK + 1)
 /*
  * Inode flags
  */
 #define	EXT4_SECRM_FL			0x00000001 /* Secure deletion */
 #define	EXT4_UNRM_FL			0x00000002 /* Undelete */
 #define	EXT4_COMPR_FL			0x00000004 /* Compress file */
 #define EXT4_SYNC_FL			0x00000008 /* Synchronous updates */
 #define EXT4_IMMUTABLE_FL		0x00000010 /* Immutable file */
 #define EXT4_APPEND_FL			0x00000020 /* writes to file may only append */
 #define EXT4_NODUMP_FL			0x00000040 /* do not dump file */
 #define EXT4_NOATIME_FL			0x00000080 /* do not update atime */
 /* Reserved for compression usage... */
 #define EXT4_DIRTY_FL			0x00000100
 #define EXT4_COMPRBLK_FL		0x00000200 /* One or more compressed clusters */
 #define EXT4_NOCOMPR_FL			0x00000400 /* Don't compress */
-#define EXT4_ECOMPR_FL			0x00000800 /* Compression error */
+	/* nb: was previously EXT2_ECOMPR_FL */
+#define EXT4_ENCRYPT_FL			0x00000800 /* encrypted file */
 /* End compression flags --- maybe not all used */
 #define EXT4_INDEX_FL			0x00001000 /* hash-indexed directory */
 #define EXT4_IMAGIC_FL			0x00002000 /* AFS directory */
 #define EXT4_JOURNAL_DATA_FL		0x00004000 /* file data should be journaled */
 #define EXT4_NOTAIL_FL			0x00008000 /* file tail should not be merged */
 #define EXT4_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
 #define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
 #define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
 #define EXT4_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
 #define EXT4_INLINE_DATA_FL		0x10000000 /* Inode has inline data. */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 #define EXT4_FL_USER_VISIBLE		0x004BDFFF /* User visible flags */
 #define EXT4_FL_USER_MODIFIABLE		0x004380FF /* User modifiable flags */
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
 			   EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
 			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
 			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
 /* Flags that are appropriate for regular files (all but dir-specific ones). */
 #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
 /* Flags that are appropriate for non-directories/regular files. */
 #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 {
 	if (S_ISDIR(mode))
 		return flags;
 	else if (S_ISREG(mode))
 		return flags & EXT4_REG_FLMASK;
 	else
 		return flags & EXT4_OTHER_FLMASK;
 }
 /*
  * Inode flags used for atomic set/get
  */
 enum {
 	EXT4_INODE_SECRM	= 0,	/* Secure deletion */
 	EXT4_INODE_UNRM		= 1,	/* Undelete */
 	EXT4_INODE_COMPR	= 2,	/* Compress file */
 	EXT4_INODE_SYNC		= 3,	/* Synchronous updates */
 	EXT4_INODE_IMMUTABLE	= 4,	/* Immutable file */
 	EXT4_INODE_APPEND	= 5,	/* writes to file may only append */
 	EXT4_INODE_NODUMP	= 6,	/* do not dump file */
 	EXT4_INODE_NOATIME	= 7,	/* do not update atime */
 /* Reserved for compression usage... */
 	EXT4_INODE_DIRTY	= 8,
 	EXT4_INODE_COMPRBLK	= 9,	/* One or more compressed clusters */
 	EXT4_INODE_NOCOMPR	= 10,	/* Don't compress */
-	EXT4_INODE_ECOMPR	= 11,	/* Compression error */
+	EXT4_INODE_ENCRYPT	= 11,	/* Compression error */
 /* End compression flags --- maybe not all used */
 	EXT4_INODE_INDEX	= 12,	/* hash-indexed directory */
 	EXT4_INODE_IMAGIC	= 13,	/* AFS directory */
 	EXT4_INODE_JOURNAL_DATA	= 14,	/* file data should be journaled */
 	EXT4_INODE_NOTAIL	= 15,	/* file tail should not be merged */
 	EXT4_INODE_DIRSYNC	= 16,	/* dirsync behaviour (directories only) */
 	EXT4_INODE_TOPDIR	= 17,	/* Top of directory hierarchies*/
 	EXT4_INODE_HUGE_FILE	= 18,	/* Set to each huge file */
 	EXT4_INODE_EXTENTS	= 19,	/* Inode uses extents */
 	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
 	EXT4_INODE_EOFBLOCKS	= 22,	/* Blocks allocated beyond EOF */
 	EXT4_INODE_INLINE_DATA	= 28,	/* Data in inode. */
 	EXT4_INODE_RESERVED	= 31,	/* reserved for ext4 lib */
 };
 /*
  * Since it's pretty easy to mix up bit numbers and hex values, we use a
  * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
  * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
  * any extra space in the compiled kernel image, otherwise, the build will fail.
  * It's important that these values are the same, since we are using
  * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
  * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
  * values found in ext2, ext3 and ext4 filesystems, and of course the values
  * defined in e2fsprogs.
  *
  * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
  */
 #define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
 #define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
 static inline void ext4_check_flag_values(void)
 {
 	CHECK_FLAG_VALUE(SECRM);
 	CHECK_FLAG_VALUE(UNRM);
 	CHECK_FLAG_VALUE(COMPR);
 	CHECK_FLAG_VALUE(SYNC);
 	CHECK_FLAG_VALUE(IMMUTABLE);
 	CHECK_FLAG_VALUE(APPEND);
 	CHECK_FLAG_VALUE(NODUMP);
 	CHECK_FLAG_VALUE(NOATIME);
 	CHECK_FLAG_VALUE(DIRTY);
 	CHECK_FLAG_VALUE(COMPRBLK);
 	CHECK_FLAG_VALUE(NOCOMPR);
-	CHECK_FLAG_VALUE(ECOMPR);
+	CHECK_FLAG_VALUE(ENCRYPT);
 	CHECK_FLAG_VALUE(INDEX);
 	CHECK_FLAG_VALUE(IMAGIC);
 	CHECK_FLAG_VALUE(JOURNAL_DATA);
 	CHECK_FLAG_VALUE(NOTAIL);
 	CHECK_FLAG_VALUE(DIRSYNC);
 	CHECK_FLAG_VALUE(TOPDIR);
 	CHECK_FLAG_VALUE(HUGE_FILE);
 	CHECK_FLAG_VALUE(EXTENTS);
 	CHECK_FLAG_VALUE(EA_INODE);
 	CHECK_FLAG_VALUE(EOFBLOCKS);
 	CHECK_FLAG_VALUE(INLINE_DATA);
 	CHECK_FLAG_VALUE(RESERVED);
 }
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
 	__u32 group;		/* Group number for this data */
 	__u64 block_bitmap;	/* Absolute block number of block bitmap */
 	__u64 inode_bitmap;	/* Absolute block number of inode bitmap */
 	__u64 inode_table;	/* Absolute block number of inode table start */
 	__u32 blocks_count;	/* Total number of blocks in this group */
 	__u16 reserved_blocks;	/* Number of reserved blocks in this group */
 	__u16 unused;
 };
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 struct compat_ext4_new_group_input {
 	u32 group;
 	compat_u64 block_bitmap;
 	compat_u64 inode_bitmap;
 	compat_u64 inode_table;
 	u32 blocks_count;
 	u16 reserved_blocks;
 	u16 unused;
 };
 #endif
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
 	__u32 group;
 	__u64 block_bitmap;
 	__u64 inode_bitmap;
 	__u64 inode_table;
 	__u32 blocks_count;
 	__u16 reserved_blocks;
 	__u16 unused;
 	__u32 free_blocks_count;
 };
 /* Indexes used to index group tables in ext4_new_group_data */
 enum {
 	BLOCK_BITMAP = 0,	/* block bitmap */
 	INODE_BITMAP,		/* inode bitmap */
 	INODE_TABLE,		/* inode tables */
 	GROUP_TABLE_COUNT,
 };
 /*
  * Flags used by ext4_map_blocks()
  */
 	/* Allocate any needed blocks and/or convert an unwritten
 	   extent to be an initialized ext4 */
 #define EXT4_GET_BLOCKS_CREATE			0x0001
 	/* Request the creation of an unwritten extent */
 #define EXT4_GET_BLOCKS_UNWRIT_EXT		0x0002
 #define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT	(EXT4_GET_BLOCKS_UNWRIT_EXT|\
 						 EXT4_GET_BLOCKS_CREATE)
 	/* Caller is from the delayed allocation writeout path
 	 * finally doing the actual allocation of delayed blocks */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
 	/* caller is from the direct IO path, request to creation of an
 	unwritten extents if not allocated, split the unwritten
 	extent if blocks has been preallocated already*/
 #define EXT4_GET_BLOCKS_PRE_IO			0x0008
 #define EXT4_GET_BLOCKS_CONVERT			0x0010
 #define EXT4_GET_BLOCKS_IO_CREATE_EXT		(EXT4_GET_BLOCKS_PRE_IO|\
 					 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
 	/* Convert extent to initialized after IO complete */
 #define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
 					 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
 	/* Eventual metadata allocation (due to growing extent tree)
 	 * should not fail, so try to use reserved blocks for that.*/
 #define EXT4_GET_BLOCKS_METADATA_NOFAIL		0x0020
 	/* Don't normalize allocation size (used for fallocate) */
 #define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
 	/* Request will not result in inode size update (user for fallocate) */
 #define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080
 	/* Do not take i_data_sem locking in ext4_map_blocks */
 #define EXT4_GET_BLOCKS_NO_LOCK			0x0100
 	/* Convert written extents to unwritten */
 #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0200
 /*
  * The bit position of these flags must not overlap with any of the
  * EXT4_GET_BLOCKS_*.  They are used by ext4_find_extent(),
  * read_extent_tree_block(), ext4_split_extent_at(),
  * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
  * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
  * caching the extents when reading from the extent tree while a
  * truncate or punch hole operation is in progress.
  */
 #define EXT4_EX_NOCACHE				0x40000000
 #define EXT4_EX_FORCE_CACHE			0x20000000
 /*
  * Flags used by ext4_free_blocks
  */
 #define EXT4_FREE_BLOCKS_METADATA	0x0001
 #define EXT4_FREE_BLOCKS_FORGET		0x0002
 #define EXT4_FREE_BLOCKS_VALIDATED	0x0004
 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE	0x0008
 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER	0x0010
 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER	0x0020
 /*
  * ioctl commands
  */
 #define	EXT4_IOC_GETFLAGS		FS_IOC_GETFLAGS
 #define	EXT4_IOC_SETFLAGS		FS_IOC_SETFLAGS
 #define	EXT4_IOC_GETVERSION		_IOR('f', 3, long)
 #define	EXT4_IOC_SETVERSION		_IOW('f', 4, long)
 #define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
 #define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
 #define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
 #define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
 #define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
 #define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
 #define EXT4_IOC_MIGRATE		_IO('f', 9)
  /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
  /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 #define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
 #define EXT4_IOC_RESIZE_FS		_IOW('f', 16, __u64)
 #define EXT4_IOC_SWAP_BOOT		_IO('f', 17)
 #define EXT4_IOC_PRECACHE_EXTENTS	_IO('f', 18)
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
  */
 #define EXT4_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
 #define EXT4_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
 #define EXT4_IOC32_GETVERSION		_IOR('f', 3, int)
 #define EXT4_IOC32_SETVERSION		_IOW('f', 4, int)
 #define EXT4_IOC32_GETRSVSZ		_IOR('f', 5, int)
 #define EXT4_IOC32_SETRSVSZ		_IOW('f', 6, int)
 #define EXT4_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
 #define EXT4_IOC32_GROUP_ADD		_IOW('f', 8, struct compat_ext4_new_group_input)
 #define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
 #endif
 /* Max physical block we can address w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS	0xFFFFFFFF
 /*
  * Structure of an inode on the disk
  */
 struct ext4_inode {
 	__le16	i_mode;		/* File mode */
 	__le16	i_uid;		/* Low 16 bits of Owner Uid */
 	__le32	i_size_lo;	/* Size in bytes */
 	__le32	i_atime;	/* Access time */
 	__le32	i_ctime;	/* Inode Change time */
 	__le32	i_mtime;	/* Modification time */
 	__le32	i_dtime;	/* Deletion Time */
 	__le16	i_gid;		/* Low 16 bits of Group Id */
 	__le16	i_links_count;	/* Links count */
 	__le32	i_blocks_lo;	/* Blocks count */
 	__le32	i_flags;	/* File flags */
 	union {
 		struct {
 			__le32  l_i_version;
 		} linux1;
 		struct {
 			__u32  h_i_translator;
 		} hurd1;
 		struct {
 			__u32  m_i_reserved1;
 		} masix1;
 	} osd1;				/* OS dependent 1 */
 	__le32	i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
 	__le32	i_generation;	/* File version (for NFS) */
 	__le32	i_file_acl_lo;	/* File ACL */
 	__le32	i_size_high;
 	__le32	i_obso_faddr;	/* Obsoleted fragment address */
 	union {
 		struct {
 			__le16	l_i_blocks_high; /* were l_i_reserved1 */
 			__le16	l_i_file_acl_high;
 			__le16	l_i_uid_high;	/* these 2 fields */
 			__le16	l_i_gid_high;	/* were reserved2[0] */
 			__le16	l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
 			__le16	l_i_reserved;
 		} linux2;
 		struct {
 			__le16	h_i_reserved1;	/* Obsoleted fragment number/size which are removed in ext4 */
 			__u16	h_i_mode_high;
 			__u16	h_i_uid_high;
 			__u16	h_i_gid_high;
 			__u32	h_i_author;
 		} hurd2;
 		struct {
 			__le16	h_i_reserved1;	/* Obsoleted fragment number/size which are removed in ext4 */
 			__le16	m_i_file_acl_high;
 			__u32	m_i_reserved2[2];
 		} masix2;
 	} osd2;				/* OS dependent 2 */
 	__le16	i_extra_isize;
 	__le16	i_checksum_hi;	/* crc32c(uuid+inum+inode) BE */
 	__le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
 	__le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
 	__le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
 	__le32  i_crtime;       /* File Creation time */
 	__le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
 	__le32  i_version_hi;	/* high 32 bits for 64-bit version */
 };
 struct move_extent {
 	__u32 reserved;		/* should be zero */
 	__u32 donor_fd;		/* donor file descriptor */
 	__u64 orig_start;	/* logical start offset in block for orig */
 	__u64 donor_start;	/* logical start offset in block for donor */
 	__u64 len;		/* block length to be moved */
 	__u64 moved_len;	/* moved block length */
 };
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
 #define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
 /*
  * Extended fields will fit into an inode if the filesystem was formatted
  * with large inodes (-I 256 or larger) and there are not currently any EAs
  * consuming all of the available space. For new inodes we always reserve
  * enough space for the kernel's known extended fields, but for inodes
  * created with an old kernel this might not have been the case. None of
  * the extended inode fields is critical for correct filesystem operation.
  * This macro checks if a certain field fits in the inode. Note that
  * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
  */
 #define EXT4_FITS_IN_INODE(ext4_inode, einode, field)	\
 	((offsetof(typeof(*ext4_inode), field) +	\
 	  sizeof((ext4_inode)->field))			\
 	<= (EXT4_GOOD_OLD_INODE_SIZE +			\
 	    (einode)->i_extra_isize))			\
 static inline __le32 ext4_encode_extra_time(struct timespec *time)
 {
        return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
 			   (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
                           ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
 }
 static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
 {
        if (sizeof(time->tv_sec) > 4)
 	       time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
 			       << 32;
        time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
 }
 #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)			       \
 do {									       \
 	(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);	       \
 	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
 		(raw_inode)->xtime ## _extra =				       \
 				ext4_encode_extra_time(&(inode)->xtime);       \
 } while (0)
 #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)			       \
 do {									       \
 	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
 		(raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec);      \
 	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
 		(raw_inode)->xtime ## _extra =				       \
 				ext4_encode_extra_time(&(einode)->xtime);      \
 } while (0)
 #define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)			       \
 do {									       \
 	(inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime);       \
 	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
 		ext4_decode_extra_time(&(inode)->xtime,			       \
 				       raw_inode->xtime ## _extra);	       \
 	else								       \
 		(inode)->xtime.tv_nsec = 0;				       \
 } while (0)
 #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)			       \
 do {									       \
 	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
 		(einode)->xtime.tv_sec = 				       \
 			(signed)le32_to_cpu((raw_inode)->xtime);	       \
 	else								       \
 		(einode)->xtime.tv_sec = 0;				       \
 	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
 		ext4_decode_extra_time(&(einode)->xtime,		       \
 				       raw_inode->xtime ## _extra);	       \
 	else								       \
 		(einode)->xtime.tv_nsec = 0;				       \
 } while (0)
 #define i_disk_version osd1.linux1.l_i_version
 #if defined(__KERNEL__) || defined(__linux__)
 #define i_reserved1	osd1.linux1.l_i_reserved1
 #define i_file_acl_high	osd2.linux2.l_i_file_acl_high
 #define i_blocks_high	osd2.linux2.l_i_blocks_high
 #define i_uid_low	i_uid
 #define i_gid_low	i_gid
 #define i_uid_high	osd2.linux2.l_i_uid_high
 #define i_gid_high	osd2.linux2.l_i_gid_high
 #define i_checksum_lo	osd2.linux2.l_i_checksum_lo
 #elif defined(__GNU__)
 #define i_translator	osd1.hurd1.h_i_translator
 #define i_uid_high	osd2.hurd2.h_i_uid_high
 #define i_gid_high	osd2.hurd2.h_i_gid_high
 #define i_author	osd2.hurd2.h_i_author
 #elif defined(__masix__)
 #define i_reserved1	osd1.masix1.m_i_reserved1
 #define i_file_acl_high	osd2.masix2.m_i_file_acl_high
 #define i_reserved2	osd2.masix2.m_i_reserved2
 #endif /* defined(__KERNEL__) || defined(__linux__) */
 #include "extents_status.h"
 /*
  * fourth extended file system inode data in memory
  */
 struct ext4_inode_info {
 	__le32	i_data[15];	/* unconverted */
 	__u32	i_dtime;
 	ext4_fsblk_t	i_file_acl;
 	/*
 	 * i_block_group is the number of the block group which contains
 	 * this file's inode.  Constant across the lifetime of the inode,
 	 * it is ued for making block allocation decisions - we try to
 	 * place a file's data blocks near its inode block, and new inodes
 	 * near to their parent directory's inode.
 	 */
 	ext4_group_t	i_block_group;
 	ext4_lblk_t	i_dir_start_lookup;
 #if (BITS_PER_LONG < 64)
 	unsigned long	i_state_flags;		/* Dynamic state flags */
 #endif
 	unsigned long	i_flags;
 	/*
 	 * Extended attributes can be read independently of the main file
 	 * data. Taking i_mutex even when reading would cause contention
 	 * between readers of EAs and writers of regular file data, so
 	 * instead we synchronize on xattr_sem when reading or changing
 	 * EAs.
 	 */
 	struct rw_semaphore xattr_sem;
 	struct list_head i_orphan;	/* unlinked but open inodes */
 	/*
 	 * i_disksize keeps track of what the inode size is ON DISK, not
 	 * in memory.  During truncate, i_size is set to the new size by
 	 * the VFS prior to calling ext4_truncate(), but the filesystem won't
 	 * set i_disksize to 0 until the truncate is actually under way.
 	 *
 	 * The intent is that i_disksize always represents the blocks which
 	 * are used by this file.  This allows recovery to restart truncate
 	 * on orphans if we crash during truncate.  We actually write i_disksize
 	 * into the on-disk inode when writing inodes out, instead of i_size.
 	 *
 	 * The only time when i_disksize and i_size may be different is when
 	 * a truncate is in progress.  The only things which change i_disksize
 	 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
 	 */
 	loff_t	i_disksize;
 	/*
 	 * i_data_sem is for serialising ext4_truncate() against
 	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
 	 * data tree are chopped off during truncate. We can't do that in
 	 * ext4 because whenever we perform intermediate commits during
 	 * truncate, the inode and all the metadata blocks *must* be in a
 	 * consistent state which allows truncation of the orphans to restart
 	 * during recovery.  Hence we must fix the get_block-vs-truncate race
 	 * by other means, so we have i_data_sem.
 	 */
 	struct rw_semaphore i_data_sem;
 	struct inode vfs_inode;
 	struct jbd2_inode *jinode;
 	spinlock_t i_raw_lock;	/* protects updates to the raw inode */
 	/*
 	 * File creation time. Its function is same as that of
 	 * struct timespec i_{a,c,m}time in the generic inode.
 	 */
 	struct timespec i_crtime;
 	/* mballoc */
 	struct list_head i_prealloc_list;
 	spinlock_t i_prealloc_lock;
 	/* extents status tree */
 	struct ext4_es_tree i_es_tree;
 	rwlock_t i_es_lock;
 	struct list_head i_es_list;
 	unsigned int i_es_all_nr;	/* protected by i_es_lock */
 	unsigned int i_es_shk_nr;	/* protected by i_es_lock */
 	ext4_lblk_t i_es_shrink_lblk;	/* Offset where we start searching for
 					   extents to shrink. Protected by
 					   i_es_lock  */
 	/* ialloc */
 	ext4_group_t	i_last_alloc_group;
 	/* allocation reservation info for delalloc */
 	/* In case of bigalloc, these refer to clusters rather than blocks */
 	unsigned int i_reserved_data_blocks;
 	unsigned int i_reserved_meta_blocks;
 	unsigned int i_allocated_meta_blocks;
 	ext4_lblk_t i_da_metadata_calc_last_lblock;
 	int i_da_metadata_calc_len;
 	/* on-disk additional length */
 	__u16 i_extra_isize;
 	/* Indicate the inline data space. */
 	u16 i_inline_off;
 	u16 i_inline_size;
 #ifdef CONFIG_QUOTA
 	/* quota space reservation, managed internally by quota code */
 	qsize_t i_reserved_quota;
 #endif
 	/* Lock protecting lists below */
 	spinlock_t i_completed_io_lock;
 	/*
 	 * Completed IOs that need unwritten extents handling and have
 	 * transaction reserved
 	 */
 	struct list_head i_rsv_conversion_list;
 	/*
 	 * Completed IOs that need unwritten extents handling and don't have
 	 * transaction reserved
 	 */
 	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
 	atomic_t i_unwritten; /* Nr. of inflight conversions pending */
 	struct work_struct i_rsv_conversion_work;
 	spinlock_t i_block_reservation_lock;
 	/*
 	 * Transactions that contain inode's metadata needed to complete
 	 * fsync and fdatasync, respectively.
 	 */
 	tid_t i_sync_tid;
 	tid_t i_datasync_tid;
 #ifdef CONFIG_QUOTA
 	struct dquot *i_dquot[MAXQUOTAS];
 #endif
 	/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
 	__u32 i_csum_seed;
 };
 /*
  * File system states
  */
 #define	EXT4_VALID_FS			0x0001	/* Unmounted cleanly */
 #define	EXT4_ERROR_FS			0x0002	/* Errors detected */
 #define	EXT4_ORPHAN_FS			0x0004	/* Orphans being recovered */
 /*
  * Misc. filesystem flags
  */
 #define EXT2_FLAGS_SIGNED_HASH		0x0001  /* Signed dirhash in use */
 #define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
 #define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
 /*
  * Mount flags set via mount options or defaults
  */
 #define EXT4_MOUNT_GRPID		0x00004	/* Create files with directory's group */
 #define EXT4_MOUNT_DEBUG		0x00008	/* Some debugging messages */
 #define EXT4_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
 #define EXT4_MOUNT_ERRORS_RO		0x00020	/* Remount fs ro on errors */
 #define EXT4_MOUNT_ERRORS_PANIC		0x00040	/* Panic on errors */
 #define EXT4_MOUNT_ERRORS_MASK		0x00070
 #define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
 #ifdef CONFIG_FS_DAX
 #define EXT4_MOUNT_DAX			0x00200	/* Direct Access */
 #else
 #define EXT4_MOUNT_DAX			0
 #endif
 #define EXT4_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
 #define EXT4_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
 #define EXT4_MOUNT_ORDERED_DATA		0x00800	/* Flush data before commit */
 #define EXT4_MOUNT_WRITEBACK_DATA	0x00C00	/* No data ordering */
 #define EXT4_MOUNT_UPDATE_JOURNAL	0x01000	/* Update the journal format */
 #define EXT4_MOUNT_NO_UID32		0x02000  /* Disable 32-bit UIDs */
 #define EXT4_MOUNT_XATTR_USER		0x04000	/* Extended user attributes */
 #define EXT4_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
 #define EXT4_MOUNT_NO_AUTO_DA_ALLOC	0x10000	/* No auto delalloc mapping */
 #define EXT4_MOUNT_BARRIER		0x20000 /* Use block barriers */
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
 #define EXT4_MOUNT_DIOREAD_NOLOCK	0x400000 /* Enable support for dio read nolocking */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
 #define EXT4_MOUNT_DISCARD		0x40000000 /* Issue DISCARD requests */
 #define EXT4_MOUNT_INIT_INODE_TABLE	0x80000000 /* Initialize uninitialized itables */
 /*
  * Mount flags set either automatically (could not be set by mount option)
  * based on per file system feature or property or in special cases such as
  * distinguishing between explicit mount option definition and default.
  */
 #define EXT4_MOUNT2_EXPLICIT_DELALLOC	0x00000001 /* User explicitly
 						      specified delalloc */
 #define EXT4_MOUNT2_STD_GROUP_SIZE	0x00000002 /* We have standard group
 						      size of blocksize * 8
 						      blocks */
 #define EXT4_MOUNT2_HURD_COMPAT		0x00000004 /* Support HURD-castrated
 						      file systems */
 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
 #define set_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt |= \
 						EXT4_MOUNT_##opt
 #define test_opt(sb, opt)		(EXT4_SB(sb)->s_mount_opt & \
 					 EXT4_MOUNT_##opt)
 #define clear_opt2(sb, opt)		EXT4_SB(sb)->s_mount_opt2 &= \
 						~EXT4_MOUNT2_##opt
 #define set_opt2(sb, opt)		EXT4_SB(sb)->s_mount_opt2 |= \
 						EXT4_MOUNT2_##opt
 #define test_opt2(sb, opt)		(EXT4_SB(sb)->s_mount_opt2 & \
 					 EXT4_MOUNT2_##opt)
 #define ext4_test_and_set_bit		__test_and_set_bit_le
 #define ext4_set_bit			__set_bit_le
 #define ext4_set_bit_atomic		ext2_set_bit_atomic
 #define ext4_test_and_clear_bit		__test_and_clear_bit_le
 #define ext4_clear_bit			__clear_bit_le
 #define ext4_clear_bit_atomic		ext2_clear_bit_atomic
 #define ext4_test_bit			test_bit_le
 #define ext4_find_next_zero_bit		find_next_zero_bit_le
 #define ext4_find_next_bit		find_next_bit_le
 extern void ext4_set_bits(void *bm, int cur, int len);
 /*
  * Maximal mount counts between two filesystem checks
  */
 #define EXT4_DFL_MAX_MNT_COUNT		20	/* Allow 20 mounts */
 #define EXT4_DFL_CHECKINTERVAL		0	/* Don't use interval check */
 /*
  * Behaviour when detecting errors
  */
 #define EXT4_ERRORS_CONTINUE		1	/* Continue execution */
 #define EXT4_ERRORS_RO			2	/* Remount fs read-only */
 #define EXT4_ERRORS_PANIC		3	/* Panic */
 #define EXT4_ERRORS_DEFAULT		EXT4_ERRORS_CONTINUE
 /* Metadata checksum algorithm codes */
 #define EXT4_CRC32C_CHKSUM		1
+/* Encryption algorithms */
+#define EXT4_ENCRYPTION_MODE_INVALID		0
+#define EXT4_ENCRYPTION_MODE_AES_256_XTS	1
+#define EXT4_ENCRYPTION_MODE_AES_256_GCM	2
+#define EXT4_ENCRYPTION_MODE_AES_256_CBC	3
 /*
  * Structure of the super block
  */
 struct ext4_super_block {
 /*00*/	__le32	s_inodes_count;		/* Inodes count */
 	__le32	s_blocks_count_lo;	/* Blocks count */
 	__le32	s_r_blocks_count_lo;	/* Reserved blocks count */
 	__le32	s_free_blocks_count_lo;	/* Free blocks count */
 /*10*/	__le32	s_free_inodes_count;	/* Free inodes count */
 	__le32	s_first_data_block;	/* First Data Block */
 	__le32	s_log_block_size;	/* Block size */
 	__le32	s_log_cluster_size;	/* Allocation cluster size */
 /*20*/	__le32	s_blocks_per_group;	/* # Blocks per group */
 	__le32	s_clusters_per_group;	/* # Clusters per group */
 	__le32	s_inodes_per_group;	/* # Inodes per group */
 	__le32	s_mtime;		/* Mount time */
 /*30*/	__le32	s_wtime;		/* Write time */
 	__le16	s_mnt_count;		/* Mount count */
 	__le16	s_max_mnt_count;	/* Maximal mount count */
 	__le16	s_magic;		/* Magic signature */
 	__le16	s_state;		/* File system state */
 	__le16	s_errors;		/* Behaviour when detecting errors */
 	__le16	s_minor_rev_level;	/* minor revision level */
 /*40*/	__le32	s_lastcheck;		/* time of last check */
 	__le32	s_checkinterval;	/* max. time between checks */
 	__le32	s_creator_os;		/* OS */
 	__le32	s_rev_level;		/* Revision level */
 /*50*/	__le16	s_def_resuid;		/* Default uid for reserved blocks */
 	__le16	s_def_resgid;		/* Default gid for reserved blocks */
 	/*
 	 * These fields are for EXT4_DYNAMIC_REV superblocks only.
 	 *
 	 * Note: the difference between the compatible feature set and
 	 * the incompatible feature set is that if there is a bit set
 	 * in the incompatible feature set that the kernel doesn't
 	 * know about, it should refuse to mount the filesystem.
 	 *
 	 * e2fsck's requirements are more strict; if it doesn't know
 	 * about a feature in either the compatible or incompatible
 	 * feature set, it must abort and not try to meddle with
 	 * things it doesn't understand...
 	 */
 	__le32	s_first_ino;		/* First non-reserved inode */
 	__le16  s_inode_size;		/* size of inode structure */
 	__le16	s_block_group_nr;	/* block group # of this superblock */
 	__le32	s_feature_compat;	/* compatible feature set */
 /*60*/	__le32	s_feature_incompat;	/* incompatible feature set */
 	__le32	s_feature_ro_compat;	/* readonly-compatible feature set */
 /*68*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
 /*78*/	char	s_volume_name[16];	/* volume name */
 /*88*/	char	s_last_mounted[64];	/* directory where last mounted */
 /*C8*/	__le32	s_algorithm_usage_bitmap; /* For compression */
 	/*
 	 * Performance hints.  Directory preallocation should only
 	 * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
 	 */
 	__u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
 	__u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
 	__le16	s_reserved_gdt_blocks;	/* Per group desc for online growth */
 	/*
 	 * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
 	 */
 /*D0*/	__u8	s_journal_uuid[16];	/* uuid of journal superblock */
 /*E0*/	__le32	s_journal_inum;		/* inode number of journal file */
 	__le32	s_journal_dev;		/* device number of journal file */
 	__le32	s_last_orphan;		/* start of list of inodes to delete */
 	__le32	s_hash_seed[4];		/* HTREE hash seed */
 	__u8	s_def_hash_version;	/* Default hash version to use */
 	__u8	s_jnl_backup_type;
 	__le16  s_desc_size;		/* size of group descriptor */
 /*100*/	__le32	s_default_mount_opts;
 	__le32	s_first_meta_bg;	/* First metablock block group */
 	__le32	s_mkfs_time;		/* When the filesystem was created */
 	__le32	s_jnl_blocks[17];	/* Backup of the journal inode */
 	/* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
 /*150*/	__le32	s_blocks_count_hi;	/* Blocks count */
 	__le32	s_r_blocks_count_hi;	/* Reserved blocks count */
 	__le32	s_free_blocks_count_hi;	/* Free blocks count */
 	__le16	s_min_extra_isize;	/* All inodes have at least # bytes */
 	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
 	__le32	s_flags;		/* Miscellaneous flags */
 	__le16  s_raid_stride;		/* RAID stride */
 	__le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
 	__le64  s_mmp_block;            /* Block for multi-mount protection */
 	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
 	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
 	__u8	s_checksum_type;	/* metadata checksum algorithm used */
 	__le16  s_reserved_pad;
 	__le64	s_kbytes_written;	/* nr of lifetime kilobytes written */
 	__le32	s_snapshot_inum;	/* Inode number of active snapshot */
 	__le32	s_snapshot_id;		/* sequential ID of active snapshot */
 	__le64	s_snapshot_r_blocks_count; /* reserved blocks for active
 					      snapshot's future use */
 	__le32	s_snapshot_list;	/* inode number of the head of the
 					   on-disk snapshot list */
 #define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
 	__le32	s_error_count;		/* number of fs errors */
 	__le32	s_first_error_time;	/* first time an error happened */
 	__le32	s_first_error_ino;	/* inode involved in first error */
 	__le64	s_first_error_block;	/* block involved of first error */
 	__u8	s_first_error_func[32];	/* function where the error happened */
 	__le32	s_first_error_line;	/* line number where error happened */
 	__le32	s_last_error_time;	/* most recent time of an error */
 	__le32	s_last_error_ino;	/* inode involved in last error */
 	__le32	s_last_error_line;	/* line number where error happened */
 	__le64	s_last_error_block;	/* block involved of last error */
 	__u8	s_last_error_func[32];	/* function where the error happened */
 #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
 	__u8	s_mount_opts[64];
 	__le32	s_usr_quota_inum;	/* inode for tracking user quota */
 	__le32	s_grp_quota_inum;	/* inode for tracking group quota */
 	__le32	s_overhead_clusters;	/* overhead blocks/clusters in fs */
 	__le32	s_backup_bgs[2];	/* groups with sparse_super2 SBs */
-	__le32	s_reserved[106];	/* Padding to the end of the block */
+	__u8	s_encrypt_algos[4];	/* Encryption algorithms in use  */
+	__le32	s_reserved[105];	/* Padding to the end of the block */
 	__le32	s_checksum;		/* crc32c(superblock) */
 };
 #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
 #ifdef __KERNEL__
 /*
  * run-time mount flags
  */
 #define EXT4_MF_MNTDIR_SAMPLED	0x0001
 #define EXT4_MF_FS_ABORTED	0x0002	/* Fatal error detected */
 /* Number of quota types we support */
 #define EXT4_MAXQUOTAS 2
 /*
  * fourth extended-fs super-block data in memory
  */
 struct ext4_sb_info {
 	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
 	unsigned long s_inodes_per_block;/* Number of inodes per block */
 	unsigned long s_blocks_per_group;/* Number of blocks in a group */
 	unsigned long s_clusters_per_group; /* Number of clusters in a group */
 	unsigned long s_inodes_per_group;/* Number of inodes in a group */
 	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
 	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
 	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
 	ext4_group_t s_groups_count;	/* Number of groups in the fs */
 	ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
 	unsigned long s_overhead;  /* # of fs overhead clusters */
 	unsigned int s_cluster_ratio;	/* Number of blocks per cluster */
 	unsigned int s_cluster_bits;	/* log2 of s_cluster_ratio */
 	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
 	struct buffer_head **s_group_desc;
 	unsigned int s_mount_opt;
 	unsigned int s_mount_opt2;
 	unsigned int s_mount_flags;
 	unsigned int s_def_mount_opt;
 	ext4_fsblk_t s_sb_block;
 	atomic64_t s_resv_clusters;
 	kuid_t s_resuid;
 	kgid_t s_resgid;
 	unsigned short s_mount_state;
 	unsigned short s_pad;
 	int s_addr_per_block_bits;
 	int s_desc_per_block_bits;
 	int s_inode_size;
 	int s_first_ino;
 	unsigned int s_inode_readahead_blks;
 	unsigned int s_inode_goal;
 	spinlock_t s_next_gen_lock;
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
 	int s_def_hash_version;
 	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
 	struct percpu_counter s_freeclusters_counter;
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
 	struct percpu_counter s_dirtyclusters_counter;
 	struct blockgroup_lock *s_blockgroup_lock;
 	struct proc_dir_entry *s_proc;
 	struct kobject s_kobj;
 	struct completion s_kobj_unregister;
 	struct super_block *s_sb;
 	/* Journaling */
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	struct mutex s_orphan_lock;
 	unsigned long s_resize_flags;		/* Flags indicating if there
 						   is a resizer */
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
 	struct block_device *journal_bdev;
 #ifdef CONFIG_QUOTA
 	char *s_qf_names[EXT4_MAXQUOTAS];	/* Names of quota files with journalled quota */
 	int s_jquota_fmt;			/* Format of quota to use */
 #endif
 	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
 	struct rb_root system_blks;
 #ifdef EXTENTS_STATS
 	/* ext4 extents stats */
 	unsigned long s_ext_min;
 	unsigned long s_ext_max;
 	unsigned long s_depth_max;
 	spinlock_t s_ext_stats_lock;
 	unsigned long s_ext_blocks;
 	unsigned long s_ext_extents;
 #endif
 	/* for buddy allocator */
 	struct ext4_group_info ***s_group_info;
 	struct inode *s_buddy_cache;
 	spinlock_t s_md_lock;
 	unsigned short *s_mb_offsets;
 	unsigned int *s_mb_maxs;
 	unsigned int s_group_info_size;
 	/* tunables */
 	unsigned long s_stripe;
 	unsigned int s_mb_stream_request;
 	unsigned int s_mb_max_to_scan;
 	unsigned int s_mb_min_to_scan;
 	unsigned int s_mb_stats;
 	unsigned int s_mb_order2_reqs;
 	unsigned int s_mb_group_prealloc;
 	unsigned int s_max_dir_size_kb;
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
 	/* stats for buddy allocator */
 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
 	atomic_t s_bal_success;	/* we found long enough chunks */
 	atomic_t s_bal_allocated;	/* in blocks */
 	atomic_t s_bal_ex_scanned;	/* total extents scanned */
 	atomic_t s_bal_goals;	/* goal hits */
 	atomic_t s_bal_breaks;	/* too long searches */
 	atomic_t s_bal_2orders;	/* 2^order hits */
 	spinlock_t s_bal_lock;
 	unsigned long s_mb_buddies_generated;
 	unsigned long long s_mb_generation_time;
 	atomic_t s_mb_lost_chunks;
 	atomic_t s_mb_preallocated;
 	atomic_t s_mb_discarded;
 	atomic_t s_lock_busy;
 	/* locality groups */
 	struct ext4_locality_group __percpu *s_locality_groups;
 	/* for write statistics */
 	unsigned long s_sectors_written_start;
 	u64 s_kbytes_written;
 	/* the size of zero-out chunk */
 	unsigned int s_extent_max_zeroout_kb;
 	unsigned int s_log_groups_per_flex;
 	struct flex_groups *s_flex_groups;
 	ext4_group_t s_flex_groups_allocated;
 	/* workqueue for reserved extent conversions (buffered io) */
 	struct workqueue_struct *rsv_conversion_wq;
 	/* timer for periodic error stats printing */
 	struct timer_list s_err_report;
 	/* Lazy inode table initialization info */
 	struct ext4_li_request *s_li_request;
 	/* Wait multiplier for lazy initialization thread */
 	unsigned int s_li_wait_mult;
 	/* Kernel thread for multiple mount protection */
 	struct task_struct *s_mmp_tsk;
 	/* record the last minlen when FITRIM is called. */
 	atomic_t s_last_trim_minblks;
 	/* Reference to checksum algorithm driver via cryptoapi */
 	struct crypto_shash *s_chksum_driver;
 	/* Precomputed FS UUID checksum for seeding other checksums */
 	__u32 s_csum_seed;
 	/* Reclaim extents from extent status tree */
 	struct shrinker s_es_shrinker;
 	struct list_head s_es_list;	/* List of inodes with reclaimable extents */
 	long s_es_nr_inode;
 	struct ext4_es_stats s_es_stats;
 	struct mb_cache *s_mb_cache;
 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 	/* Ratelimit ext4 messages. */
 	struct ratelimit_state s_err_ratelimit_state;
 	struct ratelimit_state s_warning_ratelimit_state;
 	struct ratelimit_state s_msg_ratelimit_state;
 };
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
 static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
 {
 	return container_of(inode, struct ext4_inode_info, vfs_inode);
 }
 static inline struct timespec ext4_current_time(struct inode *inode)
 {
 	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
 		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
 }
 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 {
 	return ino == EXT4_ROOT_INO ||
 		ino == EXT4_USR_QUOTA_INO ||
 		ino == EXT4_GRP_QUOTA_INO ||
 		ino == EXT4_BOOT_LOADER_INO ||
 		ino == EXT4_JOURNAL_INO ||
 		ino == EXT4_RESIZE_INO ||
 		(ino >= EXT4_FIRST_INO(sb) &&
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
 static inline void ext4_set_io_unwritten_flag(struct inode *inode,
 					      struct ext4_io_end *io_end)
 {
 	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
 		io_end->flag |= EXT4_IO_END_UNWRITTEN;
 		atomic_inc(&EXT4_I(inode)->i_unwritten);
 	}
 }
 static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
 {
 	return inode->i_private;
 }
 static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
 {
 	inode->i_private = io;
 }
 /*
  * Inode dynamic state flags
  */
 enum {
 	EXT4_STATE_JDATA,		/* journaled data exists */
 	EXT4_STATE_NEW,			/* inode is newly created */
 	EXT4_STATE_XATTR,		/* has in-inode xattrs */
 	EXT4_STATE_NO_EXPAND,		/* No space for expansion */
 	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */
 	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
 	EXT4_STATE_NEWENTRY,		/* File just added to dir */
 	EXT4_STATE_DIOREAD_LOCK,	/* Disable support for dio read
 					   nolocking */
 	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
 	EXT4_STATE_ORDERED_MODE,	/* data=ordered mode */
 	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */
 };
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
 static inline int ext4_test_inode_##name(struct inode *inode, int bit)	\
 {									\
 	return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);	\
 }									\
 static inline void ext4_set_inode_##name(struct inode *inode, int bit)	\
 {									\
 	set_bit(bit + (offset), &EXT4_I(inode)->i_##field);		\
 }									\
 static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
 {									\
 	clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);		\
 }
 /* Add these declarations here only so that these functions can be
  * found by name.  Otherwise, they are very hard to locate. */
 static inline int ext4_test_inode_flag(struct inode *inode, int bit);
 static inline void ext4_set_inode_flag(struct inode *inode, int bit);
 static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
 EXT4_INODE_BIT_FNS(flag, flags, 0)
 /* Add these declarations here only so that these functions can be
  * found by name.  Otherwise, they are very hard to locate. */
 static inline int ext4_test_inode_state(struct inode *inode, int bit);
 static inline void ext4_set_inode_state(struct inode *inode, int bit);
 static inline void ext4_clear_inode_state(struct inode *inode, int bit);
 #if (BITS_PER_LONG < 64)
 EXT4_INODE_BIT_FNS(state, state_flags, 0)
 static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 {
 	(ei)->i_state_flags = 0;
 }
 #else
 EXT4_INODE_BIT_FNS(state, flags, 32)
 static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 {
 	/* We depend on the fact that callers will set i_flags */
 }
 #endif
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
  * macros from user land. */
 #define EXT4_SB(sb)	(sb)
 #endif
 #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
 /*
  * Codes for operating systems
  */
 #define EXT4_OS_LINUX		0
 #define EXT4_OS_HURD		1
 #define EXT4_OS_MASIX		2
 #define EXT4_OS_FREEBSD		3
 #define EXT4_OS_LITES		4
 /*
  * Revision levels
  */
 #define EXT4_GOOD_OLD_REV	0	/* The good old (original) format */
 #define EXT4_DYNAMIC_REV	1	/* V2 format w/ dynamic inode sizes */
 #define EXT4_CURRENT_REV	EXT4_GOOD_OLD_REV
 #define EXT4_MAX_SUPP_REV	EXT4_DYNAMIC_REV
 #define EXT4_GOOD_OLD_INODE_SIZE 128
 /*
  * Feature set definitions
  */
 #define EXT4_HAS_COMPAT_FEATURE(sb,mask)			\
 	((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)			\
 	((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)			\
 	((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
 #define EXT4_SET_COMPAT_FEATURE(sb,mask)			\
 	EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
 #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)			\
 	EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
 #define EXT4_SET_INCOMPAT_FEATURE(sb,mask)			\
 	EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
 #define EXT4_CLEAR_COMPAT_FEATURE(sb,mask)			\
 	EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
 #define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
 	EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
 #define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
 	EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
 #define EXT4_FEATURE_COMPAT_DIR_PREALLOC	0x0001
 #define EXT4_FEATURE_COMPAT_IMAGIC_INODES	0x0002
 #define EXT4_FEATURE_COMPAT_HAS_JOURNAL		0x0004
 #define EXT4_FEATURE_COMPAT_EXT_ATTR		0x0008
 #define EXT4_FEATURE_COMPAT_RESIZE_INODE	0x0010
 #define EXT4_FEATURE_COMPAT_DIR_INDEX		0x0020
 #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2	0x0200
 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
 #define EXT4_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
 #define EXT4_FEATURE_RO_COMPAT_HUGE_FILE        0x0008
 #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM		0x0010
 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
 #define EXT4_FEATURE_RO_COMPAT_QUOTA		0x0100
 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC		0x0200
 /*
  * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM).  When
  * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
  * all other data structures' checksums.  However, the METADATA_CSUM and
  * GDT_CSUM bits are mutually exclusive.
  */
 #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM	0x0400
+#define EXT4_FEATURE_RO_COMPAT_READONLY		0x1000
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
 #define EXT4_FEATURE_INCOMPAT_RECOVER		0x0004 /* Needs recovery */
 #define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008 /* Journal device */
 #define EXT4_FEATURE_INCOMPAT_META_BG		0x0010
 #define EXT4_FEATURE_INCOMPAT_EXTENTS		0x0040 /* extents support */
 #define EXT4_FEATURE_INCOMPAT_64BIT		0x0080
 #define EXT4_FEATURE_INCOMPAT_MMP               0x0100
 #define EXT4_FEATURE_INCOMPAT_FLEX_BG		0x0200
 #define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
 #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM	0x2000 /* use crc32c for bg */
 #define EXT4_FEATURE_INCOMPAT_LARGEDIR		0x4000 /* >2GB or 3-lvl htree */
 #define EXT4_FEATURE_INCOMPAT_INLINE_DATA	0x8000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_ENCRYPT		0x10000
 #define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
 					 EXT4_FEATURE_INCOMPAT_META_BG)
 #define EXT2_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
 #define EXT3_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT3_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
 					 EXT4_FEATURE_INCOMPAT_RECOVER| \
 					 EXT4_FEATURE_INCOMPAT_META_BG)
 #define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
 #define EXT4_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT4_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
 					 EXT4_FEATURE_INCOMPAT_RECOVER| \
 					 EXT4_FEATURE_INCOMPAT_META_BG| \
 					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
 					 EXT4_FEATURE_INCOMPAT_64BIT| \
 					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
 					 EXT4_FEATURE_INCOMPAT_MMP |	\
 					 EXT4_FEATURE_INCOMPAT_INLINE_DATA)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
 					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
 					 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
 					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
 					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
 					 EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
 					 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
 					 EXT4_FEATURE_RO_COMPAT_QUOTA)
 /*
  * Default values for user and/or group using reserved blocks
  */
 #define	EXT4_DEF_RESUID		0
 #define	EXT4_DEF_RESGID		0
 #define EXT4_DEF_INODE_READAHEAD_BLKS	32
 /*
  * Default mount options
  */
 #define EXT4_DEFM_DEBUG		0x0001
 #define EXT4_DEFM_BSDGROUPS	0x0002
 #define EXT4_DEFM_XATTR_USER	0x0004
 #define EXT4_DEFM_ACL		0x0008
 #define EXT4_DEFM_UID16		0x0010
 #define EXT4_DEFM_JMODE		0x0060
 #define EXT4_DEFM_JMODE_DATA	0x0020
 #define EXT4_DEFM_JMODE_ORDERED	0x0040
 #define EXT4_DEFM_JMODE_WBACK	0x0060
 #define EXT4_DEFM_NOBARRIER	0x0100
 #define EXT4_DEFM_BLOCK_VALIDITY 0x0200
 #define EXT4_DEFM_DISCARD	0x0400
 #define EXT4_DEFM_NODELALLOC	0x0800
 /*
  * Default journal batch times
  */
 #define EXT4_DEF_MIN_BATCH_TIME	0
 #define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
 /*
  * Minimum number of groups in a flexgroup before we separate out
  * directories into the first block group of a flexgroup
  */
 #define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME	4
 /*
  * Structure of a directory entry
  */
 #define EXT4_NAME_LEN 255
 struct ext4_dir_entry {
 	__le32	inode;			/* Inode number */
 	__le16	rec_len;		/* Directory entry length */
 	__le16	name_len;		/* Name length */
 	char	name[EXT4_NAME_LEN];	/* File name */
 };
 /*
  * The new version of the directory entry.  Since EXT4 structures are
  * stored in intel byte order, and the name_len field could never be
  * bigger than 255 chars, it's safe to reclaim the extra byte for the
  * file_type field.
  */
 struct ext4_dir_entry_2 {
 	__le32	inode;			/* Inode number */
 	__le16	rec_len;		/* Directory entry length */
 	__u8	name_len;		/* Name length */
 	__u8	file_type;
 	char	name[EXT4_NAME_LEN];	/* File name */
 };
 /*
  * This is a bogus directory entry at the end of each leaf block that
  * records checksums.
  */
 struct ext4_dir_entry_tail {
 	__le32	det_reserved_zero1;	/* Pretend to be unused */
 	__le16	det_rec_len;		/* 12 */
 	__u8	det_reserved_zero2;	/* Zero name length */
 	__u8	det_reserved_ft;	/* 0xDE, fake file type */
 	__le32	det_checksum;		/* crc32c(uuid+inum+dirblock) */
 };
 #define EXT4_DIRENT_TAIL(block, blocksize) \
 	((struct ext4_dir_entry_tail *)(((void *)(block)) + \
 					((blocksize) - \
 					 sizeof(struct ext4_dir_entry_tail))))
 /*
  * Ext4 directory file types.  Only the low 3 bits are used.  The
  * other bits are reserved for now.
  */
 #define EXT4_FT_UNKNOWN		0
 #define EXT4_FT_REG_FILE	1
 #define EXT4_FT_DIR		2
 #define EXT4_FT_CHRDEV		3
 #define EXT4_FT_BLKDEV		4
 #define EXT4_FT_FIFO		5
 #define EXT4_FT_SOCK		6
 #define EXT4_FT_SYMLINK		7
 #define EXT4_FT_MAX		8
 #define EXT4_FT_DIR_CSUM	0xDE
 /*
  * EXT4_DIR_PAD defines the directory entries boundaries
  *
  * NOTE: It must be a multiple of 4
  */
 #define EXT4_DIR_PAD			4
 #define EXT4_DIR_ROUND			(EXT4_DIR_PAD - 1)
 #define EXT4_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT4_DIR_ROUND) & \
 					 ~EXT4_DIR_ROUND)
 #define EXT4_MAX_REC_LEN		((1<<16)-1)
 /*
  * If we ever get support for fs block sizes > page_size, we'll need
  * to remove the #if statements in the next two functions...
  */
 static inline unsigned int
 ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
 {
 	unsigned len = le16_to_cpu(dlen);
 #if (PAGE_CACHE_SIZE >= 65536)
 	if (len == EXT4_MAX_REC_LEN || len == 0)
 		return blocksize;
 	return (len & 65532) | ((len & 3) << 16);
 #else
 	return len;
 #endif
 }
 static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
 {
 	if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
 		BUG();
 #if (PAGE_CACHE_SIZE >= 65536)
 	if (len < 65536)
 		return cpu_to_le16(len);
 	if (len == blocksize) {
 		if (blocksize == 65536)
 			return cpu_to_le16(EXT4_MAX_REC_LEN);
 		else
 			return cpu_to_le16(0);
 	}
 	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
 #else
 	return cpu_to_le16(len);
 #endif
 }
 /*
  * Hash Tree Directory indexing
  * (c) Daniel Phillips, 2001
  */
 #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
 				      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
 		    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
 #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
 #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
 /* Legal values for the dx_root hash_version field: */
 #define DX_HASH_LEGACY		0
 #define DX_HASH_HALF_MD4	1
 #define DX_HASH_TEA		2
 #define DX_HASH_LEGACY_UNSIGNED	3
 #define DX_HASH_HALF_MD4_UNSIGNED	4
 #define DX_HASH_TEA_UNSIGNED		5
 static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
 			      const void *address, unsigned int length)
 {
 	struct {
 		struct shash_desc shash;
 		char ctx[4];
 	} desc;
 	int err;
 	BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));
 	desc.shash.tfm = sbi->s_chksum_driver;
 	desc.shash.flags = 0;
 	*(u32 *)desc.ctx = crc;
 	err = crypto_shash_update(&desc.shash, address, length);
 	BUG_ON(err);
 	return *(u32 *)desc.ctx;
 }
 #ifdef __KERNEL__
 /* hash info structure used by the directory hash */
 struct dx_hash_info
 {
 	u32		hash;
 	u32		minor_hash;
 	int		hash_version;
 	u32		*seed;
 };
 /* 32 and 64 bit signed EOF for dx directories */
 #define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
 #define EXT4_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)
 /*
  * Control parameters used by ext4_htree_next_block
  */
 #define HASH_NB_ALWAYS		1
 /*
  * Describe an inode's exact location on disk and in memory
  */
 struct ext4_iloc
 {
 	struct buffer_head *bh;
 	unsigned long offset;
 	ext4_group_t block_group;
 };
 static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
 {
 	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
 }
 /*
  * This structure is stuffed into the struct file's private_data field
  * for directories.  It is where we put information so that we can do
  * readdir operations in hash tree order.
  */
 struct dir_private_info {
 	struct rb_root	root;
 	struct rb_node	*curr_node;
 	struct fname	*extra_fname;
 	loff_t		last_pos;
 	__u32		curr_hash;
 	__u32		curr_minor_hash;
 	__u32		next_hash;
 };
 /* calculate the first block number of the group */
 static inline ext4_fsblk_t
 ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 {
 	return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 }
 /*
  * Special error return code only used by dx_probe() and its callers.
  */
 #define ERR_BAD_DX_DIR	(-(MAX_ERRNO - 1))
 /*
  * Timeout and state flag for lazy initialization inode thread.
  */
 #define EXT4_DEF_LI_WAIT_MULT			10
 #define EXT4_DEF_LI_MAX_START_DELAY		5
 #define EXT4_LAZYINIT_QUIT			0x0001
 #define EXT4_LAZYINIT_RUNNING			0x0002
 /*
  * Lazy inode table initialization info
  */
 struct ext4_lazy_init {
 	unsigned long		li_state;
 	struct list_head	li_request_list;
 	struct mutex		li_list_mtx;
 };
 struct ext4_li_request {
 	struct super_block	*lr_super;
 	struct ext4_sb_info	*lr_sbi;
 	ext4_group_t		lr_next_group;
 	struct list_head	lr_request;
 	unsigned long		lr_next_sched;
 	unsigned long		lr_timeout;
 };
 struct ext4_features {
 	struct kobject f_kobj;
 	struct completion f_kobj_unregister;
 };
 /*
  * This structure will be used for multiple mount protection. It will be
  * written into the block number saved in the s_mmp_block field in the
  * superblock. Programs that check MMP should assume that if
  * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
  * to use the filesystem, regardless of how old the timestamp is.
  */
 #define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
 #define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
 #define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
 #define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
 struct mmp_struct {
 	__le32	mmp_magic;		/* Magic number for MMP */
 	__le32	mmp_seq;		/* Sequence no. updated periodically */
 	/*
 	 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
 	 * purposes and do not affect the correctness of the algorithm
 	 */
 	__le64	mmp_time;		/* Time last updated */
 	char	mmp_nodename[64];	/* Node which last updated MMP block */
 	char	mmp_bdevname[32];	/* Bdev which last updated MMP block */
 	/*
 	 * mmp_check_interval is used to verify if the MMP block has been
 	 * updated on the block device. The value is updated based on the
 	 * maximum time to write the MMP block during an update cycle.
 	 */
 	__le16	mmp_check_interval;
 	__le16	mmp_pad1;
 	__le32	mmp_pad2[226];
 	__le32	mmp_checksum;		/* crc32c(uuid+mmp_block) */
 };
 /* arguments passed to the mmp thread */
 struct mmpd_data {
 	struct buffer_head *bh; /* bh from initial read_mmp_block() */
 	struct super_block *sb;  /* super block of the fs */
 };
 /*
  * Check interval multiplier
  * The MMP block is written every update interval and initially checked every
  * update interval x the multiplier (the value is then adapted based on the
  * write latency). The reason is that writes can be delayed under load and we
  * don't want readers to incorrectly assume that the filesystem is no longer
  * in use.
  */
 #define EXT4_MMP_CHECK_MULT		2UL
 /*
  * Minimum interval for MMP checking in seconds.
  */
 #define EXT4_MMP_MIN_CHECK_INTERVAL	5UL
 /*
  * Maximum interval for MMP checking in seconds.
  */
 #define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
 /*
  * Function prototypes
  */
 /*
  * Ok, these declarations are also in <linux/kernel.h> but none of the
  * ext4 source programs needs to include it so they are duplicated here.
  */
 # define NORET_TYPE	/**/
 # define ATTRIB_NORET	__attribute__((noreturn))
 # define NORET_AND	noreturn,
 /* bitmap.c */
 extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
 void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
 				struct ext4_group_desc *gdp,
 				struct buffer_head *bh, int sz);
 int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
 				  struct ext4_group_desc *gdp,
 				  struct buffer_head *bh, int sz);
 void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
 				struct ext4_group_desc *gdp,
 				struct buffer_head *bh);
 int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
 				  struct ext4_group_desc *gdp,
 				  struct buffer_head *bh);
 /* balloc.c */
 extern void ext4_get_group_no_and_offset(struct super_block *sb,
 					 ext4_fsblk_t blocknr,
 					 ext4_group_t *blockgrpp,
 					 ext4_grpblk_t *offsetp);
 extern ext4_group_t ext4_get_group_number(struct super_block *sb,
 					  ext4_fsblk_t block);
 extern unsigned int ext4_block_group(struct super_block *sb,
 			ext4_fsblk_t blocknr);
 extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
 			ext4_fsblk_t blocknr);
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 			ext4_group_t group);
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 					 ext4_fsblk_t goal,
 					 unsigned int flags,
 					 unsigned long *count,
 					 int *errp);
 extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
 				    s64 nclusters, unsigned int flags);
 extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 						    ext4_group_t block_group,
 						    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
 						ext4_group_t block_group);
 extern int ext4_wait_block_bitmap(struct super_block *sb,
 				  ext4_group_t block_group,
 				  struct buffer_head *bh);
 extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
 						  ext4_group_t block_group);
 extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
 					      ext4_group_t block_group,
 					      struct ext4_group_desc *gdp);
 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 /* dir.c */
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
 				  struct file *,
 				  struct ext4_dir_entry_2 *,
 				  struct buffer_head *, char *, int,
 				  unsigned int);
 #define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset)	\
 	unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
 					(de), (bh), (buf), (size), (offset)))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 				    __u32 minor_hash,
 				    struct ext4_dir_entry_2 *dirent);
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
 			     struct buffer_head *bh,
 			     void *buf, int buf_size,
 			     const char *name, int namelen,
 			     struct ext4_dir_entry_2 **dest_de);
 void ext4_insert_dentry(struct inode *inode,
 			struct ext4_dir_entry_2 *de,
 			int buf_size,
 			const char *name, int namelen);
 static inline void ext4_update_dx_flag(struct inode *inode)
 {
 	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
 				     EXT4_FEATURE_COMPAT_DIR_INDEX))
 		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
 }
 static unsigned char ext4_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
 {
 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
 	    (filetype >= EXT4_FT_MAX))
 		return DT_UNKNOWN;
 	return ext4_filetype_table[filetype];
 }
 extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
 			     void *buf, int buf_size);
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
 			  dx_hash_info *hinfo);
 /* ialloc.c */
 extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
 				      const struct qstr *qstr, __u32 goal,
 				      uid_t *owner, int handle_type,
 				      unsigned int line_no, int nblocks);
 #define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \
 	__ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
 			 0, 0, 0)
 #define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
 				    type, nblocks)		    \
 	__ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
 			 (type), __LINE__, (nblocks))
 extern void ext4_free_inode(handle_t *, struct inode *);
 extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
 extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 extern int ext4_init_inode_table(struct super_block *sb,
 				 ext4_group_t group, int barrier);
 extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
 /* mballoc.c */
 extern long ext4_mb_stats;
 extern long ext4_mb_max_to_scan;
 extern int ext4_mb_init(struct super_block *);
 extern int ext4_mb_release(struct super_block *);
 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
 				struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
 extern void ext4_discard_preallocations(struct inode *);
 extern int __init ext4_init_mballoc(void);
 extern void ext4_exit_mballoc(void);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			     struct buffer_head *bh, ext4_fsblk_t block,
 			     unsigned long count, int flags);
 extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
 				   ext4_group_t ngroups);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
 extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 				ext4_fsblk_t block, unsigned long count);
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_write(struct inode *inode, sector_t iblock,
 			 struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
 				struct buffer_head *bh_result, int create);
 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 			   struct buffer_head *bh, int create);
 int ext4_walk_page_buffers(handle_t *handle,
 			   struct buffer_head *head,
 			   unsigned from,
 			   unsigned to,
 			   int *partial,
 			   int (*fn)(handle_t *handle,
 				     struct buffer_head *bh));
 int do_journal_get_write_access(handle_t *handle,
 				struct buffer_head *bh);
 #define FALL_BACK_TO_NONDELALLOC 1
 #define CONVERT_INLINE_DATA	 2
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 				struct kstat *stat);
 extern void ext4_evict_inode(struct inode *);
 extern void ext4_clear_inode(struct inode *);
 extern int  ext4_sync_inode(handle_t *, struct inode *);
 extern void ext4_dirty_inode(struct inode *, int);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
 extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t lend);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 				struct ext4_map_blocks *map, int flags);
 extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 				struct iov_iter *iter, loff_t offset);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
 extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
 extern void ext4_ind_truncate(handle_t *, struct inode *inode);
 extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
 				 ext4_lblk_t start, ext4_lblk_t end);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
 extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *);
 extern int ext4_ind_migrate(struct inode *inode);
 /* namei.c */
 extern int ext4_dirent_csum_verify(struct inode *inode,
 				   struct ext4_dir_entry *dirent);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 				__u32 start_minor_hash, __u32 *next_hash);
 extern int search_dir(struct buffer_head *bh,
 		      char *search_buf,
 		      int buf_size,
 		      struct inode *dir,
 		      const struct qstr *d_name,
 		      unsigned int offset,
 		      struct ext4_dir_entry_2 **res_dir);
 extern int ext4_generic_delete_entry(handle_t *handle,
 				     struct inode *dir,
 				     struct ext4_dir_entry_2 *de_del,
 				     struct buffer_head *bh,
 				     void *entry_buf,
 				     int buf_size,
 				     int csum_size);
 /* resize.c */
 extern int ext4_group_add(struct super_block *sb,
 				struct ext4_new_group_data *input);
 extern int ext4_group_extend(struct super_block *sb,
 				struct ext4_super_block *es,
 				ext4_fsblk_t n_blocks_count);
 extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
 /* super.c */
 extern int ext4_calculate_overhead(struct super_block *sb);
 extern void ext4_superblock_csum_set(struct super_block *sb);
 extern void *ext4_kvmalloc(size_t size, gfp_t flags);
 extern void *ext4_kvzalloc(size_t size, gfp_t flags);
 extern int ext4_alloc_flex_bg_array(struct super_block *sb,
 				    ext4_group_t ngroup);
 extern const char *ext4_decode_error(struct super_block *sb, int errno,
 				     char nbuf[16]);
 extern __printf(4, 5)
 void __ext4_error(struct super_block *, const char *, unsigned int,
 		  const char *, ...);
 extern __printf(5, 6)
 void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
 		      const char *, ...);
 extern __printf(5, 6)
 void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
 		     const char *, ...);
 extern void __ext4_std_error(struct super_block *, const char *,
 			     unsigned int, int);
 extern __printf(4, 5)
 void __ext4_abort(struct super_block *, const char *, unsigned int,
 		  const char *, ...);
 extern __printf(4, 5)
 void __ext4_warning(struct super_block *, const char *, unsigned int,
 		    const char *, ...);
 extern __printf(3, 4)
 void __ext4_msg(struct super_block *, const char *, const char *, ...);
 extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
 			   const char *, unsigned int, const char *);
 extern __printf(7, 8)
 void __ext4_grp_locked_error(const char *, unsigned int,
 			     struct super_block *, ext4_group_t,
 			     unsigned long, ext4_fsblk_t,
 			     const char *, ...);
 #ifdef CONFIG_PRINTK
 #define ext4_error_inode(inode, func, line, block, fmt, ...)		\
 	__ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
 #define ext4_error_file(file, func, line, block, fmt, ...)		\
 	__ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
 #define ext4_error(sb, fmt, ...)					\
 	__ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
 #define ext4_abort(sb, fmt, ...)					\
 	__ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
 #define ext4_warning(sb, fmt, ...)					\
 	__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
 #define ext4_msg(sb, level, fmt, ...)				\
 	__ext4_msg(sb, level, fmt, ##__VA_ARGS__)
 #define dump_mmp_msg(sb, mmp, msg)					\
 	__dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
 #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)		\
 	__ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
 				fmt, ##__VA_ARGS__)
 #else
 #define ext4_error_inode(inode, func, line, block, fmt, ...)		\
 do {									\
 	no_printk(fmt, ##__VA_ARGS__);					\
 	__ext4_error_inode(inode, "", 0, block, " ");			\
 } while (0)
 #define ext4_error_file(file, func, line, block, fmt, ...)		\
 do {									\
 	no_printk(fmt, ##__VA_ARGS__);					\
 	__ext4_error_file(file, "", 0, block, " ");			\
 } while (0)
 #define ext4_error(sb, fmt, ...)					\
 do {									\
 	no_printk(fmt, ##__VA_ARGS__);					\
 	__ext4_error(sb, "", 0, " ");					\
 } while (0)
 #define ext4_abort(sb, fmt, ...)					\
 do {									\
 	no_printk(fmt, ##__VA_ARGS__);					\
 	__ext4_abort(sb, "", 0, " ");					\
 } while (0)
 #define ext4_warning(sb, fmt, ...)					\
 do {									\
 	no_printk(fmt, ##__VA_ARGS__);					\
 	__ext4_warning(sb, "", 0, " ");					\
 } while (0)
 #define ext4_msg(sb, level, fmt, ...)					\
 do {									\
 	no_printk(fmt, ##__VA_ARGS__);					\
 	__ext4_msg(sb, "", " ");					\
 } while (0)
 #define dump_mmp_msg(sb, mmp, msg)					\
 	__dump_mmp_msg(sb, mmp, "", 0, "")
 #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)		\
 do {									\
 	no_printk(fmt, ##__VA_ARGS__);				\
 	__ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");	\
 } while (0)
 #endif
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
 					__u32 compat);
 extern int ext4_update_rocompat_feature(handle_t *handle,
 					struct super_block *sb,	__u32 rocompat);
 extern int ext4_update_incompat_feature(handle_t *handle,
 					struct super_block *sb,	__u32 incompat);
 extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 				      struct ext4_group_desc *bg);
 extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 				      struct ext4_group_desc *bg);
 extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 				     struct ext4_group_desc *bg);
 extern __u32 ext4_free_group_clusters(struct super_block *sb,
 				      struct ext4_group_desc *bg);
 extern __u32 ext4_free_inodes_count(struct super_block *sb,
 				 struct ext4_group_desc *bg);
 extern __u32 ext4_used_dirs_count(struct super_block *sb,
 				struct ext4_group_desc *bg);
 extern __u32 ext4_itable_unused_count(struct super_block *sb,
 				   struct ext4_group_desc *bg);
 extern void ext4_block_bitmap_set(struct super_block *sb,
 				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_bitmap_set(struct super_block *sb,
 				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_table_set(struct super_block *sb,
 				 struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_free_group_clusters_set(struct super_block *sb,
 					 struct ext4_group_desc *bg,
 					 __u32 count);
 extern void ext4_free_inodes_set(struct super_block *sb,
 				struct ext4_group_desc *bg, __u32 count);
 extern void ext4_used_dirs_set(struct super_block *sb,
 				struct ext4_group_desc *bg, __u32 count);
 extern void ext4_itable_unused_set(struct super_block *sb,
 				   struct ext4_group_desc *bg, __u32 count);
 extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
 				       struct ext4_group_desc *gdp);
 extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
 				     struct ext4_group_desc *gdp);
 extern int ext4_register_li_request(struct super_block *sb,
 				    ext4_group_t first_not_zeroed);
 static inline int ext4_has_group_desc_csum(struct super_block *sb)
 {
 	return EXT4_HAS_RO_COMPAT_FEATURE(sb,
 					  EXT4_FEATURE_RO_COMPAT_GDT_CSUM) ||
 	       (EXT4_SB(sb)->s_chksum_driver != NULL);
 }
 static inline int ext4_has_metadata_csum(struct super_block *sb)
 {
 	WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb,
 			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
 		     !EXT4_SB(sb)->s_chksum_driver);
 	return (EXT4_SB(sb)->s_chksum_driver != NULL);
 }
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
 	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
 		le32_to_cpu(es->s_blocks_count_lo);
 }
 static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
 {
 	return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
 		le32_to_cpu(es->s_r_blocks_count_lo);
 }
 static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
 {
 	return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) |
 		le32_to_cpu(es->s_free_blocks_count_lo);
 }
 static inline void ext4_blocks_count_set(struct ext4_super_block *es,
 					 ext4_fsblk_t blk)
 {
 	es->s_blocks_count_lo = cpu_to_le32((u32)blk);
 	es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
 }
 static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
 					      ext4_fsblk_t blk)
 {
 	es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
 	es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
 }
 static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
 					   ext4_fsblk_t blk)
 {
 	es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
 	es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
 }
 static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
 {
 	if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
 		return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
 			le32_to_cpu(raw_inode->i_size_lo);
 	else
 		return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
 }
 static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
 {
 	raw_inode->i_size_lo = cpu_to_le32(i_size);
 	raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
 }
 static inline
 struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 					    ext4_group_t group)
 {
 	 struct ext4_group_info ***grp_info;
 	 long indexv, indexh;
 	 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
 	 grp_info = EXT4_SB(sb)->s_group_info;
 	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
 	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
 	 return grp_info[indexv][indexh];
 }
 /*
  * Reading s_groups_count requires using smp_rmb() afterwards.  See
  * the locking protocol documented in the comments of ext4_group_add()
  * in resize.c
  */
 static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
 {
 	ext4_group_t	ngroups = EXT4_SB(sb)->s_groups_count;
 	smp_rmb();
 	return ngroups;
 }
 static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
 					     ext4_group_t block_group)
 {
 	return block_group >> sbi->s_log_groups_per_flex;
 }
 static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
 {
 	return 1 << sbi->s_log_groups_per_flex;
 }
 #define ext4_std_error(sb, errno)				\
 do {								\
 	if ((errno))						\
 		__ext4_std_error((sb), __func__, __LINE__, (errno));	\
 } while (0)
 #ifdef CONFIG_SMP
 /* Each CPU can accumulate percpu_counter_batch clusters in their local
  * counters. So we need to make sure we have free clusters more
  * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
  */
 #define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
 #else
 #define EXT4_FREECLUSTERS_WATERMARK 0
 #endif
 /* Update i_disksize. Requires i_mutex to avoid races with truncate */
 static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 {
 	WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
 		     !mutex_is_locked(&inode->i_mutex));
 	down_write(&EXT4_I(inode)->i_data_sem);
 	if (newsize > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = newsize;
 	up_write(&EXT4_I(inode)->i_data_sem);
 }
 /* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
 static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
 {
 	int changed = 0;
 	if (newsize > inode->i_size) {
 		i_size_write(inode, newsize);
 		changed = 1;
 	}
 	if (newsize > EXT4_I(inode)->i_disksize) {
 		ext4_update_i_disksize(inode, newsize);
 		changed |= 2;
 	}
 	return changed;
 }
 struct ext4_group_info {
 	unsigned long   bb_state;
 	struct rb_root  bb_free_root;
 	ext4_grpblk_t	bb_first_free;	/* first free block */
 	ext4_grpblk_t	bb_free;	/* total free blocks */
 	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
 	ext4_grpblk_t	bb_largest_free_order;/* order of largest frag in BG */
 	struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
 	void            *bb_bitmap;
 #endif
 	struct rw_semaphore alloc_sem;
 	ext4_grpblk_t	bb_counters[];	/* Nr of free power-of-two-block
 					 * regions, index is order.
 					 * bb_counters[3] = 5 means
 					 * 5 free 8-block regions. */
 };
 #define EXT4_GROUP_INFO_NEED_INIT_BIT		0
 #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT		1
 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	2
 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	3
 #define EXT4_MB_GRP_NEED_INIT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
 #define EXT4_MB_GRP_BBITMAP_CORRUPT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
 #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
 #define EXT4_MB_GRP_WAS_TRIMMED(grp)	\
 	(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
 #define EXT4_MB_GRP_SET_TRIMMED(grp)	\
 	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
 #define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
 	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
 #define EXT4_MAX_CONTENTION		8
 #define EXT4_CONTENTION_THRESHOLD	2
 static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
 					      ext4_group_t group)
 {
 	return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
 }
 /*
  * Returns true if the filesystem is busy enough that attempts to
  * access the block group locks has run into contention.
  */
 static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
 {
 	return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
 }
 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
 {
 	spinlock_t *lock = ext4_group_lock_ptr(sb, group);
 	if (spin_trylock(lock))
 		/*
 		 * We're able to grab the lock right away, so drop the
 		 * lock contention counter.
 		 */
 		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
 	else {
 		/*
 		 * The lock is busy, so bump the contention counter,
 		 * and then wait on the spin lock.
 		 */
 		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
 				  EXT4_MAX_CONTENTION);
 		spin_lock(lock);
 	}
 }
 static inline void ext4_unlock_group(struct super_block *sb,
 					ext4_group_t group)
 {
 	spin_unlock(ext4_group_lock_ptr(sb, group));
 }
 /*
  * Block validity checking
  */
 #define ext4_check_indirect_blockref(inode, bh)				\
 	ext4_check_blockref(__func__, __LINE__, inode,			\
 			    (__le32 *)(bh)->b_data,			\
 			    EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 #define ext4_ind_check_inode(inode)					\
 	ext4_check_blockref(__func__, __LINE__, inode,			\
 			    EXT4_I(inode)->i_data,			\
 			    EXT4_NDIR_BLOCKS)
 /*
  * Inodes and files operations
  */
 /* dir.c */
 extern const struct file_operations ext4_dir_operations;
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
 extern const struct file_operations ext4_dax_file_operations;
 extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 /* inline.c */
 extern int ext4_get_max_inline_size(struct inode *inode);
 extern int ext4_find_inline_data_nolock(struct inode *inode);
 extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
 				 unsigned int len);
 extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
 extern int ext4_readpage_inline(struct inode *inode, struct page *page);
 extern int ext4_try_to_write_inline_data(struct address_space *mapping,
 					 struct inode *inode,
 					 loff_t pos, unsigned len,
 					 unsigned flags,
 					 struct page **pagep);
 extern int ext4_write_inline_data_end(struct inode *inode,
 				      loff_t pos, unsigned len,
 				      unsigned copied,
 				      struct page *page);
 extern struct buffer_head *
 ext4_journalled_write_inline_data(struct inode *inode,
 				  unsigned len,
 				  struct page *page);
 extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
 					   struct inode *inode,
 					   loff_t pos, unsigned len,
 					   unsigned flags,
 					   struct page **pagep,
 					   void **fsdata);
 extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
 					 unsigned len, unsigned copied,
 					 struct page *page);
 extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
 				     struct inode *inode);
 extern int ext4_try_create_inline_dir(handle_t *handle,
 				      struct inode *parent,
 				      struct inode *inode);
 extern int ext4_read_inline_dir(struct file *filp,
 				struct dir_context *ctx,
 				int *has_inline_data);
 extern int htree_inlinedir_to_tree(struct file *dir_file,
 				   struct inode *dir, ext4_lblk_t block,
 				   struct dx_hash_info *hinfo,
 				   __u32 start_hash, __u32 start_minor_hash,
 				   int *has_inline_data);
 extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
 					const struct qstr *d_name,
 					struct ext4_dir_entry_2 **res_dir,
 					int *has_inline_data);
 extern int ext4_delete_inline_entry(handle_t *handle,
 				    struct inode *dir,
 				    struct ext4_dir_entry_2 *de_del,
 				    struct buffer_head *bh,
 				    int *has_inline_data);
 extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
 extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
 					struct ext4_dir_entry_2 **parent_de,
 					int *retval);
 extern int ext4_inline_data_fiemap(struct inode *inode,
 				   struct fiemap_extent_info *fieinfo,
 				   int *has_inline, __u64 start, __u64 len);
 extern int ext4_try_to_evict_inline_data(handle_t *handle,
 					 struct inode *inode,
 					 int needed);
 extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
 extern int ext4_convert_inline_data(struct inode *inode);
 static inline int ext4_has_inline_data(struct inode *inode)
 {
 	return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
 	       EXT4_I(inode)->i_inline_off;
 }
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
 extern struct dentry *ext4_get_parent(struct dentry *child);
 extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
 				 struct ext4_dir_entry_2 *de,
 				 int blocksize, int csum_size,
 				 unsigned int parent_ino, int dotdot_real_len);
 extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
 				   unsigned int blocksize);
 extern int ext4_handle_dirty_dirent_node(handle_t *handle,
 					 struct inode *inode,
 					 struct buffer_head *bh);
 #define S_SHIFT 12
 static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
 	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
 	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
 	[S_IFBLK >> S_SHIFT]	= EXT4_FT_BLKDEV,
 	[S_IFIFO >> S_SHIFT]	= EXT4_FT_FIFO,
 	[S_IFSOCK >> S_SHIFT]	= EXT4_FT_SOCK,
 	[S_IFLNK >> S_SHIFT]	= EXT4_FT_SYMLINK,
 };
 static inline void ext4_set_de_type(struct super_block *sb,
 				struct ext4_dir_entry_2 *de,
 				umode_t mode) {
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
 		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
 extern const struct inode_operations ext4_fast_symlink_inode_operations;
 /* block_validity */
 extern void ext4_release_system_zone(struct super_block *sb);
 extern int ext4_setup_system_zone(struct super_block *sb);
 extern int __init ext4_init_system_zone(void);
 extern void ext4_exit_system_zone(void);
 extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
 				 ext4_fsblk_t start_blk,
 				 unsigned int count);
 extern int ext4_check_blockref(const char *, unsigned int,
 			       struct inode *, __le32 *, unsigned int);
 /* extents.c */
 struct ext4_ext_path;
 struct ext4_extent;
 /*
  * Maximum number of logical blocks in a file; ext4_extent's ee_block is
  * __le32.
  */
 #define EXT_MAX_BLOCKS	0xffffffff
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			       struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(handle_t *, struct inode *);
 extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 				 ext4_lblk_t end);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
 					  loff_t offset, ssize_t len);
 extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
 			   struct ext4_map_blocks *map, int flags);
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
 					 ext4_lblk_t lblocks);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 						   int num,
 						   struct ext4_ext_path *path);
 extern int ext4_can_extents_be_merged(struct inode *inode,
 				      struct ext4_extent *ex1,
 				      struct ext4_extent *ex2);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *,
 				  struct ext4_ext_path **,
 				  struct ext4_extent *, int);
 extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
 					      struct ext4_ext_path **,
 					      int flags);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
 extern int ext4_find_delalloc_range(struct inode *inode,
 				    ext4_lblk_t lblk_start,
 				    ext4_lblk_t lblk_end);
 extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
 extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
 extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
 extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
 				struct inode *inode2, ext4_lblk_t lblk1,
 			     ext4_lblk_t lblk2,  ext4_lblk_t count,
 			     int mark_unwritten,int *err);
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
 					    struct inode *second);
 extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
 					  struct inode *donor_inode);
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 start_orig, __u64 start_donor,
 			     __u64 len, __u64 *moved_len);
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
 extern void ext4_exit_pageio(void);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
 extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
 extern int ext4_put_io_end(ext4_io_end_t *io_end);
 extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
 extern void ext4_io_submit_init(struct ext4_io_submit *io,
 				struct writeback_control *wbc);
 extern void ext4_end_io_rsv_work(struct work_struct *work);
 extern void ext4_io_submit(struct ext4_io_submit *io);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
 			       struct page *page,
 			       int len,
 			       struct writeback_control *wbc,
 			       bool keep_towrite);
 /* mmp.c */
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 /*
  * Add new method to test whether block and inode bitmaps are properly
  * initialized. With uninit_bg reading the block from disk is not enough
  * to mark the bitmap uptodate. We need to also zero-out the bitmap
  */
 #define BH_BITMAP_UPTODATE BH_JBDPrivateStart
 static inline int bitmap_uptodate(struct buffer_head *bh)
 {
 	return (buffer_uptodate(bh) &&
 			test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
 }
 static inline void set_bitmap_uptodate(struct buffer_head *bh)
 {
 	set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
 }
 /*
  * Disable DIO read nolock optimization, so new dioreaders will be forced
  * to grab i_mutex
  */
 static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
 {
 	ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
 	smp_mb();
 }
 static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
 {
 	smp_mb();
 	ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
 }
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 /* For ioend & aio unwritten conversion wait queues */
 #define EXT4_WQ_HASH_SZ		37
 #define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
 					    EXT4_WQ_HASH_SZ])
 #define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
 					     EXT4_WQ_HASH_SZ])
 extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
 extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 #define EXT4_RESIZING	0
 extern int ext4_resize_begin(struct super_block *sb);
 extern void ext4_resize_end(struct super_block *sb);
 #endif	/* __KERNEL__ */
 #endif	/* _EXT4_H */

fs/ext4/indirect.c

Diff comments View file @ feaf222

1	/*	1	/*
2	* linux/fs/ext4/indirect.c	2	* linux/fs/ext4/indirect.c
3	*	3	*
4	* from	4	* from
5	*	5	*
6	* linux/fs/ext4/inode.c	6	* linux/fs/ext4/inode.c
7	*	7	*
8	* Copyright (C) 1992, 1993, 1994, 1995	8	* Copyright (C) 1992, 1993, 1994, 1995
9	* Remy Card (card@masi.ibp.fr)	9	* Remy Card (card@masi.ibp.fr)
10	* Laboratoire MASI - Institut Blaise Pascal	10	* Laboratoire MASI - Institut Blaise Pascal
11	* Universite Pierre et Marie Curie (Paris VI)	11	* Universite Pierre et Marie Curie (Paris VI)
12	*	12	*
13	* from	13	* from
14	*	14	*
15	* linux/fs/minix/inode.c	15	* linux/fs/minix/inode.c
16	*	16	*
17	* Copyright (C) 1991, 1992 Linus Torvalds	17	* Copyright (C) 1991, 1992 Linus Torvalds
18	*	18	*
19	* Goal-directed block allocation by Stephen Tweedie	19	* Goal-directed block allocation by Stephen Tweedie
20	* (sct@redhat.com), 1993, 1998	20	* (sct@redhat.com), 1993, 1998
21	*/	21	*/
22		22
23	#include <linux/aio.h>	23	#include <linux/aio.h>
24	#include "ext4_jbd2.h"	24	#include "ext4_jbd2.h"
25	#include "truncate.h"	25	#include "truncate.h"
26		26
27	#include <trace/events/ext4.h>	27	#include <trace/events/ext4.h>
28		28
29	typedef struct {	29	typedef struct {
30	__le32 *p;	30	__le32 *p;
31	__le32 key;	31	__le32 key;
32	struct buffer_head *bh;	32	struct buffer_head *bh;
33	} Indirect;	33	} Indirect;
34		34
35	static inline void add_chain(Indirect p, struct buffer_head bh, __le32 *v)	35	static inline void add_chain(Indirect p, struct buffer_head bh, __le32 *v)
36	{	36	{
37	p->key = *(p->p = v);	37	p->key = *(p->p = v);
38	p->bh = bh;	38	p->bh = bh;
39	}	39	}
40		40
41	/**	41	/**
42	* ext4_block_to_path - parse the block number into array of offsets	42	* ext4_block_to_path - parse the block number into array of offsets
43	* @inode: inode in question (we are only interested in its superblock)	43	* @inode: inode in question (we are only interested in its superblock)
44	* @i_block: block number to be parsed	44	* @i_block: block number to be parsed
45	* @offsets: array to store the offsets in	45	* @offsets: array to store the offsets in
46	* @boundary: set this non-zero if the referred-to block is likely to be	46	* @boundary: set this non-zero if the referred-to block is likely to be
47	* followed (on disk) by an indirect block.	47	* followed (on disk) by an indirect block.
48	*	48	*
49	* To store the locations of file's data ext4 uses a data structure common	49	* To store the locations of file's data ext4 uses a data structure common
50	* for UNIX filesystems - tree of pointers anchored in the inode, with	50	* for UNIX filesystems - tree of pointers anchored in the inode, with
51	* data blocks at leaves and indirect blocks in intermediate nodes.	51	* data blocks at leaves and indirect blocks in intermediate nodes.
52	* This function translates the block number into path in that tree -	52	* This function translates the block number into path in that tree -
53	* return value is the path length and @offsets[n] is the offset of	53	* return value is the path length and @offsets[n] is the offset of
54	* pointer to (n+1)th node in the nth one. If @block is out of range	54	* pointer to (n+1)th node in the nth one. If @block is out of range
55	* (negative or too large) warning is printed and zero returned.	55	* (negative or too large) warning is printed and zero returned.
56	*	56	*
57	* Note: function doesn't find node addresses, so no IO is needed. All	57	* Note: function doesn't find node addresses, so no IO is needed. All
58	* we need to know is the capacity of indirect blocks (taken from the	58	* we need to know is the capacity of indirect blocks (taken from the
59	* inode->i_sb).	59	* inode->i_sb).
60	*/	60	*/
61		61
62	/*	62	/*
63	* Portability note: the last comparison (check that we fit into triple	63	* Portability note: the last comparison (check that we fit into triple
64	* indirect block) is spelled differently, because otherwise on an	64	* indirect block) is spelled differently, because otherwise on an
65	* architecture with 32-bit longs and 8Kb pages we might get into trouble	65	* architecture with 32-bit longs and 8Kb pages we might get into trouble
66	* if our filesystem had 8Kb blocks. We might use long long, but that would	66	* if our filesystem had 8Kb blocks. We might use long long, but that would
67	* kill us on x86. Oh, well, at least the sign propagation does not matter -	67	* kill us on x86. Oh, well, at least the sign propagation does not matter -
68	* i_block would have to be negative in the very beginning, so we would not	68	* i_block would have to be negative in the very beginning, so we would not
69	* get there at all.	69	* get there at all.
70	*/	70	*/
71		71
72	static int ext4_block_to_path(struct inode *inode,	72	static int ext4_block_to_path(struct inode *inode,
73	ext4_lblk_t i_block,	73	ext4_lblk_t i_block,
74	ext4_lblk_t offsets[4], int *boundary)	74	ext4_lblk_t offsets[4], int *boundary)
75	{	75	{
76	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);	76	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
77	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);	77	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
78	const long direct_blocks = EXT4_NDIR_BLOCKS,	78	const long direct_blocks = EXT4_NDIR_BLOCKS,
79	indirect_blocks = ptrs,	79	indirect_blocks = ptrs,
80	double_blocks = (1 << (ptrs_bits * 2));	80	double_blocks = (1 << (ptrs_bits * 2));
81	int n = 0;	81	int n = 0;
82	int final = 0;	82	int final = 0;
83		83
84	if (i_block < direct_blocks) {	84	if (i_block < direct_blocks) {
85	offsets[n++] = i_block;	85	offsets[n++] = i_block;
86	final = direct_blocks;	86	final = direct_blocks;
87	} else if ((i_block -= direct_blocks) < indirect_blocks) {	87	} else if ((i_block -= direct_blocks) < indirect_blocks) {
88	offsets[n++] = EXT4_IND_BLOCK;	88	offsets[n++] = EXT4_IND_BLOCK;
89	offsets[n++] = i_block;	89	offsets[n++] = i_block;
90	final = ptrs;	90	final = ptrs;
91	} else if ((i_block -= indirect_blocks) < double_blocks) {	91	} else if ((i_block -= indirect_blocks) < double_blocks) {
92	offsets[n++] = EXT4_DIND_BLOCK;	92	offsets[n++] = EXT4_DIND_BLOCK;
93	offsets[n++] = i_block >> ptrs_bits;	93	offsets[n++] = i_block >> ptrs_bits;
94	offsets[n++] = i_block & (ptrs - 1);	94	offsets[n++] = i_block & (ptrs - 1);
95	final = ptrs;	95	final = ptrs;
96	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {	96	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
97	offsets[n++] = EXT4_TIND_BLOCK;	97	offsets[n++] = EXT4_TIND_BLOCK;
98	offsets[n++] = i_block >> (ptrs_bits * 2);	98	offsets[n++] = i_block >> (ptrs_bits * 2);
99	offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);	99	offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
100	offsets[n++] = i_block & (ptrs - 1);	100	offsets[n++] = i_block & (ptrs - 1);
101	final = ptrs;	101	final = ptrs;
102	} else {	102	} else {
103	ext4_warning(inode->i_sb, "block %lu > max in inode %lu",	103	ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
104	i_block + direct_blocks +	104	i_block + direct_blocks +
105	indirect_blocks + double_blocks, inode->i_ino);	105	indirect_blocks + double_blocks, inode->i_ino);
106	}	106	}
107	if (boundary)	107	if (boundary)
108	*boundary = final - 1 - (i_block & (ptrs - 1));	108	*boundary = final - 1 - (i_block & (ptrs - 1));
109	return n;	109	return n;
110	}	110	}
111		111
112	/**	112	/**
113	* ext4_get_branch - read the chain of indirect blocks leading to data	113	* ext4_get_branch - read the chain of indirect blocks leading to data
114	* @inode: inode in question	114	* @inode: inode in question
115	* @depth: depth of the chain (1 - direct pointer, etc.)	115	* @depth: depth of the chain (1 - direct pointer, etc.)
116	* @offsets: offsets of pointers in inode/indirect blocks	116	* @offsets: offsets of pointers in inode/indirect blocks
117	* @chain: place to store the result	117	* @chain: place to store the result
118	* @err: here we store the error value	118	* @err: here we store the error value
119	*	119	*
120	* Function fills the array of triples <key, p, bh> and returns %NULL	120	* Function fills the array of triples <key, p, bh> and returns %NULL
121	* if everything went OK or the pointer to the last filled triple	121	* if everything went OK or the pointer to the last filled triple
122	* (incomplete one) otherwise. Upon the return chain[i].key contains	122	* (incomplete one) otherwise. Upon the return chain[i].key contains
123	* the number of (i+1)-th block in the chain (as it is stored in memory,	123	* the number of (i+1)-th block in the chain (as it is stored in memory,
124	* i.e. little-endian 32-bit), chain[i].p contains the address of that	124	* i.e. little-endian 32-bit), chain[i].p contains the address of that
125	* number (it points into struct inode for i==0 and into the bh->b_data	125	* number (it points into struct inode for i==0 and into the bh->b_data
126	* for i>0) and chain[i].bh points to the buffer_head of i-th indirect	126	* for i>0) and chain[i].bh points to the buffer_head of i-th indirect
127	* block for i>0 and NULL for i==0. In other words, it holds the block	127	* block for i>0 and NULL for i==0. In other words, it holds the block
128	* numbers of the chain, addresses they were taken from (and where we can	128	* numbers of the chain, addresses they were taken from (and where we can
129	* verify that chain did not change) and buffer_heads hosting these	129	* verify that chain did not change) and buffer_heads hosting these
130	* numbers.	130	* numbers.
131	*	131	*
132	* Function stops when it stumbles upon zero pointer (absent block)	132	* Function stops when it stumbles upon zero pointer (absent block)
133	* (pointer to last triple returned, *@err == 0)	133	* (pointer to last triple returned, *@err == 0)
134	* or when it gets an IO error reading an indirect block	134	* or when it gets an IO error reading an indirect block
135	* (ditto, *@err == -EIO)	135	* (ditto, *@err == -EIO)
136	* or when it reads all @depth-1 indirect blocks successfully and finds	136	* or when it reads all @depth-1 indirect blocks successfully and finds
137	* the whole chain, all way to the data (returns %NULL, *err == 0).	137	* the whole chain, all way to the data (returns %NULL, *err == 0).
138	*	138	*
139	* Need to be called with	139	* Need to be called with
140	* down_read(&EXT4_I(inode)->i_data_sem)	140	* down_read(&EXT4_I(inode)->i_data_sem)
141	*/	141	*/
142	static Indirect ext4_get_branch(struct inode inode, int depth,	142	static Indirect ext4_get_branch(struct inode inode, int depth,
143	ext4_lblk_t *offsets,	143	ext4_lblk_t *offsets,
144	Indirect chain[4], int *err)	144	Indirect chain[4], int *err)
145	{	145	{
146	struct super_block *sb = inode->i_sb;	146	struct super_block *sb = inode->i_sb;
147	Indirect *p = chain;	147	Indirect *p = chain;
148	struct buffer_head *bh;	148	struct buffer_head *bh;
149	int ret = -EIO;	149	int ret = -EIO;
150		150
151	*err = 0;	151	*err = 0;
152	/* i_data is not going away, no lock needed */	152	/* i_data is not going away, no lock needed */
153	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);	153	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
154	if (!p->key)	154	if (!p->key)
155	goto no_block;	155	goto no_block;
156	while (--depth) {	156	while (--depth) {
157	bh = sb_getblk(sb, le32_to_cpu(p->key));	157	bh = sb_getblk(sb, le32_to_cpu(p->key));
158	if (unlikely(!bh)) {	158	if (unlikely(!bh)) {
159	ret = -ENOMEM;	159	ret = -ENOMEM;
160	goto failure;	160	goto failure;
161	}	161	}
162		162
163	if (!bh_uptodate_or_lock(bh)) {	163	if (!bh_uptodate_or_lock(bh)) {
164	if (bh_submit_read(bh) < 0) {	164	if (bh_submit_read(bh) < 0) {
165	put_bh(bh);	165	put_bh(bh);
166	goto failure;	166	goto failure;
167	}	167	}
168	/* validate block references */	168	/* validate block references */
169	if (ext4_check_indirect_blockref(inode, bh)) {	169	if (ext4_check_indirect_blockref(inode, bh)) {
170	put_bh(bh);	170	put_bh(bh);
171	goto failure;	171	goto failure;
172	}	172	}
173	}	173	}
174		174
175	add_chain(++p, bh, (__le32 )bh->b_data + ++offsets);	175	add_chain(++p, bh, (__le32 )bh->b_data + ++offsets);
176	/* Reader: end */	176	/* Reader: end */
177	if (!p->key)	177	if (!p->key)
178	goto no_block;	178	goto no_block;
179	}	179	}
180	return NULL;	180	return NULL;
181		181
182	failure:	182	failure:
183	*err = ret;	183	*err = ret;
184	no_block:	184	no_block:
185	return p;	185	return p;
186	}	186	}
187		187
188	/**	188	/**
189	* ext4_find_near - find a place for allocation with sufficient locality	189	* ext4_find_near - find a place for allocation with sufficient locality
190	* @inode: owner	190	* @inode: owner
191	* @ind: descriptor of indirect block.	191	* @ind: descriptor of indirect block.
192	*	192	*
193	* This function returns the preferred place for block allocation.	193	* This function returns the preferred place for block allocation.
194	* It is used when heuristic for sequential allocation fails.	194	* It is used when heuristic for sequential allocation fails.
195	* Rules are:	195	* Rules are:
196	* + if there is a block to the left of our position - allocate near it.	196	* + if there is a block to the left of our position - allocate near it.
197	* + if pointer will live in indirect block - allocate near that block.	197	* + if pointer will live in indirect block - allocate near that block.
198	* + if pointer will live in inode - allocate in the same	198	* + if pointer will live in inode - allocate in the same
199	* cylinder group.	199	* cylinder group.
200	*	200	*
201	* In the latter case we colour the starting block by the callers PID to	201	* In the latter case we colour the starting block by the callers PID to
202	* prevent it from clashing with concurrent allocations for a different inode	202	* prevent it from clashing with concurrent allocations for a different inode
203	* in the same block group. The PID is used here so that functionally related	203	* in the same block group. The PID is used here so that functionally related
204	* files will be close-by on-disk.	204	* files will be close-by on-disk.
205	*	205	*
206	* Caller must make sure that @ind is valid and will stay that way.	206	* Caller must make sure that @ind is valid and will stay that way.
207	*/	207	*/
208	static ext4_fsblk_t ext4_find_near(struct inode inode, Indirect ind)	208	static ext4_fsblk_t ext4_find_near(struct inode inode, Indirect ind)
209	{	209	{
210	struct ext4_inode_info *ei = EXT4_I(inode);	210	struct ext4_inode_info *ei = EXT4_I(inode);
211	__le32 start = ind->bh ? (__le32 ) ind->bh->b_data : ei->i_data;	211	__le32 start = ind->bh ? (__le32 ) ind->bh->b_data : ei->i_data;
212	__le32 *p;	212	__le32 *p;
213		213
214	/* Try to find previous block */	214	/* Try to find previous block */
215	for (p = ind->p - 1; p >= start; p--) {	215	for (p = ind->p - 1; p >= start; p--) {
216	if (*p)	216	if (*p)
217	return le32_to_cpu(*p);	217	return le32_to_cpu(*p);
218	}	218	}
219		219
220	/* No such thing, so let's try location of indirect block */	220	/* No such thing, so let's try location of indirect block */
221	if (ind->bh)	221	if (ind->bh)
222	return ind->bh->b_blocknr;	222	return ind->bh->b_blocknr;
223		223
224	/*	224	/*
225	* It is going to be referred to from the inode itself? OK, just put it	225	* It is going to be referred to from the inode itself? OK, just put it
226	* into the same cylinder group then.	226	* into the same cylinder group then.
227	*/	227	*/
228	return ext4_inode_to_goal_block(inode);	228	return ext4_inode_to_goal_block(inode);
229	}	229	}
230		230
231	/**	231	/**
232	* ext4_find_goal - find a preferred place for allocation.	232	* ext4_find_goal - find a preferred place for allocation.
233	* @inode: owner	233	* @inode: owner
234	* @block: block we want	234	* @block: block we want
235	* @partial: pointer to the last triple within a chain	235	* @partial: pointer to the last triple within a chain
236	*	236	*
237	* Normally this function find the preferred place for block allocation,	237	* Normally this function find the preferred place for block allocation,
238	* returns it.	238	* returns it.
239	* Because this is only used for non-extent files, we limit the block nr	239	* Because this is only used for non-extent files, we limit the block nr
240	* to 32 bits.	240	* to 32 bits.
241	*/	241	*/
242	static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,	242	static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
243	Indirect *partial)	243	Indirect *partial)
244	{	244	{
245	ext4_fsblk_t goal;	245	ext4_fsblk_t goal;
246		246
247	/*	247	/*
248	* XXX need to get goal block from mballoc's data structures	248	* XXX need to get goal block from mballoc's data structures
249	*/	249	*/
250		250
251	goal = ext4_find_near(inode, partial);	251	goal = ext4_find_near(inode, partial);
252	goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;	252	goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
253	return goal;	253	return goal;
254	}	254	}
255		255
256	/**	256	/**
257	* ext4_blks_to_allocate - Look up the block map and count the number	257	* ext4_blks_to_allocate - Look up the block map and count the number
258	* of direct blocks need to be allocated for the given branch.	258	* of direct blocks need to be allocated for the given branch.
259	*	259	*
260	* @branch: chain of indirect blocks	260	* @branch: chain of indirect blocks
261	* @k: number of blocks need for indirect blocks	261	* @k: number of blocks need for indirect blocks
262	* @blks: number of data blocks to be mapped.	262	* @blks: number of data blocks to be mapped.
263	* @blocks_to_boundary: the offset in the indirect block	263	* @blocks_to_boundary: the offset in the indirect block
264	*	264	*
265	* return the total number of blocks to be allocate, including the	265	* return the total number of blocks to be allocate, including the
266	* direct and indirect blocks.	266	* direct and indirect blocks.
267	*/	267	*/
268	static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,	268	static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
269	int blocks_to_boundary)	269	int blocks_to_boundary)
270	{	270	{
271	unsigned int count = 0;	271	unsigned int count = 0;
272		272
273	/*	273	/*
274	* Simple case, [t,d]Indirect block(s) has not allocated yet	274	* Simple case, [t,d]Indirect block(s) has not allocated yet
275	* then it's clear blocks on that path have not allocated	275	* then it's clear blocks on that path have not allocated
276	*/	276	*/
277	if (k > 0) {	277	if (k > 0) {
278	/* right now we don't handle cross boundary allocation */	278	/* right now we don't handle cross boundary allocation */
279	if (blks < blocks_to_boundary + 1)	279	if (blks < blocks_to_boundary + 1)
280	count += blks;	280	count += blks;
281	else	281	else
282	count += blocks_to_boundary + 1;	282	count += blocks_to_boundary + 1;
283	return count;	283	return count;
284	}	284	}
285		285
286	count++;	286	count++;
287	while (count < blks && count <= blocks_to_boundary &&	287	while (count < blks && count <= blocks_to_boundary &&
288	le32_to_cpu(*(branch[0].p + count)) == 0) {	288	le32_to_cpu(*(branch[0].p + count)) == 0) {
289	count++;	289	count++;
290	}	290	}
291	return count;	291	return count;
292	}	292	}
293		293
294	/**	294	/**
295	* ext4_alloc_branch - allocate and set up a chain of blocks.	295	* ext4_alloc_branch - allocate and set up a chain of blocks.
296	* @handle: handle for this transaction	296	* @handle: handle for this transaction
297	* @inode: owner	297	* @inode: owner
298	* @indirect_blks: number of allocated indirect blocks	298	* @indirect_blks: number of allocated indirect blocks
299	* @blks: number of allocated direct blocks	299	* @blks: number of allocated direct blocks
300	* @goal: preferred place for allocation	300	* @goal: preferred place for allocation
301	* @offsets: offsets (in the blocks) to store the pointers to next.	301	* @offsets: offsets (in the blocks) to store the pointers to next.
302	* @branch: place to store the chain in.	302	* @branch: place to store the chain in.
303	*	303	*
304	* This function allocates blocks, zeroes out all but the last one,	304	* This function allocates blocks, zeroes out all but the last one,
305	* links them into chain and (if we are synchronous) writes them to disk.	305	* links them into chain and (if we are synchronous) writes them to disk.
306	* In other words, it prepares a branch that can be spliced onto the	306	* In other words, it prepares a branch that can be spliced onto the
307	* inode. It stores the information about that chain in the branch[], in	307	* inode. It stores the information about that chain in the branch[], in
308	* the same format as ext4_get_branch() would do. We are calling it after	308	* the same format as ext4_get_branch() would do. We are calling it after
309	* we had read the existing part of chain and partial points to the last	309	* we had read the existing part of chain and partial points to the last
310	* triple of that (one with zero ->key). Upon the exit we have the same	310	* triple of that (one with zero ->key). Upon the exit we have the same
311	* picture as after the successful ext4_get_block(), except that in one	311	* picture as after the successful ext4_get_block(), except that in one
312	* place chain is disconnected - *branch->p is still zero (we did not	312	* place chain is disconnected - *branch->p is still zero (we did not
313	* set the last link), but branch->key contains the number that should	313	* set the last link), but branch->key contains the number that should
314	* be placed into *branch->p to fill that gap.	314	* be placed into *branch->p to fill that gap.
315	*	315	*
316	* If allocation fails we free all blocks we've allocated (and forget	316	* If allocation fails we free all blocks we've allocated (and forget
317	* their buffer_heads) and return the error value the from failed	317	* their buffer_heads) and return the error value the from failed
318	* ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain	318	* ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
319	* as described above and return 0.	319	* as described above and return 0.
320	*/	320	*/
321	static int ext4_alloc_branch(handle_t *handle,	321	static int ext4_alloc_branch(handle_t *handle,
322	struct ext4_allocation_request *ar,	322	struct ext4_allocation_request *ar,
323	int indirect_blks, ext4_lblk_t *offsets,	323	int indirect_blks, ext4_lblk_t *offsets,
324	Indirect *branch)	324	Indirect *branch)
325	{	325	{
326	struct buffer_head * bh;	326	struct buffer_head * bh;
327	ext4_fsblk_t b, new_blocks[4];	327	ext4_fsblk_t b, new_blocks[4];
328	__le32 *p;	328	__le32 *p;
329	int i, j, err, len = 1;	329	int i, j, err, len = 1;
330		330
331	for (i = 0; i <= indirect_blks; i++) {	331	for (i = 0; i <= indirect_blks; i++) {
332	if (i == indirect_blks) {	332	if (i == indirect_blks) {
333	new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);	333	new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
334	} else	334	} else
335	ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,	335	ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
336	ar->inode, ar->goal,	336	ar->inode, ar->goal,
337	ar->flags & EXT4_MB_DELALLOC_RESERVED,	337	ar->flags & EXT4_MB_DELALLOC_RESERVED,
338	NULL, &err);	338	NULL, &err);
339	if (err) {	339	if (err) {
340	i--;	340	i--;
341	goto failed;	341	goto failed;
342	}	342	}
343	branch[i].key = cpu_to_le32(new_blocks[i]);	343	branch[i].key = cpu_to_le32(new_blocks[i]);
344	if (i == 0)	344	if (i == 0)
345	continue;	345	continue;
346		346
347	bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]);	347	bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]);
348	if (unlikely(!bh)) {	348	if (unlikely(!bh)) {
349	err = -ENOMEM;	349	err = -ENOMEM;
350	goto failed;	350	goto failed;
351	}	351	}
352	lock_buffer(bh);	352	lock_buffer(bh);
353	BUFFER_TRACE(bh, "call get_create_access");	353	BUFFER_TRACE(bh, "call get_create_access");
354	err = ext4_journal_get_create_access(handle, bh);	354	err = ext4_journal_get_create_access(handle, bh);
355	if (err) {	355	if (err) {
356	unlock_buffer(bh);	356	unlock_buffer(bh);
357	goto failed;	357	goto failed;
358	}	358	}
359		359
360	memset(bh->b_data, 0, bh->b_size);	360	memset(bh->b_data, 0, bh->b_size);
361	p = branch[i].p = (__le32 *) bh->b_data + offsets[i];	361	p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
362	b = new_blocks[i];	362	b = new_blocks[i];
363		363
364	if (i == indirect_blks)	364	if (i == indirect_blks)
365	len = ar->len;	365	len = ar->len;
366	for (j = 0; j < len; j++)	366	for (j = 0; j < len; j++)
367	*p++ = cpu_to_le32(b++);	367	*p++ = cpu_to_le32(b++);
368		368
369	BUFFER_TRACE(bh, "marking uptodate");	369	BUFFER_TRACE(bh, "marking uptodate");
370	set_buffer_uptodate(bh);	370	set_buffer_uptodate(bh);
371	unlock_buffer(bh);	371	unlock_buffer(bh);
372		372
373	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");	373	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
374	err = ext4_handle_dirty_metadata(handle, ar->inode, bh);	374	err = ext4_handle_dirty_metadata(handle, ar->inode, bh);
375	if (err)	375	if (err)
376	goto failed;	376	goto failed;
377	}	377	}
378	return 0;	378	return 0;
379	failed:	379	failed:
380	for (; i >= 0; i--) {	380	for (; i >= 0; i--) {
381	/*	381	/*
382	* We want to ext4_forget() only freshly allocated indirect	382	* We want to ext4_forget() only freshly allocated indirect
383	* blocks. Buffer for new_blocks[i-1] is at branch[i].bh and	383	* blocks. Buffer for new_blocks[i-1] is at branch[i].bh and
384	* buffer at branch[0].bh is indirect block / inode already	384	* buffer at branch[0].bh is indirect block / inode already
385	* existing before ext4_alloc_branch() was called.	385	* existing before ext4_alloc_branch() was called.
386	*/	386	*/
387	if (i > 0 && i != indirect_blks && branch[i].bh)	387	if (i > 0 && i != indirect_blks && branch[i].bh)
388	ext4_forget(handle, 1, ar->inode, branch[i].bh,	388	ext4_forget(handle, 1, ar->inode, branch[i].bh,
389	branch[i].bh->b_blocknr);	389	branch[i].bh->b_blocknr);
390	ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],	390	ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
391	(i == indirect_blks) ? ar->len : 1, 0);	391	(i == indirect_blks) ? ar->len : 1, 0);
392	}	392	}
393	return err;	393	return err;
394	}	394	}
395		395
396	/**	396	/**
397	* ext4_splice_branch - splice the allocated branch onto inode.	397	* ext4_splice_branch - splice the allocated branch onto inode.
398	* @handle: handle for this transaction	398	* @handle: handle for this transaction
399	* @inode: owner	399	* @inode: owner
400	* @block: (logical) number of block we are adding	400	* @block: (logical) number of block we are adding
401	* @chain: chain of indirect blocks (with a missing link - see	401	* @chain: chain of indirect blocks (with a missing link - see
402	* ext4_alloc_branch)	402	* ext4_alloc_branch)
403	* @where: location of missing link	403	* @where: location of missing link
404	* @num: number of indirect blocks we are adding	404	* @num: number of indirect blocks we are adding
405	* @blks: number of direct blocks we are adding	405	* @blks: number of direct blocks we are adding
406	*	406	*
407	* This function fills the missing link and does all housekeeping needed in	407	* This function fills the missing link and does all housekeeping needed in
408	* inode (->i_blocks, etc.). In case of success we end up with the full	408	* inode (->i_blocks, etc.). In case of success we end up with the full
409	* chain to new block and return 0.	409	* chain to new block and return 0.
410	*/	410	*/
411	static int ext4_splice_branch(handle_t *handle,	411	static int ext4_splice_branch(handle_t *handle,
412	struct ext4_allocation_request *ar,	412	struct ext4_allocation_request *ar,
413	Indirect *where, int num)	413	Indirect *where, int num)
414	{	414	{
415	int i;	415	int i;
416	int err = 0;	416	int err = 0;
417	ext4_fsblk_t current_block;	417	ext4_fsblk_t current_block;
418		418
419	/*	419	/*
420	* If we're splicing into a [td]indirect block (as opposed to the	420	* If we're splicing into a [td]indirect block (as opposed to the
421	* inode) then we need to get write access to the [td]indirect block	421	* inode) then we need to get write access to the [td]indirect block
422	* before the splice.	422	* before the splice.
423	*/	423	*/
424	if (where->bh) {	424	if (where->bh) {
425	BUFFER_TRACE(where->bh, "get_write_access");	425	BUFFER_TRACE(where->bh, "get_write_access");
426	err = ext4_journal_get_write_access(handle, where->bh);	426	err = ext4_journal_get_write_access(handle, where->bh);
427	if (err)	427	if (err)
428	goto err_out;	428	goto err_out;
429	}	429	}
430	/* That's it */	430	/* That's it */
431		431
432	*where->p = where->key;	432	*where->p = where->key;
433		433
434	/*	434	/*
435	* Update the host buffer_head or inode to point to more just allocated	435	* Update the host buffer_head or inode to point to more just allocated
436	* direct blocks blocks	436	* direct blocks blocks
437	*/	437	*/
438	if (num == 0 && ar->len > 1) {	438	if (num == 0 && ar->len > 1) {
439	current_block = le32_to_cpu(where->key) + 1;	439	current_block = le32_to_cpu(where->key) + 1;
440	for (i = 1; i < ar->len; i++)	440	for (i = 1; i < ar->len; i++)
441	*(where->p + i) = cpu_to_le32(current_block++);	441	*(where->p + i) = cpu_to_le32(current_block++);
442	}	442	}
443		443
444	/* We are done with atomic stuff, now do the rest of housekeeping */	444	/* We are done with atomic stuff, now do the rest of housekeeping */
445	/* had we spliced it onto indirect block? */	445	/* had we spliced it onto indirect block? */
446	if (where->bh) {	446	if (where->bh) {
447	/*	447	/*
448	* If we spliced it onto an indirect block, we haven't	448	* If we spliced it onto an indirect block, we haven't
449	* altered the inode. Note however that if it is being spliced	449	* altered the inode. Note however that if it is being spliced
450	* onto an indirect block at the very end of the file (the	450	* onto an indirect block at the very end of the file (the
451	* file is growing) then we will alter the inode to reflect	451	* file is growing) then we will alter the inode to reflect
452	* the new i_size. But that is not done here - it is done in	452	* the new i_size. But that is not done here - it is done in
453	* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.	453	* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
454	*/	454	*/
455	jbd_debug(5, "splicing indirect only\n");	455	jbd_debug(5, "splicing indirect only\n");
456	BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");	456	BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
457	err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);	457	err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
458	if (err)	458	if (err)
459	goto err_out;	459	goto err_out;
460	} else {	460	} else {
461	/*	461	/*
462	* OK, we spliced it into the inode itself on a direct block.	462	* OK, we spliced it into the inode itself on a direct block.
463	*/	463	*/
464	ext4_mark_inode_dirty(handle, ar->inode);	464	ext4_mark_inode_dirty(handle, ar->inode);
465	jbd_debug(5, "splicing direct\n");	465	jbd_debug(5, "splicing direct\n");
466	}	466	}
467	return err;	467	return err;
468		468
469	err_out:	469	err_out:
470	for (i = 1; i <= num; i++) {	470	for (i = 1; i <= num; i++) {
471	/*	471	/*
472	* branch[i].bh is newly allocated, so there is no	472	* branch[i].bh is newly allocated, so there is no
473	* need to revoke the block, which is why we don't	473	* need to revoke the block, which is why we don't
474	* need to set EXT4_FREE_BLOCKS_METADATA.	474	* need to set EXT4_FREE_BLOCKS_METADATA.
475	*/	475	*/
476	ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1,	476	ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1,
477	EXT4_FREE_BLOCKS_FORGET);	477	EXT4_FREE_BLOCKS_FORGET);
478	}	478	}
479	ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key),	479	ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key),
480	ar->len, 0);	480	ar->len, 0);
481		481
482	return err;	482	return err;
483	}	483	}
484		484
485	/*	485	/*
486	* The ext4_ind_map_blocks() function handles non-extents inodes	486	* The ext4_ind_map_blocks() function handles non-extents inodes
487	* (i.e., using the traditional indirect/double-indirect i_blocks	487	* (i.e., using the traditional indirect/double-indirect i_blocks
488	* scheme) for ext4_map_blocks().	488	* scheme) for ext4_map_blocks().
489	*	489	*
490	* Allocation strategy is simple: if we have to allocate something, we will	490	* Allocation strategy is simple: if we have to allocate something, we will
491	* have to go the whole way to leaf. So let's do it before attaching anything	491	* have to go the whole way to leaf. So let's do it before attaching anything
492	* to tree, set linkage between the newborn blocks, write them if sync is	492	* to tree, set linkage between the newborn blocks, write them if sync is
493	* required, recheck the path, free and repeat if check fails, otherwise	493	* required, recheck the path, free and repeat if check fails, otherwise
494	* set the last missing link (that will protect us from any truncate-generated	494	* set the last missing link (that will protect us from any truncate-generated
495	* removals - all blocks on the path are immune now) and possibly force the	495	* removals - all blocks on the path are immune now) and possibly force the
496	* write on the parent block.	496	* write on the parent block.
497	* That has a nice additional property: no special recovery from the failed	497	* That has a nice additional property: no special recovery from the failed
498	* allocations is needed - we simply release blocks and do not touch anything	498	* allocations is needed - we simply release blocks and do not touch anything
499	* reachable from inode.	499	* reachable from inode.
500	*	500	*
501	* `handle' can be NULL if create == 0.	501	* `handle' can be NULL if create == 0.
502	*	502	*
503	* return > 0, # of blocks mapped or allocated.	503	* return > 0, # of blocks mapped or allocated.
504	* return = 0, if plain lookup failed.	504	* return = 0, if plain lookup failed.
505	* return < 0, error case.	505	* return < 0, error case.
506	*	506	*
507	* The ext4_ind_get_blocks() function should be called with	507	* The ext4_ind_get_blocks() function should be called with
508	* down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem	508	* down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
509	* blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or	509	* blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
510	* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system	510	* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
511	* blocks.	511	* blocks.
512	*/	512	*/
513	int ext4_ind_map_blocks(handle_t handle, struct inode inode,	513	int ext4_ind_map_blocks(handle_t handle, struct inode inode,
514	struct ext4_map_blocks *map,	514	struct ext4_map_blocks *map,
515	int flags)	515	int flags)
516	{	516	{
517	struct ext4_allocation_request ar;	517	struct ext4_allocation_request ar;
518	int err = -EIO;	518	int err = -EIO;
519	ext4_lblk_t offsets[4];	519	ext4_lblk_t offsets[4];
520	Indirect chain[4];	520	Indirect chain[4];
521	Indirect *partial;	521	Indirect *partial;
522	int indirect_blks;	522	int indirect_blks;
523	int blocks_to_boundary = 0;	523	int blocks_to_boundary = 0;
524	int depth;	524	int depth;
525	int count = 0;	525	int count = 0;
526	ext4_fsblk_t first_block = 0;	526	ext4_fsblk_t first_block = 0;
527		527
528	trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);	528	trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
529	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));	529	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
530	J_ASSERT(handle != NULL \|\| (flags & EXT4_GET_BLOCKS_CREATE) == 0);	530	J_ASSERT(handle != NULL \|\| (flags & EXT4_GET_BLOCKS_CREATE) == 0);
531	depth = ext4_block_to_path(inode, map->m_lblk, offsets,	531	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
532	&blocks_to_boundary);	532	&blocks_to_boundary);
533		533
534	if (depth == 0)	534	if (depth == 0)
535	goto out;	535	goto out;
536		536
537	partial = ext4_get_branch(inode, depth, offsets, chain, &err);	537	partial = ext4_get_branch(inode, depth, offsets, chain, &err);
538		538
539	/* Simplest case - block found, no allocation needed */	539	/* Simplest case - block found, no allocation needed */
540	if (!partial) {	540	if (!partial) {
541	first_block = le32_to_cpu(chain[depth - 1].key);	541	first_block = le32_to_cpu(chain[depth - 1].key);
542	count++;	542	count++;
543	/map more blocks/	543	/map more blocks/
544	while (count < map->m_len && count <= blocks_to_boundary) {	544	while (count < map->m_len && count <= blocks_to_boundary) {
545	ext4_fsblk_t blk;	545	ext4_fsblk_t blk;
546		546
547	blk = le32_to_cpu(*(chain[depth-1].p + count));	547	blk = le32_to_cpu(*(chain[depth-1].p + count));
548		548
549	if (blk == first_block + count)	549	if (blk == first_block + count)
550	count++;	550	count++;
551	else	551	else
552	break;	552	break;
553	}	553	}
554	goto got_it;	554	goto got_it;
555	}	555	}
556		556
557	/* Next simple case - plain lookup or failed read of indirect block */	557	/* Next simple case - plain lookup or failed read of indirect block */
558	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 \|\| err == -EIO)	558	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 \|\| err == -EIO)
559	goto cleanup;	559	goto cleanup;
560		560
561	/*	561	/*
562	* Okay, we need to do block allocation.	562	* Okay, we need to do block allocation.
563	*/	563	*/
564	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,	564	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
565	EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {	565	EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
566	EXT4_ERROR_INODE(inode, "Can't allocate blocks for "	566	EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
567	"non-extent mapped inodes with bigalloc");	567	"non-extent mapped inodes with bigalloc");
568	return -ENOSPC;	568	return -ENOSPC;
569	}	569	}
570		570
571	/* Set up for the direct block allocation */	571	/* Set up for the direct block allocation */
572	memset(&ar, 0, sizeof(ar));	572	memset(&ar, 0, sizeof(ar));
573	ar.inode = inode;	573	ar.inode = inode;
574	ar.logical = map->m_lblk;	574	ar.logical = map->m_lblk;
575	if (S_ISREG(inode->i_mode))	575	if (S_ISREG(inode->i_mode))
576	ar.flags = EXT4_MB_HINT_DATA;	576	ar.flags = EXT4_MB_HINT_DATA;
577	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)	577	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
578	ar.flags \|= EXT4_MB_DELALLOC_RESERVED;	578	ar.flags \|= EXT4_MB_DELALLOC_RESERVED;
579		579
580	ar.goal = ext4_find_goal(inode, map->m_lblk, partial);	580	ar.goal = ext4_find_goal(inode, map->m_lblk, partial);
581		581
582	/* the number of blocks need to allocate for [d,t]indirect blocks */	582	/* the number of blocks need to allocate for [d,t]indirect blocks */
583	indirect_blks = (chain + depth) - partial - 1;	583	indirect_blks = (chain + depth) - partial - 1;
584		584
585	/*	585	/*
586	* Next look up the indirect map to count the totoal number of	586	* Next look up the indirect map to count the totoal number of
587	* direct blocks to allocate for this branch.	587	* direct blocks to allocate for this branch.
588	*/	588	*/
589	ar.len = ext4_blks_to_allocate(partial, indirect_blks,	589	ar.len = ext4_blks_to_allocate(partial, indirect_blks,
590	map->m_len, blocks_to_boundary);	590	map->m_len, blocks_to_boundary);
591		591
592	/*	592	/*
593	* Block out ext4_truncate while we alter the tree	593	* Block out ext4_truncate while we alter the tree
594	*/	594	*/
595	err = ext4_alloc_branch(handle, &ar, indirect_blks,	595	err = ext4_alloc_branch(handle, &ar, indirect_blks,
596	offsets + (partial - chain), partial);	596	offsets + (partial - chain), partial);
597		597
598	/*	598	/*
599	* The ext4_splice_branch call will free and forget any buffers	599	* The ext4_splice_branch call will free and forget any buffers
600	* on the new chain if there is a failure, but that risks using	600	* on the new chain if there is a failure, but that risks using
601	* up transaction credits, especially for bitmaps where the	601	* up transaction credits, especially for bitmaps where the
602	* credits cannot be returned. Can we handle this somehow? We	602	* credits cannot be returned. Can we handle this somehow? We
603	* may need to return -EAGAIN upwards in the worst case. --sct	603	* may need to return -EAGAIN upwards in the worst case. --sct
604	*/	604	*/
605	if (!err)	605	if (!err)
606	err = ext4_splice_branch(handle, &ar, partial, indirect_blks);	606	err = ext4_splice_branch(handle, &ar, partial, indirect_blks);
607	if (err)	607	if (err)
608	goto cleanup;	608	goto cleanup;
609		609
610	map->m_flags \|= EXT4_MAP_NEW;	610	map->m_flags \|= EXT4_MAP_NEW;
611		611
612	ext4_update_inode_fsync_trans(handle, inode, 1);	612	ext4_update_inode_fsync_trans(handle, inode, 1);
613	count = ar.len;	613	count = ar.len;
614	got_it:	614	got_it:
615	map->m_flags \|= EXT4_MAP_MAPPED;	615	map->m_flags \|= EXT4_MAP_MAPPED;
616	map->m_pblk = le32_to_cpu(chain[depth-1].key);	616	map->m_pblk = le32_to_cpu(chain[depth-1].key);
617	map->m_len = count;	617	map->m_len = count;
618	if (count > blocks_to_boundary)	618	if (count > blocks_to_boundary)
619	map->m_flags \|= EXT4_MAP_BOUNDARY;	619	map->m_flags \|= EXT4_MAP_BOUNDARY;
620	err = count;	620	err = count;
621	/* Clean up and exit */	621	/* Clean up and exit */
622	partial = chain + depth - 1; /* the whole chain */	622	partial = chain + depth - 1; /* the whole chain */
623	cleanup:	623	cleanup:
624	while (partial > chain) {	624	while (partial > chain) {
625	BUFFER_TRACE(partial->bh, "call brelse");	625	BUFFER_TRACE(partial->bh, "call brelse");
626	brelse(partial->bh);	626	brelse(partial->bh);
627	partial--;	627	partial--;
628	}	628	}
629	out:	629	out:
630	trace_ext4_ind_map_blocks_exit(inode, flags, map, err);	630	trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
631	return err;	631	return err;
632	}	632	}
633		633
634	/*	634	/*
635	* O_DIRECT for ext3 (or indirect map) based files	635	* O_DIRECT for ext3 (or indirect map) based files
636	*	636	*
637	* If the O_DIRECT write will extend the file then add this inode to the	637	* If the O_DIRECT write will extend the file then add this inode to the
638	* orphan list. So recovery will truncate it back to the original size	638	* orphan list. So recovery will truncate it back to the original size
639	* if the machine crashes during the write.	639	* if the machine crashes during the write.
640	*	640	*
641	* If the O_DIRECT write is intantiating holes inside i_size and the machine	641	* If the O_DIRECT write is intantiating holes inside i_size and the machine
642	* crashes then stale disk data _may_ be exposed inside the file. But current	642	* crashes then stale disk data _may_ be exposed inside the file. But current
643	* VFS code falls back into buffered path in that case so we are safe.	643	* VFS code falls back into buffered path in that case so we are safe.
644	*/	644	*/
645	ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,	645	ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
646	struct iov_iter *iter, loff_t offset)	646	struct iov_iter *iter, loff_t offset)
647	{	647	{
648	struct file *file = iocb->ki_filp;	648	struct file *file = iocb->ki_filp;
649	struct inode *inode = file->f_mapping->host;	649	struct inode *inode = file->f_mapping->host;
650	struct ext4_inode_info *ei = EXT4_I(inode);	650	struct ext4_inode_info *ei = EXT4_I(inode);
651	handle_t *handle;	651	handle_t *handle;
652	ssize_t ret;	652	ssize_t ret;
653	int orphan = 0;	653	int orphan = 0;
654	size_t count = iov_iter_count(iter);	654	size_t count = iov_iter_count(iter);
655	int retries = 0;	655	int retries = 0;
656		656
657	if (rw == WRITE) {	657	if (rw == WRITE) {
658	loff_t final_size = offset + count;	658	loff_t final_size = offset + count;
659		659
660	if (final_size > inode->i_size) {	660	if (final_size > inode->i_size) {
661	/* Credits for sb + inode write */	661	/* Credits for sb + inode write */
662	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);	662	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
663	if (IS_ERR(handle)) {	663	if (IS_ERR(handle)) {
664	ret = PTR_ERR(handle);	664	ret = PTR_ERR(handle);
665	goto out;	665	goto out;
666	}	666	}
667	ret = ext4_orphan_add(handle, inode);	667	ret = ext4_orphan_add(handle, inode);
668	if (ret) {	668	if (ret) {
669	ext4_journal_stop(handle);	669	ext4_journal_stop(handle);
670	goto out;	670	goto out;
671	}	671	}
672	orphan = 1;	672	orphan = 1;
673	ei->i_disksize = inode->i_size;	673	ei->i_disksize = inode->i_size;
674	ext4_journal_stop(handle);	674	ext4_journal_stop(handle);
675	}	675	}
676	}	676	}
677		677
678	retry:	678	retry:
679	if (rw == READ && ext4_should_dioread_nolock(inode)) {	679	if (rw == READ && ext4_should_dioread_nolock(inode)) {
680	/*	680	/*
681	* Nolock dioread optimization may be dynamically disabled	681	* Nolock dioread optimization may be dynamically disabled
682	* via ext4_inode_block_unlocked_dio(). Check inode's state	682	* via ext4_inode_block_unlocked_dio(). Check inode's state
683	* while holding extra i_dio_count ref.	683	* while holding extra i_dio_count ref.
684	*/	684	*/
685	atomic_inc(&inode->i_dio_count);	685	atomic_inc(&inode->i_dio_count);
686	smp_mb();	686	smp_mb();
687	if (unlikely(ext4_test_inode_state(inode,	687	if (unlikely(ext4_test_inode_state(inode,
688	EXT4_STATE_DIOREAD_LOCK))) {	688	EXT4_STATE_DIOREAD_LOCK))) {
689	inode_dio_done(inode);	689	inode_dio_done(inode);
690	goto locked;	690	goto locked;
691	}	691	}
692	if (IS_DAX(inode))	692	if (IS_DAX(inode))
693	ret = dax_do_io(rw, iocb, inode, iter, offset,	693	ret = dax_do_io(rw, iocb, inode, iter, offset,
694	ext4_get_block, NULL, 0);	694	ext4_get_block, NULL, 0);
695	else	695	else
696	ret = __blockdev_direct_IO(rw, iocb, inode,	696	ret = __blockdev_direct_IO(rw, iocb, inode,
697	inode->i_sb->s_bdev, iter, offset,	697	inode->i_sb->s_bdev, iter, offset,
698	ext4_get_block, NULL, NULL, 0);	698	ext4_get_block, NULL, NULL, 0);
699	inode_dio_done(inode);	699	inode_dio_done(inode);
700	} else {	700	} else {
701	locked:	701	locked:
702	if (IS_DAX(inode))	702	if (IS_DAX(inode))
703	ret = dax_do_io(rw, iocb, inode, iter, offset,	703	ret = dax_do_io(rw, iocb, inode, iter, offset,
704	ext4_get_block, NULL, DIO_LOCKING);	704	ext4_get_block, NULL, DIO_LOCKING);
705	else	705	else
706	ret = blockdev_direct_IO(rw, iocb, inode, iter,	706	ret = blockdev_direct_IO(rw, iocb, inode, iter,
707	offset, ext4_get_block);	707	offset, ext4_get_block);
708		708
709	if (unlikely((rw & WRITE) && ret < 0)) {	709	if (unlikely((rw & WRITE) && ret < 0)) {
710	loff_t isize = i_size_read(inode);	710	loff_t isize = i_size_read(inode);
711	loff_t end = offset + count;	711	loff_t end = offset + count;
712		712
713	if (end > isize)	713	if (end > isize)
714	ext4_truncate_failed_write(inode);	714	ext4_truncate_failed_write(inode);
715	}	715	}
716	}	716	}
717	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))	717	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
718	goto retry;	718	goto retry;
719		719
720	if (orphan) {	720	if (orphan) {
721	int err;	721	int err;
722		722
723	/* Credits for sb + inode write */	723	/* Credits for sb + inode write */
724	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);	724	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
725	if (IS_ERR(handle)) {	725	if (IS_ERR(handle)) {
726	/* This is really bad luck. We've written the data	726	/* This is really bad luck. We've written the data
727	* but cannot extend i_size. Bail out and pretend	727	* but cannot extend i_size. Bail out and pretend
728	* the write failed... */	728	* the write failed... */
729	ret = PTR_ERR(handle);	729	ret = PTR_ERR(handle);
730	if (inode->i_nlink)	730	if (inode->i_nlink)
731	ext4_orphan_del(NULL, inode);	731	ext4_orphan_del(NULL, inode);
732		732
733	goto out;	733	goto out;
734	}	734	}
735	if (inode->i_nlink)	735	if (inode->i_nlink)
736	ext4_orphan_del(handle, inode);	736	ext4_orphan_del(handle, inode);
737	if (ret > 0) {	737	if (ret > 0) {
738	loff_t end = offset + ret;	738	loff_t end = offset + ret;
739	if (end > inode->i_size) {	739	if (end > inode->i_size) {
740	ei->i_disksize = end;	740	ei->i_disksize = end;
741	i_size_write(inode, end);	741	i_size_write(inode, end);
742	/*	742	/*
743	* We're going to return a positive `ret'	743	* We're going to return a positive `ret'
744	* here due to non-zero-length I/O, so there's	744	* here due to non-zero-length I/O, so there's
745	* no way of reporting error returns from	745	* no way of reporting error returns from
746	* ext4_mark_inode_dirty() to userspace. So	746	* ext4_mark_inode_dirty() to userspace. So
747	* ignore it.	747	* ignore it.
748	*/	748	*/
749	ext4_mark_inode_dirty(handle, inode);	749	ext4_mark_inode_dirty(handle, inode);
750	}	750	}
751	}	751	}
752	err = ext4_journal_stop(handle);	752	err = ext4_journal_stop(handle);
753	if (ret == 0)	753	if (ret == 0)
754	ret = err;	754	ret = err;
755	}	755	}
756	out:	756	out:
757	return ret;	757	return ret;
758	}	758	}
759		759
760	/*	760	/*
761	* Calculate the number of metadata blocks need to reserve	761	* Calculate the number of metadata blocks need to reserve
762	* to allocate a new block at @lblocks for non extent file based file	762	* to allocate a new block at @lblocks for non extent file based file
763	*/	763	*/
764	int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)	764	int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
765	{	765	{
766	struct ext4_inode_info *ei = EXT4_I(inode);	766	struct ext4_inode_info *ei = EXT4_I(inode);
767	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);	767	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
768	int blk_bits;	768	int blk_bits;
769		769
770	if (lblock < EXT4_NDIR_BLOCKS)	770	if (lblock < EXT4_NDIR_BLOCKS)
771	return 0;	771	return 0;
772		772
773	lblock -= EXT4_NDIR_BLOCKS;	773	lblock -= EXT4_NDIR_BLOCKS;
774		774
775	if (ei->i_da_metadata_calc_len &&	775	if (ei->i_da_metadata_calc_len &&
776	(lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {	776	(lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
777	ei->i_da_metadata_calc_len++;	777	ei->i_da_metadata_calc_len++;
778	return 0;	778	return 0;
779	}	779	}
780	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;	780	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
781	ei->i_da_metadata_calc_len = 1;	781	ei->i_da_metadata_calc_len = 1;
782	blk_bits = order_base_2(lblock);	782	blk_bits = order_base_2(lblock);
783	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;	783	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
784	}	784	}
785		785
786	/*	786	/*
787	* Calculate number of indirect blocks touched by mapping @nrblocks logically	787	* Calculate number of indirect blocks touched by mapping @nrblocks logically
788	* contiguous blocks	788	* contiguous blocks
789	*/	789	*/
790	int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)	790	int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
791	{	791	{
792	/*	792	/*
793	* With N contiguous data blocks, we need at most	793	* With N contiguous data blocks, we need at most
794	* N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,	794	* N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
795	* 2 dindirect blocks, and 1 tindirect block	795	* 2 dindirect blocks, and 1 tindirect block
796	*/	796	*/
797	return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;	797	return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
798	}	798	}
799		799
800	/*	800	/*
801	* Truncate transactions can be complex and absolutely huge. So we need to	801	* Truncate transactions can be complex and absolutely huge. So we need to
802	* be able to restart the transaction at a conventient checkpoint to make	802	* be able to restart the transaction at a conventient checkpoint to make
803	* sure we don't overflow the journal.	803	* sure we don't overflow the journal.
804	*	804	*
805	* Try to extend this transaction for the purposes of truncation. If	805	* Try to extend this transaction for the purposes of truncation. If
806	* extend fails, we need to propagate the failure up and restart the	806	* extend fails, we need to propagate the failure up and restart the
807	* transaction in the top-level truncate loop. --sct	807	* transaction in the top-level truncate loop. --sct
808	*	808	*
809	* Returns 0 if we managed to create more room. If we can't create more	809	* Returns 0 if we managed to create more room. If we can't create more
810	* room, and the transaction must be restarted we return 1.	810	* room, and the transaction must be restarted we return 1.
811	*/	811	*/
812	static int try_to_extend_transaction(handle_t handle, struct inode inode)	812	static int try_to_extend_transaction(handle_t handle, struct inode inode)
813	{	813	{
814	if (!ext4_handle_valid(handle))	814	if (!ext4_handle_valid(handle))
815	return 0;	815	return 0;
816	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))	816	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
817	return 0;	817	return 0;
818	if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))	818	if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
819	return 0;	819	return 0;
820	return 1;	820	return 1;
821	}	821	}
822		822
823	/*	823	/*
824	* Probably it should be a library function... search for first non-zero word	824	* Probably it should be a library function... search for first non-zero word
825	* or memcmp with zero_page, whatever is better for particular architecture.	825	* or memcmp with zero_page, whatever is better for particular architecture.
826	* Linus?	826	* Linus?
827	*/	827	*/
828	static inline int all_zeroes(__le32 p, __le32 q)	828	static inline int all_zeroes(__le32 p, __le32 q)
829	{	829	{
830	while (p < q)	830	while (p < q)
831	if (*p++)	831	if (*p++)
832	return 0;	832	return 0;
833	return 1;	833	return 1;
834	}	834	}
835		835
836	/**	836	/**
837	* ext4_find_shared - find the indirect blocks for partial truncation.	837	* ext4_find_shared - find the indirect blocks for partial truncation.
838	* @inode: inode in question	838	* @inode: inode in question
839	* @depth: depth of the affected branch	839	* @depth: depth of the affected branch
840	* @offsets: offsets of pointers in that branch (see ext4_block_to_path)	840	* @offsets: offsets of pointers in that branch (see ext4_block_to_path)
841	* @chain: place to store the pointers to partial indirect blocks	841	* @chain: place to store the pointers to partial indirect blocks
842	* @top: place to the (detached) top of branch	842	* @top: place to the (detached) top of branch
843	*	843	*
844	* This is a helper function used by ext4_truncate().	844	* This is a helper function used by ext4_truncate().
845	*	845	*
846	* When we do truncate() we may have to clean the ends of several	846	* When we do truncate() we may have to clean the ends of several
847	* indirect blocks but leave the blocks themselves alive. Block is	847	* indirect blocks but leave the blocks themselves alive. Block is
848	* partially truncated if some data below the new i_size is referred	848	* partially truncated if some data below the new i_size is referred
849	* from it (and it is on the path to the first completely truncated	849	* from it (and it is on the path to the first completely truncated
850	* data block, indeed). We have to free the top of that path along	850	* data block, indeed). We have to free the top of that path along
851	* with everything to the right of the path. Since no allocation	851	* with everything to the right of the path. Since no allocation
852	* past the truncation point is possible until ext4_truncate()	852	* past the truncation point is possible until ext4_truncate()
853	* finishes, we may safely do the latter, but top of branch may	853	* finishes, we may safely do the latter, but top of branch may
854	* require special attention - pageout below the truncation point	854	* require special attention - pageout below the truncation point
855	* might try to populate it.	855	* might try to populate it.
856	*	856	*
857	* We atomically detach the top of branch from the tree, store the	857	* We atomically detach the top of branch from the tree, store the
858	* block number of its root in *@top, pointers to buffer_heads of	858	* block number of its root in *@top, pointers to buffer_heads of
859	* partially truncated blocks - in @chain[].bh and pointers to	859	* partially truncated blocks - in @chain[].bh and pointers to
860	* their last elements that should not be removed - in	860	* their last elements that should not be removed - in
861	* @chain[].p. Return value is the pointer to last filled element	861	* @chain[].p. Return value is the pointer to last filled element
862	* of @chain.	862	* of @chain.
863	*	863	*
864	* The work left to caller to do the actual freeing of subtrees:	864	* The work left to caller to do the actual freeing of subtrees:
865	* a) free the subtree starting from *@top	865	* a) free the subtree starting from *@top
866	* b) free the subtrees whose roots are stored in	866	* b) free the subtrees whose roots are stored in
867	* (@chain[i].p+1 .. end of @chain[i].bh->b_data)	867	* (@chain[i].p+1 .. end of @chain[i].bh->b_data)
868	* c) free the subtrees growing from the inode past the @chain[0].	868	* c) free the subtrees growing from the inode past the @chain[0].
869	* (no partially truncated stuff there). */	869	* (no partially truncated stuff there). */
870		870
871	static Indirect ext4_find_shared(struct inode inode, int depth,	871	static Indirect ext4_find_shared(struct inode inode, int depth,
872	ext4_lblk_t offsets[4], Indirect chain[4],	872	ext4_lblk_t offsets[4], Indirect chain[4],
873	__le32 *top)	873	__le32 *top)
874	{	874	{
875	Indirect partial, p;	875	Indirect partial, p;
876	int k, err;	876	int k, err;
877		877
878	*top = 0;	878	*top = 0;
879	/* Make k index the deepest non-null offset + 1 */	879	/* Make k index the deepest non-null offset + 1 */
880	for (k = depth; k > 1 && !offsets[k-1]; k--)	880	for (k = depth; k > 1 && !offsets[k-1]; k--)
881	;	881	;
882	partial = ext4_get_branch(inode, k, offsets, chain, &err);	882	partial = ext4_get_branch(inode, k, offsets, chain, &err);
883	/* Writer: pointers */	883	/* Writer: pointers */
884	if (!partial)	884	if (!partial)
885	partial = chain + k-1;	885	partial = chain + k-1;
886	/*	886	/*
887	* If the branch acquired continuation since we've looked at it -	887	* If the branch acquired continuation since we've looked at it -
888	* fine, it should all survive and (new) top doesn't belong to us.	888	* fine, it should all survive and (new) top doesn't belong to us.
889	*/	889	*/
890	if (!partial->key && *partial->p)	890	if (!partial->key && *partial->p)
891	/* Writer: end */	891	/* Writer: end */
892	goto no_top;	892	goto no_top;
893	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)	893	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
894	;	894	;
895	/*	895	/*
896	* OK, we've found the last block that must survive. The rest of our	896	* OK, we've found the last block that must survive. The rest of our
897	* branch should be detached before unlocking. However, if that rest	897	* branch should be detached before unlocking. However, if that rest
898	* of branch is all ours and does not grow immediately from the inode	898	* of branch is all ours and does not grow immediately from the inode
899	* it's easier to cheat and just decrement partial->p.	899	* it's easier to cheat and just decrement partial->p.
900	*/	900	*/
901	if (p == chain + k - 1 && p > chain) {	901	if (p == chain + k - 1 && p > chain) {
902	p->p--;	902	p->p--;
903	} else {	903	} else {
904	top = p->p;	904	top = p->p;
905	/* Nope, don't do this in ext4. Must leave the tree intact */	905	/* Nope, don't do this in ext4. Must leave the tree intact */
906	#if 0	906	#if 0
907	*p->p = 0;	907	*p->p = 0;
908	#endif	908	#endif
909	}	909	}
910	/* Writer: end */	910	/* Writer: end */
911		911
912	while (partial > p) {	912	while (partial > p) {
913	brelse(partial->bh);	913	brelse(partial->bh);
914	partial--;	914	partial--;
915	}	915	}
916	no_top:	916	no_top:
917	return partial;	917	return partial;
918	}	918	}
919		919
920	/*	920	/*
921	* Zero a number of block pointers in either an inode or an indirect block.	921	* Zero a number of block pointers in either an inode or an indirect block.
922	* If we restart the transaction we must again get write access to the	922	* If we restart the transaction we must again get write access to the
923	* indirect block for further modification.	923	* indirect block for further modification.
924	*	924	*
925	* We release `count' blocks on disk, but (last - first) may be greater	925	* We release `count' blocks on disk, but (last - first) may be greater
926	* than `count' because there can be holes in there.	926	* than `count' because there can be holes in there.
927	*	927	*
928	* Return 0 on success, 1 on invalid block range	928	* Return 0 on success, 1 on invalid block range
929	* and < 0 on fatal error.	929	* and < 0 on fatal error.
930	*/	930	*/
931	static int ext4_clear_blocks(handle_t handle, struct inode inode,	931	static int ext4_clear_blocks(handle_t handle, struct inode inode,
932	struct buffer_head *bh,	932	struct buffer_head *bh,
933	ext4_fsblk_t block_to_free,	933	ext4_fsblk_t block_to_free,
934	unsigned long count, __le32 *first,	934	unsigned long count, __le32 *first,
935	__le32 *last)	935	__le32 *last)
936	{	936	{
937	__le32 *p;	937	__le32 *p;
938	int flags = EXT4_FREE_BLOCKS_VALIDATED;	938	int flags = EXT4_FREE_BLOCKS_VALIDATED;
939	int err;	939	int err;
940		940
941	if (S_ISDIR(inode->i_mode) \|\| S_ISLNK(inode->i_mode))	941	if (S_ISDIR(inode->i_mode) \|\| S_ISLNK(inode->i_mode))
942	flags \|= EXT4_FREE_BLOCKS_FORGET \| EXT4_FREE_BLOCKS_METADATA;	942	flags \|= EXT4_FREE_BLOCKS_FORGET \| EXT4_FREE_BLOCKS_METADATA;
943	else if (ext4_should_journal_data(inode))	943	else if (ext4_should_journal_data(inode))
944	flags \|= EXT4_FREE_BLOCKS_FORGET;	944	flags \|= EXT4_FREE_BLOCKS_FORGET;
945		945
946	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,	946	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
947	count)) {	947	count)) {
948	EXT4_ERROR_INODE(inode, "attempt to clear invalid "	948	EXT4_ERROR_INODE(inode, "attempt to clear invalid "
949	"blocks %llu len %lu",	949	"blocks %llu len %lu",
950	(unsigned long long) block_to_free, count);	950	(unsigned long long) block_to_free, count);
951	return 1;	951	return 1;
952	}	952	}
953		953
954	if (try_to_extend_transaction(handle, inode)) {	954	if (try_to_extend_transaction(handle, inode)) {
955	if (bh) {	955	if (bh) {
956	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");	956	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
957	err = ext4_handle_dirty_metadata(handle, inode, bh);	957	err = ext4_handle_dirty_metadata(handle, inode, bh);
958	if (unlikely(err))	958	if (unlikely(err))
959	goto out_err;	959	goto out_err;
960	}	960	}
961	err = ext4_mark_inode_dirty(handle, inode);	961	err = ext4_mark_inode_dirty(handle, inode);
962	if (unlikely(err))	962	if (unlikely(err))
963	goto out_err;	963	goto out_err;
964	err = ext4_truncate_restart_trans(handle, inode,	964	err = ext4_truncate_restart_trans(handle, inode,
965	ext4_blocks_for_truncate(inode));	965	ext4_blocks_for_truncate(inode));
966	if (unlikely(err))	966	if (unlikely(err))
967	goto out_err;	967	goto out_err;
968	if (bh) {	968	if (bh) {
969	BUFFER_TRACE(bh, "retaking write access");	969	BUFFER_TRACE(bh, "retaking write access");
970	err = ext4_journal_get_write_access(handle, bh);	970	err = ext4_journal_get_write_access(handle, bh);
971	if (unlikely(err))	971	if (unlikely(err))
972	goto out_err;	972	goto out_err;
973	}	973	}
974	}	974	}
975		975
976	for (p = first; p < last; p++)	976	for (p = first; p < last; p++)
977	*p = 0;	977	*p = 0;
978		978
979	ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);	979	ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
980	return 0;	980	return 0;
981	out_err:	981	out_err:
982	ext4_std_error(inode->i_sb, err);	982	ext4_std_error(inode->i_sb, err);
983	return err;	983	return err;
984	}	984	}
985		985
986	/**	986	/**
987	* ext4_free_data - free a list of data blocks	987	* ext4_free_data - free a list of data blocks
988	* @handle: handle for this transaction	988	* @handle: handle for this transaction
989	* @inode: inode we are dealing with	989	* @inode: inode we are dealing with
990	* @this_bh: indirect buffer_head which contains @first and @last	990	* @this_bh: indirect buffer_head which contains @first and @last
991	* @first: array of block numbers	991	* @first: array of block numbers
992	* @last: points immediately past the end of array	992	* @last: points immediately past the end of array
993	*	993	*
994	* We are freeing all blocks referred from that array (numbers are stored as	994	* We are freeing all blocks referred from that array (numbers are stored as
995	* little-endian 32-bit) and updating @inode->i_blocks appropriately.	995	* little-endian 32-bit) and updating @inode->i_blocks appropriately.
996	*	996	*
997	* We accumulate contiguous runs of blocks to free. Conveniently, if these	997	* We accumulate contiguous runs of blocks to free. Conveniently, if these
998	* blocks are contiguous then releasing them at one time will only affect one	998	* blocks are contiguous then releasing them at one time will only affect one
999	* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't	999	* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1000	* actually use a lot of journal space.	1000	* actually use a lot of journal space.
1001	*	1001	*
1002	* @this_bh will be %NULL if @first and @last point into the inode's direct	1002	* @this_bh will be %NULL if @first and @last point into the inode's direct
1003	* block pointers.	1003	* block pointers.
1004	*/	1004	*/
1005	static void ext4_free_data(handle_t handle, struct inode inode,	1005	static void ext4_free_data(handle_t handle, struct inode inode,
1006	struct buffer_head *this_bh,	1006	struct buffer_head *this_bh,
1007	__le32 first, __le32 last)	1007	__le32 first, __le32 last)
1008	{	1008	{
1009	ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */	1009	ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
1010	unsigned long count = 0; /* Number of blocks in the run */	1010	unsigned long count = 0; /* Number of blocks in the run */
1011	__le32 block_to_free_p = NULL; / Pointer into inode/ind	1011	__le32 block_to_free_p = NULL; / Pointer into inode/ind
1012	corresponding to	1012	corresponding to
1013	block_to_free */	1013	block_to_free */
1014	ext4_fsblk_t nr; /* Current block # */	1014	ext4_fsblk_t nr; /* Current block # */
1015	__le32 p; / Pointer into inode/ind	1015	__le32 p; / Pointer into inode/ind
1016	for current block */	1016	for current block */
1017	int err = 0;	1017	int err = 0;
1018		1018
1019	if (this_bh) { /* For indirect block */	1019	if (this_bh) { /* For indirect block */
1020	BUFFER_TRACE(this_bh, "get_write_access");	1020	BUFFER_TRACE(this_bh, "get_write_access");
1021	err = ext4_journal_get_write_access(handle, this_bh);	1021	err = ext4_journal_get_write_access(handle, this_bh);
1022	/* Important: if we can't update the indirect pointers	1022	/* Important: if we can't update the indirect pointers
1023	* to the blocks, we can't free them. */	1023	* to the blocks, we can't free them. */
1024	if (err)	1024	if (err)
1025	return;	1025	return;
1026	}	1026	}
1027		1027
1028	for (p = first; p < last; p++) {	1028	for (p = first; p < last; p++) {
1029	nr = le32_to_cpu(*p);	1029	nr = le32_to_cpu(*p);
1030	if (nr) {	1030	if (nr) {
1031	/* accumulate blocks to free if they're contiguous */	1031	/* accumulate blocks to free if they're contiguous */
1032	if (count == 0) {	1032	if (count == 0) {
1033	block_to_free = nr;	1033	block_to_free = nr;
1034	block_to_free_p = p;	1034	block_to_free_p = p;
1035	count = 1;	1035	count = 1;
1036	} else if (nr == block_to_free + count) {	1036	} else if (nr == block_to_free + count) {
1037	count++;	1037	count++;
1038	} else {	1038	} else {
1039	err = ext4_clear_blocks(handle, inode, this_bh,	1039	err = ext4_clear_blocks(handle, inode, this_bh,
1040	block_to_free, count,	1040	block_to_free, count,
1041	block_to_free_p, p);	1041	block_to_free_p, p);
1042	if (err)	1042	if (err)
1043	break;	1043	break;
1044	block_to_free = nr;	1044	block_to_free = nr;
1045	block_to_free_p = p;	1045	block_to_free_p = p;
1046	count = 1;	1046	count = 1;
1047	}	1047	}
1048	}	1048	}
1049	}	1049	}
1050		1050
1051	if (!err && count > 0)	1051	if (!err && count > 0)
1052	err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,	1052	err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
1053	count, block_to_free_p, p);	1053	count, block_to_free_p, p);
1054	if (err < 0)	1054	if (err < 0)
1055	/* fatal error */	1055	/* fatal error */
1056	return;	1056	return;
1057		1057
1058	if (this_bh) {	1058	if (this_bh) {
1059	BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");	1059	BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
1060		1060
1061	/*	1061	/*
1062	* The buffer head should have an attached journal head at this	1062	* The buffer head should have an attached journal head at this
1063	* point. However, if the data is corrupted and an indirect	1063	* point. However, if the data is corrupted and an indirect
1064	* block pointed to itself, it would have been detached when	1064	* block pointed to itself, it would have been detached when
1065	* the block was cleared. Check for this instead of OOPSing.	1065	* the block was cleared. Check for this instead of OOPSing.
1066	*/	1066	*/
1067	if ((EXT4_JOURNAL(inode) == NULL) \|\| bh2jh(this_bh))	1067	if ((EXT4_JOURNAL(inode) == NULL) \|\| bh2jh(this_bh))
1068	ext4_handle_dirty_metadata(handle, inode, this_bh);	1068	ext4_handle_dirty_metadata(handle, inode, this_bh);
1069	else	1069	else
1070	EXT4_ERROR_INODE(inode,	1070	EXT4_ERROR_INODE(inode,
1071	"circular indirect block detected at "	1071	"circular indirect block detected at "
1072	"block %llu",	1072	"block %llu",
1073	(unsigned long long) this_bh->b_blocknr);	1073	(unsigned long long) this_bh->b_blocknr);
1074	}	1074	}
1075	}	1075	}
1076		1076
1077	/**	1077	/**
1078	* ext4_free_branches - free an array of branches	1078	* ext4_free_branches - free an array of branches
1079	* @handle: JBD handle for this transaction	1079	* @handle: JBD handle for this transaction
1080	* @inode: inode we are dealing with	1080	* @inode: inode we are dealing with
1081	* @parent_bh: the buffer_head which contains @first and @last	1081	* @parent_bh: the buffer_head which contains @first and @last
1082	* @first: array of block numbers	1082	* @first: array of block numbers
1083	* @last: pointer immediately past the end of array	1083	* @last: pointer immediately past the end of array
1084	* @depth: depth of the branches to free	1084	* @depth: depth of the branches to free
1085	*	1085	*
1086	* We are freeing all blocks referred from these branches (numbers are	1086	* We are freeing all blocks referred from these branches (numbers are
1087	* stored as little-endian 32-bit) and updating @inode->i_blocks	1087	* stored as little-endian 32-bit) and updating @inode->i_blocks
1088	* appropriately.	1088	* appropriately.
1089	*/	1089	*/
1090	static void ext4_free_branches(handle_t handle, struct inode inode,	1090	static void ext4_free_branches(handle_t handle, struct inode inode,
1091	struct buffer_head *parent_bh,	1091	struct buffer_head *parent_bh,
1092	__le32 first, __le32 last, int depth)	1092	__le32 first, __le32 last, int depth)
1093	{	1093	{
1094	ext4_fsblk_t nr;	1094	ext4_fsblk_t nr;
1095	__le32 *p;	1095	__le32 *p;
1096		1096
1097	if (ext4_handle_is_aborted(handle))	1097	if (ext4_handle_is_aborted(handle))
1098	return;	1098	return;
1099		1099
1100	if (depth--) {	1100	if (depth--) {
1101	struct buffer_head *bh;	1101	struct buffer_head *bh;
1102	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);	1102	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1103	p = last;	1103	p = last;
1104	while (--p >= first) {	1104	while (--p >= first) {
1105	nr = le32_to_cpu(*p);	1105	nr = le32_to_cpu(*p);
1106	if (!nr)	1106	if (!nr)
1107	continue; /* A hole */	1107	continue; /* A hole */
1108		1108
1109	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),	1109	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
1110	nr, 1)) {	1110	nr, 1)) {
1111	EXT4_ERROR_INODE(inode,	1111	EXT4_ERROR_INODE(inode,
1112	"invalid indirect mapped "	1112	"invalid indirect mapped "
1113	"block %lu (level %d)",	1113	"block %lu (level %d)",
1114	(unsigned long) nr, depth);	1114	(unsigned long) nr, depth);
1115	break;	1115	break;
1116	}	1116	}
1117		1117
1118	/* Go read the buffer for the next level down */	1118	/* Go read the buffer for the next level down */
1119	bh = sb_bread(inode->i_sb, nr);	1119	bh = sb_bread(inode->i_sb, nr);
1120		1120
1121	/*	1121	/*
1122	* A read failure? Report error and clear slot	1122	* A read failure? Report error and clear slot
1123	* (should be rare).	1123	* (should be rare).
1124	*/	1124	*/
1125	if (!bh) {	1125	if (!bh) {
1126	EXT4_ERROR_INODE_BLOCK(inode, nr,	1126	EXT4_ERROR_INODE_BLOCK(inode, nr,
1127	"Read failure");	1127	"Read failure");
1128	continue;	1128	continue;
1129	}	1129	}
1130		1130
1131	/* This zaps the entire block. Bottom up. */	1131	/* This zaps the entire block. Bottom up. */
1132	BUFFER_TRACE(bh, "free child branches");	1132	BUFFER_TRACE(bh, "free child branches");
1133	ext4_free_branches(handle, inode, bh,	1133	ext4_free_branches(handle, inode, bh,
1134	(__le32 *) bh->b_data,	1134	(__le32 *) bh->b_data,
1135	(__le32 *) bh->b_data + addr_per_block,	1135	(__le32 *) bh->b_data + addr_per_block,
1136	depth);	1136	depth);
1137	brelse(bh);	1137	brelse(bh);
1138		1138
1139	/*	1139	/*
1140	* Everything below this this pointer has been	1140	* Everything below this this pointer has been
1141	* released. Now let this top-of-subtree go.	1141	* released. Now let this top-of-subtree go.
1142	*	1142	*
1143	* We want the freeing of this indirect block to be	1143	* We want the freeing of this indirect block to be
1144	* atomic in the journal with the updating of the	1144	* atomic in the journal with the updating of the
1145	* bitmap block which owns it. So make some room in	1145	* bitmap block which owns it. So make some room in
1146	* the journal.	1146	* the journal.
1147	*	1147	*
1148	* We zero the parent pointer after freeing its	1148	* We zero the parent pointer after freeing its
1149	* pointee in the bitmaps, so if extend_transaction()	1149	* pointee in the bitmaps, so if extend_transaction()
1150	* for some reason fails to put the bitmap changes and	1150	* for some reason fails to put the bitmap changes and
1151	* the release into the same transaction, recovery	1151	* the release into the same transaction, recovery
1152	* will merely complain about releasing a free block,	1152	* will merely complain about releasing a free block,
1153	* rather than leaking blocks.	1153	* rather than leaking blocks.
1154	*/	1154	*/
1155	if (ext4_handle_is_aborted(handle))	1155	if (ext4_handle_is_aborted(handle))
1156	return;	1156	return;
1157	if (try_to_extend_transaction(handle, inode)) {	1157	if (try_to_extend_transaction(handle, inode)) {
1158	ext4_mark_inode_dirty(handle, inode);	1158	ext4_mark_inode_dirty(handle, inode);
1159	ext4_truncate_restart_trans(handle, inode,	1159	ext4_truncate_restart_trans(handle, inode,
1160	ext4_blocks_for_truncate(inode));	1160	ext4_blocks_for_truncate(inode));
1161	}	1161	}
1162		1162
1163	/*	1163	/*
1164	* The forget flag here is critical because if	1164	* The forget flag here is critical because if
1165	* we are journaling (and not doing data	1165	* we are journaling (and not doing data
1166	* journaling), we have to make sure a revoke	1166	* journaling), we have to make sure a revoke
1167	* record is written to prevent the journal	1167	* record is written to prevent the journal
1168	* replay from overwriting the (former)	1168	* replay from overwriting the (former)
1169	* indirect block if it gets reallocated as a	1169	* indirect block if it gets reallocated as a
1170	* data block. This must happen in the same	1170	* data block. This must happen in the same
1171	* transaction where the data blocks are	1171	* transaction where the data blocks are
1172	* actually freed.	1172	* actually freed.
1173	*/	1173	*/
1174	ext4_free_blocks(handle, inode, NULL, nr, 1,	1174	ext4_free_blocks(handle, inode, NULL, nr, 1,
1175	EXT4_FREE_BLOCKS_METADATA\|	1175	EXT4_FREE_BLOCKS_METADATA\|
1176	EXT4_FREE_BLOCKS_FORGET);	1176	EXT4_FREE_BLOCKS_FORGET);
1177		1177
1178	if (parent_bh) {	1178	if (parent_bh) {
1179	/*	1179	/*
1180	* The block which we have just freed is	1180	* The block which we have just freed is
1181	* pointed to by an indirect block: journal it	1181	* pointed to by an indirect block: journal it
1182	*/	1182	*/
1183	BUFFER_TRACE(parent_bh, "get_write_access");	1183	BUFFER_TRACE(parent_bh, "get_write_access");
1184	if (!ext4_journal_get_write_access(handle,	1184	if (!ext4_journal_get_write_access(handle,
1185	parent_bh)){	1185	parent_bh)){
1186	*p = 0;	1186	*p = 0;
1187	BUFFER_TRACE(parent_bh,	1187	BUFFER_TRACE(parent_bh,
1188	"call ext4_handle_dirty_metadata");	1188	"call ext4_handle_dirty_metadata");
1189	ext4_handle_dirty_metadata(handle,	1189	ext4_handle_dirty_metadata(handle,
1190	inode,	1190	inode,
1191	parent_bh);	1191	parent_bh);
1192	}	1192	}
1193	}	1193	}
1194	}	1194	}
1195	} else {	1195	} else {
1196	/* We have reached the bottom of the tree. */	1196	/* We have reached the bottom of the tree. */
1197	BUFFER_TRACE(parent_bh, "free data blocks");	1197	BUFFER_TRACE(parent_bh, "free data blocks");
1198	ext4_free_data(handle, inode, parent_bh, first, last);	1198	ext4_free_data(handle, inode, parent_bh, first, last);
1199	}	1199	}
1200	}	1200	}
1201		1201
1202	void ext4_ind_truncate(handle_t handle, struct inode inode)	1202	void ext4_ind_truncate(handle_t handle, struct inode inode)
1203	{	1203	{
1204	struct ext4_inode_info *ei = EXT4_I(inode);	1204	struct ext4_inode_info *ei = EXT4_I(inode);
1205	__le32 *i_data = ei->i_data;	1205	__le32 *i_data = ei->i_data;
1206	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);	1206	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1207	ext4_lblk_t offsets[4];	1207	ext4_lblk_t offsets[4];
1208	Indirect chain[4];	1208	Indirect chain[4];
1209	Indirect *partial;	1209	Indirect *partial;
1210	__le32 nr = 0;	1210	__le32 nr = 0;
1211	int n = 0;	1211	int n = 0;
1212	ext4_lblk_t last_block, max_block;	1212	ext4_lblk_t last_block, max_block;
1213	unsigned blocksize = inode->i_sb->s_blocksize;	1213	unsigned blocksize = inode->i_sb->s_blocksize;
1214		1214
1215	last_block = (inode->i_size + blocksize-1)	1215	last_block = (inode->i_size + blocksize-1)
1216	>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);	1216	>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1217	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)	1217	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
1218	>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);	1218	>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1219		1219
1220	if (last_block != max_block) {	1220	if (last_block != max_block) {
1221	n = ext4_block_to_path(inode, last_block, offsets, NULL);	1221	n = ext4_block_to_path(inode, last_block, offsets, NULL);
1222	if (n == 0)	1222	if (n == 0)
1223	return;	1223	return;
1224	}	1224	}
1225		1225
1226	ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);	1226	ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
1227		1227
1228	/*	1228	/*
1229	* The orphan list entry will now protect us from any crash which	1229	* The orphan list entry will now protect us from any crash which
1230	* occurs before the truncate completes, so it is now safe to propagate	1230	* occurs before the truncate completes, so it is now safe to propagate
1231	* the new, shorter inode size (held for now in i_size) into the	1231	* the new, shorter inode size (held for now in i_size) into the
1232	* on-disk inode. We do this via i_disksize, which is the value which	1232	* on-disk inode. We do this via i_disksize, which is the value which
1233	* ext4 really writes onto the disk inode.	1233	* ext4 really writes onto the disk inode.
1234	*/	1234	*/
1235	ei->i_disksize = inode->i_size;	1235	ei->i_disksize = inode->i_size;
1236		1236
1237	if (last_block == max_block) {	1237	if (last_block == max_block) {
1238	/*	1238	/*
1239	* It is unnecessary to free any data blocks if last_block is	1239	* It is unnecessary to free any data blocks if last_block is
1240	* equal to the indirect block limit.	1240	* equal to the indirect block limit.
1241	*/	1241	*/
1242	return;	1242	return;
1243	} else if (n == 1) { /* direct blocks */	1243	} else if (n == 1) { /* direct blocks */
1244	ext4_free_data(handle, inode, NULL, i_data+offsets[0],	1244	ext4_free_data(handle, inode, NULL, i_data+offsets[0],
1245	i_data + EXT4_NDIR_BLOCKS);	1245	i_data + EXT4_NDIR_BLOCKS);
1246	goto do_indirects;	1246	goto do_indirects;
1247	}	1247	}
1248		1248
1249	partial = ext4_find_shared(inode, n, offsets, chain, &nr);	1249	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
1250	/* Kill the top of shared branch (not detached) */	1250	/* Kill the top of shared branch (not detached) */
1251	if (nr) {	1251	if (nr) {
1252	if (partial == chain) {	1252	if (partial == chain) {
1253	/* Shared branch grows from the inode */	1253	/* Shared branch grows from the inode */
1254	ext4_free_branches(handle, inode, NULL,	1254	ext4_free_branches(handle, inode, NULL,
1255	&nr, &nr+1, (chain+n-1) - partial);	1255	&nr, &nr+1, (chain+n-1) - partial);
1256	*partial->p = 0;	1256	*partial->p = 0;
1257	/*	1257	/*
1258	* We mark the inode dirty prior to restart,	1258	* We mark the inode dirty prior to restart,
1259	* and prior to stop. No need for it here.	1259	* and prior to stop. No need for it here.
1260	*/	1260	*/
1261	} else {	1261	} else {
1262	/* Shared branch grows from an indirect block */	1262	/* Shared branch grows from an indirect block */
1263	BUFFER_TRACE(partial->bh, "get_write_access");	1263	BUFFER_TRACE(partial->bh, "get_write_access");
1264	ext4_free_branches(handle, inode, partial->bh,	1264	ext4_free_branches(handle, inode, partial->bh,
1265	partial->p,	1265	partial->p,
1266	partial->p+1, (chain+n-1) - partial);	1266	partial->p+1, (chain+n-1) - partial);
1267	}	1267	}
1268	}	1268	}
1269	/* Clear the ends of indirect blocks on the shared branch */	1269	/* Clear the ends of indirect blocks on the shared branch */
1270	while (partial > chain) {	1270	while (partial > chain) {
1271	ext4_free_branches(handle, inode, partial->bh, partial->p + 1,	1271	ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
1272	(__le32*)partial->bh->b_data+addr_per_block,	1272	(__le32*)partial->bh->b_data+addr_per_block,
1273	(chain+n-1) - partial);	1273	(chain+n-1) - partial);
1274	BUFFER_TRACE(partial->bh, "call brelse");	1274	BUFFER_TRACE(partial->bh, "call brelse");
1275	brelse(partial->bh);	1275	brelse(partial->bh);
1276	partial--;	1276	partial--;
1277	}	1277	}
1278	do_indirects:	1278	do_indirects:
1279	/* Kill the remaining (whole) subtrees */	1279	/* Kill the remaining (whole) subtrees */
1280	switch (offsets[0]) {	1280	switch (offsets[0]) {
1281	default:	1281	default:
1282	nr = i_data[EXT4_IND_BLOCK];	1282	nr = i_data[EXT4_IND_BLOCK];
1283	if (nr) {	1283	if (nr) {
1284	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);	1284	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
1285	i_data[EXT4_IND_BLOCK] = 0;	1285	i_data[EXT4_IND_BLOCK] = 0;
1286	}	1286	}
1287	case EXT4_IND_BLOCK:	1287	case EXT4_IND_BLOCK:
1288	nr = i_data[EXT4_DIND_BLOCK];	1288	nr = i_data[EXT4_DIND_BLOCK];
1289	if (nr) {	1289	if (nr) {
1290	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);	1290	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
1291	i_data[EXT4_DIND_BLOCK] = 0;	1291	i_data[EXT4_DIND_BLOCK] = 0;
1292	}	1292	}
1293	case EXT4_DIND_BLOCK:	1293	case EXT4_DIND_BLOCK:
1294	nr = i_data[EXT4_TIND_BLOCK];	1294	nr = i_data[EXT4_TIND_BLOCK];
1295	if (nr) {	1295	if (nr) {
1296	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);	1296	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
1297	i_data[EXT4_TIND_BLOCK] = 0;	1297	i_data[EXT4_TIND_BLOCK] = 0;
1298	}	1298	}
1299	case EXT4_TIND_BLOCK:	1299	case EXT4_TIND_BLOCK:
1300	;	1300	;
1301	}	1301	}
1302	}	1302	}
1303		1303
1304	/**	1304	/**
1305	* ext4_ind_remove_space - remove space from the range	1305	* ext4_ind_remove_space - remove space from the range
1306	* @handle: JBD handle for this transaction	1306	* @handle: JBD handle for this transaction
1307	* @inode: inode we are dealing with	1307	* @inode: inode we are dealing with
1308	* @start: First block to remove	1308	* @start: First block to remove
1309	* @end: One block after the last block to remove (exclusive)	1309	* @end: One block after the last block to remove (exclusive)
1310	*	1310	*
1311	* Free the blocks in the defined range (end is exclusive endpoint of	1311	* Free the blocks in the defined range (end is exclusive endpoint of
1312	* range). This is used by ext4_punch_hole().	1312	* range). This is used by ext4_punch_hole().
1313	*/	1313	*/
1314	int ext4_ind_remove_space(handle_t handle, struct inode inode,	1314	int ext4_ind_remove_space(handle_t handle, struct inode inode,
1315	ext4_lblk_t start, ext4_lblk_t end)	1315	ext4_lblk_t start, ext4_lblk_t end)
1316	{	1316	{
1317	struct ext4_inode_info *ei = EXT4_I(inode);	1317	struct ext4_inode_info *ei = EXT4_I(inode);
1318	__le32 *i_data = ei->i_data;	1318	__le32 *i_data = ei->i_data;
1319	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);	1319	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1320	ext4_lblk_t offsets[4], offsets2[4];	1320	ext4_lblk_t offsets[4], offsets2[4];
1321	Indirect chain[4], chain2[4];	1321	Indirect chain[4], chain2[4];
1322	Indirect partial, partial2;	1322	Indirect partial, partial2;
1323	ext4_lblk_t max_block;	1323	ext4_lblk_t max_block;
1324	__le32 nr = 0, nr2 = 0;	1324	__le32 nr = 0, nr2 = 0;
1325	int n = 0, n2 = 0;	1325	int n = 0, n2 = 0;
1326	unsigned blocksize = inode->i_sb->s_blocksize;	1326	unsigned blocksize = inode->i_sb->s_blocksize;
1327		1327
1328	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)	1328	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
1329	>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);	1329	>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1330	if (end >= max_block)	1330	if (end >= max_block)
1331	end = max_block;	1331	end = max_block;
1332	if ((start >= end) \|\| (start > max_block))	1332	if ((start >= end) \|\| (start > max_block))
1333	return 0;	1333	return 0;
1334		1334
1335	n = ext4_block_to_path(inode, start, offsets, NULL);	1335	n = ext4_block_to_path(inode, start, offsets, NULL);
1336	n2 = ext4_block_to_path(inode, end, offsets2, NULL);	1336	n2 = ext4_block_to_path(inode, end, offsets2, NULL);
1337		1337
1338	BUG_ON(n > n2);	1338	BUG_ON(n > n2);
1339		1339
1340	if ((n == 1) && (n == n2)) {	1340	if ((n == 1) && (n == n2)) {
1341	/* We're punching only within direct block range */	1341	/* We're punching only within direct block range */
1342	ext4_free_data(handle, inode, NULL, i_data + offsets[0],	1342	ext4_free_data(handle, inode, NULL, i_data + offsets[0],
1343	i_data + offsets2[0]);	1343	i_data + offsets2[0]);
1344	return 0;	1344	return 0;
1345	} else if (n2 > n) {	1345	} else if (n2 > n) {
1346	/*	1346	/*
1347	* Start and end are on a different levels so we're going to	1347	* Start and end are on a different levels so we're going to
1348	* free partial block at start, and partial block at end of	1348	* free partial block at start, and partial block at end of
1349	* the range. If there are some levels in between then	1349	* the range. If there are some levels in between then
1350	* do_indirects label will take care of that.	1350	* do_indirects label will take care of that.
1351	*/	1351	*/
1352		1352
1353	if (n == 1) {	1353	if (n == 1) {
1354	/*	1354	/*
1355	* Start is at the direct block level, free	1355	* Start is at the direct block level, free
1356	* everything to the end of the level.	1356	* everything to the end of the level.
1357	*/	1357	*/
1358	ext4_free_data(handle, inode, NULL, i_data + offsets[0],	1358	ext4_free_data(handle, inode, NULL, i_data + offsets[0],
1359	i_data + EXT4_NDIR_BLOCKS);	1359	i_data + EXT4_NDIR_BLOCKS);
1360	goto end_range;	1360	goto end_range;
1361	}	1361	}
1362		1362
1363		1363
1364	partial = ext4_find_shared(inode, n, offsets, chain, &nr);	1364	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
1365	if (nr) {	1365	if (nr) {
1366	if (partial == chain) {	1366	if (partial == chain) {
1367	/* Shared branch grows from the inode */	1367	/* Shared branch grows from the inode */
1368	ext4_free_branches(handle, inode, NULL,	1368	ext4_free_branches(handle, inode, NULL,
1369	&nr, &nr+1, (chain+n-1) - partial);	1369	&nr, &nr+1, (chain+n-1) - partial);
1370	*partial->p = 0;	1370	*partial->p = 0;
1371	} else {	1371	} else {
1372	/* Shared branch grows from an indirect block */	1372	/* Shared branch grows from an indirect block */
1373	BUFFER_TRACE(partial->bh, "get_write_access");	1373	BUFFER_TRACE(partial->bh, "get_write_access");
1374	ext4_free_branches(handle, inode, partial->bh,	1374	ext4_free_branches(handle, inode, partial->bh,
1375	partial->p,	1375	partial->p,
1376	partial->p+1, (chain+n-1) - partial);	1376	partial->p+1, (chain+n-1) - partial);
1377	}	1377	}
1378	}	1378	}
1379		1379
1380	/*	1380	/*
1381	* Clear the ends of indirect blocks on the shared branch	1381	* Clear the ends of indirect blocks on the shared branch
1382	* at the start of the range	1382	* at the start of the range
1383	*/	1383	*/
1384	while (partial > chain) {	1384	while (partial > chain) {
1385	ext4_free_branches(handle, inode, partial->bh,	1385	ext4_free_branches(handle, inode, partial->bh,
1386	partial->p + 1,	1386	partial->p + 1,
1387	(__le32 *)partial->bh->b_data+addr_per_block,	1387	(__le32 *)partial->bh->b_data+addr_per_block,
1388	(chain+n-1) - partial);	1388	(chain+n-1) - partial);
1389	BUFFER_TRACE(partial->bh, "call brelse");	1389	BUFFER_TRACE(partial->bh, "call brelse");
1390	brelse(partial->bh);	1390	brelse(partial->bh);
1391	partial--;	1391	partial--;
1392	}	1392	}
1393		1393
1394	end_range:	1394	end_range:
1395	partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);	1395	partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
1396	if (nr2) {	1396	if (nr2) {
1397	if (partial2 == chain2) {	1397	if (partial2 == chain2) {
1398	/*	1398	/*
1399	* Remember, end is exclusive so here we're at	1399	* Remember, end is exclusive so here we're at
1400	* the start of the next level we're not going	1400	* the start of the next level we're not going
1401	* to free. Everything was covered by the start	1401	* to free. Everything was covered by the start
1402	* of the range.	1402	* of the range.
1403	*/	1403	*/
1404	return 0;	1404	goto do_indirects;
1405	} else {
1406	/* Shared branch grows from an indirect block */
1407	partial2--;
1408	}	1405	}
1409	} else {	1406	} else {
1410	/*	1407	/*
1411	* ext4_find_shared returns Indirect structure which	1408	* ext4_find_shared returns Indirect structure which
1412	* points to the last element which should not be	1409	* points to the last element which should not be
1413	* removed by truncate. But this is end of the range	1410	* removed by truncate. But this is end of the range
1414	* in punch_hole so we need to point to the next element	1411	* in punch_hole so we need to point to the next element
1415	*/	1412	*/
1416	partial2->p++;	1413	partial2->p++;
1417	}	1414	}
1418		1415
1419	/*	1416	/*
1420	* Clear the ends of indirect blocks on the shared branch	1417	* Clear the ends of indirect blocks on the shared branch
1421	* at the end of the range	1418	* at the end of the range
1422	*/	1419	*/
1423	while (partial2 > chain2) {	1420	while (partial2 > chain2) {
1424	ext4_free_branches(handle, inode, partial2->bh,	1421	ext4_free_branches(handle, inode, partial2->bh,
1425	(__le32 *)partial2->bh->b_data,	1422	(__le32 *)partial2->bh->b_data,
1426	partial2->p,	1423	partial2->p,
1427	(chain2+n2-1) - partial2);	1424	(chain2+n2-1) - partial2);
1428	BUFFER_TRACE(partial2->bh, "call brelse");	1425	BUFFER_TRACE(partial2->bh, "call brelse");
1429	brelse(partial2->bh);	1426	brelse(partial2->bh);
1430	partial2--;	1427	partial2--;
1431	}	1428	}
1432	goto do_indirects;	1429	goto do_indirects;
1433	}	1430	}
1434		1431
1435	/* Punch happened within the same level (n == n2) */	1432	/* Punch happened within the same level (n == n2) */
1436	partial = ext4_find_shared(inode, n, offsets, chain, &nr);	1433	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
1437	partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);	1434	partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
1438	/*	1435
1439	* ext4_find_shared returns Indirect structure which	1436	/* Free top, but only if partial2 isn't its subtree. */
1440	* points to the last element which should not be	1437	if (nr) {
1441	* removed by truncate. But this is end of the range	1438	int level = min(partial - chain, partial2 - chain2);
1442	* in punch_hole so we need to point to the next element	1439	int i;
1443	*/	1440	int subtree = 1;
1444	partial2->p++;	1441
1445	while ((partial > chain) \|\| (partial2 > chain2)) {	1442	for (i = 0; i <= level; i++) {
1446	/* We're at the same block, so we're almost finished */	1443	if (offsets[i] != offsets2[i]) {
1447	if ((partial->bh && partial2->bh) &&	1444	subtree = 0;
1448	(partial->bh->b_blocknr == partial2->bh->b_blocknr)) {	1445	break;
1449	if ((partial > chain) && (partial2 > chain2)) {	1446	}
		1447	}
		1448
		1449	if (!subtree) {
		1450	if (partial == chain) {
		1451	/* Shared branch grows from the inode */
		1452	ext4_free_branches(handle, inode, NULL,
		1453	&nr, &nr+1,
		1454	(chain+n-1) - partial);
		1455	*partial->p = 0;
		1456	} else {
		1457	/* Shared branch grows from an indirect block */
		1458	BUFFER_TRACE(partial->bh, "get_write_access");
1450	ext4_free_branches(handle, inode, partial->bh,	1459	ext4_free_branches(handle, inode, partial->bh,
1451	partial->p + 1,	1460	partial->p,
1452	partial2->p,	1461	partial->p+1,
1453	(chain+n-1) - partial);	1462	(chain+n-1) - partial);
1454	BUFFER_TRACE(partial->bh, "call brelse");
1455	brelse(partial->bh);
1456	BUFFER_TRACE(partial2->bh, "call brelse");
1457	brelse(partial2->bh);
1458	}	1463	}
1459	return 0;
1460	}	1464	}
		1465	}
		1466
		1467	if (!nr2) {
1461	/*	1468	/*
1462	* Clear the ends of indirect blocks on the shared branch	1469	* ext4_find_shared returns Indirect structure which
1463	* at the start of the range	1470	* points to the last element which should not be
		1471	* removed by truncate. But this is end of the range
		1472	* in punch_hole so we need to point to the next element
1464	*/	1473	*/
1465	if (partial > chain) {	1474	partial2->p++;
		1475	}
		1476
		1477	while (partial > chain \|\| partial2 > chain2) {
		1478	int depth = (chain+n-1) - partial;
		1479	int depth2 = (chain2+n2-1) - partial2;
		1480
		1481	if (partial > chain && partial2 > chain2 &&
		1482	partial->bh->b_blocknr == partial2->bh->b_blocknr) {
		1483	/*
		1484	* We've converged on the same block. Clear the range,
		1485	* then we're done.
		1486	*/
1466	ext4_free_branches(handle, inode, partial->bh,	1487	ext4_free_branches(handle, inode, partial->bh,
1467	partial->p + 1,	1488	partial->p + 1,
1468	(__le32 *)partial->bh->b_data+addr_per_block,	1489	partial2->p,
1469	(chain+n-1) - partial);	1490	(chain+n-1) - partial);
1470	BUFFER_TRACE(partial->bh, "call brelse");	1491	BUFFER_TRACE(partial->bh, "call brelse");
1471	brelse(partial->bh);	1492	brelse(partial->bh);
1472	partial--;	1493	BUFFER_TRACE(partial2->bh, "call brelse");
		1494	brelse(partial2->bh);
		1495	return 0;
1473	}	1496	}
		1497
1474	/*	1498	/*
1475	* Clear the ends of indirect blocks on the shared branch	1499	* The start and end partial branches may not be at the same
1476	* at the end of the range	1500	* level even though the punch happened within one level. So, we
		1501	* give them a chance to arrive at the same level, then walk
		1502	* them in step with each other until we converge on the same
		1503	* block.
1477	*/	1504	*/
1478	if (partial2 > chain2) {	1505	if (partial > chain && depth <= depth2) {
		1506	ext4_free_branches(handle, inode, partial->bh,
		1507	partial->p + 1,
		1508	(__le32 *)partial->bh->b_data+addr_per_block,
		1509	(chain+n-1) - partial);
		1510	BUFFER_TRACE(partial->bh, "call brelse");
		1511	brelse(partial->bh);
		1512	partial--;
		1513	}
		1514	if (partial2 > chain2 && depth2 <= depth) {
1479	ext4_free_branches(handle, inode, partial2->bh,	1515	ext4_free_branches(handle, inode, partial2->bh,
1480	(__le32 *)partial2->bh->b_data,	1516	(__le32 *)partial2->bh->b_data,
1481	partial2->p,	1517	partial2->p,
1482	(chain2+n-1) - partial2);	1518	(chain2+n2-1) - partial2);
1483	BUFFER_TRACE(partial2->bh, "call brelse");	1519	BUFFER_TRACE(partial2->bh, "call brelse");
1484	brelse(partial2->bh);	1520	brelse(partial2->bh);
1485	partial2--;	1521	partial2--;
1486	}	1522	}
1487	}	1523	}
		1524	return 0;
1488		1525
1489	do_indirects:	1526	do_indirects:
1490	/* Kill the remaining (whole) subtrees */	1527	/* Kill the remaining (whole) subtrees */
1491	switch (offsets[0]) {	1528	switch (offsets[0]) {
1492	default:	1529	default:
1493	if (++n >= n2)	1530	if (++n >= n2)
1494	return 0;	1531	return 0;
1495	nr = i_data[EXT4_IND_BLOCK];	1532	nr = i_data[EXT4_IND_BLOCK];
1496	if (nr) {	1533	if (nr) {
1497	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);	1534	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
1498	i_data[EXT4_IND_BLOCK] = 0;	1535	i_data[EXT4_IND_BLOCK] = 0;
1499	}	1536	}
1500	case EXT4_IND_BLOCK:	1537	case EXT4_IND_BLOCK:
1501	if (++n >= n2)	1538	if (++n >= n2)
1502	return 0;	1539	return 0;
1503	nr = i_data[EXT4_DIND_BLOCK];	1540	nr = i_data[EXT4_DIND_BLOCK];
1504	if (nr) {	1541	if (nr) {
1505	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);	1542	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
1506	i_data[EXT4_DIND_BLOCK] = 0;	1543	i_data[EXT4_DIND_BLOCK] = 0;
1507	}	1544	}
1508	case EXT4_DIND_BLOCK:	1545	case EXT4_DIND_BLOCK:
1509	if (++n >= n2)	1546	if (++n >= n2)
1510	return 0;	1547	return 0;
1511	nr = i_data[EXT4_TIND_BLOCK];	1548	nr = i_data[EXT4_TIND_BLOCK];
1512	if (nr) {	1549	if (nr) {
1513	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);	1550	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);

fs/ext4/inode.c

Diff comments View file @ feaf222

 /*
  *  linux/fs/ext4/inode.c
  *
  * Copyright (C) 1992, 1993, 1994, 1995
  * Remy Card (card@masi.ibp.fr)
  * Laboratoire MASI - Institut Blaise Pascal
  * Universite Pierre et Marie Curie (Paris VI)
  *
  *  from
  *
  *  linux/fs/minix/inode.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  *	(jj@sunsite.ms.mff.cuni.cz)
  *
  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
  */
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/jbd2.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/aio.h>
 #include <linux/bitops.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 #include "truncate.h"
 #include <trace/events/ext4.h>
 #define MPAGE_DA_EXTENT_TAIL 0x01
 static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
 			      struct ext4_inode_info *ei)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	__u16 csum_lo;
 	__u16 csum_hi = 0;
 	__u32 csum;
 	csum_lo = le16_to_cpu(raw->i_checksum_lo);
 	raw->i_checksum_lo = 0;
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
 	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
 		csum_hi = le16_to_cpu(raw->i_checksum_hi);
 		raw->i_checksum_hi = 0;
 	}
 	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
 			   EXT4_INODE_SIZE(inode->i_sb));
 	raw->i_checksum_lo = cpu_to_le16(csum_lo);
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
 	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
 		raw->i_checksum_hi = cpu_to_le16(csum_hi);
 	return csum;
 }
 static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
 				  struct ext4_inode_info *ei)
 {
 	__u32 provided, calculated;
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_LINUX) ||
 	    !ext4_has_metadata_csum(inode->i_sb))
 		return 1;
 	provided = le16_to_cpu(raw->i_checksum_lo);
 	calculated = ext4_inode_csum(inode, raw, ei);
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
 	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
 		provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
 	else
 		calculated &= 0xFFFF;
 	return provided == calculated;
 }
 static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
 				struct ext4_inode_info *ei)
 {
 	__u32 csum;
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_LINUX) ||
 	    !ext4_has_metadata_csum(inode->i_sb))
 		return;
 	csum = ext4_inode_csum(inode, raw, ei);
 	raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
 	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
 		raw->i_checksum_hi = cpu_to_le16(csum >> 16);
 }
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
 	trace_ext4_begin_ordered_truncate(inode, new_size);
 	/*
 	 * If jinode is zero, then we never opened the file for
 	 * writing, so there's no need to call
 	 * jbd2_journal_begin_ordered_truncate() since there's no
 	 * outstanding writes we need to flush.
 	 */
 	if (!EXT4_I(inode)->jinode)
 		return 0;
 	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
 						   EXT4_I(inode)->jinode,
 						   new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned int offset,
 				unsigned int length);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
 				  int pextents);
 /*
  * Test whether an inode is a fast symlink.
  */
 static int ext4_inode_is_fast_symlink(struct inode *inode)
 {
         int ea_blocks = EXT4_I(inode)->i_file_acl ?
 		EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
 	if (ext4_has_inline_data(inode))
 		return 0;
 	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
 }
 /*
  * Restart the transaction associated with *handle.  This does a commit,
  * so before we call here everything must be consistently dirtied against
  * this transaction.
  */
 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
 				 int nblocks)
 {
 	int ret;
 	/*
 	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
 	 * moment, get_block can be called only for blocks inside i_size since
 	 * page cache has been already dropped and writes are blocked by
 	 * i_mutex. So we can safely drop the i_data_sem here.
 	 */
 	BUG_ON(EXT4_JOURNAL(inode) == NULL);
 	jbd_debug(2, "restarting handle %p\n", handle);
 	up_write(&EXT4_I(inode)->i_data_sem);
 	ret = ext4_journal_restart(handle, nblocks);
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_discard_preallocations(inode);
 	return ret;
 }
 /*
  * Called at the last iput() if i_nlink is zero.
  */
 void ext4_evict_inode(struct inode *inode)
 {
 	handle_t *handle;
 	int err;
 	trace_ext4_evict_inode(inode);
 	if (inode->i_nlink) {
 		/*
 		 * When journalling data dirty buffers are tracked only in the
 		 * journal. So although mm thinks everything is clean and
 		 * ready for reaping the inode might still have some pages to
 		 * write in the running transaction or waiting to be
 		 * checkpointed. Thus calling jbd2_journal_invalidatepage()
 		 * (via truncate_inode_pages()) to discard these buffers can
 		 * cause data loss. Also even if we did not discard these
 		 * buffers, we would have no way to find them after the inode
 		 * is reaped and thus user could see stale data if he tries to
 		 * read them before the transaction is checkpointed. So be
 		 * careful and force everything to disk here... We use
 		 * ei->i_datasync_tid to store the newest transaction
 		 * containing inode's data.
 		 *
 		 * Note that directories do not have this problem because they
 		 * don't use page cache.
 		 */
 		if (ext4_should_journal_data(inode) &&
 		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
 		    inode->i_ino != EXT4_JOURNAL_INO) {
 			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
 			jbd2_complete_transaction(journal, commit_tid);
 			filemap_write_and_wait(&inode->i_data);
 		}
 		truncate_inode_pages_final(&inode->i_data);
 		WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
 		goto no_delete;
 	}
 	if (is_bad_inode(inode))
 		goto no_delete;
 	dquot_initialize(inode);
 	if (ext4_should_order_data(inode))
 		ext4_begin_ordered_truncate(inode, 0);
 	truncate_inode_pages_final(&inode->i_data);
 	WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
 	/*
 	 * Protect us against freezing - iput() caller didn't have to have any
 	 * protection against it
 	 */
 	sb_start_intwrite(inode->i_sb);
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
 				    ext4_blocks_for_truncate(inode)+3);
 	if (IS_ERR(handle)) {
 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
 		/*
 		 * If we're going to skip the normal cleanup, we still need to
 		 * make sure that the in-core orphan linked list is properly
 		 * cleaned up.
 		 */
 		ext4_orphan_del(NULL, inode);
 		sb_end_intwrite(inode->i_sb);
 		goto no_delete;
 	}
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 	inode->i_size = 0;
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext4_warning(inode->i_sb,
 			     "couldn't mark inode dirty (err %d)", err);
 		goto stop_handle;
 	}
 	if (inode->i_blocks)
 		ext4_truncate(inode);
 	/*
 	 * ext4_ext_truncate() doesn't reserve any slop when it
 	 * restarts journal transactions; therefore there may not be
 	 * enough credits left in the handle to remove the inode from
 	 * the orphan list and set the dtime field.
 	 */
 	if (!ext4_handle_has_enough_credits(handle, 3)) {
 		err = ext4_journal_extend(handle, 3);
 		if (err > 0)
 			err = ext4_journal_restart(handle, 3);
 		if (err != 0) {
 			ext4_warning(inode->i_sb,
 				     "couldn't extend journal (err %d)", err);
 		stop_handle:
 			ext4_journal_stop(handle);
 			ext4_orphan_del(NULL, inode);
 			sb_end_intwrite(inode->i_sb);
 			goto no_delete;
 		}
 	}
 	/*
 	 * Kill off the orphan record which ext4_truncate created.
 	 * AKPM: I think this can be inside the above `if'.
 	 * Note that ext4_orphan_del() has to be able to cope with the
 	 * deletion of a non-existent orphan - this is because we don't
 	 * know if ext4_truncate() actually created an orphan record.
 	 * (Well, we could do this if we need to, but heck - it works)
 	 */
 	ext4_orphan_del(handle, inode);
 	EXT4_I(inode)->i_dtime	= get_seconds();
 	/*
 	 * One subtle ordering requirement: if anything has gone wrong
 	 * (transaction abort, IO errors, whatever), then we can still
 	 * do these next steps (the fs will already have been marked as
 	 * having errors), but we can't free the inode if the mark_dirty
 	 * fails.
 	 */
 	if (ext4_mark_inode_dirty(handle, inode))
 		/* If that failed, just do the required in-core inode clear. */
 		ext4_clear_inode(inode);
 	else
 		ext4_free_inode(handle, inode);
 	ext4_journal_stop(handle);
 	sb_end_intwrite(inode->i_sb);
 	return;
 no_delete:
 	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
 }
 #ifdef CONFIG_QUOTA
 qsize_t *ext4_get_reserved_space(struct inode *inode)
 {
 	return &EXT4_I(inode)->i_reserved_quota;
 }
 #endif
 /*
  * Called with i_data_sem down, which is important since we can call
  * ext4_discard_preallocations() from here.
  */
 void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	spin_lock(&ei->i_block_reservation_lock);
 	trace_ext4_da_update_reserve_space(inode, used, quota_claim);
 	if (unlikely(used > ei->i_reserved_data_blocks)) {
 		ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
 			 "with only %d reserved data blocks",
 			 __func__, inode->i_ino, used,
 			 ei->i_reserved_data_blocks);
 		WARN_ON(1);
 		used = ei->i_reserved_data_blocks;
 	}
 	/* Update per-inode reservations */
 	ei->i_reserved_data_blocks -= used;
 	percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	/* Update quota subsystem for data blocks */
 	if (quota_claim)
 		dquot_claim_block(inode, EXT4_C2B(sbi, used));
 	else {
 		/*
 		 * We did fallocate with an offset that is already delayed
 		 * allocated. So on delayed allocated writeback we should
 		 * not re-claim the quota for fallocated blocks.
 		 */
 		dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
 	}
 	/*
 	 * If we have done all the pending block allocations and if
 	 * there aren't any writers on the inode, we can discard the
 	 * inode's preallocations.
 	 */
 	if ((ei->i_reserved_data_blocks == 0) &&
 	    (atomic_read(&inode->i_writecount) == 0))
 		ext4_discard_preallocations(inode);
 }
 static int __check_block_validity(struct inode *inode, const char *func,
 				unsigned int line,
 				struct ext4_map_blocks *map)
 {
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
 				   map->m_len)) {
 		ext4_error_inode(inode, func, line, map->m_pblk,
 				 "lblock %lu mapped to illegal pblock "
 				 "(length %d)", (unsigned long) map->m_lblk,
 				 map->m_len);
 		return -EIO;
 	}
 	return 0;
 }
 #define check_block_validity(inode, map)	\
 	__check_block_validity((inode), __func__, __LINE__, (map))
 #ifdef ES_AGGRESSIVE_TEST
 static void ext4_map_blocks_es_recheck(handle_t *handle,
 				       struct inode *inode,
 				       struct ext4_map_blocks *es_map,
 				       struct ext4_map_blocks *map,
 				       int flags)
 {
 	int retval;
 	map->m_flags = 0;
 	/*
 	 * There is a race window that the result is not the same.
 	 * e.g. xfstests #223 when dioread_nolock enables.  The reason
 	 * is that we lookup a block mapping in extent status tree with
 	 * out taking i_data_sem.  So at the time the unwritten extent
 	 * could be converted.
 	 */
 	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
 		down_read(&EXT4_I(inode)->i_data_sem);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	} else {
 		retval = ext4_ind_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	}
 	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
 		up_read((&EXT4_I(inode)->i_data_sem));
 	/*
 	 * We don't check m_len because extent will be collpased in status
 	 * tree.  So the m_len might not equal.
 	 */
 	if (es_map->m_lblk != map->m_lblk ||
 	    es_map->m_flags != map->m_flags ||
 	    es_map->m_pblk != map->m_pblk) {
 		printk("ES cache assertion failed for inode: %lu "
 		       "es_cached ex [%d/%d/%llu/%x] != "
 		       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
 		       inode->i_ino, es_map->m_lblk, es_map->m_len,
 		       es_map->m_pblk, es_map->m_flags, map->m_lblk,
 		       map->m_len, map->m_pblk, map->m_flags,
 		       retval, flags);
 	}
 }
 #endif /* ES_AGGRESSIVE_TEST */
 /*
  * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
  *
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
  * and store the allocated blocks in the result buffer head and mark it
  * mapped.
  *
  * If file type is extents based, it will call ext4_ext_map_blocks(),
  * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
  * based files
  *
  * On success, it returns the number of blocks being mapped or allocated.
  * if create==0 and the blocks are pre-allocated and unwritten block,
  * the result buffer head is unmapped. If the create ==1, it will make sure
  * the buffer head is mapped.
  *
  * It returns 0 if plain look up failed (blocks have not been allocated), in
  * that case, buffer head is unmapped
  *
  * It returns the error in case of allocation failure.
  */
 int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		    struct ext4_map_blocks *map, int flags)
 {
 	struct extent_status es;
 	int retval;
 	int ret = 0;
 #ifdef ES_AGGRESSIVE_TEST
 	struct ext4_map_blocks orig_map;
 	memcpy(&orig_map, map, sizeof(*map));
 #endif
 	map->m_flags = 0;
 	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
 		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
 		  (unsigned long) map->m_lblk);
 	/*
 	 * ext4_map_blocks returns an int, and m_len is an unsigned int
 	 */
 	if (unlikely(map->m_len > INT_MAX))
 		map->m_len = INT_MAX;
 	/* We can handle the block number less than EXT_MAX_BLOCKS */
 	if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
 		return -EIO;
 	/* Lookup extent status tree firstly */
 	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
 		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
 			map->m_pblk = ext4_es_pblock(&es) +
 					map->m_lblk - es.es_lblk;
 			map->m_flags |= ext4_es_is_written(&es) ?
 					EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
 			retval = es.es_len - (map->m_lblk - es.es_lblk);
 			if (retval > map->m_len)
 				retval = map->m_len;
 			map->m_len = retval;
 		} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
 			retval = 0;
 		} else {
 			BUG_ON(1);
 		}
 #ifdef ES_AGGRESSIVE_TEST
 		ext4_map_blocks_es_recheck(handle, inode, map,
 					   &orig_map, flags);
 #endif
 		goto found;
 	}
 	/*
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
 	 */
 	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
 		down_read(&EXT4_I(inode)->i_data_sem);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	} else {
 		retval = ext4_ind_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	}
 	if (retval > 0) {
 		unsigned int status;
 		if (unlikely(retval != map->m_len)) {
 			ext4_warning(inode->i_sb,
 				     "ES len assertion failed for inode "
 				     "%lu: retval %d != map->m_len %d",
 				     inode->i_ino, retval, map->m_len);
 			WARN_ON(1);
 		}
 		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
 				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
 		    ext4_find_delalloc_range(inode, map->m_lblk,
 					     map->m_lblk + map->m_len - 1))
 			status |= EXTENT_STATUS_DELAYED;
 		ret = ext4_es_insert_extent(inode, map->m_lblk,
 					    map->m_len, map->m_pblk, status);
 		if (ret < 0)
 			retval = ret;
 	}
 	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
 		up_read((&EXT4_I(inode)->i_data_sem));
 found:
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
 	/* If it is only a block(s) look up */
 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
 		return retval;
 	/*
 	 * Returns if the blocks have already allocated
 	 *
 	 * Note that if blocks have been preallocated
 	 * ext4_ext_get_block() returns the create = 0
 	 * with buffer head unmapped.
 	 */
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
 		/*
 		 * If we need to convert extent to unwritten
 		 * we continue and do the actual work in
 		 * ext4_ext_map_blocks()
 		 */
 		if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
 			return retval;
 	/*
 	 * Here we clear m_flags because after allocating an new extent,
 	 * it will be set again.
 	 */
 	map->m_flags &= ~EXT4_MAP_FLAGS;
 	/*
 	 * New blocks allocate and/or writing to unwritten extent
 	 * will possibly result in updating i_data, so we take
 	 * the write lock of i_data_sem, and call get_block()
 	 * with create == 1 flag.
 	 */
 	down_write(&EXT4_I(inode)->i_data_sem);
 	/*
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
 	 */
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags);
 	} else {
 		retval = ext4_ind_map_blocks(handle, inode, map, flags);
 		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
 			/*
 			 * We allocated new blocks which will result in
 			 * i_data's format changing.  Force the migrate
 			 * to fail by clearing migrate flags
 			 */
 			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
 		}
 		/*
 		 * Update reserved blocks/metadata blocks after successful
 		 * block allocation which had been deferred till now. We don't
 		 * support fallocate for non extent files. So we can update
 		 * reserve space here.
 		 */
 		if ((retval > 0) &&
 			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
 			ext4_da_update_reserve_space(inode, retval, 1);
 	}
 	if (retval > 0) {
 		unsigned int status;
 		if (unlikely(retval != map->m_len)) {
 			ext4_warning(inode->i_sb,
 				     "ES len assertion failed for inode "
 				     "%lu: retval %d != map->m_len %d",
 				     inode->i_ino, retval, map->m_len);
 			WARN_ON(1);
 		}
 		/*
 		 * If the extent has been zeroed out, we don't need to update
 		 * extent status tree.
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
 		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
 			if (ext4_es_is_written(&es))
 				goto has_zeroout;
 		}
 		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
 				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
 		    ext4_find_delalloc_range(inode, map->m_lblk,
 					     map->m_lblk + map->m_len - 1))
 			status |= EXTENT_STATUS_DELAYED;
 		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
 					    map->m_pblk, status);
 		if (ret < 0)
 			retval = ret;
 	}
 has_zeroout:
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
 	return retval;
 }
 static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
 {
 	struct inode *inode = bh->b_assoc_map->host;
 	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
 	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
 	int err;
 	if (!uptodate)
 		return;
 	WARN_ON(!buffer_unwritten(bh));
 	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
 }
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 static int _ext4_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *bh, int flags)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct ext4_map_blocks map;
 	int ret = 0, started = 0;
 	int dio_credits;
 	if (ext4_has_inline_data(inode))
 		return -ERANGE;
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
 	if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
 		/* Direct IO write... */
 		if (map.m_len > DIO_MAX_BLOCKS)
 			map.m_len = DIO_MAX_BLOCKS;
 		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
 		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
 					    dio_credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			return ret;
 		}
 		started = 1;
 	}
 	ret = ext4_map_blocks(handle, inode, &map, flags);
 	if (ret > 0) {
 		ext4_io_end_t *io_end = ext4_inode_aio(inode);
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
 		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
 			bh->b_assoc_map = inode->i_mapping;
 			bh->b_private = (void *)(unsigned long)iblock;
 			bh->b_end_io = ext4_end_io_unwritten;
 		}
 		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
 			set_buffer_defer_completion(bh);
 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 		ret = 0;
 	}
 	if (started)
 		ext4_journal_stop(handle);
 	return ret;
 }
 int ext4_get_block(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh, int create)
 {
 	return _ext4_get_block(inode, iblock, bh,
 			       create ? EXT4_GET_BLOCKS_CREATE : 0);
 }
 /*
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 				ext4_lblk_t block, int create)
 {
 	struct ext4_map_blocks map;
 	struct buffer_head *bh;
 	int err;
 	J_ASSERT(handle != NULL || create == 0);
 	map.m_lblk = block;
 	map.m_len = 1;
 	err = ext4_map_blocks(handle, inode, &map,
 			      create ? EXT4_GET_BLOCKS_CREATE : 0);
 	if (err == 0)
 		return create ? ERR_PTR(-ENOSPC) : NULL;
 	if (err < 0)
 		return ERR_PTR(err);
 	bh = sb_getblk(inode->i_sb, map.m_pblk);
 	if (unlikely(!bh))
 		return ERR_PTR(-ENOMEM);
 	if (map.m_flags & EXT4_MAP_NEW) {
 		J_ASSERT(create != 0);
 		J_ASSERT(handle != NULL);
 		/*
 		 * Now that we do not always journal data, we should
 		 * keep in mind whether this should always journal the
 		 * new buffer as metadata.  For now, regular file
 		 * writes use ext4_get_block instead, so it's not a
 		 * problem.
 		 */
 		lock_buffer(bh);
 		BUFFER_TRACE(bh, "call get_create_access");
 		err = ext4_journal_get_create_access(handle, bh);
 		if (unlikely(err)) {
 			unlock_buffer(bh);
 			goto errout;
 		}
 		if (!buffer_uptodate(bh)) {
 			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
 			set_buffer_uptodate(bh);
 		}
 		unlock_buffer(bh);
 		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 		err = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (unlikely(err))
 			goto errout;
 	} else
 		BUFFER_TRACE(bh, "not a new buffer");
 	return bh;
 errout:
 	brelse(bh);
 	return ERR_PTR(err);
 }
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 			       ext4_lblk_t block, int create)
 {
 	struct buffer_head *bh;
 	bh = ext4_getblk(handle, inode, block, create);
 	if (IS_ERR(bh))
 		return bh;
 	if (!bh || buffer_uptodate(bh))
 		return bh;
 	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
 	put_bh(bh);
 	return ERR_PTR(-EIO);
 }
 int ext4_walk_page_buffers(handle_t *handle,
 			   struct buffer_head *head,
 			   unsigned from,
 			   unsigned to,
 			   int *partial,
 			   int (*fn)(handle_t *handle,
 				     struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
 	unsigned blocksize = head->b_size;
 	int err, ret = 0;
 	struct buffer_head *next;
 	for (bh = head, block_start = 0;
 	     ret == 0 && (bh != head || !block_start);
 	     block_start = block_end, bh = next) {
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (partial && !buffer_uptodate(bh))
 				*partial = 1;
 			continue;
 		}
 		err = (*fn)(handle, bh);
 		if (!ret)
 			ret = err;
 	}
 	return ret;
 }
 /*
  * To preserve ordering, it is essential that the hole instantiation and
  * the data write be encapsulated in a single transaction.  We cannot
  * close off a transaction and start a new one between the ext4_get_block()
  * and the commit_write().  So doing the jbd2_journal_start at the start of
  * prepare_write() is the right place.
  *
  * Also, this function can nest inside ext4_writepage().  In that case, we
  * *know* that ext4_writepage() has generated enough buffer credits to do the
  * whole page.  So we won't block on the journal in that case, which is good,
  * because the caller may be PF_MEMALLOC.
  *
  * By accident, ext4 can be reentered when a transaction is open via
  * quota file writes.  If we were to commit the transaction while thus
  * reentered, there can be a deadlock - we would be holding a quota
  * lock, and the commit would never complete if another thread had a
  * transaction open and was blocking on the quota lock - a ranking
  * violation.
  *
  * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
  * will _not_ run commit under these circumstances because handle->h_ref
  * is elevated.  We'll still have enough credits for the tiny quotafile
  * write.
  */
 int do_journal_get_write_access(handle_t *handle,
 				struct buffer_head *bh)
 {
 	int dirty = buffer_dirty(bh);
 	int ret;
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	/*
 	 * __block_write_begin() could have dirtied some buffers. Clean
 	 * the dirty bit as jbd2_journal_get_write_access() could complain
 	 * otherwise about fs integrity issues. Setting of the dirty bit
 	 * by __block_write_begin() isn't a real problem here as we clear
 	 * the bit before releasing a page lock and thus writeback cannot
 	 * ever write the buffer.
 	 */
 	if (dirty)
 		clear_buffer_dirty(bh);
 	BUFFER_TRACE(bh, "get write access");
 	ret = ext4_journal_get_write_access(handle, bh);
 	if (!ret && dirty)
 		ret = ext4_handle_dirty_metadata(handle, NULL, bh);
 	return ret;
 }
 static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
 			    loff_t pos, unsigned len, unsigned flags,
 			    struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret, needed_blocks;
 	handle_t *handle;
 	int retries = 0;
 	struct page *page;
 	pgoff_t index;
 	unsigned from, to;
 	trace_ext4_write_begin(inode, pos, len, flags);
 	/*
 	 * Reserve one block more for addition to orphan list in case
 	 * we allocate blocks but write fails for some reason
 	 */
 	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
 		ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
 						    flags, pagep);
 		if (ret < 0)
 			return ret;
 		if (ret == 1)
 			return 0;
 	}
 	/*
 	 * grab_cache_page_write_begin() can take a long time if the
 	 * system is thrashing due to memory pressure, or if the page
 	 * is being written back.  So grab it first before we start
 	 * the transaction handle.  This also allows us to allocate
 	 * the page (if needed) without using GFP_NOFS.
 	 */
 retry_grab:
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	unlock_page(page);
 retry_journal:
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
 	if (IS_ERR(handle)) {
 		page_cache_release(page);
 		return PTR_ERR(handle);
 	}
 	lock_page(page);
 	if (page->mapping != mapping) {
 		/* The page got truncated from under us */
 		unlock_page(page);
 		page_cache_release(page);
 		ext4_journal_stop(handle);
 		goto retry_grab;
 	}
 	/* In case writeback began while the page was unlocked */
 	wait_for_stable_page(page);
 	if (ext4_should_dioread_nolock(inode))
 		ret = __block_write_begin(page, pos, len, ext4_get_block_write);
 	else
 		ret = __block_write_begin(page, pos, len, ext4_get_block);
 	if (!ret && ext4_should_journal_data(inode)) {
 		ret = ext4_walk_page_buffers(handle, page_buffers(page),
 					     from, to, NULL,
 					     do_journal_get_write_access);
 	}
 	if (ret) {
 		unlock_page(page);
 		/*
 		 * __block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
 		 *
 		 * Add inode to orphan list in case we crash before
 		 * truncate finishes
 		 */
 		if (pos + len > inode->i_size && ext4_can_truncate(inode))
 			ext4_orphan_add(handle, inode);
 		ext4_journal_stop(handle);
 		if (pos + len > inode->i_size) {
 			ext4_truncate_failed_write(inode);
 			/*
 			 * If truncate failed early the inode might
 			 * still be on the orphan list; we need to
 			 * make sure the inode is removed from the
 			 * orphan list in that case.
 			 */
 			if (inode->i_nlink)
 				ext4_orphan_del(NULL, inode);
 		}
 		if (ret == -ENOSPC &&
 		    ext4_should_retry_alloc(inode->i_sb, &retries))
 			goto retry_journal;
 		page_cache_release(page);
 		return ret;
 	}
 	*pagep = page;
 	return ret;
 }
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
 	int ret;
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	set_buffer_uptodate(bh);
 	ret = ext4_handle_dirty_metadata(handle, NULL, bh);
 	clear_buffer_meta(bh);
 	clear_buffer_prio(bh);
 	return ret;
 }
 /*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
  *
  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
  * buffers are managed internally.
  */
 static int ext4_write_end(struct file *file,
 			  struct address_space *mapping,
 			  loff_t pos, unsigned len, unsigned copied,
 			  struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
+	loff_t old_size = inode->i_size;
 	int ret = 0, ret2;
 	int i_size_changed = 0;
 	trace_ext4_write_end(inode, pos, len, copied);
 	if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
 		ret = ext4_jbd2_file_inode(handle, inode);
 		if (ret) {
 			unlock_page(page);
 			page_cache_release(page);
 			goto errout;
 		}
 	}
 	if (ext4_has_inline_data(inode)) {
 		ret = ext4_write_inline_data_end(inode, pos, len,
 						 copied, page);
 		if (ret < 0)
 			goto errout;
 		copied = ret;
 	} else
 		copied = block_write_end(file, mapping, pos,
 					 len, copied, page, fsdata);
 	/*
 	 * it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
 	 */
 	i_size_changed = ext4_update_inode_size(inode, pos + copied);
 	unlock_page(page);
 	page_cache_release(page);
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
 	/*
 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
 	 * makes the holding time of page lock longer. Second, it forces lock
 	 * ordering of page lock and transaction start for journaling
 	 * filesystems.
 	 */
 	if (i_size_changed)
 		ext4_mark_inode_dirty(handle, inode);
 	if (pos + len > inode->i_size && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
 		 */
 		ext4_orphan_add(handle, inode);
 errout:
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	if (pos + len > inode->i_size) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
 		 */
 		if (inode->i_nlink)
 			ext4_orphan_del(NULL, inode);
 	}
 	return ret ? ret : copied;
 }
 static int ext4_journalled_write_end(struct file *file,
 				     struct address_space *mapping,
 				     loff_t pos, unsigned len, unsigned copied,
 				     struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
+	loff_t old_size = inode->i_size;
 	int ret = 0, ret2;
 	int partial = 0;
 	unsigned from, to;
 	int size_changed = 0;
 	trace_ext4_journalled_write_end(inode, pos, len, copied);
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 	BUG_ON(!ext4_handle_valid(handle));
 	if (ext4_has_inline_data(inode))
 		copied = ext4_write_inline_data_end(inode, pos, len,
 						    copied, page);
 	else {
 		if (copied < len) {
 			if (!PageUptodate(page))
 				copied = 0;
 			page_zero_new_buffers(page, from+copied, to);
 		}
 		ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
 					     to, &partial, write_end_fn);
 		if (!partial)
 			SetPageUptodate(page);
 	}
 	size_changed = ext4_update_inode_size(inode, pos + copied);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
 	unlock_page(page);
 	page_cache_release(page);
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
 	if (size_changed) {
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		if (!ret)
 			ret = ret2;
 	}
 	if (pos + len > inode->i_size && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
 		 */
 		ext4_orphan_add(handle, inode);
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	if (pos + len > inode->i_size) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
 		 */
 		if (inode->i_nlink)
 			ext4_orphan_del(NULL, inode);
 	}
 	return ret ? ret : copied;
 }
 /*
  * Reserve a single cluster located at lblock
  */
 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int md_needed;
 	int ret;
 	/*
 	 * We will charge metadata quota at writeout time; this saves
 	 * us from metadata over-estimation, though we may go over by
 	 * a small amount in the end.  Here we just reserve for data.
 	 */
 	ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
 	if (ret)
 		return ret;
 	/*
 	 * recalculate the amount of metadata blocks to reserve
 	 * in order to allocate nrblocks
 	 * worse case is one extent per block
 	 */
 	spin_lock(&ei->i_block_reservation_lock);
 	/*
 	 * ext4_calc_metadata_amount() has side effects, which we have
 	 * to be prepared undo if we fail to claim space.
 	 */
 	md_needed = 0;
 	trace_ext4_da_reserve_space(inode, 0);
 	if (ext4_claim_free_clusters(sbi, 1, 0)) {
 		spin_unlock(&ei->i_block_reservation_lock);
 		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
 		return -ENOSPC;
 	}
 	ei->i_reserved_data_blocks++;
 	spin_unlock(&ei->i_block_reservation_lock);
 	return 0;       /* success */
 }
 static void ext4_da_release_space(struct inode *inode, int to_free)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	if (!to_free)
 		return;		/* Nothing to release, exit */
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	trace_ext4_da_release_space(inode, to_free);
 	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
 		/*
 		 * if there aren't enough reserved blocks, then the
 		 * counter is messed up somewhere.  Since this
 		 * function is called from invalidate page, it's
 		 * harmless to return without any action.
 		 */
 		ext4_warning(inode->i_sb, "ext4_da_release_space: "
 			 "ino %lu, to_free %d with only %d reserved "
 			 "data blocks", inode->i_ino, to_free,
 			 ei->i_reserved_data_blocks);
 		WARN_ON(1);
 		to_free = ei->i_reserved_data_blocks;
 	}
 	ei->i_reserved_data_blocks -= to_free;
 	/* update fs dirty data blocks counter */
 	percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
 }
 static void ext4_da_page_release_reservation(struct page *page,
 					     unsigned int offset,
 					     unsigned int length)
 {
 	int to_release = 0;
 	struct buffer_head *head, *bh;
 	unsigned int curr_off = 0;
 	struct inode *inode = page->mapping->host;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	unsigned int stop = offset + length;
 	int num_clusters;
 	ext4_fsblk_t lblk;
 	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
 	head = page_buffers(page);
 	bh = head;
 	do {
 		unsigned int next_off = curr_off + bh->b_size;
 		if (next_off > stop)
 			break;
 		if ((offset <= curr_off) && (buffer_delay(bh))) {
 			to_release++;
 			clear_buffer_delay(bh);
 		}
 		curr_off = next_off;
 	} while ((bh = bh->b_this_page) != head);
 	if (to_release) {
 		lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 		ext4_es_remove_extent(inode, lblk, to_release);
 	}
 	/* If we have released all the blocks belonging to a cluster, then we
 	 * need to release the reserved space for that cluster. */
 	num_clusters = EXT4_NUM_B2C(sbi, to_release);
 	while (num_clusters > 0) {
 		lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
 			((num_clusters - 1) << sbi->s_cluster_bits);
 		if (sbi->s_cluster_ratio == 1 ||
 		    !ext4_find_delalloc_cluster(inode, lblk))
 			ext4_da_release_space(inode, 1);
 		num_clusters--;
 	}
 }
 /*
  * Delayed allocation stuff
  */
 struct mpage_da_data {
 	struct inode *inode;
 	struct writeback_control *wbc;
 	pgoff_t first_page;	/* The first page to write */
 	pgoff_t next_page;	/* Current page to examine */
 	pgoff_t last_page;	/* Last page to examine */
 	/*
 	 * Extent to map - this can be after first_page because that can be
 	 * fully mapped. We somewhat abuse m_flags to store whether the extent
 	 * is delalloc or unwritten.
 	 */
 	struct ext4_map_blocks map;
 	struct ext4_io_submit io_submit;	/* IO submission data */
 };
 static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 				       bool invalidate)
 {
 	int nr_pages, i;
 	pgoff_t index, end;
 	struct pagevec pvec;
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
 	/* This is necessary when next_page == 0. */
 	if (mpd->first_page >= mpd->next_page)
 		return;
 	index = mpd->first_page;
 	end   = mpd->next_page - 1;
 	if (invalidate) {
 		ext4_lblk_t start, last;
 		start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 		last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 		ext4_es_remove_extent(inode, start, last - start + 1);
 	}
 	pagevec_init(&pvec, 0);
 	while (index <= end) {
 		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			if (page->index > end)
 				break;
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
 			if (invalidate) {
 				block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 				ClearPageUptodate(page);
 			}
 			unlock_page(page);
 		}
 		index = pvec.pages[nr_pages - 1]->index + 1;
 		pagevec_release(&pvec);
 	}
 }
 static void ext4_print_free_blocks(struct inode *inode)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct super_block *sb = inode->i_sb;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
 	       EXT4_C2B(EXT4_SB(inode->i_sb),
 			ext4_count_free_clusters(sb)));
 	ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
 	ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
 	       (long long) EXT4_C2B(EXT4_SB(sb),
 		percpu_counter_sum(&sbi->s_freeclusters_counter)));
 	ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
 	       (long long) EXT4_C2B(EXT4_SB(sb),
 		percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
 	ext4_msg(sb, KERN_CRIT, "Block reservation details");
 	ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
 		 ei->i_reserved_data_blocks);
 	return;
 }
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 {
 	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
 }
 /*
  * This function is grabs code from the very beginning of
  * ext4_map_blocks, but assumes that the caller is from delayed write
  * time. This function looks up the requested blocks and sets the
  * buffer delay bit under the protection of i_data_sem.
  */
 static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 			      struct ext4_map_blocks *map,
 			      struct buffer_head *bh)
 {
 	struct extent_status es;
 	int retval;
 	sector_t invalid_block = ~((sector_t) 0xffff);
 #ifdef ES_AGGRESSIVE_TEST
 	struct ext4_map_blocks orig_map;
 	memcpy(&orig_map, map, sizeof(*map));
 #endif
 	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
 		invalid_block = ~0;
 	map->m_flags = 0;
 	ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
 		  "logical block %lu\n", inode->i_ino, map->m_len,
 		  (unsigned long) map->m_lblk);
 	/* Lookup extent status tree firstly */
 	if (ext4_es_lookup_extent(inode, iblock, &es)) {
 		if (ext4_es_is_hole(&es)) {
 			retval = 0;
 			down_read(&EXT4_I(inode)->i_data_sem);
 			goto add_delayed;
 		}
 		/*
 		 * Delayed extent could be allocated by fallocate.
 		 * So we need to check it.
 		 */
 		if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
 			map_bh(bh, inode->i_sb, invalid_block);
 			set_buffer_new(bh);
 			set_buffer_delay(bh);
 			return 0;
 		}
 		map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
 		retval = es.es_len - (iblock - es.es_lblk);
 		if (retval > map->m_len)
 			retval = map->m_len;
 		map->m_len = retval;
 		if (ext4_es_is_written(&es))
 			map->m_flags |= EXT4_MAP_MAPPED;
 		else if (ext4_es_is_unwritten(&es))
 			map->m_flags |= EXT4_MAP_UNWRITTEN;
 		else
 			BUG_ON(1);
 #ifdef ES_AGGRESSIVE_TEST
 		ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
 #endif
 		return retval;
 	}
 	/*
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
 	 */
 	down_read(&EXT4_I(inode)->i_data_sem);
 	if (ext4_has_inline_data(inode))
 		retval = 0;
 	else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
 	else
 		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
 add_delayed:
 	if (retval == 0) {
 		int ret;
 		/*
 		 * XXX: __block_prepare_write() unmaps passed block,
 		 * is it OK?
 		 */
 		/*
 		 * If the block was allocated from previously allocated cluster,
 		 * then we don't need to reserve it again. However we still need
 		 * to reserve metadata for every block we're going to write.
 		 */
 		if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 ||
 		    !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
 			ret = ext4_da_reserve_space(inode, iblock);
 			if (ret) {
 				/* not enough space to reserve */
 				retval = ret;
 				goto out_unlock;
 			}
 		}
 		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
 					    ~0, EXTENT_STATUS_DELAYED);
 		if (ret) {
 			retval = ret;
 			goto out_unlock;
 		}
 		map_bh(bh, inode->i_sb, invalid_block);
 		set_buffer_new(bh);
 		set_buffer_delay(bh);
 	} else if (retval > 0) {
 		int ret;
 		unsigned int status;
 		if (unlikely(retval != map->m_len)) {
 			ext4_warning(inode->i_sb,
 				     "ES len assertion failed for inode "
 				     "%lu: retval %d != map->m_len %d",
 				     inode->i_ino, retval, map->m_len);
 			WARN_ON(1);
 		}
 		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
 				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
 					    map->m_pblk, status);
 		if (ret != 0)
 			retval = ret;
 	}
 out_unlock:
 	up_read((&EXT4_I(inode)->i_data_sem));
 	return retval;
 }
 /*
  * This is a special get_block_t callback which is used by
  * ext4_da_write_begin().  It will either return mapped block or
  * reserve space for a single block.
  *
  * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
  * We also have b_blocknr = -1 and b_bdev initialized properly
  *
  * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
  * initialized properly.
  */
 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 			   struct buffer_head *bh, int create)
 {
 	struct ext4_map_blocks map;
 	int ret = 0;
 	BUG_ON(create == 0);
 	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
 	map.m_lblk = iblock;
 	map.m_len = 1;
 	/*
 	 * first, we need to know whether the block is allocated already
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
 	ret = ext4_da_map_blocks(inode, iblock, &map, bh);
 	if (ret <= 0)
 		return ret;
 	map_bh(bh, inode->i_sb, map.m_pblk);
 	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
 	if (buffer_unwritten(bh)) {
 		/* A delayed write to unwritten bh should be marked
 		 * new and mapped.  Mapped ensures that we don't do
 		 * get_block multiple times when we write to the same
 		 * offset and new ensures that we do proper zero out
 		 * for partial write.
 		 */
 		set_buffer_new(bh);
 		set_buffer_mapped(bh);
 	}
 	return 0;
 }
 static int bget_one(handle_t *handle, struct buffer_head *bh)
 {
 	get_bh(bh);
 	return 0;
 }
 static int bput_one(handle_t *handle, struct buffer_head *bh)
 {
 	put_bh(bh);
 	return 0;
 }
 static int __ext4_journalled_writepage(struct page *page,
 				       unsigned int len)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
 	struct buffer_head *page_bufs = NULL;
 	handle_t *handle = NULL;
 	int ret = 0, err = 0;
 	int inline_data = ext4_has_inline_data(inode);
 	struct buffer_head *inode_bh = NULL;
 	ClearPageChecked(page);
 	if (inline_data) {
 		BUG_ON(page->index != 0);
 		BUG_ON(len > ext4_get_max_inline_size(inode));
 		inode_bh = ext4_journalled_write_inline_data(inode, len, page);
 		if (inode_bh == NULL)
 			goto out;
 	} else {
 		page_bufs = page_buffers(page);
 		if (!page_bufs) {
 			BUG();
 			goto out;
 		}
 		ext4_walk_page_buffers(handle, page_bufs, 0, len,
 				       NULL, bget_one);
 	}
 	/* As soon as we unlock the page, it can go away, but we have
 	 * references to buffers so we are safe */
 	unlock_page(page);
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
 				    ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out;
 	}
 	BUG_ON(!ext4_handle_valid(handle));
 	if (inline_data) {
 		BUFFER_TRACE(inode_bh, "get write access");
 		ret = ext4_journal_get_write_access(handle, inode_bh);
 		err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
 	} else {
 		ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
 					     do_journal_get_write_access);
 		err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
 					     write_end_fn);
 	}
 	if (ret == 0)
 		ret = err;
 	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
 	if (!ext4_has_inline_data(inode))
 		ext4_walk_page_buffers(NULL, page_bufs, 0, len,
 				       NULL, bput_one);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
 	brelse(inode_bh);
 	return ret;
 }
 /*
  * Note that we don't need to start a transaction unless we're journaling data
  * because we should have holes filled from ext4_page_mkwrite(). We even don't
  * need to file the inode to the transaction's list in ordered mode because if
  * we are writing back data added by write(), the inode is already there and if
  * we are writing back data modified via mmap(), no one guarantees in which
  * transaction the data will hit the disk. In case we are journaling data, we
  * cannot start transaction directly because transaction start ranks above page
  * lock so we have to do some magic.
  *
  * This function can get called via...
  *   - ext4_writepages after taking page lock (have journal handle)
  *   - journal_submit_inode_data_buffers (no journal handle)
  *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
  *   - grab_page_cache when doing write_begin (have journal handle)
  *
  * We don't do any block allocation in this function. If we have page with
  * multiple blocks we need to write those buffer_heads that are mapped. This
  * is important for mmaped based write. So if we do with blocksize 1K
  * truncate(f, 1024);
  * a = mmap(f, 0, 4096);
  * a[0] = 'a';
  * truncate(f, 4096);
  * we have in the page first buffer_head mapped via page_mkwrite call back
  * but other buffer_heads would be unmapped but dirty (dirty done via the
  * do_wp_page). So writepage should write the first block. If we modify
  * the mmap area beyond 1024 we will again get a page_fault and the
  * page_mkwrite callback will do the block allocation and mark the
  * buffer_heads mapped.
  *
  * We redirty the page if we have any buffer_heads that is either delay or
  * unwritten in the page.
  *
  * We can get recursively called as show below.
  *
  *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
  *		ext4_writepage()
  *
  * But since we don't do any block allocation we should not deadlock.
  * Page also have the dirty flag cleared so we don't get recurive page_lock.
  */
 static int ext4_writepage(struct page *page,
 			  struct writeback_control *wbc)
 {
 	int ret = 0;
 	loff_t size;
 	unsigned int len;
 	struct buffer_head *page_bufs = NULL;
 	struct inode *inode = page->mapping->host;
 	struct ext4_io_submit io_submit;
 	bool keep_towrite = false;
 	trace_ext4_writepage(page);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
 	else
 		len = PAGE_CACHE_SIZE;
 	page_bufs = page_buffers(page);
 	/*
 	 * We cannot do block allocation or other extent handling in this
 	 * function. If there are buffers needing that, we have to redirty
 	 * the page. But we may reach here when we do a journal commit via
 	 * journal_submit_inode_data_buffers() and in that case we must write
 	 * allocated buffers to achieve data=ordered mode guarantees.
 	 */
 	if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
 				   ext4_bh_delay_or_unwritten)) {
 		redirty_page_for_writepage(wbc, page);
 		if (current->flags & PF_MEMALLOC) {
 			/*
 			 * For memory cleaning there's no point in writing only
 			 * some buffers. So just bail out. Warn if we came here
 			 * from direct reclaim.
 			 */
 			WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
 							== PF_MEMALLOC);
 			unlock_page(page);
 			return 0;
 		}
 		keep_towrite = true;
 	}
 	if (PageChecked(page) && ext4_should_journal_data(inode))
 		/*
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		return __ext4_journalled_writepage(page, len);
 	ext4_io_submit_init(&io_submit, wbc);
 	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
 	if (!io_submit.io_end) {
 		redirty_page_for_writepage(wbc, page);
 		unlock_page(page);
 		return -ENOMEM;
 	}
 	ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
 	ext4_io_submit(&io_submit);
 	/* Drop io_end reference we got from init */
 	ext4_put_io_end_defer(io_submit.io_end);
 	return ret;
 }
 static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
 {
 	int len;
 	loff_t size = i_size_read(mpd->inode);
 	int err;
 	BUG_ON(page->index != mpd->first_page);
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
 	else
 		len = PAGE_CACHE_SIZE;
 	clear_page_dirty_for_io(page);
 	err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
 	if (!err)
 		mpd->wbc->nr_to_write--;
 	mpd->first_page++;
 	return err;
 }
 #define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
 /*
  * mballoc gives us at most this number of blocks...
  * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
  * The rest of mballoc seems to handle chunks up to full group size.
  */
 #define MAX_WRITEPAGES_EXTENT_LEN 2048
 /*
  * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
  *
  * @mpd - extent of blocks
  * @lblk - logical number of the block in the file
  * @bh - buffer head we want to add to the extent
  *
  * The function is used to collect contig. blocks in the same state. If the
  * buffer doesn't require mapping for writeback and we haven't started the
  * extent of buffers to map yet, the function returns 'true' immediately - the
  * caller can write the buffer right away. Otherwise the function returns true
  * if the block has been added to the extent, false if the block couldn't be
  * added.
  */
 static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
 				   struct buffer_head *bh)
 {
 	struct ext4_map_blocks *map = &mpd->map;
 	/* Buffer that doesn't need mapping for writeback? */
 	if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
 	    (!buffer_delay(bh) && !buffer_unwritten(bh))) {
 		/* So far no extent to map => we write the buffer right away */
 		if (map->m_len == 0)
 			return true;
 		return false;
 	}
 	/* First block in the extent? */
 	if (map->m_len == 0) {
 		map->m_lblk = lblk;
 		map->m_len = 1;
 		map->m_flags = bh->b_state & BH_FLAGS;
 		return true;
 	}
 	/* Don't go larger than mballoc is willing to allocate */
 	if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
 		return false;
 	/* Can we merge the block to our big extent? */
 	if (lblk == map->m_lblk + map->m_len &&
 	    (bh->b_state & BH_FLAGS) == map->m_flags) {
 		map->m_len++;
 		return true;
 	}
 	return false;
 }
 /*
  * mpage_process_page_bufs - submit page buffers for IO or add them to extent
  *
  * @mpd - extent of blocks for mapping
  * @head - the first buffer in the page
  * @bh - buffer we should start processing from
  * @lblk - logical number of the block in the file corresponding to @bh
  *
  * Walk through page buffers from @bh upto @head (exclusive) and either submit
  * the page for IO if all buffers in this page were mapped and there's no
  * accumulated extent of buffers to map or add buffers in the page to the
  * extent of buffers to map. The function returns 1 if the caller can continue
  * by processing the next page, 0 if it should stop adding buffers to the
  * extent to map because we cannot extend it anymore. It can also return value
  * < 0 in case of error during IO submission.
  */
 static int mpage_process_page_bufs(struct mpage_da_data *mpd,
 				   struct buffer_head *head,
 				   struct buffer_head *bh,
 				   ext4_lblk_t lblk)
 {
 	struct inode *inode = mpd->inode;
 	int err;
 	ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
 							>> inode->i_blkbits;
 	do {
 		BUG_ON(buffer_locked(bh));
 		if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
 			/* Found extent to map? */
 			if (mpd->map.m_len)
 				return 0;
 			/* Everything mapped so far and we hit EOF */
 			break;
 		}
 	} while (lblk++, (bh = bh->b_this_page) != head);
 	/* So far everything mapped? Submit the page for IO. */
 	if (mpd->map.m_len == 0) {
 		err = mpage_submit_page(mpd, head->b_page);
 		if (err < 0)
 			return err;
 	}
 	return lblk < blocks;
 }
 /*
  * mpage_map_buffers - update buffers corresponding to changed extent and
  *		       submit fully mapped pages for IO
  *
  * @mpd - description of extent to map, on return next extent to map
  *
  * Scan buffers corresponding to changed extent (we expect corresponding pages
  * to be already locked) and update buffer state according to new extent state.
  * We map delalloc buffers to their physical location, clear unwritten bits,
  * and mark buffers as uninit when we perform writes to unwritten extents
  * and do extent conversion after IO is finished. If the last page is not fully
  * mapped, we update @map to the next extent in the last page that needs
  * mapping. Otherwise we submit the page for IO.
  */
 static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 {
 	struct pagevec pvec;
 	int nr_pages, i;
 	struct inode *inode = mpd->inode;
 	struct buffer_head *head, *bh;
 	int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
 	pgoff_t start, end;
 	ext4_lblk_t lblk;
 	sector_t pblock;
 	int err;
 	start = mpd->map.m_lblk >> bpp_bits;
 	end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
 	lblk = start << bpp_bits;
 	pblock = mpd->map.m_pblk;
 	pagevec_init(&pvec, 0);
 	while (start <= end) {
 		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
 					  PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			if (page->index > end)
 				break;
 			/* Up to 'end' pages must be contiguous */
 			BUG_ON(page->index != start);
 			bh = head = page_buffers(page);
 			do {
 				if (lblk < mpd->map.m_lblk)
 					continue;
 				if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
 					/*
 					 * Buffer after end of mapped extent.
 					 * Find next buffer in the page to map.
 					 */
 					mpd->map.m_len = 0;
 					mpd->map.m_flags = 0;
 					/*
 					 * FIXME: If dioread_nolock supports
 					 * blocksize < pagesize, we need to make
 					 * sure we add size mapped so far to
 					 * io_end->size as the following call
 					 * can submit the page for IO.
 					 */
 					err = mpage_process_page_bufs(mpd, head,
 								      bh, lblk);
 					pagevec_release(&pvec);
 					if (err > 0)
 						err = 0;
 					return err;
 				}
 				if (buffer_delay(bh)) {
 					clear_buffer_delay(bh);
 					bh->b_blocknr = pblock++;
 				}
 				clear_buffer_unwritten(bh);
 			} while (lblk++, (bh = bh->b_this_page) != head);
 			/*
 			 * FIXME: This is going to break if dioread_nolock
 			 * supports blocksize < pagesize as we will try to
 			 * convert potentially unmapped parts of inode.
 			 */
 			mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
 			/* Page fully mapped - let IO run! */
 			err = mpage_submit_page(mpd, page);
 			if (err < 0) {
 				pagevec_release(&pvec);
 				return err;
 			}
 			start++;
 		}
 		pagevec_release(&pvec);
 	}
 	/* Extent fully mapped and matches with page boundary. We are done. */
 	mpd->map.m_len = 0;
 	mpd->map.m_flags = 0;
 	return 0;
 }
 static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
 {
 	struct inode *inode = mpd->inode;
 	struct ext4_map_blocks *map = &mpd->map;
 	int get_blocks_flags;
 	int err, dioread_nolock;
 	trace_ext4_da_write_pages_extent(inode, map);
 	/*
 	 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
 	 * to convert an unwritten extent to be initialized (in the case
 	 * where we have written into one or more preallocated blocks).  It is
 	 * possible that we're going to need more metadata blocks than
 	 * previously reserved. However we must not fail because we're in
 	 * writeback and there is nothing we can do about it so it might result
 	 * in data loss.  So use reserved blocks to allocate metadata if
 	 * possible.
 	 *
 	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
 	 * the blocks in question are delalloc blocks.  This indicates
 	 * that the blocks and quotas has already been checked when
 	 * the data was copied into the page cache.
 	 */
 	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
 			   EXT4_GET_BLOCKS_METADATA_NOFAIL;
 	dioread_nolock = ext4_should_dioread_nolock(inode);
 	if (dioread_nolock)
 		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
 	if (map->m_flags & (1 << BH_Delay))
 		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
 	err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
 	if (err < 0)
 		return err;
 	if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
 		if (!mpd->io_submit.io_end->handle &&
 		    ext4_handle_valid(handle)) {
 			mpd->io_submit.io_end->handle = handle->h_rsv_handle;
 			handle->h_rsv_handle = NULL;
 		}
 		ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
 	}
 	BUG_ON(map->m_len == 0);
 	if (map->m_flags & EXT4_MAP_NEW) {
 		struct block_device *bdev = inode->i_sb->s_bdev;
 		int i;
 		for (i = 0; i < map->m_len; i++)
 			unmap_underlying_metadata(bdev, map->m_pblk + i);
 	}
 	return 0;
 }
 /*
  * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
  *				 mpd->len and submit pages underlying it for IO
  *
  * @handle - handle for journal operations
  * @mpd - extent to map
  * @give_up_on_write - we set this to true iff there is a fatal error and there
  *                     is no hope of writing the data. The caller should discard
  *                     dirty pages to avoid infinite loops.
  *
  * The function maps extent starting at mpd->lblk of length mpd->len. If it is
  * delayed, blocks are allocated, if it is unwritten, we may need to convert
  * them to initialized or split the described range from larger unwritten
  * extent. Note that we need not map all the described range since allocation
  * can return less blocks or the range is covered by more unwritten extents. We
  * cannot map more because we are limited by reserved transaction credits. On
  * the other hand we always make sure that the last touched page is fully
  * mapped so that it can be written out (and thus forward progress is
  * guaranteed). After mapping we submit all mapped pages for IO.
  */
 static int mpage_map_and_submit_extent(handle_t *handle,
 				       struct mpage_da_data *mpd,
 				       bool *give_up_on_write)
 {
 	struct inode *inode = mpd->inode;
 	struct ext4_map_blocks *map = &mpd->map;
 	int err;
 	loff_t disksize;
 	int progress = 0;
 	mpd->io_submit.io_end->offset =
 				((loff_t)map->m_lblk) << inode->i_blkbits;
 	do {
 		err = mpage_map_one_extent(handle, mpd);
 		if (err < 0) {
 			struct super_block *sb = inode->i_sb;
 			if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
 				goto invalidate_dirty_pages;
 			/*
 			 * Let the uper layers retry transient errors.
 			 * In the case of ENOSPC, if ext4_count_free_blocks()
 			 * is non-zero, a commit should free up blocks.
 			 */
 			if ((err == -ENOMEM) ||
 			    (err == -ENOSPC && ext4_count_free_clusters(sb))) {
 				if (progress)
 					goto update_disksize;
 				return err;
 			}
 			ext4_msg(sb, KERN_CRIT,
 				 "Delayed block allocation failed for "
 				 "inode %lu at logical offset %llu with"
 				 " max blocks %u with error %d",
 				 inode->i_ino,
 				 (unsigned long long)map->m_lblk,
 				 (unsigned)map->m_len, -err);
 			ext4_msg(sb, KERN_CRIT,
 				 "This should not happen!! Data will "
 				 "be lost\n");
 			if (err == -ENOSPC)
 				ext4_print_free_blocks(inode);
 		invalidate_dirty_pages:
 			*give_up_on_write = true;
 			return err;
 		}
 		progress = 1;
 		/*
 		 * Update buffer state, submit mapped pages, and get us new
 		 * extent to map
 		 */
 		err = mpage_map_and_submit_buffers(mpd);
 		if (err < 0)
 			goto update_disksize;
 	} while (map->m_len);
 update_disksize:
 	/*
 	 * Update on-disk size after IO is submitted.  Races with
 	 * truncate are avoided by checking i_size under i_data_sem.
 	 */
 	disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
 	if (disksize > EXT4_I(inode)->i_disksize) {
 		int err2;
 		loff_t i_size;
 		down_write(&EXT4_I(inode)->i_data_sem);
 		i_size = i_size_read(inode);
 		if (disksize > i_size)
 			disksize = i_size;
 		if (disksize > EXT4_I(inode)->i_disksize)
 			EXT4_I(inode)->i_disksize = disksize;
 		err2 = ext4_mark_inode_dirty(handle, inode);
 		up_write(&EXT4_I(inode)->i_data_sem);
 		if (err2)
 			ext4_error(inode->i_sb,
 				   "Failed to mark inode %lu dirty",
 				   inode->i_ino);
 		if (!err)
 			err = err2;
 	}
 	return err;
 }
 /*
  * Calculate the total number of credits to reserve for one writepages
  * iteration. This is called from ext4_writepages(). We map an extent of
  * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
  * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
  * bpp - 1 blocks in bpp different extents.
  */
 static int ext4_da_writepages_trans_blocks(struct inode *inode)
 {
 	int bpp = ext4_journal_blocks_per_page(inode);
 	return ext4_meta_trans_blocks(inode,
 				MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
 }
 /*
  * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
  * 				 and underlying extent to map
  *
  * @mpd - where to look for pages
  *
  * Walk dirty pages in the mapping. If they are fully mapped, submit them for
  * IO immediately. When we find a page which isn't mapped we start accumulating
  * extent of buffers underlying these pages that needs mapping (formed by
  * either delayed or unwritten buffers). We also lock the pages containing
  * these buffers. The extent found is returned in @mpd structure (starting at
  * mpd->lblk with length mpd->len blocks).
  *
  * Note that this function can attach bios to one io_end structure which are
  * neither logically nor physically contiguous. Although it may seem as an
  * unnecessary complication, it is actually inevitable in blocksize < pagesize
  * case as we need to track IO to all buffers underlying a page in one io_end.
  */
 static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 {
 	struct address_space *mapping = mpd->inode->i_mapping;
 	struct pagevec pvec;
 	unsigned int nr_pages;
 	long left = mpd->wbc->nr_to_write;
 	pgoff_t index = mpd->first_page;
 	pgoff_t end = mpd->last_page;
 	int tag;
 	int i, err = 0;
 	int blkbits = mpd->inode->i_blkbits;
 	ext4_lblk_t lblk;
 	struct buffer_head *head;
 	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 	pagevec_init(&pvec, 0);
 	mpd->map.m_len = 0;
 	mpd->next_page = index;
 	while (index <= end) {
 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
 			goto out;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			/*
 			 * At this point, the page may be truncated or
 			 * invalidated (changing page->mapping to NULL), or
 			 * even swizzled back from swapper_space to tmpfs file
 			 * mapping. However, page->index will not change
 			 * because we have a reference on the page.
 			 */
 			if (page->index > end)
 				goto out;
 			/*
 			 * Accumulated enough dirty pages? This doesn't apply
 			 * to WB_SYNC_ALL mode. For integrity sync we have to
 			 * keep going because someone may be concurrently
 			 * dirtying pages, and we might have synced a lot of
 			 * newly appeared dirty pages, but have not synced all
 			 * of the old dirty pages.
 			 */
 			if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
 				goto out;
 			/* If we can't merge this page, we are done. */
 			if (mpd->map.m_len > 0 && mpd->next_page != page->index)
 				goto out;
 			lock_page(page);
 			/*
 			 * If the page is no longer dirty, or its mapping no
 			 * longer corresponds to inode we are writing (which
 			 * means it has been truncated or invalidated), or the
 			 * page is already under writeback and we are not doing
 			 * a data integrity writeback, skip the page
 			 */
 			if (!PageDirty(page) ||
 			    (PageWriteback(page) &&
 			     (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
 			    unlikely(page->mapping != mapping)) {
 				unlock_page(page);
 				continue;
 			}
 			wait_on_page_writeback(page);
 			BUG_ON(PageWriteback(page));
 			if (mpd->map.m_len == 0)
 				mpd->first_page = page->index;
 			mpd->next_page = page->index + 1;
 			/* Add all dirty buffers to mpd */
 			lblk = ((ext4_lblk_t)page->index) <<
 				(PAGE_CACHE_SHIFT - blkbits);
 			head = page_buffers(page);
 			err = mpage_process_page_bufs(mpd, head, head, lblk);
 			if (err <= 0)
 				goto out;
 			err = 0;
 			left--;
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
 	return 0;
 out:
 	pagevec_release(&pvec);
 	return err;
 }
 static int __writepage(struct page *page, struct writeback_control *wbc,
 		       void *data)
 {
 	struct address_space *mapping = data;
 	int ret = ext4_writepage(page, wbc);
 	mapping_set_error(mapping, ret);
 	return ret;
 }
 static int ext4_writepages(struct address_space *mapping,
 			   struct writeback_control *wbc)
 {
 	pgoff_t	writeback_index = 0;
 	long nr_to_write = wbc->nr_to_write;
 	int range_whole = 0;
 	int cycled = 1;
 	handle_t *handle = NULL;
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
 	int needed_blocks, rsv_blocks = 0, ret = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 	bool done;
 	struct blk_plug plug;
 	bool give_up_on_write = false;
 	trace_ext4_writepages(inode, wbc);
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
 	 * a transaction for special inodes like journal inode on last iput()
 	 * because that could violate lock ordering on umount
 	 */
 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		goto out_writepages;
 	if (ext4_should_journal_data(inode)) {
 		struct blk_plug plug;
 		blk_start_plug(&plug);
 		ret = write_cache_pages(mapping, wbc, __writepage, mapping);
 		blk_finish_plug(&plug);
 		goto out_writepages;
 	}
 	/*
 	 * If the filesystem has aborted, it is read-only, so return
 	 * right away instead of dumping stack traces later on that
 	 * will obscure the real source of the problem.  We test
 	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
 	 * the latter could be true if the filesystem is mounted
 	 * read-only, and in that case, ext4_writepages should
 	 * *never* be called, so if that ever happens, we would want
 	 * the stack trace.
 	 */
 	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
 		ret = -EROFS;
 		goto out_writepages;
 	}
 	if (ext4_should_dioread_nolock(inode)) {
 		/*
 		 * We may need to convert up to one extent per block in
 		 * the page and we may dirty the inode.
 		 */
 		rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
 	}
 	/*
 	 * If we have inline data and arrive here, it means that
 	 * we will soon create the block for the 1st page, so
 	 * we'd better clear the inline data here.
 	 */
 	if (ext4_has_inline_data(inode)) {
 		/* Just inode will be modified... */
 		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			goto out_writepages;
 		}
 		BUG_ON(ext4_test_inode_state(inode,
 				EXT4_STATE_MAY_INLINE_DATA));
 		ext4_destroy_inline_data(handle, inode);
 		ext4_journal_stop(handle);
 	}
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 	if (wbc->range_cyclic) {
 		writeback_index = mapping->writeback_index;
 		if (writeback_index)
 			cycled = 0;
 		mpd.first_page = writeback_index;
 		mpd.last_page = -1;
 	} else {
 		mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
 		mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
 	}
 	mpd.inode = inode;
 	mpd.wbc = wbc;
 	ext4_io_submit_init(&mpd.io_submit, wbc);
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
 	done = false;
 	blk_start_plug(&plug);
 	while (!done && mpd.first_page <= mpd.last_page) {
 		/* For each extent of pages we use new io_end */
 		mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
 		if (!mpd.io_submit.io_end) {
 			ret = -ENOMEM;
 			break;
 		}
 		/*
 		 * We have two constraints: We find one extent to map and we
 		 * must always write out whole page (makes a difference when
 		 * blocksize < pagesize) so that we don't block on IO when we
 		 * try to write out the rest of the page. Journalled mode is
 		 * not supported by delalloc.
 		 */
 		BUG_ON(ext4_should_journal_data(inode));
 		needed_blocks = ext4_da_writepages_trans_blocks(inode);
 		/* start a new transaction */
 		handle = ext4_journal_start_with_reserve(inode,
 				EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
 			       "%ld pages, ino %lu; err %d", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
 			/* Release allocated io_end */
 			ext4_put_io_end(mpd.io_submit.io_end);
 			break;
 		}
 		trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
 		ret = mpage_prepare_extent_to_map(&mpd);
 		if (!ret) {
 			if (mpd.map.m_len)
 				ret = mpage_map_and_submit_extent(handle, &mpd,
 					&give_up_on_write);
 			else {
 				/*
 				 * We scanned the whole range (or exhausted
 				 * nr_to_write), submitted what was mapped and
 				 * didn't find anything needing mapping. We are
 				 * done.
 				 */
 				done = true;
 			}
 		}
 		ext4_journal_stop(handle);
 		/* Submit prepared bio */
 		ext4_io_submit(&mpd.io_submit);
 		/* Unlock pages we didn't use */
 		mpage_release_unused_pages(&mpd, give_up_on_write);
 		/* Drop our io_end reference we got from init */
 		ext4_put_io_end(mpd.io_submit.io_end);
 		if (ret == -ENOSPC && sbi->s_journal) {
 			/*
 			 * Commit the transaction which would
 			 * free blocks released in the transaction
 			 * and try again
 			 */
 			jbd2_journal_force_commit_nested(sbi->s_journal);
 			ret = 0;
 			continue;
 		}
 		/* Fatal error - ENOMEM, EIO... */
 		if (ret)
 			break;
 	}
 	blk_finish_plug(&plug);
 	if (!ret && !cycled && wbc->nr_to_write > 0) {
 		cycled = 1;
 		mpd.last_page = writeback_index - 1;
 		mpd.first_page = 0;
 		goto retry;
 	}
 	/* Update index */
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		/*
 		 * Set the writeback_index so that range_cyclic
 		 * mode will write it back later
 		 */
 		mapping->writeback_index = mpd.first_page;
 out_writepages:
 	trace_ext4_writepages_result(inode, wbc, ret,
 				     nr_to_write - wbc->nr_to_write);
 	return ret;
 }
 static int ext4_nonda_switch(struct super_block *sb)
 {
 	s64 free_clusters, dirty_clusters;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	/*
 	 * switch to non delalloc mode if we are running low
 	 * on free block. The free block accounting via percpu
 	 * counters can get slightly wrong with percpu_counter_batch getting
 	 * accumulated on each CPU without updating global counters
 	 * Delalloc need an accurate free block accounting. So switch
 	 * to non delalloc when we are near to error range.
 	 */
 	free_clusters =
 		percpu_counter_read_positive(&sbi->s_freeclusters_counter);
 	dirty_clusters =
 		percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
 	/*
 	 * Start pushing delalloc when 1/2 of free blocks are dirty.
 	 */
 	if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
 		try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
 	if (2 * free_clusters < 3 * dirty_clusters ||
 	    free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
 		/*
 		 * free block count is less than 150% of dirty blocks
 		 * or free blocks is less than watermark
 		 */
 		return 1;
 	}
 	return 0;
 }
 /* We always reserve for an inode update; the superblock could be there too */
 static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
 {
 	if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
 				EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
 		return 1;
 	if (pos + len <= 0x7fffffffULL)
 		return 1;
 	/* We might need to update the superblock to set LARGE_FILE */
 	return 2;
 }
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 			       loff_t pos, unsigned len, unsigned flags,
 			       struct page **pagep, void **fsdata)
 {
 	int ret, retries = 0;
 	struct page *page;
 	pgoff_t index;
 	struct inode *inode = mapping->host;
 	handle_t *handle;
 	index = pos >> PAGE_CACHE_SHIFT;
 	if (ext4_nonda_switch(inode->i_sb)) {
 		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
 		return ext4_write_begin(file, mapping, pos,
 					len, flags, pagep, fsdata);
 	}
 	*fsdata = (void *)0;
 	trace_ext4_da_write_begin(inode, pos, len, flags);
 	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
 		ret = ext4_da_write_inline_data_begin(mapping, inode,
 						      pos, len, flags,
 						      pagep, fsdata);
 		if (ret < 0)
 			return ret;
 		if (ret == 1)
 			return 0;
 	}
 	/*
 	 * grab_cache_page_write_begin() can take a long time if the
 	 * system is thrashing due to memory pressure, or if the page
 	 * is being written back.  So grab it first before we start
 	 * the transaction handle.  This also allows us to allocate
 	 * the page (if needed) without using GFP_NOFS.
 	 */
 retry_grab:
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	unlock_page(page);
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
 	 * if there is delayed block allocation. But we still need
 	 * to journalling the i_disksize update if writes to the end
 	 * of file which has an already mapped buffer.
 	 */
 retry_journal:
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
 				ext4_da_write_credits(inode, pos, len));
 	if (IS_ERR(handle)) {
 		page_cache_release(page);
 		return PTR_ERR(handle);
 	}
 	lock_page(page);
 	if (page->mapping != mapping) {
 		/* The page got truncated from under us */
 		unlock_page(page);
 		page_cache_release(page);
 		ext4_journal_stop(handle);
 		goto retry_grab;
 	}
 	/* In case writeback began while the page was unlocked */
 	wait_for_stable_page(page);
 	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
 	if (ret < 0) {
 		unlock_page(page);
 		ext4_journal_stop(handle);
 		/*
 		 * block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
 		 */
 		if (pos + len > inode->i_size)
 			ext4_truncate_failed_write(inode);
 		if (ret == -ENOSPC &&
 		    ext4_should_retry_alloc(inode->i_sb, &retries))
 			goto retry_journal;
 		page_cache_release(page);
 		return ret;
 	}
 	*pagep = page;
 	return ret;
 }
 /*
  * Check if we should update i_disksize
  * when write to the end of file but not require block allocation
  */
 static int ext4_da_should_update_i_disksize(struct page *page,
 					    unsigned long offset)
 {
 	struct buffer_head *bh;
 	struct inode *inode = page->mapping->host;
 	unsigned int idx;
 	int i;
 	bh = page_buffers(page);
 	idx = offset >> inode->i_blkbits;
 	for (i = 0; i < idx; i++)
 		bh = bh->b_this_page;
 	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
 		return 0;
 	return 1;
 }
 static int ext4_da_write_end(struct file *file,
 			     struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned copied,
 			     struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	handle_t *handle = ext4_journal_current_handle();
 	loff_t new_i_size;
 	unsigned long start, end;
 	int write_mode = (int)(unsigned long)fsdata;
 	if (write_mode == FALL_BACK_TO_NONDELALLOC)
 		return ext4_write_end(file, mapping, pos,
 				      len, copied, page, fsdata);
 	trace_ext4_da_write_end(inode, pos, len, copied);
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	end = start + copied - 1;
 	/*
 	 * generic_write_end() will run mark_inode_dirty() if i_size
 	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
 	 * into that.
 	 */
 	new_i_size = pos + copied;
 	if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
 		if (ext4_has_inline_data(inode) ||
 		    ext4_da_should_update_i_disksize(page, end)) {
 			ext4_update_i_disksize(inode, new_i_size);
 			/* We need to mark inode dirty even if
 			 * new_i_size is less that inode->i_size
 			 * bu greater than i_disksize.(hint delalloc)
 			 */
 			ext4_mark_inode_dirty(handle, inode);
 		}
 	}
 	if (write_mode != CONVERT_INLINE_DATA &&
 	    ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
 	    ext4_has_inline_data(inode))
 		ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
 						     page);
 	else
 		ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
 	if (ret2 < 0)
 		ret = ret2;
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	return ret ? ret : copied;
 }
 static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
 				   unsigned int length)
 {
 	/*
 	 * Drop reserved blocks
 	 */
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		goto out;
 	ext4_da_page_release_reservation(page, offset, length);
 out:
 	ext4_invalidatepage(page, offset, length);
 	return;
 }
 /*
  * Force all delayed allocation blocks to be allocated for a given inode.
  */
 int ext4_alloc_da_blocks(struct inode *inode)
 {
 	trace_ext4_alloc_da_blocks(inode);
 	if (!EXT4_I(inode)->i_reserved_data_blocks)
 		return 0;
 	/*
 	 * We do something simple for now.  The filemap_flush() will
 	 * also start triggering a write of the data blocks, which is
 	 * not strictly speaking necessary (and for users of
 	 * laptop_mode, not even desirable).  However, to do otherwise
 	 * would require replicating code paths in:
 	 *
 	 * ext4_writepages() ->
 	 *    write_cache_pages() ---> (via passed in callback function)
 	 *        __mpage_da_writepage() -->
 	 *           mpage_add_bh_to_extent()
 	 *           mpage_da_map_blocks()
 	 *
 	 * The problem is that write_cache_pages(), located in
 	 * mm/page-writeback.c, marks pages clean in preparation for
 	 * doing I/O, which is not desirable if we're not planning on
 	 * doing I/O at all.
 	 *
 	 * We could call write_cache_pages(), and then redirty all of
 	 * the pages by calling redirty_page_for_writepage() but that
 	 * would be ugly in the extreme.  So instead we would need to
 	 * replicate parts of the code in the above functions,
 	 * simplifying them because we wouldn't actually intend to
 	 * write out the pages, but rather only collect contiguous
 	 * logical block extents, call the multi-block allocator, and
 	 * then update the buffer heads with the block allocations.
 	 *
 	 * For now, though, we'll cheat by calling filemap_flush(),
 	 * which will map the blocks, and start the I/O, but not
 	 * actually wait for the I/O to complete.
 	 */
 	return filemap_flush(inode->i_mapping);
 }
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
  * the swapper to find the on-disk block of a specific piece of data.
  *
  * Naturally, this is dangerous if the block concerned is still in the
  * journal.  If somebody makes a swapfile on an ext4 data-journaling
  * filesystem and enables swap, then they may get a nasty shock when the
  * data getting swapped to that swapfile suddenly gets overwritten by
  * the original zero's written out previously to the journal and
  * awaiting writeback in the kernel's buffer cache.
  *
  * So, if we see any bmap calls here on a modified, data-journaled file,
  * take extra steps to flush any blocks which might be in the cache.
  */
 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 {
 	struct inode *inode = mapping->host;
 	journal_t *journal;
 	int err;
 	/*
 	 * We can get here for an inline file via the FIBMAP ioctl
 	 */
 	if (ext4_has_inline_data(inode))
 		return 0;
 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
 			test_opt(inode->i_sb, DELALLOC)) {
 		/*
 		 * With delalloc we want to sync the file
 		 * so that we can make sure we allocate
 		 * blocks for file
 		 */
 		filemap_write_and_wait(mapping);
 	}
 	if (EXT4_JOURNAL(inode) &&
 	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
 		 * bmap on dirty files is expected to be extremely rare:
 		 * only if we run lilo or swapon on a freshly made file
 		 * do we expect this to happen.
 		 *
 		 * (bmap requires CAP_SYS_RAWIO so this does not
 		 * represent an unprivileged user DOS attack --- we'd be
 		 * in trouble if mortal users could trigger this path at
 		 * will.)
 		 *
 		 * NB. EXT4_STATE_JDATA is not set on files other than
 		 * regular files.  If somebody wants to bmap a directory
 		 * or symlink and gets confused because the buffer
 		 * hasn't yet been flushed to disk, they deserve
 		 * everything they get.
 		 */
 		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
 		journal = EXT4_JOURNAL(inode);
 		jbd2_journal_lock_updates(journal);
 		err = jbd2_journal_flush(journal);
 		jbd2_journal_unlock_updates(journal);
 		if (err)
 			return 0;
 	}
 	return generic_block_bmap(mapping, block, ext4_get_block);
 }
 static int ext4_readpage(struct file *file, struct page *page)
 {
 	int ret = -EAGAIN;
 	struct inode *inode = page->mapping->host;
 	trace_ext4_readpage(page);
 	if (ext4_has_inline_data(inode))
 		ret = ext4_readpage_inline(inode, page);
 	if (ret == -EAGAIN)
 		return mpage_readpage(page, ext4_get_block);
 	return ret;
 }
 static int
 ext4_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
 	struct inode *inode = mapping->host;
 	/* If the file has inline data, no need to do readpages. */
 	if (ext4_has_inline_data(inode))
 		return 0;
 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 static void ext4_invalidatepage(struct page *page, unsigned int offset,
 				unsigned int length)
 {
 	trace_ext4_invalidatepage(page, offset, length);
 	/* No journalling happens on data buffers when this function is used */
 	WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
 	block_invalidatepage(page, offset, length);
 }
 static int __ext4_journalled_invalidatepage(struct page *page,
 					    unsigned int offset,
 					    unsigned int length)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 	trace_ext4_journalled_invalidatepage(page, offset, length);
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
 	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		ClearPageChecked(page);
 	return jbd2_journal_invalidatepage(journal, page, offset, length);
 }
 /* Wrapper for aops... */
 static void ext4_journalled_invalidatepage(struct page *page,
 					   unsigned int offset,
 					   unsigned int length)
 {
 	WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
 }
 static int ext4_releasepage(struct page *page, gfp_t wait)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 	trace_ext4_releasepage(page);
 	/* Page has dirty journalled data -> cannot release */
 	if (PageChecked(page))
 		return 0;
 	if (journal)
 		return jbd2_journal_try_to_free_buffers(journal, page, wait);
 	else
 		return try_to_free_buffers(page);
 }
 /*
  * ext4_get_block used when preparing for a DIO write or buffer write.
  * We allocate an uinitialized extent if blocks haven't been allocated.
  * The extent will be converted to initialized after the IO is complete.
  */
 int ext4_get_block_write(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
 	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
 	return _ext4_get_block(inode, iblock, bh_result,
 			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
 	ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
 	return _ext4_get_block(inode, iblock, bh_result,
 			       EXT4_GET_BLOCKS_NO_LOCK);
 }
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 			    ssize_t size, void *private)
 {
         ext4_io_end_t *io_end = iocb->private;
 	/* if not async direct IO just return */
 	if (!io_end)
 		return;
 	ext_debug("ext4_end_io_dio(): io_end 0x%p "
 		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
  		  iocb->private, io_end->inode->i_ino, iocb, offset,
 		  size);
 	iocb->private = NULL;
 	io_end->offset = offset;
 	io_end->size = size;
 	ext4_put_io_end(io_end);
 }
 /*
  * For ext4 extent files, ext4 will do direct-io write to holes,
  * preallocated extents, and those write extend the file, no need to
  * fall back to buffered IO.
  *
  * For holes, we fallocate those blocks, mark them as unwritten
  * If those blocks were preallocated, we mark sure they are split, but
  * still keep the range to write as unwritten.
  *
  * The unwritten extents will be converted to written when DIO is completed.
  * For async direct IO, since the IO may still pending when return, we
  * set up an end_io call back function, which will do the conversion
  * when async direct IO completed.
  *
  * If the O_DIRECT write will extend the file then add this inode to the
  * orphan list.  So recovery will truncate it back to the original size
  * if the machine crashes during the write.
  *
  */
 static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 			      struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 	size_t count = iov_iter_count(iter);
 	int overwrite = 0;
 	get_block_t *get_block_func = NULL;
 	int dio_flags = 0;
 	loff_t final_size = offset + count;
 	ext4_io_end_t *io_end = NULL;
 	/* Use the old path for reads and writes beyond i_size. */
 	if (rw != WRITE || final_size > inode->i_size)
 		return ext4_ind_direct_IO(rw, iocb, iter, offset);
 	BUG_ON(iocb->private == NULL);
 	/*
 	 * Make all waiters for direct IO properly wait also for extent
 	 * conversion. This also disallows race between truncate() and
 	 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
 	 */
 	if (rw == WRITE)
 		atomic_inc(&inode->i_dio_count);
 	/* If we do a overwrite dio, i_mutex locking can be released */
 	overwrite = *((int *)iocb->private);
 	if (overwrite) {
 		down_read(&EXT4_I(inode)->i_data_sem);
 		mutex_unlock(&inode->i_mutex);
 	}
 	/*
 	 * We could direct write to holes and fallocate.
 	 *
 	 * Allocated blocks to fill the hole are marked as
 	 * unwritten to prevent parallel buffered read to expose
 	 * the stale data before DIO complete the data IO.
 	 *
 	 * As to previously fallocated extents, ext4 get_block will
 	 * just simply mark the buffer mapped but still keep the
 	 * extents unwritten.
 	 *
 	 * For non AIO case, we will convert those unwritten extents
 	 * to written after return back from blockdev_direct_IO.
 	 *
 	 * For async DIO, the conversion needs to be deferred when the
 	 * IO is completed. The ext4 end_io callback function will be
 	 * called to take care of the conversion work.  Here for async
 	 * case, we allocate an io_end structure to hook to the iocb.
 	 */
 	iocb->private = NULL;
 	ext4_inode_aio_set(inode, NULL);
 	if (!is_sync_kiocb(iocb)) {
 		io_end = ext4_init_io_end(inode, GFP_NOFS);
 		if (!io_end) {
 			ret = -ENOMEM;
 			goto retake_lock;
 		}
 		/*
 		 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
 		 */
 		iocb->private = ext4_get_io_end(io_end);
 		/*
 		 * we save the io structure for current async direct
 		 * IO, so that later ext4_map_blocks() could flag the
 		 * io structure whether there is a unwritten extents
 		 * needs to be converted when IO is completed.
 		 */
 		ext4_inode_aio_set(inode, io_end);
 	}
 	if (overwrite) {
 		get_block_func = ext4_get_block_write_nolock;
 	} else {
 		get_block_func = ext4_get_block_write;
 		dio_flags = DIO_LOCKING;
 	}
 	if (IS_DAX(inode))
 		ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
 				ext4_end_io_dio, dio_flags);
 	else
 		ret = __blockdev_direct_IO(rw, iocb, inode,
 					   inode->i_sb->s_bdev, iter, offset,
 					   get_block_func,
 					   ext4_end_io_dio, NULL, dio_flags);
 	/*
 	 * Put our reference to io_end. This can free the io_end structure e.g.
 	 * in sync IO case or in case of error. It can even perform extent
 	 * conversion if all bios we submitted finished before we got here.
 	 * Note that in that case iocb->private can be already set to NULL
 	 * here.
 	 */
 	if (io_end) {
 		ext4_inode_aio_set(inode, NULL);
 		ext4_put_io_end(io_end);
 		/*
 		 * When no IO was submitted ext4_end_io_dio() was not
 		 * called so we have to put iocb's reference.
 		 */
 		if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
 			WARN_ON(iocb->private != io_end);
 			WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
 			ext4_put_io_end(io_end);
 			iocb->private = NULL;
 		}
 	}
 	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
 		int err;
 		/*
 		 * for non AIO case, since the IO is already
 		 * completed, we could do the conversion right here
 		 */
 		err = ext4_convert_unwritten_extents(NULL, inode,
 						     offset, ret);
 		if (err < 0)
 			ret = err;
 		ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 	}
 retake_lock:
 	if (rw == WRITE)
 		inode_dio_done(inode);
 	/* take i_mutex locking again if we do a ovewrite dio */
 	if (overwrite) {
 		up_read(&EXT4_I(inode)->i_data_sem);
 		mutex_lock(&inode->i_mutex);
 	}
 	return ret;
 }
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 			      struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 	/*
 	 * If we are doing data journalling we don't support O_DIRECT
 	 */
 	if (ext4_should_journal_data(inode))
 		return 0;
 	/* Let buffer I/O handle the inline data case. */
 	if (ext4_has_inline_data(inode))
 		return 0;
 	trace_ext4_direct_IO_enter(inode, offset, count, rw);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
 	else
 		ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
 	trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
 	return ret;
 }
 /*
  * Pages can be marked dirty completely asynchronously from ext4's journalling
  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
  * much here because ->set_page_dirty is called under VFS locks.  The page is
  * not necessarily locked.
  *
  * We cannot just dirty the page and leave attached buffers clean, because the
  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
  * or jbddirty because all the journalling code will explode.
  *
  * So what we do is to mark the page "pending dirty" and next time writepage
  * is called, propagate that into the buffers appropriately.
  */
 static int ext4_journalled_set_page_dirty(struct page *page)
 {
 	SetPageChecked(page);
 	return __set_page_dirty_nobuffers(page);
 }
 static const struct address_space_operations ext4_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.writepages		= ext4_writepages,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_write_end,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 static const struct address_space_operations ext4_journalled_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.writepages		= ext4_writepages,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_journalled_write_end,
 	.set_page_dirty		= ext4_journalled_set_page_dirty,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_journalled_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 static const struct address_space_operations ext4_da_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.writepages		= ext4_writepages,
 	.write_begin		= ext4_da_write_begin,
 	.write_end		= ext4_da_write_end,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_da_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 void ext4_set_aops(struct inode *inode)
 {
 	switch (ext4_inode_journal_mode(inode)) {
 	case EXT4_INODE_ORDERED_DATA_MODE:
 		ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
 		break;
 	case EXT4_INODE_WRITEBACK_DATA_MODE:
 		ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
 		break;
 	case EXT4_INODE_JOURNAL_DATA_MODE:
 		inode->i_mapping->a_ops = &ext4_journalled_aops;
 		return;
 	default:
 		BUG();
 	}
 	if (test_opt(inode->i_sb, DELALLOC))
 		inode->i_mapping->a_ops = &ext4_da_aops;
 	else
 		inode->i_mapping->a_ops = &ext4_aops;
 }
 static int __ext4_block_zero_page_range(handle_t *handle,
 		struct address_space *mapping, loff_t from, loff_t length)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize, pos;
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
 	struct page *page;
 	int err = 0;
 	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
 				   mapping_gfp_mask(mapping) & ~__GFP_FS);
 	if (!page)
 		return -ENOMEM;
 	blocksize = inode->i_sb->s_blocksize;
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	/* Find the buffer that contains "offset" */
 	bh = page_buffers(page);
 	pos = blocksize;
 	while (offset >= pos) {
 		bh = bh->b_this_page;
 		iblock++;
 		pos += blocksize;
 	}
 	if (buffer_freed(bh)) {
 		BUFFER_TRACE(bh, "freed: skip");
 		goto unlock;
 	}
 	if (!buffer_mapped(bh)) {
 		BUFFER_TRACE(bh, "unmapped");
 		ext4_get_block(inode, iblock, bh, 0);
 		/* unmapped? It's a hole - nothing to do */
 		if (!buffer_mapped(bh)) {
 			BUFFER_TRACE(bh, "still unmapped");
 			goto unlock;
 		}
 	}
 	/* Ok, it's mapped. Make sure it's up-to-date */
 	if (PageUptodate(page))
 		set_buffer_uptodate(bh);
 	if (!buffer_uptodate(bh)) {
 		err = -EIO;
 		ll_rw_block(READ, 1, &bh);
 		wait_on_buffer(bh);
 		/* Uhhuh. Read error. Complain and punt. */
 		if (!buffer_uptodate(bh))
 			goto unlock;
 	}
 	if (ext4_should_journal_data(inode)) {
 		BUFFER_TRACE(bh, "get write access");
 		err = ext4_journal_get_write_access(handle, bh);
 		if (err)
 			goto unlock;
 	}
 	zero_user(page, offset, length);
 	BUFFER_TRACE(bh, "zeroed end of block");
 	if (ext4_should_journal_data(inode)) {
 		err = ext4_handle_dirty_metadata(handle, inode, bh);
 	} else {
 		err = 0;
 		mark_buffer_dirty(bh);
 		if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
 			err = ext4_jbd2_file_inode(handle, inode);
 	}
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 	return err;
 }
 /*
  * ext4_block_zero_page_range() zeros out a mapping of length 'length'
  * starting from file offset 'from'.  The range to be zero'd must
  * be contained with in one block.  If the specified range exceeds
  * the end of the block it will be shortened to end of the block
  * that cooresponds to 'from'
  */
 static int ext4_block_zero_page_range(handle_t *handle,
 		struct address_space *mapping, loff_t from, loff_t length)
 {
 	struct inode *inode = mapping->host;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	unsigned max = blocksize - (offset & (blocksize - 1));
 	/*
 	 * correct length if it does not fall between
 	 * 'from' and the end of the block
 	 */
 	if (length > max || length < 0)
 		length = max;
 	if (IS_DAX(inode))
 		return dax_zero_page_range(inode, from, length, ext4_get_block);
 	return __ext4_block_zero_page_range(handle, mapping, from, length);
 }
 /*
  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
  * up to the end of the block which corresponds to `from'.
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
 static int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from)
 {
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned length;
 	unsigned blocksize;
 	struct inode *inode = mapping->host;
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 	return ext4_block_zero_page_range(handle, mapping, from, length);
 }
 int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t length)
 {
 	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	unsigned partial_start, partial_end;
 	ext4_fsblk_t start, end;
 	loff_t byte_end = (lstart + length - 1);
 	int err = 0;
 	partial_start = lstart & (sb->s_blocksize - 1);
 	partial_end = byte_end & (sb->s_blocksize - 1);
 	start = lstart >> sb->s_blocksize_bits;
 	end = byte_end >> sb->s_blocksize_bits;
 	/* Handle partial zero within the single block */
 	if (start == end &&
 	    (partial_start || (partial_end != sb->s_blocksize - 1))) {
 		err = ext4_block_zero_page_range(handle, mapping,
 						 lstart, length);
 		return err;
 	}
 	/* Handle partial zero out on the start of the range */
 	if (partial_start) {
 		err = ext4_block_zero_page_range(handle, mapping,
 						 lstart, sb->s_blocksize);
 		if (err)
 			return err;
 	}
 	/* Handle partial zero out on the end of the range */
 	if (partial_end != sb->s_blocksize - 1)
 		err = ext4_block_zero_page_range(handle, mapping,
 						 byte_end - partial_end,
 						 partial_end + 1);
 	return err;
 }
 int ext4_can_truncate(struct inode *inode)
 {
 	if (S_ISREG(inode->i_mode))
 		return 1;
 	if (S_ISDIR(inode->i_mode))
 		return 1;
 	if (S_ISLNK(inode->i_mode))
 		return !ext4_inode_is_fast_symlink(inode);
 	return 0;
 }
 /*
  * ext4_punch_hole: punches a hole in a file by releaseing the blocks
  * associated with the given offset and length
  *
  * @inode:  File inode
  * @offset: The offset where the hole will begin
  * @len:    The length of the hole
  *
  * Returns: 0 on success or negative on failure
  */
 int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 {
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t first_block, stop_block;
 	struct address_space *mapping = inode->i_mapping;
 	loff_t first_block_offset, last_block_offset;
 	handle_t *handle;
 	unsigned int credits;
 	int ret = 0;
 	if (!S_ISREG(inode->i_mode))
 		return -EOPNOTSUPP;
 	trace_ext4_punch_hole(inode, offset, length, 0);
 	/*
 	 * Write out all dirty pages to avoid race conditions
 	 * Then release them.
 	 */
 	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 		ret = filemap_write_and_wait_range(mapping, offset,
 						   offset + length - 1);
 		if (ret)
 			return ret;
 	}
 	mutex_lock(&inode->i_mutex);
 	/* No need to punch hole beyond i_size */
 	if (offset >= inode->i_size)
 		goto out_mutex;
 	/*
 	 * If the hole extends beyond i_size, set the hole
 	 * to end after the page that contains i_size
 	 */
 	if (offset + length > inode->i_size) {
 		length = inode->i_size +
 		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
 		   offset;
 	}
 	if (offset & (sb->s_blocksize - 1) ||
 	    (offset + length) & (sb->s_blocksize - 1)) {
 		/*
 		 * Attach jinode to inode for jbd2 if we do any zeroing of
 		 * partial block
 		 */
 		ret = ext4_inode_attach_jinode(inode);
 		if (ret < 0)
 			goto out_mutex;
 	}
 	first_block_offset = round_up(offset, sb->s_blocksize);
 	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
 	/* Now release the pages and zero block aligned part of pages*/
 	if (last_block_offset > first_block_offset)
 		truncate_pagecache_range(inode, first_block_offset,
 					 last_block_offset);
 	/* Wait all existing dio workers, newcomers will block on i_mutex */
 	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		credits = ext4_writepage_trans_blocks(inode);
 	else
 		credits = ext4_blocks_for_truncate(inode);
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		ext4_std_error(sb, ret);
 		goto out_dio;
 	}
 	ret = ext4_zero_partial_blocks(handle, inode, offset,
 				       length);
 	if (ret)
 		goto out_stop;
 	first_block = (offset + sb->s_blocksize - 1) >>
 		EXT4_BLOCK_SIZE_BITS(sb);
 	stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
 	/* If there are no blocks to remove, return now */
 	if (first_block >= stop_block)
 		goto out_stop;
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_discard_preallocations(inode);
 	ret = ext4_es_remove_extent(inode, first_block,
 				    stop_block - first_block);
 	if (ret) {
 		up_write(&EXT4_I(inode)->i_data_sem);
 		goto out_stop;
 	}
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		ret = ext4_ext_remove_space(inode, first_block,
 					    stop_block - 1);
 	else
 		ret = ext4_ind_remove_space(handle, inode, first_block,
 					    stop_block);
 	up_write(&EXT4_I(inode)->i_data_sem);
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 	/* Now release the pages again to reduce race window */
 	if (last_block_offset > first_block_offset)
 		truncate_pagecache_range(inode, first_block_offset,
 					 last_block_offset);
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 out_stop:
 	ext4_journal_stop(handle);
 out_dio:
 	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 int ext4_inode_attach_jinode(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct jbd2_inode *jinode;
 	if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
 		return 0;
 	jinode = jbd2_alloc_inode(GFP_KERNEL);
 	spin_lock(&inode->i_lock);
 	if (!ei->jinode) {
 		if (!jinode) {
 			spin_unlock(&inode->i_lock);
 			return -ENOMEM;
 		}
 		ei->jinode = jinode;
 		jbd2_journal_init_jbd_inode(ei->jinode, inode);
 		jinode = NULL;
 	}
 	spin_unlock(&inode->i_lock);
 	if (unlikely(jinode != NULL))
 		jbd2_free_inode(jinode);
 	return 0;
 }
 /*
  * ext4_truncate()
  *
  * We block out ext4_get_block() block instantiations across the entire
  * transaction, and VFS/VM ensures that ext4_truncate() cannot run
  * simultaneously on behalf of the same inode.
  *
  * As we work through the truncate and commit bits of it to the journal there
  * is one core, guiding principle: the file's tree must always be consistent on
  * disk.  We must be able to restart the truncate after a crash.
  *
  * The file's tree may be transiently inconsistent in memory (although it
  * probably isn't), but whenever we close off and commit a journal transaction,
  * the contents of (the filesystem + the journal) must be consistent and
  * restartable.  It's pretty simple, really: bottom up, right to left (although
  * left-to-right works OK too).
  *
  * Note that at recovery time, journal replay occurs *before* the restart of
  * truncate against the orphan inode list.
  *
  * The committed inode has the new, desired i_size (which is the same as
  * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
  * that this inode's truncate did not complete and it will again call
  * ext4_truncate() to have another go.  So there will be instantiated blocks
  * to the right of the truncation point in a crashed ext4 filesystem.  But
  * that's fine - as long as they are linked from the inode, the post-crash
  * ext4_truncate() run will find them and release them.
  */
 void ext4_truncate(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int credits;
 	handle_t *handle;
 	struct address_space *mapping = inode->i_mapping;
 	/*
 	 * There is a possibility that we're either freeing the inode
 	 * or it's a completely new inode. In those cases we might not
 	 * have i_mutex locked because it's not necessary.
 	 */
 	if (!(inode->i_state & (I_NEW|I_FREEING)))
 		WARN_ON(!mutex_is_locked(&inode->i_mutex));
 	trace_ext4_truncate_enter(inode);
 	if (!ext4_can_truncate(inode))
 		return;
 	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 	if (ext4_has_inline_data(inode)) {
 		int has_inline = 1;
 		ext4_inline_data_truncate(inode, &has_inline);
 		if (has_inline)
 			return;
 	}
 	/* If we zero-out tail of the page, we have to create jinode for jbd2 */
 	if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
 		if (ext4_inode_attach_jinode(inode) < 0)
 			return;
 	}
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		credits = ext4_writepage_trans_blocks(inode);
 	else
 		credits = ext4_blocks_for_truncate(inode);
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
 	if (IS_ERR(handle)) {
 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
 		return;
 	}
 	if (inode->i_size & (inode->i_sb->s_blocksize - 1))
 		ext4_block_truncate_page(handle, mapping, inode->i_size);
 	/*
 	 * We add the inode to the orphan list, so that if this
 	 * truncate spans multiple transactions, and we crash, we will
 	 * resume the truncate when the filesystem recovers.  It also
 	 * marks the inode dirty, to catch the new size.
 	 *
 	 * Implication: the file must always be in a sane, consistent
 	 * truncatable state while each transaction commits.
 	 */
 	if (ext4_orphan_add(handle, inode))
 		goto out_stop;
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_discard_preallocations(inode);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		ext4_ext_truncate(handle, inode);
 	else
 		ext4_ind_truncate(handle, inode);
 	up_write(&ei->i_data_sem);
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 out_stop:
 	/*
 	 * If this was a simple ftruncate() and the file will remain alive,
 	 * then we need to clear up the orphan record which we created above.
 	 * However, if this was a real unlink then we were called by
 	 * ext4_evict_inode(), and we allow that function to clean up the
 	 * orphan info for us.
 	 */
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 	trace_ext4_truncate_exit(inode);
 }
 /*
  * ext4_get_inode_loc returns with an extra refcount against the inode's
  * underlying buffer_head on success. If 'in_mem' is true, we have all
  * data in memory that is needed to recreate the on-disk version of this
  * inode.
  */
 static int __ext4_get_inode_loc(struct inode *inode,
 				struct ext4_iloc *iloc, int in_mem)
 {
 	struct ext4_group_desc	*gdp;
 	struct buffer_head	*bh;
 	struct super_block	*sb = inode->i_sb;
 	ext4_fsblk_t		block;
 	int			inodes_per_block, inode_offset;
 	iloc->bh = NULL;
 	if (!ext4_valid_inum(sb, inode->i_ino))
 		return -EIO;
 	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
 	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
 	if (!gdp)
 		return -EIO;
 	/*
 	 * Figure out the offset within the block group inode table
 	 */
 	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
 	inode_offset = ((inode->i_ino - 1) %
 			EXT4_INODES_PER_GROUP(sb));
 	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
 	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
 	bh = sb_getblk(sb, block);
 	if (unlikely(!bh))
 		return -ENOMEM;
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
 		/*
 		 * If the buffer has the write error flag, we have failed
 		 * to write out another inode in the same block.  In this
 		 * case, we don't have to read the block because we may
 		 * read the old inode data successfully.
 		 */
 		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
 			set_buffer_uptodate(bh);
 		if (buffer_uptodate(bh)) {
 			/* someone brought it uptodate while we waited */
 			unlock_buffer(bh);
 			goto has_buffer;
 		}
 		/*
 		 * If we have all information of the inode in memory and this
 		 * is the only valid inode in the block, we need not read the
 		 * block.
 		 */
 		if (in_mem) {
 			struct buffer_head *bitmap_bh;
 			int i, start;
 			start = inode_offset & ~(inodes_per_block - 1);
 			/* Is the inode bitmap in cache? */
 			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
 			if (unlikely(!bitmap_bh))
 				goto make_io;
 			/*
 			 * If the inode bitmap isn't in cache then the
 			 * optimisation may end up performing two reads instead
 			 * of one, so skip it.
 			 */
 			if (!buffer_uptodate(bitmap_bh)) {
 				brelse(bitmap_bh);
 				goto make_io;
 			}
 			for (i = start; i < start + inodes_per_block; i++) {
 				if (i == inode_offset)
 					continue;
 				if (ext4_test_bit(i, bitmap_bh->b_data))
 					break;
 			}
 			brelse(bitmap_bh);
 			if (i == start + inodes_per_block) {
 				/* all other inodes are free, so skip I/O */
 				memset(bh->b_data, 0, bh->b_size);
 				set_buffer_uptodate(bh);
 				unlock_buffer(bh);
 				goto has_buffer;
 			}
 		}
 make_io:
 		/*
 		 * If we need to do any I/O, try to pre-readahead extra
 		 * blocks from the inode table.
 		 */
 		if (EXT4_SB(sb)->s_inode_readahead_blks) {
 			ext4_fsblk_t b, end, table;
 			unsigned num;
 			__u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
 			table = ext4_inode_table(sb, gdp);
 			/* s_inode_readahead_blks is always a power of 2 */
 			b = block & ~((ext4_fsblk_t) ra_blks - 1);
 			if (table > b)
 				b = table;
 			end = b + ra_blks;
 			num = EXT4_INODES_PER_GROUP(sb);
 			if (ext4_has_group_desc_csum(sb))
 				num -= ext4_itable_unused_count(sb, gdp);
 			table += num / inodes_per_block;
 			if (end > table)
 				end = table;
 			while (b <= end)
 				sb_breadahead(sb, b++);
 		}
 		/*
 		 * There are other valid inodes in the buffer, this inode
 		 * has in-inode xattrs, or we don't have this inode in memory.
 		 * Read the block from disk.
 		 */
 		trace_ext4_load_inode(inode);
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ | REQ_META | REQ_PRIO, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			EXT4_ERROR_INODE_BLOCK(inode, block,
 					       "unable to read itable block");
 			brelse(bh);
 			return -EIO;
 		}
 	}
 has_buffer:
 	iloc->bh = bh;
 	return 0;
 }
 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 {
 	/* We have all inode data except xattrs in memory here. */
 	return __ext4_get_inode_loc(inode, iloc,
 		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
 }
 void ext4_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = EXT4_I(inode)->i_flags;
 	unsigned int new_fl = 0;
 	if (flags & EXT4_SYNC_FL)
 		new_fl |= S_SYNC;
 	if (flags & EXT4_APPEND_FL)
 		new_fl |= S_APPEND;
 	if (flags & EXT4_IMMUTABLE_FL)
 		new_fl |= S_IMMUTABLE;
 	if (flags & EXT4_NOATIME_FL)
 		new_fl |= S_NOATIME;
 	if (flags & EXT4_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
 	if (test_opt(inode->i_sb, DAX))
 		new_fl |= S_DAX;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
 }
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
 void ext4_get_inode_flags(struct ext4_inode_info *ei)
 {
 	unsigned int vfs_fl;
 	unsigned long old_fl, new_fl;
 	do {
 		vfs_fl = ei->vfs_inode.i_flags;
 		old_fl = ei->i_flags;
 		new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
 				EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
 				EXT4_DIRSYNC_FL);
 		if (vfs_fl & S_SYNC)
 			new_fl |= EXT4_SYNC_FL;
 		if (vfs_fl & S_APPEND)
 			new_fl |= EXT4_APPEND_FL;
 		if (vfs_fl & S_IMMUTABLE)
 			new_fl |= EXT4_IMMUTABLE_FL;
 		if (vfs_fl & S_NOATIME)
 			new_fl |= EXT4_NOATIME_FL;
 		if (vfs_fl & S_DIRSYNC)
 			new_fl |= EXT4_DIRSYNC_FL;
 	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
 }
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
 				  struct ext4_inode_info *ei)
 {
 	blkcnt_t i_blocks ;
 	struct inode *inode = &(ei->vfs_inode);
 	struct super_block *sb = inode->i_sb;
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
 		/* we are using combined 48 bit field */
 		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
 					le32_to_cpu(raw_inode->i_blocks_lo);
 		if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
 			/* i_blocks represent file system block size */
 			return i_blocks  << (inode->i_blkbits - 9);
 		} else {
 			return i_blocks;
 		}
 	} else {
 		return le32_to_cpu(raw_inode->i_blocks_lo);
 	}
 }
 static inline void ext4_iget_extra_inode(struct inode *inode,
 					 struct ext4_inode *raw_inode,
 					 struct ext4_inode_info *ei)
 {
 	__le32 *magic = (void *)raw_inode +
 			EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
 	if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
 		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
 		ext4_find_inline_data_nolock(inode);
 	} else
 		EXT4_I(inode)->i_inline_off = 0;
 }
 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 {
 	struct ext4_iloc iloc;
 	struct ext4_inode *raw_inode;
 	struct ext4_inode_info *ei;
 	struct inode *inode;
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 	long ret;
 	int block;
 	uid_t i_uid;
 	gid_t i_gid;
 	inode = iget_locked(sb, ino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 	ei = EXT4_I(inode);
 	iloc.bh = NULL;
 	ret = __ext4_get_inode_loc(inode, &iloc, 0);
 	if (ret < 0)
 		goto bad_inode;
 	raw_inode = ext4_raw_inode(&iloc);
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
 		    EXT4_INODE_SIZE(inode->i_sb)) {
 			EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
 				EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
 				EXT4_INODE_SIZE(inode->i_sb));
 			ret = -EIO;
 			goto bad_inode;
 		}
 	} else
 		ei->i_extra_isize = 0;
 	/* Precompute checksum seed for inode metadata */
 	if (ext4_has_metadata_csum(sb)) {
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 		__u32 csum;
 		__le32 inum = cpu_to_le32(inode->i_ino);
 		__le32 gen = raw_inode->i_generation;
 		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
 				   sizeof(inum));
 		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
 					      sizeof(gen));
 	}
 	if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
 		EXT4_ERROR_INODE(inode, "checksum invalid");
 		ret = -EIO;
 		goto bad_inode;
 	}
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
 	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
 	i_uid_write(inode, i_uid);
 	i_gid_write(inode, i_gid);
 	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
 	ei->i_inline_off = 0;
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
 	 * This is needed because nfsd might try to access dead inodes
 	 * the test is that same one that e2fsck uses
 	 * NeilBrown 1999oct15
 	 */
 	if (inode->i_nlink == 0) {
 		if ((inode->i_mode == 0 ||
 		     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
 		    ino != EXT4_BOOT_LOADER_INO) {
 			/* this inode is deleted */
 			ret = -ESTALE;
 			goto bad_inode;
 		}
 		/* The only unlinked inodes we let through here have
 		 * valid i_mode and are being read by the orphan
 		 * recovery code: that's fine, we're about to complete
 		 * the process of deleting those.
 		 * OR it is the EXT4_BOOT_LOADER_INO which is
 		 * not initialized on a new filesystem. */
 	}
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
 		ei->i_file_acl |=
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
 	inode->i_size = ext4_isize(raw_inode);
 	ei->i_disksize = inode->i_size;
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;
 #endif
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_block_group = iloc.block_group;
 	ei->i_last_alloc_group = ~0;
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
 	 * even on big-endian machines: we do NOT byteswap the block numbers!
 	 */
 	for (block = 0; block < EXT4_N_BLOCKS; block++)
 		ei->i_data[block] = raw_inode->i_block[block];
 	INIT_LIST_HEAD(&ei->i_orphan);
 	/*
 	 * Set transaction id's of transactions that have to be committed
 	 * to finish f[data]sync. We set them to currently running transaction
 	 * as we cannot be sure that the inode or some of its metadata isn't
 	 * part of the transaction - the inode could have been reclaimed and
 	 * now it is reread from disk.
 	 */
 	if (journal) {
 		transaction_t *transaction;
 		tid_t tid;
 		read_lock(&journal->j_state_lock);
 		if (journal->j_running_transaction)
 			transaction = journal->j_running_transaction;
 		else
 			transaction = journal->j_committing_transaction;
 		if (transaction)
 			tid = transaction->t_tid;
 		else
 			tid = journal->j_commit_sequence;
 		read_unlock(&journal->j_state_lock);
 		ei->i_sync_tid = tid;
 		ei->i_datasync_tid = tid;
 	}
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		if (ei->i_extra_isize == 0) {
 			/* The extra space is currently unused. Use it. */
 			ei->i_extra_isize = sizeof(struct ext4_inode) -
 					    EXT4_GOOD_OLD_INODE_SIZE;
 		} else {
 			ext4_iget_extra_inode(inode, raw_inode, ei);
 		}
 	}
 	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
 	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
 	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
 	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
 		inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
 		if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
 				inode->i_version |=
 		    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
 		}
 	}
 	ret = 0;
 	if (ei->i_file_acl &&
 	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
 		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
 				 ei->i_file_acl);
 		ret = -EIO;
 		goto bad_inode;
 	} else if (!ext4_has_inline_data(inode)) {
 		if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 			if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 			    (S_ISLNK(inode->i_mode) &&
 			     !ext4_inode_is_fast_symlink(inode))))
 				/* Validate extent which is part of inode */
 				ret = ext4_ext_check_inode(inode);
 		} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 			   (S_ISLNK(inode->i_mode) &&
 			    !ext4_inode_is_fast_symlink(inode))) {
 			/* Validate block references which are part of inode */
 			ret = ext4_ind_check_inode(inode);
 		}
 	}
 	if (ret)
 		goto bad_inode;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		if (test_opt(inode->i_sb, DAX))
 			inode->i_fop = &ext4_dax_file_operations;
 		else
 			inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &ext4_dir_inode_operations;
 		inode->i_fop = &ext4_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (ext4_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext4_fast_symlink_inode_operations;
 			nd_terminate_link(ei->i_data, inode->i_size,
 				sizeof(ei->i_data) - 1);
 		} else {
 			inode->i_op = &ext4_symlink_inode_operations;
 			ext4_set_aops(inode);
 		}
 	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 		inode->i_op = &ext4_special_inode_operations;
 		if (raw_inode->i_block[0])
 			init_special_inode(inode, inode->i_mode,
 			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
 		else
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	} else if (ino == EXT4_BOOT_LOADER_INO) {
 		make_bad_inode(inode);
 	} else {
 		ret = -EIO;
 		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
 		goto bad_inode;
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
 	unlock_new_inode(inode);
 	return inode;
 bad_inode:
 	brelse(iloc.bh);
 	iget_failed(inode);
 	return ERR_PTR(ret);
 }
 struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
 {
 	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
 		return ERR_PTR(-EIO);
 	return ext4_iget(sb, ino);
 }
 static int ext4_inode_blocks_set(handle_t *handle,
 				struct ext4_inode *raw_inode,
 				struct ext4_inode_info *ei)
 {
 	struct inode *inode = &(ei->vfs_inode);
 	u64 i_blocks = inode->i_blocks;
 	struct super_block *sb = inode->i_sb;
 	if (i_blocks <= ~0U) {
 		/*
 		 * i_blocks can be represented in a 32 bit variable
 		 * as multiple of 512 bytes
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = 0;
 		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		return 0;
 	}
 	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
 		return -EFBIG;
 	if (i_blocks <= 0xffffffffffffULL) {
 		/*
 		 * i_blocks can be represented in a 48 bit variable
 		 * as multiple of 512 bytes
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 	} else {
 		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		/* i_block is stored in file system block size */
 		i_blocks = i_blocks >> (inode->i_blkbits - 9);
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 	}
 	return 0;
 }
 struct other_inode {
 	unsigned long		orig_ino;
 	struct ext4_inode	*raw_inode;
 };
 static int other_inode_match(struct inode * inode, unsigned long ino,
 			     void *data)
 {
 	struct other_inode *oi = (struct other_inode *) data;
 	if ((inode->i_ino != ino) ||
 	    (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
 			       I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
 	    ((inode->i_state & I_DIRTY_TIME) == 0))
 		return 0;
 	spin_lock(&inode->i_lock);
 	if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
 				I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
 	    (inode->i_state & I_DIRTY_TIME)) {
 		struct ext4_inode_info	*ei = EXT4_I(inode);
 		inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
 		spin_unlock(&inode->i_lock);
 		spin_lock(&ei->i_raw_lock);
 		EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
 		EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
 		EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
 		ext4_inode_csum_set(inode, oi->raw_inode, ei);
 		spin_unlock(&ei->i_raw_lock);
 		trace_ext4_other_inode_update_time(inode, oi->orig_ino);
 		return -1;
 	}
 	spin_unlock(&inode->i_lock);
 	return -1;
 }
 /*
  * Opportunistically update the other time fields for other inodes in
  * the same inode table block.
  */
 static void ext4_update_other_inodes_time(struct super_block *sb,
 					  unsigned long orig_ino, char *buf)
 {
 	struct other_inode oi;
 	unsigned long ino;
 	int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
 	int inode_size = EXT4_INODE_SIZE(sb);
 	oi.orig_ino = orig_ino;
 	ino = orig_ino & ~(inodes_per_block - 1);
 	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
 		if (ino == orig_ino)
 			continue;
 		oi.raw_inode = (struct ext4_inode *) buf;
 		(void) find_inode_nowait(sb, ino, other_inode_match, &oi);
 	}
 }
 /*
  * Post the struct inode info into an on-disk inode location in the
  * buffer-cache.  This gobbles the caller's reference to the
  * buffer_head in the inode location struct.
  *
  * The caller must have write access to iloc->bh.
  */
 static int ext4_do_update_inode(handle_t *handle,
 				struct inode *inode,
 				struct ext4_iloc *iloc)
 {
 	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct buffer_head *bh = iloc->bh;
 	struct super_block *sb = inode->i_sb;
 	int err = 0, rc, block;
 	int need_datasync = 0, set_large_file = 0;
 	uid_t i_uid;
 	gid_t i_gid;
 	spin_lock(&ei->i_raw_lock);
 	/* For fields not tracked in the in-memory inode,
 	 * initialise them to zero for new inodes. */
 	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
 	ext4_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	i_uid = i_uid_read(inode);
 	i_gid = i_gid_read(inode);
 	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
 /*
  * Fix up interoperability with old kernels. Otherwise, old inodes get
  * re-used with the upper 16 bits of the uid/gid intact
  */
 		if (!ei->i_dtime) {
 			raw_inode->i_uid_high =
 				cpu_to_le16(high_16_bits(i_uid));
 			raw_inode->i_gid_high =
 				cpu_to_le16(high_16_bits(i_gid));
 		} else {
 			raw_inode->i_uid_high = 0;
 			raw_inode->i_gid_high = 0;
 		}
 	} else {
 		raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
 		raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
 		raw_inode->i_uid_high = 0;
 		raw_inode->i_gid_high = 0;
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
 	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
 	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
 	err = ext4_inode_blocks_set(handle, raw_inode, ei);
 	if (err) {
 		spin_unlock(&ei->i_raw_lock);
 		goto out_brelse;
 	}
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
 	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
 	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
 		raw_inode->i_file_acl_high =
 			cpu_to_le16(ei->i_file_acl >> 32);
 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
 	if (ei->i_disksize != ext4_isize(raw_inode)) {
 		ext4_isize_set(raw_inode, ei->i_disksize);
 		need_datasync = 1;
 	}
 	if (ei->i_disksize > 0x7fffffffULL) {
 		if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
 				EXT4_SB(sb)->s_es->s_rev_level ==
 		    cpu_to_le32(EXT4_GOOD_OLD_REV))
 			set_large_file = 1;
 	}
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		if (old_valid_dev(inode->i_rdev)) {
 			raw_inode->i_block[0] =
 				cpu_to_le32(old_encode_dev(inode->i_rdev));
 			raw_inode->i_block[1] = 0;
 		} else {
 			raw_inode->i_block[0] = 0;
 			raw_inode->i_block[1] =
 				cpu_to_le32(new_encode_dev(inode->i_rdev));
 			raw_inode->i_block[2] = 0;
 		}
 	} else if (!ext4_has_inline_data(inode)) {
 		for (block = 0; block < EXT4_N_BLOCKS; block++)
 			raw_inode->i_block[block] = ei->i_data[block];
 	}
 	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
 		raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
 		if (ei->i_extra_isize) {
 			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
 				raw_inode->i_version_hi =
 					cpu_to_le32(inode->i_version >> 32);
 			raw_inode->i_extra_isize =
 				cpu_to_le16(ei->i_extra_isize);
 		}
 	}
 	ext4_inode_csum_set(inode, raw_inode, ei);
 	spin_unlock(&ei->i_raw_lock);
 	if (inode->i_sb->s_flags & MS_LAZYTIME)
 		ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
 					      bh->b_data);
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 	rc = ext4_handle_dirty_metadata(handle, NULL, bh);
 	if (!err)
 		err = rc;
 	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
 	if (set_large_file) {
 		BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
 		err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
 		if (err)
 			goto out_brelse;
 		ext4_update_dynamic_rev(sb);
 		EXT4_SET_RO_COMPAT_FEATURE(sb,
 					   EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
 		ext4_handle_sync(handle);
 		err = ext4_handle_dirty_super(handle, sb);
 	}
 	ext4_update_inode_fsync_trans(handle, inode, need_datasync);
 out_brelse:
 	brelse(bh);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 /*
  * ext4_write_inode()
  *
  * We are called from a few places:
  *
  * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
  *   Here, there will be no transaction running. We wait for any running
  *   transaction to commit.
  *
  * - Within flush work (sys_sync(), kupdate and such).
  *   We wait on commit, if told to.
  *
  * - Within iput_final() -> write_inode_now()
  *   We wait on commit, if told to.
  *
  * In all cases it is actually safe for us to return without doing anything,
  * because the inode has been copied into a raw inode buffer in
  * ext4_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
  * writeback.
  *
  * Note that we are absolutely dependent upon all inode dirtiers doing the
  * right thing: they *must* call mark_inode_dirty() after dirtying info in
  * which we are interested.
  *
  * It would be a bug for them to not do this.  The code:
  *
  *	mark_inode_dirty(inode)
  *	stuff();
  *	inode->i_size = expr;
  *
  * is in error because write_inode() could occur while `stuff()' is running,
  * and the new i_size will be lost.  Plus the inode will no longer be on the
  * superblock's dirty inode list.
  */
 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int err;
 	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
 		return 0;
 	if (EXT4_SB(inode->i_sb)->s_journal) {
 		if (ext4_journal_current_handle()) {
 			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
 			dump_stack();
 			return -EIO;
 		}
 		/*
 		 * No need to force transaction in WB_SYNC_NONE mode. Also
 		 * ext4_sync_fs() will force the commit after everything is
 		 * written.
 		 */
 		if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
 			return 0;
 		err = ext4_force_commit(inode->i_sb);
 	} else {
 		struct ext4_iloc iloc;
 		err = __ext4_get_inode_loc(inode, &iloc, 0);
 		if (err)
 			return err;
 		/*
 		 * sync(2) will flush the whole buffer cache. No need to do
 		 * it here separately for each inode.
 		 */
 		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
 			sync_dirty_buffer(iloc.bh);
 		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
 			EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
 					 "IO error syncing inode");
 			err = -EIO;
 		}
 		brelse(iloc.bh);
 	}
 	return err;
 }
 /*
  * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
  * buffers that are attached to a page stradding i_size and are undergoing
  * commit. In that case we have to wait for commit to finish and try again.
  */
 static void ext4_wait_for_tail_page_commit(struct inode *inode)
 {
 	struct page *page;
 	unsigned offset;
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 	tid_t commit_tid = 0;
 	int ret;
 	offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
 	/*
 	 * All buffers in the last page remain valid? Then there's nothing to
 	 * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
 	 * blocksize case
 	 */
 	if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
 		return;
 	while (1) {
 		page = find_lock_page(inode->i_mapping,
 				      inode->i_size >> PAGE_CACHE_SHIFT);
 		if (!page)
 			return;
 		ret = __ext4_journalled_invalidatepage(page, offset,
 						PAGE_CACHE_SIZE - offset);
 		unlock_page(page);
 		page_cache_release(page);
 		if (ret != -EBUSY)
 			return;
 		commit_tid = 0;
 		read_lock(&journal->j_state_lock);
 		if (journal->j_committing_transaction)
 			commit_tid = journal->j_committing_transaction->t_tid;
 		read_unlock(&journal->j_state_lock);
 		if (commit_tid)
 			jbd2_log_wait_commit(journal, commit_tid);
 	}
 }
 /*
  * ext4_setattr()
  *
  * Called from notify_change.
  *
  * We want to trap VFS attempts to truncate the file as soon as
  * possible.  In particular, we want to make sure that when the VFS
  * shrinks i_size, we put the inode on the orphan list and modify
  * i_disksize immediately, so that during the subsequent flushing of
  * dirty pages and freeing of disk blocks, we can guarantee that any
  * commit will leave the blocks being flushed in an unused state on
  * disk.  (On recovery, the inode will get truncated and the blocks will
  * be freed, so we have a strong guarantee that no future commit will
  * leave these blocks visible to the user.)
  *
  * Another thing we have to assure is that if we are in ordered mode
  * and inode is still attached to the committing transaction, we must
  * we start writeout of all the dirty pages which are being truncated.
  * This way we are sure that all the data written in the previous
  * transaction are already on disk (truncate waits for pages under
  * writeback).
  *
  * Called with inode->i_mutex down.
  */
 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error, rc = 0;
 	int orphan = 0;
 	const unsigned int ia_valid = attr->ia_valid;
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
 	if (is_quota_modification(inode, attr))
 		dquot_initialize(inode);
 	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
 	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
 		handle_t *handle;
 		/* (user+group)*(old+new) structure, inode write (sb,
 		 * inode block, ? - but truncate inode update has it) */
 		handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
 			(EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
 			 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
 		error = dquot_transfer(inode, attr);
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
 		}
 		/* Update corresponding info in inode so that everything is in
 		 * one transaction */
 		if (attr->ia_valid & ATTR_UID)
 			inode->i_uid = attr->ia_uid;
 		if (attr->ia_valid & ATTR_GID)
 			inode->i_gid = attr->ia_gid;
 		error = ext4_mark_inode_dirty(handle, inode);
 		ext4_journal_stop(handle);
 	}
 	if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
 		handle_t *handle;
 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 			if (attr->ia_size > sbi->s_bitmap_maxbytes)
 				return -EFBIG;
 		}
 		if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
 			inode_inc_iversion(inode);
 		if (S_ISREG(inode->i_mode) &&
 		    (attr->ia_size < inode->i_size)) {
 			if (ext4_should_order_data(inode)) {
 				error = ext4_begin_ordered_truncate(inode,
 							    attr->ia_size);
 				if (error)
 					goto err_out;
 			}
 			handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
 			if (IS_ERR(handle)) {
 				error = PTR_ERR(handle);
 				goto err_out;
 			}
 			if (ext4_handle_valid(handle)) {
 				error = ext4_orphan_add(handle, inode);
 				orphan = 1;
 			}
 			down_write(&EXT4_I(inode)->i_data_sem);
 			EXT4_I(inode)->i_disksize = attr->ia_size;
 			rc = ext4_mark_inode_dirty(handle, inode);
 			if (!error)
 				error = rc;
 			/*
 			 * We have to update i_size under i_data_sem together
 			 * with i_disksize to avoid races with writeback code
 			 * running ext4_wb_update_i_disksize().
 			 */
 			if (!error)
 				i_size_write(inode, attr->ia_size);
 			up_write(&EXT4_I(inode)->i_data_sem);
 			ext4_journal_stop(handle);
 			if (error) {
 				ext4_orphan_del(NULL, inode);
 				goto err_out;
 			}
 		} else {
 			loff_t oldsize = inode->i_size;
 			i_size_write(inode, attr->ia_size);
 			pagecache_isize_extended(inode, oldsize, inode->i_size);
 		}
 		/*
 		 * Blocks are going to be removed from the inode. Wait
 		 * for dio in flight.  Temporarily disable
 		 * dioread_nolock to prevent livelock.
 		 */
 		if (orphan) {
 			if (!ext4_should_journal_data(inode)) {
 				ext4_inode_block_unlocked_dio(inode);
 				inode_dio_wait(inode);
 				ext4_inode_resume_unlocked_dio(inode);
 			} else
 				ext4_wait_for_tail_page_commit(inode);
 		}
 		/*
 		 * Truncate pagecache after we've waited for commit
 		 * in data=journal mode to make pages freeable.
 		 */
 		truncate_pagecache(inode, inode->i_size);
 	}
 	/*
 	 * We want to call ext4_truncate() even if attr->ia_size ==
 	 * inode->i_size for cases like truncation of fallocated space
 	 */
 	if (attr->ia_valid & ATTR_SIZE)
 		ext4_truncate(inode);
 	if (!rc) {
 		setattr_copy(inode, attr);
 		mark_inode_dirty(inode);
 	}
 	/*
 	 * If the call to ext4_truncate failed to get a transaction handle at
 	 * all, we need to clean up the in-core orphan list manually.
 	 */
 	if (orphan && inode->i_nlink)
 		ext4_orphan_del(NULL, inode);
 	if (!rc && (ia_valid & ATTR_MODE))
 		rc = posix_acl_chmod(inode, inode->i_mode);
 err_out:
 	ext4_std_error(inode->i_sb, error);
 	if (!error)
 		error = rc;
 	return error;
 }
 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		 struct kstat *stat)
 {
 	struct inode *inode;
 	unsigned long long delalloc_blocks;
 	inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
 	/*
 	 * If there is inline data in the inode, the inode will normally not
 	 * have data blocks allocated (it may have an external xattr block).
 	 * Report at least one sector for such files, so tools like tar, rsync,
 	 * others doen't incorrectly think the file is completely sparse.
 	 */
 	if (unlikely(ext4_has_inline_data(inode)))
 		stat->blocks += (stat->size + 511) >> 9;
 	/*
 	 * We can't update i_blocks if the block allocation is delayed
 	 * otherwise in the case of system crash before the real block
 	 * allocation is done, we will have i_blocks inconsistent with
 	 * on-disk file blocks.
 	 * We always keep i_blocks updated together with real
 	 * allocation. But to not confuse with user, stat
 	 * will return the blocks that include the delayed allocation
 	 * blocks for this file.
 	 */
 	delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
 				   EXT4_I(inode)->i_reserved_data_blocks);
 	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
 	return 0;
 }
 static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
 				   int pextents)
 {
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return ext4_ind_trans_blocks(inode, lblocks);
 	return ext4_ext_index_trans_blocks(inode, pextents);
 }
 /*
  * Account for index blocks, block groups bitmaps and block group
  * descriptor blocks if modify datablocks and index blocks
  * worse case, the indexs blocks spread over different block groups
  *
  * If datablocks are discontiguous, they are possible to spread over
  * different block groups too. If they are contiguous, with flexbg,
  * they could still across block group boundary.
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
 static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
 				  int pextents)
 {
 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
 	int gdpblocks;
 	int idxblocks;
 	int ret = 0;
 	/*
 	 * How many index blocks need to touch to map @lblocks logical blocks
 	 * to @pextents physical extents?
 	 */
 	idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
 	ret = idxblocks;
 	/*
 	 * Now let's see how many group bitmaps and group descriptors need
 	 * to account
 	 */
 	groups = idxblocks + pextents;
 	gdpblocks = groups;
 	if (groups > ngroups)
 		groups = ngroups;
 	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
 		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
 	/* bitmaps and block group descriptor blocks */
 	ret += groups + gdpblocks;
 	/* Blocks for super block, inode, quota and xattr blocks */
 	ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
 	return ret;
 }
 /*
  * Calculate the total number of credits to reserve to fit
  * the modification of a single pages into a single transaction,
  * which may include multiple chunks of block allocations.
  *
  * This could be called via ext4_write_begin()
  *
  * We need to consider the worse case, when
  * one new block per extent.
  */
 int ext4_writepage_trans_blocks(struct inode *inode)
 {
 	int bpp = ext4_journal_blocks_per_page(inode);
 	int ret;
 	ret = ext4_meta_trans_blocks(inode, bpp, bpp);
 	/* Account for data blocks for journalled mode */
 	if (ext4_should_journal_data(inode))
 		ret += bpp;
 	return ret;
 }
 /*
  * Calculate the journal credits for a chunk of data modification.
  *
  * This is called from DIO, fallocate or whoever calling
  * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
  *
  * journal buffers for data blocks are not included here, as DIO
  * and fallocate do no need to journal data buffers.
  */
 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
 {
 	return ext4_meta_trans_blocks(inode, nrblocks, 1);
 }
 /*
  * The caller must have previously called ext4_reserve_inode_write().
  * Give this, we know that the caller already has write access to iloc->bh.
  */
 int ext4_mark_iloc_dirty(handle_t *handle,
 			 struct inode *inode, struct ext4_iloc *iloc)
 {
 	int err = 0;
 	if (IS_I_VERSION(inode))
 		inode_inc_iversion(inode);
 	/* the do_update_inode consumes one bh->b_count */
 	get_bh(iloc->bh);
 	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
 	err = ext4_do_update_inode(handle, inode, iloc);
 	put_bh(iloc->bh);
 	return err;
 }
 /*
  * On success, We end up with an outstanding reference count against
  * iloc->bh.  This _must_ be cleaned up later.
  */
 int
 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 			 struct ext4_iloc *iloc)
 {
 	int err;
 	err = ext4_get_inode_loc(inode, iloc);
 	if (!err) {
 		BUFFER_TRACE(iloc->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, iloc->bh);
 		if (err) {
 			brelse(iloc->bh);
 			iloc->bh = NULL;
 		}
 	}
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 /*
  * Expand an inode by new_extra_isize bytes.
  * Returns 0 on success or negative error number on failure.
  */
 static int ext4_expand_extra_isize(struct inode *inode,
 				   unsigned int new_extra_isize,
 				   struct ext4_iloc iloc,
 				   handle_t *handle)
 {
 	struct ext4_inode *raw_inode;
 	struct ext4_xattr_ibody_header *header;
 	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
 		return 0;
 	raw_inode = ext4_raw_inode(&iloc);
 	header = IHDR(inode, raw_inode);
 	/* No extended attributes present */
 	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
 	    header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
 		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
 			new_extra_isize);
 		EXT4_I(inode)->i_extra_isize = new_extra_isize;
 		return 0;
 	}
 	/* try to expand with EAs present */
 	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
 					  raw_inode, handle);
 }
 /*
  * What we do here is to mark the in-core inode as clean with respect to inode
  * dirtiness (it may still be data-dirty).
  * This means that the in-core inode may be reaped by prune_icache
  * without having to perform any I/O.  This is a very good thing,
  * because *any* task may call prune_icache - even ones which
  * have a transaction open against a different journal.
  *
  * Is this cheating?  Not really.  Sure, we haven't written the
  * inode out, but prune_icache isn't a user-visible syncing function.
  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
  * we start and wait on commits.
  */
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
 	struct ext4_iloc iloc;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	static unsigned int mnt_count;
 	int err, ret;
 	might_sleep();
 	trace_ext4_mark_inode_dirty(inode, _RET_IP_);
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (ext4_handle_valid(handle) &&
 	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
 	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
 		/*
 		 * We need extra buffer credits since we may write into EA block
 		 * with this same handle. If journal_extend fails, then it will
 		 * only result in a minor loss of functionality for that inode.
 		 * If this is felt to be critical, then e2fsck should be run to
 		 * force a large enough s_min_extra_isize.
 		 */
 		if ((jbd2_journal_extend(handle,
 			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
 			ret = ext4_expand_extra_isize(inode,
 						      sbi->s_want_extra_isize,
 						      iloc, handle);
 			if (ret) {
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_NO_EXPAND);
 				if (mnt_count !=
 					le16_to_cpu(sbi->s_es->s_mnt_count)) {
 					ext4_warning(inode->i_sb,
 					"Unable to expand inode %lu. Delete"
 					" some EAs or run e2fsck.",
 					inode->i_ino);
 					mnt_count =
 					  le16_to_cpu(sbi->s_es->s_mnt_count);
 				}
 			}
 		}
 	}
 	if (!err)
 		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	return err;
 }
 /*
  * ext4_dirty_inode() is called from __mark_inode_dirty()
  *
  * We're really interested in the case where a file is being extended.
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
  * Also, dquot_alloc_block() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
  * so would cause a commit on atime updates, which we don't bother doing.
  * We handle synchronous inodes at the highest possible level.
  *
  * If only the I_DIRTY_TIME flag is set, we can skip everything.  If
  * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
  * to copy into the on-disk inode structure are the timestamp files.
  */
 void ext4_dirty_inode(struct inode *inode, int flags)
 {
 	handle_t *handle;
 	if (flags == I_DIRTY_TIME)
 		return;
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 	if (IS_ERR(handle))
 		goto out;
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 out:
 	return;
 }
 #if 0
 /*
  * Bind an inode's backing buffer_head into this transaction, to prevent
  * it from being flushed to disk early.  Unlike
  * ext4_reserve_inode_write, this leaves behind no bh reference and
  * returns no iloc structure, so the caller needs to repeat the iloc
  * lookup to mark the inode dirty later.
  */
 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
 {
 	struct ext4_iloc iloc;
 	int err = 0;
 	if (handle) {
 		err = ext4_get_inode_loc(inode, &iloc);
 		if (!err) {
 			BUFFER_TRACE(iloc.bh, "get_write_access");
 			err = jbd2_journal_get_write_access(handle, iloc.bh);
 			if (!err)
 				err = ext4_handle_dirty_metadata(handle,
 								 NULL,
 								 iloc.bh);
 			brelse(iloc.bh);
 		}
 	}
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 #endif
 int ext4_change_inode_journal_flag(struct inode *inode, int val)
 {
 	journal_t *journal;
 	handle_t *handle;
 	int err;
 	/*
 	 * We have to be very careful here: changing a data block's
 	 * journaling status dynamically is dangerous.  If we write a
 	 * data block to the journal, change the status and then delete
 	 * that block, we risk forgetting to revoke the old log record
 	 * from the journal and so a subsequent replay can corrupt data.
 	 * So, first we make sure that the journal is empty and that
 	 * nobody is changing anything.
 	 */
 	journal = EXT4_JOURNAL(inode);
 	if (!journal)
 		return 0;
 	if (is_journal_aborted(journal))
 		return -EROFS;
 	/* We have to allocate physical blocks for delalloc blocks
 	 * before flushing journal. otherwise delalloc blocks can not
 	 * be allocated any more. even more truncate on delalloc blocks
 	 * could trigger BUG by flushing delalloc blocks in journal.
 	 * There is no delalloc block in non-journal data mode.
 	 */
 	if (val && test_opt(inode->i_sb, DELALLOC)) {
 		err = ext4_alloc_da_blocks(inode);
 		if (err < 0)
 			return err;
 	}
 	/* Wait for all existing dio workers */
 	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 	jbd2_journal_lock_updates(journal);
 	/*
 	 * OK, there are no updates running now, and all cached data is
 	 * synced to disk.  We are now in a completely consistent state
 	 * which doesn't have anything in the journal, and we know that
 	 * no filesystem updates are running, so it is safe to modify
 	 * the inode's in-core data-journaling state flag now.
 	 */
 	if (val)
 		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	else {
 		err = jbd2_journal_flush(journal);
 		if (err < 0) {
 			jbd2_journal_unlock_updates(journal);
 			ext4_inode_resume_unlocked_dio(inode);
 			return err;
 		}
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	}
 	ext4_set_aops(inode);
 	jbd2_journal_unlock_updates(journal);
 	ext4_inode_resume_unlocked_dio(inode);
 	/* Finally we can mark the inode as dirty. */
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	err = ext4_mark_inode_dirty(handle, inode);
 	ext4_handle_sync(handle);
 	ext4_journal_stop(handle);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
 {
 	return !buffer_mapped(bh);
 }
 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
 	int ret;
 	struct file *file = vma->vm_file;
 	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
 	handle_t *handle;
 	get_block_t *get_block;
 	int retries = 0;
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vma->vm_file);
 	/* Delalloc case is easy... */
 	if (test_opt(inode->i_sb, DELALLOC) &&
 	    !ext4_should_journal_data(inode) &&
 	    !ext4_nonda_switch(inode->i_sb)) {
 		do {
 			ret = __block_page_mkwrite(vma, vmf,
 						   ext4_da_get_block_prep);
 		} while (ret == -ENOSPC &&
 		       ext4_should_retry_alloc(inode->i_sb, &retries));
 		goto out_ret;
 	}
 	lock_page(page);
 	size = i_size_read(inode);
 	/* Page got truncated from under us? */
 	if (page->mapping != mapping || page_offset(page) > size) {
 		unlock_page(page);
 		ret = VM_FAULT_NOPAGE;
 		goto out;
 	}
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
 	else
 		len = PAGE_CACHE_SIZE;
 	/*
 	 * Return if we have all the buffers mapped. This avoids the need to do
 	 * journal_start/journal_stop which can block and take a long time
 	 */
 	if (page_has_buffers(page)) {
 		if (!ext4_walk_page_buffers(NULL, page_buffers(page),
 					    0, len, NULL,
 					    ext4_bh_unmapped)) {
 			/* Wait so that we don't change page under IO */
 			wait_for_stable_page(page);
 			ret = VM_FAULT_LOCKED;
 			goto out;
 		}
 	}
 	unlock_page(page);
 	/* OK, we need to fill the hole... */
 	if (ext4_should_dioread_nolock(inode))
 		get_block = ext4_get_block_write;
 	else
 		get_block = ext4_get_block;
 retry_alloc:
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
 				    ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
 	ret = __block_page_mkwrite(vma, vmf, get_block);
 	if (!ret && ext4_should_journal_data(inode)) {
 		if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
 			  PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
 			unlock_page(page);
 			ret = VM_FAULT_SIGBUS;
 			ext4_journal_stop(handle);
 			goto out;
 		}
 		ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	}
 	ext4_journal_stop(handle);
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry_alloc;
 out_ret:
 	ret = block_page_mkwrite_return(ret);
 out:
 	sb_end_pagefault(inode->i_sb);
 	return ret;
 }

fs/ext4/super.c

Diff comments View file @ feaf222

 /*
  *  linux/fs/ext4/super.c
  *
  * Copyright (C) 1992, 1993, 1994, 1995
  * Remy Card (card@masi.ibp.fr)
  * Laboratoire MASI - Institut Blaise Pascal
  * Universite Pierre et Marie Curie (Paris VI)
  *
  *  from
  *
  *  linux/fs/minix/inode.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  Big-endian to little-endian byte-swapping/bitmaps by
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  */
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/vmalloc.h>
 #include <linux/jbd2.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/vfs.h>
 #include <linux/random.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <linux/cleancache.h>
 #include <asm/uaccess.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include "ext4.h"
 #include "ext4_extents.h"	/* Needed for trace points definition */
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 #include "mballoc.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
 static struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
 static struct ext4_lazy_init *ext4_li_info;
 static struct mutex ext4_li_mtx;
 static struct ext4_features *ext4_feat;
 static int ext4_mballoc_ready;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
 static int ext4_show_options(struct seq_file *seq, struct dentry *root);
 static int ext4_commit_super(struct super_block *sb, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
 					struct ext4_super_block *es);
 static void ext4_clear_journal_err(struct super_block *sb,
 				   struct ext4_super_block *es);
 static int ext4_sync_fs(struct super_block *sb, int wait);
 static int ext4_remount(struct super_block *sb, int *flags, char *data);
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 		       const char *dev_name, void *data);
 static inline int ext2_feature_set_ok(struct super_block *sb);
 static inline int ext3_feature_set_ok(struct super_block *sb);
 static int ext4_feature_set_ok(struct super_block *sb, int readonly);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
 static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext2",
 	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("ext2");
 MODULE_ALIAS("ext2");
 #define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
 #else
 #define IS_EXT2_SB(sb) (0)
 #endif
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext3",
 	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("ext3");
 MODULE_ALIAS("ext3");
 #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
 #else
 #define IS_EXT3_SB(sb) (0)
 #endif
 static int ext4_verify_csum_type(struct super_block *sb,
 				 struct ext4_super_block *es)
 {
 	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
 					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
 		return 1;
 	return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
 }
 static __le32 ext4_superblock_csum(struct super_block *sb,
 				   struct ext4_super_block *es)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int offset = offsetof(struct ext4_super_block, s_checksum);
 	__u32 csum;
 	csum = ext4_chksum(sbi, ~0, (char *)es, offset);
 	return cpu_to_le32(csum);
 }
 static int ext4_superblock_csum_verify(struct super_block *sb,
 				       struct ext4_super_block *es)
 {
 	if (!ext4_has_metadata_csum(sb))
 		return 1;
 	return es->s_checksum == ext4_superblock_csum(sb, es);
 }
 void ext4_superblock_csum_set(struct super_block *sb)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	if (!ext4_has_metadata_csum(sb))
 		return;
 	es->s_checksum = ext4_superblock_csum(sb, es);
 }
 void *ext4_kvmalloc(size_t size, gfp_t flags)
 {
 	void *ret;
 	ret = kmalloc(size, flags | __GFP_NOWARN);
 	if (!ret)
 		ret = __vmalloc(size, flags, PAGE_KERNEL);
 	return ret;
 }
 void *ext4_kvzalloc(size_t size, gfp_t flags)
 {
 	void *ret;
 	ret = kzalloc(size, flags | __GFP_NOWARN);
 	if (!ret)
 		ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
 	return ret;
 }
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 			       struct ext4_group_desc *bg)
 {
 	return le32_to_cpu(bg->bg_block_bitmap_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 }
 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 			       struct ext4_group_desc *bg)
 {
 	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 }
 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 			      struct ext4_group_desc *bg)
 {
 	return le32_to_cpu(bg->bg_inode_table_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 __u32 ext4_free_group_clusters(struct super_block *sb,
 			       struct ext4_group_desc *bg)
 {
 	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 }
 __u32 ext4_free_inodes_count(struct super_block *sb,
 			      struct ext4_group_desc *bg)
 {
 	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 }
 __u32 ext4_used_dirs_count(struct super_block *sb,
 			      struct ext4_group_desc *bg)
 {
 	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 }
 __u32 ext4_itable_unused_count(struct super_block *sb,
 			      struct ext4_group_desc *bg)
 {
 	return le16_to_cpu(bg->bg_itable_unused_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 }
 void ext4_block_bitmap_set(struct super_block *sb,
 			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
 {
 	bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 		bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
 }
 void ext4_inode_bitmap_set(struct super_block *sb,
 			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
 {
 	bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 		bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
 }
 void ext4_inode_table_set(struct super_block *sb,
 			  struct ext4_group_desc *bg, ext4_fsblk_t blk)
 {
 	bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 }
 void ext4_free_group_clusters_set(struct super_block *sb,
 				  struct ext4_group_desc *bg, __u32 count)
 {
 	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
 }
 void ext4_free_inodes_set(struct super_block *sb,
 			  struct ext4_group_desc *bg, __u32 count)
 {
 	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
 }
 void ext4_used_dirs_set(struct super_block *sb,
 			  struct ext4_group_desc *bg, __u32 count)
 {
 	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
 }
 void ext4_itable_unused_set(struct super_block *sb,
 			  struct ext4_group_desc *bg, __u32 count)
 {
 	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 }
 static void __save_error_info(struct super_block *sb, const char *func,
 			    unsigned int line)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 	es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
 	es->s_last_error_time = cpu_to_le32(get_seconds());
 	strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
 	es->s_last_error_line = cpu_to_le32(line);
 	if (!es->s_first_error_time) {
 		es->s_first_error_time = es->s_last_error_time;
 		strncpy(es->s_first_error_func, func,
 			sizeof(es->s_first_error_func));
 		es->s_first_error_line = cpu_to_le32(line);
 		es->s_first_error_ino = es->s_last_error_ino;
 		es->s_first_error_block = es->s_last_error_block;
 	}
 	/*
 	 * Start the daily error reporting function if it hasn't been
 	 * started already
 	 */
 	if (!es->s_error_count)
 		mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
 	le32_add_cpu(&es->s_error_count, 1);
 }
 static void save_error_info(struct super_block *sb, const char *func,
 			    unsigned int line)
 {
 	__save_error_info(sb, func, line);
 	ext4_commit_super(sb, 1);
 }
 /*
  * The del_gendisk() function uninitializes the disk-specific data
  * structures, including the bdi structure, without telling anyone
  * else.  Once this happens, any attempt to call mark_buffer_dirty()
  * (for example, by ext4_commit_super), will cause a kernel OOPS.
  * This is a kludge to prevent these oops until we can put in a proper
  * hook in del_gendisk() to inform the VFS and file system layers.
  */
 static int block_device_ejected(struct super_block *sb)
 {
 	struct inode *bd_inode = sb->s_bdev->bd_inode;
 	struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
 	return bdi->dev == NULL;
 }
 static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 {
 	struct super_block		*sb = journal->j_private;
 	struct ext4_sb_info		*sbi = EXT4_SB(sb);
 	int				error = is_journal_aborted(journal);
 	struct ext4_journal_cb_entry	*jce;
 	BUG_ON(txn->t_state == T_FINISHED);
 	spin_lock(&sbi->s_md_lock);
 	while (!list_empty(&txn->t_private_list)) {
 		jce = list_entry(txn->t_private_list.next,
 				 struct ext4_journal_cb_entry, jce_list);
 		list_del_init(&jce->jce_list);
 		spin_unlock(&sbi->s_md_lock);
 		jce->jce_func(sb, jce, error);
 		spin_lock(&sbi->s_md_lock);
 	}
 	spin_unlock(&sbi->s_md_lock);
 }
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
  *
  * On ext2, we can store the error state of the filesystem in the
  * superblock.  That is not possible on ext4, because we may have other
  * write ordering constraints on the superblock which prevent us from
  * writing it out straight away; and given that the journal is about to
  * be aborted, we can't rely on the current, or future, transactions to
  * write out the superblock safely.
  *
  * We'll just use the jbd2_journal_abort() error code to record an error in
  * the journal instead.  On recovery, the journal will complain about
  * that error until we've noted it down and cleared it.
  */
 static void ext4_handle_error(struct super_block *sb)
 {
 	if (sb->s_flags & MS_RDONLY)
 		return;
 	if (!test_opt(sb, ERRORS_CONT)) {
 		journal_t *journal = EXT4_SB(sb)->s_journal;
 		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 		if (journal)
 			jbd2_journal_abort(journal, -EIO);
 	}
 	if (test_opt(sb, ERRORS_RO)) {
 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 		/*
 		 * Make sure updated value of ->s_mount_flags will be visible
 		 * before ->s_flags update
 		 */
 		smp_wmb();
 		sb->s_flags |= MS_RDONLY;
 	}
 	if (test_opt(sb, ERRORS_PANIC))
 		panic("EXT4-fs (device %s): panic forced after error\n",
 			sb->s_id);
 }
 #define ext4_error_ratelimit(sb)					\
 		___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),	\
 			     "EXT4-fs error")
 void __ext4_error(struct super_block *sb, const char *function,
 		  unsigned int line, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
 	if (ext4_error_ratelimit(sb)) {
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
 		printk(KERN_CRIT
 		       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
 		       sb->s_id, function, line, current->comm, &vaf);
 		va_end(args);
 	}
 	save_error_info(sb, function, line);
 	ext4_handle_error(sb);
 }
 void __ext4_error_inode(struct inode *inode, const char *function,
 			unsigned int line, ext4_fsblk_t block,
 			const char *fmt, ...)
 {
 	va_list args;
 	struct va_format vaf;
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 	es->s_last_error_block = cpu_to_le64(block);
 	if (ext4_error_ratelimit(inode->i_sb)) {
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
 		if (block)
 			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 			       "inode #%lu: block %llu: comm %s: %pV\n",
 			       inode->i_sb->s_id, function, line, inode->i_ino,
 			       block, current->comm, &vaf);
 		else
 			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 			       "inode #%lu: comm %s: %pV\n",
 			       inode->i_sb->s_id, function, line, inode->i_ino,
 			       current->comm, &vaf);
 		va_end(args);
 	}
 	save_error_info(inode->i_sb, function, line);
 	ext4_handle_error(inode->i_sb);
 }
 void __ext4_error_file(struct file *file, const char *function,
 		       unsigned int line, ext4_fsblk_t block,
 		       const char *fmt, ...)
 {
 	va_list args;
 	struct va_format vaf;
 	struct ext4_super_block *es;
 	struct inode *inode = file_inode(file);
 	char pathname[80], *path;
 	es = EXT4_SB(inode->i_sb)->s_es;
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 	if (ext4_error_ratelimit(inode->i_sb)) {
 		path = d_path(&(file->f_path), pathname, sizeof(pathname));
 		if (IS_ERR(path))
 			path = "(unknown)";
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
 		if (block)
 			printk(KERN_CRIT
 			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 			       "block %llu: comm %s: path %s: %pV\n",
 			       inode->i_sb->s_id, function, line, inode->i_ino,
 			       block, current->comm, path, &vaf);
 		else
 			printk(KERN_CRIT
 			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 			       "comm %s: path %s: %pV\n",
 			       inode->i_sb->s_id, function, line, inode->i_ino,
 			       current->comm, path, &vaf);
 		va_end(args);
 	}
 	save_error_info(inode->i_sb, function, line);
 	ext4_handle_error(inode->i_sb);
 }
 const char *ext4_decode_error(struct super_block *sb, int errno,
 			      char nbuf[16])
 {
 	char *errstr = NULL;
 	switch (errno) {
 	case -EIO:
 		errstr = "IO failure";
 		break;
 	case -ENOMEM:
 		errstr = "Out of memory";
 		break;
 	case -EROFS:
 		if (!sb || (EXT4_SB(sb)->s_journal &&
 			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
 			errstr = "Journal has aborted";
 		else
 			errstr = "Readonly filesystem";
 		break;
 	default:
 		/* If the caller passed in an extra buffer for unknown
 		 * errors, textualise them now.  Else we just return
 		 * NULL. */
 		if (nbuf) {
 			/* Check for truncated error codes... */
 			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
 				errstr = nbuf;
 		}
 		break;
 	}
 	return errstr;
 }
 /* __ext4_std_error decodes expected errors from journaling functions
  * automatically and invokes the appropriate error response.  */
 void __ext4_std_error(struct super_block *sb, const char *function,
 		      unsigned int line, int errno)
 {
 	char nbuf[16];
 	const char *errstr;
 	/* Special case: if the error is EROFS, and we're not already
 	 * inside a transaction, then there's really no point in logging
 	 * an error. */
 	if (errno == -EROFS && journal_current_handle() == NULL &&
 	    (sb->s_flags & MS_RDONLY))
 		return;
 	if (ext4_error_ratelimit(sb)) {
 		errstr = ext4_decode_error(sb, errno, nbuf);
 		printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
 		       sb->s_id, function, line, errstr);
 	}
 	save_error_info(sb, function, line);
 	ext4_handle_error(sb);
 }
 /*
  * ext4_abort is a much stronger failure handler than ext4_error.  The
  * abort function may be used to deal with unrecoverable failures such
  * as journal IO errors or ENOMEM at a critical moment in log management.
  *
  * We unconditionally force the filesystem into an ABORT|READONLY state,
  * unless the error response on the fs has been set to panic in which
  * case we take the easy way out and panic immediately.
  */
 void __ext4_abort(struct super_block *sb, const char *function,
 		unsigned int line, const char *fmt, ...)
 {
 	va_list args;
 	save_error_info(sb, function, line);
 	va_start(args, fmt);
 	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
 	       function, line);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
 	if ((sb->s_flags & MS_RDONLY) == 0) {
 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 		/*
 		 * Make sure updated value of ->s_mount_flags will be visible
 		 * before ->s_flags update
 		 */
 		smp_wmb();
 		sb->s_flags |= MS_RDONLY;
 		if (EXT4_SB(sb)->s_journal)
 			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 		save_error_info(sb, function, line);
 	}
 	if (test_opt(sb, ERRORS_PANIC))
 		panic("EXT4-fs panic from previous error\n");
 }
 void __ext4_msg(struct super_block *sb,
 		const char *prefix, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
 	if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
 		return;
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
 	printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
 	va_end(args);
 }
 void __ext4_warning(struct super_block *sb, const char *function,
 		    unsigned int line, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
 	if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
 			  "EXT4-fs warning"))
 		return;
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
 	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
 	       sb->s_id, function, line, &vaf);
 	va_end(args);
 }
 void __ext4_grp_locked_error(const char *function, unsigned int line,
 			     struct super_block *sb, ext4_group_t grp,
 			     unsigned long ino, ext4_fsblk_t block,
 			     const char *fmt, ...)
 __releases(bitlock)
 __acquires(bitlock)
 {
 	struct va_format vaf;
 	va_list args;
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	es->s_last_error_ino = cpu_to_le32(ino);
 	es->s_last_error_block = cpu_to_le64(block);
 	__save_error_info(sb, function, line);
 	if (ext4_error_ratelimit(sb)) {
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
 		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
 		       sb->s_id, function, line, grp);
 		if (ino)
 			printk(KERN_CONT "inode %lu: ", ino);
 		if (block)
 			printk(KERN_CONT "block %llu:",
 			       (unsigned long long) block);
 		printk(KERN_CONT "%pV\n", &vaf);
 		va_end(args);
 	}
 	if (test_opt(sb, ERRORS_CONT)) {
 		ext4_commit_super(sb, 0);
 		return;
 	}
 	ext4_unlock_group(sb, grp);
 	ext4_handle_error(sb);
 	/*
 	 * We only get here in the ERRORS_RO case; relocking the group
 	 * may be dangerous, but nothing bad will happen since the
 	 * filesystem will have already been marked read/only and the
 	 * journal has been aborted.  We return 1 as a hint to callers
 	 * who might what to use the return value from
 	 * ext4_grp_locked_error() to distinguish between the
 	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
 	 * aggressively from the ext4 function in question, with a
 	 * more appropriate error code.
 	 */
 	ext4_lock_group(sb, grp);
 	return;
 }
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
 		return;
 	ext4_warning(sb,
 		     "updating to rev %d because of new feature flag, "
 		     "running e2fsck is recommended",
 		     EXT4_DYNAMIC_REV);
 	es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
 	es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
 	es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
 	/* leave es->s_feature_*compat flags alone */
 	/* es->s_uuid will be set by e2fsck if empty */
 	/*
 	 * The rest of the superblock fields should be zero, and if not it
 	 * means they are likely already in use, so leave them alone.  We
 	 * can leave it up to e2fsck to clean up any inconsistencies there.
 	 */
 }
 /*
  * Open the external journal device
  */
 static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 {
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
 fail:
 	ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
 			__bdevname(dev, b), PTR_ERR(bdev));
 	return NULL;
 }
 /*
  * Release the journal device
  */
 static void ext4_blkdev_put(struct block_device *bdev)
 {
 	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
 {
 	struct block_device *bdev;
 	bdev = sbi->journal_bdev;
 	if (bdev) {
 		ext4_blkdev_put(bdev);
 		sbi->journal_bdev = NULL;
 	}
 }
 static inline struct inode *orphan_list_entry(struct list_head *l)
 {
 	return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
 }
 static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 {
 	struct list_head *l;
 	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
 		 le32_to_cpu(sbi->s_es->s_last_orphan));
 	printk(KERN_ERR "sb_info orphan list:\n");
 	list_for_each(l, &sbi->s_orphan) {
 		struct inode *inode = orphan_list_entry(l);
 		printk(KERN_ERR "  "
 		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
 		       inode->i_sb->s_id, inode->i_ino, inode,
 		       inode->i_mode, inode->i_nlink,
 		       NEXT_ORPHAN(inode));
 	}
 }
 static void ext4_put_super(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 	ext4_unregister_li_request(sb);
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 	flush_workqueue(sbi->rsv_conversion_wq);
 	destroy_workqueue(sbi->rsv_conversion_wq);
 	if (sbi->s_journal) {
 		err = jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 		if (err < 0)
 			ext4_abort(sb, "Couldn't clean up the journal");
 	}
 	ext4_es_unregister_shrinker(sbi);
 	del_timer_sync(&sbi->s_err_report);
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
 	}
 	if (!(sb->s_flags & MS_RDONLY))
 		ext4_commit_super(sb, 1);
 	if (sbi->s_proc) {
 		remove_proc_entry("options", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
 	kobject_del(&sbi->s_kobj);
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kvfree(sbi->s_group_desc);
 	kvfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
 #endif
 	/* Debugging code just in case the in-memory inode orphan list
 	 * isn't empty.  The on-disk one can be non-empty if we've
 	 * detected an error and taken the fs readonly, but the
 	 * in-memory list had better be clean by this point. */
 	if (!list_empty(&sbi->s_orphan))
 		dump_orphan_list(sb, sbi);
 	J_ASSERT(list_empty(&sbi->s_orphan));
 	invalidate_bdev(sb->s_bdev);
 	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
 		/*
 		 * Invalidate the journal device's buffers.  We don't want them
 		 * floating about in memory - the physical journal device may
 		 * hotswapped, and it breaks the `ro-after' testing code.
 		 */
 		sync_blockdev(sbi->journal_bdev);
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
 	if (sbi->s_mb_cache) {
 		ext4_xattr_destroy_cache(sbi->s_mb_cache);
 		sbi->s_mb_cache = NULL;
 	}
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 	sb->s_fs_info = NULL;
 	/*
 	 * Now that we are completely done shutting down the
 	 * superblock, we need to actually destroy the kobject.
 	 */
 	kobject_put(&sbi->s_kobj);
 	wait_for_completion(&sbi->s_kobj_unregister);
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 }
 static struct kmem_cache *ext4_inode_cachep;
 /*
  * Called inside transaction, so use GFP_NOFS
  */
 static struct inode *ext4_alloc_inode(struct super_block *sb)
 {
 	struct ext4_inode_info *ei;
 	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
 	ei->vfs_inode.i_version = 1;
 	spin_lock_init(&ei->i_raw_lock);
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
 	ext4_es_init_tree(&ei->i_es_tree);
 	rwlock_init(&ei->i_es_lock);
 	INIT_LIST_HEAD(&ei->i_es_list);
 	ei->i_es_all_nr = 0;
 	ei->i_es_shk_nr = 0;
 	ei->i_es_shrink_lblk = 0;
 	ei->i_reserved_data_blocks = 0;
 	ei->i_reserved_meta_blocks = 0;
 	ei->i_allocated_meta_blocks = 0;
 	ei->i_da_metadata_calc_len = 0;
 	ei->i_da_metadata_calc_last_lblock = 0;
 	spin_lock_init(&(ei->i_block_reservation_lock));
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;
 	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
 #endif
 	ei->jinode = NULL;
 	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
 	spin_lock_init(&ei->i_completed_io_lock);
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
 	atomic_set(&ei->i_ioend_count, 0);
 	atomic_set(&ei->i_unwritten, 0);
 	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
 	return &ei->vfs_inode;
 }
 static int ext4_drop_inode(struct inode *inode)
 {
 	int drop = generic_drop_inode(inode);
 	trace_ext4_drop_inode(inode, drop);
 	return drop;
 }
 static void ext4_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
 }
 static void ext4_destroy_inode(struct inode *inode)
 {
 	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
 		ext4_msg(inode->i_sb, KERN_ERR,
 			 "Inode %lu (%p): orphan list check failed!",
 			 inode->i_ino, EXT4_I(inode));
 		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
 				EXT4_I(inode), sizeof(struct ext4_inode_info),
 				true);
 		dump_stack();
 	}
 	call_rcu(&inode->i_rcu, ext4_i_callback);
 }
 static void init_once(void *foo)
 {
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 	INIT_LIST_HEAD(&ei->i_orphan);
 	init_rwsem(&ei->xattr_sem);
 	init_rwsem(&ei->i_data_sem);
 	inode_init_once(&ei->vfs_inode);
 }
 static int __init init_inodecache(void)
 {
 	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
 					     sizeof(struct ext4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
 					     init_once);
 	if (ext4_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
 }
 static void destroy_inodecache(void)
 {
 	/*
 	 * Make sure all delayed rcu free inodes are flushed before we
 	 * destroy cache.
 	 */
 	rcu_barrier();
 	kmem_cache_destroy(ext4_inode_cachep);
 }
 void ext4_clear_inode(struct inode *inode)
 {
 	invalidate_inode_buffers(inode);
 	clear_inode(inode);
 	dquot_drop(inode);
 	ext4_discard_preallocations(inode);
 	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
 	if (EXT4_I(inode)->jinode) {
 		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
 					       EXT4_I(inode)->jinode);
 		jbd2_free_inode(EXT4_I(inode)->jinode);
 		EXT4_I(inode)->jinode = NULL;
 	}
 }
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 					u64 ino, u32 generation)
 {
 	struct inode *inode;
 	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
 		return ERR_PTR(-ESTALE);
 	if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
 		return ERR_PTR(-ESTALE);
 	/* iget isn't really right if the inode is currently unallocated!!
 	 *
 	 * ext4_read_inode will return a bad_inode if the inode had been
 	 * deleted, so we should be safe.
 	 *
 	 * Currently we don't know the generation for parent directory, so
 	 * a generation of 0 means "accept any"
 	 */
 	inode = ext4_iget_normal(sb, ino);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	if (generation && inode->i_generation != generation) {
 		iput(inode);
 		return ERR_PTR(-ESTALE);
 	}
 	return inode;
 }
 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
 					int fh_len, int fh_type)
 {
 	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
 				    ext4_nfs_get_inode);
 }
 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
 					int fh_len, int fh_type)
 {
 	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
 				    ext4_nfs_get_inode);
 }
 /*
  * Try to release metadata pages (indirect blocks, directories) which are
  * mapped via the block device.  Since these pages could have journal heads
  * which would prevent try_to_free_buffers() from freeing them, we must use
  * jbd2 layer's try_to_free_buffers() function to release them.
  */
 static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
 				 gfp_t wait)
 {
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 	WARN_ON(PageChecked(page));
 	if (!page_has_buffers(page))
 		return 0;
 	if (journal)
 		return jbd2_journal_try_to_free_buffers(journal, page,
 							wait & ~__GFP_WAIT);
 	return try_to_free_buffers(page);
 }
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 static int ext4_write_dquot(struct dquot *dquot);
 static int ext4_acquire_dquot(struct dquot *dquot);
 static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 			 struct path *path);
 static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off);
 static ssize_t ext4_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 			     unsigned int flags);
 static int ext4_enable_quotas(struct super_block *sb);
 static struct dquot **ext4_get_dquots(struct inode *inode)
 {
 	return EXT4_I(inode)->i_dquot;
 }
 static const struct dquot_operations ext4_quota_operations = {
 	.get_reserved_space = ext4_get_reserved_space,
 	.write_dquot	= ext4_write_dquot,
 	.acquire_dquot	= ext4_acquire_dquot,
 	.release_dquot	= ext4_release_dquot,
 	.mark_dirty	= ext4_mark_dquot_dirty,
 	.write_info	= ext4_write_info,
 	.alloc_dquot	= dquot_alloc,
 	.destroy_dquot	= dquot_destroy,
 };
 static const struct quotactl_ops ext4_qctl_operations = {
 	.quota_on	= ext4_quota_on,
 	.quota_off	= ext4_quota_off,
 	.quota_sync	= dquot_quota_sync,
 	.get_info	= dquot_get_dqinfo,
 	.set_info	= dquot_set_dqinfo,
 	.get_dqblk	= dquot_get_dqblk,
 	.set_dqblk	= dquot_set_dqblk
 };
 #endif
 static const struct super_operations ext4_sops = {
 	.alloc_inode	= ext4_alloc_inode,
 	.destroy_inode	= ext4_destroy_inode,
 	.write_inode	= ext4_write_inode,
 	.dirty_inode	= ext4_dirty_inode,
 	.drop_inode	= ext4_drop_inode,
 	.evict_inode	= ext4_evict_inode,
 	.put_super	= ext4_put_super,
 	.sync_fs	= ext4_sync_fs,
 	.freeze_fs	= ext4_freeze,
 	.unfreeze_fs	= ext4_unfreeze,
 	.statfs		= ext4_statfs,
 	.remount_fs	= ext4_remount,
 	.show_options	= ext4_show_options,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext4_quota_read,
 	.quota_write	= ext4_quota_write,
 	.get_dquots	= ext4_get_dquots,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
 };
 static const struct export_operations ext4_export_ops = {
 	.fh_to_dentry = ext4_fh_to_dentry,
 	.fh_to_parent = ext4_fh_to_parent,
 	.get_parent = ext4_get_parent,
 };
 enum {
 	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_nouid32, Opt_debug, Opt_removed,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
 	Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
 	Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
 	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
 	Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
 	Opt_lazytime, Opt_nolazytime,
 	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
 	Opt_max_dir_size_kb, Opt_nojournal_checksum,
 };
 static const match_table_t tokens = {
 	{Opt_bsd_df, "bsddf"},
 	{Opt_minix_df, "minixdf"},
 	{Opt_grpid, "grpid"},
 	{Opt_grpid, "bsdgroups"},
 	{Opt_nogrpid, "nogrpid"},
 	{Opt_nogrpid, "sysvgroups"},
 	{Opt_resgid, "resgid=%u"},
 	{Opt_resuid, "resuid=%u"},
 	{Opt_sb, "sb=%u"},
 	{Opt_err_cont, "errors=continue"},
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
 	{Opt_nouid32, "nouid32"},
 	{Opt_debug, "debug"},
 	{Opt_removed, "oldalloc"},
 	{Opt_removed, "orlov"},
 	{Opt_user_xattr, "user_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
 	{Opt_noload, "norecovery"},
 	{Opt_noload, "noload"},
 	{Opt_removed, "nobh"},
 	{Opt_removed, "bh"},
 	{Opt_commit, "commit=%u"},
 	{Opt_min_batch_time, "min_batch_time=%u"},
 	{Opt_max_batch_time, "max_batch_time=%u"},
 	{Opt_journal_dev, "journal_dev=%u"},
 	{Opt_journal_path, "journal_path=%s"},
 	{Opt_journal_checksum, "journal_checksum"},
 	{Opt_nojournal_checksum, "nojournal_checksum"},
 	{Opt_journal_async_commit, "journal_async_commit"},
 	{Opt_abort, "abort"},
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_data_err_abort, "data_err=abort"},
 	{Opt_data_err_ignore, "data_err=ignore"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
 	{Opt_grpjquota, "grpjquota=%s"},
 	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
 	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
 	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
 	{Opt_grpquota, "grpquota"},
 	{Opt_noquota, "noquota"},
 	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
 	{Opt_barrier, "barrier"},
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_i_version, "i_version"},
 	{Opt_dax, "dax"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_lazytime, "lazytime"},
 	{Opt_nolazytime, "nolazytime"},
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_removed, "mblk_io_submit"},
 	{Opt_removed, "nomblk_io_submit"},
 	{Opt_block_validity, "block_validity"},
 	{Opt_noblock_validity, "noblock_validity"},
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
 	{Opt_journal_ioprio, "journal_ioprio=%u"},
 	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
 	{Opt_auto_da_alloc, "auto_da_alloc"},
 	{Opt_noauto_da_alloc, "noauto_da_alloc"},
 	{Opt_dioread_nolock, "dioread_nolock"},
 	{Opt_dioread_lock, "dioread_lock"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
 	{Opt_init_itable, "init_itable=%u"},
 	{Opt_init_itable, "init_itable"},
 	{Opt_noinit_itable, "noinit_itable"},
 	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
 	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
 	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
 	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
 	{Opt_removed, "noreservation"}, /* mount option from ext2/3 */
 	{Opt_removed, "journal=%u"},	/* mount option from ext2/3 */
 	{Opt_err, NULL},
 };
 static ext4_fsblk_t get_sb_block(void **data)
 {
 	ext4_fsblk_t	sb_block;
 	char		*options = (char *) *data;
 	if (!options || strncmp(options, "sb=", 3) != 0)
 		return 1;	/* Default location */
 	options += 3;
 	/* TODO: use simple_strtoll with >32bit ext4 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
 		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
 		       (char *) *data);
 		return 1;
 	}
 	if (*options == ',')
 		options++;
 	*data = (void *) options;
 	return sb_block;
 }
 #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
 static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
 	"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
 #ifdef CONFIG_QUOTA
 static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	char *qname;
 	int ret = -1;
 	if (sb_any_quota_loaded(sb) &&
 		!sbi->s_qf_names[qtype]) {
 		ext4_msg(sb, KERN_ERR,
 			"Cannot change journaled "
 			"quota options when quota turned on");
 		return -1;
 	}
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
 		ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
 			 "when QUOTA feature is enabled");
 		return -1;
 	}
 	qname = match_strdup(args);
 	if (!qname) {
 		ext4_msg(sb, KERN_ERR,
 			"Not enough memory for storing quotafile name");
 		return -1;
 	}
 	if (sbi->s_qf_names[qtype]) {
 		if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
 			ret = 1;
 		else
 			ext4_msg(sb, KERN_ERR,
 				 "%s quota file already specified",
 				 QTYPE2NAME(qtype));
 		goto errout;
 	}
 	if (strchr(qname, '/')) {
 		ext4_msg(sb, KERN_ERR,
 			"quotafile must be on filesystem root");
 		goto errout;
 	}
 	sbi->s_qf_names[qtype] = qname;
 	set_opt(sb, QUOTA);
 	return 1;
 errout:
 	kfree(qname);
 	return ret;
 }
 static int clear_qf_name(struct super_block *sb, int qtype)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	if (sb_any_quota_loaded(sb) &&
 		sbi->s_qf_names[qtype]) {
 		ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
 			" when quota turned on");
 		return -1;
 	}
 	kfree(sbi->s_qf_names[qtype]);
 	sbi->s_qf_names[qtype] = NULL;
 	return 1;
 }
 #endif
 #define MOPT_SET	0x0001
 #define MOPT_CLEAR	0x0002
 #define MOPT_NOSUPPORT	0x0004
 #define MOPT_EXPLICIT	0x0008
 #define MOPT_CLEAR_ERR	0x0010
 #define MOPT_GTE0	0x0020
 #ifdef CONFIG_QUOTA
 #define MOPT_Q		0
 #define MOPT_QFMT	0x0040
 #else
 #define MOPT_Q		MOPT_NOSUPPORT
 #define MOPT_QFMT	MOPT_NOSUPPORT
 #endif
 #define MOPT_DATAJ	0x0080
 #define MOPT_NO_EXT2	0x0100
 #define MOPT_NO_EXT3	0x0200
 #define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
 #define MOPT_STRING	0x0400
 static const struct mount_opts {
 	int	token;
 	int	mount_opt;
 	int	flags;
 } ext4_mount_opts[] = {
 	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
 	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
 	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
 	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
 	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
 	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
 	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
 	 MOPT_EXT4_ONLY | MOPT_SET},
 	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
 	 MOPT_EXT4_ONLY | MOPT_CLEAR},
 	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
 	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
 	{Opt_delalloc, EXT4_MOUNT_DELALLOC,
 	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
 	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
 	 MOPT_EXT4_ONLY | MOPT_CLEAR},
 	{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
 	 MOPT_EXT4_ONLY | MOPT_CLEAR},
 	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
 	 MOPT_EXT4_ONLY | MOPT_SET},
 	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
 				    EXT4_MOUNT_JOURNAL_CHECKSUM),
 	 MOPT_EXT4_ONLY | MOPT_SET},
 	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
 	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
 	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
 	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
 	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
 	 MOPT_NO_EXT2 | MOPT_SET},
 	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
 	 MOPT_NO_EXT2 | MOPT_CLEAR},
 	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
 	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
 	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
 	{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
 	{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
 	{Opt_commit, 0, MOPT_GTE0},
 	{Opt_max_batch_time, 0, MOPT_GTE0},
 	{Opt_min_batch_time, 0, MOPT_GTE0},
 	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
 	{Opt_init_itable, 0, MOPT_GTE0},
 	{Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
 	{Opt_stripe, 0, MOPT_GTE0},
 	{Opt_resuid, 0, MOPT_GTE0},
 	{Opt_resgid, 0, MOPT_GTE0},
 	{Opt_journal_dev, 0, MOPT_GTE0},
 	{Opt_journal_path, 0, MOPT_STRING},
 	{Opt_journal_ioprio, 0, MOPT_GTE0},
 	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
 	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
 	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
 	 MOPT_NO_EXT2 | MOPT_DATAJ},
 	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
 	{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
 	{Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
 #else
 	{Opt_acl, 0, MOPT_NOSUPPORT},
 	{Opt_noacl, 0, MOPT_NOSUPPORT},
 #endif
 	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
 	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
 	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
 	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
 							MOPT_SET | MOPT_Q},
 	{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
 							MOPT_SET | MOPT_Q},
 	{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
 		       EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
 	{Opt_usrjquota, 0, MOPT_Q},
 	{Opt_grpjquota, 0, MOPT_Q},
 	{Opt_offusrjquota, 0, MOPT_Q},
 	{Opt_offgrpjquota, 0, MOPT_Q},
 	{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
 	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
 	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
 	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
 	{Opt_err, 0, 0}
 };
 static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 			    substring_t *args, unsigned long *journal_devnum,
 			    unsigned int *journal_ioprio, int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	const struct mount_opts *m;
 	kuid_t uid;
 	kgid_t gid;
 	int arg = 0;
 #ifdef CONFIG_QUOTA
 	if (token == Opt_usrjquota)
 		return set_qf_name(sb, USRQUOTA, &args[0]);
 	else if (token == Opt_grpjquota)
 		return set_qf_name(sb, GRPQUOTA, &args[0]);
 	else if (token == Opt_offusrjquota)
 		return clear_qf_name(sb, USRQUOTA);
 	else if (token == Opt_offgrpjquota)
 		return clear_qf_name(sb, GRPQUOTA);
 #endif
 	switch (token) {
 	case Opt_noacl:
 	case Opt_nouser_xattr:
 		ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
 		break;
 	case Opt_sb:
 		return 1;	/* handled by get_sb_block() */
 	case Opt_removed:
 		ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
 		return 1;
 	case Opt_abort:
 		sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
 		return 1;
 	case Opt_i_version:
 		sb->s_flags |= MS_I_VERSION;
 		return 1;
 	case Opt_lazytime:
 		sb->s_flags |= MS_LAZYTIME;
 		return 1;
 	case Opt_nolazytime:
 		sb->s_flags &= ~MS_LAZYTIME;
 		return 1;
 	}
 	for (m = ext4_mount_opts; m->token != Opt_err; m++)
 		if (token == m->token)
 			break;
 	if (m->token == Opt_err) {
 		ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
 			 "or missing value", opt);
 		return -1;
 	}
 	if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
 		ext4_msg(sb, KERN_ERR,
 			 "Mount option \"%s\" incompatible with ext2", opt);
 		return -1;
 	}
 	if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
 		ext4_msg(sb, KERN_ERR,
 			 "Mount option \"%s\" incompatible with ext3", opt);
 		return -1;
 	}
 	if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
 		return -1;
 	if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
 		return -1;
 	if (m->flags & MOPT_EXPLICIT)
 		set_opt2(sb, EXPLICIT_DELALLOC);
 	if (m->flags & MOPT_CLEAR_ERR)
 		clear_opt(sb, ERRORS_MASK);
 	if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
 		ext4_msg(sb, KERN_ERR, "Cannot change quota "
 			 "options when quota turned on");
 		return -1;
 	}
 	if (m->flags & MOPT_NOSUPPORT) {
 		ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
 	} else if (token == Opt_commit) {
 		if (arg == 0)
 			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
 		sbi->s_commit_interval = HZ * arg;
 	} else if (token == Opt_max_batch_time) {
 		sbi->s_max_batch_time = arg;
 	} else if (token == Opt_min_batch_time) {
 		sbi->s_min_batch_time = arg;
 	} else if (token == Opt_inode_readahead_blks) {
 		if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
 			ext4_msg(sb, KERN_ERR,
 				 "EXT4-fs: inode_readahead_blks must be "
 				 "0 or a power of 2 smaller than 2^31");
 			return -1;
 		}
 		sbi->s_inode_readahead_blks = arg;
 	} else if (token == Opt_init_itable) {
 		set_opt(sb, INIT_INODE_TABLE);
 		if (!args->from)
 			arg = EXT4_DEF_LI_WAIT_MULT;
 		sbi->s_li_wait_mult = arg;
 	} else if (token == Opt_max_dir_size_kb) {
 		sbi->s_max_dir_size_kb = arg;
 	} else if (token == Opt_stripe) {
 		sbi->s_stripe = arg;
 	} else if (token == Opt_resuid) {
 		uid = make_kuid(current_user_ns(), arg);
 		if (!uid_valid(uid)) {
 			ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
 			return -1;
 		}
 		sbi->s_resuid = uid;
 	} else if (token == Opt_resgid) {
 		gid = make_kgid(current_user_ns(), arg);
 		if (!gid_valid(gid)) {
 			ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
 			return -1;
 		}
 		sbi->s_resgid = gid;
 	} else if (token == Opt_journal_dev) {
 		if (is_remount) {
 			ext4_msg(sb, KERN_ERR,
 				 "Cannot specify journal on remount");
 			return -1;
 		}
 		*journal_devnum = arg;
 	} else if (token == Opt_journal_path) {
 		char *journal_path;
 		struct inode *journal_inode;
 		struct path path;
 		int error;
 		if (is_remount) {
 			ext4_msg(sb, KERN_ERR,
 				 "Cannot specify journal on remount");
 			return -1;
 		}
 		journal_path = match_strdup(&args[0]);
 		if (!journal_path) {
 			ext4_msg(sb, KERN_ERR, "error: could not dup "
 				"journal device string");
 			return -1;
 		}
 		error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
 		if (error) {
 			ext4_msg(sb, KERN_ERR, "error: could not find "
 				"journal device path: error %d", error);
 			kfree(journal_path);
 			return -1;
 		}
 		journal_inode = path.dentry->d_inode;
 		if (!S_ISBLK(journal_inode->i_mode)) {
 			ext4_msg(sb, KERN_ERR, "error: journal path %s "
 				"is not a block device", journal_path);
 			path_put(&path);
 			kfree(journal_path);
 			return -1;
 		}
 		*journal_devnum = new_encode_dev(journal_inode->i_rdev);
 		path_put(&path);
 		kfree(journal_path);
 	} else if (token == Opt_journal_ioprio) {
 		if (arg > 7) {
 			ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
 				 " (must be 0-7)");
 			return -1;
 		}
 		*journal_ioprio =
 			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
 	} else if (m->flags & MOPT_DATAJ) {
 		if (is_remount) {
 			if (!sbi->s_journal)
 				ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
 			else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
 				ext4_msg(sb, KERN_ERR,
 					 "Cannot change data mode on remount");
 				return -1;
 			}
 		} else {
 			clear_opt(sb, DATA_FLAGS);
 			sbi->s_mount_opt |= m->mount_opt;
 		}
 #ifdef CONFIG_QUOTA
 	} else if (m->flags & MOPT_QFMT) {
 		if (sb_any_quota_loaded(sb) &&
 		    sbi->s_jquota_fmt != m->mount_opt) {
 			ext4_msg(sb, KERN_ERR, "Cannot change journaled "
 				 "quota options when quota turned on");
 			return -1;
 		}
 		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 					       EXT4_FEATURE_RO_COMPAT_QUOTA)) {
 			ext4_msg(sb, KERN_ERR,
 				 "Cannot set journaled quota options "
 				 "when QUOTA feature is enabled");
 			return -1;
 		}
 		sbi->s_jquota_fmt = m->mount_opt;
 #endif
 #ifndef CONFIG_FS_DAX
 	} else if (token == Opt_dax) {
 		ext4_msg(sb, KERN_INFO, "dax option not supported");
 		return -1;
 #endif
 	} else {
 		if (!args->from)
 			arg = 1;
 		if (m->flags & MOPT_CLEAR)
 			arg = !arg;
 		else if (unlikely(!(m->flags & MOPT_SET))) {
 			ext4_msg(sb, KERN_WARNING,
 				 "buggy handling of option %s", opt);
 			WARN_ON(1);
 			return -1;
 		}
 		if (arg != 0)
 			sbi->s_mount_opt |= m->mount_opt;
 		else
 			sbi->s_mount_opt &= ~m->mount_opt;
 	}
 	return 1;
 }
 static int parse_options(char *options, struct super_block *sb,
 			 unsigned long *journal_devnum,
 			 unsigned int *journal_ioprio,
 			 int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
 	int token;
 	if (!options)
 		return 1;
 	while ((p = strsep(&options, ",")) != NULL) {
 		if (!*p)
 			continue;
 		/*
 		 * Initialize args struct so we know whether arg was
 		 * found; some options take optional arguments.
 		 */
 		args[0].to = args[0].from = NULL;
 		token = match_token(p, tokens, args);
 		if (handle_mount_opt(sb, p, token, args, journal_devnum,
 				     journal_ioprio, is_remount) < 0)
 			return 0;
 	}
 #ifdef CONFIG_QUOTA
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
 	    (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
 		ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
 			 "feature is enabled");
 		return 0;
 	}
 	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
 		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
 			clear_opt(sb, USRQUOTA);
 		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
 			clear_opt(sb, GRPQUOTA);
 		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
 			ext4_msg(sb, KERN_ERR, "old and new quota "
 					"format mixing");
 			return 0;
 		}
 		if (!sbi->s_jquota_fmt) {
 			ext4_msg(sb, KERN_ERR, "journaled quota format "
 					"not specified");
 			return 0;
 		}
 	}
 #endif
 	if (test_opt(sb, DIOREAD_NOLOCK)) {
 		int blocksize =
 			BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 		if (blocksize < PAGE_CACHE_SIZE) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
 				 "dioread_nolock if block size != PAGE_SIZE");
 			return 0;
 		}
 	}
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
 	    test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
 		ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
 			 "in data=ordered mode");
 		return 0;
 	}
 	return 1;
 }
 static inline void ext4_show_quota_options(struct seq_file *seq,
 					   struct super_block *sb)
 {
 #if defined(CONFIG_QUOTA)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	if (sbi->s_jquota_fmt) {
 		char *fmtname = "";
 		switch (sbi->s_jquota_fmt) {
 		case QFMT_VFS_OLD:
 			fmtname = "vfsold";
 			break;
 		case QFMT_VFS_V0:
 			fmtname = "vfsv0";
 			break;
 		case QFMT_VFS_V1:
 			fmtname = "vfsv1";
 			break;
 		}
 		seq_printf(seq, ",jqfmt=%s", fmtname);
 	}
 	if (sbi->s_qf_names[USRQUOTA])
 		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
 	if (sbi->s_qf_names[GRPQUOTA])
 		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
 #endif
 }
 static const char *token2str(int token)
 {
 	const struct match_token *t;
 	for (t = tokens; t->token != Opt_err; t++)
 		if (t->token == token && !strchr(t->pattern, '='))
 			break;
 	return t->pattern;
 }
 /*
  * Show an option if
  *  - it's set to a non-default value OR
  *  - if the per-sb default is different from the global default
  */
 static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
 			      int nodefs)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
 	const struct mount_opts *m;
 	char sep = nodefs ? '\n' : ',';
 #define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
 #define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
 	if (sbi->s_sb_block != 1)
 		SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
 	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
 		int want_set = m->flags & MOPT_SET;
 		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
 		    (m->flags & MOPT_CLEAR_ERR))
 			continue;
 		if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
 			continue; /* skip if same as the default */
 		if ((want_set &&
 		     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
 		    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
 			continue; /* select Opt_noFoo vs Opt_Foo */
 		SEQ_OPTS_PRINT("%s", token2str(m->token));
 	}
 	if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
 	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
 		SEQ_OPTS_PRINT("resuid=%u",
 				from_kuid_munged(&init_user_ns, sbi->s_resuid));
 	if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
 	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
 		SEQ_OPTS_PRINT("resgid=%u",
 				from_kgid_munged(&init_user_ns, sbi->s_resgid));
 	def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
 	if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
 		SEQ_OPTS_PUTS("errors=remount-ro");
 	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
 		SEQ_OPTS_PUTS("errors=continue");
 	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
 		SEQ_OPTS_PUTS("errors=panic");
 	if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
 		SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
 	if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
 		SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
 	if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
 		SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
 	if (sb->s_flags & MS_I_VERSION)
 		SEQ_OPTS_PUTS("i_version");
 	if (nodefs || sbi->s_stripe)
 		SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
 	if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
 		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
 			SEQ_OPTS_PUTS("data=journal");
 		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
 			SEQ_OPTS_PUTS("data=ordered");
 		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
 			SEQ_OPTS_PUTS("data=writeback");
 	}
 	if (nodefs ||
 	    sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
 		SEQ_OPTS_PRINT("inode_readahead_blks=%u",
 			       sbi->s_inode_readahead_blks);
 	if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
 		       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
 		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
 	if (nodefs || sbi->s_max_dir_size_kb)
 		SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
 static int ext4_show_options(struct seq_file *seq, struct dentry *root)
 {
 	return _ext4_show_options(seq, root->d_sb, 0);
 }
 static int options_seq_show(struct seq_file *seq, void *offset)
 {
 	struct super_block *sb = seq->private;
 	int rc;
 	seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
 	rc = _ext4_show_options(seq, sb, 1);
 	seq_puts(seq, "\n");
 	return rc;
 }
 static int options_open_fs(struct inode *inode, struct file *file)
 {
 	return single_open(file, options_seq_show, PDE_DATA(inode));
 }
 static const struct file_operations ext4_seq_options_fops = {
 	.owner = THIS_MODULE,
 	.open = options_open_fs,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = single_release,
 };
 static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			    int read_only)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int res = 0;
 	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
 		ext4_msg(sb, KERN_ERR, "revision level too high, "
 			 "forcing read-only mode");
 		res = MS_RDONLY;
 	}
 	if (read_only)
 		goto done;
 	if (!(sbi->s_mount_state & EXT4_VALID_FS))
 		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
 			 "running e2fsck is recommended");
 	else if (sbi->s_mount_state & EXT4_ERROR_FS)
 		ext4_msg(sb, KERN_WARNING,
 			 "warning: mounting fs with errors, "
 			 "running e2fsck is recommended");
 	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
 		 le16_to_cpu(es->s_mnt_count) >=
 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
 		ext4_msg(sb, KERN_WARNING,
 			 "warning: maximal mount count reached, "
 			 "running e2fsck is recommended");
 	else if (le32_to_cpu(es->s_checkinterval) &&
 		(le32_to_cpu(es->s_lastcheck) +
 			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
 		ext4_msg(sb, KERN_WARNING,
 			 "warning: checktime reached, "
 			 "running e2fsck is recommended");
 	if (!sbi->s_journal)
 		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
 		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
 	le16_add_cpu(&es->s_mnt_count, 1);
 	es->s_mtime = cpu_to_le32(get_seconds());
 	ext4_update_dynamic_rev(sb);
 	if (sbi->s_journal)
 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 	ext4_commit_super(sb, 1);
 done:
 	if (test_opt(sb, DEBUG))
 		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
 				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
 			sb->s_blocksize,
 			sbi->s_groups_count,
 			EXT4_BLOCKS_PER_GROUP(sb),
 			EXT4_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt, sbi->s_mount_opt2);
 	cleancache_init_fs(sb);
 	return res;
 }
 int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct flex_groups *new_groups;
 	int size;
 	if (!sbi->s_log_groups_per_flex)
 		return 0;
 	size = ext4_flex_group(sbi, ngroup - 1) + 1;
 	if (size <= sbi->s_flex_groups_allocated)
 		return 0;
 	size = roundup_pow_of_two(size * sizeof(struct flex_groups));
 	new_groups = ext4_kvzalloc(size, GFP_KERNEL);
 	if (!new_groups) {
 		ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
 			 size / (int) sizeof(struct flex_groups));
 		return -ENOMEM;
 	}
 	if (sbi->s_flex_groups) {
 		memcpy(new_groups, sbi->s_flex_groups,
 		       (sbi->s_flex_groups_allocated *
 			sizeof(struct flex_groups)));
 		kvfree(sbi->s_flex_groups);
 	}
 	sbi->s_flex_groups = new_groups;
 	sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
 	return 0;
 }
 static int ext4_fill_flex_info(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *gdp = NULL;
 	ext4_group_t flex_group;
 	int i, err;
 	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
 	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
 		sbi->s_log_groups_per_flex = 0;
 		return 1;
 	}
 	err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
 	if (err)
 		goto failed;
 	for (i = 0; i < sbi->s_groups_count; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		flex_group = ext4_flex_group(sbi, i);
 		atomic_add(ext4_free_inodes_count(sb, gdp),
 			   &sbi->s_flex_groups[flex_group].free_inodes);
 		atomic64_add(ext4_free_group_clusters(sb, gdp),
 			     &sbi->s_flex_groups[flex_group].free_clusters);
 		atomic_add(ext4_used_dirs_count(sb, gdp),
 			   &sbi->s_flex_groups[flex_group].used_dirs);
 	}
 	return 1;
 failed:
 	return 0;
 }
 static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
 				   struct ext4_group_desc *gdp)
 {
 	int offset;
 	__u16 crc = 0;
 	__le32 le_group = cpu_to_le32(block_group);
 	if (ext4_has_metadata_csum(sbi->s_sb)) {
 		/* Use new metadata_csum algorithm */
 		__le16 save_csum;
 		__u32 csum32;
 		save_csum = gdp->bg_checksum;
 		gdp->bg_checksum = 0;
 		csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
 				     sizeof(le_group));
 		csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
 				     sbi->s_desc_size);
 		gdp->bg_checksum = save_csum;
 		crc = csum32 & 0xFFFF;
 		goto out;
 	}
 	/* old crc16 code */
 	if (!(sbi->s_es->s_feature_ro_compat &
 	      cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
 		return 0;
 	offset = offsetof(struct ext4_group_desc, bg_checksum);
 	crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
 	crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
 	crc = crc16(crc, (__u8 *)gdp, offset);
 	offset += sizeof(gdp->bg_checksum); /* skip checksum */
 	/* for checksum of struct ext4_group_desc do the rest...*/
 	if ((sbi->s_es->s_feature_incompat &
 	     cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
 	    offset < le16_to_cpu(sbi->s_es->s_desc_size))
 		crc = crc16(crc, (__u8 *)gdp + offset,
 			    le16_to_cpu(sbi->s_es->s_desc_size) -
 				offset);
 out:
 	return cpu_to_le16(crc);
 }
 int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
 				struct ext4_group_desc *gdp)
 {
 	if (ext4_has_group_desc_csum(sb) &&
 	    (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
 						      block_group, gdp)))
 		return 0;
 	return 1;
 }
 void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
 			      struct ext4_group_desc *gdp)
 {
 	if (!ext4_has_group_desc_csum(sb))
 		return;
 	gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
 }
 /* Called at mount-time, super-block is locked */
 static int ext4_check_descriptors(struct super_block *sb,
 				  ext4_group_t *first_not_zeroed)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
 	ext4_fsblk_t last_block;
 	ext4_fsblk_t block_bitmap;
 	ext4_fsblk_t inode_bitmap;
 	ext4_fsblk_t inode_table;
 	int flexbg_flag = 0;
 	ext4_group_t i, grp = sbi->s_groups_count;
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		flexbg_flag = 1;
 	ext4_debug("Checking group descriptors");
 	for (i = 0; i < sbi->s_groups_count; i++) {
 		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
 		if (i == sbi->s_groups_count - 1 || flexbg_flag)
 			last_block = ext4_blocks_count(sbi->s_es) - 1;
 		else
 			last_block = first_block +
 				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
 		if ((grp == sbi->s_groups_count) &&
 		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
 			grp = i;
 		block_bitmap = ext4_block_bitmap(sb, gdp);
 		if (block_bitmap < first_block || block_bitmap > last_block) {
 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 			       "Block bitmap for group %u not in group "
 			       "(block %llu)!", i, block_bitmap);
 			return 0;
 		}
 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
 		if (inode_bitmap < first_block || inode_bitmap > last_block) {
 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 			       "Inode bitmap for group %u not in group "
 			       "(block %llu)!", i, inode_bitmap);
 			return 0;
 		}
 		inode_table = ext4_inode_table(sb, gdp);
 		if (inode_table < first_block ||
 		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 			       "Inode table for group %u not in group "
 			       "(block %llu)!", i, inode_table);
 			return 0;
 		}
 		ext4_lock_group(sb, i);
 		if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 				 "Checksum for group %u failed (%u!=%u)",
 				 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
 				     gdp)), le16_to_cpu(gdp->bg_checksum));
 			if (!(sb->s_flags & MS_RDONLY)) {
 				ext4_unlock_group(sb, i);
 				return 0;
 			}
 		}
 		ext4_unlock_group(sb, i);
 		if (!flexbg_flag)
 			first_block += EXT4_BLOCKS_PER_GROUP(sb);
 	}
 	if (NULL != first_not_zeroed)
 		*first_not_zeroed = grp;
 	return 1;
 }
 /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
  * the superblock) which were deleted from all directories, but held open by
  * a process at the time of a crash.  We walk the list and try to delete these
  * inodes at recovery time (only with a read-write filesystem).
  *
  * In order to keep the orphan inode chain consistent during traversal (in
  * case of crash during recovery), we link each inode into the superblock
  * orphan list_head and handle it the same way as an inode deletion during
  * normal operation (which journals the operations for us).
  *
  * We only do an iget() and an iput() on each inode, which is very safe if we
  * accidentally point at an in-use or already deleted inode.  The worst that
  * can happen in this case is that we get a "bit already cleared" message from
  * ext4_free_inode().  The only reason we would point at a wrong inode is if
  * e2fsck was run on this filesystem, and it must have already done the orphan
  * inode cleanup for us, so we can safely abort without any further action.
  */
 static void ext4_orphan_cleanup(struct super_block *sb,
 				struct ext4_super_block *es)
 {
 	unsigned int s_flags = sb->s_flags;
 	int nr_orphans = 0, nr_truncates = 0;
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
 	if (!es->s_last_orphan) {
 		jbd_debug(4, "no orphan inodes to clean up\n");
 		return;
 	}
 	if (bdev_read_only(sb->s_bdev)) {
 		ext4_msg(sb, KERN_ERR, "write access "
 			"unavailable, skipping orphan cleanup");
 		return;
 	}
 	/* Check if feature set would not allow a r/w mount */
 	if (!ext4_feature_set_ok(sb, 0)) {
 		ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
 			 "unknown ROCOMPAT features");
 		return;
 	}
 	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
 		/* don't clear list on RO mount w/ errors */
 		if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
 			ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
 				  "clearing orphan list.\n");
 			es->s_last_orphan = 0;
 		}
 		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
 		return;
 	}
 	if (s_flags & MS_RDONLY) {
 		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
 		sb->s_flags &= ~MS_RDONLY;
 	}
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
 	sb->s_flags |= MS_ACTIVE;
 	/* Turn on quotas so that they are updated correctly */
 	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
 		if (EXT4_SB(sb)->s_qf_names[i]) {
 			int ret = ext4_quota_on_mount(sb, i);
 			if (ret < 0)
 				ext4_msg(sb, KERN_ERR,
 					"Cannot turn on journaled "
 					"quota: error %d", ret);
 		}
 	}
 #endif
 	while (es->s_last_orphan) {
 		struct inode *inode;
 		inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
 		if (IS_ERR(inode)) {
 			es->s_last_orphan = 0;
 			break;
 		}
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
 		dquot_initialize(inode);
 		if (inode->i_nlink) {
 			if (test_opt(sb, DEBUG))
 				ext4_msg(sb, KERN_DEBUG,
 					"%s: truncating inode %lu to %lld bytes",
 					__func__, inode->i_ino, inode->i_size);
 			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
 			mutex_lock(&inode->i_mutex);
 			truncate_inode_pages(inode->i_mapping, inode->i_size);
 			ext4_truncate(inode);
 			mutex_unlock(&inode->i_mutex);
 			nr_truncates++;
 		} else {
 			if (test_opt(sb, DEBUG))
 				ext4_msg(sb, KERN_DEBUG,
 					"%s: deleting unreferenced inode %lu",
 					__func__, inode->i_ino);
 			jbd_debug(2, "deleting unreferenced inode %lu\n",
 				  inode->i_ino);
 			nr_orphans++;
 		}
 		iput(inode);  /* The delete magic happens here! */
 	}
 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
 	if (nr_orphans)
 		ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
 		       PLURAL(nr_orphans));
 	if (nr_truncates)
 		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
 		       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
 	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
 		if (sb_dqopt(sb)->files[i])
 			dquot_quota_off(sb, i);
 	}
 #endif
 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 }
 /*
  * Maximal extent format file size.
  * Resulting logical blkno at s_maxbytes must fit in our on-disk
  * extent format containers, within a sector_t, and within i_blocks
  * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
  * so that won't be a limiting factor.
  *
  * However there is other limiting factor. We do store extents in the form
  * of starting block and length, hence the resulting length of the extent
  * covering maximum file size must fit into on-disk format containers as
  * well. Given that length is always by 1 unit bigger than max unit (because
  * we count 0 as well) we have to lower the s_maxbytes by one fs block.
  *
  * Note, this does *not* consider any metadata overhead for vfs i_blocks.
  */
 static loff_t ext4_max_size(int blkbits, int has_huge_files)
 {
 	loff_t res;
 	loff_t upper_limit = MAX_LFS_FILESIZE;
 	/* small i_blocks in vfs inode? */
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
 		 * CONFIG_LBDAF is not enabled implies the inode
 		 * i_block represent total blocks in 512 bytes
 		 * 32 == size of vfs inode i_blocks * 8
 		 */
 		upper_limit = (1LL << 32) - 1;
 		/* total blocks in file system block size */
 		upper_limit >>= (blkbits - 9);
 		upper_limit <<= blkbits;
 	}
 	/*
 	 * 32-bit extent-start container, ee_block. We lower the maxbytes
 	 * by one fs block, so ee_len can cover the extent of maximum file
 	 * size
 	 */
 	res = (1LL << 32) - 1;
 	res <<= blkbits;
 	/* Sanity check against vm- & vfs- imposed limits */
 	if (res > upper_limit)
 		res = upper_limit;
 	return res;
 }
 /*
  * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
  * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
  * We need to be 1 filesystem block less than the 2^48 sector limit.
  */
 static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 {
 	loff_t res = EXT4_NDIR_BLOCKS;
 	int meta_blocks;
 	loff_t upper_limit;
 	/* This is calculated to be the largest file size for a dense, block
 	 * mapped file such that the file's total number of 512-byte sectors,
 	 * including data and all indirect blocks, does not exceed (2^48 - 1).
 	 *
 	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
 	 * number of 512-byte sectors of the file.
 	 */
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
 		 * !has_huge_files or CONFIG_LBDAF not enabled implies that
 		 * the inode i_block field represents total file blocks in
 		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
 		 */
 		upper_limit = (1LL << 32) - 1;
 		/* total blocks in file system block size */
 		upper_limit >>= (bits - 9);
 	} else {
 		/*
 		 * We use 48 bit ext4_inode i_blocks
 		 * With EXT4_HUGE_FILE_FL set the i_blocks
 		 * represent total number of blocks in
 		 * file system block size
 		 */
 		upper_limit = (1LL << 48) - 1;
 	}
 	/* indirect blocks */
 	meta_blocks = 1;
 	/* double indirect blocks */
 	meta_blocks += 1 + (1LL << (bits-2));
 	/* tripple indirect blocks */
 	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
 	upper_limit -= meta_blocks;
 	upper_limit <<= bits;
 	res += 1LL << (bits-2);
 	res += 1LL << (2*(bits-2));
 	res += 1LL << (3*(bits-2));
 	res <<= bits;
 	if (res > upper_limit)
 		res = upper_limit;
 	if (res > MAX_LFS_FILESIZE)
 		res = MAX_LFS_FILESIZE;
 	return res;
 }
 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
 				   ext4_fsblk_t logical_sb_block, int nr)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t bg, first_meta_bg;
 	int has_super = 0;
 	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
 	    nr < first_meta_bg)
 		return logical_sb_block + nr + 1;
 	bg = sbi->s_desc_per_block * nr;
 	if (ext4_bg_has_super(sb, bg))
 		has_super = 1;
 	/*
 	 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
 	 * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
 	 * on modern mke2fs or blksize > 1k on older mke2fs) then we must
 	 * compensate.
 	 */
 	if (sb->s_blocksize == 1024 && nr == 0 &&
 	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0)
 		has_super++;
 	return (has_super + ext4_group_first_block_no(sb, bg));
 }
 /**
  * ext4_get_stripe_size: Get the stripe size.
  * @sbi: In memory super block info
  *
  * If we have specified it via mount option, then
  * use the mount option value. If the value specified at mount time is
  * greater than the blocks per group use the super block value.
  * If the super block value is greater than blocks per group return 0.
  * Allocator needs it be less than blocks per group.
  *
  */
 static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 {
 	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
 	unsigned long stripe_width =
 			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
 	int ret;
 	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
 		ret = sbi->s_stripe;
 	else if (stripe_width <= sbi->s_blocks_per_group)
 		ret = stripe_width;
 	else if (stride <= sbi->s_blocks_per_group)
 		ret = stride;
 	else
 		ret = 0;
 	/*
 	 * If the stripe width is 1, this makes no sense and
 	 * we set it to 0 to turn off stripe handling code.
 	 */
 	if (ret <= 1)
 		ret = 0;
 	return ret;
 }
 /* sysfs supprt */
 struct ext4_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
 	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
 			 const char *, size_t);
 	union {
 		int offset;
 		int deprecated_val;
 	} u;
 };
 static int parse_strtoull(const char *buf,
 		unsigned long long max, unsigned long long *value)
 {
 	int ret;
 	ret = kstrtoull(skip_spaces(buf), 0, value);
 	if (!ret && *value > max)
 		ret = -EINVAL;
 	return ret;
 }
 static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
 					      struct ext4_sb_info *sbi,
 					      char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
 		(s64) EXT4_C2B(sbi,
 			percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
 }
 static ssize_t session_write_kbytes_show(struct ext4_attr *a,
 					 struct ext4_sb_info *sbi, char *buf)
 {
 	struct super_block *sb = sbi->s_buddy_cache->i_sb;
 	if (!sb->s_bdev->bd_part)
 		return snprintf(buf, PAGE_SIZE, "0\n");
 	return snprintf(buf, PAGE_SIZE, "%lu\n",
 			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
 			 sbi->s_sectors_written_start) >> 1);
 }
 static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
 					  struct ext4_sb_info *sbi, char *buf)
 {
 	struct super_block *sb = sbi->s_buddy_cache->i_sb;
 	if (!sb->s_bdev->bd_part)
 		return snprintf(buf, PAGE_SIZE, "0\n");
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
 			(unsigned long long)(sbi->s_kbytes_written +
 			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
 			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 					  struct ext4_sb_info *sbi,
 					  const char *buf, size_t count)
 {
 	unsigned long t;
 	int ret;
 	ret = kstrtoul(skip_spaces(buf), 0, &t);
 	if (ret)
 		return ret;
 	if (t && (!is_power_of_2(t) || t > 0x40000000))
 		return -EINVAL;
 	sbi->s_inode_readahead_blks = t;
 	return count;
 }
 static ssize_t sbi_ui_show(struct ext4_attr *a,
 			   struct ext4_sb_info *sbi, char *buf)
 {
 	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
 	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
 }
 static ssize_t sbi_ui_store(struct ext4_attr *a,
 			    struct ext4_sb_info *sbi,
 			    const char *buf, size_t count)
 {
 	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
 	unsigned long t;
 	int ret;
 	ret = kstrtoul(skip_spaces(buf), 0, &t);
 	if (ret)
 		return ret;
 	*ui = t;
 	return count;
 }
 static ssize_t es_ui_show(struct ext4_attr *a,
 			   struct ext4_sb_info *sbi, char *buf)
 {
 	unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
 			   a->u.offset);
 	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
 }
 static ssize_t reserved_clusters_show(struct ext4_attr *a,
 				  struct ext4_sb_info *sbi, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
 		(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
 }
 static ssize_t reserved_clusters_store(struct ext4_attr *a,
 				   struct ext4_sb_info *sbi,
 				   const char *buf, size_t count)
 {
 	unsigned long long val;
 	int ret;
 	if (parse_strtoull(buf, -1ULL, &val))
 		return -EINVAL;
 	ret = ext4_reserve_clusters(sbi, val);
 	return ret ? ret : count;
 }
 static ssize_t trigger_test_error(struct ext4_attr *a,
 				  struct ext4_sb_info *sbi,
 				  const char *buf, size_t count)
 {
 	int len = count;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (len && buf[len-1] == '\n')
 		len--;
 	if (len)
 		ext4_error(sbi->s_sb, "%.*s", len, buf);
 	return count;
 }
 static ssize_t sbi_deprecated_show(struct ext4_attr *a,
 				   struct ext4_sb_info *sbi, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
 }
 #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
 static struct ext4_attr ext4_attr_##_name = {			\
 	.attr = {.name = __stringify(_name), .mode = _mode },	\
 	.show	= _show,					\
 	.store	= _store,					\
 	.u = {							\
 		.offset = offsetof(struct ext4_sb_info, _elname),\
 	},							\
 }
 #define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname)		\
 static struct ext4_attr ext4_attr_##_name = {				\
 	.attr = {.name = __stringify(_name), .mode = _mode },		\
 	.show	= _show,						\
 	.store	= _store,						\
 	.u = {								\
 		.offset = offsetof(struct ext4_super_block, _elname),	\
 	},								\
 }
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 #define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
 #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
 #define EXT4_RO_ATTR_ES_UI(name, elname)	\
 	EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)	\
 	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
 #define ATTR_LIST(name) &ext4_attr_##name.attr
 #define EXT4_DEPRECATED_ATTR(_name, _val)	\
 static struct ext4_attr ext4_attr_##_name = {			\
 	.attr = {.name = __stringify(_name), .mode = 0444 },	\
 	.show	= sbi_deprecated_show,				\
 	.u = {							\
 		.deprecated_val = _val,				\
 	},							\
 }
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
 EXT4_RW_ATTR(reserved_clusters);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 		 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
 EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
 EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
 EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
 EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
 EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
 EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
 EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
 static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
 	ATTR_LIST(session_write_kbytes),
 	ATTR_LIST(lifetime_write_kbytes),
 	ATTR_LIST(reserved_clusters),
 	ATTR_LIST(inode_readahead_blks),
 	ATTR_LIST(inode_goal),
 	ATTR_LIST(mb_stats),
 	ATTR_LIST(mb_max_to_scan),
 	ATTR_LIST(mb_min_to_scan),
 	ATTR_LIST(mb_order2_req),
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_group_prealloc),
 	ATTR_LIST(max_writeback_mb_bump),
 	ATTR_LIST(extent_max_zeroout_kb),
 	ATTR_LIST(trigger_fs_error),
 	ATTR_LIST(err_ratelimit_interval_ms),
 	ATTR_LIST(err_ratelimit_burst),
 	ATTR_LIST(warning_ratelimit_interval_ms),
 	ATTR_LIST(warning_ratelimit_burst),
 	ATTR_LIST(msg_ratelimit_interval_ms),
 	ATTR_LIST(msg_ratelimit_burst),
 	ATTR_LIST(errors_count),
 	ATTR_LIST(first_error_time),
 	ATTR_LIST(last_error_time),
 	NULL,
 };
 /* Features this copy of ext4 supports */
 EXT4_INFO_ATTR(lazy_itable_init);
 EXT4_INFO_ATTR(batched_discard);
 EXT4_INFO_ATTR(meta_bg_resize);
 static struct attribute *ext4_feat_attrs[] = {
 	ATTR_LIST(lazy_itable_init),
 	ATTR_LIST(batched_discard),
 	ATTR_LIST(meta_bg_resize),
 	NULL,
 };
 static ssize_t ext4_attr_show(struct kobject *kobj,
 			      struct attribute *attr, char *buf)
 {
 	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
 						s_kobj);
 	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
 	return a->show ? a->show(a, sbi, buf) : 0;
 }
 static ssize_t ext4_attr_store(struct kobject *kobj,
 			       struct attribute *attr,
 			       const char *buf, size_t len)
 {
 	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
 						s_kobj);
 	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
 	return a->store ? a->store(a, sbi, buf, len) : 0;
 }
 static void ext4_sb_release(struct kobject *kobj)
 {
 	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
 						s_kobj);
 	complete(&sbi->s_kobj_unregister);
 }
 static const struct sysfs_ops ext4_attr_ops = {
 	.show	= ext4_attr_show,
 	.store	= ext4_attr_store,
 };
 static struct kobj_type ext4_ktype = {
 	.default_attrs	= ext4_attrs,
 	.sysfs_ops	= &ext4_attr_ops,
 	.release	= ext4_sb_release,
 };
 static void ext4_feat_release(struct kobject *kobj)
 {
 	complete(&ext4_feat->f_kobj_unregister);
 }
 static ssize_t ext4_feat_show(struct kobject *kobj,
 			      struct attribute *attr, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "supported\n");
 }
 /*
  * We can not use ext4_attr_show/store because it relies on the kobject
  * being embedded in the ext4_sb_info structure which is definitely not
  * true in this case.
  */
 static const struct sysfs_ops ext4_feat_ops = {
 	.show	= ext4_feat_show,
 	.store	= NULL,
 };
 static struct kobj_type ext4_feat_ktype = {
 	.default_attrs	= ext4_feat_attrs,
 	.sysfs_ops	= &ext4_feat_ops,
 	.release	= ext4_feat_release,
 };
 /*
  * Check whether this filesystem can be mounted based on
  * the features present and the RDONLY/RDWR mount requested.
  * Returns 1 if this filesystem can be mounted as requested,
  * 0 if it cannot be.
  */
 static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 {
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
 		ext4_msg(sb, KERN_ERR,
 			"Couldn't mount because of "
 			"unsupported optional features (%x)",
 			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
 			~EXT4_FEATURE_INCOMPAT_SUPP));
 		return 0;
 	}
 	if (readonly)
 		return 1;
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) {
+		ext4_msg(sb, KERN_INFO, "filesystem is read-only");
+		sb->s_flags |= MS_RDONLY;
+		return 1;
+	}
 	/* Check that feature set is OK for a read-write mount */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
 		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
 			 "unsupported optional features (%x)",
 			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
 				~EXT4_FEATURE_RO_COMPAT_SUPP));
 		return 0;
 	}
 	/*
 	 * Large file size enabled file system can only be mounted
 	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
 	 */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
 		if (sizeof(blkcnt_t) < sizeof(u64)) {
 			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
 				 "cannot be mounted RDWR without "
 				 "CONFIG_LBDAF");
 			return 0;
 		}
 	}
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
 	    !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 		ext4_msg(sb, KERN_ERR,
 			 "Can't support bigalloc feature without "
 			 "extents feature\n");
 		return 0;
 	}
 #ifndef CONFIG_QUOTA
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
 	    !readonly) {
 		ext4_msg(sb, KERN_ERR,
 			 "Filesystem with quota feature cannot be mounted RDWR "
 			 "without CONFIG_QUOTA");
 		return 0;
 	}
 #endif  /* CONFIG_QUOTA */
 	return 1;
 }
 /*
  * This function is called once a day if we have errors logged
  * on the file system
  */
 static void print_daily_error_info(unsigned long arg)
 {
 	struct super_block *sb = (struct super_block *) arg;
 	struct ext4_sb_info *sbi;
 	struct ext4_super_block *es;
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
 	if (es->s_error_count)
 		/* fsck newer than v1.41.13 is needed to clean this condition. */
 		ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
 			 le32_to_cpu(es->s_error_count));
 	if (es->s_first_error_time) {
 		printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
 		       sb->s_id, le32_to_cpu(es->s_first_error_time),
 		       (int) sizeof(es->s_first_error_func),
 		       es->s_first_error_func,
 		       le32_to_cpu(es->s_first_error_line));
 		if (es->s_first_error_ino)
 			printk(": inode %u",
 			       le32_to_cpu(es->s_first_error_ino));
 		if (es->s_first_error_block)
 			printk(": block %llu", (unsigned long long)
 			       le64_to_cpu(es->s_first_error_block));
 		printk("\n");
 	}
 	if (es->s_last_error_time) {
 		printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
 		       sb->s_id, le32_to_cpu(es->s_last_error_time),
 		       (int) sizeof(es->s_last_error_func),
 		       es->s_last_error_func,
 		       le32_to_cpu(es->s_last_error_line));
 		if (es->s_last_error_ino)
 			printk(": inode %u",
 			       le32_to_cpu(es->s_last_error_ino));
 		if (es->s_last_error_block)
 			printk(": block %llu", (unsigned long long)
 			       le64_to_cpu(es->s_last_error_block));
 		printk("\n");
 	}
 	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
 }
 /* Find next suitable group and run ext4_init_inode_table */
 static int ext4_run_li_request(struct ext4_li_request *elr)
 {
 	struct ext4_group_desc *gdp = NULL;
 	ext4_group_t group, ngroups;
 	struct super_block *sb;
 	unsigned long timeout = 0;
 	int ret = 0;
 	sb = elr->lr_super;
 	ngroups = EXT4_SB(sb)->s_groups_count;
 	sb_start_write(sb);
 	for (group = elr->lr_next_group; group < ngroups; group++) {
 		gdp = ext4_get_group_desc(sb, group, NULL);
 		if (!gdp) {
 			ret = 1;
 			break;
 		}
 		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
 			break;
 	}
 	if (group >= ngroups)
 		ret = 1;
 	if (!ret) {
 		timeout = jiffies;
 		ret = ext4_init_inode_table(sb, group,
 					    elr->lr_timeout ? 0 : 1);
 		if (elr->lr_timeout == 0) {
 			timeout = (jiffies - timeout) *
 				  elr->lr_sbi->s_li_wait_mult;
 			elr->lr_timeout = timeout;
 		}
 		elr->lr_next_sched = jiffies + elr->lr_timeout;
 		elr->lr_next_group = group + 1;
 	}
 	sb_end_write(sb);
 	return ret;
 }
 /*
  * Remove lr_request from the list_request and free the
  * request structure. Should be called with li_list_mtx held
  */
 static void ext4_remove_li_request(struct ext4_li_request *elr)
 {
 	struct ext4_sb_info *sbi;
 	if (!elr)
 		return;
 	sbi = elr->lr_sbi;
 	list_del(&elr->lr_request);
 	sbi->s_li_request = NULL;
 	kfree(elr);
 }
 static void ext4_unregister_li_request(struct super_block *sb)
 {
 	mutex_lock(&ext4_li_mtx);
 	if (!ext4_li_info) {
 		mutex_unlock(&ext4_li_mtx);
 		return;
 	}
 	mutex_lock(&ext4_li_info->li_list_mtx);
 	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
 	mutex_unlock(&ext4_li_info->li_list_mtx);
 	mutex_unlock(&ext4_li_mtx);
 }
 static struct task_struct *ext4_lazyinit_task;
 /*
  * This is the function where ext4lazyinit thread lives. It walks
  * through the request list searching for next scheduled filesystem.
  * When such a fs is found, run the lazy initialization request
  * (ext4_rn_li_request) and keep track of the time spend in this
  * function. Based on that time we compute next schedule time of
  * the request. When walking through the list is complete, compute
  * next waking time and put itself into sleep.
  */
 static int ext4_lazyinit_thread(void *arg)
 {
 	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
 	struct list_head *pos, *n;
 	struct ext4_li_request *elr;
 	unsigned long next_wakeup, cur;
 	BUG_ON(NULL == eli);
 cont_thread:
 	while (true) {
 		next_wakeup = MAX_JIFFY_OFFSET;
 		mutex_lock(&eli->li_list_mtx);
 		if (list_empty(&eli->li_request_list)) {
 			mutex_unlock(&eli->li_list_mtx);
 			goto exit_thread;
 		}
 		list_for_each_safe(pos, n, &eli->li_request_list) {
 			elr = list_entry(pos, struct ext4_li_request,
 					 lr_request);
 			if (time_after_eq(jiffies, elr->lr_next_sched)) {
 				if (ext4_run_li_request(elr) != 0) {
 					/* error, remove the lazy_init job */
 					ext4_remove_li_request(elr);
 					continue;
 				}
 			}
 			if (time_before(elr->lr_next_sched, next_wakeup))
 				next_wakeup = elr->lr_next_sched;
 		}
 		mutex_unlock(&eli->li_list_mtx);
 		try_to_freeze();
 		cur = jiffies;
 		if ((time_after_eq(cur, next_wakeup)) ||
 		    (MAX_JIFFY_OFFSET == next_wakeup)) {
 			cond_resched();
 			continue;
 		}
 		schedule_timeout_interruptible(next_wakeup - cur);
 		if (kthread_should_stop()) {
 			ext4_clear_request_list();
 			goto exit_thread;
 		}
 	}
 exit_thread:
 	/*
 	 * It looks like the request list is empty, but we need
 	 * to check it under the li_list_mtx lock, to prevent any
 	 * additions into it, and of course we should lock ext4_li_mtx
 	 * to atomically free the list and ext4_li_info, because at
 	 * this point another ext4 filesystem could be registering
 	 * new one.
 	 */
 	mutex_lock(&ext4_li_mtx);
 	mutex_lock(&eli->li_list_mtx);
 	if (!list_empty(&eli->li_request_list)) {
 		mutex_unlock(&eli->li_list_mtx);
 		mutex_unlock(&ext4_li_mtx);
 		goto cont_thread;
 	}
 	mutex_unlock(&eli->li_list_mtx);
 	kfree(ext4_li_info);
 	ext4_li_info = NULL;
 	mutex_unlock(&ext4_li_mtx);
 	return 0;
 }
 static void ext4_clear_request_list(void)
 {
 	struct list_head *pos, *n;
 	struct ext4_li_request *elr;
 	mutex_lock(&ext4_li_info->li_list_mtx);
 	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
 		elr = list_entry(pos, struct ext4_li_request,
 				 lr_request);
 		ext4_remove_li_request(elr);
 	}
 	mutex_unlock(&ext4_li_info->li_list_mtx);
 }
 static int ext4_run_lazyinit_thread(void)
 {
 	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
 					 ext4_li_info, "ext4lazyinit");
 	if (IS_ERR(ext4_lazyinit_task)) {
 		int err = PTR_ERR(ext4_lazyinit_task);
 		ext4_clear_request_list();
 		kfree(ext4_li_info);
 		ext4_li_info = NULL;
 		printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
 				 "initialization thread\n",
 				 err);
 		return err;
 	}
 	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
 	return 0;
 }
 /*
  * Check whether it make sense to run itable init. thread or not.
  * If there is at least one uninitialized inode table, return
  * corresponding group number, else the loop goes through all
  * groups and return total number of groups.
  */
 static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
 {
 	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
 	struct ext4_group_desc *gdp = NULL;
 	for (group = 0; group < ngroups; group++) {
 		gdp = ext4_get_group_desc(sb, group, NULL);
 		if (!gdp)
 			continue;
 		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
 			break;
 	}
 	return group;
 }
 static int ext4_li_info_new(void)
 {
 	struct ext4_lazy_init *eli = NULL;
 	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
 	if (!eli)
 		return -ENOMEM;
 	INIT_LIST_HEAD(&eli->li_request_list);
 	mutex_init(&eli->li_list_mtx);
 	eli->li_state |= EXT4_LAZYINIT_QUIT;
 	ext4_li_info = eli;
 	return 0;
 }
 static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
 					    ext4_group_t start)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_li_request *elr;
 	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
 	if (!elr)
 		return NULL;
 	elr->lr_super = sb;
 	elr->lr_sbi = sbi;
 	elr->lr_next_group = start;
 	/*
 	 * Randomize first schedule time of the request to
 	 * spread the inode table initialization requests
 	 * better.
 	 */
 	elr->lr_next_sched = jiffies + (prandom_u32() %
 				(EXT4_DEF_LI_MAX_START_DELAY * HZ));
 	return elr;
 }
 int ext4_register_li_request(struct super_block *sb,
 			     ext4_group_t first_not_zeroed)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_li_request *elr = NULL;
 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 	int ret = 0;
 	mutex_lock(&ext4_li_mtx);
 	if (sbi->s_li_request != NULL) {
 		/*
 		 * Reset timeout so it can be computed again, because
 		 * s_li_wait_mult might have changed.
 		 */
 		sbi->s_li_request->lr_timeout = 0;
 		goto out;
 	}
 	if (first_not_zeroed == ngroups ||
 	    (sb->s_flags & MS_RDONLY) ||
 	    !test_opt(sb, INIT_INODE_TABLE))
 		goto out;
 	elr = ext4_li_request_new(sb, first_not_zeroed);
 	if (!elr) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	if (NULL == ext4_li_info) {
 		ret = ext4_li_info_new();
 		if (ret)
 			goto out;
 	}
 	mutex_lock(&ext4_li_info->li_list_mtx);
 	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
 	mutex_unlock(&ext4_li_info->li_list_mtx);
 	sbi->s_li_request = elr;
 	/*
 	 * set elr to NULL here since it has been inserted to
 	 * the request_list and the removal and free of it is
 	 * handled by ext4_clear_request_list from now on.
 	 */
 	elr = NULL;
 	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
 		ret = ext4_run_lazyinit_thread();
 		if (ret)
 			goto out;
 	}
 out:
 	mutex_unlock(&ext4_li_mtx);
 	if (ret)
 		kfree(elr);
 	return ret;
 }
 /*
  * We do not need to lock anything since this is called on
  * module unload.
  */
 static void ext4_destroy_lazyinit_thread(void)
 {
 	/*
 	 * If thread exited earlier
 	 * there's nothing to be done.
 	 */
 	if (!ext4_li_info || !ext4_lazyinit_task)
 		return;
 	kthread_stop(ext4_lazyinit_task);
 }
 static int set_journal_csum_feature_set(struct super_block *sb)
 {
 	int ret = 1;
 	int compat, incompat;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	if (ext4_has_metadata_csum(sb)) {
 		/* journal checksum v3 */
 		compat = 0;
 		incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
 	} else {
 		/* journal checksum v1 */
 		compat = JBD2_FEATURE_COMPAT_CHECKSUM;
 		incompat = 0;
 	}
 	jbd2_journal_clear_features(sbi->s_journal,
 			JBD2_FEATURE_COMPAT_CHECKSUM, 0,
 			JBD2_FEATURE_INCOMPAT_CSUM_V3 |
 			JBD2_FEATURE_INCOMPAT_CSUM_V2);
 	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
 		ret = jbd2_journal_set_features(sbi->s_journal,
 				compat, 0,
 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
 				incompat);
 	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
 		ret = jbd2_journal_set_features(sbi->s_journal,
 				compat, 0,
 				incompat);
 		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
 	} else {
 		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
 	}
 	return ret;
 }
 /*
  * Note: calculating the overhead so we can be compatible with
  * historical BSD practice is quite difficult in the face of
  * clusters/bigalloc.  This is because multiple metadata blocks from
  * different block group can end up in the same allocation cluster.
  * Calculating the exact overhead in the face of clustered allocation
  * requires either O(all block bitmaps) in memory or O(number of block
  * groups**2) in time.  We will still calculate the superblock for
  * older file systems --- and if we come across with a bigalloc file
  * system with zero in s_overhead_clusters the estimate will be close to
  * correct especially for very large cluster sizes --- but for newer
  * file systems, it's better to calculate this figure once at mkfs
  * time, and store it in the superblock.  If the superblock value is
  * present (even for non-bigalloc file systems), we will use it.
  */
 static int count_overhead(struct super_block *sb, ext4_group_t grp,
 			  char *buf)
 {
 	struct ext4_sb_info	*sbi = EXT4_SB(sb);
 	struct ext4_group_desc	*gdp;
 	ext4_fsblk_t		first_block, last_block, b;
 	ext4_group_t		i, ngroups = ext4_get_groups_count(sb);
 	int			s, j, count = 0;
 	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
 		return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
 			sbi->s_itb_per_group + 2);
 	first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
 		(grp * EXT4_BLOCKS_PER_GROUP(sb));
 	last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
 	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		b = ext4_block_bitmap(sb, gdp);
 		if (b >= first_block && b <= last_block) {
 			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
 			count++;
 		}
 		b = ext4_inode_bitmap(sb, gdp);
 		if (b >= first_block && b <= last_block) {
 			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
 			count++;
 		}
 		b = ext4_inode_table(sb, gdp);
 		if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
 			for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
 				int c = EXT4_B2C(sbi, b - first_block);
 				ext4_set_bit(c, buf);
 				count++;
 			}
 		if (i != grp)
 			continue;
 		s = 0;
 		if (ext4_bg_has_super(sb, grp)) {
 			ext4_set_bit(s++, buf);
 			count++;
 		}
 		for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
 			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
 			count++;
 		}
 	}
 	if (!count)
 		return 0;
 	return EXT4_CLUSTERS_PER_GROUP(sb) -
 		ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
 }
 /*
  * Compute the overhead and stash it in sbi->s_overhead
  */
 int ext4_calculate_overhead(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 	ext4_fsblk_t overhead = 0;
 	char *buf = (char *) get_zeroed_page(GFP_NOFS);
 	if (!buf)
 		return -ENOMEM;
 	/*
 	 * Compute the overhead (FS structures).  This is constant
 	 * for a given filesystem unless the number of block groups
 	 * changes so we cache the previous value until it does.
 	 */
 	/*
 	 * All of the blocks before first_data_block are overhead
 	 */
 	overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
 	/*
 	 * Add the overhead found in each block group
 	 */
 	for (i = 0; i < ngroups; i++) {
 		int blks;
 		blks = count_overhead(sb, i, buf);
 		overhead += blks;
 		if (blks)
 			memset(buf, 0, PAGE_SIZE);
 		cond_resched();
 	}
 	/* Add the internal journal blocks as well */
 	if (sbi->s_journal && !sbi->journal_bdev)
 		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
 	sbi->s_overhead = overhead;
 	smp_wmb();
 	free_page((unsigned long) buf);
 	return 0;
 }
 static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
 {
 	ext4_fsblk_t resv_clusters;
 	/*
 	 * There's no need to reserve anything when we aren't using extents.
 	 * The space estimates are exact, there are no unwritten extents,
 	 * hole punching doesn't need new metadata... This is needed especially
 	 * to keep ext2/3 backward compatibility.
 	 */
 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
 		return 0;
 	/*
 	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
 	 * This should cover the situations where we can not afford to run
 	 * out of space like for example punch hole, or converting
 	 * unwritten extents in delalloc path. In most cases such
 	 * allocation would require 1, or 2 blocks, higher numbers are
 	 * very rare.
 	 */
 	resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
 			EXT4_SB(sb)->s_cluster_bits;
 	do_div(resv_clusters, 50);
 	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
 	return resv_clusters;
 }
 static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
 {
 	ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
 				sbi->s_cluster_bits;
 	if (count >= clusters)
 		return -EINVAL;
 	atomic64_set(&sbi->s_resv_clusters, count);
 	return 0;
 }
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 {
 	char *orig_data = kstrdup(data, GFP_KERNEL);
 	struct buffer_head *bh;
 	struct ext4_super_block *es = NULL;
 	struct ext4_sb_info *sbi;
 	ext4_fsblk_t block;
 	ext4_fsblk_t sb_block = get_sb_block(&data);
 	ext4_fsblk_t logical_sb_block;
 	unsigned long offset = 0;
 	unsigned long journal_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
 	char *cp;
 	const char *descr;
 	int ret = -ENOMEM;
 	int blocksize, clustersize;
 	unsigned int db_count;
 	unsigned int i;
 	int needs_recovery, has_huge_files, has_bigalloc;
 	__u64 blocks_count;
 	int err = 0;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 	ext4_group_t first_not_zeroed;
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		goto out_free_orig;
 	sbi->s_blockgroup_lock =
 		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
 	if (!sbi->s_blockgroup_lock) {
 		kfree(sbi);
 		goto out_free_orig;
 	}
 	sb->s_fs_info = sbi;
 	sbi->s_sb = sb;
 	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
 	sbi->s_sb_block = sb_block;
 	if (sb->s_bdev->bd_part)
 		sbi->s_sectors_written_start =
 			part_stat_read(sb->s_bdev->bd_part, sectors[1]);
 	/* Cleanup superblock name */
 	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
 		*cp = '!';
 	/* -EINVAL is default */
 	ret = -EINVAL;
 	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
 	if (!blocksize) {
 		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
 		goto out_fail;
 	}
 	/*
 	 * The ext4 superblock will not be buffer aligned for other than 1kB
 	 * block sizes.  We need to calculate the offset from buffer start.
 	 */
 	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
 		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
 		offset = do_div(logical_sb_block, blocksize);
 	} else {
 		logical_sb_block = sb_block;
 	}
 	if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
 		ext4_msg(sb, KERN_ERR, "unable to read superblock");
 		goto out_fail;
 	}
 	/*
 	 * Note: s_es must be initialized as soon as possible because
 	 *       some ext4 macro-instructions depend on its value
 	 */
 	es = (struct ext4_super_block *) (bh->b_data + offset);
 	sbi->s_es = es;
 	sb->s_magic = le16_to_cpu(es->s_magic);
 	if (sb->s_magic != EXT4_SUPER_MAGIC)
 		goto cantfind_ext4;
 	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
 	/* Warn if metadata_csum and gdt_csum are both set. */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
 	    EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
 		ext4_warning(sb, "metadata_csum and uninit_bg are "
 			     "redundant flags; please run fsck.");
 	/* Check for a known checksum algorithm */
 	if (!ext4_verify_csum_type(sb, es)) {
 		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
 			 "unknown checksum algorithm.");
 		silent = 1;
 		goto cantfind_ext4;
 	}
 	/* Load the checksum driver */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
 		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
 		if (IS_ERR(sbi->s_chksum_driver)) {
 			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
 			ret = PTR_ERR(sbi->s_chksum_driver);
 			sbi->s_chksum_driver = NULL;
 			goto failed_mount;
 		}
 	}
 	/* Check superblock checksum */
 	if (!ext4_superblock_csum_verify(sb, es)) {
 		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
 			 "invalid superblock checksum.  Run e2fsck?");
 		silent = 1;
 		goto cantfind_ext4;
 	}
 	/* Precompute checksum seed for all metadata */
 	if (ext4_has_metadata_csum(sb))
 		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
 					       sizeof(es->s_uuid));
 	/* Set defaults before we parse the mount options */
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
 	set_opt(sb, INIT_INODE_TABLE);
 	if (def_mount_opts & EXT4_DEFM_DEBUG)
 		set_opt(sb, DEBUG);
 	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
 		set_opt(sb, GRPID);
 	if (def_mount_opts & EXT4_DEFM_UID16)
 		set_opt(sb, NO_UID32);
 	/* xattr user namespace & acls are now defaulted on */
 	set_opt(sb, XATTR_USER);
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	set_opt(sb, POSIX_ACL);
 #endif
 	/* don't forget to enable journal_csum when metadata_csum is enabled. */
 	if (ext4_has_metadata_csum(sb))
 		set_opt(sb, JOURNAL_CHECKSUM);
 	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
 		set_opt(sb, JOURNAL_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
 		set_opt(sb, ORDERED_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
 		set_opt(sb, WRITEBACK_DATA);
 	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
 		set_opt(sb, ERRORS_PANIC);
 	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
 		set_opt(sb, ERRORS_CONT);
 	else
 		set_opt(sb, ERRORS_RO);
 	/* block_validity enabled by default; disable with noblock_validity */
 	set_opt(sb, BLOCK_VALIDITY);
 	if (def_mount_opts & EXT4_DEFM_DISCARD)
 		set_opt(sb, DISCARD);
 	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
 	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
 	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
 	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
 		set_opt(sb, BARRIER);
 	/*
 	 * enable delayed allocation by default
 	 * Use -o nodelalloc to turn it off
 	 */
 	if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
 	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
 		set_opt(sb, DELALLOC);
 	/*
 	 * set default s_li_wait_mult for lazyinit, for the case there is
 	 * no mount option specified.
 	 */
 	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
 	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
 			   &journal_devnum, &journal_ioprio, 0)) {
 		ext4_msg(sb, KERN_WARNING,
 			 "failed to parse options in superblock: %s",
 			 sbi->s_es->s_mount_opts);
 	}
 	sbi->s_def_mount_opt = sbi->s_mount_opt;
 	if (!parse_options((char *) data, sb, &journal_devnum,
 			   &journal_ioprio, 0))
 		goto failed_mount;
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
 		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
 			    "with data=journal disables delayed "
 			    "allocation and O_DIRECT support!\n");
 		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
 				 "both data=journal and delalloc");
 			goto failed_mount;
 		}
 		if (test_opt(sb, DIOREAD_NOLOCK)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
 				 "both data=journal and dioread_nolock");
 			goto failed_mount;
 		}
 		if (test_opt(sb, DAX)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
 				 "both data=journal and dax");
 			goto failed_mount;
 		}
 		if (test_opt(sb, DELALLOC))
 			clear_opt(sb, DELALLOC);
 	}
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
 	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
 		ext4_msg(sb, KERN_WARNING,
 		       "feature flags set on rev 0 fs, "
 		       "running e2fsck is recommended");
 	if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
 		set_opt2(sb, HURD_COMPAT);
 		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
 					      EXT4_FEATURE_INCOMPAT_64BIT)) {
 			ext4_msg(sb, KERN_ERR,
 				 "The Hurd can't support 64-bit file systems");
 			goto failed_mount;
 		}
 	}
 	if (IS_EXT2_SB(sb)) {
 		if (ext2_feature_set_ok(sb))
 			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
 				 "using the ext4 subsystem");
 		else {
 			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
 				 "to feature incompatibilities");
 			goto failed_mount;
 		}
 	}
 	if (IS_EXT3_SB(sb)) {
 		if (ext3_feature_set_ok(sb))
 			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
 				 "using the ext4 subsystem");
 		else {
 			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
 				 "to feature incompatibilities");
 			goto failed_mount;
 		}
 	}
 	/*
 	 * Check feature flags regardless of the revision level, since we
 	 * previously didn't change the revision level when setting the flags,
 	 * so there is a chance incompat flags are set on a rev 0 filesystem.
 	 */
 	if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
 		goto failed_mount;
 	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
 	    blocksize > EXT4_MAX_BLOCK_SIZE) {
 		ext4_msg(sb, KERN_ERR,
 		       "Unsupported filesystem blocksize %d", blocksize);
 		goto failed_mount;
 	}
 	if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
 		if (blocksize != PAGE_SIZE) {
 			ext4_msg(sb, KERN_ERR,
 					"error: unsupported blocksize for dax");
 			goto failed_mount;
 		}
 		if (!sb->s_bdev->bd_disk->fops->direct_access) {
 			ext4_msg(sb, KERN_ERR,
 					"error: device does not support dax");
 			goto failed_mount;
 		}
 	}
 	if (sb->s_blocksize != blocksize) {
 		/* Validate the filesystem blocksize */
 		if (!sb_set_blocksize(sb, blocksize)) {
 			ext4_msg(sb, KERN_ERR, "bad block size %d",
 					blocksize);
 			goto failed_mount;
 		}
 		brelse(bh);
 		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
 		offset = do_div(logical_sb_block, blocksize);
 		bh = sb_bread_unmovable(sb, logical_sb_block);
 		if (!bh) {
 			ext4_msg(sb, KERN_ERR,
 			       "Can't read superblock on 2nd try");
 			goto failed_mount;
 		}
 		es = (struct ext4_super_block *)(bh->b_data + offset);
 		sbi->s_es = es;
 		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
 			ext4_msg(sb, KERN_ERR,
 			       "Magic mismatch, very weird!");
 			goto failed_mount;
 		}
 	}
 	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
 	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
 						      has_huge_files);
 	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
 		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
 		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
 	} else {
 		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
 		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
 		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
 		    (!is_power_of_2(sbi->s_inode_size)) ||
 		    (sbi->s_inode_size > blocksize)) {
 			ext4_msg(sb, KERN_ERR,
 			       "unsupported inode size: %d",
 			       sbi->s_inode_size);
 			goto failed_mount;
 		}
 		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
 			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
 	}
 	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
 		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
 		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
 		    !is_power_of_2(sbi->s_desc_size)) {
 			ext4_msg(sb, KERN_ERR,
 			       "unsupported descriptor size %lu",
 			       sbi->s_desc_size);
 			goto failed_mount;
 		}
 	} else
 		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
 	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
 	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
 	if (sbi->s_inodes_per_block == 0)
 		goto cantfind_ext4;
 	sbi->s_itb_per_group = sbi->s_inodes_per_group /
 					sbi->s_inodes_per_block;
 	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
 	sbi->s_sbh = bh;
 	sbi->s_mount_state = le16_to_cpu(es->s_state);
 	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
 	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
 	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
 	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
 		i = le32_to_cpu(es->s_flags);
 		if (i & EXT2_FLAGS_UNSIGNED_HASH)
 			sbi->s_hash_unsigned = 3;
 		else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
 #ifdef __CHAR_UNSIGNED__
 			if (!(sb->s_flags & MS_RDONLY))
 				es->s_flags |=
 					cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
 			sbi->s_hash_unsigned = 3;
 #else
 			if (!(sb->s_flags & MS_RDONLY))
 				es->s_flags |=
 					cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
 #endif
 		}
 	}
 	/* Handle clustersize */
 	clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
 	has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_BIGALLOC);
 	if (has_bigalloc) {
 		if (clustersize < blocksize) {
 			ext4_msg(sb, KERN_ERR,
 				 "cluster size (%d) smaller than "
 				 "block size (%d)", clustersize, blocksize);
 			goto failed_mount;
 		}
 		sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
 			le32_to_cpu(es->s_log_block_size);
 		sbi->s_clusters_per_group =
 			le32_to_cpu(es->s_clusters_per_group);
 		if (sbi->s_clusters_per_group > blocksize * 8) {
 			ext4_msg(sb, KERN_ERR,
 				 "#clusters per group too big: %lu",
 				 sbi->s_clusters_per_group);
 			goto failed_mount;
 		}
 		if (sbi->s_blocks_per_group !=
 		    (sbi->s_clusters_per_group * (clustersize / blocksize))) {
 			ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
 				 "clusters per group (%lu) inconsistent",
 				 sbi->s_blocks_per_group,
 				 sbi->s_clusters_per_group);
 			goto failed_mount;
 		}
 	} else {
 		if (clustersize != blocksize) {
 			ext4_warning(sb, "fragment/cluster size (%d) != "
 				     "block size (%d)", clustersize,
 				     blocksize);
 			clustersize = blocksize;
 		}
 		if (sbi->s_blocks_per_group > blocksize * 8) {
 			ext4_msg(sb, KERN_ERR,
 				 "#blocks per group too big: %lu",
 				 sbi->s_blocks_per_group);
 			goto failed_mount;
 		}
 		sbi->s_clusters_per_group = sbi->s_blocks_per_group;
 		sbi->s_cluster_bits = 0;
 	}
 	sbi->s_cluster_ratio = clustersize / blocksize;
 	if (sbi->s_inodes_per_group > blocksize * 8) {
 		ext4_msg(sb, KERN_ERR,
 		       "#inodes per group too big: %lu",
 		       sbi->s_inodes_per_group);
 		goto failed_mount;
 	}
 	/* Do we have standard group size of clustersize * 8 blocks ? */
 	if (sbi->s_blocks_per_group == clustersize << 3)
 		set_opt2(sb, STD_GROUP_SIZE);
 	/*
 	 * Test whether we have more sectors than will fit in sector_t,
 	 * and whether the max offset is addressable by the page cache.
 	 */
 	err = generic_check_addressable(sb->s_blocksize_bits,
 					ext4_blocks_count(es));
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "filesystem"
 			 " too large to mount safely on this system");
 		if (sizeof(sector_t) < 8)
 			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
 		goto failed_mount;
 	}
 	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
 	/* check blocks count against device size */
 	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
 	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
 		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
 		       "exceeds size of device (%llu blocks)",
 		       ext4_blocks_count(es), blocks_count);
 		goto failed_mount;
 	}
 	/*
 	 * It makes no sense for the first data block to be beyond the end
 	 * of the filesystem.
 	 */
 	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
 		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
 			 "block %u is beyond end of filesystem (%llu)",
 			 le32_to_cpu(es->s_first_data_block),
 			 ext4_blocks_count(es));
 		goto failed_mount;
 	}
 	blocks_count = (ext4_blocks_count(es) -
 			le32_to_cpu(es->s_first_data_block) +
 			EXT4_BLOCKS_PER_GROUP(sb) - 1);
 	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
 	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
 		ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
 		       "(block count %llu, first data block %u, "
 		       "blocks per group %lu)", sbi->s_groups_count,
 		       ext4_blocks_count(es),
 		       le32_to_cpu(es->s_first_data_block),
 		       EXT4_BLOCKS_PER_GROUP(sb));
 		goto failed_mount;
 	}
 	sbi->s_groups_count = blocks_count;
 	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
 			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
 	sbi->s_group_desc = ext4_kvmalloc(db_count *
 					  sizeof(struct buffer_head *),
 					  GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
 		ext4_msg(sb, KERN_ERR, "not enough memory");
 		ret = -ENOMEM;
 		goto failed_mount;
 	}
 	if (ext4_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
 	if (sbi->s_proc)
 		proc_create_data("options", S_IRUGO, sbi->s_proc,
 				 &ext4_seq_options_fops, sb);
 	bgl_lock_init(sbi->s_blockgroup_lock);
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logical_sb_block, i);
 		sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
 		if (!sbi->s_group_desc[i]) {
 			ext4_msg(sb, KERN_ERR,
 			       "can't read group descriptor %d", i);
 			db_count = i;
 			goto failed_mount2;
 		}
 	}
 	if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
 		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
 		goto failed_mount2;
 	}
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
-	init_timer(&sbi->s_err_report);
+	setup_timer(&sbi->s_err_report, print_daily_error_info,
-	sbi->s_err_report.function = print_daily_error_info;
+		(unsigned long) sb);
-	sbi->s_err_report.data = (unsigned long) sb;
 	/* Register extent status tree shrinker */
 	if (ext4_es_register_shrinker(sbi))
 		goto failed_mount3;
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_extent_max_zeroout_kb = 32;
 	/*
 	 * set up enough so that it can read an inode
 	 */
 	sb->s_op = &ext4_sops;
 	sb->s_export_op = &ext4_export_ops;
 	sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &ext4_quota_operations;
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
 		sb->s_qcop = &dquot_quotactl_sysfile_ops;
 	else
 		sb->s_qcop = &ext4_qctl_operations;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
 #endif
 	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
 	sb->s_root = NULL;
 	needs_recovery = (es->s_last_orphan != 0 ||
 			  EXT4_HAS_INCOMPAT_FEATURE(sb,
 				    EXT4_FEATURE_INCOMPAT_RECOVER));
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
 	    !(sb->s_flags & MS_RDONLY))
 		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
 			goto failed_mount3a;
 	/*
 	 * The first inode we look at is the journal inode.  Don't try
 	 * root first: it may be modified in the journal!
 	 */
 	if (!test_opt(sb, NOLOAD) &&
 	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext4_load_journal(sb, es, journal_devnum))
 			goto failed_mount3a;
 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
 	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		ext4_msg(sb, KERN_ERR, "required journal recovery "
 		       "suppressed and not mounted read-only");
 		goto failed_mount_wq;
 	} else {
 		clear_opt(sb, DATA_FLAGS);
 		sbi->s_journal = NULL;
 		needs_recovery = 0;
 		goto no_journal;
 	}
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
 	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
 				       JBD2_FEATURE_INCOMPAT_64BIT)) {
 		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
 		goto failed_mount_wq;
 	}
 	if (!set_journal_csum_feature_set(sb)) {
 		ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
 			 "feature set");
 		goto failed_mount_wq;
 	}
 	/* We have now updated the journal if required, so we can
 	 * validate the data journaling mode. */
 	switch (test_opt(sb, DATA_FLAGS)) {
 	case 0:
 		/* No mode set, assume a default based on the journal
 		 * capabilities: ORDERED_DATA if the journal can
 		 * cope, else JOURNAL_DATA
 		 */
 		if (jbd2_journal_check_available_features
 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
 			set_opt(sb, ORDERED_DATA);
 		else
 			set_opt(sb, JOURNAL_DATA);
 		break;
 	case EXT4_MOUNT_ORDERED_DATA:
 	case EXT4_MOUNT_WRITEBACK_DATA:
 		if (!jbd2_journal_check_available_features
 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
 			ext4_msg(sb, KERN_ERR, "Journal does not support "
 			       "requested data journaling mode");
 			goto failed_mount_wq;
 		}
 	default:
 		break;
 	}
 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
 no_journal:
 	if (ext4_mballoc_ready) {
 		sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
 		if (!sbi->s_mb_cache) {
 			ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
 			goto failed_mount_wq;
 		}
 	}
 	/*
 	 * Get the # of file system overhead blocks from the
 	 * superblock if present.
 	 */
 	if (es->s_overhead_clusters)
 		sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
 	else {
 		err = ext4_calculate_overhead(sb);
 		if (err)
 			goto failed_mount_wq;
 	}
 	/*
 	 * The maximum number of concurrent works can be high and
 	 * concurrency isn't really necessary.  Limit it to 1.
 	 */
 	EXT4_SB(sb)->rsv_conversion_wq =
 		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
 	if (!EXT4_SB(sb)->rsv_conversion_wq) {
 		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
 		ret = -ENOMEM;
 		goto failed_mount4;
 	}
 	/*
 	 * The jbd2_journal_load will have done any necessary log recovery,
 	 * so we can safely mount the rest of the filesystem now.
 	 */
 	root = ext4_iget(sb, EXT4_ROOT_INO);
 	if (IS_ERR(root)) {
 		ext4_msg(sb, KERN_ERR, "get root inode failed");
 		ret = PTR_ERR(root);
 		root = NULL;
 		goto failed_mount4;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
 		iput(root);
 		goto failed_mount4;
 	}
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		ext4_msg(sb, KERN_ERR, "get root dentry failed");
 		ret = -ENOMEM;
 		goto failed_mount4;
 	}
 	if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
 		sb->s_flags |= MS_RDONLY;
 	/* determine the minimum size of new large inodes, if present */
 	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
 						     EXT4_GOOD_OLD_INODE_SIZE;
 		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				       EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
 			if (sbi->s_want_extra_isize <
 			    le16_to_cpu(es->s_want_extra_isize))
 				sbi->s_want_extra_isize =
 					le16_to_cpu(es->s_want_extra_isize);
 			if (sbi->s_want_extra_isize <
 			    le16_to_cpu(es->s_min_extra_isize))
 				sbi->s_want_extra_isize =
 					le16_to_cpu(es->s_min_extra_isize);
 		}
 	}
 	/* Check if enough inode space is available */
 	if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
 							sbi->s_inode_size) {
 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
 						       EXT4_GOOD_OLD_INODE_SIZE;
 		ext4_msg(sb, KERN_INFO, "required extra inode space not"
 			 "available");
 	}
 	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
 			 "reserved pool", ext4_calculate_resv_clusters(sb));
 		goto failed_mount4a;
 	}
 	err = ext4_setup_system_zone(sb);
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to initialize system "
 			 "zone (%d)", err);
 		goto failed_mount4a;
 	}
 	ext4_ext_init(sb);
 	err = ext4_mb_init(sb);
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
 			 err);
 		goto failed_mount5;
 	}
 	block = ext4_count_free_clusters(sb);
 	ext4_free_blocks_count_set(sbi->s_es,
 				   EXT4_C2B(sbi, block));
 	err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
 				  GFP_KERNEL);
 	if (!err) {
 		unsigned long freei = ext4_count_free_inodes(sb);
 		sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
 		err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
 					  GFP_KERNEL);
 	}
 	if (!err)
 		err = percpu_counter_init(&sbi->s_dirs_counter,
 					  ext4_count_dirs(sb), GFP_KERNEL);
 	if (!err)
 		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
 					  GFP_KERNEL);
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount6;
 	}
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		if (!ext4_fill_flex_info(sb)) {
 			ext4_msg(sb, KERN_ERR,
 			       "unable to initialize "
 			       "flex_bg meta info!");
 			goto failed_mount6;
 		}
 	err = ext4_register_li_request(sb, first_not_zeroed);
 	if (err)
 		goto failed_mount6;
 	sbi->s_kobj.kset = ext4_kset;
 	init_completion(&sbi->s_kobj_unregister);
 	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
 				   "%s", sb->s_id);
 	if (err)
 		goto failed_mount7;
 #ifdef CONFIG_QUOTA
 	/* Enable quota usage during mount. */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
 	    !(sb->s_flags & MS_RDONLY)) {
 		err = ext4_enable_quotas(sb);
 		if (err)
 			goto failed_mount8;
 	}
 #endif  /* CONFIG_QUOTA */
 	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
 	ext4_orphan_cleanup(sb, es);
 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
 	if (needs_recovery) {
 		ext4_msg(sb, KERN_INFO, "recovery complete");
 		ext4_mark_recovery_complete(sb, es);
 	}
 	if (EXT4_SB(sb)->s_journal) {
 		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
 			descr = " journalled data mode";
 		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
 			descr = " ordered data mode";
 		else
 			descr = " writeback data mode";
 	} else
 		descr = "out journal";
 	if (test_opt(sb, DISCARD)) {
 		struct request_queue *q = bdev_get_queue(sb->s_bdev);
 		if (!blk_queue_discard(q))
 			ext4_msg(sb, KERN_WARNING,
 				 "mounting with \"discard\" option, but "
 				 "the device does not support discard");
 	}
 	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
 		 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
 		 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
 	if (es->s_error_count)
 		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
 	/* Enable message ratelimiting. Default is 10 messages per 5 secs. */
 	ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
 	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
 	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
 	kfree(orig_data);
 	return 0;
 cantfind_ext4:
 	if (!silent)
 		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
 	goto failed_mount;
 #ifdef CONFIG_QUOTA
 failed_mount8:
 	kobject_del(&sbi->s_kobj);
 #endif
 failed_mount7:
 	ext4_unregister_li_request(sb);
 failed_mount6:
 	ext4_mb_release(sb);
 	if (sbi->s_flex_groups)
 		kvfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
 failed_mount5:
 	ext4_ext_release(sb);
 	ext4_release_system_zone(sb);
 failed_mount4a:
 	dput(sb->s_root);
 	sb->s_root = NULL;
 failed_mount4:
 	ext4_msg(sb, KERN_ERR, "mount failed");
 	if (EXT4_SB(sb)->rsv_conversion_wq)
 		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
 failed_mount_wq:
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
 failed_mount3a:
 	ext4_es_unregister_shrinker(sbi);
 failed_mount3:
 	del_timer_sync(&sbi->s_err_report);
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kvfree(sbi->s_group_desc);
 failed_mount:
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);
 	if (sbi->s_proc) {
 		remove_proc_entry("options", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
 #endif
 	ext4_blkdev_remove(sbi);
 	brelse(bh);
 out_fail:
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 out_free_orig:
 	kfree(orig_data);
 	return err ? err : ret;
 }
 /*
  * Setup any per-fs journal parameters now.  We'll do this both on
  * initial mount, once the journal has been initialised but before we've
  * done any recovery; and again on any subsequent remount.
  */
 static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	journal->j_commit_interval = sbi->s_commit_interval;
 	journal->j_min_batch_time = sbi->s_min_batch_time;
 	journal->j_max_batch_time = sbi->s_max_batch_time;
 	write_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
 		journal->j_flags |= JBD2_BARRIER;
 	else
 		journal->j_flags &= ~JBD2_BARRIER;
 	if (test_opt(sb, DATA_ERR_ABORT))
 		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
 	else
 		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
 	write_unlock(&journal->j_state_lock);
 }
 static journal_t *ext4_get_journal(struct super_block *sb,
 				   unsigned int journal_inum)
 {
 	struct inode *journal_inode;
 	journal_t *journal;
 	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 	/* First, test for the existence of a valid inode on disk.  Bad
 	 * things happen if we iget() an unused inode, as the subsequent
 	 * iput() will try to delete it. */
 	journal_inode = ext4_iget(sb, journal_inum);
 	if (IS_ERR(journal_inode)) {
 		ext4_msg(sb, KERN_ERR, "no journal found");
 		return NULL;
 	}
 	if (!journal_inode->i_nlink) {
 		make_bad_inode(journal_inode);
 		iput(journal_inode);
 		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
 		return NULL;
 	}
 	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	if (!S_ISREG(journal_inode->i_mode)) {
 		ext4_msg(sb, KERN_ERR, "invalid journal inode");
 		iput(journal_inode);
 		return NULL;
 	}
 	journal = jbd2_journal_init_inode(journal_inode);
 	if (!journal) {
 		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
 		iput(journal_inode);
 		return NULL;
 	}
 	journal->j_private = sb;
 	ext4_init_journal_params(sb, journal);
 	return journal;
 }
 static journal_t *ext4_get_dev_journal(struct super_block *sb,
 				       dev_t j_dev)
 {
 	struct buffer_head *bh;
 	journal_t *journal;
 	ext4_fsblk_t start;
 	ext4_fsblk_t len;
 	int hblock, blocksize;
 	ext4_fsblk_t sb_block;
 	unsigned long offset;
 	struct ext4_super_block *es;
 	struct block_device *bdev;
 	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 	bdev = ext4_blkdev_get(j_dev, sb);
 	if (bdev == NULL)
 		return NULL;
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
 		ext4_msg(sb, KERN_ERR,
 			"blocksize too small for journal device");
 		goto out_bdev;
 	}
 	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
 	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
 	set_blocksize(bdev, blocksize);
 	if (!(bh = __bread(bdev, sb_block, blocksize))) {
 		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
 		       "external journal");
 		goto out_bdev;
 	}
 	es = (struct ext4_super_block *) (bh->b_data + offset);
 	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
 	    !(le32_to_cpu(es->s_feature_incompat) &
 	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
 		ext4_msg(sb, KERN_ERR, "external journal has "
 					"bad superblock");
 		brelse(bh);
 		goto out_bdev;
 	}
 	if ((le32_to_cpu(es->s_feature_ro_compat) &
 	     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
 	    es->s_checksum != ext4_superblock_csum(sb, es)) {
 		ext4_msg(sb, KERN_ERR, "external journal has "
 				       "corrupt superblock");
 		brelse(bh);
 		goto out_bdev;
 	}
 	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
 		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
 		brelse(bh);
 		goto out_bdev;
 	}
 	len = ext4_blocks_count(es);
 	start = sb_block + 1;
 	brelse(bh);	/* we're done with the superblock */
 	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
 					start, len, blocksize);
 	if (!journal) {
 		ext4_msg(sb, KERN_ERR, "failed to create device journal");
 		goto out_bdev;
 	}
 	journal->j_private = sb;
 	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
 	wait_on_buffer(journal->j_sb_buffer);
 	if (!buffer_uptodate(journal->j_sb_buffer)) {
 		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
 		goto out_journal;
 	}
 	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
 		ext4_msg(sb, KERN_ERR, "External journal has more than one "
 					"user (unsupported) - %d",
 			be32_to_cpu(journal->j_superblock->s_nr_users));
 		goto out_journal;
 	}
 	EXT4_SB(sb)->journal_bdev = bdev;
 	ext4_init_journal_params(sb, journal);
 	return journal;
 out_journal:
 	jbd2_journal_destroy(journal);
 out_bdev:
 	ext4_blkdev_put(bdev);
 	return NULL;
 }
 static int ext4_load_journal(struct super_block *sb,
 			     struct ext4_super_block *es,
 			     unsigned long journal_devnum)
 {
 	journal_t *journal;
 	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
 	dev_t journal_dev;
 	int err = 0;
 	int really_read_only;
 	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
 			"numbers have changed");
 		journal_dev = new_decode_dev(journal_devnum);
 	} else
 		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
 	really_read_only = bdev_read_only(sb->s_bdev);
 	/*
 	 * Are we loading a blank journal or performing recovery after a
 	 * crash?  For recovery, we need to check in advance whether we
 	 * can get read-write access to the device.
 	 */
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		if (sb->s_flags & MS_RDONLY) {
 			ext4_msg(sb, KERN_INFO, "INFO: recovery "
 					"required on readonly filesystem");
 			if (really_read_only) {
 				ext4_msg(sb, KERN_ERR, "write access "
 					"unavailable, cannot proceed");
 				return -EROFS;
 			}
 			ext4_msg(sb, KERN_INFO, "write access will "
 			       "be enabled during recovery");
 		}
 	}
 	if (journal_inum && journal_dev) {
 		ext4_msg(sb, KERN_ERR, "filesystem has both journal "
 		       "and inode journals!");
 		return -EINVAL;
 	}
 	if (journal_inum) {
 		if (!(journal = ext4_get_journal(sb, journal_inum)))
 			return -EINVAL;
 	} else {
 		if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
 			return -EINVAL;
 	}
 	if (!(journal->j_flags & JBD2_BARRIER))
 		ext4_msg(sb, KERN_INFO, "barriers disabled");
 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
 		err = jbd2_journal_wipe(journal, !really_read_only);
 	if (!err) {
 		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
 		if (save)
 			memcpy(save, ((char *) es) +
 			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
 		err = jbd2_journal_load(journal);
 		if (save)
 			memcpy(((char *) es) + EXT4_S_ERR_START,
 			       save, EXT4_S_ERR_LEN);
 		kfree(save);
 	}
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "error loading journal");
 		jbd2_journal_destroy(journal);
 		return err;
 	}
 	EXT4_SB(sb)->s_journal = journal;
 	ext4_clear_journal_err(sb, es);
 	if (!really_read_only && journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		es->s_journal_dev = cpu_to_le32(journal_devnum);
 		/* Make sure we flush the recovery flag to disk. */
 		ext4_commit_super(sb, 1);
 	}
 	return 0;
 }
 static int ext4_commit_super(struct super_block *sb, int sync)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
 	int error = 0;
 	if (!sbh || block_device_ejected(sb))
 		return error;
 	if (buffer_write_io_error(sbh)) {
 		/*
 		 * Oh, dear.  A previous attempt to write the
 		 * superblock failed.  This could happen because the
 		 * USB device was yanked out.  Or it could happen to
 		 * be a transient write error and maybe the block will
 		 * be remapped.  Nothing we can do but to retry the
 		 * write and hope for the best.
 		 */
 		ext4_msg(sb, KERN_ERR, "previous I/O error to "
 		       "superblock detected");
 		clear_buffer_write_io_error(sbh);
 		set_buffer_uptodate(sbh);
 	}
 	/*
 	 * If the file system is mounted read-only, don't update the
 	 * superblock write time.  This avoids updating the superblock
 	 * write time when we are mounting the root file system
 	 * read/only but we need to replay the journal; at that point,
 	 * for people who are east of GMT and who make their clock
 	 * tick in localtime for Windows bug-for-bug compatibility,
 	 * the clock is set in the future, and this will cause e2fsck
 	 * to complain and force a full file system check.
 	 */
 	if (!(sb->s_flags & MS_RDONLY))
 		es->s_wtime = cpu_to_le32(get_seconds());
 	if (sb->s_bdev->bd_part)
 		es->s_kbytes_written =
 			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
 			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
 			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
 	else
 		es->s_kbytes_written =
 			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
 	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
 		ext4_free_blocks_count_set(es,
 			EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
 				&EXT4_SB(sb)->s_freeclusters_counter)));
 	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
 		es->s_free_inodes_count =
 			cpu_to_le32(percpu_counter_sum_positive(
 				&EXT4_SB(sb)->s_freeinodes_counter));
 	BUFFER_TRACE(sbh, "marking dirty");
 	ext4_superblock_csum_set(sb);
 	mark_buffer_dirty(sbh);
 	if (sync) {
 		error = sync_dirty_buffer(sbh);
 		if (error)
 			return error;
 		error = buffer_write_io_error(sbh);
 		if (error) {
 			ext4_msg(sb, KERN_ERR, "I/O error while writing "
 			       "superblock");
 			clear_buffer_write_io_error(sbh);
 			set_buffer_uptodate(sbh);
 		}
 	}
 	return error;
 }
 /*
  * Have we just finished recovery?  If so, and if we are mounting (or
  * remounting) the filesystem readonly, then we will end up with a
  * consistent fs on disk.  Record that fact.
  */
 static void ext4_mark_recovery_complete(struct super_block *sb,
 					struct ext4_super_block *es)
 {
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
 		BUG_ON(journal != NULL);
 		return;
 	}
 	jbd2_journal_lock_updates(journal);
 	if (jbd2_journal_flush(journal) < 0)
 		goto out;
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		ext4_commit_super(sb, 1);
 	}
 out:
 	jbd2_journal_unlock_updates(journal);
 }
 /*
  * If we are mounting (or read-write remounting) a filesystem whose journal
  * has recorded an error from a previous lifetime, move that error to the
  * main filesystem now.
  */
 static void ext4_clear_journal_err(struct super_block *sb,
 				   struct ext4_super_block *es)
 {
 	journal_t *journal;
 	int j_errno;
 	const char *errstr;
 	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 	journal = EXT4_SB(sb)->s_journal;
 	/*
 	 * Now check for any error status which may have been recorded in the
 	 * journal by a prior ext4_error() or ext4_abort()
 	 */
 	j_errno = jbd2_journal_errno(journal);
 	if (j_errno) {
 		char nbuf[16];
 		errstr = ext4_decode_error(sb, j_errno, nbuf);
 		ext4_warning(sb, "Filesystem error recorded "
 			     "from previous mount: %s", errstr);
 		ext4_warning(sb, "Marking fs in need of filesystem check.");
 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
 		ext4_commit_super(sb, 1);
 		jbd2_journal_clear_err(journal);
 		jbd2_journal_update_sb_errno(journal);
 	}
 }
 /*
  * Force the running and committing transactions to commit,
  * and wait on the commit.
  */
 int ext4_force_commit(struct super_block *sb)
 {
 	journal_t *journal;
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 	journal = EXT4_SB(sb)->s_journal;
 	return ext4_journal_force_commit(journal);
 }
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
 	int ret = 0;
 	tid_t target;
 	bool needs_barrier = false;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	trace_ext4_sync_fs(sb, wait);
 	flush_workqueue(sbi->rsv_conversion_wq);
 	/*
 	 * Writeback quota in non-journalled quota case - journalled quota has
 	 * no dirty dquots
 	 */
 	dquot_writeback_dquots(sb, -1);
 	/*
 	 * Data writeback is possible w/o journal transaction, so barrier must
 	 * being sent at the end of the function. But we can skip it if
 	 * transaction_commit will do it for us.
 	 */
 	if (sbi->s_journal) {
 		target = jbd2_get_latest_transaction(sbi->s_journal);
 		if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
 		    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
 			needs_barrier = true;
 		if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
 			if (wait)
 				ret = jbd2_log_wait_commit(sbi->s_journal,
 							   target);
 		}
 	} else if (wait && test_opt(sb, BARRIER))
 		needs_barrier = true;
 	if (needs_barrier) {
 		int err;
 		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
 		if (!ret)
 			ret = err;
 	}
 	return ret;
 }
 /*
  * LVM calls this function before a (read-only) snapshot is created.  This
  * gives us a chance to flush the journal completely and mark the fs clean.
  *
  * Note that only this function cannot bring a filesystem to be in a clean
  * state independently. It relies on upper layer to stop all data & metadata
  * modifications.
  */
 static int ext4_freeze(struct super_block *sb)
 {
 	int error = 0;
 	journal_t *journal;
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 	journal = EXT4_SB(sb)->s_journal;
 	if (journal) {
 		/* Now we set up the journal barrier. */
 		jbd2_journal_lock_updates(journal);
 		/*
 		 * Don't clear the needs_recovery flag if we failed to
 		 * flush the journal.
 		 */
 		error = jbd2_journal_flush(journal);
 		if (error < 0)
 			goto out;
 	}
 	/* Journal blocked and flushed, clear needs_recovery flag. */
 	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 	error = ext4_commit_super(sb, 1);
 out:
 	if (journal)
 		/* we rely on upper layer to stop further updates */
 		jbd2_journal_unlock_updates(journal);
 	return error;
 }
 /*
  * Called by LVM after the snapshot is done.  We need to reset the RECOVER
  * flag here, even though the filesystem is not technically dirty yet.
  */
 static int ext4_unfreeze(struct super_block *sb)
 {
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 	/* Reset the needs_recovery flag before the fs is unlocked. */
 	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 	ext4_commit_super(sb, 1);
 	return 0;
 }
 /*
  * Structure to save mount options for ext4_remount's benefit
  */
 struct ext4_mount_options {
 	unsigned long s_mount_opt;
 	unsigned long s_mount_opt2;
 	kuid_t s_resuid;
 	kgid_t s_resgid;
 	unsigned long s_commit_interval;
 	u32 s_min_batch_time, s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	int s_jquota_fmt;
 	char *s_qf_names[EXT4_MAXQUOTAS];
 #endif
 };
 static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	unsigned long old_sb_flags;
 	struct ext4_mount_options old_opts;
 	int enable_quota = 0;
 	ext4_group_t g;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 	int err = 0;
 #ifdef CONFIG_QUOTA
 	int i, j;
 #endif
 	char *orig_data = kstrdup(data, GFP_KERNEL);
 	/* Store the original options */
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
 	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
 	old_opts.s_resuid = sbi->s_resuid;
 	old_opts.s_resgid = sbi->s_resgid;
 	old_opts.s_commit_interval = sbi->s_commit_interval;
 	old_opts.s_min_batch_time = sbi->s_min_batch_time;
 	old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		if (sbi->s_qf_names[i]) {
 			old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
 							 GFP_KERNEL);
 			if (!old_opts.s_qf_names[i]) {
 				for (j = 0; j < i; j++)
 					kfree(old_opts.s_qf_names[j]);
 				kfree(orig_data);
 				return -ENOMEM;
 			}
 		} else
 			old_opts.s_qf_names[i] = NULL;
 #endif
 	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
 		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
-	/*
-	 * Allow the "check" option to be passed as a remount option.
-	 */
 	if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
 	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
 	    test_opt(sb, JOURNAL_CHECKSUM)) {
 		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
-			 "during remount not supported");
+			 "during remount not supported; ignoring");
-		err = -EINVAL;
+		sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
-		goto restore_opts;
 	}
-	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
-	    test_opt(sb, JOURNAL_CHECKSUM)) {
-		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
-			 "during remount not supported");
-		err = -EINVAL;
-		goto restore_opts;
-	}
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
 		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
 				 "both data=journal and delalloc");
 			err = -EINVAL;
 			goto restore_opts;
 		}
 		if (test_opt(sb, DIOREAD_NOLOCK)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
 				 "both data=journal and dioread_nolock");
 			err = -EINVAL;
 			goto restore_opts;
 		}
 		if (test_opt(sb, DAX)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
 				 "both data=journal and dax");
 			err = -EINVAL;
 			goto restore_opts;
 		}
 	}
 	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
 		ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
 			"dax flag with busy inodes while remounting");
 		sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
 	}
 	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
 		ext4_abort(sb, "Abort forced by user");
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 	es = sbi->s_es;
 	if (sbi->s_journal) {
 		ext4_init_journal_params(sb, sbi->s_journal);
 		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 	}
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
 			err = -EROFS;
 			goto restore_opts;
 		}
 		if (*flags & MS_RDONLY) {
 			err = sync_filesystem(sb);
 			if (err < 0)
 				goto restore_opts;
 			err = dquot_suspend(sb, -1);
 			if (err < 0)
 				goto restore_opts;
 			/*
 			 * First of all, the unconditional stuff we have to do
 			 * to disable replay of the journal when we next remount
 			 */
 			sb->s_flags |= MS_RDONLY;
 			/*
 			 * OK, test if we are remounting a valid rw partition
 			 * readonly, and if so set the rdonly flag and then
 			 * mark the partition as valid again.
 			 */
 			if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
 			    (sbi->s_mount_state & EXT4_VALID_FS))
 				es->s_state = cpu_to_le16(sbi->s_mount_state);
 			if (sbi->s_journal)
 				ext4_mark_recovery_complete(sb, es);
 		} else {
 			/* Make sure we can mount this feature set readwrite */
-			if (!ext4_feature_set_ok(sb, 0)) {
+			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_READONLY) ||
+			    !ext4_feature_set_ok(sb, 0)) {
 				err = -EROFS;
 				goto restore_opts;
 			}
 			/*
 			 * Make sure the group descriptor checksums
 			 * are sane.  If they aren't, refuse to remount r/w.
 			 */
 			for (g = 0; g < sbi->s_groups_count; g++) {
 				struct ext4_group_desc *gdp =
 					ext4_get_group_desc(sb, g, NULL);
 				if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
 					ext4_msg(sb, KERN_ERR,
 	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
 		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
 					       le16_to_cpu(gdp->bg_checksum));
 					err = -EINVAL;
 					goto restore_opts;
 				}
 			}
 			/*
 			 * If we have an unprocessed orphan list hanging
 			 * around from a previously readonly bdev mount,
 			 * require a full umount/remount for now.
 			 */
 			if (es->s_last_orphan) {
 				ext4_msg(sb, KERN_WARNING, "Couldn't "
 				       "remount RDWR because of unprocessed "
 				       "orphan inode list.  Please "
 				       "umount/remount instead");
 				err = -EINVAL;
 				goto restore_opts;
 			}
 			/*
 			 * Mounting a RDONLY partition read-write, so reread
 			 * and store the current valid flag.  (It may have
 			 * been changed by e2fsck since we originally mounted
 			 * the partition.)
 			 */
 			if (sbi->s_journal)
 				ext4_clear_journal_err(sb, es);
 			sbi->s_mount_state = le16_to_cpu(es->s_state);
 			if (!ext4_setup_super(sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
 			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
 						     EXT4_FEATURE_INCOMPAT_MMP))
 				if (ext4_multi_mount_protect(sb,
 						le64_to_cpu(es->s_mmp_block))) {
 					err = -EROFS;
 					goto restore_opts;
 				}
 			enable_quota = 1;
 		}
 	}
 	/*
 	 * Reinitialize lazy itable initialization thread based on
 	 * current settings
 	 */
 	if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
 		ext4_unregister_li_request(sb);
 	else {
 		ext4_group_t first_not_zeroed;
 		first_not_zeroed = ext4_has_uninit_itable(sb);
 		ext4_register_li_request(sb, first_not_zeroed);
 	}
 	ext4_setup_system_zone(sb);
 	if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
 		ext4_commit_super(sb, 1);
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(old_opts.s_qf_names[i]);
 	if (enable_quota) {
 		if (sb_any_quota_suspended(sb))
 			dquot_resume(sb, -1);
 		else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 					EXT4_FEATURE_RO_COMPAT_QUOTA)) {
 			err = ext4_enable_quotas(sb);
 			if (err)
 				goto restore_opts;
 		}
 	}
 #endif
 	*flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
 	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
 	kfree(orig_data);
 	return 0;
 restore_opts:
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.s_mount_opt;
 	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sbi->s_commit_interval = old_opts.s_commit_interval;
 	sbi->s_min_batch_time = old_opts.s_min_batch_time;
 	sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
 	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
 		kfree(sbi->s_qf_names[i]);
 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
 	}
 #endif
 	kfree(orig_data);
 	return err;
 }
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	ext4_fsblk_t overhead = 0, resv_blocks;
 	u64 fsid;
 	s64 bfree;
 	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
 	if (!test_opt(sb, MINIX_DF))
 		overhead = sbi->s_overhead;
 	buf->f_type = EXT4_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
 	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
 		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
 	/* prevent underflow in case that few free space is available */
 	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
 	buf->f_bavail = buf->f_bfree -
 			(ext4_r_blocks_count(es) + resv_blocks);
 	if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
 		buf->f_bavail = 0;
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
 	buf->f_namelen = EXT4_NAME_LEN;
 	fsid = le64_to_cpup((void *)es->s_uuid) ^
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
 	return 0;
 }
 /* Helper function for writing quotas on sync - we need to start transaction
  * before quota file is locked for write. Otherwise the are possible deadlocks:
  * Process 1                         Process 2
  * ext4_create()                     quota_sync()
  *   jbd2_journal_start()                  write_dquot()
  *   dquot_initialize()                         down(dqio_mutex)
  *     down(dqio_mutex)                    jbd2_journal_start()
  *
  */
 #ifdef CONFIG_QUOTA
 static inline struct inode *dquot_to_inode(struct dquot *dquot)
 {
 	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
 }
 static int ext4_write_dquot(struct dquot *dquot)
 {
 	int ret, err;
 	handle_t *handle;
 	struct inode *inode;
 	inode = dquot_to_inode(dquot);
 	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
 				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_commit(dquot);
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
 	return ret;
 }
 static int ext4_acquire_dquot(struct dquot *dquot)
 {
 	int ret, err;
 	handle_t *handle;
 	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
 				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_acquire(dquot);
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
 	return ret;
 }
 static int ext4_release_dquot(struct dquot *dquot)
 {
 	int ret, err;
 	handle_t *handle;
 	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
 				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle)) {
 		/* Release dquot anyway to avoid endless cycle in dqput() */
 		dquot_release(dquot);
 		return PTR_ERR(handle);
 	}
 	ret = dquot_release(dquot);
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
 	return ret;
 }
 static int ext4_mark_dquot_dirty(struct dquot *dquot)
 {
 	struct super_block *sb = dquot->dq_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	/* Are we journaling quotas? */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
 	    sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
 		dquot_mark_dquot_dirty(dquot);
 		return ext4_write_dquot(dquot);
 	} else {
 		return dquot_mark_dquot_dirty(dquot);
 	}
 }
 static int ext4_write_info(struct super_block *sb, int type)
 {
 	int ret, err;
 	handle_t *handle;
 	/* Data block + inode block */
 	handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_commit_info(sb, type);
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
 	return ret;
 }
 /*
  * Turn on quotas during mount time - we need to find
  * the quota file and such...
  */
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
 	return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
 					EXT4_SB(sb)->s_jquota_fmt, type);
 }
 /*
  * Standard function to be called on quota_on
  */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 			 struct path *path)
 {
 	int err;
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
 	/* Quotafile not on the same filesystem? */
 	if (path->dentry->d_sb != sb)
 		return -EXDEV;
 	/* Journaling quota? */
 	if (EXT4_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not in fs root? */
 		if (path->dentry->d_parent != sb->s_root)
 			ext4_msg(sb, KERN_WARNING,
 				"Quota file not on filesystem root. "
 				"Journaled quota will not work");
 	}
 	/*
 	 * When we journal data on quota file, we have to flush journal to see
 	 * all updates to the file when we bypass pagecache...
 	 */
 	if (EXT4_SB(sb)->s_journal &&
 	    ext4_should_journal_data(path->dentry->d_inode)) {
 		/*
 		 * We don't need to lock updates but journal_flush() could
 		 * otherwise be livelocked...
 		 */
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
 		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 		if (err)
 			return err;
 	}
 	return dquot_quota_on(sb, type, format_id, path);
 }
 static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 			     unsigned int flags)
 {
 	int err;
 	struct inode *qf_inode;
 	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
 	};
 	BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
 	if (!qf_inums[type])
 		return -EPERM;
 	qf_inode = ext4_iget(sb, qf_inums[type]);
 	if (IS_ERR(qf_inode)) {
 		ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
 		return PTR_ERR(qf_inode);
 	}
 	/* Don't account quota for quota files to avoid recursion */
 	qf_inode->i_flags |= S_NOQUOTA;
 	err = dquot_enable(qf_inode, type, format_id, flags);
 	iput(qf_inode);
 	return err;
 }
 /* Enable usage tracking for all quota types. */
 static int ext4_enable_quotas(struct super_block *sb)
 {
 	int type, err = 0;
 	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
 	};
 	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
 	for (type = 0; type < EXT4_MAXQUOTAS; type++) {
 		if (qf_inums[type]) {
 			err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
 						DQUOT_USAGE_ENABLED);
 			if (err) {
 				ext4_warning(sb,
 					"Failed to enable quota tracking "
 					"(type=%d, err=%d). Please run "
 					"e2fsck to fix.", type, err);
 				return err;
 			}
 		}
 	}
 	return 0;
 }
 static int ext4_quota_off(struct super_block *sb, int type)
 {
 	struct inode *inode = sb_dqopt(sb)->files[type];
 	handle_t *handle;
 	/* Force all delayed allocation blocks to be allocated.
 	 * Caller already holds s_umount sem */
 	if (test_opt(sb, DELALLOC))
 		sync_filesystem(sb);
 	if (!inode)
 		goto out;
 	/* Update modification times of quota files when userspace can
 	 * start looking at them */
 	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
 	if (IS_ERR(handle))
 		goto out;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 out:
 	return dquot_quota_off(sb, type);
 }
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
  * acquiring the locks... As quota files are never truncated and quota code
  * itself serializes the operations (and no one else should touch the files)
  * we don't have to be afraid of races */
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off)
 {
 	struct inode *inode = sb_dqopt(sb)->files[type];
 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
 	int offset = off & (sb->s_blocksize - 1);
 	int tocopy;
 	size_t toread;
 	struct buffer_head *bh;
 	loff_t i_size = i_size_read(inode);
 	if (off > i_size)
 		return 0;
 	if (off+len > i_size)
 		len = i_size-off;
 	toread = len;
 	while (toread > 0) {
 		tocopy = sb->s_blocksize - offset < toread ?
 				sb->s_blocksize - offset : toread;
 		bh = ext4_bread(NULL, inode, blk, 0);
 		if (IS_ERR(bh))
 			return PTR_ERR(bh);
 		if (!bh)	/* A hole? */
 			memset(data, 0, tocopy);
 		else
 			memcpy(data, bh->b_data+offset, tocopy);
 		brelse(bh);
 		offset = 0;
 		toread -= tocopy;
 		data += tocopy;
 		blk++;
 	}
 	return len;
 }
 /* Write to quotafile (we know the transaction is already started and has
  * enough credits) */
 static ssize_t ext4_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off)
 {
 	struct inode *inode = sb_dqopt(sb)->files[type];
 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
 	int err, offset = off & (sb->s_blocksize - 1);
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 	if (EXT4_SB(sb)->s_journal && !handle) {
 		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
 			" cancelled because transaction is not started",
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
 	}
 	/*
 	 * Since we account only one data block in transaction credits,
 	 * then it is impossible to cross a block boundary.
 	 */
 	if (sb->s_blocksize - offset < len) {
 		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
 			" cancelled because not block aligned",
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
 	}
 	bh = ext4_bread(handle, inode, blk, 1);
 	if (IS_ERR(bh))
 		return PTR_ERR(bh);
 	if (!bh)
 		goto out;
 	BUFFER_TRACE(bh, "get write access");
 	err = ext4_journal_get_write_access(handle, bh);
 	if (err) {
 		brelse(bh);
 		return err;
 	}
 	lock_buffer(bh);
 	memcpy(bh->b_data+offset, data, len);
 	flush_dcache_page(bh->b_page);
 	unlock_buffer(bh);
 	err = ext4_handle_dirty_metadata(handle, NULL, bh);
 	brelse(bh);
 out:
 	if (inode->i_size < off + len) {
 		i_size_write(inode, off + len);
 		EXT4_I(inode)->i_disksize = inode->i_size;
 		ext4_mark_inode_dirty(handle, inode);
 	}
 	return len;
 }
 #endif
 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 		       const char *dev_name, void *data)
 {
 	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
 }
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static inline void register_as_ext2(void)
 {
 	int err = register_filesystem(&ext2_fs_type);
 	if (err)
 		printk(KERN_WARNING
 		       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
 }
 static inline void unregister_as_ext2(void)
 {
 	unregister_filesystem(&ext2_fs_type);
 }
 static inline int ext2_feature_set_ok(struct super_block *sb)
 {
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
 		return 0;
 	if (sb->s_flags & MS_RDONLY)
 		return 1;
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
 		return 0;
 	return 1;
 }
 #else
 static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
 static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static inline void register_as_ext3(void)
 {
 	int err = register_filesystem(&ext3_fs_type);
 	if (err)
 		printk(KERN_WARNING
 		       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
 }
 static inline void unregister_as_ext3(void)
 {
 	unregister_filesystem(&ext3_fs_type);
 }
 static inline int ext3_feature_set_ok(struct super_block *sb)
 {
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
 		return 0;
 	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
 		return 0;
 	if (sb->s_flags & MS_RDONLY)
 		return 1;
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
 		return 0;
 	return 1;
 }
 #else
 static inline void register_as_ext3(void) { }
 static inline void unregister_as_ext3(void) { }
 static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 static struct file_system_type ext4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4",
 	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("ext4");
 static int __init ext4_init_feat_adverts(void)
 {
 	struct ext4_features *ef;
 	int ret = -ENOMEM;
 	ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
 	if (!ef)
 		goto out;
 	ef->f_kobj.kset = ext4_kset;
 	init_completion(&ef->f_kobj_unregister);
 	ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
 				   "features");
 	if (ret) {
 		kfree(ef);
 		goto out;
 	}
 	ext4_feat = ef;
 	ret = 0;
 out:
 	return ret;
 }
 static void ext4_exit_feat_adverts(void)
 {
 	kobject_put(&ext4_feat->f_kobj);
 	wait_for_completion(&ext4_feat->f_kobj_unregister);
 	kfree(ext4_feat);
 }
 /* Shared across all ext4 file systems */
 wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
 struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 static int __init ext4_init_fs(void)
 {
 	int i, err;
 	ext4_li_info = NULL;
 	mutex_init(&ext4_li_mtx);
 	/* Build-time check for flags consistency */
 	ext4_check_flag_values();
 	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
 		mutex_init(&ext4__aio_mutex[i]);
 		init_waitqueue_head(&ext4__ioend_wq[i]);
 	}
 	err = ext4_init_es();
 	if (err)
 		return err;
 	err = ext4_init_pageio();
 	if (err)
 		goto out7;
 	err = ext4_init_system_zone();
 	if (err)
 		goto out6;
 	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
 	if (!ext4_kset) {
 		err = -ENOMEM;
 		goto out5;
 	}
 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 	err = ext4_init_feat_adverts();
 	if (err)
 		goto out4;
 	err = ext4_init_mballoc();
 	if (err)
 		goto out2;
 	else
 		ext4_mballoc_ready = 1;
 	err = init_inodecache();
 	if (err)
 		goto out1;
 	register_as_ext3();
 	register_as_ext2();
 	err = register_filesystem(&ext4_fs_type);
 	if (err)
 		goto out;
 	return 0;
 out:
 	unregister_as_ext2();
 	unregister_as_ext3();
 	destroy_inodecache();
 out1:
 	ext4_mballoc_ready = 0;
 	ext4_exit_mballoc();
 out2:
 	ext4_exit_feat_adverts();
 out4:
 	if (ext4_proc_root)
 		remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
 out5:
 	ext4_exit_system_zone();
 out6:
 	ext4_exit_pageio();
 out7:
 	ext4_exit_es();
 	return err;
 }
 static void __exit ext4_exit_fs(void)
 {
 	ext4_destroy_lazyinit_thread();
 	unregister_as_ext2();
 	unregister_as_ext3();
 	unregister_filesystem(&ext4_fs_type);
 	destroy_inodecache();
 	ext4_exit_mballoc();
 	ext4_exit_feat_adverts();
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
 	ext4_exit_system_zone();
 	ext4_exit_pageio();
 	ext4_exit_es();

fs/jbd2/recovery.c

Diff comments View file @ feaf222

 /*
  * linux/fs/jbd2/recovery.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
  * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
  * option, any later version, incorporated herein by reference.
  *
  * Journal recovery routines for the generic filesystem journaling code;
  * part of the ext2fs journaling system.
  */
 #ifndef __KERNEL__
 #include "jfs_user.h"
 #else
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
 #include <linux/errno.h>
 #include <linux/crc32.h>
 #include <linux/blkdev.h>
 #endif
 /*
  * Maintain information about the progress of the recovery job, so that
  * the different passes can carry information between them.
  */
 struct recovery_info
 {
 	tid_t		start_transaction;
 	tid_t		end_transaction;
 	int		nr_replays;
 	int		nr_revokes;
 	int		nr_revoke_hits;
 };
 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
 static int do_one_pass(journal_t *journal,
 				struct recovery_info *info, enum passtype pass);
 static int scan_revoke_records(journal_t *, struct buffer_head *,
 				tid_t, struct recovery_info *);
 #ifdef __KERNEL__
 /* Release readahead buffers after use */
 static void journal_brelse_array(struct buffer_head *b[], int n)
 {
 	while (--n >= 0)
 		brelse (b[n]);
 }
 /*
  * When reading from the journal, we are going through the block device
  * layer directly and so there is no readahead being done for us.  We
  * need to implement any readahead ourselves if we want it to happen at
  * all.  Recovery is basically one long sequential read, so make sure we
  * do the IO in reasonably large chunks.
  *
  * This is not so critical that we need to be enormously clever about
  * the readahead size, though.  128K is a purely arbitrary, good-enough
  * fixed value.
  */
 #define MAXBUF 8
 static int do_readahead(journal_t *journal, unsigned int start)
 {
 	int err;
 	unsigned int max, nbufs, next;
 	unsigned long long blocknr;
 	struct buffer_head *bh;
 	struct buffer_head * bufs[MAXBUF];
 	/* Do up to 128K of readahead */
 	max = start + (128 * 1024 / journal->j_blocksize);
 	if (max > journal->j_maxlen)
 		max = journal->j_maxlen;
 	/* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
 	 * a time to the block device IO layer. */
 	nbufs = 0;
 	for (next = start; next < max; next++) {
 		err = jbd2_journal_bmap(journal, next, &blocknr);
 		if (err) {
 			printk(KERN_ERR "JBD2: bad block at offset %u\n",
 				next);
 			goto failed;
 		}
 		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
 		if (!bh) {
 			err = -ENOMEM;
 			goto failed;
 		}
 		if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
 			bufs[nbufs++] = bh;
 			if (nbufs == MAXBUF) {
 				ll_rw_block(READ, nbufs, bufs);
 				journal_brelse_array(bufs, nbufs);
 				nbufs = 0;
 			}
 		} else
 			brelse(bh);
 	}
 	if (nbufs)
 		ll_rw_block(READ, nbufs, bufs);
 	err = 0;
 failed:
 	if (nbufs)
 		journal_brelse_array(bufs, nbufs);
 	return err;
 }
 #endif /* __KERNEL__ */
 /*
  * Read a block from the journal
  */
 static int jread(struct buffer_head **bhp, journal_t *journal,
 		 unsigned int offset)
 {
 	int err;
 	unsigned long long blocknr;
 	struct buffer_head *bh;
 	*bhp = NULL;
 	if (offset >= journal->j_maxlen) {
 		printk(KERN_ERR "JBD2: corrupted journal superblock\n");
 		return -EIO;
 	}
 	err = jbd2_journal_bmap(journal, offset, &blocknr);
 	if (err) {
 		printk(KERN_ERR "JBD2: bad block at offset %u\n",
 			offset);
 		return err;
 	}
 	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
 	if (!bh)
 		return -ENOMEM;
 	if (!buffer_uptodate(bh)) {
 		/* If this is a brand new buffer, start readahead.
                    Otherwise, we assume we are already reading it.  */
 		if (!buffer_req(bh))
 			do_readahead(journal, offset);
 		wait_on_buffer(bh);
 	}
 	if (!buffer_uptodate(bh)) {
 		printk(KERN_ERR "JBD2: Failed to read block at offset %u\n",
 			offset);
 		brelse(bh);
 		return -EIO;
 	}
 	*bhp = bh;
 	return 0;
 }
 static int jbd2_descr_block_csum_verify(journal_t *j,
 					void *buf)
 {
 	struct jbd2_journal_block_tail *tail;
 	__be32 provided;
 	__u32 calculated;
 	if (!jbd2_journal_has_csum_v2or3(j))
 		return 1;
 	tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
 			sizeof(struct jbd2_journal_block_tail));
 	provided = tail->t_checksum;
 	tail->t_checksum = 0;
 	calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
 	tail->t_checksum = provided;
 	return provided == cpu_to_be32(calculated);
 }
 /*
  * Count the number of in-use tags in a journal descriptor block.
  */
 static int count_tags(journal_t *journal, struct buffer_head *bh)
 {
 	char *			tagp;
 	journal_block_tag_t *	tag;
 	int			nr = 0, size = journal->j_blocksize;
 	int			tag_bytes = journal_tag_bytes(journal);
 	if (jbd2_journal_has_csum_v2or3(journal))
 		size -= sizeof(struct jbd2_journal_block_tail);
 	tagp = &bh->b_data[sizeof(journal_header_t)];
 	while ((tagp - bh->b_data + tag_bytes) <= size) {
 		tag = (journal_block_tag_t *) tagp;
 		nr++;
 		tagp += tag_bytes;
 		if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
 			tagp += 16;
 		if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
 			break;
 	}
 	return nr;
 }
 /* Make sure we wrap around the log correctly! */
 #define wrap(journal, var)						\
 do {									\
 	if (var >= (journal)->j_last)					\
 		var -= ((journal)->j_last - (journal)->j_first);	\
 } while (0)
 /**
  * jbd2_journal_recover - recovers a on-disk journal
  * @journal: the journal to recover
  *
  * The primary function for recovering the log contents when mounting a
  * journaled device.
  *
  * Recovery is done in three passes.  In the first pass, we look for the
  * end of the log.  In the second, we assemble the list of revoke
  * blocks.  In the third and final pass, we replay any un-revoked blocks
  * in the log.
  */
 int jbd2_journal_recover(journal_t *journal)
 {
 	int			err, err2;
 	journal_superblock_t *	sb;
 	struct recovery_info	info;
 	memset(&info, 0, sizeof(info));
 	sb = journal->j_superblock;
 	/*
 	 * The journal superblock's s_start field (the current log head)
 	 * is always zero if, and only if, the journal was cleanly
 	 * unmounted.
 	 */
 	if (!sb->s_start) {
 		jbd_debug(1, "No recovery required, last transaction %d\n",
 			  be32_to_cpu(sb->s_sequence));
 		journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
 		return 0;
 	}
 	err = do_one_pass(journal, &info, PASS_SCAN);
 	if (!err)
 		err = do_one_pass(journal, &info, PASS_REVOKE);
 	if (!err)
 		err = do_one_pass(journal, &info, PASS_REPLAY);
 	jbd_debug(1, "JBD2: recovery, exit status %d, "
 		  "recovered transactions %u to %u\n",
 		  err, info.start_transaction, info.end_transaction);
 	jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
 		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
 	/* Restart the log at the next transaction ID, thus invalidating
 	 * any existing commit records in the log. */
 	journal->j_transaction_sequence = ++info.end_transaction;
 	jbd2_journal_clear_revoke(journal);
 	err2 = sync_blockdev(journal->j_fs_dev);
 	if (!err)
 		err = err2;
 	/* Make sure all replayed data is on permanent storage */
 	if (journal->j_flags & JBD2_BARRIER) {
 		err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 		if (!err)
 			err = err2;
 	}
 	return err;
 }
 /**
  * jbd2_journal_skip_recovery - Start journal and wipe exiting records
  * @journal: journal to startup
  *
  * Locate any valid recovery information from the journal and set up the
  * journal structures in memory to ignore it (presumably because the
  * caller has evidence that it is out of date).
  * This function does'nt appear to be exorted..
  *
  * We perform one pass over the journal to allow us to tell the user how
  * much recovery information is being erased, and to let us initialise
  * the journal transaction sequence numbers to the next unused ID.
  */
 int jbd2_journal_skip_recovery(journal_t *journal)
 {
 	int			err;
 	struct recovery_info	info;
 	memset (&info, 0, sizeof(info));
 	err = do_one_pass(journal, &info, PASS_SCAN);
 	if (err) {
 		printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
 		++journal->j_transaction_sequence;
 	} else {
 #ifdef CONFIG_JBD2_DEBUG
 		int dropped = info.end_transaction -
 			be32_to_cpu(journal->j_superblock->s_sequence);
 		jbd_debug(1,
 			  "JBD2: ignoring %d transaction%s from the journal.\n",
 			  dropped, (dropped == 1) ? "" : "s");
 #endif
 		journal->j_transaction_sequence = ++info.end_transaction;
 	}
 	journal->j_tail = 0;
 	return err;
 }
 static inline unsigned long long read_tag_block(journal_t *journal,
 						journal_block_tag_t *tag)
 {
 	unsigned long long block = be32_to_cpu(tag->t_blocknr);
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
 		block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
 	return block;
 }
 /*
  * calc_chksums calculates the checksums for the blocks described in the
  * descriptor block.
  */
 static int calc_chksums(journal_t *journal, struct buffer_head *bh,
 			unsigned long *next_log_block, __u32 *crc32_sum)
 {
 	int i, num_blks, err;
 	unsigned long io_block;
 	struct buffer_head *obh;
 	num_blks = count_tags(journal, bh);
 	/* Calculate checksum of the descriptor block. */
 	*crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
 	for (i = 0; i < num_blks; i++) {
 		io_block = (*next_log_block)++;
 		wrap(journal, *next_log_block);
 		err = jread(&obh, journal, io_block);
 		if (err) {
 			printk(KERN_ERR "JBD2: IO error %d recovering block "
 				"%lu in log\n", err, io_block);
 			return 1;
 		} else {
 			*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
 				     obh->b_size);
 		}
 		put_bh(obh);
 	}
 	return 0;
 }
 static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
 {
 	struct commit_header *h;
 	__be32 provided;
 	__u32 calculated;
 	if (!jbd2_journal_has_csum_v2or3(j))
 		return 1;
 	h = buf;
 	provided = h->h_chksum[0];
 	h->h_chksum[0] = 0;
 	calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
 	h->h_chksum[0] = provided;
 	return provided == cpu_to_be32(calculated);
 }
 static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
 				      void *buf, __u32 sequence)
 {
 	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
 	__u32 csum32;
 	__be32 seq;
 	if (!jbd2_journal_has_csum_v2or3(j))
 		return 1;
 	seq = cpu_to_be32(sequence);
 	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
 	csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
 	if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
 		return tag3->t_checksum == cpu_to_be32(csum32);
 	else
 		return tag->t_checksum == cpu_to_be16(csum32);
 }
 static int do_one_pass(journal_t *journal,
 			struct recovery_info *info, enum passtype pass)
 {
 	unsigned int		first_commit_ID, next_commit_ID;
 	unsigned long		next_log_block;
 	int			err, success = 0;
 	journal_superblock_t *	sb;
 	journal_header_t *	tmp;
 	struct buffer_head *	bh;
 	unsigned int		sequence;
 	int			blocktype;
 	int			tag_bytes = journal_tag_bytes(journal);
 	__u32			crc32_sum = ~0; /* Transactional Checksums */
 	int			descr_csum_size = 0;
 	int			block_error = 0;
 	/*
 	 * First thing is to establish what we expect to find in the log
 	 * (in terms of transaction IDs), and where (in terms of log
 	 * block offsets): query the superblock.
 	 */
 	sb = journal->j_superblock;
 	next_commit_ID = be32_to_cpu(sb->s_sequence);
 	next_log_block = be32_to_cpu(sb->s_start);
 	first_commit_ID = next_commit_ID;
 	if (pass == PASS_SCAN)
 		info->start_transaction = first_commit_ID;
 	jbd_debug(1, "Starting recovery pass %d\n", pass);
 	/*
 	 * Now we walk through the log, transaction by transaction,
 	 * making sure that each transaction has a commit block in the
 	 * expected place.  Each complete transaction gets replayed back
 	 * into the main filesystem.
 	 */
 	while (1) {
 		int			flags;
 		char *			tagp;
 		journal_block_tag_t *	tag;
 		struct buffer_head *	obh;
 		struct buffer_head *	nbh;
 		cond_resched();
 		/* If we already know where to stop the log traversal,
 		 * check right now that we haven't gone past the end of
 		 * the log. */
 		if (pass != PASS_SCAN)
 			if (tid_geq(next_commit_ID, info->end_transaction))
 				break;
 		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
 			  next_commit_ID, next_log_block, journal->j_last);
 		/* Skip over each chunk of the transaction looking
 		 * either the next descriptor block or the final commit
 		 * record. */
 		jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
 		err = jread(&bh, journal, next_log_block);
 		if (err)
 			goto failed;
 		next_log_block++;
 		wrap(journal, next_log_block);
 		/* What kind of buffer is it?
 		 *
 		 * If it is a descriptor block, check that it has the
 		 * expected sequence number.  Otherwise, we're all done
 		 * here. */
 		tmp = (journal_header_t *)bh->b_data;
 		if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
 			brelse(bh);
 			break;
 		}
 		blocktype = be32_to_cpu(tmp->h_blocktype);
 		sequence = be32_to_cpu(tmp->h_sequence);
 		jbd_debug(3, "Found magic %d, sequence %d\n",
 			  blocktype, sequence);
 		if (sequence != next_commit_ID) {
 			brelse(bh);
 			break;
 		}
 		/* OK, we have a valid descriptor block which matches
 		 * all of the sequence number checks.  What are we going
 		 * to do with it?  That depends on the pass... */
 		switch(blocktype) {
 		case JBD2_DESCRIPTOR_BLOCK:
 			/* Verify checksum first */
 			if (jbd2_journal_has_csum_v2or3(journal))
 				descr_csum_size =
 					sizeof(struct jbd2_journal_block_tail);
 			if (descr_csum_size > 0 &&
 			    !jbd2_descr_block_csum_verify(journal,
 							  bh->b_data)) {
+				printk(KERN_ERR "JBD2: Invalid checksum "
+				       "recovering block %lu in log\n",
+				       next_log_block);
 				err = -EIO;
 				brelse(bh);
 				goto failed;
 			}
 			/* If it is a valid descriptor block, replay it
 			 * in pass REPLAY; if journal_checksums enabled, then
 			 * calculate checksums in PASS_SCAN, otherwise,
 			 * just skip over the blocks it describes. */
 			if (pass != PASS_REPLAY) {
 				if (pass == PASS_SCAN &&
 				    JBD2_HAS_COMPAT_FEATURE(journal,
 					    JBD2_FEATURE_COMPAT_CHECKSUM) &&
 				    !info->end_transaction) {
 					if (calc_chksums(journal, bh,
 							&next_log_block,
 							&crc32_sum)) {
 						put_bh(bh);
 						break;
 					}
 					put_bh(bh);
 					continue;
 				}
 				next_log_block += count_tags(journal, bh);
 				wrap(journal, next_log_block);
 				put_bh(bh);
 				continue;
 			}
 			/* A descriptor block: we can now write all of
 			 * the data blocks.  Yay, useful work is finally
 			 * getting done here! */
 			tagp = &bh->b_data[sizeof(journal_header_t)];
 			while ((tagp - bh->b_data + tag_bytes)
 			       <= journal->j_blocksize - descr_csum_size) {
 				unsigned long io_block;
 				tag = (journal_block_tag_t *) tagp;
 				flags = be16_to_cpu(tag->t_flags);
 				io_block = next_log_block++;
 				wrap(journal, next_log_block);
 				err = jread(&obh, journal, io_block);
 				if (err) {
 					/* Recover what we can, but
 					 * report failure at the end. */
 					success = err;
 					printk(KERN_ERR
 						"JBD2: IO error %d recovering "
 						"block %ld in log\n",
 						err, io_block);
 				} else {
 					unsigned long long blocknr;
 					J_ASSERT(obh != NULL);
 					blocknr = read_tag_block(journal,
 								 tag);
 					/* If the block has been
 					 * revoked, then we're all done
 					 * here. */
 					if (jbd2_journal_test_revoke
 					    (journal, blocknr,
 					     next_commit_ID)) {
 						brelse(obh);
 						++info->nr_revoke_hits;
 						goto skip_write;
 					}
 					/* Look for block corruption */
 					if (!jbd2_block_tag_csum_verify(
 						journal, tag, obh->b_data,
 						be32_to_cpu(tmp->h_sequence))) {
 						brelse(obh);
 						success = -EIO;
 						printk(KERN_ERR "JBD2: Invalid "
 						       "checksum recovering "
 						       "block %llu in log\n",
 						       blocknr);
 						block_error = 1;
 						goto skip_write;
 					}
 					/* Find a buffer for the new
 					 * data being restored */
 					nbh = __getblk(journal->j_fs_dev,
 							blocknr,
 							journal->j_blocksize);
 					if (nbh == NULL) {
 						printk(KERN_ERR
 						       "JBD2: Out of memory "
 						       "during recovery.\n");
 						err = -ENOMEM;
 						brelse(bh);
 						brelse(obh);
 						goto failed;
 					}
 					lock_buffer(nbh);
 					memcpy(nbh->b_data, obh->b_data,
 							journal->j_blocksize);
 					if (flags & JBD2_FLAG_ESCAPE) {
 						*((__be32 *)nbh->b_data) =
 						cpu_to_be32(JBD2_MAGIC_NUMBER);
 					}
 					BUFFER_TRACE(nbh, "marking dirty");
 					set_buffer_uptodate(nbh);
 					mark_buffer_dirty(nbh);
 					BUFFER_TRACE(nbh, "marking uptodate");
 					++info->nr_replays;
 					/* ll_rw_block(WRITE, 1, &nbh); */
 					unlock_buffer(nbh);
 					brelse(obh);
 					brelse(nbh);
 				}
 			skip_write:
 				tagp += tag_bytes;
 				if (!(flags & JBD2_FLAG_SAME_UUID))
 					tagp += 16;
 				if (flags & JBD2_FLAG_LAST_TAG)
 					break;
 			}
 			brelse(bh);
 			continue;
 		case JBD2_COMMIT_BLOCK:
 			/*     How to differentiate between interrupted commit
 			 *               and journal corruption ?
 			 *
 			 * {nth transaction}
 			 *        Checksum Verification Failed
 			 *			 |
 			 *		 ____________________
 			 *		|		     |
 			 * 	async_commit             sync_commit
 			 *     		|                    |
 			 *		| GO TO NEXT    "Journal Corruption"
 			 *		| TRANSACTION
 			 *		|
 			 * {(n+1)th transanction}
 			 *		|
 			 * 	 _______|______________
 			 * 	|	 	      |
 			 * Commit block found	Commit block not found
 			 *      |		      |
 			 * "Journal Corruption"       |
 			 *		 _____________|_________
 			 *     		|	           	|
 			 *	nth trans corrupt	OR   nth trans
 			 *	and (n+1)th interrupted     interrupted
 			 *	before commit block
 			 *      could reach the disk.
 			 *	(Cannot find the difference in above
 			 *	 mentioned conditions. Hence assume
 			 *	 "Interrupted Commit".)
 			 */
 			/* Found an expected commit block: if checksums
 			 * are present verify them in PASS_SCAN; else not
 			 * much to do other than move on to the next sequence
 			 * number. */
 			if (pass == PASS_SCAN &&
 			    JBD2_HAS_COMPAT_FEATURE(journal,
 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
 				int chksum_err, chksum_seen;
 				struct commit_header *cbh =
 					(struct commit_header *)bh->b_data;
 				unsigned found_chksum =
 					be32_to_cpu(cbh->h_chksum[0]);
 				chksum_err = chksum_seen = 0;
 				if (info->end_transaction) {
 					journal->j_failed_commit =
 						info->end_transaction;
 					brelse(bh);
 					break;
 				}
 				if (crc32_sum == found_chksum &&
 				    cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
 				    cbh->h_chksum_size ==
 						JBD2_CRC32_CHKSUM_SIZE)
 				       chksum_seen = 1;
 				else if (!(cbh->h_chksum_type == 0 &&
 					     cbh->h_chksum_size == 0 &&
 					     found_chksum == 0 &&
 					     !chksum_seen))
 				/*
 				 * If fs is mounted using an old kernel and then
 				 * kernel with journal_chksum is used then we
 				 * get a situation where the journal flag has
 				 * checksum flag set but checksums are not
 				 * present i.e chksum = 0, in the individual
 				 * commit blocks.
 				 * Hence to avoid checksum failures, in this
 				 * situation, this extra check is added.
 				 */
 						chksum_err = 1;
 				if (chksum_err) {
 					info->end_transaction = next_commit_ID;
 					if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 					   JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
 						journal->j_failed_commit =
 							next_commit_ID;
 						brelse(bh);
 						break;
 					}
 				}
 				crc32_sum = ~0;
 			}
 			if (pass == PASS_SCAN &&
 			    !jbd2_commit_block_csum_verify(journal,
 							   bh->b_data)) {
 				info->end_transaction = next_commit_ID;
 				if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 				     JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 					journal->j_failed_commit =
 						next_commit_ID;
 					brelse(bh);
 					break;
 				}
 			}
 			brelse(bh);
 			next_commit_ID++;
 			continue;
 		case JBD2_REVOKE_BLOCK:
 			/* If we aren't in the REVOKE pass, then we can
 			 * just skip over this block. */
 			if (pass != PASS_REVOKE) {
 				brelse(bh);
 				continue;
 			}
 			err = scan_revoke_records(journal, bh,
 						  next_commit_ID, info);
 			brelse(bh);
 			if (err)
 				goto failed;
 			continue;
 		default:
 			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
 				  blocktype);
 			brelse(bh);
 			goto done;
 		}
 	}
  done:
 	/*
 	 * We broke out of the log scan loop: either we came to the
 	 * known end of the log or we found an unexpected block in the
 	 * log.  If the latter happened, then we know that the "current"
 	 * transaction marks the end of the valid log.
 	 */
 	if (pass == PASS_SCAN) {
 		if (!info->end_transaction)
 			info->end_transaction = next_commit_ID;
 	} else {
 		/* It's really bad news if different passes end up at
 		 * different places (but possible due to IO errors). */
 		if (info->end_transaction != next_commit_ID) {
 			printk(KERN_ERR "JBD2: recovery pass %d ended at "
 				"transaction %u, expected %u\n",
 				pass, next_commit_ID, info->end_transaction);
 			if (!success)
 				success = -EIO;
 		}
 	}
 	if (block_error && success == 0)
 		success = -EIO;
 	return success;
  failed:
 	return err;
 }
 static int jbd2_revoke_block_csum_verify(journal_t *j,
 					 void *buf)
 {
 	struct jbd2_journal_revoke_tail *tail;
 	__be32 provided;
 	__u32 calculated;
 	if (!jbd2_journal_has_csum_v2or3(j))
 		return 1;
 	tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
 			sizeof(struct jbd2_journal_revoke_tail));
 	provided = tail->r_checksum;
 	tail->r_checksum = 0;
 	calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
 	tail->r_checksum = provided;
 	return provided == cpu_to_be32(calculated);
 }
 /* Scan a revoke record, marking all blocks mentioned as revoked. */
 static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
 			       tid_t sequence, struct recovery_info *info)
 {
 	jbd2_journal_revoke_header_t *header;
 	int offset, max;
 	int record_len = 4;
 	header = (jbd2_journal_revoke_header_t *) bh->b_data;
 	offset = sizeof(jbd2_journal_revoke_header_t);
 	max = be32_to_cpu(header->r_count);
 	if (!jbd2_revoke_block_csum_verify(journal, header))
 		return -EINVAL;
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
 		record_len = 8;
 	while (offset + record_len <= max) {
 		unsigned long long blocknr;
 		int err;
 		if (record_len == 4)
 			blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
 		else
 			blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
 		offset += record_len;
 		err = jbd2_journal_set_revoke(journal, blocknr, sequence);
 		if (err)
 			return err;
 		++info->nr_revokes;
 	}
 	return 0;
 }