Merge branch 'for-linus' of git://git.open-osd.org/linux-open-osd

* 'for-linus' of git://git.open-osd.org/linux-open-osd: exofs: Multi-device mirror support exofs: Move all operations to an io_engine exofs: move osd.c to ios.c exofs: statfs blocks is sectors not FS blocks exofs: Prints on mount and unmout exofs: refactor exofs_i_info initialization into common helper exofs: dbg-print less exofs: More sane debug print trivial: some small fixes in exofs documentation

Merge branch 'for-linus' of git://git.open-osd.org/linux-open-osd
* 'for-linus' of git://git.open-osd.org/linux-open-osd: exofs: Multi-device mirror support exofs: Move all operations to an io_engine exofs: move osd.c to ios.c exofs: statfs blocks is sectors not FS blocks exofs: Prints on mount and unmout exofs: refactor exofs_i_info initialization into common helper exofs: dbg-print less exofs: More sane debug print trivial: some small fixes in exofs documentation
Linus Torvalds
2 parents fc1495bf99 04dc1e88ad
Showing 10 changed files Side-by-side Diff
Documentation/filesystems/00-INDEX
Documentation/filesystems/exofs.txt
fs/exofs/Kbuild
fs/exofs/common.h
fs/exofs/exofs.h
fs/exofs/inode.c
fs/exofs/ios.c
fs/exofs/osd.c
fs/exofs/pnfs.h
fs/exofs/super.c
@@ -36,6 +36,8 @@
 	- info about directory notification in Linux.
 ecryptfs.txt
 	- docs on eCryptfs: stacked cryptographic filesystem for Linux.
+exofs.txt
+	- info, usage, mount options, design about EXOFS.
 ext2.txt
 	- info, mount options and specifications for the Ext2 filesystem.
 ext3.txt
@@ -60,13 +60,13 @@
  
    mkfs.exofs --pid=65536 --format /dev/osd0
  
-   The --format is optional if not specified no OSD_FORMAT will be
-   preformed and a clean file system will be created in the specified pid,
+   The --format is optional. If not specified, no OSD_FORMAT will be
+   performed and a clean file system will be created in the specified pid,
    in the available space of the target. (Use --format=size_in_meg to limit
    the total LUN space available)
  
-   If pid already exist it will be deleted and a new one will be created in it's
-   place. Be careful.
+   If pid already exists, it will be deleted and a new one will be created in
+   its place. Be careful.
  
    An exofs lives inside a single OSD partition. You can create multiple exofs
    filesystems on the same device using multiple pids.
@@ -81,7 +81,7 @@
  
 7. For reference (See do-exofs example script):
 	do-exofs start - an example of how to perform the above steps.
-	do-exofs stop -  an example of how to unmount the file system.
+	do-exofs stop - an example of how to unmount the file system.
 	do-exofs format - an example of how to format and mkfs a new exofs.
  
 8. Extra compilation flags (uncomment in fs/exofs/Kbuild):
@@ -104,8 +104,8 @@
     exofs specific options: Options are separated by commas (,)
 		pid=<integer> - The partition number to mount/create as
                                 container of the filesystem.
-                                This option is mandatory
-                to=<integer>  - Timeout in ticks for a single command
+                                This option is mandatory.
+                to=<integer>  - Timeout in ticks for a single command.
                                 default is (60 * HZ) [for debugging only]
  
 ===============================================================================
@@ -116,7 +116,7 @@
   with a special ID (defined in common.h).
   Information included in the file system control block is used to fill the
   in-memory superblock structure at mount time. This object is created before
-  the file system is used by mkexofs.c It contains information such as:
+  the file system is used by mkexofs.c. It contains information such as:
 	- The file system's magic number
 	- The next inode number to be allocated
  
@@ -134,8 +134,8 @@
   attributes. This applies to both regular files and other types (directories,
   device files, symlinks, etc.).
  
-* Credentials are generated per object (inode and superblock) when they is
-  created in memory (read off disk or created). The credential works for all
+* Credentials are generated per object (inode and superblock) when they are
+  created in memory (read from disk or created). The credential works for all
   operations and is used as long as the object remains in memory.
  
 * Async OSD operations are used whenever possible, but the target may execute
@@ -145,7 +145,8 @@
   from executing in reverse order:
 	- The following are handled with the OBJ_CREATED and OBJ_2BCREATED
 	  flags. OBJ_CREATED is set when we know the object exists on the OSD -
-	  in create's callback function, and when we successfully do a read_inode.
+	  in create's callback function, and when we successfully do a
+	  read_inode.
 	  OBJ_2BCREATED is set in the beginning of the create function, so we
 	  know that we should wait.
 		- create/delete: delete should wait until the object is created
@@ -12,6 +12,6 @@
 # Kbuild - Gets included from the Kernels Makefile and build system
 #
  
-exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
+exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
@@ -49,6 +49,7 @@
 #define EXOFS_MIN_PID   0x10000	/* Smallest partition ID */
 #define EXOFS_OBJ_OFF	0x10000	/* offset for objects */
 #define EXOFS_SUPER_ID	0x10000	/* object ID for on-disk superblock */
+#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
 #define EXOFS_ROOT_ID	0x10002	/* object ID for root directory */
  
 /* exofs Application specific page/attribute */
  
  
  
  
@@ -78,18 +79,68 @@
 #define EXOFS_SUPER_MAGIC	0x5DF5
  
 /*
- * The file system control block - stored in an object's data (mainly, the one
- * with ID EXOFS_SUPER_ID).  This is where the in-memory superblock is stored
- * on disk.  Right now it just has a magic value, which is basically a sanity
- * check on our ability to communicate with the object store.
+ * The file system control block - stored in object EXOFS_SUPER_ID's data.
+ * This is where the in-memory superblock is stored on disk.
  */
+enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
 struct exofs_fscb {
 	__le64  s_nextid;	/* Highest object ID used */
-	__le32  s_numfiles;	/* Number of files on fs */
+	__le64  s_numfiles;	/* Number of files on fs */
+	__le32	s_version;	/* == EXOFS_FSCB_VER */
 	__le16  s_magic;	/* Magic signature */
 	__le16  s_newfs;	/* Non-zero if this is a new fs */
-};
  
+	/* From here on it's a static part, only written by mkexofs */
+	__le64	s_dev_table_oid;   /* Resurved, not used */
+	__le64	s_dev_table_count; /* == 0 means no dev_table */
+} __packed;
+
+/*
+ * Describes the raid used in the FS. It is part of the device table.
+ * This here is taken from the pNFS-objects definition. In exofs we
+ * use one raid policy through-out the filesystem. (NOTE: the funny
+ * alignment at begining. We take care of it at exofs_device_table.
+ */
+struct exofs_dt_data_map {
+	__le32	cb_num_comps;
+	__le64	cb_stripe_unit;
+	__le32	cb_group_width;
+	__le32	cb_group_depth;
+	__le32	cb_mirror_cnt;
+	__le32	cb_raid_algorithm;
+} __packed;
+
+/*
+ * This is an osd device information descriptor. It is a single entry in
+ * the exofs device table. It describes an osd target lun which
+ * contains data belonging to this FS. (Same partition_id on all devices)
+ */
+struct exofs_dt_device_info {
+	__le32	systemid_len;
+	u8	systemid[OSD_SYSTEMID_LEN];
+	__le64	long_name_offset;	/* If !0 then offset-in-file */
+	__le32	osdname_len;		/* */
+	u8	osdname[44];		/* Embbeded, Ususally an asci uuid */
+} __packed;
+
+/*
+ * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
+ * It contains the raid used for this multy-device FS and an array of
+ * participating devices.
+ */
+struct exofs_device_table {
+	__le32				dt_version;	/* == EXOFS_DT_VER */
+	struct exofs_dt_data_map	dt_data_map;	/* Raid policy to use */
+
+	/* Resurved space For future use. Total includeing this:
+	 * (8 * sizeof(le64))
+	 */
+	__le64				__Resurved[4];
+
+	__le64				dt_num_devices;	/* Array size */
+	struct exofs_dt_device_info	dt_dev_table[];	/* Array of devices */
+} __packed;
+
 /****************************************************************************
  * inode-related things
  ****************************************************************************/
@@ -154,24 +205,6 @@
 #define EXOFS_DIR_REC_LEN(name_len) \
 	(((name_len) + offsetof(struct exofs_dir_entry, name)  + \
 	  EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
-
-/*************************
- * function declarations *
- *************************/
-/* osd.c                 */
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
-			   const struct osd_obj_id *obj);
-
-int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
-static inline int exofs_check_ok(struct osd_request *or)
-{
-	return exofs_check_ok_resid(or, NULL, NULL);
-}
-int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
-int exofs_async_op(struct osd_request *or,
-	osd_req_done_fn *async_done, void *caller_context, u8 *cred);
-
-int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
  
 #endif /*ifndef __EXOFS_COM_H__*/
@@ -30,13 +30,17 @@
  * along with exofs; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
  
 #include <linux/fs.h>
 #include <linux/time.h>
 #include "common.h"
  
-#ifndef __EXOFS_H__
-#define __EXOFS_H__
+/* FIXME: Remove once pnfs hits mainline
+ * #include <linux/exportfs/pnfs_osd_xdr.h>
+ */
+#include "pnfs.h"
  
 #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
  
@@ -55,7 +59,7 @@
  * our extension to the in-memory superblock
  */
 struct exofs_sb_info {
-	struct osd_dev	*s_dev;			/* returned by get_osd_dev    */
+	struct exofs_fscb s_fscb;		/* Written often, pre-allocate*/
 	osd_id		s_pid;			/* partition ID of file system*/
 	int		s_timeout;		/* timeout for OSD operations */
 	uint64_t	s_nextid;		/* highest object ID used     */
@@ -63,7 +67,11 @@
 	spinlock_t	s_next_gen_lock;	/* spinlock for gen # update  */
 	u32		s_next_generation;	/* next gen # to use          */
 	atomic_t	s_curr_pending;		/* number of pending commands */
-	uint8_t		s_cred[OSD_CAP_LEN];	/* all-powerful credential    */
+	uint8_t		s_cred[OSD_CAP_LEN];	/* credential for the fscb    */
+
+	struct pnfs_osd_data_map data_map;	/* Default raid to use        */
+	unsigned	s_numdevs;		/* Num of devices in array    */
+	struct osd_dev	*s_ods[1];		/* Variable length, minimum 1 */
 };
  
 /*
@@ -79,6 +87,50 @@
 	struct inode   vfs_inode;          /* normal in-memory inode          */
 };
  
+static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
+{
+	return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
+}
+
+struct exofs_io_state;
+typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
+
+struct exofs_io_state {
+	struct kref		kref;
+
+	void			*private;
+	exofs_io_done_fn	done;
+
+	struct exofs_sb_info	*sbi;
+	struct osd_obj_id	obj;
+	u8			*cred;
+
+	/* Global read/write IO*/
+	loff_t			offset;
+	unsigned long		length;
+	void			*kern_buff;
+	struct bio		*bio;
+
+	/* Attributes */
+	unsigned		in_attr_len;
+	struct osd_attr 	*in_attr;
+	unsigned		out_attr_len;
+	struct osd_attr 	*out_attr;
+
+	/* Variable array of size numdevs */
+	unsigned numdevs;
+	struct exofs_per_dev_state {
+		struct osd_request *or;
+		struct bio *bio;
+	} per_dev[];
+};
+
+static inline unsigned exofs_io_state_size(unsigned numdevs)
+{
+	return sizeof(struct exofs_io_state) +
+		sizeof(struct exofs_per_dev_state) * numdevs;
+}
+
 /*
  * our inode flags
  */
@@ -130,6 +182,42 @@
 /*************************
  * function declarations *
  *************************/
+
+/* ios.c */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+			   const struct osd_obj_id *obj);
+int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+		    u64 offset, void *p, unsigned length);
+
+int  exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios);
+void exofs_put_io_state(struct exofs_io_state *ios);
+
+int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
+
+int exofs_sbi_create(struct exofs_io_state *ios);
+int exofs_sbi_remove(struct exofs_io_state *ios);
+int exofs_sbi_write(struct exofs_io_state *ios);
+int exofs_sbi_read(struct exofs_io_state *ios);
+
+int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
+
+int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
+static inline int exofs_oi_write(struct exofs_i_info *oi,
+				 struct exofs_io_state *ios)
+{
+	ios->obj.id = exofs_oi_objno(oi);
+	ios->cred = oi->i_cred;
+	return exofs_sbi_write(ios);
+}
+
+static inline int exofs_oi_read(struct exofs_i_info *oi,
+				struct exofs_io_state *ios)
+{
+	ios->obj.id = exofs_oi_objno(oi);
+	ios->cred = oi->i_cred;
+	return exofs_sbi_read(ios);
+}
+
 /* inode.c               */
 void exofs_truncate(struct inode *inode);
 int exofs_setattr(struct dentry *, struct iattr *);
@@ -169,6 +257,7 @@
  
 /* inode.c           */
 extern const struct address_space_operations exofs_aops;
+extern const struct osd_attr g_attr_logical_length;
  
 /* namei.c           */
 extern const struct inode_operations exofs_dir_inode_operations;
@@ -37,15 +37,18 @@
  
 #include "exofs.h"
  
-#ifdef CONFIG_EXOFS_DEBUG
-#  define EXOFS_DEBUG_OBJ_ISIZE 1
-#endif
+#define EXOFS_DBGMSG2(M...) do {} while (0)
  
+enum { BIO_MAX_PAGES_KMALLOC =
+		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+};
+
 struct page_collect {
 	struct exofs_sb_info *sbi;
 	struct request_queue *req_q;
 	struct inode *inode;
 	unsigned expected_pages;
+	struct exofs_io_state *ios;
  
 	struct bio *bio;
 	unsigned nr_pages;
  
  
  
@@ -54,22 +57,23 @@
 };
  
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
-		struct inode *inode)
+		       struct inode *inode)
 {
 	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
  
 	pcol->sbi = sbi;
-	pcol->req_q = osd_request_queue(sbi->s_dev);
+	/* Create master bios on first Q, later on cloning, each clone will be
+	 * allocated on it's destination Q
+	 */
+	pcol->req_q = osd_request_queue(sbi->s_ods[0]);
 	pcol->inode = inode;
 	pcol->expected_pages = expected_pages;
  
+	pcol->ios = NULL;
 	pcol->bio = NULL;
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
-
-	EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
-		     expected_pages);
 }
  
 static void _pcol_reset(struct page_collect *pcol)
  
  
  
  
  
  
@@ -80,35 +84,49 @@
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
-	EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
-		     pcol->inode->i_ino, pcol->expected_pages);
+	pcol->ios = NULL;
  
 	/* this is probably the end of the loop but in writes
 	 * it might not end here. don't be left with nothing
 	 */
 	if (!pcol->expected_pages)
-		pcol->expected_pages = 128;
+		pcol->expected_pages = BIO_MAX_PAGES_KMALLOC;
 }
  
 static int pcol_try_alloc(struct page_collect *pcol)
 {
-	int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
+	int pages = min_t(unsigned, pcol->expected_pages,
+			  BIO_MAX_PAGES_KMALLOC);
  
+	if (!pcol->ios) { /* First time allocate io_state */
+		int ret = exofs_get_io_state(pcol->sbi, &pcol->ios);
+
+		if (ret)
+			return ret;
+	}
+
 	for (; pages; pages >>= 1) {
-		pcol->bio = bio_alloc(GFP_KERNEL, pages);
+		pcol->bio = bio_kmalloc(GFP_KERNEL, pages);
 		if (likely(pcol->bio))
 			return 0;
 	}
  
-	EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
+	EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n",
 		  pcol->expected_pages);
 	return -ENOMEM;
 }
  
 static void pcol_free(struct page_collect *pcol)
 {
-	bio_put(pcol->bio);
-	pcol->bio = NULL;
+	if (pcol->bio) {
+		bio_put(pcol->bio);
+		pcol->bio = NULL;
+	}
+
+	if (pcol->ios) {
+		exofs_put_io_state(pcol->ios);
+		pcol->ios = NULL;
+	}
 }
  
 static int pcol_add_page(struct page_collect *pcol, struct page *page,
  
  
  
@@ -161,22 +179,17 @@
 /* Called at the end of reads, to optionally unlock pages and update their
  * status.
  */
-static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
-			    bool do_unlock)
+static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 {
 	struct bio_vec *bvec;
 	int i;
 	u64 resid;
 	u64 good_bytes;
 	u64 length = 0;
-	int ret = exofs_check_ok_resid(or, &resid, NULL);
+	int ret = exofs_check_io(pcol->ios, &resid);
  
-	osd_end_request(or);
-
 	if (likely(!ret))
 		good_bytes = pcol->length;
-	else if (!resid)
-		good_bytes = 0;
 	else
 		good_bytes = pcol->length - resid;
  
@@ -198,7 +211,7 @@
 		else
 			page_stat = ret;
  
-		EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx) %s\n",
+		EXOFS_DBGMSG2("    readpages_done(0x%lx, 0x%lx) %s\n",
 			  inode->i_ino, page->index,
 			  page_stat ? "bad_bytes" : "good_bytes");
  
  
  
@@ -214,13 +227,13 @@
 }
  
 /* callback of async reads */
-static void readpages_done(struct osd_request *or, void *p)
+static void readpages_done(struct exofs_io_state *ios, void *p)
 {
 	struct page_collect *pcol = p;
  
-	__readpages_done(or, pcol, true);
+	__readpages_done(pcol, true);
 	atomic_dec(&pcol->sbi->s_curr_pending);
-	kfree(p);
+	kfree(pcol);
 }
  
 static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
  
  
@@ -238,17 +251,13 @@
  
 		unlock_page(page);
 	}
-	pcol_free(pcol);
 }
  
 static int read_exec(struct page_collect *pcol, bool is_sync)
 {
 	struct exofs_i_info *oi = exofs_i(pcol->inode);
-	struct osd_obj_id obj = {pcol->sbi->s_pid,
-					pcol->inode->i_ino + EXOFS_OBJ_OFF};
-	struct osd_request *or = NULL;
+	struct exofs_io_state *ios = pcol->ios;
 	struct page_collect *pcol_copy = NULL;
-	loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
 	int ret;
  
 	if (!pcol->bio)
  
  
@@ -257,17 +266,13 @@
 	/* see comment in _readpage() about sync reads */
 	WARN_ON(is_sync && (pcol->nr_pages != 1));
  
-	or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		ret = -ENOMEM;
-		goto err;
-	}
+	ios->bio = pcol->bio;
+	ios->length = pcol->length;
+	ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
  
-	osd_req_read(or, &obj, i_start, pcol->bio, pcol->length);
-
 	if (is_sync) {
-		exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
-		return __readpages_done(or, pcol, false);
+		exofs_oi_read(oi, pcol->ios);
+		return __readpages_done(pcol, false);
 	}
  
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
  
@@ -277,14 +282,16 @@
 	}
  
 	*pcol_copy = *pcol;
-	ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
+	ios->done = readpages_done;
+	ios->private = pcol_copy;
+	ret = exofs_oi_read(oi, ios);
 	if (unlikely(ret))
 		goto err;
  
 	atomic_inc(&pcol->sbi->s_curr_pending);
  
 	EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
-		  obj.id, _LLU(i_start), pcol->length);
+		  ios->obj.id, _LLU(ios->offset), pcol->length);
  
 	/* pages ownership was passed to pcol_copy */
 	_pcol_reset(pcol);
  
  
@@ -293,12 +300,10 @@
 err:
 	if (!is_sync)
 		_unlock_pcol_pages(pcol, ret, READ);
-	else /* Pages unlocked by caller in sync mode only free bio */
-		pcol_free(pcol);
  
+	pcol_free(pcol);
+
 	kfree(pcol_copy);
-	if (or)
-		osd_end_request(or);
 	return ret;
 }
  
  
@@ -370,12 +375,12 @@
 	if (len != PAGE_CACHE_SIZE)
 		zero_user(page, len, PAGE_CACHE_SIZE - len);
  
-	EXOFS_DBGMSG("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+	EXOFS_DBGMSG2("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
 		     inode->i_ino, page->index, len);
  
 	ret = pcol_add_page(pcol, page, len);
 	if (ret) {
-		EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
+		EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
 			  "this_len=0x%zx nr_pages=%u length=0x%lx\n",
 			  page, len, pcol->nr_pages, pcol->length);
  
@@ -419,9 +424,8 @@
  
 	_pcol_init(&pcol, 1, page->mapping->host);
  
-	/* readpage_strip might call read_exec(,async) inside at several places
-	 * but this is safe for is_async=0 since read_exec will not do anything
-	 * when we have a single page.
+	/* readpage_strip might call read_exec(,is_sync==false) at several
+	 * places but not if we have a single page.
 	 */
 	ret = readpage_strip(&pcol, page);
 	if (ret) {
@@ -440,8 +444,8 @@
 	return _readpage(page, false);
 }
  
-/* Callback for osd_write. All writes are asynchronouse */
-static void writepages_done(struct osd_request *or, void *p)
+/* Callback for osd_write. All writes are asynchronous */
+static void writepages_done(struct exofs_io_state *ios, void *p)
 {
 	struct page_collect *pcol = p;
 	struct bio_vec *bvec;
  
  
@@ -449,16 +453,12 @@
 	u64 resid;
 	u64  good_bytes;
 	u64  length = 0;
+	int ret = exofs_check_io(ios, &resid);
  
-	int ret = exofs_check_ok_resid(or, NULL, &resid);
-
-	osd_end_request(or);
 	atomic_dec(&pcol->sbi->s_curr_pending);
  
 	if (likely(!ret))
 		good_bytes = pcol->length;
-	else if (!resid)
-		good_bytes = 0;
 	else
 		good_bytes = pcol->length - resid;
  
@@ -482,7 +482,7 @@
  
 		update_write_page(page, page_stat);
 		unlock_page(page);
-		EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+		EXOFS_DBGMSG2("    writepages_done(0x%lx, 0x%lx) status=%d\n",
 			     inode->i_ino, page->index, page_stat);
  
 		length += bvec->bv_len;
  
  
@@ -496,23 +496,13 @@
 static int write_exec(struct page_collect *pcol)
 {
 	struct exofs_i_info *oi = exofs_i(pcol->inode);
-	struct osd_obj_id obj = {pcol->sbi->s_pid,
-					pcol->inode->i_ino + EXOFS_OBJ_OFF};
-	struct osd_request *or = NULL;
+	struct exofs_io_state *ios = pcol->ios;
 	struct page_collect *pcol_copy = NULL;
-	loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
 	int ret;
  
 	if (!pcol->bio)
 		return 0;
  
-	or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
-		ret = -ENOMEM;
-		goto err;
-	}
-
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
 	if (!pcol_copy) {
 		EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
  
  
@@ -523,16 +513,22 @@
 	*pcol_copy = *pcol;
  
 	pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
-	osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length);
-	ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
+
+	ios->bio = pcol_copy->bio;
+	ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
+	ios->length = pcol_copy->length;
+	ios->done = writepages_done;
+	ios->private = pcol_copy;
+
+	ret = exofs_oi_write(oi, ios);
 	if (unlikely(ret)) {
-		EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
+		EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
 		goto err;
 	}
  
 	atomic_inc(&pcol->sbi->s_curr_pending);
 	EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
-		  pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
+		  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
 		  pcol->length);
 	/* pages ownership was passed to pcol_copy */
 	_pcol_reset(pcol);
  
@@ -540,9 +536,9 @@
  
 err:
 	_unlock_pcol_pages(pcol, ret, WRITE);
+	pcol_free(pcol);
 	kfree(pcol_copy);
-	if (or)
-		osd_end_request(or);
+
 	return ret;
 }
  
@@ -586,6 +582,9 @@
 			if (PageError(page))
 				ClearPageError(page);
 			unlock_page(page);
+			EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
+				     "outside the limits\n",
+				     inode->i_ino, page->index);
 			return 0;
 		}
 	}
@@ -600,6 +599,9 @@
 		ret = write_exec(pcol);
 		if (unlikely(ret))
 			goto fail;
+
+		EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
+			     inode->i_ino, page->index);
 		goto try_again;
 	}
  
@@ -609,7 +611,7 @@
 			goto fail;
 	}
  
-	EXOFS_DBGMSG("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+	EXOFS_DBGMSG2("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
 		     inode->i_ino, page->index, len);
  
 	ret = pcol_add_page(pcol, page, len);
@@ -634,6 +636,8 @@
 	return 0;
  
 fail:
+	EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
+		     inode->i_ino, page->index, ret);
 	set_bit(AS_EIO, &page->mapping->flags);
 	unlock_page(page);
 	return ret;
  
  
@@ -652,14 +656,17 @@
 			wbc->range_end >> PAGE_CACHE_SHIFT;
  
 	if (start || end)
-		expected_pages = min(end - start + 1, 32L);
+		expected_pages = end - start + 1;
 	else
 		expected_pages = mapping->nrpages;
  
-	EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
-		     " m->nrpages=%lu start=0x%lx end=0x%lx\n",
+	if (expected_pages < 32L)
+		expected_pages = 32L;
+
+	EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
+		     "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
 		     mapping->host->i_ino, wbc->range_start, wbc->range_end,
-		     mapping->nrpages, start, end);
+		     mapping->nrpages, start, end, expected_pages);
  
 	_pcol_init(&pcol, expected_pages, mapping->host);
  
  
  
@@ -771,19 +778,28 @@
 const struct osd_attr g_attr_logical_length = ATTR_DEF(
 	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
  
+static int _do_truncate(struct inode *inode)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	loff_t isize = i_size_read(inode);
+	int ret;
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+
+	ret = exofs_oi_truncate(oi, (u64)isize);
+	EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize);
+	return ret;
+}
+
 /*
  * Truncate a file to the specified size - all we have to do is set the size
  * attribute.  We make sure the object exists first.
  */
 void exofs_truncate(struct inode *inode)
 {
-	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
 	struct exofs_i_info *oi = exofs_i(inode);
-	struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
-	struct osd_request *or;
-	struct osd_attr attr;
-	loff_t isize = i_size_read(inode);
-	__be64 newsize;
 	int ret;
  
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
  
  
@@ -793,31 +809,14 @@
 		return;
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  
-	nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
-
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
-		goto fail;
-	}
-
-	osd_req_set_attributes(or, &obj);
-
-	newsize = cpu_to_be64((u64)isize);
-	attr = g_attr_logical_length;
-	attr.val_ptr = &newsize;
-	osd_req_add_set_attr_list(or, &attr, 1);
-
 	/* if we are about to truncate an object, and it hasn't been
 	 * created yet, wait
 	 */
 	if (unlikely(wait_obj_created(oi)))
 		goto fail;
  
-	ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
-	osd_end_request(or);
+	ret = _do_truncate(inode);
 	if (ret)
 		goto fail;
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
@@ -847,65 +846,62 @@
  
 /*
  * Read an inode from the OSD, and return it as is.  We also return the size
- * attribute in the 'sanity' argument if we got compiled with debugging turned
- * on.
+ * attribute in the 'obj_size' argument.
  */
 static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
-		    struct exofs_fcb *inode, uint64_t *sanity)
+		    struct exofs_fcb *inode, uint64_t *obj_size)
 {
 	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct osd_request *or;
-	struct osd_attr attr;
-	struct osd_obj_id obj = {sbi->s_pid,
-				 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
+	struct osd_attr attrs[2];
+	struct exofs_io_state *ios;
 	int ret;
  
-	exofs_make_credential(oi->i_cred, &obj);
-
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
-		return -ENOMEM;
+	*obj_size = ~0;
+	ret = exofs_get_io_state(sbi, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+		return ret;
 	}
-	osd_req_get_attributes(or, &obj);
  
-	/* we need the inode attribute */
-	osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
+	ios->obj.id = exofs_oi_objno(oi);
+	exofs_make_credential(oi->i_cred, &ios->obj);
+	ios->cred = oi->i_cred;
  
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
-	/* we get the size attributes to do a sanity check */
-	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
-#endif
+	attrs[0] = g_attr_inode_data;
+	attrs[1] = g_attr_logical_length;
+	ios->in_attr = attrs;
+	ios->in_attr_len = ARRAY_SIZE(attrs);
  
-	ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+	ret = exofs_sbi_read(ios);
 	if (ret)
 		goto out;
  
-	attr = g_attr_inode_data;
-	ret = extract_attr_from_req(or, &attr);
+	ret = extract_attr_from_ios(ios, &attrs[0]);
 	if (ret) {
-		EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
+		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
 		goto out;
 	}
+	WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
+	memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
  
-	WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
-	memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
-
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
-	attr = g_attr_logical_length;
-	ret = extract_attr_from_req(or, &attr);
+	ret = extract_attr_from_ios(ios, &attrs[1]);
 	if (ret) {
-		EXOFS_ERR("ERROR: extract attr from or failed\n");
+		EXOFS_ERR("%s: extract_attr of logical_length failed\n",
+			  __func__);
 		goto out;
 	}
-	*sanity = get_unaligned_be64(attr.val_ptr);
-#endif
+	*obj_size = get_unaligned_be64(attrs[1].val_ptr);
  
 out:
-	osd_end_request(or);
+	exofs_put_io_state(ios);
 	return ret;
 }
  
+static void __oi_init(struct exofs_i_info *oi)
+{
+	init_waitqueue_head(&oi->i_wq);
+	oi->i_flags = 0;
+}
 /*
  * Fill in an inode read from the OSD and set it up for use
  */
@@ -914,7 +910,7 @@
 	struct exofs_i_info *oi;
 	struct exofs_fcb fcb;
 	struct inode *inode;
-	uint64_t uninitialized_var(sanity);
+	uint64_t obj_size;
 	int ret;
  
 	inode = iget_locked(sb, ino);
  
  
@@ -923,13 +919,13 @@
 	if (!(inode->i_state & I_NEW))
 		return inode;
 	oi = exofs_i(inode);
+	__oi_init(oi);
  
 	/* read the inode from the osd */
-	ret = exofs_get_inode(sb, oi, &fcb, &sanity);
+	ret = exofs_get_inode(sb, oi, &fcb, &obj_size);
 	if (ret)
 		goto bad_inode;
  
-	init_waitqueue_head(&oi->i_wq);
 	set_obj_created(oi);
  
 	/* copy stuff from on-disk struct to in-memory struct */
  
  
@@ -947,14 +943,12 @@
 	inode->i_blkbits = EXOFS_BLKSHIFT;
 	inode->i_generation = le32_to_cpu(fcb.i_generation);
  
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
-	if ((inode->i_size != sanity) &&
+	if ((inode->i_size != obj_size) &&
 		(!exofs_inode_is_fast_symlink(inode))) {
-		EXOFS_ERR("WARNING: Size of object from inode and "
-			  "attributes differ (%lld != %llu)\n",
-			  inode->i_size, _LLU(sanity));
+		EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
+			  inode->i_size, _LLU(obj_size));
+		/* FIXME: call exofs_inode_recovery() */
 	}
-#endif
  
 	oi->i_dir_start_lookup = 0;
  
  
  
  
@@ -1020,24 +1014,31 @@
  * set the obj_created flag so that other methods know that the object exists on
  * the OSD.
  */
-static void create_done(struct osd_request *or, void *p)
+static void create_done(struct exofs_io_state *ios, void *p)
 {
 	struct inode *inode = p;
 	struct exofs_i_info *oi = exofs_i(inode);
 	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
 	int ret;
  
-	ret = exofs_check_ok(or);
-	osd_end_request(or);
+	ret = exofs_check_io(ios, NULL);
+	exofs_put_io_state(ios);
+
 	atomic_dec(&sbi->s_curr_pending);
  
 	if (unlikely(ret)) {
 		EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
-			  _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
-		make_bad_inode(inode);
-	} else
-		set_obj_created(oi);
+			  _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid));
+		/*TODO: When FS is corrupted creation can fail, object already
+		 * exist. Get rid of this asynchronous creation, if exist
+		 * increment the obj counter and try the next object. Until we
+		 * succeed. All these dangling objects will be made into lost
+		 * files by chkfs.exofs
+		 */
+	}
  
+	set_obj_created(oi);
+
 	atomic_dec(&inode->i_count);
 	wake_up(&oi->i_wq);
 }
@@ -1051,8 +1052,7 @@
 	struct inode *inode;
 	struct exofs_i_info *oi;
 	struct exofs_sb_info *sbi;
-	struct osd_request *or;
-	struct osd_obj_id obj;
+	struct exofs_io_state *ios;
 	int ret;
  
 	sb = dir->i_sb;
  
@@ -1061,8 +1061,8 @@
 		return ERR_PTR(-ENOMEM);
  
 	oi = exofs_i(inode);
+	__oi_init(oi);
  
-	init_waitqueue_head(&oi->i_wq);
 	set_obj_2bcreated(oi);
  
 	sbi = sb->s_fs_info;
  
  
  
@@ -1089,28 +1089,28 @@
  
 	mark_inode_dirty(inode);
  
-	obj.partition = sbi->s_pid;
-	obj.id = inode->i_ino + EXOFS_OBJ_OFF;
-	exofs_make_credential(oi->i_cred, &obj);
-
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
-		return ERR_PTR(-ENOMEM);
+	ret = exofs_get_io_state(sbi, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
+		return ERR_PTR(ret);
 	}
  
-	osd_req_create_object(or, &obj);
+	ios->obj.id = exofs_oi_objno(oi);
+	exofs_make_credential(oi->i_cred, &ios->obj);
  
 	/* increment the refcount so that the inode will still be around when we
 	 * reach the callback
 	 */
 	atomic_inc(&inode->i_count);
  
-	ret = exofs_async_op(or, create_done, inode, oi->i_cred);
+	ios->done = create_done;
+	ios->private = inode;
+	ios->cred = oi->i_cred;
+	ret = exofs_sbi_create(ios);
 	if (ret) {
 		atomic_dec(&inode->i_count);
-		osd_end_request(or);
-		return ERR_PTR(-EIO);
+		exofs_put_io_state(ios);
+		return ERR_PTR(ret);
 	}
 	atomic_inc(&sbi->s_curr_pending);
  
  
@@ -1128,11 +1128,11 @@
 /*
  * Callback function from exofs_update_inode().
  */
-static void updatei_done(struct osd_request *or, void *p)
+static void updatei_done(struct exofs_io_state *ios, void *p)
 {
 	struct updatei_args *args = p;
  
-	osd_end_request(or);
+	exofs_put_io_state(ios);
  
 	atomic_dec(&args->sbi->s_curr_pending);
  
@@ -1148,8 +1148,7 @@
 	struct exofs_i_info *oi = exofs_i(inode);
 	struct super_block *sb = inode->i_sb;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
-	struct osd_request *or;
+	struct exofs_io_state *ios;
 	struct osd_attr attr;
 	struct exofs_fcb *fcb;
 	struct updatei_args *args;
  
  
@@ -1186,18 +1185,16 @@
 	} else
 		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
  
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
-		ret = -ENOMEM;
+	ret = exofs_get_io_state(sbi, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
 		goto free_args;
 	}
  
-	osd_req_set_attributes(or, &obj);
-
 	attr = g_attr_inode_data;
 	attr.val_ptr = fcb;
-	osd_req_add_set_attr_list(or, &attr, 1);
+	ios->out_attr_len = 1;
+	ios->out_attr = &attr;
  
 	if (!obj_created(oi)) {
 		EXOFS_DBGMSG("!obj_created\n");
  
  
  
@@ -1206,22 +1203,19 @@
 		EXOFS_DBGMSG("wait_event done\n");
 	}
  
-	if (do_sync) {
-		ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
-		osd_end_request(or);
-		goto free_args;
-	} else {
+	if (!do_sync) {
 		args->sbi = sbi;
+		ios->done = updatei_done;
+		ios->private = args;
+	}
  
-		ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
-		if (ret) {
-			osd_end_request(or);
-			goto free_args;
-		}
+	ret = exofs_oi_write(oi, ios);
+	if (!do_sync && !ret) {
 		atomic_inc(&sbi->s_curr_pending);
 		goto out; /* deallocation in updatei_done */
 	}
  
+	exofs_put_io_state(ios);
 free_args:
 	kfree(args);
 out:
  
@@ -1238,11 +1232,12 @@
  * Callback function from exofs_delete_inode() - don't have much cleaning up to
  * do.
  */
-static void delete_done(struct osd_request *or, void *p)
+static void delete_done(struct exofs_io_state *ios, void *p)
 {
-	struct exofs_sb_info *sbi;
-	osd_end_request(or);
-	sbi = p;
+	struct exofs_sb_info *sbi = p;
+
+	exofs_put_io_state(ios);
+
 	atomic_dec(&sbi->s_curr_pending);
 }
  
@@ -1256,8 +1251,7 @@
 	struct exofs_i_info *oi = exofs_i(inode);
 	struct super_block *sb = inode->i_sb;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
-	struct osd_request *or;
+	struct exofs_io_state *ios;
 	int ret;
  
 	truncate_inode_pages(&inode->i_data, 0);
  
  
  
@@ -1274,25 +1268,26 @@
  
 	clear_inode(inode);
  
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
+	ret = exofs_get_io_state(sbi, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
 		return;
 	}
  
-	osd_req_remove_object(or, &obj);
-
 	/* if we are deleting an obj that hasn't been created yet, wait */
 	if (!obj_created(oi)) {
 		BUG_ON(!obj_2bcreated(oi));
 		wait_event(oi->i_wq, obj_created(oi));
 	}
  
-	ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
+	ios->obj.id = exofs_oi_objno(oi);
+	ios->done = delete_done;
+	ios->private = sbi;
+	ios->cred = oi->i_cred;
+	ret = exofs_sbi_remove(ios);
 	if (ret) {
-		EXOFS_ERR(
-		       "ERROR: @exofs_delete_inode exofs_async_op failed\n");
-		osd_end_request(or);
+		EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__);
+		exofs_put_io_state(ios);
 		return;
 	}
 	atomic_inc(&sbi->s_curr_pending);
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <scsi/scsi_device.h>
+
+#include "exofs.h"
+
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+		    u64 offset, void *p, unsigned length)
+{
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+/*	struct osd_sense_info osi = {.key = 0};*/
+	int ret;
+
+	if (unlikely(!or)) {
+		EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
+		return -ENOMEM;
+	}
+	ret = osd_req_read_kern(or, obj, offset, p, length);
+	if (unlikely(ret)) {
+		EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
+		goto out;
+	}
+
+	ret = osd_finalize_request(or, 0, cred, NULL);
+	if (unlikely(ret)) {
+		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+		goto out;
+	}
+
+	ret = osd_execute_request(or);
+	if (unlikely(ret))
+		EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+	/* osd_req_decode_sense(or, ret); */
+
+out:
+	osd_end_request(or);
+	return ret;
+}
+
+int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
+{
+	struct exofs_io_state *ios;
+
+	/*TODO: Maybe use kmem_cach per sbi of size
+	 * exofs_io_state_size(sbi->s_numdevs)
+	 */
+	ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
+	if (unlikely(!ios)) {
+		*pios = NULL;
+		return -ENOMEM;
+	}
+
+	ios->sbi = sbi;
+	ios->obj.partition = sbi->s_pid;
+	*pios = ios;
+	return 0;
+}
+
+void exofs_put_io_state(struct exofs_io_state *ios)
+{
+	if (ios) {
+		unsigned i;
+
+		for (i = 0; i < ios->numdevs; i++) {
+			struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
+
+			if (per_dev->or)
+				osd_end_request(per_dev->or);
+			if (per_dev->bio)
+				bio_put(per_dev->bio);
+		}
+
+		kfree(ios);
+	}
+}
+
+static void _sync_done(struct exofs_io_state *ios, void *p)
+{
+	struct completion *waiting = p;
+
+	complete(waiting);
+}
+
+static void _last_io(struct kref *kref)
+{
+	struct exofs_io_state *ios = container_of(
+					kref, struct exofs_io_state, kref);
+
+	ios->done(ios, ios->private);
+}
+
+static void _done_io(struct osd_request *or, void *p)
+{
+	struct exofs_io_state *ios = p;
+
+	kref_put(&ios->kref, _last_io);
+}
+
+static int exofs_io_execute(struct exofs_io_state *ios)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	bool sync = (ios->done == NULL);
+	int i, ret;
+
+	if (sync) {
+		ios->done = _sync_done;
+		ios->private = &wait;
+	}
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_request *or = ios->per_dev[i].or;
+		if (unlikely(!or))
+			continue;
+
+		ret = osd_finalize_request(or, 0, ios->cred, NULL);
+		if (unlikely(ret)) {
+			EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
+				     ret);
+			return ret;
+		}
+	}
+
+	kref_init(&ios->kref);
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_request *or = ios->per_dev[i].or;
+		if (unlikely(!or))
+			continue;
+
+		kref_get(&ios->kref);
+		osd_execute_request_async(or, _done_io, ios);
+	}
+
+	kref_put(&ios->kref, _last_io);
+	ret = 0;
+
+	if (sync) {
+		wait_for_completion(&wait);
+		ret = exofs_check_io(ios, NULL);
+	}
+	return ret;
+}
+
+int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
+{
+	enum osd_err_priority acumulated_osd_err = 0;
+	int acumulated_lin_err = 0;
+	int i;
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_sense_info osi;
+		int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi);
+
+		if (likely(!ret))
+			continue;
+
+		if (unlikely(ret == -EFAULT)) {
+			EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__);
+			/*FIXME: All the pages in this device range should:
+			 *	clear_highpage(page);
+			 */
+		}
+
+		if (osi.osd_err_pri >= acumulated_osd_err) {
+			acumulated_osd_err = osi.osd_err_pri;
+			acumulated_lin_err = ret;
+		}
+	}
+
+	/* TODO: raid specific residual calculations */
+	if (resid) {
+		if (likely(!acumulated_lin_err))
+			*resid = 0;
+		else
+			*resid = ios->length;
+	}
+
+	return acumulated_lin_err;
+}
+
+int exofs_sbi_create(struct exofs_io_state *ios)
+{
+	int i, ret;
+
+	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+		struct osd_request *or;
+
+		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		osd_req_create_object(or, &ios->obj);
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	return ret;
+}
+
+int exofs_sbi_remove(struct exofs_io_state *ios)
+{
+	int i, ret;
+
+	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+		struct osd_request *or;
+
+		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		osd_req_remove_object(or, &ios->obj);
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	return ret;
+}
+
+int exofs_sbi_write(struct exofs_io_state *ios)
+{
+	int i, ret;
+
+	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+		struct osd_request *or;
+
+		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		if (ios->bio) {
+			struct bio *bio;
+
+			if (i != 0) {
+				bio = bio_kmalloc(GFP_KERNEL,
+						  ios->bio->bi_max_vecs);
+				if (unlikely(!bio)) {
+					ret = -ENOMEM;
+					goto out;
+				}
+
+				__bio_clone(bio, ios->bio);
+				bio->bi_bdev = NULL;
+				bio->bi_next = NULL;
+				ios->per_dev[i].bio =  bio;
+			} else {
+				bio = ios->bio;
+			}
+
+			osd_req_write(or, &ios->obj, ios->offset, bio,
+				      ios->length);
+/*			EXOFS_DBGMSG("write sync=%d\n", sync);*/
+		} else if (ios->kern_buff) {
+			osd_req_write_kern(or, &ios->obj, ios->offset,
+					   ios->kern_buff, ios->length);
+/*			EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/
+		} else {
+			osd_req_set_attributes(or, &ios->obj);
+/*			EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/
+		}
+
+		if (ios->out_attr)
+			osd_req_add_set_attr_list(or, ios->out_attr,
+						  ios->out_attr_len);
+
+		if (ios->in_attr)
+			osd_req_add_get_attr_list(or, ios->in_attr,
+						  ios->in_attr_len);
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	return ret;
+}
+
+int exofs_sbi_read(struct exofs_io_state *ios)
+{
+	int i, ret;
+
+	for (i = 0; i < 1; i++) {
+		struct osd_request *or;
+		unsigned first_dev = (unsigned)ios->obj.id;
+
+		first_dev %= ios->sbi->s_numdevs;
+		or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		if (ios->bio) {
+			osd_req_read(or, &ios->obj, ios->offset, ios->bio,
+				     ios->length);
+/*			EXOFS_DBGMSG("read sync=%d\n", sync);*/
+		} else if (ios->kern_buff) {
+			osd_req_read_kern(or, &ios->obj, ios->offset,
+					   ios->kern_buff, ios->length);
+/*			EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
+		} else {
+			osd_req_get_attributes(or, &ios->obj);
+/*			EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
+		}
+
+		if (ios->out_attr)
+			osd_req_add_set_attr_list(or, ios->out_attr,
+						  ios->out_attr_len);
+
+		if (ios->in_attr)
+			osd_req_add_get_attr_list(or, ios->in_attr,
+						  ios->in_attr_len);
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	return ret;
+}
+
+int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
+{
+	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+	void *iter = NULL;
+	int nelem;
+
+	do {
+		nelem = 1;
+		osd_req_decode_get_attr_list(ios->per_dev[0].or,
+					     &cur_attr, &nelem, &iter);
+		if ((cur_attr.attr_page == attr->attr_page) &&
+		    (cur_attr.attr_id == attr->attr_id)) {
+			attr->len = cur_attr.len;
+			attr->val_ptr = cur_attr.val_ptr;
+			return 0;
+		}
+	} while (iter);
+
+	return -EIO;
+}
+
+int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
+{
+	struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
+	struct exofs_io_state *ios;
+	struct osd_attr attr;
+	__be64 newsize;
+	int i, ret;
+
+	if (exofs_get_io_state(sbi, &ios))
+		return -ENOMEM;
+
+	ios->obj.id = exofs_oi_objno(oi);
+	ios->cred = oi->i_cred;
+
+	newsize = cpu_to_be64(size);
+	attr = g_attr_logical_length;
+	attr.val_ptr = &newsize;
+
+	for (i = 0; i < sbi->s_numdevs; i++) {
+		struct osd_request *or;
+
+		or = osd_start_request(sbi->s_ods[i], GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		osd_req_set_attributes(or, &ios->obj);
+		osd_req_add_set_attr_list(or, &attr, 1);
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	exofs_put_io_state(ios);
+	return ret;
+}
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <scsi/scsi_device.h>
-#include <scsi/osd_sense.h>
-
-#include "exofs.h"
-
-int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
-{
-	struct osd_sense_info osi;
-	int ret = osd_req_decode_sense(or, &osi);
-
-	if (ret) { /* translate to Linux codes */
-		if (osi.additional_code == scsi_invalid_field_in_cdb) {
-			if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
-				ret = -EFAULT;
-			if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
-				ret = -ENOENT;
-			else
-				ret = -EINVAL;
-		} else if (osi.additional_code == osd_quota_error)
-			ret = -ENOSPC;
-		else
-			ret = -EIO;
-	}
-
-	/* FIXME: should be include in osd_sense_info */
-	if (in_resid)
-		*in_resid = or->in.req ? or->in.req->resid_len : 0;
-
-	if (out_resid)
-		*out_resid = or->out.req ? or->out.req->resid_len : 0;
-
-	return ret;
-}
-
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
-{
-	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-
-/*
- * Perform a synchronous OSD operation.
- */
-int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
-{
-	int ret;
-
-	or->timeout = timeout;
-	ret = osd_finalize_request(or, 0, credential, NULL);
-	if (ret) {
-		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
-		return ret;
-	}
-
-	ret = osd_execute_request(or);
-
-	if (ret)
-		EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
-	/* osd_req_decode_sense(or, ret); */
-	return ret;
-}
-
-/*
- * Perform an asynchronous OSD operation.
- */
-int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
-		   void *caller_context, u8 *cred)
-{
-	int ret;
-
-	ret = osd_finalize_request(or, 0, cred, NULL);
-	if (ret) {
-		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
-		return ret;
-	}
-
-	ret = osd_execute_request_async(or, async_done, caller_context);
-
-	if (ret)
-		EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
-	return ret;
-}
-
-int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
-{
-	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
-	void *iter = NULL;
-	int nelem;
-
-	do {
-		nelem = 1;
-		osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
-		if ((cur_attr.attr_page == attr->attr_page) &&
-		    (cur_attr.attr_id == attr->attr_id)) {
-			attr->len = cur_attr.len;
-			attr->val_ptr = cur_attr.val_ptr;
-			return 0;
-		}
-	} while (iter);
-
-	return -EIO;
-}
+/*
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License  version 2 as published by the Free
+ * Software Foundation.
+ *
+ */
+
+/* FIXME: Remove this file once pnfs hits mainline */
+
+#ifndef __EXOFS_PNFS_H__
+#define __EXOFS_PNFS_H__
+
+#if defined(CONFIG_PNFS)
+
+
+/* FIXME: move this file to: linux/exportfs/pnfs_osd_xdr.h */
+#include "../nfs/objlayout/pnfs_osd_xdr.h"
+
+#else /* defined(CONFIG_PNFS) */
+
+enum pnfs_iomode {
+	IOMODE_READ = 1,
+	IOMODE_RW = 2,
+	IOMODE_ANY = 3,
+};
+
+/* Layout Structure */
+enum pnfs_osd_raid_algorithm4 {
+	PNFS_OSD_RAID_0		= 1,
+	PNFS_OSD_RAID_4		= 2,
+	PNFS_OSD_RAID_5		= 3,
+	PNFS_OSD_RAID_PQ	= 4     /* Reed-Solomon P+Q */
+};
+
+struct pnfs_osd_data_map {
+	u32	odm_num_comps;
+	u64	odm_stripe_unit;
+	u32	odm_group_width;
+	u32	odm_group_depth;
+	u32	odm_mirror_cnt;
+	u32	odm_raid_algorithm;
+};
+
+#endif /* else defined(CONFIG_PNFS) */
+
+#endif /* __EXOFS_PNFS_H__ */
@@ -203,49 +203,45 @@
 {
 	struct exofs_sb_info *sbi;
 	struct exofs_fscb *fscb;
-	struct osd_request *or;
-	struct osd_obj_id obj;
+	struct exofs_io_state *ios;
 	int ret = -ENOMEM;
  
-	fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
-	if (!fscb) {
-		EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
-		return -ENOMEM;
-	}
-
 	lock_super(sb);
 	sbi = sb->s_fs_info;
+	fscb = &sbi->s_fscb;
+
+	ret = exofs_get_io_state(sbi, &ios);
+	if (ret)
+		goto out;
+
+	/* Note: We only write the changing part of the fscb. .i.e upto the
+	 *       the fscb->s_dev_table_oid member. There is no read-modify-write
+	 *       here.
+	 */
+	ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
+	memset(fscb, 0, ios->length);
 	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
 	fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
 	fscb->s_magic = cpu_to_le16(sb->s_magic);
 	fscb->s_newfs = 0;
+	fscb->s_version = EXOFS_FSCB_VER;
  
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
-		goto out;
-	}
+	ios->obj.id = EXOFS_SUPER_ID;
+	ios->offset = 0;
+	ios->kern_buff = fscb;
+	ios->cred = sbi->s_cred;
  
-	obj.partition = sbi->s_pid;
-	obj.id = EXOFS_SUPER_ID;
-	ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
+	ret = exofs_sbi_write(ios);
 	if (unlikely(ret)) {
-		EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
+		EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
 		goto out;
 	}
-
-	ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
-	if (unlikely(ret)) {
-		EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
-		goto out;
-	}
 	sb->s_dirt = 0;
  
 out:
-	if (or)
-		osd_end_request(or);
+	EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
+	exofs_put_io_state(ios);
 	unlock_super(sb);
-	kfree(fscb);
 	return ret;
 }
  
@@ -257,6 +253,29 @@
 		sb->s_dirt = 0;
 }
  
+static void _exofs_print_device(const char *msg, const char *dev_path,
+				struct osd_dev *od, u64 pid)
+{
+	const struct osd_dev_info *odi = osduld_device_info(od);
+
+	printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
+		msg, dev_path ?: "", odi->osdname, _LLU(pid));
+}
+
+void exofs_free_sbi(struct exofs_sb_info *sbi)
+{
+	while (sbi->s_numdevs) {
+		int i = --sbi->s_numdevs;
+		struct osd_dev *od = sbi->s_ods[i];
+
+		if (od) {
+			sbi->s_ods[i] = NULL;
+			osduld_put_device(od);
+		}
+	}
+	kfree(sbi);
+}
+
 /*
  * This function is called when the vfs is freeing the superblock.  We just
  * need to free our own part.
  
@@ -279,11 +298,182 @@
 				  msecs_to_jiffies(100));
 	}
  
-	osduld_put_device(sbi->s_dev);
-	kfree(sb->s_fs_info);
+	_exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid);
+
+	exofs_free_sbi(sbi);
 	sb->s_fs_info = NULL;
 }
  
+static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
+				    struct exofs_device_table *dt)
+{
+	sbi->data_map.odm_num_comps   =
+				le32_to_cpu(dt->dt_data_map.cb_num_comps);
+	sbi->data_map.odm_stripe_unit =
+				le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
+	sbi->data_map.odm_group_width =
+				le32_to_cpu(dt->dt_data_map.cb_group_width);
+	sbi->data_map.odm_group_depth =
+				le32_to_cpu(dt->dt_data_map.cb_group_depth);
+	sbi->data_map.odm_mirror_cnt  =
+				le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
+	sbi->data_map.odm_raid_algorithm  =
+				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
+
+/* FIXME: Hard coded mirror only for now. if not so do not mount */
+	if ((sbi->data_map.odm_num_comps != numdevs) ||
+	    (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) ||
+	    (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) ||
+	    (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
+		return -EINVAL;
+	else
+		return 0;
+}
+
+/* @odi is valid only as long as @fscb_dev is valid */
+static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
+			     struct osd_dev_info *odi)
+{
+	odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
+	memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
+
+	odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
+	odi->osdname = dt_dev->osdname;
+
+	/* FIXME support long names. Will need a _put function */
+	if (dt_dev->long_name_offset)
+		return -EINVAL;
+
+	/* Make sure osdname is printable!
+	 * mkexofs should give us space for a null-terminator else the
+	 * device-table is invalid.
+	 */
+	if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
+		odi->osdname_len = sizeof(dt_dev->osdname) - 1;
+	dt_dev->osdname[odi->osdname_len] = 0;
+
+	/* If it's all zeros something is bad we read past end-of-obj */
+	return !(odi->systemid_len || odi->osdname_len);
+}
+
+static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
+				       unsigned table_count)
+{
+	struct exofs_sb_info *sbi = *psbi;
+	struct osd_dev *fscb_od;
+	struct osd_obj_id obj = {.partition = sbi->s_pid,
+				 .id = EXOFS_DEVTABLE_ID};
+	struct exofs_device_table *dt;
+	unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
+					     sizeof(*dt);
+	unsigned numdevs, i;
+	int ret;
+
+	dt = kmalloc(table_bytes, GFP_KERNEL);
+	if (unlikely(!dt)) {
+		EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
+			  table_bytes);
+		return -ENOMEM;
+	}
+
+	fscb_od = sbi->s_ods[0];
+	sbi->s_ods[0] = NULL;
+	sbi->s_numdevs = 0;
+	ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
+	if (unlikely(ret)) {
+		EXOFS_ERR("ERROR: reading device table\n");
+		goto out;
+	}
+
+	numdevs = le64_to_cpu(dt->dt_num_devices);
+	if (unlikely(!numdevs)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	WARN_ON(table_count != numdevs);
+
+	ret = _read_and_match_data_map(sbi, numdevs, dt);
+	if (unlikely(ret))
+		goto out;
+
+	if (likely(numdevs > 1)) {
+		unsigned size = numdevs * sizeof(sbi->s_ods[0]);
+
+		sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
+		if (unlikely(!sbi)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0]));
+		*psbi = sbi;
+	}
+
+	for (i = 0; i < numdevs; i++) {
+		struct exofs_fscb fscb;
+		struct osd_dev_info odi;
+		struct osd_dev *od;
+
+		if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
+			EXOFS_ERR("ERROR: Read all-zeros device entry\n");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
+		       i, odi.osdname);
+
+		/* On all devices the device table is identical. The user can
+		 * specify any one of the participating devices on the command
+		 * line. We always keep them in device-table order.
+		 */
+		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
+			sbi->s_ods[i] = fscb_od;
+			++sbi->s_numdevs;
+			fscb_od = NULL;
+			continue;
+		}
+
+		od = osduld_info_lookup(&odi);
+		if (unlikely(IS_ERR(od))) {
+			ret = PTR_ERR(od);
+			EXOFS_ERR("ERROR: device requested is not found "
+				  "osd_name-%s =>%d\n", odi.osdname, ret);
+			goto out;
+		}
+
+		sbi->s_ods[i] = od;
+		++sbi->s_numdevs;
+
+		/* Read the fscb of the other devices to make sure the FS
+		 * partition is there.
+		 */
+		ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
+				      sizeof(fscb));
+		if (unlikely(ret)) {
+			EXOFS_ERR("ERROR: Malformed participating device "
+				  "error reading fscb osd_name-%s\n",
+				  odi.osdname);
+			goto out;
+		}
+
+		/* TODO: verify other information is correct and FS-uuid
+		 *	 matches. Benny what did you say about device table
+		 *	 generation and old devices?
+		 */
+	}
+
+out:
+	kfree(dt);
+	if (unlikely(!ret && fscb_od)) {
+		EXOFS_ERR(
+		      "ERROR: Bad device-table container device not present\n");
+		osduld_put_device(fscb_od);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
 /*
  * Read the superblock from the OSD and fill in the fields
  */
  
  
  
  
  
@@ -292,24 +482,25 @@
 	struct inode *root;
 	struct exofs_mountopt *opts = data;
 	struct exofs_sb_info *sbi;	/*extended info                  */
+	struct osd_dev *od;		/* Master device                 */
 	struct exofs_fscb fscb;		/*on-disk superblock info        */
-	struct osd_request *or = NULL;
 	struct osd_obj_id obj;
+	unsigned table_count;
 	int ret;
  
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
-	sb->s_fs_info = sbi;
  
 	/* use mount options to fill superblock */
-	sbi->s_dev = osduld_path_lookup(opts->dev_name);
-	if (IS_ERR(sbi->s_dev)) {
-		ret = PTR_ERR(sbi->s_dev);
-		sbi->s_dev = NULL;
+	od = osduld_path_lookup(opts->dev_name);
+	if (IS_ERR(od)) {
+		ret = PTR_ERR(od);
 		goto free_sbi;
 	}
  
+	sbi->s_ods[0] = od;
+	sbi->s_numdevs = 1;
 	sbi->s_pid = opts->pid;
 	sbi->s_timeout = opts->timeout;
  
  
  
  
@@ -323,36 +514,14 @@
 	sb->s_bdev = NULL;
 	sb->s_dev = 0;
  
-	/* read data from on-disk superblock object */
 	obj.partition = sbi->s_pid;
 	obj.id = EXOFS_SUPER_ID;
 	exofs_make_credential(sbi->s_cred, &obj);
  
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		if (!silent)
-			EXOFS_ERR(
-			       "exofs_fill_super: osd_start_request failed.\n");
-		ret = -ENOMEM;
+	ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
+	if (unlikely(ret))
 		goto free_sbi;
-	}
-	ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
-	if (unlikely(ret)) {
-		if (!silent)
-			EXOFS_ERR(
-			       "exofs_fill_super: osd_req_read_kern failed.\n");
-		ret = -ENOMEM;
-		goto free_sbi;
-	}
  
-	ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
-	if (unlikely(ret)) {
-		if (!silent)
-			EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
-		ret = -EIO;
-		goto free_sbi;
-	}
-
 	sb->s_magic = le16_to_cpu(fscb.s_magic);
 	sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
 	sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
  
  
@@ -364,12 +533,26 @@
 		ret = -EINVAL;
 		goto free_sbi;
 	}
+	if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
+		EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
+			  EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
+		ret = -EINVAL;
+		goto free_sbi;
+	}
  
 	/* start generation numbers from a random point */
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
  
+	table_count = le64_to_cpu(fscb.s_dev_table_count);
+	if (table_count) {
+		ret = exofs_read_lookup_dev_table(&sbi, table_count);
+		if (unlikely(ret))
+			goto free_sbi;
+	}
+
 	/* set up operation vectors */
+	sb->s_fs_info = sbi;
 	sb->s_op = &exofs_sops;
 	sb->s_export_op = &exofs_export_ops;
 	root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
  
@@ -395,16 +578,15 @@
 		goto free_sbi;
 	}
  
-	ret = 0;
-out:
-	if (or)
-		osd_end_request(or);
-	return ret;
+	_exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0],
+			    sbi->s_pid);
+	return 0;
  
 free_sbi:
-	osduld_put_device(sbi->s_dev); /* NULL safe */
-	kfree(sbi);
-	goto out;
+	EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
+		  opts->dev_name, sbi->s_pid, ret);
+	exofs_free_sbi(sbi);
+	return ret;
 }
  
 /*
@@ -433,7 +615,7 @@
 {
 	struct super_block *sb = dentry->d_sb;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct osd_obj_id obj = {sbi->s_pid, 0};
+	struct exofs_io_state *ios;
 	struct osd_attr attrs[] = {
 		ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
 			OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
  
  
  
  
  
@@ -442,32 +624,33 @@
 	};
 	uint64_t capacity = ULLONG_MAX;
 	uint64_t used = ULLONG_MAX;
-	struct osd_request *or;
 	uint8_t cred_a[OSD_CAP_LEN];
 	int ret;
  
-	/* get used/capacity attributes */
-	exofs_make_credential(cred_a, &obj);
-
-	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-	if (unlikely(!or)) {
-		EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
-		return -ENOMEM;
+	ret = exofs_get_io_state(sbi, &ios);
+	if (ret) {
+		EXOFS_DBGMSG("exofs_get_io_state failed.\n");
+		return ret;
 	}
  
-	osd_req_get_attributes(or, &obj);
-	osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
-	ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
+	exofs_make_credential(cred_a, &ios->obj);
+	ios->cred = sbi->s_cred;
+	ios->in_attr = attrs;
+	ios->in_attr_len = ARRAY_SIZE(attrs);
+
+	ret = exofs_sbi_read(ios);
 	if (unlikely(ret))
 		goto out;
  
-	ret = extract_attr_from_req(or, &attrs[0]);
-	if (likely(!ret))
+	ret = extract_attr_from_ios(ios, &attrs[0]);
+	if (likely(!ret)) {
 		capacity = get_unaligned_be64(attrs[0].val_ptr);
-	else
+		if (unlikely(!capacity))
+			capacity = ULLONG_MAX;
+	} else
 		EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
  
-	ret = extract_attr_from_req(or, &attrs[1]);
+	ret = extract_attr_from_ios(ios, &attrs[1]);
 	if (likely(!ret))
 		used = get_unaligned_be64(attrs[1].val_ptr);
 	else
  
@@ -476,15 +659,15 @@
 	/* fill in the stats buffer */
 	buf->f_type = EXOFS_SUPER_MAGIC;
 	buf->f_bsize = EXOFS_BLKSIZE;
-	buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
-	buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
+	buf->f_blocks = capacity >> 9;
+	buf->f_bfree = (capacity - used) >> 9;
 	buf->f_bavail = buf->f_bfree;
 	buf->f_files = sbi->s_numfiles;
 	buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
 	buf->f_namelen = EXOFS_NAME_LEN;
  
 out:
-	osd_end_request(or);
+	exofs_put_io_state(ios);
 	return ret;
 }
...	...	@@ -36,6 +36,8 @@
36	36	- info about directory notification in Linux.
37	37	ecryptfs.txt
38	38	- docs on eCryptfs: stacked cryptographic filesystem for Linux.
	39	+exofs.txt
	40	+ - info, usage, mount options, design about EXOFS.
39	41	ext2.txt
40	42	- info, mount options and specifications for the Ext2 filesystem.
41	43	ext3.txt
...	...	@@ -60,13 +60,13 @@
60	60
61	61	mkfs.exofs --pid=65536 --format /dev/osd0
62	62
63		- The --format is optional if not specified no OSD_FORMAT will be
64		- preformed and a clean file system will be created in the specified pid,
	63	+ The --format is optional. If not specified, no OSD_FORMAT will be
	64	+ performed and a clean file system will be created in the specified pid,
65	65	in the available space of the target. (Use --format=size_in_meg to limit
66	66	the total LUN space available)
67	67
68		- If pid already exist it will be deleted and a new one will be created in it's
69		- place. Be careful.
	68	+ If pid already exists, it will be deleted and a new one will be created in
	69	+ its place. Be careful.
70	70
71	71	An exofs lives inside a single OSD partition. You can create multiple exofs
72	72	filesystems on the same device using multiple pids.
...	...	@@ -81,7 +81,7 @@
81	81
82	82	7. For reference (See do-exofs example script):
83	83	do-exofs start - an example of how to perform the above steps.
84		- do-exofs stop - an example of how to unmount the file system.
	84	+ do-exofs stop - an example of how to unmount the file system.
85	85	do-exofs format - an example of how to format and mkfs a new exofs.
86	86
87	87	8. Extra compilation flags (uncomment in fs/exofs/Kbuild):
...	...	@@ -104,8 +104,8 @@
104	104	exofs specific options: Options are separated by commas (,)
105	105	pid=<integer> - The partition number to mount/create as
106	106	container of the filesystem.
107		- This option is mandatory
108		- to=<integer> - Timeout in ticks for a single command
	107	+ This option is mandatory.
	108	+ to=<integer> - Timeout in ticks for a single command.
109	109	default is (60 * HZ) [for debugging only]
110	110
111	111	===============================================================================
...	...	@@ -116,7 +116,7 @@
116	116	with a special ID (defined in common.h).
117	117	Information included in the file system control block is used to fill the
118	118	in-memory superblock structure at mount time. This object is created before
119		- the file system is used by mkexofs.c It contains information such as:
	119	+ the file system is used by mkexofs.c. It contains information such as:
120	120	- The file system's magic number
121	121	- The next inode number to be allocated
122	122
...	...	@@ -134,8 +134,8 @@
134	134	attributes. This applies to both regular files and other types (directories,
135	135	device files, symlinks, etc.).
136	136
137		-* Credentials are generated per object (inode and superblock) when they is
138		- created in memory (read off disk or created). The credential works for all
	137	+* Credentials are generated per object (inode and superblock) when they are
	138	+ created in memory (read from disk or created). The credential works for all
139	139	operations and is used as long as the object remains in memory.
140	140
141	141	* Async OSD operations are used whenever possible, but the target may execute
...	...	@@ -145,7 +145,8 @@
145	145	from executing in reverse order:
146	146	- The following are handled with the OBJ_CREATED and OBJ_2BCREATED
147	147	flags. OBJ_CREATED is set when we know the object exists on the OSD -
148		- in create's callback function, and when we successfully do a read_inode.
	148	+ in create's callback function, and when we successfully do a
	149	+ read_inode.
149	150	OBJ_2BCREATED is set in the beginning of the create function, so we
150	151	know that we should wait.
151	152	- create/delete: delete should wait until the object is created
...	...	@@ -12,6 +12,6 @@
12	12	# Kbuild - Gets included from the Kernels Makefile and build system
13	13	#
14	14
15		-exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
	15	+exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
16	16	obj-$(CONFIG_EXOFS_FS) += exofs.o
...	...	@@ -49,6 +49,7 @@
49	49	#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
50	50	#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
51	51	#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
	52	+#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
52	53	#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
53	54
54	55	/* exofs Application specific page/attribute */
55	56
56	57
57	58
58	59
...	...	@@ -78,18 +79,68 @@
78	79	#define EXOFS_SUPER_MAGIC 0x5DF5
79	80
80	81	/*
81		- * The file system control block - stored in an object's data (mainly, the one
82		- * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored
83		- * on disk. Right now it just has a magic value, which is basically a sanity
84		- * check on our ability to communicate with the object store.
	82	+ * The file system control block - stored in object EXOFS_SUPER_ID's data.
	83	+ * This is where the in-memory superblock is stored on disk.
85	84	*/
	85	+enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
86	86	struct exofs_fscb {
87	87	__le64 s_nextid; /* Highest object ID used */
88		- __le32 s_numfiles; /* Number of files on fs */
	88	+ __le64 s_numfiles; /* Number of files on fs */
	89	+ __le32 s_version; /* == EXOFS_FSCB_VER */
89	90	__le16 s_magic; /* Magic signature */
90	91	__le16 s_newfs; /* Non-zero if this is a new fs */
91		-};
92	92
	93	+ /* From here on it's a static part, only written by mkexofs */
	94	+ __le64 s_dev_table_oid; /* Resurved, not used */
	95	+ __le64 s_dev_table_count; /* == 0 means no dev_table */
	96	+} __packed;
	97	+
	98	+/*
	99	+ * Describes the raid used in the FS. It is part of the device table.
	100	+ * This here is taken from the pNFS-objects definition. In exofs we
	101	+ * use one raid policy through-out the filesystem. (NOTE: the funny
	102	+ * alignment at begining. We take care of it at exofs_device_table.
	103	+ */
	104	+struct exofs_dt_data_map {
	105	+ __le32 cb_num_comps;
	106	+ __le64 cb_stripe_unit;
	107	+ __le32 cb_group_width;
	108	+ __le32 cb_group_depth;
	109	+ __le32 cb_mirror_cnt;
	110	+ __le32 cb_raid_algorithm;
	111	+} __packed;
	112	+
	113	+/*
	114	+ * This is an osd device information descriptor. It is a single entry in
	115	+ * the exofs device table. It describes an osd target lun which
	116	+ * contains data belonging to this FS. (Same partition_id on all devices)
	117	+ */
	118	+struct exofs_dt_device_info {
	119	+ __le32 systemid_len;
	120	+ u8 systemid[OSD_SYSTEMID_LEN];
	121	+ __le64 long_name_offset; /* If !0 then offset-in-file */
	122	+ __le32 osdname_len; /* */
	123	+ u8 osdname[44]; /* Embbeded, Ususally an asci uuid */
	124	+} __packed;
	125	+
	126	+/*
	127	+ * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
	128	+ * It contains the raid used for this multy-device FS and an array of
	129	+ * participating devices.
	130	+ */
	131	+struct exofs_device_table {
	132	+ __le32 dt_version; /* == EXOFS_DT_VER */
	133	+ struct exofs_dt_data_map dt_data_map; /* Raid policy to use */
	134	+
	135	+ /* Resurved space For future use. Total includeing this:
	136	+ * (8 * sizeof(le64))
	137	+ */
	138	+ __le64 __Resurved[4];
	139	+
	140	+ __le64 dt_num_devices; /* Array size */
	141	+ struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */
	142	+} __packed;
	143	+
93	144	/****************************************************************************
94	145	* inode-related things
95	146	****************************************************************************/
...	...	@@ -154,24 +205,6 @@
154	205	#define EXOFS_DIR_REC_LEN(name_len) \
155	206	(((name_len) + offsetof(struct exofs_dir_entry, name) + \
156	207	EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
157		-
158		-/*************************
159		- * function declarations *
160		- *************************/
161		-/* osd.c */
162		-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
163		- const struct osd_obj_id *obj);
164		-
165		-int exofs_check_ok_resid(struct osd_request or, u64 in_resid, u64 *out_resid);
166		-static inline int exofs_check_ok(struct osd_request *or)
167		-{
168		- return exofs_check_ok_resid(or, NULL, NULL);
169		-}
170		-int exofs_sync_op(struct osd_request or, int timeout, u8 cred);
171		-int exofs_async_op(struct osd_request *or,
172		- osd_req_done_fn async_done, void caller_context, u8 *cred);
173		-
174		-int extract_attr_from_req(struct osd_request or, struct osd_attr attr);
175	208
176	209	#endif /ifndef __EXOFS_COM_H__/
...	...	@@ -30,13 +30,17 @@
30	30	* along with exofs; if not, write to the Free Software
31	31	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32	32	*/
	33	+#ifndef __EXOFS_H__
	34	+#define __EXOFS_H__
33	35
34	36	#include <linux/fs.h>
35	37	#include <linux/time.h>
36	38	#include "common.h"
37	39
38		-#ifndef __EXOFS_H__
39		-#define __EXOFS_H__
	40	+/* FIXME: Remove once pnfs hits mainline
	41	+ * #include <linux/exportfs/pnfs_osd_xdr.h>
	42	+ */
	43	+#include "pnfs.h"
40	44
41	45	#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
42	46
...	...	@@ -55,7 +59,7 @@
55	59	* our extension to the in-memory superblock
56	60	*/
57	61	struct exofs_sb_info {
58		- struct osd_dev s_dev; / returned by get_osd_dev */
	62	+ struct exofs_fscb s_fscb; /* Written often, pre-allocate*/
59	63	osd_id s_pid; /* partition ID of file system*/
60	64	int s_timeout; /* timeout for OSD operations */
61	65	uint64_t s_nextid; /* highest object ID used */
...	...	@@ -63,7 +67,11 @@
63	67	spinlock_t s_next_gen_lock; /* spinlock for gen # update */
64	68	u32 s_next_generation; /* next gen # to use */
65	69	atomic_t s_curr_pending; /* number of pending commands */
66		- uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */
	70	+ uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
	71	+
	72	+ struct pnfs_osd_data_map data_map; /* Default raid to use */
	73	+ unsigned s_numdevs; /* Num of devices in array */
	74	+ struct osd_dev s_ods[1]; / Variable length, minimum 1 */
67	75	};
68	76
69	77	/*
...	...	@@ -79,6 +87,50 @@
79	87	struct inode vfs_inode; /* normal in-memory inode */
80	88	};
81	89
	90	+static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
	91	+{
	92	+ return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
	93	+}
	94	+
	95	+struct exofs_io_state;
	96	+typedef void (exofs_io_done_fn)(struct exofs_io_state or, void *private);
	97	+
	98	+struct exofs_io_state {
	99	+ struct kref kref;
	100	+
	101	+ void *private;
	102	+ exofs_io_done_fn done;
	103	+
	104	+ struct exofs_sb_info *sbi;
	105	+ struct osd_obj_id obj;
	106	+ u8 *cred;
	107	+
	108	+ /* Global read/write IO*/
	109	+ loff_t offset;
	110	+ unsigned long length;
	111	+ void *kern_buff;
	112	+ struct bio *bio;
	113	+
	114	+ /* Attributes */
	115	+ unsigned in_attr_len;
	116	+ struct osd_attr *in_attr;
	117	+ unsigned out_attr_len;
	118	+ struct osd_attr *out_attr;
	119	+
	120	+ /* Variable array of size numdevs */
	121	+ unsigned numdevs;
	122	+ struct exofs_per_dev_state {
	123	+ struct osd_request *or;
	124	+ struct bio *bio;
	125	+ } per_dev[];
	126	+};
	127	+
	128	+static inline unsigned exofs_io_state_size(unsigned numdevs)
	129	+{
	130	+ return sizeof(struct exofs_io_state) +
	131	+ sizeof(struct exofs_per_dev_state) * numdevs;
	132	+}
	133	+
82	134	/*
83	135	* our inode flags
84	136	*/
...	...	@@ -130,6 +182,42 @@
130	182	/*************************
131	183	* function declarations *
132	184	*************************/
	185	+
	186	+/* ios.c */
	187	+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
	188	+ const struct osd_obj_id *obj);
	189	+int exofs_read_kern(struct osd_dev od, u8 cred, struct osd_obj_id *obj,
	190	+ u64 offset, void *p, unsigned length);
	191	+
	192	+int exofs_get_io_state(struct exofs_sb_info sbi, struct exofs_io_state* ios);
	193	+void exofs_put_io_state(struct exofs_io_state *ios);
	194	+
	195	+int exofs_check_io(struct exofs_io_state ios, u64 resid);
	196	+
	197	+int exofs_sbi_create(struct exofs_io_state *ios);
	198	+int exofs_sbi_remove(struct exofs_io_state *ios);
	199	+int exofs_sbi_write(struct exofs_io_state *ios);
	200	+int exofs_sbi_read(struct exofs_io_state *ios);
	201	+
	202	+int extract_attr_from_ios(struct exofs_io_state ios, struct osd_attr attr);
	203	+
	204	+int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
	205	+static inline int exofs_oi_write(struct exofs_i_info *oi,
	206	+ struct exofs_io_state *ios)
	207	+{
	208	+ ios->obj.id = exofs_oi_objno(oi);
	209	+ ios->cred = oi->i_cred;
	210	+ return exofs_sbi_write(ios);
	211	+}
	212	+
	213	+static inline int exofs_oi_read(struct exofs_i_info *oi,
	214	+ struct exofs_io_state *ios)
	215	+{
	216	+ ios->obj.id = exofs_oi_objno(oi);
	217	+ ios->cred = oi->i_cred;
	218	+ return exofs_sbi_read(ios);
	219	+}
	220	+
133	221	/* inode.c */
134	222	void exofs_truncate(struct inode *inode);
135	223	int exofs_setattr(struct dentry , struct iattr );
...	...	@@ -169,6 +257,7 @@
169	257
170	258	/* inode.c */
171	259	extern const struct address_space_operations exofs_aops;
	260	+extern const struct osd_attr g_attr_logical_length;
172	261
173	262	/* namei.c */
174	263	extern const struct inode_operations exofs_dir_inode_operations;
	1	+/*
	2	+ * Copyright (C) 2005, 2006
	3	+ * Avishay Traeger (avishay@gmail.com)
	4	+ * Copyright (C) 2008, 2009
	5	+ * Boaz Harrosh <bharrosh@panasas.com>
	6	+ *
	7	+ * This file is part of exofs.
	8	+ *
	9	+ * exofs is free software; you can redistribute it and/or modify
	10	+ * it under the terms of the GNU General Public License as published by
	11	+ * the Free Software Foundation. Since it is based on ext2, and the only
	12	+ * valid version of GPL for the Linux kernel is version 2, the only valid
	13	+ * version of GPL for exofs is version 2.
	14	+ *
	15	+ * exofs is distributed in the hope that it will be useful,
	16	+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	+ * GNU General Public License for more details.
	19	+ *
	20	+ * You should have received a copy of the GNU General Public License
	21	+ * along with exofs; if not, write to the Free Software
	22	+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	23	+ */
	24	+
	25	+#include <scsi/scsi_device.h>
	26	+
	27	+#include "exofs.h"
	28	+
	29	+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
	30	+{
	31	+ osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
	32	+}
	33	+
	34	+int exofs_read_kern(struct osd_dev od, u8 cred, struct osd_obj_id *obj,
	35	+ u64 offset, void *p, unsigned length)
	36	+{
	37	+ struct osd_request *or = osd_start_request(od, GFP_KERNEL);
	38	+/* struct osd_sense_info osi = {.key = 0};*/
	39	+ int ret;
	40	+
	41	+ if (unlikely(!or)) {
	42	+ EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
	43	+ return -ENOMEM;
	44	+ }
	45	+ ret = osd_req_read_kern(or, obj, offset, p, length);
	46	+ if (unlikely(ret)) {
	47	+ EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
	48	+ goto out;
	49	+ }
	50	+
	51	+ ret = osd_finalize_request(or, 0, cred, NULL);
	52	+ if (unlikely(ret)) {
	53	+ EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
	54	+ goto out;
	55	+ }
	56	+
	57	+ ret = osd_execute_request(or);
	58	+ if (unlikely(ret))
	59	+ EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
	60	+ /* osd_req_decode_sense(or, ret); */
	61	+
	62	+out:
	63	+ osd_end_request(or);
	64	+ return ret;
	65	+}
	66	+
	67	+int exofs_get_io_state(struct exofs_sb_info sbi, struct exofs_io_state* pios)
	68	+{
	69	+ struct exofs_io_state *ios;
	70	+
	71	+ /*TODO: Maybe use kmem_cach per sbi of size
	72	+ * exofs_io_state_size(sbi->s_numdevs)
	73	+ */
	74	+ ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
	75	+ if (unlikely(!ios)) {
	76	+ *pios = NULL;
	77	+ return -ENOMEM;
	78	+ }
	79	+
	80	+ ios->sbi = sbi;
	81	+ ios->obj.partition = sbi->s_pid;
	82	+ *pios = ios;
	83	+ return 0;
	84	+}
	85	+
	86	+void exofs_put_io_state(struct exofs_io_state *ios)
	87	+{
	88	+ if (ios) {
	89	+ unsigned i;
	90	+
	91	+ for (i = 0; i < ios->numdevs; i++) {
	92	+ struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
	93	+
	94	+ if (per_dev->or)
	95	+ osd_end_request(per_dev->or);
	96	+ if (per_dev->bio)
	97	+ bio_put(per_dev->bio);
	98	+ }
	99	+
	100	+ kfree(ios);
	101	+ }
	102	+}
	103	+
	104	+static void _sync_done(struct exofs_io_state ios, void p)
	105	+{
	106	+ struct completion *waiting = p;
	107	+
	108	+ complete(waiting);
	109	+}
	110	+
	111	+static void _last_io(struct kref *kref)
	112	+{
	113	+ struct exofs_io_state *ios = container_of(
	114	+ kref, struct exofs_io_state, kref);
	115	+
	116	+ ios->done(ios, ios->private);
	117	+}
	118	+
	119	+static void _done_io(struct osd_request or, void p)
	120	+{
	121	+ struct exofs_io_state *ios = p;
	122	+
	123	+ kref_put(&ios->kref, _last_io);
	124	+}
	125	+
	126	+static int exofs_io_execute(struct exofs_io_state *ios)
	127	+{
	128	+ DECLARE_COMPLETION_ONSTACK(wait);
	129	+ bool sync = (ios->done == NULL);
	130	+ int i, ret;
	131	+
	132	+ if (sync) {
	133	+ ios->done = _sync_done;
	134	+ ios->private = &wait;
	135	+ }
	136	+
	137	+ for (i = 0; i < ios->numdevs; i++) {
	138	+ struct osd_request *or = ios->per_dev[i].or;
	139	+ if (unlikely(!or))
	140	+ continue;
	141	+
	142	+ ret = osd_finalize_request(or, 0, ios->cred, NULL);
	143	+ if (unlikely(ret)) {
	144	+ EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
	145	+ ret);
	146	+ return ret;
	147	+ }
	148	+ }
	149	+
	150	+ kref_init(&ios->kref);
	151	+
	152	+ for (i = 0; i < ios->numdevs; i++) {
	153	+ struct osd_request *or = ios->per_dev[i].or;
	154	+ if (unlikely(!or))
	155	+ continue;
	156	+
	157	+ kref_get(&ios->kref);
	158	+ osd_execute_request_async(or, _done_io, ios);
	159	+ }
	160	+
	161	+ kref_put(&ios->kref, _last_io);
	162	+ ret = 0;
	163	+
	164	+ if (sync) {
	165	+ wait_for_completion(&wait);
	166	+ ret = exofs_check_io(ios, NULL);
	167	+ }
	168	+ return ret;
	169	+}
	170	+
	171	+int exofs_check_io(struct exofs_io_state ios, u64 resid)
	172	+{
	173	+ enum osd_err_priority acumulated_osd_err = 0;
	174	+ int acumulated_lin_err = 0;
	175	+ int i;
	176	+
	177	+ for (i = 0; i < ios->numdevs; i++) {
	178	+ struct osd_sense_info osi;
	179	+ int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi);
	180	+
	181	+ if (likely(!ret))
	182	+ continue;
	183	+
	184	+ if (unlikely(ret == -EFAULT)) {
	185	+ EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__);
	186	+ /*FIXME: All the pages in this device range should:
	187	+ * clear_highpage(page);
	188	+ */
	189	+ }
	190	+
	191	+ if (osi.osd_err_pri >= acumulated_osd_err) {
	192	+ acumulated_osd_err = osi.osd_err_pri;
	193	+ acumulated_lin_err = ret;
	194	+ }
	195	+ }
	196	+
	197	+ /* TODO: raid specific residual calculations */
	198	+ if (resid) {
	199	+ if (likely(!acumulated_lin_err))
	200	+ *resid = 0;
	201	+ else
	202	+ *resid = ios->length;
	203	+ }
	204	+
	205	+ return acumulated_lin_err;
	206	+}
	207	+
	208	+int exofs_sbi_create(struct exofs_io_state *ios)
	209	+{
	210	+ int i, ret;
	211	+
	212	+ for (i = 0; i < ios->sbi->s_numdevs; i++) {
	213	+ struct osd_request *or;
	214	+
	215	+ or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
	216	+ if (unlikely(!or)) {
	217	+ EXOFS_ERR("%s: osd_start_request failed\n", __func__);
	218	+ ret = -ENOMEM;
	219	+ goto out;
	220	+ }
	221	+ ios->per_dev[i].or = or;
	222	+ ios->numdevs++;
	223	+
	224	+ osd_req_create_object(or, &ios->obj);
	225	+ }
	226	+ ret = exofs_io_execute(ios);
	227	+
	228	+out:
	229	+ return ret;
	230	+}
	231	+
	232	+int exofs_sbi_remove(struct exofs_io_state *ios)
	233	+{
	234	+ int i, ret;
	235	+
	236	+ for (i = 0; i < ios->sbi->s_numdevs; i++) {
	237	+ struct osd_request *or;
	238	+
	239	+ or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
	240	+ if (unlikely(!or)) {
	241	+ EXOFS_ERR("%s: osd_start_request failed\n", __func__);
	242	+ ret = -ENOMEM;
	243	+ goto out;
	244	+ }
	245	+ ios->per_dev[i].or = or;
	246	+ ios->numdevs++;
	247	+
	248	+ osd_req_remove_object(or, &ios->obj);
	249	+ }
	250	+ ret = exofs_io_execute(ios);
	251	+
	252	+out:
	253	+ return ret;
	254	+}
	255	+
	256	+int exofs_sbi_write(struct exofs_io_state *ios)
	257	+{
	258	+ int i, ret;
	259	+
	260	+ for (i = 0; i < ios->sbi->s_numdevs; i++) {
	261	+ struct osd_request *or;
	262	+
	263	+ or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
	264	+ if (unlikely(!or)) {
	265	+ EXOFS_ERR("%s: osd_start_request failed\n", __func__);
	266	+ ret = -ENOMEM;
	267	+ goto out;
	268	+ }
	269	+ ios->per_dev[i].or = or;
	270	+ ios->numdevs++;
	271	+
	272	+ if (ios->bio) {
	273	+ struct bio *bio;
	274	+
	275	+ if (i != 0) {
	276	+ bio = bio_kmalloc(GFP_KERNEL,
	277	+ ios->bio->bi_max_vecs);
	278	+ if (unlikely(!bio)) {
	279	+ ret = -ENOMEM;
	280	+ goto out;
	281	+ }
	282	+
	283	+ __bio_clone(bio, ios->bio);
	284	+ bio->bi_bdev = NULL;
	285	+ bio->bi_next = NULL;
	286	+ ios->per_dev[i].bio = bio;
	287	+ } else {
	288	+ bio = ios->bio;
	289	+ }
	290	+
	291	+ osd_req_write(or, &ios->obj, ios->offset, bio,
	292	+ ios->length);
	293	+/* EXOFS_DBGMSG("write sync=%d\n", sync);*/
	294	+ } else if (ios->kern_buff) {
	295	+ osd_req_write_kern(or, &ios->obj, ios->offset,
	296	+ ios->kern_buff, ios->length);
	297	+/* EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/
	298	+ } else {
	299	+ osd_req_set_attributes(or, &ios->obj);
	300	+/* EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/
	301	+ }
	302	+
	303	+ if (ios->out_attr)
	304	+ osd_req_add_set_attr_list(or, ios->out_attr,
	305	+ ios->out_attr_len);
	306	+
	307	+ if (ios->in_attr)
	308	+ osd_req_add_get_attr_list(or, ios->in_attr,
	309	+ ios->in_attr_len);
	310	+ }
	311	+ ret = exofs_io_execute(ios);
	312	+
	313	+out:
	314	+ return ret;
	315	+}
	316	+
	317	+int exofs_sbi_read(struct exofs_io_state *ios)
	318	+{
	319	+ int i, ret;
	320	+
	321	+ for (i = 0; i < 1; i++) {
	322	+ struct osd_request *or;
	323	+ unsigned first_dev = (unsigned)ios->obj.id;
	324	+
	325	+ first_dev %= ios->sbi->s_numdevs;
	326	+ or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL);
	327	+ if (unlikely(!or)) {
	328	+ EXOFS_ERR("%s: osd_start_request failed\n", __func__);
	329	+ ret = -ENOMEM;
	330	+ goto out;
	331	+ }
	332	+ ios->per_dev[i].or = or;
	333	+ ios->numdevs++;
	334	+
	335	+ if (ios->bio) {
	336	+ osd_req_read(or, &ios->obj, ios->offset, ios->bio,
	337	+ ios->length);
	338	+/* EXOFS_DBGMSG("read sync=%d\n", sync);*/
	339	+ } else if (ios->kern_buff) {
	340	+ osd_req_read_kern(or, &ios->obj, ios->offset,
	341	+ ios->kern_buff, ios->length);
	342	+/* EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
	343	+ } else {
	344	+ osd_req_get_attributes(or, &ios->obj);
	345	+/* EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
	346	+ }
	347	+
	348	+ if (ios->out_attr)
	349	+ osd_req_add_set_attr_list(or, ios->out_attr,
	350	+ ios->out_attr_len);
	351	+
	352	+ if (ios->in_attr)
	353	+ osd_req_add_get_attr_list(or, ios->in_attr,
	354	+ ios->in_attr_len);
	355	+ }
	356	+ ret = exofs_io_execute(ios);
	357	+
	358	+out:
	359	+ return ret;
	360	+}
	361	+
	362	+int extract_attr_from_ios(struct exofs_io_state ios, struct osd_attr attr)
	363	+{
	364	+ struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
	365	+ void *iter = NULL;
	366	+ int nelem;
	367	+
	368	+ do {
	369	+ nelem = 1;
	370	+ osd_req_decode_get_attr_list(ios->per_dev[0].or,
	371	+ &cur_attr, &nelem, &iter);
	372	+ if ((cur_attr.attr_page == attr->attr_page) &&
	373	+ (cur_attr.attr_id == attr->attr_id)) {
	374	+ attr->len = cur_attr.len;
	375	+ attr->val_ptr = cur_attr.val_ptr;
	376	+ return 0;
	377	+ }
	378	+ } while (iter);
	379	+
	380	+ return -EIO;
	381	+}
	382	+
	383	+int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
	384	+{
	385	+ struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
	386	+ struct exofs_io_state *ios;
	387	+ struct osd_attr attr;
	388	+ __be64 newsize;
	389	+ int i, ret;
	390	+
	391	+ if (exofs_get_io_state(sbi, &ios))
	392	+ return -ENOMEM;
	393	+
	394	+ ios->obj.id = exofs_oi_objno(oi);
	395	+ ios->cred = oi->i_cred;
	396	+
	397	+ newsize = cpu_to_be64(size);
	398	+ attr = g_attr_logical_length;
	399	+ attr.val_ptr = &newsize;
	400	+
	401	+ for (i = 0; i < sbi->s_numdevs; i++) {
	402	+ struct osd_request *or;
	403	+
	404	+ or = osd_start_request(sbi->s_ods[i], GFP_KERNEL);
	405	+ if (unlikely(!or)) {
	406	+ EXOFS_ERR("%s: osd_start_request failed\n", __func__);
	407	+ ret = -ENOMEM;
	408	+ goto out;
	409	+ }
	410	+ ios->per_dev[i].or = or;
	411	+ ios->numdevs++;
	412	+
	413	+ osd_req_set_attributes(or, &ios->obj);
	414	+ osd_req_add_set_attr_list(or, &attr, 1);
	415	+ }
	416	+ ret = exofs_io_execute(ios);
	417	+
	418	+out:
	419	+ exofs_put_io_state(ios);
	420	+ return ret;
	421	+}
1		-/*
2		- * Copyright (C) 2005, 2006
3		- * Avishay Traeger (avishay@gmail.com)
4		- * Copyright (C) 2008, 2009
5		- * Boaz Harrosh <bharrosh@panasas.com>
6		- *
7		- * This file is part of exofs.
8		- *
9		- * exofs is free software; you can redistribute it and/or modify
10		- * it under the terms of the GNU General Public License as published by
11		- * the Free Software Foundation. Since it is based on ext2, and the only
12		- * valid version of GPL for the Linux kernel is version 2, the only valid
13		- * version of GPL for exofs is version 2.
14		- *
15		- * exofs is distributed in the hope that it will be useful,
16		- * but WITHOUT ANY WARRANTY; without even the implied warranty of
17		- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18		- * GNU General Public License for more details.
19		- *
20		- * You should have received a copy of the GNU General Public License
21		- * along with exofs; if not, write to the Free Software
22		- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23		- */
24		-
25		-#include <scsi/scsi_device.h>
26		-#include <scsi/osd_sense.h>
27		-
28		-#include "exofs.h"
29		-
30		-int exofs_check_ok_resid(struct osd_request or, u64 in_resid, u64 *out_resid)
31		-{
32		- struct osd_sense_info osi;
33		- int ret = osd_req_decode_sense(or, &osi);
34		-
35		- if (ret) { /* translate to Linux codes */
36		- if (osi.additional_code == scsi_invalid_field_in_cdb) {
37		- if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
38		- ret = -EFAULT;
39		- if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
40		- ret = -ENOENT;
41		- else
42		- ret = -EINVAL;
43		- } else if (osi.additional_code == osd_quota_error)
44		- ret = -ENOSPC;
45		- else
46		- ret = -EIO;
47		- }
48		-
49		- /* FIXME: should be include in osd_sense_info */
50		- if (in_resid)
51		- *in_resid = or->in.req ? or->in.req->resid_len : 0;
52		-
53		- if (out_resid)
54		- *out_resid = or->out.req ? or->out.req->resid_len : 0;
55		-
56		- return ret;
57		-}
58		-
59		-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
60		-{
61		- osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
62		-}
63		-
64		-/*
65		- * Perform a synchronous OSD operation.
66		- */
67		-int exofs_sync_op(struct osd_request or, int timeout, uint8_t credential)
68		-{
69		- int ret;
70		-
71		- or->timeout = timeout;
72		- ret = osd_finalize_request(or, 0, credential, NULL);
73		- if (ret) {
74		- EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
75		- return ret;
76		- }
77		-
78		- ret = osd_execute_request(or);
79		-
80		- if (ret)
81		- EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
82		- /* osd_req_decode_sense(or, ret); */
83		- return ret;
84		-}
85		-
86		-/*
87		- * Perform an asynchronous OSD operation.
88		- */
89		-int exofs_async_op(struct osd_request or, osd_req_done_fn async_done,
90		- void caller_context, u8 cred)
91		-{
92		- int ret;
93		-
94		- ret = osd_finalize_request(or, 0, cred, NULL);
95		- if (ret) {
96		- EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
97		- return ret;
98		- }
99		-
100		- ret = osd_execute_request_async(or, async_done, caller_context);
101		-
102		- if (ret)
103		- EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
104		- return ret;
105		-}
106		-
107		-int extract_attr_from_req(struct osd_request or, struct osd_attr attr)
108		-{
109		- struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
110		- void *iter = NULL;
111		- int nelem;
112		-
113		- do {
114		- nelem = 1;
115		- osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
116		- if ((cur_attr.attr_page == attr->attr_page) &&
117		- (cur_attr.attr_id == attr->attr_id)) {
118		- attr->len = cur_attr.len;
119		- attr->val_ptr = cur_attr.val_ptr;
120		- return 0;
121		- }
122		- } while (iter);
123		-
124		- return -EIO;
125		-}
...	...	@@ -203,49 +203,45 @@
203	203	{
204	204	struct exofs_sb_info *sbi;
205	205	struct exofs_fscb *fscb;
206		- struct osd_request *or;
207		- struct osd_obj_id obj;
	206	+ struct exofs_io_state *ios;
208	207	int ret = -ENOMEM;
209	208
210		- fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
211		- if (!fscb) {
212		- EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
213		- return -ENOMEM;
214		- }
215		-
216	209	lock_super(sb);
217	210	sbi = sb->s_fs_info;
	211	+ fscb = &sbi->s_fscb;
	212	+
	213	+ ret = exofs_get_io_state(sbi, &ios);
	214	+ if (ret)
	215	+ goto out;
	216	+
	217	+ /* Note: We only write the changing part of the fscb. .i.e upto the
	218	+ * the fscb->s_dev_table_oid member. There is no read-modify-write
	219	+ * here.
	220	+ */
	221	+ ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
	222	+ memset(fscb, 0, ios->length);
218	223	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
219	224	fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
220	225	fscb->s_magic = cpu_to_le16(sb->s_magic);
221	226	fscb->s_newfs = 0;
	227	+ fscb->s_version = EXOFS_FSCB_VER;
222	228
223		- or = osd_start_request(sbi->s_dev, GFP_KERNEL);
224		- if (unlikely(!or)) {
225		- EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
226		- goto out;
227		- }
	229	+ ios->obj.id = EXOFS_SUPER_ID;
	230	+ ios->offset = 0;
	231	+ ios->kern_buff = fscb;
	232	+ ios->cred = sbi->s_cred;
228	233
229		- obj.partition = sbi->s_pid;
230		- obj.id = EXOFS_SUPER_ID;
231		- ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
	234	+ ret = exofs_sbi_write(ios);
232	235	if (unlikely(ret)) {
233		- EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
	236	+ EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
234	237	goto out;
235	238	}
236		-
237		- ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
238		- if (unlikely(ret)) {
239		- EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
240		- goto out;
241		- }
242	239	sb->s_dirt = 0;
243	240
244	241	out:
245		- if (or)
246		- osd_end_request(or);
	242	+ EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
	243	+ exofs_put_io_state(ios);
247	244	unlock_super(sb);
248		- kfree(fscb);
249	245	return ret;
250	246	}
251	247
...	...	@@ -257,6 +253,29 @@
257	253	sb->s_dirt = 0;
258	254	}
259	255
	256	+static void _exofs_print_device(const char msg, const char dev_path,
	257	+ struct osd_dev *od, u64 pid)
	258	+{
	259	+ const struct osd_dev_info *odi = osduld_device_info(od);
	260	+
	261	+ printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
	262	+ msg, dev_path ?: "", odi->osdname, _LLU(pid));
	263	+}
	264	+
	265	+void exofs_free_sbi(struct exofs_sb_info *sbi)
	266	+{
	267	+ while (sbi->s_numdevs) {
	268	+ int i = --sbi->s_numdevs;
	269	+ struct osd_dev *od = sbi->s_ods[i];
	270	+
	271	+ if (od) {
	272	+ sbi->s_ods[i] = NULL;
	273	+ osduld_put_device(od);
	274	+ }
	275	+ }
	276	+ kfree(sbi);
	277	+}
	278	+
260	279	/*
261	280	* This function is called when the vfs is freeing the superblock. We just
262	281	* need to free our own part.
263	282
...	...	@@ -279,11 +298,182 @@
279	298	msecs_to_jiffies(100));
280	299	}
281	300
282		- osduld_put_device(sbi->s_dev);
283		- kfree(sb->s_fs_info);
	301	+ _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid);
	302	+
	303	+ exofs_free_sbi(sbi);
284	304	sb->s_fs_info = NULL;
285	305	}
286	306
	307	+static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
	308	+ struct exofs_device_table *dt)
	309	+{
	310	+ sbi->data_map.odm_num_comps =
	311	+ le32_to_cpu(dt->dt_data_map.cb_num_comps);
	312	+ sbi->data_map.odm_stripe_unit =
	313	+ le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
	314	+ sbi->data_map.odm_group_width =
	315	+ le32_to_cpu(dt->dt_data_map.cb_group_width);
	316	+ sbi->data_map.odm_group_depth =
	317	+ le32_to_cpu(dt->dt_data_map.cb_group_depth);
	318	+ sbi->data_map.odm_mirror_cnt =
	319	+ le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
	320	+ sbi->data_map.odm_raid_algorithm =
	321	+ le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
	322	+
	323	+/* FIXME: Hard coded mirror only for now. if not so do not mount */
	324	+ if ((sbi->data_map.odm_num_comps != numdevs) \|\|
	325	+ (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) \|\|
	326	+ (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) \|\|
	327	+ (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
	328	+ return -EINVAL;
	329	+ else
	330	+ return 0;
	331	+}
	332	+
	333	+/* @odi is valid only as long as @fscb_dev is valid */
	334	+static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
	335	+ struct osd_dev_info *odi)
	336	+{
	337	+ odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
	338	+ memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
	339	+
	340	+ odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
	341	+ odi->osdname = dt_dev->osdname;
	342	+
	343	+ /* FIXME support long names. Will need a _put function */
	344	+ if (dt_dev->long_name_offset)
	345	+ return -EINVAL;
	346	+
	347	+ /* Make sure osdname is printable!
	348	+ * mkexofs should give us space for a null-terminator else the
	349	+ * device-table is invalid.
	350	+ */
	351	+ if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
	352	+ odi->osdname_len = sizeof(dt_dev->osdname) - 1;
	353	+ dt_dev->osdname[odi->osdname_len] = 0;
	354	+
	355	+ /* If it's all zeros something is bad we read past end-of-obj */
	356	+ return !(odi->systemid_len \|\| odi->osdname_len);
	357	+}
	358	+
	359	+static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
	360	+ unsigned table_count)
	361	+{
	362	+ struct exofs_sb_info sbi = psbi;
	363	+ struct osd_dev *fscb_od;
	364	+ struct osd_obj_id obj = {.partition = sbi->s_pid,
	365	+ .id = EXOFS_DEVTABLE_ID};
	366	+ struct exofs_device_table *dt;
	367	+ unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
	368	+ sizeof(*dt);
	369	+ unsigned numdevs, i;
	370	+ int ret;
	371	+
	372	+ dt = kmalloc(table_bytes, GFP_KERNEL);
	373	+ if (unlikely(!dt)) {
	374	+ EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
	375	+ table_bytes);
	376	+ return -ENOMEM;
	377	+ }
	378	+
	379	+ fscb_od = sbi->s_ods[0];
	380	+ sbi->s_ods[0] = NULL;
	381	+ sbi->s_numdevs = 0;
	382	+ ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
	383	+ if (unlikely(ret)) {
	384	+ EXOFS_ERR("ERROR: reading device table\n");
	385	+ goto out;
	386	+ }
	387	+
	388	+ numdevs = le64_to_cpu(dt->dt_num_devices);
	389	+ if (unlikely(!numdevs)) {
	390	+ ret = -EINVAL;
	391	+ goto out;
	392	+ }
	393	+ WARN_ON(table_count != numdevs);
	394	+
	395	+ ret = _read_and_match_data_map(sbi, numdevs, dt);
	396	+ if (unlikely(ret))
	397	+ goto out;
	398	+
	399	+ if (likely(numdevs > 1)) {
	400	+ unsigned size = numdevs * sizeof(sbi->s_ods[0]);
	401	+
	402	+ sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
	403	+ if (unlikely(!sbi)) {
	404	+ ret = -ENOMEM;
	405	+ goto out;
	406	+ }
	407	+ memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0]));
	408	+ *psbi = sbi;
	409	+ }
	410	+
	411	+ for (i = 0; i < numdevs; i++) {
	412	+ struct exofs_fscb fscb;
	413	+ struct osd_dev_info odi;
	414	+ struct osd_dev *od;
	415	+
	416	+ if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
	417	+ EXOFS_ERR("ERROR: Read all-zeros device entry\n");
	418	+ ret = -EINVAL;
	419	+ goto out;
	420	+ }
	421	+
	422	+ printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
	423	+ i, odi.osdname);
	424	+
	425	+ /* On all devices the device table is identical. The user can
	426	+ * specify any one of the participating devices on the command
	427	+ * line. We always keep them in device-table order.
	428	+ */
	429	+ if (fscb_od && osduld_device_same(fscb_od, &odi)) {
	430	+ sbi->s_ods[i] = fscb_od;
	431	+ ++sbi->s_numdevs;
	432	+ fscb_od = NULL;
	433	+ continue;
	434	+ }
	435	+
	436	+ od = osduld_info_lookup(&odi);
	437	+ if (unlikely(IS_ERR(od))) {
	438	+ ret = PTR_ERR(od);
	439	+ EXOFS_ERR("ERROR: device requested is not found "
	440	+ "osd_name-%s =>%d\n", odi.osdname, ret);
	441	+ goto out;
	442	+ }
	443	+
	444	+ sbi->s_ods[i] = od;
	445	+ ++sbi->s_numdevs;
	446	+
	447	+ /* Read the fscb of the other devices to make sure the FS
	448	+ * partition is there.
	449	+ */
	450	+ ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
	451	+ sizeof(fscb));
	452	+ if (unlikely(ret)) {
	453	+ EXOFS_ERR("ERROR: Malformed participating device "
	454	+ "error reading fscb osd_name-%s\n",
	455	+ odi.osdname);
	456	+ goto out;
	457	+ }
	458	+
	459	+ /* TODO: verify other information is correct and FS-uuid
	460	+ * matches. Benny what did you say about device table
	461	+ * generation and old devices?
	462	+ */
	463	+ }
	464	+
	465	+out:
	466	+ kfree(dt);
	467	+ if (unlikely(!ret && fscb_od)) {
	468	+ EXOFS_ERR(
	469	+ "ERROR: Bad device-table container device not present\n");
	470	+ osduld_put_device(fscb_od);
	471	+ ret = -EINVAL;
	472	+ }
	473	+
	474	+ return ret;
	475	+}
	476	+
287	477	/*
288	478	* Read the superblock from the OSD and fill in the fields
289	479	*/
290	480
291	481
292	482
293	483
294	484
...	...	@@ -292,24 +482,25 @@
292	482	struct inode *root;
293	483	struct exofs_mountopt *opts = data;
294	484	struct exofs_sb_info sbi; /extended info */
	485	+ struct osd_dev od; / Master device */
295	486	struct exofs_fscb fscb; /on-disk superblock info /
296		- struct osd_request *or = NULL;
297	487	struct osd_obj_id obj;
	488	+ unsigned table_count;
298	489	int ret;
299	490
300	491	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
301	492	if (!sbi)
302	493	return -ENOMEM;
303		- sb->s_fs_info = sbi;
304	494
305	495	/* use mount options to fill superblock */
306		- sbi->s_dev = osduld_path_lookup(opts->dev_name);
307		- if (IS_ERR(sbi->s_dev)) {
308		- ret = PTR_ERR(sbi->s_dev);
309		- sbi->s_dev = NULL;
	496	+ od = osduld_path_lookup(opts->dev_name);
	497	+ if (IS_ERR(od)) {
	498	+ ret = PTR_ERR(od);
310	499	goto free_sbi;
311	500	}
312	501
	502	+ sbi->s_ods[0] = od;
	503	+ sbi->s_numdevs = 1;
313	504	sbi->s_pid = opts->pid;
314	505	sbi->s_timeout = opts->timeout;
315	506
316	507
317	508
318	509
...	...	@@ -323,36 +514,14 @@
323	514	sb->s_bdev = NULL;
324	515	sb->s_dev = 0;
325	516
326		- /* read data from on-disk superblock object */
327	517	obj.partition = sbi->s_pid;
328	518	obj.id = EXOFS_SUPER_ID;
329	519	exofs_make_credential(sbi->s_cred, &obj);
330	520
331		- or = osd_start_request(sbi->s_dev, GFP_KERNEL);
332		- if (unlikely(!or)) {
333		- if (!silent)
334		- EXOFS_ERR(
335		- "exofs_fill_super: osd_start_request failed.\n");
336		- ret = -ENOMEM;
	521	+ ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
	522	+ if (unlikely(ret))
337	523	goto free_sbi;
338		- }
339		- ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
340		- if (unlikely(ret)) {
341		- if (!silent)
342		- EXOFS_ERR(
343		- "exofs_fill_super: osd_req_read_kern failed.\n");
344		- ret = -ENOMEM;
345		- goto free_sbi;
346		- }
347	524
348		- ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
349		- if (unlikely(ret)) {
350		- if (!silent)
351		- EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
352		- ret = -EIO;
353		- goto free_sbi;
354		- }
355		-
356	525	sb->s_magic = le16_to_cpu(fscb.s_magic);
357	526	sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
358	527	sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
359	528
360	529
...	...	@@ -364,12 +533,26 @@
364	533	ret = -EINVAL;
365	534	goto free_sbi;
366	535	}
	536	+ if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
	537	+ EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
	538	+ EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
	539	+ ret = -EINVAL;
	540	+ goto free_sbi;
	541	+ }
367	542
368	543	/* start generation numbers from a random point */
369	544	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
370	545	spin_lock_init(&sbi->s_next_gen_lock);
371	546
	547	+ table_count = le64_to_cpu(fscb.s_dev_table_count);
	548	+ if (table_count) {
	549	+ ret = exofs_read_lookup_dev_table(&sbi, table_count);
	550	+ if (unlikely(ret))
	551	+ goto free_sbi;
	552	+ }
	553	+
372	554	/* set up operation vectors */
	555	+ sb->s_fs_info = sbi;
373	556	sb->s_op = &exofs_sops;
374	557	sb->s_export_op = &exofs_export_ops;
375	558	root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
376	559
...	...	@@ -395,16 +578,15 @@
395	578	goto free_sbi;
396	579	}
397	580
398		- ret = 0;
399		-out:
400		- if (or)
401		- osd_end_request(or);
402		- return ret;
	581	+ _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0],
	582	+ sbi->s_pid);
	583	+ return 0;
403	584
404	585	free_sbi:
405		- osduld_put_device(sbi->s_dev); /* NULL safe */
406		- kfree(sbi);
407		- goto out;
	586	+ EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
	587	+ opts->dev_name, sbi->s_pid, ret);
	588	+ exofs_free_sbi(sbi);
	589	+ return ret;
408	590	}
409	591
410	592	/*
...	...	@@ -433,7 +615,7 @@
433	615	{
434	616	struct super_block *sb = dentry->d_sb;
435	617	struct exofs_sb_info *sbi = sb->s_fs_info;
436		- struct osd_obj_id obj = {sbi->s_pid, 0};
	618	+ struct exofs_io_state *ios;
437	619	struct osd_attr attrs[] = {
438	620	ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
439	621	OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
440	622
441	623
442	624
443	625
444	626
...	...	@@ -442,32 +624,33 @@
442	624	};
443	625	uint64_t capacity = ULLONG_MAX;
444	626	uint64_t used = ULLONG_MAX;
445		- struct osd_request *or;
446	627	uint8_t cred_a[OSD_CAP_LEN];
447	628	int ret;
448	629
449		- /* get used/capacity attributes */
450		- exofs_make_credential(cred_a, &obj);
451		-
452		- or = osd_start_request(sbi->s_dev, GFP_KERNEL);
453		- if (unlikely(!or)) {
454		- EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
455		- return -ENOMEM;
	630	+ ret = exofs_get_io_state(sbi, &ios);
	631	+ if (ret) {
	632	+ EXOFS_DBGMSG("exofs_get_io_state failed.\n");
	633	+ return ret;
456	634	}
457	635
458		- osd_req_get_attributes(or, &obj);
459		- osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
460		- ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
	636	+ exofs_make_credential(cred_a, &ios->obj);
	637	+ ios->cred = sbi->s_cred;
	638	+ ios->in_attr = attrs;
	639	+ ios->in_attr_len = ARRAY_SIZE(attrs);
	640	+
	641	+ ret = exofs_sbi_read(ios);
461	642	if (unlikely(ret))
462	643	goto out;
463	644
464		- ret = extract_attr_from_req(or, &attrs[0]);
465		- if (likely(!ret))
	645	+ ret = extract_attr_from_ios(ios, &attrs[0]);
	646	+ if (likely(!ret)) {
466	647	capacity = get_unaligned_be64(attrs[0].val_ptr);
467		- else
	648	+ if (unlikely(!capacity))
	649	+ capacity = ULLONG_MAX;
	650	+ } else
468	651	EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
469	652
470		- ret = extract_attr_from_req(or, &attrs[1]);
	653	+ ret = extract_attr_from_ios(ios, &attrs[1]);
471	654	if (likely(!ret))
472	655	used = get_unaligned_be64(attrs[1].val_ptr);
473	656	else
474	657
...	...	@@ -476,15 +659,15 @@
476	659	/* fill in the stats buffer */
477	660	buf->f_type = EXOFS_SUPER_MAGIC;
478	661	buf->f_bsize = EXOFS_BLKSIZE;
479		- buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
480		- buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
	662	+ buf->f_blocks = capacity >> 9;
	663	+ buf->f_bfree = (capacity - used) >> 9;
481	664	buf->f_bavail = buf->f_bfree;
482	665	buf->f_files = sbi->s_numfiles;
483	666	buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
484	667	buf->f_namelen = EXOFS_NAME_LEN;
485	668
486	669	out:
487		- osd_end_request(or);
	670	+ exofs_put_io_state(ios);
488	671	return ret;
489	672	}
490	673