Commit a5eba3f66f812cbc076a1170b3f888ad63f850b2

Authored by Linus Torvalds

Merge branch 'for-linus' of git://git.open-osd.org/linux-open-osd

* 'for-linus' of git://git.open-osd.org/linux-open-osd:
  exofs: Multi-device mirror support
  exofs: Move all operations to an io_engine
  exofs: move osd.c to ios.c
  exofs: statfs blocks is sectors not FS blocks
  exofs: Prints on mount and unmout
  exofs: refactor exofs_i_info initialization into common helper
  exofs: dbg-print less
  exofs: More sane debug print
  trivial: some small fixes in exofs documentation

Showing 10 changed files Side-by-side Diff

Documentation/filesystems/00-INDEX
... ... @@ -36,6 +36,8 @@
36 36 - info about directory notification in Linux.
37 37 ecryptfs.txt
38 38 - docs on eCryptfs: stacked cryptographic filesystem for Linux.
  39 +exofs.txt
  40 + - info, usage, mount options, design about EXOFS.
39 41 ext2.txt
40 42 - info, mount options and specifications for the Ext2 filesystem.
41 43 ext3.txt
Documentation/filesystems/exofs.txt
... ... @@ -60,13 +60,13 @@
60 60  
61 61 mkfs.exofs --pid=65536 --format /dev/osd0
62 62  
63   - The --format is optional if not specified no OSD_FORMAT will be
64   - preformed and a clean file system will be created in the specified pid,
  63 + The --format is optional. If not specified, no OSD_FORMAT will be
  64 + performed and a clean file system will be created in the specified pid,
65 65 in the available space of the target. (Use --format=size_in_meg to limit
66 66 the total LUN space available)
67 67  
68   - If pid already exist it will be deleted and a new one will be created in it's
69   - place. Be careful.
  68 + If pid already exists, it will be deleted and a new one will be created in
  69 + its place. Be careful.
70 70  
71 71 An exofs lives inside a single OSD partition. You can create multiple exofs
72 72 filesystems on the same device using multiple pids.
... ... @@ -81,7 +81,7 @@
81 81  
82 82 7. For reference (See do-exofs example script):
83 83 do-exofs start - an example of how to perform the above steps.
84   - do-exofs stop - an example of how to unmount the file system.
  84 + do-exofs stop - an example of how to unmount the file system.
85 85 do-exofs format - an example of how to format and mkfs a new exofs.
86 86  
87 87 8. Extra compilation flags (uncomment in fs/exofs/Kbuild):
... ... @@ -104,8 +104,8 @@
104 104 exofs specific options: Options are separated by commas (,)
105 105 pid=<integer> - The partition number to mount/create as
106 106 container of the filesystem.
107   - This option is mandatory
108   - to=<integer> - Timeout in ticks for a single command
  107 + This option is mandatory.
  108 + to=<integer> - Timeout in ticks for a single command.
109 109 default is (60 * HZ) [for debugging only]
110 110  
111 111 ===============================================================================
... ... @@ -116,7 +116,7 @@
116 116 with a special ID (defined in common.h).
117 117 Information included in the file system control block is used to fill the
118 118 in-memory superblock structure at mount time. This object is created before
119   - the file system is used by mkexofs.c It contains information such as:
  119 + the file system is used by mkexofs.c. It contains information such as:
120 120 - The file system's magic number
121 121 - The next inode number to be allocated
122 122  
... ... @@ -134,8 +134,8 @@
134 134 attributes. This applies to both regular files and other types (directories,
135 135 device files, symlinks, etc.).
136 136  
137   -* Credentials are generated per object (inode and superblock) when they is
138   - created in memory (read off disk or created). The credential works for all
  137 +* Credentials are generated per object (inode and superblock) when they are
  138 + created in memory (read from disk or created). The credential works for all
139 139 operations and is used as long as the object remains in memory.
140 140  
141 141 * Async OSD operations are used whenever possible, but the target may execute
... ... @@ -145,7 +145,8 @@
145 145 from executing in reverse order:
146 146 - The following are handled with the OBJ_CREATED and OBJ_2BCREATED
147 147 flags. OBJ_CREATED is set when we know the object exists on the OSD -
148   - in create's callback function, and when we successfully do a read_inode.
  148 + in create's callback function, and when we successfully do a
  149 + read_inode.
149 150 OBJ_2BCREATED is set in the beginning of the create function, so we
150 151 know that we should wait.
151 152 - create/delete: delete should wait until the object is created
... ... @@ -12,6 +12,6 @@
12 12 # Kbuild - Gets included from the Kernels Makefile and build system
13 13 #
14 14  
15   -exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
  15 +exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
16 16 obj-$(CONFIG_EXOFS_FS) += exofs.o
... ... @@ -49,6 +49,7 @@
49 49 #define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
50 50 #define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
51 51 #define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
  52 +#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
52 53 #define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
53 54  
54 55 /* exofs Application specific page/attribute */
55 56  
56 57  
57 58  
58 59  
... ... @@ -78,18 +79,68 @@
78 79 #define EXOFS_SUPER_MAGIC 0x5DF5
79 80  
80 81 /*
81   - * The file system control block - stored in an object's data (mainly, the one
82   - * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored
83   - * on disk. Right now it just has a magic value, which is basically a sanity
84   - * check on our ability to communicate with the object store.
  82 + * The file system control block - stored in object EXOFS_SUPER_ID's data.
  83 + * This is where the in-memory superblock is stored on disk.
85 84 */
  85 +enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
86 86 struct exofs_fscb {
87 87 __le64 s_nextid; /* Highest object ID used */
88   - __le32 s_numfiles; /* Number of files on fs */
  88 + __le64 s_numfiles; /* Number of files on fs */
  89 + __le32 s_version; /* == EXOFS_FSCB_VER */
89 90 __le16 s_magic; /* Magic signature */
90 91 __le16 s_newfs; /* Non-zero if this is a new fs */
91   -};
92 92  
  93 + /* From here on it's a static part, only written by mkexofs */
  94 + __le64 s_dev_table_oid; /* Resurved, not used */
  95 + __le64 s_dev_table_count; /* == 0 means no dev_table */
  96 +} __packed;
  97 +
  98 +/*
  99 + * Describes the raid used in the FS. It is part of the device table.
  100 + * This here is taken from the pNFS-objects definition. In exofs we
  101 + * use one raid policy through-out the filesystem. (NOTE: the funny
  102 + * alignment at begining. We take care of it at exofs_device_table.
  103 + */
  104 +struct exofs_dt_data_map {
  105 + __le32 cb_num_comps;
  106 + __le64 cb_stripe_unit;
  107 + __le32 cb_group_width;
  108 + __le32 cb_group_depth;
  109 + __le32 cb_mirror_cnt;
  110 + __le32 cb_raid_algorithm;
  111 +} __packed;
  112 +
  113 +/*
  114 + * This is an osd device information descriptor. It is a single entry in
  115 + * the exofs device table. It describes an osd target lun which
  116 + * contains data belonging to this FS. (Same partition_id on all devices)
  117 + */
  118 +struct exofs_dt_device_info {
  119 + __le32 systemid_len;
  120 + u8 systemid[OSD_SYSTEMID_LEN];
  121 + __le64 long_name_offset; /* If !0 then offset-in-file */
  122 + __le32 osdname_len; /* */
  123 + u8 osdname[44]; /* Embbeded, Ususally an asci uuid */
  124 +} __packed;
  125 +
  126 +/*
  127 + * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
  128 + * It contains the raid used for this multy-device FS and an array of
  129 + * participating devices.
  130 + */
  131 +struct exofs_device_table {
  132 + __le32 dt_version; /* == EXOFS_DT_VER */
  133 + struct exofs_dt_data_map dt_data_map; /* Raid policy to use */
  134 +
  135 + /* Resurved space For future use. Total includeing this:
  136 + * (8 * sizeof(le64))
  137 + */
  138 + __le64 __Resurved[4];
  139 +
  140 + __le64 dt_num_devices; /* Array size */
  141 + struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */
  142 +} __packed;
  143 +
93 144 /****************************************************************************
94 145 * inode-related things
95 146 ****************************************************************************/
... ... @@ -154,24 +205,6 @@
154 205 #define EXOFS_DIR_REC_LEN(name_len) \
155 206 (((name_len) + offsetof(struct exofs_dir_entry, name) + \
156 207 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
157   -
158   -/*************************
159   - * function declarations *
160   - *************************/
161   -/* osd.c */
162   -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
163   - const struct osd_obj_id *obj);
164   -
165   -int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
166   -static inline int exofs_check_ok(struct osd_request *or)
167   -{
168   - return exofs_check_ok_resid(or, NULL, NULL);
169   -}
170   -int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
171   -int exofs_async_op(struct osd_request *or,
172   - osd_req_done_fn *async_done, void *caller_context, u8 *cred);
173   -
174   -int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
175 208  
176 209 #endif /*ifndef __EXOFS_COM_H__*/
... ... @@ -30,13 +30,17 @@
30 30 * along with exofs; if not, write to the Free Software
31 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 32 */
  33 +#ifndef __EXOFS_H__
  34 +#define __EXOFS_H__
33 35  
34 36 #include <linux/fs.h>
35 37 #include <linux/time.h>
36 38 #include "common.h"
37 39  
38   -#ifndef __EXOFS_H__
39   -#define __EXOFS_H__
  40 +/* FIXME: Remove once pnfs hits mainline
  41 + * #include <linux/exportfs/pnfs_osd_xdr.h>
  42 + */
  43 +#include "pnfs.h"
40 44  
41 45 #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
42 46  
... ... @@ -55,7 +59,7 @@
55 59 * our extension to the in-memory superblock
56 60 */
57 61 struct exofs_sb_info {
58   - struct osd_dev *s_dev; /* returned by get_osd_dev */
  62 + struct exofs_fscb s_fscb; /* Written often, pre-allocate*/
59 63 osd_id s_pid; /* partition ID of file system*/
60 64 int s_timeout; /* timeout for OSD operations */
61 65 uint64_t s_nextid; /* highest object ID used */
... ... @@ -63,7 +67,11 @@
63 67 spinlock_t s_next_gen_lock; /* spinlock for gen # update */
64 68 u32 s_next_generation; /* next gen # to use */
65 69 atomic_t s_curr_pending; /* number of pending commands */
66   - uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */
  70 + uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
  71 +
  72 + struct pnfs_osd_data_map data_map; /* Default raid to use */
  73 + unsigned s_numdevs; /* Num of devices in array */
  74 + struct osd_dev *s_ods[1]; /* Variable length, minimum 1 */
67 75 };
68 76  
69 77 /*
... ... @@ -79,6 +87,50 @@
79 87 struct inode vfs_inode; /* normal in-memory inode */
80 88 };
81 89  
  90 +static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
  91 +{
  92 + return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
  93 +}
  94 +
  95 +struct exofs_io_state;
  96 +typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
  97 +
  98 +struct exofs_io_state {
  99 + struct kref kref;
  100 +
  101 + void *private;
  102 + exofs_io_done_fn done;
  103 +
  104 + struct exofs_sb_info *sbi;
  105 + struct osd_obj_id obj;
  106 + u8 *cred;
  107 +
  108 + /* Global read/write IO*/
  109 + loff_t offset;
  110 + unsigned long length;
  111 + void *kern_buff;
  112 + struct bio *bio;
  113 +
  114 + /* Attributes */
  115 + unsigned in_attr_len;
  116 + struct osd_attr *in_attr;
  117 + unsigned out_attr_len;
  118 + struct osd_attr *out_attr;
  119 +
  120 + /* Variable array of size numdevs */
  121 + unsigned numdevs;
  122 + struct exofs_per_dev_state {
  123 + struct osd_request *or;
  124 + struct bio *bio;
  125 + } per_dev[];
  126 +};
  127 +
  128 +static inline unsigned exofs_io_state_size(unsigned numdevs)
  129 +{
  130 + return sizeof(struct exofs_io_state) +
  131 + sizeof(struct exofs_per_dev_state) * numdevs;
  132 +}
  133 +
82 134 /*
83 135 * our inode flags
84 136 */
... ... @@ -130,6 +182,42 @@
130 182 /*************************
131 183 * function declarations *
132 184 *************************/
  185 +
  186 +/* ios.c */
  187 +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
  188 + const struct osd_obj_id *obj);
  189 +int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
  190 + u64 offset, void *p, unsigned length);
  191 +
  192 +int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios);
  193 +void exofs_put_io_state(struct exofs_io_state *ios);
  194 +
  195 +int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
  196 +
  197 +int exofs_sbi_create(struct exofs_io_state *ios);
  198 +int exofs_sbi_remove(struct exofs_io_state *ios);
  199 +int exofs_sbi_write(struct exofs_io_state *ios);
  200 +int exofs_sbi_read(struct exofs_io_state *ios);
  201 +
  202 +int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
  203 +
  204 +int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
  205 +static inline int exofs_oi_write(struct exofs_i_info *oi,
  206 + struct exofs_io_state *ios)
  207 +{
  208 + ios->obj.id = exofs_oi_objno(oi);
  209 + ios->cred = oi->i_cred;
  210 + return exofs_sbi_write(ios);
  211 +}
  212 +
  213 +static inline int exofs_oi_read(struct exofs_i_info *oi,
  214 + struct exofs_io_state *ios)
  215 +{
  216 + ios->obj.id = exofs_oi_objno(oi);
  217 + ios->cred = oi->i_cred;
  218 + return exofs_sbi_read(ios);
  219 +}
  220 +
133 221 /* inode.c */
134 222 void exofs_truncate(struct inode *inode);
135 223 int exofs_setattr(struct dentry *, struct iattr *);
... ... @@ -169,6 +257,7 @@
169 257  
170 258 /* inode.c */
171 259 extern const struct address_space_operations exofs_aops;
  260 +extern const struct osd_attr g_attr_logical_length;
172 261  
173 262 /* namei.c */
174 263 extern const struct inode_operations exofs_dir_inode_operations;
... ... @@ -37,15 +37,18 @@
37 37  
38 38 #include "exofs.h"
39 39  
40   -#ifdef CONFIG_EXOFS_DEBUG
41   -# define EXOFS_DEBUG_OBJ_ISIZE 1
42   -#endif
  40 +#define EXOFS_DBGMSG2(M...) do {} while (0)
43 41  
  42 +enum { BIO_MAX_PAGES_KMALLOC =
  43 + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
  44 +};
  45 +
44 46 struct page_collect {
45 47 struct exofs_sb_info *sbi;
46 48 struct request_queue *req_q;
47 49 struct inode *inode;
48 50 unsigned expected_pages;
  51 + struct exofs_io_state *ios;
49 52  
50 53 struct bio *bio;
51 54 unsigned nr_pages;
52 55  
53 56  
54 57  
... ... @@ -54,22 +57,23 @@
54 57 };
55 58  
56 59 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
57   - struct inode *inode)
  60 + struct inode *inode)
58 61 {
59 62 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
60 63  
61 64 pcol->sbi = sbi;
62   - pcol->req_q = osd_request_queue(sbi->s_dev);
  65 + /* Create master bios on first Q, later on cloning, each clone will be
  66 + * allocated on it's destination Q
  67 + */
  68 + pcol->req_q = osd_request_queue(sbi->s_ods[0]);
63 69 pcol->inode = inode;
64 70 pcol->expected_pages = expected_pages;
65 71  
  72 + pcol->ios = NULL;
66 73 pcol->bio = NULL;
67 74 pcol->nr_pages = 0;
68 75 pcol->length = 0;
69 76 pcol->pg_first = -1;
70   -
71   - EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
72   - expected_pages);
73 77 }
74 78  
75 79 static void _pcol_reset(struct page_collect *pcol)
76 80  
77 81  
78 82  
79 83  
80 84  
81 85  
... ... @@ -80,35 +84,49 @@
80 84 pcol->nr_pages = 0;
81 85 pcol->length = 0;
82 86 pcol->pg_first = -1;
83   - EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
84   - pcol->inode->i_ino, pcol->expected_pages);
  87 + pcol->ios = NULL;
85 88  
86 89 /* this is probably the end of the loop but in writes
87 90 * it might not end here. don't be left with nothing
88 91 */
89 92 if (!pcol->expected_pages)
90   - pcol->expected_pages = 128;
  93 + pcol->expected_pages = BIO_MAX_PAGES_KMALLOC;
91 94 }
92 95  
93 96 static int pcol_try_alloc(struct page_collect *pcol)
94 97 {
95   - int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
  98 + int pages = min_t(unsigned, pcol->expected_pages,
  99 + BIO_MAX_PAGES_KMALLOC);
96 100  
  101 + if (!pcol->ios) { /* First time allocate io_state */
  102 + int ret = exofs_get_io_state(pcol->sbi, &pcol->ios);
  103 +
  104 + if (ret)
  105 + return ret;
  106 + }
  107 +
97 108 for (; pages; pages >>= 1) {
98   - pcol->bio = bio_alloc(GFP_KERNEL, pages);
  109 + pcol->bio = bio_kmalloc(GFP_KERNEL, pages);
99 110 if (likely(pcol->bio))
100 111 return 0;
101 112 }
102 113  
103   - EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
  114 + EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n",
104 115 pcol->expected_pages);
105 116 return -ENOMEM;
106 117 }
107 118  
108 119 static void pcol_free(struct page_collect *pcol)
109 120 {
110   - bio_put(pcol->bio);
111   - pcol->bio = NULL;
  121 + if (pcol->bio) {
  122 + bio_put(pcol->bio);
  123 + pcol->bio = NULL;
  124 + }
  125 +
  126 + if (pcol->ios) {
  127 + exofs_put_io_state(pcol->ios);
  128 + pcol->ios = NULL;
  129 + }
112 130 }
113 131  
114 132 static int pcol_add_page(struct page_collect *pcol, struct page *page,
115 133  
116 134  
117 135  
... ... @@ -161,22 +179,17 @@
161 179 /* Called at the end of reads, to optionally unlock pages and update their
162 180 * status.
163 181 */
164   -static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
165   - bool do_unlock)
  182 +static int __readpages_done(struct page_collect *pcol, bool do_unlock)
166 183 {
167 184 struct bio_vec *bvec;
168 185 int i;
169 186 u64 resid;
170 187 u64 good_bytes;
171 188 u64 length = 0;
172   - int ret = exofs_check_ok_resid(or, &resid, NULL);
  189 + int ret = exofs_check_io(pcol->ios, &resid);
173 190  
174   - osd_end_request(or);
175   -
176 191 if (likely(!ret))
177 192 good_bytes = pcol->length;
178   - else if (!resid)
179   - good_bytes = 0;
180 193 else
181 194 good_bytes = pcol->length - resid;
182 195  
... ... @@ -198,7 +211,7 @@
198 211 else
199 212 page_stat = ret;
200 213  
201   - EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n",
  214 + EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n",
202 215 inode->i_ino, page->index,
203 216 page_stat ? "bad_bytes" : "good_bytes");
204 217  
205 218  
206 219  
... ... @@ -214,13 +227,13 @@
214 227 }
215 228  
216 229 /* callback of async reads */
217   -static void readpages_done(struct osd_request *or, void *p)
  230 +static void readpages_done(struct exofs_io_state *ios, void *p)
218 231 {
219 232 struct page_collect *pcol = p;
220 233  
221   - __readpages_done(or, pcol, true);
  234 + __readpages_done(pcol, true);
222 235 atomic_dec(&pcol->sbi->s_curr_pending);
223   - kfree(p);
  236 + kfree(pcol);
224 237 }
225 238  
226 239 static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
227 240  
228 241  
... ... @@ -238,17 +251,13 @@
238 251  
239 252 unlock_page(page);
240 253 }
241   - pcol_free(pcol);
242 254 }
243 255  
244 256 static int read_exec(struct page_collect *pcol, bool is_sync)
245 257 {
246 258 struct exofs_i_info *oi = exofs_i(pcol->inode);
247   - struct osd_obj_id obj = {pcol->sbi->s_pid,
248   - pcol->inode->i_ino + EXOFS_OBJ_OFF};
249   - struct osd_request *or = NULL;
  259 + struct exofs_io_state *ios = pcol->ios;
250 260 struct page_collect *pcol_copy = NULL;
251   - loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
252 261 int ret;
253 262  
254 263 if (!pcol->bio)
255 264  
256 265  
... ... @@ -257,17 +266,13 @@
257 266 /* see comment in _readpage() about sync reads */
258 267 WARN_ON(is_sync && (pcol->nr_pages != 1));
259 268  
260   - or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
261   - if (unlikely(!or)) {
262   - ret = -ENOMEM;
263   - goto err;
264   - }
  269 + ios->bio = pcol->bio;
  270 + ios->length = pcol->length;
  271 + ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
265 272  
266   - osd_req_read(or, &obj, i_start, pcol->bio, pcol->length);
267   -
268 273 if (is_sync) {
269   - exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
270   - return __readpages_done(or, pcol, false);
  274 + exofs_oi_read(oi, pcol->ios);
  275 + return __readpages_done(pcol, false);
271 276 }
272 277  
273 278 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
274 279  
... ... @@ -277,14 +282,16 @@
277 282 }
278 283  
279 284 *pcol_copy = *pcol;
280   - ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
  285 + ios->done = readpages_done;
  286 + ios->private = pcol_copy;
  287 + ret = exofs_oi_read(oi, ios);
281 288 if (unlikely(ret))
282 289 goto err;
283 290  
284 291 atomic_inc(&pcol->sbi->s_curr_pending);
285 292  
286 293 EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
287   - obj.id, _LLU(i_start), pcol->length);
  294 + ios->obj.id, _LLU(ios->offset), pcol->length);
288 295  
289 296 /* pages ownership was passed to pcol_copy */
290 297 _pcol_reset(pcol);
291 298  
292 299  
... ... @@ -293,12 +300,10 @@
293 300 err:
294 301 if (!is_sync)
295 302 _unlock_pcol_pages(pcol, ret, READ);
296   - else /* Pages unlocked by caller in sync mode only free bio */
297   - pcol_free(pcol);
298 303  
  304 + pcol_free(pcol);
  305 +
299 306 kfree(pcol_copy);
300   - if (or)
301   - osd_end_request(or);
302 307 return ret;
303 308 }
304 309  
305 310  
... ... @@ -370,12 +375,12 @@
370 375 if (len != PAGE_CACHE_SIZE)
371 376 zero_user(page, len, PAGE_CACHE_SIZE - len);
372 377  
373   - EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
  378 + EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
374 379 inode->i_ino, page->index, len);
375 380  
376 381 ret = pcol_add_page(pcol, page, len);
377 382 if (ret) {
378   - EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
  383 + EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
379 384 "this_len=0x%zx nr_pages=%u length=0x%lx\n",
380 385 page, len, pcol->nr_pages, pcol->length);
381 386  
... ... @@ -419,9 +424,8 @@
419 424  
420 425 _pcol_init(&pcol, 1, page->mapping->host);
421 426  
422   - /* readpage_strip might call read_exec(,async) inside at several places
423   - * but this is safe for is_async=0 since read_exec will not do anything
424   - * when we have a single page.
  427 + /* readpage_strip might call read_exec(,is_sync==false) at several
  428 + * places but not if we have a single page.
425 429 */
426 430 ret = readpage_strip(&pcol, page);
427 431 if (ret) {
... ... @@ -440,8 +444,8 @@
440 444 return _readpage(page, false);
441 445 }
442 446  
443   -/* Callback for osd_write. All writes are asynchronouse */
444   -static void writepages_done(struct osd_request *or, void *p)
  447 +/* Callback for osd_write. All writes are asynchronous */
  448 +static void writepages_done(struct exofs_io_state *ios, void *p)
445 449 {
446 450 struct page_collect *pcol = p;
447 451 struct bio_vec *bvec;
448 452  
449 453  
... ... @@ -449,16 +453,12 @@
449 453 u64 resid;
450 454 u64 good_bytes;
451 455 u64 length = 0;
  456 + int ret = exofs_check_io(ios, &resid);
452 457  
453   - int ret = exofs_check_ok_resid(or, NULL, &resid);
454   -
455   - osd_end_request(or);
456 458 atomic_dec(&pcol->sbi->s_curr_pending);
457 459  
458 460 if (likely(!ret))
459 461 good_bytes = pcol->length;
460   - else if (!resid)
461   - good_bytes = 0;
462 462 else
463 463 good_bytes = pcol->length - resid;
464 464  
... ... @@ -482,7 +482,7 @@
482 482  
483 483 update_write_page(page, page_stat);
484 484 unlock_page(page);
485   - EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n",
  485 + EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n",
486 486 inode->i_ino, page->index, page_stat);
487 487  
488 488 length += bvec->bv_len;
489 489  
490 490  
... ... @@ -496,23 +496,13 @@
496 496 static int write_exec(struct page_collect *pcol)
497 497 {
498 498 struct exofs_i_info *oi = exofs_i(pcol->inode);
499   - struct osd_obj_id obj = {pcol->sbi->s_pid,
500   - pcol->inode->i_ino + EXOFS_OBJ_OFF};
501   - struct osd_request *or = NULL;
  499 + struct exofs_io_state *ios = pcol->ios;
502 500 struct page_collect *pcol_copy = NULL;
503   - loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
504 501 int ret;
505 502  
506 503 if (!pcol->bio)
507 504 return 0;
508 505  
509   - or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
510   - if (unlikely(!or)) {
511   - EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
512   - ret = -ENOMEM;
513   - goto err;
514   - }
515   -
516 506 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
517 507 if (!pcol_copy) {
518 508 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
519 509  
520 510  
... ... @@ -523,16 +513,22 @@
523 513 *pcol_copy = *pcol;
524 514  
525 515 pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
526   - osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length);
527   - ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
  516 +
  517 + ios->bio = pcol_copy->bio;
  518 + ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
  519 + ios->length = pcol_copy->length;
  520 + ios->done = writepages_done;
  521 + ios->private = pcol_copy;
  522 +
  523 + ret = exofs_oi_write(oi, ios);
528 524 if (unlikely(ret)) {
529   - EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
  525 + EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
530 526 goto err;
531 527 }
532 528  
533 529 atomic_inc(&pcol->sbi->s_curr_pending);
534 530 EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
535   - pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
  531 + pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
536 532 pcol->length);
537 533 /* pages ownership was passed to pcol_copy */
538 534 _pcol_reset(pcol);
539 535  
... ... @@ -540,9 +536,9 @@
540 536  
541 537 err:
542 538 _unlock_pcol_pages(pcol, ret, WRITE);
  539 + pcol_free(pcol);
543 540 kfree(pcol_copy);
544   - if (or)
545   - osd_end_request(or);
  541 +
546 542 return ret;
547 543 }
548 544  
... ... @@ -586,6 +582,9 @@
586 582 if (PageError(page))
587 583 ClearPageError(page);
588 584 unlock_page(page);
  585 + EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
  586 + "outside the limits\n",
  587 + inode->i_ino, page->index);
589 588 return 0;
590 589 }
591 590 }
... ... @@ -600,6 +599,9 @@
600 599 ret = write_exec(pcol);
601 600 if (unlikely(ret))
602 601 goto fail;
  602 +
  603 + EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
  604 + inode->i_ino, page->index);
603 605 goto try_again;
604 606 }
605 607  
... ... @@ -609,7 +611,7 @@
609 611 goto fail;
610 612 }
611 613  
612   - EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
  614 + EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
613 615 inode->i_ino, page->index, len);
614 616  
615 617 ret = pcol_add_page(pcol, page, len);
... ... @@ -634,6 +636,8 @@
634 636 return 0;
635 637  
636 638 fail:
  639 + EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
  640 + inode->i_ino, page->index, ret);
637 641 set_bit(AS_EIO, &page->mapping->flags);
638 642 unlock_page(page);
639 643 return ret;
640 644  
641 645  
... ... @@ -652,14 +656,17 @@
652 656 wbc->range_end >> PAGE_CACHE_SHIFT;
653 657  
654 658 if (start || end)
655   - expected_pages = min(end - start + 1, 32L);
  659 + expected_pages = end - start + 1;
656 660 else
657 661 expected_pages = mapping->nrpages;
658 662  
659   - EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
660   - " m->nrpages=%lu start=0x%lx end=0x%lx\n",
  663 + if (expected_pages < 32L)
  664 + expected_pages = 32L;
  665 +
  666 + EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
  667 + "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
661 668 mapping->host->i_ino, wbc->range_start, wbc->range_end,
662   - mapping->nrpages, start, end);
  669 + mapping->nrpages, start, end, expected_pages);
663 670  
664 671 _pcol_init(&pcol, expected_pages, mapping->host);
665 672  
666 673  
667 674  
... ... @@ -771,19 +778,28 @@
771 778 const struct osd_attr g_attr_logical_length = ATTR_DEF(
772 779 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
773 780  
  781 +static int _do_truncate(struct inode *inode)
  782 +{
  783 + struct exofs_i_info *oi = exofs_i(inode);
  784 + loff_t isize = i_size_read(inode);
  785 + int ret;
  786 +
  787 + inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  788 +
  789 + nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
  790 +
  791 + ret = exofs_oi_truncate(oi, (u64)isize);
  792 + EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize);
  793 + return ret;
  794 +}
  795 +
774 796 /*
775 797 * Truncate a file to the specified size - all we have to do is set the size
776 798 * attribute. We make sure the object exists first.
777 799 */
778 800 void exofs_truncate(struct inode *inode)
779 801 {
780   - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
781 802 struct exofs_i_info *oi = exofs_i(inode);
782   - struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
783   - struct osd_request *or;
784   - struct osd_attr attr;
785   - loff_t isize = i_size_read(inode);
786   - __be64 newsize;
787 803 int ret;
788 804  
789 805 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
790 806  
791 807  
... ... @@ -793,31 +809,14 @@
793 809 return;
794 810 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
795 811 return;
796   - inode->i_mtime = inode->i_ctime = CURRENT_TIME;
797 812  
798   - nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
799   -
800   - or = osd_start_request(sbi->s_dev, GFP_KERNEL);
801   - if (unlikely(!or)) {
802   - EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
803   - goto fail;
804   - }
805   -
806   - osd_req_set_attributes(or, &obj);
807   -
808   - newsize = cpu_to_be64((u64)isize);
809   - attr = g_attr_logical_length;
810   - attr.val_ptr = &newsize;
811   - osd_req_add_set_attr_list(or, &attr, 1);
812   -
813 813 /* if we are about to truncate an object, and it hasn't been
814 814 * created yet, wait
815 815 */
816 816 if (unlikely(wait_obj_created(oi)))
817 817 goto fail;
818 818  
819   - ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
820   - osd_end_request(or);
  819 + ret = _do_truncate(inode);
821 820 if (ret)
822 821 goto fail;
823 822  
824 823  
825 824  
826 825  
827 826  
828 827  
829 828  
830 829  
831 830  
832 831  
833 832  
834 833  
835 834  
836 835  
837 836  
838 837  
... ... @@ -847,65 +846,62 @@
847 846  
848 847 /*
849 848 * Read an inode from the OSD, and return it as is. We also return the size
850   - * attribute in the 'sanity' argument if we got compiled with debugging turned
851   - * on.
  849 + * attribute in the 'obj_size' argument.
852 850 */
853 851 static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
854   - struct exofs_fcb *inode, uint64_t *sanity)
  852 + struct exofs_fcb *inode, uint64_t *obj_size)
855 853 {
856 854 struct exofs_sb_info *sbi = sb->s_fs_info;
857   - struct osd_request *or;
858   - struct osd_attr attr;
859   - struct osd_obj_id obj = {sbi->s_pid,
860   - oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
  855 + struct osd_attr attrs[2];
  856 + struct exofs_io_state *ios;
861 857 int ret;
862 858  
863   - exofs_make_credential(oi->i_cred, &obj);
864   -
865   - or = osd_start_request(sbi->s_dev, GFP_KERNEL);
866   - if (unlikely(!or)) {
867   - EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
868   - return -ENOMEM;
  859 + *obj_size = ~0;
  860 + ret = exofs_get_io_state(sbi, &ios);
  861 + if (unlikely(ret)) {
  862 + EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
  863 + return ret;
869 864 }
870   - osd_req_get_attributes(or, &obj);
871 865  
872   - /* we need the inode attribute */
873   - osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
  866 + ios->obj.id = exofs_oi_objno(oi);
  867 + exofs_make_credential(oi->i_cred, &ios->obj);
  868 + ios->cred = oi->i_cred;
874 869  
875   -#ifdef EXOFS_DEBUG_OBJ_ISIZE
876   - /* we get the size attributes to do a sanity check */
877   - osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
878   -#endif
  870 + attrs[0] = g_attr_inode_data;
  871 + attrs[1] = g_attr_logical_length;
  872 + ios->in_attr = attrs;
  873 + ios->in_attr_len = ARRAY_SIZE(attrs);
879 874  
880   - ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
  875 + ret = exofs_sbi_read(ios);
881 876 if (ret)
882 877 goto out;
883 878  
884   - attr = g_attr_inode_data;
885   - ret = extract_attr_from_req(or, &attr);
  879 + ret = extract_attr_from_ios(ios, &attrs[0]);
886 880 if (ret) {
887   - EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
  881 + EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
888 882 goto out;
889 883 }
  884 + WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
  885 + memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
890 886  
891   - WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
892   - memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
893   -
894   -#ifdef EXOFS_DEBUG_OBJ_ISIZE
895   - attr = g_attr_logical_length;
896   - ret = extract_attr_from_req(or, &attr);
  887 + ret = extract_attr_from_ios(ios, &attrs[1]);
897 888 if (ret) {
898   - EXOFS_ERR("ERROR: extract attr from or failed\n");
  889 + EXOFS_ERR("%s: extract_attr of logical_length failed\n",
  890 + __func__);
899 891 goto out;
900 892 }
901   - *sanity = get_unaligned_be64(attr.val_ptr);
902   -#endif
  893 + *obj_size = get_unaligned_be64(attrs[1].val_ptr);
903 894  
904 895 out:
905   - osd_end_request(or);
  896 + exofs_put_io_state(ios);
906 897 return ret;
907 898 }
908 899  
  900 +static void __oi_init(struct exofs_i_info *oi)
  901 +{
  902 + init_waitqueue_head(&oi->i_wq);
  903 + oi->i_flags = 0;
  904 +}
909 905 /*
910 906 * Fill in an inode read from the OSD and set it up for use
911 907 */
... ... @@ -914,7 +910,7 @@
914 910 struct exofs_i_info *oi;
915 911 struct exofs_fcb fcb;
916 912 struct inode *inode;
917   - uint64_t uninitialized_var(sanity);
  913 + uint64_t obj_size;
918 914 int ret;
919 915  
920 916 inode = iget_locked(sb, ino);
921 917  
922 918  
... ... @@ -923,13 +919,13 @@
923 919 if (!(inode->i_state & I_NEW))
924 920 return inode;
925 921 oi = exofs_i(inode);
  922 + __oi_init(oi);
926 923  
927 924 /* read the inode from the osd */
928   - ret = exofs_get_inode(sb, oi, &fcb, &sanity);
  925 + ret = exofs_get_inode(sb, oi, &fcb, &obj_size);
929 926 if (ret)
930 927 goto bad_inode;
931 928  
932   - init_waitqueue_head(&oi->i_wq);
933 929 set_obj_created(oi);
934 930  
935 931 /* copy stuff from on-disk struct to in-memory struct */
936 932  
937 933  
... ... @@ -947,14 +943,12 @@
947 943 inode->i_blkbits = EXOFS_BLKSHIFT;
948 944 inode->i_generation = le32_to_cpu(fcb.i_generation);
949 945  
950   -#ifdef EXOFS_DEBUG_OBJ_ISIZE
951   - if ((inode->i_size != sanity) &&
  946 + if ((inode->i_size != obj_size) &&
952 947 (!exofs_inode_is_fast_symlink(inode))) {
953   - EXOFS_ERR("WARNING: Size of object from inode and "
954   - "attributes differ (%lld != %llu)\n",
955   - inode->i_size, _LLU(sanity));
  948 + EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
  949 + inode->i_size, _LLU(obj_size));
  950 + /* FIXME: call exofs_inode_recovery() */
956 951 }
957   -#endif
958 952  
959 953 oi->i_dir_start_lookup = 0;
960 954  
961 955  
962 956  
963 957  
... ... @@ -1020,24 +1014,31 @@
1020 1014 * set the obj_created flag so that other methods know that the object exists on
1021 1015 * the OSD.
1022 1016 */
1023   -static void create_done(struct osd_request *or, void *p)
  1017 +static void create_done(struct exofs_io_state *ios, void *p)
1024 1018 {
1025 1019 struct inode *inode = p;
1026 1020 struct exofs_i_info *oi = exofs_i(inode);
1027 1021 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
1028 1022 int ret;
1029 1023  
1030   - ret = exofs_check_ok(or);
1031   - osd_end_request(or);
  1024 + ret = exofs_check_io(ios, NULL);
  1025 + exofs_put_io_state(ios);
  1026 +
1032 1027 atomic_dec(&sbi->s_curr_pending);
1033 1028  
1034 1029 if (unlikely(ret)) {
1035 1030 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
1036   - _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
1037   - make_bad_inode(inode);
1038   - } else
1039   - set_obj_created(oi);
  1031 + _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid));
  1032 + /*TODO: When FS is corrupted creation can fail, object already
  1033 + * exist. Get rid of this asynchronous creation, if exist
  1034 + * increment the obj counter and try the next object. Until we
  1035 + * succeed. All these dangling objects will be made into lost
  1036 + * files by chkfs.exofs
  1037 + */
  1038 + }
1040 1039  
  1040 + set_obj_created(oi);
  1041 +
1041 1042 atomic_dec(&inode->i_count);
1042 1043 wake_up(&oi->i_wq);
1043 1044 }
... ... @@ -1051,8 +1052,7 @@
1051 1052 struct inode *inode;
1052 1053 struct exofs_i_info *oi;
1053 1054 struct exofs_sb_info *sbi;
1054   - struct osd_request *or;
1055   - struct osd_obj_id obj;
  1055 + struct exofs_io_state *ios;
1056 1056 int ret;
1057 1057  
1058 1058 sb = dir->i_sb;
1059 1059  
... ... @@ -1061,8 +1061,8 @@
1061 1061 return ERR_PTR(-ENOMEM);
1062 1062  
1063 1063 oi = exofs_i(inode);
  1064 + __oi_init(oi);
1064 1065  
1065   - init_waitqueue_head(&oi->i_wq);
1066 1066 set_obj_2bcreated(oi);
1067 1067  
1068 1068 sbi = sb->s_fs_info;
1069 1069  
1070 1070  
1071 1071  
... ... @@ -1089,28 +1089,28 @@
1089 1089  
1090 1090 mark_inode_dirty(inode);
1091 1091  
1092   - obj.partition = sbi->s_pid;
1093   - obj.id = inode->i_ino + EXOFS_OBJ_OFF;
1094   - exofs_make_credential(oi->i_cred, &obj);
1095   -
1096   - or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1097   - if (unlikely(!or)) {
1098   - EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
1099   - return ERR_PTR(-ENOMEM);
  1092 + ret = exofs_get_io_state(sbi, &ios);
  1093 + if (unlikely(ret)) {
  1094 + EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
  1095 + return ERR_PTR(ret);
1100 1096 }
1101 1097  
1102   - osd_req_create_object(or, &obj);
  1098 + ios->obj.id = exofs_oi_objno(oi);
  1099 + exofs_make_credential(oi->i_cred, &ios->obj);
1103 1100  
1104 1101 /* increment the refcount so that the inode will still be around when we
1105 1102 * reach the callback
1106 1103 */
1107 1104 atomic_inc(&inode->i_count);
1108 1105  
1109   - ret = exofs_async_op(or, create_done, inode, oi->i_cred);
  1106 + ios->done = create_done;
  1107 + ios->private = inode;
  1108 + ios->cred = oi->i_cred;
  1109 + ret = exofs_sbi_create(ios);
1110 1110 if (ret) {
1111 1111 atomic_dec(&inode->i_count);
1112   - osd_end_request(or);
1113   - return ERR_PTR(-EIO);
  1112 + exofs_put_io_state(ios);
  1113 + return ERR_PTR(ret);
1114 1114 }
1115 1115 atomic_inc(&sbi->s_curr_pending);
1116 1116  
1117 1117  
... ... @@ -1128,11 +1128,11 @@
1128 1128 /*
1129 1129 * Callback function from exofs_update_inode().
1130 1130 */
1131   -static void updatei_done(struct osd_request *or, void *p)
  1131 +static void updatei_done(struct exofs_io_state *ios, void *p)
1132 1132 {
1133 1133 struct updatei_args *args = p;
1134 1134  
1135   - osd_end_request(or);
  1135 + exofs_put_io_state(ios);
1136 1136  
1137 1137 atomic_dec(&args->sbi->s_curr_pending);
1138 1138  
... ... @@ -1148,8 +1148,7 @@
1148 1148 struct exofs_i_info *oi = exofs_i(inode);
1149 1149 struct super_block *sb = inode->i_sb;
1150 1150 struct exofs_sb_info *sbi = sb->s_fs_info;
1151   - struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
1152   - struct osd_request *or;
  1151 + struct exofs_io_state *ios;
1153 1152 struct osd_attr attr;
1154 1153 struct exofs_fcb *fcb;
1155 1154 struct updatei_args *args;
1156 1155  
1157 1156  
... ... @@ -1186,18 +1185,16 @@
1186 1185 } else
1187 1186 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1188 1187  
1189   - or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1190   - if (unlikely(!or)) {
1191   - EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
1192   - ret = -ENOMEM;
  1188 + ret = exofs_get_io_state(sbi, &ios);
  1189 + if (unlikely(ret)) {
  1190 + EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
1193 1191 goto free_args;
1194 1192 }
1195 1193  
1196   - osd_req_set_attributes(or, &obj);
1197   -
1198 1194 attr = g_attr_inode_data;
1199 1195 attr.val_ptr = fcb;
1200   - osd_req_add_set_attr_list(or, &attr, 1);
  1196 + ios->out_attr_len = 1;
  1197 + ios->out_attr = &attr;
1201 1198  
1202 1199 if (!obj_created(oi)) {
1203 1200 EXOFS_DBGMSG("!obj_created\n");
1204 1201  
1205 1202  
1206 1203  
... ... @@ -1206,22 +1203,19 @@
1206 1203 EXOFS_DBGMSG("wait_event done\n");
1207 1204 }
1208 1205  
1209   - if (do_sync) {
1210   - ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
1211   - osd_end_request(or);
1212   - goto free_args;
1213   - } else {
  1206 + if (!do_sync) {
1214 1207 args->sbi = sbi;
  1208 + ios->done = updatei_done;
  1209 + ios->private = args;
  1210 + }
1215 1211  
1216   - ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
1217   - if (ret) {
1218   - osd_end_request(or);
1219   - goto free_args;
1220   - }
  1212 + ret = exofs_oi_write(oi, ios);
  1213 + if (!do_sync && !ret) {
1221 1214 atomic_inc(&sbi->s_curr_pending);
1222 1215 goto out; /* deallocation in updatei_done */
1223 1216 }
1224 1217  
  1218 + exofs_put_io_state(ios);
1225 1219 free_args:
1226 1220 kfree(args);
1227 1221 out:
1228 1222  
... ... @@ -1238,11 +1232,12 @@
1238 1232 * Callback function from exofs_delete_inode() - don't have much cleaning up to
1239 1233 * do.
1240 1234 */
1241   -static void delete_done(struct osd_request *or, void *p)
  1235 +static void delete_done(struct exofs_io_state *ios, void *p)
1242 1236 {
1243   - struct exofs_sb_info *sbi;
1244   - osd_end_request(or);
1245   - sbi = p;
  1237 + struct exofs_sb_info *sbi = p;
  1238 +
  1239 + exofs_put_io_state(ios);
  1240 +
1246 1241 atomic_dec(&sbi->s_curr_pending);
1247 1242 }
1248 1243  
... ... @@ -1256,8 +1251,7 @@
1256 1251 struct exofs_i_info *oi = exofs_i(inode);
1257 1252 struct super_block *sb = inode->i_sb;
1258 1253 struct exofs_sb_info *sbi = sb->s_fs_info;
1259   - struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
1260   - struct osd_request *or;
  1254 + struct exofs_io_state *ios;
1261 1255 int ret;
1262 1256  
1263 1257 truncate_inode_pages(&inode->i_data, 0);
1264 1258  
1265 1259  
1266 1260  
... ... @@ -1274,25 +1268,26 @@
1274 1268  
1275 1269 clear_inode(inode);
1276 1270  
1277   - or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1278   - if (unlikely(!or)) {
1279   - EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
  1271 + ret = exofs_get_io_state(sbi, &ios);
  1272 + if (unlikely(ret)) {
  1273 + EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
1280 1274 return;
1281 1275 }
1282 1276  
1283   - osd_req_remove_object(or, &obj);
1284   -
1285 1277 /* if we are deleting an obj that hasn't been created yet, wait */
1286 1278 if (!obj_created(oi)) {
1287 1279 BUG_ON(!obj_2bcreated(oi));
1288 1280 wait_event(oi->i_wq, obj_created(oi));
1289 1281 }
1290 1282  
1291   - ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
  1283 + ios->obj.id = exofs_oi_objno(oi);
  1284 + ios->done = delete_done;
  1285 + ios->private = sbi;
  1286 + ios->cred = oi->i_cred;
  1287 + ret = exofs_sbi_remove(ios);
1292 1288 if (ret) {
1293   - EXOFS_ERR(
1294   - "ERROR: @exofs_delete_inode exofs_async_op failed\n");
1295   - osd_end_request(or);
  1289 + EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__);
  1290 + exofs_put_io_state(ios);
1296 1291 return;
1297 1292 }
1298 1293 atomic_inc(&sbi->s_curr_pending);
  1 +/*
  2 + * Copyright (C) 2005, 2006
  3 + * Avishay Traeger (avishay@gmail.com)
  4 + * Copyright (C) 2008, 2009
  5 + * Boaz Harrosh <bharrosh@panasas.com>
  6 + *
  7 + * This file is part of exofs.
  8 + *
  9 + * exofs is free software; you can redistribute it and/or modify
  10 + * it under the terms of the GNU General Public License as published by
  11 + * the Free Software Foundation. Since it is based on ext2, and the only
  12 + * valid version of GPL for the Linux kernel is version 2, the only valid
  13 + * version of GPL for exofs is version 2.
  14 + *
  15 + * exofs is distributed in the hope that it will be useful,
  16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18 + * GNU General Public License for more details.
  19 + *
  20 + * You should have received a copy of the GNU General Public License
  21 + * along with exofs; if not, write to the Free Software
  22 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  23 + */
  24 +
  25 +#include <scsi/scsi_device.h>
  26 +
  27 +#include "exofs.h"
  28 +
  29 +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
  30 +{
  31 + osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
  32 +}
  33 +
  34 +int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
  35 + u64 offset, void *p, unsigned length)
  36 +{
  37 + struct osd_request *or = osd_start_request(od, GFP_KERNEL);
  38 +/* struct osd_sense_info osi = {.key = 0};*/
  39 + int ret;
  40 +
  41 + if (unlikely(!or)) {
  42 + EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
  43 + return -ENOMEM;
  44 + }
  45 + ret = osd_req_read_kern(or, obj, offset, p, length);
  46 + if (unlikely(ret)) {
  47 + EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
  48 + goto out;
  49 + }
  50 +
  51 + ret = osd_finalize_request(or, 0, cred, NULL);
  52 + if (unlikely(ret)) {
  53 + EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
  54 + goto out;
  55 + }
  56 +
  57 + ret = osd_execute_request(or);
  58 + if (unlikely(ret))
  59 + EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
  60 + /* osd_req_decode_sense(or, ret); */
  61 +
  62 +out:
  63 + osd_end_request(or);
  64 + return ret;
  65 +}
  66 +
  67 +int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
  68 +{
  69 + struct exofs_io_state *ios;
  70 +
  71 + /*TODO: Maybe use kmem_cach per sbi of size
  72 + * exofs_io_state_size(sbi->s_numdevs)
  73 + */
  74 + ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
  75 + if (unlikely(!ios)) {
  76 + *pios = NULL;
  77 + return -ENOMEM;
  78 + }
  79 +
  80 + ios->sbi = sbi;
  81 + ios->obj.partition = sbi->s_pid;
  82 + *pios = ios;
  83 + return 0;
  84 +}
  85 +
  86 +void exofs_put_io_state(struct exofs_io_state *ios)
  87 +{
  88 + if (ios) {
  89 + unsigned i;
  90 +
  91 + for (i = 0; i < ios->numdevs; i++) {
  92 + struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
  93 +
  94 + if (per_dev->or)
  95 + osd_end_request(per_dev->or);
  96 + if (per_dev->bio)
  97 + bio_put(per_dev->bio);
  98 + }
  99 +
  100 + kfree(ios);
  101 + }
  102 +}
  103 +
  104 +static void _sync_done(struct exofs_io_state *ios, void *p)
  105 +{
  106 + struct completion *waiting = p;
  107 +
  108 + complete(waiting);
  109 +}
  110 +
  111 +static void _last_io(struct kref *kref)
  112 +{
  113 + struct exofs_io_state *ios = container_of(
  114 + kref, struct exofs_io_state, kref);
  115 +
  116 + ios->done(ios, ios->private);
  117 +}
  118 +
  119 +static void _done_io(struct osd_request *or, void *p)
  120 +{
  121 + struct exofs_io_state *ios = p;
  122 +
  123 + kref_put(&ios->kref, _last_io);
  124 +}
  125 +
  126 +static int exofs_io_execute(struct exofs_io_state *ios)
  127 +{
  128 + DECLARE_COMPLETION_ONSTACK(wait);
  129 + bool sync = (ios->done == NULL);
  130 + int i, ret;
  131 +
  132 + if (sync) {
  133 + ios->done = _sync_done;
  134 + ios->private = &wait;
  135 + }
  136 +
  137 + for (i = 0; i < ios->numdevs; i++) {
  138 + struct osd_request *or = ios->per_dev[i].or;
  139 + if (unlikely(!or))
  140 + continue;
  141 +
  142 + ret = osd_finalize_request(or, 0, ios->cred, NULL);
  143 + if (unlikely(ret)) {
  144 + EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
  145 + ret);
  146 + return ret;
  147 + }
  148 + }
  149 +
  150 + kref_init(&ios->kref);
  151 +
  152 + for (i = 0; i < ios->numdevs; i++) {
  153 + struct osd_request *or = ios->per_dev[i].or;
  154 + if (unlikely(!or))
  155 + continue;
  156 +
  157 + kref_get(&ios->kref);
  158 + osd_execute_request_async(or, _done_io, ios);
  159 + }
  160 +
  161 + kref_put(&ios->kref, _last_io);
  162 + ret = 0;
  163 +
  164 + if (sync) {
  165 + wait_for_completion(&wait);
  166 + ret = exofs_check_io(ios, NULL);
  167 + }
  168 + return ret;
  169 +}
  170 +
  171 +int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
  172 +{
  173 + enum osd_err_priority acumulated_osd_err = 0;
  174 + int acumulated_lin_err = 0;
  175 + int i;
  176 +
  177 + for (i = 0; i < ios->numdevs; i++) {
  178 + struct osd_sense_info osi;
  179 + int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi);
  180 +
  181 + if (likely(!ret))
  182 + continue;
  183 +
  184 + if (unlikely(ret == -EFAULT)) {
  185 + EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__);
  186 + /*FIXME: All the pages in this device range should:
  187 + * clear_highpage(page);
  188 + */
  189 + }
  190 +
  191 + if (osi.osd_err_pri >= acumulated_osd_err) {
  192 + acumulated_osd_err = osi.osd_err_pri;
  193 + acumulated_lin_err = ret;
  194 + }
  195 + }
  196 +
  197 + /* TODO: raid specific residual calculations */
  198 + if (resid) {
  199 + if (likely(!acumulated_lin_err))
  200 + *resid = 0;
  201 + else
  202 + *resid = ios->length;
  203 + }
  204 +
  205 + return acumulated_lin_err;
  206 +}
  207 +
  208 +int exofs_sbi_create(struct exofs_io_state *ios)
  209 +{
  210 + int i, ret;
  211 +
  212 + for (i = 0; i < ios->sbi->s_numdevs; i++) {
  213 + struct osd_request *or;
  214 +
  215 + or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
  216 + if (unlikely(!or)) {
  217 + EXOFS_ERR("%s: osd_start_request failed\n", __func__);
  218 + ret = -ENOMEM;
  219 + goto out;
  220 + }
  221 + ios->per_dev[i].or = or;
  222 + ios->numdevs++;
  223 +
  224 + osd_req_create_object(or, &ios->obj);
  225 + }
  226 + ret = exofs_io_execute(ios);
  227 +
  228 +out:
  229 + return ret;
  230 +}
  231 +
  232 +int exofs_sbi_remove(struct exofs_io_state *ios)
  233 +{
  234 + int i, ret;
  235 +
  236 + for (i = 0; i < ios->sbi->s_numdevs; i++) {
  237 + struct osd_request *or;
  238 +
  239 + or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
  240 + if (unlikely(!or)) {
  241 + EXOFS_ERR("%s: osd_start_request failed\n", __func__);
  242 + ret = -ENOMEM;
  243 + goto out;
  244 + }
  245 + ios->per_dev[i].or = or;
  246 + ios->numdevs++;
  247 +
  248 + osd_req_remove_object(or, &ios->obj);
  249 + }
  250 + ret = exofs_io_execute(ios);
  251 +
  252 +out:
  253 + return ret;
  254 +}
  255 +
  256 +int exofs_sbi_write(struct exofs_io_state *ios)
  257 +{
  258 + int i, ret;
  259 +
  260 + for (i = 0; i < ios->sbi->s_numdevs; i++) {
  261 + struct osd_request *or;
  262 +
  263 + or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
  264 + if (unlikely(!or)) {
  265 + EXOFS_ERR("%s: osd_start_request failed\n", __func__);
  266 + ret = -ENOMEM;
  267 + goto out;
  268 + }
  269 + ios->per_dev[i].or = or;
  270 + ios->numdevs++;
  271 +
  272 + if (ios->bio) {
  273 + struct bio *bio;
  274 +
  275 + if (i != 0) {
  276 + bio = bio_kmalloc(GFP_KERNEL,
  277 + ios->bio->bi_max_vecs);
  278 + if (unlikely(!bio)) {
  279 + ret = -ENOMEM;
  280 + goto out;
  281 + }
  282 +
  283 + __bio_clone(bio, ios->bio);
  284 + bio->bi_bdev = NULL;
  285 + bio->bi_next = NULL;
  286 + ios->per_dev[i].bio = bio;
  287 + } else {
  288 + bio = ios->bio;
  289 + }
  290 +
  291 + osd_req_write(or, &ios->obj, ios->offset, bio,
  292 + ios->length);
  293 +/* EXOFS_DBGMSG("write sync=%d\n", sync);*/
  294 + } else if (ios->kern_buff) {
  295 + osd_req_write_kern(or, &ios->obj, ios->offset,
  296 + ios->kern_buff, ios->length);
  297 +/* EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/
  298 + } else {
  299 + osd_req_set_attributes(or, &ios->obj);
  300 +/* EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/
  301 + }
  302 +
  303 + if (ios->out_attr)
  304 + osd_req_add_set_attr_list(or, ios->out_attr,
  305 + ios->out_attr_len);
  306 +
  307 + if (ios->in_attr)
  308 + osd_req_add_get_attr_list(or, ios->in_attr,
  309 + ios->in_attr_len);
  310 + }
  311 + ret = exofs_io_execute(ios);
  312 +
  313 +out:
  314 + return ret;
  315 +}
  316 +
  317 +int exofs_sbi_read(struct exofs_io_state *ios)
  318 +{
  319 + int i, ret;
  320 +
  321 + for (i = 0; i < 1; i++) {
  322 + struct osd_request *or;
  323 + unsigned first_dev = (unsigned)ios->obj.id;
  324 +
  325 + first_dev %= ios->sbi->s_numdevs;
  326 + or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL);
  327 + if (unlikely(!or)) {
  328 + EXOFS_ERR("%s: osd_start_request failed\n", __func__);
  329 + ret = -ENOMEM;
  330 + goto out;
  331 + }
  332 + ios->per_dev[i].or = or;
  333 + ios->numdevs++;
  334 +
  335 + if (ios->bio) {
  336 + osd_req_read(or, &ios->obj, ios->offset, ios->bio,
  337 + ios->length);
  338 +/* EXOFS_DBGMSG("read sync=%d\n", sync);*/
  339 + } else if (ios->kern_buff) {
  340 + osd_req_read_kern(or, &ios->obj, ios->offset,
  341 + ios->kern_buff, ios->length);
  342 +/* EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
  343 + } else {
  344 + osd_req_get_attributes(or, &ios->obj);
  345 +/* EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
  346 + }
  347 +
  348 + if (ios->out_attr)
  349 + osd_req_add_set_attr_list(or, ios->out_attr,
  350 + ios->out_attr_len);
  351 +
  352 + if (ios->in_attr)
  353 + osd_req_add_get_attr_list(or, ios->in_attr,
  354 + ios->in_attr_len);
  355 + }
  356 + ret = exofs_io_execute(ios);
  357 +
  358 +out:
  359 + return ret;
  360 +}
  361 +
  362 +int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
  363 +{
  364 + struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
  365 + void *iter = NULL;
  366 + int nelem;
  367 +
  368 + do {
  369 + nelem = 1;
  370 + osd_req_decode_get_attr_list(ios->per_dev[0].or,
  371 + &cur_attr, &nelem, &iter);
  372 + if ((cur_attr.attr_page == attr->attr_page) &&
  373 + (cur_attr.attr_id == attr->attr_id)) {
  374 + attr->len = cur_attr.len;
  375 + attr->val_ptr = cur_attr.val_ptr;
  376 + return 0;
  377 + }
  378 + } while (iter);
  379 +
  380 + return -EIO;
  381 +}
  382 +
  383 +int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
  384 +{
  385 + struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
  386 + struct exofs_io_state *ios;
  387 + struct osd_attr attr;
  388 + __be64 newsize;
  389 + int i, ret;
  390 +
  391 + if (exofs_get_io_state(sbi, &ios))
  392 + return -ENOMEM;
  393 +
  394 + ios->obj.id = exofs_oi_objno(oi);
  395 + ios->cred = oi->i_cred;
  396 +
  397 + newsize = cpu_to_be64(size);
  398 + attr = g_attr_logical_length;
  399 + attr.val_ptr = &newsize;
  400 +
  401 + for (i = 0; i < sbi->s_numdevs; i++) {
  402 + struct osd_request *or;
  403 +
  404 + or = osd_start_request(sbi->s_ods[i], GFP_KERNEL);
  405 + if (unlikely(!or)) {
  406 + EXOFS_ERR("%s: osd_start_request failed\n", __func__);
  407 + ret = -ENOMEM;
  408 + goto out;
  409 + }
  410 + ios->per_dev[i].or = or;
  411 + ios->numdevs++;
  412 +
  413 + osd_req_set_attributes(or, &ios->obj);
  414 + osd_req_add_set_attr_list(or, &attr, 1);
  415 + }
  416 + ret = exofs_io_execute(ios);
  417 +
  418 +out:
  419 + exofs_put_io_state(ios);
  420 + return ret;
  421 +}
fs/exofs/osd.c
1   -/*
2   - * Copyright (C) 2005, 2006
3   - * Avishay Traeger (avishay@gmail.com)
4   - * Copyright (C) 2008, 2009
5   - * Boaz Harrosh <bharrosh@panasas.com>
6   - *
7   - * This file is part of exofs.
8   - *
9   - * exofs is free software; you can redistribute it and/or modify
10   - * it under the terms of the GNU General Public License as published by
11   - * the Free Software Foundation. Since it is based on ext2, and the only
12   - * valid version of GPL for the Linux kernel is version 2, the only valid
13   - * version of GPL for exofs is version 2.
14   - *
15   - * exofs is distributed in the hope that it will be useful,
16   - * but WITHOUT ANY WARRANTY; without even the implied warranty of
17   - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18   - * GNU General Public License for more details.
19   - *
20   - * You should have received a copy of the GNU General Public License
21   - * along with exofs; if not, write to the Free Software
22   - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23   - */
24   -
25   -#include <scsi/scsi_device.h>
26   -#include <scsi/osd_sense.h>
27   -
28   -#include "exofs.h"
29   -
30   -int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
31   -{
32   - struct osd_sense_info osi;
33   - int ret = osd_req_decode_sense(or, &osi);
34   -
35   - if (ret) { /* translate to Linux codes */
36   - if (osi.additional_code == scsi_invalid_field_in_cdb) {
37   - if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
38   - ret = -EFAULT;
39   - if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
40   - ret = -ENOENT;
41   - else
42   - ret = -EINVAL;
43   - } else if (osi.additional_code == osd_quota_error)
44   - ret = -ENOSPC;
45   - else
46   - ret = -EIO;
47   - }
48   -
49   - /* FIXME: should be include in osd_sense_info */
50   - if (in_resid)
51   - *in_resid = or->in.req ? or->in.req->resid_len : 0;
52   -
53   - if (out_resid)
54   - *out_resid = or->out.req ? or->out.req->resid_len : 0;
55   -
56   - return ret;
57   -}
58   -
59   -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
60   -{
61   - osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
62   -}
63   -
64   -/*
65   - * Perform a synchronous OSD operation.
66   - */
67   -int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
68   -{
69   - int ret;
70   -
71   - or->timeout = timeout;
72   - ret = osd_finalize_request(or, 0, credential, NULL);
73   - if (ret) {
74   - EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
75   - return ret;
76   - }
77   -
78   - ret = osd_execute_request(or);
79   -
80   - if (ret)
81   - EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
82   - /* osd_req_decode_sense(or, ret); */
83   - return ret;
84   -}
85   -
86   -/*
87   - * Perform an asynchronous OSD operation.
88   - */
89   -int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
90   - void *caller_context, u8 *cred)
91   -{
92   - int ret;
93   -
94   - ret = osd_finalize_request(or, 0, cred, NULL);
95   - if (ret) {
96   - EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
97   - return ret;
98   - }
99   -
100   - ret = osd_execute_request_async(or, async_done, caller_context);
101   -
102   - if (ret)
103   - EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
104   - return ret;
105   -}
106   -
107   -int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
108   -{
109   - struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
110   - void *iter = NULL;
111   - int nelem;
112   -
113   - do {
114   - nelem = 1;
115   - osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
116   - if ((cur_attr.attr_page == attr->attr_page) &&
117   - (cur_attr.attr_id == attr->attr_id)) {
118   - attr->len = cur_attr.len;
119   - attr->val_ptr = cur_attr.val_ptr;
120   - return 0;
121   - }
122   - } while (iter);
123   -
124   - return -EIO;
125   -}
  1 +/*
  2 + * Copyright (C) 2008, 2009
  3 + * Boaz Harrosh <bharrosh@panasas.com>
  4 + *
  5 + * This file is part of exofs.
  6 + *
  7 + * exofs is free software; you can redistribute it and/or modify it under the
  8 + * terms of the GNU General Public License version 2 as published by the Free
  9 + * Software Foundation.
  10 + *
  11 + */
  12 +
  13 +/* FIXME: Remove this file once pnfs hits mainline */
  14 +
  15 +#ifndef __EXOFS_PNFS_H__
  16 +#define __EXOFS_PNFS_H__
  17 +
  18 +#if defined(CONFIG_PNFS)
  19 +
  20 +
  21 +/* FIXME: move this file to: linux/exportfs/pnfs_osd_xdr.h */
  22 +#include "../nfs/objlayout/pnfs_osd_xdr.h"
  23 +
  24 +#else /* defined(CONFIG_PNFS) */
  25 +
  26 +enum pnfs_iomode {
  27 + IOMODE_READ = 1,
  28 + IOMODE_RW = 2,
  29 + IOMODE_ANY = 3,
  30 +};
  31 +
  32 +/* Layout Structure */
  33 +enum pnfs_osd_raid_algorithm4 {
  34 + PNFS_OSD_RAID_0 = 1,
  35 + PNFS_OSD_RAID_4 = 2,
  36 + PNFS_OSD_RAID_5 = 3,
  37 + PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
  38 +};
  39 +
  40 +struct pnfs_osd_data_map {
  41 + u32 odm_num_comps;
  42 + u64 odm_stripe_unit;
  43 + u32 odm_group_width;
  44 + u32 odm_group_depth;
  45 + u32 odm_mirror_cnt;
  46 + u32 odm_raid_algorithm;
  47 +};
  48 +
  49 +#endif /* else defined(CONFIG_PNFS) */
  50 +
  51 +#endif /* __EXOFS_PNFS_H__ */
... ... @@ -203,49 +203,45 @@
203 203 {
204 204 struct exofs_sb_info *sbi;
205 205 struct exofs_fscb *fscb;
206   - struct osd_request *or;
207   - struct osd_obj_id obj;
  206 + struct exofs_io_state *ios;
208 207 int ret = -ENOMEM;
209 208  
210   - fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
211   - if (!fscb) {
212   - EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
213   - return -ENOMEM;
214   - }
215   -
216 209 lock_super(sb);
217 210 sbi = sb->s_fs_info;
  211 + fscb = &sbi->s_fscb;
  212 +
  213 + ret = exofs_get_io_state(sbi, &ios);
  214 + if (ret)
  215 + goto out;
  216 +
  217 + /* Note: We only write the changing part of the fscb. .i.e upto the
  218 + * the fscb->s_dev_table_oid member. There is no read-modify-write
  219 + * here.
  220 + */
  221 + ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
  222 + memset(fscb, 0, ios->length);
218 223 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
219 224 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
220 225 fscb->s_magic = cpu_to_le16(sb->s_magic);
221 226 fscb->s_newfs = 0;
  227 + fscb->s_version = EXOFS_FSCB_VER;
222 228  
223   - or = osd_start_request(sbi->s_dev, GFP_KERNEL);
224   - if (unlikely(!or)) {
225   - EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
226   - goto out;
227   - }
  229 + ios->obj.id = EXOFS_SUPER_ID;
  230 + ios->offset = 0;
  231 + ios->kern_buff = fscb;
  232 + ios->cred = sbi->s_cred;
228 233  
229   - obj.partition = sbi->s_pid;
230   - obj.id = EXOFS_SUPER_ID;
231   - ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
  234 + ret = exofs_sbi_write(ios);
232 235 if (unlikely(ret)) {
233   - EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
  236 + EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
234 237 goto out;
235 238 }
236   -
237   - ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
238   - if (unlikely(ret)) {
239   - EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
240   - goto out;
241   - }
242 239 sb->s_dirt = 0;
243 240  
244 241 out:
245   - if (or)
246   - osd_end_request(or);
  242 + EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
  243 + exofs_put_io_state(ios);
247 244 unlock_super(sb);
248   - kfree(fscb);
249 245 return ret;
250 246 }
251 247  
... ... @@ -257,6 +253,29 @@
257 253 sb->s_dirt = 0;
258 254 }
259 255  
  256 +static void _exofs_print_device(const char *msg, const char *dev_path,
  257 + struct osd_dev *od, u64 pid)
  258 +{
  259 + const struct osd_dev_info *odi = osduld_device_info(od);
  260 +
  261 + printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
  262 + msg, dev_path ?: "", odi->osdname, _LLU(pid));
  263 +}
  264 +
  265 +void exofs_free_sbi(struct exofs_sb_info *sbi)
  266 +{
  267 + while (sbi->s_numdevs) {
  268 + int i = --sbi->s_numdevs;
  269 + struct osd_dev *od = sbi->s_ods[i];
  270 +
  271 + if (od) {
  272 + sbi->s_ods[i] = NULL;
  273 + osduld_put_device(od);
  274 + }
  275 + }
  276 + kfree(sbi);
  277 +}
  278 +
260 279 /*
261 280 * This function is called when the vfs is freeing the superblock. We just
262 281 * need to free our own part.
263 282  
... ... @@ -279,11 +298,182 @@
279 298 msecs_to_jiffies(100));
280 299 }
281 300  
282   - osduld_put_device(sbi->s_dev);
283   - kfree(sb->s_fs_info);
  301 + _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid);
  302 +
  303 + exofs_free_sbi(sbi);
284 304 sb->s_fs_info = NULL;
285 305 }
286 306  
  307 +static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
  308 + struct exofs_device_table *dt)
  309 +{
  310 + sbi->data_map.odm_num_comps =
  311 + le32_to_cpu(dt->dt_data_map.cb_num_comps);
  312 + sbi->data_map.odm_stripe_unit =
  313 + le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
  314 + sbi->data_map.odm_group_width =
  315 + le32_to_cpu(dt->dt_data_map.cb_group_width);
  316 + sbi->data_map.odm_group_depth =
  317 + le32_to_cpu(dt->dt_data_map.cb_group_depth);
  318 + sbi->data_map.odm_mirror_cnt =
  319 + le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
  320 + sbi->data_map.odm_raid_algorithm =
  321 + le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
  322 +
  323 +/* FIXME: Hard coded mirror only for now. if not so do not mount */
  324 + if ((sbi->data_map.odm_num_comps != numdevs) ||
  325 + (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) ||
  326 + (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) ||
  327 + (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
  328 + return -EINVAL;
  329 + else
  330 + return 0;
  331 +}
  332 +
  333 +/* @odi is valid only as long as @fscb_dev is valid */
  334 +static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
  335 + struct osd_dev_info *odi)
  336 +{
  337 + odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
  338 + memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
  339 +
  340 + odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
  341 + odi->osdname = dt_dev->osdname;
  342 +
  343 + /* FIXME support long names. Will need a _put function */
  344 + if (dt_dev->long_name_offset)
  345 + return -EINVAL;
  346 +
  347 + /* Make sure osdname is printable!
  348 + * mkexofs should give us space for a null-terminator else the
  349 + * device-table is invalid.
  350 + */
  351 + if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
  352 + odi->osdname_len = sizeof(dt_dev->osdname) - 1;
  353 + dt_dev->osdname[odi->osdname_len] = 0;
  354 +
  355 + /* If it's all zeros something is bad we read past end-of-obj */
  356 + return !(odi->systemid_len || odi->osdname_len);
  357 +}
  358 +
  359 +static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
  360 + unsigned table_count)
  361 +{
  362 + struct exofs_sb_info *sbi = *psbi;
  363 + struct osd_dev *fscb_od;
  364 + struct osd_obj_id obj = {.partition = sbi->s_pid,
  365 + .id = EXOFS_DEVTABLE_ID};
  366 + struct exofs_device_table *dt;
  367 + unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
  368 + sizeof(*dt);
  369 + unsigned numdevs, i;
  370 + int ret;
  371 +
  372 + dt = kmalloc(table_bytes, GFP_KERNEL);
  373 + if (unlikely(!dt)) {
  374 + EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
  375 + table_bytes);
  376 + return -ENOMEM;
  377 + }
  378 +
  379 + fscb_od = sbi->s_ods[0];
  380 + sbi->s_ods[0] = NULL;
  381 + sbi->s_numdevs = 0;
  382 + ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
  383 + if (unlikely(ret)) {
  384 + EXOFS_ERR("ERROR: reading device table\n");
  385 + goto out;
  386 + }
  387 +
  388 + numdevs = le64_to_cpu(dt->dt_num_devices);
  389 + if (unlikely(!numdevs)) {
  390 + ret = -EINVAL;
  391 + goto out;
  392 + }
  393 + WARN_ON(table_count != numdevs);
  394 +
  395 + ret = _read_and_match_data_map(sbi, numdevs, dt);
  396 + if (unlikely(ret))
  397 + goto out;
  398 +
  399 + if (likely(numdevs > 1)) {
  400 + unsigned size = numdevs * sizeof(sbi->s_ods[0]);
  401 +
  402 + sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
  403 + if (unlikely(!sbi)) {
  404 + ret = -ENOMEM;
  405 + goto out;
  406 + }
  407 + memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0]));
  408 + *psbi = sbi;
  409 + }
  410 +
  411 + for (i = 0; i < numdevs; i++) {
  412 + struct exofs_fscb fscb;
  413 + struct osd_dev_info odi;
  414 + struct osd_dev *od;
  415 +
  416 + if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
  417 + EXOFS_ERR("ERROR: Read all-zeros device entry\n");
  418 + ret = -EINVAL;
  419 + goto out;
  420 + }
  421 +
  422 + printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
  423 + i, odi.osdname);
  424 +
  425 + /* On all devices the device table is identical. The user can
  426 + * specify any one of the participating devices on the command
  427 + * line. We always keep them in device-table order.
  428 + */
  429 + if (fscb_od && osduld_device_same(fscb_od, &odi)) {
  430 + sbi->s_ods[i] = fscb_od;
  431 + ++sbi->s_numdevs;
  432 + fscb_od = NULL;
  433 + continue;
  434 + }
  435 +
  436 + od = osduld_info_lookup(&odi);
  437 + if (unlikely(IS_ERR(od))) {
  438 + ret = PTR_ERR(od);
  439 + EXOFS_ERR("ERROR: device requested is not found "
  440 + "osd_name-%s =>%d\n", odi.osdname, ret);
  441 + goto out;
  442 + }
  443 +
  444 + sbi->s_ods[i] = od;
  445 + ++sbi->s_numdevs;
  446 +
  447 + /* Read the fscb of the other devices to make sure the FS
  448 + * partition is there.
  449 + */
  450 + ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
  451 + sizeof(fscb));
  452 + if (unlikely(ret)) {
  453 + EXOFS_ERR("ERROR: Malformed participating device "
  454 + "error reading fscb osd_name-%s\n",
  455 + odi.osdname);
  456 + goto out;
  457 + }
  458 +
  459 + /* TODO: verify other information is correct and FS-uuid
  460 + * matches. Benny what did you say about device table
  461 + * generation and old devices?
  462 + */
  463 + }
  464 +
  465 +out:
  466 + kfree(dt);
  467 + if (unlikely(!ret && fscb_od)) {
  468 + EXOFS_ERR(
  469 + "ERROR: Bad device-table container device not present\n");
  470 + osduld_put_device(fscb_od);
  471 + ret = -EINVAL;
  472 + }
  473 +
  474 + return ret;
  475 +}
  476 +
287 477 /*
288 478 * Read the superblock from the OSD and fill in the fields
289 479 */
290 480  
291 481  
292 482  
293 483  
294 484  
... ... @@ -292,24 +482,25 @@
292 482 struct inode *root;
293 483 struct exofs_mountopt *opts = data;
294 484 struct exofs_sb_info *sbi; /*extended info */
  485 + struct osd_dev *od; /* Master device */
295 486 struct exofs_fscb fscb; /*on-disk superblock info */
296   - struct osd_request *or = NULL;
297 487 struct osd_obj_id obj;
  488 + unsigned table_count;
298 489 int ret;
299 490  
300 491 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
301 492 if (!sbi)
302 493 return -ENOMEM;
303   - sb->s_fs_info = sbi;
304 494  
305 495 /* use mount options to fill superblock */
306   - sbi->s_dev = osduld_path_lookup(opts->dev_name);
307   - if (IS_ERR(sbi->s_dev)) {
308   - ret = PTR_ERR(sbi->s_dev);
309   - sbi->s_dev = NULL;
  496 + od = osduld_path_lookup(opts->dev_name);
  497 + if (IS_ERR(od)) {
  498 + ret = PTR_ERR(od);
310 499 goto free_sbi;
311 500 }
312 501  
  502 + sbi->s_ods[0] = od;
  503 + sbi->s_numdevs = 1;
313 504 sbi->s_pid = opts->pid;
314 505 sbi->s_timeout = opts->timeout;
315 506  
316 507  
317 508  
318 509  
... ... @@ -323,36 +514,14 @@
323 514 sb->s_bdev = NULL;
324 515 sb->s_dev = 0;
325 516  
326   - /* read data from on-disk superblock object */
327 517 obj.partition = sbi->s_pid;
328 518 obj.id = EXOFS_SUPER_ID;
329 519 exofs_make_credential(sbi->s_cred, &obj);
330 520  
331   - or = osd_start_request(sbi->s_dev, GFP_KERNEL);
332   - if (unlikely(!or)) {
333   - if (!silent)
334   - EXOFS_ERR(
335   - "exofs_fill_super: osd_start_request failed.\n");
336   - ret = -ENOMEM;
  521 + ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
  522 + if (unlikely(ret))
337 523 goto free_sbi;
338   - }
339   - ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
340   - if (unlikely(ret)) {
341   - if (!silent)
342   - EXOFS_ERR(
343   - "exofs_fill_super: osd_req_read_kern failed.\n");
344   - ret = -ENOMEM;
345   - goto free_sbi;
346   - }
347 524  
348   - ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
349   - if (unlikely(ret)) {
350   - if (!silent)
351   - EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
352   - ret = -EIO;
353   - goto free_sbi;
354   - }
355   -
356 525 sb->s_magic = le16_to_cpu(fscb.s_magic);
357 526 sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
358 527 sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
359 528  
360 529  
... ... @@ -364,12 +533,26 @@
364 533 ret = -EINVAL;
365 534 goto free_sbi;
366 535 }
  536 + if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
  537 + EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
  538 + EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
  539 + ret = -EINVAL;
  540 + goto free_sbi;
  541 + }
367 542  
368 543 /* start generation numbers from a random point */
369 544 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
370 545 spin_lock_init(&sbi->s_next_gen_lock);
371 546  
  547 + table_count = le64_to_cpu(fscb.s_dev_table_count);
  548 + if (table_count) {
  549 + ret = exofs_read_lookup_dev_table(&sbi, table_count);
  550 + if (unlikely(ret))
  551 + goto free_sbi;
  552 + }
  553 +
372 554 /* set up operation vectors */
  555 + sb->s_fs_info = sbi;
373 556 sb->s_op = &exofs_sops;
374 557 sb->s_export_op = &exofs_export_ops;
375 558 root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
376 559  
... ... @@ -395,16 +578,15 @@
395 578 goto free_sbi;
396 579 }
397 580  
398   - ret = 0;
399   -out:
400   - if (or)
401   - osd_end_request(or);
402   - return ret;
  581 + _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0],
  582 + sbi->s_pid);
  583 + return 0;
403 584  
404 585 free_sbi:
405   - osduld_put_device(sbi->s_dev); /* NULL safe */
406   - kfree(sbi);
407   - goto out;
  586 + EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
  587 + opts->dev_name, sbi->s_pid, ret);
  588 + exofs_free_sbi(sbi);
  589 + return ret;
408 590 }
409 591  
410 592 /*
... ... @@ -433,7 +615,7 @@
433 615 {
434 616 struct super_block *sb = dentry->d_sb;
435 617 struct exofs_sb_info *sbi = sb->s_fs_info;
436   - struct osd_obj_id obj = {sbi->s_pid, 0};
  618 + struct exofs_io_state *ios;
437 619 struct osd_attr attrs[] = {
438 620 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
439 621 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
440 622  
441 623  
442 624  
443 625  
444 626  
... ... @@ -442,32 +624,33 @@
442 624 };
443 625 uint64_t capacity = ULLONG_MAX;
444 626 uint64_t used = ULLONG_MAX;
445   - struct osd_request *or;
446 627 uint8_t cred_a[OSD_CAP_LEN];
447 628 int ret;
448 629  
449   - /* get used/capacity attributes */
450   - exofs_make_credential(cred_a, &obj);
451   -
452   - or = osd_start_request(sbi->s_dev, GFP_KERNEL);
453   - if (unlikely(!or)) {
454   - EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
455   - return -ENOMEM;
  630 + ret = exofs_get_io_state(sbi, &ios);
  631 + if (ret) {
  632 + EXOFS_DBGMSG("exofs_get_io_state failed.\n");
  633 + return ret;
456 634 }
457 635  
458   - osd_req_get_attributes(or, &obj);
459   - osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
460   - ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
  636 + exofs_make_credential(cred_a, &ios->obj);
  637 + ios->cred = sbi->s_cred;
  638 + ios->in_attr = attrs;
  639 + ios->in_attr_len = ARRAY_SIZE(attrs);
  640 +
  641 + ret = exofs_sbi_read(ios);
461 642 if (unlikely(ret))
462 643 goto out;
463 644  
464   - ret = extract_attr_from_req(or, &attrs[0]);
465   - if (likely(!ret))
  645 + ret = extract_attr_from_ios(ios, &attrs[0]);
  646 + if (likely(!ret)) {
466 647 capacity = get_unaligned_be64(attrs[0].val_ptr);
467   - else
  648 + if (unlikely(!capacity))
  649 + capacity = ULLONG_MAX;
  650 + } else
468 651 EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
469 652  
470   - ret = extract_attr_from_req(or, &attrs[1]);
  653 + ret = extract_attr_from_ios(ios, &attrs[1]);
471 654 if (likely(!ret))
472 655 used = get_unaligned_be64(attrs[1].val_ptr);
473 656 else
474 657  
... ... @@ -476,15 +659,15 @@
476 659 /* fill in the stats buffer */
477 660 buf->f_type = EXOFS_SUPER_MAGIC;
478 661 buf->f_bsize = EXOFS_BLKSIZE;
479   - buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
480   - buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
  662 + buf->f_blocks = capacity >> 9;
  663 + buf->f_bfree = (capacity - used) >> 9;
481 664 buf->f_bavail = buf->f_bfree;
482 665 buf->f_files = sbi->s_numfiles;
483 666 buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
484 667 buf->f_namelen = EXOFS_NAME_LEN;
485 668  
486 669 out:
487   - osd_end_request(or);
  670 + exofs_put_io_state(ios);
488 671 return ret;
489 672 }
490 673