Merge commit 'jwb/next' into next
[pandora-kernel.git] / fs / ocfs2 / buffer_head_io.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * io.c
5  *
6  * Buffer cache handling
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30
31 #include <cluster/masklog.h>
32
33 #include "ocfs2.h"
34
35 #include "alloc.h"
36 #include "inode.h"
37 #include "journal.h"
38 #include "uptodate.h"
39
40 #include "buffer_head_io.h"
41
42 /*
43  * Bits on bh->b_state used by ocfs2.
44  *
45  * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
46  */
47 enum ocfs2_state_bits {
48         BH_NeedsValidate = BH_JBDPrivateStart,
49 };
50
51 /* Expand the magic b_state functions */
52 BUFFER_FNS(NeedsValidate, needs_validate);
53
54 int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
55                       struct ocfs2_caching_info *ci)
56 {
57         int ret = 0;
58
59         mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n",
60                    (unsigned long long)bh->b_blocknr, ci);
61
62         BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
63         BUG_ON(buffer_jbd(bh));
64
65         /* No need to check for a soft readonly file system here. non
66          * journalled writes are only ever done on system files which
67          * can get modified during recovery even if read-only. */
68         if (ocfs2_is_hard_readonly(osb)) {
69                 ret = -EROFS;
70                 goto out;
71         }
72
73         ocfs2_metadata_cache_io_lock(ci);
74
75         lock_buffer(bh);
76         set_buffer_uptodate(bh);
77
78         /* remove from dirty list before I/O. */
79         clear_buffer_dirty(bh);
80
81         get_bh(bh); /* for end_buffer_write_sync() */
82         bh->b_end_io = end_buffer_write_sync;
83         submit_bh(WRITE, bh);
84
85         wait_on_buffer(bh);
86
87         if (buffer_uptodate(bh)) {
88                 ocfs2_set_buffer_uptodate(ci, bh);
89         } else {
90                 /* We don't need to remove the clustered uptodate
91                  * information for this bh as it's not marked locally
92                  * uptodate. */
93                 ret = -EIO;
94                 put_bh(bh);
95         }
96
97         ocfs2_metadata_cache_io_unlock(ci);
98 out:
99         mlog_exit(ret);
100         return ret;
101 }
102
103 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
104                            unsigned int nr, struct buffer_head *bhs[])
105 {
106         int status = 0;
107         unsigned int i;
108         struct buffer_head *bh;
109
110         if (!nr) {
111                 mlog(ML_BH_IO, "No buffers will be read!\n");
112                 goto bail;
113         }
114
115         for (i = 0 ; i < nr ; i++) {
116                 if (bhs[i] == NULL) {
117                         bhs[i] = sb_getblk(osb->sb, block++);
118                         if (bhs[i] == NULL) {
119                                 status = -EIO;
120                                 mlog_errno(status);
121                                 goto bail;
122                         }
123                 }
124                 bh = bhs[i];
125
126                 if (buffer_jbd(bh)) {
127                         mlog(ML_BH_IO,
128                              "trying to sync read a jbd "
129                              "managed bh (blocknr = %llu), skipping\n",
130                              (unsigned long long)bh->b_blocknr);
131                         continue;
132                 }
133
134                 if (buffer_dirty(bh)) {
135                         /* This should probably be a BUG, or
136                          * at least return an error. */
137                         mlog(ML_ERROR,
138                              "trying to sync read a dirty "
139                              "buffer! (blocknr = %llu), skipping\n",
140                              (unsigned long long)bh->b_blocknr);
141                         continue;
142                 }
143
144                 lock_buffer(bh);
145                 if (buffer_jbd(bh)) {
146                         mlog(ML_ERROR,
147                              "block %llu had the JBD bit set "
148                              "while I was in lock_buffer!",
149                              (unsigned long long)bh->b_blocknr);
150                         BUG();
151                 }
152
153                 clear_buffer_uptodate(bh);
154                 get_bh(bh); /* for end_buffer_read_sync() */
155                 bh->b_end_io = end_buffer_read_sync;
156                 submit_bh(READ, bh);
157         }
158
159         for (i = nr; i > 0; i--) {
160                 bh = bhs[i - 1];
161
162                 /* No need to wait on the buffer if it's managed by JBD. */
163                 if (!buffer_jbd(bh))
164                         wait_on_buffer(bh);
165
166                 if (!buffer_uptodate(bh)) {
167                         /* Status won't be cleared from here on out,
168                          * so we can safely record this and loop back
169                          * to cleanup the other buffers. */
170                         status = -EIO;
171                         put_bh(bh);
172                         bhs[i - 1] = NULL;
173                 }
174         }
175
176 bail:
177         return status;
178 }
179
180 int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
181                       struct buffer_head *bhs[], int flags,
182                       int (*validate)(struct super_block *sb,
183                                       struct buffer_head *bh))
184 {
185         int status = 0;
186         int i, ignore_cache = 0;
187         struct buffer_head *bh;
188         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
189
190         mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n",
191                    ci, (unsigned long long)block, nr, flags);
192
193         BUG_ON(!ci);
194         BUG_ON((flags & OCFS2_BH_READAHEAD) &&
195                (flags & OCFS2_BH_IGNORE_CACHE));
196
197         if (bhs == NULL) {
198                 status = -EINVAL;
199                 mlog_errno(status);
200                 goto bail;
201         }
202
203         if (nr < 0) {
204                 mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
205                 status = -EINVAL;
206                 mlog_errno(status);
207                 goto bail;
208         }
209
210         if (nr == 0) {
211                 mlog(ML_BH_IO, "No buffers will be read!\n");
212                 status = 0;
213                 goto bail;
214         }
215
216         ocfs2_metadata_cache_io_lock(ci);
217         for (i = 0 ; i < nr ; i++) {
218                 if (bhs[i] == NULL) {
219                         bhs[i] = sb_getblk(sb, block++);
220                         if (bhs[i] == NULL) {
221                                 ocfs2_metadata_cache_io_unlock(ci);
222                                 status = -EIO;
223                                 mlog_errno(status);
224                                 goto bail;
225                         }
226                 }
227                 bh = bhs[i];
228                 ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
229
230                 /* There are three read-ahead cases here which we need to
231                  * be concerned with. All three assume a buffer has
232                  * previously been submitted with OCFS2_BH_READAHEAD
233                  * and it hasn't yet completed I/O.
234                  *
235                  * 1) The current request is sync to disk. This rarely
236                  *    happens these days, and never when performance
237                  *    matters - the code can just wait on the buffer
238                  *    lock and re-submit.
239                  *
240                  * 2) The current request is cached, but not
241                  *    readahead. ocfs2_buffer_uptodate() will return
242                  *    false anyway, so we'll wind up waiting on the
243                  *    buffer lock to do I/O. We re-check the request
244                  *    with after getting the lock to avoid a re-submit.
245                  *
246                  * 3) The current request is readahead (and so must
247                  *    also be a caching one). We short circuit if the
248                  *    buffer is locked (under I/O) and if it's in the
249                  *    uptodate cache. The re-check from #2 catches the
250                  *    case that the previous read-ahead completes just
251                  *    before our is-it-in-flight check.
252                  */
253
254                 if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
255                         mlog(ML_UPTODATE,
256                              "bh (%llu), owner %llu not uptodate\n",
257                              (unsigned long long)bh->b_blocknr,
258                              (unsigned long long)ocfs2_metadata_cache_owner(ci));
259                         /* We're using ignore_cache here to say
260                          * "go to disk" */
261                         ignore_cache = 1;
262                 }
263
264                 if (buffer_jbd(bh)) {
265                         if (ignore_cache)
266                                 mlog(ML_BH_IO, "trying to sync read a jbd "
267                                                "managed bh (blocknr = %llu)\n",
268                                      (unsigned long long)bh->b_blocknr);
269                         continue;
270                 }
271
272                 if (ignore_cache) {
273                         if (buffer_dirty(bh)) {
274                                 /* This should probably be a BUG, or
275                                  * at least return an error. */
276                                 mlog(ML_BH_IO, "asking me to sync read a dirty "
277                                                "buffer! (blocknr = %llu)\n",
278                                      (unsigned long long)bh->b_blocknr);
279                                 continue;
280                         }
281
282                         /* A read-ahead request was made - if the
283                          * buffer is already under read-ahead from a
284                          * previously submitted request than we are
285                          * done here. */
286                         if ((flags & OCFS2_BH_READAHEAD)
287                             && ocfs2_buffer_read_ahead(ci, bh))
288                                 continue;
289
290                         lock_buffer(bh);
291                         if (buffer_jbd(bh)) {
292 #ifdef CATCH_BH_JBD_RACES
293                                 mlog(ML_ERROR, "block %llu had the JBD bit set "
294                                                "while I was in lock_buffer!",
295                                      (unsigned long long)bh->b_blocknr);
296                                 BUG();
297 #else
298                                 unlock_buffer(bh);
299                                 continue;
300 #endif
301                         }
302
303                         /* Re-check ocfs2_buffer_uptodate() as a
304                          * previously read-ahead buffer may have
305                          * completed I/O while we were waiting for the
306                          * buffer lock. */
307                         if (!(flags & OCFS2_BH_IGNORE_CACHE)
308                             && !(flags & OCFS2_BH_READAHEAD)
309                             && ocfs2_buffer_uptodate(ci, bh)) {
310                                 unlock_buffer(bh);
311                                 continue;
312                         }
313
314                         clear_buffer_uptodate(bh);
315                         get_bh(bh); /* for end_buffer_read_sync() */
316                         if (validate)
317                                 set_buffer_needs_validate(bh);
318                         bh->b_end_io = end_buffer_read_sync;
319                         submit_bh(READ, bh);
320                         continue;
321                 }
322         }
323
324         status = 0;
325
326         for (i = (nr - 1); i >= 0; i--) {
327                 bh = bhs[i];
328
329                 if (!(flags & OCFS2_BH_READAHEAD)) {
330                         /* We know this can't have changed as we hold the
331                          * owner sem. Avoid doing any work on the bh if the
332                          * journal has it. */
333                         if (!buffer_jbd(bh))
334                                 wait_on_buffer(bh);
335
336                         if (!buffer_uptodate(bh)) {
337                                 /* Status won't be cleared from here on out,
338                                  * so we can safely record this and loop back
339                                  * to cleanup the other buffers. Don't need to
340                                  * remove the clustered uptodate information
341                                  * for this bh as it's not marked locally
342                                  * uptodate. */
343                                 status = -EIO;
344                                 put_bh(bh);
345                                 bhs[i] = NULL;
346                                 continue;
347                         }
348
349                         if (buffer_needs_validate(bh)) {
350                                 /* We never set NeedsValidate if the
351                                  * buffer was held by the journal, so
352                                  * that better not have changed */
353                                 BUG_ON(buffer_jbd(bh));
354                                 clear_buffer_needs_validate(bh);
355                                 status = validate(sb, bh);
356                                 if (status) {
357                                         put_bh(bh);
358                                         bhs[i] = NULL;
359                                         continue;
360                                 }
361                         }
362                 }
363
364                 /* Always set the buffer in the cache, even if it was
365                  * a forced read, or read-ahead which hasn't yet
366                  * completed. */
367                 ocfs2_set_buffer_uptodate(ci, bh);
368         }
369         ocfs2_metadata_cache_io_unlock(ci);
370
371         mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
372              (unsigned long long)block, nr,
373              ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
374              flags);
375
376 bail:
377
378         mlog_exit(status);
379         return status;
380 }
381
382 /* Check whether the blkno is the super block or one of the backups. */
383 static void ocfs2_check_super_or_backup(struct super_block *sb,
384                                         sector_t blkno)
385 {
386         int i;
387         u64 backup_blkno;
388
389         if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
390                 return;
391
392         for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
393                 backup_blkno = ocfs2_backup_super_blkno(sb, i);
394                 if (backup_blkno == blkno)
395                         return;
396         }
397
398         BUG();
399 }
400
401 /*
402  * Write super block and backups doesn't need to collaborate with journal,
403  * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
404  * into this function.
405  */
406 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
407                                 struct buffer_head *bh)
408 {
409         int ret = 0;
410
411         mlog_entry_void();
412
413         BUG_ON(buffer_jbd(bh));
414         ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
415
416         if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
417                 ret = -EROFS;
418                 goto out;
419         }
420
421         lock_buffer(bh);
422         set_buffer_uptodate(bh);
423
424         /* remove from dirty list before I/O. */
425         clear_buffer_dirty(bh);
426
427         get_bh(bh); /* for end_buffer_write_sync() */
428         bh->b_end_io = end_buffer_write_sync;
429         submit_bh(WRITE, bh);
430
431         wait_on_buffer(bh);
432
433         if (!buffer_uptodate(bh)) {
434                 ret = -EIO;
435                 put_bh(bh);
436         }
437
438 out:
439         mlog_exit(ret);
440         return ret;
441 }