Merge branch 'upstream-fixes' of git://lost.foo-projects.org/~ahkok/git/netdev-2...
[pandora-kernel.git] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
46
47 #include "buffer_head_io.h"
48
49 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52 static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
53                                   struct inode *alloc_inode,
54                                   struct buffer_head *bg_bh,
55                                   u64 group_blkno,
56                                   u16 my_chain,
57                                   struct ocfs2_chain_list *cl);
58 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59                                    struct inode *alloc_inode,
60                                    struct buffer_head *bh);
61
62 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
63                                        struct ocfs2_alloc_context *ac);
64
65 static int ocfs2_cluster_group_search(struct inode *inode,
66                                       struct buffer_head *group_bh,
67                                       u32 bits_wanted, u32 min_bits,
68                                       u16 *bit_off, u16 *bits_found);
69 static int ocfs2_block_group_search(struct inode *inode,
70                                     struct buffer_head *group_bh,
71                                     u32 bits_wanted, u32 min_bits,
72                                     u16 *bit_off, u16 *bits_found);
73 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
74                                      struct ocfs2_alloc_context *ac,
75                                      u32 bits_wanted,
76                                      u32 min_bits,
77                                      u16 *bit_off,
78                                      unsigned int *num_bits,
79                                      u64 *bg_blkno);
80 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
81                                          int nr);
82 static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
83                                              struct inode *alloc_inode,
84                                              struct ocfs2_group_desc *bg,
85                                              struct buffer_head *group_bh,
86                                              unsigned int bit_off,
87                                              unsigned int num_bits);
88 static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
89                                                struct inode *alloc_inode,
90                                                struct ocfs2_group_desc *bg,
91                                                struct buffer_head *group_bh,
92                                                unsigned int bit_off,
93                                                unsigned int num_bits);
94
95 static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
96                                     struct inode *alloc_inode,
97                                     struct buffer_head *fe_bh,
98                                     struct buffer_head *bg_bh,
99                                     struct buffer_head *prev_bg_bh,
100                                     u16 chain);
101 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
102                                                      u32 wanted);
103 static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
104                                     struct inode *alloc_inode,
105                                     struct buffer_head *alloc_bh,
106                                     unsigned int start_bit,
107                                     u64 bg_blkno,
108                                     unsigned int count);
109 static inline u64 ocfs2_which_suballoc_group(u64 block,
110                                              unsigned int bit);
111 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
112                                                    u64 bg_blkno,
113                                                    u16 bg_bit_off);
114 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
115                                             u32 cluster);
116 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
117                                                 u64 data_blkno,
118                                                 u64 *bg_blkno,
119                                                 u16 *bg_bit_off);
120
121 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
122 {
123         if (ac->ac_inode)
124                 iput(ac->ac_inode);
125         if (ac->ac_bh)
126                 brelse(ac->ac_bh);
127         kfree(ac);
128 }
129
130 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
131 {
132         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
133 }
134
135 /* somewhat more expensive than our other checks, so use sparingly. */
136 static int ocfs2_check_group_descriptor(struct super_block *sb,
137                                         struct ocfs2_dinode *di,
138                                         struct ocfs2_group_desc *gd)
139 {
140         unsigned int max_bits;
141
142         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
143                 OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
144                 return -EIO;
145         }
146
147         if (di->i_blkno != gd->bg_parent_dinode) {
148                 ocfs2_error(sb, "Group descriptor # %llu has bad parent "
149                             "pointer (%llu, expected %llu)",
150                             (unsigned long long)le64_to_cpu(gd->bg_blkno),
151                             (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
152                             (unsigned long long)le64_to_cpu(di->i_blkno));
153                 return -EIO;
154         }
155
156         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
157         if (le16_to_cpu(gd->bg_bits) > max_bits) {
158                 ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
159                             (unsigned long long)le64_to_cpu(gd->bg_blkno),
160                             le16_to_cpu(gd->bg_bits));
161                 return -EIO;
162         }
163
164         if (le16_to_cpu(gd->bg_chain) >=
165             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
166                 ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
167                             (unsigned long long)le64_to_cpu(gd->bg_blkno),
168                             le16_to_cpu(gd->bg_chain));
169                 return -EIO;
170         }
171
172         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
173                 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
174                             "claims that %u are free",
175                             (unsigned long long)le64_to_cpu(gd->bg_blkno),
176                             le16_to_cpu(gd->bg_bits),
177                             le16_to_cpu(gd->bg_free_bits_count));
178                 return -EIO;
179         }
180
181         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
182                 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
183                             "max bitmap bits of %u",
184                             (unsigned long long)le64_to_cpu(gd->bg_blkno),
185                             le16_to_cpu(gd->bg_bits),
186                             8 * le16_to_cpu(gd->bg_size));
187                 return -EIO;
188         }
189
190         return 0;
191 }
192
193 static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
194                                   struct inode *alloc_inode,
195                                   struct buffer_head *bg_bh,
196                                   u64 group_blkno,
197                                   u16 my_chain,
198                                   struct ocfs2_chain_list *cl)
199 {
200         int status = 0;
201         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
202         struct super_block * sb = alloc_inode->i_sb;
203
204         mlog_entry_void();
205
206         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
207                 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
208                             "b_blocknr (%llu)",
209                             (unsigned long long)group_blkno,
210                             (unsigned long long) bg_bh->b_blocknr);
211                 status = -EIO;
212                 goto bail;
213         }
214
215         status = ocfs2_journal_access(handle,
216                                       alloc_inode,
217                                       bg_bh,
218                                       OCFS2_JOURNAL_ACCESS_CREATE);
219         if (status < 0) {
220                 mlog_errno(status);
221                 goto bail;
222         }
223
224         memset(bg, 0, sb->s_blocksize);
225         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
226         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
227         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
228         bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
229         bg->bg_chain = cpu_to_le16(my_chain);
230         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
231         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
232         bg->bg_blkno = cpu_to_le64(group_blkno);
233         /* set the 1st bit in the bitmap to account for the descriptor block */
234         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
235         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
236
237         status = ocfs2_journal_dirty(handle, bg_bh);
238         if (status < 0)
239                 mlog_errno(status);
240
241         /* There is no need to zero out or otherwise initialize the
242          * other blocks in a group - All valid FS metadata in a block
243          * group stores the superblock fs_generation value at
244          * allocation time. */
245
246 bail:
247         mlog_exit(status);
248         return status;
249 }
250
251 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
252 {
253         u16 curr, best;
254
255         best = curr = 0;
256         while (curr < le16_to_cpu(cl->cl_count)) {
257                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
258                     le32_to_cpu(cl->cl_recs[curr].c_total))
259                         best = curr;
260                 curr++;
261         }
262         return best;
263 }
264
265 /*
266  * We expect the block group allocator to already be locked.
267  */
268 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
269                                    struct inode *alloc_inode,
270                                    struct buffer_head *bh)
271 {
272         int status, credits;
273         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
274         struct ocfs2_chain_list *cl;
275         struct ocfs2_alloc_context *ac = NULL;
276         struct ocfs2_journal_handle *handle = NULL;
277         u32 bit_off, num_bits;
278         u16 alloc_rec;
279         u64 bg_blkno;
280         struct buffer_head *bg_bh = NULL;
281         struct ocfs2_group_desc *bg;
282
283         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
284
285         mlog_entry_void();
286
287         handle = ocfs2_alloc_handle(osb);
288         if (!handle) {
289                 status = -ENOMEM;
290                 mlog_errno(status);
291                 goto bail;
292         }
293
294         cl = &fe->id2.i_chain;
295         status = ocfs2_reserve_clusters(osb,
296                                         handle,
297                                         le16_to_cpu(cl->cl_cpg),
298                                         &ac);
299         if (status < 0) {
300                 if (status != -ENOSPC)
301                         mlog_errno(status);
302                 goto bail;
303         }
304
305         credits = ocfs2_calc_group_alloc_credits(osb->sb,
306                                                  le16_to_cpu(cl->cl_cpg));
307         handle = ocfs2_start_trans(osb, handle, credits);
308         if (IS_ERR(handle)) {
309                 status = PTR_ERR(handle);
310                 handle = NULL;
311                 mlog_errno(status);
312                 goto bail;
313         }
314
315         status = ocfs2_claim_clusters(osb,
316                                       handle,
317                                       ac,
318                                       le16_to_cpu(cl->cl_cpg),
319                                       &bit_off,
320                                       &num_bits);
321         if (status < 0) {
322                 if (status != -ENOSPC)
323                         mlog_errno(status);
324                 goto bail;
325         }
326
327         alloc_rec = ocfs2_find_smallest_chain(cl);
328
329         /* setup the group */
330         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
331         mlog(0, "new descriptor, record %u, at block %llu\n",
332              alloc_rec, (unsigned long long)bg_blkno);
333
334         bg_bh = sb_getblk(osb->sb, bg_blkno);
335         if (!bg_bh) {
336                 status = -EIO;
337                 mlog_errno(status);
338                 goto bail;
339         }
340         ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
341
342         status = ocfs2_block_group_fill(handle,
343                                         alloc_inode,
344                                         bg_bh,
345                                         bg_blkno,
346                                         alloc_rec,
347                                         cl);
348         if (status < 0) {
349                 mlog_errno(status);
350                 goto bail;
351         }
352
353         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
354
355         status = ocfs2_journal_access(handle, alloc_inode,
356                                       bh, OCFS2_JOURNAL_ACCESS_WRITE);
357         if (status < 0) {
358                 mlog_errno(status);
359                 goto bail;
360         }
361
362         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
363                      le16_to_cpu(bg->bg_free_bits_count));
364         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
365         cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
366         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
367                 le16_add_cpu(&cl->cl_next_free_rec, 1);
368
369         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
370                                         le16_to_cpu(bg->bg_free_bits_count));
371         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
372         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
373
374         status = ocfs2_journal_dirty(handle, bh);
375         if (status < 0) {
376                 mlog_errno(status);
377                 goto bail;
378         }
379
380         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
381         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
382         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
383                                              le32_to_cpu(fe->i_clusters)));
384         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
385         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
386         alloc_inode->i_blocks =
387                 ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
388
389         status = 0;
390 bail:
391         if (handle)
392                 ocfs2_commit_trans(handle);
393
394         if (ac)
395                 ocfs2_free_alloc_context(ac);
396
397         if (bg_bh)
398                 brelse(bg_bh);
399
400         mlog_exit(status);
401         return status;
402 }
403
404 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
405                                        struct ocfs2_alloc_context *ac)
406 {
407         int status;
408         u32 bits_wanted = ac->ac_bits_wanted;
409         struct inode *alloc_inode = ac->ac_inode;
410         struct buffer_head *bh = NULL;
411         struct ocfs2_journal_handle *handle = ac->ac_handle;
412         struct ocfs2_dinode *fe;
413         u32 free_bits;
414
415         mlog_entry_void();
416
417         BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
418
419         ocfs2_handle_add_inode(handle, alloc_inode);
420         status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
421         if (status < 0) {
422                 mlog_errno(status);
423                 goto bail;
424         }
425
426         fe = (struct ocfs2_dinode *) bh->b_data;
427         if (!OCFS2_IS_VALID_DINODE(fe)) {
428                 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
429                 status = -EIO;
430                 goto bail;
431         }
432         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
433                 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
434                             (unsigned long long)le64_to_cpu(fe->i_blkno));
435                 status = -EIO;
436                 goto bail;
437         }
438
439         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
440                 le32_to_cpu(fe->id1.bitmap1.i_used);
441
442         if (bits_wanted > free_bits) {
443                 /* cluster bitmap never grows */
444                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
445                         mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
446                              bits_wanted, free_bits);
447                         status = -ENOSPC;
448                         goto bail;
449                 }
450
451                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
452                 if (status < 0) {
453                         if (status != -ENOSPC)
454                                 mlog_errno(status);
455                         goto bail;
456                 }
457                 atomic_inc(&osb->alloc_stats.bg_extends);
458
459                 /* You should never ask for this much metadata */
460                 BUG_ON(bits_wanted >
461                        (le32_to_cpu(fe->id1.bitmap1.i_total)
462                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
463         }
464
465         get_bh(bh);
466         ac->ac_bh = bh;
467 bail:
468         if (bh)
469                 brelse(bh);
470
471         mlog_exit(status);
472         return status;
473 }
474
475 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
476                                struct ocfs2_journal_handle *handle,
477                                struct ocfs2_dinode *fe,
478                                struct ocfs2_alloc_context **ac)
479 {
480         int status;
481         struct inode *alloc_inode = NULL;
482
483         *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
484         if (!(*ac)) {
485                 status = -ENOMEM;
486                 mlog_errno(status);
487                 goto bail;
488         }
489
490         (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
491         (*ac)->ac_handle = handle;
492         (*ac)->ac_which = OCFS2_AC_USE_META;
493
494 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
495         alloc_inode = ocfs2_get_system_file_inode(osb,
496                                                   EXTENT_ALLOC_SYSTEM_INODE,
497                                                   0);
498 #else
499         alloc_inode = ocfs2_get_system_file_inode(osb,
500                                                   EXTENT_ALLOC_SYSTEM_INODE,
501                                                   osb->slot_num);
502 #endif
503         if (!alloc_inode) {
504                 status = -ENOMEM;
505                 mlog_errno(status);
506                 goto bail;
507         }
508
509         (*ac)->ac_inode = igrab(alloc_inode);
510         (*ac)->ac_group_search = ocfs2_block_group_search;
511
512         status = ocfs2_reserve_suballoc_bits(osb, (*ac));
513         if (status < 0) {
514                 if (status != -ENOSPC)
515                         mlog_errno(status);
516                 goto bail;
517         }
518
519         status = 0;
520 bail:
521         if ((status < 0) && *ac) {
522                 ocfs2_free_alloc_context(*ac);
523                 *ac = NULL;
524         }
525
526         if (alloc_inode)
527                 iput(alloc_inode);
528
529         mlog_exit(status);
530         return status;
531 }
532
533 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
534                             struct ocfs2_journal_handle *handle,
535                             struct ocfs2_alloc_context **ac)
536 {
537         int status;
538         struct inode *alloc_inode = NULL;
539
540         *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
541         if (!(*ac)) {
542                 status = -ENOMEM;
543                 mlog_errno(status);
544                 goto bail;
545         }
546
547         (*ac)->ac_bits_wanted = 1;
548         (*ac)->ac_handle = handle;
549         (*ac)->ac_which = OCFS2_AC_USE_INODE;
550
551         alloc_inode = ocfs2_get_system_file_inode(osb,
552                                                   INODE_ALLOC_SYSTEM_INODE,
553                                                   osb->slot_num);
554         if (!alloc_inode) {
555                 status = -ENOMEM;
556                 mlog_errno(status);
557                 goto bail;
558         }
559
560         (*ac)->ac_inode = igrab(alloc_inode);
561         (*ac)->ac_group_search = ocfs2_block_group_search;
562
563         status = ocfs2_reserve_suballoc_bits(osb, *ac);
564         if (status < 0) {
565                 if (status != -ENOSPC)
566                         mlog_errno(status);
567                 goto bail;
568         }
569
570         status = 0;
571 bail:
572         if ((status < 0) && *ac) {
573                 ocfs2_free_alloc_context(*ac);
574                 *ac = NULL;
575         }
576
577         if (alloc_inode)
578                 iput(alloc_inode);
579
580         mlog_exit(status);
581         return status;
582 }
583
584 /* local alloc code has to do the same thing, so rather than do this
585  * twice.. */
586 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
587                                       struct ocfs2_alloc_context *ac)
588 {
589         int status;
590
591         ac->ac_inode = ocfs2_get_system_file_inode(osb,
592                                                    GLOBAL_BITMAP_SYSTEM_INODE,
593                                                    OCFS2_INVALID_SLOT);
594         if (!ac->ac_inode) {
595                 status = -EINVAL;
596                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
597                 goto bail;
598         }
599         ac->ac_which = OCFS2_AC_USE_MAIN;
600         ac->ac_group_search = ocfs2_cluster_group_search;
601
602         status = ocfs2_reserve_suballoc_bits(osb, ac);
603         if (status < 0 && status != -ENOSPC)
604                 mlog_errno(status);
605 bail:
606         return status;
607 }
608
609 /* Callers don't need to care which bitmap (local alloc or main) to
610  * use so we figure it out for them, but unfortunately this clutters
611  * things a bit. */
612 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
613                            struct ocfs2_journal_handle *handle,
614                            u32 bits_wanted,
615                            struct ocfs2_alloc_context **ac)
616 {
617         int status;
618
619         mlog_entry_void();
620
621         BUG_ON(!handle);
622
623         *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
624         if (!(*ac)) {
625                 status = -ENOMEM;
626                 mlog_errno(status);
627                 goto bail;
628         }
629
630         (*ac)->ac_bits_wanted = bits_wanted;
631         (*ac)->ac_handle = handle;
632
633         status = -ENOSPC;
634         if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
635                 status = ocfs2_reserve_local_alloc_bits(osb,
636                                                         handle,
637                                                         bits_wanted,
638                                                         *ac);
639                 if ((status < 0) && (status != -ENOSPC)) {
640                         mlog_errno(status);
641                         goto bail;
642                 } else if (status == -ENOSPC) {
643                         /* reserve_local_bits will return enospc with
644                          * the local alloc inode still locked, so we
645                          * can change this safely here. */
646                         mlog(0, "Disabling local alloc\n");
647                         /* We set to OCFS2_LA_DISABLED so that umount
648                          * can clean up what's left of the local
649                          * allocation */
650                         osb->local_alloc_state = OCFS2_LA_DISABLED;
651                 }
652         }
653
654         if (status == -ENOSPC) {
655                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
656                 if (status < 0) {
657                         if (status != -ENOSPC)
658                                 mlog_errno(status);
659                         goto bail;
660                 }
661         }
662
663         status = 0;
664 bail:
665         if ((status < 0) && *ac) {
666                 ocfs2_free_alloc_context(*ac);
667                 *ac = NULL;
668         }
669
670         mlog_exit(status);
671         return status;
672 }
673
674 /*
675  * More or less lifted from ext3. I'll leave their description below:
676  *
677  * "For ext3 allocations, we must not reuse any blocks which are
678  * allocated in the bitmap buffer's "last committed data" copy.  This
679  * prevents deletes from freeing up the page for reuse until we have
680  * committed the delete transaction.
681  *
682  * If we didn't do this, then deleting something and reallocating it as
683  * data would allow the old block to be overwritten before the
684  * transaction committed (because we force data to disk before commit).
685  * This would lead to corruption if we crashed between overwriting the
686  * data and committing the delete.
687  *
688  * @@@ We may want to make this allocation behaviour conditional on
689  * data-writes at some point, and disable it for metadata allocations or
690  * sync-data inodes."
691  *
692  * Note: OCFS2 already does this differently for metadata vs data
693  * allocations, as those bitmaps are seperate and undo access is never
694  * called on a metadata group descriptor.
695  */
696 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
697                                          int nr)
698 {
699         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
700
701         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
702                 return 0;
703         if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
704                 return 1;
705
706         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
707         return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
708 }
709
710 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
711                                              struct buffer_head *bg_bh,
712                                              unsigned int bits_wanted,
713                                              unsigned int total_bits,
714                                              u16 *bit_off,
715                                              u16 *bits_found)
716 {
717         void *bitmap;
718         u16 best_offset, best_size;
719         int offset, start, found, status = 0;
720         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
721
722         if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
723                 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
724                 return -EIO;
725         }
726
727         found = start = best_offset = best_size = 0;
728         bitmap = bg->bg_bitmap;
729
730         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
731                 if (offset == total_bits)
732                         break;
733
734                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
735                         /* We found a zero, but we can't use it as it
736                          * hasn't been put to disk yet! */
737                         found = 0;
738                         start = offset + 1;
739                 } else if (offset == start) {
740                         /* we found a zero */
741                         found++;
742                         /* move start to the next bit to test */
743                         start++;
744                 } else {
745                         /* got a zero after some ones */
746                         found = 1;
747                         start = offset + 1;
748                 }
749                 if (found > best_size) {
750                         best_size = found;
751                         best_offset = start - found;
752                 }
753                 /* we got everything we needed */
754                 if (found == bits_wanted) {
755                         /* mlog(0, "Found it all!\n"); */
756                         break;
757                 }
758         }
759
760         /* XXX: I think the first clause is equivalent to the second
761          *      - jlbec */
762         if (found == bits_wanted) {
763                 *bit_off = start - found;
764                 *bits_found = found;
765         } else if (best_size) {
766                 *bit_off = best_offset;
767                 *bits_found = best_size;
768         } else {
769                 status = -ENOSPC;
770                 /* No error log here -- see the comment above
771                  * ocfs2_test_bg_bit_allocatable */
772         }
773
774         return status;
775 }
776
777 static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
778                                              struct inode *alloc_inode,
779                                              struct ocfs2_group_desc *bg,
780                                              struct buffer_head *group_bh,
781                                              unsigned int bit_off,
782                                              unsigned int num_bits)
783 {
784         int status;
785         void *bitmap = bg->bg_bitmap;
786         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
787
788         mlog_entry_void();
789
790         if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
791                 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
792                 status = -EIO;
793                 goto bail;
794         }
795         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
796
797         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
798              num_bits);
799
800         if (ocfs2_is_cluster_bitmap(alloc_inode))
801                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
802
803         status = ocfs2_journal_access(handle,
804                                       alloc_inode,
805                                       group_bh,
806                                       journal_type);
807         if (status < 0) {
808                 mlog_errno(status);
809                 goto bail;
810         }
811
812         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
813
814         while(num_bits--)
815                 ocfs2_set_bit(bit_off++, bitmap);
816
817         status = ocfs2_journal_dirty(handle,
818                                      group_bh);
819         if (status < 0) {
820                 mlog_errno(status);
821                 goto bail;
822         }
823
824 bail:
825         mlog_exit(status);
826         return status;
827 }
828
829 /* find the one with the most empty bits */
830 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
831 {
832         u16 curr, best;
833
834         BUG_ON(!cl->cl_next_free_rec);
835
836         best = curr = 0;
837         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
838                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
839                     le32_to_cpu(cl->cl_recs[best].c_free))
840                         best = curr;
841                 curr++;
842         }
843
844         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
845         return best;
846 }
847
848 static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
849                                     struct inode *alloc_inode,
850                                     struct buffer_head *fe_bh,
851                                     struct buffer_head *bg_bh,
852                                     struct buffer_head *prev_bg_bh,
853                                     u16 chain)
854 {
855         int status;
856         /* there is a really tiny chance the journal calls could fail,
857          * but we wouldn't want inconsistent blocks in *any* case. */
858         u64 fe_ptr, bg_ptr, prev_bg_ptr;
859         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
860         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
861         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
862
863         if (!OCFS2_IS_VALID_DINODE(fe)) {
864                 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
865                 status = -EIO;
866                 goto out;
867         }
868         if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
869                 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
870                 status = -EIO;
871                 goto out;
872         }
873         if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
874                 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
875                 status = -EIO;
876                 goto out;
877         }
878
879         mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
880              (unsigned long long)fe->i_blkno, chain,
881              (unsigned long long)bg->bg_blkno,
882              (unsigned long long)prev_bg->bg_blkno);
883
884         fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
885         bg_ptr = le64_to_cpu(bg->bg_next_group);
886         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
887
888         status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
889                                       OCFS2_JOURNAL_ACCESS_WRITE);
890         if (status < 0) {
891                 mlog_errno(status);
892                 goto out_rollback;
893         }
894
895         prev_bg->bg_next_group = bg->bg_next_group;
896
897         status = ocfs2_journal_dirty(handle, prev_bg_bh);
898         if (status < 0) {
899                 mlog_errno(status);
900                 goto out_rollback;
901         }
902
903         status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
904                                       OCFS2_JOURNAL_ACCESS_WRITE);
905         if (status < 0) {
906                 mlog_errno(status);
907                 goto out_rollback;
908         }
909
910         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
911
912         status = ocfs2_journal_dirty(handle, bg_bh);
913         if (status < 0) {
914                 mlog_errno(status);
915                 goto out_rollback;
916         }
917
918         status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
919                                       OCFS2_JOURNAL_ACCESS_WRITE);
920         if (status < 0) {
921                 mlog_errno(status);
922                 goto out_rollback;
923         }
924
925         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
926
927         status = ocfs2_journal_dirty(handle, fe_bh);
928         if (status < 0) {
929                 mlog_errno(status);
930                 goto out_rollback;
931         }
932
933         status = 0;
934 out_rollback:
935         if (status < 0) {
936                 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
937                 bg->bg_next_group = cpu_to_le64(bg_ptr);
938                 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
939         }
940 out:
941         mlog_exit(status);
942         return status;
943 }
944
945 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
946                                                      u32 wanted)
947 {
948         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
949 }
950
951 /* return 0 on success, -ENOSPC to keep searching and any other < 0
952  * value on error. */
953 static int ocfs2_cluster_group_search(struct inode *inode,
954                                       struct buffer_head *group_bh,
955                                       u32 bits_wanted, u32 min_bits,
956                                       u16 *bit_off, u16 *bits_found)
957 {
958         int search = -ENOSPC;
959         int ret;
960         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
961         u16 tmp_off, tmp_found;
962         unsigned int max_bits, gd_cluster_off;
963
964         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
965
966         if (gd->bg_free_bits_count) {
967                 max_bits = le16_to_cpu(gd->bg_bits);
968
969                 /* Tail groups in cluster bitmaps which aren't cpg
970                  * aligned are prone to partial extention by a failed
971                  * fs resize. If the file system resize never got to
972                  * update the dinode cluster count, then we don't want
973                  * to trust any clusters past it, regardless of what
974                  * the group descriptor says. */
975                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
976                                                           le64_to_cpu(gd->bg_blkno));
977                 if ((gd_cluster_off + max_bits) >
978                     OCFS2_I(inode)->ip_clusters) {
979                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
980                         mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
981                              (unsigned long long)le64_to_cpu(gd->bg_blkno),
982                              le16_to_cpu(gd->bg_bits),
983                              OCFS2_I(inode)->ip_clusters, max_bits);
984                 }
985
986                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
987                                                         group_bh, bits_wanted,
988                                                         max_bits,
989                                                         &tmp_off, &tmp_found);
990                 if (ret)
991                         return ret;
992
993                 /* ocfs2_block_group_find_clear_bits() might
994                  * return success, but we still want to return
995                  * -ENOSPC unless it found the minimum number
996                  * of bits. */
997                 if (min_bits <= tmp_found) {
998                         *bit_off = tmp_off;
999                         *bits_found = tmp_found;
1000                         search = 0; /* success */
1001                 }
1002         }
1003
1004         return search;
1005 }
1006
1007 static int ocfs2_block_group_search(struct inode *inode,
1008                                     struct buffer_head *group_bh,
1009                                     u32 bits_wanted, u32 min_bits,
1010                                     u16 *bit_off, u16 *bits_found)
1011 {
1012         int ret = -ENOSPC;
1013         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1014
1015         BUG_ON(min_bits != 1);
1016         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1017
1018         if (bg->bg_free_bits_count)
1019                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1020                                                         group_bh, bits_wanted,
1021                                                         le16_to_cpu(bg->bg_bits),
1022                                                         bit_off, bits_found);
1023
1024         return ret;
1025 }
1026
1027 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1028                                        struct ocfs2_journal_handle *handle,
1029                                        struct buffer_head *di_bh,
1030                                        u32 num_bits,
1031                                        u16 chain)
1032 {
1033         int ret;
1034         u32 tmp_used;
1035         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1036         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1037
1038         ret = ocfs2_journal_access(handle, inode, di_bh,
1039                                    OCFS2_JOURNAL_ACCESS_WRITE);
1040         if (ret < 0) {
1041                 mlog_errno(ret);
1042                 goto out;
1043         }
1044
1045         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1046         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1047         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1048
1049         ret = ocfs2_journal_dirty(handle, di_bh);
1050         if (ret < 0)
1051                 mlog_errno(ret);
1052
1053 out:
1054         return ret;
1055 }
1056
1057 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1058                                   u32 bits_wanted,
1059                                   u32 min_bits,
1060                                   u16 *bit_off,
1061                                   unsigned int *num_bits,
1062                                   u64 gd_blkno,
1063                                   u16 *bits_left)
1064 {
1065         int ret;
1066         u16 found;
1067         struct buffer_head *group_bh = NULL;
1068         struct ocfs2_group_desc *gd;
1069         struct inode *alloc_inode = ac->ac_inode;
1070         struct ocfs2_journal_handle *handle = ac->ac_handle;
1071
1072         ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
1073                                &group_bh, OCFS2_BH_CACHED, alloc_inode);
1074         if (ret < 0) {
1075                 mlog_errno(ret);
1076                 return ret;
1077         }
1078
1079         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1080         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
1081                 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
1082                 ret = -EIO;
1083                 goto out;
1084         }
1085
1086         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1087                                   bit_off, &found);
1088         if (ret < 0) {
1089                 if (ret != -ENOSPC)
1090                         mlog_errno(ret);
1091                 goto out;
1092         }
1093
1094         *num_bits = found;
1095
1096         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1097                                                *num_bits,
1098                                                le16_to_cpu(gd->bg_chain));
1099         if (ret < 0) {
1100                 mlog_errno(ret);
1101                 goto out;
1102         }
1103
1104         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1105                                          *bit_off, *num_bits);
1106         if (ret < 0)
1107                 mlog_errno(ret);
1108
1109         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1110
1111 out:
1112         brelse(group_bh);
1113
1114         return ret;
1115 }
1116
1117 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1118                               u32 bits_wanted,
1119                               u32 min_bits,
1120                               u16 *bit_off,
1121                               unsigned int *num_bits,
1122                               u64 *bg_blkno,
1123                               u16 *bits_left)
1124 {
1125         int status;
1126         u16 chain, tmp_bits;
1127         u32 tmp_used;
1128         u64 next_group;
1129         struct ocfs2_journal_handle *handle = ac->ac_handle;
1130         struct inode *alloc_inode = ac->ac_inode;
1131         struct buffer_head *group_bh = NULL;
1132         struct buffer_head *prev_group_bh = NULL;
1133         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1134         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1135         struct ocfs2_group_desc *bg;
1136
1137         chain = ac->ac_chain;
1138         mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1139              bits_wanted, chain,
1140              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1141
1142         status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1143                                   le64_to_cpu(cl->cl_recs[chain].c_blkno),
1144                                   &group_bh, OCFS2_BH_CACHED, alloc_inode);
1145         if (status < 0) {
1146                 mlog_errno(status);
1147                 goto bail;
1148         }
1149         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1150         status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1151         if (status) {
1152                 mlog_errno(status);
1153                 goto bail;
1154         }
1155
1156         status = -ENOSPC;
1157         /* for now, the chain search is a bit simplistic. We just use
1158          * the 1st group with any empty bits. */
1159         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1160                                              bits_wanted, min_bits, bit_off,
1161                                              &tmp_bits)) == -ENOSPC) {
1162                 if (!bg->bg_next_group)
1163                         break;
1164
1165                 if (prev_group_bh) {
1166                         brelse(prev_group_bh);
1167                         prev_group_bh = NULL;
1168                 }
1169                 next_group = le64_to_cpu(bg->bg_next_group);
1170                 prev_group_bh = group_bh;
1171                 group_bh = NULL;
1172                 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1173                                           next_group, &group_bh,
1174                                           OCFS2_BH_CACHED, alloc_inode);
1175                 if (status < 0) {
1176                         mlog_errno(status);
1177                         goto bail;
1178                 }
1179                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1180                 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1181                 if (status) {
1182                         mlog_errno(status);
1183                         goto bail;
1184                 }
1185         }
1186         if (status < 0) {
1187                 if (status != -ENOSPC)
1188                         mlog_errno(status);
1189                 goto bail;
1190         }
1191
1192         mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1193              tmp_bits, (unsigned long long)bg->bg_blkno);
1194
1195         *num_bits = tmp_bits;
1196
1197         BUG_ON(*num_bits == 0);
1198
1199         /*
1200          * Keep track of previous block descriptor read. When
1201          * we find a target, if we have read more than X
1202          * number of descriptors, and the target is reasonably
1203          * empty, relink him to top of his chain.
1204          *
1205          * We've read 0 extra blocks and only send one more to
1206          * the transaction, yet the next guy to search has a
1207          * much easier time.
1208          *
1209          * Do this *after* figuring out how many bits we're taking out
1210          * of our target group.
1211          */
1212         if (ac->ac_allow_chain_relink &&
1213             (prev_group_bh) &&
1214             (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1215                 status = ocfs2_relink_block_group(handle, alloc_inode,
1216                                                   ac->ac_bh, group_bh,
1217                                                   prev_group_bh, chain);
1218                 if (status < 0) {
1219                         mlog_errno(status);
1220                         goto bail;
1221                 }
1222         }
1223
1224         /* Ok, claim our bits now: set the info on dinode, chainlist
1225          * and then the group */
1226         status = ocfs2_journal_access(handle,
1227                                       alloc_inode,
1228                                       ac->ac_bh,
1229                                       OCFS2_JOURNAL_ACCESS_WRITE);
1230         if (status < 0) {
1231                 mlog_errno(status);
1232                 goto bail;
1233         }
1234
1235         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1236         fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1237         le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1238
1239         status = ocfs2_journal_dirty(handle,
1240                                      ac->ac_bh);
1241         if (status < 0) {
1242                 mlog_errno(status);
1243                 goto bail;
1244         }
1245
1246         status = ocfs2_block_group_set_bits(handle,
1247                                             alloc_inode,
1248                                             bg,
1249                                             group_bh,
1250                                             *bit_off,
1251                                             *num_bits);
1252         if (status < 0) {
1253                 mlog_errno(status);
1254                 goto bail;
1255         }
1256
1257         mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1258              (unsigned long long)fe->i_blkno);
1259
1260         *bg_blkno = le64_to_cpu(bg->bg_blkno);
1261         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1262 bail:
1263         if (group_bh)
1264                 brelse(group_bh);
1265         if (prev_group_bh)
1266                 brelse(prev_group_bh);
1267
1268         mlog_exit(status);
1269         return status;
1270 }
1271
1272 /* will give out up to bits_wanted contiguous bits. */
1273 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1274                                      struct ocfs2_alloc_context *ac,
1275                                      u32 bits_wanted,
1276                                      u32 min_bits,
1277                                      u16 *bit_off,
1278                                      unsigned int *num_bits,
1279                                      u64 *bg_blkno)
1280 {
1281         int status;
1282         u16 victim, i;
1283         u16 bits_left = 0;
1284         u64 hint_blkno = ac->ac_last_group;
1285         struct ocfs2_chain_list *cl;
1286         struct ocfs2_dinode *fe;
1287
1288         mlog_entry_void();
1289
1290         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1291         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1292         BUG_ON(!ac->ac_bh);
1293
1294         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1295         if (!OCFS2_IS_VALID_DINODE(fe)) {
1296                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1297                 status = -EIO;
1298                 goto bail;
1299         }
1300         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1301             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1302                 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1303                             "bits but only %u total.",
1304                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1305                             le32_to_cpu(fe->id1.bitmap1.i_used),
1306                             le32_to_cpu(fe->id1.bitmap1.i_total));
1307                 status = -EIO;
1308                 goto bail;
1309         }
1310
1311         if (hint_blkno) {
1312                 /* Attempt to short-circuit the usual search mechanism
1313                  * by jumping straight to the most recently used
1314                  * allocation group. This helps us mantain some
1315                  * contiguousness across allocations. */
1316                 status = ocfs2_search_one_group(ac, bits_wanted, min_bits,
1317                                                 bit_off, num_bits,
1318                                                 hint_blkno, &bits_left);
1319                 if (!status) {
1320                         /* Be careful to update *bg_blkno here as the
1321                          * caller is expecting it to be filled in, and
1322                          * ocfs2_search_one_group() won't do that for
1323                          * us. */
1324                         *bg_blkno = hint_blkno;
1325                         goto set_hint;
1326                 }
1327                 if (status < 0 && status != -ENOSPC) {
1328                         mlog_errno(status);
1329                         goto bail;
1330                 }
1331         }
1332
1333         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1334
1335         victim = ocfs2_find_victim_chain(cl);
1336         ac->ac_chain = victim;
1337         ac->ac_allow_chain_relink = 1;
1338
1339         status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
1340                                     num_bits, bg_blkno, &bits_left);
1341         if (!status)
1342                 goto set_hint;
1343         if (status < 0 && status != -ENOSPC) {
1344                 mlog_errno(status);
1345                 goto bail;
1346         }
1347
1348         mlog(0, "Search of victim chain %u came up with nothing, "
1349              "trying all chains now.\n", victim);
1350
1351         /* If we didn't pick a good victim, then just default to
1352          * searching each chain in order. Don't allow chain relinking
1353          * because we only calculate enough journal credits for one
1354          * relink per alloc. */
1355         ac->ac_allow_chain_relink = 0;
1356         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1357                 if (i == victim)
1358                         continue;
1359                 if (!cl->cl_recs[i].c_free)
1360                         continue;
1361
1362                 ac->ac_chain = i;
1363                 status = ocfs2_search_chain(ac, bits_wanted, min_bits,
1364                                             bit_off, num_bits, bg_blkno,
1365                                             &bits_left);
1366                 if (!status)
1367                         break;
1368                 if (status < 0 && status != -ENOSPC) {
1369                         mlog_errno(status);
1370                         goto bail;
1371                 }
1372         }
1373
1374 set_hint:
1375         if (status != -ENOSPC) {
1376                 /* If the next search of this group is not likely to
1377                  * yield a suitable extent, then we reset the last
1378                  * group hint so as to not waste a disk read */
1379                 if (bits_left < min_bits)
1380                         ac->ac_last_group = 0;
1381                 else
1382                         ac->ac_last_group = *bg_blkno;
1383         }
1384
1385 bail:
1386         mlog_exit(status);
1387         return status;
1388 }
1389
1390 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1391                          struct ocfs2_journal_handle *handle,
1392                          struct ocfs2_alloc_context *ac,
1393                          u32 bits_wanted,
1394                          u16 *suballoc_bit_start,
1395                          unsigned int *num_bits,
1396                          u64 *blkno_start)
1397 {
1398         int status;
1399         u64 bg_blkno;
1400
1401         BUG_ON(!ac);
1402         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1403         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1404         BUG_ON(ac->ac_handle != handle);
1405
1406         status = ocfs2_claim_suballoc_bits(osb,
1407                                            ac,
1408                                            bits_wanted,
1409                                            1,
1410                                            suballoc_bit_start,
1411                                            num_bits,
1412                                            &bg_blkno);
1413         if (status < 0) {
1414                 mlog_errno(status);
1415                 goto bail;
1416         }
1417         atomic_inc(&osb->alloc_stats.bg_allocs);
1418
1419         *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1420         ac->ac_bits_given += (*num_bits);
1421         status = 0;
1422 bail:
1423         mlog_exit(status);
1424         return status;
1425 }
1426
1427 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1428                           struct ocfs2_journal_handle *handle,
1429                           struct ocfs2_alloc_context *ac,
1430                           u16 *suballoc_bit,
1431                           u64 *fe_blkno)
1432 {
1433         int status;
1434         unsigned int num_bits;
1435         u64 bg_blkno;
1436
1437         mlog_entry_void();
1438
1439         BUG_ON(!ac);
1440         BUG_ON(ac->ac_bits_given != 0);
1441         BUG_ON(ac->ac_bits_wanted != 1);
1442         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1443         BUG_ON(ac->ac_handle != handle);
1444
1445         status = ocfs2_claim_suballoc_bits(osb,
1446                                            ac,
1447                                            1,
1448                                            1,
1449                                            suballoc_bit,
1450                                            &num_bits,
1451                                            &bg_blkno);
1452         if (status < 0) {
1453                 mlog_errno(status);
1454                 goto bail;
1455         }
1456         atomic_inc(&osb->alloc_stats.bg_allocs);
1457
1458         BUG_ON(num_bits != 1);
1459
1460         *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1461         ac->ac_bits_given++;
1462         status = 0;
1463 bail:
1464         mlog_exit(status);
1465         return status;
1466 }
1467
1468 /* translate a group desc. blkno and it's bitmap offset into
1469  * disk cluster offset. */
1470 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1471                                                    u64 bg_blkno,
1472                                                    u16 bg_bit_off)
1473 {
1474         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1475         u32 cluster = 0;
1476
1477         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1478
1479         if (bg_blkno != osb->first_cluster_group_blkno)
1480                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1481         cluster += (u32) bg_bit_off;
1482         return cluster;
1483 }
1484
1485 /* given a cluster offset, calculate which block group it belongs to
1486  * and return that block offset. */
1487 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
1488                                             u32 cluster)
1489 {
1490         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1491         u32 group_no;
1492
1493         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1494
1495         group_no = cluster / osb->bitmap_cpg;
1496         if (!group_no)
1497                 return osb->first_cluster_group_blkno;
1498         return ocfs2_clusters_to_blocks(inode->i_sb,
1499                                         group_no * osb->bitmap_cpg);
1500 }
1501
1502 /* given the block number of a cluster start, calculate which cluster
1503  * group and descriptor bitmap offset that corresponds to. */
1504 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1505                                                 u64 data_blkno,
1506                                                 u64 *bg_blkno,
1507                                                 u16 *bg_bit_off)
1508 {
1509         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1510         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1511
1512         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1513
1514         *bg_blkno = ocfs2_which_cluster_group(inode,
1515                                               data_cluster);
1516
1517         if (*bg_blkno == osb->first_cluster_group_blkno)
1518                 *bg_bit_off = (u16) data_cluster;
1519         else
1520                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1521                                                              data_blkno - *bg_blkno);
1522 }
1523
1524 /*
1525  * min_bits - minimum contiguous chunk from this total allocation we
1526  * can handle. set to what we asked for originally for a full
1527  * contig. allocation, set to '1' to indicate we can deal with extents
1528  * of any size.
1529  */
1530 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1531                          struct ocfs2_journal_handle *handle,
1532                          struct ocfs2_alloc_context *ac,
1533                          u32 min_clusters,
1534                          u32 *cluster_start,
1535                          u32 *num_clusters)
1536 {
1537         int status;
1538         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1539         u64 bg_blkno = 0;
1540         u16 bg_bit_off;
1541
1542         mlog_entry_void();
1543
1544         BUG_ON(!ac);
1545         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1546
1547         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1548                && ac->ac_which != OCFS2_AC_USE_MAIN);
1549         BUG_ON(ac->ac_handle != handle);
1550
1551         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1552                 status = ocfs2_claim_local_alloc_bits(osb,
1553                                                       handle,
1554                                                       ac,
1555                                                       bits_wanted,
1556                                                       cluster_start,
1557                                                       num_clusters);
1558                 if (!status)
1559                         atomic_inc(&osb->alloc_stats.local_data);
1560         } else {
1561                 if (min_clusters > (osb->bitmap_cpg - 1)) {
1562                         /* The only paths asking for contiguousness
1563                          * should know about this already. */
1564                         mlog(ML_ERROR, "minimum allocation requested exceeds "
1565                                        "group bitmap size!");
1566                         status = -ENOSPC;
1567                         goto bail;
1568                 }
1569                 /* clamp the current request down to a realistic size. */
1570                 if (bits_wanted > (osb->bitmap_cpg - 1))
1571                         bits_wanted = osb->bitmap_cpg - 1;
1572
1573                 status = ocfs2_claim_suballoc_bits(osb,
1574                                                    ac,
1575                                                    bits_wanted,
1576                                                    min_clusters,
1577                                                    &bg_bit_off,
1578                                                    num_clusters,
1579                                                    &bg_blkno);
1580                 if (!status) {
1581                         *cluster_start =
1582                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1583                                                                  bg_blkno,
1584                                                                  bg_bit_off);
1585                         atomic_inc(&osb->alloc_stats.bitmap_data);
1586                 }
1587         }
1588         if (status < 0) {
1589                 if (status != -ENOSPC)
1590                         mlog_errno(status);
1591                 goto bail;
1592         }
1593
1594         ac->ac_bits_given += *num_clusters;
1595
1596 bail:
1597         mlog_exit(status);
1598         return status;
1599 }
1600
1601 static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
1602                                                struct inode *alloc_inode,
1603                                                struct ocfs2_group_desc *bg,
1604                                                struct buffer_head *group_bh,
1605                                                unsigned int bit_off,
1606                                                unsigned int num_bits)
1607 {
1608         int status;
1609         unsigned int tmp;
1610         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1611         struct ocfs2_group_desc *undo_bg = NULL;
1612
1613         mlog_entry_void();
1614
1615         if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1616                 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1617                 status = -EIO;
1618                 goto bail;
1619         }
1620
1621         mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1622
1623         if (ocfs2_is_cluster_bitmap(alloc_inode))
1624                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1625
1626         status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1627                                       journal_type);
1628         if (status < 0) {
1629                 mlog_errno(status);
1630                 goto bail;
1631         }
1632
1633         if (ocfs2_is_cluster_bitmap(alloc_inode))
1634                 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1635
1636         tmp = num_bits;
1637         while(tmp--) {
1638                 ocfs2_clear_bit((bit_off + tmp),
1639                                 (unsigned long *) bg->bg_bitmap);
1640                 if (ocfs2_is_cluster_bitmap(alloc_inode))
1641                         ocfs2_set_bit(bit_off + tmp,
1642                                       (unsigned long *) undo_bg->bg_bitmap);
1643         }
1644         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1645
1646         status = ocfs2_journal_dirty(handle, group_bh);
1647         if (status < 0)
1648                 mlog_errno(status);
1649 bail:
1650         return status;
1651 }
1652
1653 /*
1654  * expects the suballoc inode to already be locked.
1655  */
1656 static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
1657                                     struct inode *alloc_inode,
1658                                     struct buffer_head *alloc_bh,
1659                                     unsigned int start_bit,
1660                                     u64 bg_blkno,
1661                                     unsigned int count)
1662 {
1663         int status = 0;
1664         u32 tmp_used;
1665         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1666         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1667         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1668         struct buffer_head *group_bh = NULL;
1669         struct ocfs2_group_desc *group;
1670
1671         mlog_entry_void();
1672
1673         if (!OCFS2_IS_VALID_DINODE(fe)) {
1674                 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1675                 status = -EIO;
1676                 goto bail;
1677         }
1678         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1679
1680         mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1681              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1682              (unsigned long long)bg_blkno, start_bit);
1683
1684         status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1685                                   alloc_inode);
1686         if (status < 0) {
1687                 mlog_errno(status);
1688                 goto bail;
1689         }
1690
1691         group = (struct ocfs2_group_desc *) group_bh->b_data;
1692         status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
1693         if (status) {
1694                 mlog_errno(status);
1695                 goto bail;
1696         }
1697         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1698
1699         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1700                                               group, group_bh,
1701                                               start_bit, count);
1702         if (status < 0) {
1703                 mlog_errno(status);
1704                 goto bail;
1705         }
1706
1707         status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1708                                       OCFS2_JOURNAL_ACCESS_WRITE);
1709         if (status < 0) {
1710                 mlog_errno(status);
1711                 goto bail;
1712         }
1713
1714         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1715                      count);
1716         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1717         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1718
1719         status = ocfs2_journal_dirty(handle, alloc_bh);
1720         if (status < 0) {
1721                 mlog_errno(status);
1722                 goto bail;
1723         }
1724
1725 bail:
1726         if (group_bh)
1727                 brelse(group_bh);
1728
1729         mlog_exit(status);
1730         return status;
1731 }
1732
1733 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1734 {
1735         u64 group = block - (u64) bit;
1736
1737         return group;
1738 }
1739
1740 int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
1741                       struct inode *inode_alloc_inode,
1742                       struct buffer_head *inode_alloc_bh,
1743                       struct ocfs2_dinode *di)
1744 {
1745         u64 blk = le64_to_cpu(di->i_blkno);
1746         u16 bit = le16_to_cpu(di->i_suballoc_bit);
1747         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1748
1749         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1750                                         inode_alloc_bh, bit, bg_blkno, 1);
1751 }
1752
1753 int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
1754                             struct inode *eb_alloc_inode,
1755                             struct buffer_head *eb_alloc_bh,
1756                             struct ocfs2_extent_block *eb)
1757 {
1758         u64 blk = le64_to_cpu(eb->h_blkno);
1759         u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1760         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1761
1762         return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1763                                         bit, bg_blkno, 1);
1764 }
1765
1766 int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
1767                        struct inode *bitmap_inode,
1768                        struct buffer_head *bitmap_bh,
1769                        u64 start_blk,
1770                        unsigned int num_clusters)
1771 {
1772         int status;
1773         u16 bg_start_bit;
1774         u64 bg_blkno;
1775         struct ocfs2_dinode *fe;
1776
1777         /* You can't ever have a contiguous set of clusters
1778          * bigger than a block group bitmap so we never have to worry
1779          * about looping on them. */
1780
1781         mlog_entry_void();
1782
1783         /* This is expensive. We can safely remove once this stuff has
1784          * gotten tested really well. */
1785         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1786
1787         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1788
1789         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1790                                      &bg_start_bit);
1791
1792         mlog(0, "want to free %u clusters starting at block %llu\n",
1793              num_clusters, (unsigned long long)start_blk);
1794         mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
1795              (unsigned long long)bg_blkno, bg_start_bit);
1796
1797         status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1798                                           bg_start_bit, bg_blkno,
1799                                           num_clusters);
1800         if (status < 0)
1801                 mlog_errno(status);
1802
1803         mlog_exit(status);
1804         return status;
1805 }
1806
1807 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1808 {
1809         printk("Block Group:\n");
1810         printk("bg_signature:       %s\n", bg->bg_signature);
1811         printk("bg_size:            %u\n", bg->bg_size);
1812         printk("bg_bits:            %u\n", bg->bg_bits);
1813         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1814         printk("bg_chain:           %u\n", bg->bg_chain);
1815         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
1816         printk("bg_next_group:      %llu\n",
1817                (unsigned long long)bg->bg_next_group);
1818         printk("bg_parent_dinode:   %llu\n",
1819                (unsigned long long)bg->bg_parent_dinode);
1820         printk("bg_blkno:           %llu\n",
1821                (unsigned long long)bg->bg_blkno);
1822 }
1823
1824 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1825 {
1826         int i;
1827
1828         printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
1829         printk("i_signature:                  %s\n", fe->i_signature);
1830         printk("i_size:                       %llu\n",
1831                (unsigned long long)fe->i_size);
1832         printk("i_clusters:                   %u\n", fe->i_clusters);
1833         printk("i_generation:                 %u\n",
1834                le32_to_cpu(fe->i_generation));
1835         printk("id1.bitmap1.i_used:           %u\n",
1836                le32_to_cpu(fe->id1.bitmap1.i_used));
1837         printk("id1.bitmap1.i_total:          %u\n",
1838                le32_to_cpu(fe->id1.bitmap1.i_total));
1839         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
1840         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
1841         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
1842         printk("id2.i_chain.cl_next_free_rec: %u\n",
1843                fe->id2.i_chain.cl_next_free_rec);
1844         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1845                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
1846                        fe->id2.i_chain.cl_recs[i].c_free);
1847                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1848                        fe->id2.i_chain.cl_recs[i].c_total);
1849                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
1850                        (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
1851         }
1852 }