Merge git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6-nmw
[pandora-kernel.git] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "blockcheck.h"
39 #include "dlmglue.h"
40 #include "inode.h"
41 #include "journal.h"
42 #include "localalloc.h"
43 #include "suballoc.h"
44 #include "super.h"
45 #include "sysfile.h"
46 #include "uptodate.h"
47
48 #include "buffer_head_io.h"
49
50 #define NOT_ALLOC_NEW_GROUP             0
51 #define ALLOC_NEW_GROUP                 0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL        0x2
53
54 #define OCFS2_MAX_TO_STEAL              1024
55
56 struct ocfs2_suballoc_result {
57         u64             sr_bg_blkno;    /* The bg we allocated from.  Set
58                                            to 0 when a block group is
59                                            contiguous. */
60         u64             sr_bg_stable_blkno; /*
61                                              * Doesn't change, always
62                                              * set to target block
63                                              * group descriptor
64                                              * block.
65                                              */
66         u64             sr_blkno;       /* The first allocated block */
67         unsigned int    sr_bit_offset;  /* The bit in the bg */
68         unsigned int    sr_bits;        /* How many bits we claimed */
69 };
70
71 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
72 {
73         if (res->sr_blkno == 0)
74                 return 0;
75
76         if (res->sr_bg_blkno)
77                 return res->sr_bg_blkno;
78
79         return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
80 }
81
82 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
83 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
84 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
85 static int ocfs2_block_group_fill(handle_t *handle,
86                                   struct inode *alloc_inode,
87                                   struct buffer_head *bg_bh,
88                                   u64 group_blkno,
89                                   unsigned int group_clusters,
90                                   u16 my_chain,
91                                   struct ocfs2_chain_list *cl);
92 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
93                                    struct inode *alloc_inode,
94                                    struct buffer_head *bh,
95                                    u64 max_block,
96                                    u64 *last_alloc_group,
97                                    int flags);
98
99 static int ocfs2_cluster_group_search(struct inode *inode,
100                                       struct buffer_head *group_bh,
101                                       u32 bits_wanted, u32 min_bits,
102                                       u64 max_block,
103                                       struct ocfs2_suballoc_result *res);
104 static int ocfs2_block_group_search(struct inode *inode,
105                                     struct buffer_head *group_bh,
106                                     u32 bits_wanted, u32 min_bits,
107                                     u64 max_block,
108                                     struct ocfs2_suballoc_result *res);
109 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
110                                      handle_t *handle,
111                                      u32 bits_wanted,
112                                      u32 min_bits,
113                                      struct ocfs2_suballoc_result *res);
114 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
115                                          int nr);
116 static inline int ocfs2_block_group_set_bits(handle_t *handle,
117                                              struct inode *alloc_inode,
118                                              struct ocfs2_group_desc *bg,
119                                              struct buffer_head *group_bh,
120                                              unsigned int bit_off,
121                                              unsigned int num_bits);
122 static int ocfs2_relink_block_group(handle_t *handle,
123                                     struct inode *alloc_inode,
124                                     struct buffer_head *fe_bh,
125                                     struct buffer_head *bg_bh,
126                                     struct buffer_head *prev_bg_bh,
127                                     u16 chain);
128 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
129                                                      u32 wanted);
130 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
131                                                    u64 bg_blkno,
132                                                    u16 bg_bit_off);
133 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
134                                                 u64 data_blkno,
135                                                 u64 *bg_blkno,
136                                                 u16 *bg_bit_off);
137 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
138                                              u32 bits_wanted, u64 max_block,
139                                              int flags,
140                                              struct ocfs2_alloc_context **ac);
141
142 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
143 {
144         struct inode *inode = ac->ac_inode;
145
146         if (inode) {
147                 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
148                         ocfs2_inode_unlock(inode, 1);
149
150                 mutex_unlock(&inode->i_mutex);
151
152                 iput(inode);
153                 ac->ac_inode = NULL;
154         }
155         brelse(ac->ac_bh);
156         ac->ac_bh = NULL;
157         ac->ac_resv = NULL;
158         if (ac->ac_find_loc_priv) {
159                 kfree(ac->ac_find_loc_priv);
160                 ac->ac_find_loc_priv = NULL;
161         }
162 }
163
164 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
165 {
166         ocfs2_free_ac_resource(ac);
167         kfree(ac);
168 }
169
170 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
171 {
172         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
173 }
174
175 #define do_error(fmt, ...)                                              \
176         do{                                                             \
177                 if (resize)                                     \
178                         mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
179                 else                                                    \
180                         ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
181         } while (0)
182
183 static int ocfs2_validate_gd_self(struct super_block *sb,
184                                   struct buffer_head *bh,
185                                   int resize)
186 {
187         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
188
189         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
190                 do_error("Group descriptor #%llu has bad signature %.*s",
191                          (unsigned long long)bh->b_blocknr, 7,
192                          gd->bg_signature);
193                 return -EINVAL;
194         }
195
196         if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
197                 do_error("Group descriptor #%llu has an invalid bg_blkno "
198                          "of %llu",
199                          (unsigned long long)bh->b_blocknr,
200                          (unsigned long long)le64_to_cpu(gd->bg_blkno));
201                 return -EINVAL;
202         }
203
204         if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
205                 do_error("Group descriptor #%llu has an invalid "
206                          "fs_generation of #%u",
207                          (unsigned long long)bh->b_blocknr,
208                          le32_to_cpu(gd->bg_generation));
209                 return -EINVAL;
210         }
211
212         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
213                 do_error("Group descriptor #%llu has bit count %u but "
214                          "claims that %u are free",
215                          (unsigned long long)bh->b_blocknr,
216                          le16_to_cpu(gd->bg_bits),
217                          le16_to_cpu(gd->bg_free_bits_count));
218                 return -EINVAL;
219         }
220
221         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
222                 do_error("Group descriptor #%llu has bit count %u but "
223                          "max bitmap bits of %u",
224                          (unsigned long long)bh->b_blocknr,
225                          le16_to_cpu(gd->bg_bits),
226                          8 * le16_to_cpu(gd->bg_size));
227                 return -EINVAL;
228         }
229
230         return 0;
231 }
232
233 static int ocfs2_validate_gd_parent(struct super_block *sb,
234                                     struct ocfs2_dinode *di,
235                                     struct buffer_head *bh,
236                                     int resize)
237 {
238         unsigned int max_bits;
239         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
240
241         if (di->i_blkno != gd->bg_parent_dinode) {
242                 do_error("Group descriptor #%llu has bad parent "
243                          "pointer (%llu, expected %llu)",
244                          (unsigned long long)bh->b_blocknr,
245                          (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
246                          (unsigned long long)le64_to_cpu(di->i_blkno));
247                 return -EINVAL;
248         }
249
250         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
251         if (le16_to_cpu(gd->bg_bits) > max_bits) {
252                 do_error("Group descriptor #%llu has bit count of %u",
253                          (unsigned long long)bh->b_blocknr,
254                          le16_to_cpu(gd->bg_bits));
255                 return -EINVAL;
256         }
257
258         /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
259         if ((le16_to_cpu(gd->bg_chain) >
260              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
261             ((le16_to_cpu(gd->bg_chain) ==
262              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
263                 do_error("Group descriptor #%llu has bad chain %u",
264                          (unsigned long long)bh->b_blocknr,
265                          le16_to_cpu(gd->bg_chain));
266                 return -EINVAL;
267         }
268
269         return 0;
270 }
271
272 #undef do_error
273
274 /*
275  * This version only prints errors.  It does not fail the filesystem, and
276  * exists only for resize.
277  */
278 int ocfs2_check_group_descriptor(struct super_block *sb,
279                                  struct ocfs2_dinode *di,
280                                  struct buffer_head *bh)
281 {
282         int rc;
283         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
284
285         BUG_ON(!buffer_uptodate(bh));
286
287         /*
288          * If the ecc fails, we return the error but otherwise
289          * leave the filesystem running.  We know any error is
290          * local to this block.
291          */
292         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
293         if (rc) {
294                 mlog(ML_ERROR,
295                      "Checksum failed for group descriptor %llu\n",
296                      (unsigned long long)bh->b_blocknr);
297         } else
298                 rc = ocfs2_validate_gd_self(sb, bh, 1);
299         if (!rc)
300                 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
301
302         return rc;
303 }
304
305 static int ocfs2_validate_group_descriptor(struct super_block *sb,
306                                            struct buffer_head *bh)
307 {
308         int rc;
309         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
310
311         mlog(0, "Validating group descriptor %llu\n",
312              (unsigned long long)bh->b_blocknr);
313
314         BUG_ON(!buffer_uptodate(bh));
315
316         /*
317          * If the ecc fails, we return the error but otherwise
318          * leave the filesystem running.  We know any error is
319          * local to this block.
320          */
321         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
322         if (rc)
323                 return rc;
324
325         /*
326          * Errors after here are fatal.
327          */
328
329         return ocfs2_validate_gd_self(sb, bh, 0);
330 }
331
332 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
333                                 u64 gd_blkno, struct buffer_head **bh)
334 {
335         int rc;
336         struct buffer_head *tmp = *bh;
337
338         rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
339                               ocfs2_validate_group_descriptor);
340         if (rc)
341                 goto out;
342
343         rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
344         if (rc) {
345                 brelse(tmp);
346                 goto out;
347         }
348
349         /* If ocfs2_read_block() got us a new bh, pass it up. */
350         if (!*bh)
351                 *bh = tmp;
352
353 out:
354         return rc;
355 }
356
357 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
358                                           struct ocfs2_group_desc *bg,
359                                           struct ocfs2_chain_list *cl,
360                                           u64 p_blkno, unsigned int clusters)
361 {
362         struct ocfs2_extent_list *el = &bg->bg_list;
363         struct ocfs2_extent_rec *rec;
364
365         BUG_ON(!ocfs2_supports_discontig_bg(osb));
366         if (!el->l_next_free_rec)
367                 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
368         rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
369         rec->e_blkno = cpu_to_le64(p_blkno);
370         rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
371                                   le16_to_cpu(cl->cl_bpc));
372         rec->e_leaf_clusters = cpu_to_le16(clusters);
373         le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
374         le16_add_cpu(&bg->bg_free_bits_count,
375                      clusters * le16_to_cpu(cl->cl_bpc));
376         le16_add_cpu(&el->l_next_free_rec, 1);
377 }
378
379 static int ocfs2_block_group_fill(handle_t *handle,
380                                   struct inode *alloc_inode,
381                                   struct buffer_head *bg_bh,
382                                   u64 group_blkno,
383                                   unsigned int group_clusters,
384                                   u16 my_chain,
385                                   struct ocfs2_chain_list *cl)
386 {
387         int status = 0;
388         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
389         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
390         struct super_block * sb = alloc_inode->i_sb;
391
392         mlog_entry_void();
393
394         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
395                 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
396                             "b_blocknr (%llu)",
397                             (unsigned long long)group_blkno,
398                             (unsigned long long) bg_bh->b_blocknr);
399                 status = -EIO;
400                 goto bail;
401         }
402
403         status = ocfs2_journal_access_gd(handle,
404                                          INODE_CACHE(alloc_inode),
405                                          bg_bh,
406                                          OCFS2_JOURNAL_ACCESS_CREATE);
407         if (status < 0) {
408                 mlog_errno(status);
409                 goto bail;
410         }
411
412         memset(bg, 0, sb->s_blocksize);
413         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
414         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
415         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
416                                                 osb->s_feature_incompat));
417         bg->bg_chain = cpu_to_le16(my_chain);
418         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
419         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
420         bg->bg_blkno = cpu_to_le64(group_blkno);
421         if (group_clusters == le16_to_cpu(cl->cl_cpg))
422                 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
423         else
424                 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
425                                               group_clusters);
426
427         /* set the 1st bit in the bitmap to account for the descriptor block */
428         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
429         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
430
431         ocfs2_journal_dirty(handle, bg_bh);
432
433         /* There is no need to zero out or otherwise initialize the
434          * other blocks in a group - All valid FS metadata in a block
435          * group stores the superblock fs_generation value at
436          * allocation time. */
437
438 bail:
439         mlog_exit(status);
440         return status;
441 }
442
443 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
444 {
445         u16 curr, best;
446
447         best = curr = 0;
448         while (curr < le16_to_cpu(cl->cl_count)) {
449                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
450                     le32_to_cpu(cl->cl_recs[curr].c_total))
451                         best = curr;
452                 curr++;
453         }
454         return best;
455 }
456
457 static struct buffer_head *
458 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
459                                struct inode *alloc_inode,
460                                struct ocfs2_alloc_context *ac,
461                                struct ocfs2_chain_list *cl)
462 {
463         int status;
464         u32 bit_off, num_bits;
465         u64 bg_blkno;
466         struct buffer_head *bg_bh;
467         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
468
469         status = ocfs2_claim_clusters(handle, ac,
470                                       le16_to_cpu(cl->cl_cpg), &bit_off,
471                                       &num_bits);
472         if (status < 0) {
473                 if (status != -ENOSPC)
474                         mlog_errno(status);
475                 goto bail;
476         }
477
478         /* setup the group */
479         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
480         mlog(0, "new descriptor, record %u, at block %llu\n",
481              alloc_rec, (unsigned long long)bg_blkno);
482
483         bg_bh = sb_getblk(osb->sb, bg_blkno);
484         if (!bg_bh) {
485                 status = -EIO;
486                 mlog_errno(status);
487                 goto bail;
488         }
489         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
490
491         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
492                                         bg_blkno, num_bits, alloc_rec, cl);
493         if (status < 0) {
494                 brelse(bg_bh);
495                 mlog_errno(status);
496         }
497
498 bail:
499         return status ? ERR_PTR(status) : bg_bh;
500 }
501
502 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
503                                         handle_t *handle,
504                                         struct ocfs2_alloc_context *ac,
505                                         unsigned int min_bits,
506                                         u32 *bit_off, u32 *num_bits)
507 {
508         int status = 0;
509
510         while (min_bits) {
511                 status = ocfs2_claim_clusters(handle, ac, min_bits,
512                                               bit_off, num_bits);
513                 if (status != -ENOSPC)
514                         break;
515
516                 min_bits >>= 1;
517         }
518
519         return status;
520 }
521
522 static int ocfs2_block_group_grow_discontig(handle_t *handle,
523                                             struct inode *alloc_inode,
524                                             struct buffer_head *bg_bh,
525                                             struct ocfs2_alloc_context *ac,
526                                             struct ocfs2_chain_list *cl,
527                                             unsigned int min_bits)
528 {
529         int status;
530         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
531         struct ocfs2_group_desc *bg =
532                 (struct ocfs2_group_desc *)bg_bh->b_data;
533         unsigned int needed = le16_to_cpu(cl->cl_cpg) -
534                          le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
535         u32 p_cpos, clusters;
536         u64 p_blkno;
537         struct ocfs2_extent_list *el = &bg->bg_list;
538
539         status = ocfs2_journal_access_gd(handle,
540                                          INODE_CACHE(alloc_inode),
541                                          bg_bh,
542                                          OCFS2_JOURNAL_ACCESS_CREATE);
543         if (status < 0) {
544                 mlog_errno(status);
545                 goto bail;
546         }
547
548         while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
549                                 le16_to_cpu(el->l_count))) {
550                 if (min_bits > needed)
551                         min_bits = needed;
552                 status = ocfs2_block_group_claim_bits(osb, handle, ac,
553                                                       min_bits, &p_cpos,
554                                                       &clusters);
555                 if (status < 0) {
556                         if (status != -ENOSPC)
557                                 mlog_errno(status);
558                         goto bail;
559                 }
560                 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
561                 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
562                                               clusters);
563
564                 min_bits = clusters;
565                 needed = le16_to_cpu(cl->cl_cpg) -
566                          le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
567         }
568
569         if (needed > 0) {
570                 /*
571                  * We have used up all the extent rec but can't fill up
572                  * the cpg. So bail out.
573                  */
574                 status = -ENOSPC;
575                 goto bail;
576         }
577
578         ocfs2_journal_dirty(handle, bg_bh);
579
580 bail:
581         return status;
582 }
583
584 static void ocfs2_bg_alloc_cleanup(handle_t *handle,
585                                    struct ocfs2_alloc_context *cluster_ac,
586                                    struct inode *alloc_inode,
587                                    struct buffer_head *bg_bh)
588 {
589         int i, ret;
590         struct ocfs2_group_desc *bg;
591         struct ocfs2_extent_list *el;
592         struct ocfs2_extent_rec *rec;
593
594         if (!bg_bh)
595                 return;
596
597         bg = (struct ocfs2_group_desc *)bg_bh->b_data;
598         el = &bg->bg_list;
599         for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
600                 rec = &el->l_recs[i];
601                 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
602                                           cluster_ac->ac_bh,
603                                           le64_to_cpu(rec->e_blkno),
604                                           le32_to_cpu(rec->e_leaf_clusters));
605                 if (ret)
606                         mlog_errno(ret);
607                 /* Try all the clusters to free */
608         }
609
610         ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
611         brelse(bg_bh);
612 }
613
614 static struct buffer_head *
615 ocfs2_block_group_alloc_discontig(handle_t *handle,
616                                   struct inode *alloc_inode,
617                                   struct ocfs2_alloc_context *ac,
618                                   struct ocfs2_chain_list *cl)
619 {
620         int status;
621         u32 bit_off, num_bits;
622         u64 bg_blkno;
623         unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
624         struct buffer_head *bg_bh = NULL;
625         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
626         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
627
628         if (!ocfs2_supports_discontig_bg(osb)) {
629                 status = -ENOSPC;
630                 goto bail;
631         }
632
633         status = ocfs2_extend_trans(handle,
634                                     ocfs2_calc_bg_discontig_credits(osb->sb));
635         if (status) {
636                 mlog_errno(status);
637                 goto bail;
638         }
639
640         /*
641          * We're going to be grabbing from multiple cluster groups.
642          * We don't have enough credits to relink them all, and the
643          * cluster groups will be staying in cache for the duration of
644          * this operation.
645          */
646         ac->ac_allow_chain_relink = 0;
647
648         /* Claim the first region */
649         status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
650                                               &bit_off, &num_bits);
651         if (status < 0) {
652                 if (status != -ENOSPC)
653                         mlog_errno(status);
654                 goto bail;
655         }
656         min_bits = num_bits;
657
658         /* setup the group */
659         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
660         mlog(0, "new descriptor, record %u, at block %llu\n",
661              alloc_rec, (unsigned long long)bg_blkno);
662
663         bg_bh = sb_getblk(osb->sb, bg_blkno);
664         if (!bg_bh) {
665                 status = -EIO;
666                 mlog_errno(status);
667                 goto bail;
668         }
669         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
670
671         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
672                                         bg_blkno, num_bits, alloc_rec, cl);
673         if (status < 0) {
674                 mlog_errno(status);
675                 goto bail;
676         }
677
678         status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
679                                                   bg_bh, ac, cl, min_bits);
680         if (status)
681                 mlog_errno(status);
682
683 bail:
684         if (status)
685                 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
686         return status ? ERR_PTR(status) : bg_bh;
687 }
688
689 /*
690  * We expect the block group allocator to already be locked.
691  */
692 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
693                                    struct inode *alloc_inode,
694                                    struct buffer_head *bh,
695                                    u64 max_block,
696                                    u64 *last_alloc_group,
697                                    int flags)
698 {
699         int status, credits;
700         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
701         struct ocfs2_chain_list *cl;
702         struct ocfs2_alloc_context *ac = NULL;
703         handle_t *handle = NULL;
704         u16 alloc_rec;
705         struct buffer_head *bg_bh = NULL;
706         struct ocfs2_group_desc *bg;
707
708         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
709
710         mlog_entry_void();
711
712         cl = &fe->id2.i_chain;
713         status = ocfs2_reserve_clusters_with_limit(osb,
714                                                    le16_to_cpu(cl->cl_cpg),
715                                                    max_block, flags, &ac);
716         if (status < 0) {
717                 if (status != -ENOSPC)
718                         mlog_errno(status);
719                 goto bail;
720         }
721
722         credits = ocfs2_calc_group_alloc_credits(osb->sb,
723                                                  le16_to_cpu(cl->cl_cpg));
724         handle = ocfs2_start_trans(osb, credits);
725         if (IS_ERR(handle)) {
726                 status = PTR_ERR(handle);
727                 handle = NULL;
728                 mlog_errno(status);
729                 goto bail;
730         }
731
732         if (last_alloc_group && *last_alloc_group != 0) {
733                 mlog(0, "use old allocation group %llu for block group alloc\n",
734                      (unsigned long long)*last_alloc_group);
735                 ac->ac_last_group = *last_alloc_group;
736         }
737
738         bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
739                                                ac, cl);
740         if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
741                 bg_bh = ocfs2_block_group_alloc_discontig(handle,
742                                                           alloc_inode,
743                                                           ac, cl);
744         if (IS_ERR(bg_bh)) {
745                 status = PTR_ERR(bg_bh);
746                 bg_bh = NULL;
747                 if (status != -ENOSPC)
748                         mlog_errno(status);
749                 goto bail;
750         }
751         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
752
753         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
754                                          bh, OCFS2_JOURNAL_ACCESS_WRITE);
755         if (status < 0) {
756                 mlog_errno(status);
757                 goto bail;
758         }
759
760         alloc_rec = le16_to_cpu(bg->bg_chain);
761         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
762                      le16_to_cpu(bg->bg_free_bits_count));
763         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
764                      le16_to_cpu(bg->bg_bits));
765         cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
766         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
767                 le16_add_cpu(&cl->cl_next_free_rec, 1);
768
769         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
770                                         le16_to_cpu(bg->bg_free_bits_count));
771         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
772         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
773
774         ocfs2_journal_dirty(handle, bh);
775
776         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
777         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
778         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
779                                              le32_to_cpu(fe->i_clusters)));
780         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
781         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
782         alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
783
784         status = 0;
785
786         /* save the new last alloc group so that the caller can cache it. */
787         if (last_alloc_group)
788                 *last_alloc_group = ac->ac_last_group;
789
790 bail:
791         if (handle)
792                 ocfs2_commit_trans(osb, handle);
793
794         if (ac)
795                 ocfs2_free_alloc_context(ac);
796
797         brelse(bg_bh);
798
799         mlog_exit(status);
800         return status;
801 }
802
803 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
804                                        struct ocfs2_alloc_context *ac,
805                                        int type,
806                                        u32 slot,
807                                        u64 *last_alloc_group,
808                                        int flags)
809 {
810         int status;
811         u32 bits_wanted = ac->ac_bits_wanted;
812         struct inode *alloc_inode;
813         struct buffer_head *bh = NULL;
814         struct ocfs2_dinode *fe;
815         u32 free_bits;
816
817         mlog_entry_void();
818
819         alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
820         if (!alloc_inode) {
821                 mlog_errno(-EINVAL);
822                 return -EINVAL;
823         }
824
825         mutex_lock(&alloc_inode->i_mutex);
826
827         status = ocfs2_inode_lock(alloc_inode, &bh, 1);
828         if (status < 0) {
829                 mutex_unlock(&alloc_inode->i_mutex);
830                 iput(alloc_inode);
831
832                 mlog_errno(status);
833                 return status;
834         }
835
836         ac->ac_inode = alloc_inode;
837         ac->ac_alloc_slot = slot;
838
839         fe = (struct ocfs2_dinode *) bh->b_data;
840
841         /* The bh was validated by the inode read inside
842          * ocfs2_inode_lock().  Any corruption is a code bug. */
843         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
844
845         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
846                 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
847                             (unsigned long long)le64_to_cpu(fe->i_blkno));
848                 status = -EIO;
849                 goto bail;
850         }
851
852         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
853                 le32_to_cpu(fe->id1.bitmap1.i_used);
854
855         if (bits_wanted > free_bits) {
856                 /* cluster bitmap never grows */
857                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
858                         mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
859                              bits_wanted, free_bits);
860                         status = -ENOSPC;
861                         goto bail;
862                 }
863
864                 if (!(flags & ALLOC_NEW_GROUP)) {
865                         mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
866                              "and we don't alloc a new group for it.\n",
867                              slot, bits_wanted, free_bits);
868                         status = -ENOSPC;
869                         goto bail;
870                 }
871
872                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
873                                                  ac->ac_max_block,
874                                                  last_alloc_group, flags);
875                 if (status < 0) {
876                         if (status != -ENOSPC)
877                                 mlog_errno(status);
878                         goto bail;
879                 }
880                 atomic_inc(&osb->alloc_stats.bg_extends);
881
882                 /* You should never ask for this much metadata */
883                 BUG_ON(bits_wanted >
884                        (le32_to_cpu(fe->id1.bitmap1.i_total)
885                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
886         }
887
888         get_bh(bh);
889         ac->ac_bh = bh;
890 bail:
891         brelse(bh);
892
893         mlog_exit(status);
894         return status;
895 }
896
897 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
898 {
899         spin_lock(&osb->osb_lock);
900         osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
901         spin_unlock(&osb->osb_lock);
902         atomic_set(&osb->s_num_inodes_stolen, 0);
903 }
904
905 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
906 {
907         spin_lock(&osb->osb_lock);
908         osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
909         spin_unlock(&osb->osb_lock);
910         atomic_set(&osb->s_num_meta_stolen, 0);
911 }
912
913 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
914 {
915         ocfs2_init_inode_steal_slot(osb);
916         ocfs2_init_meta_steal_slot(osb);
917 }
918
919 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
920 {
921         spin_lock(&osb->osb_lock);
922         if (type == INODE_ALLOC_SYSTEM_INODE)
923                 osb->s_inode_steal_slot = slot;
924         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
925                 osb->s_meta_steal_slot = slot;
926         spin_unlock(&osb->osb_lock);
927 }
928
929 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
930 {
931         int slot = OCFS2_INVALID_SLOT;
932
933         spin_lock(&osb->osb_lock);
934         if (type == INODE_ALLOC_SYSTEM_INODE)
935                 slot = osb->s_inode_steal_slot;
936         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
937                 slot = osb->s_meta_steal_slot;
938         spin_unlock(&osb->osb_lock);
939
940         return slot;
941 }
942
943 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
944 {
945         return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
946 }
947
948 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
949 {
950         return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
951 }
952
953 static int ocfs2_steal_resource(struct ocfs2_super *osb,
954                                 struct ocfs2_alloc_context *ac,
955                                 int type)
956 {
957         int i, status = -ENOSPC;
958         int slot = __ocfs2_get_steal_slot(osb, type);
959
960         /* Start to steal resource from the first slot after ours. */
961         if (slot == OCFS2_INVALID_SLOT)
962                 slot = osb->slot_num + 1;
963
964         for (i = 0; i < osb->max_slots; i++, slot++) {
965                 if (slot == osb->max_slots)
966                         slot = 0;
967
968                 if (slot == osb->slot_num)
969                         continue;
970
971                 status = ocfs2_reserve_suballoc_bits(osb, ac,
972                                                      type,
973                                                      (u32)slot, NULL,
974                                                      NOT_ALLOC_NEW_GROUP);
975                 if (status >= 0) {
976                         __ocfs2_set_steal_slot(osb, slot, type);
977                         break;
978                 }
979
980                 ocfs2_free_ac_resource(ac);
981         }
982
983         return status;
984 }
985
986 static int ocfs2_steal_inode(struct ocfs2_super *osb,
987                              struct ocfs2_alloc_context *ac)
988 {
989         return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
990 }
991
992 static int ocfs2_steal_meta(struct ocfs2_super *osb,
993                             struct ocfs2_alloc_context *ac)
994 {
995         return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
996 }
997
998 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
999                                       int blocks,
1000                                       struct ocfs2_alloc_context **ac)
1001 {
1002         int status;
1003         int slot = ocfs2_get_meta_steal_slot(osb);
1004
1005         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1006         if (!(*ac)) {
1007                 status = -ENOMEM;
1008                 mlog_errno(status);
1009                 goto bail;
1010         }
1011
1012         (*ac)->ac_bits_wanted = blocks;
1013         (*ac)->ac_which = OCFS2_AC_USE_META;
1014         (*ac)->ac_group_search = ocfs2_block_group_search;
1015
1016         if (slot != OCFS2_INVALID_SLOT &&
1017                 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
1018                 goto extent_steal;
1019
1020         atomic_set(&osb->s_num_meta_stolen, 0);
1021         status = ocfs2_reserve_suballoc_bits(osb, (*ac),
1022                                              EXTENT_ALLOC_SYSTEM_INODE,
1023                                              (u32)osb->slot_num, NULL,
1024                                              ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
1025
1026
1027         if (status >= 0) {
1028                 status = 0;
1029                 if (slot != OCFS2_INVALID_SLOT)
1030                         ocfs2_init_meta_steal_slot(osb);
1031                 goto bail;
1032         } else if (status < 0 && status != -ENOSPC) {
1033                 mlog_errno(status);
1034                 goto bail;
1035         }
1036
1037         ocfs2_free_ac_resource(*ac);
1038
1039 extent_steal:
1040         status = ocfs2_steal_meta(osb, *ac);
1041         atomic_inc(&osb->s_num_meta_stolen);
1042         if (status < 0) {
1043                 if (status != -ENOSPC)
1044                         mlog_errno(status);
1045                 goto bail;
1046         }
1047
1048         status = 0;
1049 bail:
1050         if ((status < 0) && *ac) {
1051                 ocfs2_free_alloc_context(*ac);
1052                 *ac = NULL;
1053         }
1054
1055         mlog_exit(status);
1056         return status;
1057 }
1058
1059 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1060                                struct ocfs2_extent_list *root_el,
1061                                struct ocfs2_alloc_context **ac)
1062 {
1063         return ocfs2_reserve_new_metadata_blocks(osb,
1064                                         ocfs2_extend_meta_needed(root_el),
1065                                         ac);
1066 }
1067
1068 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1069                             struct ocfs2_alloc_context **ac)
1070 {
1071         int status;
1072         int slot = ocfs2_get_inode_steal_slot(osb);
1073         u64 alloc_group;
1074
1075         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1076         if (!(*ac)) {
1077                 status = -ENOMEM;
1078                 mlog_errno(status);
1079                 goto bail;
1080         }
1081
1082         (*ac)->ac_bits_wanted = 1;
1083         (*ac)->ac_which = OCFS2_AC_USE_INODE;
1084
1085         (*ac)->ac_group_search = ocfs2_block_group_search;
1086
1087         /*
1088          * stat(2) can't handle i_ino > 32bits, so we tell the
1089          * lower levels not to allocate us a block group past that
1090          * limit.  The 'inode64' mount option avoids this behavior.
1091          */
1092         if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1093                 (*ac)->ac_max_block = (u32)~0U;
1094
1095         /*
1096          * slot is set when we successfully steal inode from other nodes.
1097          * It is reset in 3 places:
1098          * 1. when we flush the truncate log
1099          * 2. when we complete local alloc recovery.
1100          * 3. when we successfully allocate from our own slot.
1101          * After it is set, we will go on stealing inodes until we find the
1102          * need to check our slots to see whether there is some space for us.
1103          */
1104         if (slot != OCFS2_INVALID_SLOT &&
1105             atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1106                 goto inode_steal;
1107
1108         atomic_set(&osb->s_num_inodes_stolen, 0);
1109         alloc_group = osb->osb_inode_alloc_group;
1110         status = ocfs2_reserve_suballoc_bits(osb, *ac,
1111                                              INODE_ALLOC_SYSTEM_INODE,
1112                                              (u32)osb->slot_num,
1113                                              &alloc_group,
1114                                              ALLOC_NEW_GROUP |
1115                                              ALLOC_GROUPS_FROM_GLOBAL);
1116         if (status >= 0) {
1117                 status = 0;
1118
1119                 spin_lock(&osb->osb_lock);
1120                 osb->osb_inode_alloc_group = alloc_group;
1121                 spin_unlock(&osb->osb_lock);
1122                 mlog(0, "after reservation, new allocation group is "
1123                      "%llu\n", (unsigned long long)alloc_group);
1124
1125                 /*
1126                  * Some inodes must be freed by us, so try to allocate
1127                  * from our own next time.
1128                  */
1129                 if (slot != OCFS2_INVALID_SLOT)
1130                         ocfs2_init_inode_steal_slot(osb);
1131                 goto bail;
1132         } else if (status < 0 && status != -ENOSPC) {
1133                 mlog_errno(status);
1134                 goto bail;
1135         }
1136
1137         ocfs2_free_ac_resource(*ac);
1138
1139 inode_steal:
1140         status = ocfs2_steal_inode(osb, *ac);
1141         atomic_inc(&osb->s_num_inodes_stolen);
1142         if (status < 0) {
1143                 if (status != -ENOSPC)
1144                         mlog_errno(status);
1145                 goto bail;
1146         }
1147
1148         status = 0;
1149 bail:
1150         if ((status < 0) && *ac) {
1151                 ocfs2_free_alloc_context(*ac);
1152                 *ac = NULL;
1153         }
1154
1155         mlog_exit(status);
1156         return status;
1157 }
1158
1159 /* local alloc code has to do the same thing, so rather than do this
1160  * twice.. */
1161 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1162                                       struct ocfs2_alloc_context *ac)
1163 {
1164         int status;
1165
1166         ac->ac_which = OCFS2_AC_USE_MAIN;
1167         ac->ac_group_search = ocfs2_cluster_group_search;
1168
1169         status = ocfs2_reserve_suballoc_bits(osb, ac,
1170                                              GLOBAL_BITMAP_SYSTEM_INODE,
1171                                              OCFS2_INVALID_SLOT, NULL,
1172                                              ALLOC_NEW_GROUP);
1173         if (status < 0 && status != -ENOSPC) {
1174                 mlog_errno(status);
1175                 goto bail;
1176         }
1177
1178 bail:
1179         return status;
1180 }
1181
1182 /* Callers don't need to care which bitmap (local alloc or main) to
1183  * use so we figure it out for them, but unfortunately this clutters
1184  * things a bit. */
1185 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1186                                              u32 bits_wanted, u64 max_block,
1187                                              int flags,
1188                                              struct ocfs2_alloc_context **ac)
1189 {
1190         int status;
1191
1192         mlog_entry_void();
1193
1194         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1195         if (!(*ac)) {
1196                 status = -ENOMEM;
1197                 mlog_errno(status);
1198                 goto bail;
1199         }
1200
1201         (*ac)->ac_bits_wanted = bits_wanted;
1202         (*ac)->ac_max_block = max_block;
1203
1204         status = -ENOSPC;
1205         if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1206             ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1207                 status = ocfs2_reserve_local_alloc_bits(osb,
1208                                                         bits_wanted,
1209                                                         *ac);
1210                 if ((status < 0) && (status != -ENOSPC)) {
1211                         mlog_errno(status);
1212                         goto bail;
1213                 }
1214         }
1215
1216         if (status == -ENOSPC) {
1217                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1218                 if (status < 0) {
1219                         if (status != -ENOSPC)
1220                                 mlog_errno(status);
1221                         goto bail;
1222                 }
1223         }
1224
1225         status = 0;
1226 bail:
1227         if ((status < 0) && *ac) {
1228                 ocfs2_free_alloc_context(*ac);
1229                 *ac = NULL;
1230         }
1231
1232         mlog_exit(status);
1233         return status;
1234 }
1235
1236 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1237                            u32 bits_wanted,
1238                            struct ocfs2_alloc_context **ac)
1239 {
1240         return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1241                                                  ALLOC_NEW_GROUP, ac);
1242 }
1243
1244 /*
1245  * More or less lifted from ext3. I'll leave their description below:
1246  *
1247  * "For ext3 allocations, we must not reuse any blocks which are
1248  * allocated in the bitmap buffer's "last committed data" copy.  This
1249  * prevents deletes from freeing up the page for reuse until we have
1250  * committed the delete transaction.
1251  *
1252  * If we didn't do this, then deleting something and reallocating it as
1253  * data would allow the old block to be overwritten before the
1254  * transaction committed (because we force data to disk before commit).
1255  * This would lead to corruption if we crashed between overwriting the
1256  * data and committing the delete.
1257  *
1258  * @@@ We may want to make this allocation behaviour conditional on
1259  * data-writes at some point, and disable it for metadata allocations or
1260  * sync-data inodes."
1261  *
1262  * Note: OCFS2 already does this differently for metadata vs data
1263  * allocations, as those bitmaps are separate and undo access is never
1264  * called on a metadata group descriptor.
1265  */
1266 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1267                                          int nr)
1268 {
1269         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1270         int ret;
1271
1272         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1273                 return 0;
1274
1275         if (!buffer_jbd(bg_bh))
1276                 return 1;
1277
1278         jbd_lock_bh_state(bg_bh);
1279         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1280         if (bg)
1281                 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1282         else
1283                 ret = 1;
1284         jbd_unlock_bh_state(bg_bh);
1285
1286         return ret;
1287 }
1288
1289 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1290                                              struct buffer_head *bg_bh,
1291                                              unsigned int bits_wanted,
1292                                              unsigned int total_bits,
1293                                              struct ocfs2_suballoc_result *res)
1294 {
1295         void *bitmap;
1296         u16 best_offset, best_size;
1297         int offset, start, found, status = 0;
1298         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1299
1300         /* Callers got this descriptor from
1301          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1302         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1303
1304         found = start = best_offset = best_size = 0;
1305         bitmap = bg->bg_bitmap;
1306
1307         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1308                 if (offset == total_bits)
1309                         break;
1310
1311                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1312                         /* We found a zero, but we can't use it as it
1313                          * hasn't been put to disk yet! */
1314                         found = 0;
1315                         start = offset + 1;
1316                 } else if (offset == start) {
1317                         /* we found a zero */
1318                         found++;
1319                         /* move start to the next bit to test */
1320                         start++;
1321                 } else {
1322                         /* got a zero after some ones */
1323                         found = 1;
1324                         start = offset + 1;
1325                 }
1326                 if (found > best_size) {
1327                         best_size = found;
1328                         best_offset = start - found;
1329                 }
1330                 /* we got everything we needed */
1331                 if (found == bits_wanted) {
1332                         /* mlog(0, "Found it all!\n"); */
1333                         break;
1334                 }
1335         }
1336
1337         if (best_size) {
1338                 res->sr_bit_offset = best_offset;
1339                 res->sr_bits = best_size;
1340         } else {
1341                 status = -ENOSPC;
1342                 /* No error log here -- see the comment above
1343                  * ocfs2_test_bg_bit_allocatable */
1344         }
1345
1346         return status;
1347 }
1348
1349 static inline int ocfs2_block_group_set_bits(handle_t *handle,
1350                                              struct inode *alloc_inode,
1351                                              struct ocfs2_group_desc *bg,
1352                                              struct buffer_head *group_bh,
1353                                              unsigned int bit_off,
1354                                              unsigned int num_bits)
1355 {
1356         int status;
1357         void *bitmap = bg->bg_bitmap;
1358         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1359
1360         mlog_entry_void();
1361
1362         /* All callers get the descriptor via
1363          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1364         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1365         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1366
1367         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
1368              num_bits);
1369
1370         if (ocfs2_is_cluster_bitmap(alloc_inode))
1371                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1372
1373         status = ocfs2_journal_access_gd(handle,
1374                                          INODE_CACHE(alloc_inode),
1375                                          group_bh,
1376                                          journal_type);
1377         if (status < 0) {
1378                 mlog_errno(status);
1379                 goto bail;
1380         }
1381
1382         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1383         while(num_bits--)
1384                 ocfs2_set_bit(bit_off++, bitmap);
1385
1386         ocfs2_journal_dirty(handle, group_bh);
1387
1388 bail:
1389         mlog_exit(status);
1390         return status;
1391 }
1392
1393 /* find the one with the most empty bits */
1394 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1395 {
1396         u16 curr, best;
1397
1398         BUG_ON(!cl->cl_next_free_rec);
1399
1400         best = curr = 0;
1401         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1402                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1403                     le32_to_cpu(cl->cl_recs[best].c_free))
1404                         best = curr;
1405                 curr++;
1406         }
1407
1408         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1409         return best;
1410 }
1411
1412 static int ocfs2_relink_block_group(handle_t *handle,
1413                                     struct inode *alloc_inode,
1414                                     struct buffer_head *fe_bh,
1415                                     struct buffer_head *bg_bh,
1416                                     struct buffer_head *prev_bg_bh,
1417                                     u16 chain)
1418 {
1419         int status;
1420         /* there is a really tiny chance the journal calls could fail,
1421          * but we wouldn't want inconsistent blocks in *any* case. */
1422         u64 fe_ptr, bg_ptr, prev_bg_ptr;
1423         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1424         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1425         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1426
1427         /* The caller got these descriptors from
1428          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1429         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1430         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1431
1432         mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1433              (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1434              (unsigned long long)le64_to_cpu(bg->bg_blkno),
1435              (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1436
1437         fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1438         bg_ptr = le64_to_cpu(bg->bg_next_group);
1439         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1440
1441         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1442                                          prev_bg_bh,
1443                                          OCFS2_JOURNAL_ACCESS_WRITE);
1444         if (status < 0) {
1445                 mlog_errno(status);
1446                 goto out_rollback;
1447         }
1448
1449         prev_bg->bg_next_group = bg->bg_next_group;
1450         ocfs2_journal_dirty(handle, prev_bg_bh);
1451
1452         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1453                                          bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1454         if (status < 0) {
1455                 mlog_errno(status);
1456                 goto out_rollback;
1457         }
1458
1459         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1460         ocfs2_journal_dirty(handle, bg_bh);
1461
1462         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1463                                          fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1464         if (status < 0) {
1465                 mlog_errno(status);
1466                 goto out_rollback;
1467         }
1468
1469         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1470         ocfs2_journal_dirty(handle, fe_bh);
1471
1472 out_rollback:
1473         if (status < 0) {
1474                 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1475                 bg->bg_next_group = cpu_to_le64(bg_ptr);
1476                 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1477         }
1478
1479         mlog_exit(status);
1480         return status;
1481 }
1482
1483 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1484                                                      u32 wanted)
1485 {
1486         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1487 }
1488
1489 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1490  * value on error. */
1491 static int ocfs2_cluster_group_search(struct inode *inode,
1492                                       struct buffer_head *group_bh,
1493                                       u32 bits_wanted, u32 min_bits,
1494                                       u64 max_block,
1495                                       struct ocfs2_suballoc_result *res)
1496 {
1497         int search = -ENOSPC;
1498         int ret;
1499         u64 blkoff;
1500         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1501         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1502         unsigned int max_bits, gd_cluster_off;
1503
1504         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1505
1506         if (gd->bg_free_bits_count) {
1507                 max_bits = le16_to_cpu(gd->bg_bits);
1508
1509                 /* Tail groups in cluster bitmaps which aren't cpg
1510                  * aligned are prone to partial extention by a failed
1511                  * fs resize. If the file system resize never got to
1512                  * update the dinode cluster count, then we don't want
1513                  * to trust any clusters past it, regardless of what
1514                  * the group descriptor says. */
1515                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1516                                                           le64_to_cpu(gd->bg_blkno));
1517                 if ((gd_cluster_off + max_bits) >
1518                     OCFS2_I(inode)->ip_clusters) {
1519                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1520                         mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1521                              (unsigned long long)le64_to_cpu(gd->bg_blkno),
1522                              le16_to_cpu(gd->bg_bits),
1523                              OCFS2_I(inode)->ip_clusters, max_bits);
1524                 }
1525
1526                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1527                                                         group_bh, bits_wanted,
1528                                                         max_bits, res);
1529                 if (ret)
1530                         return ret;
1531
1532                 if (max_block) {
1533                         blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1534                                                           gd_cluster_off +
1535                                                           res->sr_bit_offset +
1536                                                           res->sr_bits);
1537                         mlog(0, "Checking %llu against %llu\n",
1538                              (unsigned long long)blkoff,
1539                              (unsigned long long)max_block);
1540                         if (blkoff > max_block)
1541                                 return -ENOSPC;
1542                 }
1543
1544                 /* ocfs2_block_group_find_clear_bits() might
1545                  * return success, but we still want to return
1546                  * -ENOSPC unless it found the minimum number
1547                  * of bits. */
1548                 if (min_bits <= res->sr_bits)
1549                         search = 0; /* success */
1550                 else if (res->sr_bits) {
1551                         /*
1552                          * Don't show bits which we'll be returning
1553                          * for allocation to the local alloc bitmap.
1554                          */
1555                         ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1556                 }
1557         }
1558
1559         return search;
1560 }
1561
1562 static int ocfs2_block_group_search(struct inode *inode,
1563                                     struct buffer_head *group_bh,
1564                                     u32 bits_wanted, u32 min_bits,
1565                                     u64 max_block,
1566                                     struct ocfs2_suballoc_result *res)
1567 {
1568         int ret = -ENOSPC;
1569         u64 blkoff;
1570         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1571
1572         BUG_ON(min_bits != 1);
1573         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1574
1575         if (bg->bg_free_bits_count) {
1576                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1577                                                         group_bh, bits_wanted,
1578                                                         le16_to_cpu(bg->bg_bits),
1579                                                         res);
1580                 if (!ret && max_block) {
1581                         blkoff = le64_to_cpu(bg->bg_blkno) +
1582                                 res->sr_bit_offset + res->sr_bits;
1583                         mlog(0, "Checking %llu against %llu\n",
1584                              (unsigned long long)blkoff,
1585                              (unsigned long long)max_block);
1586                         if (blkoff > max_block)
1587                                 ret = -ENOSPC;
1588                 }
1589         }
1590
1591         return ret;
1592 }
1593
1594 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1595                                        handle_t *handle,
1596                                        struct buffer_head *di_bh,
1597                                        u32 num_bits,
1598                                        u16 chain)
1599 {
1600         int ret;
1601         u32 tmp_used;
1602         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1603         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1604
1605         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1606                                       OCFS2_JOURNAL_ACCESS_WRITE);
1607         if (ret < 0) {
1608                 mlog_errno(ret);
1609                 goto out;
1610         }
1611
1612         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1613         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1614         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1615         ocfs2_journal_dirty(handle, di_bh);
1616
1617 out:
1618         return ret;
1619 }
1620
1621 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1622                                          struct ocfs2_extent_rec *rec,
1623                                          struct ocfs2_chain_list *cl)
1624 {
1625         unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1626         unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1627         unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
1628
1629         if (res->sr_bit_offset < bitoff)
1630                 return 0;
1631         if (res->sr_bit_offset >= (bitoff + bitcount))
1632                 return 0;
1633         res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1634                 (res->sr_bit_offset - bitoff);
1635         if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1636                 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1637         return 1;
1638 }
1639
1640 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1641                                           struct ocfs2_group_desc *bg,
1642                                           struct ocfs2_suballoc_result *res)
1643 {
1644         int i;
1645         u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1646         struct ocfs2_extent_rec *rec;
1647         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1648         struct ocfs2_chain_list *cl = &di->id2.i_chain;
1649
1650         if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1651                 res->sr_blkno = 0;
1652                 return;
1653         }
1654
1655         res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1656         res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1657         if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1658             !bg->bg_list.l_next_free_rec)
1659                 return;
1660
1661         for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1662                 rec = &bg->bg_list.l_recs[i];
1663                 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1664                         res->sr_bg_blkno = bg_blkno;  /* Restore */
1665                         break;
1666                 }
1667         }
1668 }
1669
1670 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1671                                   handle_t *handle,
1672                                   u32 bits_wanted,
1673                                   u32 min_bits,
1674                                   struct ocfs2_suballoc_result *res,
1675                                   u16 *bits_left)
1676 {
1677         int ret;
1678         struct buffer_head *group_bh = NULL;
1679         struct ocfs2_group_desc *gd;
1680         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1681         struct inode *alloc_inode = ac->ac_inode;
1682
1683         ret = ocfs2_read_group_descriptor(alloc_inode, di,
1684                                           res->sr_bg_blkno, &group_bh);
1685         if (ret < 0) {
1686                 mlog_errno(ret);
1687                 return ret;
1688         }
1689
1690         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1691         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1692                                   ac->ac_max_block, res);
1693         if (ret < 0) {
1694                 if (ret != -ENOSPC)
1695                         mlog_errno(ret);
1696                 goto out;
1697         }
1698
1699         if (!ret)
1700                 ocfs2_bg_discontig_fix_result(ac, gd, res);
1701
1702         /*
1703          * sr_bg_blkno might have been changed by
1704          * ocfs2_bg_discontig_fix_result
1705          */
1706         res->sr_bg_stable_blkno = group_bh->b_blocknr;
1707
1708         if (ac->ac_find_loc_only)
1709                 goto out_loc_only;
1710
1711         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1712                                                res->sr_bits,
1713                                                le16_to_cpu(gd->bg_chain));
1714         if (ret < 0) {
1715                 mlog_errno(ret);
1716                 goto out;
1717         }
1718
1719         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1720                                          res->sr_bit_offset, res->sr_bits);
1721         if (ret < 0)
1722                 mlog_errno(ret);
1723
1724 out_loc_only:
1725         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1726
1727 out:
1728         brelse(group_bh);
1729
1730         return ret;
1731 }
1732
1733 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1734                               handle_t *handle,
1735                               u32 bits_wanted,
1736                               u32 min_bits,
1737                               struct ocfs2_suballoc_result *res,
1738                               u16 *bits_left)
1739 {
1740         int status;
1741         u16 chain;
1742         u64 next_group;
1743         struct inode *alloc_inode = ac->ac_inode;
1744         struct buffer_head *group_bh = NULL;
1745         struct buffer_head *prev_group_bh = NULL;
1746         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1747         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1748         struct ocfs2_group_desc *bg;
1749
1750         chain = ac->ac_chain;
1751         mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1752              bits_wanted, chain,
1753              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1754
1755         status = ocfs2_read_group_descriptor(alloc_inode, fe,
1756                                              le64_to_cpu(cl->cl_recs[chain].c_blkno),
1757                                              &group_bh);
1758         if (status < 0) {
1759                 mlog_errno(status);
1760                 goto bail;
1761         }
1762         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1763
1764         status = -ENOSPC;
1765         /* for now, the chain search is a bit simplistic. We just use
1766          * the 1st group with any empty bits. */
1767         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1768                                              bits_wanted, min_bits,
1769                                              ac->ac_max_block,
1770                                              res)) == -ENOSPC) {
1771                 if (!bg->bg_next_group)
1772                         break;
1773
1774                 brelse(prev_group_bh);
1775                 prev_group_bh = NULL;
1776
1777                 next_group = le64_to_cpu(bg->bg_next_group);
1778                 prev_group_bh = group_bh;
1779                 group_bh = NULL;
1780                 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1781                                                      next_group, &group_bh);
1782                 if (status < 0) {
1783                         mlog_errno(status);
1784                         goto bail;
1785                 }
1786                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1787         }
1788         if (status < 0) {
1789                 if (status != -ENOSPC)
1790                         mlog_errno(status);
1791                 goto bail;
1792         }
1793
1794         mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1795              res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1796
1797         res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1798
1799         BUG_ON(res->sr_bits == 0);
1800         if (!status)
1801                 ocfs2_bg_discontig_fix_result(ac, bg, res);
1802
1803         /*
1804          * sr_bg_blkno might have been changed by
1805          * ocfs2_bg_discontig_fix_result
1806          */
1807         res->sr_bg_stable_blkno = group_bh->b_blocknr;
1808
1809         /*
1810          * Keep track of previous block descriptor read. When
1811          * we find a target, if we have read more than X
1812          * number of descriptors, and the target is reasonably
1813          * empty, relink him to top of his chain.
1814          *
1815          * We've read 0 extra blocks and only send one more to
1816          * the transaction, yet the next guy to search has a
1817          * much easier time.
1818          *
1819          * Do this *after* figuring out how many bits we're taking out
1820          * of our target group.
1821          */
1822         if (ac->ac_allow_chain_relink &&
1823             (prev_group_bh) &&
1824             (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1825                 status = ocfs2_relink_block_group(handle, alloc_inode,
1826                                                   ac->ac_bh, group_bh,
1827                                                   prev_group_bh, chain);
1828                 if (status < 0) {
1829                         mlog_errno(status);
1830                         goto bail;
1831                 }
1832         }
1833
1834         if (ac->ac_find_loc_only)
1835                 goto out_loc_only;
1836
1837         status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1838                                                   ac->ac_bh, res->sr_bits,
1839                                                   chain);
1840         if (status) {
1841                 mlog_errno(status);
1842                 goto bail;
1843         }
1844
1845         status = ocfs2_block_group_set_bits(handle,
1846                                             alloc_inode,
1847                                             bg,
1848                                             group_bh,
1849                                             res->sr_bit_offset,
1850                                             res->sr_bits);
1851         if (status < 0) {
1852                 mlog_errno(status);
1853                 goto bail;
1854         }
1855
1856         mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
1857              (unsigned long long)le64_to_cpu(fe->i_blkno));
1858
1859 out_loc_only:
1860         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1861 bail:
1862         brelse(group_bh);
1863         brelse(prev_group_bh);
1864
1865         mlog_exit(status);
1866         return status;
1867 }
1868
1869 /* will give out up to bits_wanted contiguous bits. */
1870 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1871                                      handle_t *handle,
1872                                      u32 bits_wanted,
1873                                      u32 min_bits,
1874                                      struct ocfs2_suballoc_result *res)
1875 {
1876         int status;
1877         u16 victim, i;
1878         u16 bits_left = 0;
1879         u64 hint = ac->ac_last_group;
1880         struct ocfs2_chain_list *cl;
1881         struct ocfs2_dinode *fe;
1882
1883         mlog_entry_void();
1884
1885         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1886         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1887         BUG_ON(!ac->ac_bh);
1888
1889         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1890
1891         /* The bh was validated by the inode read during
1892          * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1893         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1894
1895         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1896             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1897                 ocfs2_error(ac->ac_inode->i_sb,
1898                             "Chain allocator dinode %llu has %u used "
1899                             "bits but only %u total.",
1900                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1901                             le32_to_cpu(fe->id1.bitmap1.i_used),
1902                             le32_to_cpu(fe->id1.bitmap1.i_total));
1903                 status = -EIO;
1904                 goto bail;
1905         }
1906
1907         res->sr_bg_blkno = hint;
1908         if (res->sr_bg_blkno) {
1909                 /* Attempt to short-circuit the usual search mechanism
1910                  * by jumping straight to the most recently used
1911                  * allocation group. This helps us mantain some
1912                  * contiguousness across allocations. */
1913                 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1914                                                 min_bits, res, &bits_left);
1915                 if (!status)
1916                         goto set_hint;
1917                 if (status < 0 && status != -ENOSPC) {
1918                         mlog_errno(status);
1919                         goto bail;
1920                 }
1921         }
1922
1923         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1924
1925         victim = ocfs2_find_victim_chain(cl);
1926         ac->ac_chain = victim;
1927         ac->ac_allow_chain_relink = 1;
1928
1929         status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1930                                     res, &bits_left);
1931         if (!status) {
1932                 hint = ocfs2_group_from_res(res);
1933                 goto set_hint;
1934         }
1935         if (status < 0 && status != -ENOSPC) {
1936                 mlog_errno(status);
1937                 goto bail;
1938         }
1939
1940         mlog(0, "Search of victim chain %u came up with nothing, "
1941              "trying all chains now.\n", victim);
1942
1943         /* If we didn't pick a good victim, then just default to
1944          * searching each chain in order. Don't allow chain relinking
1945          * because we only calculate enough journal credits for one
1946          * relink per alloc. */
1947         ac->ac_allow_chain_relink = 0;
1948         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1949                 if (i == victim)
1950                         continue;
1951                 if (!cl->cl_recs[i].c_free)
1952                         continue;
1953
1954                 ac->ac_chain = i;
1955                 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1956                                             res, &bits_left);
1957                 if (!status) {
1958                         hint = ocfs2_group_from_res(res);
1959                         break;
1960                 }
1961                 if (status < 0 && status != -ENOSPC) {
1962                         mlog_errno(status);
1963                         goto bail;
1964                 }
1965         }
1966
1967 set_hint:
1968         if (status != -ENOSPC) {
1969                 /* If the next search of this group is not likely to
1970                  * yield a suitable extent, then we reset the last
1971                  * group hint so as to not waste a disk read */
1972                 if (bits_left < min_bits)
1973                         ac->ac_last_group = 0;
1974                 else
1975                         ac->ac_last_group = hint;
1976         }
1977
1978 bail:
1979         mlog_exit(status);
1980         return status;
1981 }
1982
1983 int ocfs2_claim_metadata(handle_t *handle,
1984                          struct ocfs2_alloc_context *ac,
1985                          u32 bits_wanted,
1986                          u64 *suballoc_loc,
1987                          u16 *suballoc_bit_start,
1988                          unsigned int *num_bits,
1989                          u64 *blkno_start)
1990 {
1991         int status;
1992         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1993
1994         BUG_ON(!ac);
1995         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1996         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1997
1998         status = ocfs2_claim_suballoc_bits(ac,
1999                                            handle,
2000                                            bits_wanted,
2001                                            1,
2002                                            &res);
2003         if (status < 0) {
2004                 mlog_errno(status);
2005                 goto bail;
2006         }
2007         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2008
2009         *suballoc_loc = res.sr_bg_blkno;
2010         *suballoc_bit_start = res.sr_bit_offset;
2011         *blkno_start = res.sr_blkno;
2012         ac->ac_bits_given += res.sr_bits;
2013         *num_bits = res.sr_bits;
2014         status = 0;
2015 bail:
2016         mlog_exit(status);
2017         return status;
2018 }
2019
2020 static void ocfs2_init_inode_ac_group(struct inode *dir,
2021                                       struct buffer_head *parent_di_bh,
2022                                       struct ocfs2_alloc_context *ac)
2023 {
2024         struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2025         /*
2026          * Try to allocate inodes from some specific group.
2027          *
2028          * If the parent dir has recorded the last group used in allocation,
2029          * cool, use it. Otherwise if we try to allocate new inode from the
2030          * same slot the parent dir belongs to, use the same chunk.
2031          *
2032          * We are very careful here to avoid the mistake of setting
2033          * ac_last_group to a group descriptor from a different (unlocked) slot.
2034          */
2035         if (OCFS2_I(dir)->ip_last_used_group &&
2036             OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2037                 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2038         else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2039                 if (di->i_suballoc_loc)
2040                         ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2041                 else
2042                         ac->ac_last_group = ocfs2_which_suballoc_group(
2043                                         le64_to_cpu(di->i_blkno),
2044                                         le16_to_cpu(di->i_suballoc_bit));
2045         }
2046 }
2047
2048 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2049                                              struct ocfs2_alloc_context *ac)
2050 {
2051         OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2052         OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2053 }
2054
2055 int ocfs2_find_new_inode_loc(struct inode *dir,
2056                              struct buffer_head *parent_fe_bh,
2057                              struct ocfs2_alloc_context *ac,
2058                              u64 *fe_blkno)
2059 {
2060         int ret;
2061         handle_t *handle = NULL;
2062         struct ocfs2_suballoc_result *res;
2063
2064         BUG_ON(!ac);
2065         BUG_ON(ac->ac_bits_given != 0);
2066         BUG_ON(ac->ac_bits_wanted != 1);
2067         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2068
2069         res = kzalloc(sizeof(*res), GFP_NOFS);
2070         if (res == NULL) {
2071                 ret = -ENOMEM;
2072                 mlog_errno(ret);
2073                 goto out;
2074         }
2075
2076         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2077
2078         /*
2079          * The handle started here is for chain relink. Alternatively,
2080          * we could just disable relink for these calls.
2081          */
2082         handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2083         if (IS_ERR(handle)) {
2084                 ret = PTR_ERR(handle);
2085                 handle = NULL;
2086                 mlog_errno(ret);
2087                 goto out;
2088         }
2089
2090         /*
2091          * This will instruct ocfs2_claim_suballoc_bits and
2092          * ocfs2_search_one_group to search but save actual allocation
2093          * for later.
2094          */
2095         ac->ac_find_loc_only = 1;
2096
2097         ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2098         if (ret < 0) {
2099                 mlog_errno(ret);
2100                 goto out;
2101         }
2102
2103         ac->ac_find_loc_priv = res;
2104         *fe_blkno = res->sr_blkno;
2105
2106 out:
2107         if (handle)
2108                 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2109
2110         if (ret)
2111                 kfree(res);
2112
2113         return ret;
2114 }
2115
2116 int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2117                                  struct inode *dir,
2118                                  struct ocfs2_alloc_context *ac,
2119                                  u64 *suballoc_loc,
2120                                  u16 *suballoc_bit,
2121                                  u64 di_blkno)
2122 {
2123         int ret;
2124         u16 chain;
2125         struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2126         struct buffer_head *bg_bh = NULL;
2127         struct ocfs2_group_desc *bg;
2128         struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2129
2130         /*
2131          * Since di_blkno is being passed back in, we check for any
2132          * inconsistencies which may have happened between
2133          * calls. These are code bugs as di_blkno is not expected to
2134          * change once returned from ocfs2_find_new_inode_loc()
2135          */
2136         BUG_ON(res->sr_blkno != di_blkno);
2137
2138         ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2139                                           res->sr_bg_stable_blkno, &bg_bh);
2140         if (ret) {
2141                 mlog_errno(ret);
2142                 goto out;
2143         }
2144
2145         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2146         chain = le16_to_cpu(bg->bg_chain);
2147
2148         ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2149                                                ac->ac_bh, res->sr_bits,
2150                                                chain);
2151         if (ret) {
2152                 mlog_errno(ret);
2153                 goto out;
2154         }
2155
2156         ret = ocfs2_block_group_set_bits(handle,
2157                                          ac->ac_inode,
2158                                          bg,
2159                                          bg_bh,
2160                                          res->sr_bit_offset,
2161                                          res->sr_bits);
2162         if (ret < 0) {
2163                 mlog_errno(ret);
2164                 goto out;
2165         }
2166
2167         mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
2168              (unsigned long long)di_blkno);
2169
2170         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2171
2172         BUG_ON(res->sr_bits != 1);
2173
2174         *suballoc_loc = res->sr_bg_blkno;
2175         *suballoc_bit = res->sr_bit_offset;
2176         ac->ac_bits_given++;
2177         ocfs2_save_inode_ac_group(dir, ac);
2178
2179 out:
2180         brelse(bg_bh);
2181
2182         return ret;
2183 }
2184
2185 int ocfs2_claim_new_inode(handle_t *handle,
2186                           struct inode *dir,
2187                           struct buffer_head *parent_fe_bh,
2188                           struct ocfs2_alloc_context *ac,
2189                           u64 *suballoc_loc,
2190                           u16 *suballoc_bit,
2191                           u64 *fe_blkno)
2192 {
2193         int status;
2194         struct ocfs2_suballoc_result res;
2195
2196         mlog_entry_void();
2197
2198         BUG_ON(!ac);
2199         BUG_ON(ac->ac_bits_given != 0);
2200         BUG_ON(ac->ac_bits_wanted != 1);
2201         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2202
2203         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2204
2205         status = ocfs2_claim_suballoc_bits(ac,
2206                                            handle,
2207                                            1,
2208                                            1,
2209                                            &res);
2210         if (status < 0) {
2211                 mlog_errno(status);
2212                 goto bail;
2213         }
2214         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2215
2216         BUG_ON(res.sr_bits != 1);
2217
2218         *suballoc_loc = res.sr_bg_blkno;
2219         *suballoc_bit = res.sr_bit_offset;
2220         *fe_blkno = res.sr_blkno;
2221         ac->ac_bits_given++;
2222         ocfs2_save_inode_ac_group(dir, ac);
2223         status = 0;
2224 bail:
2225         mlog_exit(status);
2226         return status;
2227 }
2228
2229 /* translate a group desc. blkno and it's bitmap offset into
2230  * disk cluster offset. */
2231 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2232                                                    u64 bg_blkno,
2233                                                    u16 bg_bit_off)
2234 {
2235         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2236         u32 cluster = 0;
2237
2238         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2239
2240         if (bg_blkno != osb->first_cluster_group_blkno)
2241                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2242         cluster += (u32) bg_bit_off;
2243         return cluster;
2244 }
2245
2246 /* given a cluster offset, calculate which block group it belongs to
2247  * and return that block offset. */
2248 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2249 {
2250         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2251         u32 group_no;
2252
2253         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2254
2255         group_no = cluster / osb->bitmap_cpg;
2256         if (!group_no)
2257                 return osb->first_cluster_group_blkno;
2258         return ocfs2_clusters_to_blocks(inode->i_sb,
2259                                         group_no * osb->bitmap_cpg);
2260 }
2261
2262 /* given the block number of a cluster start, calculate which cluster
2263  * group and descriptor bitmap offset that corresponds to. */
2264 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2265                                                 u64 data_blkno,
2266                                                 u64 *bg_blkno,
2267                                                 u16 *bg_bit_off)
2268 {
2269         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2270         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2271
2272         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2273
2274         *bg_blkno = ocfs2_which_cluster_group(inode,
2275                                               data_cluster);
2276
2277         if (*bg_blkno == osb->first_cluster_group_blkno)
2278                 *bg_bit_off = (u16) data_cluster;
2279         else
2280                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2281                                                              data_blkno - *bg_blkno);
2282 }
2283
2284 /*
2285  * min_bits - minimum contiguous chunk from this total allocation we
2286  * can handle. set to what we asked for originally for a full
2287  * contig. allocation, set to '1' to indicate we can deal with extents
2288  * of any size.
2289  */
2290 int __ocfs2_claim_clusters(handle_t *handle,
2291                            struct ocfs2_alloc_context *ac,
2292                            u32 min_clusters,
2293                            u32 max_clusters,
2294                            u32 *cluster_start,
2295                            u32 *num_clusters)
2296 {
2297         int status;
2298         unsigned int bits_wanted = max_clusters;
2299         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2300         struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2301
2302         mlog_entry_void();
2303
2304         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2305
2306         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2307                && ac->ac_which != OCFS2_AC_USE_MAIN);
2308
2309         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2310                 WARN_ON(min_clusters > 1);
2311
2312                 status = ocfs2_claim_local_alloc_bits(osb,
2313                                                       handle,
2314                                                       ac,
2315                                                       bits_wanted,
2316                                                       cluster_start,
2317                                                       num_clusters);
2318                 if (!status)
2319                         atomic_inc(&osb->alloc_stats.local_data);
2320         } else {
2321                 if (min_clusters > (osb->bitmap_cpg - 1)) {
2322                         /* The only paths asking for contiguousness
2323                          * should know about this already. */
2324                         mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2325                              "group bitmap size %u!\n", min_clusters,
2326                              osb->bitmap_cpg);
2327                         status = -ENOSPC;
2328                         goto bail;
2329                 }
2330                 /* clamp the current request down to a realistic size. */
2331                 if (bits_wanted > (osb->bitmap_cpg - 1))
2332                         bits_wanted = osb->bitmap_cpg - 1;
2333
2334                 status = ocfs2_claim_suballoc_bits(ac,
2335                                                    handle,
2336                                                    bits_wanted,
2337                                                    min_clusters,
2338                                                    &res);
2339                 if (!status) {
2340                         BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2341                         *cluster_start =
2342                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2343                                                                  res.sr_bg_blkno,
2344                                                                  res.sr_bit_offset);
2345                         atomic_inc(&osb->alloc_stats.bitmap_data);
2346                         *num_clusters = res.sr_bits;
2347                 }
2348         }
2349         if (status < 0) {
2350                 if (status != -ENOSPC)
2351                         mlog_errno(status);
2352                 goto bail;
2353         }
2354
2355         ac->ac_bits_given += *num_clusters;
2356
2357 bail:
2358         mlog_exit(status);
2359         return status;
2360 }
2361
2362 int ocfs2_claim_clusters(handle_t *handle,
2363                          struct ocfs2_alloc_context *ac,
2364                          u32 min_clusters,
2365                          u32 *cluster_start,
2366                          u32 *num_clusters)
2367 {
2368         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2369
2370         return __ocfs2_claim_clusters(handle, ac, min_clusters,
2371                                       bits_wanted, cluster_start, num_clusters);
2372 }
2373
2374 static int ocfs2_block_group_clear_bits(handle_t *handle,
2375                                         struct inode *alloc_inode,
2376                                         struct ocfs2_group_desc *bg,
2377                                         struct buffer_head *group_bh,
2378                                         unsigned int bit_off,
2379                                         unsigned int num_bits,
2380                                         void (*undo_fn)(unsigned int bit,
2381                                                         unsigned long *bmap))
2382 {
2383         int status;
2384         unsigned int tmp;
2385         struct ocfs2_group_desc *undo_bg = NULL;
2386
2387         mlog_entry_void();
2388
2389         /* The caller got this descriptor from
2390          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2391         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2392
2393         mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
2394
2395         BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2396         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2397                                          group_bh,
2398                                          undo_fn ?
2399                                          OCFS2_JOURNAL_ACCESS_UNDO :
2400                                          OCFS2_JOURNAL_ACCESS_WRITE);
2401         if (status < 0) {
2402                 mlog_errno(status);
2403                 goto bail;
2404         }
2405
2406         if (undo_fn) {
2407                 jbd_lock_bh_state(group_bh);
2408                 undo_bg = (struct ocfs2_group_desc *)
2409                                         bh2jh(group_bh)->b_committed_data;
2410                 BUG_ON(!undo_bg);
2411         }
2412
2413         tmp = num_bits;
2414         while(tmp--) {
2415                 ocfs2_clear_bit((bit_off + tmp),
2416                                 (unsigned long *) bg->bg_bitmap);
2417                 if (undo_fn)
2418                         undo_fn(bit_off + tmp,
2419                                 (unsigned long *) undo_bg->bg_bitmap);
2420         }
2421         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2422
2423         if (undo_fn)
2424                 jbd_unlock_bh_state(group_bh);
2425
2426         ocfs2_journal_dirty(handle, group_bh);
2427 bail:
2428         return status;
2429 }
2430
2431 /*
2432  * expects the suballoc inode to already be locked.
2433  */
2434 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2435                                      struct inode *alloc_inode,
2436                                      struct buffer_head *alloc_bh,
2437                                      unsigned int start_bit,
2438                                      u64 bg_blkno,
2439                                      unsigned int count,
2440                                      void (*undo_fn)(unsigned int bit,
2441                                                      unsigned long *bitmap))
2442 {
2443         int status = 0;
2444         u32 tmp_used;
2445         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2446         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2447         struct buffer_head *group_bh = NULL;
2448         struct ocfs2_group_desc *group;
2449
2450         mlog_entry_void();
2451
2452         /* The alloc_bh comes from ocfs2_free_dinode() or
2453          * ocfs2_free_clusters().  The callers have all locked the
2454          * allocator and gotten alloc_bh from the lock call.  This
2455          * validates the dinode buffer.  Any corruption that has happended
2456          * is a code bug. */
2457         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2458         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2459
2460         mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
2461              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
2462              (unsigned long long)bg_blkno, start_bit);
2463
2464         status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2465                                              &group_bh);
2466         if (status < 0) {
2467                 mlog_errno(status);
2468                 goto bail;
2469         }
2470         group = (struct ocfs2_group_desc *) group_bh->b_data;
2471
2472         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2473
2474         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2475                                               group, group_bh,
2476                                               start_bit, count, undo_fn);
2477         if (status < 0) {
2478                 mlog_errno(status);
2479                 goto bail;
2480         }
2481
2482         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2483                                          alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2484         if (status < 0) {
2485                 mlog_errno(status);
2486                 goto bail;
2487         }
2488
2489         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2490                      count);
2491         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2492         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2493         ocfs2_journal_dirty(handle, alloc_bh);
2494
2495 bail:
2496         brelse(group_bh);
2497
2498         mlog_exit(status);
2499         return status;
2500 }
2501
2502 int ocfs2_free_suballoc_bits(handle_t *handle,
2503                              struct inode *alloc_inode,
2504                              struct buffer_head *alloc_bh,
2505                              unsigned int start_bit,
2506                              u64 bg_blkno,
2507                              unsigned int count)
2508 {
2509         return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2510                                          start_bit, bg_blkno, count, NULL);
2511 }
2512
2513 int ocfs2_free_dinode(handle_t *handle,
2514                       struct inode *inode_alloc_inode,
2515                       struct buffer_head *inode_alloc_bh,
2516                       struct ocfs2_dinode *di)
2517 {
2518         u64 blk = le64_to_cpu(di->i_blkno);
2519         u16 bit = le16_to_cpu(di->i_suballoc_bit);
2520         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2521
2522         if (di->i_suballoc_loc)
2523                 bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2524         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2525                                         inode_alloc_bh, bit, bg_blkno, 1);
2526 }
2527
2528 static int _ocfs2_free_clusters(handle_t *handle,
2529                                 struct inode *bitmap_inode,
2530                                 struct buffer_head *bitmap_bh,
2531                                 u64 start_blk,
2532                                 unsigned int num_clusters,
2533                                 void (*undo_fn)(unsigned int bit,
2534                                                 unsigned long *bitmap))
2535 {
2536         int status;
2537         u16 bg_start_bit;
2538         u64 bg_blkno;
2539         struct ocfs2_dinode *fe;
2540
2541         /* You can't ever have a contiguous set of clusters
2542          * bigger than a block group bitmap so we never have to worry
2543          * about looping on them. */
2544
2545         mlog_entry_void();
2546
2547         /* This is expensive. We can safely remove once this stuff has
2548          * gotten tested really well. */
2549         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2550
2551         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2552
2553         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2554                                      &bg_start_bit);
2555
2556         mlog(0, "want to free %u clusters starting at block %llu\n",
2557              num_clusters, (unsigned long long)start_blk);
2558         mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2559              (unsigned long long)bg_blkno, bg_start_bit);
2560
2561         status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2562                                            bg_start_bit, bg_blkno,
2563                                            num_clusters, undo_fn);
2564         if (status < 0) {
2565                 mlog_errno(status);
2566                 goto out;
2567         }
2568
2569         ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2570                                          num_clusters);
2571
2572 out:
2573         mlog_exit(status);
2574         return status;
2575 }
2576
2577 int ocfs2_free_clusters(handle_t *handle,
2578                         struct inode *bitmap_inode,
2579                         struct buffer_head *bitmap_bh,
2580                         u64 start_blk,
2581                         unsigned int num_clusters)
2582 {
2583         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2584                                     start_blk, num_clusters,
2585                                     _ocfs2_set_bit);
2586 }
2587
2588 /*
2589  * Give never-used clusters back to the global bitmap.  We don't need
2590  * to protect these bits in the undo buffer.
2591  */
2592 int ocfs2_release_clusters(handle_t *handle,
2593                            struct inode *bitmap_inode,
2594                            struct buffer_head *bitmap_bh,
2595                            u64 start_blk,
2596                            unsigned int num_clusters)
2597 {
2598         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2599                                     start_blk, num_clusters,
2600                                     _ocfs2_clear_bit);
2601 }
2602
2603 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2604 {
2605         printk("Block Group:\n");
2606         printk("bg_signature:       %s\n", bg->bg_signature);
2607         printk("bg_size:            %u\n", bg->bg_size);
2608         printk("bg_bits:            %u\n", bg->bg_bits);
2609         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2610         printk("bg_chain:           %u\n", bg->bg_chain);
2611         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2612         printk("bg_next_group:      %llu\n",
2613                (unsigned long long)bg->bg_next_group);
2614         printk("bg_parent_dinode:   %llu\n",
2615                (unsigned long long)bg->bg_parent_dinode);
2616         printk("bg_blkno:           %llu\n",
2617                (unsigned long long)bg->bg_blkno);
2618 }
2619
2620 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2621 {
2622         int i;
2623
2624         printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2625         printk("i_signature:                  %s\n", fe->i_signature);
2626         printk("i_size:                       %llu\n",
2627                (unsigned long long)fe->i_size);
2628         printk("i_clusters:                   %u\n", fe->i_clusters);
2629         printk("i_generation:                 %u\n",
2630                le32_to_cpu(fe->i_generation));
2631         printk("id1.bitmap1.i_used:           %u\n",
2632                le32_to_cpu(fe->id1.bitmap1.i_used));
2633         printk("id1.bitmap1.i_total:          %u\n",
2634                le32_to_cpu(fe->id1.bitmap1.i_total));
2635         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2636         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2637         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2638         printk("id2.i_chain.cl_next_free_rec: %u\n",
2639                fe->id2.i_chain.cl_next_free_rec);
2640         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2641                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2642                        fe->id2.i_chain.cl_recs[i].c_free);
2643                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2644                        fe->id2.i_chain.cl_recs[i].c_total);
2645                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2646                        (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2647         }
2648 }
2649
2650 /*
2651  * For a given allocation, determine which allocators will need to be
2652  * accessed, and lock them, reserving the appropriate number of bits.
2653  *
2654  * Sparse file systems call this from ocfs2_write_begin_nolock()
2655  * and ocfs2_allocate_unwritten_extents().
2656  *
2657  * File systems which don't support holes call this from
2658  * ocfs2_extend_allocation().
2659  */
2660 int ocfs2_lock_allocators(struct inode *inode,
2661                           struct ocfs2_extent_tree *et,
2662                           u32 clusters_to_add, u32 extents_to_split,
2663                           struct ocfs2_alloc_context **data_ac,
2664                           struct ocfs2_alloc_context **meta_ac)
2665 {
2666         int ret = 0, num_free_extents;
2667         unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2668         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2669
2670         *meta_ac = NULL;
2671         if (data_ac)
2672                 *data_ac = NULL;
2673
2674         BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2675
2676         num_free_extents = ocfs2_num_free_extents(osb, et);
2677         if (num_free_extents < 0) {
2678                 ret = num_free_extents;
2679                 mlog_errno(ret);
2680                 goto out;
2681         }
2682
2683         /*
2684          * Sparse allocation file systems need to be more conservative
2685          * with reserving room for expansion - the actual allocation
2686          * happens while we've got a journal handle open so re-taking
2687          * a cluster lock (because we ran out of room for another
2688          * extent) will violate ordering rules.
2689          *
2690          * Most of the time we'll only be seeing this 1 cluster at a time
2691          * anyway.
2692          *
2693          * Always lock for any unwritten extents - we might want to
2694          * add blocks during a split.
2695          */
2696         if (!num_free_extents ||
2697             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2698                 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2699                 if (ret < 0) {
2700                         if (ret != -ENOSPC)
2701                                 mlog_errno(ret);
2702                         goto out;
2703                 }
2704         }
2705
2706         if (clusters_to_add == 0)
2707                 goto out;
2708
2709         ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2710         if (ret < 0) {
2711                 if (ret != -ENOSPC)
2712                         mlog_errno(ret);
2713                 goto out;
2714         }
2715
2716 out:
2717         if (ret) {
2718                 if (*meta_ac) {
2719                         ocfs2_free_alloc_context(*meta_ac);
2720                         *meta_ac = NULL;
2721                 }
2722
2723                 /*
2724                  * We cannot have an error and a non null *data_ac.
2725                  */
2726         }
2727
2728         return ret;
2729 }
2730
2731 /*
2732  * Read the inode specified by blkno to get suballoc_slot and
2733  * suballoc_bit.
2734  */
2735 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2736                                        u16 *suballoc_slot, u64 *group_blkno,
2737                                        u16 *suballoc_bit)
2738 {
2739         int status;
2740         struct buffer_head *inode_bh = NULL;
2741         struct ocfs2_dinode *inode_fe;
2742
2743         mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
2744
2745         /* dirty read disk */
2746         status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2747         if (status < 0) {
2748                 mlog(ML_ERROR, "read block %llu failed %d\n",
2749                      (unsigned long long)blkno, status);
2750                 goto bail;
2751         }
2752
2753         inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2754         if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2755                 mlog(ML_ERROR, "invalid inode %llu requested\n",
2756                      (unsigned long long)blkno);
2757                 status = -EINVAL;
2758                 goto bail;
2759         }
2760
2761         if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2762             (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2763                 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2764                      (unsigned long long)blkno,
2765                      (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2766                 status = -EINVAL;
2767                 goto bail;
2768         }
2769
2770         if (suballoc_slot)
2771                 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2772         if (suballoc_bit)
2773                 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2774         if (group_blkno)
2775                 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2776
2777 bail:
2778         brelse(inode_bh);
2779
2780         mlog_exit(status);
2781         return status;
2782 }
2783
2784 /*
2785  * test whether bit is SET in allocator bitmap or not.  on success, 0
2786  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2787  * is returned and *res is meaningless.  Call this after you have
2788  * cluster locked against suballoc, or you may get a result based on
2789  * non-up2date contents
2790  */
2791 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2792                                    struct inode *suballoc,
2793                                    struct buffer_head *alloc_bh,
2794                                    u64 group_blkno, u64 blkno,
2795                                    u16 bit, int *res)
2796 {
2797         struct ocfs2_dinode *alloc_di;
2798         struct ocfs2_group_desc *group;
2799         struct buffer_head *group_bh = NULL;
2800         u64 bg_blkno;
2801         int status;
2802
2803         mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2804                    (unsigned int)bit);
2805
2806         alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2807         if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2808                 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2809                      (unsigned int)bit,
2810                      ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2811                 status = -EINVAL;
2812                 goto bail;
2813         }
2814
2815         bg_blkno = group_blkno ? group_blkno :
2816                    ocfs2_which_suballoc_group(blkno, bit);
2817         status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2818                                              &group_bh);
2819         if (status < 0) {
2820                 mlog(ML_ERROR, "read group %llu failed %d\n",
2821                      (unsigned long long)bg_blkno, status);
2822                 goto bail;
2823         }
2824
2825         group = (struct ocfs2_group_desc *) group_bh->b_data;
2826         *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2827
2828 bail:
2829         brelse(group_bh);
2830
2831         mlog_exit(status);
2832         return status;
2833 }
2834
2835 /*
2836  * Test if the bit representing this inode (blkno) is set in the
2837  * suballocator.
2838  *
2839  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2840  *
2841  * In the event of failure, a negative value is returned and *res is
2842  * meaningless.
2843  *
2844  * Callers must make sure to hold nfs_sync_lock to prevent
2845  * ocfs2_delete_inode() on another node from accessing the same
2846  * suballocator concurrently.
2847  */
2848 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2849 {
2850         int status;
2851         u64 group_blkno = 0;
2852         u16 suballoc_bit = 0, suballoc_slot = 0;
2853         struct inode *inode_alloc_inode;
2854         struct buffer_head *alloc_bh = NULL;
2855
2856         mlog_entry("blkno: %llu", (unsigned long long)blkno);
2857
2858         status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2859                                              &group_blkno, &suballoc_bit);
2860         if (status < 0) {
2861                 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2862                 goto bail;
2863         }
2864
2865         inode_alloc_inode =
2866                 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2867                                             suballoc_slot);
2868         if (!inode_alloc_inode) {
2869                 /* the error code could be inaccurate, but we are not able to
2870                  * get the correct one. */
2871                 status = -EINVAL;
2872                 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2873                      (u32)suballoc_slot);
2874                 goto bail;
2875         }
2876
2877         mutex_lock(&inode_alloc_inode->i_mutex);
2878         status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2879         if (status < 0) {
2880                 mutex_unlock(&inode_alloc_inode->i_mutex);
2881                 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2882                      (u32)suballoc_slot, status);
2883                 goto bail;
2884         }
2885
2886         status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2887                                          group_blkno, blkno, suballoc_bit, res);
2888         if (status < 0)
2889                 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2890
2891         ocfs2_inode_unlock(inode_alloc_inode, 0);
2892         mutex_unlock(&inode_alloc_inode->i_mutex);
2893
2894         iput(inode_alloc_inode);
2895         brelse(alloc_bh);
2896 bail:
2897         mlog_exit(status);
2898         return status;
2899 }