Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec...
[pandora-kernel.git] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "blockcheck.h"
39 #include "dlmglue.h"
40 #include "inode.h"
41 #include "journal.h"
42 #include "localalloc.h"
43 #include "suballoc.h"
44 #include "super.h"
45 #include "sysfile.h"
46 #include "uptodate.h"
47
48 #include "buffer_head_io.h"
49
50 #define NOT_ALLOC_NEW_GROUP             0
51 #define ALLOC_NEW_GROUP                 0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL        0x2
53
54 #define OCFS2_MAX_TO_STEAL              1024
55
56 struct ocfs2_suballoc_result {
57         u64             sr_bg_blkno;    /* The bg we allocated from.  Set
58                                            to 0 when a block group is
59                                            contiguous. */
60         u64             sr_bg_stable_blkno; /*
61                                              * Doesn't change, always
62                                              * set to target block
63                                              * group descriptor
64                                              * block.
65                                              */
66         u64             sr_blkno;       /* The first allocated block */
67         unsigned int    sr_bit_offset;  /* The bit in the bg */
68         unsigned int    sr_bits;        /* How many bits we claimed */
69 };
70
71 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
72 {
73         if (res->sr_blkno == 0)
74                 return 0;
75
76         if (res->sr_bg_blkno)
77                 return res->sr_bg_blkno;
78
79         return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
80 }
81
82 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
83 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
84 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
85 static int ocfs2_block_group_fill(handle_t *handle,
86                                   struct inode *alloc_inode,
87                                   struct buffer_head *bg_bh,
88                                   u64 group_blkno,
89                                   unsigned int group_clusters,
90                                   u16 my_chain,
91                                   struct ocfs2_chain_list *cl);
92 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
93                                    struct inode *alloc_inode,
94                                    struct buffer_head *bh,
95                                    u64 max_block,
96                                    u64 *last_alloc_group,
97                                    int flags);
98
99 static int ocfs2_cluster_group_search(struct inode *inode,
100                                       struct buffer_head *group_bh,
101                                       u32 bits_wanted, u32 min_bits,
102                                       u64 max_block,
103                                       struct ocfs2_suballoc_result *res);
104 static int ocfs2_block_group_search(struct inode *inode,
105                                     struct buffer_head *group_bh,
106                                     u32 bits_wanted, u32 min_bits,
107                                     u64 max_block,
108                                     struct ocfs2_suballoc_result *res);
109 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
110                                      handle_t *handle,
111                                      u32 bits_wanted,
112                                      u32 min_bits,
113                                      struct ocfs2_suballoc_result *res);
114 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
115                                          int nr);
116 static inline int ocfs2_block_group_set_bits(handle_t *handle,
117                                              struct inode *alloc_inode,
118                                              struct ocfs2_group_desc *bg,
119                                              struct buffer_head *group_bh,
120                                              unsigned int bit_off,
121                                              unsigned int num_bits);
122 static int ocfs2_relink_block_group(handle_t *handle,
123                                     struct inode *alloc_inode,
124                                     struct buffer_head *fe_bh,
125                                     struct buffer_head *bg_bh,
126                                     struct buffer_head *prev_bg_bh,
127                                     u16 chain);
128 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
129                                                      u32 wanted);
130 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
131                                                    u64 bg_blkno,
132                                                    u16 bg_bit_off);
133 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
134                                                 u64 data_blkno,
135                                                 u64 *bg_blkno,
136                                                 u16 *bg_bit_off);
137 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
138                                              u32 bits_wanted, u64 max_block,
139                                              int flags,
140                                              struct ocfs2_alloc_context **ac);
141
142 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
143 {
144         struct inode *inode = ac->ac_inode;
145
146         if (inode) {
147                 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
148                         ocfs2_inode_unlock(inode, 1);
149
150                 mutex_unlock(&inode->i_mutex);
151
152                 iput(inode);
153                 ac->ac_inode = NULL;
154         }
155         brelse(ac->ac_bh);
156         ac->ac_bh = NULL;
157         ac->ac_resv = NULL;
158         if (ac->ac_find_loc_priv) {
159                 kfree(ac->ac_find_loc_priv);
160                 ac->ac_find_loc_priv = NULL;
161         }
162 }
163
164 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
165 {
166         ocfs2_free_ac_resource(ac);
167         kfree(ac);
168 }
169
170 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
171 {
172         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
173 }
174
175 #define do_error(fmt, ...)                                              \
176         do{                                                             \
177                 if (resize)                                     \
178                         mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
179                 else                                                    \
180                         ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
181         } while (0)
182
183 static int ocfs2_validate_gd_self(struct super_block *sb,
184                                   struct buffer_head *bh,
185                                   int resize)
186 {
187         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
188
189         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
190                 do_error("Group descriptor #%llu has bad signature %.*s",
191                          (unsigned long long)bh->b_blocknr, 7,
192                          gd->bg_signature);
193                 return -EINVAL;
194         }
195
196         if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
197                 do_error("Group descriptor #%llu has an invalid bg_blkno "
198                          "of %llu",
199                          (unsigned long long)bh->b_blocknr,
200                          (unsigned long long)le64_to_cpu(gd->bg_blkno));
201                 return -EINVAL;
202         }
203
204         if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
205                 do_error("Group descriptor #%llu has an invalid "
206                          "fs_generation of #%u",
207                          (unsigned long long)bh->b_blocknr,
208                          le32_to_cpu(gd->bg_generation));
209                 return -EINVAL;
210         }
211
212         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
213                 do_error("Group descriptor #%llu has bit count %u but "
214                          "claims that %u are free",
215                          (unsigned long long)bh->b_blocknr,
216                          le16_to_cpu(gd->bg_bits),
217                          le16_to_cpu(gd->bg_free_bits_count));
218                 return -EINVAL;
219         }
220
221         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
222                 do_error("Group descriptor #%llu has bit count %u but "
223                          "max bitmap bits of %u",
224                          (unsigned long long)bh->b_blocknr,
225                          le16_to_cpu(gd->bg_bits),
226                          8 * le16_to_cpu(gd->bg_size));
227                 return -EINVAL;
228         }
229
230         return 0;
231 }
232
233 static int ocfs2_validate_gd_parent(struct super_block *sb,
234                                     struct ocfs2_dinode *di,
235                                     struct buffer_head *bh,
236                                     int resize)
237 {
238         unsigned int max_bits;
239         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
240
241         if (di->i_blkno != gd->bg_parent_dinode) {
242                 do_error("Group descriptor #%llu has bad parent "
243                          "pointer (%llu, expected %llu)",
244                          (unsigned long long)bh->b_blocknr,
245                          (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
246                          (unsigned long long)le64_to_cpu(di->i_blkno));
247                 return -EINVAL;
248         }
249
250         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
251         if (le16_to_cpu(gd->bg_bits) > max_bits) {
252                 do_error("Group descriptor #%llu has bit count of %u",
253                          (unsigned long long)bh->b_blocknr,
254                          le16_to_cpu(gd->bg_bits));
255                 return -EINVAL;
256         }
257
258         /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
259         if ((le16_to_cpu(gd->bg_chain) >
260              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
261             ((le16_to_cpu(gd->bg_chain) ==
262              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
263                 do_error("Group descriptor #%llu has bad chain %u",
264                          (unsigned long long)bh->b_blocknr,
265                          le16_to_cpu(gd->bg_chain));
266                 return -EINVAL;
267         }
268
269         return 0;
270 }
271
272 #undef do_error
273
274 /*
275  * This version only prints errors.  It does not fail the filesystem, and
276  * exists only for resize.
277  */
278 int ocfs2_check_group_descriptor(struct super_block *sb,
279                                  struct ocfs2_dinode *di,
280                                  struct buffer_head *bh)
281 {
282         int rc;
283         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
284
285         BUG_ON(!buffer_uptodate(bh));
286
287         /*
288          * If the ecc fails, we return the error but otherwise
289          * leave the filesystem running.  We know any error is
290          * local to this block.
291          */
292         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
293         if (rc) {
294                 mlog(ML_ERROR,
295                      "Checksum failed for group descriptor %llu\n",
296                      (unsigned long long)bh->b_blocknr);
297         } else
298                 rc = ocfs2_validate_gd_self(sb, bh, 1);
299         if (!rc)
300                 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
301
302         return rc;
303 }
304
305 static int ocfs2_validate_group_descriptor(struct super_block *sb,
306                                            struct buffer_head *bh)
307 {
308         int rc;
309         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
310
311         mlog(0, "Validating group descriptor %llu\n",
312              (unsigned long long)bh->b_blocknr);
313
314         BUG_ON(!buffer_uptodate(bh));
315
316         /*
317          * If the ecc fails, we return the error but otherwise
318          * leave the filesystem running.  We know any error is
319          * local to this block.
320          */
321         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
322         if (rc)
323                 return rc;
324
325         /*
326          * Errors after here are fatal.
327          */
328
329         return ocfs2_validate_gd_self(sb, bh, 0);
330 }
331
332 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
333                                 u64 gd_blkno, struct buffer_head **bh)
334 {
335         int rc;
336         struct buffer_head *tmp = *bh;
337
338         rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
339                               ocfs2_validate_group_descriptor);
340         if (rc)
341                 goto out;
342
343         rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
344         if (rc) {
345                 brelse(tmp);
346                 goto out;
347         }
348
349         /* If ocfs2_read_block() got us a new bh, pass it up. */
350         if (!*bh)
351                 *bh = tmp;
352
353 out:
354         return rc;
355 }
356
357 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
358                                           struct ocfs2_group_desc *bg,
359                                           struct ocfs2_chain_list *cl,
360                                           u64 p_blkno, unsigned int clusters)
361 {
362         struct ocfs2_extent_list *el = &bg->bg_list;
363         struct ocfs2_extent_rec *rec;
364
365         BUG_ON(!ocfs2_supports_discontig_bg(osb));
366         if (!el->l_next_free_rec)
367                 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
368         rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
369         rec->e_blkno = cpu_to_le64(p_blkno);
370         rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
371                                   le16_to_cpu(cl->cl_bpc));
372         rec->e_leaf_clusters = cpu_to_le16(clusters);
373         le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
374         le16_add_cpu(&bg->bg_free_bits_count,
375                      clusters * le16_to_cpu(cl->cl_bpc));
376         le16_add_cpu(&el->l_next_free_rec, 1);
377 }
378
379 static int ocfs2_block_group_fill(handle_t *handle,
380                                   struct inode *alloc_inode,
381                                   struct buffer_head *bg_bh,
382                                   u64 group_blkno,
383                                   unsigned int group_clusters,
384                                   u16 my_chain,
385                                   struct ocfs2_chain_list *cl)
386 {
387         int status = 0;
388         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
389         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
390         struct super_block * sb = alloc_inode->i_sb;
391
392         mlog_entry_void();
393
394         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
395                 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
396                             "b_blocknr (%llu)",
397                             (unsigned long long)group_blkno,
398                             (unsigned long long) bg_bh->b_blocknr);
399                 status = -EIO;
400                 goto bail;
401         }
402
403         status = ocfs2_journal_access_gd(handle,
404                                          INODE_CACHE(alloc_inode),
405                                          bg_bh,
406                                          OCFS2_JOURNAL_ACCESS_CREATE);
407         if (status < 0) {
408                 mlog_errno(status);
409                 goto bail;
410         }
411
412         memset(bg, 0, sb->s_blocksize);
413         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
414         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
415         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
416                                                 osb->s_feature_incompat));
417         bg->bg_chain = cpu_to_le16(my_chain);
418         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
419         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
420         bg->bg_blkno = cpu_to_le64(group_blkno);
421         if (group_clusters == le16_to_cpu(cl->cl_cpg))
422                 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
423         else
424                 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
425                                               group_clusters);
426
427         /* set the 1st bit in the bitmap to account for the descriptor block */
428         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
429         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
430
431         ocfs2_journal_dirty(handle, bg_bh);
432
433         /* There is no need to zero out or otherwise initialize the
434          * other blocks in a group - All valid FS metadata in a block
435          * group stores the superblock fs_generation value at
436          * allocation time. */
437
438 bail:
439         mlog_exit(status);
440         return status;
441 }
442
443 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
444 {
445         u16 curr, best;
446
447         best = curr = 0;
448         while (curr < le16_to_cpu(cl->cl_count)) {
449                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
450                     le32_to_cpu(cl->cl_recs[curr].c_total))
451                         best = curr;
452                 curr++;
453         }
454         return best;
455 }
456
457 static struct buffer_head *
458 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
459                                struct inode *alloc_inode,
460                                struct ocfs2_alloc_context *ac,
461                                struct ocfs2_chain_list *cl)
462 {
463         int status;
464         u32 bit_off, num_bits;
465         u64 bg_blkno;
466         struct buffer_head *bg_bh;
467         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
468
469         status = ocfs2_claim_clusters(handle, ac,
470                                       le16_to_cpu(cl->cl_cpg), &bit_off,
471                                       &num_bits);
472         if (status < 0) {
473                 if (status != -ENOSPC)
474                         mlog_errno(status);
475                 goto bail;
476         }
477
478         /* setup the group */
479         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
480         mlog(0, "new descriptor, record %u, at block %llu\n",
481              alloc_rec, (unsigned long long)bg_blkno);
482
483         bg_bh = sb_getblk(osb->sb, bg_blkno);
484         if (!bg_bh) {
485                 status = -EIO;
486                 mlog_errno(status);
487                 goto bail;
488         }
489         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
490
491         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
492                                         bg_blkno, num_bits, alloc_rec, cl);
493         if (status < 0) {
494                 brelse(bg_bh);
495                 mlog_errno(status);
496         }
497
498 bail:
499         return status ? ERR_PTR(status) : bg_bh;
500 }
501
502 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
503                                         handle_t *handle,
504                                         struct ocfs2_alloc_context *ac,
505                                         unsigned int min_bits,
506                                         u32 *bit_off, u32 *num_bits)
507 {
508         int status = 0;
509
510         while (min_bits) {
511                 status = ocfs2_claim_clusters(handle, ac, min_bits,
512                                               bit_off, num_bits);
513                 if (status != -ENOSPC)
514                         break;
515
516                 min_bits >>= 1;
517         }
518
519         return status;
520 }
521
522 static int ocfs2_block_group_grow_discontig(handle_t *handle,
523                                             struct inode *alloc_inode,
524                                             struct buffer_head *bg_bh,
525                                             struct ocfs2_alloc_context *ac,
526                                             struct ocfs2_chain_list *cl,
527                                             unsigned int min_bits)
528 {
529         int status;
530         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
531         struct ocfs2_group_desc *bg =
532                 (struct ocfs2_group_desc *)bg_bh->b_data;
533         unsigned int needed = le16_to_cpu(cl->cl_cpg) -
534                          le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
535         u32 p_cpos, clusters;
536         u64 p_blkno;
537         struct ocfs2_extent_list *el = &bg->bg_list;
538
539         status = ocfs2_journal_access_gd(handle,
540                                          INODE_CACHE(alloc_inode),
541                                          bg_bh,
542                                          OCFS2_JOURNAL_ACCESS_CREATE);
543         if (status < 0) {
544                 mlog_errno(status);
545                 goto bail;
546         }
547
548         while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
549                                 le16_to_cpu(el->l_count))) {
550                 if (min_bits > needed)
551                         min_bits = needed;
552                 status = ocfs2_block_group_claim_bits(osb, handle, ac,
553                                                       min_bits, &p_cpos,
554                                                       &clusters);
555                 if (status < 0) {
556                         if (status != -ENOSPC)
557                                 mlog_errno(status);
558                         goto bail;
559                 }
560                 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
561                 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
562                                               clusters);
563
564                 min_bits = clusters;
565                 needed = le16_to_cpu(cl->cl_cpg) -
566                          le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
567         }
568
569         if (needed > 0) {
570                 /*
571                  * We have used up all the extent rec but can't fill up
572                  * the cpg. So bail out.
573                  */
574                 status = -ENOSPC;
575                 goto bail;
576         }
577
578         ocfs2_journal_dirty(handle, bg_bh);
579
580 bail:
581         return status;
582 }
583
584 static void ocfs2_bg_alloc_cleanup(handle_t *handle,
585                                    struct ocfs2_alloc_context *cluster_ac,
586                                    struct inode *alloc_inode,
587                                    struct buffer_head *bg_bh)
588 {
589         int i, ret;
590         struct ocfs2_group_desc *bg;
591         struct ocfs2_extent_list *el;
592         struct ocfs2_extent_rec *rec;
593
594         if (!bg_bh)
595                 return;
596
597         bg = (struct ocfs2_group_desc *)bg_bh->b_data;
598         el = &bg->bg_list;
599         for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
600                 rec = &el->l_recs[i];
601                 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
602                                           cluster_ac->ac_bh,
603                                           le64_to_cpu(rec->e_blkno),
604                                           le32_to_cpu(rec->e_leaf_clusters));
605                 if (ret)
606                         mlog_errno(ret);
607                 /* Try all the clusters to free */
608         }
609
610         ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
611         brelse(bg_bh);
612 }
613
614 static struct buffer_head *
615 ocfs2_block_group_alloc_discontig(handle_t *handle,
616                                   struct inode *alloc_inode,
617                                   struct ocfs2_alloc_context *ac,
618                                   struct ocfs2_chain_list *cl)
619 {
620         int status;
621         u32 bit_off, num_bits;
622         u64 bg_blkno;
623         unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
624         struct buffer_head *bg_bh = NULL;
625         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
626         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
627
628         if (!ocfs2_supports_discontig_bg(osb)) {
629                 status = -ENOSPC;
630                 goto bail;
631         }
632
633         status = ocfs2_extend_trans(handle,
634                                     ocfs2_calc_bg_discontig_credits(osb->sb));
635         if (status) {
636                 mlog_errno(status);
637                 goto bail;
638         }
639
640         /*
641          * We're going to be grabbing from multiple cluster groups.
642          * We don't have enough credits to relink them all, and the
643          * cluster groups will be staying in cache for the duration of
644          * this operation.
645          */
646         ac->ac_allow_chain_relink = 0;
647
648         /* Claim the first region */
649         status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
650                                               &bit_off, &num_bits);
651         if (status < 0) {
652                 if (status != -ENOSPC)
653                         mlog_errno(status);
654                 goto bail;
655         }
656         min_bits = num_bits;
657
658         /* setup the group */
659         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
660         mlog(0, "new descriptor, record %u, at block %llu\n",
661              alloc_rec, (unsigned long long)bg_blkno);
662
663         bg_bh = sb_getblk(osb->sb, bg_blkno);
664         if (!bg_bh) {
665                 status = -EIO;
666                 mlog_errno(status);
667                 goto bail;
668         }
669         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
670
671         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
672                                         bg_blkno, num_bits, alloc_rec, cl);
673         if (status < 0) {
674                 mlog_errno(status);
675                 goto bail;
676         }
677
678         status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
679                                                   bg_bh, ac, cl, min_bits);
680         if (status)
681                 mlog_errno(status);
682
683 bail:
684         if (status)
685                 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
686         return status ? ERR_PTR(status) : bg_bh;
687 }
688
689 /*
690  * We expect the block group allocator to already be locked.
691  */
692 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
693                                    struct inode *alloc_inode,
694                                    struct buffer_head *bh,
695                                    u64 max_block,
696                                    u64 *last_alloc_group,
697                                    int flags)
698 {
699         int status, credits;
700         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
701         struct ocfs2_chain_list *cl;
702         struct ocfs2_alloc_context *ac = NULL;
703         handle_t *handle = NULL;
704         u16 alloc_rec;
705         struct buffer_head *bg_bh = NULL;
706         struct ocfs2_group_desc *bg;
707
708         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
709
710         mlog_entry_void();
711
712         cl = &fe->id2.i_chain;
713         status = ocfs2_reserve_clusters_with_limit(osb,
714                                                    le16_to_cpu(cl->cl_cpg),
715                                                    max_block, flags, &ac);
716         if (status < 0) {
717                 if (status != -ENOSPC)
718                         mlog_errno(status);
719                 goto bail;
720         }
721
722         credits = ocfs2_calc_group_alloc_credits(osb->sb,
723                                                  le16_to_cpu(cl->cl_cpg));
724         handle = ocfs2_start_trans(osb, credits);
725         if (IS_ERR(handle)) {
726                 status = PTR_ERR(handle);
727                 handle = NULL;
728                 mlog_errno(status);
729                 goto bail;
730         }
731
732         if (last_alloc_group && *last_alloc_group != 0) {
733                 mlog(0, "use old allocation group %llu for block group alloc\n",
734                      (unsigned long long)*last_alloc_group);
735                 ac->ac_last_group = *last_alloc_group;
736         }
737
738         bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
739                                                ac, cl);
740         if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
741                 bg_bh = ocfs2_block_group_alloc_discontig(handle,
742                                                           alloc_inode,
743                                                           ac, cl);
744         if (IS_ERR(bg_bh)) {
745                 status = PTR_ERR(bg_bh);
746                 bg_bh = NULL;
747                 if (status != -ENOSPC)
748                         mlog_errno(status);
749                 goto bail;
750         }
751         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
752
753         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
754                                          bh, OCFS2_JOURNAL_ACCESS_WRITE);
755         if (status < 0) {
756                 mlog_errno(status);
757                 goto bail;
758         }
759
760         alloc_rec = le16_to_cpu(bg->bg_chain);
761         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
762                      le16_to_cpu(bg->bg_free_bits_count));
763         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
764                      le16_to_cpu(bg->bg_bits));
765         cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
766         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
767                 le16_add_cpu(&cl->cl_next_free_rec, 1);
768
769         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
770                                         le16_to_cpu(bg->bg_free_bits_count));
771         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
772         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
773
774         ocfs2_journal_dirty(handle, bh);
775
776         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
777         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
778         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
779                                              le32_to_cpu(fe->i_clusters)));
780         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
781         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
782         alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
783
784         status = 0;
785
786         /* save the new last alloc group so that the caller can cache it. */
787         if (last_alloc_group)
788                 *last_alloc_group = ac->ac_last_group;
789
790 bail:
791         if (handle)
792                 ocfs2_commit_trans(osb, handle);
793
794         if (ac)
795                 ocfs2_free_alloc_context(ac);
796
797         brelse(bg_bh);
798
799         mlog_exit(status);
800         return status;
801 }
802
803 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
804                                        struct ocfs2_alloc_context *ac,
805                                        int type,
806                                        u32 slot,
807                                        u64 *last_alloc_group,
808                                        int flags)
809 {
810         int status;
811         u32 bits_wanted = ac->ac_bits_wanted;
812         struct inode *alloc_inode;
813         struct buffer_head *bh = NULL;
814         struct ocfs2_dinode *fe;
815         u32 free_bits;
816
817         mlog_entry_void();
818
819         alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
820         if (!alloc_inode) {
821                 mlog_errno(-EINVAL);
822                 return -EINVAL;
823         }
824
825         mutex_lock(&alloc_inode->i_mutex);
826
827         status = ocfs2_inode_lock(alloc_inode, &bh, 1);
828         if (status < 0) {
829                 mutex_unlock(&alloc_inode->i_mutex);
830                 iput(alloc_inode);
831
832                 mlog_errno(status);
833                 return status;
834         }
835
836         ac->ac_inode = alloc_inode;
837         ac->ac_alloc_slot = slot;
838
839         fe = (struct ocfs2_dinode *) bh->b_data;
840
841         /* The bh was validated by the inode read inside
842          * ocfs2_inode_lock().  Any corruption is a code bug. */
843         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
844
845         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
846                 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
847                             (unsigned long long)le64_to_cpu(fe->i_blkno));
848                 status = -EIO;
849                 goto bail;
850         }
851
852         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
853                 le32_to_cpu(fe->id1.bitmap1.i_used);
854
855         if (bits_wanted > free_bits) {
856                 /* cluster bitmap never grows */
857                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
858                         mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
859                              bits_wanted, free_bits);
860                         status = -ENOSPC;
861                         goto bail;
862                 }
863
864                 if (!(flags & ALLOC_NEW_GROUP)) {
865                         mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
866                              "and we don't alloc a new group for it.\n",
867                              slot, bits_wanted, free_bits);
868                         status = -ENOSPC;
869                         goto bail;
870                 }
871
872                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
873                                                  ac->ac_max_block,
874                                                  last_alloc_group, flags);
875                 if (status < 0) {
876                         if (status != -ENOSPC)
877                                 mlog_errno(status);
878                         goto bail;
879                 }
880                 atomic_inc(&osb->alloc_stats.bg_extends);
881
882                 /* You should never ask for this much metadata */
883                 BUG_ON(bits_wanted >
884                        (le32_to_cpu(fe->id1.bitmap1.i_total)
885                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
886         }
887
888         get_bh(bh);
889         ac->ac_bh = bh;
890 bail:
891         brelse(bh);
892
893         mlog_exit(status);
894         return status;
895 }
896
897 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
898 {
899         spin_lock(&osb->osb_lock);
900         osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
901         spin_unlock(&osb->osb_lock);
902         atomic_set(&osb->s_num_inodes_stolen, 0);
903 }
904
905 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
906 {
907         spin_lock(&osb->osb_lock);
908         osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
909         spin_unlock(&osb->osb_lock);
910         atomic_set(&osb->s_num_meta_stolen, 0);
911 }
912
913 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
914 {
915         ocfs2_init_inode_steal_slot(osb);
916         ocfs2_init_meta_steal_slot(osb);
917 }
918
919 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
920 {
921         spin_lock(&osb->osb_lock);
922         if (type == INODE_ALLOC_SYSTEM_INODE)
923                 osb->s_inode_steal_slot = slot;
924         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
925                 osb->s_meta_steal_slot = slot;
926         spin_unlock(&osb->osb_lock);
927 }
928
929 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
930 {
931         int slot = OCFS2_INVALID_SLOT;
932
933         spin_lock(&osb->osb_lock);
934         if (type == INODE_ALLOC_SYSTEM_INODE)
935                 slot = osb->s_inode_steal_slot;
936         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
937                 slot = osb->s_meta_steal_slot;
938         spin_unlock(&osb->osb_lock);
939
940         return slot;
941 }
942
943 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
944 {
945         return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
946 }
947
948 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
949 {
950         return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
951 }
952
953 static int ocfs2_steal_resource(struct ocfs2_super *osb,
954                                 struct ocfs2_alloc_context *ac,
955                                 int type)
956 {
957         int i, status = -ENOSPC;
958         int slot = __ocfs2_get_steal_slot(osb, type);
959
960         /* Start to steal resource from the first slot after ours. */
961         if (slot == OCFS2_INVALID_SLOT)
962                 slot = osb->slot_num + 1;
963
964         for (i = 0; i < osb->max_slots; i++, slot++) {
965                 if (slot == osb->max_slots)
966                         slot = 0;
967
968                 if (slot == osb->slot_num)
969                         continue;
970
971                 status = ocfs2_reserve_suballoc_bits(osb, ac,
972                                                      type,
973                                                      (u32)slot, NULL,
974                                                      NOT_ALLOC_NEW_GROUP);
975                 if (status >= 0) {
976                         __ocfs2_set_steal_slot(osb, slot, type);
977                         break;
978                 }
979
980                 ocfs2_free_ac_resource(ac);
981         }
982
983         return status;
984 }
985
986 static int ocfs2_steal_inode(struct ocfs2_super *osb,
987                              struct ocfs2_alloc_context *ac)
988 {
989         return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
990 }
991
992 static int ocfs2_steal_meta(struct ocfs2_super *osb,
993                             struct ocfs2_alloc_context *ac)
994 {
995         return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
996 }
997
998 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
999                                       int blocks,
1000                                       struct ocfs2_alloc_context **ac)
1001 {
1002         int status;
1003         int slot = ocfs2_get_meta_steal_slot(osb);
1004
1005         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1006         if (!(*ac)) {
1007                 status = -ENOMEM;
1008                 mlog_errno(status);
1009                 goto bail;
1010         }
1011
1012         (*ac)->ac_bits_wanted = blocks;
1013         (*ac)->ac_which = OCFS2_AC_USE_META;
1014         (*ac)->ac_group_search = ocfs2_block_group_search;
1015
1016         if (slot != OCFS2_INVALID_SLOT &&
1017                 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
1018                 goto extent_steal;
1019
1020         atomic_set(&osb->s_num_meta_stolen, 0);
1021         status = ocfs2_reserve_suballoc_bits(osb, (*ac),
1022                                              EXTENT_ALLOC_SYSTEM_INODE,
1023                                              (u32)osb->slot_num, NULL,
1024                                              ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
1025
1026
1027         if (status >= 0) {
1028                 status = 0;
1029                 if (slot != OCFS2_INVALID_SLOT)
1030                         ocfs2_init_meta_steal_slot(osb);
1031                 goto bail;
1032         } else if (status < 0 && status != -ENOSPC) {
1033                 mlog_errno(status);
1034                 goto bail;
1035         }
1036
1037         ocfs2_free_ac_resource(*ac);
1038
1039 extent_steal:
1040         status = ocfs2_steal_meta(osb, *ac);
1041         atomic_inc(&osb->s_num_meta_stolen);
1042         if (status < 0) {
1043                 if (status != -ENOSPC)
1044                         mlog_errno(status);
1045                 goto bail;
1046         }
1047
1048         status = 0;
1049 bail:
1050         if ((status < 0) && *ac) {
1051                 ocfs2_free_alloc_context(*ac);
1052                 *ac = NULL;
1053         }
1054
1055         mlog_exit(status);
1056         return status;
1057 }
1058
1059 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1060                                struct ocfs2_extent_list *root_el,
1061                                struct ocfs2_alloc_context **ac)
1062 {
1063         return ocfs2_reserve_new_metadata_blocks(osb,
1064                                         ocfs2_extend_meta_needed(root_el),
1065                                         ac);
1066 }
1067
1068 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1069                             struct ocfs2_alloc_context **ac)
1070 {
1071         int status;
1072         int slot = ocfs2_get_inode_steal_slot(osb);
1073         u64 alloc_group;
1074
1075         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1076         if (!(*ac)) {
1077                 status = -ENOMEM;
1078                 mlog_errno(status);
1079                 goto bail;
1080         }
1081
1082         (*ac)->ac_bits_wanted = 1;
1083         (*ac)->ac_which = OCFS2_AC_USE_INODE;
1084
1085         (*ac)->ac_group_search = ocfs2_block_group_search;
1086
1087         /*
1088          * stat(2) can't handle i_ino > 32bits, so we tell the
1089          * lower levels not to allocate us a block group past that
1090          * limit.  The 'inode64' mount option avoids this behavior.
1091          */
1092         if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1093                 (*ac)->ac_max_block = (u32)~0U;
1094
1095         /*
1096          * slot is set when we successfully steal inode from other nodes.
1097          * It is reset in 3 places:
1098          * 1. when we flush the truncate log
1099          * 2. when we complete local alloc recovery.
1100          * 3. when we successfully allocate from our own slot.
1101          * After it is set, we will go on stealing inodes until we find the
1102          * need to check our slots to see whether there is some space for us.
1103          */
1104         if (slot != OCFS2_INVALID_SLOT &&
1105             atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1106                 goto inode_steal;
1107
1108         atomic_set(&osb->s_num_inodes_stolen, 0);
1109         alloc_group = osb->osb_inode_alloc_group;
1110         status = ocfs2_reserve_suballoc_bits(osb, *ac,
1111                                              INODE_ALLOC_SYSTEM_INODE,
1112                                              (u32)osb->slot_num,
1113                                              &alloc_group,
1114                                              ALLOC_NEW_GROUP |
1115                                              ALLOC_GROUPS_FROM_GLOBAL);
1116         if (status >= 0) {
1117                 status = 0;
1118
1119                 spin_lock(&osb->osb_lock);
1120                 osb->osb_inode_alloc_group = alloc_group;
1121                 spin_unlock(&osb->osb_lock);
1122                 mlog(0, "after reservation, new allocation group is "
1123                      "%llu\n", (unsigned long long)alloc_group);
1124
1125                 /*
1126                  * Some inodes must be freed by us, so try to allocate
1127                  * from our own next time.
1128                  */
1129                 if (slot != OCFS2_INVALID_SLOT)
1130                         ocfs2_init_inode_steal_slot(osb);
1131                 goto bail;
1132         } else if (status < 0 && status != -ENOSPC) {
1133                 mlog_errno(status);
1134                 goto bail;
1135         }
1136
1137         ocfs2_free_ac_resource(*ac);
1138
1139 inode_steal:
1140         status = ocfs2_steal_inode(osb, *ac);
1141         atomic_inc(&osb->s_num_inodes_stolen);
1142         if (status < 0) {
1143                 if (status != -ENOSPC)
1144                         mlog_errno(status);
1145                 goto bail;
1146         }
1147
1148         status = 0;
1149 bail:
1150         if ((status < 0) && *ac) {
1151                 ocfs2_free_alloc_context(*ac);
1152                 *ac = NULL;
1153         }
1154
1155         mlog_exit(status);
1156         return status;
1157 }
1158
1159 /* local alloc code has to do the same thing, so rather than do this
1160  * twice.. */
1161 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1162                                       struct ocfs2_alloc_context *ac)
1163 {
1164         int status;
1165
1166         ac->ac_which = OCFS2_AC_USE_MAIN;
1167         ac->ac_group_search = ocfs2_cluster_group_search;
1168
1169         status = ocfs2_reserve_suballoc_bits(osb, ac,
1170                                              GLOBAL_BITMAP_SYSTEM_INODE,
1171                                              OCFS2_INVALID_SLOT, NULL,
1172                                              ALLOC_NEW_GROUP);
1173         if (status < 0 && status != -ENOSPC) {
1174                 mlog_errno(status);
1175                 goto bail;
1176         }
1177
1178 bail:
1179         return status;
1180 }
1181
1182 /* Callers don't need to care which bitmap (local alloc or main) to
1183  * use so we figure it out for them, but unfortunately this clutters
1184  * things a bit. */
1185 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1186                                              u32 bits_wanted, u64 max_block,
1187                                              int flags,
1188                                              struct ocfs2_alloc_context **ac)
1189 {
1190         int status;
1191
1192         mlog_entry_void();
1193
1194         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1195         if (!(*ac)) {
1196                 status = -ENOMEM;
1197                 mlog_errno(status);
1198                 goto bail;
1199         }
1200
1201         (*ac)->ac_bits_wanted = bits_wanted;
1202         (*ac)->ac_max_block = max_block;
1203
1204         status = -ENOSPC;
1205         if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1206             ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1207                 status = ocfs2_reserve_local_alloc_bits(osb,
1208                                                         bits_wanted,
1209                                                         *ac);
1210                 if ((status < 0) && (status != -ENOSPC)) {
1211                         mlog_errno(status);
1212                         goto bail;
1213                 }
1214         }
1215
1216         if (status == -ENOSPC) {
1217                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1218                 if (status < 0) {
1219                         if (status != -ENOSPC)
1220                                 mlog_errno(status);
1221                         goto bail;
1222                 }
1223         }
1224
1225         status = 0;
1226 bail:
1227         if ((status < 0) && *ac) {
1228                 ocfs2_free_alloc_context(*ac);
1229                 *ac = NULL;
1230         }
1231
1232         mlog_exit(status);
1233         return status;
1234 }
1235
1236 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1237                            u32 bits_wanted,
1238                            struct ocfs2_alloc_context **ac)
1239 {
1240         return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1241                                                  ALLOC_NEW_GROUP, ac);
1242 }
1243
1244 /*
1245  * More or less lifted from ext3. I'll leave their description below:
1246  *
1247  * "For ext3 allocations, we must not reuse any blocks which are
1248  * allocated in the bitmap buffer's "last committed data" copy.  This
1249  * prevents deletes from freeing up the page for reuse until we have
1250  * committed the delete transaction.
1251  *
1252  * If we didn't do this, then deleting something and reallocating it as
1253  * data would allow the old block to be overwritten before the
1254  * transaction committed (because we force data to disk before commit).
1255  * This would lead to corruption if we crashed between overwriting the
1256  * data and committing the delete.
1257  *
1258  * @@@ We may want to make this allocation behaviour conditional on
1259  * data-writes at some point, and disable it for metadata allocations or
1260  * sync-data inodes."
1261  *
1262  * Note: OCFS2 already does this differently for metadata vs data
1263  * allocations, as those bitmaps are separate and undo access is never
1264  * called on a metadata group descriptor.
1265  */
1266 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1267                                          int nr)
1268 {
1269         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1270         int ret;
1271
1272         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1273                 return 0;
1274
1275         if (!buffer_jbd(bg_bh))
1276                 return 1;
1277
1278         jbd_lock_bh_state(bg_bh);
1279         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1280         if (bg)
1281                 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1282         else
1283                 ret = 1;
1284         jbd_unlock_bh_state(bg_bh);
1285
1286         return ret;
1287 }
1288
1289 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1290                                              struct buffer_head *bg_bh,
1291                                              unsigned int bits_wanted,
1292                                              unsigned int total_bits,
1293                                              struct ocfs2_suballoc_result *res)
1294 {
1295         void *bitmap;
1296         u16 best_offset, best_size;
1297         int offset, start, found, status = 0;
1298         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1299
1300         /* Callers got this descriptor from
1301          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1302         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1303
1304         found = start = best_offset = best_size = 0;
1305         bitmap = bg->bg_bitmap;
1306
1307         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1308                 if (offset == total_bits)
1309                         break;
1310
1311                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1312                         /* We found a zero, but we can't use it as it
1313                          * hasn't been put to disk yet! */
1314                         found = 0;
1315                         start = offset + 1;
1316                 } else if (offset == start) {
1317                         /* we found a zero */
1318                         found++;
1319                         /* move start to the next bit to test */
1320                         start++;
1321                 } else {
1322                         /* got a zero after some ones */
1323                         found = 1;
1324                         start = offset + 1;
1325                 }
1326                 if (found > best_size) {
1327                         best_size = found;
1328                         best_offset = start - found;
1329                 }
1330                 /* we got everything we needed */
1331                 if (found == bits_wanted) {
1332                         /* mlog(0, "Found it all!\n"); */
1333                         break;
1334                 }
1335         }
1336
1337         if (best_size) {
1338                 res->sr_bit_offset = best_offset;
1339                 res->sr_bits = best_size;
1340         } else {
1341                 status = -ENOSPC;
1342                 /* No error log here -- see the comment above
1343                  * ocfs2_test_bg_bit_allocatable */
1344         }
1345
1346         return status;
1347 }
1348
1349 static inline int ocfs2_block_group_set_bits(handle_t *handle,
1350                                              struct inode *alloc_inode,
1351                                              struct ocfs2_group_desc *bg,
1352                                              struct buffer_head *group_bh,
1353                                              unsigned int bit_off,
1354                                              unsigned int num_bits)
1355 {
1356         int status;
1357         void *bitmap = bg->bg_bitmap;
1358         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1359
1360         mlog_entry_void();
1361
1362         /* All callers get the descriptor via
1363          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1364         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1365         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1366
1367         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
1368              num_bits);
1369
1370         if (ocfs2_is_cluster_bitmap(alloc_inode))
1371                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1372
1373         status = ocfs2_journal_access_gd(handle,
1374                                          INODE_CACHE(alloc_inode),
1375                                          group_bh,
1376                                          journal_type);
1377         if (status < 0) {
1378                 mlog_errno(status);
1379                 goto bail;
1380         }
1381
1382         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1383         if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1384                 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
1385                             " count %u but claims %u are freed. num_bits %d",
1386                             (unsigned long long)le64_to_cpu(bg->bg_blkno),
1387                             le16_to_cpu(bg->bg_bits),
1388                             le16_to_cpu(bg->bg_free_bits_count), num_bits);
1389                 return -EROFS;
1390         }
1391         while(num_bits--)
1392                 ocfs2_set_bit(bit_off++, bitmap);
1393
1394         ocfs2_journal_dirty(handle, group_bh);
1395
1396 bail:
1397         mlog_exit(status);
1398         return status;
1399 }
1400
1401 /* find the one with the most empty bits */
1402 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1403 {
1404         u16 curr, best;
1405
1406         BUG_ON(!cl->cl_next_free_rec);
1407
1408         best = curr = 0;
1409         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1410                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1411                     le32_to_cpu(cl->cl_recs[best].c_free))
1412                         best = curr;
1413                 curr++;
1414         }
1415
1416         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1417         return best;
1418 }
1419
1420 static int ocfs2_relink_block_group(handle_t *handle,
1421                                     struct inode *alloc_inode,
1422                                     struct buffer_head *fe_bh,
1423                                     struct buffer_head *bg_bh,
1424                                     struct buffer_head *prev_bg_bh,
1425                                     u16 chain)
1426 {
1427         int status;
1428         /* there is a really tiny chance the journal calls could fail,
1429          * but we wouldn't want inconsistent blocks in *any* case. */
1430         u64 fe_ptr, bg_ptr, prev_bg_ptr;
1431         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1432         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1433         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1434
1435         /* The caller got these descriptors from
1436          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1437         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1438         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1439
1440         mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1441              (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1442              (unsigned long long)le64_to_cpu(bg->bg_blkno),
1443              (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1444
1445         fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1446         bg_ptr = le64_to_cpu(bg->bg_next_group);
1447         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1448
1449         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1450                                          prev_bg_bh,
1451                                          OCFS2_JOURNAL_ACCESS_WRITE);
1452         if (status < 0) {
1453                 mlog_errno(status);
1454                 goto out_rollback;
1455         }
1456
1457         prev_bg->bg_next_group = bg->bg_next_group;
1458         ocfs2_journal_dirty(handle, prev_bg_bh);
1459
1460         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1461                                          bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1462         if (status < 0) {
1463                 mlog_errno(status);
1464                 goto out_rollback;
1465         }
1466
1467         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1468         ocfs2_journal_dirty(handle, bg_bh);
1469
1470         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1471                                          fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1472         if (status < 0) {
1473                 mlog_errno(status);
1474                 goto out_rollback;
1475         }
1476
1477         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1478         ocfs2_journal_dirty(handle, fe_bh);
1479
1480 out_rollback:
1481         if (status < 0) {
1482                 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1483                 bg->bg_next_group = cpu_to_le64(bg_ptr);
1484                 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1485         }
1486
1487         mlog_exit(status);
1488         return status;
1489 }
1490
1491 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1492                                                      u32 wanted)
1493 {
1494         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1495 }
1496
1497 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1498  * value on error. */
1499 static int ocfs2_cluster_group_search(struct inode *inode,
1500                                       struct buffer_head *group_bh,
1501                                       u32 bits_wanted, u32 min_bits,
1502                                       u64 max_block,
1503                                       struct ocfs2_suballoc_result *res)
1504 {
1505         int search = -ENOSPC;
1506         int ret;
1507         u64 blkoff;
1508         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1509         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1510         unsigned int max_bits, gd_cluster_off;
1511
1512         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1513
1514         if (gd->bg_free_bits_count) {
1515                 max_bits = le16_to_cpu(gd->bg_bits);
1516
1517                 /* Tail groups in cluster bitmaps which aren't cpg
1518                  * aligned are prone to partial extention by a failed
1519                  * fs resize. If the file system resize never got to
1520                  * update the dinode cluster count, then we don't want
1521                  * to trust any clusters past it, regardless of what
1522                  * the group descriptor says. */
1523                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1524                                                           le64_to_cpu(gd->bg_blkno));
1525                 if ((gd_cluster_off + max_bits) >
1526                     OCFS2_I(inode)->ip_clusters) {
1527                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1528                         mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1529                              (unsigned long long)le64_to_cpu(gd->bg_blkno),
1530                              le16_to_cpu(gd->bg_bits),
1531                              OCFS2_I(inode)->ip_clusters, max_bits);
1532                 }
1533
1534                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1535                                                         group_bh, bits_wanted,
1536                                                         max_bits, res);
1537                 if (ret)
1538                         return ret;
1539
1540                 if (max_block) {
1541                         blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1542                                                           gd_cluster_off +
1543                                                           res->sr_bit_offset +
1544                                                           res->sr_bits);
1545                         mlog(0, "Checking %llu against %llu\n",
1546                              (unsigned long long)blkoff,
1547                              (unsigned long long)max_block);
1548                         if (blkoff > max_block)
1549                                 return -ENOSPC;
1550                 }
1551
1552                 /* ocfs2_block_group_find_clear_bits() might
1553                  * return success, but we still want to return
1554                  * -ENOSPC unless it found the minimum number
1555                  * of bits. */
1556                 if (min_bits <= res->sr_bits)
1557                         search = 0; /* success */
1558                 else if (res->sr_bits) {
1559                         /*
1560                          * Don't show bits which we'll be returning
1561                          * for allocation to the local alloc bitmap.
1562                          */
1563                         ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1564                 }
1565         }
1566
1567         return search;
1568 }
1569
1570 static int ocfs2_block_group_search(struct inode *inode,
1571                                     struct buffer_head *group_bh,
1572                                     u32 bits_wanted, u32 min_bits,
1573                                     u64 max_block,
1574                                     struct ocfs2_suballoc_result *res)
1575 {
1576         int ret = -ENOSPC;
1577         u64 blkoff;
1578         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1579
1580         BUG_ON(min_bits != 1);
1581         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1582
1583         if (bg->bg_free_bits_count) {
1584                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1585                                                         group_bh, bits_wanted,
1586                                                         le16_to_cpu(bg->bg_bits),
1587                                                         res);
1588                 if (!ret && max_block) {
1589                         blkoff = le64_to_cpu(bg->bg_blkno) +
1590                                 res->sr_bit_offset + res->sr_bits;
1591                         mlog(0, "Checking %llu against %llu\n",
1592                              (unsigned long long)blkoff,
1593                              (unsigned long long)max_block);
1594                         if (blkoff > max_block)
1595                                 ret = -ENOSPC;
1596                 }
1597         }
1598
1599         return ret;
1600 }
1601
1602 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1603                                        handle_t *handle,
1604                                        struct buffer_head *di_bh,
1605                                        u32 num_bits,
1606                                        u16 chain)
1607 {
1608         int ret;
1609         u32 tmp_used;
1610         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1611         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1612
1613         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1614                                       OCFS2_JOURNAL_ACCESS_WRITE);
1615         if (ret < 0) {
1616                 mlog_errno(ret);
1617                 goto out;
1618         }
1619
1620         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1621         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1622         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1623         ocfs2_journal_dirty(handle, di_bh);
1624
1625 out:
1626         return ret;
1627 }
1628
1629 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1630                                          struct ocfs2_extent_rec *rec,
1631                                          struct ocfs2_chain_list *cl)
1632 {
1633         unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1634         unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1635         unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
1636
1637         if (res->sr_bit_offset < bitoff)
1638                 return 0;
1639         if (res->sr_bit_offset >= (bitoff + bitcount))
1640                 return 0;
1641         res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1642                 (res->sr_bit_offset - bitoff);
1643         if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1644                 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1645         return 1;
1646 }
1647
1648 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1649                                           struct ocfs2_group_desc *bg,
1650                                           struct ocfs2_suballoc_result *res)
1651 {
1652         int i;
1653         u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1654         struct ocfs2_extent_rec *rec;
1655         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1656         struct ocfs2_chain_list *cl = &di->id2.i_chain;
1657
1658         if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1659                 res->sr_blkno = 0;
1660                 return;
1661         }
1662
1663         res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1664         res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1665         if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1666             !bg->bg_list.l_next_free_rec)
1667                 return;
1668
1669         for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1670                 rec = &bg->bg_list.l_recs[i];
1671                 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1672                         res->sr_bg_blkno = bg_blkno;  /* Restore */
1673                         break;
1674                 }
1675         }
1676 }
1677
1678 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1679                                   handle_t *handle,
1680                                   u32 bits_wanted,
1681                                   u32 min_bits,
1682                                   struct ocfs2_suballoc_result *res,
1683                                   u16 *bits_left)
1684 {
1685         int ret;
1686         struct buffer_head *group_bh = NULL;
1687         struct ocfs2_group_desc *gd;
1688         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1689         struct inode *alloc_inode = ac->ac_inode;
1690
1691         ret = ocfs2_read_group_descriptor(alloc_inode, di,
1692                                           res->sr_bg_blkno, &group_bh);
1693         if (ret < 0) {
1694                 mlog_errno(ret);
1695                 return ret;
1696         }
1697
1698         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1699         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1700                                   ac->ac_max_block, res);
1701         if (ret < 0) {
1702                 if (ret != -ENOSPC)
1703                         mlog_errno(ret);
1704                 goto out;
1705         }
1706
1707         if (!ret)
1708                 ocfs2_bg_discontig_fix_result(ac, gd, res);
1709
1710         /*
1711          * sr_bg_blkno might have been changed by
1712          * ocfs2_bg_discontig_fix_result
1713          */
1714         res->sr_bg_stable_blkno = group_bh->b_blocknr;
1715
1716         if (ac->ac_find_loc_only)
1717                 goto out_loc_only;
1718
1719         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1720                                                res->sr_bits,
1721                                                le16_to_cpu(gd->bg_chain));
1722         if (ret < 0) {
1723                 mlog_errno(ret);
1724                 goto out;
1725         }
1726
1727         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1728                                          res->sr_bit_offset, res->sr_bits);
1729         if (ret < 0)
1730                 mlog_errno(ret);
1731
1732 out_loc_only:
1733         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1734
1735 out:
1736         brelse(group_bh);
1737
1738         return ret;
1739 }
1740
1741 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1742                               handle_t *handle,
1743                               u32 bits_wanted,
1744                               u32 min_bits,
1745                               struct ocfs2_suballoc_result *res,
1746                               u16 *bits_left)
1747 {
1748         int status;
1749         u16 chain;
1750         u64 next_group;
1751         struct inode *alloc_inode = ac->ac_inode;
1752         struct buffer_head *group_bh = NULL;
1753         struct buffer_head *prev_group_bh = NULL;
1754         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1755         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1756         struct ocfs2_group_desc *bg;
1757
1758         chain = ac->ac_chain;
1759         mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1760              bits_wanted, chain,
1761              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1762
1763         status = ocfs2_read_group_descriptor(alloc_inode, fe,
1764                                              le64_to_cpu(cl->cl_recs[chain].c_blkno),
1765                                              &group_bh);
1766         if (status < 0) {
1767                 mlog_errno(status);
1768                 goto bail;
1769         }
1770         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1771
1772         status = -ENOSPC;
1773         /* for now, the chain search is a bit simplistic. We just use
1774          * the 1st group with any empty bits. */
1775         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1776                                              bits_wanted, min_bits,
1777                                              ac->ac_max_block,
1778                                              res)) == -ENOSPC) {
1779                 if (!bg->bg_next_group)
1780                         break;
1781
1782                 brelse(prev_group_bh);
1783                 prev_group_bh = NULL;
1784
1785                 next_group = le64_to_cpu(bg->bg_next_group);
1786                 prev_group_bh = group_bh;
1787                 group_bh = NULL;
1788                 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1789                                                      next_group, &group_bh);
1790                 if (status < 0) {
1791                         mlog_errno(status);
1792                         goto bail;
1793                 }
1794                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1795         }
1796         if (status < 0) {
1797                 if (status != -ENOSPC)
1798                         mlog_errno(status);
1799                 goto bail;
1800         }
1801
1802         mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1803              res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1804
1805         res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1806
1807         BUG_ON(res->sr_bits == 0);
1808         if (!status)
1809                 ocfs2_bg_discontig_fix_result(ac, bg, res);
1810
1811         /*
1812          * sr_bg_blkno might have been changed by
1813          * ocfs2_bg_discontig_fix_result
1814          */
1815         res->sr_bg_stable_blkno = group_bh->b_blocknr;
1816
1817         /*
1818          * Keep track of previous block descriptor read. When
1819          * we find a target, if we have read more than X
1820          * number of descriptors, and the target is reasonably
1821          * empty, relink him to top of his chain.
1822          *
1823          * We've read 0 extra blocks and only send one more to
1824          * the transaction, yet the next guy to search has a
1825          * much easier time.
1826          *
1827          * Do this *after* figuring out how many bits we're taking out
1828          * of our target group.
1829          */
1830         if (ac->ac_allow_chain_relink &&
1831             (prev_group_bh) &&
1832             (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1833                 status = ocfs2_relink_block_group(handle, alloc_inode,
1834                                                   ac->ac_bh, group_bh,
1835                                                   prev_group_bh, chain);
1836                 if (status < 0) {
1837                         mlog_errno(status);
1838                         goto bail;
1839                 }
1840         }
1841
1842         if (ac->ac_find_loc_only)
1843                 goto out_loc_only;
1844
1845         status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1846                                                   ac->ac_bh, res->sr_bits,
1847                                                   chain);
1848         if (status) {
1849                 mlog_errno(status);
1850                 goto bail;
1851         }
1852
1853         status = ocfs2_block_group_set_bits(handle,
1854                                             alloc_inode,
1855                                             bg,
1856                                             group_bh,
1857                                             res->sr_bit_offset,
1858                                             res->sr_bits);
1859         if (status < 0) {
1860                 mlog_errno(status);
1861                 goto bail;
1862         }
1863
1864         mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
1865              (unsigned long long)le64_to_cpu(fe->i_blkno));
1866
1867 out_loc_only:
1868         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1869 bail:
1870         brelse(group_bh);
1871         brelse(prev_group_bh);
1872
1873         mlog_exit(status);
1874         return status;
1875 }
1876
1877 /* will give out up to bits_wanted contiguous bits. */
1878 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1879                                      handle_t *handle,
1880                                      u32 bits_wanted,
1881                                      u32 min_bits,
1882                                      struct ocfs2_suballoc_result *res)
1883 {
1884         int status;
1885         u16 victim, i;
1886         u16 bits_left = 0;
1887         u64 hint = ac->ac_last_group;
1888         struct ocfs2_chain_list *cl;
1889         struct ocfs2_dinode *fe;
1890
1891         mlog_entry_void();
1892
1893         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1894         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1895         BUG_ON(!ac->ac_bh);
1896
1897         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1898
1899         /* The bh was validated by the inode read during
1900          * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1901         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1902
1903         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1904             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1905                 ocfs2_error(ac->ac_inode->i_sb,
1906                             "Chain allocator dinode %llu has %u used "
1907                             "bits but only %u total.",
1908                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1909                             le32_to_cpu(fe->id1.bitmap1.i_used),
1910                             le32_to_cpu(fe->id1.bitmap1.i_total));
1911                 status = -EIO;
1912                 goto bail;
1913         }
1914
1915         res->sr_bg_blkno = hint;
1916         if (res->sr_bg_blkno) {
1917                 /* Attempt to short-circuit the usual search mechanism
1918                  * by jumping straight to the most recently used
1919                  * allocation group. This helps us mantain some
1920                  * contiguousness across allocations. */
1921                 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1922                                                 min_bits, res, &bits_left);
1923                 if (!status)
1924                         goto set_hint;
1925                 if (status < 0 && status != -ENOSPC) {
1926                         mlog_errno(status);
1927                         goto bail;
1928                 }
1929         }
1930
1931         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1932
1933         victim = ocfs2_find_victim_chain(cl);
1934         ac->ac_chain = victim;
1935         ac->ac_allow_chain_relink = 1;
1936
1937         status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1938                                     res, &bits_left);
1939         if (!status) {
1940                 hint = ocfs2_group_from_res(res);
1941                 goto set_hint;
1942         }
1943         if (status < 0 && status != -ENOSPC) {
1944                 mlog_errno(status);
1945                 goto bail;
1946         }
1947
1948         mlog(0, "Search of victim chain %u came up with nothing, "
1949              "trying all chains now.\n", victim);
1950
1951         /* If we didn't pick a good victim, then just default to
1952          * searching each chain in order. Don't allow chain relinking
1953          * because we only calculate enough journal credits for one
1954          * relink per alloc. */
1955         ac->ac_allow_chain_relink = 0;
1956         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1957                 if (i == victim)
1958                         continue;
1959                 if (!cl->cl_recs[i].c_free)
1960                         continue;
1961
1962                 ac->ac_chain = i;
1963                 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1964                                             res, &bits_left);
1965                 if (!status) {
1966                         hint = ocfs2_group_from_res(res);
1967                         break;
1968                 }
1969                 if (status < 0 && status != -ENOSPC) {
1970                         mlog_errno(status);
1971                         goto bail;
1972                 }
1973         }
1974
1975 set_hint:
1976         if (status != -ENOSPC) {
1977                 /* If the next search of this group is not likely to
1978                  * yield a suitable extent, then we reset the last
1979                  * group hint so as to not waste a disk read */
1980                 if (bits_left < min_bits)
1981                         ac->ac_last_group = 0;
1982                 else
1983                         ac->ac_last_group = hint;
1984         }
1985
1986 bail:
1987         mlog_exit(status);
1988         return status;
1989 }
1990
1991 int ocfs2_claim_metadata(handle_t *handle,
1992                          struct ocfs2_alloc_context *ac,
1993                          u32 bits_wanted,
1994                          u64 *suballoc_loc,
1995                          u16 *suballoc_bit_start,
1996                          unsigned int *num_bits,
1997                          u64 *blkno_start)
1998 {
1999         int status;
2000         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2001
2002         BUG_ON(!ac);
2003         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
2004         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
2005
2006         status = ocfs2_claim_suballoc_bits(ac,
2007                                            handle,
2008                                            bits_wanted,
2009                                            1,
2010                                            &res);
2011         if (status < 0) {
2012                 mlog_errno(status);
2013                 goto bail;
2014         }
2015         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2016
2017         *suballoc_loc = res.sr_bg_blkno;
2018         *suballoc_bit_start = res.sr_bit_offset;
2019         *blkno_start = res.sr_blkno;
2020         ac->ac_bits_given += res.sr_bits;
2021         *num_bits = res.sr_bits;
2022         status = 0;
2023 bail:
2024         mlog_exit(status);
2025         return status;
2026 }
2027
2028 static void ocfs2_init_inode_ac_group(struct inode *dir,
2029                                       struct buffer_head *parent_di_bh,
2030                                       struct ocfs2_alloc_context *ac)
2031 {
2032         struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2033         /*
2034          * Try to allocate inodes from some specific group.
2035          *
2036          * If the parent dir has recorded the last group used in allocation,
2037          * cool, use it. Otherwise if we try to allocate new inode from the
2038          * same slot the parent dir belongs to, use the same chunk.
2039          *
2040          * We are very careful here to avoid the mistake of setting
2041          * ac_last_group to a group descriptor from a different (unlocked) slot.
2042          */
2043         if (OCFS2_I(dir)->ip_last_used_group &&
2044             OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2045                 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2046         else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2047                 if (di->i_suballoc_loc)
2048                         ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2049                 else
2050                         ac->ac_last_group = ocfs2_which_suballoc_group(
2051                                         le64_to_cpu(di->i_blkno),
2052                                         le16_to_cpu(di->i_suballoc_bit));
2053         }
2054 }
2055
2056 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2057                                              struct ocfs2_alloc_context *ac)
2058 {
2059         OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2060         OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2061 }
2062
2063 int ocfs2_find_new_inode_loc(struct inode *dir,
2064                              struct buffer_head *parent_fe_bh,
2065                              struct ocfs2_alloc_context *ac,
2066                              u64 *fe_blkno)
2067 {
2068         int ret;
2069         handle_t *handle = NULL;
2070         struct ocfs2_suballoc_result *res;
2071
2072         BUG_ON(!ac);
2073         BUG_ON(ac->ac_bits_given != 0);
2074         BUG_ON(ac->ac_bits_wanted != 1);
2075         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2076
2077         res = kzalloc(sizeof(*res), GFP_NOFS);
2078         if (res == NULL) {
2079                 ret = -ENOMEM;
2080                 mlog_errno(ret);
2081                 goto out;
2082         }
2083
2084         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2085
2086         /*
2087          * The handle started here is for chain relink. Alternatively,
2088          * we could just disable relink for these calls.
2089          */
2090         handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2091         if (IS_ERR(handle)) {
2092                 ret = PTR_ERR(handle);
2093                 handle = NULL;
2094                 mlog_errno(ret);
2095                 goto out;
2096         }
2097
2098         /*
2099          * This will instruct ocfs2_claim_suballoc_bits and
2100          * ocfs2_search_one_group to search but save actual allocation
2101          * for later.
2102          */
2103         ac->ac_find_loc_only = 1;
2104
2105         ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2106         if (ret < 0) {
2107                 mlog_errno(ret);
2108                 goto out;
2109         }
2110
2111         ac->ac_find_loc_priv = res;
2112         *fe_blkno = res->sr_blkno;
2113
2114 out:
2115         if (handle)
2116                 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2117
2118         if (ret)
2119                 kfree(res);
2120
2121         return ret;
2122 }
2123
2124 int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2125                                  struct inode *dir,
2126                                  struct ocfs2_alloc_context *ac,
2127                                  u64 *suballoc_loc,
2128                                  u16 *suballoc_bit,
2129                                  u64 di_blkno)
2130 {
2131         int ret;
2132         u16 chain;
2133         struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2134         struct buffer_head *bg_bh = NULL;
2135         struct ocfs2_group_desc *bg;
2136         struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2137
2138         /*
2139          * Since di_blkno is being passed back in, we check for any
2140          * inconsistencies which may have happened between
2141          * calls. These are code bugs as di_blkno is not expected to
2142          * change once returned from ocfs2_find_new_inode_loc()
2143          */
2144         BUG_ON(res->sr_blkno != di_blkno);
2145
2146         ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2147                                           res->sr_bg_stable_blkno, &bg_bh);
2148         if (ret) {
2149                 mlog_errno(ret);
2150                 goto out;
2151         }
2152
2153         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2154         chain = le16_to_cpu(bg->bg_chain);
2155
2156         ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2157                                                ac->ac_bh, res->sr_bits,
2158                                                chain);
2159         if (ret) {
2160                 mlog_errno(ret);
2161                 goto out;
2162         }
2163
2164         ret = ocfs2_block_group_set_bits(handle,
2165                                          ac->ac_inode,
2166                                          bg,
2167                                          bg_bh,
2168                                          res->sr_bit_offset,
2169                                          res->sr_bits);
2170         if (ret < 0) {
2171                 mlog_errno(ret);
2172                 goto out;
2173         }
2174
2175         mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
2176              (unsigned long long)di_blkno);
2177
2178         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2179
2180         BUG_ON(res->sr_bits != 1);
2181
2182         *suballoc_loc = res->sr_bg_blkno;
2183         *suballoc_bit = res->sr_bit_offset;
2184         ac->ac_bits_given++;
2185         ocfs2_save_inode_ac_group(dir, ac);
2186
2187 out:
2188         brelse(bg_bh);
2189
2190         return ret;
2191 }
2192
2193 int ocfs2_claim_new_inode(handle_t *handle,
2194                           struct inode *dir,
2195                           struct buffer_head *parent_fe_bh,
2196                           struct ocfs2_alloc_context *ac,
2197                           u64 *suballoc_loc,
2198                           u16 *suballoc_bit,
2199                           u64 *fe_blkno)
2200 {
2201         int status;
2202         struct ocfs2_suballoc_result res;
2203
2204         mlog_entry_void();
2205
2206         BUG_ON(!ac);
2207         BUG_ON(ac->ac_bits_given != 0);
2208         BUG_ON(ac->ac_bits_wanted != 1);
2209         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2210
2211         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2212
2213         status = ocfs2_claim_suballoc_bits(ac,
2214                                            handle,
2215                                            1,
2216                                            1,
2217                                            &res);
2218         if (status < 0) {
2219                 mlog_errno(status);
2220                 goto bail;
2221         }
2222         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2223
2224         BUG_ON(res.sr_bits != 1);
2225
2226         *suballoc_loc = res.sr_bg_blkno;
2227         *suballoc_bit = res.sr_bit_offset;
2228         *fe_blkno = res.sr_blkno;
2229         ac->ac_bits_given++;
2230         ocfs2_save_inode_ac_group(dir, ac);
2231         status = 0;
2232 bail:
2233         mlog_exit(status);
2234         return status;
2235 }
2236
2237 /* translate a group desc. blkno and it's bitmap offset into
2238  * disk cluster offset. */
2239 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2240                                                    u64 bg_blkno,
2241                                                    u16 bg_bit_off)
2242 {
2243         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2244         u32 cluster = 0;
2245
2246         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2247
2248         if (bg_blkno != osb->first_cluster_group_blkno)
2249                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2250         cluster += (u32) bg_bit_off;
2251         return cluster;
2252 }
2253
2254 /* given a cluster offset, calculate which block group it belongs to
2255  * and return that block offset. */
2256 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2257 {
2258         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2259         u32 group_no;
2260
2261         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2262
2263         group_no = cluster / osb->bitmap_cpg;
2264         if (!group_no)
2265                 return osb->first_cluster_group_blkno;
2266         return ocfs2_clusters_to_blocks(inode->i_sb,
2267                                         group_no * osb->bitmap_cpg);
2268 }
2269
2270 /* given the block number of a cluster start, calculate which cluster
2271  * group and descriptor bitmap offset that corresponds to. */
2272 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2273                                                 u64 data_blkno,
2274                                                 u64 *bg_blkno,
2275                                                 u16 *bg_bit_off)
2276 {
2277         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2278         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2279
2280         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2281
2282         *bg_blkno = ocfs2_which_cluster_group(inode,
2283                                               data_cluster);
2284
2285         if (*bg_blkno == osb->first_cluster_group_blkno)
2286                 *bg_bit_off = (u16) data_cluster;
2287         else
2288                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2289                                                              data_blkno - *bg_blkno);
2290 }
2291
2292 /*
2293  * min_bits - minimum contiguous chunk from this total allocation we
2294  * can handle. set to what we asked for originally for a full
2295  * contig. allocation, set to '1' to indicate we can deal with extents
2296  * of any size.
2297  */
2298 int __ocfs2_claim_clusters(handle_t *handle,
2299                            struct ocfs2_alloc_context *ac,
2300                            u32 min_clusters,
2301                            u32 max_clusters,
2302                            u32 *cluster_start,
2303                            u32 *num_clusters)
2304 {
2305         int status;
2306         unsigned int bits_wanted = max_clusters;
2307         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2308         struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2309
2310         mlog_entry_void();
2311
2312         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2313
2314         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2315                && ac->ac_which != OCFS2_AC_USE_MAIN);
2316
2317         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2318                 WARN_ON(min_clusters > 1);
2319
2320                 status = ocfs2_claim_local_alloc_bits(osb,
2321                                                       handle,
2322                                                       ac,
2323                                                       bits_wanted,
2324                                                       cluster_start,
2325                                                       num_clusters);
2326                 if (!status)
2327                         atomic_inc(&osb->alloc_stats.local_data);
2328         } else {
2329                 if (min_clusters > (osb->bitmap_cpg - 1)) {
2330                         /* The only paths asking for contiguousness
2331                          * should know about this already. */
2332                         mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2333                              "group bitmap size %u!\n", min_clusters,
2334                              osb->bitmap_cpg);
2335                         status = -ENOSPC;
2336                         goto bail;
2337                 }
2338                 /* clamp the current request down to a realistic size. */
2339                 if (bits_wanted > (osb->bitmap_cpg - 1))
2340                         bits_wanted = osb->bitmap_cpg - 1;
2341
2342                 status = ocfs2_claim_suballoc_bits(ac,
2343                                                    handle,
2344                                                    bits_wanted,
2345                                                    min_clusters,
2346                                                    &res);
2347                 if (!status) {
2348                         BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2349                         *cluster_start =
2350                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2351                                                                  res.sr_bg_blkno,
2352                                                                  res.sr_bit_offset);
2353                         atomic_inc(&osb->alloc_stats.bitmap_data);
2354                         *num_clusters = res.sr_bits;
2355                 }
2356         }
2357         if (status < 0) {
2358                 if (status != -ENOSPC)
2359                         mlog_errno(status);
2360                 goto bail;
2361         }
2362
2363         ac->ac_bits_given += *num_clusters;
2364
2365 bail:
2366         mlog_exit(status);
2367         return status;
2368 }
2369
2370 int ocfs2_claim_clusters(handle_t *handle,
2371                          struct ocfs2_alloc_context *ac,
2372                          u32 min_clusters,
2373                          u32 *cluster_start,
2374                          u32 *num_clusters)
2375 {
2376         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2377
2378         return __ocfs2_claim_clusters(handle, ac, min_clusters,
2379                                       bits_wanted, cluster_start, num_clusters);
2380 }
2381
2382 static int ocfs2_block_group_clear_bits(handle_t *handle,
2383                                         struct inode *alloc_inode,
2384                                         struct ocfs2_group_desc *bg,
2385                                         struct buffer_head *group_bh,
2386                                         unsigned int bit_off,
2387                                         unsigned int num_bits,
2388                                         void (*undo_fn)(unsigned int bit,
2389                                                         unsigned long *bmap))
2390 {
2391         int status;
2392         unsigned int tmp;
2393         struct ocfs2_group_desc *undo_bg = NULL;
2394
2395         mlog_entry_void();
2396
2397         /* The caller got this descriptor from
2398          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2399         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2400
2401         mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
2402
2403         BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2404         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2405                                          group_bh,
2406                                          undo_fn ?
2407                                          OCFS2_JOURNAL_ACCESS_UNDO :
2408                                          OCFS2_JOURNAL_ACCESS_WRITE);
2409         if (status < 0) {
2410                 mlog_errno(status);
2411                 goto bail;
2412         }
2413
2414         if (undo_fn) {
2415                 jbd_lock_bh_state(group_bh);
2416                 undo_bg = (struct ocfs2_group_desc *)
2417                                         bh2jh(group_bh)->b_committed_data;
2418                 BUG_ON(!undo_bg);
2419         }
2420
2421         tmp = num_bits;
2422         while(tmp--) {
2423                 ocfs2_clear_bit((bit_off + tmp),
2424                                 (unsigned long *) bg->bg_bitmap);
2425                 if (undo_fn)
2426                         undo_fn(bit_off + tmp,
2427                                 (unsigned long *) undo_bg->bg_bitmap);
2428         }
2429         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2430         if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2431                 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
2432                             " count %u but claims %u are freed. num_bits %d",
2433                             (unsigned long long)le64_to_cpu(bg->bg_blkno),
2434                             le16_to_cpu(bg->bg_bits),
2435                             le16_to_cpu(bg->bg_free_bits_count), num_bits);
2436                 return -EROFS;
2437         }
2438
2439         if (undo_fn)
2440                 jbd_unlock_bh_state(group_bh);
2441
2442         ocfs2_journal_dirty(handle, group_bh);
2443 bail:
2444         return status;
2445 }
2446
2447 /*
2448  * expects the suballoc inode to already be locked.
2449  */
2450 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2451                                      struct inode *alloc_inode,
2452                                      struct buffer_head *alloc_bh,
2453                                      unsigned int start_bit,
2454                                      u64 bg_blkno,
2455                                      unsigned int count,
2456                                      void (*undo_fn)(unsigned int bit,
2457                                                      unsigned long *bitmap))
2458 {
2459         int status = 0;
2460         u32 tmp_used;
2461         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2462         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2463         struct buffer_head *group_bh = NULL;
2464         struct ocfs2_group_desc *group;
2465
2466         mlog_entry_void();
2467
2468         /* The alloc_bh comes from ocfs2_free_dinode() or
2469          * ocfs2_free_clusters().  The callers have all locked the
2470          * allocator and gotten alloc_bh from the lock call.  This
2471          * validates the dinode buffer.  Any corruption that has happended
2472          * is a code bug. */
2473         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2474         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2475
2476         mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
2477              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
2478              (unsigned long long)bg_blkno, start_bit);
2479
2480         status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2481                                              &group_bh);
2482         if (status < 0) {
2483                 mlog_errno(status);
2484                 goto bail;
2485         }
2486         group = (struct ocfs2_group_desc *) group_bh->b_data;
2487
2488         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2489
2490         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2491                                               group, group_bh,
2492                                               start_bit, count, undo_fn);
2493         if (status < 0) {
2494                 mlog_errno(status);
2495                 goto bail;
2496         }
2497
2498         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2499                                          alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2500         if (status < 0) {
2501                 mlog_errno(status);
2502                 goto bail;
2503         }
2504
2505         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2506                      count);
2507         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2508         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2509         ocfs2_journal_dirty(handle, alloc_bh);
2510
2511 bail:
2512         brelse(group_bh);
2513
2514         mlog_exit(status);
2515         return status;
2516 }
2517
2518 int ocfs2_free_suballoc_bits(handle_t *handle,
2519                              struct inode *alloc_inode,
2520                              struct buffer_head *alloc_bh,
2521                              unsigned int start_bit,
2522                              u64 bg_blkno,
2523                              unsigned int count)
2524 {
2525         return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2526                                          start_bit, bg_blkno, count, NULL);
2527 }
2528
2529 int ocfs2_free_dinode(handle_t *handle,
2530                       struct inode *inode_alloc_inode,
2531                       struct buffer_head *inode_alloc_bh,
2532                       struct ocfs2_dinode *di)
2533 {
2534         u64 blk = le64_to_cpu(di->i_blkno);
2535         u16 bit = le16_to_cpu(di->i_suballoc_bit);
2536         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2537
2538         if (di->i_suballoc_loc)
2539                 bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2540         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2541                                         inode_alloc_bh, bit, bg_blkno, 1);
2542 }
2543
2544 static int _ocfs2_free_clusters(handle_t *handle,
2545                                 struct inode *bitmap_inode,
2546                                 struct buffer_head *bitmap_bh,
2547                                 u64 start_blk,
2548                                 unsigned int num_clusters,
2549                                 void (*undo_fn)(unsigned int bit,
2550                                                 unsigned long *bitmap))
2551 {
2552         int status;
2553         u16 bg_start_bit;
2554         u64 bg_blkno;
2555         struct ocfs2_dinode *fe;
2556
2557         /* You can't ever have a contiguous set of clusters
2558          * bigger than a block group bitmap so we never have to worry
2559          * about looping on them. */
2560
2561         mlog_entry_void();
2562
2563         /* This is expensive. We can safely remove once this stuff has
2564          * gotten tested really well. */
2565         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2566
2567         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2568
2569         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2570                                      &bg_start_bit);
2571
2572         mlog(0, "want to free %u clusters starting at block %llu\n",
2573              num_clusters, (unsigned long long)start_blk);
2574         mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2575              (unsigned long long)bg_blkno, bg_start_bit);
2576
2577         status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2578                                            bg_start_bit, bg_blkno,
2579                                            num_clusters, undo_fn);
2580         if (status < 0) {
2581                 mlog_errno(status);
2582                 goto out;
2583         }
2584
2585         ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2586                                          num_clusters);
2587
2588 out:
2589         mlog_exit(status);
2590         return status;
2591 }
2592
2593 int ocfs2_free_clusters(handle_t *handle,
2594                         struct inode *bitmap_inode,
2595                         struct buffer_head *bitmap_bh,
2596                         u64 start_blk,
2597                         unsigned int num_clusters)
2598 {
2599         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2600                                     start_blk, num_clusters,
2601                                     _ocfs2_set_bit);
2602 }
2603
2604 /*
2605  * Give never-used clusters back to the global bitmap.  We don't need
2606  * to protect these bits in the undo buffer.
2607  */
2608 int ocfs2_release_clusters(handle_t *handle,
2609                            struct inode *bitmap_inode,
2610                            struct buffer_head *bitmap_bh,
2611                            u64 start_blk,
2612                            unsigned int num_clusters)
2613 {
2614         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2615                                     start_blk, num_clusters,
2616                                     _ocfs2_clear_bit);
2617 }
2618
2619 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2620 {
2621         printk("Block Group:\n");
2622         printk("bg_signature:       %s\n", bg->bg_signature);
2623         printk("bg_size:            %u\n", bg->bg_size);
2624         printk("bg_bits:            %u\n", bg->bg_bits);
2625         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2626         printk("bg_chain:           %u\n", bg->bg_chain);
2627         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2628         printk("bg_next_group:      %llu\n",
2629                (unsigned long long)bg->bg_next_group);
2630         printk("bg_parent_dinode:   %llu\n",
2631                (unsigned long long)bg->bg_parent_dinode);
2632         printk("bg_blkno:           %llu\n",
2633                (unsigned long long)bg->bg_blkno);
2634 }
2635
2636 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2637 {
2638         int i;
2639
2640         printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2641         printk("i_signature:                  %s\n", fe->i_signature);
2642         printk("i_size:                       %llu\n",
2643                (unsigned long long)fe->i_size);
2644         printk("i_clusters:                   %u\n", fe->i_clusters);
2645         printk("i_generation:                 %u\n",
2646                le32_to_cpu(fe->i_generation));
2647         printk("id1.bitmap1.i_used:           %u\n",
2648                le32_to_cpu(fe->id1.bitmap1.i_used));
2649         printk("id1.bitmap1.i_total:          %u\n",
2650                le32_to_cpu(fe->id1.bitmap1.i_total));
2651         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2652         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2653         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2654         printk("id2.i_chain.cl_next_free_rec: %u\n",
2655                fe->id2.i_chain.cl_next_free_rec);
2656         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2657                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2658                        fe->id2.i_chain.cl_recs[i].c_free);
2659                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2660                        fe->id2.i_chain.cl_recs[i].c_total);
2661                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2662                        (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2663         }
2664 }
2665
2666 /*
2667  * For a given allocation, determine which allocators will need to be
2668  * accessed, and lock them, reserving the appropriate number of bits.
2669  *
2670  * Sparse file systems call this from ocfs2_write_begin_nolock()
2671  * and ocfs2_allocate_unwritten_extents().
2672  *
2673  * File systems which don't support holes call this from
2674  * ocfs2_extend_allocation().
2675  */
2676 int ocfs2_lock_allocators(struct inode *inode,
2677                           struct ocfs2_extent_tree *et,
2678                           u32 clusters_to_add, u32 extents_to_split,
2679                           struct ocfs2_alloc_context **data_ac,
2680                           struct ocfs2_alloc_context **meta_ac)
2681 {
2682         int ret = 0, num_free_extents;
2683         unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2684         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2685
2686         *meta_ac = NULL;
2687         if (data_ac)
2688                 *data_ac = NULL;
2689
2690         BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2691
2692         num_free_extents = ocfs2_num_free_extents(osb, et);
2693         if (num_free_extents < 0) {
2694                 ret = num_free_extents;
2695                 mlog_errno(ret);
2696                 goto out;
2697         }
2698
2699         /*
2700          * Sparse allocation file systems need to be more conservative
2701          * with reserving room for expansion - the actual allocation
2702          * happens while we've got a journal handle open so re-taking
2703          * a cluster lock (because we ran out of room for another
2704          * extent) will violate ordering rules.
2705          *
2706          * Most of the time we'll only be seeing this 1 cluster at a time
2707          * anyway.
2708          *
2709          * Always lock for any unwritten extents - we might want to
2710          * add blocks during a split.
2711          */
2712         if (!num_free_extents ||
2713             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2714                 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2715                 if (ret < 0) {
2716                         if (ret != -ENOSPC)
2717                                 mlog_errno(ret);
2718                         goto out;
2719                 }
2720         }
2721
2722         if (clusters_to_add == 0)
2723                 goto out;
2724
2725         ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2726         if (ret < 0) {
2727                 if (ret != -ENOSPC)
2728                         mlog_errno(ret);
2729                 goto out;
2730         }
2731
2732 out:
2733         if (ret) {
2734                 if (*meta_ac) {
2735                         ocfs2_free_alloc_context(*meta_ac);
2736                         *meta_ac = NULL;
2737                 }
2738
2739                 /*
2740                  * We cannot have an error and a non null *data_ac.
2741                  */
2742         }
2743
2744         return ret;
2745 }
2746
2747 /*
2748  * Read the inode specified by blkno to get suballoc_slot and
2749  * suballoc_bit.
2750  */
2751 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2752                                        u16 *suballoc_slot, u64 *group_blkno,
2753                                        u16 *suballoc_bit)
2754 {
2755         int status;
2756         struct buffer_head *inode_bh = NULL;
2757         struct ocfs2_dinode *inode_fe;
2758
2759         mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
2760
2761         /* dirty read disk */
2762         status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2763         if (status < 0) {
2764                 mlog(ML_ERROR, "read block %llu failed %d\n",
2765                      (unsigned long long)blkno, status);
2766                 goto bail;
2767         }
2768
2769         inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2770         if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2771                 mlog(ML_ERROR, "invalid inode %llu requested\n",
2772                      (unsigned long long)blkno);
2773                 status = -EINVAL;
2774                 goto bail;
2775         }
2776
2777         if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2778             (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2779                 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2780                      (unsigned long long)blkno,
2781                      (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2782                 status = -EINVAL;
2783                 goto bail;
2784         }
2785
2786         if (suballoc_slot)
2787                 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2788         if (suballoc_bit)
2789                 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2790         if (group_blkno)
2791                 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2792
2793 bail:
2794         brelse(inode_bh);
2795
2796         mlog_exit(status);
2797         return status;
2798 }
2799
2800 /*
2801  * test whether bit is SET in allocator bitmap or not.  on success, 0
2802  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2803  * is returned and *res is meaningless.  Call this after you have
2804  * cluster locked against suballoc, or you may get a result based on
2805  * non-up2date contents
2806  */
2807 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2808                                    struct inode *suballoc,
2809                                    struct buffer_head *alloc_bh,
2810                                    u64 group_blkno, u64 blkno,
2811                                    u16 bit, int *res)
2812 {
2813         struct ocfs2_dinode *alloc_di;
2814         struct ocfs2_group_desc *group;
2815         struct buffer_head *group_bh = NULL;
2816         u64 bg_blkno;
2817         int status;
2818
2819         mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2820                    (unsigned int)bit);
2821
2822         alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2823         if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2824                 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2825                      (unsigned int)bit,
2826                      ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2827                 status = -EINVAL;
2828                 goto bail;
2829         }
2830
2831         bg_blkno = group_blkno ? group_blkno :
2832                    ocfs2_which_suballoc_group(blkno, bit);
2833         status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2834                                              &group_bh);
2835         if (status < 0) {
2836                 mlog(ML_ERROR, "read group %llu failed %d\n",
2837                      (unsigned long long)bg_blkno, status);
2838                 goto bail;
2839         }
2840
2841         group = (struct ocfs2_group_desc *) group_bh->b_data;
2842         *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2843
2844 bail:
2845         brelse(group_bh);
2846
2847         mlog_exit(status);
2848         return status;
2849 }
2850
2851 /*
2852  * Test if the bit representing this inode (blkno) is set in the
2853  * suballocator.
2854  *
2855  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2856  *
2857  * In the event of failure, a negative value is returned and *res is
2858  * meaningless.
2859  *
2860  * Callers must make sure to hold nfs_sync_lock to prevent
2861  * ocfs2_delete_inode() on another node from accessing the same
2862  * suballocator concurrently.
2863  */
2864 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2865 {
2866         int status;
2867         u64 group_blkno = 0;
2868         u16 suballoc_bit = 0, suballoc_slot = 0;
2869         struct inode *inode_alloc_inode;
2870         struct buffer_head *alloc_bh = NULL;
2871
2872         mlog_entry("blkno: %llu", (unsigned long long)blkno);
2873
2874         status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2875                                              &group_blkno, &suballoc_bit);
2876         if (status < 0) {
2877                 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2878                 goto bail;
2879         }
2880
2881         inode_alloc_inode =
2882                 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2883                                             suballoc_slot);
2884         if (!inode_alloc_inode) {
2885                 /* the error code could be inaccurate, but we are not able to
2886                  * get the correct one. */
2887                 status = -EINVAL;
2888                 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2889                      (u32)suballoc_slot);
2890                 goto bail;
2891         }
2892
2893         mutex_lock(&inode_alloc_inode->i_mutex);
2894         status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2895         if (status < 0) {
2896                 mutex_unlock(&inode_alloc_inode->i_mutex);
2897                 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2898                      (u32)suballoc_slot, status);
2899                 goto bail;
2900         }
2901
2902         status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2903                                          group_blkno, blkno, suballoc_bit, res);
2904         if (status < 0)
2905                 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2906
2907         ocfs2_inode_unlock(inode_alloc_inode, 0);
2908         mutex_unlock(&inode_alloc_inode->i_mutex);
2909
2910         iput(inode_alloc_inode);
2911         brelse(alloc_bh);
2912 bail:
2913         mlog_exit(status);
2914         return status;
2915 }