efc509b3af1ff45cc8747a0bcd6e3078d91dcdf0
[pandora-kernel.git] / fs / ocfs2 / move_extents.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * move_extents.c
5  *
6  * Copyright (C) 2011 Oracle.  All rights reserved.
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public
10  * License version 2 as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License for more details.
16  */
17 #include <linux/fs.h>
18 #include <linux/types.h>
19 #include <linux/mount.h>
20 #include <linux/swap.h>
21
22 #include <cluster/masklog.h>
23
24 #include "ocfs2.h"
25 #include "ocfs2_ioctl.h"
26
27 #include "alloc.h"
28 #include "aops.h"
29 #include "dlmglue.h"
30 #include "extent_map.h"
31 #include "inode.h"
32 #include "journal.h"
33 #include "suballoc.h"
34 #include "uptodate.h"
35 #include "super.h"
36 #include "dir.h"
37 #include "buffer_head_io.h"
38 #include "sysfile.h"
39 #include "suballoc.h"
40 #include "refcounttree.h"
41 #include "move_extents.h"
42
43 struct ocfs2_move_extents_context {
44         struct inode *inode;
45         struct file *file;
46         int auto_defrag;
47         int partial;
48         int credits;
49         u32 new_phys_cpos;
50         u32 clusters_moved;
51         u64 refcount_loc;
52         struct ocfs2_move_extents *range;
53         struct ocfs2_extent_tree et;
54         struct ocfs2_alloc_context *meta_ac;
55         struct ocfs2_alloc_context *data_ac;
56         struct ocfs2_cached_dealloc_ctxt dealloc;
57 };
58
59 static int __ocfs2_move_extent(handle_t *handle,
60                                struct ocfs2_move_extents_context *context,
61                                u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
62                                int ext_flags)
63 {
64         int ret = 0, index;
65         struct inode *inode = context->inode;
66         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
67         struct ocfs2_extent_rec *rec, replace_rec;
68         struct ocfs2_path *path = NULL;
69         struct ocfs2_extent_list *el;
70         u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
71         u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
72
73         ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
74                                                p_cpos, new_p_cpos, len);
75         if (ret) {
76                 mlog_errno(ret);
77                 goto out;
78         }
79
80         memset(&replace_rec, 0, sizeof(replace_rec));
81         replace_rec.e_cpos = cpu_to_le32(cpos);
82         replace_rec.e_leaf_clusters = cpu_to_le16(len);
83         replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
84                                                                    new_p_cpos));
85
86         path = ocfs2_new_path_from_et(&context->et);
87         if (!path) {
88                 ret = -ENOMEM;
89                 mlog_errno(ret);
90                 goto out;
91         }
92
93         ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
94         if (ret) {
95                 mlog_errno(ret);
96                 goto out;
97         }
98
99         el = path_leaf_el(path);
100
101         index = ocfs2_search_extent_list(el, cpos);
102         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
103                 ocfs2_error(inode->i_sb,
104                             "Inode %llu has an extent at cpos %u which can no "
105                             "longer be found.\n",
106                             (unsigned long long)ino, cpos);
107                 ret = -EROFS;
108                 goto out;
109         }
110
111         rec = &el->l_recs[index];
112
113         BUG_ON(ext_flags != rec->e_flags);
114         /*
115          * after moving/defraging to new location, the extent is not going
116          * to be refcounted anymore.
117          */
118         replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
119
120         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
121                                       context->et.et_root_bh,
122                                       OCFS2_JOURNAL_ACCESS_WRITE);
123         if (ret) {
124                 mlog_errno(ret);
125                 goto out;
126         }
127
128         ret = ocfs2_split_extent(handle, &context->et, path, index,
129                                  &replace_rec, context->meta_ac,
130                                  &context->dealloc);
131         if (ret) {
132                 mlog_errno(ret);
133                 goto out;
134         }
135
136         ocfs2_journal_dirty(handle, context->et.et_root_bh);
137
138         context->new_phys_cpos = new_p_cpos;
139
140         /*
141          * need I to append truncate log for old clusters?
142          */
143         if (old_blkno) {
144                 if (ext_flags & OCFS2_EXT_REFCOUNTED)
145                         ret = ocfs2_decrease_refcount(inode, handle,
146                                         ocfs2_blocks_to_clusters(osb->sb,
147                                                                  old_blkno),
148                                         len, context->meta_ac,
149                                         &context->dealloc, 1);
150                 else
151                         ret = ocfs2_truncate_log_append(osb, handle,
152                                                         old_blkno, len);
153         }
154
155 out:
156         return ret;
157 }
158
159 /*
160  * lock allocators, and reserving appropriate number of bits for
161  * meta blocks and data clusters.
162  *
163  * in some cases, we don't need to reserve clusters, just let data_ac
164  * be NULL.
165  */
166 static int ocfs2_lock_allocators_move_extents(struct inode *inode,
167                                         struct ocfs2_extent_tree *et,
168                                         u32 clusters_to_move,
169                                         u32 extents_to_split,
170                                         struct ocfs2_alloc_context **meta_ac,
171                                         struct ocfs2_alloc_context **data_ac,
172                                         int extra_blocks,
173                                         int *credits)
174 {
175         int ret, num_free_extents;
176         unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
177         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
178
179         num_free_extents = ocfs2_num_free_extents(osb, et);
180         if (num_free_extents < 0) {
181                 ret = num_free_extents;
182                 mlog_errno(ret);
183                 goto out;
184         }
185
186         if (!num_free_extents ||
187             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
188                 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
189
190         ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
191         if (ret) {
192                 mlog_errno(ret);
193                 goto out;
194         }
195
196         if (data_ac) {
197                 ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
198                 if (ret) {
199                         mlog_errno(ret);
200                         goto out;
201                 }
202         }
203
204         *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
205                                               clusters_to_move + 2);
206
207         mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
208              extra_blocks, clusters_to_move, *credits);
209 out:
210         if (ret) {
211                 if (*meta_ac) {
212                         ocfs2_free_alloc_context(*meta_ac);
213                         *meta_ac = NULL;
214                 }
215         }
216
217         return ret;
218 }
219
220 /*
221  * Using one journal handle to guarantee the data consistency in case
222  * crash happens anywhere.
223  */
224 static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
225                                u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
226 {
227         int ret, credits = 0, extra_blocks = 0, partial = context->partial;
228         handle_t *handle;
229         struct inode *inode = context->inode;
230         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
231         struct inode *tl_inode = osb->osb_tl_inode;
232         struct ocfs2_refcount_tree *ref_tree = NULL;
233         u32 new_phys_cpos, new_len;
234         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
235
236         if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
237
238                 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
239                          OCFS2_HAS_REFCOUNT_FL));
240
241                 BUG_ON(!context->refcount_loc);
242
243                 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
244                                                &ref_tree, NULL);
245                 if (ret) {
246                         mlog_errno(ret);
247                         return ret;
248                 }
249
250                 ret = ocfs2_prepare_refcount_change_for_del(inode,
251                                                         context->refcount_loc,
252                                                         phys_blkno,
253                                                         *len,
254                                                         &credits,
255                                                         &extra_blocks);
256                 if (ret) {
257                         mlog_errno(ret);
258                         goto out;
259                 }
260         }
261
262         ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
263                                                  &context->meta_ac,
264                                                  &context->data_ac,
265                                                  extra_blocks, &credits);
266         if (ret) {
267                 mlog_errno(ret);
268                 goto out;
269         }
270
271         /*
272          * should be using allocation reservation strategy there?
273          *
274          * if (context->data_ac)
275          *      context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
276          */
277
278         mutex_lock(&tl_inode->i_mutex);
279
280         if (ocfs2_truncate_log_needs_flush(osb)) {
281                 ret = __ocfs2_flush_truncate_log(osb);
282                 if (ret < 0) {
283                         mlog_errno(ret);
284                         goto out_unlock_mutex;
285                 }
286         }
287
288         handle = ocfs2_start_trans(osb, credits);
289         if (IS_ERR(handle)) {
290                 ret = PTR_ERR(handle);
291                 mlog_errno(ret);
292                 goto out_unlock_mutex;
293         }
294
295         ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
296                                      &new_phys_cpos, &new_len);
297         if (ret) {
298                 mlog_errno(ret);
299                 goto out_commit;
300         }
301
302         /*
303          * allowing partial extent moving is kind of 'pros and cons', it makes
304          * whole defragmentation less likely to fail, on the contrary, the bad
305          * thing is it may make the fs even more fragmented after moving, let
306          * userspace make a good decision here.
307          */
308         if (new_len != *len) {
309                 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
310                 if (!partial) {
311                         context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
312                         ret = -ENOSPC;
313                         goto out_commit;
314                 }
315         }
316
317         mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
318              phys_cpos, new_phys_cpos);
319
320         ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
321                                   new_phys_cpos, ext_flags);
322         if (ret)
323                 mlog_errno(ret);
324
325         if (partial && (new_len != *len))
326                 *len = new_len;
327
328         /*
329          * Here we should write the new page out first if we are
330          * in write-back mode.
331          */
332         ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
333         if (ret)
334                 mlog_errno(ret);
335
336 out_commit:
337         ocfs2_commit_trans(osb, handle);
338
339 out_unlock_mutex:
340         mutex_unlock(&tl_inode->i_mutex);
341
342         if (context->data_ac) {
343                 ocfs2_free_alloc_context(context->data_ac);
344                 context->data_ac = NULL;
345         }
346
347         if (context->meta_ac) {
348                 ocfs2_free_alloc_context(context->meta_ac);
349                 context->meta_ac = NULL;
350         }
351
352 out:
353         if (ref_tree)
354                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
355
356         return ret;
357 }
358
359 /*
360  * find the victim alloc group, where #blkno fits.
361  */
362 static int ocfs2_find_victim_alloc_group(struct inode *inode,
363                                          u64 vict_blkno,
364                                          int type, int slot,
365                                          int *vict_bit,
366                                          struct buffer_head **ret_bh)
367 {
368         int ret, i, blocks_per_unit = 1;
369         u64 blkno;
370         char namebuf[40];
371
372         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
373         struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
374         struct ocfs2_chain_list *cl;
375         struct ocfs2_chain_rec *rec;
376         struct ocfs2_dinode *ac_dinode;
377         struct ocfs2_group_desc *bg;
378
379         ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
380         ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
381                                          strlen(namebuf), &blkno);
382         if (ret) {
383                 ret = -ENOENT;
384                 goto out;
385         }
386
387         ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
388         if (ret) {
389                 mlog_errno(ret);
390                 goto out;
391         }
392
393         ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
394         cl = &(ac_dinode->id2.i_chain);
395         rec = &(cl->cl_recs[0]);
396
397         if (type == GLOBAL_BITMAP_SYSTEM_INODE)
398                 blocks_per_unit <<= (osb->s_clustersize_bits -
399                                                 inode->i_sb->s_blocksize_bits);
400         /*
401          * 'vict_blkno' was out of the valid range.
402          */
403         if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
404             (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
405                                 blocks_per_unit))) {
406                 ret = -EINVAL;
407                 goto out;
408         }
409
410         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
411
412                 rec = &(cl->cl_recs[i]);
413                 if (!rec)
414                         continue;
415
416                 bg = NULL;
417
418                 do {
419                         if (!bg)
420                                 blkno = le64_to_cpu(rec->c_blkno);
421                         else
422                                 blkno = le64_to_cpu(bg->bg_next_group);
423
424                         if (gd_bh) {
425                                 brelse(gd_bh);
426                                 gd_bh = NULL;
427                         }
428
429                         ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
430                         if (ret) {
431                                 mlog_errno(ret);
432                                 goto out;
433                         }
434
435                         bg = (struct ocfs2_group_desc *)gd_bh->b_data;
436
437                         if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
438                                                 le16_to_cpu(bg->bg_bits))) {
439
440                                 *ret_bh = gd_bh;
441                                 *vict_bit = (vict_blkno - blkno) /
442                                                         blocks_per_unit;
443                                 mlog(0, "find the victim group: #%llu, "
444                                      "total_bits: %u, vict_bit: %u\n",
445                                      blkno, le16_to_cpu(bg->bg_bits),
446                                      *vict_bit);
447                                 goto out;
448                         }
449
450                 } while (le64_to_cpu(bg->bg_next_group));
451         }
452
453         ret = -EINVAL;
454 out:
455         brelse(ac_bh);
456
457         /*
458          * caller has to release the gd_bh properly.
459          */
460         return ret;
461 }
462
463 /*
464  * XXX: helper to validate and adjust moving goal.
465  */
466 static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
467                                                struct ocfs2_move_extents *range)
468 {
469         int ret, goal_bit = 0;
470
471         struct buffer_head *gd_bh = NULL;
472         struct ocfs2_group_desc *bg;
473         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
474         int c_to_b = 1 << (osb->s_clustersize_bits -
475                                         inode->i_sb->s_blocksize_bits);
476
477         /*
478          * validate goal sits within global_bitmap, and return the victim
479          * group desc
480          */
481         ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
482                                             GLOBAL_BITMAP_SYSTEM_INODE,
483                                             OCFS2_INVALID_SLOT,
484                                             &goal_bit, &gd_bh);
485         if (ret)
486                 goto out;
487
488         bg = (struct ocfs2_group_desc *)gd_bh->b_data;
489
490         /*
491          * make goal become cluster aligned.
492          */
493         if (range->me_goal % c_to_b)
494                 range->me_goal = range->me_goal / c_to_b * c_to_b;
495
496         /*
497          * moving goal is not allowd to start with a group desc blok(#0 blk)
498          * let's compromise to the latter cluster.
499          */
500         if (range->me_goal == le64_to_cpu(bg->bg_blkno))
501                 range->me_goal += c_to_b;
502
503         /*
504          * movement is not gonna cross two groups.
505          */
506         if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
507                                                                 range->me_len) {
508                 ret = -EINVAL;
509                 goto out;
510         }
511         /*
512          * more exact validations/adjustments will be performed later during
513          * moving operation for each extent range.
514          */
515         mlog(0, "extents get ready to be moved to #%llu block\n",
516              range->me_goal);
517
518 out:
519         brelse(gd_bh);
520
521         return ret;
522 }
523
524 static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
525                                     int *goal_bit, u32 move_len, u32 max_hop,
526                                     u32 *phys_cpos)
527 {
528         int i, used, last_free_bits = 0, base_bit = *goal_bit;
529         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
530         u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
531                                                  le64_to_cpu(gd->bg_blkno));
532
533         for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
534
535                 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
536                 if (used) {
537                         /*
538                          * we even tried searching the free chunk by jumping
539                          * a 'max_hop' distance, but still failed.
540                          */
541                         if ((i - base_bit) > max_hop) {
542                                 *phys_cpos = 0;
543                                 break;
544                         }
545
546                         if (last_free_bits)
547                                 last_free_bits = 0;
548
549                         continue;
550                 } else
551                         last_free_bits++;
552
553                 if (last_free_bits == move_len) {
554                         *goal_bit = i;
555                         *phys_cpos = base_cpos + i;
556                         break;
557                 }
558         }
559
560         mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
561 }
562
563 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
564                                        handle_t *handle,
565                                        struct buffer_head *di_bh,
566                                        u32 num_bits,
567                                        u16 chain)
568 {
569         int ret;
570         u32 tmp_used;
571         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
572         struct ocfs2_chain_list *cl =
573                                 (struct ocfs2_chain_list *) &di->id2.i_chain;
574
575         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
576                                       OCFS2_JOURNAL_ACCESS_WRITE);
577         if (ret < 0) {
578                 mlog_errno(ret);
579                 goto out;
580         }
581
582         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
583         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
584         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
585         ocfs2_journal_dirty(handle, di_bh);
586
587 out:
588         return ret;
589 }
590
591 static inline int ocfs2_block_group_set_bits(handle_t *handle,
592                                              struct inode *alloc_inode,
593                                              struct ocfs2_group_desc *bg,
594                                              struct buffer_head *group_bh,
595                                              unsigned int bit_off,
596                                              unsigned int num_bits)
597 {
598         int status;
599         void *bitmap = bg->bg_bitmap;
600         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
601
602         /* All callers get the descriptor via
603          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
604         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
605         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
606
607         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
608              num_bits);
609
610         if (ocfs2_is_cluster_bitmap(alloc_inode))
611                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
612
613         status = ocfs2_journal_access_gd(handle,
614                                          INODE_CACHE(alloc_inode),
615                                          group_bh,
616                                          journal_type);
617         if (status < 0) {
618                 mlog_errno(status);
619                 goto bail;
620         }
621
622         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
623         if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
624                 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
625                             " count %u but claims %u are freed. num_bits %d",
626                             (unsigned long long)le64_to_cpu(bg->bg_blkno),
627                             le16_to_cpu(bg->bg_bits),
628                             le16_to_cpu(bg->bg_free_bits_count), num_bits);
629                 return -EROFS;
630         }
631         while (num_bits--)
632                 ocfs2_set_bit(bit_off++, bitmap);
633
634         ocfs2_journal_dirty(handle, group_bh);
635
636 bail:
637         return status;
638 }
639
640 static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
641                              u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
642                              u32 len, int ext_flags)
643 {
644         int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
645         handle_t *handle;
646         struct inode *inode = context->inode;
647         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
648         struct inode *tl_inode = osb->osb_tl_inode;
649         struct inode *gb_inode = NULL;
650         struct buffer_head *gb_bh = NULL;
651         struct buffer_head *gd_bh = NULL;
652         struct ocfs2_group_desc *gd;
653         struct ocfs2_refcount_tree *ref_tree = NULL;
654         u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
655                                                     context->range->me_threshold);
656         u64 phys_blkno, new_phys_blkno;
657
658         phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
659
660         if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
661
662                 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
663                          OCFS2_HAS_REFCOUNT_FL));
664
665                 BUG_ON(!context->refcount_loc);
666
667                 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
668                                                &ref_tree, NULL);
669                 if (ret) {
670                         mlog_errno(ret);
671                         return ret;
672                 }
673
674                 ret = ocfs2_prepare_refcount_change_for_del(inode,
675                                                         context->refcount_loc,
676                                                         phys_blkno,
677                                                         len,
678                                                         &credits,
679                                                         &extra_blocks);
680                 if (ret) {
681                         mlog_errno(ret);
682                         goto out;
683                 }
684         }
685
686         ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
687                                                  &context->meta_ac,
688                                                  NULL, extra_blocks, &credits);
689         if (ret) {
690                 mlog_errno(ret);
691                 goto out;
692         }
693
694         /*
695          * need to count 2 extra credits for global_bitmap inode and
696          * group descriptor.
697          */
698         credits += OCFS2_INODE_UPDATE_CREDITS + 1;
699
700         /*
701          * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
702          * logic, while we still need to lock the global_bitmap.
703          */
704         gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
705                                                OCFS2_INVALID_SLOT);
706         if (!gb_inode) {
707                 mlog(ML_ERROR, "unable to get global_bitmap inode\n");
708                 ret = -EIO;
709                 goto out;
710         }
711
712         mutex_lock(&gb_inode->i_mutex);
713
714         ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
715         if (ret) {
716                 mlog_errno(ret);
717                 goto out_unlock_gb_mutex;
718         }
719
720         mutex_lock(&tl_inode->i_mutex);
721
722         handle = ocfs2_start_trans(osb, credits);
723         if (IS_ERR(handle)) {
724                 ret = PTR_ERR(handle);
725                 mlog_errno(ret);
726                 goto out_unlock_tl_inode;
727         }
728
729         new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
730         ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
731                                             GLOBAL_BITMAP_SYSTEM_INODE,
732                                             OCFS2_INVALID_SLOT,
733                                             &goal_bit, &gd_bh);
734         if (ret) {
735                 mlog_errno(ret);
736                 goto out_commit;
737         }
738
739         /*
740          * probe the victim cluster group to find a proper
741          * region to fit wanted movement, it even will perfrom
742          * a best-effort attempt by compromising to a threshold
743          * around the goal.
744          */
745         ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
746                                 new_phys_cpos);
747         if (!new_phys_cpos) {
748                 ret = -ENOSPC;
749                 goto out_commit;
750         }
751
752         ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
753                                   *new_phys_cpos, ext_flags);
754         if (ret) {
755                 mlog_errno(ret);
756                 goto out_commit;
757         }
758
759         gd = (struct ocfs2_group_desc *)gd_bh->b_data;
760         ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
761                                                le16_to_cpu(gd->bg_chain));
762         if (ret) {
763                 mlog_errno(ret);
764                 goto out_commit;
765         }
766
767         ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
768                                          goal_bit, len);
769         if (ret)
770                 mlog_errno(ret);
771
772         /*
773          * Here we should write the new page out first if we are
774          * in write-back mode.
775          */
776         ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
777         if (ret)
778                 mlog_errno(ret);
779
780 out_commit:
781         ocfs2_commit_trans(osb, handle);
782         brelse(gd_bh);
783
784 out_unlock_tl_inode:
785         mutex_unlock(&tl_inode->i_mutex);
786
787         ocfs2_inode_unlock(gb_inode, 1);
788 out_unlock_gb_mutex:
789         mutex_unlock(&gb_inode->i_mutex);
790         brelse(gb_bh);
791         iput(gb_inode);
792
793 out:
794         if (context->meta_ac) {
795                 ocfs2_free_alloc_context(context->meta_ac);
796                 context->meta_ac = NULL;
797         }
798
799         if (ref_tree)
800                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
801
802         return ret;
803 }
804
805 /*
806  * Helper to calculate the defraging length in one run according to threshold.
807  */
808 static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
809                                          u32 threshold, int *skip)
810 {
811         if ((*alloc_size + *len_defraged) < threshold) {
812                 /*
813                  * proceed defragmentation until we meet the thresh
814                  */
815                 *len_defraged += *alloc_size;
816         } else if (*len_defraged == 0) {
817                 /*
818                  * XXX: skip a large extent.
819                  */
820                 *skip = 1;
821         } else {
822                 /*
823                  * split this extent to coalesce with former pieces as
824                  * to reach the threshold.
825                  *
826                  * we're done here with one cycle of defragmentation
827                  * in a size of 'thresh', resetting 'len_defraged'
828                  * forces a new defragmentation.
829                  */
830                 *alloc_size = threshold - *len_defraged;
831                 *len_defraged = 0;
832         }
833 }
834
835 static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
836                                 struct ocfs2_move_extents_context *context)
837 {
838         int ret = 0, flags, do_defrag, skip = 0;
839         u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
840         u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
841
842         struct inode *inode = context->inode;
843         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
844         struct ocfs2_move_extents *range = context->range;
845         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
846
847         if ((inode->i_size == 0) || (range->me_len == 0))
848                 return 0;
849
850         if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
851                 return 0;
852
853         context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
854
855         ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
856         ocfs2_init_dealloc_ctxt(&context->dealloc);
857
858         /*
859          * TO-DO XXX:
860          *
861          * - xattr extents.
862          */
863
864         do_defrag = context->auto_defrag;
865
866         /*
867          * extents moving happens in unit of clusters, for the sake
868          * of simplicity, we may ignore two clusters where 'byte_start'
869          * and 'byte_start + len' were within.
870          */
871         move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
872         len_to_move = (range->me_start + range->me_len) >>
873                                                 osb->s_clustersize_bits;
874         if (len_to_move >= move_start)
875                 len_to_move -= move_start;
876         else
877                 len_to_move = 0;
878
879         if (do_defrag)
880                 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
881         else
882                 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
883                                                          range->me_goal);
884
885         mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
886              "thresh: %u\n",
887              (unsigned long long)OCFS2_I(inode)->ip_blkno,
888              (unsigned long long)range->me_start,
889              (unsigned long long)range->me_len,
890              move_start, len_to_move, defrag_thresh);
891
892         cpos = move_start;
893         while (len_to_move) {
894                 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
895                                          &flags);
896                 if (ret) {
897                         mlog_errno(ret);
898                         goto out;
899                 }
900
901                 if (alloc_size > len_to_move)
902                         alloc_size = len_to_move;
903
904                 /*
905                  * XXX: how to deal with a hole:
906                  *
907                  * - skip the hole of course
908                  * - force a new defragmentation
909                  */
910                 if (!phys_cpos) {
911                         if (do_defrag)
912                                 len_defraged = 0;
913
914                         goto next;
915                 }
916
917                 if (do_defrag) {
918                         ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
919                                                      defrag_thresh, &skip);
920                         /*
921                          * skip large extents
922                          */
923                         if (skip) {
924                                 skip = 0;
925                                 goto next;
926                         }
927
928                         mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
929                              "alloc_size: %u, len_defraged: %u\n",
930                              cpos, phys_cpos, alloc_size, len_defraged);
931
932                         ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
933                                                   &alloc_size, flags);
934                 } else {
935                         ret = ocfs2_move_extent(context, cpos, phys_cpos,
936                                                 &new_phys_cpos, alloc_size,
937                                                 flags);
938
939                         new_phys_cpos += alloc_size;
940                 }
941
942                 if (ret < 0) {
943                         mlog_errno(ret);
944                         goto out;
945                 }
946
947                 context->clusters_moved += alloc_size;
948 next:
949                 cpos += alloc_size;
950                 len_to_move -= alloc_size;
951         }
952
953         range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
954
955 out:
956         range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
957                                                       context->clusters_moved);
958         range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
959                                                        context->new_phys_cpos);
960
961         ocfs2_schedule_truncate_log_flush(osb, 1);
962         ocfs2_run_deallocs(osb, &context->dealloc);
963
964         return ret;
965 }
966
967 static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
968 {
969         int status;
970         handle_t *handle;
971         struct inode *inode = context->inode;
972         struct ocfs2_dinode *di;
973         struct buffer_head *di_bh = NULL;
974         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
975
976         if (!inode)
977                 return -ENOENT;
978
979         if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
980                 return -EROFS;
981
982         mutex_lock(&inode->i_mutex);
983
984         /*
985          * This prevents concurrent writes from other nodes
986          */
987         status = ocfs2_rw_lock(inode, 1);
988         if (status) {
989                 mlog_errno(status);
990                 goto out;
991         }
992
993         status = ocfs2_inode_lock(inode, &di_bh, 1);
994         if (status) {
995                 mlog_errno(status);
996                 goto out_rw_unlock;
997         }
998
999         /*
1000          * rememer ip_xattr_sem also needs to be held if necessary
1001          */
1002         down_write(&OCFS2_I(inode)->ip_alloc_sem);
1003
1004         status = __ocfs2_move_extents_range(di_bh, context);
1005
1006         up_write(&OCFS2_I(inode)->ip_alloc_sem);
1007         if (status) {
1008                 mlog_errno(status);
1009                 goto out_inode_unlock;
1010         }
1011
1012         /*
1013          * We update ctime for these changes
1014          */
1015         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1016         if (IS_ERR(handle)) {
1017                 status = PTR_ERR(handle);
1018                 mlog_errno(status);
1019                 goto out_inode_unlock;
1020         }
1021
1022         status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1023                                          OCFS2_JOURNAL_ACCESS_WRITE);
1024         if (status) {
1025                 mlog_errno(status);
1026                 goto out_commit;
1027         }
1028
1029         di = (struct ocfs2_dinode *)di_bh->b_data;
1030         inode->i_ctime = CURRENT_TIME;
1031         di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1032         di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1033
1034         ocfs2_journal_dirty(handle, di_bh);
1035
1036 out_commit:
1037         ocfs2_commit_trans(osb, handle);
1038
1039 out_inode_unlock:
1040         brelse(di_bh);
1041         ocfs2_inode_unlock(inode, 1);
1042 out_rw_unlock:
1043         ocfs2_rw_unlock(inode, 1);
1044 out:
1045         mutex_unlock(&inode->i_mutex);
1046
1047         return status;
1048 }
1049
1050 int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
1051 {
1052         int status;
1053
1054         struct inode *inode = filp->f_path.dentry->d_inode;
1055         struct ocfs2_move_extents range;
1056         struct ocfs2_move_extents_context *context = NULL;
1057
1058         status = mnt_want_write(filp->f_path.mnt);
1059         if (status)
1060                 return status;
1061
1062         if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
1063                 goto out;
1064
1065         if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1066                 status = -EPERM;
1067                 goto out;
1068         }
1069
1070         context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
1071         if (!context) {
1072                 status = -ENOMEM;
1073                 mlog_errno(status);
1074                 goto out;
1075         }
1076
1077         context->inode = inode;
1078         context->file = filp;
1079
1080         if (argp) {
1081                 if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
1082                                    sizeof(range))) {
1083                         status = -EFAULT;
1084                         goto out;
1085                 }
1086         } else {
1087                 status = -EINVAL;
1088                 goto out;
1089         }
1090
1091         if (range.me_start > i_size_read(inode))
1092                 goto out;
1093
1094         if (range.me_start + range.me_len > i_size_read(inode))
1095                         range.me_len = i_size_read(inode) - range.me_start;
1096
1097         context->range = &range;
1098
1099         if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
1100                 context->auto_defrag = 1;
1101                 if (!range.me_threshold)
1102                         /*
1103                          * ok, the default theshold for the defragmentation
1104                          * is 1M, since our maximum clustersize was 1M also.
1105                          * any thought?
1106                          */
1107                         range.me_threshold = 1024 * 1024;
1108                 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
1109                         context->partial = 1;
1110         } else {
1111                 /*
1112                  * first best-effort attempt to validate and adjust the goal
1113                  * (physical address in block), while it can't guarantee later
1114                  * operation can succeed all the time since global_bitmap may
1115                  * change a bit over time.
1116                  */
1117
1118                 status = ocfs2_validate_and_adjust_move_goal(inode, &range);
1119                 if (status)
1120                         goto out;
1121         }
1122
1123         status = ocfs2_move_extents(context);
1124         if (status)
1125                 mlog_errno(status);
1126 out:
1127         /*
1128          * movement/defragmentation may end up being partially completed,
1129          * that's the reason why we need to return userspace the finished
1130          * length and new_offset even if failure happens somewhere.
1131          */
1132         if (argp) {
1133                 if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
1134                                 sizeof(range)))
1135                         status = -EFAULT;
1136         }
1137
1138         kfree(context);
1139
1140         mnt_drop_write(filp->f_path.mnt);
1141
1142         return status;
1143 }