fs/ocfs2/move_extents.c

   1 /* -*- mode: c; c-basic-offset: 8; -*-
   2  * vim: noexpandtab sw=8 ts=8 sts=0:
   3  *
   4  * move_extents.c
   5  *
   6  * Copyright (C) 2011 Oracle.  All rights reserved.
   7  *
   8  * This program is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public
  10  * License version 2 as published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * General Public License for more details.
  16  */
  17 #include <linux/fs.h>
  18 #include <linux/types.h>
  19 #include <linux/mount.h>
  20 #include <linux/swap.h>
  21
  22 #include <cluster/masklog.h>
  23
  24 #include "ocfs2.h"
  25 #include "ocfs2_ioctl.h"
  26
  27 #include "alloc.h"
  28 #include "aops.h"
  29 #include "dlmglue.h"
  30 #include "extent_map.h"
  31 #include "inode.h"
  32 #include "journal.h"
  33 #include "suballoc.h"
  34 #include "uptodate.h"
  35 #include "super.h"
  36 #include "dir.h"
  37 #include "buffer_head_io.h"
  38 #include "sysfile.h"
  39 #include "suballoc.h"
  40 #include "refcounttree.h"
  41 #include "move_extents.h"
  42
  43 struct ocfs2_move_extents_context {
  44         struct inode *inode;
  45         struct file *file;
  46         int auto_defrag;
  47         int partial;
  48         int credits;
  49         u32 new_phys_cpos;
  50         u32 clusters_moved;
  51         u64 refcount_loc;
  52         struct ocfs2_move_extents *range;
  53         struct ocfs2_extent_tree et;
  54         struct ocfs2_alloc_context *meta_ac;
  55         struct ocfs2_alloc_context *data_ac;
  56         struct ocfs2_cached_dealloc_ctxt dealloc;
  57 };
  58
  59 static int __ocfs2_move_extent(handle_t *handle,
  60                                struct ocfs2_move_extents_context *context,
  61                                u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
  62                                int ext_flags)
  63 {
  64         int ret = 0, index;
  65         struct inode *inode = context->inode;
  66         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  67         struct ocfs2_extent_rec *rec, replace_rec;
  68         struct ocfs2_path *path = NULL;
  69         struct ocfs2_extent_list *el;
  70         u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
  71         u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
  72
  73         ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
  74                                                p_cpos, new_p_cpos, len);
  75         if (ret) {
  76                 mlog_errno(ret);
  77                 goto out;
  78         }
  79
  80         memset(&replace_rec, 0, sizeof(replace_rec));
  81         replace_rec.e_cpos = cpu_to_le32(cpos);
  82         replace_rec.e_leaf_clusters = cpu_to_le16(len);
  83         replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
  84                                                                    new_p_cpos));
  85
  86         path = ocfs2_new_path_from_et(&context->et);
  87         if (!path) {
  88                 ret = -ENOMEM;
  89                 mlog_errno(ret);
  90                 goto out;
  91         }
  92
  93         ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
  94         if (ret) {
  95                 mlog_errno(ret);
  96                 goto out;
  97         }
  98
  99         el = path_leaf_el(path);
 100
 101         index = ocfs2_search_extent_list(el, cpos);
 102         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
 103                 ocfs2_error(inode->i_sb,
 104                             "Inode %llu has an extent at cpos %u which can no "
 105                             "longer be found.\n",
 106                             (unsigned long long)ino, cpos);
 107                 ret = -EROFS;
 108                 goto out;
 109         }
 110
 111         rec = &el->l_recs[index];
 112
 113         BUG_ON(ext_flags != rec->e_flags);
 114         /*
 115          * after moving/defraging to new location, the extent is not going
 116          * to be refcounted anymore.
 117          */
 118         replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
 119
 120         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
 121                                       context->et.et_root_bh,
 122                                       OCFS2_JOURNAL_ACCESS_WRITE);
 123         if (ret) {
 124                 mlog_errno(ret);
 125                 goto out;
 126         }
 127
 128         ret = ocfs2_split_extent(handle, &context->et, path, index,
 129                                  &replace_rec, context->meta_ac,
 130                                  &context->dealloc);
 131         if (ret) {
 132                 mlog_errno(ret);
 133                 goto out;
 134         }
 135
 136         ocfs2_journal_dirty(handle, context->et.et_root_bh);
 137
 138         context->new_phys_cpos = new_p_cpos;
 139
 140         /*
 141          * need I to append truncate log for old clusters?
 142          */
 143         if (old_blkno) {
 144                 if (ext_flags & OCFS2_EXT_REFCOUNTED)
 145                         ret = ocfs2_decrease_refcount(inode, handle,
 146                                         ocfs2_blocks_to_clusters(osb->sb,
 147                                                                  old_blkno),
 148                                         len, context->meta_ac,
 149                                         &context->dealloc, 1);
 150                 else
 151                         ret = ocfs2_truncate_log_append(osb, handle,
 152                                                         old_blkno, len);
 153         }
 154
 155 out:
 156         return ret;
 157 }
 158
 159 /*
 160  * lock allocators, and reserving appropriate number of bits for
 161  * meta blocks and data clusters.
 162  *
 163  * in some cases, we don't need to reserve clusters, just let data_ac
 164  * be NULL.
 165  */
 166 static int ocfs2_lock_allocators_move_extents(struct inode *inode,
 167                                         struct ocfs2_extent_tree *et,
 168                                         u32 clusters_to_move,
 169                                         u32 extents_to_split,
 170                                         struct ocfs2_alloc_context **meta_ac,
 171                                         struct ocfs2_alloc_context **data_ac,
 172                                         int extra_blocks,
 173                                         int *credits)
 174 {
 175         int ret, num_free_extents;
 176         unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
 177         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 178
 179         num_free_extents = ocfs2_num_free_extents(osb, et);
 180         if (num_free_extents < 0) {
 181                 ret = num_free_extents;
 182                 mlog_errno(ret);
 183                 goto out;
 184         }
 185
 186         if (!num_free_extents ||
 187             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
 188                 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
 189
 190         ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
 191         if (ret) {
 192                 mlog_errno(ret);
 193                 goto out;
 194         }
 195
 196         if (data_ac) {
 197                 ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
 198                 if (ret) {
 199                         mlog_errno(ret);
 200                         goto out;
 201                 }
 202         }
 203
 204         *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
 205                                               clusters_to_move + 2);
 206
 207         mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
 208              extra_blocks, clusters_to_move, *credits);
 209 out:
 210         if (ret) {
 211                 if (*meta_ac) {
 212                         ocfs2_free_alloc_context(*meta_ac);
 213                         *meta_ac = NULL;
 214                 }
 215         }
 216
 217         return ret;
 218 }
 219
 220 /*
 221  * Using one journal handle to guarantee the data consistency in case
 222  * crash happens anywhere.
 223  */
 224 static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
 225                                u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
 226 {
 227         int ret, credits = 0, extra_blocks = 0, partial = context->partial;
 228         handle_t *handle;
 229         struct inode *inode = context->inode;
 230         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 231         struct inode *tl_inode = osb->osb_tl_inode;
 232         struct ocfs2_refcount_tree *ref_tree = NULL;
 233         u32 new_phys_cpos, new_len;
 234         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 235
 236         if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
 237
 238                 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
 239                          OCFS2_HAS_REFCOUNT_FL));
 240
 241                 BUG_ON(!context->refcount_loc);
 242
 243                 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
 244                                                &ref_tree, NULL);
 245                 if (ret) {
 246                         mlog_errno(ret);
 247                         return ret;
 248                 }
 249
 250                 ret = ocfs2_prepare_refcount_change_for_del(inode,
 251                                                         context->refcount_loc,
 252                                                         phys_blkno,
 253                                                         *len,
 254                                                         &credits,
 255                                                         &extra_blocks);
 256                 if (ret) {
 257                         mlog_errno(ret);
 258                         goto out;
 259                 }
 260         }
 261
 262         ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
 263                                                  &context->meta_ac,
 264                                                  &context->data_ac,
 265                                                  extra_blocks, &credits);
 266         if (ret) {
 267                 mlog_errno(ret);
 268                 goto out;
 269         }
 270
 271         /*
 272          * should be using allocation reservation strategy there?
 273          *
 274          * if (context->data_ac)
 275          *      context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
 276          */
 277
 278         mutex_lock(&tl_inode->i_mutex);
 279
 280         if (ocfs2_truncate_log_needs_flush(osb)) {
 281                 ret = __ocfs2_flush_truncate_log(osb);
 282                 if (ret < 0) {
 283                         mlog_errno(ret);
 284                         goto out_unlock_mutex;
 285                 }
 286         }
 287
 288         handle = ocfs2_start_trans(osb, credits);
 289         if (IS_ERR(handle)) {
 290                 ret = PTR_ERR(handle);
 291                 mlog_errno(ret);
 292                 goto out_unlock_mutex;
 293         }
 294
 295         ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
 296                                      &new_phys_cpos, &new_len);
 297         if (ret) {
 298                 mlog_errno(ret);
 299                 goto out_commit;
 300         }
 301
 302         /*
 303          * allowing partial extent moving is kind of 'pros and cons', it makes
 304          * whole defragmentation less likely to fail, on the contrary, the bad
 305          * thing is it may make the fs even more fragmented after moving, let
 306          * userspace make a good decision here.
 307          */
 308         if (new_len != *len) {
 309                 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
 310                 if (!partial) {
 311                         context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
 312                         ret = -ENOSPC;
 313                         goto out_commit;
 314                 }
 315         }
 316
 317         mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
 318              phys_cpos, new_phys_cpos);
 319
 320         ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
 321                                   new_phys_cpos, ext_flags);
 322         if (ret)
 323                 mlog_errno(ret);
 324
 325         if (partial && (new_len != *len))
 326                 *len = new_len;
 327
 328         /*
 329          * Here we should write the new page out first if we are
 330          * in write-back mode.
 331          */
 332         ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
 333         if (ret)
 334                 mlog_errno(ret);
 335
 336 out_commit:
 337         ocfs2_commit_trans(osb, handle);
 338
 339 out_unlock_mutex:
 340         mutex_unlock(&tl_inode->i_mutex);
 341
 342         if (context->data_ac) {
 343                 ocfs2_free_alloc_context(context->data_ac);
 344                 context->data_ac = NULL;
 345         }
 346
 347         if (context->meta_ac) {
 348                 ocfs2_free_alloc_context(context->meta_ac);
 349                 context->meta_ac = NULL;
 350         }
 351
 352 out:
 353         if (ref_tree)
 354                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 355
 356         return ret;
 357 }
 358
 359 /*
 360  * find the victim alloc group, where #blkno fits.
 361  */
 362 static int ocfs2_find_victim_alloc_group(struct inode *inode,
 363                                          u64 vict_blkno,
 364                                          int type, int slot,
 365                                          int *vict_bit,
 366                                          struct buffer_head **ret_bh)
 367 {
 368         int ret, i, blocks_per_unit = 1;
 369         u64 blkno;
 370         char namebuf[40];
 371
 372         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 373         struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
 374         struct ocfs2_chain_list *cl;
 375         struct ocfs2_chain_rec *rec;
 376         struct ocfs2_dinode *ac_dinode;
 377         struct ocfs2_group_desc *bg;
 378
 379         ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
 380         ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
 381                                          strlen(namebuf), &blkno);
 382         if (ret) {
 383                 ret = -ENOENT;
 384                 goto out;
 385         }
 386
 387         ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
 388         if (ret) {
 389                 mlog_errno(ret);
 390                 goto out;
 391         }
 392
 393         ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
 394         cl = &(ac_dinode->id2.i_chain);
 395         rec = &(cl->cl_recs[0]);
 396
 397         if (type == GLOBAL_BITMAP_SYSTEM_INODE)
 398                 blocks_per_unit <<= (osb->s_clustersize_bits -
 399                                                 inode->i_sb->s_blocksize_bits);
 400         /*
 401          * 'vict_blkno' was out of the valid range.
 402          */
 403         if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
 404             (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
 405                                 blocks_per_unit))) {
 406                 ret = -EINVAL;
 407                 goto out;
 408         }
 409
 410         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
 411
 412                 rec = &(cl->cl_recs[i]);
 413                 if (!rec)
 414                         continue;
 415
 416                 bg = NULL;
 417
 418                 do {
 419                         if (!bg)
 420                                 blkno = le64_to_cpu(rec->c_blkno);
 421                         else
 422                                 blkno = le64_to_cpu(bg->bg_next_group);
 423
 424                         if (gd_bh) {
 425                                 brelse(gd_bh);
 426                                 gd_bh = NULL;
 427                         }
 428
 429                         ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
 430                         if (ret) {
 431                                 mlog_errno(ret);
 432                                 goto out;
 433                         }
 434
 435                         bg = (struct ocfs2_group_desc *)gd_bh->b_data;
 436
 437                         if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
 438                                                 le16_to_cpu(bg->bg_bits))) {
 439
 440                                 *ret_bh = gd_bh;
 441                                 *vict_bit = (vict_blkno - blkno) /
 442                                                         blocks_per_unit;
 443                                 mlog(0, "find the victim group: #%llu, "
 444                                      "total_bits: %u, vict_bit: %u\n",
 445                                      blkno, le16_to_cpu(bg->bg_bits),
 446                                      *vict_bit);
 447                                 goto out;
 448                         }
 449
 450                 } while (le64_to_cpu(bg->bg_next_group));
 451         }
 452
 453         ret = -EINVAL;
 454 out:
 455         brelse(ac_bh);
 456
 457         /*
 458          * caller has to release the gd_bh properly.
 459          */
 460         return ret;
 461 }
 462
 463 /*
 464  * XXX: helper to validate and adjust moving goal.
 465  */
 466 static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
 467                                                struct ocfs2_move_extents *range)
 468 {
 469         int ret, goal_bit = 0;
 470
 471         struct buffer_head *gd_bh = NULL;
 472         struct ocfs2_group_desc *bg;
 473         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 474         int c_to_b = 1 << (osb->s_clustersize_bits -
 475                                         inode->i_sb->s_blocksize_bits);
 476
 477         /*
 478          * validate goal sits within global_bitmap, and return the victim
 479          * group desc
 480          */
 481         ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
 482                                             GLOBAL_BITMAP_SYSTEM_INODE,
 483                                             OCFS2_INVALID_SLOT,
 484                                             &goal_bit, &gd_bh);
 485         if (ret)
 486                 goto out;
 487
 488         bg = (struct ocfs2_group_desc *)gd_bh->b_data;
 489
 490         /*
 491          * make goal become cluster aligned.
 492          */
 493         if (range->me_goal % c_to_b)
 494                 range->me_goal = range->me_goal / c_to_b * c_to_b;
 495
 496         /*
 497          * moving goal is not allowd to start with a group desc blok(#0 blk)
 498          * let's compromise to the latter cluster.
 499          */
 500         if (range->me_goal == le64_to_cpu(bg->bg_blkno))
 501                 range->me_goal += c_to_b;
 502
 503         /*
 504          * movement is not gonna cross two groups.
 505          */
 506         if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
 507                                                                 range->me_len) {
 508                 ret = -EINVAL;
 509                 goto out;
 510         }
 511         /*
 512          * more exact validations/adjustments will be performed later during
 513          * moving operation for each extent range.
 514          */
 515         mlog(0, "extents get ready to be moved to #%llu block\n",
 516              range->me_goal);
 517
 518 out:
 519         brelse(gd_bh);
 520
 521         return ret;
 522 }
 523
 524 static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
 525                                     int *goal_bit, u32 move_len, u32 max_hop,
 526                                     u32 *phys_cpos)
 527 {
 528         int i, used, last_free_bits = 0, base_bit = *goal_bit;
 529         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 530         u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
 531                                                  le64_to_cpu(gd->bg_blkno));
 532
 533         for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
 534
 535                 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
 536                 if (used) {
 537                         /*
 538                          * we even tried searching the free chunk by jumping
 539                          * a 'max_hop' distance, but still failed.
 540                          */
 541                         if ((i - base_bit) > max_hop) {
 542                                 *phys_cpos = 0;
 543                                 break;
 544                         }
 545
 546                         if (last_free_bits)
 547                                 last_free_bits = 0;
 548
 549                         continue;
 550                 } else
 551                         last_free_bits++;
 552
 553                 if (last_free_bits == move_len) {
 554                         *goal_bit = i;
 555                         *phys_cpos = base_cpos + i;
 556                         break;
 557                 }
 558         }
 559
 560         mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
 561 }
 562
 563 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
 564                                        handle_t *handle,
 565                                        struct buffer_head *di_bh,
 566                                        u32 num_bits,
 567                                        u16 chain)
 568 {
 569         int ret;
 570         u32 tmp_used;
 571         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
 572         struct ocfs2_chain_list *cl =
 573                                 (struct ocfs2_chain_list *) &di->id2.i_chain;
 574
 575         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
 576                                       OCFS2_JOURNAL_ACCESS_WRITE);
 577         if (ret < 0) {
 578                 mlog_errno(ret);
 579                 goto out;
 580         }
 581
 582         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
 583         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
 584         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
 585         ocfs2_journal_dirty(handle, di_bh);
 586
 587 out:
 588         return ret;
 589 }
 590
 591 static inline int ocfs2_block_group_set_bits(handle_t *handle,
 592                                              struct inode *alloc_inode,
 593                                              struct ocfs2_group_desc *bg,
 594                                              struct buffer_head *group_bh,
 595                                              unsigned int bit_off,
 596                                              unsigned int num_bits)
 597 {
 598         int status;
 599         void *bitmap = bg->bg_bitmap;
 600         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
 601
 602         /* All callers get the descriptor via
 603          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
 604         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 605         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
 606
 607         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
 608              num_bits);
 609
 610         if (ocfs2_is_cluster_bitmap(alloc_inode))
 611                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
 612
 613         status = ocfs2_journal_access_gd(handle,
 614                                          INODE_CACHE(alloc_inode),
 615                                          group_bh,
 616                                          journal_type);
 617         if (status < 0) {
 618                 mlog_errno(status);
 619                 goto bail;
 620         }
 621
 622         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
 623         if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
 624                 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
 625                             " count %u but claims %u are freed. num_bits %d",
 626                             (unsigned long long)le64_to_cpu(bg->bg_blkno),
 627                             le16_to_cpu(bg->bg_bits),
 628                             le16_to_cpu(bg->bg_free_bits_count), num_bits);
 629                 return -EROFS;
 630         }
 631         while (num_bits--)
 632                 ocfs2_set_bit(bit_off++, bitmap);
 633
 634         ocfs2_journal_dirty(handle, group_bh);
 635
 636 bail:
 637         return status;
 638 }
 639
 640 static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 641                              u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
 642                              u32 len, int ext_flags)
 643 {
 644         int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
 645         handle_t *handle;
 646         struct inode *inode = context->inode;
 647         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 648         struct inode *tl_inode = osb->osb_tl_inode;
 649         struct inode *gb_inode = NULL;
 650         struct buffer_head *gb_bh = NULL;
 651         struct buffer_head *gd_bh = NULL;
 652         struct ocfs2_group_desc *gd;
 653         struct ocfs2_refcount_tree *ref_tree = NULL;
 654         u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
 655                                                     context->range->me_threshold);
 656         u64 phys_blkno, new_phys_blkno;
 657
 658         phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 659
 660         if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
 661
 662                 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
 663                          OCFS2_HAS_REFCOUNT_FL));
 664
 665                 BUG_ON(!context->refcount_loc);
 666
 667                 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
 668                                                &ref_tree, NULL);
 669                 if (ret) {
 670                         mlog_errno(ret);
 671                         return ret;
 672                 }
 673
 674                 ret = ocfs2_prepare_refcount_change_for_del(inode,
 675                                                         context->refcount_loc,
 676                                                         phys_blkno,
 677                                                         len,
 678                                                         &credits,
 679                                                         &extra_blocks);
 680                 if (ret) {
 681                         mlog_errno(ret);
 682                         goto out;
 683                 }
 684         }
 685
 686         ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
 687                                                  &context->meta_ac,
 688                                                  NULL, extra_blocks, &credits);
 689         if (ret) {
 690                 mlog_errno(ret);
 691                 goto out;
 692         }
 693
 694         /*
 695          * need to count 2 extra credits for global_bitmap inode and
 696          * group descriptor.
 697          */
 698         credits += OCFS2_INODE_UPDATE_CREDITS + 1;
 699
 700         /*
 701          * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
 702          * logic, while we still need to lock the global_bitmap.
 703          */
 704         gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
 705                                                OCFS2_INVALID_SLOT);
 706         if (!gb_inode) {
 707                 mlog(ML_ERROR, "unable to get global_bitmap inode\n");
 708                 ret = -EIO;
 709                 goto out;
 710         }
 711
 712         mutex_lock(&gb_inode->i_mutex);
 713
 714         ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
 715         if (ret) {
 716                 mlog_errno(ret);
 717                 goto out_unlock_gb_mutex;
 718         }
 719
 720         mutex_lock(&tl_inode->i_mutex);
 721
 722         handle = ocfs2_start_trans(osb, credits);
 723         if (IS_ERR(handle)) {
 724                 ret = PTR_ERR(handle);
 725                 mlog_errno(ret);
 726                 goto out_unlock_tl_inode;
 727         }
 728
 729         new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
 730         ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
 731                                             GLOBAL_BITMAP_SYSTEM_INODE,
 732                                             OCFS2_INVALID_SLOT,
 733                                             &goal_bit, &gd_bh);
 734         if (ret) {
 735                 mlog_errno(ret);
 736                 goto out_commit;
 737         }
 738
 739         /*
 740          * probe the victim cluster group to find a proper
 741          * region to fit wanted movement, it even will perfrom
 742          * a best-effort attempt by compromising to a threshold
 743          * around the goal.
 744          */
 745         ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
 746                                 new_phys_cpos);
 747         if (!new_phys_cpos) {
 748                 ret = -ENOSPC;
 749                 goto out_commit;
 750         }
 751
 752         ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
 753                                   *new_phys_cpos, ext_flags);
 754         if (ret) {
 755                 mlog_errno(ret);
 756                 goto out_commit;
 757         }
 758
 759         gd = (struct ocfs2_group_desc *)gd_bh->b_data;
 760         ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
 761                                                le16_to_cpu(gd->bg_chain));
 762         if (ret) {
 763                 mlog_errno(ret);
 764                 goto out_commit;
 765         }
 766
 767         ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
 768                                          goal_bit, len);
 769         if (ret)
 770                 mlog_errno(ret);
 771
 772         /*
 773          * Here we should write the new page out first if we are
 774          * in write-back mode.
 775          */
 776         ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
 777         if (ret)
 778                 mlog_errno(ret);
 779
 780 out_commit:
 781         ocfs2_commit_trans(osb, handle);
 782         brelse(gd_bh);
 783
 784 out_unlock_tl_inode:
 785         mutex_unlock(&tl_inode->i_mutex);
 786
 787         ocfs2_inode_unlock(gb_inode, 1);
 788 out_unlock_gb_mutex:
 789         mutex_unlock(&gb_inode->i_mutex);
 790         brelse(gb_bh);
 791         iput(gb_inode);
 792
 793 out:
 794         if (context->meta_ac) {
 795                 ocfs2_free_alloc_context(context->meta_ac);
 796                 context->meta_ac = NULL;
 797         }
 798
 799         if (ref_tree)
 800                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 801
 802         return ret;
 803 }
 804
 805 /*
 806  * Helper to calculate the defraging length in one run according to threshold.
 807  */
 808 static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
 809                                          u32 threshold, int *skip)
 810 {
 811         if ((*alloc_size + *len_defraged) < threshold) {
 812                 /*
 813                  * proceed defragmentation until we meet the thresh
 814                  */
 815                 *len_defraged += *alloc_size;
 816         } else if (*len_defraged == 0) {
 817                 /*
 818                  * XXX: skip a large extent.
 819                  */
 820                 *skip = 1;
 821         } else {
 822                 /*
 823                  * split this extent to coalesce with former pieces as
 824                  * to reach the threshold.
 825                  *
 826                  * we're done here with one cycle of defragmentation
 827                  * in a size of 'thresh', resetting 'len_defraged'
 828                  * forces a new defragmentation.
 829                  */
 830                 *alloc_size = threshold - *len_defraged;
 831                 *len_defraged = 0;
 832         }
 833 }
 834
 835 static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
 836                                 struct ocfs2_move_extents_context *context)
 837 {
 838         int ret = 0, flags, do_defrag, skip = 0;
 839         u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
 840         u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
 841
 842         struct inode *inode = context->inode;
 843         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 844         struct ocfs2_move_extents *range = context->range;
 845         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 846
 847         if ((inode->i_size == 0) || (range->me_len == 0))
 848                 return 0;
 849
 850         if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 851                 return 0;
 852
 853         context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
 854
 855         ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
 856         ocfs2_init_dealloc_ctxt(&context->dealloc);
 857
 858         /*
 859          * TO-DO XXX:
 860          *
 861          * - xattr extents.
 862          */
 863
 864         do_defrag = context->auto_defrag;
 865
 866         /*
 867          * extents moving happens in unit of clusters, for the sake
 868          * of simplicity, we may ignore two clusters where 'byte_start'
 869          * and 'byte_start + len' were within.
 870          */
 871         move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
 872         len_to_move = (range->me_start + range->me_len) >>
 873                                                 osb->s_clustersize_bits;
 874         if (len_to_move >= move_start)
 875                 len_to_move -= move_start;
 876         else
 877                 len_to_move = 0;
 878
 879         if (do_defrag)
 880                 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
 881         else
 882                 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
 883                                                          range->me_goal);
 884
 885         mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
 886              "thresh: %u\n",
 887              (unsigned long long)OCFS2_I(inode)->ip_blkno,
 888              (unsigned long long)range->me_start,
 889              (unsigned long long)range->me_len,
 890              move_start, len_to_move, defrag_thresh);
 891
 892         cpos = move_start;
 893         while (len_to_move) {
 894                 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
 895                                          &flags);
 896                 if (ret) {
 897                         mlog_errno(ret);
 898                         goto out;
 899                 }
 900
 901                 if (alloc_size > len_to_move)
 902                         alloc_size = len_to_move;
 903
 904                 /*
 905                  * XXX: how to deal with a hole:
 906                  *
 907                  * - skip the hole of course
 908                  * - force a new defragmentation
 909                  */
 910                 if (!phys_cpos) {
 911                         if (do_defrag)
 912                                 len_defraged = 0;
 913
 914                         goto next;
 915                 }
 916
 917                 if (do_defrag) {
 918                         ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
 919                                                      defrag_thresh, &skip);
 920                         /*
 921                          * skip large extents
 922                          */
 923                         if (skip) {
 924                                 skip = 0;
 925                                 goto next;
 926                         }
 927
 928                         mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
 929                              "alloc_size: %u, len_defraged: %u\n",
 930                              cpos, phys_cpos, alloc_size, len_defraged);
 931
 932                         ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
 933                                                   &alloc_size, flags);
 934                 } else {
 935                         ret = ocfs2_move_extent(context, cpos, phys_cpos,
 936                                                 &new_phys_cpos, alloc_size,
 937                                                 flags);
 938
 939                         new_phys_cpos += alloc_size;
 940                 }
 941
 942                 if (ret < 0) {
 943                         mlog_errno(ret);
 944                         goto out;
 945                 }
 946
 947                 context->clusters_moved += alloc_size;
 948 next:
 949                 cpos += alloc_size;
 950                 len_to_move -= alloc_size;
 951         }
 952
 953         range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
 954
 955 out:
 956         range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
 957                                                       context->clusters_moved);
 958         range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
 959                                                        context->new_phys_cpos);
 960
 961         ocfs2_schedule_truncate_log_flush(osb, 1);
 962         ocfs2_run_deallocs(osb, &context->dealloc);
 963
 964         return ret;
 965 }
 966
 967 static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
 968 {
 969         int status;
 970         handle_t *handle;
 971         struct inode *inode = context->inode;
 972         struct ocfs2_dinode *di;
 973         struct buffer_head *di_bh = NULL;
 974         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 975
 976         if (!inode)
 977                 return -ENOENT;
 978
 979         if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 980                 return -EROFS;
 981
 982         mutex_lock(&inode->i_mutex);
 983
 984         /*
 985          * This prevents concurrent writes from other nodes
 986          */
 987         status = ocfs2_rw_lock(inode, 1);
 988         if (status) {
 989                 mlog_errno(status);
 990                 goto out;
 991         }
 992
 993         status = ocfs2_inode_lock(inode, &di_bh, 1);
 994         if (status) {
 995                 mlog_errno(status);
 996                 goto out_rw_unlock;
 997         }
 998
 999         /*
1000          * rememer ip_xattr_sem also needs to be held if necessary
1001          */
1002         down_write(&OCFS2_I(inode)->ip_alloc_sem);
1003
1004         status = __ocfs2_move_extents_range(di_bh, context);
1005
1006         up_write(&OCFS2_I(inode)->ip_alloc_sem);
1007         if (status) {
1008                 mlog_errno(status);
1009                 goto out_inode_unlock;
1010         }
1011
1012         /*
1013          * We update ctime for these changes
1014          */
1015         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1016         if (IS_ERR(handle)) {
1017                 status = PTR_ERR(handle);
1018                 mlog_errno(status);
1019                 goto out_inode_unlock;
1020         }
1021
1022         status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1023                                          OCFS2_JOURNAL_ACCESS_WRITE);
1024         if (status) {
1025                 mlog_errno(status);
1026                 goto out_commit;
1027         }
1028
1029         di = (struct ocfs2_dinode *)di_bh->b_data;
1030         inode->i_ctime = CURRENT_TIME;
1031         di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1032         di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1033
1034         ocfs2_journal_dirty(handle, di_bh);
1035
1036 out_commit:
1037         ocfs2_commit_trans(osb, handle);
1038
1039 out_inode_unlock:
1040         brelse(di_bh);
1041         ocfs2_inode_unlock(inode, 1);
1042 out_rw_unlock:
1043         ocfs2_rw_unlock(inode, 1);
1044 out:
1045         mutex_unlock(&inode->i_mutex);
1046
1047         return status;
1048 }
1049
1050 int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
1051 {
1052         int status;
1053
1054         struct inode *inode = filp->f_path.dentry->d_inode;
1055         struct ocfs2_move_extents range;
1056         struct ocfs2_move_extents_context *context = NULL;
1057
1058         status = mnt_want_write(filp->f_path.mnt);
1059         if (status)
1060                 return status;
1061
1062         if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
1063                 goto out;
1064
1065         if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1066                 status = -EPERM;
1067                 goto out;
1068         }
1069
1070         context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
1071         if (!context) {
1072                 status = -ENOMEM;
1073                 mlog_errno(status);
1074                 goto out;
1075         }
1076
1077         context->inode = inode;
1078         context->file = filp;
1079
1080         if (argp) {
1081                 if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
1082                                    sizeof(range))) {
1083                         status = -EFAULT;
1084                         goto out;
1085                 }
1086         } else {
1087                 status = -EINVAL;
1088                 goto out;
1089         }
1090
1091         if (range.me_start > i_size_read(inode))
1092                 goto out;
1093
1094         if (range.me_start + range.me_len > i_size_read(inode))
1095                         range.me_len = i_size_read(inode) - range.me_start;
1096
1097         context->range = &range;
1098
1099         if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
1100                 context->auto_defrag = 1;
1101                 if (!range.me_threshold)
1102                         /*
1103                          * ok, the default theshold for the defragmentation
1104                          * is 1M, since our maximum clustersize was 1M also.
1105                          * any thought?
1106                          */
1107                         range.me_threshold = 1024 * 1024;
1108                 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
1109                         context->partial = 1;
1110         } else {
1111                 /*
1112                  * first best-effort attempt to validate and adjust the goal
1113                  * (physical address in block), while it can't guarantee later
1114                  * operation can succeed all the time since global_bitmap may
1115                  * change a bit over time.
1116                  */
1117
1118                 status = ocfs2_validate_and_adjust_move_goal(inode, &range);
1119                 if (status)
1120                         goto out;
1121         }
1122
1123         status = ocfs2_move_extents(context);
1124         if (status)
1125                 mlog_errno(status);
1126 out:
1127         /*
1128          * movement/defragmentation may end up being partially completed,
1129          * that's the reason why we need to return userspace the finished
1130          * length and new_offset even if failure happens somewhere.
1131          */
1132         if (argp) {
1133                 if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
1134                                 sizeof(range)))
1135                         status = -EFAULT;
1136         }
1137
1138         kfree(context);
1139
1140         mnt_drop_write(filp->f_path.mnt);
1141
1142         return status;
1143 }