fs/xfs/xfs_filestream.c

   1 /*
   2  * Copyright (c) 2006-2007 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include "xfs_bmap_btree.h"
  20 #include "xfs_inum.h"
  21 #include "xfs_dinode.h"
  22 #include "xfs_inode.h"
  23 #include "xfs_ag.h"
  24 #include "xfs_log.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_sb.h"
  27 #include "xfs_mount.h"
  28 #include "xfs_bmap.h"
  29 #include "xfs_alloc.h"
  30 #include "xfs_utils.h"
  31 #include "xfs_mru_cache.h"
  32 #include "xfs_filestream.h"
  33 #include "xfs_trace.h"
  34
  35 #ifdef XFS_FILESTREAMS_TRACE
  36
  37 ktrace_t *xfs_filestreams_trace_buf;
  38
  39 STATIC void
  40 xfs_filestreams_trace(
  41         xfs_mount_t     *mp,    /* mount point */
  42         int             type,   /* type of trace */
  43         const char      *func,  /* source function */
  44         int             line,   /* source line number */
  45         __psunsigned_t  arg0,
  46         __psunsigned_t  arg1,
  47         __psunsigned_t  arg2,
  48         __psunsigned_t  arg3,
  49         __psunsigned_t  arg4,
  50         __psunsigned_t  arg5)
  51 {
  52         ktrace_enter(xfs_filestreams_trace_buf,
  53                 (void *)(__psint_t)(type | (line << 16)),
  54                 (void *)func,
  55                 (void *)(__psunsigned_t)current_pid(),
  56                 (void *)mp,
  57                 (void *)(__psunsigned_t)arg0,
  58                 (void *)(__psunsigned_t)arg1,
  59                 (void *)(__psunsigned_t)arg2,
  60                 (void *)(__psunsigned_t)arg3,
  61                 (void *)(__psunsigned_t)arg4,
  62                 (void *)(__psunsigned_t)arg5,
  63                 NULL, NULL, NULL, NULL, NULL, NULL);
  64 }
  65
  66 #define TRACE0(mp,t)                    TRACE6(mp,t,0,0,0,0,0,0)
  67 #define TRACE1(mp,t,a0)                 TRACE6(mp,t,a0,0,0,0,0,0)
  68 #define TRACE2(mp,t,a0,a1)              TRACE6(mp,t,a0,a1,0,0,0,0)
  69 #define TRACE3(mp,t,a0,a1,a2)           TRACE6(mp,t,a0,a1,a2,0,0,0)
  70 #define TRACE4(mp,t,a0,a1,a2,a3)        TRACE6(mp,t,a0,a1,a2,a3,0,0)
  71 #define TRACE5(mp,t,a0,a1,a2,a3,a4)     TRACE6(mp,t,a0,a1,a2,a3,a4,0)
  72 #define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
  73         xfs_filestreams_trace(mp, t, __func__, __LINE__, \
  74                                 (__psunsigned_t)a0, (__psunsigned_t)a1, \
  75                                 (__psunsigned_t)a2, (__psunsigned_t)a3, \
  76                                 (__psunsigned_t)a4, (__psunsigned_t)a5)
  77
  78 #define TRACE_AG_SCAN(mp, ag, ag2) \
  79                 TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2);
  80 #define TRACE_AG_PICK1(mp, max_ag, maxfree) \
  81                 TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree);
  82 #define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \
  83                 TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \
  84                          cnt, free, scan, flag)
  85 #define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \
  86                 TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2)
  87 #define TRACE_FREE(mp, ip, pip, ag, cnt) \
  88                 TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt)
  89 #define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \
  90                 TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt)
  91 #define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \
  92                 TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt)
  93 #define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \
  94                 TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt)
  95 #define TRACE_ORPHAN(mp, ip, ag) \
  96                 TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag);
  97
  98
  99 #else
 100 #define TRACE_AG_SCAN(mp, ag, ag2)
 101 #define TRACE_AG_PICK1(mp, max_ag, maxfree)
 102 #define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag)
 103 #define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2)
 104 #define TRACE_FREE(mp, ip, pip, ag, cnt)
 105 #define TRACE_LOOKUP(mp, ip, pip, ag, cnt)
 106 #define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt)
 107 #define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt)
 108 #define TRACE_ORPHAN(mp, ip, ag)
 109 #endif
 110
 111 static kmem_zone_t *item_zone;
 112
 113 /*
 114  * Structure for associating a file or a directory with an allocation group.
 115  * The parent directory pointer is only needed for files, but since there will
 116  * generally be vastly more files than directories in the cache, using the same
 117  * data structure simplifies the code with very little memory overhead.
 118  */
 119 typedef struct fstrm_item
 120 {
 121         xfs_agnumber_t  ag;     /* AG currently in use for the file/directory. */
 122         xfs_inode_t     *ip;    /* inode self-pointer. */
 123         xfs_inode_t     *pip;   /* Parent directory inode pointer. */
 124 } fstrm_item_t;
 125
 126 /*
 127  * Allocation group filestream associations are tracked with per-ag atomic
 128  * counters.  These counters allow _xfs_filestream_pick_ag() to tell whether a
 129  * particular AG already has active filestreams associated with it. The mount
 130  * point's m_peraglock is used to protect these counters from per-ag array
 131  * re-allocation during a growfs operation.  When xfs_growfs_data_private() is
 132  * about to reallocate the array, it calls xfs_filestream_flush() with the
 133  * m_peraglock held in write mode.
 134  *
 135  * Since xfs_mru_cache_flush() guarantees that all the free functions for all
 136  * the cache elements have finished executing before it returns, it's safe for
 137  * the free functions to use the atomic counters without m_peraglock protection.
 138  * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
 139  * whether it was called with the m_peraglock held in read mode, write mode or
 140  * not held at all.  The race condition this addresses is the following:
 141  *
 142  *  - The work queue scheduler fires and pulls a filestream directory cache
 143  *    element off the LRU end of the cache for deletion, then gets pre-empted.
 144  *  - A growfs operation grabs the m_peraglock in write mode, flushes all the
 145  *    remaining items from the cache and reallocates the mount point's per-ag
 146  *    array, resetting all the counters to zero.
 147  *  - The work queue thread resumes and calls the free function for the element
 148  *    it started cleaning up earlier.  In the process it decrements the
 149  *    filestreams counter for an AG that now has no references.
 150  *
 151  * With a shrinkfs feature, the above scenario could panic the system.
 152  *
 153  * All other uses of the following macros should be protected by either the
 154  * m_peraglock held in read mode, or the cache's internal locking exposed by the
 155  * interval between a call to xfs_mru_cache_lookup() and a call to
 156  * xfs_mru_cache_done().  In addition, the m_peraglock must be held in read mode
 157  * when new elements are added to the cache.
 158  *
 159  * Combined, these locking rules ensure that no associations will ever exist in
 160  * the cache that reference per-ag array elements that have since been
 161  * reallocated.
 162  */
 163 static int
 164 xfs_filestream_peek_ag(
 165         xfs_mount_t     *mp,
 166         xfs_agnumber_t  agno)
 167 {
 168         struct xfs_perag *pag;
 169         int             ret;
 170
 171         pag = xfs_perag_get(mp, agno);
 172         ret = atomic_read(&pag->pagf_fstrms);
 173         xfs_perag_put(pag);
 174         return ret;
 175 }
 176
 177 static int
 178 xfs_filestream_get_ag(
 179         xfs_mount_t     *mp,
 180         xfs_agnumber_t  agno)
 181 {
 182         struct xfs_perag *pag;
 183         int             ret;
 184
 185         pag = xfs_perag_get(mp, agno);
 186         ret = atomic_inc_return(&pag->pagf_fstrms);
 187         xfs_perag_put(pag);
 188         return ret;
 189 }
 190
 191 static void
 192 xfs_filestream_put_ag(
 193         xfs_mount_t     *mp,
 194         xfs_agnumber_t  agno)
 195 {
 196         struct xfs_perag *pag;
 197
 198         pag = xfs_perag_get(mp, agno);
 199         atomic_dec(&pag->pagf_fstrms);
 200         xfs_perag_put(pag);
 201 }
 202
 203 /*
 204  * Scan the AGs starting at startag looking for an AG that isn't in use and has
 205  * at least minlen blocks free.
 206  */
 207 static int
 208 _xfs_filestream_pick_ag(
 209         xfs_mount_t     *mp,
 210         xfs_agnumber_t  startag,
 211         xfs_agnumber_t  *agp,
 212         int             flags,
 213         xfs_extlen_t    minlen)
 214 {
 215         int             streams, max_streams;
 216         int             err, trylock, nscan;
 217         xfs_extlen_t    longest, free, minfree, maxfree = 0;
 218         xfs_agnumber_t  ag, max_ag = NULLAGNUMBER;
 219         struct xfs_perag *pag;
 220
 221         /* 2% of an AG's blocks must be free for it to be chosen. */
 222         minfree = mp->m_sb.sb_agblocks / 50;
 223
 224         ag = startag;
 225         *agp = NULLAGNUMBER;
 226
 227         /* For the first pass, don't sleep trying to init the per-AG. */
 228         trylock = XFS_ALLOC_FLAG_TRYLOCK;
 229
 230         for (nscan = 0; 1; nscan++) {
 231                 pag = xfs_perag_get(mp, ag);
 232                 TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
 233
 234                 if (!pag->pagf_init) {
 235                         err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
 236                         if (err && !trylock) {
 237                                 xfs_perag_put(pag);
 238                                 return err;
 239                         }
 240                 }
 241
 242                 /* Might fail sometimes during the 1st pass with trylock set. */
 243                 if (!pag->pagf_init)
 244                         goto next_ag;
 245
 246                 /* Keep track of the AG with the most free blocks. */
 247                 if (pag->pagf_freeblks > maxfree) {
 248                         maxfree = pag->pagf_freeblks;
 249                         max_streams = atomic_read(&pag->pagf_fstrms);
 250                         max_ag = ag;
 251                 }
 252
 253                 /*
 254                  * The AG reference count does two things: it enforces mutual
 255                  * exclusion when examining the suitability of an AG in this
 256                  * loop, and it guards against two filestreams being established
 257                  * in the same AG as each other.
 258                  */
 259                 if (xfs_filestream_get_ag(mp, ag) > 1) {
 260                         xfs_filestream_put_ag(mp, ag);
 261                         goto next_ag;
 262                 }
 263
 264                 longest = xfs_alloc_longest_free_extent(mp, pag);
 265                 if (((minlen && longest >= minlen) ||
 266                      (!minlen && pag->pagf_freeblks >= minfree)) &&
 267                     (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
 268                      (flags & XFS_PICK_LOWSPACE))) {
 269
 270                         /* Break out, retaining the reference on the AG. */
 271                         free = pag->pagf_freeblks;
 272                         streams = atomic_read(&pag->pagf_fstrms);
 273                         xfs_perag_put(pag);
 274                         *agp = ag;
 275                         break;
 276                 }
 277
 278                 /* Drop the reference on this AG, it's not usable. */
 279                 xfs_filestream_put_ag(mp, ag);
 280 next_ag:
 281                 xfs_perag_put(pag);
 282                 /* Move to the next AG, wrapping to AG 0 if necessary. */
 283                 if (++ag >= mp->m_sb.sb_agcount)
 284                         ag = 0;
 285
 286                 /* If a full pass of the AGs hasn't been done yet, continue. */
 287                 if (ag != startag)
 288                         continue;
 289
 290                 /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
 291                 if (trylock != 0) {
 292                         trylock = 0;
 293                         continue;
 294                 }
 295
 296                 /* Finally, if lowspace wasn't set, set it for the 3rd pass. */
 297                 if (!(flags & XFS_PICK_LOWSPACE)) {
 298                         flags |= XFS_PICK_LOWSPACE;
 299                         continue;
 300                 }
 301
 302                 /*
 303                  * Take the AG with the most free space, regardless of whether
 304                  * it's already in use by another filestream.
 305                  */
 306                 if (max_ag != NULLAGNUMBER) {
 307                         xfs_filestream_get_ag(mp, max_ag);
 308                         TRACE_AG_PICK1(mp, max_ag, maxfree);
 309                         streams = max_streams;
 310                         free = maxfree;
 311                         *agp = max_ag;
 312                         break;
 313                 }
 314
 315                 /* take AG 0 if none matched */
 316                 TRACE_AG_PICK1(mp, max_ag, maxfree);
 317                 *agp = 0;
 318                 return 0;
 319         }
 320
 321         TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
 322
 323         return 0;
 324 }
 325
 326 /*
 327  * Set the allocation group number for a file or a directory, updating inode
 328  * references and per-AG references as appropriate.
 329  */
 330 static int
 331 _xfs_filestream_update_ag(
 332         xfs_inode_t     *ip,
 333         xfs_inode_t     *pip,
 334         xfs_agnumber_t  ag)
 335 {
 336         int             err = 0;
 337         xfs_mount_t     *mp;
 338         xfs_mru_cache_t *cache;
 339         fstrm_item_t    *item;
 340         xfs_agnumber_t  old_ag;
 341         xfs_inode_t     *old_pip;
 342
 343         /*
 344          * Either ip is a regular file and pip is a directory, or ip is a
 345          * directory and pip is NULL.
 346          */
 347         ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip &&
 348                        S_ISDIR(pip->i_d.di_mode)) ||
 349                       (S_ISDIR(ip->i_d.di_mode) && !pip)));
 350
 351         mp = ip->i_mount;
 352         cache = mp->m_filestream;
 353
 354         item = xfs_mru_cache_lookup(cache, ip->i_ino);
 355         if (item) {
 356                 ASSERT(item->ip == ip);
 357                 old_ag = item->ag;
 358                 item->ag = ag;
 359                 old_pip = item->pip;
 360                 item->pip = pip;
 361                 xfs_mru_cache_done(cache);
 362
 363                 /*
 364                  * If the AG has changed, drop the old ref and take a new one,
 365                  * effectively transferring the reference from old to new AG.
 366                  */
 367                 if (ag != old_ag) {
 368                         xfs_filestream_put_ag(mp, old_ag);
 369                         xfs_filestream_get_ag(mp, ag);
 370                 }
 371
 372                 /*
 373                  * If ip is a file and its pip has changed, drop the old ref and
 374                  * take a new one.
 375                  */
 376                 if (pip && pip != old_pip) {
 377                         IRELE(old_pip);
 378                         IHOLD(pip);
 379                 }
 380
 381                 TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag),
 382                                 ag, xfs_filestream_peek_ag(mp, ag));
 383                 return 0;
 384         }
 385
 386         item = kmem_zone_zalloc(item_zone, KM_MAYFAIL);
 387         if (!item)
 388                 return ENOMEM;
 389
 390         item->ag = ag;
 391         item->ip = ip;
 392         item->pip = pip;
 393
 394         err = xfs_mru_cache_insert(cache, ip->i_ino, item);
 395         if (err) {
 396                 kmem_zone_free(item_zone, item);
 397                 return err;
 398         }
 399
 400         /* Take a reference on the AG. */
 401         xfs_filestream_get_ag(mp, ag);
 402
 403         /*
 404          * Take a reference on the inode itself regardless of whether it's a
 405          * regular file or a directory.
 406          */
 407         IHOLD(ip);
 408
 409         /*
 410          * In the case of a regular file, take a reference on the parent inode
 411          * as well to ensure it remains in-core.
 412          */
 413         if (pip)
 414                 IHOLD(pip);
 415
 416         TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag),
 417                         ag, xfs_filestream_peek_ag(mp, ag));
 418
 419         return 0;
 420 }
 421
 422 /* xfs_fstrm_free_func(): callback for freeing cached stream items. */
 423 STATIC void
 424 xfs_fstrm_free_func(
 425         unsigned long   ino,
 426         void            *data)
 427 {
 428         fstrm_item_t    *item  = (fstrm_item_t *)data;
 429         xfs_inode_t     *ip = item->ip;
 430
 431         ASSERT(ip->i_ino == ino);
 432
 433         xfs_iflags_clear(ip, XFS_IFILESTREAM);
 434
 435         /* Drop the reference taken on the AG when the item was added. */
 436         xfs_filestream_put_ag(ip->i_mount, item->ag);
 437
 438         TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
 439                 xfs_filestream_peek_ag(ip->i_mount, item->ag));
 440
 441         /*
 442          * _xfs_filestream_update_ag() always takes a reference on the inode
 443          * itself, whether it's a file or a directory.  Release it here.
 444          * This can result in the inode being freed and so we must
 445          * not hold any inode locks when freeing filesstreams objects
 446          * otherwise we can deadlock here.
 447          */
 448         IRELE(ip);
 449
 450         /*
 451          * In the case of a regular file, _xfs_filestream_update_ag() also
 452          * takes a ref on the parent inode to keep it in-core.  Release that
 453          * too.
 454          */
 455         if (item->pip)
 456                 IRELE(item->pip);
 457
 458         /* Finally, free the memory allocated for the item. */
 459         kmem_zone_free(item_zone, item);
 460 }
 461
 462 /*
 463  * xfs_filestream_init() is called at xfs initialisation time to set up the
 464  * memory zone that will be used for filestream data structure allocation.
 465  */
 466 int
 467 xfs_filestream_init(void)
 468 {
 469         item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
 470         if (!item_zone)
 471                 return -ENOMEM;
 472
 473         return 0;
 474 }
 475
 476 /*
 477  * xfs_filestream_uninit() is called at xfs termination time to destroy the
 478  * memory zone that was used for filestream data structure allocation.
 479  */
 480 void
 481 xfs_filestream_uninit(void)
 482 {
 483         kmem_zone_destroy(item_zone);
 484 }
 485
 486 /*
 487  * xfs_filestream_mount() is called when a file system is mounted with the
 488  * filestream option.  It is responsible for allocating the data structures
 489  * needed to track the new file system's file streams.
 490  */
 491 int
 492 xfs_filestream_mount(
 493         xfs_mount_t     *mp)
 494 {
 495         int             err;
 496         unsigned int    lifetime, grp_count;
 497
 498         /*
 499          * The filestream timer tunable is currently fixed within the range of
 500          * one second to four minutes, with five seconds being the default.  The
 501          * group count is somewhat arbitrary, but it'd be nice to adhere to the
 502          * timer tunable to within about 10 percent.  This requires at least 10
 503          * groups.
 504          */
 505         lifetime  = xfs_fstrm_centisecs * 10;
 506         grp_count = 10;
 507
 508         err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
 509                              xfs_fstrm_free_func);
 510
 511         return err;
 512 }
 513
 514 /*
 515  * xfs_filestream_unmount() is called when a file system that was mounted with
 516  * the filestream option is unmounted.  It drains the data structures created
 517  * to track the file system's file streams and frees all the memory that was
 518  * allocated.
 519  */
 520 void
 521 xfs_filestream_unmount(
 522         xfs_mount_t     *mp)
 523 {
 524         xfs_mru_cache_destroy(mp->m_filestream);
 525 }
 526
 527 /*
 528  * Return the AG of the filestream the file or directory belongs to, or
 529  * NULLAGNUMBER otherwise.
 530  */
 531 xfs_agnumber_t
 532 xfs_filestream_lookup_ag(
 533         xfs_inode_t     *ip)
 534 {
 535         xfs_mru_cache_t *cache;
 536         fstrm_item_t    *item;
 537         xfs_agnumber_t  ag;
 538         int             ref;
 539
 540         if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) {
 541                 ASSERT(0);
 542                 return NULLAGNUMBER;
 543         }
 544
 545         cache = ip->i_mount->m_filestream;
 546         item = xfs_mru_cache_lookup(cache, ip->i_ino);
 547         if (!item) {
 548                 TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0);
 549                 return NULLAGNUMBER;
 550         }
 551
 552         ASSERT(ip == item->ip);
 553         ag = item->ag;
 554         ref = xfs_filestream_peek_ag(ip->i_mount, ag);
 555         xfs_mru_cache_done(cache);
 556
 557         TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref);
 558         return ag;
 559 }
 560
 561 /*
 562  * xfs_filestream_associate() should only be called to associate a regular file
 563  * with its parent directory.  Calling it with a child directory isn't
 564  * appropriate because filestreams don't apply to entire directory hierarchies.
 565  * Creating a file in a child directory of an existing filestream directory
 566  * starts a new filestream with its own allocation group association.
 567  *
 568  * Returns < 0 on error, 0 if successful association occurred, > 0 if
 569  * we failed to get an association because of locking issues.
 570  */
 571 int
 572 xfs_filestream_associate(
 573         xfs_inode_t     *pip,
 574         xfs_inode_t     *ip)
 575 {
 576         xfs_mount_t     *mp;
 577         xfs_mru_cache_t *cache;
 578         fstrm_item_t    *item;
 579         xfs_agnumber_t  ag, rotorstep, startag;
 580         int             err = 0;
 581
 582         ASSERT(S_ISDIR(pip->i_d.di_mode));
 583         ASSERT(S_ISREG(ip->i_d.di_mode));
 584         if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode))
 585                 return -EINVAL;
 586
 587         mp = pip->i_mount;
 588         cache = mp->m_filestream;
 589
 590         /*
 591          * We have a problem, Houston.
 592          *
 593          * Taking the iolock here violates inode locking order - we already
 594          * hold the ilock. Hence if we block getting this lock we may never
 595          * wake. Unfortunately, that means if we can't get the lock, we're
 596          * screwed in terms of getting a stream association - we can't spin
 597          * waiting for the lock because someone else is waiting on the lock we
 598          * hold and we cannot drop that as we are in a transaction here.
 599          *
 600          * Lucky for us, this inversion is not a problem because it's a
 601          * directory inode that we are trying to lock here.
 602          *
 603          * So, if we can't get the iolock without sleeping then just give up
 604          */
 605         if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
 606                 return 1;
 607
 608         /* If the parent directory is already in the cache, use its AG. */
 609         item = xfs_mru_cache_lookup(cache, pip->i_ino);
 610         if (item) {
 611                 ASSERT(item->ip == pip);
 612                 ag = item->ag;
 613                 xfs_mru_cache_done(cache);
 614
 615                 TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
 616                 err = _xfs_filestream_update_ag(ip, pip, ag);
 617
 618                 goto exit;
 619         }
 620
 621         /*
 622          * Set the starting AG using the rotor for inode32, otherwise
 623          * use the directory inode's AG.
 624          */
 625         if (mp->m_flags & XFS_MOUNT_32BITINODES) {
 626                 rotorstep = xfs_rotorstep;
 627                 startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
 628                 mp->m_agfrotor = (mp->m_agfrotor + 1) %
 629                                  (mp->m_sb.sb_agcount * rotorstep);
 630         } else
 631                 startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
 632
 633         /* Pick a new AG for the parent inode starting at startag. */
 634         err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
 635         if (err || ag == NULLAGNUMBER)
 636                 goto exit_did_pick;
 637
 638         /* Associate the parent inode with the AG. */
 639         err = _xfs_filestream_update_ag(pip, NULL, ag);
 640         if (err)
 641                 goto exit_did_pick;
 642
 643         /* Associate the file inode with the AG. */
 644         err = _xfs_filestream_update_ag(ip, pip, ag);
 645         if (err)
 646                 goto exit_did_pick;
 647
 648         TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));
 649
 650 exit_did_pick:
 651         /*
 652          * If _xfs_filestream_pick_ag() returned a valid AG, remove the
 653          * reference it took on it, since the file and directory will have taken
 654          * their own now if they were successfully cached.
 655          */
 656         if (ag != NULLAGNUMBER)
 657                 xfs_filestream_put_ag(mp, ag);
 658
 659 exit:
 660         xfs_iunlock(pip, XFS_IOLOCK_EXCL);
 661         return -err;
 662 }
 663
 664 /*
 665  * Pick a new allocation group for the current file and its file stream.  This
 666  * function is called by xfs_bmap_filestreams() with the mount point's per-ag
 667  * lock held.
 668  */
 669 int
 670 xfs_filestream_new_ag(
 671         xfs_bmalloca_t  *ap,
 672         xfs_agnumber_t  *agp)
 673 {
 674         int             flags, err;
 675         xfs_inode_t     *ip, *pip = NULL;
 676         xfs_mount_t     *mp;
 677         xfs_mru_cache_t *cache;
 678         xfs_extlen_t    minlen;
 679         fstrm_item_t    *dir, *file;
 680         xfs_agnumber_t  ag = NULLAGNUMBER;
 681
 682         ip = ap->ip;
 683         mp = ip->i_mount;
 684         cache = mp->m_filestream;
 685         minlen = ap->length;
 686         *agp = NULLAGNUMBER;
 687
 688         /*
 689          * Look for the file in the cache, removing it if it's found.  Doing
 690          * this allows it to be held across the dir lookup that follows.
 691          */
 692         file = xfs_mru_cache_remove(cache, ip->i_ino);
 693         if (file) {
 694                 ASSERT(ip == file->ip);
 695
 696                 /* Save the file's parent inode and old AG number for later. */
 697                 pip = file->pip;
 698                 ag = file->ag;
 699
 700                 /* Look for the file's directory in the cache. */
 701                 dir = xfs_mru_cache_lookup(cache, pip->i_ino);
 702                 if (dir) {
 703                         ASSERT(pip == dir->ip);
 704
 705                         /*
 706                          * If the directory has already moved on to a new AG,
 707                          * use that AG as the new AG for the file. Don't
 708                          * forget to twiddle the AG refcounts to match the
 709                          * movement.
 710                          */
 711                         if (dir->ag != file->ag) {
 712                                 xfs_filestream_put_ag(mp, file->ag);
 713                                 xfs_filestream_get_ag(mp, dir->ag);
 714                                 *agp = file->ag = dir->ag;
 715                         }
 716
 717                         xfs_mru_cache_done(cache);
 718                 }
 719
 720                 /*
 721                  * Put the file back in the cache.  If this fails, the free
 722                  * function needs to be called to tidy up in the same way as if
 723                  * the item had simply expired from the cache.
 724                  */
 725                 err = xfs_mru_cache_insert(cache, ip->i_ino, file);
 726                 if (err) {
 727                         xfs_fstrm_free_func(ip->i_ino, file);
 728                         return err;
 729                 }
 730
 731                 /*
 732                  * If the file's AG was moved to the directory's new AG, there's
 733                  * nothing more to be done.
 734                  */
 735                 if (*agp != NULLAGNUMBER) {
 736                         TRACE_MOVEAG(mp, ip, pip,
 737                                         ag, xfs_filestream_peek_ag(mp, ag),
 738                                         *agp, xfs_filestream_peek_ag(mp, *agp));
 739                         return 0;
 740                 }
 741         }
 742
 743         /*
 744          * If the file's parent directory is known, take its iolock in exclusive
 745          * mode to prevent two sibling files from racing each other to migrate
 746          * themselves and their parent to different AGs.
 747          *
 748          * Note that we lock the parent directory iolock inside the child
 749          * iolock here.  That's fine as we never hold both parent and child
 750          * iolock in any other place.  This is different from the ilock,
 751          * which requires locking of the child after the parent for namespace
 752          * operations.
 753          */
 754         if (pip)
 755                 xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
 756
 757         /*
 758          * A new AG needs to be found for the file.  If the file's parent
 759          * directory is also known, it will be moved to the new AG as well to
 760          * ensure that files created inside it in future use the new AG.
 761          */
 762         ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
 763         flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
 764                 (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
 765
 766         err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
 767         if (err || *agp == NULLAGNUMBER)
 768                 goto exit;
 769
 770         /*
 771          * If the file wasn't found in the file cache, then its parent directory
 772          * inode isn't known.  For this to have happened, the file must either
 773          * be pre-existing, or it was created long enough ago that its cache
 774          * entry has expired.  This isn't the sort of usage that the filestreams
 775          * allocator is trying to optimise, so there's no point trying to track
 776          * its new AG somehow in the filestream data structures.
 777          */
 778         if (!pip) {
 779                 TRACE_ORPHAN(mp, ip, *agp);
 780                 goto exit;
 781         }
 782
 783         /* Associate the parent inode with the AG. */
 784         err = _xfs_filestream_update_ag(pip, NULL, *agp);
 785         if (err)
 786                 goto exit;
 787
 788         /* Associate the file inode with the AG. */
 789         err = _xfs_filestream_update_ag(ip, pip, *agp);
 790         if (err)
 791                 goto exit;
 792
 793         TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0,
 794                         *agp, xfs_filestream_peek_ag(mp, *agp));
 795
 796 exit:
 797         /*
 798          * If _xfs_filestream_pick_ag() returned a valid AG, remove the
 799          * reference it took on it, since the file and directory will have taken
 800          * their own now if they were successfully cached.
 801          */
 802         if (*agp != NULLAGNUMBER)
 803                 xfs_filestream_put_ag(mp, *agp);
 804         else
 805                 *agp = 0;
 806
 807         if (pip)
 808                 xfs_iunlock(pip, XFS_IOLOCK_EXCL);
 809
 810         return err;
 811 }
 812
 813 /*
 814  * Remove an association between an inode and a filestream object.
 815  * Typically this is done on last close of an unlinked file.
 816  */
 817 void
 818 xfs_filestream_deassociate(
 819         xfs_inode_t     *ip)
 820 {
 821         xfs_mru_cache_t *cache = ip->i_mount->m_filestream;
 822
 823         xfs_mru_cache_delete(cache, ip->i_ino);
 824 }