[XFS] Allow punching holes to free space when at ENOSPC
[pandora-kernel.git] / fs / xfs / xfs_vnodeops.c
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18
19 #include "xfs.h"
20 #include "xfs_fs.h"
21 #include "xfs_types.h"
22 #include "xfs_bit.h"
23 #include "xfs_log.h"
24 #include "xfs_inum.h"
25 #include "xfs_trans.h"
26 #include "xfs_sb.h"
27 #include "xfs_ag.h"
28 #include "xfs_dir2.h"
29 #include "xfs_dmapi.h"
30 #include "xfs_mount.h"
31 #include "xfs_da_btree.h"
32 #include "xfs_bmap_btree.h"
33 #include "xfs_alloc_btree.h"
34 #include "xfs_ialloc_btree.h"
35 #include "xfs_dir2_sf.h"
36 #include "xfs_attr_sf.h"
37 #include "xfs_dinode.h"
38 #include "xfs_inode.h"
39 #include "xfs_inode_item.h"
40 #include "xfs_itable.h"
41 #include "xfs_btree.h"
42 #include "xfs_ialloc.h"
43 #include "xfs_alloc.h"
44 #include "xfs_bmap.h"
45 #include "xfs_attr.h"
46 #include "xfs_rw.h"
47 #include "xfs_error.h"
48 #include "xfs_quota.h"
49 #include "xfs_utils.h"
50 #include "xfs_rtalloc.h"
51 #include "xfs_refcache.h"
52 #include "xfs_trans_space.h"
53 #include "xfs_log_priv.h"
54 #include "xfs_filestream.h"
55
56 STATIC int
57 xfs_open(
58         bhv_desc_t      *bdp,
59         cred_t          *credp)
60 {
61         int             mode;
62         bhv_vnode_t     *vp = BHV_TO_VNODE(bdp);
63         xfs_inode_t     *ip = XFS_BHVTOI(bdp);
64
65         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
66                 return XFS_ERROR(EIO);
67
68         /*
69          * If it's a directory with any blocks, read-ahead block 0
70          * as we're almost certain to have the next operation be a read there.
71          */
72         if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
73                 mode = xfs_ilock_map_shared(ip);
74                 if (ip->i_d.di_nextents > 0)
75                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
76                 xfs_iunlock(ip, mode);
77         }
78         return 0;
79 }
80
81 /*
82  * xfs_getattr
83  */
84 STATIC int
85 xfs_getattr(
86         bhv_desc_t      *bdp,
87         bhv_vattr_t     *vap,
88         int             flags,
89         cred_t          *credp)
90 {
91         xfs_inode_t     *ip;
92         xfs_mount_t     *mp;
93         bhv_vnode_t     *vp;
94
95         vp  = BHV_TO_VNODE(bdp);
96         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
97
98         ip = XFS_BHVTOI(bdp);
99         mp = ip->i_mount;
100
101         if (XFS_FORCED_SHUTDOWN(mp))
102                 return XFS_ERROR(EIO);
103
104         if (!(flags & ATTR_LAZY))
105                 xfs_ilock(ip, XFS_ILOCK_SHARED);
106
107         vap->va_size = XFS_ISIZE(ip);
108         if (vap->va_mask == XFS_AT_SIZE)
109                 goto all_done;
110
111         vap->va_nblocks =
112                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
113         vap->va_nodeid = ip->i_ino;
114 #if XFS_BIG_INUMS
115         vap->va_nodeid += mp->m_inoadd;
116 #endif
117         vap->va_nlink = ip->i_d.di_nlink;
118
119         /*
120          * Quick exit for non-stat callers
121          */
122         if ((vap->va_mask &
123             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
124               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
125                 goto all_done;
126
127         /*
128          * Copy from in-core inode.
129          */
130         vap->va_mode = ip->i_d.di_mode;
131         vap->va_uid = ip->i_d.di_uid;
132         vap->va_gid = ip->i_d.di_gid;
133         vap->va_projid = ip->i_d.di_projid;
134
135         /*
136          * Check vnode type block/char vs. everything else.
137          */
138         switch (ip->i_d.di_mode & S_IFMT) {
139         case S_IFBLK:
140         case S_IFCHR:
141                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
142                 vap->va_blocksize = BLKDEV_IOSIZE;
143                 break;
144         default:
145                 vap->va_rdev = 0;
146
147                 if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
148                         vap->va_blocksize = xfs_preferred_iosize(mp);
149                 } else {
150
151                         /*
152                          * If the file blocks are being allocated from a
153                          * realtime partition, then return the inode's
154                          * realtime extent size or the realtime volume's
155                          * extent size.
156                          */
157                         vap->va_blocksize =
158                                 xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
159                 }
160                 break;
161         }
162
163         vn_atime_to_timespec(vp, &vap->va_atime);
164         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
165         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
166         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
167         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
168
169         /*
170          * Exit for stat callers.  See if any of the rest of the fields
171          * to be filled in are needed.
172          */
173         if ((vap->va_mask &
174              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
175               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
176                 goto all_done;
177
178         /*
179          * Convert di_flags to xflags.
180          */
181         vap->va_xflags = xfs_ip2xflags(ip);
182
183         /*
184          * Exit for inode revalidate.  See if any of the rest of
185          * the fields to be filled in are needed.
186          */
187         if ((vap->va_mask &
188              (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
189               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
190                 goto all_done;
191
192         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
193         vap->va_nextents =
194                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
195                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
196                         ip->i_d.di_nextents;
197         if (ip->i_afp)
198                 vap->va_anextents =
199                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
200                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
201                                  ip->i_d.di_anextents;
202         else
203                 vap->va_anextents = 0;
204         vap->va_gen = ip->i_d.di_gen;
205
206  all_done:
207         if (!(flags & ATTR_LAZY))
208                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
209         return 0;
210 }
211
212
213 /*
214  * xfs_setattr
215  */
216 int
217 xfs_setattr(
218         bhv_desc_t              *bdp,
219         bhv_vattr_t             *vap,
220         int                     flags,
221         cred_t                  *credp)
222 {
223         xfs_inode_t             *ip;
224         xfs_trans_t             *tp;
225         xfs_mount_t             *mp;
226         int                     mask;
227         int                     code;
228         uint                    lock_flags;
229         uint                    commit_flags=0;
230         uid_t                   uid=0, iuid=0;
231         gid_t                   gid=0, igid=0;
232         int                     timeflags = 0;
233         bhv_vnode_t             *vp;
234         xfs_prid_t              projid=0, iprojid=0;
235         int                     mandlock_before, mandlock_after;
236         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
237         int                     file_owner;
238         int                     need_iolock = 1;
239
240         vp = BHV_TO_VNODE(bdp);
241         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
242
243         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
244                 return XFS_ERROR(EROFS);
245
246         /*
247          * Cannot set certain attributes.
248          */
249         mask = vap->va_mask;
250         if (mask & XFS_AT_NOSET) {
251                 return XFS_ERROR(EINVAL);
252         }
253
254         ip = XFS_BHVTOI(bdp);
255         mp = ip->i_mount;
256
257         if (XFS_FORCED_SHUTDOWN(mp))
258                 return XFS_ERROR(EIO);
259
260         /*
261          * Timestamps do not need to be logged and hence do not
262          * need to be done within a transaction.
263          */
264         if (mask & XFS_AT_UPDTIMES) {
265                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
266                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
267                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
268                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
269                 xfs_ichgtime(ip, timeflags);
270                 return 0;
271         }
272
273         olddquot1 = olddquot2 = NULL;
274         udqp = gdqp = NULL;
275
276         /*
277          * If disk quotas is on, we make sure that the dquots do exist on disk,
278          * before we start any other transactions. Trying to do this later
279          * is messy. We don't care to take a readlock to look at the ids
280          * in inode here, because we can't hold it across the trans_reserve.
281          * If the IDs do change before we take the ilock, we're covered
282          * because the i_*dquot fields will get updated anyway.
283          */
284         if (XFS_IS_QUOTA_ON(mp) &&
285             (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
286                 uint    qflags = 0;
287
288                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
289                         uid = vap->va_uid;
290                         qflags |= XFS_QMOPT_UQUOTA;
291                 } else {
292                         uid = ip->i_d.di_uid;
293                 }
294                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
295                         gid = vap->va_gid;
296                         qflags |= XFS_QMOPT_GQUOTA;
297                 }  else {
298                         gid = ip->i_d.di_gid;
299                 }
300                 if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
301                         projid = vap->va_projid;
302                         qflags |= XFS_QMOPT_PQUOTA;
303                 }  else {
304                         projid = ip->i_d.di_projid;
305                 }
306                 /*
307                  * We take a reference when we initialize udqp and gdqp,
308                  * so it is important that we never blindly double trip on
309                  * the same variable. See xfs_create() for an example.
310                  */
311                 ASSERT(udqp == NULL);
312                 ASSERT(gdqp == NULL);
313                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
314                                          &udqp, &gdqp);
315                 if (code)
316                         return code;
317         }
318
319         /*
320          * For the other attributes, we acquire the inode lock and
321          * first do an error checking pass.
322          */
323         tp = NULL;
324         lock_flags = XFS_ILOCK_EXCL;
325         if (flags & ATTR_NOLOCK)
326                 need_iolock = 0;
327         if (!(mask & XFS_AT_SIZE)) {
328                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
329                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
330                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
331                         commit_flags = 0;
332                         if ((code = xfs_trans_reserve(tp, 0,
333                                                      XFS_ICHANGE_LOG_RES(mp), 0,
334                                                      0, 0))) {
335                                 lock_flags = 0;
336                                 goto error_return;
337                         }
338                 }
339         } else {
340                 if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
341                     !(flags & ATTR_DMI)) {
342                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
343                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
344                                 vap->va_size, 0, dmflags, NULL);
345                         if (code) {
346                                 lock_flags = 0;
347                                 goto error_return;
348                         }
349                 }
350                 if (need_iolock)
351                         lock_flags |= XFS_IOLOCK_EXCL;
352         }
353
354         xfs_ilock(ip, lock_flags);
355
356         /* boolean: are we the file owner? */
357         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
358
359         /*
360          * Change various properties of a file.
361          * Only the owner or users with CAP_FOWNER
362          * capability may do these things.
363          */
364         if (mask &
365             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
366              XFS_AT_GID|XFS_AT_PROJID)) {
367                 /*
368                  * CAP_FOWNER overrides the following restrictions:
369                  *
370                  * The user ID of the calling process must be equal
371                  * to the file owner ID, except in cases where the
372                  * CAP_FSETID capability is applicable.
373                  */
374                 if (!file_owner && !capable(CAP_FOWNER)) {
375                         code = XFS_ERROR(EPERM);
376                         goto error_return;
377                 }
378
379                 /*
380                  * CAP_FSETID overrides the following restrictions:
381                  *
382                  * The effective user ID of the calling process shall match
383                  * the file owner when setting the set-user-ID and
384                  * set-group-ID bits on that file.
385                  *
386                  * The effective group ID or one of the supplementary group
387                  * IDs of the calling process shall match the group owner of
388                  * the file when setting the set-group-ID bit on that file
389                  */
390                 if (mask & XFS_AT_MODE) {
391                         mode_t m = 0;
392
393                         if ((vap->va_mode & S_ISUID) && !file_owner)
394                                 m |= S_ISUID;
395                         if ((vap->va_mode & S_ISGID) &&
396                             !in_group_p((gid_t)ip->i_d.di_gid))
397                                 m |= S_ISGID;
398 #if 0
399                         /* Linux allows this, Irix doesn't. */
400                         if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
401                                 m |= S_ISVTX;
402 #endif
403                         if (m && !capable(CAP_FSETID))
404                                 vap->va_mode &= ~m;
405                 }
406         }
407
408         /*
409          * Change file ownership.  Must be the owner or privileged.
410          * If the system was configured with the "restricted_chown"
411          * option, the owner is not permitted to give away the file,
412          * and can change the group id only to a group of which he
413          * or she is a member.
414          */
415         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
416                 /*
417                  * These IDs could have changed since we last looked at them.
418                  * But, we're assured that if the ownership did change
419                  * while we didn't have the inode locked, inode's dquot(s)
420                  * would have changed also.
421                  */
422                 iuid = ip->i_d.di_uid;
423                 iprojid = ip->i_d.di_projid;
424                 igid = ip->i_d.di_gid;
425                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
426                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
427                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
428                          iprojid;
429
430                 /*
431                  * CAP_CHOWN overrides the following restrictions:
432                  *
433                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
434                  * shall override the restriction that a process cannot
435                  * change the user ID of a file it owns and the restriction
436                  * that the group ID supplied to the chown() function
437                  * shall be equal to either the group ID or one of the
438                  * supplementary group IDs of the calling process.
439                  */
440                 if (restricted_chown &&
441                     (iuid != uid || (igid != gid &&
442                                      !in_group_p((gid_t)gid))) &&
443                     !capable(CAP_CHOWN)) {
444                         code = XFS_ERROR(EPERM);
445                         goto error_return;
446                 }
447                 /*
448                  * Do a quota reservation only if uid/projid/gid is actually
449                  * going to change.
450                  */
451                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
452                     (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
453                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
454                         ASSERT(tp);
455                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
456                                                 capable(CAP_FOWNER) ?
457                                                 XFS_QMOPT_FORCE_RES : 0);
458                         if (code)       /* out of quota */
459                                 goto error_return;
460                 }
461         }
462
463         /*
464          * Truncate file.  Must have write permission and not be a directory.
465          */
466         if (mask & XFS_AT_SIZE) {
467                 /* Short circuit the truncate case for zero length files */
468                 if ((vap->va_size == 0) &&
469                    (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
470                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
471                         lock_flags &= ~XFS_ILOCK_EXCL;
472                         if (mask & XFS_AT_CTIME)
473                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
474                         code = 0;
475                         goto error_return;
476                 }
477
478                 if (VN_ISDIR(vp)) {
479                         code = XFS_ERROR(EISDIR);
480                         goto error_return;
481                 } else if (!VN_ISREG(vp)) {
482                         code = XFS_ERROR(EINVAL);
483                         goto error_return;
484                 }
485                 /*
486                  * Make sure that the dquots are attached to the inode.
487                  */
488                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
489                         goto error_return;
490         }
491
492         /*
493          * Change file access or modified times.
494          */
495         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
496                 if (!file_owner) {
497                         if ((flags & ATTR_UTIME) &&
498                             !capable(CAP_FOWNER)) {
499                                 code = XFS_ERROR(EPERM);
500                                 goto error_return;
501                         }
502                 }
503         }
504
505         /*
506          * Change extent size or realtime flag.
507          */
508         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
509                 /*
510                  * Can't change extent size if any extents are allocated.
511                  */
512                 if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
513                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
514                      vap->va_extsize) ) {
515                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
516                         goto error_return;
517                 }
518
519                 /*
520                  * Can't change realtime flag if any extents are allocated.
521                  */
522                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
523                     (mask & XFS_AT_XFLAGS) &&
524                     (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
525                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
526                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
527                         goto error_return;
528                 }
529                 /*
530                  * Extent size must be a multiple of the appropriate block
531                  * size, if set at all.
532                  */
533                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
534                         xfs_extlen_t    size;
535
536                         if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
537                             ((mask & XFS_AT_XFLAGS) &&
538                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
539                                 size = mp->m_sb.sb_rextsize <<
540                                        mp->m_sb.sb_blocklog;
541                         } else {
542                                 size = mp->m_sb.sb_blocksize;
543                         }
544                         if (vap->va_extsize % size) {
545                                 code = XFS_ERROR(EINVAL);
546                                 goto error_return;
547                         }
548                 }
549                 /*
550                  * If realtime flag is set then must have realtime data.
551                  */
552                 if ((mask & XFS_AT_XFLAGS) &&
553                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
554                         if ((mp->m_sb.sb_rblocks == 0) ||
555                             (mp->m_sb.sb_rextsize == 0) ||
556                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
557                                 code = XFS_ERROR(EINVAL);
558                                 goto error_return;
559                         }
560                 }
561
562                 /*
563                  * Can't modify an immutable/append-only file unless
564                  * we have appropriate permission.
565                  */
566                 if ((mask & XFS_AT_XFLAGS) &&
567                     (ip->i_d.di_flags &
568                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
569                      (vap->va_xflags &
570                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
571                     !capable(CAP_LINUX_IMMUTABLE)) {
572                         code = XFS_ERROR(EPERM);
573                         goto error_return;
574                 }
575         }
576
577         /*
578          * Now we can make the changes.  Before we join the inode
579          * to the transaction, if XFS_AT_SIZE is set then take care of
580          * the part of the truncation that must be done without the
581          * inode lock.  This needs to be done before joining the inode
582          * to the transaction, because the inode cannot be unlocked
583          * once it is a part of the transaction.
584          */
585         if (mask & XFS_AT_SIZE) {
586                 code = 0;
587                 if ((vap->va_size > ip->i_size) &&
588                     (flags & ATTR_NOSIZETOK) == 0) {
589                         code = xfs_igrow_start(ip, vap->va_size, credp);
590                 }
591                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
592                 vn_iowait(vp); /* wait for the completion of any pending DIOs */
593                 if (!code)
594                         code = xfs_itruncate_data(ip, vap->va_size);
595                 if (code) {
596                         ASSERT(tp == NULL);
597                         lock_flags &= ~XFS_ILOCK_EXCL;
598                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
599                         goto error_return;
600                 }
601                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
602                 if ((code = xfs_trans_reserve(tp, 0,
603                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
604                                              XFS_TRANS_PERM_LOG_RES,
605                                              XFS_ITRUNCATE_LOG_COUNT))) {
606                         xfs_trans_cancel(tp, 0);
607                         if (need_iolock)
608                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
609                         return code;
610                 }
611                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
612                 xfs_ilock(ip, XFS_ILOCK_EXCL);
613         }
614
615         if (tp) {
616                 xfs_trans_ijoin(tp, ip, lock_flags);
617                 xfs_trans_ihold(tp, ip);
618         }
619
620         /* determine whether mandatory locking mode changes */
621         mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
622
623         /*
624          * Truncate file.  Must have write permission and not be a directory.
625          */
626         if (mask & XFS_AT_SIZE) {
627                 if (vap->va_size > ip->i_size) {
628                         xfs_igrow_finish(tp, ip, vap->va_size,
629                             !(flags & ATTR_DMI));
630                 } else if ((vap->va_size <= ip->i_size) ||
631                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
632                         /*
633                          * signal a sync transaction unless
634                          * we're truncating an already unlinked
635                          * file on a wsync filesystem
636                          */
637                         code = xfs_itruncate_finish(&tp, ip,
638                                             (xfs_fsize_t)vap->va_size,
639                                             XFS_DATA_FORK,
640                                             ((ip->i_d.di_nlink != 0 ||
641                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
642                                              ? 1 : 0));
643                         if (code)
644                                 goto abort_return;
645                         /*
646                          * Truncated "down", so we're removing references
647                          * to old data here - if we now delay flushing for
648                          * a long time, we expose ourselves unduly to the
649                          * notorious NULL files problem.  So, we mark this
650                          * vnode and flush it when the file is closed, and
651                          * do not wait the usual (long) time for writeout.
652                          */
653                         VTRUNCATE(vp);
654                 }
655                 /*
656                  * Have to do this even if the file's size doesn't change.
657                  */
658                 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
659         }
660
661         /*
662          * Change file access modes.
663          */
664         if (mask & XFS_AT_MODE) {
665                 ip->i_d.di_mode &= S_IFMT;
666                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
667
668                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
669                 timeflags |= XFS_ICHGTIME_CHG;
670         }
671
672         /*
673          * Change file ownership.  Must be the owner or privileged.
674          * If the system was configured with the "restricted_chown"
675          * option, the owner is not permitted to give away the file,
676          * and can change the group id only to a group of which he
677          * or she is a member.
678          */
679         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
680                 /*
681                  * CAP_FSETID overrides the following restrictions:
682                  *
683                  * The set-user-ID and set-group-ID bits of a file will be
684                  * cleared upon successful return from chown()
685                  */
686                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
687                     !capable(CAP_FSETID)) {
688                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
689                 }
690
691                 /*
692                  * Change the ownerships and register quota modifications
693                  * in the transaction.
694                  */
695                 if (iuid != uid) {
696                         if (XFS_IS_UQUOTA_ON(mp)) {
697                                 ASSERT(mask & XFS_AT_UID);
698                                 ASSERT(udqp);
699                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
700                                                         &ip->i_udquot, udqp);
701                         }
702                         ip->i_d.di_uid = uid;
703                 }
704                 if (igid != gid) {
705                         if (XFS_IS_GQUOTA_ON(mp)) {
706                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
707                                 ASSERT(mask & XFS_AT_GID);
708                                 ASSERT(gdqp);
709                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
710                                                         &ip->i_gdquot, gdqp);
711                         }
712                         ip->i_d.di_gid = gid;
713                 }
714                 if (iprojid != projid) {
715                         if (XFS_IS_PQUOTA_ON(mp)) {
716                                 ASSERT(!XFS_IS_GQUOTA_ON(mp));
717                                 ASSERT(mask & XFS_AT_PROJID);
718                                 ASSERT(gdqp);
719                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
720                                                         &ip->i_gdquot, gdqp);
721                         }
722                         ip->i_d.di_projid = projid;
723                         /*
724                          * We may have to rev the inode as well as
725                          * the superblock version number since projids didn't
726                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
727                          */
728                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
729                                 xfs_bump_ino_vers2(tp, ip);
730                 }
731
732                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
733                 timeflags |= XFS_ICHGTIME_CHG;
734         }
735
736
737         /*
738          * Change file access or modified times.
739          */
740         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
741                 if (mask & XFS_AT_ATIME) {
742                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
743                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
744                         ip->i_update_core = 1;
745                         timeflags &= ~XFS_ICHGTIME_ACC;
746                 }
747                 if (mask & XFS_AT_MTIME) {
748                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
749                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
750                         timeflags &= ~XFS_ICHGTIME_MOD;
751                         timeflags |= XFS_ICHGTIME_CHG;
752                 }
753                 if (tp && (flags & ATTR_UTIME))
754                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
755         }
756
757         /*
758          * Change XFS-added attributes.
759          */
760         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
761                 if (mask & XFS_AT_EXTSIZE) {
762                         /*
763                          * Converting bytes to fs blocks.
764                          */
765                         ip->i_d.di_extsize = vap->va_extsize >>
766                                 mp->m_sb.sb_blocklog;
767                 }
768                 if (mask & XFS_AT_XFLAGS) {
769                         uint    di_flags;
770
771                         /* can't set PREALLOC this way, just preserve it */
772                         di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
773                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
774                                 di_flags |= XFS_DIFLAG_IMMUTABLE;
775                         if (vap->va_xflags & XFS_XFLAG_APPEND)
776                                 di_flags |= XFS_DIFLAG_APPEND;
777                         if (vap->va_xflags & XFS_XFLAG_SYNC)
778                                 di_flags |= XFS_DIFLAG_SYNC;
779                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
780                                 di_flags |= XFS_DIFLAG_NOATIME;
781                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
782                                 di_flags |= XFS_DIFLAG_NODUMP;
783                         if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
784                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
785                         if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
786                                 di_flags |= XFS_DIFLAG_NODEFRAG;
787                         if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
788                                 di_flags |= XFS_DIFLAG_FILESTREAM;
789                         if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
790                                 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
791                                         di_flags |= XFS_DIFLAG_RTINHERIT;
792                                 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
793                                         di_flags |= XFS_DIFLAG_NOSYMLINKS;
794                                 if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
795                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
796                         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
797                                 if (vap->va_xflags & XFS_XFLAG_REALTIME) {
798                                         di_flags |= XFS_DIFLAG_REALTIME;
799                                         ip->i_iocore.io_flags |= XFS_IOCORE_RT;
800                                 } else {
801                                         ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
802                                 }
803                                 if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
804                                         di_flags |= XFS_DIFLAG_EXTSIZE;
805                         }
806                         ip->i_d.di_flags = di_flags;
807                 }
808                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
809                 timeflags |= XFS_ICHGTIME_CHG;
810         }
811
812         /*
813          * Change file inode change time only if XFS_AT_CTIME set
814          * AND we have been called by a DMI function.
815          */
816
817         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
818                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
819                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
820                 ip->i_update_core = 1;
821                 timeflags &= ~XFS_ICHGTIME_CHG;
822         }
823
824         /*
825          * Send out timestamp changes that need to be set to the
826          * current time.  Not done when called by a DMI function.
827          */
828         if (timeflags && !(flags & ATTR_DMI))
829                 xfs_ichgtime(ip, timeflags);
830
831         XFS_STATS_INC(xs_ig_attrchg);
832
833         /*
834          * If this is a synchronous mount, make sure that the
835          * transaction goes to disk before returning to the user.
836          * This is slightly sub-optimal in that truncates require
837          * two sync transactions instead of one for wsync filesystems.
838          * One for the truncate and one for the timestamps since we
839          * don't want to change the timestamps unless we're sure the
840          * truncate worked.  Truncates are less than 1% of the laddis
841          * mix so this probably isn't worth the trouble to optimize.
842          */
843         code = 0;
844         if (tp) {
845                 if (mp->m_flags & XFS_MOUNT_WSYNC)
846                         xfs_trans_set_sync(tp);
847
848                 code = xfs_trans_commit(tp, commit_flags);
849         }
850
851         /*
852          * If the (regular) file's mandatory locking mode changed, then
853          * notify the vnode.  We do this under the inode lock to prevent
854          * racing calls to vop_vnode_change.
855          */
856         mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
857         if (mandlock_before != mandlock_after) {
858                 bhv_vop_vnode_change(vp, VCHANGE_FLAGS_ENF_LOCKING,
859                                  mandlock_after);
860         }
861
862         xfs_iunlock(ip, lock_flags);
863
864         /*
865          * Release any dquot(s) the inode had kept before chown.
866          */
867         XFS_QM_DQRELE(mp, olddquot1);
868         XFS_QM_DQRELE(mp, olddquot2);
869         XFS_QM_DQRELE(mp, udqp);
870         XFS_QM_DQRELE(mp, gdqp);
871
872         if (code) {
873                 return code;
874         }
875
876         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
877             !(flags & ATTR_DMI)) {
878                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
879                                         NULL, DM_RIGHT_NULL, NULL, NULL,
880                                         0, 0, AT_DELAY_FLAG(flags));
881         }
882         return 0;
883
884  abort_return:
885         commit_flags |= XFS_TRANS_ABORT;
886         /* FALLTHROUGH */
887  error_return:
888         XFS_QM_DQRELE(mp, udqp);
889         XFS_QM_DQRELE(mp, gdqp);
890         if (tp) {
891                 xfs_trans_cancel(tp, commit_flags);
892         }
893         if (lock_flags != 0) {
894                 xfs_iunlock(ip, lock_flags);
895         }
896         return code;
897 }
898
899
900 /*
901  * xfs_access
902  * Null conversion from vnode mode bits to inode mode bits, as in efs.
903  */
904 STATIC int
905 xfs_access(
906         bhv_desc_t      *bdp,
907         int             mode,
908         cred_t          *credp)
909 {
910         xfs_inode_t     *ip;
911         int             error;
912
913         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
914                                                (inst_t *)__return_address);
915
916         ip = XFS_BHVTOI(bdp);
917         xfs_ilock(ip, XFS_ILOCK_SHARED);
918         error = xfs_iaccess(ip, mode, credp);
919         xfs_iunlock(ip, XFS_ILOCK_SHARED);
920         return error;
921 }
922
923
924 /*
925  * The maximum pathlen is 1024 bytes. Since the minimum file system
926  * blocksize is 512 bytes, we can get a max of 2 extents back from
927  * bmapi.
928  */
929 #define SYMLINK_MAPS 2
930
931 /*
932  * xfs_readlink
933  *
934  */
935 STATIC int
936 xfs_readlink(
937         bhv_desc_t      *bdp,
938         uio_t           *uiop,
939         int             ioflags,
940         cred_t          *credp)
941 {
942         xfs_inode_t     *ip;
943         int             count;
944         xfs_off_t       offset;
945         int             pathlen;
946         bhv_vnode_t     *vp;
947         int             error = 0;
948         xfs_mount_t     *mp;
949         int             nmaps;
950         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
951         xfs_daddr_t     d;
952         int             byte_cnt;
953         int             n;
954         xfs_buf_t       *bp;
955
956         vp = BHV_TO_VNODE(bdp);
957         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
958
959         ip = XFS_BHVTOI(bdp);
960         mp = ip->i_mount;
961
962         if (XFS_FORCED_SHUTDOWN(mp))
963                 return XFS_ERROR(EIO);
964
965         xfs_ilock(ip, XFS_ILOCK_SHARED);
966
967         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
968
969         offset = uiop->uio_offset;
970         count = uiop->uio_resid;
971
972         if (offset < 0) {
973                 error = XFS_ERROR(EINVAL);
974                 goto error_return;
975         }
976         if (count <= 0) {
977                 error = 0;
978                 goto error_return;
979         }
980
981         /*
982          * See if the symlink is stored inline.
983          */
984         pathlen = (int)ip->i_d.di_size;
985
986         if (ip->i_df.if_flags & XFS_IFINLINE) {
987                 error = xfs_uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
988         }
989         else {
990                 /*
991                  * Symlink not inline.  Call bmap to get it in.
992                  */
993                 nmaps = SYMLINK_MAPS;
994
995                 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
996                                   0, NULL, 0, mval, &nmaps, NULL, NULL);
997
998                 if (error) {
999                         goto error_return;
1000                 }
1001
1002                 for (n = 0; n < nmaps; n++) {
1003                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1004                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1005                         bp = xfs_buf_read(mp->m_ddev_targp, d,
1006                                       BTOBB(byte_cnt), 0);
1007                         error = XFS_BUF_GETERROR(bp);
1008                         if (error) {
1009                                 xfs_ioerror_alert("xfs_readlink",
1010                                           ip->i_mount, bp, XFS_BUF_ADDR(bp));
1011                                 xfs_buf_relse(bp);
1012                                 goto error_return;
1013                         }
1014                         if (pathlen < byte_cnt)
1015                                 byte_cnt = pathlen;
1016                         pathlen -= byte_cnt;
1017
1018                         error = xfs_uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
1019                         xfs_buf_relse (bp);
1020                 }
1021
1022         }
1023
1024 error_return:
1025         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1026         return error;
1027 }
1028
1029
1030 /*
1031  * xfs_fsync
1032  *
1033  * This is called to sync the inode and its data out to disk.
1034  * We need to hold the I/O lock while flushing the data, and
1035  * the inode lock while flushing the inode.  The inode lock CANNOT
1036  * be held while flushing the data, so acquire after we're done
1037  * with that.
1038  */
1039 STATIC int
1040 xfs_fsync(
1041         bhv_desc_t      *bdp,
1042         int             flag,
1043         cred_t          *credp,
1044         xfs_off_t       start,
1045         xfs_off_t       stop)
1046 {
1047         xfs_inode_t     *ip;
1048         xfs_trans_t     *tp;
1049         int             error;
1050         int             log_flushed = 0, changed = 1;
1051
1052         vn_trace_entry(BHV_TO_VNODE(bdp),
1053                         __FUNCTION__, (inst_t *)__return_address);
1054
1055         ip = XFS_BHVTOI(bdp);
1056
1057         ASSERT(start >= 0 && stop >= -1);
1058
1059         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1060                 return XFS_ERROR(EIO);
1061
1062         /*
1063          * We always need to make sure that the required inode state
1064          * is safe on disk.  The vnode might be clean but because
1065          * of committed transactions that haven't hit the disk yet.
1066          * Likewise, there could be unflushed non-transactional
1067          * changes to the inode core that have to go to disk.
1068          *
1069          * The following code depends on one assumption:  that
1070          * any transaction that changes an inode logs the core
1071          * because it has to change some field in the inode core
1072          * (typically nextents or nblocks).  That assumption
1073          * implies that any transactions against an inode will
1074          * catch any non-transactional updates.  If inode-altering
1075          * transactions exist that violate this assumption, the
1076          * code breaks.  Right now, it figures that if the involved
1077          * update_* field is clear and the inode is unpinned, the
1078          * inode is clean.  Either it's been flushed or it's been
1079          * committed and the commit has hit the disk unpinning the inode.
1080          * (Note that xfs_inode_item_format() called at commit clears
1081          * the update_* fields.)
1082          */
1083         xfs_ilock(ip, XFS_ILOCK_SHARED);
1084
1085         /* If we are flushing data then we care about update_size
1086          * being set, otherwise we care about update_core
1087          */
1088         if ((flag & FSYNC_DATA) ?
1089                         (ip->i_update_size == 0) :
1090                         (ip->i_update_core == 0)) {
1091                 /*
1092                  * Timestamps/size haven't changed since last inode
1093                  * flush or inode transaction commit.  That means
1094                  * either nothing got written or a transaction
1095                  * committed which caught the updates.  If the
1096                  * latter happened and the transaction hasn't
1097                  * hit the disk yet, the inode will be still
1098                  * be pinned.  If it is, force the log.
1099                  */
1100
1101                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1102
1103                 if (xfs_ipincount(ip)) {
1104                         _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1105                                       XFS_LOG_FORCE |
1106                                       ((flag & FSYNC_WAIT)
1107                                        ? XFS_LOG_SYNC : 0),
1108                                       &log_flushed);
1109                 } else {
1110                         /*
1111                          * If the inode is not pinned and nothing
1112                          * has changed we don't need to flush the
1113                          * cache.
1114                          */
1115                         changed = 0;
1116                 }
1117                 error = 0;
1118         } else  {
1119                 /*
1120                  * Kick off a transaction to log the inode
1121                  * core to get the updates.  Make it
1122                  * sync if FSYNC_WAIT is passed in (which
1123                  * is done by everybody but specfs).  The
1124                  * sync transaction will also force the log.
1125                  */
1126                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1127                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1128                 if ((error = xfs_trans_reserve(tp, 0,
1129                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1130                                 0, 0, 0)))  {
1131                         xfs_trans_cancel(tp, 0);
1132                         return error;
1133                 }
1134                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1135
1136                 /*
1137                  * Note - it's possible that we might have pushed
1138                  * ourselves out of the way during trans_reserve
1139                  * which would flush the inode.  But there's no
1140                  * guarantee that the inode buffer has actually
1141                  * gone out yet (it's delwri).  Plus the buffer
1142                  * could be pinned anyway if it's part of an
1143                  * inode in another recent transaction.  So we
1144                  * play it safe and fire off the transaction anyway.
1145                  */
1146                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1147                 xfs_trans_ihold(tp, ip);
1148                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1149                 if (flag & FSYNC_WAIT)
1150                         xfs_trans_set_sync(tp);
1151                 error = _xfs_trans_commit(tp, 0, &log_flushed);
1152
1153                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1154         }
1155
1156         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1157                 /*
1158                  * If the log write didn't issue an ordered tag we need
1159                  * to flush the disk cache for the data device now.
1160                  */
1161                 if (!log_flushed)
1162                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1163
1164                 /*
1165                  * If this inode is on the RT dev we need to flush that
1166                  * cache as well.
1167                  */
1168                 if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1169                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1170         }
1171
1172         return error;
1173 }
1174
1175 /*
1176  * This is called by xfs_inactive to free any blocks beyond eof
1177  * when the link count isn't zero and by xfs_dm_punch_hole() when
1178  * punching a hole to EOF.
1179  */
1180 int
1181 xfs_free_eofblocks(
1182         xfs_mount_t     *mp,
1183         xfs_inode_t     *ip,
1184         int             flags)
1185 {
1186         xfs_trans_t     *tp;
1187         int             error;
1188         xfs_fileoff_t   end_fsb;
1189         xfs_fileoff_t   last_fsb;
1190         xfs_filblks_t   map_len;
1191         int             nimaps;
1192         xfs_bmbt_irec_t imap;
1193         int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
1194
1195         /*
1196          * Figure out if there are any blocks beyond the end
1197          * of the file.  If not, then there is nothing to do.
1198          */
1199         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1200         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1201         map_len = last_fsb - end_fsb;
1202         if (map_len <= 0)
1203                 return 0;
1204
1205         nimaps = 1;
1206         xfs_ilock(ip, XFS_ILOCK_SHARED);
1207         error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
1208                           NULL, 0, &imap, &nimaps, NULL, NULL);
1209         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1210
1211         if (!error && (nimaps != 0) &&
1212             (imap.br_startblock != HOLESTARTBLOCK ||
1213              ip->i_delayed_blks)) {
1214                 /*
1215                  * Attach the dquots to the inode up front.
1216                  */
1217                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1218                         return error;
1219
1220                 /*
1221                  * There are blocks after the end of file.
1222                  * Free them up now by truncating the file to
1223                  * its current size.
1224                  */
1225                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1226
1227                 /*
1228                  * Do the xfs_itruncate_start() call before
1229                  * reserving any log space because
1230                  * itruncate_start will call into the buffer
1231                  * cache and we can't
1232                  * do that within a transaction.
1233                  */
1234                 if (use_iolock)
1235                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
1236                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1237                                     ip->i_size);
1238                 if (error) {
1239                         xfs_trans_cancel(tp, 0);
1240                         if (use_iolock)
1241                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1242                         return error;
1243                 }
1244
1245                 error = xfs_trans_reserve(tp, 0,
1246                                           XFS_ITRUNCATE_LOG_RES(mp),
1247                                           0, XFS_TRANS_PERM_LOG_RES,
1248                                           XFS_ITRUNCATE_LOG_COUNT);
1249                 if (error) {
1250                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1251                         xfs_trans_cancel(tp, 0);
1252                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1253                         return error;
1254                 }
1255
1256                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1257                 xfs_trans_ijoin(tp, ip,
1258                                 XFS_IOLOCK_EXCL |
1259                                 XFS_ILOCK_EXCL);
1260                 xfs_trans_ihold(tp, ip);
1261
1262                 error = xfs_itruncate_finish(&tp, ip,
1263                                              ip->i_size,
1264                                              XFS_DATA_FORK,
1265                                              0);
1266                 /*
1267                  * If we get an error at this point we
1268                  * simply don't bother truncating the file.
1269                  */
1270                 if (error) {
1271                         xfs_trans_cancel(tp,
1272                                          (XFS_TRANS_RELEASE_LOG_RES |
1273                                           XFS_TRANS_ABORT));
1274                 } else {
1275                         error = xfs_trans_commit(tp,
1276                                                 XFS_TRANS_RELEASE_LOG_RES);
1277                 }
1278                 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1279                                             : XFS_ILOCK_EXCL));
1280         }
1281         return error;
1282 }
1283
1284 /*
1285  * Free a symlink that has blocks associated with it.
1286  */
1287 STATIC int
1288 xfs_inactive_symlink_rmt(
1289         xfs_inode_t     *ip,
1290         xfs_trans_t     **tpp)
1291 {
1292         xfs_buf_t       *bp;
1293         int             committed;
1294         int             done;
1295         int             error;
1296         xfs_fsblock_t   first_block;
1297         xfs_bmap_free_t free_list;
1298         int             i;
1299         xfs_mount_t     *mp;
1300         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1301         int             nmaps;
1302         xfs_trans_t     *ntp;
1303         int             size;
1304         xfs_trans_t     *tp;
1305
1306         tp = *tpp;
1307         mp = ip->i_mount;
1308         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1309         /*
1310          * We're freeing a symlink that has some
1311          * blocks allocated to it.  Free the
1312          * blocks here.  We know that we've got
1313          * either 1 or 2 extents and that we can
1314          * free them all in one bunmapi call.
1315          */
1316         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1317         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1318                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1319                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1320                 xfs_trans_cancel(tp, 0);
1321                 *tpp = NULL;
1322                 return error;
1323         }
1324         /*
1325          * Lock the inode, fix the size, and join it to the transaction.
1326          * Hold it so in the normal path, we still have it locked for
1327          * the second transaction.  In the error paths we need it
1328          * held so the cancel won't rele it, see below.
1329          */
1330         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1331         size = (int)ip->i_d.di_size;
1332         ip->i_d.di_size = 0;
1333         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1334         xfs_trans_ihold(tp, ip);
1335         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1336         /*
1337          * Find the block(s) so we can inval and unmap them.
1338          */
1339         done = 0;
1340         XFS_BMAP_INIT(&free_list, &first_block);
1341         nmaps = ARRAY_SIZE(mval);
1342         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1343                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1344                         &free_list, NULL)))
1345                 goto error0;
1346         /*
1347          * Invalidate the block(s).
1348          */
1349         for (i = 0; i < nmaps; i++) {
1350                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1351                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1352                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1353                 xfs_trans_binval(tp, bp);
1354         }
1355         /*
1356          * Unmap the dead block(s) to the free_list.
1357          */
1358         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1359                         &first_block, &free_list, NULL, &done)))
1360                 goto error1;
1361         ASSERT(done);
1362         /*
1363          * Commit the first transaction.  This logs the EFI and the inode.
1364          */
1365         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1366                 goto error1;
1367         /*
1368          * The transaction must have been committed, since there were
1369          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1370          * The new tp has the extent freeing and EFDs.
1371          */
1372         ASSERT(committed);
1373         /*
1374          * The first xact was committed, so add the inode to the new one.
1375          * Mark it dirty so it will be logged and moved forward in the log as
1376          * part of every commit.
1377          */
1378         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1379         xfs_trans_ihold(tp, ip);
1380         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1381         /*
1382          * Get a new, empty transaction to return to our caller.
1383          */
1384         ntp = xfs_trans_dup(tp);
1385         /*
1386          * Commit the transaction containing extent freeing and EFDs.
1387          * If we get an error on the commit here or on the reserve below,
1388          * we need to unlock the inode since the new transaction doesn't
1389          * have the inode attached.
1390          */
1391         error = xfs_trans_commit(tp, 0);
1392         tp = ntp;
1393         if (error) {
1394                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1395                 goto error0;
1396         }
1397         /*
1398          * Remove the memory for extent descriptions (just bookkeeping).
1399          */
1400         if (ip->i_df.if_bytes)
1401                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1402         ASSERT(ip->i_df.if_bytes == 0);
1403         /*
1404          * Put an itruncate log reservation in the new transaction
1405          * for our caller.
1406          */
1407         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1408                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1409                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1410                 goto error0;
1411         }
1412         /*
1413          * Return with the inode locked but not joined to the transaction.
1414          */
1415         *tpp = tp;
1416         return 0;
1417
1418  error1:
1419         xfs_bmap_cancel(&free_list);
1420  error0:
1421         /*
1422          * Have to come here with the inode locked and either
1423          * (held and in the transaction) or (not in the transaction).
1424          * If the inode isn't held then cancel would iput it, but
1425          * that's wrong since this is inactive and the vnode ref
1426          * count is 0 already.
1427          * Cancel won't do anything to the inode if held, but it still
1428          * needs to be locked until the cancel is done, if it was
1429          * joined to the transaction.
1430          */
1431         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1432         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1433         *tpp = NULL;
1434         return error;
1435
1436 }
1437
1438 STATIC int
1439 xfs_inactive_symlink_local(
1440         xfs_inode_t     *ip,
1441         xfs_trans_t     **tpp)
1442 {
1443         int             error;
1444
1445         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1446         /*
1447          * We're freeing a symlink which fit into
1448          * the inode.  Just free the memory used
1449          * to hold the old symlink.
1450          */
1451         error = xfs_trans_reserve(*tpp, 0,
1452                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1453                                   0, XFS_TRANS_PERM_LOG_RES,
1454                                   XFS_ITRUNCATE_LOG_COUNT);
1455
1456         if (error) {
1457                 xfs_trans_cancel(*tpp, 0);
1458                 *tpp = NULL;
1459                 return error;
1460         }
1461         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1462
1463         /*
1464          * Zero length symlinks _can_ exist.
1465          */
1466         if (ip->i_df.if_bytes > 0) {
1467                 xfs_idata_realloc(ip,
1468                                   -(ip->i_df.if_bytes),
1469                                   XFS_DATA_FORK);
1470                 ASSERT(ip->i_df.if_bytes == 0);
1471         }
1472         return 0;
1473 }
1474
1475 STATIC int
1476 xfs_inactive_attrs(
1477         xfs_inode_t     *ip,
1478         xfs_trans_t     **tpp)
1479 {
1480         xfs_trans_t     *tp;
1481         int             error;
1482         xfs_mount_t     *mp;
1483
1484         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1485         tp = *tpp;
1486         mp = ip->i_mount;
1487         ASSERT(ip->i_d.di_forkoff != 0);
1488         xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1489         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1490
1491         error = xfs_attr_inactive(ip);
1492         if (error) {
1493                 *tpp = NULL;
1494                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1495                 return error; /* goto out */
1496         }
1497
1498         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1499         error = xfs_trans_reserve(tp, 0,
1500                                   XFS_IFREE_LOG_RES(mp),
1501                                   0, XFS_TRANS_PERM_LOG_RES,
1502                                   XFS_INACTIVE_LOG_COUNT);
1503         if (error) {
1504                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1505                 xfs_trans_cancel(tp, 0);
1506                 *tpp = NULL;
1507                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1508                 return error;
1509         }
1510
1511         xfs_ilock(ip, XFS_ILOCK_EXCL);
1512         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1513         xfs_trans_ihold(tp, ip);
1514         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1515
1516         ASSERT(ip->i_d.di_anextents == 0);
1517
1518         *tpp = tp;
1519         return 0;
1520 }
1521
1522 STATIC int
1523 xfs_release(
1524         bhv_desc_t      *bdp)
1525 {
1526         xfs_inode_t     *ip;
1527         bhv_vnode_t     *vp;
1528         xfs_mount_t     *mp;
1529         int             error;
1530
1531         vp = BHV_TO_VNODE(bdp);
1532         ip = XFS_BHVTOI(bdp);
1533         mp = ip->i_mount;
1534
1535         if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
1536                 return 0;
1537
1538         /* If this is a read-only mount, don't do this (would generate I/O) */
1539         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1540                 return 0;
1541
1542         if (!XFS_FORCED_SHUTDOWN(mp)) {
1543                 /*
1544                  * If we are using filestreams, and we have an unlinked
1545                  * file that we are processing the last close on, then nothing
1546                  * will be able to reopen and write to this file. Purge this
1547                  * inode from the filestreams cache so that it doesn't delay
1548                  * teardown of the inode.
1549                  */
1550                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1551                         xfs_filestream_deassociate(ip);
1552
1553                 /*
1554                  * If we previously truncated this file and removed old data
1555                  * in the process, we want to initiate "early" writeout on
1556                  * the last close.  This is an attempt to combat the notorious
1557                  * NULL files problem which is particularly noticable from a
1558                  * truncate down, buffered (re-)write (delalloc), followed by
1559                  * a crash.  What we are effectively doing here is
1560                  * significantly reducing the time window where we'd otherwise
1561                  * be exposed to that problem.
1562                  */
1563                 if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1564                         bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
1565         }
1566
1567 #ifdef HAVE_REFCACHE
1568         /* If we are in the NFS reference cache then don't do this now */
1569         if (ip->i_refcache)
1570                 return 0;
1571 #endif
1572
1573         if (ip->i_d.di_nlink != 0) {
1574                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1575                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1576                        ip->i_delayed_blks > 0)) &&
1577                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1578                     (!(ip->i_d.di_flags &
1579                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1580                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1581                         if (error)
1582                                 return error;
1583                         /* Update linux inode block count after free above */
1584                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1585                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1586                 }
1587         }
1588
1589         return 0;
1590 }
1591
1592 /*
1593  * xfs_inactive
1594  *
1595  * This is called when the vnode reference count for the vnode
1596  * goes to zero.  If the file has been unlinked, then it must
1597  * now be truncated.  Also, we clear all of the read-ahead state
1598  * kept for the inode here since the file is now closed.
1599  */
1600 STATIC int
1601 xfs_inactive(
1602         bhv_desc_t      *bdp,
1603         cred_t          *credp)
1604 {
1605         xfs_inode_t     *ip;
1606         bhv_vnode_t     *vp;
1607         xfs_bmap_free_t free_list;
1608         xfs_fsblock_t   first_block;
1609         int             committed;
1610         xfs_trans_t     *tp;
1611         xfs_mount_t     *mp;
1612         int             error;
1613         int             truncate;
1614
1615         vp = BHV_TO_VNODE(bdp);
1616         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1617
1618         ip = XFS_BHVTOI(bdp);
1619
1620         /*
1621          * If the inode is already free, then there can be nothing
1622          * to clean up here.
1623          */
1624         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1625                 ASSERT(ip->i_df.if_real_bytes == 0);
1626                 ASSERT(ip->i_df.if_broot_bytes == 0);
1627                 return VN_INACTIVE_CACHE;
1628         }
1629
1630         /*
1631          * Only do a truncate if it's a regular file with
1632          * some actual space in it.  It's OK to look at the
1633          * inode's fields without the lock because we're the
1634          * only one with a reference to the inode.
1635          */
1636         truncate = ((ip->i_d.di_nlink == 0) &&
1637             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1638              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1639             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1640
1641         mp = ip->i_mount;
1642
1643         if (ip->i_d.di_nlink == 0 &&
1644             DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
1645                 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1646         }
1647
1648         error = 0;
1649
1650         /* If this is a read-only mount, don't do this (would generate I/O) */
1651         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1652                 goto out;
1653
1654         if (ip->i_d.di_nlink != 0) {
1655                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1656                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1657                        ip->i_delayed_blks > 0)) &&
1658                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1659                      (!(ip->i_d.di_flags &
1660                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1661                       (ip->i_delayed_blks != 0)))) {
1662                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1663                         if (error)
1664                                 return VN_INACTIVE_CACHE;
1665                         /* Update linux inode block count after free above */
1666                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1667                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1668                 }
1669                 goto out;
1670         }
1671
1672         ASSERT(ip->i_d.di_nlink == 0);
1673
1674         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1675                 return VN_INACTIVE_CACHE;
1676
1677         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1678         if (truncate) {
1679                 /*
1680                  * Do the xfs_itruncate_start() call before
1681                  * reserving any log space because itruncate_start
1682                  * will call into the buffer cache and we can't
1683                  * do that within a transaction.
1684                  */
1685                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1686
1687                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1688                 if (error) {
1689                         xfs_trans_cancel(tp, 0);
1690                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1691                         return VN_INACTIVE_CACHE;
1692                 }
1693
1694                 error = xfs_trans_reserve(tp, 0,
1695                                           XFS_ITRUNCATE_LOG_RES(mp),
1696                                           0, XFS_TRANS_PERM_LOG_RES,
1697                                           XFS_ITRUNCATE_LOG_COUNT);
1698                 if (error) {
1699                         /* Don't call itruncate_cleanup */
1700                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1701                         xfs_trans_cancel(tp, 0);
1702                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1703                         return VN_INACTIVE_CACHE;
1704                 }
1705
1706                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1707                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1708                 xfs_trans_ihold(tp, ip);
1709
1710                 /*
1711                  * normally, we have to run xfs_itruncate_finish sync.
1712                  * But if filesystem is wsync and we're in the inactive
1713                  * path, then we know that nlink == 0, and that the
1714                  * xaction that made nlink == 0 is permanently committed
1715                  * since xfs_remove runs as a synchronous transaction.
1716                  */
1717                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1718                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1719
1720                 if (error) {
1721                         xfs_trans_cancel(tp,
1722                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1723                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1724                         return VN_INACTIVE_CACHE;
1725                 }
1726         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1727
1728                 /*
1729                  * If we get an error while cleaning up a
1730                  * symlink we bail out.
1731                  */
1732                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1733                         xfs_inactive_symlink_rmt(ip, &tp) :
1734                         xfs_inactive_symlink_local(ip, &tp);
1735
1736                 if (error) {
1737                         ASSERT(tp == NULL);
1738                         return VN_INACTIVE_CACHE;
1739                 }
1740
1741                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1742                 xfs_trans_ihold(tp, ip);
1743         } else {
1744                 error = xfs_trans_reserve(tp, 0,
1745                                           XFS_IFREE_LOG_RES(mp),
1746                                           0, XFS_TRANS_PERM_LOG_RES,
1747                                           XFS_INACTIVE_LOG_COUNT);
1748                 if (error) {
1749                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1750                         xfs_trans_cancel(tp, 0);
1751                         return VN_INACTIVE_CACHE;
1752                 }
1753
1754                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1755                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1756                 xfs_trans_ihold(tp, ip);
1757         }
1758
1759         /*
1760          * If there are attributes associated with the file
1761          * then blow them away now.  The code calls a routine
1762          * that recursively deconstructs the attribute fork.
1763          * We need to just commit the current transaction
1764          * because we can't use it for xfs_attr_inactive().
1765          */
1766         if (ip->i_d.di_anextents > 0) {
1767                 error = xfs_inactive_attrs(ip, &tp);
1768                 /*
1769                  * If we got an error, the transaction is already
1770                  * cancelled, and the inode is unlocked. Just get out.
1771                  */
1772                  if (error)
1773                          return VN_INACTIVE_CACHE;
1774         } else if (ip->i_afp) {
1775                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1776         }
1777
1778         /*
1779          * Free the inode.
1780          */
1781         XFS_BMAP_INIT(&free_list, &first_block);
1782         error = xfs_ifree(tp, ip, &free_list);
1783         if (error) {
1784                 /*
1785                  * If we fail to free the inode, shut down.  The cancel
1786                  * might do that, we need to make sure.  Otherwise the
1787                  * inode might be lost for a long time or forever.
1788                  */
1789                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1790                         cmn_err(CE_NOTE,
1791                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1792                                 error, mp->m_fsname);
1793                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1794                 }
1795                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1796         } else {
1797                 /*
1798                  * Credit the quota account(s). The inode is gone.
1799                  */
1800                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1801
1802                 /*
1803                  * Just ignore errors at this point.  There is
1804                  * nothing we can do except to try to keep going.
1805                  */
1806                 (void) xfs_bmap_finish(&tp,  &free_list, &committed);
1807                 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1808         }
1809         /*
1810          * Release the dquots held by inode, if any.
1811          */
1812         XFS_QM_DQDETACH(mp, ip);
1813
1814         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1815
1816  out:
1817         return VN_INACTIVE_CACHE;
1818 }
1819
1820
1821 /*
1822  * xfs_lookup
1823  */
1824 STATIC int
1825 xfs_lookup(
1826         bhv_desc_t              *dir_bdp,
1827         bhv_vname_t             *dentry,
1828         bhv_vnode_t             **vpp,
1829         int                     flags,
1830         bhv_vnode_t             *rdir,
1831         cred_t                  *credp)
1832 {
1833         xfs_inode_t             *dp, *ip;
1834         xfs_ino_t               e_inum;
1835         int                     error;
1836         uint                    lock_mode;
1837         bhv_vnode_t             *dir_vp;
1838
1839         dir_vp = BHV_TO_VNODE(dir_bdp);
1840         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1841
1842         dp = XFS_BHVTOI(dir_bdp);
1843
1844         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1845                 return XFS_ERROR(EIO);
1846
1847         lock_mode = xfs_ilock_map_shared(dp);
1848         error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1849         if (!error) {
1850                 *vpp = XFS_ITOV(ip);
1851                 ITRACE(ip);
1852         }
1853         xfs_iunlock_map_shared(dp, lock_mode);
1854         return error;
1855 }
1856
1857
1858 /*
1859  * xfs_create (create a new file).
1860  */
1861 STATIC int
1862 xfs_create(
1863         bhv_desc_t              *dir_bdp,
1864         bhv_vname_t             *dentry,
1865         bhv_vattr_t             *vap,
1866         bhv_vnode_t             **vpp,
1867         cred_t                  *credp)
1868 {
1869         char                    *name = VNAME(dentry);
1870         bhv_vnode_t             *dir_vp;
1871         xfs_inode_t             *dp, *ip;
1872         bhv_vnode_t             *vp = NULL;
1873         xfs_trans_t             *tp;
1874         xfs_mount_t             *mp;
1875         xfs_dev_t               rdev;
1876         int                     error;
1877         xfs_bmap_free_t         free_list;
1878         xfs_fsblock_t           first_block;
1879         boolean_t               dp_joined_to_trans;
1880         int                     dm_event_sent = 0;
1881         uint                    cancel_flags;
1882         int                     committed;
1883         xfs_prid_t              prid;
1884         struct xfs_dquot        *udqp, *gdqp;
1885         uint                    resblks;
1886         int                     dm_di_mode;
1887         int                     namelen;
1888
1889         ASSERT(!*vpp);
1890         dir_vp = BHV_TO_VNODE(dir_bdp);
1891         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1892
1893         dp = XFS_BHVTOI(dir_bdp);
1894         mp = dp->i_mount;
1895
1896         dm_di_mode = vap->va_mode;
1897         namelen = VNAMELEN(dentry);
1898
1899         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
1900                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1901                                 dir_vp, DM_RIGHT_NULL, NULL,
1902                                 DM_RIGHT_NULL, name, NULL,
1903                                 dm_di_mode, 0, 0);
1904
1905                 if (error)
1906                         return error;
1907                 dm_event_sent = 1;
1908         }
1909
1910         if (XFS_FORCED_SHUTDOWN(mp))
1911                 return XFS_ERROR(EIO);
1912
1913         /* Return through std_return after this point. */
1914
1915         udqp = gdqp = NULL;
1916         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1917                 prid = dp->i_d.di_projid;
1918         else if (vap->va_mask & XFS_AT_PROJID)
1919                 prid = (xfs_prid_t)vap->va_projid;
1920         else
1921                 prid = (xfs_prid_t)dfltprid;
1922
1923         /*
1924          * Make sure that we have allocated dquot(s) on disk.
1925          */
1926         error = XFS_QM_DQVOPALLOC(mp, dp,
1927                         current_fsuid(credp), current_fsgid(credp), prid,
1928                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1929         if (error)
1930                 goto std_return;
1931
1932         ip = NULL;
1933         dp_joined_to_trans = B_FALSE;
1934
1935         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1936         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1937         resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1938         /*
1939          * Initially assume that the file does not exist and
1940          * reserve the resources for that case.  If that is not
1941          * the case we'll drop the one we have and get a more
1942          * appropriate transaction later.
1943          */
1944         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1945                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1946         if (error == ENOSPC) {
1947                 resblks = 0;
1948                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1949                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1950         }
1951         if (error) {
1952                 cancel_flags = 0;
1953                 dp = NULL;
1954                 goto error_return;
1955         }
1956
1957         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1958
1959         XFS_BMAP_INIT(&free_list, &first_block);
1960
1961         ASSERT(ip == NULL);
1962
1963         /*
1964          * Reserve disk quota and the inode.
1965          */
1966         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1967         if (error)
1968                 goto error_return;
1969
1970         if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
1971                 goto error_return;
1972         rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1973         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
1974                         rdev, credp, prid, resblks > 0,
1975                         &ip, &committed);
1976         if (error) {
1977                 if (error == ENOSPC)
1978                         goto error_return;
1979                 goto abort_return;
1980         }
1981         ITRACE(ip);
1982
1983         /*
1984          * At this point, we've gotten a newly allocated inode.
1985          * It is locked (and joined to the transaction).
1986          */
1987
1988         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1989
1990         /*
1991          * Now we join the directory inode to the transaction.
1992          * We do not do it earlier because xfs_dir_ialloc
1993          * might commit the previous transaction (and release
1994          * all the locks).
1995          */
1996
1997         VN_HOLD(dir_vp);
1998         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1999         dp_joined_to_trans = B_TRUE;
2000
2001         error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
2002                                         &first_block, &free_list, resblks ?
2003                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2004         if (error) {
2005                 ASSERT(error != ENOSPC);
2006                 goto abort_return;
2007         }
2008         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2009         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2010
2011         /*
2012          * If this is a synchronous mount, make sure that the
2013          * create transaction goes to disk before returning to
2014          * the user.
2015          */
2016         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2017                 xfs_trans_set_sync(tp);
2018         }
2019
2020         dp->i_gen++;
2021
2022         /*
2023          * Attach the dquot(s) to the inodes and modify them incore.
2024          * These ids of the inode couldn't have changed since the new
2025          * inode has been locked ever since it was created.
2026          */
2027         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2028
2029         /*
2030          * xfs_trans_commit normally decrements the vnode ref count
2031          * when it unlocks the inode. Since we want to return the
2032          * vnode to the caller, we bump the vnode ref count now.
2033          */
2034         IHOLD(ip);
2035         vp = XFS_ITOV(ip);
2036
2037         error = xfs_bmap_finish(&tp, &free_list, &committed);
2038         if (error) {
2039                 xfs_bmap_cancel(&free_list);
2040                 goto abort_rele;
2041         }
2042
2043         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2044         if (error) {
2045                 IRELE(ip);
2046                 tp = NULL;
2047                 goto error_return;
2048         }
2049
2050         XFS_QM_DQRELE(mp, udqp);
2051         XFS_QM_DQRELE(mp, gdqp);
2052
2053         /*
2054          * Propagate the fact that the vnode changed after the
2055          * xfs_inode locks have been released.
2056          */
2057         bhv_vop_vnode_change(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2058
2059         *vpp = vp;
2060
2061         /* Fallthrough to std_return with error = 0  */
2062
2063 std_return:
2064         if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
2065                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2066                                                         DM_EVENT_POSTCREATE)) {
2067                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2068                         dir_vp, DM_RIGHT_NULL,
2069                         *vpp ? vp:NULL,
2070                         DM_RIGHT_NULL, name, NULL,
2071                         dm_di_mode, error, 0);
2072         }
2073         return error;
2074
2075  abort_return:
2076         cancel_flags |= XFS_TRANS_ABORT;
2077         /* FALLTHROUGH */
2078
2079  error_return:
2080         if (tp != NULL)
2081                 xfs_trans_cancel(tp, cancel_flags);
2082
2083         if (!dp_joined_to_trans && (dp != NULL))
2084                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2085         XFS_QM_DQRELE(mp, udqp);
2086         XFS_QM_DQRELE(mp, gdqp);
2087
2088         goto std_return;
2089
2090  abort_rele:
2091         /*
2092          * Wait until after the current transaction is aborted to
2093          * release the inode.  This prevents recursive transactions
2094          * and deadlocks from xfs_inactive.
2095          */
2096         cancel_flags |= XFS_TRANS_ABORT;
2097         xfs_trans_cancel(tp, cancel_flags);
2098         IRELE(ip);
2099
2100         XFS_QM_DQRELE(mp, udqp);
2101         XFS_QM_DQRELE(mp, gdqp);
2102
2103         goto std_return;
2104 }
2105
2106 #ifdef DEBUG
2107 /*
2108  * Some counters to see if (and how often) we are hitting some deadlock
2109  * prevention code paths.
2110  */
2111
2112 int xfs_rm_locks;
2113 int xfs_rm_lock_delays;
2114 int xfs_rm_attempts;
2115 #endif
2116
2117 /*
2118  * The following routine will lock the inodes associated with the
2119  * directory and the named entry in the directory. The locks are
2120  * acquired in increasing inode number.
2121  *
2122  * If the entry is "..", then only the directory is locked. The
2123  * vnode ref count will still include that from the .. entry in
2124  * this case.
2125  *
2126  * There is a deadlock we need to worry about. If the locked directory is
2127  * in the AIL, it might be blocking up the log. The next inode we lock
2128  * could be already locked by another thread waiting for log space (e.g
2129  * a permanent log reservation with a long running transaction (see
2130  * xfs_itruncate_finish)). To solve this, we must check if the directory
2131  * is in the ail and use lock_nowait. If we can't lock, we need to
2132  * drop the inode lock on the directory and try again. xfs_iunlock will
2133  * potentially push the tail if we were holding up the log.
2134  */
2135 STATIC int
2136 xfs_lock_dir_and_entry(
2137         xfs_inode_t     *dp,
2138         xfs_inode_t     *ip)    /* inode of entry 'name' */
2139 {
2140         int             attempts;
2141         xfs_ino_t       e_inum;
2142         xfs_inode_t     *ips[2];
2143         xfs_log_item_t  *lp;
2144
2145 #ifdef DEBUG
2146         xfs_rm_locks++;
2147 #endif
2148         attempts = 0;
2149
2150 again:
2151         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2152
2153         e_inum = ip->i_ino;
2154
2155         ITRACE(ip);
2156
2157         /*
2158          * We want to lock in increasing inum. Since we've already
2159          * acquired the lock on the directory, we may need to release
2160          * if if the inum of the entry turns out to be less.
2161          */
2162         if (e_inum > dp->i_ino) {
2163                 /*
2164                  * We are already in the right order, so just
2165                  * lock on the inode of the entry.
2166                  * We need to use nowait if dp is in the AIL.
2167                  */
2168
2169                 lp = (xfs_log_item_t *)dp->i_itemp;
2170                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2171                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2172                                 attempts++;
2173 #ifdef DEBUG
2174                                 xfs_rm_attempts++;
2175 #endif
2176
2177                                 /*
2178                                  * Unlock dp and try again.
2179                                  * xfs_iunlock will try to push the tail
2180                                  * if the inode is in the AIL.
2181                                  */
2182
2183                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2184
2185                                 if ((attempts % 5) == 0) {
2186                                         delay(1); /* Don't just spin the CPU */
2187 #ifdef DEBUG
2188                                         xfs_rm_lock_delays++;
2189 #endif
2190                                 }
2191                                 goto again;
2192                         }
2193                 } else {
2194                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2195                 }
2196         } else if (e_inum < dp->i_ino) {
2197                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2198
2199                 ips[0] = ip;
2200                 ips[1] = dp;
2201                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2202         }
2203         /* else  e_inum == dp->i_ino */
2204         /*     This can happen if we're asked to lock /x/..
2205          *     the entry is "..", which is also the parent directory.
2206          */
2207
2208         return 0;
2209 }
2210
2211 #ifdef DEBUG
2212 int xfs_locked_n;
2213 int xfs_small_retries;
2214 int xfs_middle_retries;
2215 int xfs_lots_retries;
2216 int xfs_lock_delays;
2217 #endif
2218
2219 /*
2220  * Bump the subclass so xfs_lock_inodes() acquires each lock with
2221  * a different value
2222  */
2223 static inline int
2224 xfs_lock_inumorder(int lock_mode, int subclass)
2225 {
2226         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2227                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2228         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2229                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2230
2231         return lock_mode;
2232 }
2233
2234 /*
2235  * The following routine will lock n inodes in exclusive mode.
2236  * We assume the caller calls us with the inodes in i_ino order.
2237  *
2238  * We need to detect deadlock where an inode that we lock
2239  * is in the AIL and we start waiting for another inode that is locked
2240  * by a thread in a long running transaction (such as truncate). This can
2241  * result in deadlock since the long running trans might need to wait
2242  * for the inode we just locked in order to push the tail and free space
2243  * in the log.
2244  */
2245 void
2246 xfs_lock_inodes(
2247         xfs_inode_t     **ips,
2248         int             inodes,
2249         int             first_locked,
2250         uint            lock_mode)
2251 {
2252         int             attempts = 0, i, j, try_lock;
2253         xfs_log_item_t  *lp;
2254
2255         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2256
2257         if (first_locked) {
2258                 try_lock = 1;
2259                 i = 1;
2260         } else {
2261                 try_lock = 0;
2262                 i = 0;
2263         }
2264
2265 again:
2266         for (; i < inodes; i++) {
2267                 ASSERT(ips[i]);
2268
2269                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2270                         continue;
2271
2272                 /*
2273                  * If try_lock is not set yet, make sure all locked inodes
2274                  * are not in the AIL.
2275                  * If any are, set try_lock to be used later.
2276                  */
2277
2278                 if (!try_lock) {
2279                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2280                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2281                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2282                                         try_lock++;
2283                                 }
2284                         }
2285                 }
2286
2287                 /*
2288                  * If any of the previous locks we have locked is in the AIL,
2289                  * we must TRY to get the second and subsequent locks. If
2290                  * we can't get any, we must release all we have
2291                  * and try again.
2292                  */
2293
2294                 if (try_lock) {
2295                         /* try_lock must be 0 if i is 0. */
2296                         /*
2297                          * try_lock means we have an inode locked
2298                          * that is in the AIL.
2299                          */
2300                         ASSERT(i != 0);
2301                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2302                                 attempts++;
2303
2304                                 /*
2305                                  * Unlock all previous guys and try again.
2306                                  * xfs_iunlock will try to push the tail
2307                                  * if the inode is in the AIL.
2308                                  */
2309
2310                                 for(j = i - 1; j >= 0; j--) {
2311
2312                                         /*
2313                                          * Check to see if we've already
2314                                          * unlocked this one.
2315                                          * Not the first one going back,
2316                                          * and the inode ptr is the same.
2317                                          */
2318                                         if ((j != (i - 1)) && ips[j] ==
2319                                                                 ips[j+1])
2320                                                 continue;
2321
2322                                         xfs_iunlock(ips[j], lock_mode);
2323                                 }
2324
2325                                 if ((attempts % 5) == 0) {
2326                                         delay(1); /* Don't just spin the CPU */
2327 #ifdef DEBUG
2328                                         xfs_lock_delays++;
2329 #endif
2330                                 }
2331                                 i = 0;
2332                                 try_lock = 0;
2333                                 goto again;
2334                         }
2335                 } else {
2336                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2337                 }
2338         }
2339
2340 #ifdef DEBUG
2341         if (attempts) {
2342                 if (attempts < 5) xfs_small_retries++;
2343                 else if (attempts < 100) xfs_middle_retries++;
2344                 else xfs_lots_retries++;
2345         } else {
2346                 xfs_locked_n++;
2347         }
2348 #endif
2349 }
2350
2351 #ifdef  DEBUG
2352 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2353 int remove_which_error_return = 0;
2354 #else /* ! DEBUG */
2355 #define REMOVE_DEBUG_TRACE(x)
2356 #endif  /* ! DEBUG */
2357
2358
2359 /*
2360  * xfs_remove
2361  *
2362  */
2363 STATIC int
2364 xfs_remove(
2365         bhv_desc_t              *dir_bdp,
2366         bhv_vname_t             *dentry,
2367         cred_t                  *credp)
2368 {
2369         bhv_vnode_t             *dir_vp;
2370         char                    *name = VNAME(dentry);
2371         xfs_inode_t             *dp, *ip;
2372         xfs_trans_t             *tp = NULL;
2373         xfs_mount_t             *mp;
2374         int                     error = 0;
2375         xfs_bmap_free_t         free_list;
2376         xfs_fsblock_t           first_block;
2377         int                     cancel_flags;
2378         int                     committed;
2379         int                     dm_di_mode = 0;
2380         int                     link_zero;
2381         uint                    resblks;
2382         int                     namelen;
2383
2384         dir_vp = BHV_TO_VNODE(dir_bdp);
2385         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2386
2387         dp = XFS_BHVTOI(dir_bdp);
2388         mp = dp->i_mount;
2389
2390         if (XFS_FORCED_SHUTDOWN(mp))
2391                 return XFS_ERROR(EIO);
2392
2393         namelen = VNAMELEN(dentry);
2394
2395         if (!xfs_get_dir_entry(dentry, &ip)) {
2396                 dm_di_mode = ip->i_d.di_mode;
2397                 IRELE(ip);
2398         }
2399
2400         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
2401                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2402                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2403                                         name, NULL, dm_di_mode, 0, 0);
2404                 if (error)
2405                         return error;
2406         }
2407
2408         /* From this point on, return through std_return */
2409         ip = NULL;
2410
2411         /*
2412          * We need to get a reference to ip before we get our log
2413          * reservation. The reason for this is that we cannot call
2414          * xfs_iget for an inode for which we do not have a reference
2415          * once we've acquired a log reservation. This is because the
2416          * inode we are trying to get might be in xfs_inactive going
2417          * for a log reservation. Since we'll have to wait for the
2418          * inactive code to complete before returning from xfs_iget,
2419          * we need to make sure that we don't have log space reserved
2420          * when we call xfs_iget.  Instead we get an unlocked reference
2421          * to the inode before getting our log reservation.
2422          */
2423         error = xfs_get_dir_entry(dentry, &ip);
2424         if (error) {
2425                 REMOVE_DEBUG_TRACE(__LINE__);
2426                 goto std_return;
2427         }
2428
2429         dm_di_mode = ip->i_d.di_mode;
2430
2431         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2432
2433         ITRACE(ip);
2434
2435         error = XFS_QM_DQATTACH(mp, dp, 0);
2436         if (!error && dp != ip)
2437                 error = XFS_QM_DQATTACH(mp, ip, 0);
2438         if (error) {
2439                 REMOVE_DEBUG_TRACE(__LINE__);
2440                 IRELE(ip);
2441                 goto std_return;
2442         }
2443
2444         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2445         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2446         /*
2447          * We try to get the real space reservation first,
2448          * allowing for directory btree deletion(s) implying
2449          * possible bmap insert(s).  If we can't get the space
2450          * reservation then we use 0 instead, and avoid the bmap
2451          * btree insert(s) in the directory code by, if the bmap
2452          * insert tries to happen, instead trimming the LAST
2453          * block from the directory.
2454          */
2455         resblks = XFS_REMOVE_SPACE_RES(mp);
2456         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2457                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2458         if (error == ENOSPC) {
2459                 resblks = 0;
2460                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2461                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2462         }
2463         if (error) {
2464                 ASSERT(error != ENOSPC);
2465                 REMOVE_DEBUG_TRACE(__LINE__);
2466                 xfs_trans_cancel(tp, 0);
2467                 IRELE(ip);
2468                 return error;
2469         }
2470
2471         error = xfs_lock_dir_and_entry(dp, ip);
2472         if (error) {
2473                 REMOVE_DEBUG_TRACE(__LINE__);
2474                 xfs_trans_cancel(tp, cancel_flags);
2475                 IRELE(ip);
2476                 goto std_return;
2477         }
2478
2479         /*
2480          * At this point, we've gotten both the directory and the entry
2481          * inodes locked.
2482          */
2483         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2484         if (dp != ip) {
2485                 /*
2486                  * Increment vnode ref count only in this case since
2487                  * there's an extra vnode reference in the case where
2488                  * dp == ip.
2489                  */
2490                 IHOLD(dp);
2491                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2492         }
2493
2494         /*
2495          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2496          */
2497         XFS_BMAP_INIT(&free_list, &first_block);
2498         error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
2499                                         &first_block, &free_list, 0);
2500         if (error) {
2501                 ASSERT(error != ENOENT);
2502                 REMOVE_DEBUG_TRACE(__LINE__);
2503                 goto error1;
2504         }
2505         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2506
2507         dp->i_gen++;
2508         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2509
2510         error = xfs_droplink(tp, ip);
2511         if (error) {
2512                 REMOVE_DEBUG_TRACE(__LINE__);
2513                 goto error1;
2514         }
2515
2516         /* Determine if this is the last link while
2517          * we are in the transaction.
2518          */
2519         link_zero = (ip)->i_d.di_nlink==0;
2520
2521         /*
2522          * Take an extra ref on the inode so that it doesn't
2523          * go to xfs_inactive() from within the commit.
2524          */
2525         IHOLD(ip);
2526
2527         /*
2528          * If this is a synchronous mount, make sure that the
2529          * remove transaction goes to disk before returning to
2530          * the user.
2531          */
2532         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2533                 xfs_trans_set_sync(tp);
2534         }
2535
2536         error = xfs_bmap_finish(&tp, &free_list, &committed);
2537         if (error) {
2538                 REMOVE_DEBUG_TRACE(__LINE__);
2539                 goto error_rele;
2540         }
2541
2542         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2543         if (error) {
2544                 IRELE(ip);
2545                 goto std_return;
2546         }
2547
2548         /*
2549          * Before we drop our extra reference to the inode, purge it
2550          * from the refcache if it is there.  By waiting until afterwards
2551          * to do the IRELE, we ensure that we won't go inactive in the
2552          * xfs_refcache_purge_ip routine (although that would be OK).
2553          */
2554         xfs_refcache_purge_ip(ip);
2555
2556         /*
2557          * If we are using filestreams, kill the stream association.
2558          * If the file is still open it may get a new one but that
2559          * will get killed on last close in xfs_close() so we don't
2560          * have to worry about that.
2561          */
2562         if (link_zero && xfs_inode_is_filestream(ip))
2563                 xfs_filestream_deassociate(ip);
2564
2565         vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2566
2567         /*
2568          * Let interposed file systems know about removed links.
2569          */
2570         bhv_vop_link_removed(XFS_ITOV(ip), dir_vp, link_zero);
2571
2572         IRELE(ip);
2573
2574 /*      Fall through to std_return with error = 0 */
2575  std_return:
2576         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
2577                                                 DM_EVENT_POSTREMOVE)) {
2578                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2579                                 dir_vp, DM_RIGHT_NULL,
2580                                 NULL, DM_RIGHT_NULL,
2581                                 name, NULL, dm_di_mode, error, 0);
2582         }
2583         return error;
2584
2585  error1:
2586         xfs_bmap_cancel(&free_list);
2587         cancel_flags |= XFS_TRANS_ABORT;
2588         xfs_trans_cancel(tp, cancel_flags);
2589         goto std_return;
2590
2591  error_rele:
2592         /*
2593          * In this case make sure to not release the inode until after
2594          * the current transaction is aborted.  Releasing it beforehand
2595          * can cause us to go to xfs_inactive and start a recursive
2596          * transaction which can easily deadlock with the current one.
2597          */
2598         xfs_bmap_cancel(&free_list);
2599         cancel_flags |= XFS_TRANS_ABORT;
2600         xfs_trans_cancel(tp, cancel_flags);
2601
2602         /*
2603          * Before we drop our extra reference to the inode, purge it
2604          * from the refcache if it is there.  By waiting until afterwards
2605          * to do the IRELE, we ensure that we won't go inactive in the
2606          * xfs_refcache_purge_ip routine (although that would be OK).
2607          */
2608         xfs_refcache_purge_ip(ip);
2609
2610         IRELE(ip);
2611
2612         goto std_return;
2613 }
2614
2615
2616 /*
2617  * xfs_link
2618  *
2619  */
2620 STATIC int
2621 xfs_link(
2622         bhv_desc_t              *target_dir_bdp,
2623         bhv_vnode_t             *src_vp,
2624         bhv_vname_t             *dentry,
2625         cred_t                  *credp)
2626 {
2627         xfs_inode_t             *tdp, *sip;
2628         xfs_trans_t             *tp;
2629         xfs_mount_t             *mp;
2630         xfs_inode_t             *ips[2];
2631         int                     error;
2632         xfs_bmap_free_t         free_list;
2633         xfs_fsblock_t           first_block;
2634         int                     cancel_flags;
2635         int                     committed;
2636         bhv_vnode_t             *target_dir_vp;
2637         int                     resblks;
2638         char                    *target_name = VNAME(dentry);
2639         int                     target_namelen;
2640
2641         target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2642         vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2643         vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2644
2645         target_namelen = VNAMELEN(dentry);
2646         ASSERT(!VN_ISDIR(src_vp));
2647
2648         sip = xfs_vtoi(src_vp);
2649         tdp = XFS_BHVTOI(target_dir_bdp);
2650         mp = tdp->i_mount;
2651         if (XFS_FORCED_SHUTDOWN(mp))
2652                 return XFS_ERROR(EIO);
2653
2654         if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
2655                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2656                                         target_dir_vp, DM_RIGHT_NULL,
2657                                         src_vp, DM_RIGHT_NULL,
2658                                         target_name, NULL, 0, 0, 0);
2659                 if (error)
2660                         return error;
2661         }
2662
2663         /* Return through std_return after this point. */
2664
2665         error = XFS_QM_DQATTACH(mp, sip, 0);
2666         if (!error && sip != tdp)
2667                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2668         if (error)
2669                 goto std_return;
2670
2671         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2672         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2673         resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2674         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2675                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2676         if (error == ENOSPC) {
2677                 resblks = 0;
2678                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2679                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2680         }
2681         if (error) {
2682                 cancel_flags = 0;
2683                 goto error_return;
2684         }
2685
2686         if (sip->i_ino < tdp->i_ino) {
2687                 ips[0] = sip;
2688                 ips[1] = tdp;
2689         } else {
2690                 ips[0] = tdp;
2691                 ips[1] = sip;
2692         }
2693
2694         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2695
2696         /*
2697          * Increment vnode ref counts since xfs_trans_commit &
2698          * xfs_trans_cancel will both unlock the inodes and
2699          * decrement the associated ref counts.
2700          */
2701         VN_HOLD(src_vp);
2702         VN_HOLD(target_dir_vp);
2703         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2704         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2705
2706         /*
2707          * If the source has too many links, we can't make any more to it.
2708          */
2709         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2710                 error = XFS_ERROR(EMLINK);
2711                 goto error_return;
2712         }
2713
2714         /*
2715          * If we are using project inheritance, we only allow hard link
2716          * creation in our tree when the project IDs are the same; else
2717          * the tree quota mechanism could be circumvented.
2718          */
2719         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2720                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2721                 error = XFS_ERROR(EXDEV);
2722                 goto error_return;
2723         }
2724
2725         if (resblks == 0 &&
2726             (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
2727                 goto error_return;
2728
2729         XFS_BMAP_INIT(&free_list, &first_block);
2730
2731         error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
2732                                    sip->i_ino, &first_block, &free_list,
2733                                    resblks);
2734         if (error)
2735                 goto abort_return;
2736         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2737         tdp->i_gen++;
2738         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2739
2740         error = xfs_bumplink(tp, sip);
2741         if (error)
2742                 goto abort_return;
2743
2744         /*
2745          * If this is a synchronous mount, make sure that the
2746          * link transaction goes to disk before returning to
2747          * the user.
2748          */
2749         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2750                 xfs_trans_set_sync(tp);
2751         }
2752
2753         error = xfs_bmap_finish (&tp, &free_list, &committed);
2754         if (error) {
2755                 xfs_bmap_cancel(&free_list);
2756                 goto abort_return;
2757         }
2758
2759         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2760         if (error)
2761                 goto std_return;
2762
2763         /* Fall through to std_return with error = 0. */
2764 std_return:
2765         if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
2766                                                 DM_EVENT_POSTLINK)) {
2767                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2768                                 target_dir_vp, DM_RIGHT_NULL,
2769                                 src_vp, DM_RIGHT_NULL,
2770                                 target_name, NULL, 0, error, 0);
2771         }
2772         return error;
2773
2774  abort_return:
2775         cancel_flags |= XFS_TRANS_ABORT;
2776         /* FALLTHROUGH */
2777
2778  error_return:
2779         xfs_trans_cancel(tp, cancel_flags);
2780         goto std_return;
2781 }
2782
2783
2784 /*
2785  * xfs_mkdir
2786  *
2787  */
2788 STATIC int
2789 xfs_mkdir(
2790         bhv_desc_t              *dir_bdp,
2791         bhv_vname_t             *dentry,
2792         bhv_vattr_t             *vap,
2793         bhv_vnode_t             **vpp,
2794         cred_t                  *credp)
2795 {
2796         char                    *dir_name = VNAME(dentry);
2797         xfs_inode_t             *dp;
2798         xfs_inode_t             *cdp;   /* inode of created dir */
2799         bhv_vnode_t             *cvp;   /* vnode of created dir */
2800         xfs_trans_t             *tp;
2801         xfs_mount_t             *mp;
2802         int                     cancel_flags;
2803         int                     error;
2804         int                     committed;
2805         xfs_bmap_free_t         free_list;
2806         xfs_fsblock_t           first_block;
2807         bhv_vnode_t             *dir_vp;
2808         boolean_t               dp_joined_to_trans;
2809         boolean_t               created = B_FALSE;
2810         int                     dm_event_sent = 0;
2811         xfs_prid_t              prid;
2812         struct xfs_dquot        *udqp, *gdqp;
2813         uint                    resblks;
2814         int                     dm_di_mode;
2815         int                     dir_namelen;
2816
2817         dir_vp = BHV_TO_VNODE(dir_bdp);
2818         dp = XFS_BHVTOI(dir_bdp);
2819         mp = dp->i_mount;
2820
2821         if (XFS_FORCED_SHUTDOWN(mp))
2822                 return XFS_ERROR(EIO);
2823
2824         dir_namelen = VNAMELEN(dentry);
2825
2826         tp = NULL;
2827         dp_joined_to_trans = B_FALSE;
2828         dm_di_mode = vap->va_mode;
2829
2830         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
2831                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2832                                         dir_vp, DM_RIGHT_NULL, NULL,
2833                                         DM_RIGHT_NULL, dir_name, NULL,
2834                                         dm_di_mode, 0, 0);
2835                 if (error)
2836                         return error;
2837                 dm_event_sent = 1;
2838         }
2839
2840         /* Return through std_return after this point. */
2841
2842         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2843
2844         mp = dp->i_mount;
2845         udqp = gdqp = NULL;
2846         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2847                 prid = dp->i_d.di_projid;
2848         else if (vap->va_mask & XFS_AT_PROJID)
2849                 prid = (xfs_prid_t)vap->va_projid;
2850         else
2851                 prid = (xfs_prid_t)dfltprid;
2852
2853         /*
2854          * Make sure that we have allocated dquot(s) on disk.
2855          */
2856         error = XFS_QM_DQVOPALLOC(mp, dp,
2857                         current_fsuid(credp), current_fsgid(credp), prid,
2858                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2859         if (error)
2860                 goto std_return;
2861
2862         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2863         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2864         resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2865         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2866                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2867         if (error == ENOSPC) {
2868                 resblks = 0;
2869                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2870                                           XFS_TRANS_PERM_LOG_RES,
2871                                           XFS_MKDIR_LOG_COUNT);
2872         }
2873         if (error) {
2874                 cancel_flags = 0;
2875                 dp = NULL;
2876                 goto error_return;
2877         }
2878
2879         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2880
2881         /*
2882          * Check for directory link count overflow.
2883          */
2884         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2885                 error = XFS_ERROR(EMLINK);
2886                 goto error_return;
2887         }
2888
2889         /*
2890          * Reserve disk quota and the inode.
2891          */
2892         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2893         if (error)
2894                 goto error_return;
2895
2896         if (resblks == 0 &&
2897             (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
2898                 goto error_return;
2899         /*
2900          * create the directory inode.
2901          */
2902         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
2903                         0, credp, prid, resblks > 0,
2904                 &cdp, NULL);
2905         if (error) {
2906                 if (error == ENOSPC)
2907                         goto error_return;
2908                 goto abort_return;
2909         }
2910         ITRACE(cdp);
2911
2912         /*
2913          * Now we add the directory inode to the transaction.
2914          * We waited until now since xfs_dir_ialloc might start
2915          * a new transaction.  Had we joined the transaction
2916          * earlier, the locks might have gotten released.
2917          */
2918         VN_HOLD(dir_vp);
2919         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2920         dp_joined_to_trans = B_TRUE;
2921
2922         XFS_BMAP_INIT(&free_list, &first_block);
2923
2924         error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
2925                                    &first_block, &free_list, resblks ?
2926                                    resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2927         if (error) {
2928                 ASSERT(error != ENOSPC);
2929                 goto error1;
2930         }
2931         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2932
2933         /*
2934          * Bump the in memory version number of the parent directory
2935          * so that other processes accessing it will recognize that
2936          * the directory has changed.
2937          */
2938         dp->i_gen++;
2939
2940         error = xfs_dir_init(tp, cdp, dp);
2941         if (error)
2942                 goto error2;
2943
2944         cdp->i_gen = 1;
2945         error = xfs_bumplink(tp, dp);
2946         if (error)
2947                 goto error2;
2948
2949         cvp = XFS_ITOV(cdp);
2950
2951         created = B_TRUE;
2952
2953         *vpp = cvp;
2954         IHOLD(cdp);
2955
2956         /*
2957          * Attach the dquots to the new inode and modify the icount incore.
2958          */
2959         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2960
2961         /*
2962          * If this is a synchronous mount, make sure that the
2963          * mkdir transaction goes to disk before returning to
2964          * the user.
2965          */
2966         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2967                 xfs_trans_set_sync(tp);
2968         }
2969
2970         error = xfs_bmap_finish(&tp, &free_list, &committed);
2971         if (error) {
2972                 IRELE(cdp);
2973                 goto error2;
2974         }
2975
2976         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2977         XFS_QM_DQRELE(mp, udqp);
2978         XFS_QM_DQRELE(mp, gdqp);
2979         if (error) {
2980                 IRELE(cdp);
2981         }
2982
2983         /* Fall through to std_return with error = 0 or errno from
2984          * xfs_trans_commit. */
2985
2986 std_return:
2987         if ( (created || (error != 0 && dm_event_sent != 0)) &&
2988                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2989                                                 DM_EVENT_POSTCREATE)) {
2990                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2991                                         dir_vp, DM_RIGHT_NULL,
2992                                         created ? XFS_ITOV(cdp):NULL,
2993                                         DM_RIGHT_NULL,
2994                                         dir_name, NULL,
2995                                         dm_di_mode, error, 0);
2996         }
2997         return error;
2998
2999  error2:
3000  error1:
3001         xfs_bmap_cancel(&free_list);
3002  abort_return:
3003         cancel_flags |= XFS_TRANS_ABORT;
3004  error_return:
3005         xfs_trans_cancel(tp, cancel_flags);
3006         XFS_QM_DQRELE(mp, udqp);
3007         XFS_QM_DQRELE(mp, gdqp);
3008
3009         if (!dp_joined_to_trans && (dp != NULL)) {
3010                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3011         }
3012
3013         goto std_return;
3014 }
3015
3016
3017 /*
3018  * xfs_rmdir
3019  *
3020  */
3021 STATIC int
3022 xfs_rmdir(
3023         bhv_desc_t              *dir_bdp,
3024         bhv_vname_t             *dentry,
3025         cred_t                  *credp)
3026 {
3027         char                    *name = VNAME(dentry);
3028         xfs_inode_t             *dp;
3029         xfs_inode_t             *cdp;   /* child directory */
3030         xfs_trans_t             *tp;
3031         xfs_mount_t             *mp;
3032         int                     error;
3033         xfs_bmap_free_t         free_list;
3034         xfs_fsblock_t           first_block;
3035         int                     cancel_flags;
3036         int                     committed;
3037         bhv_vnode_t             *dir_vp;
3038         int                     dm_di_mode = S_IFDIR;
3039         int                     last_cdp_link;
3040         int                     namelen;
3041         uint                    resblks;
3042
3043         dir_vp = BHV_TO_VNODE(dir_bdp);
3044         dp = XFS_BHVTOI(dir_bdp);
3045         mp = dp->i_mount;
3046
3047         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3048
3049         if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3050                 return XFS_ERROR(EIO);
3051         namelen = VNAMELEN(dentry);
3052
3053         if (!xfs_get_dir_entry(dentry, &cdp)) {
3054                 dm_di_mode = cdp->i_d.di_mode;
3055                 IRELE(cdp);
3056         }
3057
3058         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
3059                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3060                                         dir_vp, DM_RIGHT_NULL,
3061                                         NULL, DM_RIGHT_NULL,
3062                                         name, NULL, dm_di_mode, 0, 0);
3063                 if (error)
3064                         return XFS_ERROR(error);
3065         }
3066
3067         /* Return through std_return after this point. */
3068
3069         cdp = NULL;
3070
3071         /*
3072          * We need to get a reference to cdp before we get our log
3073          * reservation.  The reason for this is that we cannot call
3074          * xfs_iget for an inode for which we do not have a reference
3075          * once we've acquired a log reservation.  This is because the
3076          * inode we are trying to get might be in xfs_inactive going
3077          * for a log reservation.  Since we'll have to wait for the
3078          * inactive code to complete before returning from xfs_iget,
3079          * we need to make sure that we don't have log space reserved
3080          * when we call xfs_iget.  Instead we get an unlocked reference
3081          * to the inode before getting our log reservation.
3082          */
3083         error = xfs_get_dir_entry(dentry, &cdp);
3084         if (error) {
3085                 REMOVE_DEBUG_TRACE(__LINE__);
3086                 goto std_return;
3087         }
3088         mp = dp->i_mount;
3089         dm_di_mode = cdp->i_d.di_mode;
3090
3091         /*
3092          * Get the dquots for the inodes.
3093          */
3094         error = XFS_QM_DQATTACH(mp, dp, 0);
3095         if (!error && dp != cdp)
3096                 error = XFS_QM_DQATTACH(mp, cdp, 0);
3097         if (error) {
3098                 IRELE(cdp);
3099                 REMOVE_DEBUG_TRACE(__LINE__);
3100                 goto std_return;
3101         }
3102
3103         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3104         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3105         /*
3106          * We try to get the real space reservation first,
3107          * allowing for directory btree deletion(s) implying
3108          * possible bmap insert(s).  If we can't get the space
3109          * reservation then we use 0 instead, and avoid the bmap
3110          * btree insert(s) in the directory code by, if the bmap
3111          * insert tries to happen, instead trimming the LAST
3112          * block from the directory.
3113          */
3114         resblks = XFS_REMOVE_SPACE_RES(mp);
3115         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3116                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3117         if (error == ENOSPC) {
3118                 resblks = 0;
3119                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3120                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3121         }
3122         if (error) {
3123                 ASSERT(error != ENOSPC);
3124                 cancel_flags = 0;
3125                 IRELE(cdp);
3126                 goto error_return;
3127         }
3128         XFS_BMAP_INIT(&free_list, &first_block);
3129
3130         /*
3131          * Now lock the child directory inode and the parent directory
3132          * inode in the proper order.  This will take care of validating
3133          * that the directory entry for the child directory inode has
3134          * not changed while we were obtaining a log reservation.
3135          */
3136         error = xfs_lock_dir_and_entry(dp, cdp);
3137         if (error) {
3138                 xfs_trans_cancel(tp, cancel_flags);
3139                 IRELE(cdp);
3140                 goto std_return;
3141         }
3142
3143         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3144         if (dp != cdp) {
3145                 /*
3146                  * Only increment the parent directory vnode count if
3147                  * we didn't bump it in looking up cdp.  The only time
3148                  * we don't bump it is when we're looking up ".".
3149                  */
3150                 VN_HOLD(dir_vp);
3151         }
3152
3153         ITRACE(cdp);
3154         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3155
3156         ASSERT(cdp->i_d.di_nlink >= 2);
3157         if (cdp->i_d.di_nlink != 2) {
3158                 error = XFS_ERROR(ENOTEMPTY);
3159                 goto error_return;
3160         }
3161         if (!xfs_dir_isempty(cdp)) {
3162                 error = XFS_ERROR(ENOTEMPTY);
3163                 goto error_return;
3164         }
3165
3166         error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
3167                                         &first_block, &free_list, resblks);
3168         if (error)
3169                 goto error1;
3170
3171         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3172
3173         /*
3174          * Bump the in memory generation count on the parent
3175          * directory so that other can know that it has changed.
3176          */
3177         dp->i_gen++;
3178
3179         /*
3180          * Drop the link from cdp's "..".
3181          */
3182         error = xfs_droplink(tp, dp);
3183         if (error) {
3184                 goto error1;
3185         }
3186
3187         /*
3188          * Drop the link from dp to cdp.
3189          */
3190         error = xfs_droplink(tp, cdp);
3191         if (error) {
3192                 goto error1;
3193         }
3194
3195         /*
3196          * Drop the "." link from cdp to self.
3197          */
3198         error = xfs_droplink(tp, cdp);
3199         if (error) {
3200                 goto error1;
3201         }
3202
3203         /* Determine these before committing transaction */
3204         last_cdp_link = (cdp)->i_d.di_nlink==0;
3205
3206         /*
3207          * Take an extra ref on the child vnode so that it
3208          * does not go to xfs_inactive() from within the commit.
3209          */
3210         IHOLD(cdp);
3211
3212         /*
3213          * If this is a synchronous mount, make sure that the
3214          * rmdir transaction goes to disk before returning to
3215          * the user.
3216          */
3217         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3218                 xfs_trans_set_sync(tp);
3219         }
3220
3221         error = xfs_bmap_finish (&tp, &free_list, &committed);
3222         if (error) {
3223                 xfs_bmap_cancel(&free_list);
3224                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3225                                  XFS_TRANS_ABORT));
3226                 IRELE(cdp);
3227                 goto std_return;
3228         }
3229
3230         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3231         if (error) {
3232                 IRELE(cdp);
3233                 goto std_return;
3234         }
3235
3236
3237         /*
3238          * Let interposed file systems know about removed links.
3239          */
3240         bhv_vop_link_removed(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3241
3242         IRELE(cdp);
3243
3244         /* Fall through to std_return with error = 0 or the errno
3245          * from xfs_trans_commit. */
3246  std_return:
3247         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
3248                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3249                                         dir_vp, DM_RIGHT_NULL,
3250                                         NULL, DM_RIGHT_NULL,
3251                                         name, NULL, dm_di_mode,
3252                                         error, 0);
3253         }
3254         return error;
3255
3256  error1:
3257         xfs_bmap_cancel(&free_list);
3258         cancel_flags |= XFS_TRANS_ABORT;
3259         /* FALLTHROUGH */
3260
3261  error_return:
3262         xfs_trans_cancel(tp, cancel_flags);
3263         goto std_return;
3264 }
3265
3266
3267 /*
3268  * Read dp's entries starting at uiop->uio_offset and translate them into
3269  * bufsize bytes worth of struct dirents starting at bufbase.
3270  */
3271 STATIC int
3272 xfs_readdir(
3273         bhv_desc_t      *dir_bdp,
3274         uio_t           *uiop,
3275         cred_t          *credp,
3276         int             *eofp)
3277 {
3278         xfs_inode_t     *dp;
3279         xfs_trans_t     *tp = NULL;
3280         int             error = 0;
3281         uint            lock_mode;
3282
3283         vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3284                                                (inst_t *)__return_address);
3285         dp = XFS_BHVTOI(dir_bdp);
3286
3287         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
3288                 return XFS_ERROR(EIO);
3289
3290         lock_mode = xfs_ilock_map_shared(dp);
3291         error = xfs_dir_getdents(tp, dp, uiop, eofp);
3292         xfs_iunlock_map_shared(dp, lock_mode);
3293         return error;
3294 }
3295
3296
3297 STATIC int
3298 xfs_symlink(
3299         bhv_desc_t              *dir_bdp,
3300         bhv_vname_t             *dentry,
3301         bhv_vattr_t             *vap,
3302         char                    *target_path,
3303         bhv_vnode_t             **vpp,
3304         cred_t                  *credp)
3305 {
3306         xfs_trans_t             *tp;
3307         xfs_mount_t             *mp;
3308         xfs_inode_t             *dp;
3309         xfs_inode_t             *ip;
3310         int                     error;
3311         int                     pathlen;
3312         xfs_bmap_free_t         free_list;
3313         xfs_fsblock_t           first_block;
3314         boolean_t               dp_joined_to_trans;
3315         bhv_vnode_t             *dir_vp;
3316         uint                    cancel_flags;
3317         int                     committed;
3318         xfs_fileoff_t           first_fsb;
3319         xfs_filblks_t           fs_blocks;
3320         int                     nmaps;
3321         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3322         xfs_daddr_t             d;
3323         char                    *cur_chunk;
3324         int                     byte_cnt;
3325         int                     n;
3326         xfs_buf_t               *bp;
3327         xfs_prid_t              prid;
3328         struct xfs_dquot        *udqp, *gdqp;
3329         uint                    resblks;
3330         char                    *link_name = VNAME(dentry);
3331         int                     link_namelen;
3332
3333         *vpp = NULL;
3334         dir_vp = BHV_TO_VNODE(dir_bdp);
3335         dp = XFS_BHVTOI(dir_bdp);
3336         dp_joined_to_trans = B_FALSE;
3337         error = 0;
3338         ip = NULL;
3339         tp = NULL;
3340
3341         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3342
3343         mp = dp->i_mount;
3344
3345         if (XFS_FORCED_SHUTDOWN(mp))
3346                 return XFS_ERROR(EIO);
3347
3348         link_namelen = VNAMELEN(dentry);
3349
3350         /*
3351          * Check component lengths of the target path name.
3352          */
3353         pathlen = strlen(target_path);
3354         if (pathlen >= MAXPATHLEN)      /* total string too long */
3355                 return XFS_ERROR(ENAMETOOLONG);
3356         if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3357                 int len, total;
3358                 char *path;
3359
3360                 for (total = 0, path = target_path; total < pathlen;) {
3361                         /*
3362                          * Skip any slashes.
3363                          */
3364                         while(*path == '/') {
3365                                 total++;
3366                                 path++;
3367                         }
3368
3369                         /*
3370                          * Count up to the next slash or end of path.
3371                          * Error out if the component is bigger than MAXNAMELEN.
3372                          */
3373                         for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3374                                 if (++len >= MAXNAMELEN) {
3375                                         error = ENAMETOOLONG;
3376                                         return error;
3377                                 }
3378                         }
3379                 }
3380         }
3381
3382         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
3383                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3384                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3385                                         link_name, target_path, 0, 0, 0);
3386                 if (error)
3387                         return error;
3388         }
3389
3390         /* Return through std_return after this point. */
3391
3392         udqp = gdqp = NULL;
3393         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3394                 prid = dp->i_d.di_projid;
3395         else if (vap->va_mask & XFS_AT_PROJID)
3396                 prid = (xfs_prid_t)vap->va_projid;
3397         else
3398                 prid = (xfs_prid_t)dfltprid;
3399
3400         /*
3401          * Make sure that we have allocated dquot(s) on disk.
3402          */
3403         error = XFS_QM_DQVOPALLOC(mp, dp,
3404                         current_fsuid(credp), current_fsgid(credp), prid,
3405                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3406         if (error)
3407                 goto std_return;
3408
3409         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3410         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3411         /*
3412          * The symlink will fit into the inode data fork?
3413          * There can't be any attributes so we get the whole variable part.
3414          */
3415         if (pathlen <= XFS_LITINO(mp))
3416                 fs_blocks = 0;
3417         else
3418                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3419         resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3420         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3421                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3422         if (error == ENOSPC && fs_blocks == 0) {
3423                 resblks = 0;
3424                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3425                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3426         }
3427         if (error) {
3428                 cancel_flags = 0;
3429                 dp = NULL;
3430                 goto error_return;
3431         }
3432
3433         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3434
3435         /*
3436          * Check whether the directory allows new symlinks or not.
3437          */
3438         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3439                 error = XFS_ERROR(EPERM);
3440                 goto error_return;
3441         }
3442
3443         /*
3444          * Reserve disk quota : blocks and inode.
3445          */
3446         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3447         if (error)
3448                 goto error_return;
3449
3450         /*
3451          * Check for ability to enter directory entry, if no space reserved.
3452          */
3453         if (resblks == 0 &&
3454             (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
3455                 goto error_return;
3456         /*
3457          * Initialize the bmap freelist prior to calling either
3458          * bmapi or the directory create code.
3459          */
3460         XFS_BMAP_INIT(&free_list, &first_block);
3461
3462         /*
3463          * Allocate an inode for the symlink.
3464          */
3465         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3466                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3467         if (error) {
3468                 if (error == ENOSPC)
3469                         goto error_return;
3470                 goto error1;
3471         }
3472         ITRACE(ip);
3473
3474         VN_HOLD(dir_vp);
3475         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3476         dp_joined_to_trans = B_TRUE;
3477
3478         /*
3479          * Also attach the dquot(s) to it, if applicable.
3480          */
3481         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3482
3483         if (resblks)
3484                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3485         /*
3486          * If the symlink will fit into the inode, write it inline.
3487          */
3488         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3489                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3490                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3491                 ip->i_d.di_size = pathlen;
3492
3493                 /*
3494                  * The inode was initially created in extent format.
3495                  */
3496                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3497                 ip->i_df.if_flags |= XFS_IFINLINE;
3498
3499                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3500                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3501
3502         } else {
3503                 first_fsb = 0;
3504                 nmaps = SYMLINK_MAPS;
3505
3506                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3507                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3508                                   &first_block, resblks, mval, &nmaps,
3509                                   &free_list, NULL);
3510                 if (error) {
3511                         goto error1;
3512                 }
3513
3514                 if (resblks)
3515                         resblks -= fs_blocks;
3516                 ip->i_d.di_size = pathlen;
3517                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3518
3519                 cur_chunk = target_path;
3520                 for (n = 0; n < nmaps; n++) {
3521                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3522                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3523                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3524                                                BTOBB(byte_cnt), 0);
3525                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3526                         if (pathlen < byte_cnt) {
3527                                 byte_cnt = pathlen;
3528                         }
3529                         pathlen -= byte_cnt;
3530
3531                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3532                         cur_chunk += byte_cnt;
3533
3534                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3535                 }
3536         }
3537
3538         /*
3539          * Create the directory entry for the symlink.
3540          */
3541         error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
3542                                    &first_block, &free_list, resblks);
3543         if (error)
3544                 goto error1;
3545         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3546         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3547
3548         /*
3549          * Bump the in memory version number of the parent directory
3550          * so that other processes accessing it will recognize that
3551          * the directory has changed.
3552          */
3553         dp->i_gen++;
3554
3555         /*
3556          * If this is a synchronous mount, make sure that the
3557          * symlink transaction goes to disk before returning to
3558          * the user.
3559          */
3560         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3561                 xfs_trans_set_sync(tp);
3562         }
3563
3564         /*
3565          * xfs_trans_commit normally decrements the vnode ref count
3566          * when it unlocks the inode. Since we want to return the
3567          * vnode to the caller, we bump the vnode ref count now.
3568          */
3569         IHOLD(ip);
3570
3571         error = xfs_bmap_finish(&tp, &free_list, &committed);
3572         if (error) {
3573                 goto error2;
3574         }
3575         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3576         XFS_QM_DQRELE(mp, udqp);
3577         XFS_QM_DQRELE(mp, gdqp);
3578
3579         /* Fall through to std_return with error = 0 or errno from
3580          * xfs_trans_commit     */
3581 std_return:
3582         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3583                              DM_EVENT_POSTSYMLINK)) {
3584                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3585                                         dir_vp, DM_RIGHT_NULL,
3586                                         error ? NULL : XFS_ITOV(ip),
3587                                         DM_RIGHT_NULL, link_name, target_path,
3588                                         0, error, 0);
3589         }
3590
3591         if (!error) {
3592                 bhv_vnode_t *vp;
3593
3594                 ASSERT(ip);
3595                 vp = XFS_ITOV(ip);
3596                 *vpp = vp;
3597         }
3598         return error;
3599
3600  error2:
3601         IRELE(ip);
3602  error1:
3603         xfs_bmap_cancel(&free_list);
3604         cancel_flags |= XFS_TRANS_ABORT;
3605  error_return:
3606         xfs_trans_cancel(tp, cancel_flags);
3607         XFS_QM_DQRELE(mp, udqp);
3608         XFS_QM_DQRELE(mp, gdqp);
3609
3610         if (!dp_joined_to_trans && (dp != NULL)) {
3611                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3612         }
3613
3614         goto std_return;
3615 }
3616
3617
3618 /*
3619  * xfs_fid2
3620  *
3621  * A fid routine that takes a pointer to a previously allocated
3622  * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3623  */
3624 STATIC int
3625 xfs_fid2(
3626         bhv_desc_t      *bdp,
3627         fid_t           *fidp)
3628 {
3629         xfs_inode_t     *ip;
3630         xfs_fid2_t      *xfid;
3631
3632         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3633                                        (inst_t *)__return_address);
3634         ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
3635
3636         xfid = (xfs_fid2_t *)fidp;
3637         ip = XFS_BHVTOI(bdp);
3638         xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3639         xfid->fid_pad = 0;
3640         /*
3641          * use memcpy because the inode is a long long and there's no
3642          * assurance that xfid->fid_ino is properly aligned.
3643          */
3644         memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3645         xfid->fid_gen = ip->i_d.di_gen;
3646
3647         return 0;
3648 }
3649
3650
3651 /*
3652  * xfs_rwlock
3653  */
3654 int
3655 xfs_rwlock(
3656         bhv_desc_t      *bdp,
3657         bhv_vrwlock_t   locktype)
3658 {
3659         xfs_inode_t     *ip;
3660         bhv_vnode_t     *vp;
3661
3662         vp = BHV_TO_VNODE(bdp);
3663         if (VN_ISDIR(vp))
3664                 return 1;
3665         ip = XFS_BHVTOI(bdp);
3666         if (locktype == VRWLOCK_WRITE) {
3667                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3668         } else if (locktype == VRWLOCK_TRY_READ) {
3669                 return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3670         } else if (locktype == VRWLOCK_TRY_WRITE) {
3671                 return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3672         } else {
3673                 ASSERT((locktype == VRWLOCK_READ) ||
3674                        (locktype == VRWLOCK_WRITE_DIRECT));
3675                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3676         }
3677
3678         return 1;
3679 }
3680
3681
3682 /*
3683  * xfs_rwunlock
3684  */
3685 void
3686 xfs_rwunlock(
3687         bhv_desc_t      *bdp,
3688         bhv_vrwlock_t   locktype)
3689 {
3690         xfs_inode_t     *ip;
3691         bhv_vnode_t     *vp;
3692
3693         vp = BHV_TO_VNODE(bdp);
3694         if (VN_ISDIR(vp))
3695                 return;
3696         ip = XFS_BHVTOI(bdp);
3697         if (locktype == VRWLOCK_WRITE) {
3698                 /*
3699                  * In the write case, we may have added a new entry to
3700                  * the reference cache.  This might store a pointer to
3701                  * an inode to be released in this inode.  If it is there,
3702                  * clear the pointer and release the inode after unlocking
3703                  * this one.
3704                  */
3705                 xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3706         } else {
3707                 ASSERT((locktype == VRWLOCK_READ) ||
3708                        (locktype == VRWLOCK_WRITE_DIRECT));
3709                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3710         }
3711         return;
3712 }
3713
3714 STATIC int
3715 xfs_inode_flush(
3716         bhv_desc_t      *bdp,
3717         int             flags)
3718 {
3719         xfs_inode_t     *ip;
3720         xfs_mount_t     *mp;
3721         xfs_inode_log_item_t *iip;
3722         int             error = 0;
3723
3724         ip = XFS_BHVTOI(bdp);
3725         mp = ip->i_mount;
3726         iip = ip->i_itemp;
3727
3728         if (XFS_FORCED_SHUTDOWN(mp))
3729                 return XFS_ERROR(EIO);
3730
3731         /*
3732          * Bypass inodes which have already been cleaned by
3733          * the inode flush clustering code inside xfs_iflush
3734          */
3735         if ((ip->i_update_core == 0) &&
3736             ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3737                 return 0;
3738
3739         if (flags & FLUSH_LOG) {
3740                 if (iip && iip->ili_last_lsn) {
3741                         xlog_t          *log = mp->m_log;
3742                         xfs_lsn_t       sync_lsn;
3743                         int             s, log_flags = XFS_LOG_FORCE;
3744
3745                         s = GRANT_LOCK(log);
3746                         sync_lsn = log->l_last_sync_lsn;
3747                         GRANT_UNLOCK(log, s);
3748
3749                         if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
3750                                 return 0;
3751
3752                         if (flags & FLUSH_SYNC)
3753                                 log_flags |= XFS_LOG_SYNC;
3754                         return xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3755                 }
3756         }
3757
3758         /*
3759          * We make this non-blocking if the inode is contended,
3760          * return EAGAIN to indicate to the caller that they
3761          * did not succeed. This prevents the flush path from
3762          * blocking on inodes inside another operation right
3763          * now, they get caught later by xfs_sync.
3764          */
3765         if (flags & FLUSH_INODE) {
3766                 int     flush_flags;
3767
3768                 if (xfs_ipincount(ip))
3769                         return EAGAIN;
3770
3771                 if (flags & FLUSH_SYNC) {
3772                         xfs_ilock(ip, XFS_ILOCK_SHARED);
3773                         xfs_iflock(ip);
3774                 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3775                         if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3776                                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3777                                 return EAGAIN;
3778                         }
3779                 } else {
3780                         return EAGAIN;
3781                 }
3782
3783                 if (flags & FLUSH_SYNC)
3784                         flush_flags = XFS_IFLUSH_SYNC;
3785                 else
3786                         flush_flags = XFS_IFLUSH_ASYNC;
3787
3788                 error = xfs_iflush(ip, flush_flags);
3789                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3790         }
3791
3792         return error;
3793 }
3794
3795 int
3796 xfs_set_dmattrs (
3797         bhv_desc_t      *bdp,
3798         u_int           evmask,
3799         u_int16_t       state,
3800         cred_t          *credp)
3801 {
3802         xfs_inode_t     *ip;
3803         xfs_trans_t     *tp;
3804         xfs_mount_t     *mp;
3805         int             error;
3806
3807         if (!capable(CAP_SYS_ADMIN))
3808                 return XFS_ERROR(EPERM);
3809
3810         ip = XFS_BHVTOI(bdp);
3811         mp = ip->i_mount;
3812
3813         if (XFS_FORCED_SHUTDOWN(mp))
3814                 return XFS_ERROR(EIO);
3815
3816         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3817         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3818         if (error) {
3819                 xfs_trans_cancel(tp, 0);
3820                 return error;
3821         }
3822         xfs_ilock(ip, XFS_ILOCK_EXCL);
3823         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3824
3825         ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3826         ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3827
3828         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3829         IHOLD(ip);
3830         error = xfs_trans_commit(tp, 0);
3831
3832         return error;
3833 }
3834
3835 STATIC int
3836 xfs_reclaim(
3837         bhv_desc_t      *bdp)
3838 {
3839         xfs_inode_t     *ip;
3840         bhv_vnode_t     *vp;
3841
3842         vp = BHV_TO_VNODE(bdp);
3843         ip = XFS_BHVTOI(bdp);
3844
3845         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3846
3847         ASSERT(!VN_MAPPED(vp));
3848
3849         /* bad inode, get out here ASAP */
3850         if (VN_BAD(vp)) {
3851                 xfs_ireclaim(ip);
3852                 return 0;
3853         }
3854
3855         vn_iowait(vp);
3856
3857         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3858
3859         /*
3860          * Make sure the atime in the XFS inode is correct before freeing the
3861          * Linux inode.
3862          */
3863         xfs_synchronize_atime(ip);
3864
3865         /*
3866          * If we have nothing to flush with this inode then complete the
3867          * teardown now, otherwise break the link between the xfs inode and the
3868          * linux inode and clean up the xfs inode later. This avoids flushing
3869          * the inode to disk during the delete operation itself.
3870          *
3871          * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3872          * first to ensure that xfs_iunpin() will never see an xfs inode
3873          * that has a linux inode being reclaimed. Synchronisation is provided
3874          * by the i_flags_lock.
3875          */
3876         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3877                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3878                 xfs_iflock(ip);
3879                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3880         } else {
3881                 xfs_mount_t     *mp = ip->i_mount;
3882
3883                 /* Protect sync and unpin from us */
3884                 XFS_MOUNT_ILOCK(mp);
3885                 spin_lock(&ip->i_flags_lock);
3886                 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3887                 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3888                 spin_unlock(&ip->i_flags_lock);
3889                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3890                 XFS_MOUNT_IUNLOCK(mp);
3891         }
3892         return 0;
3893 }
3894
3895 int
3896 xfs_finish_reclaim(
3897         xfs_inode_t     *ip,
3898         int             locked,
3899         int             sync_mode)
3900 {
3901         xfs_ihash_t     *ih = ip->i_hash;
3902         bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
3903         int             error;
3904
3905         if (vp && VN_BAD(vp))
3906                 goto reclaim;
3907
3908         /* The hash lock here protects a thread in xfs_iget_core from
3909          * racing with us on linking the inode back with a vnode.
3910          * Once we have the XFS_IRECLAIM flag set it will not touch
3911          * us.
3912          */
3913         write_lock(&ih->ih_lock);
3914         spin_lock(&ip->i_flags_lock);
3915         if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3916             (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3917                 spin_unlock(&ip->i_flags_lock);
3918                 write_unlock(&ih->ih_lock);
3919                 if (locked) {
3920                         xfs_ifunlock(ip);
3921                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3922                 }
3923                 return 1;
3924         }
3925         __xfs_iflags_set(ip, XFS_IRECLAIM);
3926         spin_unlock(&ip->i_flags_lock);
3927         write_unlock(&ih->ih_lock);
3928
3929         /*
3930          * If the inode is still dirty, then flush it out.  If the inode
3931          * is not in the AIL, then it will be OK to flush it delwri as
3932          * long as xfs_iflush() does not keep any references to the inode.
3933          * We leave that decision up to xfs_iflush() since it has the
3934          * knowledge of whether it's OK to simply do a delwri flush of
3935          * the inode or whether we need to wait until the inode is
3936          * pulled from the AIL.
3937          * We get the flush lock regardless, though, just to make sure
3938          * we don't free it while it is being flushed.
3939          */
3940         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3941                 if (!locked) {
3942                         xfs_ilock(ip, XFS_ILOCK_EXCL);
3943                         xfs_iflock(ip);
3944                 }
3945
3946                 if (ip->i_update_core ||
3947                     ((ip->i_itemp != NULL) &&
3948                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3949                         error = xfs_iflush(ip, sync_mode);
3950                         /*
3951                          * If we hit an error, typically because of filesystem
3952                          * shutdown, we don't need to let vn_reclaim to know
3953                          * because we're gonna reclaim the inode anyway.
3954                          */
3955                         if (error) {
3956                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3957                                 goto reclaim;
3958                         }
3959                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3960                 }
3961
3962                 ASSERT(ip->i_update_core == 0);
3963                 ASSERT(ip->i_itemp == NULL ||
3964                        ip->i_itemp->ili_format.ilf_fields == 0);
3965                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3966         } else if (locked) {
3967                 /*
3968                  * We are not interested in doing an iflush if we're
3969                  * in the process of shutting down the filesystem forcibly.
3970                  * So, just reclaim the inode.
3971                  */
3972                 xfs_ifunlock(ip);
3973                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3974         }
3975
3976  reclaim:
3977         xfs_ireclaim(ip);
3978         return 0;
3979 }
3980
3981 int
3982 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3983 {
3984         int             purged;
3985         xfs_inode_t     *ip, *n;
3986         int             done = 0;
3987
3988         while (!done) {
3989                 purged = 0;
3990                 XFS_MOUNT_ILOCK(mp);
3991                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3992                         if (noblock) {
3993                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3994                                         continue;
3995                                 if (xfs_ipincount(ip) ||
3996                                     !xfs_iflock_nowait(ip)) {
3997                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3998                                         continue;
3999                                 }
4000                         }
4001                         XFS_MOUNT_IUNLOCK(mp);
4002                         if (xfs_finish_reclaim(ip, noblock,
4003                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
4004                                 delay(1);
4005                         purged = 1;
4006                         break;
4007                 }
4008
4009                 done = !purged;
4010         }
4011
4012         XFS_MOUNT_IUNLOCK(mp);
4013         return 0;
4014 }
4015
4016 /*
4017  * xfs_alloc_file_space()
4018  *      This routine allocates disk space for the given file.
4019  *
4020  *      If alloc_type == 0, this request is for an ALLOCSP type
4021  *      request which will change the file size.  In this case, no
4022  *      DMAPI event will be generated by the call.  A TRUNCATE event
4023  *      will be generated later by xfs_setattr.
4024  *
4025  *      If alloc_type != 0, this request is for a RESVSP type
4026  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
4027  *      lower block boundary byte address is less than the file's
4028  *      length.
4029  *
4030  * RETURNS:
4031  *       0 on success
4032  *      errno on error
4033  *
4034  */
4035 STATIC int
4036 xfs_alloc_file_space(
4037         xfs_inode_t             *ip,
4038         xfs_off_t               offset,
4039         xfs_off_t               len,
4040         int                     alloc_type,
4041         int                     attr_flags)
4042 {
4043         xfs_mount_t             *mp = ip->i_mount;
4044         xfs_off_t               count;
4045         xfs_filblks_t           allocated_fsb;
4046         xfs_filblks_t           allocatesize_fsb;
4047         xfs_extlen_t            extsz, temp;
4048         xfs_fileoff_t           startoffset_fsb;
4049         xfs_fsblock_t           firstfsb;
4050         int                     nimaps;
4051         int                     bmapi_flag;
4052         int                     quota_flag;
4053         int                     rt;
4054         xfs_trans_t             *tp;
4055         xfs_bmbt_irec_t         imaps[1], *imapp;
4056         xfs_bmap_free_t         free_list;
4057         uint                    qblocks, resblks, resrtextents;
4058         int                     committed;
4059         int                     error;
4060
4061         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4062
4063         if (XFS_FORCED_SHUTDOWN(mp))
4064                 return XFS_ERROR(EIO);
4065
4066         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4067                 return error;
4068
4069         if (len <= 0)
4070                 return XFS_ERROR(EINVAL);
4071
4072         rt = XFS_IS_REALTIME_INODE(ip);
4073         extsz = xfs_get_extsz_hint(ip);
4074
4075         count = len;
4076         imapp = &imaps[0];
4077         nimaps = 1;
4078         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4079         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
4080         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4081
4082         /*      Generate a DMAPI event if needed.       */
4083         if (alloc_type != 0 && offset < ip->i_size &&
4084                         (attr_flags&ATTR_DMI) == 0  &&
4085                         DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4086                 xfs_off_t           end_dmi_offset;
4087
4088                 end_dmi_offset = offset+len;
4089                 if (end_dmi_offset > ip->i_size)
4090                         end_dmi_offset = ip->i_size;
4091                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4092                         offset, end_dmi_offset - offset,
4093                         0, NULL);
4094                 if (error)
4095                         return error;
4096         }
4097
4098         /*
4099          * Allocate file space until done or until there is an error
4100          */
4101 retry:
4102         while (allocatesize_fsb && !error) {
4103                 xfs_fileoff_t   s, e;
4104
4105                 /*
4106                  * Determine space reservations for data/realtime.
4107                  */
4108                 if (unlikely(extsz)) {
4109                         s = startoffset_fsb;
4110                         do_div(s, extsz);
4111                         s *= extsz;
4112                         e = startoffset_fsb + allocatesize_fsb;
4113                         if ((temp = do_mod(startoffset_fsb, extsz)))
4114                                 e += temp;
4115                         if ((temp = do_mod(e, extsz)))
4116                                 e += extsz - temp;
4117                 } else {
4118                         s = 0;
4119                         e = allocatesize_fsb;
4120                 }
4121
4122                 if (unlikely(rt)) {
4123                         resrtextents = qblocks = (uint)(e - s);
4124                         resrtextents /= mp->m_sb.sb_rextsize;
4125                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4126                         quota_flag = XFS_QMOPT_RES_RTBLKS;
4127                 } else {
4128                         resrtextents = 0;
4129                         resblks = qblocks = \
4130                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
4131                         quota_flag = XFS_QMOPT_RES_REGBLKS;
4132                 }
4133
4134                 /*
4135                  * Allocate and setup the transaction.
4136                  */
4137                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4138                 error = xfs_trans_reserve(tp, resblks,
4139                                           XFS_WRITE_LOG_RES(mp), resrtextents,
4140                                           XFS_TRANS_PERM_LOG_RES,
4141                                           XFS_WRITE_LOG_COUNT);
4142                 /*
4143                  * Check for running out of space
4144                  */
4145                 if (error) {
4146                         /*
4147                          * Free the transaction structure.
4148                          */
4149                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4150                         xfs_trans_cancel(tp, 0);
4151                         break;
4152                 }
4153                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4154                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
4155                                                       qblocks, 0, quota_flag);
4156                 if (error)
4157                         goto error1;
4158
4159                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4160                 xfs_trans_ihold(tp, ip);
4161
4162                 /*
4163                  * Issue the xfs_bmapi() call to allocate the blocks
4164                  */
4165                 XFS_BMAP_INIT(&free_list, &firstfsb);
4166                 error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4167                                   allocatesize_fsb, bmapi_flag,
4168                                   &firstfsb, 0, imapp, &nimaps,
4169                                   &free_list, NULL);
4170                 if (error) {
4171                         goto error0;
4172                 }
4173
4174                 /*
4175                  * Complete the transaction
4176                  */
4177                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4178                 if (error) {
4179                         goto error0;
4180                 }
4181
4182                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4183                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4184                 if (error) {
4185                         break;
4186                 }
4187
4188                 allocated_fsb = imapp->br_blockcount;
4189
4190                 if (nimaps == 0) {
4191                         error = XFS_ERROR(ENOSPC);
4192                         break;
4193                 }
4194
4195                 startoffset_fsb += allocated_fsb;
4196                 allocatesize_fsb -= allocated_fsb;
4197         }
4198 dmapi_enospc_check:
4199         if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
4200             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
4201
4202                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4203                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4204                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4205                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4206                 if (error == 0)
4207                         goto retry;     /* Maybe DMAPI app. has made space */
4208                 /* else fall through with error from XFS_SEND_DATA */
4209         }
4210
4211         return error;
4212
4213 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4214         xfs_bmap_cancel(&free_list);
4215         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4216
4217 error1: /* Just cancel transaction */
4218         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4219         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4220         goto dmapi_enospc_check;
4221 }
4222
4223 /*
4224  * Zero file bytes between startoff and endoff inclusive.
4225  * The iolock is held exclusive and no blocks are buffered.
4226  */
4227 STATIC int
4228 xfs_zero_remaining_bytes(
4229         xfs_inode_t             *ip,
4230         xfs_off_t               startoff,
4231         xfs_off_t               endoff)
4232 {
4233         xfs_bmbt_irec_t         imap;
4234         xfs_fileoff_t           offset_fsb;
4235         xfs_off_t               lastoffset;
4236         xfs_off_t               offset;
4237         xfs_buf_t               *bp;
4238         xfs_mount_t             *mp = ip->i_mount;
4239         int                     nimap;
4240         int                     error = 0;
4241
4242         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4243                                 ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4244                                 mp->m_rtdev_targp : mp->m_ddev_targp);
4245
4246         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4247                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
4248                 nimap = 1;
4249                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
4250                         NULL, 0, &imap, &nimap, NULL, NULL);
4251                 if (error || nimap < 1)
4252                         break;
4253                 ASSERT(imap.br_blockcount >= 1);
4254                 ASSERT(imap.br_startoff == offset_fsb);
4255                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4256                 if (lastoffset > endoff)
4257                         lastoffset = endoff;
4258                 if (imap.br_startblock == HOLESTARTBLOCK)
4259                         continue;
4260                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4261                 if (imap.br_state == XFS_EXT_UNWRITTEN)
4262                         continue;
4263                 XFS_BUF_UNDONE(bp);
4264                 XFS_BUF_UNWRITE(bp);
4265                 XFS_BUF_READ(bp);
4266                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4267                 xfsbdstrat(mp, bp);
4268                 if ((error = xfs_iowait(bp))) {
4269                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4270                                           mp, bp, XFS_BUF_ADDR(bp));
4271                         break;
4272                 }
4273                 memset(XFS_BUF_PTR(bp) +
4274                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4275                       0, lastoffset - offset + 1);
4276                 XFS_BUF_UNDONE(bp);
4277                 XFS_BUF_UNREAD(bp);
4278                 XFS_BUF_WRITE(bp);
4279                 xfsbdstrat(mp, bp);
4280                 if ((error = xfs_iowait(bp))) {
4281                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4282                                           mp, bp, XFS_BUF_ADDR(bp));
4283                         break;
4284                 }
4285         }
4286         xfs_buf_free(bp);
4287         return error;
4288 }
4289
4290 /*
4291  * xfs_free_file_space()
4292  *      This routine frees disk space for the given file.
4293  *
4294  *      This routine is only called by xfs_change_file_space
4295  *      for an UNRESVSP type call.
4296  *
4297  * RETURNS:
4298  *       0 on success
4299  *      errno on error
4300  *
4301  */
4302 STATIC int
4303 xfs_free_file_space(
4304         xfs_inode_t             *ip,
4305         xfs_off_t               offset,
4306         xfs_off_t               len,
4307         int                     attr_flags)
4308 {
4309         bhv_vnode_t             *vp;
4310         int                     committed;
4311         int                     done;
4312         xfs_off_t               end_dmi_offset;
4313         xfs_fileoff_t           endoffset_fsb;
4314         int                     error;
4315         xfs_fsblock_t           firstfsb;
4316         xfs_bmap_free_t         free_list;
4317         xfs_bmbt_irec_t         imap;
4318         xfs_off_t               ioffset;
4319         xfs_extlen_t            mod=0;
4320         xfs_mount_t             *mp;
4321         int                     nimap;
4322         uint                    resblks;
4323         uint                    rounding;
4324         int                     rt;
4325         xfs_fileoff_t           startoffset_fsb;
4326         xfs_trans_t             *tp;
4327         int                     need_iolock = 1;
4328
4329         vp = XFS_ITOV(ip);
4330         mp = ip->i_mount;
4331
4332         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4333
4334         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4335                 return error;
4336
4337         error = 0;
4338         if (len <= 0)   /* if nothing being freed */
4339                 return error;
4340         rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4341         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4342         end_dmi_offset = offset + len;
4343         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4344
4345         if (offset < ip->i_size &&
4346             (attr_flags & ATTR_DMI) == 0 &&
4347             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4348                 if (end_dmi_offset > ip->i_size)
4349                         end_dmi_offset = ip->i_size;
4350                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4351                                 offset, end_dmi_offset - offset,
4352                                 AT_DELAY_FLAG(attr_flags), NULL);
4353                 if (error)
4354                         return error;
4355         }
4356
4357         if (attr_flags & ATTR_NOLOCK)
4358                 need_iolock = 0;
4359         if (need_iolock) {
4360                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
4361                 vn_iowait(vp);  /* wait for the completion of any pending DIOs */
4362         }
4363
4364         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP);
4365         ioffset = offset & ~(rounding - 1);
4366
4367         if (VN_CACHED(vp) != 0) {
4368                 xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4369                                 ctooff(offtoct(ioffset)), -1);
4370                 error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)),
4371                                 -1, FI_REMAPF_LOCKED);
4372                 if (error)
4373                         goto out_unlock_iolock;
4374         }
4375
4376         /*
4377          * Need to zero the stuff we're not freeing, on disk.
4378          * If its a realtime file & can't use unwritten extents then we
4379          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4380          * will take care of it for us.
4381          */
4382         if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4383                 nimap = 1;
4384                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
4385                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4386                 if (error)
4387                         goto out_unlock_iolock;
4388                 ASSERT(nimap == 0 || nimap == 1);
4389                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4390                         xfs_daddr_t     block;
4391
4392                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4393                         block = imap.br_startblock;
4394                         mod = do_div(block, mp->m_sb.sb_rextsize);
4395                         if (mod)
4396                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4397                 }
4398                 nimap = 1;
4399                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
4400                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4401                 if (error)
4402                         goto out_unlock_iolock;
4403                 ASSERT(nimap == 0 || nimap == 1);
4404                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4405                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4406                         mod++;
4407                         if (mod && (mod != mp->m_sb.sb_rextsize))
4408                                 endoffset_fsb -= mod;
4409                 }
4410         }
4411         if ((done = (endoffset_fsb <= startoffset_fsb)))
4412                 /*
4413                  * One contiguous piece to clear
4414                  */
4415                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4416         else {
4417                 /*
4418                  * Some full blocks, possibly two pieces to clear
4419                  */
4420                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4421                         error = xfs_zero_remaining_bytes(ip, offset,
4422                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4423                 if (!error &&
4424                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4425                         error = xfs_zero_remaining_bytes(ip,
4426                                 XFS_FSB_TO_B(mp, endoffset_fsb),
4427                                 offset + len - 1);
4428         }
4429
4430         /*
4431          * free file space until done or until there is an error
4432          */
4433         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4434         while (!error && !done) {
4435
4436                 /*
4437                  * allocate and setup the transaction. Allow this
4438                  * transaction to dip into the reserve blocks to ensure
4439                  * the freeing of the space succeeds at ENOSPC.
4440                  */
4441                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4442                 tp->t_flags |= XFS_TRANS_RESERVE;
4443                 error = xfs_trans_reserve(tp,
4444                                           resblks,
4445                                           XFS_WRITE_LOG_RES(mp),
4446                                           0,
4447                                           XFS_TRANS_PERM_LOG_RES,
4448                                           XFS_WRITE_LOG_COUNT);
4449
4450                 /*
4451                  * check for running out of space
4452                  */
4453                 if (error) {
4454                         /*
4455                          * Free the transaction structure.
4456                          */
4457                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4458                         xfs_trans_cancel(tp, 0);
4459                         break;
4460                 }
4461                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4462                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4463                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
4464                                 XFS_QMOPT_RES_REGBLKS);
4465                 if (error)
4466                         goto error1;
4467
4468                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4469                 xfs_trans_ihold(tp, ip);
4470
4471                 /*
4472                  * issue the bunmapi() call to free the blocks
4473                  */
4474                 XFS_BMAP_INIT(&free_list, &firstfsb);
4475                 error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4476                                   endoffset_fsb - startoffset_fsb,
4477                                   0, 2, &firstfsb, &free_list, NULL, &done);
4478                 if (error) {
4479                         goto error0;
4480                 }
4481
4482                 /*
4483                  * complete the transaction
4484                  */
4485                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4486                 if (error) {
4487                         goto error0;
4488                 }
4489
4490                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4491                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4492         }
4493
4494  out_unlock_iolock:
4495         if (need_iolock)
4496                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4497         return error;
4498
4499  error0:
4500         xfs_bmap_cancel(&free_list);
4501  error1:
4502         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4503         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4504                     XFS_ILOCK_EXCL);
4505         return error;
4506 }
4507
4508 /*
4509  * xfs_change_file_space()
4510  *      This routine allocates or frees disk space for the given file.
4511  *      The user specified parameters are checked for alignment and size
4512  *      limitations.
4513  *
4514  * RETURNS:
4515  *       0 on success
4516  *      errno on error
4517  *
4518  */
4519 int
4520 xfs_change_file_space(
4521         bhv_desc_t      *bdp,
4522         int             cmd,
4523         xfs_flock64_t   *bf,
4524         xfs_off_t       offset,
4525         cred_t          *credp,
4526         int             attr_flags)
4527 {
4528         int             clrprealloc;
4529         int             error;
4530         xfs_fsize_t     fsize;
4531         xfs_inode_t     *ip;
4532         xfs_mount_t     *mp;
4533         int             setprealloc;
4534         xfs_off_t       startoffset;
4535         xfs_off_t       llen;
4536         xfs_trans_t     *tp;
4537         bhv_vattr_t     va;
4538         bhv_vnode_t     *vp;
4539
4540         vp = BHV_TO_VNODE(bdp);
4541         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4542
4543         ip = XFS_BHVTOI(bdp);
4544         mp = ip->i_mount;
4545
4546         /*
4547          * must be a regular file and have write permission
4548          */
4549         if (!VN_ISREG(vp))
4550                 return XFS_ERROR(EINVAL);
4551
4552         xfs_ilock(ip, XFS_ILOCK_SHARED);
4553
4554         if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4555                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
4556                 return error;
4557         }
4558
4559         xfs_iunlock(ip, XFS_ILOCK_SHARED);
4560
4561         switch (bf->l_whence) {
4562         case 0: /*SEEK_SET*/
4563                 break;
4564         case 1: /*SEEK_CUR*/
4565                 bf->l_start += offset;
4566                 break;
4567         case 2: /*SEEK_END*/
4568                 bf->l_start += ip->i_size;
4569                 break;
4570         default:
4571                 return XFS_ERROR(EINVAL);
4572         }
4573
4574         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4575
4576         if (   (bf->l_start < 0)
4577             || (bf->l_start > XFS_MAXIOFFSET(mp))
4578             || (bf->l_start + llen < 0)
4579             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4580                 return XFS_ERROR(EINVAL);
4581
4582         bf->l_whence = 0;
4583
4584         startoffset = bf->l_start;
4585         fsize = ip->i_size;
4586
4587         /*
4588          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4589          * file space.
4590          * These calls do NOT zero the data space allocated to the file,
4591          * nor do they change the file size.
4592          *
4593          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4594          * space.
4595          * These calls cause the new file data to be zeroed and the file
4596          * size to be changed.
4597          */
4598         setprealloc = clrprealloc = 0;
4599
4600         switch (cmd) {
4601         case XFS_IOC_RESVSP:
4602         case XFS_IOC_RESVSP64:
4603                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4604                                                                 1, attr_flags);
4605                 if (error)
4606                         return error;
4607                 setprealloc = 1;
4608                 break;
4609
4610         case XFS_IOC_UNRESVSP:
4611         case XFS_IOC_UNRESVSP64:
4612                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4613                                                                 attr_flags)))
4614                         return error;
4615                 break;
4616
4617         case XFS_IOC_ALLOCSP:
4618         case XFS_IOC_ALLOCSP64:
4619         case XFS_IOC_FREESP:
4620         case XFS_IOC_FREESP64:
4621                 if (startoffset > fsize) {
4622                         error = xfs_alloc_file_space(ip, fsize,
4623                                         startoffset - fsize, 0, attr_flags);
4624                         if (error)
4625                                 break;
4626                 }
4627
4628                 va.va_mask = XFS_AT_SIZE;
4629                 va.va_size = startoffset;
4630
4631                 error = xfs_setattr(bdp, &va, attr_flags, credp);
4632
4633                 if (error)
4634                         return error;
4635
4636                 clrprealloc = 1;
4637                 break;
4638
4639         default:
4640                 ASSERT(0);
4641                 return XFS_ERROR(EINVAL);
4642         }
4643
4644         /*
4645          * update the inode timestamp, mode, and prealloc flag bits
4646          */
4647         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4648
4649         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4650                                       0, 0, 0))) {
4651                 /* ASSERT(0); */
4652                 xfs_trans_cancel(tp, 0);
4653                 return error;
4654         }
4655
4656         xfs_ilock(ip, XFS_ILOCK_EXCL);
4657
4658         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4659         xfs_trans_ihold(tp, ip);
4660
4661         if ((attr_flags & ATTR_DMI) == 0) {
4662                 ip->i_d.di_mode &= ~S_ISUID;
4663
4664                 /*
4665                  * Note that we don't have to worry about mandatory
4666                  * file locking being disabled here because we only
4667                  * clear the S_ISGID bit if the Group execute bit is
4668                  * on, but if it was on then mandatory locking wouldn't
4669                  * have been enabled.
4670                  */
4671                 if (ip->i_d.di_mode & S_IXGRP)
4672                         ip->i_d.di_mode &= ~S_ISGID;
4673
4674                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4675         }
4676         if (setprealloc)
4677                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4678         else if (clrprealloc)
4679                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4680
4681         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4682         xfs_trans_set_sync(tp);
4683
4684         error = xfs_trans_commit(tp, 0);
4685
4686         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4687
4688         return error;
4689 }
4690
4691 bhv_vnodeops_t xfs_vnodeops = {
4692         BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4693         .vop_open               = xfs_open,
4694         .vop_read               = xfs_read,
4695 #ifdef HAVE_SPLICE
4696         .vop_splice_read        = xfs_splice_read,
4697         .vop_splice_write       = xfs_splice_write,
4698 #endif
4699         .vop_write              = xfs_write,
4700         .vop_ioctl              = xfs_ioctl,
4701         .vop_getattr            = xfs_getattr,
4702         .vop_setattr            = xfs_setattr,
4703         .vop_access             = xfs_access,
4704         .vop_lookup             = xfs_lookup,
4705         .vop_create             = xfs_create,
4706         .vop_remove             = xfs_remove,
4707         .vop_link               = xfs_link,
4708         .vop_rename             = xfs_rename,
4709         .vop_mkdir              = xfs_mkdir,
4710         .vop_rmdir              = xfs_rmdir,
4711         .vop_readdir            = xfs_readdir,
4712         .vop_symlink            = xfs_symlink,
4713         .vop_readlink           = xfs_readlink,
4714         .vop_fsync              = xfs_fsync,
4715         .vop_inactive           = xfs_inactive,
4716         .vop_fid2               = xfs_fid2,
4717         .vop_rwlock             = xfs_rwlock,
4718         .vop_rwunlock           = xfs_rwunlock,
4719         .vop_bmap               = xfs_bmap,
4720         .vop_reclaim            = xfs_reclaim,
4721         .vop_attr_get           = xfs_attr_get,
4722         .vop_attr_set           = xfs_attr_set,
4723         .vop_attr_remove        = xfs_attr_remove,
4724         .vop_attr_list          = xfs_attr_list,
4725         .vop_link_removed       = (vop_link_removed_t)fs_noval,
4726         .vop_vnode_change       = (vop_vnode_change_t)fs_noval,
4727         .vop_tosspages          = fs_tosspages,
4728         .vop_flushinval_pages   = fs_flushinval_pages,
4729         .vop_flush_pages        = fs_flush_pages,
4730         .vop_release            = xfs_release,
4731         .vop_iflush             = xfs_inode_flush,
4732 };