2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu>
5 This program can be distributed under the terms of the GNU GPL.
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/poll.h>
14 #include <linux/uio.h>
15 #include <linux/miscdevice.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/slab.h>
20 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
22 static kmem_cache_t *fuse_req_cachep;
24 static struct fuse_conn *fuse_get_conn(struct file *file)
27 spin_lock(&fuse_lock);
28 fc = file->private_data;
29 if (fc && !fc->mounted)
31 spin_unlock(&fuse_lock);
35 static void fuse_request_init(struct fuse_req *req)
37 memset(req, 0, sizeof(*req));
38 INIT_LIST_HEAD(&req->list);
39 init_waitqueue_head(&req->waitq);
40 atomic_set(&req->count, 1);
43 struct fuse_req *fuse_request_alloc(void)
45 struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL);
47 fuse_request_init(req);
51 void fuse_request_free(struct fuse_req *req)
53 kmem_cache_free(fuse_req_cachep, req);
56 static void block_sigs(sigset_t *oldset)
60 siginitsetinv(&mask, sigmask(SIGKILL));
61 sigprocmask(SIG_BLOCK, &mask, oldset);
64 static void restore_sigs(sigset_t *oldset)
66 sigprocmask(SIG_SETMASK, oldset, NULL);
69 void fuse_reset_request(struct fuse_req *req)
71 int preallocated = req->preallocated;
72 BUG_ON(atomic_read(&req->count) != 1);
73 fuse_request_init(req);
74 req->preallocated = preallocated;
77 static void __fuse_get_request(struct fuse_req *req)
79 atomic_inc(&req->count);
82 /* Must be called with > 1 refcount */
83 static void __fuse_put_request(struct fuse_req *req)
85 BUG_ON(atomic_read(&req->count) < 2);
86 atomic_dec(&req->count);
89 static struct fuse_req *do_get_request(struct fuse_conn *fc)
93 spin_lock(&fuse_lock);
94 BUG_ON(list_empty(&fc->unused_list));
95 req = list_entry(fc->unused_list.next, struct fuse_req, list);
96 list_del_init(&req->list);
97 spin_unlock(&fuse_lock);
98 fuse_request_init(req);
99 req->preallocated = 1;
100 req->in.h.uid = current->fsuid;
101 req->in.h.gid = current->fsgid;
102 req->in.h.pid = current->pid;
106 /* This can return NULL, but only in case it's interrupted by a SIGKILL */
107 struct fuse_req *fuse_get_request(struct fuse_conn *fc)
113 intr = down_interruptible(&fc->outstanding_sem);
114 restore_sigs(&oldset);
115 return intr ? NULL : do_get_request(fc);
118 static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
120 spin_lock(&fuse_lock);
121 if (req->preallocated)
122 list_add(&req->list, &fc->unused_list);
124 fuse_request_free(req);
126 /* If we are in debt decrease that first */
127 if (fc->outstanding_debt)
128 fc->outstanding_debt--;
130 up(&fc->outstanding_sem);
131 spin_unlock(&fuse_lock);
134 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
136 if (atomic_dec_and_test(&req->count))
137 fuse_putback_request(fc, req);
140 void fuse_release_background(struct fuse_req *req)
146 spin_lock(&fuse_lock);
147 list_del(&req->bg_entry);
148 spin_unlock(&fuse_lock);
151 static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
154 struct fuse_init_out *arg = &req->misc.init_out;
156 if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
159 fc->minor = arg->minor;
160 fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
163 /* After INIT reply is received other requests can go
164 out. So do (FUSE_MAX_OUTSTANDING - 1) number of
165 up()s on outstanding_sem. The last up() is done in
166 fuse_putback_request() */
167 for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
168 up(&fc->outstanding_sem);
172 * This function is called when a request is finished. Either a reply
173 * has arrived or it was interrupted (and not yet sent) or some error
174 * occurred during communication with userspace, or the device file
175 * was closed. In case of a background request the reference to the
176 * stored objects are released. The requester thread is woken up (if
177 * still waiting), and finally the reference to the request is
180 * Called with fuse_lock, unlocks it
182 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
185 spin_unlock(&fuse_lock);
186 if (req->background) {
187 down_read(&fc->sbput_sem);
189 fuse_release_background(req);
190 up_read(&fc->sbput_sem);
192 wake_up(&req->waitq);
193 if (req->in.h.opcode == FUSE_INIT)
194 process_init_reply(fc, req);
195 else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
196 /* Special case for failed iget in CREATE */
197 u64 nodeid = req->in.h.nodeid;
198 fuse_reset_request(req);
199 fuse_send_forget(fc, req, nodeid, 1);
202 fuse_put_request(fc, req);
206 * Unfortunately request interruption not just solves the deadlock
207 * problem, it causes problems too. These stem from the fact, that an
208 * interrupted request is continued to be processed in userspace,
209 * while all the locks and object references (inode and file) held
210 * during the operation are released.
212 * To release the locks is exactly why there's a need to interrupt the
213 * request, so there's not a lot that can be done about this, except
214 * introduce additional locking in userspace.
216 * More important is to keep inode and file references until userspace
217 * has replied, otherwise FORGET and RELEASE could be sent while the
218 * inode/file is still used by the filesystem.
220 * For this reason the concept of "background" request is introduced.
221 * An interrupted request is backgrounded if it has been already sent
222 * to userspace. Backgrounding involves getting an extra reference to
223 * inode(s) or file used in the request, and adding the request to
224 * fc->background list. When a reply is received for a background
225 * request, the object references are released, and the request is
226 * removed from the list. If the filesystem is unmounted while there
227 * are still background requests, the list is walked and references
228 * are released as if a reply was received.
230 * There's one more use for a background request. The RELEASE message is
231 * always sent as background, since it doesn't return an error or
234 static void background_request(struct fuse_conn *fc, struct fuse_req *req)
237 list_add(&req->bg_entry, &fc->background);
239 req->inode = igrab(req->inode);
241 req->inode2 = igrab(req->inode2);
246 /* Called with fuse_lock held. Releases, and then reacquires it. */
247 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
251 spin_unlock(&fuse_lock);
253 wait_event_interruptible(req->waitq, req->finished);
254 restore_sigs(&oldset);
255 spin_lock(&fuse_lock);
259 req->out.h.error = -EINTR;
260 req->interrupted = 1;
262 /* This is uninterruptible sleep, because data is
263 being copied to/from the buffers of req. During
264 locked state, there mustn't be any filesystem
265 operation (e.g. page fault), since that could lead
267 spin_unlock(&fuse_lock);
268 wait_event(req->waitq, !req->locked);
269 spin_lock(&fuse_lock);
271 if (!req->sent && !list_empty(&req->list)) {
272 list_del(&req->list);
273 __fuse_put_request(req);
274 } else if (!req->finished && req->sent)
275 background_request(fc, req);
278 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
283 for (i = 0; i < numargs; i++)
284 nbytes += args[i].size;
289 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
292 /* zero is special */
295 req->in.h.unique = fc->reqctr;
296 req->in.h.len = sizeof(struct fuse_in_header) +
297 len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
298 if (!req->preallocated) {
299 /* If request is not preallocated (either FORGET or
300 RELEASE), then still decrease outstanding_sem, so
301 user can't open infinite number of files while not
302 processing the RELEASE requests. However for
303 efficiency do it without blocking, so if down()
304 would block, just increase the debt instead */
305 if (down_trylock(&fc->outstanding_sem))
306 fc->outstanding_debt++;
308 list_add_tail(&req->list, &fc->pending);
313 * This can only be interrupted by a SIGKILL
315 void request_send(struct fuse_conn *fc, struct fuse_req *req)
318 spin_lock(&fuse_lock);
320 req->out.h.error = -ENOTCONN;
321 else if (fc->conn_error)
322 req->out.h.error = -ECONNREFUSED;
324 queue_request(fc, req);
325 /* acquire extra reference, since request is still needed
326 after request_end() */
327 __fuse_get_request(req);
329 request_wait_answer(fc, req);
331 spin_unlock(&fuse_lock);
334 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
336 spin_lock(&fuse_lock);
338 queue_request(fc, req);
339 spin_unlock(&fuse_lock);
341 req->out.h.error = -ENOTCONN;
342 request_end(fc, req);
346 void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
349 request_send_nowait(fc, req);
352 void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
355 spin_lock(&fuse_lock);
356 background_request(fc, req);
357 spin_unlock(&fuse_lock);
358 request_send_nowait(fc, req);
361 void fuse_send_init(struct fuse_conn *fc)
363 /* This is called from fuse_read_super() so there's guaranteed
364 to be exactly one request available */
365 struct fuse_req *req = fuse_get_request(fc);
366 struct fuse_init_in *arg = &req->misc.init_in;
367 arg->major = FUSE_KERNEL_VERSION;
368 arg->minor = FUSE_KERNEL_MINOR_VERSION;
369 req->in.h.opcode = FUSE_INIT;
371 req->in.args[0].size = sizeof(*arg);
372 req->in.args[0].value = arg;
373 req->out.numargs = 1;
374 /* Variable length arguement used for backward compatibility
375 with interface version < 7.5. Rest of init_out is zeroed
376 by do_get_request(), so a short reply is not a problem */
378 req->out.args[0].size = sizeof(struct fuse_init_out);
379 req->out.args[0].value = &req->misc.init_out;
380 request_send_background(fc, req);
384 * Lock the request. Up to the next unlock_request() there mustn't be
385 * anything that could cause a page-fault. If the request was already
386 * interrupted bail out.
388 static int lock_request(struct fuse_req *req)
392 spin_lock(&fuse_lock);
393 if (req->interrupted)
397 spin_unlock(&fuse_lock);
403 * Unlock request. If it was interrupted during being locked, the
404 * requester thread is currently waiting for it to be unlocked, so
407 static void unlock_request(struct fuse_req *req)
410 spin_lock(&fuse_lock);
412 if (req->interrupted)
413 wake_up(&req->waitq);
414 spin_unlock(&fuse_lock);
418 struct fuse_copy_state {
420 struct fuse_req *req;
421 const struct iovec *iov;
422 unsigned long nr_segs;
423 unsigned long seglen;
431 static void fuse_copy_init(struct fuse_copy_state *cs, int write,
432 struct fuse_req *req, const struct iovec *iov,
433 unsigned long nr_segs)
435 memset(cs, 0, sizeof(*cs));
439 cs->nr_segs = nr_segs;
442 /* Unmap and put previous page of userspace buffer */
443 static void fuse_copy_finish(struct fuse_copy_state *cs)
446 kunmap_atomic(cs->mapaddr, KM_USER0);
448 flush_dcache_page(cs->pg);
449 set_page_dirty_lock(cs->pg);
457 * Get another pagefull of userspace buffer, and map it to kernel
458 * address space, and lock request
460 static int fuse_copy_fill(struct fuse_copy_state *cs)
462 unsigned long offset;
465 unlock_request(cs->req);
466 fuse_copy_finish(cs);
468 BUG_ON(!cs->nr_segs);
469 cs->seglen = cs->iov[0].iov_len;
470 cs->addr = (unsigned long) cs->iov[0].iov_base;
474 down_read(¤t->mm->mmap_sem);
475 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
477 up_read(¤t->mm->mmap_sem);
481 offset = cs->addr % PAGE_SIZE;
482 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
483 cs->buf = cs->mapaddr + offset;
484 cs->len = min(PAGE_SIZE - offset, cs->seglen);
485 cs->seglen -= cs->len;
488 return lock_request(cs->req);
491 /* Do as much copy to/from userspace buffer as we can */
492 static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
494 unsigned ncpy = min(*size, cs->len);
497 memcpy(cs->buf, *val, ncpy);
499 memcpy(*val, cs->buf, ncpy);
509 * Copy a page in the request to/from the userspace buffer. Must be
512 static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
513 unsigned offset, unsigned count, int zeroing)
515 if (page && zeroing && count < PAGE_SIZE) {
516 void *mapaddr = kmap_atomic(page, KM_USER1);
517 memset(mapaddr, 0, PAGE_SIZE);
518 kunmap_atomic(mapaddr, KM_USER1);
522 if (!cs->len && (err = fuse_copy_fill(cs)))
525 void *mapaddr = kmap_atomic(page, KM_USER1);
526 void *buf = mapaddr + offset;
527 offset += fuse_copy_do(cs, &buf, &count);
528 kunmap_atomic(mapaddr, KM_USER1);
530 offset += fuse_copy_do(cs, NULL, &count);
532 if (page && !cs->write)
533 flush_dcache_page(page);
537 /* Copy pages in the request to/from userspace buffer */
538 static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
542 struct fuse_req *req = cs->req;
543 unsigned offset = req->page_offset;
544 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
546 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
547 struct page *page = req->pages[i];
548 int err = fuse_copy_page(cs, page, offset, count, zeroing);
553 count = min(nbytes, (unsigned) PAGE_SIZE);
559 /* Copy a single argument in the request to/from userspace buffer */
560 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
564 if (!cs->len && (err = fuse_copy_fill(cs)))
566 fuse_copy_do(cs, &val, &size);
571 /* Copy request arguments to/from userspace buffer */
572 static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
573 unsigned argpages, struct fuse_arg *args,
579 for (i = 0; !err && i < numargs; i++) {
580 struct fuse_arg *arg = &args[i];
581 if (i == numargs - 1 && argpages)
582 err = fuse_copy_pages(cs, arg->size, zeroing);
584 err = fuse_copy_one(cs, arg->value, arg->size);
589 /* Wait until a request is available on the pending list */
590 static void request_wait(struct fuse_conn *fc)
592 DECLARE_WAITQUEUE(wait, current);
594 add_wait_queue_exclusive(&fc->waitq, &wait);
595 while (fc->mounted && list_empty(&fc->pending)) {
596 set_current_state(TASK_INTERRUPTIBLE);
597 if (signal_pending(current))
600 spin_unlock(&fuse_lock);
602 spin_lock(&fuse_lock);
604 set_current_state(TASK_RUNNING);
605 remove_wait_queue(&fc->waitq, &wait);
609 * Read a single request into the userspace filesystem's buffer. This
610 * function waits until a request is available, then removes it from
611 * the pending list and copies request data to userspace buffer. If
612 * no reply is needed (FORGET) or request has been interrupted or
613 * there was an error during the copying then it's finished by calling
614 * request_end(). Otherwise add it to the processing list, and set
617 static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
618 unsigned long nr_segs, loff_t *off)
621 struct fuse_conn *fc;
622 struct fuse_req *req;
624 struct fuse_copy_state cs;
628 spin_lock(&fuse_lock);
629 fc = file->private_data;
638 if (list_empty(&fc->pending))
641 req = list_entry(fc->pending.next, struct fuse_req, list);
642 list_del_init(&req->list);
646 /* If request is too large, reply with an error and restart the read */
647 if (iov_length(iov, nr_segs) < reqsize) {
648 req->out.h.error = -EIO;
649 /* SETXATTR is special, since it may contain too large data */
650 if (in->h.opcode == FUSE_SETXATTR)
651 req->out.h.error = -E2BIG;
652 request_end(fc, req);
655 spin_unlock(&fuse_lock);
656 fuse_copy_init(&cs, 1, req, iov, nr_segs);
657 err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
659 err = fuse_copy_args(&cs, in->numargs, in->argpages,
660 (struct fuse_arg *) in->args, 0);
661 fuse_copy_finish(&cs);
662 spin_lock(&fuse_lock);
664 if (!err && req->interrupted)
667 if (!req->interrupted)
668 req->out.h.error = -EIO;
669 request_end(fc, req);
673 request_end(fc, req);
676 list_add_tail(&req->list, &fc->processing);
677 spin_unlock(&fuse_lock);
682 spin_unlock(&fuse_lock);
686 static ssize_t fuse_dev_read(struct file *file, char __user *buf,
687 size_t nbytes, loff_t *off)
690 iov.iov_len = nbytes;
692 return fuse_dev_readv(file, &iov, 1, off);
695 /* Look up request on processing list by unique ID */
696 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
698 struct list_head *entry;
700 list_for_each(entry, &fc->processing) {
701 struct fuse_req *req;
702 req = list_entry(entry, struct fuse_req, list);
703 if (req->in.h.unique == unique)
709 static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
712 unsigned reqsize = sizeof(struct fuse_out_header);
715 return nbytes != reqsize ? -EINVAL : 0;
717 reqsize += len_args(out->numargs, out->args);
719 if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
721 else if (reqsize > nbytes) {
722 struct fuse_arg *lastarg = &out->args[out->numargs-1];
723 unsigned diffsize = reqsize - nbytes;
724 if (diffsize > lastarg->size)
726 lastarg->size -= diffsize;
728 return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
733 * Write a single reply to a request. First the header is copied from
734 * the write buffer. The request is then searched on the processing
735 * list by the unique ID found in the header. If found, then remove
736 * it from the list and copy the rest of the buffer to the request.
737 * The request is finished by calling request_end()
739 static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
740 unsigned long nr_segs, loff_t *off)
743 unsigned nbytes = iov_length(iov, nr_segs);
744 struct fuse_req *req;
745 struct fuse_out_header oh;
746 struct fuse_copy_state cs;
747 struct fuse_conn *fc = fuse_get_conn(file);
751 fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
752 if (nbytes < sizeof(struct fuse_out_header))
755 err = fuse_copy_one(&cs, &oh, sizeof(oh));
759 if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
763 spin_lock(&fuse_lock);
764 req = request_find(fc, oh.unique);
769 list_del_init(&req->list);
770 if (req->interrupted) {
771 spin_unlock(&fuse_lock);
772 fuse_copy_finish(&cs);
773 spin_lock(&fuse_lock);
774 request_end(fc, req);
780 spin_unlock(&fuse_lock);
782 err = copy_out_args(&cs, &req->out, nbytes);
783 fuse_copy_finish(&cs);
785 spin_lock(&fuse_lock);
788 if (req->interrupted)
790 } else if (!req->interrupted)
791 req->out.h.error = -EIO;
792 request_end(fc, req);
794 return err ? err : nbytes;
797 spin_unlock(&fuse_lock);
799 fuse_copy_finish(&cs);
803 static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
804 size_t nbytes, loff_t *off)
807 iov.iov_len = nbytes;
808 iov.iov_base = (char __user *) buf;
809 return fuse_dev_writev(file, &iov, 1, off);
812 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
814 struct fuse_conn *fc = fuse_get_conn(file);
815 unsigned mask = POLLOUT | POLLWRNORM;
820 poll_wait(file, &fc->waitq, wait);
822 spin_lock(&fuse_lock);
823 if (!list_empty(&fc->pending))
824 mask |= POLLIN | POLLRDNORM;
825 spin_unlock(&fuse_lock);
830 /* Abort all requests on the given list (pending or processing) */
831 static void end_requests(struct fuse_conn *fc, struct list_head *head)
833 while (!list_empty(head)) {
834 struct fuse_req *req;
835 req = list_entry(head->next, struct fuse_req, list);
836 list_del_init(&req->list);
837 req->out.h.error = -ECONNABORTED;
838 request_end(fc, req);
839 spin_lock(&fuse_lock);
843 static int fuse_dev_release(struct inode *inode, struct file *file)
845 struct fuse_conn *fc;
847 spin_lock(&fuse_lock);
848 fc = file->private_data;
851 end_requests(fc, &fc->pending);
852 end_requests(fc, &fc->processing);
853 fuse_release_conn(fc);
855 spin_unlock(&fuse_lock);
859 struct file_operations fuse_dev_operations = {
860 .owner = THIS_MODULE,
862 .read = fuse_dev_read,
863 .readv = fuse_dev_readv,
864 .write = fuse_dev_write,
865 .writev = fuse_dev_writev,
866 .poll = fuse_dev_poll,
867 .release = fuse_dev_release,
870 static struct miscdevice fuse_miscdevice = {
873 .fops = &fuse_dev_operations,
876 int __init fuse_dev_init(void)
879 fuse_req_cachep = kmem_cache_create("fuse_request",
880 sizeof(struct fuse_req),
882 if (!fuse_req_cachep)
885 err = misc_register(&fuse_miscdevice);
887 goto out_cache_clean;
892 kmem_cache_destroy(fuse_req_cachep);
897 void fuse_dev_cleanup(void)
899 misc_deregister(&fuse_miscdevice);
900 kmem_cache_destroy(fuse_req_cachep);