fs: push i_mutex and filemap_write_and_wait down into ->fsync() handlers
[linux-2.6.git] / ipc / mqueue.c
1 /*
2  * POSIX message queues filesystem for Linux.
3  *
4  * Copyright (C) 2003,2004  Krzysztof Benedyczak    (golbi@mat.uni.torun.pl)
5  *                          Michal Wronski          (michal.wronski@gmail.com)
6  *
7  * Spinlocks:               Mohamed Abbas           (abbas.mohamed@intel.com)
8  * Lockless receive & send, fd based notify:
9  *                          Manfred Spraul          (manfred@colorfullife.com)
10  *
11  * Audit:                   George Wilson           (ltcgcw@us.ibm.com)
12  *
13  * This file is released under the GPL.
14  */
15
16 #include <linux/capability.h>
17 #include <linux/init.h>
18 #include <linux/pagemap.h>
19 #include <linux/file.h>
20 #include <linux/mount.h>
21 #include <linux/namei.h>
22 #include <linux/sysctl.h>
23 #include <linux/poll.h>
24 #include <linux/mqueue.h>
25 #include <linux/msg.h>
26 #include <linux/skbuff.h>
27 #include <linux/netlink.h>
28 #include <linux/syscalls.h>
29 #include <linux/audit.h>
30 #include <linux/signal.h>
31 #include <linux/mutex.h>
32 #include <linux/nsproxy.h>
33 #include <linux/pid.h>
34 #include <linux/ipc_namespace.h>
35 #include <linux/slab.h>
36
37 #include <net/sock.h>
38 #include "util.h"
39
40 #define MQUEUE_MAGIC    0x19800202
41 #define DIRENT_SIZE     20
42 #define FILENT_SIZE     80
43
44 #define SEND            0
45 #define RECV            1
46
47 #define STATE_NONE      0
48 #define STATE_PENDING   1
49 #define STATE_READY     2
50
51 struct ext_wait_queue {         /* queue of sleeping tasks */
52         struct task_struct *task;
53         struct list_head list;
54         struct msg_msg *msg;    /* ptr of loaded message */
55         int state;              /* one of STATE_* values */
56 };
57
58 struct mqueue_inode_info {
59         spinlock_t lock;
60         struct inode vfs_inode;
61         wait_queue_head_t wait_q;
62
63         struct msg_msg **messages;
64         struct mq_attr attr;
65
66         struct sigevent notify;
67         struct pid* notify_owner;
68         struct user_struct *user;       /* user who created, for accounting */
69         struct sock *notify_sock;
70         struct sk_buff *notify_cookie;
71
72         /* for tasks waiting for free space and messages, respectively */
73         struct ext_wait_queue e_wait_q[2];
74
75         unsigned long qsize; /* size of queue in memory (sum of all msgs) */
76 };
77
78 static const struct inode_operations mqueue_dir_inode_operations;
79 static const struct file_operations mqueue_file_operations;
80 static const struct super_operations mqueue_super_ops;
81 static void remove_notification(struct mqueue_inode_info *info);
82
83 static struct kmem_cache *mqueue_inode_cachep;
84
85 static struct ctl_table_header * mq_sysctl_table;
86
87 static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
88 {
89         return container_of(inode, struct mqueue_inode_info, vfs_inode);
90 }
91
92 /*
93  * This routine should be called with the mq_lock held.
94  */
95 static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
96 {
97         return get_ipc_ns(inode->i_sb->s_fs_info);
98 }
99
100 static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
101 {
102         struct ipc_namespace *ns;
103
104         spin_lock(&mq_lock);
105         ns = __get_ns_from_inode(inode);
106         spin_unlock(&mq_lock);
107         return ns;
108 }
109
110 static struct inode *mqueue_get_inode(struct super_block *sb,
111                 struct ipc_namespace *ipc_ns, int mode,
112                 struct mq_attr *attr)
113 {
114         struct user_struct *u = current_user();
115         struct inode *inode;
116
117         inode = new_inode(sb);
118         if (inode) {
119                 inode->i_ino = get_next_ino();
120                 inode->i_mode = mode;
121                 inode->i_uid = current_fsuid();
122                 inode->i_gid = current_fsgid();
123                 inode->i_mtime = inode->i_ctime = inode->i_atime =
124                                 CURRENT_TIME;
125
126                 if (S_ISREG(mode)) {
127                         struct mqueue_inode_info *info;
128                         struct task_struct *p = current;
129                         unsigned long mq_bytes, mq_msg_tblsz;
130
131                         inode->i_fop = &mqueue_file_operations;
132                         inode->i_size = FILENT_SIZE;
133                         /* mqueue specific info */
134                         info = MQUEUE_I(inode);
135                         spin_lock_init(&info->lock);
136                         init_waitqueue_head(&info->wait_q);
137                         INIT_LIST_HEAD(&info->e_wait_q[0].list);
138                         INIT_LIST_HEAD(&info->e_wait_q[1].list);
139                         info->notify_owner = NULL;
140                         info->qsize = 0;
141                         info->user = NULL;      /* set when all is ok */
142                         memset(&info->attr, 0, sizeof(info->attr));
143                         info->attr.mq_maxmsg = ipc_ns->mq_msg_max;
144                         info->attr.mq_msgsize = ipc_ns->mq_msgsize_max;
145                         if (attr) {
146                                 info->attr.mq_maxmsg = attr->mq_maxmsg;
147                                 info->attr.mq_msgsize = attr->mq_msgsize;
148                         }
149                         mq_msg_tblsz = info->attr.mq_maxmsg * sizeof(struct msg_msg *);
150                         info->messages = kmalloc(mq_msg_tblsz, GFP_KERNEL);
151                         if (!info->messages)
152                                 goto out_inode;
153
154                         mq_bytes = (mq_msg_tblsz +
155                                 (info->attr.mq_maxmsg * info->attr.mq_msgsize));
156
157                         spin_lock(&mq_lock);
158                         if (u->mq_bytes + mq_bytes < u->mq_bytes ||
159                             u->mq_bytes + mq_bytes >
160                             task_rlimit(p, RLIMIT_MSGQUEUE)) {
161                                 spin_unlock(&mq_lock);
162                                 /* mqueue_evict_inode() releases info->messages */
163                                 goto out_inode;
164                         }
165                         u->mq_bytes += mq_bytes;
166                         spin_unlock(&mq_lock);
167
168                         /* all is ok */
169                         info->user = get_uid(u);
170                 } else if (S_ISDIR(mode)) {
171                         inc_nlink(inode);
172                         /* Some things misbehave if size == 0 on a directory */
173                         inode->i_size = 2 * DIRENT_SIZE;
174                         inode->i_op = &mqueue_dir_inode_operations;
175                         inode->i_fop = &simple_dir_operations;
176                 }
177         }
178         return inode;
179 out_inode:
180         iput(inode);
181         return NULL;
182 }
183
184 static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
185 {
186         struct inode *inode;
187         struct ipc_namespace *ns = data;
188         int error;
189
190         sb->s_blocksize = PAGE_CACHE_SIZE;
191         sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
192         sb->s_magic = MQUEUE_MAGIC;
193         sb->s_op = &mqueue_super_ops;
194
195         inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO,
196                                 NULL);
197         if (!inode) {
198                 error = -ENOMEM;
199                 goto out;
200         }
201
202         sb->s_root = d_alloc_root(inode);
203         if (!sb->s_root) {
204                 iput(inode);
205                 error = -ENOMEM;
206                 goto out;
207         }
208         error = 0;
209
210 out:
211         return error;
212 }
213
214 static struct dentry *mqueue_mount(struct file_system_type *fs_type,
215                          int flags, const char *dev_name,
216                          void *data)
217 {
218         if (!(flags & MS_KERNMOUNT))
219                 data = current->nsproxy->ipc_ns;
220         return mount_ns(fs_type, flags, data, mqueue_fill_super);
221 }
222
223 static void init_once(void *foo)
224 {
225         struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
226
227         inode_init_once(&p->vfs_inode);
228 }
229
230 static struct inode *mqueue_alloc_inode(struct super_block *sb)
231 {
232         struct mqueue_inode_info *ei;
233
234         ei = kmem_cache_alloc(mqueue_inode_cachep, GFP_KERNEL);
235         if (!ei)
236                 return NULL;
237         return &ei->vfs_inode;
238 }
239
240 static void mqueue_i_callback(struct rcu_head *head)
241 {
242         struct inode *inode = container_of(head, struct inode, i_rcu);
243         INIT_LIST_HEAD(&inode->i_dentry);
244         kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
245 }
246
247 static void mqueue_destroy_inode(struct inode *inode)
248 {
249         call_rcu(&inode->i_rcu, mqueue_i_callback);
250 }
251
252 static void mqueue_evict_inode(struct inode *inode)
253 {
254         struct mqueue_inode_info *info;
255         struct user_struct *user;
256         unsigned long mq_bytes;
257         int i;
258         struct ipc_namespace *ipc_ns;
259
260         end_writeback(inode);
261
262         if (S_ISDIR(inode->i_mode))
263                 return;
264
265         ipc_ns = get_ns_from_inode(inode);
266         info = MQUEUE_I(inode);
267         spin_lock(&info->lock);
268         for (i = 0; i < info->attr.mq_curmsgs; i++)
269                 free_msg(info->messages[i]);
270         kfree(info->messages);
271         spin_unlock(&info->lock);
272
273         /* Total amount of bytes accounted for the mqueue */
274         mq_bytes = info->attr.mq_maxmsg * (sizeof(struct msg_msg *)
275             + info->attr.mq_msgsize);
276         user = info->user;
277         if (user) {
278                 spin_lock(&mq_lock);
279                 user->mq_bytes -= mq_bytes;
280                 /*
281                  * get_ns_from_inode() ensures that the
282                  * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
283                  * to which we now hold a reference, or it is NULL.
284                  * We can't put it here under mq_lock, though.
285                  */
286                 if (ipc_ns)
287                         ipc_ns->mq_queues_count--;
288                 spin_unlock(&mq_lock);
289                 free_uid(user);
290         }
291         if (ipc_ns)
292                 put_ipc_ns(ipc_ns);
293 }
294
295 static int mqueue_create(struct inode *dir, struct dentry *dentry,
296                                 int mode, struct nameidata *nd)
297 {
298         struct inode *inode;
299         struct mq_attr *attr = dentry->d_fsdata;
300         int error;
301         struct ipc_namespace *ipc_ns;
302
303         spin_lock(&mq_lock);
304         ipc_ns = __get_ns_from_inode(dir);
305         if (!ipc_ns) {
306                 error = -EACCES;
307                 goto out_unlock;
308         }
309         if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
310                         !capable(CAP_SYS_RESOURCE)) {
311                 error = -ENOSPC;
312                 goto out_unlock;
313         }
314         ipc_ns->mq_queues_count++;
315         spin_unlock(&mq_lock);
316
317         inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
318         if (!inode) {
319                 error = -ENOMEM;
320                 spin_lock(&mq_lock);
321                 ipc_ns->mq_queues_count--;
322                 goto out_unlock;
323         }
324
325         put_ipc_ns(ipc_ns);
326         dir->i_size += DIRENT_SIZE;
327         dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
328
329         d_instantiate(dentry, inode);
330         dget(dentry);
331         return 0;
332 out_unlock:
333         spin_unlock(&mq_lock);
334         if (ipc_ns)
335                 put_ipc_ns(ipc_ns);
336         return error;
337 }
338
339 static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
340 {
341         struct inode *inode = dentry->d_inode;
342
343         dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
344         dir->i_size -= DIRENT_SIZE;
345         drop_nlink(inode);
346         dput(dentry);
347         return 0;
348 }
349
350 /*
351 *       This is routine for system read from queue file.
352 *       To avoid mess with doing here some sort of mq_receive we allow
353 *       to read only queue size & notification info (the only values
354 *       that are interesting from user point of view and aren't accessible
355 *       through std routines)
356 */
357 static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
358                                 size_t count, loff_t *off)
359 {
360         struct mqueue_inode_info *info = MQUEUE_I(filp->f_path.dentry->d_inode);
361         char buffer[FILENT_SIZE];
362         ssize_t ret;
363
364         spin_lock(&info->lock);
365         snprintf(buffer, sizeof(buffer),
366                         "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
367                         info->qsize,
368                         info->notify_owner ? info->notify.sigev_notify : 0,
369                         (info->notify_owner &&
370                          info->notify.sigev_notify == SIGEV_SIGNAL) ?
371                                 info->notify.sigev_signo : 0,
372                         pid_vnr(info->notify_owner));
373         spin_unlock(&info->lock);
374         buffer[sizeof(buffer)-1] = '\0';
375
376         ret = simple_read_from_buffer(u_data, count, off, buffer,
377                                 strlen(buffer));
378         if (ret <= 0)
379                 return ret;
380
381         filp->f_path.dentry->d_inode->i_atime = filp->f_path.dentry->d_inode->i_ctime = CURRENT_TIME;
382         return ret;
383 }
384
385 static int mqueue_flush_file(struct file *filp, fl_owner_t id)
386 {
387         struct mqueue_inode_info *info = MQUEUE_I(filp->f_path.dentry->d_inode);
388
389         spin_lock(&info->lock);
390         if (task_tgid(current) == info->notify_owner)
391                 remove_notification(info);
392
393         spin_unlock(&info->lock);
394         return 0;
395 }
396
397 static unsigned int mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab)
398 {
399         struct mqueue_inode_info *info = MQUEUE_I(filp->f_path.dentry->d_inode);
400         int retval = 0;
401
402         poll_wait(filp, &info->wait_q, poll_tab);
403
404         spin_lock(&info->lock);
405         if (info->attr.mq_curmsgs)
406                 retval = POLLIN | POLLRDNORM;
407
408         if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
409                 retval |= POLLOUT | POLLWRNORM;
410         spin_unlock(&info->lock);
411
412         return retval;
413 }
414
415 /* Adds current to info->e_wait_q[sr] before element with smaller prio */
416 static void wq_add(struct mqueue_inode_info *info, int sr,
417                         struct ext_wait_queue *ewp)
418 {
419         struct ext_wait_queue *walk;
420
421         ewp->task = current;
422
423         list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
424                 if (walk->task->static_prio <= current->static_prio) {
425                         list_add_tail(&ewp->list, &walk->list);
426                         return;
427                 }
428         }
429         list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
430 }
431
432 /*
433  * Puts current task to sleep. Caller must hold queue lock. After return
434  * lock isn't held.
435  * sr: SEND or RECV
436  */
437 static int wq_sleep(struct mqueue_inode_info *info, int sr,
438                     ktime_t *timeout, struct ext_wait_queue *ewp)
439 {
440         int retval;
441         signed long time;
442
443         wq_add(info, sr, ewp);
444
445         for (;;) {
446                 set_current_state(TASK_INTERRUPTIBLE);
447
448                 spin_unlock(&info->lock);
449                 time = schedule_hrtimeout_range_clock(timeout,
450                     HRTIMER_MODE_ABS, 0, CLOCK_REALTIME);
451
452                 while (ewp->state == STATE_PENDING)
453                         cpu_relax();
454
455                 if (ewp->state == STATE_READY) {
456                         retval = 0;
457                         goto out;
458                 }
459                 spin_lock(&info->lock);
460                 if (ewp->state == STATE_READY) {
461                         retval = 0;
462                         goto out_unlock;
463                 }
464                 if (signal_pending(current)) {
465                         retval = -ERESTARTSYS;
466                         break;
467                 }
468                 if (time == 0) {
469                         retval = -ETIMEDOUT;
470                         break;
471                 }
472         }
473         list_del(&ewp->list);
474 out_unlock:
475         spin_unlock(&info->lock);
476 out:
477         return retval;
478 }
479
480 /*
481  * Returns waiting task that should be serviced first or NULL if none exists
482  */
483 static struct ext_wait_queue *wq_get_first_waiter(
484                 struct mqueue_inode_info *info, int sr)
485 {
486         struct list_head *ptr;
487
488         ptr = info->e_wait_q[sr].list.prev;
489         if (ptr == &info->e_wait_q[sr].list)
490                 return NULL;
491         return list_entry(ptr, struct ext_wait_queue, list);
492 }
493
494 /* Auxiliary functions to manipulate messages' list */
495 static void msg_insert(struct msg_msg *ptr, struct mqueue_inode_info *info)
496 {
497         int k;
498
499         k = info->attr.mq_curmsgs - 1;
500         while (k >= 0 && info->messages[k]->m_type >= ptr->m_type) {
501                 info->messages[k + 1] = info->messages[k];
502                 k--;
503         }
504         info->attr.mq_curmsgs++;
505         info->qsize += ptr->m_ts;
506         info->messages[k + 1] = ptr;
507 }
508
509 static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
510 {
511         info->qsize -= info->messages[--info->attr.mq_curmsgs]->m_ts;
512         return info->messages[info->attr.mq_curmsgs];
513 }
514
515 static inline void set_cookie(struct sk_buff *skb, char code)
516 {
517         ((char*)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
518 }
519
520 /*
521  * The next function is only to split too long sys_mq_timedsend
522  */
523 static void __do_notify(struct mqueue_inode_info *info)
524 {
525         /* notification
526          * invoked when there is registered process and there isn't process
527          * waiting synchronously for message AND state of queue changed from
528          * empty to not empty. Here we are sure that no one is waiting
529          * synchronously. */
530         if (info->notify_owner &&
531             info->attr.mq_curmsgs == 1) {
532                 struct siginfo sig_i;
533                 switch (info->notify.sigev_notify) {
534                 case SIGEV_NONE:
535                         break;
536                 case SIGEV_SIGNAL:
537                         /* sends signal */
538
539                         sig_i.si_signo = info->notify.sigev_signo;
540                         sig_i.si_errno = 0;
541                         sig_i.si_code = SI_MESGQ;
542                         sig_i.si_value = info->notify.sigev_value;
543                         sig_i.si_pid = task_tgid_nr_ns(current,
544                                                 ns_of_pid(info->notify_owner));
545                         sig_i.si_uid = current_uid();
546
547                         kill_pid_info(info->notify.sigev_signo,
548                                       &sig_i, info->notify_owner);
549                         break;
550                 case SIGEV_THREAD:
551                         set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
552                         netlink_sendskb(info->notify_sock, info->notify_cookie);
553                         break;
554                 }
555                 /* after notification unregisters process */
556                 put_pid(info->notify_owner);
557                 info->notify_owner = NULL;
558         }
559         wake_up(&info->wait_q);
560 }
561
562 static int prepare_timeout(const struct timespec __user *u_abs_timeout,
563                            ktime_t *expires, struct timespec *ts)
564 {
565         if (copy_from_user(ts, u_abs_timeout, sizeof(struct timespec)))
566                 return -EFAULT;
567         if (!timespec_valid(ts))
568                 return -EINVAL;
569
570         *expires = timespec_to_ktime(*ts);
571         return 0;
572 }
573
574 static void remove_notification(struct mqueue_inode_info *info)
575 {
576         if (info->notify_owner != NULL &&
577             info->notify.sigev_notify == SIGEV_THREAD) {
578                 set_cookie(info->notify_cookie, NOTIFY_REMOVED);
579                 netlink_sendskb(info->notify_sock, info->notify_cookie);
580         }
581         put_pid(info->notify_owner);
582         info->notify_owner = NULL;
583 }
584
585 static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr)
586 {
587         if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0)
588                 return 0;
589         if (capable(CAP_SYS_RESOURCE)) {
590                 if (attr->mq_maxmsg > HARD_MSGMAX)
591                         return 0;
592         } else {
593                 if (attr->mq_maxmsg > ipc_ns->mq_msg_max ||
594                                 attr->mq_msgsize > ipc_ns->mq_msgsize_max)
595                         return 0;
596         }
597         /* check for overflow */
598         if (attr->mq_msgsize > ULONG_MAX/attr->mq_maxmsg)
599                 return 0;
600         if ((unsigned long)(attr->mq_maxmsg * (attr->mq_msgsize
601             + sizeof (struct msg_msg *))) <
602             (unsigned long)(attr->mq_maxmsg * attr->mq_msgsize))
603                 return 0;
604         return 1;
605 }
606
607 /*
608  * Invoked when creating a new queue via sys_mq_open
609  */
610 static struct file *do_create(struct ipc_namespace *ipc_ns, struct dentry *dir,
611                         struct dentry *dentry, int oflag, mode_t mode,
612                         struct mq_attr *attr)
613 {
614         const struct cred *cred = current_cred();
615         struct file *result;
616         int ret;
617
618         if (attr) {
619                 if (!mq_attr_ok(ipc_ns, attr)) {
620                         ret = -EINVAL;
621                         goto out;
622                 }
623                 /* store for use during create */
624                 dentry->d_fsdata = attr;
625         }
626
627         mode &= ~current_umask();
628         ret = mnt_want_write(ipc_ns->mq_mnt);
629         if (ret)
630                 goto out;
631         ret = vfs_create(dir->d_inode, dentry, mode, NULL);
632         dentry->d_fsdata = NULL;
633         if (ret)
634                 goto out_drop_write;
635
636         result = dentry_open(dentry, ipc_ns->mq_mnt, oflag, cred);
637         /*
638          * dentry_open() took a persistent mnt_want_write(),
639          * so we can now drop this one.
640          */
641         mnt_drop_write(ipc_ns->mq_mnt);
642         return result;
643
644 out_drop_write:
645         mnt_drop_write(ipc_ns->mq_mnt);
646 out:
647         dput(dentry);
648         mntput(ipc_ns->mq_mnt);
649         return ERR_PTR(ret);
650 }
651
652 /* Opens existing queue */
653 static struct file *do_open(struct ipc_namespace *ipc_ns,
654                                 struct dentry *dentry, int oflag)
655 {
656         int ret;
657         const struct cred *cred = current_cred();
658
659         static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
660                                                   MAY_READ | MAY_WRITE };
661
662         if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) {
663                 ret = -EINVAL;
664                 goto err;
665         }
666
667         if (inode_permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE])) {
668                 ret = -EACCES;
669                 goto err;
670         }
671
672         return dentry_open(dentry, ipc_ns->mq_mnt, oflag, cred);
673
674 err:
675         dput(dentry);
676         mntput(ipc_ns->mq_mnt);
677         return ERR_PTR(ret);
678 }
679
680 SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
681                 struct mq_attr __user *, u_attr)
682 {
683         struct dentry *dentry;
684         struct file *filp;
685         char *name;
686         struct mq_attr attr;
687         int fd, error;
688         struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
689
690         if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
691                 return -EFAULT;
692
693         audit_mq_open(oflag, mode, u_attr ? &attr : NULL);
694
695         if (IS_ERR(name = getname(u_name)))
696                 return PTR_ERR(name);
697
698         fd = get_unused_fd_flags(O_CLOEXEC);
699         if (fd < 0)
700                 goto out_putname;
701
702         mutex_lock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
703         dentry = lookup_one_len(name, ipc_ns->mq_mnt->mnt_root, strlen(name));
704         if (IS_ERR(dentry)) {
705                 error = PTR_ERR(dentry);
706                 goto out_putfd;
707         }
708         mntget(ipc_ns->mq_mnt);
709
710         if (oflag & O_CREAT) {
711                 if (dentry->d_inode) {  /* entry already exists */
712                         audit_inode(name, dentry);
713                         if (oflag & O_EXCL) {
714                                 error = -EEXIST;
715                                 goto out;
716                         }
717                         filp = do_open(ipc_ns, dentry, oflag);
718                 } else {
719                         filp = do_create(ipc_ns, ipc_ns->mq_mnt->mnt_root,
720                                                 dentry, oflag, mode,
721                                                 u_attr ? &attr : NULL);
722                 }
723         } else {
724                 if (!dentry->d_inode) {
725                         error = -ENOENT;
726                         goto out;
727                 }
728                 audit_inode(name, dentry);
729                 filp = do_open(ipc_ns, dentry, oflag);
730         }
731
732         if (IS_ERR(filp)) {
733                 error = PTR_ERR(filp);
734                 goto out_putfd;
735         }
736
737         fd_install(fd, filp);
738         goto out_upsem;
739
740 out:
741         dput(dentry);
742         mntput(ipc_ns->mq_mnt);
743 out_putfd:
744         put_unused_fd(fd);
745         fd = error;
746 out_upsem:
747         mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
748 out_putname:
749         putname(name);
750         return fd;
751 }
752
753 SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
754 {
755         int err;
756         char *name;
757         struct dentry *dentry;
758         struct inode *inode = NULL;
759         struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
760
761         name = getname(u_name);
762         if (IS_ERR(name))
763                 return PTR_ERR(name);
764
765         mutex_lock_nested(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex,
766                         I_MUTEX_PARENT);
767         dentry = lookup_one_len(name, ipc_ns->mq_mnt->mnt_root, strlen(name));
768         if (IS_ERR(dentry)) {
769                 err = PTR_ERR(dentry);
770                 goto out_unlock;
771         }
772
773         if (!dentry->d_inode) {
774                 err = -ENOENT;
775                 goto out_err;
776         }
777
778         inode = dentry->d_inode;
779         if (inode)
780                 ihold(inode);
781         err = mnt_want_write(ipc_ns->mq_mnt);
782         if (err)
783                 goto out_err;
784         err = vfs_unlink(dentry->d_parent->d_inode, dentry);
785         mnt_drop_write(ipc_ns->mq_mnt);
786 out_err:
787         dput(dentry);
788
789 out_unlock:
790         mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
791         putname(name);
792         if (inode)
793                 iput(inode);
794
795         return err;
796 }
797
798 /* Pipelined send and receive functions.
799  *
800  * If a receiver finds no waiting message, then it registers itself in the
801  * list of waiting receivers. A sender checks that list before adding the new
802  * message into the message array. If there is a waiting receiver, then it
803  * bypasses the message array and directly hands the message over to the
804  * receiver.
805  * The receiver accepts the message and returns without grabbing the queue
806  * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers
807  * are necessary. The same algorithm is used for sysv semaphores, see
808  * ipc/sem.c for more details.
809  *
810  * The same algorithm is used for senders.
811  */
812
813 /* pipelined_send() - send a message directly to the task waiting in
814  * sys_mq_timedreceive() (without inserting message into a queue).
815  */
816 static inline void pipelined_send(struct mqueue_inode_info *info,
817                                   struct msg_msg *message,
818                                   struct ext_wait_queue *receiver)
819 {
820         receiver->msg = message;
821         list_del(&receiver->list);
822         receiver->state = STATE_PENDING;
823         wake_up_process(receiver->task);
824         smp_wmb();
825         receiver->state = STATE_READY;
826 }
827
828 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
829  * gets its message and put to the queue (we have one free place for sure). */
830 static inline void pipelined_receive(struct mqueue_inode_info *info)
831 {
832         struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
833
834         if (!sender) {
835                 /* for poll */
836                 wake_up_interruptible(&info->wait_q);
837                 return;
838         }
839         msg_insert(sender->msg, info);
840         list_del(&sender->list);
841         sender->state = STATE_PENDING;
842         wake_up_process(sender->task);
843         smp_wmb();
844         sender->state = STATE_READY;
845 }
846
847 SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
848                 size_t, msg_len, unsigned int, msg_prio,
849                 const struct timespec __user *, u_abs_timeout)
850 {
851         struct file *filp;
852         struct inode *inode;
853         struct ext_wait_queue wait;
854         struct ext_wait_queue *receiver;
855         struct msg_msg *msg_ptr;
856         struct mqueue_inode_info *info;
857         ktime_t expires, *timeout = NULL;
858         struct timespec ts;
859         int ret;
860
861         if (u_abs_timeout) {
862                 int res = prepare_timeout(u_abs_timeout, &expires, &ts);
863                 if (res)
864                         return res;
865                 timeout = &expires;
866         }
867
868         if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
869                 return -EINVAL;
870
871         audit_mq_sendrecv(mqdes, msg_len, msg_prio, timeout ? &ts : NULL);
872
873         filp = fget(mqdes);
874         if (unlikely(!filp)) {
875                 ret = -EBADF;
876                 goto out;
877         }
878
879         inode = filp->f_path.dentry->d_inode;
880         if (unlikely(filp->f_op != &mqueue_file_operations)) {
881                 ret = -EBADF;
882                 goto out_fput;
883         }
884         info = MQUEUE_I(inode);
885         audit_inode(NULL, filp->f_path.dentry);
886
887         if (unlikely(!(filp->f_mode & FMODE_WRITE))) {
888                 ret = -EBADF;
889                 goto out_fput;
890         }
891
892         if (unlikely(msg_len > info->attr.mq_msgsize)) {
893                 ret = -EMSGSIZE;
894                 goto out_fput;
895         }
896
897         /* First try to allocate memory, before doing anything with
898          * existing queues. */
899         msg_ptr = load_msg(u_msg_ptr, msg_len);
900         if (IS_ERR(msg_ptr)) {
901                 ret = PTR_ERR(msg_ptr);
902                 goto out_fput;
903         }
904         msg_ptr->m_ts = msg_len;
905         msg_ptr->m_type = msg_prio;
906
907         spin_lock(&info->lock);
908
909         if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
910                 if (filp->f_flags & O_NONBLOCK) {
911                         spin_unlock(&info->lock);
912                         ret = -EAGAIN;
913                 } else {
914                         wait.task = current;
915                         wait.msg = (void *) msg_ptr;
916                         wait.state = STATE_NONE;
917                         ret = wq_sleep(info, SEND, timeout, &wait);
918                 }
919                 if (ret < 0)
920                         free_msg(msg_ptr);
921         } else {
922                 receiver = wq_get_first_waiter(info, RECV);
923                 if (receiver) {
924                         pipelined_send(info, msg_ptr, receiver);
925                 } else {
926                         /* adds message to the queue */
927                         msg_insert(msg_ptr, info);
928                         __do_notify(info);
929                 }
930                 inode->i_atime = inode->i_mtime = inode->i_ctime =
931                                 CURRENT_TIME;
932                 spin_unlock(&info->lock);
933                 ret = 0;
934         }
935 out_fput:
936         fput(filp);
937 out:
938         return ret;
939 }
940
941 SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
942                 size_t, msg_len, unsigned int __user *, u_msg_prio,
943                 const struct timespec __user *, u_abs_timeout)
944 {
945         ssize_t ret;
946         struct msg_msg *msg_ptr;
947         struct file *filp;
948         struct inode *inode;
949         struct mqueue_inode_info *info;
950         struct ext_wait_queue wait;
951         ktime_t expires, *timeout = NULL;
952         struct timespec ts;
953
954         if (u_abs_timeout) {
955                 int res = prepare_timeout(u_abs_timeout, &expires, &ts);
956                 if (res)
957                         return res;
958                 timeout = &expires;
959         }
960
961         audit_mq_sendrecv(mqdes, msg_len, 0, timeout ? &ts : NULL);
962
963         filp = fget(mqdes);
964         if (unlikely(!filp)) {
965                 ret = -EBADF;
966                 goto out;
967         }
968
969         inode = filp->f_path.dentry->d_inode;
970         if (unlikely(filp->f_op != &mqueue_file_operations)) {
971                 ret = -EBADF;
972                 goto out_fput;
973         }
974         info = MQUEUE_I(inode);
975         audit_inode(NULL, filp->f_path.dentry);
976
977         if (unlikely(!(filp->f_mode & FMODE_READ))) {
978                 ret = -EBADF;
979                 goto out_fput;
980         }
981
982         /* checks if buffer is big enough */
983         if (unlikely(msg_len < info->attr.mq_msgsize)) {
984                 ret = -EMSGSIZE;
985                 goto out_fput;
986         }
987
988         spin_lock(&info->lock);
989         if (info->attr.mq_curmsgs == 0) {
990                 if (filp->f_flags & O_NONBLOCK) {
991                         spin_unlock(&info->lock);
992                         ret = -EAGAIN;
993                 } else {
994                         wait.task = current;
995                         wait.state = STATE_NONE;
996                         ret = wq_sleep(info, RECV, timeout, &wait);
997                         msg_ptr = wait.msg;
998                 }
999         } else {
1000                 msg_ptr = msg_get(info);
1001
1002                 inode->i_atime = inode->i_mtime = inode->i_ctime =
1003                                 CURRENT_TIME;
1004
1005                 /* There is now free space in queue. */
1006                 pipelined_receive(info);
1007                 spin_unlock(&info->lock);
1008                 ret = 0;
1009         }
1010         if (ret == 0) {
1011                 ret = msg_ptr->m_ts;
1012
1013                 if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
1014                         store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
1015                         ret = -EFAULT;
1016                 }
1017                 free_msg(msg_ptr);
1018         }
1019 out_fput:
1020         fput(filp);
1021 out:
1022         return ret;
1023 }
1024
1025 /*
1026  * Notes: the case when user wants us to deregister (with NULL as pointer)
1027  * and he isn't currently owner of notification, will be silently discarded.
1028  * It isn't explicitly defined in the POSIX.
1029  */
1030 SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
1031                 const struct sigevent __user *, u_notification)
1032 {
1033         int ret;
1034         struct file *filp;
1035         struct sock *sock;
1036         struct inode *inode;
1037         struct sigevent notification;
1038         struct mqueue_inode_info *info;
1039         struct sk_buff *nc;
1040
1041         if (u_notification) {
1042                 if (copy_from_user(&notification, u_notification,
1043                                         sizeof(struct sigevent)))
1044                         return -EFAULT;
1045         }
1046
1047         audit_mq_notify(mqdes, u_notification ? &notification : NULL);
1048
1049         nc = NULL;
1050         sock = NULL;
1051         if (u_notification != NULL) {
1052                 if (unlikely(notification.sigev_notify != SIGEV_NONE &&
1053                              notification.sigev_notify != SIGEV_SIGNAL &&
1054                              notification.sigev_notify != SIGEV_THREAD))
1055                         return -EINVAL;
1056                 if (notification.sigev_notify == SIGEV_SIGNAL &&
1057                         !valid_signal(notification.sigev_signo)) {
1058                         return -EINVAL;
1059                 }
1060                 if (notification.sigev_notify == SIGEV_THREAD) {
1061                         long timeo;
1062
1063                         /* create the notify skb */
1064                         nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
1065                         if (!nc) {
1066                                 ret = -ENOMEM;
1067                                 goto out;
1068                         }
1069                         if (copy_from_user(nc->data,
1070                                         notification.sigev_value.sival_ptr,
1071                                         NOTIFY_COOKIE_LEN)) {
1072                                 ret = -EFAULT;
1073                                 goto out;
1074                         }
1075
1076                         /* TODO: add a header? */
1077                         skb_put(nc, NOTIFY_COOKIE_LEN);
1078                         /* and attach it to the socket */
1079 retry:
1080                         filp = fget(notification.sigev_signo);
1081                         if (!filp) {
1082                                 ret = -EBADF;
1083                                 goto out;
1084                         }
1085                         sock = netlink_getsockbyfilp(filp);
1086                         fput(filp);
1087                         if (IS_ERR(sock)) {
1088                                 ret = PTR_ERR(sock);
1089                                 sock = NULL;
1090                                 goto out;
1091                         }
1092
1093                         timeo = MAX_SCHEDULE_TIMEOUT;
1094                         ret = netlink_attachskb(sock, nc, &timeo, NULL);
1095                         if (ret == 1)
1096                                 goto retry;
1097                         if (ret) {
1098                                 sock = NULL;
1099                                 nc = NULL;
1100                                 goto out;
1101                         }
1102                 }
1103         }
1104
1105         filp = fget(mqdes);
1106         if (!filp) {
1107                 ret = -EBADF;
1108                 goto out;
1109         }
1110
1111         inode = filp->f_path.dentry->d_inode;
1112         if (unlikely(filp->f_op != &mqueue_file_operations)) {
1113                 ret = -EBADF;
1114                 goto out_fput;
1115         }
1116         info = MQUEUE_I(inode);
1117
1118         ret = 0;
1119         spin_lock(&info->lock);
1120         if (u_notification == NULL) {
1121                 if (info->notify_owner == task_tgid(current)) {
1122                         remove_notification(info);
1123                         inode->i_atime = inode->i_ctime = CURRENT_TIME;
1124                 }
1125         } else if (info->notify_owner != NULL) {
1126                 ret = -EBUSY;
1127         } else {
1128                 switch (notification.sigev_notify) {
1129                 case SIGEV_NONE:
1130                         info->notify.sigev_notify = SIGEV_NONE;
1131                         break;
1132                 case SIGEV_THREAD:
1133                         info->notify_sock = sock;
1134                         info->notify_cookie = nc;
1135                         sock = NULL;
1136                         nc = NULL;
1137                         info->notify.sigev_notify = SIGEV_THREAD;
1138                         break;
1139                 case SIGEV_SIGNAL:
1140                         info->notify.sigev_signo = notification.sigev_signo;
1141                         info->notify.sigev_value = notification.sigev_value;
1142                         info->notify.sigev_notify = SIGEV_SIGNAL;
1143                         break;
1144                 }
1145
1146                 info->notify_owner = get_pid(task_tgid(current));
1147                 inode->i_atime = inode->i_ctime = CURRENT_TIME;
1148         }
1149         spin_unlock(&info->lock);
1150 out_fput:
1151         fput(filp);
1152 out:
1153         if (sock) {
1154                 netlink_detachskb(sock, nc);
1155         } else if (nc) {
1156                 dev_kfree_skb(nc);
1157         }
1158         return ret;
1159 }
1160
1161 SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
1162                 const struct mq_attr __user *, u_mqstat,
1163                 struct mq_attr __user *, u_omqstat)
1164 {
1165         int ret;
1166         struct mq_attr mqstat, omqstat;
1167         struct file *filp;
1168         struct inode *inode;
1169         struct mqueue_inode_info *info;
1170
1171         if (u_mqstat != NULL) {
1172                 if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr)))
1173                         return -EFAULT;
1174                 if (mqstat.mq_flags & (~O_NONBLOCK))
1175                         return -EINVAL;
1176         }
1177
1178         filp = fget(mqdes);
1179         if (!filp) {
1180                 ret = -EBADF;
1181                 goto out;
1182         }
1183
1184         inode = filp->f_path.dentry->d_inode;
1185         if (unlikely(filp->f_op != &mqueue_file_operations)) {
1186                 ret = -EBADF;
1187                 goto out_fput;
1188         }
1189         info = MQUEUE_I(inode);
1190
1191         spin_lock(&info->lock);
1192
1193         omqstat = info->attr;
1194         omqstat.mq_flags = filp->f_flags & O_NONBLOCK;
1195         if (u_mqstat) {
1196                 audit_mq_getsetattr(mqdes, &mqstat);
1197                 spin_lock(&filp->f_lock);
1198                 if (mqstat.mq_flags & O_NONBLOCK)
1199                         filp->f_flags |= O_NONBLOCK;
1200                 else
1201                         filp->f_flags &= ~O_NONBLOCK;
1202                 spin_unlock(&filp->f_lock);
1203
1204                 inode->i_atime = inode->i_ctime = CURRENT_TIME;
1205         }
1206
1207         spin_unlock(&info->lock);
1208
1209         ret = 0;
1210         if (u_omqstat != NULL && copy_to_user(u_omqstat, &omqstat,
1211                                                 sizeof(struct mq_attr)))
1212                 ret = -EFAULT;
1213
1214 out_fput:
1215         fput(filp);
1216 out:
1217         return ret;
1218 }
1219
1220 static const struct inode_operations mqueue_dir_inode_operations = {
1221         .lookup = simple_lookup,
1222         .create = mqueue_create,
1223         .unlink = mqueue_unlink,
1224 };
1225
1226 static const struct file_operations mqueue_file_operations = {
1227         .flush = mqueue_flush_file,
1228         .poll = mqueue_poll_file,
1229         .read = mqueue_read_file,
1230         .llseek = default_llseek,
1231 };
1232
1233 static const struct super_operations mqueue_super_ops = {
1234         .alloc_inode = mqueue_alloc_inode,
1235         .destroy_inode = mqueue_destroy_inode,
1236         .evict_inode = mqueue_evict_inode,
1237         .statfs = simple_statfs,
1238 };
1239
1240 static struct file_system_type mqueue_fs_type = {
1241         .name = "mqueue",
1242         .mount = mqueue_mount,
1243         .kill_sb = kill_litter_super,
1244 };
1245
1246 int mq_init_ns(struct ipc_namespace *ns)
1247 {
1248         ns->mq_queues_count  = 0;
1249         ns->mq_queues_max    = DFLT_QUEUESMAX;
1250         ns->mq_msg_max       = DFLT_MSGMAX;
1251         ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
1252
1253         ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
1254         if (IS_ERR(ns->mq_mnt)) {
1255                 int err = PTR_ERR(ns->mq_mnt);
1256                 ns->mq_mnt = NULL;
1257                 return err;
1258         }
1259         return 0;
1260 }
1261
1262 void mq_clear_sbinfo(struct ipc_namespace *ns)
1263 {
1264         ns->mq_mnt->mnt_sb->s_fs_info = NULL;
1265 }
1266
1267 void mq_put_mnt(struct ipc_namespace *ns)
1268 {
1269         mntput(ns->mq_mnt);
1270 }
1271
1272 static int __init init_mqueue_fs(void)
1273 {
1274         int error;
1275
1276         mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
1277                                 sizeof(struct mqueue_inode_info), 0,
1278                                 SLAB_HWCACHE_ALIGN, init_once);
1279         if (mqueue_inode_cachep == NULL)
1280                 return -ENOMEM;
1281
1282         /* ignore failures - they are not fatal */
1283         mq_sysctl_table = mq_register_sysctl_table();
1284
1285         error = register_filesystem(&mqueue_fs_type);
1286         if (error)
1287                 goto out_sysctl;
1288
1289         spin_lock_init(&mq_lock);
1290
1291         init_ipc_ns.mq_mnt = kern_mount_data(&mqueue_fs_type, &init_ipc_ns);
1292         if (IS_ERR(init_ipc_ns.mq_mnt)) {
1293                 error = PTR_ERR(init_ipc_ns.mq_mnt);
1294                 goto out_filesystem;
1295         }
1296
1297         return 0;
1298
1299 out_filesystem:
1300         unregister_filesystem(&mqueue_fs_type);
1301 out_sysctl:
1302         if (mq_sysctl_table)
1303                 unregister_sysctl_table(mq_sysctl_table);
1304         kmem_cache_destroy(mqueue_inode_cachep);
1305         return error;
1306 }
1307
1308 __initcall(init_mqueue_fs);