fs/select.c

   1 /*
   2  * This file contains the procedures for the handling of select and poll
   3  *
   4  * Created for Linux based loosely upon Mathius Lattner's minix
   5  * patches by Peter MacDonald. Heavily edited by Linus.
   6  *
   7  *  4 February 1994
   8  *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
   9  *     flag set in its personality we do *not* modify the given timeout
  10  *     parameter to reflect time remaining.
  11  *
  12  *  24 January 2000
  13  *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation
  14  *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
  15   *
  16   * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  17  */
  18
  19 #include <linux/kernel.h>
  20 #include <linux/sched.h>
  21 #include <linux/syscalls.h>
  22 #include <linux/export.h>
  23 #include <linux/slab.h>
  24 #include <linux/poll.h>
  25 #include <linux/personality.h> /* for STICKY_TIMEOUTS */
  26 #include <linux/file.h>
  27 #include <linux/fdtable.h>
  28 #include <linux/fs.h>
  29 #include <linux/rcupdate.h>
  30 #include <linux/hrtimer.h>
  31 #include <linux/sched/rt.h>
  32 #include <linux/freezer.h>
  33
  34 #include <asm/uaccess.h>
  35
  36
  37 /*
  38  * Estimate expected accuracy in ns from a timeval.
  39  *
  40  * After quite a bit of churning around, we've settled on
  41  * a simple thing of taking 0.1% of the timeout as the
  42  * slack, with a cap of 100 msec.
  43  * "nice" tasks get a 0.5% slack instead.
  44  *
  45  * Consider this comment an open invitation to come up with even
  46  * better solutions..
  47  */
  48
  49 #define MAX_SLACK       (100 * NSEC_PER_MSEC)
  50
  51 static long __estimate_accuracy(struct timespec *tv)
  52 {
  53         long slack;
  54         int divfactor = 1000;
  55
  56         if (tv->tv_sec < 0)
  57                 return 0;
  58
  59         if (task_nice(current) > 0)
  60                 divfactor = divfactor / 5;
  61
  62         if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
  63                 return MAX_SLACK;
  64
  65         slack = tv->tv_nsec / divfactor;
  66         slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
  67
  68         if (slack > MAX_SLACK)
  69                 return MAX_SLACK;
  70
  71         return slack;
  72 }
  73
  74 long select_estimate_accuracy(struct timespec *tv)
  75 {
  76         unsigned long ret;
  77         struct timespec now;
  78
  79         /*
  80          * Realtime tasks get a slack of 0 for obvious reasons.
  81          */
  82
  83         if (rt_task(current))
  84                 return 0;
  85
  86         ktime_get_ts(&now);
  87         now = timespec_sub(*tv, now);
  88         ret = __estimate_accuracy(&now);
  89         if (ret < current->timer_slack_ns)
  90                 return current->timer_slack_ns;
  91         return ret;
  92 }
  93
  94
  95
  96 struct poll_table_page {
  97         struct poll_table_page * next;
  98         struct poll_table_entry * entry;
  99         struct poll_table_entry entries[0];
 100 };
 101
 102 #define POLL_TABLE_FULL(table) \
 103         ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
 104
 105 /*
 106  * Ok, Peter made a complicated, but straightforward multiple_wait() function.
 107  * I have rewritten this, taking some shortcuts: This code may not be easy to
 108  * follow, but it should be free of race-conditions, and it's practical. If you
 109  * understand what I'm doing here, then you understand how the linux
 110  * sleep/wakeup mechanism works.
 111  *
 112  * Two very simple procedures, poll_wait() and poll_freewait() make all the
 113  * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
 114  * as all select/poll functions have to call it to add an entry to the
 115  * poll table.
 116  */
 117 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 118                        poll_table *p);
 119
 120 void poll_initwait(struct poll_wqueues *pwq)
 121 {
 122         init_poll_funcptr(&pwq->pt, __pollwait);
 123         pwq->polling_task = current;
 124         pwq->triggered = 0;
 125         pwq->error = 0;
 126         pwq->table = NULL;
 127         pwq->inline_index = 0;
 128 }
 129 EXPORT_SYMBOL(poll_initwait);
 130
 131 static void free_poll_entry(struct poll_table_entry *entry)
 132 {
 133         remove_wait_queue(entry->wait_address, &entry->wait);
 134         fput(entry->filp);
 135 }
 136
 137 void poll_freewait(struct poll_wqueues *pwq)
 138 {
 139         struct poll_table_page * p = pwq->table;
 140         int i;
 141         for (i = 0; i < pwq->inline_index; i++)
 142                 free_poll_entry(pwq->inline_entries + i);
 143         while (p) {
 144                 struct poll_table_entry * entry;
 145                 struct poll_table_page *old;
 146
 147                 entry = p->entry;
 148                 do {
 149                         entry--;
 150                         free_poll_entry(entry);
 151                 } while (entry > p->entries);
 152                 old = p;
 153                 p = p->next;
 154                 free_page((unsigned long) old);
 155         }
 156 }
 157 EXPORT_SYMBOL(poll_freewait);
 158
 159 static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
 160 {
 161         struct poll_table_page *table = p->table;
 162
 163         if (p->inline_index < N_INLINE_POLL_ENTRIES)
 164                 return p->inline_entries + p->inline_index++;
 165
 166         if (!table || POLL_TABLE_FULL(table)) {
 167                 struct poll_table_page *new_table;
 168
 169                 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
 170                 if (!new_table) {
 171                         p->error = -ENOMEM;
 172                         return NULL;
 173                 }
 174                 new_table->entry = new_table->entries;
 175                 new_table->next = table;
 176                 p->table = new_table;
 177                 table = new_table;
 178         }
 179
 180         return table->entry++;
 181 }
 182
 183 static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 184 {
 185         struct poll_wqueues *pwq = wait->private;
 186         DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
 187
 188         /*
 189          * Although this function is called under waitqueue lock, LOCK
 190          * doesn't imply write barrier and the users expect write
 191          * barrier semantics on wakeup functions.  The following
 192          * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
 193          * and is paired with set_mb() in poll_schedule_timeout.
 194          */
 195         smp_wmb();
 196         pwq->triggered = 1;
 197
 198         /*
 199          * Perform the default wake up operation using a dummy
 200          * waitqueue.
 201          *
 202          * TODO: This is hacky but there currently is no interface to
 203          * pass in @sync.  @sync is scheduled to be removed and once
 204          * that happens, wake_up_process() can be used directly.
 205          */
 206         return default_wake_function(&dummy_wait, mode, sync, key);
 207 }
 208
 209 int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 210 {
 211         struct poll_table_entry *entry;
 212
 213         entry = container_of(wait, struct poll_table_entry, wait);
 214         if (key && !((unsigned long)key & entry->key))
 215                 return 0;
 216         return __pollwake(wait, mode, sync, key);
 217 }
 218 EXPORT_SYMBOL(pollwake);
 219
 220 /* Add a new entry */
 221 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 222                                 poll_table *p)
 223 {
 224         struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
 225         struct poll_table_entry *entry = poll_get_entry(pwq);
 226         if (!entry)
 227                 return;
 228         entry->filp = get_file(filp);
 229         entry->wait_address = wait_address;
 230         entry->key = p->_key;
 231         init_waitqueue_func_entry(&entry->wait, pollwake);
 232         entry->wait.private = pwq;
 233         add_wait_queue(wait_address, &entry->wait);
 234 }
 235
 236 int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 237                           ktime_t *expires, unsigned long slack)
 238 {
 239         int rc = -EINTR;
 240
 241         set_current_state(state);
 242         if (!pwq->triggered)
 243                 rc = freezable_schedule_hrtimeout_range(expires, slack,
 244                                                         HRTIMER_MODE_ABS);
 245         __set_current_state(TASK_RUNNING);
 246
 247         /*
 248          * Prepare for the next iteration.
 249          *
 250          * The following set_mb() serves two purposes.  First, it's
 251          * the counterpart rmb of the wmb in pollwake() such that data
 252          * written before wake up is always visible after wake up.
 253          * Second, the full barrier guarantees that triggered clearing
 254          * doesn't pass event check of the next iteration.  Note that
 255          * this problem doesn't exist for the first iteration as
 256          * add_wait_queue() has full barrier semantics.
 257          */
 258         set_mb(pwq->triggered, 0);
 259
 260         return rc;
 261 }
 262 EXPORT_SYMBOL(poll_schedule_timeout);
 263
 264 /**
 265  * poll_select_set_timeout - helper function to setup the timeout value
 266  * @to:         pointer to timespec variable for the final timeout
 267  * @sec:        seconds (from user space)
 268  * @nsec:       nanoseconds (from user space)
 269  *
 270  * Note, we do not use a timespec for the user space value here, That
 271  * way we can use the function for timeval and compat interfaces as well.
 272  *
 273  * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
 274  */
 275 int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
 276 {
 277         struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
 278
 279         if (!timespec_valid(&ts))
 280                 return -EINVAL;
 281
 282         /* Optimize for the zero timeout value here */
 283         if (!sec && !nsec) {
 284                 to->tv_sec = to->tv_nsec = 0;
 285         } else {
 286                 ktime_get_ts(to);
 287                 *to = timespec_add_safe(*to, ts);
 288         }
 289         return 0;
 290 }
 291
 292 static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
 293                                       int timeval, int ret)
 294 {
 295         struct timespec rts;
 296         struct timeval rtv;
 297
 298         if (!p)
 299                 return ret;
 300
 301         if (current->personality & STICKY_TIMEOUTS)
 302                 goto sticky;
 303
 304         /* No update for zero timeout */
 305         if (!end_time->tv_sec && !end_time->tv_nsec)
 306                 return ret;
 307
 308         ktime_get_ts(&rts);
 309         rts = timespec_sub(*end_time, rts);
 310         if (rts.tv_sec < 0)
 311                 rts.tv_sec = rts.tv_nsec = 0;
 312
 313         if (timeval) {
 314                 if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
 315                         memset(&rtv, 0, sizeof(rtv));
 316                 rtv.tv_sec = rts.tv_sec;
 317                 rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
 318
 319                 if (!copy_to_user(p, &rtv, sizeof(rtv)))
 320                         return ret;
 321
 322         } else if (!copy_to_user(p, &rts, sizeof(rts)))
 323                 return ret;
 324
 325         /*
 326          * If an application puts its timeval in read-only memory, we
 327          * don't want the Linux-specific update to the timeval to
 328          * cause a fault after the select has completed
 329          * successfully. However, because we're not updating the
 330          * timeval, we can't restart the system call.
 331          */
 332
 333 sticky:
 334         if (ret == -ERESTARTNOHAND)
 335                 ret = -EINTR;
 336         return ret;
 337 }
 338
 339 #define FDS_IN(fds, n)          (fds->in + n)
 340 #define FDS_OUT(fds, n)         (fds->out + n)
 341 #define FDS_EX(fds, n)          (fds->ex + n)
 342
 343 #define BITS(fds, n)    (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))
 344
 345 static int max_select_fd(unsigned long n, fd_set_bits *fds)
 346 {
 347         unsigned long *open_fds;
 348         unsigned long set;
 349         int max;
 350         struct fdtable *fdt;
 351
 352         /* handle last in-complete long-word first */
 353         set = ~(~0UL << (n & (BITS_PER_LONG-1)));
 354         n /= BITS_PER_LONG;
 355         fdt = files_fdtable(current->files);
 356         open_fds = fdt->open_fds + n;
 357         max = 0;
 358         if (set) {
 359                 set &= BITS(fds, n);
 360                 if (set) {
 361                         if (!(set & ~*open_fds))
 362                                 goto get_max;
 363                         return -EBADF;
 364                 }
 365         }
 366         while (n) {
 367                 open_fds--;
 368                 n--;
 369                 set = BITS(fds, n);
 370                 if (!set)
 371                         continue;
 372                 if (set & ~*open_fds)
 373                         return -EBADF;
 374                 if (max)
 375                         continue;
 376 get_max:
 377                 do {
 378                         max++;
 379                         set >>= 1;
 380                 } while (set);
 381                 max += n * BITS_PER_LONG;
 382         }
 383
 384         return max;
 385 }
 386
 387 #define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
 388 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 389 #define POLLEX_SET (POLLPRI)
 390
 391 static inline void wait_key_set(poll_table *wait, unsigned long in,
 392                                 unsigned long out, unsigned long bit)
 393 {
 394         wait->_key = POLLEX_SET;
 395         if (in & bit)
 396                 wait->_key |= POLLIN_SET;
 397         if (out & bit)
 398                 wait->_key |= POLLOUT_SET;
 399 }
 400
 401 int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 402 {
 403         ktime_t expire, *to = NULL;
 404         struct poll_wqueues table;
 405         poll_table *wait;
 406         int retval, i, timed_out = 0;
 407         unsigned long slack = 0;
 408
 409         rcu_read_lock();
 410         retval = max_select_fd(n, fds);
 411         rcu_read_unlock();
 412
 413         if (retval < 0)
 414                 return retval;
 415         n = retval;
 416
 417         poll_initwait(&table);
 418         wait = &table.pt;
 419         if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
 420                 wait->_qproc = NULL;
 421                 timed_out = 1;
 422         }
 423
 424         if (end_time && !timed_out)
 425                 slack = select_estimate_accuracy(end_time);
 426
 427         retval = 0;
 428         for (;;) {
 429                 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
 430
 431                 inp = fds->in; outp = fds->out; exp = fds->ex;
 432                 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
 433
 434                 for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
 435                         unsigned long in, out, ex, all_bits, bit = 1, mask, j;
 436                         unsigned long res_in = 0, res_out = 0, res_ex = 0;
 437
 438                         in = *inp++; out = *outp++; ex = *exp++;
 439                         all_bits = in | out | ex;
 440                         if (all_bits == 0) {
 441                                 i += BITS_PER_LONG;
 442                                 continue;
 443                         }
 444
 445                         for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
 446                                 struct fd f;
 447                                 if (i >= n)
 448                                         break;
 449                                 if (!(bit & all_bits))
 450                                         continue;
 451                                 f = fdget(i);
 452                                 if (f.file) {
 453                                         const struct file_operations *f_op;
 454                                         f_op = f.file->f_op;
 455                                         mask = DEFAULT_POLLMASK;
 456                                         if (f_op && f_op->poll) {
 457                                                 wait_key_set(wait, in, out, bit);
 458                                                 mask = (*f_op->poll)(f.file, wait);
 459                                         }
 460                                         fdput(f);
 461                                         if ((mask & POLLIN_SET) && (in & bit)) {
 462                                                 res_in |= bit;
 463                                                 retval++;
 464                                                 wait->_qproc = NULL;
 465                                         }
 466                                         if ((mask & POLLOUT_SET) && (out & bit)) {
 467                                                 res_out |= bit;
 468                                                 retval++;
 469                                                 wait->_qproc = NULL;
 470                                         }
 471                                         if ((mask & POLLEX_SET) && (ex & bit)) {
 472                                                 res_ex |= bit;
 473                                                 retval++;
 474                                                 wait->_qproc = NULL;
 475                                         }
 476                                 }
 477                         }
 478                         if (res_in)
 479                                 *rinp = res_in;
 480                         if (res_out)
 481                                 *routp = res_out;
 482                         if (res_ex)
 483                                 *rexp = res_ex;
 484                         cond_resched();
 485                 }
 486                 wait->_qproc = NULL;
 487                 if (retval || timed_out || signal_pending(current))
 488                         break;
 489                 if (table.error) {
 490                         retval = table.error;
 491                         break;
 492                 }
 493
 494                 /*
 495                  * If this is the first loop and we have a timeout
 496                  * given, then we convert to ktime_t and set the to
 497                  * pointer to the expiry value.
 498                  */
 499                 if (end_time && !to) {
 500                         expire = timespec_to_ktime(*end_time);
 501                         to = &expire;
 502                 }
 503
 504                 if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
 505                                            to, slack))
 506                         timed_out = 1;
 507         }
 508
 509         poll_freewait(&table);
 510
 511         return retval;
 512 }
 513
 514 /*
 515  * We can actually return ERESTARTSYS instead of EINTR, but I'd
 516  * like to be certain this leads to no problems. So I return
 517  * EINTR just for safety.
 518  *
 519  * Update: ERESTARTSYS breaks at least the xview clock binary, so
 520  * I'm trying ERESTARTNOHAND which restart only when you want to.
 521  */
 522 int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 523                            fd_set __user *exp, struct timespec *end_time)
 524 {
 525         fd_set_bits fds;
 526         void *bits;
 527         int ret, max_fds;
 528         unsigned int size;
 529         struct fdtable *fdt;
 530         /* Allocate small arguments on the stack to save memory and be faster */
 531         long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
 532
 533         ret = -EINVAL;
 534         if (n < 0)
 535                 goto out_nofds;
 536
 537         /* max_fds can increase, so grab it once to avoid race */
 538         rcu_read_lock();
 539         fdt = files_fdtable(current->files);
 540         max_fds = fdt->max_fds;
 541         rcu_read_unlock();
 542         if (n > max_fds)
 543                 n = max_fds;
 544
 545         /*
 546          * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
 547          * since we used fdset we need to allocate memory in units of
 548          * long-words.
 549          */
 550         size = FDS_BYTES(n);
 551         bits = stack_fds;
 552         if (size > sizeof(stack_fds) / 6) {
 553                 /* Not enough space in on-stack array; must use kmalloc */
 554                 ret = -ENOMEM;
 555                 bits = kmalloc(6 * size, GFP_KERNEL);
 556                 if (!bits)
 557                         goto out_nofds;
 558         }
 559         fds.in      = bits;
 560         fds.out     = bits +   size;
 561         fds.ex      = bits + 2*size;
 562         fds.res_in  = bits + 3*size;
 563         fds.res_out = bits + 4*size;
 564         fds.res_ex  = bits + 5*size;
 565
 566         if ((ret = get_fd_set(n, inp, fds.in)) ||
 567             (ret = get_fd_set(n, outp, fds.out)) ||
 568             (ret = get_fd_set(n, exp, fds.ex)))
 569                 goto out;
 570         zero_fd_set(n, fds.res_in);
 571         zero_fd_set(n, fds.res_out);
 572         zero_fd_set(n, fds.res_ex);
 573
 574         ret = do_select(n, &fds, end_time);
 575
 576         if (ret < 0)
 577                 goto out;
 578         if (!ret) {
 579                 ret = -ERESTARTNOHAND;
 580                 if (signal_pending(current))
 581                         goto out;
 582                 ret = 0;
 583         }
 584
 585         if (set_fd_set(n, inp, fds.res_in) ||
 586             set_fd_set(n, outp, fds.res_out) ||
 587             set_fd_set(n, exp, fds.res_ex))
 588                 ret = -EFAULT;
 589
 590 out:
 591         if (bits != stack_fds)
 592                 kfree(bits);
 593 out_nofds:
 594         return ret;
 595 }
 596
 597 SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
 598                 fd_set __user *, exp, struct timeval __user *, tvp)
 599 {
 600         struct timespec end_time, *to = NULL;
 601         struct timeval tv;
 602         int ret;
 603
 604         if (tvp) {
 605                 if (copy_from_user(&tv, tvp, sizeof(tv)))
 606                         return -EFAULT;
 607
 608                 to = &end_time;
 609                 if (poll_select_set_timeout(to,
 610                                 tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
 611                                 (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
 612                         return -EINVAL;
 613         }
 614
 615         ret = core_sys_select(n, inp, outp, exp, to);
 616         ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
 617
 618         return ret;
 619 }
 620
 621 static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 622                        fd_set __user *exp, struct timespec __user *tsp,
 623                        const sigset_t __user *sigmask, size_t sigsetsize)
 624 {
 625         sigset_t ksigmask, sigsaved;
 626         struct timespec ts, end_time, *to = NULL;
 627         int ret;
 628
 629         if (tsp) {
 630                 if (copy_from_user(&ts, tsp, sizeof(ts)))
 631                         return -EFAULT;
 632
 633                 to = &end_time;
 634                 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 635                         return -EINVAL;
 636         }
 637
 638         if (sigmask) {
 639                 /* XXX: Don't preclude handling different sized sigset_t's.  */
 640                 if (sigsetsize != sizeof(sigset_t))
 641                         return -EINVAL;
 642                 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
 643                         return -EFAULT;
 644
 645                 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 646                 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 647         }
 648
 649         ret = core_sys_select(n, inp, outp, exp, to);
 650         ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 651
 652         if (ret == -ERESTARTNOHAND) {
 653                 /*
 654                  * Don't restore the signal mask yet. Let do_signal() deliver
 655                  * the signal on the way back to userspace, before the signal
 656                  * mask is restored.
 657                  */
 658                 if (sigmask) {
 659                         memcpy(&current->saved_sigmask, &sigsaved,
 660                                         sizeof(sigsaved));
 661                         set_restore_sigmask();
 662                 }
 663         } else if (sigmask)
 664                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 665
 666         return ret;
 667 }
 668
 669 /*
 670  * Most architectures can't handle 7-argument syscalls. So we provide a
 671  * 6-argument version where the sixth argument is a pointer to a structure
 672  * which has a pointer to the sigset_t itself followed by a size_t containing
 673  * the sigset size.
 674  */
 675 SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
 676                 fd_set __user *, exp, struct timespec __user *, tsp,
 677                 void __user *, sig)
 678 {
 679         size_t sigsetsize = 0;
 680         sigset_t __user *up = NULL;
 681
 682         if (sig) {
 683                 if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
 684                     || __get_user(up, (sigset_t __user * __user *)sig)
 685                     || __get_user(sigsetsize,
 686                                 (size_t __user *)(sig+sizeof(void *))))
 687                         return -EFAULT;
 688         }
 689
 690         return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
 691 }
 692
 693 #ifdef __ARCH_WANT_SYS_OLD_SELECT
 694 struct sel_arg_struct {
 695         unsigned long n;
 696         fd_set __user *inp, *outp, *exp;
 697         struct timeval __user *tvp;
 698 };
 699
 700 SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
 701 {
 702         struct sel_arg_struct a;
 703
 704         if (copy_from_user(&a, arg, sizeof(a)))
 705                 return -EFAULT;
 706         return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
 707 }
 708 #endif
 709
 710 struct poll_list {
 711         struct poll_list *next;
 712         int len;
 713         struct pollfd entries[0];
 714 };
 715
 716 #define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
 717
 718 /*
 719  * Fish for pollable events on the pollfd->fd file descriptor. We're only
 720  * interested in events matching the pollfd->events mask, and the result
 721  * matching that mask is both recorded in pollfd->revents and returned. The
 722  * pwait poll_table will be used by the fd-provided poll handler for waiting,
 723  * if pwait->_qproc is non-NULL.
 724  */
 725 static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 726 {
 727         unsigned int mask;
 728         int fd;
 729
 730         mask = 0;
 731         fd = pollfd->fd;
 732         if (fd >= 0) {
 733                 struct fd f = fdget(fd);
 734                 mask = POLLNVAL;
 735                 if (f.file) {
 736                         mask = DEFAULT_POLLMASK;
 737                         if (f.file->f_op && f.file->f_op->poll) {
 738                                 pwait->_key = pollfd->events|POLLERR|POLLHUP;
 739                                 mask = f.file->f_op->poll(f.file, pwait);
 740                         }
 741                         /* Mask out unneeded events. */
 742                         mask &= pollfd->events | POLLERR | POLLHUP;
 743                         fdput(f);
 744                 }
 745         }
 746         pollfd->revents = mask;
 747
 748         return mask;
 749 }
 750
 751 static int do_poll(unsigned int nfds,  struct poll_list *list,
 752                    struct poll_wqueues *wait, struct timespec *end_time)
 753 {
 754         poll_table* pt = &wait->pt;
 755         ktime_t expire, *to = NULL;
 756         int timed_out = 0, count = 0;
 757         unsigned long slack = 0;
 758
 759         /* Optimise the no-wait case */
 760         if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
 761                 pt->_qproc = NULL;
 762                 timed_out = 1;
 763         }
 764
 765         if (end_time && !timed_out)
 766                 slack = select_estimate_accuracy(end_time);
 767
 768         for (;;) {
 769                 struct poll_list *walk;
 770
 771                 for (walk = list; walk != NULL; walk = walk->next) {
 772                         struct pollfd * pfd, * pfd_end;
 773
 774                         pfd = walk->entries;
 775                         pfd_end = pfd + walk->len;
 776                         for (; pfd != pfd_end; pfd++) {
 777                                 /*
 778                                  * Fish for events. If we found one, record it
 779                                  * and kill poll_table->_qproc, so we don't
 780                                  * needlessly register any other waiters after
 781                                  * this. They'll get immediately deregistered
 782                                  * when we break out and return.
 783                                  */
 784                                 if (do_pollfd(pfd, pt)) {
 785                                         count++;
 786                                         pt->_qproc = NULL;
 787                                 }
 788                         }
 789                 }
 790                 /*
 791                  * All waiters have already been registered, so don't provide
 792                  * a poll_table->_qproc to them on the next loop iteration.
 793                  */
 794                 pt->_qproc = NULL;
 795                 if (!count) {
 796                         count = wait->error;
 797                         if (signal_pending(current))
 798                                 count = -EINTR;
 799                 }
 800                 if (count || timed_out)
 801                         break;
 802
 803                 /*
 804                  * If this is the first loop and we have a timeout
 805                  * given, then we convert to ktime_t and set the to
 806                  * pointer to the expiry value.
 807                  */
 808                 if (end_time && !to) {
 809                         expire = timespec_to_ktime(*end_time);
 810                         to = &expire;
 811                 }
 812
 813                 if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
 814                         timed_out = 1;
 815         }
 816         return count;
 817 }
 818
 819 #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
 820                         sizeof(struct pollfd))
 821
 822 int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 823                 struct timespec *end_time)
 824 {
 825         struct poll_wqueues table;
 826         int err = -EFAULT, fdcount, len, size;
 827         /* Allocate small arguments on the stack to save memory and be
 828            faster - use long to make sure the buffer is aligned properly
 829            on 64 bit archs to avoid unaligned access */
 830         long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
 831         struct poll_list *const head = (struct poll_list *)stack_pps;
 832         struct poll_list *walk = head;
 833         unsigned long todo = nfds;
 834
 835         if (nfds > rlimit(RLIMIT_NOFILE))
 836                 return -EINVAL;
 837
 838         len = min_t(unsigned int, nfds, N_STACK_PPS);
 839         for (;;) {
 840                 walk->next = NULL;
 841                 walk->len = len;
 842                 if (!len)
 843                         break;
 844
 845                 if (copy_from_user(walk->entries, ufds + nfds-todo,
 846                                         sizeof(struct pollfd) * walk->len))
 847                         goto out_fds;
 848
 849                 todo -= walk->len;
 850                 if (!todo)
 851                         break;
 852
 853                 len = min(todo, POLLFD_PER_PAGE);
 854                 size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
 855                 walk = walk->next = kmalloc(size, GFP_KERNEL);
 856                 if (!walk) {
 857                         err = -ENOMEM;
 858                         goto out_fds;
 859                 }
 860         }
 861
 862         poll_initwait(&table);
 863         fdcount = do_poll(nfds, head, &table, end_time);
 864         poll_freewait(&table);
 865
 866         for (walk = head; walk; walk = walk->next) {
 867                 struct pollfd *fds = walk->entries;
 868                 int j;
 869
 870                 for (j = 0; j < walk->len; j++, ufds++)
 871                         if (__put_user(fds[j].revents, &ufds->revents))
 872                                 goto out_fds;
 873         }
 874
 875         err = fdcount;
 876 out_fds:
 877         walk = head->next;
 878         while (walk) {
 879                 struct poll_list *pos = walk;
 880                 walk = walk->next;
 881                 kfree(pos);
 882         }
 883
 884         return err;
 885 }
 886
 887 static long do_restart_poll(struct restart_block *restart_block)
 888 {
 889         struct pollfd __user *ufds = restart_block->poll.ufds;
 890         int nfds = restart_block->poll.nfds;
 891         struct timespec *to = NULL, end_time;
 892         int ret;
 893
 894         if (restart_block->poll.has_timeout) {
 895                 end_time.tv_sec = restart_block->poll.tv_sec;
 896                 end_time.tv_nsec = restart_block->poll.tv_nsec;
 897                 to = &end_time;
 898         }
 899
 900         ret = do_sys_poll(ufds, nfds, to);
 901
 902         if (ret == -EINTR) {
 903                 restart_block->fn = do_restart_poll;
 904                 ret = -ERESTART_RESTARTBLOCK;
 905         }
 906         return ret;
 907 }
 908
 909 SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 910                 int, timeout_msecs)
 911 {
 912         struct timespec end_time, *to = NULL;
 913         int ret;
 914
 915         if (timeout_msecs >= 0) {
 916                 to = &end_time;
 917                 poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
 918                         NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
 919         }
 920
 921         ret = do_sys_poll(ufds, nfds, to);
 922
 923         if (ret == -EINTR) {
 924                 struct restart_block *restart_block;
 925
 926                 restart_block = &current_thread_info()->restart_block;
 927                 restart_block->fn = do_restart_poll;
 928                 restart_block->poll.ufds = ufds;
 929                 restart_block->poll.nfds = nfds;
 930
 931                 if (timeout_msecs >= 0) {
 932                         restart_block->poll.tv_sec = end_time.tv_sec;
 933                         restart_block->poll.tv_nsec = end_time.tv_nsec;
 934                         restart_block->poll.has_timeout = 1;
 935                 } else
 936                         restart_block->poll.has_timeout = 0;
 937
 938                 ret = -ERESTART_RESTARTBLOCK;
 939         }
 940         return ret;
 941 }
 942
 943 SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 944                 struct timespec __user *, tsp, const sigset_t __user *, sigmask,
 945                 size_t, sigsetsize)
 946 {
 947         sigset_t ksigmask, sigsaved;
 948         struct timespec ts, end_time, *to = NULL;
 949         int ret;
 950
 951         if (tsp) {
 952                 if (copy_from_user(&ts, tsp, sizeof(ts)))
 953                         return -EFAULT;
 954
 955                 to = &end_time;
 956                 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 957                         return -EINVAL;
 958         }
 959
 960         if (sigmask) {
 961                 /* XXX: Don't preclude handling different sized sigset_t's.  */
 962                 if (sigsetsize != sizeof(sigset_t))
 963                         return -EINVAL;
 964                 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
 965                         return -EFAULT;
 966
 967                 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 968                 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 969         }
 970
 971         ret = do_sys_poll(ufds, nfds, to);
 972
 973         /* We can restart this syscall, usually */
 974         if (ret == -EINTR) {
 975                 /*
 976                  * Don't restore the signal mask yet. Let do_signal() deliver
 977                  * the signal on the way back to userspace, before the signal
 978                  * mask is restored.
 979                  */
 980                 if (sigmask) {
 981                         memcpy(&current->saved_sigmask, &sigsaved,
 982                                         sizeof(sigsaved));
 983                         set_restore_sigmask();
 984                 }
 985                 ret = -ERESTARTNOHAND;
 986         } else if (sigmask)
 987                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 988
 989         ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 990
 991         return ret;
 992 }