]> nv-tegra.nvidia Code Review - linux-2.6.git/blobdiff - kernel/perf_counter.c
Merge branch 'perfcounters-fixes-for-linus' of git://git.kernel.org/pub/scm/linux...
[linux-2.6.git] / kernel / perf_counter.c
index a67a1dc3cfa3858c0ca969d1feea2f5605fb331b..e7f60f8e31edc99a57a57fb4fb0c2353c9093341 100644 (file)
@@ -46,12 +46,18 @@ static atomic_t nr_task_counters __read_mostly;
 
 /*
  * perf counter paranoia level:
- *  0 - not paranoid
- *  1 - disallow cpu counters to unpriv
- *  2 - disallow kernel profiling to unpriv
+ *  -1 - not paranoid at all
+ *   0 - disallow raw tracepoint access for unpriv
+ *   1 - disallow cpu counters for unpriv
+ *   2 - disallow kernel profiling for unpriv
  */
 int sysctl_perf_counter_paranoid __read_mostly = 1;
 
+static inline bool perf_paranoid_tracepoint_raw(void)
+{
+       return sysctl_perf_counter_paranoid > -1;
+}
+
 static inline bool perf_paranoid_cpu(void)
 {
        return sysctl_perf_counter_paranoid > 0;
@@ -469,7 +475,8 @@ static void update_counter_times(struct perf_counter *counter)
        struct perf_counter_context *ctx = counter->ctx;
        u64 run_end;
 
-       if (counter->state < PERF_COUNTER_STATE_INACTIVE)
+       if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
+           counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
                return;
 
        counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
@@ -518,7 +525,7 @@ static void __perf_counter_disable(void *info)
         */
        if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
                update_context_time(ctx);
-               update_counter_times(counter);
+               update_group_times(counter);
                if (counter == counter->group_leader)
                        group_sched_out(counter, cpuctx, ctx);
                else
@@ -573,7 +580,7 @@ static void perf_counter_disable(struct perf_counter *counter)
         * in, so we can change the state safely.
         */
        if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
-               update_counter_times(counter);
+               update_group_times(counter);
                counter->state = PERF_COUNTER_STATE_OFF;
        }
 
@@ -850,6 +857,27 @@ retry:
        spin_unlock_irq(&ctx->lock);
 }
 
+/*
+ * Put a counter into inactive state and update time fields.
+ * Enabling the leader of a group effectively enables all
+ * the group members that aren't explicitly disabled, so we
+ * have to update their ->tstamp_enabled also.
+ * Note: this works for group members as well as group leaders
+ * since the non-leader members' sibling_lists will be empty.
+ */
+static void __perf_counter_mark_enabled(struct perf_counter *counter,
+                                       struct perf_counter_context *ctx)
+{
+       struct perf_counter *sub;
+
+       counter->state = PERF_COUNTER_STATE_INACTIVE;
+       counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+       list_for_each_entry(sub, &counter->sibling_list, list_entry)
+               if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
+                       sub->tstamp_enabled =
+                               ctx->time - sub->total_time_enabled;
+}
+
 /*
  * Cross CPU call to enable a performance counter
  */
@@ -877,8 +905,7 @@ static void __perf_counter_enable(void *info)
 
        if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
                goto unlock;
-       counter->state = PERF_COUNTER_STATE_INACTIVE;
-       counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+       __perf_counter_mark_enabled(counter, ctx);
 
        /*
         * If the counter is in a group and isn't the group leader,
@@ -971,11 +998,9 @@ static void perf_counter_enable(struct perf_counter *counter)
         * Since we have the lock this context can't be scheduled
         * in, so we can change the state safely.
         */
-       if (counter->state == PERF_COUNTER_STATE_OFF) {
-               counter->state = PERF_COUNTER_STATE_INACTIVE;
-               counter->tstamp_enabled =
-                       ctx->time - counter->total_time_enabled;
-       }
+       if (counter->state == PERF_COUNTER_STATE_OFF)
+               __perf_counter_mark_enabled(counter, ctx);
+
  out:
        spin_unlock_irq(&ctx->lock);
 }
@@ -1479,9 +1504,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
                counter->attr.enable_on_exec = 0;
                if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
                        continue;
-               counter->state = PERF_COUNTER_STATE_INACTIVE;
-               counter->tstamp_enabled =
-                       ctx->time - counter->total_time_enabled;
+               __perf_counter_mark_enabled(counter, ctx);
                enabled = 1;
        }
 
@@ -1675,6 +1698,11 @@ static void free_counter(struct perf_counter *counter)
                        atomic_dec(&nr_task_counters);
        }
 
+       if (counter->output) {
+               fput(counter->output->filp);
+               counter->output = NULL;
+       }
+
        if (counter->destroy)
                counter->destroy(counter);
 
@@ -1960,6 +1988,8 @@ unlock:
        return ret;
 }
 
+int perf_counter_set_output(struct perf_counter *counter, int output_fd);
+
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        struct perf_counter *counter = file->private_data;
@@ -1983,6 +2013,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case PERF_COUNTER_IOC_PERIOD:
                return perf_counter_period(counter, (u64 __user *)arg);
 
+       case PERF_COUNTER_IOC_SET_OUTPUT:
+               return perf_counter_set_output(counter, arg);
+
        default:
                return -ENOTTY;
        }
@@ -2253,6 +2286,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
        WARN_ON_ONCE(counter->ctx->parent_ctx);
        mutex_lock(&counter->mmap_mutex);
+       if (counter->output) {
+               ret = -EINVAL;
+               goto unlock;
+       }
+
        if (atomic_inc_not_zero(&counter->mmap_count)) {
                if (nr_pages != counter->data->nr_pages)
                        ret = -EINVAL;
@@ -2638,6 +2676,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
                             struct perf_counter *counter, unsigned int size,
                             int nmi, int sample)
 {
+       struct perf_counter *output_counter;
        struct perf_mmap_data *data;
        unsigned int offset, head;
        int have_lost;
@@ -2647,13 +2686,17 @@ static int perf_output_begin(struct perf_output_handle *handle,
                u64                      lost;
        } lost_event;
 
+       rcu_read_lock();
        /*
         * For inherited counters we send all the output towards the parent.
         */
        if (counter->parent)
                counter = counter->parent;
 
-       rcu_read_lock();
+       output_counter = rcu_dereference(counter->output);
+       if (output_counter)
+               counter = output_counter;
+
        data = rcu_dereference(counter->data);
        if (!data)
                goto out;
@@ -3934,6 +3977,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
         * have these.
         */
        if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+                       perf_paranoid_tracepoint_raw() &&
                        !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
 
@@ -4203,6 +4247,57 @@ err_size:
        goto out;
 }
 
+int perf_counter_set_output(struct perf_counter *counter, int output_fd)
+{
+       struct perf_counter *output_counter = NULL;
+       struct file *output_file = NULL;
+       struct perf_counter *old_output;
+       int fput_needed = 0;
+       int ret = -EINVAL;
+
+       if (!output_fd)
+               goto set;
+
+       output_file = fget_light(output_fd, &fput_needed);
+       if (!output_file)
+               return -EBADF;
+
+       if (output_file->f_op != &perf_fops)
+               goto out;
+
+       output_counter = output_file->private_data;
+
+       /* Don't chain output fds */
+       if (output_counter->output)
+               goto out;
+
+       /* Don't set an output fd when we already have an output channel */
+       if (counter->data)
+               goto out;
+
+       atomic_long_inc(&output_file->f_count);
+
+set:
+       mutex_lock(&counter->mmap_mutex);
+       old_output = counter->output;
+       rcu_assign_pointer(counter->output, output_counter);
+       mutex_unlock(&counter->mmap_mutex);
+
+       if (old_output) {
+               /*
+                * we need to make sure no existing perf_output_*()
+                * is still referencing this counter.
+                */
+               synchronize_rcu();
+               fput(old_output->filp);
+       }
+
+       ret = 0;
+out:
+       fput_light(output_file, fput_needed);
+       return ret;
+}
+
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
@@ -4222,15 +4317,15 @@ SYSCALL_DEFINE5(perf_counter_open,
        struct file *group_file = NULL;
        int fput_needed = 0;
        int fput_needed2 = 0;
-       int ret;
+       int err;
 
        /* for future expandability... */
-       if (flags)
+       if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
                return -EINVAL;
 
-       ret = perf_copy_attr(attr_uptr, &attr);
-       if (ret)
-               return ret;
+       err = perf_copy_attr(attr_uptr, &attr);
+       if (err)
+               return err;
 
        if (!attr.exclude_kernel) {
                if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
@@ -4253,8 +4348,8 @@ SYSCALL_DEFINE5(perf_counter_open,
         * Look up the group leader (we will attach this counter to it):
         */
        group_leader = NULL;
-       if (group_fd != -1) {
-               ret = -EINVAL;
+       if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+               err = -EINVAL;
                group_file = fget_light(group_fd, &fput_needed);
                if (!group_file)
                        goto err_put_context;
@@ -4283,18 +4378,24 @@ SYSCALL_DEFINE5(perf_counter_open,
 
        counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
                                     NULL, GFP_KERNEL);
-       ret = PTR_ERR(counter);
+       err = PTR_ERR(counter);
        if (IS_ERR(counter))
                goto err_put_context;
 
-       ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
-       if (ret < 0)
+       err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
+       if (err < 0)
                goto err_free_put_context;
 
-       counter_file = fget_light(ret, &fput_needed2);
+       counter_file = fget_light(err, &fput_needed2);
        if (!counter_file)
                goto err_free_put_context;
 
+       if (flags & PERF_FLAG_FD_OUTPUT) {
+               err = perf_counter_set_output(counter, group_fd);
+               if (err)
+                       goto err_fput_free_put_context;
+       }
+
        counter->filp = counter_file;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
@@ -4308,20 +4409,20 @@ SYSCALL_DEFINE5(perf_counter_open,
        list_add_tail(&counter->owner_entry, &current->perf_counter_list);
        mutex_unlock(&current->perf_counter_mutex);
 
+err_fput_free_put_context:
        fput_light(counter_file, fput_needed2);
 
-out_fput:
-       fput_light(group_file, fput_needed);
-
-       return ret;
-
 err_free_put_context:
-       kfree(counter);
+       if (err < 0)
+               kfree(counter);
 
 err_put_context:
-       put_ctx(ctx);
+       if (err < 0)
+               put_ctx(ctx);
+
+       fput_light(group_file, fput_needed);
 
-       goto out_fput;
+       return err;
 }
 
 /*