perf: Fix owner-list vs exit
Peter Zijlstra [Tue, 9 Nov 2010 18:01:43 +0000 (19:01 +0100)]
Oleg noticed that a perf-fd keeping a reference on the creating task
leads to a few funny side effects.

There's two different aspects to this:

  - kernel based perf-events, these should not take out
    a reference on the creating task and appear on the task's
    event list since they're not bound to fds nor visible
    to userspace.

  - fork() and pthread_create(), these can lead to the creating
    task dying (and thus the task's event-list becomming useless)
    but keeping the list and ref alive until the event is closed.

Combined they lead to malfunction of the ptrace hw_tracepoints.

Cure this by not considering kernel based perf_events for the
owner-list and destroying the owner-list when the owner dies.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Oleg Nesterov <oleg@redhat.com>
LKML-Reference: <1289576883.2084.286.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

kernel/perf_event.c

index f818d9d..671f6c8 100644 (file)
@@ -2235,11 +2235,6 @@ int perf_event_release_kernel(struct perf_event *event)
        raw_spin_unlock_irq(&ctx->lock);
        mutex_unlock(&ctx->mutex);
 
-       mutex_lock(&event->owner->perf_event_mutex);
-       list_del_init(&event->owner_entry);
-       mutex_unlock(&event->owner->perf_event_mutex);
-       put_task_struct(event->owner);
-
        free_event(event);
 
        return 0;
@@ -2252,9 +2247,43 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 static int perf_release(struct inode *inode, struct file *file)
 {
        struct perf_event *event = file->private_data;
+       struct task_struct *owner;
 
        file->private_data = NULL;
 
+       rcu_read_lock();
+       owner = ACCESS_ONCE(event->owner);
+       /*
+        * Matches the smp_wmb() in perf_event_exit_task(). If we observe
+        * !owner it means the list deletion is complete and we can indeed
+        * free this event, otherwise we need to serialize on
+        * owner->perf_event_mutex.
+        */
+       smp_read_barrier_depends();
+       if (owner) {
+               /*
+                * Since delayed_put_task_struct() also drops the last
+                * task reference we can safely take a new reference
+                * while holding the rcu_read_lock().
+                */
+               get_task_struct(owner);
+       }
+       rcu_read_unlock();
+
+       if (owner) {
+               mutex_lock(&owner->perf_event_mutex);
+               /*
+                * We have to re-check the event->owner field, if it is cleared
+                * we raced with perf_event_exit_task(), acquiring the mutex
+                * ensured they're done, and we can proceed with freeing the
+                * event.
+                */
+               if (event->owner)
+                       list_del_init(&event->owner_entry);
+               mutex_unlock(&owner->perf_event_mutex);
+               put_task_struct(owner);
+       }
+
        return perf_event_release_kernel(event);
 }
 
@@ -5678,7 +5707,7 @@ SYSCALL_DEFINE5(perf_event_open,
        mutex_unlock(&ctx->mutex);
 
        event->owner = current;
-       get_task_struct(current);
+
        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);
@@ -5746,12 +5775,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        ++ctx->generation;
        mutex_unlock(&ctx->mutex);
 
-       event->owner = current;
-       get_task_struct(current);
-       mutex_lock(&current->perf_event_mutex);
-       list_add_tail(&event->owner_entry, &current->perf_event_list);
-       mutex_unlock(&current->perf_event_mutex);
-
        return event;
 
 err_free:
@@ -5902,8 +5925,24 @@ again:
  */
 void perf_event_exit_task(struct task_struct *child)
 {
+       struct perf_event *event, *tmp;
        int ctxn;
 
+       mutex_lock(&child->perf_event_mutex);
+       list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+                                owner_entry) {
+               list_del_init(&event->owner_entry);
+
+               /*
+                * Ensure the list deletion is visible before we clear
+                * the owner, closes a race against perf_release() where
+                * we need to serialize on the owner->perf_event_mutex.
+                */
+               smp_wmb();
+               event->owner = NULL;
+       }
+       mutex_unlock(&child->perf_event_mutex);
+
        for_each_task_context_nr(ctxn)
                perf_event_exit_task_context(child, ctxn);
 }