tracepoint: add tracepoints for debugging oom_score_adj
KAMEZAWA Hiroyuki [Tue, 10 Jan 2012 23:08:09 +0000 (15:08 -0800)]
oom_score_adj is used for guarding processes from OOM-Killer.  One of
problem is that it's inherited at fork().  When a daemon set oom_score_adj
and make children, it's hard to know where the value is set.

This patch adds some tracepoints useful for debugging. This patch adds
3 trace points.
  - creating new task
  - renaming a task (exec)
  - set oom_score_adj

To debug, users need to enable some trace pointer. Maybe filtering is useful as

# EVENT=/sys/kernel/debug/tracing/events/task/
# echo "oom_score_adj != 0" > $EVENT/task_newtask/filter
# echo "oom_score_adj != 0" > $EVENT/task_rename/filter
# echo 1 > $EVENT/enable
# EVENT=/sys/kernel/debug/tracing/events/oom/
# echo 1 > $EVENT/enable

output will be like this.
# grep oom /sys/kernel/debug/tracing/trace
bash-7699  [007] d..3  5140.744510: oom_score_adj_update: pid=7699 comm=bash oom_score_adj=-1000
bash-7699  [007] ...1  5151.818022: task_newtask: pid=7729 comm=bash clone_flags=1200011 oom_score_adj=-1000
ls-7729  [003] ...2  5151.818504: task_rename: pid=7729 oldcomm=bash newcomm=ls oom_score_adj=-1000
bash-7699  [002] ...1  5175.701468: task_newtask: pid=7730 comm=bash clone_flags=1200011 oom_score_adj=-1000
grep-7730  [007] ...2  5175.701993: task_rename: pid=7730 oldcomm=bash newcomm=grep oom_score_adj=-1000

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

fs/exec.c
fs/proc/base.c
include/trace/events/oom.h [new file with mode: 0644]
include/trace/events/task.h [new file with mode: 0644]
kernel/fork.c
mm/oom_kill.c

index 3f64b9f..aeb135c 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,6 +59,8 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+
+#include <trace/events/task.h>
 #include "internal.h"
 
 int core_uses_pid;
@@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 {
        task_lock(tsk);
 
+       trace_task_rename(tsk, buf);
+
        /*
         * Threads may access current->comm without holding
         * the task lock, so write the string carefully.
index a1dddda..1aab5fe 100644 (file)
@@ -86,6 +86,7 @@
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
+#include <trace/events/oom.h>
 #include "internal.h"
 
 /* NOTE:
@@ -1010,6 +1011,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        else
                task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
                                                                -OOM_DISABLE;
+       trace_oom_score_adj_update(task);
 err_sighand:
        unlock_task_sighand(task, &flags);
 err_task_lock:
@@ -1097,6 +1099,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        task->signal->oom_score_adj = oom_score_adj;
        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
                task->signal->oom_score_adj_min = oom_score_adj;
+       trace_oom_score_adj_update(task);
        /*
         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
         * always attainable.
diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
new file mode 100644 (file)
index 0000000..dd4ba3b
--- /dev/null
@@ -0,0 +1,33 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM oom
+
+#if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OOM_H
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(oom_score_adj_update,
+
+       TP_PROTO(struct task_struct *task),
+
+       TP_ARGS(task),
+
+       TP_STRUCT__entry(
+               __field(        pid_t,  pid)
+               __array(        char,   comm,   TASK_COMM_LEN )
+               __field(         int,   oom_score_adj)
+       ),
+
+       TP_fast_assign(
+               __entry->pid = task->pid;
+               memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+               __entry->oom_score_adj = task->signal->oom_score_adj;
+       ),
+
+       TP_printk("pid=%d comm=%s oom_score_adj=%d",
+               __entry->pid, __entry->comm, __entry->oom_score_adj)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/task.h b/include/trace/events/task.h
new file mode 100644 (file)
index 0000000..b53add0
--- /dev/null
@@ -0,0 +1,61 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM task
+
+#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TASK_H
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(task_newtask,
+
+       TP_PROTO(struct task_struct *task, unsigned long clone_flags),
+
+       TP_ARGS(task, clone_flags),
+
+       TP_STRUCT__entry(
+               __field(        pid_t,  pid)
+               __array(        char,   comm, TASK_COMM_LEN)
+               __field( unsigned long, clone_flags)
+               __field(        int,    oom_score_adj)
+       ),
+
+       TP_fast_assign(
+               __entry->pid = task->pid;
+               memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+               __entry->clone_flags = clone_flags;
+               __entry->oom_score_adj = task->signal->oom_score_adj;
+       ),
+
+       TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d",
+               __entry->pid, __entry->comm,
+               __entry->clone_flags, __entry->oom_score_adj)
+);
+
+TRACE_EVENT(task_rename,
+
+       TP_PROTO(struct task_struct *task, char *comm),
+
+       TP_ARGS(task, comm),
+
+       TP_STRUCT__entry(
+               __field(        pid_t,  pid)
+               __array(        char, oldcomm,  TASK_COMM_LEN)
+               __array(        char, newcomm,  TASK_COMM_LEN)
+               __field(        int, oom_score_adj)
+       ),
+
+       TP_fast_assign(
+               __entry->pid = task->pid;
+               memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN);
+               memcpy(entry->newcomm, comm, TASK_COMM_LEN);
+               __entry->oom_score_adj = task->signal->oom_score_adj;
+       ),
+
+       TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d",
+               __entry->pid, __entry->oldcomm,
+               __entry->newcomm, __entry->oom_score_adj)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index b00711c..5e1391b 100644 (file)
@@ -76,6 +76,9 @@
 
 #include <trace/events/sched.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -1370,6 +1373,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (clone_flags & CLONE_THREAD)
                threadgroup_change_end(current);
        perf_event_fork(p);
+
+       trace_task_newtask(p, clone_flags);
+
        return p;
 
 bad_fork_free_pid:
index eeb27e2..7c122fa 100644 (file)
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/freezer.h>
+#include <linux/ftrace.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/oom.h>
 
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
@@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val)
        spin_lock_irq(&sighand->siglock);
        if (current->signal->oom_score_adj == old_val)
                current->signal->oom_score_adj = new_val;
+       trace_oom_score_adj_update(current);
        spin_unlock_irq(&sighand->siglock);
 }
 
@@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val)
        spin_lock_irq(&sighand->siglock);
        old_val = current->signal->oom_score_adj;
        current->signal->oom_score_adj = new_val;
+       trace_oom_score_adj_update(current);
        spin_unlock_irq(&sighand->siglock);
 
        return old_val;