gpu: nvgpu: remove temporary gpfifo allocation in submit path
Deepak Nibade [Mon, 26 Oct 2015 13:17:55 +0000 (18:17 +0530)]
In GPU job submit path gk20a_ioctl_channel_submit_gpfifo(),
we currently allocate a temporary gpfifo, copy user space
gpfifo content into this temporary buffer, and then copy
temp buffer content into channel's gpfifo.

Allocation/copy/free of temporary buffer adds additional
overhead

Rewrite this sequence such that gk20a_submit_channel_gpfifo()
can receive either a pre-filled gpfifo or pointer to
user provided args.
And then we can direclty copy the user provided gpfifo
into the channel's gpfifo

Also, if command buffer tracing is enabled, we still need
to copy user provided gpfifo into temporaty buffer for reading
But that should not cause overhead in real world use case

Bug 200141116

Change-Id: I7166c9271da2694059da9853ab8839e98457b941
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: http://git-master/r/823386
(cherry picked from commit 3e0702db006c262dd8737a567b8e06f7ff005e2c)
Reviewed-on: http://git-master/r/835816
Reviewed-on: http://git-master/r/838591
GVS: Gerrit_Virtual_Submit
Tested-by: Kiran SJ <ksj@nvidia.com>
Reviewed-by: Bharat Nihalani <bnihalani@nvidia.com>

drivers/gpu/nvgpu/gk20a/cde_gk20a.c
drivers/gpu/nvgpu/gk20a/channel_gk20a.c
drivers/gpu/nvgpu/gk20a/channel_gk20a.h

index f148c65..b3f3d66 100644 (file)
@@ -717,7 +717,7 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
                return -ENOSYS;
        }
 
-       return gk20a_submit_channel_gpfifo(cde_ctx->ch, gpfifo,
+       return gk20a_submit_channel_gpfifo(cde_ctx->ch, gpfifo, NULL,
                                           num_entries, flags, fence, fence_out);
 }
 
index 3f610de..84727b2 100644 (file)
@@ -1517,14 +1517,42 @@ static void trace_write_pushbuffer(struct channel_gk20a *c,
 
 static void trace_write_pushbuffer_range(struct channel_gk20a *c,
                                         struct nvgpu_gpfifo *g,
+                                        struct nvgpu_submit_gpfifo_args *args,
+                                        int offset,
                                         int count)
 {
-       if (gk20a_debug_trace_cmdbuf) {
-               int i;
-               struct nvgpu_gpfifo *gp = g;
-               for (i = 0; i < count; i++, gp++)
-                       trace_write_pushbuffer(c, gp);
+       u32 size;
+       int i;
+       struct nvgpu_gpfifo *gp;
+       bool gpfifo_allocated = false;
+
+       if (!gk20a_debug_trace_cmdbuf)
+               return;
+
+       if (!g && !args)
+               return;
+
+       if (!g) {
+               size = args->num_entries * sizeof(struct nvgpu_gpfifo);
+               if (size) {
+                       g = nvgpu_alloc(size, false);
+                       if (!g)
+                               return;
+
+                       if (copy_from_user(g,
+                               (void __user *)(uintptr_t)args->gpfifo, size)) {
+                               return;
+                       }
+               }
+               gpfifo_allocated = true;
        }
+
+       gp = g + offset;
+       for (i = 0; i < count; i++, gp++)
+               trace_write_pushbuffer(c, gp);
+
+       if (gpfifo_allocated)
+               nvgpu_free(g);
 }
 
 static int gk20a_channel_add_job(struct channel_gk20a *c,
@@ -1633,6 +1661,7 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
 
 int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
                                struct nvgpu_gpfifo *gpfifo,
+                               struct nvgpu_submit_gpfifo_args *args,
                                u32 num_entries,
                                u32 flags,
                                struct nvgpu_fence *fence,
@@ -1664,6 +1693,9 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
                return -ENOMEM;
        }
 
+       if (!gpfifo && !args)
+               return -EINVAL;
+
        if ((flags & (NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT |
                      NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) &&
            !fence)
@@ -1808,24 +1840,72 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
        start = c->gpfifo.put;
        end = start + num_entries;
 
-       if (end > c->gpfifo.entry_num) {
-               int length0 = c->gpfifo.entry_num - start;
-               int length1 = num_entries - length0;
+       if (gpfifo) {
+               if (end > c->gpfifo.entry_num) {
+                       int length0 = c->gpfifo.entry_num - start;
+                       int length1 = num_entries - length0;
 
-               memcpy((struct gpfifo *)c->gpfifo.mem.cpu_va + start, gpfifo,
-                      length0 * sizeof(*gpfifo));
+                       memcpy((struct gpfifo *)c->gpfifo.mem.cpu_va + start,
+                               gpfifo,
+                               length0 * sizeof(*gpfifo));
 
-               memcpy((struct gpfifo *)c->gpfifo.mem.cpu_va, gpfifo + length0,
-                      length1 * sizeof(*gpfifo));
+                       memcpy((struct gpfifo *)c->gpfifo.mem.cpu_va,
+                               gpfifo + length0,
+                               length1 * sizeof(*gpfifo));
+
+                       trace_write_pushbuffer_range(c, gpfifo, NULL,
+                                       0, length0);
+                       trace_write_pushbuffer_range(c, gpfifo, NULL,
+                                       length0, length1);
+               } else {
+                       memcpy((struct gpfifo *)c->gpfifo.mem.cpu_va + start,
+                               gpfifo,
+                               num_entries * sizeof(*gpfifo));
 
-               trace_write_pushbuffer_range(c, gpfifo, length0);
-               trace_write_pushbuffer_range(c, gpfifo + length0, length1);
+                       trace_write_pushbuffer_range(c, gpfifo, NULL,
+                                       0, num_entries);
+               }
        } else {
-               memcpy((struct gpfifo *)c->gpfifo.mem.cpu_va + start, gpfifo,
-                      num_entries * sizeof(*gpfifo));
+               struct nvgpu_gpfifo __user *user_gpfifo =
+                       (struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo;
+               if (end > c->gpfifo.entry_num) {
+                       int length0 = c->gpfifo.entry_num - start;
+                       int length1 = num_entries - length0;
+
+                       err = copy_from_user((struct gpfifo *)c->gpfifo.mem.cpu_va + start,
+                               user_gpfifo,
+                               length0 * sizeof(*user_gpfifo));
+                       if (err) {
+                               mutex_unlock(&c->submit_lock);
+                               goto clean_up;
+                       }
+
+                       err = copy_from_user((struct gpfifo *)c->gpfifo.mem.cpu_va,
+                               user_gpfifo + length0,
+                               length1 * sizeof(*user_gpfifo));
+                       if (err) {
+                               mutex_unlock(&c->submit_lock);
+                               goto clean_up;
+                       }
 
-               trace_write_pushbuffer_range(c, gpfifo, num_entries);
+                       trace_write_pushbuffer_range(c, NULL, args,
+                                       0, length0);
+                       trace_write_pushbuffer_range(c, NULL, args,
+                                       length0, length1);
+               } else {
+                       err = copy_from_user((struct gpfifo *)c->gpfifo.mem.cpu_va + start,
+                               user_gpfifo,
+                               num_entries * sizeof(*user_gpfifo));
+                       if (err) {
+                               mutex_unlock(&c->submit_lock);
+                               goto clean_up;
+                       }
+
+                       trace_write_pushbuffer_range(c, NULL, args,
+                                       0, num_entries);
+               }
        }
+
        c->gpfifo.put = (c->gpfifo.put + num_entries) &
                (c->gpfifo.entry_num - 1);
 
@@ -2318,8 +2398,6 @@ static int gk20a_ioctl_channel_submit_gpfifo(
        struct nvgpu_submit_gpfifo_args *args)
 {
        struct gk20a_fence *fence_out;
-       void *gpfifo = NULL;
-       u32 size;
        int ret = 0;
 
        gk20a_dbg_fn("");
@@ -2327,23 +2405,7 @@ static int gk20a_ioctl_channel_submit_gpfifo(
        if (ch->has_timedout)
                return -ETIMEDOUT;
 
-       /* zero-sized submits are allowed, since they can be used for
-        * synchronization; we might still wait and do an increment */
-       size = args->num_entries * sizeof(struct nvgpu_gpfifo);
-       if (size) {
-               gpfifo = nvgpu_alloc(size, false);
-               if (!gpfifo)
-                       return -ENOMEM;
-
-               if (copy_from_user(gpfifo,
-                                       (void __user *)(uintptr_t)args->gpfifo,
-                                       size)) {
-                       ret = -EINVAL;
-                       goto clean_up;
-               }
-       }
-
-       ret = gk20a_submit_channel_gpfifo(ch, gpfifo, args->num_entries,
+       ret = gk20a_submit_channel_gpfifo(ch, NULL, args, args->num_entries,
                                          args->flags, &args->fence,
                                          &fence_out);
 
@@ -2366,7 +2428,6 @@ static int gk20a_ioctl_channel_submit_gpfifo(
        gk20a_fence_put(fence_out);
 
 clean_up:
-       nvgpu_free(gpfifo);
        return ret;
 }
 
index d0e79dc..4932ba7 100644 (file)
@@ -237,6 +237,7 @@ void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a);
 
 int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
                                struct nvgpu_gpfifo *gpfifo,
+                               struct nvgpu_submit_gpfifo_args *args,
                                u32 num_entries,
                                u32 flags,
                                struct nvgpu_fence *fence,