video: tegra: host: Add gk20a driver for Tegra12
Mark Stadler [Thu, 2 Aug 2012 16:52:03 +0000 (09:52 -0700)]
Change-Id: I3f92ff7129a8b6bda9f4645f9360b419fd12334d
Signed-off-by: Mark Stadler <mastadler@nvidia.com>

52 files changed:
drivers/video/tegra/host/gk20a/Makefile [new file with mode: 0644]
drivers/video/tegra/host/gk20a/cdma_gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/cdma_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/channel_gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/channel_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/clk_gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/clk_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/debug_gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/debug_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/fifo_gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/fifo_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/gk20a_gating_reglist.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/gr_ctx_gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/gr_ctx_gk20a_sim.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/gr_gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/gr_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_bus_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_ccsr_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_chiplet_pwr_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_ctxsw_prog_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_fb_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_fifo_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_flush_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_gmmu_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_gr_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_ltc_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_mc_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_pbdma_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_pri_ringmaster_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_pri_ringstation_sys_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_proj_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_pwr_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_ram_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_sim_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_therm_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_top_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/hw_trim_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/intr_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/kind_gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/kind_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/mm_gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/mm_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/pmu_gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/pmu_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/priv_ring_gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/priv_ring_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/sim_gk20a.h [new file with mode: 0644]
drivers/video/tegra/host/gk20a/therm_gk20a.c [new file with mode: 0644]
drivers/video/tegra/host/gk20a/therm_gk20a.h [new file with mode: 0644]

diff --git a/drivers/video/tegra/host/gk20a/Makefile b/drivers/video/tegra/host/gk20a/Makefile
new file mode 100644 (file)
index 0000000..b28e04b
--- /dev/null
@@ -0,0 +1,25 @@
+
+GCOV_PROFILE := y
+EXTRA_CFLAGS += -Idrivers/video/tegra/host
+
+nvhost-gk20a-objs  = \
+       gk20a.o \
+       fifo_gk20a.o \
+       channel_gk20a.o \
+       cdma_gk20a.o \
+       debug_gk20a.o \
+       gr_gk20a.o \
+       kind_gk20a.o \
+       mm_gk20a.o \
+       pmu_gk20a.o \
+       priv_ring_gk20a.o \
+       clk_gk20a.o \
+       therm_gk20a.o
+
+ifneq ($(CONFIG_TEGRA_GK20A_NETLIST_FIRMWARE),y)
+nvhost-gk20a-objs += gr_ctx_gk20a_sim.o
+else
+nvhost-gk20a-objs += gr_ctx_gk20a.o
+endif
+
+obj-$(CONFIG_TEGRA_GRHOST) += nvhost-gk20a.o
diff --git a/drivers/video/tegra/host/gk20a/cdma_gk20a.c b/drivers/video/tegra/host/gk20a/cdma_gk20a.c
new file mode 100644 (file)
index 0000000..528786c
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * drivers/video/tegra/host/gk20a/cdma_gk20a.c
+ *
+ * Tegra Graphics Host Command DMA
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "../nvhost_cdma.h"
+#include "../dev.h"
+
+#include "cdma_gk20a.h"
+
+/*
+ * push_buffer
+ *
+ * The push buffer is a circular array of words to be fetched by command DMA.
+ * Note that it works slightly differently to the sync queue; fence == cur
+ * means that the push buffer is full, not empty.
+ */
+
+
+/**
+ * Reset to empty push buffer
+ */
+void gk20a_push_buffer_reset(struct push_buffer *pb)
+{
+}
+
+/**
+ * Init push buffer resources
+ */
+int gk20a_push_buffer_init(struct push_buffer *pb)
+{
+       return 0;
+}
+
+/**
+ * Clean up push buffer resources
+ */
+void gk20a_push_buffer_destroy(struct push_buffer *pb)
+{
+}
+
+/**
+ * Push two words to the push buffer
+ * Caller must ensure push buffer is not full
+ */
+void gk20a_push_buffer_push_to(struct push_buffer *pb,
+                              struct mem_mgr *client,
+                              struct mem_handle *handle, u32 op1, u32 op2)
+{
+}
+
+/**
+ * Pop a number of two word slots from the push buffer
+ * Caller must ensure push buffer is not empty
+ */
+void gk20a_push_buffer_pop_from(struct push_buffer *pb, unsigned int slots)
+{
+}
+
+/**
+ * Return the number of two word slots free in the push buffer
+ */
+u32 gk20a_push_buffer_space(struct push_buffer *pb)
+{
+       return 0;
+}
+
+u32 gk20a_push_buffer_putptr(struct push_buffer *pb)
+{
+       return 0;
+}
+
+
+/**
+ * Start channel DMA
+ */
+void gk20a_cdma_start(struct nvhost_cdma *cdma)
+{
+}
+
+/**
+ * Kick channel DMA into action by writing its PUT offset (if it has changed)
+ */
+void gk20a_cdma_kick(struct nvhost_cdma *cdma)
+{
+}
+
+void gk20a_cdma_stop(struct nvhost_cdma *cdma)
+{
+}
diff --git a/drivers/video/tegra/host/gk20a/cdma_gk20a.h b/drivers/video/tegra/host/gk20a/cdma_gk20a.h
new file mode 100644 (file)
index 0000000..c7e9cd6
--- /dev/null
@@ -0,0 +1,65 @@
+/*
+ * drivers/video/tegra/host/gk20a/cdma_gk20a.h
+ *
+ * Tegra Graphics Host Command DMA
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __CDMA_GK20A_H__
+#define __CDMA_GK20A_H__
+
+#include "mm_gk20a.h"
+
+#define GK20A_PRIV_CMDBUF_ENTRY_NUM            1024
+#define GK20A_PRIV_CMDBUF_ENTRY_PRE_ALLOC_NUM  (GK20A_PRIV_CMDBUF_ENTRY_NUM / 8)
+
+struct priv_cmd_entry {
+       u32 *ptr;
+       u64 gva;
+       u16 get;        /* start of entry in queue */
+       u16 size;       /* in words */
+       u32 gp_get;     /* gp_get when submitting last priv cmd */
+       u32 gp_put;     /* gp_put when submitting last priv cmd */
+       u32 gp_wrap;    /* wrap when submitting last priv cmd */
+       bool pre_alloc; /* prealloc entry, free to free list */
+       struct list_head list;  /* node for lists */
+};
+
+struct priv_cmd_queue {
+       struct mem_desc mem;
+       u32 *base_ptr;  /* base address */
+       u64 base_gva;   /* gpu_va base */
+       u16 size;       /* num of entries in words */
+       u16 put;        /* put for priv cmd queue */
+       u16 get;        /* get for priv cmd queue */
+       struct list_head free;  /* list of pre-allocated free entries */
+       struct list_head head;  /* list of used entries */
+};
+
+void gk20a_push_buffer_reset(struct push_buffer *pb);
+int gk20a_push_buffer_init(struct push_buffer *pb);
+void gk20a_push_buffer_destroy(struct push_buffer *pb);
+void gk20a_push_buffer_push_to(struct push_buffer *pb,
+                              struct mem_mgr *memmgr,
+                              struct mem_handle *handle, u32 op1, u32 op2);
+void gk20a_push_buffer_pop_from(struct push_buffer *pb, unsigned int slots);
+u32 gk20a_push_buffer_space(struct push_buffer *pb);
+u32 gk20a_push_buffer_putptr(struct push_buffer *pb);
+void gk20a_cdma_start(struct nvhost_cdma *cdma);
+void gk20a_cdma_kick(struct nvhost_cdma *cdma);
+void gk20a_cdma_stop(struct nvhost_cdma *cdma);
+
+#endif
diff --git a/drivers/video/tegra/host/gk20a/channel_gk20a.c b/drivers/video/tegra/host/gk20a/channel_gk20a.c
new file mode 100644 (file)
index 0000000..a4c7b40
--- /dev/null
@@ -0,0 +1,1408 @@
+/*
+ * drivers/video/tegra/host/gk20a/channel_gk20a.c
+ *
+ * GK20A Graphics channel
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/delay.h>
+#include <linux/highmem.h> /* need for nvmap.h*/
+#include <trace/events/nvhost.h>
+
+#include "../../nvmap/nvmap.h"
+
+#include "../dev.h"
+#include "../nvhost_as.h"
+
+#include "gk20a.h"
+
+#include "hw_ram_gk20a.h"
+#include "hw_fifo_gk20a.h"
+#include "hw_pbdma_gk20a.h"
+#include "hw_ccsr_gk20a.h"
+#include "chip_support.h"
+
+static struct channel_gk20a *acquire_unused_channel(struct fifo_gk20a *f);
+static void release_used_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
+
+static int alloc_priv_cmdbuf(struct channel_gk20a *c, u32 size,
+                            struct priv_cmd_entry **entry);
+static void free_priv_cmdbuf(struct priv_cmd_queue *q,
+                            struct priv_cmd_entry *e);
+static void recycle_priv_cmdbuf(struct channel_gk20a *c);
+
+static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c);
+static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c);
+
+static int channel_gk20a_commit_userd(struct channel_gk20a *c);
+static int channel_gk20a_setup_userd(struct channel_gk20a *c);
+static int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
+                       u64 gpfifo_base, u32 gpfifo_entries);
+
+static void channel_gk20a_bind(struct channel_gk20a *ch_gk20a);
+static void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a);
+
+static int channel_gk20a_alloc_inst(struct gk20a *g,
+                               struct channel_gk20a *ch);
+static void channel_gk20a_free_inst(struct gk20a *g,
+                               struct channel_gk20a *ch);
+
+static int channel_gk20a_update_runlist(struct channel_gk20a *c,
+                                       bool add);
+
+static struct channel_gk20a *acquire_unused_channel(struct fifo_gk20a *f)
+{
+       struct channel_gk20a *ch = NULL;
+       int chid;
+
+       mutex_lock(&f->ch_inuse_mutex);
+       for (chid = 0; chid < f->num_channels; chid++) {
+               if (!f->channel[chid].in_use) {
+                       f->channel[chid].in_use = true;
+                       ch = &f->channel[chid];
+                       break;
+               }
+       }
+       mutex_unlock(&f->ch_inuse_mutex);
+
+       return ch;
+}
+
+static void release_used_channel(struct fifo_gk20a *f, struct channel_gk20a *c)
+{
+       mutex_lock(&f->ch_inuse_mutex);
+       f->channel[c->hw_chid].in_use = false;
+       mutex_unlock(&f->ch_inuse_mutex);
+}
+
+int channel_gk20a_commit_va(struct channel_gk20a *c)
+{
+       u32 addr_lo;
+       u32 addr_hi;
+       void *inst_ptr;
+
+       nvhost_dbg_fn("");
+
+       inst_ptr = mem_op().mmap(c->inst_block.mem.ref);
+       if (IS_ERR(inst_ptr))
+               return -ENOMEM;
+
+       addr_lo = u64_lo32(c->vm->pdes.phys) >> 12;
+       addr_hi = u64_hi32(c->vm->pdes.phys);
+
+       nvhost_dbg_info("pde pa=0x%x addr_lo=0x%x addr_hi=0x%x",
+                  c->vm->pdes.phys, addr_lo, addr_hi);
+
+       mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+               ram_in_page_dir_base_target_vid_mem_f() |
+               ram_in_page_dir_base_vol_true_f() |
+               ram_in_page_dir_base_lo_f(addr_lo));
+
+       mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+               ram_in_page_dir_base_hi_f(addr_hi));
+
+       mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+                u64_lo32(c->vm->va_limit) | 0xFFF);
+
+       mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+               ram_in_adr_limit_hi_f(u64_hi32(c->vm->va_limit)));
+
+       mem_op().munmap(c->inst_block.mem.ref, inst_ptr);
+
+       return 0;
+}
+
+static int channel_gk20a_commit_userd(struct channel_gk20a *c)
+{
+       u32 addr_lo;
+       u32 addr_hi;
+       void *inst_ptr;
+
+       nvhost_dbg_fn("");
+
+       inst_ptr = mem_op().mmap(c->inst_block.mem.ref);
+       if (IS_ERR(inst_ptr))
+               return -ENOMEM;
+
+       addr_lo = u64_lo32(c->userd_cpu_pa >> ram_userd_base_shift_v());
+       addr_hi = u64_hi32(c->userd_cpu_pa);
+
+       nvhost_dbg_info("channel %d : set ramfc userd 0x%08x",
+               c->hw_chid, c->userd_cpu_pa);
+
+       mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_w(),
+                pbdma_userd_target_vid_mem_f() |
+                pbdma_userd_addr_f(addr_lo));
+
+       mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_hi_w(),
+                pbdma_userd_target_vid_mem_f() |
+                pbdma_userd_hi_addr_f(addr_hi));
+
+       mem_op().munmap(c->inst_block.mem.ref, inst_ptr);
+
+       return 0;
+}
+
+static int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
+                               u64 gpfifo_base, u32 gpfifo_entries)
+{
+       void *inst_ptr;
+
+       nvhost_dbg_fn("");
+
+       inst_ptr = mem_op().mmap(c->inst_block.mem.ref);
+       if (IS_ERR(inst_ptr))
+               return -ENOMEM;
+
+       memset(inst_ptr, 0, ram_fc_size_val_v());
+
+       mem_wr32(inst_ptr, ram_fc_gp_base_w(),
+               pbdma_gp_base_offset_f(
+               u64_lo32(gpfifo_base >> pbdma_gp_base_rsvd_s())));
+
+       mem_wr32(inst_ptr, ram_fc_gp_base_hi_w(),
+               pbdma_gp_base_hi_offset_f(u64_hi32(gpfifo_base)) |
+               pbdma_gp_base_hi_limit2_f(ilog2(gpfifo_entries)));
+
+       mem_wr32(inst_ptr, ram_fc_signature_w(),
+                pbdma_signature_hw_valid_f() | pbdma_signature_sw_zero_f());
+
+       mem_wr32(inst_ptr, ram_fc_formats_w(),
+               pbdma_formats_gp_fermi0_f() |
+               pbdma_formats_pb_fermi1_f() |
+               pbdma_formats_mp_fermi0_f());
+
+       mem_wr32(inst_ptr, ram_fc_pb_header_w(),
+               pbdma_pb_header_priv_user_f() |
+               pbdma_pb_header_method_zero_f() |
+               pbdma_pb_header_subchannel_zero_f() |
+               pbdma_pb_header_level_main_f() |
+               pbdma_pb_header_first_true_f() |
+               pbdma_pb_header_type_inc_f());
+
+       mem_wr32(inst_ptr, ram_fc_subdevice_w(),
+               pbdma_subdevice_id_f(1) |
+               pbdma_subdevice_status_active_f() |
+               pbdma_subdevice_channel_dma_enable_f());
+
+       mem_wr32(inst_ptr, ram_fc_target_w(), pbdma_target_engine_sw_f());
+
+       mem_wr32(inst_ptr, ram_fc_acquire_w(),
+               pbdma_acquire_retry_man_2_f() |
+               pbdma_acquire_retry_exp_2_f() |
+               pbdma_acquire_timeout_exp_max_f() |
+               pbdma_acquire_timeout_man_max_f() |
+               pbdma_acquire_timeout_en_disable_f());
+
+       mem_wr32(inst_ptr, ram_fc_eng_timeslice_w(),
+               fifo_eng_timeslice_timeout_128_f() |
+               fifo_eng_timeslice_timescale_3_f() |
+               fifo_eng_timeslice_enable_true_f());
+
+       mem_wr32(inst_ptr, ram_fc_pb_timeslice_w(),
+               fifo_pb_timeslice_timeout_16_f() |
+               fifo_pb_timeslice_timescale_0_f() |
+               fifo_pb_timeslice_enable_true_f());
+
+       mem_wr32(inst_ptr, ram_fc_chid_w(), ram_fc_chid_f(c->hw_chid));
+
+       /* TBD: alwasy priv mode? */
+       mem_wr32(inst_ptr, ram_fc_hce_ctrl_w(),
+                pbdma_hce_ctrl_hce_priv_mode_yes_f());
+
+       mem_op().munmap(c->inst_block.mem.ref, inst_ptr);
+
+       return 0;
+}
+
+static int channel_gk20a_setup_userd(struct channel_gk20a *c)
+{
+       BUG_ON(!c->userd_cpu_va);
+
+       nvhost_dbg_fn("");
+
+       mem_wr32(c->userd_cpu_va, ram_userd_put_w(), 0);
+       mem_wr32(c->userd_cpu_va, ram_userd_get_w(), 0);
+       mem_wr32(c->userd_cpu_va, ram_userd_ref_w(), 0);
+       mem_wr32(c->userd_cpu_va, ram_userd_put_hi_w(), 0);
+       mem_wr32(c->userd_cpu_va, ram_userd_ref_threshold_w(), 0);
+       mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_w(), 0);
+       mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_hi_w(), 0);
+       mem_wr32(c->userd_cpu_va, ram_userd_get_hi_w(), 0);
+       mem_wr32(c->userd_cpu_va, ram_userd_gp_get_w(), 0);
+       mem_wr32(c->userd_cpu_va, ram_userd_gp_put_w(), 0);
+
+       return 0;
+}
+
+static void channel_gk20a_bind(struct channel_gk20a *ch_gk20a)
+{
+       struct gk20a *g = get_gk20a(ch_gk20a->ch->dev);
+       struct fifo_gk20a *f = &g->fifo;
+       struct fifo_engine_info_gk20a *engine_info =
+               f->engine_info + ENGINE_GR_GK20A;
+
+       u32 inst_ptr = ch_gk20a->inst_block.cpu_pa >> ram_in_base_shift_v();
+
+       nvhost_dbg_info("bind channel %d inst ptr 0x%08x",
+               ch_gk20a->hw_chid, inst_ptr);
+
+       ch_gk20a->bound = true;
+
+       gk20a_writel(g, ccsr_channel_r(ch_gk20a->hw_chid),
+               (gk20a_readl(g, ccsr_channel_r(ch_gk20a->hw_chid)) &
+                ~ccsr_channel_runlist_f(~0)) |
+                ccsr_channel_runlist_f(engine_info->runlist_id));
+
+       gk20a_writel(g, ccsr_channel_inst_r(ch_gk20a->hw_chid),
+               ccsr_channel_inst_ptr_f(inst_ptr) |
+               ccsr_channel_inst_target_vid_mem_f() |
+               ccsr_channel_inst_bind_true_f());
+
+       gk20a_writel(g, ccsr_channel_r(ch_gk20a->hw_chid),
+               (gk20a_readl(g, ccsr_channel_r(ch_gk20a->hw_chid)) &
+                ~ccsr_channel_enable_set_f(~0)) |
+                ccsr_channel_enable_set_true_f());
+}
+
+static void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a)
+{
+       struct gk20a *g = get_gk20a(ch_gk20a->ch->dev);
+
+       nvhost_dbg_fn("");
+
+       if (ch_gk20a->bound)
+               gk20a_writel(g, ccsr_channel_inst_r(ch_gk20a->hw_chid),
+                       ccsr_channel_inst_ptr_f(0) |
+                       ccsr_channel_inst_bind_false_f());
+
+       ch_gk20a->bound = false;
+}
+
+static int channel_gk20a_alloc_inst(struct gk20a *g,
+                               struct channel_gk20a *ch)
+{
+       struct mem_mgr *memmgr = mem_mgr_from_g(g);
+
+       nvhost_dbg_fn("");
+
+       ch->inst_block.mem.ref =
+               mem_op().alloc(memmgr, ram_in_alloc_size_v(),
+                           DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                           DEFAULT_NVMAP_ALLOC_FLAGS,
+                           NVMAP_HEAP_CARVEOUT_GENERIC);
+
+       if (IS_ERR(ch->inst_block.mem.ref)) {
+               ch->inst_block.mem.ref = 0;
+               goto clean_up;
+       }
+
+       ch->inst_block.cpu_pa =
+               mem_op().pin(memmgr, ch->inst_block.mem.ref);
+
+       /* IS_ERR throws a warning here (expecting void *) */
+       if (ch->inst_block.cpu_pa == -EINVAL ||
+           ch->inst_block.cpu_pa == -EINTR) {
+               ch->inst_block.cpu_pa = 0;
+               goto clean_up;
+       }
+
+       nvhost_dbg_info("channel %d inst block physical addr: 0x%08x",
+               ch->hw_chid, ch->inst_block.cpu_pa);
+
+       ch->inst_block.mem.size = ram_in_alloc_size_v();
+
+       nvhost_dbg_fn("done");
+       return 0;
+
+clean_up:
+       nvhost_dbg(dbg_fn | dbg_err, "fail");
+       channel_gk20a_free_inst(g, ch);
+       return -ENOMEM;
+}
+
+static void channel_gk20a_free_inst(struct gk20a *g,
+                               struct channel_gk20a *ch)
+{
+       struct mem_mgr *memmgr = mem_mgr_from_g(g);
+
+       mem_op().unpin(memmgr, ch->inst_block.mem.ref);
+       mem_op().put(memmgr, ch->inst_block.mem.ref);
+       memset(&ch->inst_block, 0, sizeof(struct inst_desc));
+}
+
+static int channel_gk20a_update_runlist(struct channel_gk20a *c,
+                                       bool add)
+{
+       struct gk20a *g = c->g;
+       struct fifo_gk20a *f = &g->fifo;
+       struct fifo_engine_info_gk20a *engine_info;
+       struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
+       struct fifo_runlist_info_gk20a *runlist = NULL;
+       u32 runlist_id = ~0;
+       u32 *runlist_entry_base = NULL;
+       u32 *runlist_entry = NULL;
+       phys_addr_t runlist_pa;
+       u32 old_buf, new_buf;
+       u32 chid;
+       u32 count = 0;
+       int remain;
+       bool pending;
+       u32 ret = 0;
+
+       engine_info = f->engine_info + ENGINE_GR_GK20A;
+       runlist_id = engine_info->runlist_id;
+       runlist = &f->runlist_info[runlist_id];
+
+       mutex_lock(&runlist->mutex);
+
+       if (add) {
+               if (test_and_set_bit(c->hw_chid,
+                           runlist->active_channels) == 1)
+                       goto done;
+       } else {
+               if (test_and_clear_bit(c->hw_chid,
+                           runlist->active_channels) == 0)
+                       goto done;
+       }
+
+       old_buf = runlist->cur_buffer;
+       new_buf = !runlist->cur_buffer;
+
+       nvhost_dbg_info("runlist_id : %d, switch to new buffer %p",
+               runlist_id, runlist->mem[new_buf].ref);
+
+       runlist_pa = mem_op().pin(memmgr,
+                              runlist->mem[new_buf].ref);
+       if (!runlist_pa) {
+               ret = -ENOMEM;
+               goto clean_up;
+       }
+
+       runlist_entry_base = mem_op().mmap(runlist->mem[new_buf].ref);
+       if (IS_ERR_OR_NULL(runlist_entry_base)) {
+               ret = -ENOMEM;
+               goto clean_up;
+       }
+
+       runlist_entry = runlist_entry_base;
+       for_each_set_bit(chid,
+           runlist->active_channels, f->num_channels) {
+               nvhost_dbg_info("add channel %d to runlist", chid);
+               runlist_entry[0] = chid;
+               runlist_entry[1] = 0;
+               runlist_entry += 2;
+               count++;
+       }
+
+       gk20a_writel(g, fifo_runlist_base_r(),
+               fifo_runlist_base_ptr_f(u64_lo32(runlist_pa >> 12)) |
+               fifo_runlist_base_target_vid_mem_f());
+
+       gk20a_writel(g, fifo_runlist_r(),
+               fifo_runlist_engine_f(runlist_id) |
+               fifo_eng_runlist_length_f(count));
+
+       remain =
+               wait_event_interruptible_timeout(
+                       runlist->runlist_wq,
+                       ((pending =
+                               gk20a_readl(g, fifo_eng_runlist_r(runlist_id)) &
+                               fifo_eng_runlist_pending_true_f()) == 0),
+                       2 * HZ /* 2 sec */);
+
+       if (remain == 0 && pending != 0) {
+               nvhost_err(dev_from_gk20a(g), "runlist update timeout");
+               ret = -ETIMEDOUT;
+               goto clean_up;
+       } else if (remain < 0) {
+               nvhost_err(dev_from_gk20a(g), "runlist update interrupted");
+               ret = -EINTR;
+               goto clean_up;
+       }
+
+       runlist->cur_buffer = new_buf;
+
+clean_up:
+       if (ret != 0)
+               mem_op().unpin(memmgr, runlist->mem[new_buf].ref);
+       else
+               mem_op().unpin(memmgr, runlist->mem[old_buf].ref);
+
+       mem_op().munmap(runlist->mem[new_buf].ref,
+                    runlist_entry_base);
+done:
+       mutex_unlock(&runlist->mutex);
+       return ret;
+}
+
+void gk20a_free_channel(struct nvhost_hwctx *ctx)
+{
+       struct channel_gk20a *ch = ctx->priv;
+       struct gk20a *g = ch->g;
+       struct fifo_gk20a *f = &g->fifo;
+       struct gr_gk20a *gr = &g->gr;
+       struct mem_mgr *memmgr = gk20a_channel_mem_mgr(ch);
+       struct vm_gk20a *ch_vm = ch->vm;
+       struct fifo_engine_info_gk20a *engine_info =
+                       f->engine_info + ENGINE_GR_GK20A;
+
+       if (!ch->bound)
+               return;
+
+       if (!gk20a_channel_as_bound(ch))
+               goto unbind;
+
+       /* stop, verify stopage */
+
+       /* disable channel */
+       gk20a_writel(g, ccsr_channel_r(ch->hw_chid),
+               gk20a_readl(g, ccsr_channel_r(ch->hw_chid)) |
+               ccsr_channel_enable_clr_true_f());
+
+       /* preempt the channel */
+       gk20a_fifo_preempt_channel(g,
+               engine_info->runlist_id, ch->hw_chid);
+
+       /* remove channel from runlist */
+       channel_gk20a_update_runlist(ch, false);
+
+       /* release channel ctx */
+       gk20a_free_channel_ctx(ch);
+
+       gk20a_gr_flush_channel_tlb(gr);
+
+       memset(&ch->ramfc, 0, sizeof(struct mem_desc_sub));
+
+       /* free gpfifo */
+       ch_vm->unmap(ch_vm, ch->gpfifo.gpu_va);
+       mem_op().munmap(ch->gpfifo.mem.ref, ch->gpfifo.cpu_va);
+       mem_op().put(memmgr, ch->gpfifo.mem.ref);
+       memset(&ch->gpfifo, 0, sizeof(struct gpfifo_desc));
+
+       ctx->priv = NULL;
+       channel_gk20a_free_priv_cmdbuf(ch);
+
+       /* release hwctx binding to the as_share */
+       nvhost_as_release_share(ch_vm->as_share, ctx);
+
+unbind:
+       channel_gk20a_unbind(ch);
+       channel_gk20a_free_inst(g, ch);
+
+       ch->vpr = false;
+
+       /* ALWAYS last */
+       release_used_channel(f, ch);
+}
+
+struct nvhost_hwctx *gk20a_open_channel(struct nvhost_channel *ch,
+                                        struct nvhost_hwctx *ctx)
+{
+       struct gk20a *g = get_gk20a(ch->dev);
+       struct fifo_gk20a *f = &g->fifo;
+       struct channel_gk20a *ch_gk20a;
+
+       ch_gk20a = acquire_unused_channel(f);
+       if (ch_gk20a == NULL) {
+               /* TBD: we want to make this virtualizable */
+               nvhost_err(dev_from_gk20a(g), "out of hw chids");
+               return 0;
+       }
+
+       ctx->priv = ch_gk20a;
+       ch_gk20a->g = g;
+       /* note the ch here is the same for *EVERY* gk20a channel */
+       ch_gk20a->ch = ch;
+       /* but thre's one hwctx per gk20a channel */
+       ch_gk20a->hwctx = ctx;
+
+       if (channel_gk20a_alloc_inst(g, ch_gk20a)) {
+               ch_gk20a->in_use = false;
+               ctx->priv = 0;
+               nvhost_err(dev_from_gk20a(g),
+                          "failed to open gk20a channel, out of inst mem");
+
+               return 0;
+       }
+       channel_gk20a_bind(ch_gk20a);
+
+       /* The channel is *not* runnable at this point. It still needs to have
+        * an address space bound and allocate a gpfifo and grctx. */
+
+
+       init_waitqueue_head(&ch_gk20a->notifier_wq);
+       init_waitqueue_head(&ch_gk20a->semaphore_wq);
+
+       return ctx;
+}
+
+/* move to debug_gk20a.c ... */
+static void dump_gpfifo(struct channel_gk20a *c)
+{
+       void *inst_ptr;
+       u32 chid = c->hw_chid;
+
+       nvhost_dbg_fn("");
+
+       inst_ptr = mem_op().mmap(c->inst_block.mem.ref);
+       if (IS_ERR(inst_ptr))
+               return;
+
+       nvhost_dbg_info("ramfc for channel %d:\n"
+               "ramfc: gp_base 0x%08x, gp_base_hi 0x%08x, "
+               "gp_fetch 0x%08x, gp_get 0x%08x, gp_put 0x%08x, "
+               "pb_fetch 0x%08x, pb_fetch_hi 0x%08x, "
+               "pb_get 0x%08x, pb_get_hi 0x%08x, "
+               "pb_put 0x%08x, pb_put_hi 0x%08x\n"
+               "userd: gp_put 0x%08x, gp_get 0x%08x, "
+               "get 0x%08x, get_hi 0x%08x, "
+               "put 0x%08x, put_hi 0x%08x\n"
+               "pbdma: status 0x%08x, channel 0x%08x, userd 0x%08x, "
+               "gp_base 0x%08x, gp_base_hi 0x%08x, "
+               "gp_fetch 0x%08x, gp_get 0x%08x, gp_put 0x%08x, "
+               "pb_fetch 0x%08x, pb_fetch_hi 0x%08x, "
+               "get 0x%08x, get_hi 0x%08x, put 0x%08x, put_hi 0x%08x\n"
+               "channel: ccsr_channel 0x%08x",
+               chid,
+               mem_rd32(inst_ptr, ram_fc_gp_base_w()),
+               mem_rd32(inst_ptr, ram_fc_gp_base_hi_w()),
+               mem_rd32(inst_ptr, ram_fc_gp_fetch_w()),
+               mem_rd32(inst_ptr, ram_fc_gp_get_w()),
+               mem_rd32(inst_ptr, ram_fc_gp_put_w()),
+               mem_rd32(inst_ptr, ram_fc_pb_fetch_w()),
+               mem_rd32(inst_ptr, ram_fc_pb_fetch_hi_w()),
+               mem_rd32(inst_ptr, ram_fc_pb_get_w()),
+               mem_rd32(inst_ptr, ram_fc_pb_get_hi_w()),
+               mem_rd32(inst_ptr, ram_fc_pb_put_w()),
+               mem_rd32(inst_ptr, ram_fc_pb_put_hi_w()),
+               mem_rd32(c->userd_cpu_va, ram_userd_gp_put_w()),
+               mem_rd32(c->userd_cpu_va, ram_userd_gp_get_w()),
+               mem_rd32(c->userd_cpu_va, ram_userd_get_w()),
+               mem_rd32(c->userd_cpu_va, ram_userd_get_hi_w()),
+               mem_rd32(c->userd_cpu_va, ram_userd_put_w()),
+               mem_rd32(c->userd_cpu_va, ram_userd_put_hi_w()),
+               gk20a_readl(c->g, pbdma_status_r(0)),
+               gk20a_readl(c->g, pbdma_channel_r(0)),
+               gk20a_readl(c->g, pbdma_userd_r(0)),
+               gk20a_readl(c->g, pbdma_gp_base_r(0)),
+               gk20a_readl(c->g, pbdma_gp_base_hi_r(0)),
+               gk20a_readl(c->g, pbdma_gp_fetch_r(0)),
+               gk20a_readl(c->g, pbdma_gp_get_r(0)),
+               gk20a_readl(c->g, pbdma_gp_put_r(0)),
+               gk20a_readl(c->g, pbdma_pb_fetch_r(0)),
+               gk20a_readl(c->g, pbdma_pb_fetch_hi_r(0)),
+               gk20a_readl(c->g, pbdma_get_r(0)),
+               gk20a_readl(c->g, pbdma_get_hi_r(0)),
+               gk20a_readl(c->g, pbdma_put_r(0)),
+               gk20a_readl(c->g, pbdma_put_hi_r(0)),
+               gk20a_readl(c->g, ccsr_channel_r(chid)));
+
+       mem_op().munmap(c->inst_block.mem.ref, inst_ptr);
+}
+
+/* allocate private cmd buffer.
+   used for inserting commands before/after user submitted buffers. */
+static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c)
+{
+       struct device *d = dev_from_gk20a(c->g);
+       struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
+       struct vm_gk20a *ch_vm = c->vm;
+       struct priv_cmd_queue *q = &c->priv_cmd_q;
+       struct priv_cmd_entry *e;
+       u32 i = 0, size;
+
+       size = GK20A_PRIV_CMDBUF_ENTRY_NUM * sizeof(u32);
+       q->mem.ref = mem_op().alloc(memmgr,
+                       size,
+                       DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                       DEFAULT_NVMAP_ALLOC_FLAGS,
+                       NVMAP_HEAP_CARVEOUT_GENERIC);
+       if (IS_ERR_OR_NULL(q->mem.ref)) {
+               nvhost_err(d, "ch %d : failed to allocate"
+                          " priv cmd buffer(size: %d bytes)",
+                          c->hw_chid, size);
+               goto clean_up;
+       }
+       q->mem.size = size;
+
+       q->base_ptr = (u32 *)mem_op().mmap(q->mem.ref);
+       if (IS_ERR_OR_NULL(q->base_ptr)) {
+               nvhost_err(d, "ch %d : failed to map cpu va"
+                          "for priv cmd buffer", c->hw_chid);
+               goto clean_up;
+       }
+
+       memset(q->base_ptr, 0, size);
+
+       q->base_gva = ch_vm->map(ch_vm, memmgr,
+                       q->mem.ref,
+                       0, 0, 0 /*offset_align, flags, kind*/);
+       if (!q->base_gva) {
+               nvhost_err(d, "ch %d : failed to map gpu va"
+                          "for priv cmd buffer", c->hw_chid);
+               goto clean_up;
+       }
+
+       q->size = GK20A_PRIV_CMDBUF_ENTRY_NUM;
+
+       INIT_LIST_HEAD(&q->head);
+       INIT_LIST_HEAD(&q->free);
+
+       /* pre-alloc a few entries and put them on free list */
+       for (i = 0; i < GK20A_PRIV_CMDBUF_ENTRY_PRE_ALLOC_NUM; i++) {
+               e = kzalloc(GFP_KERNEL, sizeof(struct priv_cmd_entry));
+               if (!e) {
+                       nvhost_err(d, "ch %d: fail to pre-alloc cmd entry",
+                               c->hw_chid);
+                       goto clean_up;
+               }
+               e->pre_alloc = true;
+               list_add(&e->list, &q->free);
+       }
+
+       return 0;
+
+clean_up:
+       channel_gk20a_free_priv_cmdbuf(c);
+       return -ENOMEM;
+}
+
+static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c)
+{
+       struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
+       struct vm_gk20a *ch_vm = c->vm;
+       struct priv_cmd_queue *q = &c->priv_cmd_q;
+       struct priv_cmd_entry *e;
+       struct list_head *pos, *tmp, *head;
+
+       if (q->size == 0)
+               return;
+
+       ch_vm->unmap(ch_vm, q->base_gva);
+       mem_op().munmap(q->mem.ref, q->base_ptr);
+       mem_op().put(memmgr, q->mem.ref);
+
+       /* free used list */
+       head = &q->head;
+       list_for_each_safe(pos, tmp, head) {
+               e = container_of(pos, struct priv_cmd_entry, list);
+               free_priv_cmdbuf(q, e);
+       }
+
+       /* free free list */
+       head = &q->free;
+       list_for_each_safe(pos, tmp, head) {
+               e = container_of(pos, struct priv_cmd_entry, list);
+               e->pre_alloc = false;
+               free_priv_cmdbuf(q, e);
+       }
+
+       memset(q, 0, sizeof(struct priv_cmd_queue));
+}
+
+/* allocate a cmd buffer with given size. size is number of u32 entries */
+static int alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
+                            struct priv_cmd_entry **entry)
+{
+       struct priv_cmd_queue *q = &c->priv_cmd_q;
+       struct priv_cmd_entry *e;
+       struct list_head *node;
+       u32 free_count;
+       u32 size = orig_size;
+       bool no_retry = false;
+
+       nvhost_dbg_fn("size %d", orig_size);
+
+       *entry = NULL;
+
+       /* if free space in the end is less than requested, increase the size
+        * to make the real allocated space start from beginning. */
+       if (q->put + size > q->size)
+               size = orig_size + (q->size - q->put);
+
+       nvhost_dbg_info("ch %d: priv cmd queue get:put %d:%d",
+                       c->hw_chid, q->get, q->put);
+
+TRY_AGAIN:
+       free_count = (q->size - (q->put - q->get) - 1) % q->size;
+
+       if (size > free_count) {
+               if (!no_retry) {
+                       recycle_priv_cmdbuf(c);
+                       no_retry = true;
+                       goto TRY_AGAIN;
+               } else
+                       return -EAGAIN;
+       }
+
+       if (unlikely(list_empty(&q->free))) {
+
+               nvhost_dbg_info("ch %d: run out of pre-alloc entries",
+                       c->hw_chid);
+
+               e = kzalloc(GFP_KERNEL, sizeof(struct priv_cmd_entry));
+               if (!e) {
+                       nvhost_err(dev_from_gk20a(c->g),
+                               "ch %d: fail to allocate priv cmd entry",
+                               c->hw_chid);
+                       return -ENOMEM;
+               }
+       } else  {
+               node = q->free.next;
+               list_del(node);
+               e = container_of(node, struct priv_cmd_entry, list);
+       }
+
+       e->ptr = q->base_ptr + q->put;
+       e->gva = q->base_gva + q->put * sizeof(u32);
+       e->size = orig_size;
+       e->gp_get = c->gpfifo.get;
+       e->gp_put = c->gpfifo.put;
+       e->gp_wrap = c->gpfifo.wrap;
+
+       /* if we have increased size to skip free space in the end, set put
+          to beginning of cmd buffer (0) + size */
+       if (size != orig_size)
+               q->put = orig_size;
+       else
+               q->put = (q->put + orig_size) & (q->size - 1);
+
+       /* we already handled q->put + size > q->size so BUG_ON this */
+       BUG_ON(q->put > q->size);
+
+       /* add new entry to head since we free from head */
+       list_add(&e->list, &q->head);
+
+       *entry = e;
+
+       nvhost_dbg_fn("done");
+
+       return 0;
+}
+
+/* Don't call this to free an explict cmd entry.
+ * It doesn't update priv_cmd_queue get/put */
+static void free_priv_cmdbuf(struct priv_cmd_queue *q,
+                            struct priv_cmd_entry *e)
+{
+       if (!e)
+               return;
+
+       list_del(&e->list);
+
+       if (unlikely(!e->pre_alloc))
+               kfree(e);
+       else {
+               memset(e, 0, sizeof(struct priv_cmd_entry));
+               e->pre_alloc = true;
+               list_add(&e->list, &q->free);
+       }
+}
+
+/* free entries if they're no longer being used */
+static void recycle_priv_cmdbuf(struct channel_gk20a *c)
+{
+       struct priv_cmd_queue *q = &c->priv_cmd_q;
+       struct priv_cmd_entry *e;
+       struct list_head *pos, *tmp, *head = &q->head;
+       bool wrap_around;
+
+       nvhost_dbg_fn("");
+
+       /* Find the most recent free entry. Free it and everything before it */
+       list_for_each(pos, head) {
+
+               e = list_entry(pos, struct priv_cmd_entry, list);
+
+               nvhost_dbg_info("ch %d: cmd entry get:put:wrap %d:%d:%d "
+                       "curr get:put:wrap %d:%d:%d",
+                       c->hw_chid, e->gp_get, e->gp_put, e->gp_wrap,
+                       c->gpfifo.get, c->gpfifo.put, c->gpfifo.wrap);
+
+               wrap_around = (c->gpfifo.wrap != e->gp_wrap);
+               if (e->gp_get < e->gp_put) {
+                       if (c->gpfifo.get >= e->gp_put ||
+                           wrap_around)
+                               break;
+                       else
+                               e->gp_get = c->gpfifo.get;
+               } else if (e->gp_get > e->gp_put) {
+                       if (wrap_around &&
+                           c->gpfifo.get >= e->gp_put)
+                               break;
+                       else
+                               e->gp_get = c->gpfifo.get;
+               }
+       }
+
+       if (pos != head)
+               q->get = (e->ptr - q->base_ptr) + e->size;
+       else
+               nvhost_dbg_info("no free entry recycled");
+               return;
+
+       head = pos->prev;
+       list_for_each_safe(pos, tmp, head) {
+               e = container_of(pos, struct priv_cmd_entry, list);
+               free_priv_cmdbuf(q, e);
+       }
+
+       nvhost_dbg_fn("done");
+}
+
+
+int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
+                              struct nvhost_alloc_gpfifo_args *args)
+{
+       struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
+       struct gk20a *g = c->g;
+       struct device *d = dev_from_gk20a(g);
+       struct vm_gk20a *ch_vm;
+       u32 gpfifo_size = roundup_pow_of_two(args->num_entries);
+       u32 ret;
+
+       /* TBD: add kernel ioctl change
+       if (args->flags & NVHOST_ALLOC_GPFIFO_FLAGS_VPR_ENABLED)
+               c->vpr = true; */
+
+       /* an address space needs to have been bound at this point.   */
+       if (!gk20a_channel_as_bound(c)) {
+               int err;
+               nvhost_warn(dev_from_gk20a(g),
+                           "not bound to an address space at time of gpfifo"
+                           " allocation.  Attempting to create and bind to"
+                           " one...");
+               /*
+                * Eventually this will be a fatal error. For now attempt to
+                * create and bind a share here.  This helps until we change
+                * clients to use the new address space API.  However doing this
+                * can mask errors in programming access to the address space
+                * through the front door...
+                */
+               err = nvhost_as_alloc_and_bind_share(c->ch, c->hwctx);
+               if (err || !gk20a_channel_as_bound(c)) {
+                       nvhost_err(dev_from_gk20a(g),
+                                  "not bound to address space at time"
+                                  " of gpfifo allocation");
+                       return err;
+               }
+       }
+       ch_vm = c->vm;
+
+       c->ramfc.offset = 0;
+       c->ramfc.size = ram_in_ramfc_s() / 8;
+
+       if (c->gpfifo.mem.ref) {
+               nvhost_err(d, "channel %d :"
+                          "gpfifo already allocated", c->hw_chid);
+               return -EEXIST;
+       }
+
+       c->gpfifo.mem.ref = mem_op().alloc(memmgr,
+                       gpfifo_size * sizeof(struct gpfifo),
+                       DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                       DEFAULT_NVMAP_ALLOC_FLAGS,
+                       NVMAP_HEAP_CARVEOUT_GENERIC);
+       if (IS_ERR_OR_NULL(c->gpfifo.mem.ref)) {
+               nvhost_err(d, "channel %d :"
+                          " failed to allocate gpfifo (size: %d bytes)",
+                          c->hw_chid, gpfifo_size);
+               c->gpfifo.mem.ref = 0;
+               return -ENOMEM;
+       }
+       c->gpfifo.entry_num = gpfifo_size;
+
+       c->gpfifo.cpu_va = (struct gpfifo *)mem_op().mmap(c->gpfifo.mem.ref);
+       if (IS_ERR_OR_NULL(c->gpfifo.cpu_va))
+               goto clean_up;
+
+       c->gpfifo.get = c->gpfifo.put = 0;
+
+       c->gpfifo.gpu_va = ch_vm->map(ch_vm, memmgr,
+                                     c->gpfifo.mem.ref,
+                                     0, 0 /*offset_align, flags*/, 0);
+       if (!c->gpfifo.gpu_va) {
+               nvhost_err(d, "channel %d : failed to map"
+                          " gpu_va for gpfifo", c->hw_chid);
+               goto clean_up;
+       }
+
+       nvhost_dbg_info("channel %d : gpfifo_base 0x%016llx, size %d",
+               c->hw_chid, c->gpfifo.gpu_va, c->gpfifo.entry_num);
+
+       channel_gk20a_setup_ramfc(c, c->gpfifo.gpu_va, c->gpfifo.entry_num);
+
+       channel_gk20a_setup_userd(c);
+       channel_gk20a_commit_userd(c);
+
+       /* TBD: setup engine contexts */
+
+       ret = channel_gk20a_alloc_priv_cmdbuf(c);
+       if (ret)
+               goto clean_up;
+
+       ret = channel_gk20a_update_runlist(c, true);
+       if (ret)
+               goto clean_up;
+
+       nvhost_dbg_fn("done");
+       return 0;
+
+clean_up:
+       nvhost_dbg(dbg_fn | dbg_err, "fail");
+       ch_vm->unmap(ch_vm, c->gpfifo.gpu_va);
+       mem_op().munmap(c->gpfifo.mem.ref, c->gpfifo.cpu_va);
+       mem_op().put(memmgr, c->gpfifo.mem.ref);
+       memset(&c->gpfifo, 0, sizeof(struct gpfifo_desc));
+       return -ENOMEM;
+}
+
+int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
+                               struct nvhost_gpfifo *gpfifo,
+                               u32 num_entries,
+                               struct nvhost_fence *fence,
+                               u32 flags)
+{
+       struct gk20a *g = c->g;
+       struct device *d = dev_from_gk20a(g);
+       struct nvhost_syncpt *sp = syncpt_from_gk20a(g);
+       u32 new_put, new_get;
+       u32 free_count;
+       u32 extra_count = 0;
+       u32 i;
+       u32 err = 0;
+       struct priv_cmd_entry *wait_cmd = NULL;
+       struct priv_cmd_entry *get_cmd = NULL;
+
+       nvhost_dbg_info("channel %d", c->hw_chid);
+
+       /* gp_put changed unexpectedly since last update */
+       new_put = gk20a_bar1_readl(g,
+                       c->userd_gpu_va + 4 * ram_userd_gp_put_w());
+       if (c->gpfifo.put != new_put) {
+               /* BUG_ON this */
+               nvhost_err(dev_from_gk20a(g), "gp_put changed unexpectedly "
+                          "since last update");
+               c->gpfifo.put = new_put;
+       }
+
+       /* update gp_get from userd before a new submission */
+       new_get = gk20a_bar1_readl(g,
+               c->userd_gpu_va + sizeof(u32) * ram_userd_gp_get_w());
+       if (new_get < c->gpfifo.get)
+               c->gpfifo.wrap = !c->gpfifo.wrap;
+
+       c->gpfifo.get = new_get;
+
+       nvhost_dbg_info("put %d, get %d, size %d",
+               c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+
+       free_count = (c->gpfifo.entry_num -
+               (c->gpfifo.put - c->gpfifo.get) - 1) %
+               c->gpfifo.entry_num;
+
+       if ((flags & NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) &&
+           !nvhost_syncpt_is_expired(sp, fence->syncpt_id, fence->value)) {
+               alloc_priv_cmdbuf(c, 4, &wait_cmd);
+               if (wait_cmd == NULL) {
+                       nvhost_err(d, "not enough priv cmd buffer space");
+                       err = -EAGAIN;
+                       goto clean_up;
+               }
+               extra_count++;
+       }
+       if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_GET) {
+               alloc_priv_cmdbuf(c, 2, &get_cmd);
+               if (get_cmd == NULL) {
+                       nvhost_err(d, "not enough priv cmd buffer space");
+                       err = -EAGAIN;
+                       goto clean_up;
+               }
+               extra_count++;
+       }
+
+       if (num_entries + extra_count > free_count) {
+               nvhost_err(d, "not enough gpfifo space");
+               err = -EAGAIN;
+               goto clean_up;
+       }
+
+       if (wait_cmd) {
+               /* syncpoint_a */
+               wait_cmd->ptr[0] = 0x2001001C;
+               /* payload */
+               wait_cmd->ptr[1] = fence->value;
+               /* syncpoint_b */
+               wait_cmd->ptr[2] = 0x2001001D;
+               /* syncpt_id, switch_en, wait */
+               wait_cmd->ptr[3] = (fence->syncpt_id << 8) | 0x10;
+
+               nvhost_dbg_info("cmds for syncpt wait : "
+                       "0x%08x, 0x%08x, 0x%08x, 0x%08x",
+                       wait_cmd->ptr[0],
+                       wait_cmd->ptr[1],
+                       wait_cmd->ptr[2],
+                       wait_cmd->ptr[3]);
+
+               nvhost_dbg_info("put %d, get %d, size %d",
+                       c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+
+               c->gpfifo.cpu_va[c->gpfifo.put].entry0 =
+                       u64_lo32(wait_cmd->gva);
+               c->gpfifo.cpu_va[c->gpfifo.put].entry1 =
+                       u64_hi32(wait_cmd->gva) |
+                       (4 << 10); /* 4 words for above cmds */
+
+               c->gpfifo.put = (c->gpfifo.put + 1) &
+                       (c->gpfifo.entry_num - 1);
+
+               /* save gp_put */
+               wait_cmd->gp_put = c->gpfifo.put;
+       }
+
+       for (i = 0; i < num_entries; i++) {
+               nvhost_dbg_info("put %d, get %d, size %d",
+                       c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+
+               c->gpfifo.cpu_va[c->gpfifo.put].entry0 =
+                       u64_lo32(gpfifo[i].gpu_va);
+               c->gpfifo.cpu_va[c->gpfifo.put].entry1 =
+                       u64_hi32(gpfifo[i].gpu_va) | (gpfifo[i].words << 10);
+
+               c->gpfifo.put = (c->gpfifo.put + 1) &
+                       (c->gpfifo.entry_num - 1);
+       }
+
+       if (get_cmd) {
+               fence->syncpt_id = c->hw_chid + gk20a_device.syncpt_base;
+               fence->value     = nvhost_syncpt_incr_max(sp, fence->syncpt_id, 1);
+
+               trace_nvhost_ioctl_ctrl_syncpt_incr(fence->syncpt_id);
+
+               /* increment_sync_point */
+               get_cmd->ptr[0] = 0x200100B2;
+               /* syncpt index */
+               get_cmd->ptr[1] = fence->syncpt_id;
+
+               nvhost_dbg_info("cmds for syncpt incr : 0x%08x, 0x%08x",
+                               get_cmd->ptr[0],
+                               get_cmd->ptr[1]);
+
+               nvhost_dbg_info("put %d, get %d, size %d",
+                       c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+
+               c->gpfifo.cpu_va[c->gpfifo.put].entry0 =
+                       u64_lo32(get_cmd->gva);
+               c->gpfifo.cpu_va[c->gpfifo.put].entry1 =
+                       u64_hi32(get_cmd->gva) |
+                       (2 << 10); /* 2 words for above cmds */
+
+               c->gpfifo.put = (c->gpfifo.put + 1) &
+                       (c->gpfifo.entry_num - 1);
+
+               /* save gp_put */
+               get_cmd->gp_put = c->gpfifo.put;
+       }
+
+       nvhost_dbg_info("put %d, get %d, size %d",
+               c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+
+       gk20a_bar1_writel(g,
+               c->userd_gpu_va + 4 * ram_userd_gp_put_w(),
+               c->gpfifo.put);
+
+       nvhost_dbg_fn("done");
+       return 0;
+
+clean_up:
+       nvhost_dbg(dbg_fn | dbg_err, "fail");
+       return err;
+}
+
+void gk20a_remove_channel_support(struct channel_gk20a *c)
+{
+
+}
+
+int gk20a_init_channel_support(struct gk20a *g, u32 chid)
+{
+       struct channel_gk20a *c = g->fifo.channel+chid;
+       c->g = g;
+       c->in_use = false;
+       c->hw_chid = chid;
+       c->bound = false;
+       c->remove_support = gk20a_remove_channel_support;
+       return 0;
+}
+
+int gk20a_channel_init(struct nvhost_channel *ch,
+                      struct nvhost_master *host, int index)
+{
+       return 0;
+}
+
+int gk20a_channel_submit(struct nvhost_job *job)
+{
+       nvhost_dbg_fn("");
+       return 0;
+}
+
+int gk20a_channel_alloc_obj(struct nvhost_channel *channel,
+                       u32 class_num,
+                       u32 *obj_id,
+                       u32 vaspace_share)
+{
+       nvhost_dbg_fn("");
+       return 0;
+}
+
+int gk20a_channel_free_obj(struct nvhost_channel *channel, u32 obj_id)
+{
+       nvhost_dbg_fn("");
+       return 0;
+}
+
+int gk20a_channel_map_buffer(struct channel_gk20a *ch,
+                            struct nvhost_map_buffer_args *a)
+{
+       struct mem_mgr *memmgr = gk20a_channel_mem_mgr(ch);
+       u64 ret_va;
+       struct mem_handle *r;
+
+       r = mem_op().get(memmgr, a->nvmap_handle); /*id, really*/
+
+       nvhost_dbg_info("id=0x%x r=%p", a->nvmap_handle, r);
+
+       if (!r)
+               return -EINVAL;
+
+       ret_va = ch->vm->map(ch->vm, memmgr, r,
+                           a->offset_alignment.align,
+                           a->flags, a->kind);
+       if (!ret_va)
+               return -EINVAL;
+
+       a->offset_alignment.offset = ret_va;
+       return 0;
+}
+int gk20a_channel_unmap_buffer(struct channel_gk20a *ch,
+                              struct nvhost_unmap_buffer_args *a)
+{
+       nvhost_dbg_info("offset=%llx", a->offset);
+
+       ch->vm->unmap(ch->vm, a->offset);
+
+       return 0;
+}
+
+int gk20a_channel_wait(struct channel_gk20a *ch,
+                      struct nvhost_wait_args *args)
+{
+       struct device *d = dev_from_gk20a(ch->g);
+       struct mem_mgr *memmgr = gk20a_channel_mem_mgr(ch);
+       struct mem_handle *handle_ref;
+       struct notification *notif;
+       struct timespec tv;
+       u64 jiffies;
+       u32 id;
+       u32 offset;
+       u32 timeout;
+       int remain, ret = 0;
+
+       if (args->timeout == NVHOST_NO_TIMEOUT)
+               timeout = MAX_SCHEDULE_TIMEOUT;
+       else
+               timeout = (u32)msecs_to_jiffies(args->timeout);
+
+       switch (args->type) {
+       case NVHOST_WAIT_TYPE_NOTIFIER:
+               id = args->condition.notifier.nvmap_handle;
+               offset = args->condition.notifier.offset;
+
+               handle_ref = mem_op().get(memmgr, id);
+               if (!handle_ref) {
+                       nvhost_err(d, "invalid notifier nvmap handle 0x%08x",
+                                  id);
+                       return -EINVAL;
+               }
+
+               notif = mem_op().mmap(handle_ref);
+               if (IS_ERR_OR_NULL(notif)) {
+                       nvhost_err(d, "failed to map notifier memory");
+                       return -ENOMEM;
+               }
+
+               notif = (struct notification *)((u32)notif + offset);
+
+               /* user should set status pending before
+                * calling this ioctl */
+               remain = wait_event_interruptible_timeout(
+                               ch->notifier_wq,
+                               notif->status == 0,
+                               timeout);
+
+               if (remain == 0 && notif->status != 0) {
+                       ret = -ETIMEDOUT;
+                       goto notif_clean_up;
+               } else if (remain < 0) {
+                       ret = -EINTR;
+                       goto notif_clean_up;
+               }
+
+               /* TBD: fill in correct information */
+               jiffies = get_jiffies_64();
+               jiffies_to_timespec(jiffies, &tv);
+               notif->timestamp.nanoseconds[0] = tv.tv_nsec;
+               notif->timestamp.nanoseconds[1] = tv.tv_sec;
+               notif->info32 = 0xDEADBEEF; /* should be object name */
+               notif->info16 = ch->hw_chid; /* should be method offset */
+
+notif_clean_up:
+               mem_op().munmap(handle_ref, notif);
+               return ret;
+       case NVHOST_WAIT_TYPE_SEMAPHORE:
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int gk20a_channel_zcull_get_size(struct channel_gk20a *ch,
+                           struct nvhost_zcull_get_size_args *args)
+{
+       struct gk20a *g = ch->g;
+       struct gr_gk20a *gr = &g->gr;
+
+       nvhost_dbg_fn("");
+
+       args->size = gr_gk20a_get_ctxsw_zcull_size(g, gr);
+
+       return 0;
+}
+
+int gk20a_channel_zcull_bind(struct channel_gk20a *ch,
+                           struct nvhost_zcull_bind_args *args)
+{
+       struct gk20a *g = ch->g;
+       struct gr_gk20a *gr = &g->gr;
+
+       nvhost_dbg_fn("");
+
+       return gr_gk20a_bind_ctxsw_zcull(g, gr, ch,
+                               args->gpu_va, args->mode);
+}
+
+int gk20a_channel_zcull_get_info(struct channel_gk20a *ch,
+                           struct nvhost_zcull_get_info_args *args)
+{
+       struct gk20a *g = ch->g;
+       struct gr_gk20a *gr = &g->gr;
+       struct gr_zcull_info zcull_info;
+       int err;
+
+       nvhost_dbg_fn("");
+
+       memset(&zcull_info, 0, sizeof(struct gr_zcull_info));
+
+       err = gr_gk20a_get_zcull_info(g, gr, &zcull_info);
+
+       args->width_align_pixels = zcull_info.width_align_pixels;
+       args->height_align_pixels = zcull_info.height_align_pixels;
+       args->pixel_squares_by_aliquots = zcull_info.pixel_squares_by_aliquots;
+       args->aliquot_total = zcull_info.aliquot_total;
+       args->region_byte_multiplier = zcull_info.region_byte_multiplier;
+       args->region_header_size = zcull_info.region_header_size;
+       args->subregion_header_size = zcull_info.subregion_header_size;
+       args->subregion_width_align_pixels = zcull_info.subregion_width_align_pixels;
+       args->subregion_height_align_pixels = zcull_info.subregion_height_align_pixels;
+       args->subregion_count = zcull_info.subregion_count;
+
+       return err;
+}
+
+int gk20a_channel_zbc_set_table(struct channel_gk20a *ch,
+                               struct nvhost_zbc_set_table_args *args)
+{
+       struct gk20a *g = ch->g;
+       struct gr_gk20a *gr = &g->gr;
+       struct zbc_entry zbc_val;
+       int i;
+
+       nvhost_dbg_fn("");
+
+       zbc_val.format = args->format;
+       zbc_val.type = args->type;
+
+       switch (zbc_val.type) {
+       case GK20A_ZBC_TYPE_COLOR:
+               for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                       zbc_val.color_ds[i] = args->color_ds[i];
+                       zbc_val.color_l2[i] = args->color_l2[i];
+               }
+               break;
+       case GK20A_ZBC_TYPE_DEPTH:
+               zbc_val.depth = args->depth;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return gr_gk20a_elpg_protected_call(g,
+               gr_gk20a_add_zbc(g, gr, &zbc_val));
+}
+
+int gk20a_channel_zbc_query_table(struct channel_gk20a *ch,
+                               struct nvhost_zbc_query_table_args *args)
+{
+       struct gk20a *g = ch->g;
+       struct gr_gk20a *gr = &g->gr;
+       struct zbc_query_params zbc_tbl;
+       int i, err;
+
+       nvhost_dbg_fn("");
+
+       zbc_tbl.type = args->type;
+       zbc_tbl.index_size = args->index_size;
+
+       err = gr_gk20a_query_zbc(g, gr, &zbc_tbl);
+
+       if (!err) {
+               switch (zbc_tbl.type) {
+               case GK20A_ZBC_TYPE_COLOR:
+                       for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                               args->color_ds[i] = zbc_tbl.color_ds[i];
+                               args->color_l2[i] = zbc_tbl.color_l2[i];
+                       }
+                       break;
+               case GK20A_ZBC_TYPE_DEPTH:
+                       args->depth = zbc_tbl.depth;
+                       break;
+               case GK20A_ZBC_TYPE_INVALID:
+                       args->index_size = zbc_tbl.index_size;
+                       break;
+               default:
+                       return -EINVAL;
+               }
+               args->format = zbc_tbl.format;
+               args->ref_cnt = zbc_tbl.ref_cnt;
+       }
+
+       return err;
+}
diff --git a/drivers/video/tegra/host/gk20a/channel_gk20a.h b/drivers/video/tegra/host/gk20a/channel_gk20a.h
new file mode 100644 (file)
index 0000000..9ee6a10
--- /dev/null
@@ -0,0 +1,168 @@
+/*
+ * drivers/video/tegra/host/gk20a/channel_gk20a.h
+ *
+ * GK20A graphics channel
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __CHANNEL_GK20A_H__
+#define __CHANNEL_GK20A_H__
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+
+struct gk20a;
+struct gr_gk20a;
+struct mem_mgr;
+struct mem_handle;
+
+#include "nvhost_channel.h"
+#include "nvhost_hwctx.h"
+
+#include "cdma_gk20a.h"
+#include "mm_gk20a.h"
+#include "gr_gk20a.h"
+
+struct gpfifo {
+       u32 entry0;
+       u32 entry1;
+};
+
+struct notification {
+       struct {
+               u32 nanoseconds[2];
+       } timestamp;
+       u32 info32;
+       u16 info16;
+       u16 status;
+};
+
+struct fence {
+       u32 hw_chid;
+       u32 syncpt_val;
+};
+
+/* contexts associated with a channel */
+struct channel_ctx_gk20a {
+       struct gr_ctx_desc      gr_ctx;
+       struct pm_ctx_desc      pm_ctx;
+       struct patch_desc       patch_ctx;
+       struct zcull_ctx_desc   zcull_ctx;
+       u32     global_ctx_buffer_va[NR_GLOBAL_CTX_BUF_VA];
+       bool    global_ctx_buffer_mapped;
+};
+
+/* this is the priv element of struct nvhost_channel */
+struct channel_gk20a {
+       struct gk20a *g;
+       bool in_use;
+       int hw_chid;
+       bool bound;
+       bool first_init;
+       bool vpr;
+
+       struct mem_mgr *memmgr;
+       struct nvhost_channel *ch;
+       struct nvhost_hwctx *hwctx;
+
+       struct vm_gk20a *vm;
+
+       struct gpfifo_desc gpfifo;
+
+       struct channel_ctx_gk20a ch_ctx;
+
+       struct inst_desc inst_block;
+       struct mem_desc_sub ramfc;
+
+       void *userd_cpu_va;
+       phys_addr_t userd_cpu_pa;
+       u64 userd_gpu_va;
+
+       s32 num_objects;
+
+       struct priv_cmd_queue priv_cmd_q;
+
+       wait_queue_head_t notifier_wq;
+       wait_queue_head_t semaphore_wq;
+
+       void (*remove_support)(struct channel_gk20a *);
+};
+
+static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch)
+{
+       return !!ch->hwctx->as_share;
+}
+int channel_gk20a_commit_va(struct channel_gk20a *c);
+
+struct nvhost_unmap_buffer_args;
+struct nvhost_zcull_get_size_args;
+struct nvhost_zbc_query_table_args;
+struct nvhost_fence;
+struct nvhost_alloc_gpfifo_args;
+struct nvhost_map_buffer_args;
+struct nvhost_wait_args;
+struct nvhost_zcull_bind_args;
+struct nvhost_zcull_get_info_args;
+struct nvhost_gpfifo;
+struct nvhost_zbc_set_table_args;
+
+int gk20a_init_channel_support(struct gk20a *, u32 chid);
+int gk20a_channel_init(struct nvhost_channel *ch, struct nvhost_master *host,
+                      int index);
+int gk20a_channel_submit(struct nvhost_job *job);
+int gk20a_channel_alloc_obj(struct nvhost_channel *channel,
+                       u32 class_num, u32 *obj_id, u32 vaspace_share);
+int gk20a_channel_free_obj(struct nvhost_channel *channel,
+                       u32 obj_id);
+struct nvhost_hwctx *gk20a_open_channel(struct nvhost_channel *ch,
+                       struct nvhost_hwctx *ctx);
+int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
+                       struct nvhost_alloc_gpfifo_args *args);
+int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
+                       struct nvhost_gpfifo *gpfifo, u32 num_entries,
+                       struct nvhost_fence *fence, u32 flags);
+void gk20a_free_channel(struct nvhost_hwctx *ctx);
+int gk20a_channel_map_buffer(struct channel_gk20a *ch,
+                            struct nvhost_map_buffer_args *a);
+int gk20a_channel_unmap_buffer(struct channel_gk20a *ch,
+                              struct nvhost_unmap_buffer_args *a);
+int gk20a_channel_wait(struct channel_gk20a *ch,
+                      struct nvhost_wait_args *args);
+int gk20a_channel_zcull_get_size(struct channel_gk20a *ch,
+                           struct nvhost_zcull_get_size_args *args);
+int gk20a_channel_zcull_bind(struct channel_gk20a *ch,
+                           struct nvhost_zcull_bind_args *args);
+int gk20a_channel_zcull_get_info(struct channel_gk20a *ch,
+                           struct nvhost_zcull_get_info_args *args);
+int gk20a_channel_zbc_set_table(struct channel_gk20a *ch,
+                           struct nvhost_zbc_set_table_args *args);
+int gk20a_channel_zbc_query_table(struct channel_gk20a *ch,
+                           struct nvhost_zbc_query_table_args *args);
+
+static inline
+struct mem_mgr *gk20a_channel_mem_mgr(struct channel_gk20a *ch)
+{
+       return ch->hwctx->memmgr;
+}
+
+static inline
+struct nvhost_master *host_from_gk20a_channel(struct channel_gk20a *ch)
+{
+       return nvhost_get_host(ch->ch->dev);
+}
+
+#endif /*__CHANNEL_GK20A_H__*/
diff --git a/drivers/video/tegra/host/gk20a/clk_gk20a.c b/drivers/video/tegra/host/gk20a/clk_gk20a.c
new file mode 100644 (file)
index 0000000..18d055b
--- /dev/null
@@ -0,0 +1,344 @@
+/*
+ * drivers/video/tegra/host/gk20a/clk_gk20a.c
+ *
+ * GK20A Clocks
+ *
+ * Copyright (c) 2011 - 2012, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>       /* for mdelay */
+
+#include <mach/clk.h>
+
+#include "../dev.h"
+
+#include "gk20a.h"
+#include "hw_trim_gk20a.h"
+
+#define KHz 1000
+#define MHz 1000000
+
+/* from vbios PLL info table */
+static struct pll_parms gpc_pll_params = {
+       810, 405,       /* freq */
+       1100, 2200,     /* vco */
+       25, 100,        /* u */
+       1, 255,         /* M */
+       8, 255,         /* N */
+       1, 63,          /* PL */
+};
+
+/* Calculate and update M/N/PL as well as pll->freq
+    ref_clk_f = clk_in_f / src_div = clk_in_f; (src_div = 1 on gk20a)
+    u_f = ref_clk_f / M;
+    PLL output = vco_f = u_f * N = ref_clk_f * N / M;
+    gpc2clk = target clock frequency = vco_f / PL;
+    gpcclk = gpc2clk / 2; */
+static int clk_config_pll(struct clk_gk20a *clk, struct pll *pll,
+       struct pll_parms *pll_params, u32 *target_freq, bool best_fit)
+{
+       u32 min_vco_f, max_vco_f;
+       u32 best_M, best_N;
+       u32 low_PL, high_PL, best_PL;
+       u32 pl, m, n, n2;
+       u32 target_vco_f, vco_f;
+       u32 ref_clk_f, target_clk_f, u_f;
+       u32 delta, lwv, best_delta = ~0;
+
+       BUG_ON(target_freq == NULL);
+
+       nvhost_dbg_fn("request target freq %d MHz", *target_freq);
+
+       ref_clk_f = pll->clk_in;
+       target_clk_f = *target_freq;
+       max_vco_f = pll_params->max_vco;
+       min_vco_f = pll_params->min_vco;
+       best_M = pll_params->max_M;
+       best_N = pll_params->min_N;
+       best_PL = pll_params->min_PL;
+
+       target_vco_f = target_clk_f + target_clk_f / 50;
+       if (max_vco_f < target_vco_f)
+               max_vco_f = target_vco_f;
+
+       high_PL = (max_vco_f + target_vco_f - 1) / target_vco_f;
+       high_PL = min(high_PL, pll_params->max_PL);
+       high_PL = max(high_PL, pll_params->min_PL);
+
+       low_PL = min_vco_f / target_vco_f;
+       low_PL = min(low_PL, pll_params->max_PL);
+       low_PL = max(low_PL, pll_params->min_PL);
+
+       nvhost_dbg_info("low_PL %d, high_PL %d", low_PL, high_PL);
+
+       for (pl = high_PL; pl >= low_PL; pl--) {
+               target_vco_f = target_clk_f * pl;
+
+               for (m = pll_params->min_M; m <= pll_params->max_M; m++) {
+                       u_f = ref_clk_f / m;
+
+                       if (u_f < pll_params->min_u)
+                               break;
+                       if (u_f > pll_params->max_u)
+                               continue;
+
+                       n = (target_vco_f * m) / ref_clk_f;
+                       n2 = ((target_vco_f * m) + (ref_clk_f - 1)) / ref_clk_f;
+
+                       if (n > pll_params->max_N)
+                               break;
+
+                       for (; n <= n2; n++) {
+                               if (n < pll_params->min_N)
+                                       continue;
+                               if (n > pll_params->max_N)
+                                       break;
+
+                               vco_f = ref_clk_f * n / m;
+
+                               if (vco_f >= min_vco_f && vco_f <= max_vco_f) {
+                                       lwv = (vco_f + (pl / 2)) / pl;
+                                       delta = abs(lwv - target_clk_f);
+
+                                       if (delta < best_delta) {
+                                               best_delta = delta;
+                                               best_M = m;
+                                               best_N = n;
+                                               best_PL = pl;
+
+                                               if (best_delta == 0 ||
+                                                   /* 0.45% for non best fit */
+                                                   (!best_fit && (vco_f / best_delta > 218))) {
+                                                       goto found_match;
+                                               }
+
+                                               nvhost_dbg_info("delta %d @ M %d, N %d, PL %d",
+                                                       delta, m, n, pl);
+                                       }
+                               }
+                       }
+               }
+       }
+
+found_match:
+       BUG_ON(best_delta == ~0);
+
+       if (best_fit && best_delta != 0)
+               nvhost_warn(dev_from_gk20a(clk->g),
+                       "no best match for target freq @ %d on gpc_pll",
+                       target_clk_f);
+
+       pll->M = best_M;
+       pll->N = best_N;
+       pll->PL = best_PL;
+
+       /* save current frequency */
+       pll->freq = ref_clk_f * pll->N / (pll->M * pll->PL);
+
+       *target_freq = pll->freq;
+
+       nvhost_dbg_info("actual target freq %d MHz, M %d, N %d, PL %d",
+               *target_freq, pll->M, pll->N, pll->PL);
+
+       nvhost_dbg_fn("done");
+
+       return 0;
+}
+
+static int clk_program_gpc_pll(struct gk20a *g, struct clk_gk20a *clk)
+{
+       u32 data, cfg, coeff, timeout;
+
+       nvhost_dbg_fn("");
+
+       /* put PLL in bypass before programming it */
+       data = gk20a_readl(g, trim_sys_sel_vco_r());
+       data = set_field(data, trim_sys_sel_vco_gpc2clk_out_m(),
+               trim_sys_sel_vco_gpc2clk_out_bypass_f());
+       gk20a_writel(g, trim_sys_sel_vco_r(), data);
+
+       /* disable PLL before changing coefficients */
+       cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+       cfg = set_field(cfg, trim_sys_gpcpll_cfg_enable_m(),
+                       trim_sys_gpcpll_cfg_enable_no_f());
+       gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+
+       /* change coefficients */
+       coeff = trim_sys_gpcpll_coeff_mdiv_f(clk->gpc_pll.M) |
+               trim_sys_gpcpll_coeff_ndiv_f(clk->gpc_pll.N) |
+               trim_sys_gpcpll_coeff_pldiv_f(clk->gpc_pll.PL);
+       gk20a_writel(g, trim_sys_gpcpll_coeff_r(), coeff);
+
+       /* enable PLL after changing coefficients */
+       cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+       cfg = set_field(cfg, trim_sys_gpcpll_cfg_enable_m(),
+                       trim_sys_gpcpll_cfg_enable_yes_f());
+       gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+
+       /* lock pll */
+       cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+       if (cfg & trim_sys_gpcpll_cfg_enb_lckdet_power_off_f()){
+               cfg = set_field(cfg, trim_sys_gpcpll_cfg_enb_lckdet_m(),
+                       trim_sys_gpcpll_cfg_enb_lckdet_power_on_f());
+               gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+       }
+
+       /* wait pll lock */
+       timeout = clk->pll_delay / 20 + 1;
+       do {
+               cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+               if (cfg & trim_sys_gpcpll_cfg_pll_lock_true_f())
+                       goto pll_locked;
+               udelay(20);
+       } while (--timeout > 0);
+
+       /* PLL is messed up. What can we do here? */
+       BUG();
+       return -EBUSY;
+
+pll_locked:
+       /* put PLL back on vco */
+       data = gk20a_readl(g, trim_sys_sel_vco_r());
+       data = set_field(data, trim_sys_sel_vco_gpc2clk_out_m(),
+               trim_sys_sel_vco_gpc2clk_out_vco_f());
+       gk20a_writel(g, trim_sys_sel_vco_r(), data);
+
+       return 0;
+}
+
+static int gk20a_init_clk_reset_enable_hw(struct gk20a *g)
+{
+       nvhost_dbg_fn("");
+       return 0;
+}
+
+static int gk20a_init_clk_setup_sw(struct gk20a *g, bool reinit)
+{
+       struct clk_gk20a *clk = &g->clk;
+
+       nvhost_dbg_fn("");
+
+       /* TBD: set this according to different environments */
+       clk->pll_delay = 5000; /* usec */
+
+       /* target gpc2clk = 806MHz, gpcclk = 403MHz */
+       clk->gpc_pll.id = GK20A_GPC_PLL;
+       clk->gpc_pll.clk_in = 26; /* MHz */
+       /* settings in vbios */
+       clk->gpc_pll.M = 1;
+       clk->gpc_pll.N = 31;
+       clk->gpc_pll.PL = 1;
+       clk->gpc_pll.freq = (clk->gpc_pll.clk_in * clk->gpc_pll.N) /
+               (clk->gpc_pll.M * clk->gpc_pll.PL);
+
+       clk->tegra_clk = clk_get_sys("tegra_gk20a", "PLLG_ref");
+       if (IS_ERR_OR_NULL(clk->tegra_clk)) {
+               nvhost_err(dev_from_gk20a(g),
+                       "fail to get tegra ref clk tegra_gk20a/PLLG_ref");
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static int gk20a_init_clk_setup_hw(struct gk20a *g)
+{
+       struct clk_gk20a *clk = &g->clk;
+       u32 data;
+
+       nvhost_dbg_fn("");
+
+       data = gk20a_readl(g, trim_sys_gpc2clk_out_r());
+       data = set_field(data, trim_sys_gpc2clk_out_sdiv14_m(),
+                       trim_sys_gpc2clk_out_sdiv14_indiv4_mode_f());
+       gk20a_writel(g, trim_sys_gpc2clk_out_r(), data);
+
+       return clk_program_gpc_pll(g, clk);
+}
+
+int gk20a_init_clk_support(struct gk20a *g, bool reinit)
+{
+       struct clk_gk20a *clk = &g->clk;
+       u32 err;
+
+       nvhost_dbg_fn("");
+
+       clk->g = g;
+
+       err = gk20a_init_clk_reset_enable_hw(g);
+       if (err)
+               return err;
+
+       err = gk20a_init_clk_setup_sw(g, reinit);
+       if (err)
+               return err;
+
+       err = gk20a_init_clk_setup_hw(g);
+       if (err)
+               return err;
+
+       /* TBD: remove this below when we export it to therm/edp.
+          Added here just for coverage */
+       gk20a_clk_set_rate(g, 780 /* MHz */);
+
+       return err;
+}
+
+/* TBD: interface to change clock and dvfs in one function */
+int gk20a_clk_set_rate(struct gk20a *g, u32 rate)
+{
+       struct clk_gk20a *clk = &g->clk;
+       struct clk *tegra_clk = clk->tegra_clk;
+       /* save old freq for compare and recover */
+       u32 freq = clk->gpc_pll.freq;
+       int err = 0;
+
+       nvhost_dbg_fn("");
+
+       if (rate == freq)
+               return 0;
+
+       /* gpc_pll.freq is changed to new value here */
+       err = clk_config_pll(clk, &clk->gpc_pll, &gpc_pll_params,
+                       &rate, true);
+       if (err)
+               goto clean_up;
+
+       /* raise freq, call dvfs first to raise voltage */
+       if (rate > freq) {
+               err = tegra_dvfs_set_rate(tegra_clk, rate * MHz);
+               if (err)
+                       goto clean_up;
+       }
+
+       err = clk_program_gpc_pll(g, clk);
+       if (err)
+               goto clean_up;
+
+       /* lower freq, call dvfs after to lower voltage */
+       if (rate < freq) {
+               err = tegra_dvfs_set_rate(tegra_clk, rate * MHz);
+               if (err)
+                       goto clean_up;
+       }
+
+clean_up:
+       /* Just report error but not restore PLL since dvfs could already changed
+           voltage even when it returns error. */
+       if (err)
+               nvhost_err(dev_from_gk20a(g),
+                       "failed to set rate to @ %d", rate);
+       return err;
+}
diff --git a/drivers/video/tegra/host/gk20a/clk_gk20a.h b/drivers/video/tegra/host/gk20a/clk_gk20a.h
new file mode 100644 (file)
index 0000000..ec56a68
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * drivers/video/tegra/host/gk20a/clk_gk20a.h
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011 - 2012, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _NVHOST_CLK_GK20A_H_
+#define _NVHOST_CLK_GK20A_H_
+
+enum {
+       /* only one PLL for gk20a */
+       GK20A_GPC_PLL = 0,
+};
+
+struct pll {
+       u32 id;
+       u32 clk_in;     /* MHz */
+       u32 M;
+       u32 N;
+       u32 PL;
+       u32 freq;       /* MHz */
+};
+
+struct pll_parms {
+       u32 min_freq, max_freq; /* MHz */
+       u32 min_vco, max_vco;   /* MHz */
+       u32 min_u,   max_u;     /* MHz */
+       u32 min_M,   max_M;
+       u32 min_N,   max_N;
+       u32 min_PL,  max_PL;
+};
+
+struct clk_gk20a {
+       struct gk20a *g;
+       struct clk *tegra_clk;
+       struct pll gpc_pll;
+       u32 pll_delay; /* default PLL settle time */
+};
+
+int gk20a_init_clk_support(struct gk20a *g, bool reinit);
+
+int gk20a_clk_set_rate(struct gk20a *g, u32 rate);
+
+#endif /* _NVHOST_CLK_GK20A_H_ */
diff --git a/drivers/video/tegra/host/gk20a/debug_gk20a.c b/drivers/video/tegra/host/gk20a/debug_gk20a.c
new file mode 100644 (file)
index 0000000..cd3b428
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * drivers/video/tegra/host/t20/debug_gk20a.c
+ *
+ * Copyright (C) 2011 NVIDIA Corporation
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/io.h>
+
+#include "../dev.h"
+#include "../debug.h"
+#include "../nvhost_cdma.h"
+
+#include "gk20a.h"
+
+
+void gk20a_debug_show_channel_cdma(struct nvhost_master *m,
+       struct nvhost_channel *ch, struct output *o, int chid)
+{
+}
+
+void gk20a_debug_show_channel_fifo(struct nvhost_master *m,
+       struct nvhost_channel *ch, struct output *o, int chid)
+{
+}
diff --git a/drivers/video/tegra/host/gk20a/debug_gk20a.h b/drivers/video/tegra/host/gk20a/debug_gk20a.h
new file mode 100644 (file)
index 0000000..cd67099
--- /dev/null
@@ -0,0 +1,22 @@
+/*
+ * drivers/video/tegra/host/gk20a/debug_gk20a.h
+ *
+ * Copyright (C) 2011 NVIDIA Corporation
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+void gk20a_debug_show_channel_cdma(struct nvhost_master *m,
+                               struct nvhost_channel *ch,
+                               struct output *o, int chid);
+void gk20a_debug_show_channel_fifo(struct nvhost_master *m,
+                               struct nvhost_channel *ch,
+                               struct output *o, int chid);
diff --git a/drivers/video/tegra/host/gk20a/fifo_gk20a.c b/drivers/video/tegra/host/gk20a/fifo_gk20a.c
new file mode 100644 (file)
index 0000000..a822409
--- /dev/null
@@ -0,0 +1,663 @@
+/*
+ * drivers/video/tegra/host/gk20a/fifo_gk20a.c
+ *
+ * GK20A Graphics FIFO (gr host)
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/nvmap.h>
+
+#include "../dev.h"
+#include "../nvhost_as.h"
+
+#include "gk20a.h"
+#include "hw_fifo_gk20a.h"
+#include "hw_pbdma_gk20a.h"
+#include "hw_ccsr_gk20a.h"
+#include "hw_ram_gk20a.h"
+#include "hw_proj_gk20a.h"
+#include "hw_top_gk20a.h"
+#include "hw_mc_gk20a.h"
+
+static int init_engine_info_gk20a(struct fifo_gk20a *f)
+{
+       struct fifo_engine_info_gk20a *gr_info;
+       const u32 gr_sw_id = ENGINE_GR_GK20A;
+       u32 i;
+       u32 max_info_entries = top_device_info__size_1_v();
+
+       nvhost_dbg_fn("");
+
+       /* all we really care about finding is the graphics entry    */
+       /* especially early on in sim it probably thinks it has more */
+       f->num_engines = 1;
+
+       gr_info = f->engine_info + gr_sw_id;
+
+       gr_info->sw_id = gr_sw_id;
+       gr_info->name = "gr";
+       gr_info->dev_info_id = top_device_info_type_enum_graphics_v();
+       gr_info->mmu_fault_id = fifo_intr_mmu_fault_eng_id_graphics_v();
+       gr_info->runlist_id = ~0;
+       gr_info->pbdma_id   = ~0;
+       gr_info->engine_id  = ~0;
+
+       for (i = 0; i < max_info_entries; i++) {
+               u32 table_entry = gk20a_readl(f->g, top_device_info_r(i));
+               u32 entry = top_device_info_entry_v(table_entry);
+               u32 engine_enum = top_device_info_type_enum_v(table_entry);
+               u32 table_entry2 = 0;
+
+               if (entry == top_device_info_entry_not_valid_v())
+                       continue;
+
+               if (top_device_info_chain_v(table_entry) ==
+                   top_device_info_chain_enable_v()) {
+
+                       table_entry2 = gk20a_readl(f->g,
+                                                  top_device_info_r(++i));
+
+                       engine_enum = top_device_info_type_enum_v(table_entry2);
+               }
+
+               if (entry == top_device_info_entry_enum_v() &&
+                   engine_enum == gr_info->dev_info_id) {
+                       int pbdma_id;
+                       u32 runlist_bit;
+
+                       gr_info->runlist_id =
+                               top_device_info_runlist_enum_v(table_entry);
+                       nvhost_dbg_info("gr info: runlist_id %d", gr_info->runlist_id);
+
+                       gr_info->engine_id =
+                               top_device_info_engine_enum_v(table_entry);
+                       nvhost_dbg_info("gr info: engine_id %d", gr_info->engine_id);
+
+                       runlist_bit = 1 << gr_info->runlist_id;
+
+                       for (pbdma_id = 0; pbdma_id < f->num_pbdma; pbdma_id++) {
+                               nvhost_dbg_info("gr info: pbdma_map[%d]=%d",
+                                       pbdma_id, f->pbdma_map[pbdma_id]);
+                               if (f->pbdma_map[pbdma_id] & runlist_bit)
+                                       break;
+                       }
+
+                       if (pbdma_id == f->num_pbdma) {
+                               nvhost_dbg(dbg_err, "busted pbmda map");
+                               return -EINVAL;
+                       }
+                       gr_info->pbdma_id = pbdma_id;
+
+                       break;
+               }
+       }
+
+       if (gr_info->runlist_id == ~0) {
+               nvhost_dbg(dbg_err, "busted device info");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void gk20a_remove_fifo_support(struct fifo_gk20a *f)
+{
+       struct mem_mgr *memmgr = mem_mgr_from_g(f->g);
+
+       nvhost_dbg_fn("");
+
+       if (f->channel) {
+               int c;
+               for (c = 0; c < f->num_channels; c++) {
+                       if (f->channel[c].remove_support)
+                               f->channel[c].remove_support(f->channel+c);
+               }
+               kfree(f->channel);
+               f->channel = 0;
+       }
+
+       mem_op().munmap(f->userd.mem.ref, f->userd.cpu_va);
+       mem_op().unpin(memmgr, f->userd.mem.ref);
+       mem_op().put(memmgr, f->userd.mem.ref);
+       memset(&f->userd, 0, sizeof(struct userd_desc));
+
+       kfree(f->pbdma_map);
+       f->pbdma_map = NULL;
+
+       kfree(f->engine_info);
+       f->engine_info = NULL;
+}
+
+static int fifo_gk20a_init_runlist(struct gk20a *g, struct fifo_gk20a *f)
+{
+       struct mem_mgr *memmgr = mem_mgr_from_g(g);
+       struct fifo_engine_info_gk20a *engine_info;
+       struct fifo_runlist_info_gk20a *runlist;
+       u32 engine_id;
+       u32 runlist_id;
+       u32 i;
+       u64 runlist_size;
+
+       nvhost_dbg_fn("");
+
+       f->max_runlists = fifo_eng_runlist_base__size_1_v();
+       f->runlist_info = kzalloc(sizeof(struct fifo_runlist_info_gk20a) *
+                                 f->max_runlists, GFP_KERNEL);
+       if (!f->runlist_info)
+               goto clean_up;
+
+       for (engine_id = 0; engine_id < ENGINE_INVAL_GK20A; engine_id++) {
+               engine_info = f->engine_info + engine_id;
+               runlist_id = engine_info->runlist_id;
+               runlist = &f->runlist_info[runlist_id];
+
+               runlist->active_channels =
+                       kzalloc((f->num_channels /
+                               (sizeof(unsigned long) * BITS_PER_BYTE)) + 1,
+                               GFP_KERNEL);
+               if (!runlist->active_channels)
+                       goto clean_up;
+
+               runlist_size  = ram_rl_entry_size_v() * f->num_channels;
+               for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
+                       runlist->mem[i].ref =
+                               mem_op().alloc(memmgr, runlist_size,
+                                           DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                                           DEFAULT_NVMAP_ALLOC_FLAGS,
+                                           NVMAP_HEAP_CARVEOUT_GENERIC);
+                       if (!runlist->mem[i].ref)
+                               goto clean_up;
+                       runlist->mem[i].size = runlist_size;
+               }
+               mutex_init(&runlist->mutex);
+               init_waitqueue_head(&runlist->runlist_wq);
+       }
+
+       return 0;
+
+clean_up:
+       nvhost_dbg_fn("fail");
+       for (engine_id = 0; engine_id < ENGINE_INVAL_GK20A; engine_id++) {
+               engine_info = f->engine_info + engine_id;
+               runlist_id = engine_info->runlist_id;
+               runlist = &f->runlist_info[runlist_id];
+
+               for (i = 0; i < MAX_RUNLIST_BUFFERS; i++)
+                       mem_op().put(memmgr,
+                                  runlist->mem[i].ref);
+
+               kfree(runlist->active_channels);
+       }
+
+       kfree(f->runlist_info);
+       f->runlist_info = NULL;
+
+       return -ENOMEM;
+}
+
+static int gk20a_init_fifo_reset_enable_hw(struct gk20a *g)
+{
+       u32 pmc_enable;
+       u32 intr_stall;
+       u32 mask;
+       u32 timeout;
+       int i;
+
+       nvhost_dbg_fn("");
+
+       /* enable pmc pfifo */
+       pmc_enable = gk20a_readl(g, mc_enable_r());
+       pmc_enable &= ~mc_enable_pfifo_enabled_f();
+       pmc_enable &= ~mc_enable_ce2_enabled_f();
+       pmc_enable &= ~mc_enable_priv_ring_enabled_f();
+       gk20a_writel(g, mc_enable_r(), pmc_enable);
+
+       pmc_enable = gk20a_readl(g, mc_enable_r());
+       pmc_enable |= mc_enable_pfifo_enabled_f();
+       pmc_enable |= mc_enable_ce2_enabled_f();
+       pmc_enable |= mc_enable_priv_ring_enabled_f();
+       gk20a_writel(g, mc_enable_r(), pmc_enable);
+       gk20a_readl(g, mc_enable_r());
+
+       /* enable pbdma */
+       mask = 0;
+       for (i = 0; i < proj_host_num_pbdma_v(); ++i)
+               mask |= mc_enable_pb_sel_f(mc_enable_pb_0_enabled_v(), i);
+       gk20a_writel(g, mc_enable_pb_r(), mask);
+
+       /* enable pfifo interrupt */
+       gk20a_writel(g, fifo_intr_0_r(), 0xFFFFFFFF);
+       gk20a_writel(g, fifo_intr_en_0_r(), 0xFFFFFFFF); /* TBD: alternative intr tree*/
+       gk20a_writel(g, fifo_intr_en_1_r(), 0xFFFFFFFF); /* TBD: alternative intr tree*/
+
+       /* enable pbdma interrupt */
+       mask = 0;
+       for (i = 0; i < proj_host_num_pbdma_v(); i++) {
+               intr_stall = gk20a_readl(g, pbdma_intr_stall_r(i));
+               intr_stall &= ~pbdma_intr_stall_lbreq_enabled_f();
+               gk20a_writel(g, pbdma_intr_stall_r(i), intr_stall);
+               gk20a_writel(g, pbdma_intr_0_r(i), 0xFFFFFFFF);
+               gk20a_writel(g, pbdma_intr_en_0_r(i),
+                       (~0) & ~pbdma_intr_en_0_lbreq_enabled_f());
+               gk20a_writel(g, pbdma_intr_1_r(i), 0xFFFFFFFF);
+               gk20a_writel(g, pbdma_intr_en_1_r(i), 0xFFFFFFFF);
+       }
+
+       /* TBD: apply overrides */
+
+       /* TBD: BLCG prod */
+
+       /* reset runlist interrupts */
+       gk20a_writel(g, fifo_intr_runlist_r(), ~0);
+
+       /* TBD: do we need those? */
+       timeout = gk20a_readl(g, fifo_fb_timeout_r());
+       timeout = set_field(timeout, fifo_fb_timeout_period_m(),
+                       fifo_fb_timeout_period_max_f());
+       gk20a_writel(g, fifo_fb_timeout_r(), timeout);
+
+       timeout = gk20a_readl(g, fifo_pb_timeout_r());
+       timeout &= ~fifo_pb_timeout_detection_enabled_f();
+       gk20a_writel(g, fifo_pb_timeout_r(), timeout);
+
+       gk20a_reset_priv_ring(g);
+
+       nvhost_dbg_fn("done");
+
+       return 0;
+}
+
+static int gk20a_init_fifo_setup_sw(struct gk20a *g, bool reinit)
+{
+       struct mem_mgr *memmgr = mem_mgr_from_g(g);
+       struct fifo_gk20a *f = &g->fifo;
+       int chid, i, err;
+
+       nvhost_dbg_fn("");
+
+       if (reinit) {
+               nvhost_dbg_fn("skip init");
+               return 0;
+       }
+
+       f->g = g;
+
+       f->num_channels = ccsr_channel__size_1_v();
+       f->num_pbdma = proj_host_num_pbdma_v();
+       f->max_engines = ENGINE_INVAL_GK20A;
+
+       f->userd_entry_size = 1 << ram_userd_base_shift_v();
+       f->userd_total_size = f->userd_entry_size * f->num_channels;
+
+       f->userd.mem.ref = mem_op().alloc(memmgr, f->userd_total_size,
+                                      DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                                      DEFAULT_NVMAP_ALLOC_FLAGS,
+                                      NVMAP_HEAP_CARVEOUT_GENERIC);
+       if (IS_ERR_OR_NULL(f->userd.mem.ref)) {
+               err = -ENOMEM;
+               goto clean_up;
+       }
+
+       f->userd.cpu_va = mem_op().mmap(f->userd.mem.ref);
+       /* f->userd.cpu_va = g->bar1; */
+       if (IS_ERR_OR_NULL(f->userd.cpu_va)) {
+               f->userd.cpu_va = NULL;
+               err = -ENOMEM;
+               goto clean_up;
+       }
+
+       f->userd.cpu_pa = mem_op().pin(memmgr, f->userd.mem.ref);
+       nvhost_dbg_info("userd physical address : 0x%08x",
+                  (u32)f->userd.cpu_pa);
+
+       if (f->userd.cpu_pa == -EINVAL ||
+           f->userd.cpu_pa == -EINTR) {
+               f->userd.cpu_pa = 0;
+               err = -ENOMEM;
+               goto clean_up;
+       }
+
+       /* bar1 va */
+       f->userd.gpu_va = g->mm.bar1.vm.map(&g->mm.bar1.vm,
+                                           memmgr,
+                                           f->userd.mem.ref,
+                                           4096,
+                                           NVHOST_MAP_BUFFER_FLAGS_CACHABLE_FALSE,
+                                           0);
+       nvhost_dbg_info("userd bar1 va = 0x%llx", f->userd.gpu_va);
+
+       f->userd.mem.size = f->userd_total_size;
+
+       f->channel = kzalloc(f->num_channels * sizeof(*f->channel),
+                               GFP_KERNEL);
+       f->pbdma_map = kzalloc(f->num_pbdma * sizeof(*f->pbdma_map),
+                               GFP_KERNEL);
+       f->engine_info = kzalloc(f->max_engines * sizeof(*f->engine_info),
+                               GFP_KERNEL);
+
+       if (!(f->channel && f->pbdma_map && f->engine_info)) {
+               err = -ENOMEM;
+               goto clean_up;
+       }
+
+       /* pbdma map needs to be in place before calling engine info init */
+       for (i = 0; i < f->num_pbdma; ++i)
+               f->pbdma_map[i] = gk20a_readl(g, fifo_pbdma_map_r(i));
+
+       init_engine_info_gk20a(f);
+
+       fifo_gk20a_init_runlist(g, f);
+
+       for (chid = 0; chid < f->num_channels; chid++) {
+               f->channel[chid].userd_cpu_va =
+                       f->userd.cpu_va + chid * f->userd_entry_size;
+               f->channel[chid].userd_cpu_pa =
+                       f->userd.cpu_pa + chid * f->userd_entry_size;
+               f->channel[chid].userd_gpu_va =
+                       f->userd.gpu_va + chid * f->userd_entry_size;
+
+               gk20a_init_channel_support(g, chid);
+       }
+       mutex_init(&f->ch_inuse_mutex);
+
+       f->remove_support = gk20a_remove_fifo_support;
+
+       nvhost_dbg_fn("done");
+       return 0;
+
+clean_up:
+       nvhost_dbg_fn("fail");
+       mem_op().munmap(f->userd.mem.ref, f->userd.cpu_va);
+       mem_op().unpin(memmgr, f->userd.mem.ref);
+       mem_op().put(memmgr, f->userd.mem.ref);
+       memset(&f->userd, 0, sizeof(struct userd_desc));
+
+       kfree(f->channel);
+       f->channel = NULL;
+       kfree(f->pbdma_map);
+       f->pbdma_map = NULL;
+       kfree(f->engine_info);
+       f->engine_info = NULL;
+
+       return err;
+}
+
+static void gk20a_fifo_handle_runlist_event(struct gk20a *g)
+{
+       struct fifo_gk20a *f = &g->fifo;
+       struct fifo_runlist_info_gk20a *runlist;
+       unsigned long runlist_event;
+       u32 runlist_id;
+
+       runlist_event = gk20a_readl(g, fifo_intr_runlist_r());
+       gk20a_writel(g, fifo_intr_runlist_r(), runlist_event);
+
+       for_each_set_bit(runlist_id, &runlist_event, f->max_runlists) {
+               runlist = &f->runlist_info[runlist_id];
+               wake_up(&runlist->runlist_wq);
+       }
+}
+
+static int gk20a_init_fifo_setup_hw(struct gk20a *g)
+{
+       struct fifo_gk20a *f = &g->fifo;
+
+       nvhost_dbg_fn("");
+
+       /* test write, read through bar1 @ userd region before
+        * turning on the snooping */
+       {
+               struct fifo_gk20a *f = &g->fifo;
+               u32 v, v1 = 0x33, v2 = 0x55;
+
+               u32 bar1_vaddr = f->userd.gpu_va;
+               volatile u32 *cpu_vaddr = f->userd.cpu_va;
+
+               nvhost_dbg_info("test bar1 @ vaddr 0x%x",
+                          bar1_vaddr);
+
+               v = gk20a_bar1_readl(g, bar1_vaddr);
+
+               *cpu_vaddr = v1;
+               smp_mb();
+
+               if (v1 != gk20a_bar1_readl(g, bar1_vaddr)) {
+                       nvhost_err(dev_from_gk20a(g), "bar1 broken @ gk20a!");
+                       return -EINVAL;
+               }
+
+               gk20a_bar1_writel(g, bar1_vaddr, v2);
+
+               if (v2 != gk20a_bar1_readl(g, bar1_vaddr)) {
+                       nvhost_err(dev_from_gk20a(g), "bar1 broken @ gk20a!");
+                       return -EINVAL;
+               }
+
+               /* is it visible to the cpu? */
+               if (*cpu_vaddr != v2) {
+                       nvhost_err(dev_from_gk20a(g),
+                               "cpu didn't see bar1 write @ %p!",
+                               cpu_vaddr);
+                       return -EINVAL;
+               }
+
+               /* put it back */
+               gk20a_bar1_writel(g, bar1_vaddr, v);
+       }
+
+       /*XXX all manner of flushes and caching worries, etc */
+
+       /* set the base for the userd region now */
+       gk20a_writel(g, fifo_bar1_base_r(),
+                       fifo_bar1_base_ptr_f(f->userd.gpu_va >> 12) |
+                       fifo_bar1_base_valid_true_f());
+
+       nvhost_dbg_fn("done");
+
+       return 0;
+}
+
+int gk20a_init_fifo_support(struct gk20a *g, bool reinit)
+{
+       u32 err;
+
+       err = gk20a_init_fifo_reset_enable_hw(g);
+       if (err)
+               return err;
+
+       err = gk20a_init_fifo_setup_sw(g, reinit);
+       if (err)
+               return err;
+
+       err = gk20a_init_fifo_setup_hw(g);
+       if (err)
+               return err;
+
+       return err;
+}
+
+static void gk20a_fifo_handle_mmu_fault(struct gk20a *g)
+{
+       u32 fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
+       u32 fault_info;
+       u32 engine_id;
+
+       for (engine_id = 0;
+            engine_id < fifo_intr_mmu_fault_id_field__size_1_v();
+            engine_id++) {
+               if ((fault_id & (1 << engine_id)) ==
+                   fifo_intr_mmu_fault_id_field_not_pending_v())
+                       continue;
+
+               fault_info = gk20a_readl(g,
+                       fifo_intr_mmu_fault_info_r(engine_id));
+
+               nvhost_err(dev_from_gk20a(g), "mmu fault on engine %d, "
+                       "engine_subid %d, client %d, "
+                       "addr 0x%08x:0x%08x, type %d, info 0x%08x\n",
+                       engine_id,
+                       fifo_intr_mmu_fault_info_engine_subid_v(fault_info),
+                       fifo_intr_mmu_fault_info_client_v(fault_info),
+                       fifo_intr_mmu_fault_hi_r(engine_id),
+                       fifo_intr_mmu_fault_lo_r(engine_id),
+                       fifo_intr_mmu_fault_info_type_v(fault_info),
+                       fault_info);
+
+               /* don't clear it yet */
+               /* gk20a_writel(g, fifo_intr_mmu_fault_id_r(), fault_id); */
+       }
+}
+
+void gk20a_fifo_isr(struct gk20a *g)
+{
+       u32 fifo_intr = gk20a_readl(g, fifo_intr_0_r());
+
+       /* handle runlist update */
+       if (fifo_intr & fifo_intr_0_runlist_event_pending_f()) {
+               gk20a_fifo_handle_runlist_event(g);
+               fifo_intr &= ~fifo_intr_0_runlist_event_pending_f();
+       }
+
+       /* don't clear this for now
+        * print more info for debugging */
+       if (fifo_intr & fifo_intr_0_sched_error_pending_f()) {
+               nvhost_err(dev_from_gk20a(g),
+                       "fifo sched error : 0x%08x",
+                       gk20a_readl(g, fifo_intr_sched_error_r()));
+       }
+
+       /* don't clear this for now
+        * print more info for debugging */
+       if (fifo_intr & fifo_intr_0_mmu_fault_pending_f())
+               gk20a_fifo_handle_mmu_fault(g);
+
+       if (fifo_intr)
+               nvhost_err(dev_from_gk20a(g),
+                          "unhandled fifo interrupt 0x%08x\n",
+                          fifo_intr);
+}
+
+int gk20a_fifo_preempt_channel(struct gk20a *g, u32 runlist_id, u32 hw_chid)
+{
+       struct fifo_gk20a *f = &g->fifo;
+       struct fifo_runlist_info_gk20a *runlist = &f->runlist_info[runlist_id];
+       u32 timeout = 2000; /* 2 sec */
+       u32 ret = 0;
+
+       mutex_lock(&runlist->mutex);
+
+       /* issue preempt */
+       gk20a_writel(g, fifo_preempt_r(),
+               fifo_preempt_chid_f(hw_chid) |
+               fifo_preempt_type_channel_f());
+
+       /* wait for preempt */
+       do {
+               if (!(gk20a_readl(g, fifo_preempt_r()) &
+                       fifo_preempt_pending_true_f()))
+                       break;
+
+               if (--timeout == 0) {
+                       nvhost_err(dev_from_gk20a(g),
+                                   "preempt channel %d timeout\n",
+                                   hw_chid);
+                       ret = -EBUSY;
+               }
+               mdelay(1);
+       } while (1);
+
+       mutex_unlock(&runlist->mutex);
+
+       return ret;
+}
+
+int gk20a_fifo_enable_engine_activity(struct gk20a *g,
+                               struct fifo_engine_info_gk20a *eng_info)
+{
+       u32 enable = gk20a_readl(g, fifo_sched_disable_r());
+       enable &= ~(fifo_sched_disable_true_v() >> eng_info->runlist_id);
+       gk20a_writel(g, fifo_sched_disable_r(), enable);
+
+       /* no buffered-mode ? */
+
+       return 0;
+}
+
+int gk20a_fifo_disable_engine_activity(struct gk20a *g,
+                               struct fifo_engine_info_gk20a *eng_info,
+                               bool wait_for_idle)
+{
+       u32 gr_stat, pbdma_stat, chan_stat, eng_stat, ctx_stat;
+       u32 pbdma_chid = ~0, engine_chid = ~0, disable;
+       u32 err;
+
+       gr_stat =
+               gk20a_readl(g, fifo_engine_status_r(eng_info->engine_id));
+       if (fifo_engine_status_engine_v(gr_stat) ==
+           fifo_engine_status_engine_busy_v() && !wait_for_idle)
+               return -EBUSY;
+
+       disable = gk20a_readl(g, fifo_sched_disable_r());
+       disable = set_field(disable,
+                       fifo_sched_disable_runlist_m(eng_info->runlist_id),
+                       fifo_sched_disable_runlist_f(fifo_sched_disable_true_v(),
+                               eng_info->runlist_id));
+       gk20a_writel(g, fifo_sched_disable_r(), disable);
+
+       /* no buffered-mode ? */
+
+       /* chid from pbdma status */
+       pbdma_stat = gk20a_readl(g, fifo_pbdma_status_r(eng_info->pbdma_id));
+       chan_stat  = fifo_pbdma_status_chan_status_v(pbdma_stat);
+       if (chan_stat == fifo_pbdma_status_chan_status_valid_v() ||
+           chan_stat == fifo_pbdma_status_chan_status_chsw_save_v())
+               pbdma_chid = fifo_pbdma_status_id_v(pbdma_stat);
+       else if (chan_stat == fifo_pbdma_status_chan_status_chsw_load_v() ||
+                chan_stat == fifo_pbdma_status_chan_status_chsw_switch_v())
+               pbdma_chid = fifo_pbdma_status_next_id_v(pbdma_stat);
+
+       if (pbdma_chid != ~0) {
+               err = gk20a_fifo_preempt_channel(g,
+                               eng_info->runlist_id, pbdma_chid);
+               if (err)
+                       goto clean_up;
+       }
+
+       /* chid from engine status */
+       eng_stat = gk20a_readl(g, fifo_engine_status_r(eng_info->engine_id));
+       ctx_stat  = fifo_engine_status_ctx_status_v(eng_stat);
+       if (ctx_stat == fifo_engine_status_ctx_status_valid_v() ||
+           ctx_stat == fifo_engine_status_ctx_status_ctxsw_save_v())
+               engine_chid = fifo_engine_status_id_v(eng_stat);
+       else if (ctx_stat == fifo_engine_status_ctx_status_ctxsw_load_v() ||
+                ctx_stat == fifo_engine_status_ctx_status_ctxsw_switch_v())
+               engine_chid = fifo_engine_status_next_id_v(eng_stat);
+
+       if (engine_chid != ~0 && engine_chid != pbdma_chid) {
+               err = gk20a_fifo_preempt_channel(g,
+                               eng_info->runlist_id, engine_chid);
+               if (err)
+                       goto clean_up;
+       }
+
+       return 0;
+
+clean_up:
+       gk20a_fifo_enable_engine_activity(g, eng_info);
+       return err;
+}
diff --git a/drivers/video/tegra/host/gk20a/fifo_gk20a.h b/drivers/video/tegra/host/gk20a/fifo_gk20a.h
new file mode 100644 (file)
index 0000000..fe80441
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * drivers/video/tegra/host/gk20a/fifo_gk20a.h
+ *
+ * GK20A graphics fifo (gr host)
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __FIFO_GK20A_H__
+#define __FIFO_GK20A_H__
+
+#include "channel_gk20a.h"
+
+#define MAX_RUNLIST_BUFFERS    2
+
+struct fifo_runlist_info_gk20a {
+       unsigned long *active_channels;
+       /* Each engine has its own SW and HW runlist buffer.*/
+       struct mem_desc mem[MAX_RUNLIST_BUFFERS];
+       u32  cur_buffer;
+       u32  total_entries;
+       bool stopped;
+       bool support_tsg;
+       struct mutex mutex; /* protect channel preempt and runlist upate */
+       wait_queue_head_t runlist_wq;
+};
+
+/* so far gk20a has two engines: gr and ce2(gr_copy) */
+enum {
+       ENGINE_GR_GK20A     = 0,
+       ENGINE_CE2_GK20A    = 1,
+       ENGINE_INVAL_GK20A
+};
+
+struct fifo_engine_info_gk20a {
+       u32 sw_id;
+       const char *name;
+       u32 dev_info_id;
+       u32 engine_id;
+       u32 runlist_id;
+       u32 pbdma_id;
+       u32 mmu_fault_id;
+       u32 rc_mask;
+};
+
+struct fifo_gk20a {
+       struct gk20a *g;
+       int num_channels;
+
+       int num_pbdma;
+       u32 *pbdma_map;
+
+       struct fifo_engine_info_gk20a *engine_info;
+       u32 max_engines;
+       u32 num_engines;
+
+       struct fifo_runlist_info_gk20a *runlist_info;
+       u32 max_runlists;
+
+       struct userd_desc userd;
+       u32 userd_entry_size;
+       u32 userd_total_size;
+
+       struct channel_gk20a *channel;
+       struct mutex ch_inuse_mutex; /* protect unused chid look up */
+
+       void (*remove_support)(struct fifo_gk20a *);
+};
+
+int gk20a_init_fifo_support(struct gk20a *g, bool reinit);
+
+void gk20a_fifo_isr(struct gk20a *g);
+
+int gk20a_fifo_preempt_channel(struct gk20a *g,
+                       u32 runlist_id, u32 hw_chid);
+
+int gk20a_fifo_enable_engine_activity(struct gk20a *g,
+                       struct fifo_engine_info_gk20a *eng_info);
+int gk20a_fifo_disable_engine_activity(struct gk20a *g,
+                       struct fifo_engine_info_gk20a *eng_info,
+                       bool wait_for_idle);
+
+#endif /*__GR_GK20A_H__*/
diff --git a/drivers/video/tegra/host/gk20a/gk20a.c b/drivers/video/tegra/host/gk20a/gk20a.c
new file mode 100644 (file)
index 0000000..97c2073
--- /dev/null
@@ -0,0 +1,780 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a.c
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2012, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/highmem.h>
+#include <linux/cdev.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+
+#include "dev.h"
+#include "class_ids.h"
+#include "bus_client.h"
+#include "nvhost_as.h"
+
+#include "gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_sim_gk20a.h"
+
+#include "../../../../../../arch/arm/mach-tegra/iomap.h"
+
+#define APBDEV_PMC_GPU_RG_CNTRL_0      0x448
+
+static void __iomem *pmc = IO_ADDRESS(TEGRA_PMC_BASE);
+
+static void nvhost_gk20a_init(struct nvhost_device *dev);
+static void nvhost_gk20a_deinit(struct nvhost_device *dev);
+static struct nvhost_hwctx_handler *
+    nvhost_gk20a_alloc_hwctx_handler(u32 syncpt, u32 base,
+                                    struct nvhost_channel *ch);
+
+/* TBD: should be able to put in the list below. */
+static struct resource gk20a_intr = {
+       .start = TEGRA_GK20A_INTR,
+       .end   = TEGRA_GK20A_INTR_NONSTALL,
+       .flags = IORESOURCE_IRQ,
+};
+
+struct resource gk20a_resources [] = {
+#define GK20A_BAR0_IORESOURCE_MEM 0
+{
+       .start = TEGRA_GK20A_BAR0_BASE,
+       .end   = TEGRA_GK20A_BAR0_BASE + TEGRA_GK20A_BAR0_SIZE - 1,
+       .flags = IORESOURCE_MEM,
+},
+#define GK20A_BAR1_IORESOURCE_MEM 1
+{
+       .start = TEGRA_GK20A_BAR1_BASE,
+       .end   = TEGRA_GK20A_BAR1_BASE + TEGRA_GK20A_BAR1_SIZE - 1,
+       .flags = IORESOURCE_MEM,
+},
+#if CONFIG_GK20A_SIM
+#define GK20A_SIM_IORESOURCE_MEM 2
+{
+#define TEGRA_GK20A_SIM_BASE 0x538F0000 /*tbd: get from iomap.h should get this or replacement */
+#define TEGRA_GK20A_SIM_SIZE 0x1000     /*tbd: this is a high-side guess */
+       .start = TEGRA_GK20A_SIM_BASE,
+       .end   = TEGRA_GK20A_SIM_BASE + TEGRA_GK20A_SIM_SIZE - 1,
+       .flags = IORESOURCE_MEM,
+},
+#endif
+};
+
+struct nvhost_device gk20a_device = {
+       .name          = "gk20a",
+       /* the following are set by the platform (e.g. t124) support
+       .syncpts       = BIT(NVSYNCPT_3D),
+       .waitbases     = BIT(NVWAITBASE_3D),
+       .modulemutexes = BIT(NVMODMUTEX_3D),
+       */
+       .class         = NV_GRAPHICS_GPU_CLASS_ID,
+       .keepalive     = true,
+       .clocks = {{"emc", UINT_MAX}, {}},
+       NVHOST_MODULE_NO_POWERGATE_IDS,
+       NVHOST_DEFAULT_CLOCKGATE_DELAY,
+       .alloc_hwctx_handler = nvhost_gk20a_alloc_hwctx_handler,
+       .moduleid      = NVHOST_MODULE_GPU,
+#if CONFIG_GK20A_SIM
+       .num_resources = 3, /* this is num ioresource_mem, not the sum */
+#else
+       .num_resources = 2, /* this is num ioresource_mem, not the sum */
+#endif
+       .resource = gk20a_resources,
+};
+
+
+
+#if CONFIG_GK20A_SIM
+static inline void sim_writel(struct gk20a *g, u32 r, u32 v)
+{
+       writel(v, g->sim.regs+r);
+}
+static inline u32 sim_readl(struct gk20a *g, u32 r)
+{
+       return readl(g->sim.regs+r);
+}
+
+static void kunmap_and_free_iopage(void **kvaddr, struct page **page)
+{
+       if (*kvaddr) {
+               kunmap(*kvaddr);
+               *kvaddr = 0;
+       }
+       if (*page) {
+               __free_page(*page);
+               *page = 0;
+       }
+}
+
+static void gk20a_free_sim_support(struct gk20a *g)
+{
+       /* free sim mappings, bfrs */
+       kunmap_and_free_iopage(&g->sim.send_bfr.kvaddr,
+                              &g->sim.send_bfr.page);
+
+       kunmap_and_free_iopage(&g->sim.recv_bfr.kvaddr,
+                              &g->sim.recv_bfr.page);
+
+       kunmap_and_free_iopage(&g->sim.msg_bfr.kvaddr,
+                              &g->sim.msg_bfr.page);
+}
+
+static void gk20a_remove_sim_support(struct sim_gk20a *s)
+{
+       struct gk20a *g = s->g;
+       if (g->sim.regs)
+               sim_writel(g, sim_config_r(), sim_config_mode_disabled_v());
+       gk20a_free_sim_support(g);
+}
+
+static int alloc_and_kmap_iopage(struct device *d,
+                                void **kvaddr,
+                                phys_addr_t *phys,
+                                struct page **page)
+{
+       int err = 0;
+       *page = alloc_page(GFP_KERNEL);
+
+       if (!*page) {
+               err = -ENOMEM;
+               dev_err(d, "couldn't allocate io page\n");
+               goto fail;
+       }
+
+       *kvaddr = kmap(*page);
+       if (!*kvaddr) {
+               err = -ENOMEM;
+               dev_err(d, "couldn't kmap io page\n");
+               goto fail;
+       }
+       *phys = page_to_phys(*page);
+       return 0;
+
+ fail:
+       kunmap_and_free_iopage(kvaddr, page);
+       return err;
+
+}
+/* TBD: strip from released */
+static int gk20a_init_sim_support(struct nvhost_device *dev)
+{
+       int err = 0;
+       struct gk20a *g = get_gk20a(dev);
+       struct device *d = &dev->dev;
+       phys_addr_t phys;
+
+       g->sim.g = g;
+       g->sim.regs = dev->aperture[GK20A_SIM_IORESOURCE_MEM];
+       if (!g->sim.regs) {
+               dev_err(d, "failed to remap gk20a sim regs\n");
+               err = -ENXIO;
+               goto fail;
+       }
+
+       /* allocate sim event/msg buffers */
+       err = alloc_and_kmap_iopage(d, &g->sim.send_bfr.kvaddr,
+                                   &g->sim.send_bfr.phys,
+                                   &g->sim.send_bfr.page);
+
+       err = err || alloc_and_kmap_iopage(d, &g->sim.recv_bfr.kvaddr,
+                                          &g->sim.recv_bfr.phys,
+                                          &g->sim.recv_bfr.page);
+
+       err = err || alloc_and_kmap_iopage(d, &g->sim.msg_bfr.kvaddr,
+                                          &g->sim.msg_bfr.phys,
+                                          &g->sim.msg_bfr.page);
+
+       if (!(g->sim.send_bfr.kvaddr && g->sim.recv_bfr.kvaddr &&
+             g->sim.msg_bfr.kvaddr)) {
+               dev_err(d, "couldn't allocate all sim buffers\n");
+               goto fail;
+       }
+
+       /*mark send ring invalid*/
+       sim_writel(g, sim_send_ring_r(), sim_send_ring_status_invalid_f());
+
+       /*read get pointer and make equal to put*/
+       g->sim.send_ring_put = sim_readl(g, sim_send_get_r());
+       sim_writel(g, sim_send_put_r(), g->sim.send_ring_put);
+
+       /*write send ring address and make it valid*/
+       /*TBD: work for >32b physmem*/
+       BUILD_BUG_ON(sizeof(phys_addr_t) != sizeof(u32));
+       phys = g->sim.send_bfr.phys;
+       sim_writel(g, sim_send_ring_hi_r(), 0);
+       sim_writel(g, sim_send_ring_r(),
+                  sim_send_ring_status_valid_f() |
+                  sim_send_ring_target_phys_pci_coherent_f() |
+                  sim_send_ring_size_4kb_f() |
+                  sim_send_ring_addr_lo_f(phys >> PAGE_SHIFT));
+
+       /*repeat for recv ring (but swap put,get as roles are opposite) */
+       sim_writel(g, sim_recv_ring_r(), sim_recv_ring_status_invalid_f());
+
+       /*read put pointer and make equal to get*/
+       g->sim.recv_ring_get = sim_readl(g, sim_recv_put_r());
+       sim_writel(g, sim_recv_get_r(), g->sim.recv_ring_get);
+
+       /*write send ring address and make it valid*/
+       /*TBD: work for >32b physmem*/
+       BUILD_BUG_ON(sizeof(phys_addr_t) != sizeof(u32));
+       phys = g->sim.recv_bfr.phys;
+       sim_writel(g, sim_recv_ring_hi_r(), 0);
+       sim_writel(g, sim_recv_ring_r(),
+                  sim_recv_ring_status_valid_f() |
+                  sim_recv_ring_target_phys_pci_coherent_f() |
+                  sim_recv_ring_size_4kb_f() |
+                  sim_recv_ring_addr_lo_f(phys >> PAGE_SHIFT));
+
+       g->sim.remove_support = gk20a_remove_sim_support;
+       return 0;
+
+ fail:
+       gk20a_free_sim_support(g);
+       return err;
+}
+
+static inline u32 sim_msg_header_size(void)
+{
+       return 24;/*TBD: fix the header to gt this from NV_VGPU_MSG_HEADER*/
+}
+
+static inline u32 *sim_msg_bfr(struct gk20a *g, u32 byte_offset)
+{
+       return (u32 *)(g->sim.msg_bfr.kvaddr + byte_offset);
+}
+
+static inline u32 *sim_msg_hdr(struct gk20a *g, u32 byte_offset)
+{
+       return sim_msg_bfr(g, byte_offset); /*starts at 0*/
+}
+
+static inline u32 *sim_msg_param(struct gk20a *g, u32 byte_offset)
+{
+       /*starts after msg header/cmn*/
+       return sim_msg_bfr(g, byte_offset + sim_msg_header_size());
+}
+
+static inline void sim_write_hdr(struct gk20a *g, u32 func, u32 size)
+{
+       /*memset(g->sim.msg_bfr.kvaddr,0,min(PAGE_SIZE,size));*/
+       *sim_msg_hdr(g, sim_msg_signature_r()) = sim_msg_signature_valid_v();
+       *sim_msg_hdr(g, sim_msg_result_r())    = sim_msg_result_rpc_pending_v();
+       *sim_msg_hdr(g, sim_msg_spare_r())     = sim_msg_spare__init_v();
+       *sim_msg_hdr(g, sim_msg_function_r())  = func;
+       *sim_msg_hdr(g, sim_msg_length_r())    = size + sim_msg_header_size();
+}
+
+static inline u32 sim_escape_read_hdr_size(void)
+{
+       return 12; /*TBD: fix NV_VGPU_SIM_ESCAPE_READ_HEADER*/
+}
+static u32 *sim_send_ring_bfr(struct gk20a *g, u32 byte_offset)
+{
+       return (u32 *)(g->sim.send_bfr.kvaddr + byte_offset);
+}
+static int rpc_send_message(struct gk20a *g)
+{
+       /* calculations done in units of u32s */
+       u32 send_base = sim_send_put_pointer_v(g->sim.send_ring_put) * 2;
+       u32 dma_offset = send_base + sim_dma_r()/sizeof(u32);
+       u32 dma_hi_offset = send_base + sim_dma_hi_r()/sizeof(u32);
+
+       *sim_send_ring_bfr(g, dma_offset*sizeof(u32)) =
+               sim_dma_target_phys_pci_coherent_f() |
+               sim_dma_status_valid_f() |
+               sim_dma_size_4kb_f() |
+               sim_dma_addr_lo_f(g->sim.msg_bfr.phys >> PAGE_SHIFT);
+
+       *sim_send_ring_bfr(g, dma_hi_offset*sizeof(u32)) = 0; /*TBD >32b phys*/
+
+       *sim_msg_hdr(g, sim_msg_sequence_r()) = g->sim.sequence_base++;
+
+       g->sim.send_ring_put = (g->sim.send_ring_put + 2 * sizeof(u32)) %
+               PAGE_SIZE;
+
+       /* Update the put pointer. This will trap into the host. */
+       sim_writel(g, sim_send_put_r(), g->sim.send_ring_put);
+
+       return 0;
+}
+
+static inline u32 *sim_recv_ring_bfr(struct gk20a *g, u32 byte_offset)
+{
+       return (u32 *)(g->sim.recv_bfr.kvaddr + byte_offset);
+}
+
+static int rpc_recv_poll(struct gk20a *g)
+{
+       phys_addr_t recv_phys_addr;
+
+       /* XXX This read is not required (?) */
+       /*pVGpu->recv_ring_get = VGPU_REG_RD32(pGpu, NV_VGPU_RECV_GET);*/
+
+       /* Poll the recv ring get pointer in an infinite loop*/
+       do {
+               g->sim.recv_ring_put = sim_readl(g, sim_recv_put_r());
+       } while (g->sim.recv_ring_put == g->sim.recv_ring_get);
+
+       /* process all replies */
+       while (g->sim.recv_ring_put != g->sim.recv_ring_get) {
+               /* these are in u32 offsets*/
+               u32 dma_lo_offset =
+                       sim_recv_put_pointer_v(g->sim.recv_ring_get)*2 + 0;
+               /*u32 dma_hi_offset = dma_lo_offset + 1;*/
+               u32 recv_phys_addr_lo = sim_dma_addr_lo_v(*sim_recv_ring_bfr(g, dma_lo_offset*4));
+
+               /*u32 recv_phys_addr_hi = sim_dma_hi_addr_v(
+                     (phys_addr_t)sim_recv_ring_bfr(g,dma_hi_offset*4));*/
+
+               /*TBD >32b phys addr */
+               recv_phys_addr = recv_phys_addr_lo << PAGE_SHIFT;
+
+               if (recv_phys_addr != g->sim.msg_bfr.phys) {
+                       dev_err(&g->dev->dev, "%s Error in RPC reply\n",
+                               __func__);
+                       return -1;
+               }
+
+               /* Update GET pointer */
+               g->sim.recv_ring_get = (g->sim.recv_ring_get + 2*sizeof(u32)) %
+                       PAGE_SIZE;
+
+               sim_writel(g, sim_recv_get_r(), g->sim.recv_ring_get);
+
+               g->sim.recv_ring_put = sim_readl(g, sim_recv_put_r());
+       }
+
+       return 0;
+}
+
+
+static int issue_rpc_and_wait(struct gk20a *g)
+{
+       int err;
+
+       err = rpc_send_message(g);
+       if (err) {
+               dev_err(&g->dev->dev, "%s failed rpc_send_message\n",
+                       __func__);
+               return err;
+       }
+
+       err = rpc_recv_poll(g);
+       if (err) {
+               dev_err(&g->dev->dev, "%s failed rpc_recv_poll\n", __func__);
+               return err;
+       }
+
+       /* Now check if RPC really succeeded */
+       if (*sim_msg_hdr(g, sim_msg_result_r()) != sim_msg_result_success_v()) {
+               dev_err(&g->dev->dev, "%s received failed status!\n",
+                       __func__);
+               return -(*sim_msg_hdr(g, sim_msg_result_r()));
+       }
+       return 0;
+}
+
+int gk20a_sim_esc_read(struct gk20a *g, char*path, u32 index, u32 count, u32 *data)
+{
+       int err;
+       size_t pathlen = strlen(path);
+       u32 data_offset;
+
+       sim_write_hdr(g, sim_msg_function_sim_escape_read_v(),
+                     sim_escape_read_hdr_size());
+       *sim_msg_param(g, 0) = index;
+       *sim_msg_param(g, 4) = count;
+       data_offset = roundup(0xc +  pathlen + 1, sizeof(u32));
+       *sim_msg_param(g, 8) = data_offset;
+       strcpy((char *)sim_msg_param(g, 0xc), path);
+
+       err = issue_rpc_and_wait(g);
+
+       if (!err)
+               memcpy(data, sim_msg_param(g, data_offset), count);
+       return err;
+}
+
+
+#else /*CONFIG_GK20A_SIM*/
+static inline int gk20a_init_sim_support(struct nvhost_device *dev);
+{
+       return 0;
+}
+static void gk20a_remove_sim_support(struct nvhost_device *dev)
+{
+}
+#endif /*!CONFIG_GK20A_SIM*/
+
+static irqreturn_t gk20a_intr_isr(int irq, void *dev_id)
+{
+       struct gk20a *g = dev_id;
+       u32 mc_intr_0 = gk20a_readl(g, mc_intr_0_r());
+
+       /* not from gpu when sharing irq with others */
+       if (unlikely(!mc_intr_0))
+               return IRQ_NONE;
+
+       gk20a_writel(g, mc_intr_en_0_r(),
+               mc_intr_en_0_inta_disabled_f());
+
+       /* flush previous write */
+       gk20a_readl(g, mc_intr_en_0_r());
+
+       return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t gk20a_intr_thread(int irq, void *dev_id)
+{
+       struct gk20a *g = dev_id;
+       u32 mc_intr_0;
+       u32 loop = 0;
+
+       nvhost_dbg(dbg_intr, "interrupt thread launched");
+
+       mc_intr_0 = gk20a_readl(g, mc_intr_0_r());
+
+       /* loop as many as three more times in case new interrupts come up
+        * while we're processing current ones */
+       while (mc_intr_0 && loop++ < 3) {
+               if (mc_intr_0 & mc_intr_0_pgraph_pending_f())
+                       gk20a_gr_isr(g);
+               if (mc_intr_0 & mc_intr_0_pfifo_pending_f())
+                       gk20a_fifo_isr(g);
+               if (mc_intr_0 & mc_intr_0_pmu_pending_f())
+                       gk20a_pmu_isr(g);
+               if (mc_intr_0 & mc_intr_0_priv_ring_pending_f())
+                       gk20a_priv_ring_isr(g);
+               mc_intr_0 = gk20a_readl(g, mc_intr_0_r());
+       }
+
+       if (mc_intr_0)
+               nvhost_dbg_info("leaving isr with interrupt pending 0x%08x",
+                               mc_intr_0);
+
+       gk20a_writel(g, mc_intr_en_0_r(),
+               mc_intr_en_0_inta_hardware_f());
+
+       /* flush previous write */
+       gk20a_readl(g, mc_intr_en_0_r());
+
+       return IRQ_HANDLED;
+}
+
+
+
+static void gk20a_remove_support(struct nvhost_device *dev)
+{
+       struct gk20a *g = get_gk20a(dev);
+
+       if (g->fifo.remove_support)
+               g->fifo.remove_support(&g->fifo);
+
+       if (g->sim.remove_support)
+               g->sim.remove_support(&g->sim);
+
+       if (g->irq_requested) {
+               free_irq(gk20a_intr.start, g);
+               g->irq_requested = false;
+       }
+
+       /* free mappings to registers, etc*/
+
+       if (g->regs) {
+               iounmap(g->regs);
+               g->regs = 0;
+       }
+}
+
+int nvhost_init_gk20a_support(struct nvhost_device *dev)
+{
+       int err = 0;
+
+       struct gk20a *g = get_gk20a(dev);
+
+       g->regs = dev->aperture[GK20A_BAR0_IORESOURCE_MEM];
+       if (!g->regs) {
+               dev_err(&g->dev->dev, "failed to remap gk20a registers\n");
+               err = -ENXIO;
+               goto fail;
+       }
+
+       g->bar1 = dev->aperture[GK20A_BAR1_IORESOURCE_MEM];
+       if (!g->bar1) {
+               dev_err(&g->dev->dev, "failed to remap gk20a bar1\n");
+               err = -ENXIO;
+               goto fail;
+       }
+
+       err = request_threaded_irq(gk20a_intr.start,
+                       gk20a_intr_isr, gk20a_intr_thread,
+                       0, "gk20a", g);
+       if (err) {
+               dev_err(&g->dev->dev, "failed to request stall interrupt irq @ %d\n",
+                       gk20a_intr.start);
+               goto fail;
+       }
+       g->irq_requested = true;
+
+       /* remove gk20a clamp in t124 soc register */
+       writel(0, pmc + APBDEV_PMC_GPU_RG_CNTRL_0);
+
+       gk20a_writel(g, mc_intr_en_1_r(),
+               mc_intr_en_1_inta_disabled_f());
+
+       gk20a_writel(g, mc_intr_en_0_r(),
+               mc_intr_en_0_inta_hardware_f());
+
+       err = gk20a_init_sim_support(dev);
+       if (err)
+               goto fail;
+
+       err = gk20a_init_clk_support(g, false);
+       if (err)
+               goto fail;
+
+       err = gk20a_init_mm_support(g, false);
+       if (err)
+               goto fail;
+
+       err = gk20a_init_fifo_support(g, false);
+       if (err)
+               goto fail;
+
+       err = gk20a_init_therm_support(g, false);
+       if (err)
+               goto fail;
+
+       /* init_gr & init_pmu are deferred */
+
+       g->remove_support = gk20a_remove_support;
+       return 0;
+
+ fail:
+       gk20a_remove_support(dev);
+       return err;
+}
+
+static void nvhost_gk20a_init(struct nvhost_device *dev)
+{
+       struct gk20a *g = get_gk20a(dev);
+       int err;
+
+       BUG_ON(!g);
+       nvhost_dbg_fn("");
+
+       err = gk20a_init_gr_support(g, false);
+       if (err)
+               nvhost_err(&dev->dev, "failed init gk20a gr support\n");
+
+       err = gk20a_init_pmu_support(g, false);
+       if (err)
+               nvhost_err(&dev->dev, "failed init gk20a pmu support\n");
+}
+static void nvhost_gk20a_deinit(struct nvhost_device *dev)
+{
+
+       struct gk20a *g = get_gk20a(dev);
+       nvhost_dbg_fn("");
+
+       if (g && g->remove_support)
+               g->remove_support(dev);
+       set_gk20a(dev, 0);
+       kfree(g);
+}
+
+
+static void gk20a_free_hwctx(struct kref *ref)
+{
+       struct nvhost_hwctx *ctx = container_of(ref, struct nvhost_hwctx, ref);
+       nvhost_dbg_fn("");
+
+       gk20a_free_channel(ctx);
+       kfree(ctx);
+}
+
+
+static struct nvhost_hwctx *gk20a_alloc_hwctx(struct nvhost_hwctx_handler *h,
+                                             struct nvhost_channel *ch)
+{
+       struct nvhost_hwctx *ctx;
+       nvhost_dbg_fn("");
+
+       /* it seems odd to be allocating a channel here but the
+        * t20/t30 notion of a channel is mapped on top of gk20a's
+        * channel.  this works because there is only one module
+        * under gk20a's host (gr).
+        */
+       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+       if (!ctx)
+               return NULL;
+
+       kref_init(&ctx->ref);
+       ctx->h = h;
+       ctx->channel = ch;
+
+       return gk20a_open_channel(ch, ctx);
+}
+
+static void gk20a_get_hwctx(struct nvhost_hwctx *hwctx)
+{
+       nvhost_dbg_fn("");
+       kref_get(&hwctx->ref);
+}
+
+static void gk20a_put_hwctx(struct nvhost_hwctx *hwctx)
+{
+       nvhost_dbg_fn("");
+       kref_put(&hwctx->ref, gk20a_free_hwctx);
+}
+
+static void gk20a_save_push_hwctx(struct nvhost_hwctx *ctx, struct nvhost_cdma *cdma )
+{
+       nvhost_dbg_fn("");
+}
+
+static void gk20a_save_service_hwctx(struct nvhost_hwctx *ctx)
+{
+       nvhost_dbg_fn("");
+}
+
+
+static struct nvhost_hwctx_handler *
+    nvhost_gk20a_alloc_hwctx_handler(u32 syncpt, u32 waitbase,
+                                    struct nvhost_channel *ch)
+{
+
+       struct nvhost_hwctx_handler *h;
+       nvhost_dbg_fn("");
+
+       h = kmalloc(sizeof(*h), GFP_KERNEL);
+       if (!h)
+               return NULL;
+
+       h->alloc = gk20a_alloc_hwctx;
+       h->get   = gk20a_get_hwctx;
+       h->put   = gk20a_put_hwctx;
+       h->save_push = gk20a_save_push_hwctx;
+       h->save_service = gk20a_save_service_hwctx;
+
+       return h;
+}
+
+static int __devinit gk20a_probe(struct nvhost_device *dev,
+               struct nvhost_device_id *id_table)
+{
+       int err;
+       struct gk20a *gk20a;
+
+       nvhost_dbg_fn("");
+
+       err = nvhost_client_device_get_resources(dev);
+       if (err)
+               return err;
+
+       err = nvhost_client_device_init(dev);
+
+       if (err) {
+               nvhost_dbg_fn("failed to init client device for %s",
+                             dev->name);
+               return err;
+       }
+
+       err = nvhost_as_init_device(dev);
+       if (err) {
+               nvhost_dbg_fn("failed to init client address space"
+                             " device for %s", dev->name);
+               return err;
+       }
+
+       nvhost_dbg_fn("allocating gk20a support");
+       gk20a = kzalloc(sizeof(struct gk20a), GFP_KERNEL);
+       if (!gk20a) {
+               dev_err(&dev->dev, "couldn't allocate gk20a support");
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       set_gk20a(dev, gk20a);
+       gk20a->dev = dev;
+       gk20a->host = nvhost_get_host(dev);
+
+       err = nvhost_init_gk20a_support(dev);
+
+       if (err)
+               goto fail;
+
+       return 0;
+
+ fail:
+       dev_err(&dev->dev, "failed: %d", err);
+       return err;
+}
+
+static int __exit gk20a_remove(struct nvhost_device *dev)
+{
+       /* Add clean-up */
+       return 0;
+}
+
+#ifdef CONFIG_PM
+static int gk20a_suspend(struct nvhost_device *dev, pm_message_t state)
+{
+       nvhost_dbg_fn("");
+       return nvhost_client_device_suspend(dev);
+}
+
+static int gk20a_resume(struct nvhost_device *dev)
+{
+       nvhost_dbg_fn("");
+       return 0;
+}
+#endif
+
+static struct nvhost_driver gk20a_driver = {
+       .probe = gk20a_probe,
+       .remove = __exit_p(gk20a_remove),
+       .init = nvhost_gk20a_init,
+       .deinit = nvhost_gk20a_deinit,
+       .alloc_hwctx_handler = nvhost_gk20a_alloc_hwctx_handler,
+#ifdef CONFIG_PM
+       .suspend = gk20a_suspend,
+       .resume = gk20a_resume,
+#endif
+       .driver = {
+               .owner = THIS_MODULE,
+               .name = "gk20a",
+       }
+};
+
+static int __init gk20a_init(void)
+{
+       return nvhost_driver_register(&gk20a_driver);
+}
+
+static void __exit gk20a_exit(void)
+{
+       nvhost_driver_unregister(&gk20a_driver);
+}
+
+module_init(gk20a_init);
+module_exit(gk20a_exit);
diff --git a/drivers/video/tegra/host/gk20a/gk20a.h b/drivers/video/tegra/host/gk20a/gk20a.h
new file mode 100644 (file)
index 0000000..fd477f3
--- /dev/null
@@ -0,0 +1,162 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a.h
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _NVHOST_GK20A_H_
+#define _NVHOST_GK20A_H_
+
+
+#if defined(CONFIG_ARCH_TEGRA_12x_SOC) && defined(CONFIG_TEGRA_SIMULATION_PLATFORM)
+#define CONFIG_GK20A_SIM 1
+#else
+#define CONFIG_GK20A_SIM 0
+#endif
+
+struct gk20a;
+struct fifo_gk20a;
+struct channel_gk20a;
+struct gr_gk20a;
+struct sim_gk20a;
+
+#include "clk_gk20a.h"
+#include "fifo_gk20a.h"
+#include "gr_gk20a.h"
+#include "sim_gk20a.h"
+#include "intr_gk20a.h"
+#include "pmu_gk20a.h"
+#include "priv_ring_gk20a.h"
+#include "therm_gk20a.h"
+
+#define get_gk20a(ndev) ((struct gk20a *)(ndev)->dev.platform_data)
+#define set_gk20a(ndev, f) ((ndev)->dev.platform_data = f)
+
+extern struct nvhost_device gk20a_device;
+
+struct gk20a {
+       struct nvhost_master *host;
+       struct nvhost_device *dev;
+
+       struct resource *reg_mem;
+       void __iomem *regs;
+
+       struct resource *bar1_mem;
+       void __iomem *bar1;
+
+       bool first_init;
+       bool irq_requested;
+
+       struct clk_gk20a clk;
+       struct fifo_gk20a fifo;
+       struct gr_gk20a gr;
+       struct sim_gk20a sim;
+       struct mm_gk20a mm;
+       struct pmu_gk20a pmu;
+
+       void (*remove_support)(struct nvhost_device *);
+};
+
+extern const struct nvhost_as_moduleops gk20a_as_moduleops;
+
+/* register accessors */
+static inline void gk20a_writel(struct gk20a *g, u32 r, u32 v)
+{
+       nvhost_dbg(dbg_reg, " r=0x%x v=0x%x", r, v);
+       writel(v, g->regs + r);
+}
+static inline u32 gk20a_readl(struct gk20a *g, u32 r)
+{
+       u32 v = readl(g->regs + r);
+       nvhost_dbg(dbg_reg, " r=0x%x v=0x%x", r, v);
+       return v;
+}
+
+static inline void gk20a_bar1_writel(struct gk20a *g, u32 b, u32 v)
+{
+       nvhost_dbg(dbg_reg, " b=0x%x v=0x%x", b, v);
+       writel(v, g->bar1 + b);
+}
+
+static inline u32 gk20a_bar1_readl(struct gk20a *g, u32 b)
+{
+       u32 v = readl(g->bar1 + b);
+       nvhost_dbg(dbg_reg, " b=0x%x v=0x%x", b, v);
+       return v;
+}
+
+/* convenience */
+static inline struct device *dev_from_gk20a(struct gk20a *g)
+{
+       return &g->dev->dev;
+}
+static inline struct nvhost_syncpt *syncpt_from_gk20a(struct gk20a* g)
+{
+       return &(nvhost_get_host(g->dev)->syncpt);
+}
+static inline struct mem_mgr *mem_mgr_from_g(struct gk20a* g)
+{
+       return nvhost_get_host(g->dev)->memmgr;
+}
+
+static inline u32 u64_hi32(u64 n)
+{
+       return (u32)((n >> 32) & ~(u32)0);
+}
+
+static inline u32 u64_lo32(u64 n)
+{
+       return (u32)(n & ~(u32)0);
+}
+
+static inline u32 set_field(u32 val, u32 mask, u32 field)
+{
+       return ((val & ~mask) | field);
+}
+
+/* invalidate channel lookup tlb */
+static inline void gk20a_gr_flush_channel_tlb(struct gr_gk20a *gr)
+{
+       memset(gr->chid_tlb, 0,
+               sizeof(struct gr_channel_map_tlb_entry) *
+               GR_CHANNEL_MAP_TLB_SIZE);
+}
+
+/* classes that the device supports */
+/* TBD: get these from an open-sourced SDK? */
+enum {
+       KEPLER_C                  = 0xA297,
+       FERMI_TWOD_A              = 0x902D,
+       KEPLER_COMPUTE_A          = 0xA0C0,
+       KEPLER_INLINE_TO_MEMORY_A = 0xA040,
+       KEPLER_DMA_COPY_A         = 0xA0B5, /*not sure about this one*/
+};
+
+/* TBD: these should come from tegra iomap.h &&|| be in the device resources */
+#define TEGRA_GK20A_BAR0_BASE  0x57000000
+#define TEGRA_GK20A_BAR0_SIZE  0x01000000
+#define TEGRA_GK20A_BAR1_BASE  0x58000000
+#define TEGRA_GK20A_BAR1_SIZE  0x01000000
+
+#if defined (CONFIG_TEGRA_GK20A_PMU)
+static inline int support_gk20a_pmu(void){return 1;}
+#else
+static inline int support_gk20a_pmu(void){return 0;}
+#endif
+
+
+#endif /* _NVHOST_GK20A_H_ */
diff --git a/drivers/video/tegra/host/gk20a/gk20a_gating_reglist.h b/drivers/video/tegra/host/gk20a/gk20a_gating_reglist.h
new file mode 100644 (file)
index 0000000..4032d1e
--- /dev/null
@@ -0,0 +1,269 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a_gating_reglist.h
+ *
+ * Copyright (c) 2012, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * This file is autogenerated.  Do not edit.
+ */
+
+#ifndef __gk20a_gating_reglist_h__
+#define __gk20a_gating_reglist_h__
+
+struct gating_desc {
+       u32 addr;
+       u32 prod;
+       u32 disable;
+};
+
+/* slcg gr */
+const struct gating_desc gk20a_slcg_gr[] = {
+       {.addr = 0x004041f4, .prod = 0x00000000, .disable = 0x03fffffe},
+       {.addr = 0x00409894, .prod = 0x00000000, .disable = 0x0003fffe},
+       {.addr = 0x004078c4, .prod = 0x00000000, .disable = 0x000001fe},
+       {.addr = 0x00406004, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x00405864, .prod = 0x00000000, .disable = 0x000001fe},
+       {.addr = 0x00405910, .prod = 0x00000000, .disable = 0xfffffffe},
+       {.addr = 0x00408044, .prod = 0x00000000, .disable = 0x000007fe},
+       {.addr = 0x00407004, .prod = 0x00000000, .disable = 0x0000001e},
+       {.addr = 0x0041a894, .prod = 0x00000000, .disable = 0x0003fffe},
+       {.addr = 0x00418504, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x0041860c, .prod = 0x00000000, .disable = 0x000001fe},
+       {.addr = 0x0041868c, .prod = 0x00000000, .disable = 0x0000001e},
+       {.addr = 0x0041871c, .prod = 0x00000000, .disable = 0x0000003e},
+       {.addr = 0x00418388, .prod = 0x00000000, .disable = 0x00000001},
+       {.addr = 0x0041882c, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x00418bc0, .prod = 0x00000000, .disable = 0x000001fe},
+       {.addr = 0x00418974, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x00418c74, .prod = 0x00000000, .disable = 0xfffffffe},
+       {.addr = 0x00418cf4, .prod = 0x00000000, .disable = 0xfffffffe},
+       {.addr = 0x00418d74, .prod = 0x00000000, .disable = 0xfffffffe},
+       {.addr = 0x00418f10, .prod = 0x00000000, .disable = 0xfffffffe},
+       {.addr = 0x00418e10, .prod = 0x00000000, .disable = 0xfffffffe},
+       {.addr = 0x00419024, .prod = 0x00000000, .disable = 0x000001fe},
+       {.addr = 0x00419a44, .prod = 0x00000000, .disable = 0x0000000e},
+       {.addr = 0x00419a4c, .prod = 0x00000000, .disable = 0x000001fe},
+       {.addr = 0x00419a54, .prod = 0x00000000, .disable = 0x0000003e},
+       {.addr = 0x00419a5c, .prod = 0x00000000, .disable = 0x0000000e},
+       {.addr = 0x00419a64, .prod = 0x00000000, .disable = 0x000001fe},
+       {.addr = 0x00419a6c, .prod = 0x00000000, .disable = 0x0000000e},
+       {.addr = 0x00419a74, .prod = 0x00000000, .disable = 0x0000000e},
+       {.addr = 0x00419a7c, .prod = 0x00000000, .disable = 0x0000003e},
+       {.addr = 0x00419a84, .prod = 0x00000000, .disable = 0x0000000e},
+       {.addr = 0x00419ad0, .prod = 0x00000000, .disable = 0x0000000e},
+       {.addr = 0x0041986c, .prod = 0x00000000, .disable = 0x00fffffe},
+       {.addr = 0x00419cd8, .prod = 0x00000000, .disable = 0x001ffffe},
+       {.addr = 0x00419ce0, .prod = 0x00000000, .disable = 0x001ffffe},
+       {.addr = 0x00419c74, .prod = 0x00000000, .disable = 0x0000001e},
+       {.addr = 0x00419fd4, .prod = 0x00000000, .disable = 0x0003fffe},
+       {.addr = 0x00419fdc, .prod = 0x00000000, .disable = 0xfffffffe},
+       {.addr = 0x00419fe4, .prod = 0x00000000, .disable = 0x0000000e},
+       {.addr = 0x00419ff4, .prod = 0x00000000, .disable = 0x00003ffe},
+       {.addr = 0x00419ffc, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x0041be2c, .prod = 0x00000000, .disable = 0xfffffffe},
+       {.addr = 0x0041bfec, .prod = 0x00000000, .disable = 0xfffffffe},
+       {.addr = 0x0041bed4, .prod = 0x00000000, .disable = 0xfffffffe},
+       {.addr = 0x00408814, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x0040881c, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x00408a84, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x00408a8c, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x00408a94, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x00408a9c, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x00408aa4, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x00408aac, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x004089ac, .prod = 0x00000000, .disable = 0x0001fffe},
+       {.addr = 0x00408a24, .prod = 0x00000000, .disable = 0x000001ff},
+};
+
+/* slcg perf */
+const struct gating_desc gk20a_slcg_perf[] = {
+       {.addr = 0x001be018, .prod = 0x000001ff, .disable = 0x00000000},
+       {.addr = 0x001bc018, .prod = 0x000001ff, .disable = 0x00000000},
+       {.addr = 0x001b8018, .prod = 0x000001ff, .disable = 0x00000000},
+       {.addr = 0x001b4124, .prod = 0x00000001, .disable = 0x00000000},
+};
+
+/* blcg gr */
+const struct gating_desc gk20a_blcg_gr[] = {
+       {.addr = 0x004041f0, .prod = 0x00004046, .disable = 0x00000000},
+       {.addr = 0x00409890, .prod = 0x0000007f, .disable = 0x00000000},
+       {.addr = 0x004098b0, .prod = 0x0000007f, .disable = 0x00000000},
+       {.addr = 0x004078c0, .prod = 0x00000042, .disable = 0x00000000},
+       {.addr = 0x00406000, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x00405860, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x0040590c, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x00408040, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x00407000, .prod = 0x00004041, .disable = 0x00000000},
+       {.addr = 0x00405bf0, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x0041a890, .prod = 0x0000007f, .disable = 0x00000000},
+       {.addr = 0x0041a8b0, .prod = 0x0000007f, .disable = 0x00000000},
+       {.addr = 0x00418500, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x00418608, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00418688, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00418718, .prod = 0x00000042, .disable = 0x00000000},
+       {.addr = 0x00418828, .prod = 0x00000044, .disable = 0x00000000},
+       {.addr = 0x00418bbc, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00418970, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00418c70, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x00418cf0, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x00418d70, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x00418f0c, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x00418e0c, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x00419020, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419038, .prod = 0x00000042, .disable = 0x00000000},
+       {.addr = 0x00419a40, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419a48, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419a50, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419a58, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419a60, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419a68, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419a70, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419a78, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419a80, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419acc, .prod = 0x00004047, .disable = 0x00000000},
+       {.addr = 0x00419868, .prod = 0x00000042, .disable = 0x00000000},
+       {.addr = 0x00419cd4, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419cdc, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419c70, .prod = 0x00004045, .disable = 0x00000000},
+       {.addr = 0x00419fd0, .prod = 0x00004043, .disable = 0x00000000},
+       {.addr = 0x00419fd8, .prod = 0x00004045, .disable = 0x00000000},
+       {.addr = 0x00419fe0, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419fe8, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419ff0, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x00419ff8, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00419f90, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x0041be28, .prod = 0x00000042, .disable = 0x00000000},
+       {.addr = 0x0041bfe8, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x0041bed0, .prod = 0x00004044, .disable = 0x00000000},
+       {.addr = 0x00408810, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00408818, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00408a80, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00408a88, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00408a90, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00408a98, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00408aa0, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x00408aa8, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x004089a8, .prod = 0x00004042, .disable = 0x00000000},
+       {.addr = 0x004089b0, .prod = 0x00000042, .disable = 0x00000000},
+       {.addr = 0x004089b8, .prod = 0x00004042, .disable = 0x00000000},
+};
+
+/* pg gr */
+const struct gating_desc gk20a_pg_gr[] = {
+       {.addr = 0x004041f8, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x004041fc, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00409898, .prod = 0x10140000, .disable = 0x00000000},
+       {.addr = 0x0040989c, .prod = 0xff00000a, .disable = 0x00000000},
+       {.addr = 0x004078c8, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x004078cc, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00406008, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x0040600c, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00405868, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x0040586c, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00405914, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00405924, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00408048, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x0040804c, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00407008, .prod = 0x10140000, .disable = 0x00000000},
+       {.addr = 0x0040700c, .prod = 0xff00000a, .disable = 0x00000000},
+       {.addr = 0x00405bf8, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00405bfc, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x0041a898, .prod = 0x10140000, .disable = 0x00000000},
+       {.addr = 0x0041a89c, .prod = 0xff00000a, .disable = 0x00000000},
+       {.addr = 0x00418510, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00418514, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00418610, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00418614, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00418690, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00418694, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00418720, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00418724, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00418840, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00418844, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00418bc4, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00418bc8, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00418978, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x0041897c, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00418c78, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00418c7c, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00418cf8, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00418cfc, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00418d78, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00418d7c, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00418f14, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00418f18, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00418e14, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00418e18, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419030, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419050, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419a88, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419a8c, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419a90, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419a94, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419a98, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419a9c, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419aa0, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419aa4, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419ad4, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419ad8, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419870, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419874, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419ce4, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419cf0, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419c78, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419c7c, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419fa0, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419fa4, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419fa8, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419fac, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419fb0, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419fb4, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419fb8, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419fbc, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419fc0, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419fc4, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00419fc8, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00419fcc, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x0041be30, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x0041be34, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x0041bff0, .prod = 0x10747c00, .disable = 0x00000000},
+       {.addr = 0x0041bff4, .prod = 0xff00000a, .disable = 0x00000000},
+       {.addr = 0x0041bed8, .prod = 0x10240a00, .disable = 0x00000000},
+       {.addr = 0x0041bee0, .prod = 0xff00000a, .disable = 0x00000000},
+       {.addr = 0x00408820, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00408824, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00408828, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x0040882c, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00408ac0, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00408ac4, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00408ac8, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00408acc, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00408ad0, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00408ad4, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00408ad8, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00408adc, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00408ae0, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00408ae4, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x00408ae8, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x00408aec, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x004089c0, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x004089c4, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x004089c8, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x004089cc, .prod = 0xff00a725, .disable = 0x00000000},
+       {.addr = 0x004089d0, .prod = 0x10940000, .disable = 0x00000000},
+       {.addr = 0x004089d4, .prod = 0xff00a725, .disable = 0x00000000},
+};
+
+#endif /* __gk20a_gating_reglist_h__ */
diff --git a/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.c b/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.c
new file mode 100644 (file)
index 0000000..4bd04d8
--- /dev/null
@@ -0,0 +1,327 @@
+/*
+ * drivers/video/tegra/host/gk20a/gr_ctx_gk20a.c
+ *
+ * GK20A Graphics Context
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/firmware.h>
+
+#include "dev.h"
+#include "bus_client.h"
+
+#include "gk20a.h"
+#include "gr_ctx_gk20a.h"
+#include "hw_gr_gk20a.h"
+
+static int gr_gk20a_alloc_load_netlist_u32(u32 *src, u32 len,
+                       struct u32_list_gk20a *u32_list)
+{
+       u32_list->count = (len + sizeof(u32) - 1) / sizeof(u32);
+       if (!alloc_u32_list_gk20a(u32_list))
+               return -ENOMEM;
+
+       memcpy(u32_list->l, src, len);
+
+       return 0;
+}
+
+static int gr_gk20a_alloc_load_netlist_av(u32 *src, u32 len,
+                       struct av_list_gk20a *av_list)
+{
+       av_list->count = len / sizeof(struct av_gk20a);
+       if (!alloc_av_list_gk20a(av_list))
+               return -ENOMEM;
+
+       memcpy(av_list->l, src, len);
+
+       return 0;
+}
+
+static int gr_gk20a_alloc_load_netlist_aiv(u32 *src, u32 len,
+                       struct aiv_list_gk20a *aiv_list)
+{
+       aiv_list->count = len / sizeof(struct aiv_gk20a);
+       if (!alloc_aiv_list_gk20a(aiv_list))
+               return -ENOMEM;
+
+       return 0;
+}
+
+static int gr_gk20a_get_netlist_name(int index, char *name)
+{
+       switch (index) {
+#ifdef GK20A_NETLIST_IMAGE_FW_NAME
+       case NETLIST_FINAL:
+               sprintf(name, GK20A_NETLIST_IMAGE_FW_NAME);
+               return 0;
+#endif
+#ifdef GK20A_NETLIST_IMAGE_A
+       case NETLIST_SLOT_A:
+               sprintf(name, GK20A_NETLIST_IMAGE_A);
+               return 0;
+#endif
+#ifdef GK20A_NETLIST_IMAGE_B
+       case NETLIST_SLOT_B:
+               sprintf(name, GK20A_NETLIST_IMAGE_B);
+               return 0;
+#endif
+#ifdef GK20A_NETLIST_IMAGE_C
+       case NETLIST_SLOT_C:
+               sprintf(name, GK20A_NETLIST_IMAGE_C);
+               return 0;
+#endif
+#ifdef GK20A_NETLIST_IMAGE_D
+       case NETLIST_SLOT_D:
+               sprintf(name, GK20A_NETLIST_IMAGE_D);
+               return 0;
+#endif
+       default:
+               return -1;
+       }
+
+       return -1;
+}
+
+int gr_gk20a_init_ctx_vars(struct gk20a *g, struct gr_gk20a *gr)
+{
+       struct device *d = dev_from_gk20a(g);
+       const struct firmware *netlist_fw;
+       struct netlist_image *netlist = NULL;
+       char name[MAX_NETLIST_NAME];
+       u32 i, major_v = ~0, major_v_hw, netlist_num;
+       int net, max, err = -ENOENT;
+
+       nvhost_dbg_fn("");
+
+#ifdef GK20A_NETLIST_IMAGE_FW_NAME
+       net = NETLIST_FINAL;
+       max = 0;
+       major_v_hw = ~0;
+       g->gr.ctx_vars.dynamic = false;
+#else
+       net = NETLIST_SLOT_A;
+       max = MAX_NETLIST;
+       major_v_hw = gk20a_readl(g, gr_fecs_ctx_state_store_major_rev_id_r());
+       g->gr.ctx_vars.dynamic = true;
+#endif
+
+       for (; net < max; net++) {
+
+               if (gr_gk20a_get_netlist_name(net, name) != 0) {
+                       nvhost_warn(d, "invalid netlist index %d", net);
+                       continue;
+               }
+
+               netlist_fw = nvhost_client_request_firmware(g->dev, name);
+               if (IS_ERR_OR_NULL(netlist_fw)) {
+                       nvhost_warn(d, "failed to load netlist %s", name);
+                       continue;
+               }
+
+               netlist = (struct netlist_image *)netlist_fw->data;
+
+               for (i = 0; i < netlist->header.regions; i++) {
+                       u32 *src = (u32 *)((u8 *)netlist + netlist->regions[i].data_offset);
+                       u32 size = netlist->regions[i].data_size;
+
+                       switch (netlist->regions[i].region_id) {
+                       case NETLIST_REGIONID_FECS_UCODE_DATA:
+                               nvhost_dbg_info("NETLIST_REGIONID_FECS_UCODE_DATA");
+                               err = gr_gk20a_alloc_load_netlist_u32(
+                                       src, size, &g->gr.ctx_vars.ucode.fecs.data);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_FECS_UCODE_INST:
+                               nvhost_dbg_info("NETLIST_REGIONID_FECS_UCODE_INST");
+                               err = gr_gk20a_alloc_load_netlist_u32(
+                                       src, size, &g->gr.ctx_vars.ucode.fecs.inst);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_GPCCS_UCODE_DATA:
+                               nvhost_dbg_info("NETLIST_REGIONID_GPCCS_UCODE_DATA");
+                               err = gr_gk20a_alloc_load_netlist_u32(
+                                       src, size, &g->gr.ctx_vars.ucode.gpccs.data);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_GPCCS_UCODE_INST:
+                               nvhost_dbg_info("NETLIST_REGIONID_GPCCS_UCODE_INST");
+                               err = gr_gk20a_alloc_load_netlist_u32(
+                                       src, size, &g->gr.ctx_vars.ucode.gpccs.inst);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_SW_BUNDLE_INIT:
+                               nvhost_dbg_info("NETLIST_REGIONID_SW_BUNDLE_INIT");
+                               err = gr_gk20a_alloc_load_netlist_av(
+                                       src, size, &g->gr.ctx_vars.sw_bundle_init);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_SW_METHOD_INIT:
+                               nvhost_dbg_info("NETLIST_REGIONID_SW_METHOD_INIT");
+                               err = gr_gk20a_alloc_load_netlist_av(
+                                       src, size, &g->gr.ctx_vars.sw_method_init);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_SW_CTX_LOAD:
+                               nvhost_dbg_info("NETLIST_REGIONID_SW_CTX_LOAD");
+                               err = gr_gk20a_alloc_load_netlist_aiv(
+                                       src, size, &g->gr.ctx_vars.sw_ctx_load);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_SW_NON_CTX_LOAD:
+                               nvhost_dbg_info("NETLIST_REGIONID_SW_NON_CTX_LOAD");
+                               err = gr_gk20a_alloc_load_netlist_av(
+                                       src, size, &g->gr.ctx_vars.sw_non_ctx_load);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_CTXREG_SYS:
+                               nvhost_dbg_info("NETLIST_REGIONID_CTXREG_SYS");
+                               err = gr_gk20a_alloc_load_netlist_aiv(
+                                       src, size, &g->gr.ctx_vars.ctxsw_regs.sys);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_CTXREG_GPC:
+                               nvhost_dbg_info("NETLIST_REGIONID_CTXREG_GPC");
+                               err = gr_gk20a_alloc_load_netlist_aiv(
+                                       src, size, &g->gr.ctx_vars.ctxsw_regs.gpc);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_CTXREG_TPC:
+                               nvhost_dbg_info("NETLIST_REGIONID_CTXREG_TPC");
+                               err = gr_gk20a_alloc_load_netlist_aiv(
+                                       src, size, &g->gr.ctx_vars.ctxsw_regs.tpc);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_CTXREG_ZCULL_GPC:
+                               nvhost_dbg_info("NETLIST_REGIONID_CTXREG_ZCULL_GPC");
+                               err = gr_gk20a_alloc_load_netlist_aiv(
+                                       src, size, &g->gr.ctx_vars.ctxsw_regs.zcull_gpc);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_CTXREG_PPC:
+                               nvhost_dbg_info("NETLIST_REGIONID_CTXREG_PPC");
+                               err = gr_gk20a_alloc_load_netlist_aiv(
+                                       src, size, &g->gr.ctx_vars.ctxsw_regs.ppc);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_CTXREG_PM_SYS:
+                               nvhost_dbg_info("NETLIST_REGIONID_CTXREG_PM_SYS");
+                               err = gr_gk20a_alloc_load_netlist_aiv(
+                                       src, size, &g->gr.ctx_vars.ctxsw_regs.pm_sys);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_CTXREG_PM_GPC:
+                               nvhost_dbg_info("NETLIST_REGIONID_CTXREG_PM_GPC");
+                               err = gr_gk20a_alloc_load_netlist_aiv(
+                                       src, size, &g->gr.ctx_vars.ctxsw_regs.pm_gpc);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_CTXREG_PM_TPC:
+                               nvhost_dbg_info("NETLIST_REGIONID_CTXREG_PM_TPC");
+                               err = gr_gk20a_alloc_load_netlist_aiv(
+                                       src, size, &g->gr.ctx_vars.ctxsw_regs.pm_tpc);
+                               if (err)
+                                       goto clean_up;
+                               break;
+                       case NETLIST_REGIONID_BUFFER_SIZE:
+                               g->gr.ctx_vars.buffer_size = *src;
+                               nvhost_dbg_info("NETLIST_REGIONID_BUFFER_SIZE : %d",
+                                       g->gr.ctx_vars.buffer_size);
+                               break;
+                       case NETLIST_REGIONID_CTXSW_REG_BASE_INDEX:
+                               g->gr.ctx_vars.regs_base_index = *src;
+                               nvhost_dbg_info("NETLIST_REGIONID_CTXSW_REG_BASE_INDEX : %d",
+                                       g->gr.ctx_vars.regs_base_index);
+                               break;
+                       case NETLIST_REGIONID_MAJORV:
+                               major_v = *src;
+                               nvhost_dbg_info("NETLIST_REGIONID_MAJORV : %d",
+                                       major_v);
+                               break;
+                       case NETLIST_REGIONID_NETLIST_NUM:
+                               netlist_num = *src;
+                               nvhost_dbg_info("NETLIST_REGIONID_NETLIST_NUM : %d",
+                                       netlist_num);
+                               break;
+                       case NETLIST_REGIONID_CTXREG_PMPPC:
+                               nvhost_warn(d, "NETLIST_REGIONID_CTXREG_PMPPC skipped");
+                               break;
+                       default:
+                               nvhost_warn(d, "unrecognized region %d skipped", i);
+                               break;
+                       }
+               }
+
+               if (net != NETLIST_FINAL && major_v != major_v_hw) {
+                       nvhost_dbg_info("skip %s: major_v 0x%08x doesn't match hw 0x%08x",
+                               name, major_v, major_v_hw);
+                       goto clean_up;
+               }
+
+               g->gr.ctx_vars.valid = true;
+               g->gr.netlist = net;
+
+               release_firmware(netlist_fw);
+               nvhost_dbg_fn("done");
+               goto done;
+
+clean_up:
+               kfree(g->gr.ctx_vars.ucode.fecs.inst.l);
+               kfree(g->gr.ctx_vars.ucode.fecs.data.l);
+               kfree(g->gr.ctx_vars.ucode.gpccs.inst.l);
+               kfree(g->gr.ctx_vars.ucode.gpccs.data.l);
+               kfree(g->gr.ctx_vars.sw_bundle_init.l);
+               kfree(g->gr.ctx_vars.sw_method_init.l);
+               kfree(g->gr.ctx_vars.sw_ctx_load.l);
+               kfree(g->gr.ctx_vars.sw_non_ctx_load.l);
+               kfree(g->gr.ctx_vars.ctxsw_regs.sys.l);
+               kfree(g->gr.ctx_vars.ctxsw_regs.gpc.l);
+               kfree(g->gr.ctx_vars.ctxsw_regs.tpc.l);
+               kfree(g->gr.ctx_vars.ctxsw_regs.zcull_gpc.l);
+               kfree(g->gr.ctx_vars.ctxsw_regs.ppc.l);
+               kfree(g->gr.ctx_vars.ctxsw_regs.pm_sys.l);
+               kfree(g->gr.ctx_vars.ctxsw_regs.pm_gpc.l);
+               kfree(g->gr.ctx_vars.ctxsw_regs.pm_tpc.l);
+               release_firmware(netlist_fw);
+               err = -ENOENT;
+       }
+
+done:
+       if (g->gr.ctx_vars.valid) {
+               nvhost_dbg_info("netlist image %s loaded", name);
+               return 0;
+       } else {
+               nvhost_err(d, "failed to load netlist image!!");
+               return err;
+       }
+}
+
diff --git a/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h b/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h
new file mode 100644 (file)
index 0000000..7c60ed4
--- /dev/null
@@ -0,0 +1,150 @@
+/*
+ * drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h
+ *
+ * GK20A Graphics Context
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __GR_CTX_GK20A_H__
+#define __GR_CTX_GK20A_H__
+
+
+/* production netlist, one and only one from below */
+#undef GK20A_NETLIST_IMAGE_FW_NAME
+/* emulation netlists, match majorV with HW */
+#define GK20A_NETLIST_IMAGE_A  "NETA_img.bin"
+#define GK20A_NETLIST_IMAGE_B  "NETB_img.bin"
+#undef  GK20A_NETLIST_IMAGE_C
+#undef  GK20A_NETLIST_IMAGE_D
+
+union __max_name {
+#ifdef GK20A_NETLIST_IMAGE_A
+       char __name_a[sizeof(GK20A_NETLIST_IMAGE_A)];
+#endif
+#ifdef GK20A_NETLIST_IMAGE_B
+       char __name_b[sizeof(GK20A_NETLIST_IMAGE_B)];
+#endif
+#ifdef GK20A_NETLIST_IMAGE_C
+       char __name_c[sizeof(GK20A_NETLIST_IMAGE_C)];
+#endif
+#ifdef GK20A_NETLIST_IMAGE_D
+       char __name_d[sizeof(GK20A_NETLIST_IMAGE_D)];
+#endif
+};
+
+#define MAX_NETLIST_NAME sizeof(union __max_name)
+
+/* index for emulation netlists */
+#define NETLIST_FINAL          -1
+#define NETLIST_SLOT_A         0
+#define NETLIST_SLOT_B         1
+#define NETLIST_SLOT_C         2
+#define NETLIST_SLOT_D         3
+#define MAX_NETLIST            4
+
+/* netlist regions */
+#define NETLIST_REGIONID_FECS_UCODE_DATA       0
+#define NETLIST_REGIONID_FECS_UCODE_INST       1
+#define NETLIST_REGIONID_GPCCS_UCODE_DATA      2
+#define NETLIST_REGIONID_GPCCS_UCODE_INST      3
+#define NETLIST_REGIONID_SW_BUNDLE_INIT                4
+#define NETLIST_REGIONID_SW_CTX_LOAD           5
+#define NETLIST_REGIONID_SW_NON_CTX_LOAD       6
+#define NETLIST_REGIONID_SW_METHOD_INIT                7
+#define NETLIST_REGIONID_CTXREG_SYS            8
+#define NETLIST_REGIONID_CTXREG_GPC            9
+#define NETLIST_REGIONID_CTXREG_TPC            10
+#define NETLIST_REGIONID_CTXREG_ZCULL_GPC      11
+#define NETLIST_REGIONID_CTXREG_PM_SYS         12
+#define NETLIST_REGIONID_CTXREG_PM_GPC         13
+#define NETLIST_REGIONID_CTXREG_PM_TPC         14
+#define NETLIST_REGIONID_MAJORV                        15
+#define NETLIST_REGIONID_BUFFER_SIZE           16
+#define NETLIST_REGIONID_CTXSW_REG_BASE_INDEX  17
+#define NETLIST_REGIONID_NETLIST_NUM           18
+#define NETLIST_REGIONID_CTXREG_PPC            19
+#define NETLIST_REGIONID_CTXREG_PMPPC          20
+
+struct netlist_region {
+       u32 region_id;
+       u32 data_size;
+       u32 data_offset;
+};
+
+struct netlist_image_header {
+       u32 version;
+       u32 regions;
+};
+
+struct netlist_image {
+       struct netlist_image_header header;
+       struct netlist_region regions[1];
+};
+
+struct av_gk20a {
+       u32 addr;
+       u32 value;
+};
+struct aiv_gk20a {
+       u32 addr;
+       u32 index;
+       u32 value;
+};
+struct aiv_list_gk20a {
+       struct aiv_gk20a *l;
+       u32 count;
+};
+struct av_list_gk20a {
+       struct av_gk20a *l;
+       u32 count;
+};
+struct u32_list_gk20a {
+       u32 *l;
+       u32 count;
+};
+
+static inline
+struct av_gk20a *alloc_av_list_gk20a(struct av_list_gk20a *avl)
+{
+       avl->l = kzalloc(avl->count * sizeof(*avl->l), GFP_KERNEL);
+       return avl->l;
+}
+
+static inline
+struct aiv_gk20a *alloc_aiv_list_gk20a(struct aiv_list_gk20a *aivl)
+{
+       aivl->l = kzalloc(aivl->count * sizeof(*aivl->l), GFP_KERNEL);
+       return aivl->l;
+}
+
+static inline
+u32 *alloc_u32_list_gk20a(struct u32_list_gk20a *u32l)
+{
+       u32l->l = kzalloc(u32l->count * sizeof(*u32l->l), GFP_KERNEL);
+       return u32l->l;
+}
+
+struct gr_ucode_gk20a {
+       struct {
+               struct u32_list_gk20a inst;
+               struct u32_list_gk20a data;
+       } gpccs, fecs;
+};
+
+/* main entry for grctx loading */
+int gr_gk20a_init_ctx_vars(struct gk20a *g, struct gr_gk20a *gr);
+
+#endif /*__GR_CTX_GK20A_H__*/
diff --git a/drivers/video/tegra/host/gk20a/gr_ctx_gk20a_sim.c b/drivers/video/tegra/host/gk20a/gr_ctx_gk20a_sim.c
new file mode 100644 (file)
index 0000000..005642c
--- /dev/null
@@ -0,0 +1,258 @@
+/*
+ * drivers/video/tegra/host/gk20a/gr_ctx_sim_gk20a.c
+ *
+ * GK20A Graphics Context for Simulation
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "../dev.h"
+
+#include "gk20a.h"
+#include "gr_ctx_gk20a.h"
+
+int gr_gk20a_init_ctx_vars(struct gk20a *g, struct gr_gk20a *gr)
+{
+       int err = 0;
+       u32 i, temp;
+       char *size_path  = NULL;
+       char *reg_path   = NULL;
+       char *value_path = NULL;
+
+       nvhost_dbg(dbg_fn | dbg_info,
+                  "querying grctx info from chiplib");
+
+       g->gr.ctx_vars.dynamic = true;
+       g->gr.netlist = GR_NETLIST_DYNAMIC;
+
+       /* query sizes and counts */
+       gk20a_sim_esc_readl(g, "GRCTX_UCODE_INST_FECS_COUNT", 0,
+                           &g->gr.ctx_vars.ucode.fecs.inst.count);
+       gk20a_sim_esc_readl(g, "GRCTX_UCODE_DATA_FECS_COUNT", 0,
+                           &g->gr.ctx_vars.ucode.fecs.data.count);
+       gk20a_sim_esc_readl(g, "GRCTX_UCODE_INST_GPCCS_COUNT", 0,
+                           &g->gr.ctx_vars.ucode.gpccs.inst.count);
+       gk20a_sim_esc_readl(g, "GRCTX_UCODE_DATA_GPCCS_COUNT", 0,
+                           &g->gr.ctx_vars.ucode.gpccs.data.count);
+       gk20a_sim_esc_readl(g, "GRCTX_ALL_CTX_TOTAL_WORDS", 0, &temp);
+       g->gr.ctx_vars.buffer_size = temp << 2;
+       gk20a_sim_esc_readl(g, "GRCTX_SW_BUNDLE_INIT_SIZE", 0,
+                           &g->gr.ctx_vars.sw_bundle_init.count);
+       gk20a_sim_esc_readl(g, "GRCTX_SW_METHOD_INIT_SIZE", 0,
+                           &g->gr.ctx_vars.sw_method_init.count);
+       gk20a_sim_esc_readl(g, "GRCTX_SW_CTX_LOAD_SIZE", 0,
+                           &g->gr.ctx_vars.sw_ctx_load.count);
+
+       switch (0) { /*g->gr.ctx_vars.reg_init_override)*/
+#if 0
+       case NV_REG_STR_RM_GR_REG_INIT_OVERRIDE_PROD_DIFF:
+               sizePath   = "GRCTX_NONCTXSW_PROD_DIFF_REG_SIZE";
+               regPath    = "GRCTX_NONCTXSW_PROD_DIFF_REG:REG";
+               valuePath  = "GRCTX_NONCTXSW_PROD_DIFF_REG:VALUE";
+               break;
+#endif
+       default:
+               size_path   = "GRCTX_NONCTXSW_REG_SIZE";
+               reg_path    = "GRCTX_NONCTXSW_REG:REG";
+               value_path  = "GRCTX_NONCTXSW_REG:VALUE";
+               break;
+       }
+
+       gk20a_sim_esc_readl(g, size_path, 0,
+                           &g->gr.ctx_vars.sw_non_ctx_load.count);
+
+       gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_SYS_COUNT", 0,
+                           &g->gr.ctx_vars.ctxsw_regs.sys.count);
+       gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_GPC_COUNT", 0,
+                           &g->gr.ctx_vars.ctxsw_regs.gpc.count);
+       gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_TPC_COUNT", 0,
+                           &g->gr.ctx_vars.ctxsw_regs.tpc.count);
+#if 0
+       /* looks to be unused, actually chokes the sim */
+       gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PPC_COUNT", 0,
+                           &g->gr.ctx_vars.ctxsw_regs.ppc.count);
+#endif
+       gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC_COUNT", 0,
+                           &g->gr.ctx_vars.ctxsw_regs.zcull_gpc.count);
+       gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_SYS_COUNT", 0,
+                           &g->gr.ctx_vars.ctxsw_regs.pm_sys.count);
+       gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_GPC_COUNT", 0,
+                           &g->gr.ctx_vars.ctxsw_regs.pm_gpc.count);
+       gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_TPC_COUNT", 0,
+                           &g->gr.ctx_vars.ctxsw_regs.pm_tpc.count);
+
+       err |= !alloc_u32_list_gk20a(&g->gr.ctx_vars.ucode.fecs.inst);
+       err |= !alloc_u32_list_gk20a(&g->gr.ctx_vars.ucode.fecs.data);
+       err |= !alloc_u32_list_gk20a(&g->gr.ctx_vars.ucode.gpccs.inst);
+       err |= !alloc_u32_list_gk20a(&g->gr.ctx_vars.ucode.gpccs.data);
+       err |= !alloc_av_list_gk20a(&g->gr.ctx_vars.sw_bundle_init);
+       err |= !alloc_av_list_gk20a(&g->gr.ctx_vars.sw_method_init);
+       err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.sw_ctx_load);
+       err |= !alloc_av_list_gk20a(&g->gr.ctx_vars.sw_non_ctx_load);
+       err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.sys);
+       err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.gpc);
+       err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.tpc);
+       err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.zcull_gpc);
+       err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.ppc);
+       err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.pm_sys);
+       err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.pm_gpc);
+       err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.pm_tpc);
+
+       if (err)
+               goto fail;
+
+       for (i = 0; i < g->gr.ctx_vars.ucode.fecs.inst.count; i++)
+               gk20a_sim_esc_readl(g, "GRCTX_UCODE_INST_FECS",
+                                   i, &g->gr.ctx_vars.ucode.fecs.inst.l[i]);
+
+       for (i = 0; i < g->gr.ctx_vars.ucode.fecs.data.count; i++)
+               gk20a_sim_esc_readl(g, "GRCTX_UCODE_DATA_FECS",
+                                   i, &g->gr.ctx_vars.ucode.fecs.data.l[i]);
+
+       for (i = 0; i < g->gr.ctx_vars.ucode.gpccs.inst.count; i++)
+               gk20a_sim_esc_readl(g, "GRCTX_UCODE_INST_GPCCS",
+                                   i, &g->gr.ctx_vars.ucode.gpccs.inst.l[i]);
+
+       for (i = 0; i < g->gr.ctx_vars.ucode.gpccs.data.count; i++)
+               gk20a_sim_esc_readl(g, "GRCTX_UCODE_DATA_GPCCS",
+                                   i, &g->gr.ctx_vars.ucode.gpccs.data.l[i]);
+
+       for (i = 0; i < g->gr.ctx_vars.sw_bundle_init.count; i++) {
+               struct av_gk20a *l = g->gr.ctx_vars.sw_bundle_init.l;
+               gk20a_sim_esc_readl(g, "GRCTX_SW_BUNDLE_INIT:ADDR",
+                                   i, &l[i].addr);
+               gk20a_sim_esc_readl(g, "GRCTX_SW_BUNDLE_INIT:VALUE",
+                                   i, &l[i].value);
+       }
+
+       for (i = 0; i < g->gr.ctx_vars.sw_method_init.count; i++) {
+               struct av_gk20a *l = g->gr.ctx_vars.sw_method_init.l;
+               gk20a_sim_esc_readl(g, "GRCTX_SW_METHOD_INIT:ADDR",
+                                   i, &l[i].addr);
+               gk20a_sim_esc_readl(g, "GRCTX_SW_METHOD_INIT:VALUE",
+                                   i, &l[i].value);
+       }
+
+       for (i = 0; i < g->gr.ctx_vars.sw_ctx_load.count; i++) {
+               struct aiv_gk20a *l = g->gr.ctx_vars.sw_ctx_load.l;
+               gk20a_sim_esc_readl(g, "GRCTX_SW_CTX_LOAD:ADDR",
+                                   i, &l[i].addr);
+               gk20a_sim_esc_readl(g, "GRCTX_SW_CTX_LOAD:INDEX",
+                                   i, &l[i].index);
+               gk20a_sim_esc_readl(g, "GRCTX_SW_CTX_LOAD:VALUE",
+                                   i, &l[i].value);
+       }
+
+       for (i = 0; i < g->gr.ctx_vars.sw_non_ctx_load.count; i++) {
+               struct av_gk20a *l = g->gr.ctx_vars.sw_non_ctx_load.l;
+               gk20a_sim_esc_readl(g, reg_path, i, &l[i].addr);
+               gk20a_sim_esc_readl(g, value_path, i, &l[i].value);
+       }
+
+       for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
+               struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.sys.l;
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_SYS:ADDR",
+                                   i, &l[i].addr);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_SYS:INDEX",
+                                   i, &l[i].index);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_SYS:VALUE",
+                                   i, &l[i].value);
+       }
+
+       for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
+               struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.gpc.l;
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_GPC:ADDR",
+                                   i, &l[i].addr);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_GPC:INDEX",
+                                   i, &l[i].index);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_GPC:VALUE",
+                                   i, &l[i].value);
+       }
+
+       for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
+               struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.tpc.l;
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_TPC:ADDR",
+                                   i, &l[i].addr);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_TPC:INDEX",
+                                   i, &l[i].index);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_TPC:VALUE",
+                                   i, &l[i].value);
+       }
+
+       for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
+               struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.ppc.l;
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PPC:ADDR",
+                                   i, &l[i].addr);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PPC:INDEX",
+                                   i, &l[i].index);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PPC:VALUE",
+                                   i, &l[i].value);
+       }
+
+       for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.zcull_gpc.count; i++) {
+               struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.zcull_gpc.l;
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC:ADDR",
+                                   i, &l[i].addr);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC:INDEX",
+                                   i, &l[i].index);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC:VALUE",
+                                   i, &l[i].value);
+       }
+
+       for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.pm_sys.count; i++) {
+               struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.pm_sys.l;
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_SYS:ADDR",
+                                   i, &l[i].addr);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_SYS:INDEX",
+                                   i, &l[i].index);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_SYS:VALUE",
+                                   i, &l[i].value);
+       }
+
+       for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.pm_gpc.count; i++) {
+               struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.pm_gpc.l;
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_GPC:ADDR",
+                                   i, &l[i].addr);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_GPC:INDEX",
+                                   i, &l[i].index);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_GPC:VALUE",
+                                   i, &l[i].value);
+       }
+
+       for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.pm_tpc.count; i++) {
+               struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.pm_tpc.l;
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_TPC:ADDR",
+                                   i, &l[i].addr);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_TPC:INDEX",
+                                   i, &l[i].index);
+               gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_TPC:VALUE",
+                                   i, &l[i].value);
+       }
+
+       g->gr.ctx_vars.valid = true;
+
+       gk20a_sim_esc_readl(g, "GRCTX_GEN_CTX_REGS_BASE_INDEX", 0,
+                           &g->gr.ctx_vars.regs_base_index);
+
+       nvhost_dbg(dbg_info | dbg_fn, "finished querying grctx info from chiplib");
+       return 0;
+fail:
+       nvhost_dbg(dbg_info | dbg_err | dbg_fn,
+                  "failed querying grctx info from chiplib");
+       return err;
+
+}
+
diff --git a/drivers/video/tegra/host/gk20a/gr_gk20a.c b/drivers/video/tegra/host/gk20a/gr_gk20a.c
new file mode 100644 (file)
index 0000000..3cbe92b
--- /dev/null
@@ -0,0 +1,4172 @@
+/*
+ * drivers/video/tegra/host/gk20a/gr_gk20a.c
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/delay.h>       /* for udelay */
+#include <linux/mm.h>          /* for totalram_pages */
+#include <linux/nvmap.h>
+
+#include "../dev.h"
+
+#include "gk20a.h"
+#include "gr_ctx_gk20a.h"
+
+#include "hw_ccsr_gk20a.h"
+#include "hw_ctxsw_prog_gk20a.h"
+#include "hw_gr_gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_ram_gk20a.h"
+#include "hw_pri_ringmaster_gk20a.h"
+#include "hw_proj_gk20a.h"
+#include "hw_top_gk20a.h"
+#include "hw_ltc_gk20a.h"
+#include "hw_fb_gk20a.h"
+#include "hw_therm_gk20a.h"
+#include "gk20a_gating_reglist.h"
+#include "chip_support.h"
+#include "nvhost_memmgr.h"
+
+static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
+static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_gk20a *c,
+                                   u32 addr, u32 data, u32 patch);
+
+/* global ctx buffer */
+static int  gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
+static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
+static int  gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
+                                           struct channel_gk20a *c);
+static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
+
+/* channel gr ctx buffer */
+static int  gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
+                                       struct channel_gk20a *c);
+static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
+
+/* channel patch ctx buffer */
+static int  gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
+                                       struct channel_gk20a *c);
+static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
+
+/* golden ctx image */
+static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
+                                         struct channel_gk20a *c);
+static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
+                                         struct channel_gk20a *c);
+
+static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
+{
+       u32 i, ucode_u32_size;
+       const u32 *ucode_u32_data;
+       u32 checksum;
+
+       nvhost_dbg_fn("");
+
+       gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
+                                             gr_gpccs_dmemc_blk_f(0)  |
+                                             gr_gpccs_dmemc_aincw_f(1)));
+
+       ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
+       ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
+
+       for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+               gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
+               checksum += ucode_u32_data[i];
+       }
+
+       gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
+                                            gr_fecs_dmemc_blk_f(0)  |
+                                            gr_fecs_dmemc_aincw_f(1)));
+
+       ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
+       ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
+
+       for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+               gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
+               checksum += ucode_u32_data[i];
+       }
+       nvhost_dbg_fn("done");
+}
+
+static void gr_gk20a_load_falcon_imem(struct gk20a *g)
+{
+       u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
+       const u32 *ucode_u32_data;
+       u32 tag, i, pad_start, pad_end;
+       u32 checksum;
+
+       nvhost_dbg_fn("");
+
+       cfg = gk20a_readl(g, gr_fecs_cfg_r());
+       fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
+
+       cfg = gk20a_readl(g, gr_gpc0_cfg_r());
+       gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
+
+       /* Use the broadcast address to access all of the GPCCS units. */
+       gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
+                                             gr_gpccs_imemc_blk_f(0) |
+                                             gr_gpccs_imemc_aincw_f(1)));
+
+       /* Setup the tags for the instruction memory. */
+       tag = 0;
+       gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
+
+       ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
+       ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
+
+       for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+               if (i && ((i % (256/sizeof(u32))) == 0)) {
+                       tag++;
+                       gk20a_writel(g, gr_gpccs_imemt_r(0),
+                                     gr_gpccs_imemt_tag_f(tag));
+               }
+               gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
+               checksum += ucode_u32_data[i];
+       }
+
+       pad_start = i*4;
+       pad_end = pad_start+(256-pad_start%256)+256;
+       for (i = pad_start;
+            (i < gpccs_imem_size * 256) && (i < pad_end);
+            i += 4) {
+               if (i && ((i % 256) == 0)) {
+                       tag++;
+                       gk20a_writel(g, gr_gpccs_imemt_r(0),
+                                     gr_gpccs_imemt_tag_f(tag));
+               }
+               gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
+       }
+
+       gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
+                                            gr_fecs_imemc_blk_f(0) |
+                                            gr_fecs_imemc_aincw_f(1)));
+
+       /* Setup the tags for the instruction memory. */
+       tag = 0;
+       gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
+
+       ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
+       ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
+
+       for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+               if (i && ((i % (256/sizeof(u32))) == 0)) {
+                       tag++;
+                       gk20a_writel(g, gr_fecs_imemt_r(0),
+                                     gr_fecs_imemt_tag_f(tag));
+               }
+               gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
+               checksum += ucode_u32_data[i];
+       }
+
+       pad_start = i*4;
+       pad_end = pad_start+(256-pad_start%256)+256;
+       for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
+               if (i && ((i % 256) == 0)) {
+                       tag++;
+                       gk20a_writel(g, gr_fecs_imemt_r(0),
+                                     gr_fecs_imemt_tag_f(tag));
+               }
+               gk20a_writel(g, gr_fecs_imemd_r(0), 0);
+       }
+}
+
+#define GR_IDLE_TIMEOUT_DEFAULT        10000   /* 10 milliseconds */
+
+static int gr_gk20a_wait_idle(struct gk20a *g, u32 *timeout)
+{
+#define GR_ENGINE_INDEX                0
+#define GR_IDLE_CHECK_PERIOD   10              /* 10 usec */
+
+       u32 gr_engine_status;
+       u32 gr_status;
+       bool ctxsw_active = false;
+
+       nvhost_dbg_fn("");
+
+       do {
+               u32 check = min_t(u32, GR_IDLE_CHECK_PERIOD, *timeout);
+
+               /* fmodel: host gets fifo_engine_status(gr) from gr
+                  only when gr_status is read */
+               gr_status = gk20a_readl(g, gr_status_r());
+
+               gr_engine_status = gk20a_readl(g, gr_engine_status_r());
+
+               if (!(gk20a_readl(g, mc_enable_r()) &
+                     mc_enable_pgraph_enabled_f()) ||
+                   (gr_engine_status_value_v(gr_engine_status) ==
+                    gr_engine_status_value_idle_v() &&
+                    !ctxsw_active)) {
+                       nvhost_dbg_fn("done");
+                       return 0;
+               }
+
+               udelay(GR_IDLE_CHECK_PERIOD);
+
+               /* handle interrupts */
+
+               *timeout -= check;
+
+       } while (*timeout);
+
+       nvhost_err(dev_from_gk20a(g), "timeout, status: %d",
+                  gr_engine_status);
+
+       return -1;
+}
+
+static int gr_gk20a_ctx_reset(struct gk20a *g, u32 rst_mask)
+{
+       nvhost_dbg_fn("");
+       /* FE_PWR_MODE_MODE_FORCE_ON for RTLSim and EMulation? */
+
+       if (rst_mask) {
+               gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), rst_mask);
+       } else {
+               gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
+                            gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
+                            gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
+                            gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
+                            gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
+                            gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
+                            gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
+                            gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
+                            gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
+                            gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
+       }
+
+       /* Delay for > 10 nvclks after writing reset. */
+       gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
+
+       gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
+                    gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
+                    gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
+                    gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
+                    gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
+                    gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
+                    gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
+                    gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
+                    gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
+                    gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
+
+       /* Delay for > 10 nvclks after writing reset. */
+       gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
+
+       /* FE_PWR_MODE_MODE_AUTO for RTLSim and EMulation? */
+
+       return 0;
+}
+
+static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
+                                  u32 *mailbox_ret, u32 opc_success,
+                                  u32 mailbox_ok, u32 opc_fail,
+                                  u32 mailbox_fail)
+{
+       u32 timeout = GR_IDLE_TIMEOUT_DEFAULT;
+       u32 check = WAIT_UCODE_LOOP;
+       u32 reg;
+
+       nvhost_dbg_fn("");
+
+       while (check == WAIT_UCODE_LOOP) {
+               if (timeout == 0)
+                       check = WAIT_UCODE_TIMEOUT;
+
+               /* XXX when this register read was sped up by removing printks
+                * (in sim) we had to increase GR_IDLE_TIMEOUT_DEFAULT in order
+                * not to get spurious timeouts... that says to me udelay is
+                * not doing what we think below...? */
+               reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
+
+               if (mailbox_ret)
+                       *mailbox_ret = reg;
+
+               switch (opc_success) {
+               case GR_IS_UCODE_OP_EQUAL:
+                       if (reg == mailbox_ok)
+                               check = WAIT_UCODE_OK;
+                       break;
+               case GR_IS_UCODE_OP_NOT_EQUAL:
+                       if (reg != mailbox_ok)
+                               check = WAIT_UCODE_OK;
+                       break;
+               case GR_IS_UCODE_OP_AND:
+                       if (reg & mailbox_ok)
+                               check = WAIT_UCODE_OK;
+                       break;
+               case GR_IS_UCODE_OP_LESSER:
+                       if (reg < mailbox_ok)
+                               check = WAIT_UCODE_OK;
+                       break;
+               case GR_IS_UCODE_OP_LESSER_EQUAL:
+                       if (reg <= mailbox_ok)
+                               check = WAIT_UCODE_OK;
+                       break;
+               case GR_IS_UCODE_OP_SKIP:
+                       /* do no success check */
+                       break;
+               default:
+                       nvhost_err(dev_from_gk20a(g),
+                                  "invalid success opcode 0x%x", opc_success);
+
+                       check = WAIT_UCODE_ERROR;
+                       break;
+               }
+
+               switch (opc_fail) {
+               case GR_IS_UCODE_OP_EQUAL:
+                       if (reg == mailbox_fail)
+                               check = WAIT_UCODE_ERROR;
+                       break;
+               case GR_IS_UCODE_OP_NOT_EQUAL:
+                       if (reg != mailbox_fail)
+                               check = WAIT_UCODE_ERROR;
+                       break;
+               case GR_IS_UCODE_OP_AND:
+                       if (reg & mailbox_fail)
+                               check = WAIT_UCODE_ERROR;
+                       break;
+               case GR_IS_UCODE_OP_LESSER:
+                       if (reg < mailbox_fail)
+                               check = WAIT_UCODE_ERROR;
+                       break;
+               case GR_IS_UCODE_OP_LESSER_EQUAL:
+                       if (reg <= mailbox_fail)
+                               check = WAIT_UCODE_ERROR;
+                       break;
+               case GR_IS_UCODE_OP_SKIP:
+                       /* do no check on fail*/
+                       break;
+               default:
+                       nvhost_err(dev_from_gk20a(g),
+                                  "invalid fail opcode 0x%x", opc_fail);
+                       check = WAIT_UCODE_ERROR;
+                       break;
+               }
+
+               udelay(10);
+               timeout -= min_t(u32, GR_IDLE_CHECK_PERIOD, timeout);
+       }
+
+       if (check == WAIT_UCODE_TIMEOUT) {
+               nvhost_err(dev_from_gk20a(g),
+                          "timeout waiting on ucode response");
+               return -1;
+       } else if (check == WAIT_UCODE_ERROR) {
+               nvhost_err(dev_from_gk20a(g),
+                          "ucode method failed on mailbox=%d value=0x%08x",
+                          mailbox_id, reg);
+               return -1;
+       }
+
+       nvhost_dbg_fn("done");
+       return 0;
+}
+
+int gr_gk20a_submit_fecs_method(struct gk20a *g,
+                       u32 mb_id, u32 mb_data, u32 mb_clr,
+                       u32 mtd_data, u32 mtd_adr, u32 *mb_ret,
+                       u32 opc_ok, u32 mb_ok, u32 opc_fail, u32 mb_fail)
+{
+       if (mb_id != 0)
+               gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(mb_id),
+                       mb_data);
+
+       gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
+               gr_fecs_ctxsw_mailbox_clear_value_f(mb_clr));
+
+       gk20a_writel(g, gr_fecs_method_data_r(), mtd_data);
+       gk20a_writel(g, gr_fecs_method_push_r(),
+               gr_fecs_method_push_adr_f(mtd_adr));
+
+       return gr_gk20a_ctx_wait_ucode(g, 0, mb_ret,
+               opc_ok, mb_ok, opc_fail, mb_fail);
+}
+
+static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
+{
+       u32 addr_lo;
+       u32 addr_hi;
+       u32 ret = 0;
+       void *inst_ptr = NULL;
+
+       nvhost_dbg_fn("");
+
+       inst_ptr = mem_op().mmap(c->inst_block.mem.ref);
+       if (IS_ERR(inst_ptr)) {
+               ret = -ENOMEM;
+               goto clean_up;
+       }
+
+       addr_lo = u64_lo32(gpu_va) >> 12;
+       addr_hi = u64_hi32(gpu_va);
+
+       mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
+                ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
+                ram_in_gr_wfi_ptr_lo_f(addr_lo));
+
+       mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
+                ram_in_gr_wfi_ptr_hi_f(addr_hi));
+
+       mem_op().munmap(c->inst_block.mem.ref, inst_ptr);
+
+       return 0;
+
+clean_up:
+       if (inst_ptr)
+               mem_op().munmap(c->inst_block.mem.ref, inst_ptr);
+
+       return ret;
+}
+
+static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_gk20a *c,
+                                   u32 addr, u32 data, u32 patch)
+{
+       struct channel_ctx_gk20a *ch_ctx;
+       u32 patch_slot = 0;
+       void *patch_ptr = NULL;
+
+       nvhost_dbg_fn("");
+
+       BUG_ON(patch != 0 && c == NULL);
+
+       if (patch) {
+               ch_ctx = &c->ch_ctx;
+               patch_ptr = mem_op().mmap(ch_ctx->patch_ctx.mem.ref);
+               if (IS_ERR(patch_ptr))
+                       return -ENOMEM;
+
+               patch_slot = ch_ctx->patch_ctx.data_count * 2;
+
+               mem_wr32(patch_ptr, patch_slot++, addr);
+               mem_wr32(patch_ptr, patch_slot++, data);
+
+               mem_op().munmap(ch_ctx->patch_ctx.mem.ref, patch_ptr);
+               ch_ctx->patch_ctx.data_count++;
+       } else {
+               gk20a_writel(g, addr, data);
+       }
+
+       return 0;
+}
+
+static int gr_gk20a_ctx_bind_first_channel(struct gk20a *g,
+                                       struct channel_gk20a *c)
+{
+       u32 inst_base_ptr =
+               u64_lo32(c->inst_block.cpu_pa) >> ram_in_base_shift_v();
+       u32 ret;
+
+       nvhost_dbg_info("bind channel %d inst ptr 0x%08x",
+                  c->hw_chid, inst_base_ptr);
+
+       ret = gr_gk20a_submit_fecs_method(g, 0, 0, 0x30,
+                       gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+                       gr_fecs_current_ctx_target_vid_mem_f() |
+                       gr_fecs_current_ctx_valid_f(1),
+                       gr_fecs_method_push_adr_bind_pointer_f(),
+                       0, GR_IS_UCODE_OP_AND, 0x10, GR_IS_UCODE_OP_AND, 0x20);
+       if (ret)
+               nvhost_err(dev_from_gk20a(g),
+                       "bind channel instance failed");
+
+       return ret;
+}
+
+static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
+                                   bool disable_fifo)
+{
+       struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+       struct fifo_gk20a *f = &g->fifo;
+       struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
+       u32 va_lo, va_hi, va;
+       int ret = 0;
+       void *ctx_ptr = NULL;
+
+       nvhost_dbg_fn("");
+
+       ctx_ptr = mem_op().mmap(ch_ctx->gr_ctx.mem.ref);
+       if (IS_ERR(ctx_ptr))
+               return -ENOMEM;
+
+       if (ch_ctx->zcull_ctx.gpu_va == 0 &&
+           ch_ctx->zcull_ctx.ctx_sw_mode ==
+               ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
+               ret = -EINVAL;
+               goto clean_up;
+       }
+
+       va_lo = u64_lo32(ch_ctx->zcull_ctx.gpu_va);
+       va_hi = u64_hi32(ch_ctx->zcull_ctx.gpu_va);
+       va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
+
+       if (disable_fifo) {
+               ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+               if (ret) {
+                       nvhost_err(dev_from_gk20a(g),
+                               "failed to disable gr engine activity\n");
+                       goto clean_up;
+               }
+       }
+
+       mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_v(), 0,
+                ch_ctx->zcull_ctx.ctx_sw_mode);
+
+       mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_v(), 0, va);
+
+       if (disable_fifo) {
+               ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+               if (ret) {
+                       nvhost_err(dev_from_gk20a(g),
+                               "failed to enable gr engine activity\n");
+                       goto clean_up;
+               }
+       }
+
+clean_up:
+       mem_op().munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
+
+       return ret;
+}
+
+static int gr_gk20a_ctx_pm_setup(struct gk20a *g, struct channel_gk20a *c,
+                                bool disable_fifo)
+{
+       struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+       u32 va_lo, va_hi, va;
+       int ret;
+       void *ctx_ptr = NULL;
+
+       nvhost_dbg_fn("");
+
+       ctx_ptr = mem_op().mmap(ch_ctx->gr_ctx.mem.ref);
+       if (IS_ERR(ctx_ptr))
+               return -ENOMEM;
+
+       if (ch_ctx->pm_ctx.ctx_sw_mode ==
+           ctxsw_prog_main_image_pm_mode_ctxsw_v()) {
+
+               if (ch_ctx->pm_ctx.gpu_va == 0) {
+                       ret = -ENOMEM;
+                       goto clean_up;
+               }
+
+               va_lo = u64_lo32(ch_ctx->pm_ctx.gpu_va);
+               va_hi = u64_hi32(ch_ctx->pm_ctx.gpu_va);
+               va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
+       } else {
+               va_lo = va_hi = 0;
+               va = 0;
+       }
+
+       /* TBD
+       if (disable_fifo)
+               disable_engine_activity(...);
+       */
+
+       mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_v(), 0, ch_ctx->pm_ctx.ctx_sw_mode);
+       mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_v(), 0, va);
+
+       /* TBD
+       if (disable_fifo)
+               enable_engine_activity(...);
+       */
+
+       nvhost_dbg_fn("done");
+
+clean_up:
+       nvhost_dbg_fn("fail");
+       mem_op().munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
+
+       return ret;
+}
+
+static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
+                       struct channel_gk20a *c, u32 patch)
+{
+       struct gr_gk20a *gr = &g->gr;
+       u32 attrib_offset_in_chunk = 0;
+       u32 alpha_offset_in_chunk = 0;
+       u32 pd_ab_max_output;
+       u32 gpc_index, ppc_index;
+       u32 temp;
+       u32 cbm_cfg_size1, cbm_cfg_size2;
+
+       nvhost_dbg_fn("");
+
+       gr_gk20a_ctx_patch_write(g, c, gr_ds_tga_constraintlogic_r(),
+               gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
+               gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
+               patch);
+
+       pd_ab_max_output = (gr->alpha_cb_default_size *
+               gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
+               gr_pd_ab_dist_cfg1_max_output_granularity_v();
+
+       gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg1_r(),
+               gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
+               gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
+
+       alpha_offset_in_chunk = attrib_offset_in_chunk +
+               gr->tpc_count * gr->attrib_cb_size;
+
+       for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+               temp = proj_gpc_stride_v() * gpc_index;
+               for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
+                    ppc_index++) {
+                       cbm_cfg_size1 = gr->attrib_cb_default_size *
+                               gr->pes_tpc_count[ppc_index][gpc_index];
+                       cbm_cfg_size2 = gr->alpha_cb_default_size *
+                               gr->pes_tpc_count[ppc_index][gpc_index];
+
+                       gr_gk20a_ctx_patch_write(g, c,
+                               gr_gpc0_ppc0_cbm_cfg_r() + temp +
+                               proj_ppc_in_gpc_stride_v() * ppc_index,
+                               gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
+                               gr_gpc0_ppc0_cbm_cfg_start_offset_f(attrib_offset_in_chunk) |
+                               gr_gpc0_ppc0_cbm_cfg_size_f(cbm_cfg_size1), patch);
+
+                       attrib_offset_in_chunk += gr->attrib_cb_size *
+                               gr->pes_tpc_count[ppc_index][gpc_index];
+
+                       gr_gk20a_ctx_patch_write(g, c,
+                               gr_gpc0_ppc0_cbm_cfg2_r() + temp +
+                               proj_ppc_in_gpc_stride_v() * ppc_index,
+                               gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
+                               gr_gpc0_ppc0_cbm_cfg2_size_f(cbm_cfg_size2), patch);
+
+                       alpha_offset_in_chunk += gr->alpha_cb_size *
+                               gr->pes_tpc_count[ppc_index][gpc_index];
+               }
+       }
+
+       return 0;
+}
+
+static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
+                       struct channel_gk20a *c, u32 patch)
+{
+       struct gr_gk20a *gr = &g->gr;
+       struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+       u64 addr;
+       u32 size;
+       u32 data;
+
+       nvhost_dbg_fn("");
+
+       /* global pagepool */
+       addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
+               gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
+               (u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
+                (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
+
+       size = gr->global_ctx_buffer[PAGEPOOL].size /
+               gr_scc_pagepool_total_pages_byte_granularity_v();
+
+       if (size == gr_scc_pagepool_total_pages_hwmax_value_v())
+               size = gr_scc_pagepool_total_pages_hwmax_v();
+
+       nvhost_dbg_info("pagepool addr : 0x%016llx, size : %d",
+               addr, size);
+
+       gr_gk20a_ctx_patch_write(g, c, gr_scc_pagepool_base_r(),
+               gr_scc_pagepool_base_addr_39_8_f(addr), patch);
+
+       gr_gk20a_ctx_patch_write(g, c, gr_scc_pagepool_r(),
+               gr_scc_pagepool_total_pages_f(size) |
+               gr_scc_pagepool_valid_true_f(), patch);
+
+       gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gcc_pagepool_base_r(),
+               gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
+
+       gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gcc_pagepool_r(),
+               gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
+
+       gr_gk20a_ctx_patch_write(g, c, gr_pd_pagepool_r(),
+               gr_pd_pagepool_total_pages_f(size) |
+               gr_pd_pagepool_valid_true_f(), patch);
+
+       /* global bundle cb */
+       addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
+               gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
+               (u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
+                (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
+
+       size = gr->bundle_cb_default_size;
+
+       nvhost_dbg_info("global bundle cb addr : 0x%016llx, size : %d",
+               addr, size);
+
+       gr_gk20a_ctx_patch_write(g, c, gr_scc_bundle_cb_base_r(),
+               gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
+
+       gr_gk20a_ctx_patch_write(g, c, gr_scc_bundle_cb_size_r(),
+               gr_scc_bundle_cb_size_div_256b_f(size) |
+               gr_scc_bundle_cb_size_valid_true_f(), patch);
+
+       gr_gk20a_ctx_patch_write(g, c, gr_gpcs_setup_bundle_cb_base_r(),
+               gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
+
+       gr_gk20a_ctx_patch_write(g, c, gr_gpcs_setup_bundle_cb_size_r(),
+               gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
+               gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
+
+       /* data for state_limit */
+       data = (gr->bundle_cb_default_size *
+               gr_scc_bundle_cb_size_div_256b_byte_granularity_v()) /
+               gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v();
+
+       data = min_t(u32, data, gr->min_gpm_fifo_depth);
+
+       nvhost_dbg_info("global bundle cb token limit : %d, state limit : %d",
+                  gr->bundle_cb_token_limit, data);
+
+       gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg2_r(),
+               gr_pd_ab_dist_cfg2_token_limit_f(gr->bundle_cb_token_limit) |
+               gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
+
+       /* global attrib cb */
+       addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
+               gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
+               (u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
+                (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
+
+       nvhost_dbg_info("global attrib cb addr : 0x%016llx", addr);
+
+       gr_gk20a_ctx_patch_write(g, c, gr_gpcs_setup_attrib_cb_base_r(),
+               gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
+               gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
+
+       gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
+               gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
+               gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
+
+       return 0;
+}
+
+static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, u32 patch)
+{
+       struct gr_gk20a *gr = &g->gr;
+       u32 gpm_pd_cfg;
+       u32 pd_ab_dist_cfg0;
+       u32 ds_debug;
+       u32 mpc_vtg_debug;
+       u32 pe_vaf;
+       u32 pe_vsc_vpc;
+
+       nvhost_dbg_fn("");
+
+       gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
+       pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
+       ds_debug = gk20a_readl(g, gr_ds_debug_r());
+       mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
+
+       if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
+               pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
+               pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
+
+               gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
+               pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
+               pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
+               pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
+               ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
+               mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
+
+               gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
+               gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
+               gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
+               gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
+               gr_gk20a_ctx_patch_write(g, c, gr_ds_debug_r(), ds_debug, patch);
+               gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
+       } else {
+               gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
+               pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
+               ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
+               mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
+
+               gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
+               gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
+               gr_gk20a_ctx_patch_write(g, c, gr_ds_debug_r(), ds_debug, patch);
+               gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
+       }
+
+       return 0;
+}
+
+static int gr_gk20a_setup_rop_mapping(struct gk20a *g,
+                               struct gr_gk20a *gr)
+{
+       u32 norm_entries, norm_shift;
+       u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
+       u32 map0, map1, map2, map3, map4, map5;
+
+       if (!gr->map_tiles)
+               return -1;
+
+       nvhost_dbg_fn("");
+
+       gk20a_writel(g, gr_crstr_map_table_cfg_r(),
+                    gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
+                    gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
+
+       map0 =  gr_crstr_gpc_map0_tile0_f(gr->map_tiles[0]) |
+               gr_crstr_gpc_map0_tile1_f(gr->map_tiles[1]) |
+               gr_crstr_gpc_map0_tile2_f(gr->map_tiles[2]) |
+               gr_crstr_gpc_map0_tile3_f(gr->map_tiles[3]) |
+               gr_crstr_gpc_map0_tile4_f(gr->map_tiles[4]) |
+               gr_crstr_gpc_map0_tile5_f(gr->map_tiles[5]);
+
+       map1 =  gr_crstr_gpc_map1_tile6_f(gr->map_tiles[6]) |
+               gr_crstr_gpc_map1_tile7_f(gr->map_tiles[7]) |
+               gr_crstr_gpc_map1_tile8_f(gr->map_tiles[8]) |
+               gr_crstr_gpc_map1_tile9_f(gr->map_tiles[9]) |
+               gr_crstr_gpc_map1_tile10_f(gr->map_tiles[10]) |
+               gr_crstr_gpc_map1_tile11_f(gr->map_tiles[11]);
+
+       map2 =  gr_crstr_gpc_map2_tile12_f(gr->map_tiles[12]) |
+               gr_crstr_gpc_map2_tile13_f(gr->map_tiles[13]) |
+               gr_crstr_gpc_map2_tile14_f(gr->map_tiles[14]) |
+               gr_crstr_gpc_map2_tile15_f(gr->map_tiles[15]) |
+               gr_crstr_gpc_map2_tile16_f(gr->map_tiles[16]) |
+               gr_crstr_gpc_map2_tile17_f(gr->map_tiles[17]);
+
+       map3 =  gr_crstr_gpc_map3_tile18_f(gr->map_tiles[18]) |
+               gr_crstr_gpc_map3_tile19_f(gr->map_tiles[19]) |
+               gr_crstr_gpc_map3_tile20_f(gr->map_tiles[20]) |
+               gr_crstr_gpc_map3_tile21_f(gr->map_tiles[21]) |
+               gr_crstr_gpc_map3_tile22_f(gr->map_tiles[22]) |
+               gr_crstr_gpc_map3_tile23_f(gr->map_tiles[23]);
+
+       map4 =  gr_crstr_gpc_map4_tile24_f(gr->map_tiles[24]) |
+               gr_crstr_gpc_map4_tile25_f(gr->map_tiles[25]) |
+               gr_crstr_gpc_map4_tile26_f(gr->map_tiles[26]) |
+               gr_crstr_gpc_map4_tile27_f(gr->map_tiles[27]) |
+               gr_crstr_gpc_map4_tile28_f(gr->map_tiles[28]) |
+               gr_crstr_gpc_map4_tile29_f(gr->map_tiles[29]);
+
+       map5 =  gr_crstr_gpc_map5_tile30_f(gr->map_tiles[30]) |
+               gr_crstr_gpc_map5_tile31_f(gr->map_tiles[31]) |
+               gr_crstr_gpc_map5_tile32_f(0) |
+               gr_crstr_gpc_map5_tile33_f(0) |
+               gr_crstr_gpc_map5_tile34_f(0) |
+               gr_crstr_gpc_map5_tile35_f(0);
+
+       gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
+       gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
+       gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
+       gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
+       gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
+       gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
+
+       switch (gr->tpc_count) {
+       case 1:
+               norm_shift = 4;
+               break;
+       case 2:
+       case 3:
+               norm_shift = 3;
+               break;
+       case 4:
+       case 5:
+       case 6:
+       case 7:
+               norm_shift = 2;
+               break;
+       case 8:
+       case 9:
+       case 10:
+       case 11:
+       case 12:
+       case 13:
+       case 14:
+       case 15:
+               norm_shift = 1;
+               break;
+       default:
+               norm_shift = 0;
+               break;
+       }
+
+       norm_entries = gr->tpc_count << norm_shift;
+       coeff5_mod = (1 << 5) % norm_entries;
+       coeff6_mod = (1 << 6) % norm_entries;
+       coeff7_mod = (1 << 7) % norm_entries;
+       coeff8_mod = (1 << 8) % norm_entries;
+       coeff9_mod = (1 << 9) % norm_entries;
+       coeff10_mod = (1 << 10) % norm_entries;
+       coeff11_mod = (1 << 11) % norm_entries;
+
+       gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
+                    gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
+                    gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
+                    gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
+                    gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
+                    gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
+
+       gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
+                    gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
+                    gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
+                    gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
+                    gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
+                    gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
+                    gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
+
+       gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
+       gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
+       gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
+       gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
+       gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
+       gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
+
+       gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
+                    gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
+                    gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
+
+       gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
+       gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
+       gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
+       gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
+       gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
+       gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
+
+       return 0;
+}
+
+static inline u32 count_bits(u32 mask)
+{
+       u32 temp = mask;
+       u32 count;
+       for (count = 0; temp != 0; count++)
+               temp &= temp - 1;
+
+       return count;
+}
+
+static inline u32 clear_count_bits(u32 num, u32 clear_count)
+{
+       u32 count = clear_count;
+       for (; (num != 0) && (count != 0); count--)
+               num &= num - 1;
+
+       return num;
+}
+
+static int gr_gk20a_setup_alpha_beta_tables(struct gk20a *g,
+                                       struct gr_gk20a *gr)
+{
+       u32 table_index_bits = 5;
+       u32 rows = (1 << table_index_bits);
+       u32 row_stride = gr_pd_alpha_ratio_table__size_1_v() / rows;
+
+       u32 row;
+       u32 index;
+       u32 gpc_index;
+       u32 gpcs_per_reg = 4;
+       u32 pes_index;
+       u32 tpc_count_pes;
+       u32 num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
+
+       u32 alpha_target, beta_target;
+       u32 alpha_bits, beta_bits;
+       u32 alpha_mask, beta_mask, partial_mask;
+       u32 reg_offset;
+       bool assign_alpha;
+
+       u32 map_alpha[gr_pd_alpha_ratio_table__size_1_v()];
+       u32 map_beta[gr_pd_alpha_ratio_table__size_1_v()];
+       u32 map_reg_used[gr_pd_alpha_ratio_table__size_1_v()];
+
+       nvhost_dbg_fn("");
+
+       memset(map_alpha, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
+       memset(map_beta, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
+       memset(map_reg_used, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
+
+       for (row = 0; row < rows; ++row) {
+               alpha_target = max_t(u32, gr->tpc_count * row / rows, 1);
+               beta_target = gr->tpc_count - alpha_target;
+
+               assign_alpha = (alpha_target < beta_target);
+
+               for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                       reg_offset = (row * row_stride) + (gpc_index / gpcs_per_reg);
+                       alpha_mask = beta_mask = 0;
+
+                       for (pes_index = 0; pes_index < num_pes_per_gpc; pes_index++) {
+                               tpc_count_pes = gr->pes_tpc_count[pes_index][gpc_index];
+
+                               if (assign_alpha) {
+                                       alpha_bits = (alpha_target == 0) ? 0 : tpc_count_pes;
+                                       beta_bits = tpc_count_pes - alpha_bits;
+                               } else {
+                                       beta_bits = (beta_target == 0) ? 0 : tpc_count_pes;
+                                       alpha_bits = tpc_count_pes - beta_bits;
+                               }
+
+                               partial_mask = gr->pes_tpc_mask[pes_index][gpc_index];
+                               partial_mask = clear_count_bits(partial_mask, tpc_count_pes - alpha_bits);
+                               alpha_mask |= partial_mask;
+
+                               partial_mask = gr->pes_tpc_mask[pes_index][gpc_index] ^ partial_mask;
+                               beta_mask |= partial_mask;
+
+                               alpha_target -= min(alpha_bits, alpha_target);
+                               beta_target -= min(beta_bits, beta_target);
+
+                               if ((alpha_bits > 0) || (beta_bits > 0))
+                                       assign_alpha = !assign_alpha;
+                       }
+
+                       switch (gpc_index % gpcs_per_reg) {
+                       case 0:
+                               map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n0_mask_f(alpha_mask);
+                               map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n0_mask_f(beta_mask);
+                               break;
+                       case 1:
+                               map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n1_mask_f(alpha_mask);
+                               map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n1_mask_f(beta_mask);
+                               break;
+                       case 2:
+                               map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n2_mask_f(alpha_mask);
+                               map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n2_mask_f(beta_mask);
+                               break;
+                       case 3:
+                               map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n3_mask_f(alpha_mask);
+                               map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n3_mask_f(beta_mask);
+                               break;
+                       }
+                       map_reg_used[reg_offset] = true;
+               }
+       }
+
+       for (index = 0; index < gr_pd_alpha_ratio_table__size_1_v(); index++) {
+               if (map_reg_used[index]) {
+                       gk20a_writel(g, gr_pd_alpha_ratio_table_r(index), map_alpha[index]);
+                       gk20a_writel(g, gr_pd_beta_ratio_table_r(index), map_beta[index]);
+               }
+       }
+
+       return 0;
+}
+
+static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
+{
+       struct gr_gk20a *gr = &g->gr;
+       u32 tpc_index, gpc_index;
+       u32 tpc_offset, gpc_offset;
+       u32 sm_id = 0, gpc_id = 0;
+       u32 sm_id_to_gpc_id[proj_scal_max_gpcs_v() * proj_scal_max_tpc_per_gpc_v()];
+       u32 tpc_per_gpc;
+       u32 max_ways_evict = INVALID_MAX_WAYS;
+
+       nvhost_dbg_fn("");
+
+       for (tpc_index = 0; tpc_index < gr->max_tpc_per_gpc_count; tpc_index++) {
+               for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                       gpc_offset = proj_gpc_stride_v() * gpc_index;
+                       if (tpc_index < gr->gpc_tpc_count[gpc_index]) {
+                               tpc_offset = proj_tpc_in_gpc_stride_v() * tpc_index;
+
+                               gk20a_writel(g, gr_gpc0_tpc0_sm_cfg_r() + gpc_offset + tpc_offset,
+                                            gr_gpc0_tpc0_sm_cfg_sm_id_f(sm_id));
+                               gk20a_writel(g, gr_gpc0_tpc0_l1c_cfg_smid_r() + gpc_offset + tpc_offset,
+                                            gr_gpc0_tpc0_l1c_cfg_smid_value_f(sm_id));
+                               gk20a_writel(g, gr_gpc0_gpm_pd_sm_id_r(tpc_index) + gpc_offset,
+                                            gr_gpc0_gpm_pd_sm_id_id_f(sm_id));
+                               gk20a_writel(g, gr_gpc0_tpc0_pe_cfg_smid_r() + gpc_offset + tpc_offset,
+                                            gr_gpc0_tpc0_pe_cfg_smid_value_f(sm_id));
+
+                               sm_id_to_gpc_id[sm_id] = gpc_index;
+                               sm_id++;
+                       }
+
+                       gk20a_writel(g, gr_gpc0_gpm_pd_active_tpcs_r() + gpc_offset,
+                                    gr_gpc0_gpm_pd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
+                       gk20a_writel(g, gr_gpc0_gpm_sd_active_tpcs_r() + gpc_offset,
+                                    gr_gpc0_gpm_sd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
+               }
+       }
+
+       for (tpc_index = 0, gpc_id = 0;
+            tpc_index < gr_pd_num_tpc_per_gpc__size_1_v();
+            tpc_index++, gpc_id += 8) {
+
+               if (gpc_id >= gr->gpc_count)
+                       gpc_id = 0;
+
+               tpc_per_gpc =
+                       gr_pd_num_tpc_per_gpc_count0_f(gr->gpc_tpc_count[gpc_id + 0]) |
+                       gr_pd_num_tpc_per_gpc_count1_f(gr->gpc_tpc_count[gpc_id + 1]) |
+                       gr_pd_num_tpc_per_gpc_count2_f(gr->gpc_tpc_count[gpc_id + 2]) |
+                       gr_pd_num_tpc_per_gpc_count3_f(gr->gpc_tpc_count[gpc_id + 3]) |
+                       gr_pd_num_tpc_per_gpc_count4_f(gr->gpc_tpc_count[gpc_id + 4]) |
+                       gr_pd_num_tpc_per_gpc_count5_f(gr->gpc_tpc_count[gpc_id + 5]) |
+                       gr_pd_num_tpc_per_gpc_count6_f(gr->gpc_tpc_count[gpc_id + 6]) |
+                       gr_pd_num_tpc_per_gpc_count7_f(gr->gpc_tpc_count[gpc_id + 7]);
+
+               gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
+               gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
+       }
+
+       /* grSetupPDMapping stubbed for gk20a */
+       gr_gk20a_setup_rop_mapping(g, gr);
+       gr_gk20a_setup_alpha_beta_tables(g, gr);
+
+       if (gr->num_fbps == 1)
+               max_ways_evict = 9;
+
+       if (max_ways_evict != INVALID_MAX_WAYS)
+               gk20a_writel(g, ltc_ltcs_ltss_tstg_set_mgmt_r(),
+                            ((gk20a_readl(g, ltc_ltcs_ltss_tstg_set_mgmt_r()) &
+                              ~(ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(~0))) |
+                             ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(max_ways_evict)));
+
+       for (gpc_index = 0;
+            gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
+            gpc_index += 4) {
+
+               gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
+                            gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
+                            gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
+                            gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
+                            gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
+       }
+
+       gk20a_writel(g, gr_cwd_fs_r(),
+                    gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
+                    gr_cwd_fs_num_tpcs_f(gr->tpc_count));
+
+       gk20a_writel(g, gr_bes_zrop_settings_r(),
+                    gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
+       gk20a_writel(g, gr_bes_crop_settings_r(),
+                    gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
+
+       return 0;
+}
+
+static int gr_gk20a_force_image_save(struct channel_gk20a *c, u32 save_type)
+{
+       struct gk20a *g = c->g;
+       int ret;
+
+       u32 inst_base_ptr =
+               u64_lo32(c->inst_block.cpu_pa) >> ram_in_base_shift_v();
+
+       nvhost_dbg_fn("");
+
+       ret = gr_gk20a_submit_fecs_method(g, 0, 0, 3,
+                       gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+                       gr_fecs_current_ctx_target_vid_mem_f() |
+                       gr_fecs_current_ctx_valid_f(1), save_type, 0,
+                       GR_IS_UCODE_OP_AND, 1, GR_IS_UCODE_OP_AND, 2);
+       if (ret)
+               nvhost_err(dev_from_gk20a(g), "save context image failed");
+
+       return ret;
+}
+
+/* init global golden image from a fresh gr_ctx in channel ctx.
+   save a copy in local_golden_image in ctx_vars */
+static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
+                                         struct channel_gk20a *c)
+{
+       struct gr_gk20a *gr = &g->gr;
+       struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+       u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
+       u32 ctx_header_words;
+       u32 i;
+       u32 data;
+       void *ctx_ptr = NULL;
+       void *gold_ptr = NULL;
+       u32 err = 0;
+
+       nvhost_dbg_fn("");
+
+       err = gr_gk20a_ctx_bind_first_channel(g, c);
+       if (err)
+               goto clean_up;
+
+       err = gr_gk20a_commit_global_ctx_buffers(g, c, 0);
+       if (err)
+               goto clean_up;
+
+       gold_ptr = mem_op().mmap(gr->global_ctx_buffer[GOLDEN_CTX].ref);
+       if (IS_ERR(gold_ptr))
+               goto clean_up;
+
+       ctx_ptr = mem_op().mmap(ch_ctx->gr_ctx.mem.ref);
+       if (IS_ERR(ctx_ptr))
+               goto clean_up;
+
+       ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
+       ctx_header_words >>= 2;
+
+       for (i = 0; i < ctx_header_words; i++) {
+               data = mem_rd32(ctx_ptr, i);
+               mem_wr32(gold_ptr, i, data);
+       }
+
+       mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_v(), 0,
+                ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
+
+       mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_v(), 0, 0);
+
+       gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
+
+       gr_gk20a_force_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_f());
+
+       if (gr->ctx_vars.local_golden_image == NULL) {
+
+               gr->ctx_vars.local_golden_image =
+                       kzalloc(gr->ctx_vars.golden_image_size, GFP_KERNEL);
+
+               if (gr->ctx_vars.local_golden_image == NULL) {
+                       err = -ENOMEM;
+                       goto clean_up;
+               }
+
+               for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
+                       gr->ctx_vars.local_golden_image[i] =
+                               mem_rd32(gold_ptr, i);
+       }
+
+       gr->ctx_vars.golden_image_initialized = true;
+
+clean_up:
+       if (err)
+               nvhost_dbg(dbg_fn | dbg_err, "fail");
+       else
+               nvhost_dbg_fn("done");
+
+       if (gold_ptr)
+               mem_op().munmap(gr->global_ctx_buffer[GOLDEN_CTX].ref,
+                               gold_ptr);
+       if (ctx_ptr)
+               mem_op().munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
+
+       return err;
+}
+
+/* load saved fresh copy of gloden image into channel gr_ctx */
+static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
+                                       struct channel_gk20a *c)
+{
+       struct gr_gk20a *gr = &g->gr;
+       struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+       u32 virt_addr_lo;
+       u32 virt_addr_hi;
+       u32 i;
+       int ret = 0;
+       void *ctx_ptr = NULL;
+
+       nvhost_dbg_fn("");
+
+       if (gr->ctx_vars.local_golden_image == NULL)
+               return -1;
+
+       ctx_ptr = mem_op().mmap(ch_ctx->gr_ctx.mem.ref);
+       if (IS_ERR(ctx_ptr))
+               return -ENOMEM;
+
+       for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
+               mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
+
+       mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_v(), 0, 0);
+       mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_v(), 0, 0);
+
+       virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
+       virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
+
+       mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_v(), 0,
+                ch_ctx->patch_ctx.data_count);
+       mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_v(), 0,
+                virt_addr_lo);
+       mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_v(), 0,
+                virt_addr_hi);
+
+       mem_op().munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
+
+       /* gr_gk20a_ctx_zcull_setup(g, c, false); */
+       gr_gk20a_ctx_pm_setup(g, c, false);
+
+       if (CONFIG_GK20A_SIM /*|| IS_RTLSIM()*/) {
+               u32 inst_base_ptr =
+                       u64_lo32(c->inst_block.cpu_pa) >> ram_in_base_shift_v();
+
+               ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0,
+                               gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+                               gr_fecs_current_ctx_target_vid_mem_f() |
+                               gr_fecs_current_ctx_valid_f(1),
+                               gr_fecs_method_push_adr_restore_golden_f(), 0,
+                               GR_IS_UCODE_OP_EQUAL, gr_fecs_ctxsw_mailbox_value_pass_v(),
+                               GR_IS_UCODE_OP_SKIP, 0);
+               if (ret)
+                       nvhost_err(dev_from_gk20a(g),
+                                  "restore context image failed");
+       }
+
+       return ret;
+}
+
+static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
+{
+       nvhost_dbg_fn("");
+
+       gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
+                    gr_fecs_ctxsw_mailbox_clear_value_f(~0));
+
+       gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
+       gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
+
+       gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
+       gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
+
+       nvhost_dbg_fn("done");
+}
+
+static int gr_gk20a_load_ctxsw_ucode(struct gk20a *g, struct gr_gk20a *gr)
+{
+       u32 ret;
+
+       nvhost_dbg_fn("");
+
+       if (CONFIG_GK20A_SIM) { /* fmodel */
+               gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
+                       gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
+               gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
+                       gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
+       }
+
+       gr_gk20a_load_falcon_dmem(g);
+       gr_gk20a_load_falcon_imem(g);
+
+       gr_gk20a_start_falcon_ucode(g);
+
+       ret = gr_gk20a_ctx_wait_ucode(g, 0, 0,
+                                     GR_IS_UCODE_OP_EQUAL,
+                                     eUcodeHandshakeInitComplete,
+                                     GR_IS_UCODE_OP_SKIP, 0);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g), "falcon ucode init timeout");
+               return ret;
+       }
+
+       gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
+       gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
+       gk20a_writel(g, gr_fecs_method_push_r(),
+                    gr_fecs_method_push_adr_set_watchdog_timeout_f());
+
+       nvhost_dbg_fn("done");
+       return 0;
+}
+
+#define PRI_GPCCS_ADDR_WIDTH 15
+#define CTXSW_UCODE_HEADER_SIZE_IN_BYTES 256
+
+#define PRI_GPCCS_ADDR_MASK(addr)      ((addr) & ((1 << PRI_GPCCS_ADDR_WIDTH) - 1))
+#define PRI_GPC_ADDR(addr, gpc)                (proj_gpc_base_v()+((gpc)*proj_gpc_stride_v())+(addr))
+
+static int gr_gk20a_create_ctx_header(struct gk20a *g, u32 *header)
+{
+       u32 *header_curr;
+       u32 num_gpcs;
+       u32 num_tpcs;
+       u32 num_ppcs;
+       u32 tpc_id_mask;
+       u32 ppc_mask;
+       u32 rc_offset, rc_size;
+       u32 num_fecs_ramchains;
+       u32 num_gpc_ramchains;
+       u32 sys_priv_size;
+       u32 sys_priv_offset;
+       u32 gpc_priv_size;
+       u32 gpc_priv_offset;
+       u32 fecs_image_size;
+       u32 gpc_image_size;
+       u32 total_image_size;
+       u32 lane, gpc, ppc;
+       u32 addr, words, bytes;
+       u32 litter_num_pes_per_gpc;
+
+       if (!g->gr.ctx_vars.valid)
+               return -1;
+
+       nvhost_dbg_fn("");
+
+       if (CONFIG_GK20A_SIM) { /* fmodel */
+               num_gpcs = g->gr.gpc_count;
+       } else {
+               num_gpcs = gk20a_readl(g, gr_fecs_fs_r());
+               num_gpcs = gr_fecs_fs_num_available_gpcs_v(num_gpcs);
+       }
+
+       header_curr = header;
+
+       header_curr[ctxsw_prog_main_image_num_gpc_v() >> 2] = num_gpcs;
+       header_curr[ctxsw_prog_main_image_magic_value_v() >> 2] =
+               ctxsw_prog_main_image_magic_value_v_value_f();
+
+       fecs_image_size = g->gr.ctx_vars.ctxsw_regs.sys.count << 2;
+       fecs_image_size = ((fecs_image_size + 255) & ~255);
+
+       sys_priv_size = fecs_image_size >> 8;
+       sys_priv_offset = 2 + num_gpcs;
+
+       header_curr += (CTXSW_UCODE_HEADER_SIZE_IN_BYTES >> 2);
+       header_curr[ctxsw_prog_local_reg_ctl_v() >> 2] =
+               ctxsw_prog_local_reg_ctl_offset_f(sys_priv_offset) |
+               ctxsw_prog_local_reg_ctl_size_f(sys_priv_size);
+       header_curr[ctxsw_prog_local_magic_value_v() >> 2] =
+               ctxsw_prog_local_magic_value_v_value_f();
+
+       if (!CONFIG_GK20A_SIM) {
+               rc_offset = 0;
+               rc_size = 0;
+
+               num_fecs_ramchains = gr_fecs_rc_lanes_num_chains_v(
+                       gk20a_readl(g, gr_fecs_rc_lanes_r()));
+
+               header_curr[ctxsw_prog_local_image_ctl_v() >> 2] =
+                       ctxsw_prog_local_image_ctl_num_ramchains_f(num_fecs_ramchains);
+
+               for (lane = 0; lane < num_fecs_ramchains; lane++) {
+                       rc_offset += (rc_size >> 8);
+
+                       gk20a_writel(g, gr_fecs_falcon_addr_v(), lane);
+                       words = gr_fecs_rc_lane_size_v_v(
+                                       gk20a_readl(g, gr_fecs_rc_lane_size_r(0)));
+                       header_curr[ctxsw_prog_local_ramchain_save_v(lane) >> 2] =
+                               ctxsw_prog_local_ramchain_save_words_f(words);
+                       bytes = words << 2;
+
+                       if (bytes)
+                               header_curr[ctxsw_prog_local_ramchain_ctl_v(lane) >> 2] =
+                                       ctxsw_prog_local_ramchain_ctl_offset_f(rc_offset);
+                       else
+                               header_curr[ctxsw_prog_local_ramchain_ctl_v(lane) >> 2] =
+                                       ctxsw_prog_local_ramchain_ctl_offset_f(0);
+
+                       rc_size = (bytes + 0xFF) & ~0xFF;
+                       fecs_image_size += rc_size;
+               }
+       }
+
+       header_curr[ctxsw_prog_local_image_size_v() >> 2] = fecs_image_size;
+       total_image_size = fecs_image_size + 256 + 256 + num_gpcs * 256;
+
+       litter_num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
+       for (gpc = 0; gpc < num_gpcs; gpc++) {
+
+               header_curr += (CTXSW_UCODE_HEADER_SIZE_IN_BYTES >> 2);
+
+               addr = PRI_GPC_ADDR(PRI_GPCCS_ADDR_MASK(gr_gpc0_fs_gpc_r()), gpc);
+               num_tpcs = gr_gpc0_fs_gpc_num_available_tpcs_v(
+                               gk20a_readl(g, addr));
+
+               if (litter_num_pes_per_gpc > 1) {
+                       num_ppcs = 0;
+                       ppc_mask = 0;
+                       for (ppc = 0; ppc < litter_num_pes_per_gpc; ppc++) {
+                               addr = PRI_GPC_ADDR(PRI_GPCCS_ADDR_MASK(
+                                       gr_gpc0_gpm_pd_pes_tpc_id_mask_r(ppc)), gpc);
+                               tpc_id_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(
+                                       gk20a_readl(g, addr));
+                               if (tpc_id_mask) {
+                                       num_ppcs++;
+                                       ppc_mask |= (1 << ppc);
+                               }
+                       }
+                       header_curr[ctxsw_prog_local_image_ppc_info_v() >> 2] =
+                               ctxsw_prog_local_image_ppc_info_ppc_mask_f(ppc_mask) |
+                               ctxsw_prog_local_image_ppc_info_num_ppcs_f(num_ppcs);
+               }
+
+               gpc_priv_offset = total_image_size >> 8;
+               gpc_image_size = (g->gr.ctx_vars.ctxsw_regs.gpc.count +
+                                 g->gr.ctx_vars.ctxsw_regs.tpc.count * num_tpcs) << 2;
+               gpc_image_size = ((gpc_image_size + 0xFF) & ~0xFF);
+               gpc_priv_size = gpc_image_size >> 8;
+
+               header_curr[ctxsw_prog_local_reg_ctl_v() >> 2] =
+                       ctxsw_prog_local_reg_ctl_offset_f(gpc_priv_offset) |
+                       ctxsw_prog_local_reg_ctl_size_f(gpc_priv_size);
+
+               header_curr[ctxsw_prog_local_image_num_tpcs_v() >> 2] =
+                       num_tpcs;
+               header_curr[ctxsw_prog_local_magic_value_v() >> 2] =
+                       ctxsw_prog_local_magic_value_v_value_f();
+
+               if (!CONFIG_GK20A_SIM) {
+                       rc_offset = 0;
+                       rc_size = 0;
+
+                       addr = PRI_GPC_ADDR(PRI_GPCCS_ADDR_MASK(
+                               gr_gpccs_rc_lanes_r()), gpc);
+                       num_gpc_ramchains = gr_gpccs_rc_lanes_num_chains_v(
+                               gk20a_readl(g, addr));
+
+                       header_curr[ctxsw_prog_local_image_ctl_v() >> 2] =
+                               ctxsw_prog_local_image_ctl_num_ramchains_f(num_gpc_ramchains);
+
+                       for (lane = 0; lane < num_gpc_ramchains; lane++) {
+                               rc_offset += rc_size >> 8;
+
+                               addr = PRI_GPC_ADDR(PRI_GPCCS_ADDR_MASK(
+                                               gr_gpccs_falcon_addr_r()), gpc);
+                               gk20a_writel(g, addr, lane);
+
+                               addr = PRI_GPC_ADDR(PRI_GPCCS_ADDR_MASK(
+                                               gr_gpccs_rc_lane_size_r(0)), gpc);
+                               words = gr_gpccs_rc_lane_size_v_v(
+                                               gk20a_readl(g, addr));
+
+                               header_curr[ctxsw_prog_local_ramchain_save_v(lane) >> 2] =
+                                       ctxsw_prog_local_ramchain_save_words_f(words);
+                               bytes = words << 2;
+
+                               if (bytes)
+                                       header_curr[ctxsw_prog_local_ramchain_ctl_v(lane) >> 2] =
+                                               ctxsw_prog_local_ramchain_ctl_offset_f(words);
+                               else
+                                       header_curr[ctxsw_prog_local_ramchain_ctl_v(lane) >> 2] =
+                                               ctxsw_prog_local_ramchain_ctl_offset_f(0);
+
+                               rc_size = (bytes + 0xFF) & ~0xFF;
+                               gpc_image_size += rc_size;
+                       }
+               }
+
+               header_curr[ctxsw_prog_local_image_size_v() >> 2] = gpc_image_size;
+               total_image_size += gpc_image_size;
+       }
+
+       header[ctxsw_prog_main_image_size_v() >> 2] = total_image_size;
+
+       return 0;
+}
+
+static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
+{
+       u32 golden_ctx_image_size = 0;
+       u32 zcull_ctx_image_size = 0;
+       u32 pm_ctx_image_size = 0;
+       u32 ret;
+
+       nvhost_dbg_fn("");
+
+       if (g->gr.ctx_vars.golden_image_size)
+               return 0;
+
+       /* 256 bytes hdr + 256 bytes FECS + numGpc * 256 bytes GPCCS */
+       gr->ctx_vars.buffer_header_size = 256 + 256 + 256 * gr->gpc_count;
+
+       ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 0,
+                       gr_fecs_method_push_adr_discover_image_size_f(),
+                       &golden_ctx_image_size,
+                       GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g),
+                          "query golden image size failed");
+               return ret;
+       }
+
+       ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 0,
+                       gr_fecs_method_push_adr_discover_zcull_image_size_f(),
+                       &zcull_ctx_image_size,
+                       GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g),
+                          "query zcull ctx image size failed");
+               return ret;
+       }
+
+       ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 0,
+                       gr_fecs_method_push_adr_discover_pm_image_size_f(),
+                       &pm_ctx_image_size,
+                       GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g),
+                          "query pm ctx image size failed");
+               return ret;
+       }
+
+       g->gr.ctx_vars.golden_image_size = golden_ctx_image_size;
+       g->gr.ctx_vars.zcull_ctxsw_image_size = zcull_ctx_image_size;
+
+       /* create a temp header for ctx override */
+       if (!gr->temp_ctx_header) {
+               gr->temp_ctx_header =
+                       kzalloc(gr->ctx_vars.buffer_header_size, GFP_KERNEL);
+               if (!gr->temp_ctx_header)
+                       return -ENOMEM;
+       }
+
+       gr_gk20a_create_ctx_header(g, (u32 *)gr->temp_ctx_header);
+
+       nvhost_dbg_fn("done");
+       return 0;
+}
+
+static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
+{
+       struct gr_gk20a *gr = &g->gr;
+       struct mem_mgr *memmgr = mem_mgr_from_g(g);
+       struct mem_handle *mem;
+       u32 i, attr_buffer_size;
+
+       u32 cb_buffer_size = gr_scc_bundle_cb_size_div_256b__prod_v() *
+               gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
+
+       u32 pagepool_buffer_size = gr_scc_pagepool_total_pages_hwmax_value_v() *
+               gr_scc_pagepool_total_pages_byte_granularity_v();
+
+       u32 attr_cb_default_size = gr_gpc0_ppc0_cbm_cfg_size_default_v();
+       u32 alpha_cb_default_size = gr_gpc0_ppc0_cbm_cfg2_size_default_v();
+
+       u32 attr_cb_size =
+               attr_cb_default_size + (attr_cb_default_size >> 1);
+       u32 alpha_cb_size =
+               alpha_cb_default_size + (alpha_cb_default_size >> 1);
+
+       u32 num_tpcs_per_pes = proj_scal_litter_num_tpcs_per_pes_v();
+       u32 attr_max_size_per_tpc =
+               gr_gpc0_ppc0_cbm_cfg_size_v(~0) / num_tpcs_per_pes;
+       u32 alpha_max_size_per_tpc =
+               gr_gpc0_ppc0_cbm_cfg2_size_v(~0) / num_tpcs_per_pes;
+
+
+       nvhost_dbg_fn("");
+
+       attr_cb_size =
+               (attr_cb_size > attr_max_size_per_tpc) ?
+                       attr_max_size_per_tpc : attr_cb_size;
+       attr_cb_default_size =
+               (attr_cb_default_size > attr_cb_size) ?
+                       attr_cb_size : attr_cb_default_size;
+       alpha_cb_size =
+               (alpha_cb_size > alpha_max_size_per_tpc) ?
+                       alpha_max_size_per_tpc : alpha_cb_size;
+       alpha_cb_default_size =
+               (alpha_cb_default_size > alpha_cb_size) ?
+                       alpha_cb_size : alpha_cb_default_size;
+
+       attr_buffer_size =
+               (gr_gpc0_ppc0_cbm_cfg_size_granularity_v() * alpha_cb_size +
+                gr_gpc0_ppc0_cbm_cfg2_size_granularity_v() * alpha_cb_size) *
+                gr->gpc_count;
+
+       nvhost_dbg_info("cb_buffer_size : %d", cb_buffer_size);
+
+       mem = mem_op().alloc(memmgr, cb_buffer_size,
+                         DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                         DEFAULT_NVMAP_ALLOC_FLAGS,
+                         NVMAP_HEAP_CARVEOUT_GENERIC);
+       if (IS_ERR_OR_NULL(mem))
+               goto clean_up;
+
+       gr->global_ctx_buffer[CIRCULAR].ref = mem;
+       gr->global_ctx_buffer[CIRCULAR].size = cb_buffer_size;
+
+       nvhost_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
+
+       mem = mem_op().alloc(memmgr, pagepool_buffer_size,
+                         DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                         DEFAULT_NVMAP_ALLOC_FLAGS,
+                         NVMAP_HEAP_CARVEOUT_GENERIC);
+       if (IS_ERR_OR_NULL(mem))
+               goto clean_up;
+
+       gr->global_ctx_buffer[PAGEPOOL].ref = mem;
+       gr->global_ctx_buffer[PAGEPOOL].size = pagepool_buffer_size;
+
+       nvhost_dbg_info("attr_buffer_size : %d", attr_buffer_size);
+
+       mem = mem_op().alloc(memmgr, attr_buffer_size,
+                         DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                         DEFAULT_NVMAP_ALLOC_FLAGS,
+                         NVMAP_HEAP_CARVEOUT_GENERIC);
+       if (IS_ERR_OR_NULL(mem))
+               goto clean_up;
+
+       gr->global_ctx_buffer[ATTRIBUTE].ref = mem;
+       gr->global_ctx_buffer[ATTRIBUTE].size = attr_buffer_size;
+
+       mem = mem_op().alloc(memmgr, attr_buffer_size,
+                         DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                         DEFAULT_NVMAP_ALLOC_FLAGS,
+                         NVMAP_HEAP_CARVEOUT_GENERIC); /* TBD: use NVMAP_HEAP_CARVEOUT_VPR */
+       if (IS_ERR_OR_NULL(mem))
+               goto clean_up;
+
+       gr->global_ctx_buffer[ATTRIBUTE_VPR].ref = mem;
+       gr->global_ctx_buffer[ATTRIBUTE_VPR].size = attr_buffer_size;
+
+       nvhost_dbg_info("golden_image_size : %d",
+                  gr->ctx_vars.golden_image_size);
+
+       mem = mem_op().alloc(memmgr, gr->ctx_vars.golden_image_size,
+                         DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                         DEFAULT_NVMAP_ALLOC_FLAGS,
+                         NVMAP_HEAP_CARVEOUT_GENERIC);
+       if (IS_ERR_OR_NULL(mem))
+               goto clean_up;
+
+       gr->global_ctx_buffer[GOLDEN_CTX].ref = mem;
+       gr->global_ctx_buffer[GOLDEN_CTX].size =
+               gr->ctx_vars.golden_image_size;
+
+       nvhost_dbg_fn("done");
+       return 0;
+
+ clean_up:
+       nvhost_dbg(dbg_fn | dbg_err, "fail");
+       for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
+               if (gr->global_ctx_buffer[i].ref) {
+                       mem_op().put(memmgr,
+                               gr->global_ctx_buffer[i].ref);
+                       memset(&gr->global_ctx_buffer[i],
+                               0, sizeof(struct mem_desc));
+               }
+       }
+       return -ENOMEM;
+}
+
+static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
+{
+       struct gr_gk20a *gr = &g->gr;
+       struct mem_mgr *memmgr = mem_mgr_from_g(g);
+       u32 i;
+
+       for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
+               mem_op().put(memmgr, gr->global_ctx_buffer[i].ref);
+               memset(&gr->global_ctx_buffer[i], 0, sizeof(struct mem_desc));
+       }
+}
+
+static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
+                                       struct channel_gk20a *c)
+{
+       struct vm_gk20a *ch_vm = c->vm;
+       struct mem_mgr *memmgr = mem_mgr_from_g(g);
+       struct mem_handle *handle_ref;
+       u32 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
+       struct gr_gk20a *gr = &g->gr;
+       u64 gpu_va;
+       u32 i;
+       nvhost_dbg_fn("");
+
+       gpu_va = ch_vm->map(ch_vm, memmgr,
+                           gr->global_ctx_buffer[CIRCULAR].ref,
+                           0, 0, 0 /*offset_align, flags, kind*/);
+       if (!gpu_va)
+               goto clean_up;
+       g_bfr_va[CIRCULAR_VA] = gpu_va;
+
+       if (!c->vpr)
+               handle_ref = gr->global_ctx_buffer[ATTRIBUTE].ref;
+       else
+               handle_ref = gr->global_ctx_buffer[ATTRIBUTE_VPR].ref;
+
+       gpu_va = ch_vm->map(ch_vm, memmgr, handle_ref,
+                           0, 0, 0 /*offset_align, flags, kind*/);
+       if (!gpu_va)
+               goto clean_up;
+       g_bfr_va[ATTRIBUTE_VA] = gpu_va;
+
+       gpu_va = ch_vm->map(ch_vm, memmgr,
+                           gr->global_ctx_buffer[PAGEPOOL].ref,
+                           0, 0, 0/*offset_align, flags, kind*/);
+       if (!gpu_va)
+               goto clean_up;
+       g_bfr_va[PAGEPOOL_VA] = gpu_va;
+
+       gpu_va = ch_vm->map(ch_vm, memmgr,
+                           gr->global_ctx_buffer[GOLDEN_CTX].ref,
+                           0, 0, 0 /*offset_align, flags, kind*/);
+       if (!gpu_va)
+               goto clean_up;
+       g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
+
+       c->ch_ctx.global_ctx_buffer_mapped = true;
+       return 0;
+
+ clean_up:
+       for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
+               if (g_bfr_va[i]) {
+                       ch_vm->unmap(ch_vm, g_bfr_va[i]);
+                       g_bfr_va[i] = 0;
+               }
+       }
+       return -ENOMEM;
+}
+
+static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
+{
+       struct vm_gk20a *ch_vm = c->vm;
+       u32 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
+       u32 i;
+
+       nvhost_dbg_fn("");
+
+       for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
+               if (g_bfr_va[i]) {
+                       ch_vm->unmap(ch_vm, g_bfr_va[i]);
+                       g_bfr_va[i] = 0;
+               }
+       }
+       c->ch_ctx.global_ctx_buffer_mapped = false;
+}
+
+static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
+                               struct channel_gk20a *c)
+{
+       struct gr_gk20a *gr = &g->gr;
+       struct gr_ctx_desc *gr_ctx = &c->ch_ctx.gr_ctx;
+       struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
+       struct vm_gk20a *ch_vm = c->vm;
+
+       nvhost_dbg_fn("");
+
+       if (gr->ctx_vars.buffer_size == 0)
+               return 0;
+
+       /* alloc channel gr ctx buffer */
+       gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
+       gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
+
+       gr_ctx->mem.ref = mem_op().alloc(memmgr,
+                               gr->ctx_vars.buffer_total_size,
+                               DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                               DEFAULT_NVMAP_ALLOC_FLAGS,
+                               NVMAP_HEAP_CARVEOUT_GENERIC);
+
+       if (IS_ERR(gr_ctx->mem.ref))
+               return -ENOMEM;
+
+       gr_ctx->gpu_va = ch_vm->map(ch_vm, memmgr,
+               gr_ctx->mem.ref, 0, 0, 0 /*offset_align, flags, kind*/);
+       if (!gr_ctx->gpu_va) {
+               mem_op().put(memmgr, gr_ctx->mem.ref);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
+{
+       struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+       struct mem_mgr *ch_nvmap = gk20a_channel_mem_mgr(c);
+       struct vm_gk20a *ch_vm = c->vm;
+
+       nvhost_dbg_fn("");
+
+       ch_vm->unmap(ch_vm, ch_ctx->gr_ctx.gpu_va);
+       mem_op().put(ch_nvmap, ch_ctx->gr_ctx.mem.ref);
+}
+
+static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
+                               struct channel_gk20a *c)
+{
+       struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+       struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
+       struct vm_gk20a *ch_vm = c->vm;
+
+       nvhost_dbg_fn("");
+
+       patch_ctx->mem.ref = mem_op().alloc(memmgr, 128 * sizeof(u32),
+                               DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                               DEFAULT_NVMAP_ALLOC_FLAGS,
+                               NVMAP_HEAP_CARVEOUT_GENERIC);
+       if (IS_ERR(patch_ctx->mem.ref))
+               return -ENOMEM;
+
+       patch_ctx->gpu_va = ch_vm->map(ch_vm, memmgr,
+                               patch_ctx->mem.ref,
+                               0, 0, 0 /*offset_align, flags, kind*/);
+       if (!patch_ctx->gpu_va)
+               goto clean_up;
+
+       nvhost_dbg_fn("done");
+       return 0;
+
+ clean_up:
+       nvhost_dbg(dbg_fn | dbg_err, "fail");
+       if (patch_ctx->mem.ref) {
+               mem_op().put(memmgr, patch_ctx->mem.ref);
+               patch_ctx->mem.ref = 0;
+       }
+
+       return -ENOMEM;
+}
+
+static void gr_gk20a_unmap_channel_patch_ctx(struct channel_gk20a *c)
+{
+       struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+       struct vm_gk20a *ch_vm = c->vm;
+
+       nvhost_dbg_fn("");
+
+       ch_vm->unmap(ch_vm, patch_ctx->gpu_va);
+       patch_ctx->gpu_va = 0;
+       patch_ctx->data_count = 0;
+}
+
+static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
+{
+       struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+       struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
+
+       nvhost_dbg_fn("");
+
+       gr_gk20a_unmap_channel_patch_ctx(c);
+
+       if (patch_ctx->mem.ref) {
+               mem_op().put(memmgr, patch_ctx->mem.ref);
+               patch_ctx->mem.ref = 0;
+       }
+}
+
+void gk20a_free_channel_ctx(struct channel_gk20a *c)
+{
+       gr_gk20a_unmap_global_ctx_buffers(c);
+       gr_gk20a_free_channel_patch_ctx(c);
+       gr_gk20a_free_channel_gr_ctx(c);
+
+       /* zcull_ctx, pm_ctx */
+
+       memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
+
+       c->num_objects = 0;
+       c->first_init = false;
+}
+
+int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
+                       struct nvhost_alloc_obj_ctx_args *args)
+{
+       struct gk20a *g = c->g;
+       struct gr_gk20a *gr = &g->gr;
+       struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+       bool change_to_compute_mode = false;
+       int err = 0;
+
+       nvhost_dbg_fn("");
+
+       /* an address space needs to have been bound at this point.*/
+       if (!gk20a_channel_as_bound(c)) {
+               nvhost_err(dev_from_gk20a(g),
+                          "not bound to address space at time"
+                          " of grctx allocation");
+               return -EINVAL;
+       }
+
+       switch (args->class_num) {
+       case KEPLER_COMPUTE_A:
+               /* tbd: NV2080_CTRL_GPU_COMPUTE_MODE_RULES_EXCLUSIVE_COMPUTE */
+               /* tbd: PDB_PROP_GRAPHICS_DISTINCT_3D_AND_COMPUTE_STATE_DEF  */
+               change_to_compute_mode = true;
+               break;
+       case KEPLER_C:
+       case FERMI_TWOD_A:
+       case KEPLER_DMA_COPY_A:
+               break;
+
+       default:
+               nvhost_err(dev_from_gk20a(g),
+                          "invalid obj class 0x%x", args->class_num);
+               err = -EINVAL;
+               goto out;
+       }
+
+       /* allocate gr ctx buffer */
+       if (ch_ctx->gr_ctx.mem.ref == NULL) {
+               err = gr_gk20a_alloc_channel_gr_ctx(g, c);
+               if (err) {
+                       nvhost_err(dev_from_gk20a(g),
+                               "fail to allocate gr ctx buffer");
+                       goto out;
+               }
+       } else {
+               /*TBD: needs to be more subtle about which is being allocated
+               * as some are allowed to be allocated along same channel */
+               nvhost_err(dev_from_gk20a(g),
+                       "too many classes alloc'd on same channel");
+               err = -EINVAL;
+               goto out;
+       }
+
+       /* commit gr ctx buffer */
+       err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
+       if (err) {
+               nvhost_err(dev_from_gk20a(g),
+                       "fail to commit gr ctx buffer");
+               goto out;
+       }
+
+       /* set misc. might be possible to move around later */
+       ch_ctx->pm_ctx.ctx_sw_mode =
+               ctxsw_prog_main_image_pm_mode_no_ctxsw_v();
+
+       /* allocate patch buffer */
+       if (ch_ctx->patch_ctx.mem.ref == NULL) {
+               err = gr_gk20a_alloc_channel_patch_ctx(g, c);
+               if (err) {
+                       nvhost_err(dev_from_gk20a(g),
+                               "fail to allocate patch buffer");
+                       goto out;
+               }
+       }
+
+       /* map global buffer to channel gpu_va and commit */
+       if (!ch_ctx->global_ctx_buffer_mapped) {
+               err = gr_gk20a_map_global_ctx_buffers(g, c);
+               if (err) {
+                       nvhost_err(dev_from_gk20a(g),
+                               "fail to map global ctx buffer");
+                       goto out;
+               }
+               gr_gk20a_elpg_protected_call(g,
+                       gr_gk20a_commit_global_ctx_buffers(g, c, 1));
+       }
+
+       /* init gloden image, ELPG enabled after this is done */
+       if (!gr->ctx_vars.golden_image_initialized) {
+               err = gr_gk20a_init_golden_ctx_image(g, c);
+               if (err) {
+                       nvhost_err(dev_from_gk20a(g),
+                               "fail to init golden ctx image");
+                       goto out;
+               }
+       }
+
+       /* load golden image */
+       if (!c->first_init) {
+               err = gr_gk20a_elpg_protected_call(g,
+                       gr_gk20a_load_golden_ctx_image(g, c));
+               if (err) {
+                       nvhost_err(dev_from_gk20a(g),
+                               "fail to load golden ctx image");
+                       goto out;
+               }
+               c->first_init = true;
+       }
+
+       c->num_objects++;
+
+       nvhost_dbg_fn("done");
+       return 0;
+out:
+       /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
+          can be reused so no need to release them.
+          2. golden image init and load is a one time thing so if
+          they pass, no need to undo. */
+       nvhost_dbg(dbg_fn | dbg_err, "fail");
+       return err;
+}
+
+int gk20a_free_obj_ctx(struct channel_gk20a  *c,
+                      struct nvhost_free_obj_ctx_args *args)
+{
+       nvhost_dbg_fn("");
+
+       if (c->num_objects == 0)
+               return 0;
+
+       c->num_objects--;
+
+       if (c->num_objects == 0) {
+               c->first_init = false;
+               gr_gk20a_unmap_channel_patch_ctx(c);
+       }
+
+       return 0;
+}
+
+static void gk20a_remove_gr_support(struct gk20a *g, struct gr_gk20a *gr)
+{
+       struct mem_mgr *memmgr = mem_mgr_from_g(g);
+
+       nvhost_dbg_fn("");
+
+       gr_gk20a_free_global_ctx_buffers(g);
+
+       mem_op().unpin(memmgr, gr->mmu_wr_mem.mem.ref);
+       mem_op().unpin(memmgr, gr->mmu_rd_mem.mem.ref);
+       mem_op().unpin(memmgr, gr->compbit_store.mem.ref);
+       mem_op().put(memmgr, gr->mmu_wr_mem.mem.ref);
+       mem_op().put(memmgr, gr->mmu_rd_mem.mem.ref);
+       mem_op().put(memmgr, gr->compbit_store.mem.ref);
+       kfree(gr->gpc_tpc_count);
+       kfree(gr->gpc_ppc_count);
+       kfree(gr->pes_tpc_count[0]);
+       kfree(gr->pes_tpc_count[1]);
+       kfree(gr->pes_tpc_mask[0]);
+       kfree(gr->pes_tpc_mask[1]);
+       kfree(gr->gpc_skip_mask);
+       kfree(gr->temp_ctx_header);
+       kfree(gr->ctx_vars.ucode.fecs.inst.l);
+       kfree(gr->ctx_vars.ucode.fecs.data.l);
+       kfree(gr->ctx_vars.ucode.gpccs.inst.l);
+       kfree(gr->ctx_vars.ucode.gpccs.data.l);
+       kfree(gr->ctx_vars.sw_bundle_init.l);
+       kfree(gr->ctx_vars.sw_method_init.l);
+       kfree(gr->ctx_vars.sw_ctx_load.l);
+       kfree(gr->ctx_vars.sw_non_ctx_load.l);
+       kfree(gr->ctx_vars.ctxsw_regs.sys.l);
+       kfree(gr->ctx_vars.ctxsw_regs.gpc.l);
+       kfree(gr->ctx_vars.ctxsw_regs.tpc.l);
+       kfree(gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
+       kfree(gr->ctx_vars.ctxsw_regs.ppc.l);
+       kfree(gr->ctx_vars.ctxsw_regs.pm_sys.l);
+       kfree(gr->ctx_vars.ctxsw_regs.pm_gpc.l);
+       kfree(gr->ctx_vars.ctxsw_regs.pm_tpc.l);
+
+       memset(&gr->mmu_wr_mem, 0, sizeof(struct mem_desc));
+       memset(&gr->mmu_rd_mem, 0, sizeof(struct mem_desc));
+       memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
+       gr->gpc_tpc_count = NULL;
+       gr->gpc_ppc_count = NULL;
+       gr->pes_tpc_count[0] = NULL;
+       gr->pes_tpc_count[1] = NULL;
+       gr->pes_tpc_mask[0] = NULL;
+       gr->pes_tpc_mask[1] = NULL;
+       gr->gpc_skip_mask = NULL;
+       gr->temp_ctx_header = NULL;
+
+       nvhost_allocator_destroy(&gr->comp_tags);
+
+       /*tbd*/
+}
+
+static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
+{
+       u32 gpc_index, pes_index;
+       u32 pes_tpc_mask;
+       u32 pes_tpc_count;
+       u32 pes_heavy_index;
+       u32 gpc_new_skip_mask;
+       u32 tmp;
+
+       tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
+       gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
+
+       tmp = gk20a_readl(g, top_num_gpcs_r());
+       gr->max_gpc_count = top_num_gpcs_value_v(tmp);
+
+       tmp = gk20a_readl(g, top_num_fbps_r());
+       gr->max_fbps_count = top_num_fbps_value_v(tmp);
+
+       tmp = gk20a_readl(g, top_tpc_per_gpc_r());
+       gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
+
+       gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
+
+       tmp = gk20a_readl(g, top_num_fbps_r());
+       gr->sys_count = top_num_fbps_value_v(tmp);
+
+       tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
+       gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
+
+       gr->pe_count_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
+       gr->max_zcull_per_gpc_count = proj_scal_litter_num_zcull_banks_v();
+
+       if (!gr->gpc_count) {
+               nvhost_err(dev_from_gk20a(g), "gpc_count==0!");
+               goto clean_up;
+       }
+
+       gr->gpc_tpc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+       gr->gpc_zcb_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+       gr->gpc_ppc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+       gr->pes_tpc_count[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+       gr->pes_tpc_count[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+       gr->pes_tpc_mask[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+       gr->pes_tpc_mask[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+       gr->gpc_skip_mask =
+               kzalloc(gr_pd_dist_skip_table__size_1_v() * 4 * sizeof(u32),
+                       GFP_KERNEL);
+
+       if (!gr->gpc_tpc_count || !gr->gpc_zcb_count || !gr->gpc_ppc_count ||
+           !gr->pes_tpc_count[0] || !gr->pes_tpc_count[1] ||
+           !gr->pes_tpc_mask[0] || !gr->pes_tpc_mask[1] || !gr->gpc_skip_mask)
+               goto clean_up;
+
+       gr->ppc_count = 0;
+       for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+               tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r());
+
+               gr->gpc_tpc_count[gpc_index] =
+                       gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
+               gr->tpc_count += gr->gpc_tpc_count[gpc_index];
+
+               gr->gpc_zcb_count[gpc_index] =
+                       gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
+               gr->zcb_count += gr->gpc_zcb_count[gpc_index];
+
+               gr->gpc_ppc_count[gpc_index] = gr->pe_count_per_gpc;
+               gr->ppc_count += gr->gpc_ppc_count[gpc_index];
+               for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
+
+                       tmp = gk20a_readl(g,
+                               gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
+                               gpc_index * proj_gpc_stride_v());
+
+                       pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
+                       pes_tpc_count = count_bits(pes_tpc_mask);
+
+                       gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
+                       gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
+               }
+
+               gpc_new_skip_mask = 0;
+               if (gr->pes_tpc_count[0][gpc_index] +
+                   gr->pes_tpc_count[1][gpc_index] == 5) {
+                       pes_heavy_index =
+                               gr->pes_tpc_count[0][gpc_index] >
+                               gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
+
+                       gpc_new_skip_mask =
+                               gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
+                                  (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
+                                  (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
+
+               } else if ((gr->pes_tpc_count[0][gpc_index] +
+                           gr->pes_tpc_count[1][gpc_index] == 4) &&
+                          (gr->pes_tpc_count[0][gpc_index] !=
+                           gr->pes_tpc_count[1][gpc_index])) {
+                               pes_heavy_index =
+                                   gr->pes_tpc_count[0][gpc_index] >
+                                   gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
+
+                       gpc_new_skip_mask =
+                               gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
+                                  (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
+                                  (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
+               }
+               gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
+       }
+
+       nvhost_dbg_info("fbps: %d", gr->num_fbps);
+       nvhost_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
+       nvhost_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
+       nvhost_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
+       nvhost_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
+       nvhost_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
+       nvhost_dbg_info("sys_count: %d", gr->sys_count);
+       nvhost_dbg_info("gpc_count: %d", gr->gpc_count);
+       nvhost_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
+       nvhost_dbg_info("tpc_count: %d", gr->tpc_count);
+       nvhost_dbg_info("ppc_count: %d", gr->ppc_count);
+
+       for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+               nvhost_dbg_info("gpc_tpc_count[%d] : %d",
+                          gpc_index, gr->gpc_tpc_count[gpc_index]);
+       for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+               nvhost_dbg_info("gpc_zcb_count[%d] : %d",
+                          gpc_index, gr->gpc_zcb_count[gpc_index]);
+       for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+               nvhost_dbg_info("gpc_ppc_count[%d] : %d",
+                          gpc_index, gr->gpc_ppc_count[gpc_index]);
+       for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+               nvhost_dbg_info("gpc_skip_mask[%d] : %d",
+                          gpc_index, gr->gpc_skip_mask[gpc_index]);
+       for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+               for (pes_index = 0;
+                    pes_index < gr->pe_count_per_gpc;
+                    pes_index++)
+                       nvhost_dbg_info("pes_tpc_count[%d][%d] : %d",
+                                  pes_index, gpc_index,
+                                  gr->pes_tpc_count[pes_index][gpc_index]);
+
+       for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+               for (pes_index = 0;
+                    pes_index < gr->pe_count_per_gpc;
+                    pes_index++)
+                       nvhost_dbg_info("pes_tpc_mask[%d][%d] : %d",
+                                  pes_index, gpc_index,
+                                  gr->pes_tpc_mask[pes_index][gpc_index]);
+
+       gr->bundle_cb_default_size = gr_scc_bundle_cb_size_div_256b__prod_v();
+       gr->min_gpm_fifo_depth = gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
+       gr->bundle_cb_token_limit = gr_pd_ab_dist_cfg2_token_limit_init_v();
+       gr->attrib_cb_default_size = gr_gpc0_ppc0_cbm_cfg_size_default_v();
+       /* gk20a has a fixed beta CB RAM, don't alloc more */
+       gr->attrib_cb_size = gr->attrib_cb_default_size;
+       gr->alpha_cb_default_size = gr_gpc0_ppc0_cbm_cfg2_size_default_v();
+       gr->alpha_cb_size = gr->alpha_cb_default_size + (gr->alpha_cb_default_size >> 1);
+       gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
+
+       nvhost_dbg_info("bundle_cb_default_size: %d",
+                  gr->bundle_cb_default_size);
+       nvhost_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
+       nvhost_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
+       nvhost_dbg_info("attrib_cb_default_size: %d",
+                  gr->attrib_cb_default_size);
+       nvhost_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
+       nvhost_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
+       nvhost_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
+       nvhost_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
+
+       return 0;
+
+clean_up:
+       return -ENOMEM;
+}
+
+static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
+{
+       struct mem_mgr *memmgr = mem_mgr_from_g(g);
+       void *mmu_ptr;
+
+       gr->mmu_wr_mem_size = gr->mmu_rd_mem_size = 0x1000;
+
+       gr->mmu_wr_mem.mem.ref = mem_op().alloc(memmgr, gr->mmu_wr_mem_size,
+                                            DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                                            DEFAULT_NVMAP_ALLOC_FLAGS,
+                                            NVMAP_HEAP_CARVEOUT_GENERIC);
+       if (!gr->mmu_wr_mem.mem.ref)
+               goto clean_up;
+       gr->mmu_wr_mem.mem.size = gr->mmu_wr_mem_size;
+
+       gr->mmu_rd_mem.mem.ref = mem_op().alloc(memmgr, gr->mmu_rd_mem_size,
+                                            DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                                            DEFAULT_NVMAP_ALLOC_FLAGS,
+                                            NVMAP_HEAP_CARVEOUT_GENERIC);
+       if (!gr->mmu_rd_mem.mem.ref)
+               goto clean_up;
+       gr->mmu_rd_mem.mem.size = gr->mmu_rd_mem_size;
+
+       mmu_ptr = mem_op().mmap(gr->mmu_wr_mem.mem.ref);
+       if (!mmu_ptr)
+               goto clean_up;
+       memset(mmu_ptr, 0, gr->mmu_wr_mem.mem.size);
+       mem_op().munmap(gr->mmu_wr_mem.mem.ref, mmu_ptr);
+
+       mmu_ptr = mem_op().mmap(gr->mmu_rd_mem.mem.ref);
+       if (!mmu_ptr)
+               goto clean_up;
+       memset(mmu_ptr, 0, gr->mmu_rd_mem.mem.size);
+       mem_op().munmap(gr->mmu_rd_mem.mem.ref, mmu_ptr);
+
+       gr->mmu_wr_mem.cpu_pa = mem_op().pin(memmgr, gr->mmu_wr_mem.mem.ref);
+       if (gr->mmu_wr_mem.cpu_pa == -EINVAL || gr->mmu_wr_mem.cpu_pa == -EINTR)
+               goto clean_up;
+
+       gr->mmu_rd_mem.cpu_pa = mem_op().pin(memmgr, gr->mmu_rd_mem.mem.ref);
+       if (gr->mmu_rd_mem.cpu_pa == -EINVAL || gr->mmu_rd_mem.cpu_pa == -EINTR)
+               goto clean_up;
+
+       return 0;
+
+clean_up:
+       return -ENOMEM;
+}
+
+static u32 prime_set[18] = {
+       2, 3, 5, 7, 11, 13, 17, 19, 23, 39, 31, 37, 41, 43, 47, 53, 59, 61 };
+
+static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
+{
+       s32 comm_denom;
+       s32 mul_factor;
+       s32 *init_frac = NULL;
+       s32 *init_err = NULL;
+       s32 *run_err = NULL;
+       s32 *sorted_num_tpcs = NULL;
+       s32 *sorted_to_unsorted_gpc_map = NULL;
+       u32 gpc_index;
+       u32 gpc_mark = 0;
+       u32 num_tpc;
+       u32 max_tpc_count = 0;
+       u32 swap;
+       u32 tile_count;
+       u32 index;
+       bool delete_map = false;
+       bool gpc_sorted;
+       int ret = 0;
+
+       init_frac = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
+       init_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
+       run_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
+       sorted_num_tpcs =
+               kzalloc(proj_scal_max_gpcs_v() *
+                       proj_scal_max_tpc_per_gpc_v() * sizeof(s32),
+                       GFP_KERNEL);
+       sorted_to_unsorted_gpc_map =
+               kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
+
+       if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
+             sorted_to_unsorted_gpc_map)) {
+               ret = -ENOMEM;
+               goto clean_up;
+       }
+
+       gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
+
+       if (gr->tpc_count == 3)
+               gr->map_row_offset = 2;
+       else if (gr->tpc_count < 3)
+               gr->map_row_offset = 1;
+       else {
+               gr->map_row_offset = 3;
+
+               for (index = 1; index < 18; index++) {
+                       u32 prime = prime_set[index];
+                       if ((gr->tpc_count % prime) != 0) {
+                               gr->map_row_offset = prime;
+                               break;
+                       }
+               }
+       }
+
+       switch (gr->tpc_count) {
+       case 15:
+               gr->map_row_offset = 6;
+               break;
+       case 14:
+               gr->map_row_offset = 5;
+               break;
+       case 13:
+               gr->map_row_offset = 2;
+               break;
+       case 11:
+               gr->map_row_offset = 7;
+               break;
+       case 10:
+               gr->map_row_offset = 6;
+               break;
+       case 7:
+       case 5:
+               gr->map_row_offset = 1;
+               break;
+       default:
+               break;
+       }
+
+       if (gr->map_tiles) {
+               if (gr->map_tile_count != gr->tpc_count)
+                       delete_map = true;
+
+               for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
+                       if ((u32)gr->map_tiles[tile_count] >= gr->tpc_count)
+                               delete_map = true;
+               }
+
+               if (delete_map) {
+                       kfree(gr->map_tiles);
+                       gr->map_tiles = NULL;
+                       gr->map_tile_count = 0;
+               }
+       }
+
+       if (gr->map_tiles == NULL) {
+               gr->map_tile_count = proj_scal_max_gpcs_v();
+
+               gr->map_tiles = kzalloc(proj_scal_max_gpcs_v() * sizeof(u8), GFP_KERNEL);
+               if (gr->map_tiles == NULL) {
+                       ret = -ENOMEM;
+                       goto clean_up;
+               }
+
+               for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                       sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
+                       sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
+               }
+
+               gpc_sorted = false;
+               while (!gpc_sorted) {
+                       gpc_sorted = true;
+                       for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
+                               if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
+                                       gpc_sorted = false;
+                                       swap = sorted_num_tpcs[gpc_index];
+                                       sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
+                                       sorted_num_tpcs[gpc_index + 1] = swap;
+                                       swap = sorted_to_unsorted_gpc_map[gpc_index];
+                                       sorted_to_unsorted_gpc_map[gpc_index] =
+                                               sorted_to_unsorted_gpc_map[gpc_index + 1];
+                                       sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
+                               }
+                       }
+               }
+
+               for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+                       if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
+                               max_tpc_count = gr->gpc_tpc_count[gpc_index];
+
+               mul_factor = gr->gpc_count * max_tpc_count;
+               if (mul_factor & 0x1)
+                       mul_factor = 2;
+               else
+                       mul_factor = 1;
+
+               comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
+
+               for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                       num_tpc = sorted_num_tpcs[gpc_index];
+
+                       init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
+
+                       if (num_tpc != 0)
+                               init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
+                       else
+                               init_err[gpc_index] = 0;
+
+                       run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
+               }
+
+               while (gpc_mark < gr->tpc_count) {
+                       for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                               if ((run_err[gpc_index] * 2) >= comm_denom) {
+                                       gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
+                                       run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
+                               } else
+                                       run_err[gpc_index] += init_frac[gpc_index];
+                       }
+               }
+       }
+
+clean_up:
+       kfree(init_frac);
+       kfree(init_err);
+       kfree(run_err);
+       kfree(sorted_num_tpcs);
+       kfree(sorted_to_unsorted_gpc_map);
+
+       if (ret)
+               nvhost_dbg(dbg_fn | dbg_err, "fail");
+       else
+               nvhost_dbg_fn("done");
+
+       return ret;
+}
+
+static int gr_gk20a_init_comptag(struct gk20a *g, struct gr_gk20a *gr)
+{
+       struct mem_mgr *memmgr = mem_mgr_from_g(g);
+
+       /* max memory size (MB) to cover */
+       u32 max_size = gr->max_comptag_mem;
+       /* one tag line covers 128KB */
+       u32 max_comptag_lines = max_size << 3;
+
+       u32 hw_max_comptag_lines =
+               ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_init_v();
+
+       u32 cbc_param =
+               gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r());
+       u32 comptags_per_cacheline =
+               ltc_ltcs_ltss_cbc_param_comptags_per_cache_line_v(cbc_param);
+       u32 slices_per_fbp =
+               ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(cbc_param);
+       u32 cacheline_size =
+               512 << ltc_ltcs_ltss_cbc_param_cache_line_size_v(cbc_param);
+
+       u32 compbit_backing_size;
+       int ret = 0;
+
+       nvhost_dbg_fn("");
+
+       if (max_comptag_lines == 0) {
+               gr->compbit_store.mem.size = 0;
+               return 0;
+       }
+
+       if (max_comptag_lines > hw_max_comptag_lines)
+               max_comptag_lines = hw_max_comptag_lines;
+
+       /* no hybird fb */
+       compbit_backing_size =
+               DIV_ROUND_UP(max_comptag_lines, comptags_per_cacheline) *
+               cacheline_size * slices_per_fbp * gr->num_fbps;
+
+       /* aligned to 2KB * num_fbps */
+       compbit_backing_size +=
+               gr->num_fbps << ltc_ltc0_lts0_cbc_base_alignment_shift_v();
+
+       /* must be a multiple of 64KB */
+       compbit_backing_size = roundup(compbit_backing_size, 64*1024);
+
+       max_comptag_lines =
+               (compbit_backing_size * comptags_per_cacheline) /
+               cacheline_size * slices_per_fbp * gr->num_fbps;
+
+       if (max_comptag_lines > hw_max_comptag_lines)
+               max_comptag_lines = hw_max_comptag_lines;
+
+       nvhost_dbg_info("compbit backing store size : %d",
+               compbit_backing_size);
+       nvhost_dbg_info("max comptag lines : %d",
+               max_comptag_lines);
+
+       gr->compbit_store.mem.ref =
+               mem_op().alloc(memmgr, compbit_backing_size,
+                           DEFAULT_NVMAP_ALLOC_ALIGNMENT,
+                           DEFAULT_NVMAP_ALLOC_FLAGS,
+                           NVMAP_HEAP_CARVEOUT_GENERIC);
+       if (IS_ERR_OR_NULL(gr->compbit_store.mem.ref)) {
+               nvhost_err(dev_from_gk20a(g), "failed to allocate"
+                          "backing store for compbit : size %d",
+                          compbit_backing_size);
+               return -ENOMEM;
+       }
+       gr->compbit_store.mem.size = compbit_backing_size;
+
+       gr->compbit_store.base_pa =
+               mem_op().pin(memmgr, gr->compbit_store.mem.ref);
+       if (gr->compbit_store.base_pa == -EINVAL ||
+           gr->compbit_store.base_pa == -EINTR) {
+               ret = -ENOMEM;
+               goto clean_up;
+       }
+
+       nvhost_allocator_init(&gr->comp_tags, "comptag",
+                       1, max_comptag_lines, 1);
+
+clean_up:
+       mem_op().put(memmgr, gr->compbit_store.mem.ref);
+       return ret;
+}
+
+int gk20a_gr_clear_comptags(struct gk20a *g, u32 min, u32 max)
+{
+       struct gr_gk20a *gr = &g->gr;
+       u32 fbp, slice, ctrl1, val;
+       u32 timeout = GR_IDLE_TIMEOUT_DEFAULT;
+       u32 slices_per_fbp =
+               ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(
+                       gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r()));
+
+       nvhost_dbg_fn("");
+
+       if (gr->compbit_store.mem.size == 0)
+               return 0;
+
+       gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl2_r(),
+                    ltc_ltcs_ltss_cbc_ctrl2_clear_lower_bound_f(min));
+       gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl3_r(),
+                    ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_f(max));
+       gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl1_r(),
+                    gk20a_readl(g, ltc_ltcs_ltss_cbc_ctrl1_r()) |
+                    ltc_ltcs_ltss_cbc_ctrl1_clear_active_f());
+
+       for (fbp = 0; fbp < gr->num_fbps; fbp++) {
+               for (slice = 0; slice < slices_per_fbp; slice++) {
+                       ctrl1 = ltc_ltc0_lts0_cbc_ctrl1_r() +
+                               fbp * proj_ltc_pri_stride_v() +
+                               slice * proj_lts_pri_stride_v();
+
+                       do {
+                               u32 check = min_t(u32,
+                                       GR_IDLE_CHECK_PERIOD, timeout);
+
+                               val = gk20a_readl(g, ctrl1);
+                               if (ltc_ltc0_lts0_cbc_ctrl1_clear_v(val) !=
+                                   ltc_ltc0_lts0_cbc_ctrl1_clear_active_v())
+                                       break;
+
+                               udelay(GR_IDLE_CHECK_PERIOD);
+                               timeout -= check;
+
+                       } while (timeout);
+
+                       if (timeout == 0) {
+                               nvhost_err(dev_from_gk20a(g),
+                                          "comp tag clear timeout\n");
+                               return -EBUSY;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
+{
+       struct gr_zcull_gk20a *zcull = &gr->zcull;
+
+       zcull->aliquot_width = gr->tpc_count * 16;
+       zcull->aliquot_height = 16;
+
+       zcull->width_align_pixels = gr->tpc_count * 16;
+       zcull->height_align_pixels = 32;
+
+       zcull->aliquot_size =
+               zcull->aliquot_width * zcull->aliquot_height;
+
+       /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
+       zcull->pixel_squares_by_aliquots =
+               gr->zcb_count * 16 * 16 * gr->tpc_count /
+               (gr->gpc_count * gr->gpc_tpc_count[0]);
+
+       zcull->total_aliquots =
+               gr_gpc0_zcull_total_ram_size_num_aliquots_f(
+                       gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
+
+       return 0;
+}
+
+u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
+{
+       /* assuming gr has already been initialized */
+       return gr->ctx_vars.zcull_ctxsw_image_size;
+}
+
+int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
+                       struct channel_gk20a *c, u64 zcull_va, u32 mode)
+{
+       struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
+
+       zcull_ctx->ctx_sw_mode = mode;
+       zcull_ctx->gpu_va = zcull_va;
+
+       /* TBD: don't disable channel in sw method processing */
+       return gr_gk20a_ctx_zcull_setup(g, c, true);
+}
+
+int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
+                       struct gr_zcull_info *zcull_params)
+{
+       struct gr_zcull_gk20a *zcull = &gr->zcull;
+
+       zcull_params->width_align_pixels = zcull->width_align_pixels;
+       zcull_params->height_align_pixels = zcull->height_align_pixels;
+       zcull_params->pixel_squares_by_aliquots =
+               zcull->pixel_squares_by_aliquots;
+       zcull_params->aliquot_total = zcull->total_aliquots;
+
+       zcull_params->region_byte_multiplier =
+               gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
+       zcull_params->region_header_size =
+               proj_scal_litter_num_gpcs_v() *
+               gr_zcull_save_restore_header_bytes_per_gpc_v();
+
+       zcull_params->subregion_header_size =
+               proj_scal_litter_num_gpcs_v() *
+               gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
+
+       zcull_params->subregion_width_align_pixels =
+               gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
+       zcull_params->subregion_height_align_pixels =
+               gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
+       zcull_params->subregion_count = gr_zcull_subregion_qty_v();
+
+       return 0;
+}
+
+static int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
+                               struct zbc_entry *color_val, u32 index)
+{
+       struct fifo_gk20a *f = &g->fifo;
+       struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
+       u32 i;
+       u32 timeout = GR_IDLE_TIMEOUT_DEFAULT;
+       u32 ret;
+
+       ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g),
+                       "failed to disable gr engine activity\n");
+               return ret;
+       }
+
+       ret = gr_gk20a_wait_idle(g, &timeout);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g),
+                       "failed to idle graphics\n");
+               goto clean_up;
+       }
+
+       /* update l2 table */
+       gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+                       (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
+                        ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
+                               ltc_ltcs_ltss_dstg_zbc_index_address_f(index +
+                                       GK20A_STARTOF_ZBC_TABLE));
+
+       for (i = 0; i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++)
+               gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(i),
+                       color_val->color_l2[i]);
+
+       /* update ds table */
+       gk20a_writel(g, gr_ds_zbc_color_r_r(),
+               gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
+       gk20a_writel(g, gr_ds_zbc_color_g_r(),
+               gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
+       gk20a_writel(g, gr_ds_zbc_color_b_r(),
+               gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
+       gk20a_writel(g, gr_ds_zbc_color_a_r(),
+               gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
+
+       gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
+               gr_ds_zbc_color_fmt_val_f(color_val->format));
+
+       gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+               gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
+
+       /* trigger the write */
+       gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+               gr_ds_zbc_tbl_ld_select_c_f() |
+               gr_ds_zbc_tbl_ld_action_write_f() |
+               gr_ds_zbc_tbl_ld_trigger_active_f());
+
+       /* update local copy */
+       for (i = 0; i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++) {
+               gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
+               gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
+       }
+       gr->zbc_col_tbl[index].format = color_val->format;
+       gr->zbc_col_tbl[index].ref_cnt++;
+
+clean_up:
+       ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g),
+                       "failed to enable gr engine activity\n");
+       }
+
+       return ret;
+}
+
+static int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
+                               struct zbc_entry *depth_val, u32 index)
+{
+       struct fifo_gk20a *f = &g->fifo;
+       struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
+       u32 timeout = GR_IDLE_TIMEOUT_DEFAULT;
+       u32 ret;
+
+       ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g),
+                       "failed to disable gr engine activity\n");
+               return ret;
+       }
+
+       ret = gr_gk20a_wait_idle(g, &timeout);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g),
+                       "failed to idle graphics\n");
+               goto clean_up;
+       }
+
+       /* update l2 table */
+       gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+                       (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
+                        ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
+                               ltc_ltcs_ltss_dstg_zbc_index_address_f(index +
+                                       GK20A_STARTOF_ZBC_TABLE));
+
+       gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(),
+                       depth_val->depth);
+
+       /* update ds table */
+       gk20a_writel(g, gr_ds_zbc_z_r(),
+               gr_ds_zbc_z_val_f(depth_val->depth));
+
+       gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
+               gr_ds_zbc_z_fmt_val_f(depth_val->format));
+
+       gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+               gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
+
+       /* trigger the write */
+       gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+               gr_ds_zbc_tbl_ld_select_z_f() |
+               gr_ds_zbc_tbl_ld_action_write_f() |
+               gr_ds_zbc_tbl_ld_trigger_active_f());
+
+       /* update local copy */
+       gr->zbc_dep_tbl[index].depth = depth_val->depth;
+       gr->zbc_dep_tbl[index].format = depth_val->format;
+       gr->zbc_dep_tbl[index].ref_cnt++;
+
+clean_up:
+       ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g),
+                       "failed to enable gr engine activity\n");
+       }
+
+       return ret;
+}
+
+int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
+                    struct zbc_entry *zbc_val)
+{
+       struct zbc_color_table *c_tbl;
+       struct zbc_depth_table *d_tbl;
+       u32 i, ret = -ENOMEM;
+       bool added = false;
+
+       /* no endian swap ? */
+
+       switch (zbc_val->type) {
+       case GK20A_ZBC_TYPE_COLOR:
+               /* search existing tables */
+               for (i = 0; i < gr->max_used_color_index; i++) {
+
+                       c_tbl = &gr->zbc_col_tbl[i];
+
+                       if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
+                           memcmp(c_tbl->color_ds, zbc_val->color_ds,
+                               sizeof(zbc_val->color_ds)) == 0) {
+
+                               if (memcmp(c_tbl->color_l2, zbc_val->color_l2,
+                                   sizeof(zbc_val->color_l2))) {
+                                       nvhost_err(dev_from_gk20a(g),
+                                               "zbc l2 and ds color don't match with existing entries");
+                                       return -EINVAL;
+                               }
+                               added = true;
+                               c_tbl->ref_cnt++;
+                               ret = 0;
+                               break;
+                       }
+               }
+               /* add new table */
+               if (!added &&
+                   gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
+
+                       c_tbl =
+                           &gr->zbc_col_tbl[gr->max_used_color_index];
+                       WARN_ON(c_tbl->ref_cnt != 0);
+
+                       ret = gr_gk20a_add_zbc_color(g, gr,
+                               zbc_val, gr->max_used_color_index);
+
+                       if (!ret)
+                               gr->max_used_color_index++;
+               }
+               break;
+       case GK20A_ZBC_TYPE_DEPTH:
+               /* search existing tables */
+               for (i = 0; i < gr->max_used_depth_index; i++) {
+
+                       d_tbl = &gr->zbc_dep_tbl[i];
+
+                       if (d_tbl->ref_cnt &&
+                           d_tbl->depth == zbc_val->depth &&
+                           d_tbl->format == zbc_val->format) {
+                               added = true;
+                               d_tbl->ref_cnt++;
+                               ret = 0;
+                               break;
+                       }
+               }
+               /* add new table */
+               if (!added &&
+                   gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
+
+                       d_tbl =
+                           &gr->zbc_dep_tbl[gr->max_used_depth_index];
+                       WARN_ON(d_tbl->ref_cnt != 0);
+
+                       ret = gr_gk20a_add_zbc_depth(g, gr,
+                               zbc_val, gr->max_used_depth_index);
+
+                       if (!ret)
+                               gr->max_used_depth_index++;
+               }
+               break;
+       default:
+               nvhost_err(dev_from_gk20a(g),
+                       "invalid zbc table type %d", zbc_val->type);
+               return -EINVAL;
+       }
+
+       if (added && ret == 0) {
+               /* update zbc for elpg */
+       }
+
+       return ret;
+}
+
+int gr_gk20a_clear_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
+{
+       struct fifo_gk20a *f = &g->fifo;
+       struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
+       u32 i, j;
+       u32 timeout = GR_IDLE_TIMEOUT_DEFAULT;
+       u32 ret;
+
+       ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g),
+                       "failed to disable gr engine activity\n");
+               return ret;
+       }
+
+       ret = gr_gk20a_wait_idle(g, &timeout);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g),
+                       "failed to idle graphics\n");
+               goto clean_up;
+       }
+
+       for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
+               gr->zbc_col_tbl[i].format = 0;
+               gr->zbc_col_tbl[i].ref_cnt = 0;
+
+               gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
+                       gr_ds_zbc_color_fmt_val_invalid_f());
+               gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+                       gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
+
+               /* trigger the write */
+               gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+                       gr_ds_zbc_tbl_ld_select_c_f() |
+                       gr_ds_zbc_tbl_ld_action_write_f() |
+                       gr_ds_zbc_tbl_ld_trigger_active_f());
+
+               /* clear l2 table */
+               gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+                       (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
+                        ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
+                               ltc_ltcs_ltss_dstg_zbc_index_address_f(i +
+                                       GK20A_STARTOF_ZBC_TABLE));
+
+               for (j = 0; j < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); j++) {
+                       gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(j), 0);
+                       gr->zbc_col_tbl[i].color_l2[j] = 0;
+                       gr->zbc_col_tbl[i].color_ds[j] = 0;
+               }
+       }
+       gr->max_used_color_index = 0;
+       gr->max_default_color_index = 0;
+
+       for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
+               gr->zbc_dep_tbl[i].depth = 0;
+               gr->zbc_dep_tbl[i].format = 0;
+               gr->zbc_dep_tbl[i].ref_cnt = 0;
+
+               gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
+                       gr_ds_zbc_z_fmt_val_invalid_f());
+               gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+                       gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
+
+               /* trigger the write */
+               gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+                       gr_ds_zbc_tbl_ld_select_z_f() |
+                       gr_ds_zbc_tbl_ld_action_write_f() |
+                       gr_ds_zbc_tbl_ld_trigger_active_f());
+
+               /* clear l2 table */
+               gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+                       (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
+                        ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
+                               ltc_ltcs_ltss_dstg_zbc_index_address_f(i +
+                                       GK20A_STARTOF_ZBC_TABLE));
+
+               gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
+       }
+       gr->max_used_depth_index = 0;
+       gr->max_default_depth_index = 0;
+
+clean_up:
+       ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+       if (ret) {
+               nvhost_err(dev_from_gk20a(g),
+                       "failed to enable gr engine activity\n");
+       }
+
+       /* elpg stuff */
+
+       return ret;
+}
+
+/* get a zbc table entry specified by index
+ * return table size when type is invalid */
+int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
+                       struct zbc_query_params *query_params)
+{
+       u32 index = query_params->index_size;
+       u32 i;
+
+       switch (query_params->type) {
+       case GK20A_ZBC_TYPE_INVALID:
+               query_params->index_size = GK20A_ZBC_TABLE_SIZE;
+               break;
+       case GK20A_ZBC_TYPE_COLOR:
+               if (index >= GK20A_ZBC_TABLE_SIZE) {
+                       nvhost_err(dev_from_gk20a(g),
+                               "invalid zbc color table index\n");
+                       return -EINVAL;
+               }
+               for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                       query_params->color_l2[i] =
+