4caaa51ab1aa488a83aaac325d60ffe532c37a6d
[linux-3.10.git] / drivers / video / tegra / host / gk20a / gr_gk20a.c
1 /*
2  * drivers/video/tegra/host/gk20a/gr_gk20a.c
3  *
4  * GK20A Graphics
5  *
6  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
7  *
8  * This program is free software; you can redistribute it and/or modify it
9  * under the terms and conditions of the GNU General Public License,
10  * version 2, as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  *
17  * You should have received a copy of the GNU General Public License along with
18  * this program; if not, write to the Free Software Foundation, Inc.,
19  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
20  */
21
22 #include <linux/delay.h>        /* for udelay */
23 #include <linux/mm.h>           /* for totalram_pages */
24 #include <linux/scatterlist.h>
25 #include <linux/nvmap.h>
26 #include <linux/tegra-soc.h>
27 #include <linux/nvhost_dbg_gpu_ioctl.h>
28 #include <linux/vmalloc.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/firmware.h>
31
32 #include "../dev.h"
33 #include "bus_client.h"
34
35 #include "gk20a.h"
36 #include "gr_ctx_gk20a.h"
37
38 #include "hw_ccsr_gk20a.h"
39 #include "hw_ctxsw_prog_gk20a.h"
40 #include "hw_fifo_gk20a.h"
41 #include "hw_gr_gk20a.h"
42 #include "hw_mc_gk20a.h"
43 #include "hw_ram_gk20a.h"
44 #include "hw_pri_ringmaster_gk20a.h"
45 #include "hw_pri_ringstation_sys_gk20a.h"
46 #include "hw_pri_ringstation_gpc_gk20a.h"
47 #include "hw_pri_ringstation_fbp_gk20a.h"
48 #include "hw_proj_gk20a.h"
49 #include "hw_top_gk20a.h"
50 #include "hw_ltc_gk20a.h"
51 #include "hw_fb_gk20a.h"
52 #include "hw_therm_gk20a.h"
53 #include "hw_pbdma_gk20a.h"
54 #include "chip_support.h"
55 #include "nvhost_memmgr.h"
56 #include "gk20a_gating_reglist.h"
57 #include "gr_pri_gk20a.h"
58 #include "regops_gk20a.h"
59 #include "dbg_gpu_gk20a.h"
60
61 #define BLK_SIZE (256)
62
63 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
64 static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx,
65                                     u32 addr, u32 data, bool patch);
66
67 /* global ctx buffer */
68 static int  gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
69 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
70 static int  gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
71                                             struct channel_gk20a *c);
72 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
73
74 /* channel gr ctx buffer */
75 static int  gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
76                                         struct channel_gk20a *c);
77 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
78
79 /* channel patch ctx buffer */
80 static int  gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
81                                         struct channel_gk20a *c);
82 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
83
84 /* golden ctx image */
85 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
86                                           struct channel_gk20a *c);
87 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
88                                           struct channel_gk20a *c);
89
90 void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
91 {
92         int i;
93
94         nvhost_err(dev_from_gk20a(g), "gr_fecs_os_r : %d",
95                 gk20a_readl(g, gr_fecs_os_r()));
96         nvhost_err(dev_from_gk20a(g), "gr_fecs_cpuctl_r : 0x%x",
97                 gk20a_readl(g, gr_fecs_cpuctl_r()));
98         nvhost_err(dev_from_gk20a(g), "gr_fecs_idlestate_r : 0x%x",
99                 gk20a_readl(g, gr_fecs_idlestate_r()));
100         nvhost_err(dev_from_gk20a(g), "gr_fecs_mailbox0_r : 0x%x",
101                 gk20a_readl(g, gr_fecs_mailbox0_r()));
102         nvhost_err(dev_from_gk20a(g), "gr_fecs_mailbox1_r : 0x%x",
103                 gk20a_readl(g, gr_fecs_mailbox1_r()));
104         nvhost_err(dev_from_gk20a(g), "gr_fecs_irqstat_r : 0x%x",
105                 gk20a_readl(g, gr_fecs_irqstat_r()));
106         nvhost_err(dev_from_gk20a(g), "gr_fecs_irqmode_r : 0x%x",
107                 gk20a_readl(g, gr_fecs_irqmode_r()));
108         nvhost_err(dev_from_gk20a(g), "gr_fecs_irqmask_r : 0x%x",
109                 gk20a_readl(g, gr_fecs_irqmask_r()));
110         nvhost_err(dev_from_gk20a(g), "gr_fecs_irqdest_r : 0x%x",
111                 gk20a_readl(g, gr_fecs_irqdest_r()));
112         nvhost_err(dev_from_gk20a(g), "gr_fecs_debug1_r : 0x%x",
113                 gk20a_readl(g, gr_fecs_debug1_r()));
114         nvhost_err(dev_from_gk20a(g), "gr_fecs_debuginfo_r : 0x%x",
115                 gk20a_readl(g, gr_fecs_debuginfo_r()));
116
117         for (i = 0; i < gr_fecs_ctxsw_mailbox__size_1_v(); i++)
118                 nvhost_err(dev_from_gk20a(g), "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
119                         i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
120
121         nvhost_err(dev_from_gk20a(g), "gr_fecs_engctl_r : 0x%x",
122                 gk20a_readl(g, gr_fecs_engctl_r()));
123         nvhost_err(dev_from_gk20a(g), "gr_fecs_curctx_r : 0x%x",
124                 gk20a_readl(g, gr_fecs_curctx_r()));
125         nvhost_err(dev_from_gk20a(g), "gr_fecs_nxtctx_r : 0x%x",
126                 gk20a_readl(g, gr_fecs_nxtctx_r()));
127
128         gk20a_writel(g, gr_fecs_icd_cmd_r(),
129                 gr_fecs_icd_cmd_opc_rreg_f() |
130                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
131         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_IMB : 0x%x",
132                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
133
134         gk20a_writel(g, gr_fecs_icd_cmd_r(),
135                 gr_fecs_icd_cmd_opc_rreg_f() |
136                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
137         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_DMB : 0x%x",
138                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
139
140         gk20a_writel(g, gr_fecs_icd_cmd_r(),
141                 gr_fecs_icd_cmd_opc_rreg_f() |
142                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
143         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_CSW : 0x%x",
144                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
145
146         gk20a_writel(g, gr_fecs_icd_cmd_r(),
147                 gr_fecs_icd_cmd_opc_rreg_f() |
148                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
149         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_CTX : 0x%x",
150                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
151
152         gk20a_writel(g, gr_fecs_icd_cmd_r(),
153                 gr_fecs_icd_cmd_opc_rreg_f() |
154                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
155         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_EXCI : 0x%x",
156                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
157
158         for (i = 0; i < 4; i++) {
159                 gk20a_writel(g, gr_fecs_icd_cmd_r(),
160                         gr_fecs_icd_cmd_opc_rreg_f() |
161                         gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
162                 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_PC : 0x%x",
163                         gk20a_readl(g, gr_fecs_icd_rdata_r()));
164
165                 gk20a_writel(g, gr_fecs_icd_cmd_r(),
166                         gr_fecs_icd_cmd_opc_rreg_f() |
167                         gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
168                 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_SP : 0x%x",
169                         gk20a_readl(g, gr_fecs_icd_rdata_r()));
170         }
171 }
172
173 static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
174 {
175         u32 i, ucode_u32_size;
176         const u32 *ucode_u32_data;
177         u32 checksum;
178
179         nvhost_dbg_fn("");
180
181         gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
182                                               gr_gpccs_dmemc_blk_f(0)  |
183                                               gr_gpccs_dmemc_aincw_f(1)));
184
185         ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
186         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
187
188         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
189                 gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
190                 checksum += ucode_u32_data[i];
191         }
192
193         gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
194                                              gr_fecs_dmemc_blk_f(0)  |
195                                              gr_fecs_dmemc_aincw_f(1)));
196
197         ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
198         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
199
200         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
201                 gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
202                 checksum += ucode_u32_data[i];
203         }
204         nvhost_dbg_fn("done");
205 }
206
207 static void gr_gk20a_load_falcon_imem(struct gk20a *g)
208 {
209         u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
210         const u32 *ucode_u32_data;
211         u32 tag, i, pad_start, pad_end;
212         u32 checksum;
213
214         nvhost_dbg_fn("");
215
216         cfg = gk20a_readl(g, gr_fecs_cfg_r());
217         fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
218
219         cfg = gk20a_readl(g, gr_gpc0_cfg_r());
220         gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
221
222         /* Use the broadcast address to access all of the GPCCS units. */
223         gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
224                                               gr_gpccs_imemc_blk_f(0) |
225                                               gr_gpccs_imemc_aincw_f(1)));
226
227         /* Setup the tags for the instruction memory. */
228         tag = 0;
229         gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
230
231         ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
232         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
233
234         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
235                 if (i && ((i % (256/sizeof(u32))) == 0)) {
236                         tag++;
237                         gk20a_writel(g, gr_gpccs_imemt_r(0),
238                                       gr_gpccs_imemt_tag_f(tag));
239                 }
240                 gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
241                 checksum += ucode_u32_data[i];
242         }
243
244         pad_start = i*4;
245         pad_end = pad_start+(256-pad_start%256)+256;
246         for (i = pad_start;
247              (i < gpccs_imem_size * 256) && (i < pad_end);
248              i += 4) {
249                 if (i && ((i % 256) == 0)) {
250                         tag++;
251                         gk20a_writel(g, gr_gpccs_imemt_r(0),
252                                       gr_gpccs_imemt_tag_f(tag));
253                 }
254                 gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
255         }
256
257         gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
258                                              gr_fecs_imemc_blk_f(0) |
259                                              gr_fecs_imemc_aincw_f(1)));
260
261         /* Setup the tags for the instruction memory. */
262         tag = 0;
263         gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
264
265         ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
266         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
267
268         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
269                 if (i && ((i % (256/sizeof(u32))) == 0)) {
270                         tag++;
271                         gk20a_writel(g, gr_fecs_imemt_r(0),
272                                       gr_fecs_imemt_tag_f(tag));
273                 }
274                 gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
275                 checksum += ucode_u32_data[i];
276         }
277
278         pad_start = i*4;
279         pad_end = pad_start+(256-pad_start%256)+256;
280         for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
281                 if (i && ((i % 256) == 0)) {
282                         tag++;
283                         gk20a_writel(g, gr_fecs_imemt_r(0),
284                                       gr_fecs_imemt_tag_f(tag));
285                 }
286                 gk20a_writel(g, gr_fecs_imemd_r(0), 0);
287         }
288 }
289
290 static int gr_gk20a_wait_idle(struct gk20a *g, unsigned long end_jiffies,
291                 u32 expect_delay)
292 {
293         u32 delay = expect_delay;
294         bool gr_enabled;
295         bool ctxsw_active;
296         bool gr_busy;
297
298         nvhost_dbg_fn("");
299
300         do {
301                 /* fmodel: host gets fifo_engine_status(gr) from gr
302                    only when gr_status is read */
303                 gk20a_readl(g, gr_status_r());
304
305                 gr_enabled = gk20a_readl(g, mc_enable_r()) &
306                         mc_enable_pgraph_enabled_f();
307
308                 ctxsw_active = gk20a_readl(g,
309                         fifo_engine_status_r(ENGINE_GR_GK20A)) &
310                         fifo_engine_status_ctxsw_in_progress_f();
311
312                 gr_busy = gk20a_readl(g, gr_engine_status_r()) &
313                         gr_engine_status_value_busy_f();
314
315                 if (!gr_enabled || (!gr_busy && !ctxsw_active)) {
316                         nvhost_dbg_fn("done");
317                         return 0;
318                 }
319
320                 usleep_range(delay, delay * 2);
321                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
322
323         } while (time_before(jiffies, end_jiffies));
324
325         nvhost_err(dev_from_gk20a(g),
326                 "timeout, ctxsw busy : %d, gr busy : %d",
327                 ctxsw_active, gr_busy);
328
329         return -EAGAIN;
330 }
331
332 static int gr_gk20a_ctx_reset(struct gk20a *g, u32 rst_mask)
333 {
334         u32 delay = GR_IDLE_CHECK_DEFAULT;
335         unsigned long end_jiffies = jiffies +
336                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
337         u32 reg;
338
339         nvhost_dbg_fn("");
340
341         /* Force clocks on */
342         gk20a_writel(g, gr_fe_pwr_mode_r(),
343                      gr_fe_pwr_mode_req_send_f() |
344                      gr_fe_pwr_mode_mode_force_on_f());
345
346         /* Wait for the clocks to indicate that they are on */
347         do {
348                 reg = gk20a_readl(g, gr_fe_pwr_mode_r());
349
350                 if (gr_fe_pwr_mode_req_v(reg) == gr_fe_pwr_mode_req_done_v())
351                         break;
352
353                 usleep_range(delay, delay * 2);
354                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
355
356         } while (time_before(jiffies, end_jiffies));
357
358         if (!time_before(jiffies, end_jiffies)) {
359                 nvhost_err(dev_from_gk20a(g),
360                            "failed to force the clocks on\n");
361                 WARN_ON(1);
362         }
363
364         if (rst_mask) {
365                 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), rst_mask);
366         } else {
367                 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
368                              gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
369                              gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
370                              gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
371                              gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
372                              gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
373                              gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
374                              gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
375                              gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
376                              gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
377         }
378
379         /* we need to read the reset register *and* wait for a moment to ensure
380          * reset propagation */
381
382         gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
383         udelay(20);
384
385         gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
386                      gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
387                      gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
388                      gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
389                      gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
390                      gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
391                      gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
392                      gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
393                      gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
394                      gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
395
396         /* we need to readl the reset and then wait a small moment after that */
397         gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
398         udelay(20);
399
400         /* Set power mode back to auto */
401         gk20a_writel(g, gr_fe_pwr_mode_r(),
402                      gr_fe_pwr_mode_req_send_f() |
403                      gr_fe_pwr_mode_mode_auto_f());
404
405         /* Wait for the request to complete */
406         end_jiffies = jiffies + msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
407         do {
408                 reg = gk20a_readl(g, gr_fe_pwr_mode_r());
409
410                 if (gr_fe_pwr_mode_req_v(reg) == gr_fe_pwr_mode_req_done_v())
411                         break;
412
413                 usleep_range(delay, delay * 2);
414                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
415
416         } while (time_before(jiffies, end_jiffies));
417
418         if (!time_before(jiffies, end_jiffies)) {
419                 nvhost_err(dev_from_gk20a(g),
420                            "failed to set power mode to auto\n");
421                 WARN_ON(1);
422         }
423
424         return 0;
425 }
426
427 static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
428                                    u32 *mailbox_ret, u32 opc_success,
429                                    u32 mailbox_ok, u32 opc_fail,
430                                    u32 mailbox_fail)
431 {
432         unsigned long end_jiffies = jiffies +
433                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
434         u32 delay = GR_IDLE_CHECK_DEFAULT;
435         u32 check = WAIT_UCODE_LOOP;
436         u32 reg;
437
438         nvhost_dbg_fn("");
439
440         while (check == WAIT_UCODE_LOOP) {
441                 if (!time_before(jiffies, end_jiffies) &&
442                                 tegra_platform_is_silicon())
443                         check = WAIT_UCODE_TIMEOUT;
444
445                 reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
446
447                 if (mailbox_ret)
448                         *mailbox_ret = reg;
449
450                 switch (opc_success) {
451                 case GR_IS_UCODE_OP_EQUAL:
452                         if (reg == mailbox_ok)
453                                 check = WAIT_UCODE_OK;
454                         break;
455                 case GR_IS_UCODE_OP_NOT_EQUAL:
456                         if (reg != mailbox_ok)
457                                 check = WAIT_UCODE_OK;
458                         break;
459                 case GR_IS_UCODE_OP_AND:
460                         if (reg & mailbox_ok)
461                                 check = WAIT_UCODE_OK;
462                         break;
463                 case GR_IS_UCODE_OP_LESSER:
464                         if (reg < mailbox_ok)
465                                 check = WAIT_UCODE_OK;
466                         break;
467                 case GR_IS_UCODE_OP_LESSER_EQUAL:
468                         if (reg <= mailbox_ok)
469                                 check = WAIT_UCODE_OK;
470                         break;
471                 case GR_IS_UCODE_OP_SKIP:
472                         /* do no success check */
473                         break;
474                 default:
475                         nvhost_err(dev_from_gk20a(g),
476                                    "invalid success opcode 0x%x", opc_success);
477
478                         check = WAIT_UCODE_ERROR;
479                         break;
480                 }
481
482                 switch (opc_fail) {
483                 case GR_IS_UCODE_OP_EQUAL:
484                         if (reg == mailbox_fail)
485                                 check = WAIT_UCODE_ERROR;
486                         break;
487                 case GR_IS_UCODE_OP_NOT_EQUAL:
488                         if (reg != mailbox_fail)
489                                 check = WAIT_UCODE_ERROR;
490                         break;
491                 case GR_IS_UCODE_OP_AND:
492                         if (reg & mailbox_fail)
493                                 check = WAIT_UCODE_ERROR;
494                         break;
495                 case GR_IS_UCODE_OP_LESSER:
496                         if (reg < mailbox_fail)
497                                 check = WAIT_UCODE_ERROR;
498                         break;
499                 case GR_IS_UCODE_OP_LESSER_EQUAL:
500                         if (reg <= mailbox_fail)
501                                 check = WAIT_UCODE_ERROR;
502                         break;
503                 case GR_IS_UCODE_OP_SKIP:
504                         /* do no check on fail*/
505                         break;
506                 default:
507                         nvhost_err(dev_from_gk20a(g),
508                                    "invalid fail opcode 0x%x", opc_fail);
509                         check = WAIT_UCODE_ERROR;
510                         break;
511                 }
512
513                 usleep_range(delay, delay * 2);
514                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
515         }
516
517         if (check == WAIT_UCODE_TIMEOUT) {
518                 nvhost_err(dev_from_gk20a(g),
519                            "timeout waiting on ucode response");
520                 gk20a_fecs_dump_falcon_stats(g);
521                 return -1;
522         } else if (check == WAIT_UCODE_ERROR) {
523                 nvhost_err(dev_from_gk20a(g),
524                            "ucode method failed on mailbox=%d value=0x%08x",
525                            mailbox_id, reg);
526                 gk20a_fecs_dump_falcon_stats(g);
527                 return -1;
528         }
529
530         nvhost_dbg_fn("done");
531         return 0;
532 }
533
534 /* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
535  * We should replace most, if not all, fecs method calls to this instead. */
536 struct fecs_method_op_gk20a {
537         struct {
538                 u32 addr;
539                 u32 data;
540         } method;
541
542         struct {
543                 u32 id;
544                 u32 data;
545                 u32 clr;
546                 u32 *ret;
547                 u32 ok;
548                 u32 fail;
549         } mailbox;
550
551         struct {
552                 u32 ok;
553                 u32 fail;
554         } cond;
555
556 };
557
558 int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
559                                    struct fecs_method_op_gk20a op)
560 {
561         struct gr_gk20a *gr = &g->gr;
562         int ret;
563
564         mutex_lock(&gr->fecs_mutex);
565
566         if (op.mailbox.id != 0)
567                 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
568                              op.mailbox.data);
569
570         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
571                 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
572
573         gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
574         gk20a_writel(g, gr_fecs_method_push_r(),
575                 gr_fecs_method_push_adr_f(op.method.addr));
576
577         /* op.mb.id == 4 cases require waiting for completion on
578          * for op.mb.id == 0 */
579         if (op.mailbox.id == 4)
580                 op.mailbox.id = 0;
581
582         ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
583                                       op.cond.ok, op.mailbox.ok,
584                                       op.cond.fail, op.mailbox.fail);
585
586         mutex_unlock(&gr->fecs_mutex);
587
588         return ret;
589 }
590
591 int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
592 {
593         return gr_gk20a_submit_fecs_method_op(g,
594               (struct fecs_method_op_gk20a) {
595                       .method.addr = fecs_method,
596                       .method.data = ~0,
597                       .mailbox = { .id   = 1, /*sideband?*/
598                                    .data = ~0, .clr = ~0, .ret = ret,
599                                    .ok   = gr_fecs_ctxsw_mailbox_value_pass_v(),
600                                    .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
601                       .cond.ok = GR_IS_UCODE_OP_EQUAL,
602                       .cond.fail = GR_IS_UCODE_OP_EQUAL });
603 }
604
605 /* Stop processing (stall) context switches at FECS */
606 int gr_gk20a_disable_ctxsw(struct gk20a *g)
607 {
608         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
609         return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0);
610 }
611
612 /* Start processing (continue) context switches at FECS */
613 int gr_gk20a_enable_ctxsw(struct gk20a *g)
614 {
615         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
616         return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0);
617 }
618
619
620 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
621 {
622         u32 addr_lo;
623         u32 addr_hi;
624         void *inst_ptr = NULL;
625
626         nvhost_dbg_fn("");
627
628         /* flush gpu_va before commit */
629         gk20a_mm_fb_flush(c->g);
630         gk20a_mm_l2_flush(c->g, true);
631
632         inst_ptr = c->inst_block.cpuva;
633         if (!inst_ptr)
634                 return -ENOMEM;
635
636         addr_lo = u64_lo32(gpu_va) >> 12;
637         addr_hi = u64_hi32(gpu_va);
638
639         mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
640                  ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
641                  ram_in_gr_wfi_ptr_lo_f(addr_lo));
642
643         mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
644                  ram_in_gr_wfi_ptr_hi_f(addr_hi));
645
646         gk20a_mm_l2_invalidate(c->g);
647
648         return 0;
649 }
650
651 /*
652  * Context state can be written directly or "patched" at times.
653  * So that code can be used in either situation it is written
654  * using a series _ctx_patch_write(..., patch) statements.
655  * However any necessary cpu map/unmap and gpu l2 invalidates
656  * should be minimized (to avoid doing it once per patch write).
657  * Before a sequence of these set up with "_ctx_patch_write_begin"
658  * and close with "_ctx_patch_write_end."
659  */
660 static int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
661                                           struct channel_ctx_gk20a *ch_ctx)
662 {
663         /* being defensive still... */
664         if (ch_ctx->patch_ctx.cpu_va) {
665                 nvhost_err(dev_from_gk20a(g), "nested ctx patch begin?");
666                 return -EBUSY;
667         }
668
669         ch_ctx->patch_ctx.cpu_va =
670                 nvhost_memmgr_mmap(ch_ctx->patch_ctx.mem.ref);
671
672         if (!ch_ctx->patch_ctx.cpu_va)
673                 return -ENOMEM;
674
675         return 0;
676 }
677
678 static int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
679                                         struct channel_ctx_gk20a *ch_ctx)
680 {
681         /* being defensive still... */
682         if (!ch_ctx->patch_ctx.cpu_va) {
683                 nvhost_err(dev_from_gk20a(g), "dangling ctx patch end?");
684                 return -EINVAL;
685         }
686
687         nvhost_memmgr_munmap(ch_ctx->patch_ctx.mem.ref,
688                              ch_ctx->patch_ctx.cpu_va);
689         ch_ctx->patch_ctx.cpu_va = NULL;
690
691         gk20a_mm_l2_invalidate(g);
692         return 0;
693 }
694
695 static int gr_gk20a_ctx_patch_write(struct gk20a *g,
696                                     struct channel_ctx_gk20a *ch_ctx,
697                                     u32 addr, u32 data, bool patch)
698 {
699         u32 patch_slot = 0;
700         void *patch_ptr = NULL;
701         bool mapped_here = false;
702
703         BUG_ON(patch != 0 && ch_ctx == NULL);
704
705         if (patch) {
706                 if (!ch_ctx)
707                         return -EINVAL;
708                 /* we added an optimization prolog, epilog
709                  * to get rid of unnecessary maps and l2 invals.
710                  * but be defensive still... */
711                 if (!ch_ctx->patch_ctx.cpu_va) {
712                         int err;
713                         nvhost_err(dev_from_gk20a(g),
714                                    "per-write ctx patch begin?");
715                         /* yes, gr_gk20a_ctx_patch_smpc causes this one */
716                         err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
717                         if (err)
718                                 return err;
719                         mapped_here = true;
720                 } else
721                         mapped_here = false;
722
723                 patch_ptr = ch_ctx->patch_ctx.cpu_va;
724                 patch_slot = ch_ctx->patch_ctx.data_count * 2;
725
726                 mem_wr32(patch_ptr, patch_slot++, addr);
727                 mem_wr32(patch_ptr, patch_slot++, data);
728
729                 ch_ctx->patch_ctx.data_count++;
730
731                 if (mapped_here)
732                         gr_gk20a_ctx_patch_write_end(g, ch_ctx);
733
734         } else
735                 gk20a_writel(g, addr, data);
736
737         return 0;
738 }
739
740 static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
741                                         struct channel_gk20a *c)
742 {
743         u32 inst_base_ptr = u64_lo32(c->inst_block.cpu_pa
744                                      >> ram_in_base_shift_v());
745         u32 ret;
746
747         nvhost_dbg_info("bind channel %d inst ptr 0x%08x",
748                    c->hw_chid, inst_base_ptr);
749
750         ret = gr_gk20a_submit_fecs_method_op(g,
751                      (struct fecs_method_op_gk20a) {
752                      .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
753                      .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
754                                      gr_fecs_current_ctx_target_vid_mem_f() |
755                                      gr_fecs_current_ctx_valid_f(1)),
756                      .mailbox = { .id = 0, .data = 0,
757                                   .clr = 0x30,
758                                   .ret = NULL,
759                                   .ok = 0x10,
760                                   .fail = 0x20, },
761                      .cond.ok = GR_IS_UCODE_OP_AND,
762                      .cond.fail = GR_IS_UCODE_OP_AND});
763         if (ret)
764                 nvhost_err(dev_from_gk20a(g),
765                         "bind channel instance failed");
766
767         return ret;
768 }
769
770 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
771                                     bool disable_fifo)
772 {
773         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
774         struct fifo_gk20a *f = &g->fifo;
775         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
776         u32 va_lo, va_hi, va;
777         int ret = 0;
778         void *ctx_ptr = NULL;
779
780         nvhost_dbg_fn("");
781
782         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
783                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
784                         0, pgprot_dmacoherent(PAGE_KERNEL));
785         if (!ctx_ptr)
786                 return -ENOMEM;
787
788         if (ch_ctx->zcull_ctx.gpu_va == 0 &&
789             ch_ctx->zcull_ctx.ctx_sw_mode ==
790                 ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
791                 ret = -EINVAL;
792                 goto clean_up;
793         }
794
795         va_lo = u64_lo32(ch_ctx->zcull_ctx.gpu_va);
796         va_hi = u64_hi32(ch_ctx->zcull_ctx.gpu_va);
797         va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
798
799         if (disable_fifo) {
800                 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
801                 if (ret) {
802                         nvhost_err(dev_from_gk20a(g),
803                                 "failed to disable gr engine activity\n");
804                         goto clean_up;
805                 }
806         }
807
808         /* Channel gr_ctx buffer is gpu cacheable.
809            Flush and invalidate before cpu update. */
810         gk20a_mm_fb_flush(g);
811         gk20a_mm_l2_flush(g, true);
812
813         mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
814                  ch_ctx->zcull_ctx.ctx_sw_mode);
815
816         mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
817
818         if (disable_fifo) {
819                 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
820                 if (ret) {
821                         nvhost_err(dev_from_gk20a(g),
822                                 "failed to enable gr engine activity\n");
823                         goto clean_up;
824                 }
825         }
826         gk20a_mm_l2_invalidate(g);
827
828 clean_up:
829         vunmap(ctx_ptr);
830
831         return ret;
832 }
833
834 static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
835                         struct channel_gk20a *c, bool patch)
836 {
837         struct gr_gk20a *gr = &g->gr;
838         struct channel_ctx_gk20a *ch_ctx = NULL;
839         u32 attrib_offset_in_chunk = 0;
840         u32 alpha_offset_in_chunk = 0;
841         u32 pd_ab_max_output;
842         u32 gpc_index, ppc_index;
843         u32 temp;
844         u32 cbm_cfg_size1, cbm_cfg_size2;
845
846         nvhost_dbg_fn("");
847
848         if (patch) {
849                 int err;
850                 ch_ctx = &c->ch_ctx;
851                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
852                 if (err)
853                         return err;
854         }
855
856         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
857                 gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
858                 gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
859                 patch);
860
861         pd_ab_max_output = (gr->alpha_cb_default_size *
862                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
863                 gr_pd_ab_dist_cfg1_max_output_granularity_v();
864
865         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
866                 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
867                 gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
868
869         alpha_offset_in_chunk = attrib_offset_in_chunk +
870                 gr->tpc_count * gr->attrib_cb_size;
871
872         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
873                 temp = proj_gpc_stride_v() * gpc_index;
874                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
875                      ppc_index++) {
876                         cbm_cfg_size1 = gr->attrib_cb_default_size *
877                                 gr->pes_tpc_count[ppc_index][gpc_index];
878                         cbm_cfg_size2 = gr->alpha_cb_default_size *
879                                 gr->pes_tpc_count[ppc_index][gpc_index];
880
881                         gr_gk20a_ctx_patch_write(g, ch_ctx,
882                                 gr_gpc0_ppc0_cbm_cfg_r() + temp +
883                                 proj_ppc_in_gpc_stride_v() * ppc_index,
884                                 gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
885                                 gr_gpc0_ppc0_cbm_cfg_start_offset_f(attrib_offset_in_chunk) |
886                                 gr_gpc0_ppc0_cbm_cfg_size_f(cbm_cfg_size1), patch);
887
888                         attrib_offset_in_chunk += gr->attrib_cb_size *
889                                 gr->pes_tpc_count[ppc_index][gpc_index];
890
891                         gr_gk20a_ctx_patch_write(g, ch_ctx,
892                                 gr_gpc0_ppc0_cbm_cfg2_r() + temp +
893                                 proj_ppc_in_gpc_stride_v() * ppc_index,
894                                 gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
895                                 gr_gpc0_ppc0_cbm_cfg2_size_f(cbm_cfg_size2), patch);
896
897                         alpha_offset_in_chunk += gr->alpha_cb_size *
898                                 gr->pes_tpc_count[ppc_index][gpc_index];
899                 }
900         }
901
902         if (patch)
903                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
904
905         return 0;
906 }
907
908 static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
909                         struct channel_gk20a *c, bool patch)
910 {
911         struct gr_gk20a *gr = &g->gr;
912         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
913         u64 addr;
914         u32 size;
915         u32 data;
916
917         nvhost_dbg_fn("");
918         if (patch) {
919                 int err;
920                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
921                 if (err)
922                         return err;
923         }
924
925         /* global pagepool buffer */
926         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
927                 gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
928                 (u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
929                  (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
930
931         size = gr->global_ctx_buffer[PAGEPOOL].size /
932                 gr_scc_pagepool_total_pages_byte_granularity_v();
933
934         if (size == gr_scc_pagepool_total_pages_hwmax_value_v())
935                 size = gr_scc_pagepool_total_pages_hwmax_v();
936
937         nvhost_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
938                 addr, size);
939
940         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
941                 gr_scc_pagepool_base_addr_39_8_f(addr), patch);
942
943         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
944                 gr_scc_pagepool_total_pages_f(size) |
945                 gr_scc_pagepool_valid_true_f(), patch);
946
947         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
948                 gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
949
950         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
951                 gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
952
953         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
954                 gr_pd_pagepool_total_pages_f(size) |
955                 gr_pd_pagepool_valid_true_f(), patch);
956
957         /* global bundle cb */
958         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
959                 gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
960                 (u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
961                  (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
962
963         size = gr->bundle_cb_default_size;
964
965         nvhost_dbg_info("bundle cb addr : 0x%016llx, size : %d",
966                 addr, size);
967
968         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
969                 gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
970
971         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
972                 gr_scc_bundle_cb_size_div_256b_f(size) |
973                 gr_scc_bundle_cb_size_valid_true_f(), patch);
974
975         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
976                 gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
977
978         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
979                 gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
980                 gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
981
982         /* data for state_limit */
983         data = (gr->bundle_cb_default_size *
984                 gr_scc_bundle_cb_size_div_256b_byte_granularity_v()) /
985                 gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v();
986
987         data = min_t(u32, data, gr->min_gpm_fifo_depth);
988
989         nvhost_dbg_info("bundle cb token limit : %d, state limit : %d",
990                    gr->bundle_cb_token_limit, data);
991
992         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
993                 gr_pd_ab_dist_cfg2_token_limit_f(gr->bundle_cb_token_limit) |
994                 gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
995
996         /* global attrib cb */
997         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
998                 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
999                 (u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
1000                  (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
1001
1002         nvhost_dbg_info("attrib cb addr : 0x%016llx", addr);
1003
1004         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
1005                 gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
1006                 gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
1007
1008         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
1009                 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
1010                 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
1011
1012         if (patch)
1013                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
1014
1015         return 0;
1016 }
1017
1018 static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
1019 {
1020         struct gr_gk20a *gr = &g->gr;
1021         struct channel_ctx_gk20a *ch_ctx = NULL;
1022         u32 gpm_pd_cfg;
1023         u32 pd_ab_dist_cfg0;
1024         u32 ds_debug;
1025         u32 mpc_vtg_debug;
1026         u32 pe_vaf;
1027         u32 pe_vsc_vpc;
1028
1029         nvhost_dbg_fn("");
1030
1031         gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
1032         pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
1033         ds_debug = gk20a_readl(g, gr_ds_debug_r());
1034         mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
1035
1036         if (patch) {
1037                 int err;
1038                 ch_ctx = &c->ch_ctx;
1039                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
1040                 if (err)
1041                         return err;
1042         }
1043
1044         if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
1045                 pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
1046                 pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
1047
1048                 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
1049                 pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
1050                 pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
1051                 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
1052                 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
1053                 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
1054
1055                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1056                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
1057                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
1058                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1059                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1060                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1061         } else {
1062                 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
1063                 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
1064                 ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
1065                 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
1066
1067                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1068                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1069                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1070                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1071         }
1072
1073         if (patch)
1074                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
1075
1076         return 0;
1077 }
1078
1079 static int gr_gk20a_setup_rop_mapping(struct gk20a *g,
1080                                 struct gr_gk20a *gr)
1081 {
1082         u32 norm_entries, norm_shift;
1083         u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
1084         u32 map0, map1, map2, map3, map4, map5;
1085
1086         if (!gr->map_tiles)
1087                 return -1;
1088
1089         nvhost_dbg_fn("");
1090
1091         gk20a_writel(g, gr_crstr_map_table_cfg_r(),
1092                      gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
1093                      gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
1094
1095         map0 =  gr_crstr_gpc_map0_tile0_f(gr->map_tiles[0]) |
1096                 gr_crstr_gpc_map0_tile1_f(gr->map_tiles[1]) |
1097                 gr_crstr_gpc_map0_tile2_f(gr->map_tiles[2]) |
1098                 gr_crstr_gpc_map0_tile3_f(gr->map_tiles[3]) |
1099                 gr_crstr_gpc_map0_tile4_f(gr->map_tiles[4]) |
1100                 gr_crstr_gpc_map0_tile5_f(gr->map_tiles[5]);
1101
1102         map1 =  gr_crstr_gpc_map1_tile6_f(gr->map_tiles[6]) |
1103                 gr_crstr_gpc_map1_tile7_f(gr->map_tiles[7]) |
1104                 gr_crstr_gpc_map1_tile8_f(gr->map_tiles[8]) |
1105                 gr_crstr_gpc_map1_tile9_f(gr->map_tiles[9]) |
1106                 gr_crstr_gpc_map1_tile10_f(gr->map_tiles[10]) |
1107                 gr_crstr_gpc_map1_tile11_f(gr->map_tiles[11]);
1108
1109         map2 =  gr_crstr_gpc_map2_tile12_f(gr->map_tiles[12]) |
1110                 gr_crstr_gpc_map2_tile13_f(gr->map_tiles[13]) |
1111                 gr_crstr_gpc_map2_tile14_f(gr->map_tiles[14]) |
1112                 gr_crstr_gpc_map2_tile15_f(gr->map_tiles[15]) |
1113                 gr_crstr_gpc_map2_tile16_f(gr->map_tiles[16]) |
1114                 gr_crstr_gpc_map2_tile17_f(gr->map_tiles[17]);
1115
1116         map3 =  gr_crstr_gpc_map3_tile18_f(gr->map_tiles[18]) |
1117                 gr_crstr_gpc_map3_tile19_f(gr->map_tiles[19]) |
1118                 gr_crstr_gpc_map3_tile20_f(gr->map_tiles[20]) |
1119                 gr_crstr_gpc_map3_tile21_f(gr->map_tiles[21]) |
1120                 gr_crstr_gpc_map3_tile22_f(gr->map_tiles[22]) |
1121                 gr_crstr_gpc_map3_tile23_f(gr->map_tiles[23]);
1122
1123         map4 =  gr_crstr_gpc_map4_tile24_f(gr->map_tiles[24]) |
1124                 gr_crstr_gpc_map4_tile25_f(gr->map_tiles[25]) |
1125                 gr_crstr_gpc_map4_tile26_f(gr->map_tiles[26]) |
1126                 gr_crstr_gpc_map4_tile27_f(gr->map_tiles[27]) |
1127                 gr_crstr_gpc_map4_tile28_f(gr->map_tiles[28]) |
1128                 gr_crstr_gpc_map4_tile29_f(gr->map_tiles[29]);
1129
1130         map5 =  gr_crstr_gpc_map5_tile30_f(gr->map_tiles[30]) |
1131                 gr_crstr_gpc_map5_tile31_f(gr->map_tiles[31]) |
1132                 gr_crstr_gpc_map5_tile32_f(0) |
1133                 gr_crstr_gpc_map5_tile33_f(0) |
1134                 gr_crstr_gpc_map5_tile34_f(0) |
1135                 gr_crstr_gpc_map5_tile35_f(0);
1136
1137         gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
1138         gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
1139         gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
1140         gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
1141         gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
1142         gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
1143
1144         switch (gr->tpc_count) {
1145         case 1:
1146                 norm_shift = 4;
1147                 break;
1148         case 2:
1149         case 3:
1150                 norm_shift = 3;
1151                 break;
1152         case 4:
1153         case 5:
1154         case 6:
1155         case 7:
1156                 norm_shift = 2;
1157                 break;
1158         case 8:
1159         case 9:
1160         case 10:
1161         case 11:
1162         case 12:
1163         case 13:
1164         case 14:
1165         case 15:
1166                 norm_shift = 1;
1167                 break;
1168         default:
1169                 norm_shift = 0;
1170                 break;
1171         }
1172
1173         norm_entries = gr->tpc_count << norm_shift;
1174         coeff5_mod = (1 << 5) % norm_entries;
1175         coeff6_mod = (1 << 6) % norm_entries;
1176         coeff7_mod = (1 << 7) % norm_entries;
1177         coeff8_mod = (1 << 8) % norm_entries;
1178         coeff9_mod = (1 << 9) % norm_entries;
1179         coeff10_mod = (1 << 10) % norm_entries;
1180         coeff11_mod = (1 << 11) % norm_entries;
1181
1182         gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
1183                      gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
1184                      gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
1185                      gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
1186                      gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
1187                      gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
1188
1189         gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
1190                      gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
1191                      gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
1192                      gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
1193                      gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
1194                      gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
1195                      gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
1196
1197         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
1198         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
1199         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
1200         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
1201         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
1202         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
1203
1204         gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
1205                      gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
1206                      gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
1207
1208         gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
1209         gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
1210         gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
1211         gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
1212         gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
1213         gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
1214
1215         return 0;
1216 }
1217
1218 static inline u32 count_bits(u32 mask)
1219 {
1220         u32 temp = mask;
1221         u32 count;
1222         for (count = 0; temp != 0; count++)
1223                 temp &= temp - 1;
1224
1225         return count;
1226 }
1227
1228 static inline u32 clear_count_bits(u32 num, u32 clear_count)
1229 {
1230         u32 count = clear_count;
1231         for (; (num != 0) && (count != 0); count--)
1232                 num &= num - 1;
1233
1234         return num;
1235 }
1236
1237 static int gr_gk20a_setup_alpha_beta_tables(struct gk20a *g,
1238                                         struct gr_gk20a *gr)
1239 {
1240         u32 table_index_bits = 5;
1241         u32 rows = (1 << table_index_bits);
1242         u32 row_stride = gr_pd_alpha_ratio_table__size_1_v() / rows;
1243
1244         u32 row;
1245         u32 index;
1246         u32 gpc_index;
1247         u32 gpcs_per_reg = 4;
1248         u32 pes_index;
1249         u32 tpc_count_pes;
1250         u32 num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
1251
1252         u32 alpha_target, beta_target;
1253         u32 alpha_bits, beta_bits;
1254         u32 alpha_mask, beta_mask, partial_mask;
1255         u32 reg_offset;
1256         bool assign_alpha;
1257
1258         u32 map_alpha[gr_pd_alpha_ratio_table__size_1_v()];
1259         u32 map_beta[gr_pd_alpha_ratio_table__size_1_v()];
1260         u32 map_reg_used[gr_pd_alpha_ratio_table__size_1_v()];
1261
1262         nvhost_dbg_fn("");
1263
1264         memset(map_alpha, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1265         memset(map_beta, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1266         memset(map_reg_used, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1267
1268         for (row = 0; row < rows; ++row) {
1269                 alpha_target = max_t(u32, gr->tpc_count * row / rows, 1);
1270                 beta_target = gr->tpc_count - alpha_target;
1271
1272                 assign_alpha = (alpha_target < beta_target);
1273
1274                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1275                         reg_offset = (row * row_stride) + (gpc_index / gpcs_per_reg);
1276                         alpha_mask = beta_mask = 0;
1277
1278                         for (pes_index = 0; pes_index < num_pes_per_gpc; pes_index++) {
1279                                 tpc_count_pes = gr->pes_tpc_count[pes_index][gpc_index];
1280
1281                                 if (assign_alpha) {
1282                                         alpha_bits = (alpha_target == 0) ? 0 : tpc_count_pes;
1283                                         beta_bits = tpc_count_pes - alpha_bits;
1284                                 } else {
1285                                         beta_bits = (beta_target == 0) ? 0 : tpc_count_pes;
1286                                         alpha_bits = tpc_count_pes - beta_bits;
1287                                 }
1288
1289                                 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index];
1290                                 partial_mask = clear_count_bits(partial_mask, tpc_count_pes - alpha_bits);
1291                                 alpha_mask |= partial_mask;
1292
1293                                 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index] ^ partial_mask;
1294                                 beta_mask |= partial_mask;
1295
1296                                 alpha_target -= min(alpha_bits, alpha_target);
1297                                 beta_target -= min(beta_bits, beta_target);
1298
1299                                 if ((alpha_bits > 0) || (beta_bits > 0))
1300                                         assign_alpha = !assign_alpha;
1301                         }
1302
1303                         switch (gpc_index % gpcs_per_reg) {
1304                         case 0:
1305                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n0_mask_f(alpha_mask);
1306                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n0_mask_f(beta_mask);
1307                                 break;
1308                         case 1:
1309                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n1_mask_f(alpha_mask);
1310                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n1_mask_f(beta_mask);
1311                                 break;
1312                         case 2:
1313                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n2_mask_f(alpha_mask);
1314                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n2_mask_f(beta_mask);
1315                                 break;
1316                         case 3:
1317                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n3_mask_f(alpha_mask);
1318                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n3_mask_f(beta_mask);
1319                                 break;
1320                         }
1321                         map_reg_used[reg_offset] = true;
1322                 }
1323         }
1324
1325         for (index = 0; index < gr_pd_alpha_ratio_table__size_1_v(); index++) {
1326                 if (map_reg_used[index]) {
1327                         gk20a_writel(g, gr_pd_alpha_ratio_table_r(index), map_alpha[index]);
1328                         gk20a_writel(g, gr_pd_beta_ratio_table_r(index), map_beta[index]);
1329                 }
1330         }
1331
1332         return 0;
1333 }
1334
1335 static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
1336 {
1337         struct gr_gk20a *gr = &g->gr;
1338         u32 tpc_index, gpc_index;
1339         u32 tpc_offset, gpc_offset;
1340         u32 sm_id = 0, gpc_id = 0;
1341         u32 sm_id_to_gpc_id[proj_scal_max_gpcs_v() * proj_scal_max_tpc_per_gpc_v()];
1342         u32 tpc_per_gpc;
1343         u32 max_ways_evict = INVALID_MAX_WAYS;
1344
1345         nvhost_dbg_fn("");
1346
1347         for (tpc_index = 0; tpc_index < gr->max_tpc_per_gpc_count; tpc_index++) {
1348                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1349                         gpc_offset = proj_gpc_stride_v() * gpc_index;
1350                         if (tpc_index < gr->gpc_tpc_count[gpc_index]) {
1351                                 tpc_offset = proj_tpc_in_gpc_stride_v() * tpc_index;
1352
1353                                 gk20a_writel(g, gr_gpc0_tpc0_sm_cfg_r() + gpc_offset + tpc_offset,
1354                                              gr_gpc0_tpc0_sm_cfg_sm_id_f(sm_id));
1355                                 gk20a_writel(g, gr_gpc0_tpc0_l1c_cfg_smid_r() + gpc_offset + tpc_offset,
1356                                              gr_gpc0_tpc0_l1c_cfg_smid_value_f(sm_id));
1357                                 gk20a_writel(g, gr_gpc0_gpm_pd_sm_id_r(tpc_index) + gpc_offset,
1358                                              gr_gpc0_gpm_pd_sm_id_id_f(sm_id));
1359                                 gk20a_writel(g, gr_gpc0_tpc0_pe_cfg_smid_r() + gpc_offset + tpc_offset,
1360                                              gr_gpc0_tpc0_pe_cfg_smid_value_f(sm_id));
1361
1362                                 sm_id_to_gpc_id[sm_id] = gpc_index;
1363                                 sm_id++;
1364                         }
1365
1366                         gk20a_writel(g, gr_gpc0_gpm_pd_active_tpcs_r() + gpc_offset,
1367                                      gr_gpc0_gpm_pd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1368                         gk20a_writel(g, gr_gpc0_gpm_sd_active_tpcs_r() + gpc_offset,
1369                                      gr_gpc0_gpm_sd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1370                 }
1371         }
1372
1373         for (tpc_index = 0, gpc_id = 0;
1374              tpc_index < gr_pd_num_tpc_per_gpc__size_1_v();
1375              tpc_index++, gpc_id += 8) {
1376
1377                 if (gpc_id >= gr->gpc_count)
1378                         gpc_id = 0;
1379
1380                 tpc_per_gpc =
1381                         gr_pd_num_tpc_per_gpc_count0_f(gr->gpc_tpc_count[gpc_id + 0]) |
1382                         gr_pd_num_tpc_per_gpc_count1_f(gr->gpc_tpc_count[gpc_id + 1]) |
1383                         gr_pd_num_tpc_per_gpc_count2_f(gr->gpc_tpc_count[gpc_id + 2]) |
1384                         gr_pd_num_tpc_per_gpc_count3_f(gr->gpc_tpc_count[gpc_id + 3]) |
1385                         gr_pd_num_tpc_per_gpc_count4_f(gr->gpc_tpc_count[gpc_id + 4]) |
1386                         gr_pd_num_tpc_per_gpc_count5_f(gr->gpc_tpc_count[gpc_id + 5]) |
1387                         gr_pd_num_tpc_per_gpc_count6_f(gr->gpc_tpc_count[gpc_id + 6]) |
1388                         gr_pd_num_tpc_per_gpc_count7_f(gr->gpc_tpc_count[gpc_id + 7]);
1389
1390                 gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1391                 gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1392         }
1393
1394         /* gr__setup_pd_mapping stubbed for gk20a */
1395         gr_gk20a_setup_rop_mapping(g, gr);
1396         gr_gk20a_setup_alpha_beta_tables(g, gr);
1397
1398         if (gr->num_fbps == 1)
1399                 max_ways_evict = 9;
1400
1401         if (max_ways_evict != INVALID_MAX_WAYS)
1402                 gk20a_writel(g, ltc_ltcs_ltss_tstg_set_mgmt_r(),
1403                              ((gk20a_readl(g, ltc_ltcs_ltss_tstg_set_mgmt_r()) &
1404                                ~(ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(~0))) |
1405                               ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(max_ways_evict)));
1406
1407         for (gpc_index = 0;
1408              gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
1409              gpc_index += 4) {
1410
1411                 gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
1412                              gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
1413                              gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
1414                              gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
1415                              gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
1416         }
1417
1418         gk20a_writel(g, gr_cwd_fs_r(),
1419                      gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1420                      gr_cwd_fs_num_tpcs_f(gr->tpc_count));
1421
1422         gk20a_writel(g, gr_bes_zrop_settings_r(),
1423                      gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
1424         gk20a_writel(g, gr_bes_crop_settings_r(),
1425                      gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
1426
1427         return 0;
1428 }
1429
1430 static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
1431 {
1432         struct gk20a *g = c->g;
1433         int ret;
1434
1435         u32 inst_base_ptr =
1436                 u64_lo32(c->inst_block.cpu_pa
1437                 >> ram_in_base_shift_v());
1438
1439
1440         nvhost_dbg_fn("");
1441
1442         ret = gr_gk20a_submit_fecs_method_op(g,
1443                 (struct fecs_method_op_gk20a) {
1444                 .method.addr = save_type,
1445                 .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1446                                 gr_fecs_current_ctx_target_vid_mem_f() |
1447                                 gr_fecs_current_ctx_valid_f(1)),
1448                 .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
1449                         .ok = 1, .fail = 2,
1450                 },
1451                 .cond.ok = GR_IS_UCODE_OP_AND,
1452                 .cond.fail = GR_IS_UCODE_OP_AND,
1453                  });
1454
1455         if (ret)
1456                 nvhost_err(dev_from_gk20a(g), "save context image failed");
1457
1458         return ret;
1459 }
1460
1461 /* init global golden image from a fresh gr_ctx in channel ctx.
1462    save a copy in local_golden_image in ctx_vars */
1463 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
1464                                           struct channel_gk20a *c)
1465 {
1466         struct gr_gk20a *gr = &g->gr;
1467         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1468         u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
1469         u32 ctx_header_words;
1470         u32 i;
1471         u32 data;
1472         void *ctx_ptr = NULL;
1473         void *gold_ptr = NULL;
1474         u32 err = 0;
1475
1476         nvhost_dbg_fn("");
1477
1478         /* golden ctx is global to all channels. Although only the first
1479            channel initializes golden image, driver needs to prevent multiple
1480            channels from initializing golden ctx at the same time */
1481         mutex_lock(&gr->ctx_mutex);
1482
1483         if (gr->ctx_vars.golden_image_initialized)
1484                 goto clean_up;
1485
1486         err = gr_gk20a_fecs_ctx_bind_channel(g, c);
1487         if (err)
1488                 goto clean_up;
1489
1490         err = gr_gk20a_elpg_protected_call(g,
1491                         gr_gk20a_commit_global_ctx_buffers(g, c, false));
1492         if (err)
1493                 goto clean_up;
1494
1495         gold_ptr = nvhost_memmgr_mmap(gr->global_ctx_buffer[GOLDEN_CTX].ref);
1496         if (!gold_ptr)
1497                 goto clean_up;
1498
1499         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1500                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1501                         0, pgprot_dmacoherent(PAGE_KERNEL));
1502         if (!ctx_ptr)
1503                 goto clean_up;
1504
1505         ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
1506         ctx_header_words >>= 2;
1507
1508         /* Channel gr_ctx buffer is gpu cacheable.
1509            Flush before cpu read. */
1510         gk20a_mm_fb_flush(g);
1511         gk20a_mm_l2_flush(g, false);
1512
1513         for (i = 0; i < ctx_header_words; i++) {
1514                 data = mem_rd32(ctx_ptr, i);
1515                 mem_wr32(gold_ptr, i, data);
1516         }
1517
1518         mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
1519                  ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
1520
1521         mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
1522
1523         gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
1524
1525         gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
1526
1527         if (gr->ctx_vars.local_golden_image == NULL) {
1528
1529                 gr->ctx_vars.local_golden_image =
1530                         kzalloc(gr->ctx_vars.golden_image_size, GFP_KERNEL);
1531
1532                 if (gr->ctx_vars.local_golden_image == NULL) {
1533                         err = -ENOMEM;
1534                         goto clean_up;
1535                 }
1536
1537                 for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1538                         gr->ctx_vars.local_golden_image[i] =
1539                                 mem_rd32(gold_ptr, i);
1540         }
1541
1542         gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
1543
1544         gr->ctx_vars.golden_image_initialized = true;
1545
1546         gk20a_mm_l2_invalidate(g);
1547
1548         gk20a_writel(g, gr_fecs_current_ctx_r(),
1549                 gr_fecs_current_ctx_valid_false_f());
1550
1551 clean_up:
1552         if (err)
1553                 nvhost_err(dev_from_gk20a(g), "fail");
1554         else
1555                 nvhost_dbg_fn("done");
1556
1557         if (gold_ptr)
1558                 nvhost_memmgr_munmap(gr->global_ctx_buffer[GOLDEN_CTX].ref,
1559                                      gold_ptr);
1560         if (ctx_ptr)
1561                 vunmap(ctx_ptr);
1562
1563         mutex_unlock(&gr->ctx_mutex);
1564         return err;
1565 }
1566
1567 int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
1568                                     struct channel_gk20a *c,
1569                                     bool enable_smpc_ctxsw)
1570 {
1571         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1572         void *ctx_ptr = NULL;
1573         u32 data;
1574
1575         /*XXX caller responsible for making sure the channel is quiesced? */
1576
1577         /* Channel gr_ctx buffer is gpu cacheable.
1578            Flush and invalidate before cpu update. */
1579         gk20a_mm_fb_flush(g);
1580         gk20a_mm_l2_flush(g, true);
1581
1582         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1583                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1584                         0, pgprot_dmacoherent(PAGE_KERNEL));
1585         if (!ctx_ptr)
1586                 return -ENOMEM;
1587
1588         data = mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1589         data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
1590         data |= enable_smpc_ctxsw ?
1591                 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
1592                 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
1593         mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1594                  data);
1595
1596         vunmap(ctx_ptr);
1597
1598         gk20a_mm_l2_invalidate(g);
1599
1600         return 0;
1601 }
1602
1603 /* load saved fresh copy of gloden image into channel gr_ctx */
1604 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
1605                                         struct channel_gk20a *c)
1606 {
1607         struct gr_gk20a *gr = &g->gr;
1608         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1609         u32 virt_addr_lo;
1610         u32 virt_addr_hi;
1611         u32 i, v, data;
1612         int ret = 0;
1613         void *ctx_ptr = NULL;
1614
1615         nvhost_dbg_fn("");
1616
1617         if (gr->ctx_vars.local_golden_image == NULL)
1618                 return -1;
1619
1620         /* Channel gr_ctx buffer is gpu cacheable.
1621            Flush and invalidate before cpu update. */
1622         gk20a_mm_fb_flush(g);
1623         gk20a_mm_l2_flush(g, true);
1624
1625         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1626                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1627                         0, pgprot_dmacoherent(PAGE_KERNEL));
1628         if (!ctx_ptr)
1629                 return -ENOMEM;
1630
1631         for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1632                 mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
1633
1634         mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
1635         mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
1636
1637         virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
1638         virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
1639
1640         mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
1641                  ch_ctx->patch_ctx.data_count);
1642         mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
1643                  virt_addr_lo);
1644         mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
1645                  virt_addr_hi);
1646
1647         /* no user for client managed performance counter ctx */
1648         ch_ctx->pm_ctx.ctx_sw_mode =
1649                 ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
1650         data = mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1651         data = data & ~ctxsw_prog_main_image_pm_mode_m();
1652         data |= ch_ctx->pm_ctx.ctx_sw_mode;
1653         mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1654                  data);
1655
1656         mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0);
1657
1658         /* set priv access map */
1659         virt_addr_lo =
1660                  u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1661         virt_addr_hi =
1662                  u64_hi32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1663
1664         mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
1665                  ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f());
1666         mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
1667                  virt_addr_lo);
1668         mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
1669                  virt_addr_hi);
1670         /* disable verif features */
1671         v = mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
1672         v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
1673         v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
1674         mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
1675
1676
1677         vunmap(ctx_ptr);
1678
1679         gk20a_mm_l2_invalidate(g);
1680
1681         if (tegra_platform_is_linsim()) {
1682                 u32 inst_base_ptr =
1683                         u64_lo32(c->inst_block.cpu_pa
1684                         >> ram_in_base_shift_v());
1685
1686                 ret = gr_gk20a_submit_fecs_method_op(g,
1687                           (struct fecs_method_op_gk20a) {
1688                                   .method.data =
1689                                           (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1690                                            gr_fecs_current_ctx_target_vid_mem_f() |
1691                                            gr_fecs_current_ctx_valid_f(1)),
1692                                   .method.addr =
1693                                           gr_fecs_method_push_adr_restore_golden_v(),
1694                                   .mailbox = {
1695                                           .id = 0, .data = 0,
1696                                           .clr = ~0, .ret = NULL,
1697                                           .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
1698                                           .fail = 0},
1699                                   .cond.ok = GR_IS_UCODE_OP_EQUAL,
1700                                   .cond.fail = GR_IS_UCODE_OP_SKIP});
1701
1702                 if (ret)
1703                         nvhost_err(dev_from_gk20a(g),
1704                                    "restore context image failed");
1705         }
1706
1707         return ret;
1708 }
1709
1710 static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
1711 {
1712         nvhost_dbg_fn("");
1713
1714         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
1715                      gr_fecs_ctxsw_mailbox_clear_value_f(~0));
1716
1717         gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
1718         gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
1719
1720         gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
1721         gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
1722
1723         nvhost_dbg_fn("done");
1724 }
1725
1726 static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
1727 {
1728         struct mm_gk20a *mm = &g->mm;
1729         struct vm_gk20a *vm = &mm->pmu.vm;
1730         struct device *d = dev_from_gk20a(g);
1731         struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1732         void *inst_ptr;
1733         u32 pde_addr_lo;
1734         u32 pde_addr_hi;
1735         u64 pde_addr;
1736
1737         /* Alloc mem of inst block */
1738         p_ucode_info->inst_blk_desc.size = ram_in_alloc_size_v();
1739         p_ucode_info->inst_blk_desc.cpuva = dma_alloc_coherent(d,
1740                                         p_ucode_info->inst_blk_desc.size,
1741                                         &p_ucode_info->inst_blk_desc.iova,
1742                                         GFP_KERNEL);
1743         if (!p_ucode_info->inst_blk_desc.cpuva) {
1744                 nvhost_err(d, "failed to allocate memory\n");
1745                 return -ENOMEM;
1746         }
1747
1748         p_ucode_info->inst_blk_desc.cpu_pa = gk20a_get_phys_from_iova(d,
1749                                         p_ucode_info->inst_blk_desc.iova);
1750
1751         inst_ptr = p_ucode_info->inst_blk_desc.cpuva;
1752
1753         /* Set inst block */
1754         mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
1755                  u64_lo32(vm->va_limit) | 0xFFF);
1756         mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
1757                 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
1758
1759         pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
1760         pde_addr_lo = u64_lo32(pde_addr >> 12);
1761         pde_addr_hi = u64_hi32(pde_addr);
1762         mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
1763                 ram_in_page_dir_base_target_vid_mem_f() |
1764                 ram_in_page_dir_base_vol_true_f() |
1765                 ram_in_page_dir_base_lo_f(pde_addr_lo));
1766         mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
1767                 ram_in_page_dir_base_hi_f(pde_addr_hi));
1768
1769         /* Map ucode surface to GMMU */
1770         p_ucode_info->ucode_gpuva = gk20a_gmmu_map(vm,
1771                                         &p_ucode_info->surface_desc.sgt,
1772                                         p_ucode_info->surface_desc.size,
1773                                         0, /* flags */
1774                                         mem_flag_read_only);
1775         if (!p_ucode_info->ucode_gpuva) {
1776                 nvhost_err(d, "failed to update gmmu ptes\n");
1777                 return -ENOMEM;
1778         }
1779
1780         return 0;
1781 }
1782
1783 static void gr_gk20a_init_ctxsw_ucode_segment(
1784         struct gk20a_ctxsw_ucode_segment *p_seg, u32 *p_offset, u32 size)
1785 {
1786         p_seg->offset = *p_offset;
1787         p_seg->size = size;
1788         *p_offset = ALIGN(*p_offset + size, BLK_SIZE);
1789 }
1790
1791 static void gr_gk20a_init_ctxsw_ucode_inst(
1792         struct gk20a_ctxsw_ucode_inst *p_inst, u32 *p_offset,
1793         struct gk20a_ctxsw_bootloader_desc *p_bootdesc,
1794         u32 code_size, u32 data_size)
1795 {
1796         u32 boot_size = ALIGN(p_bootdesc->bootloader_size, sizeof(u32));
1797         p_inst->boot_entry = p_bootdesc->bootloader_entry_point;
1798         p_inst->boot_imem_offset = p_bootdesc->bootloader_imem_offset;
1799         gr_gk20a_init_ctxsw_ucode_segment(&p_inst->boot, p_offset, boot_size);
1800         gr_gk20a_init_ctxsw_ucode_segment(&p_inst->code, p_offset, code_size);
1801         gr_gk20a_init_ctxsw_ucode_segment(&p_inst->data, p_offset, data_size);
1802 }
1803
1804 static int gr_gk20a_copy_ctxsw_ucode_inst(
1805         u8 *p_buf,
1806         struct gk20a_ctxsw_ucode_inst *p_inst,
1807         struct gk20a_ctxsw_bootloader_desc *p_bootdesc, u32 *p_bootimage,
1808         u32 *p_code, u32 *p_data)
1809 {
1810         memcpy(p_buf + p_inst->boot.offset, p_bootimage, p_inst->boot.size);
1811         memcpy(p_buf + p_inst->code.offset, p_code, p_inst->code.size);
1812         memcpy(p_buf + p_inst->data.offset, p_data, p_inst->data.size);
1813         return 0;
1814 }
1815
1816 static int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
1817 {
1818         struct device *d = dev_from_gk20a(g);
1819         struct mm_gk20a *mm = &g->mm;
1820         struct vm_gk20a *vm = &mm->pmu.vm;
1821         struct gk20a_ctxsw_bootloader_desc *p_fecs_boot_desc;
1822         struct gk20a_ctxsw_bootloader_desc *p_gpcs_boot_desc;
1823         const struct firmware *fecs_fw;
1824         const struct firmware *gpccs_fw;
1825         u32 *p_fecs_boot_image;
1826         u32 *p_gpcs_boot_image;
1827         struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1828         u8 *p_buf;
1829         u32 ucode_size;
1830         int err = 0;
1831         DEFINE_DMA_ATTRS(attrs);
1832
1833         fecs_fw = nvhost_client_request_firmware(g->dev,
1834                                         GK20A_FECS_UCODE_IMAGE);
1835         if (!fecs_fw) {
1836                 nvhost_err(d, "failed to load fecs ucode!!");
1837                 return -ENOENT;
1838         }
1839
1840         p_fecs_boot_desc = fecs_fw->data;
1841         p_fecs_boot_image = fecs_fw->data +
1842                                 sizeof(struct gk20a_ctxsw_bootloader_desc);
1843
1844         gpccs_fw = nvhost_client_request_firmware(g->dev,
1845                                         GK20A_GPCCS_UCODE_IMAGE);
1846         if (!gpccs_fw) {
1847                 release_firmware(fecs_fw);
1848                 nvhost_err(d, "failed to load gpccs ucode!!");
1849                 return -ENOENT;
1850         }
1851
1852         p_gpcs_boot_desc = gpccs_fw->data;
1853         p_gpcs_boot_image = gpccs_fw->data +
1854                                 sizeof(struct gk20a_ctxsw_bootloader_desc);
1855
1856         ucode_size = 0;
1857         gr_gk20a_init_ctxsw_ucode_inst(&p_ucode_info->fecs, &ucode_size,
1858                 p_fecs_boot_desc,
1859                 g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
1860                 g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
1861         gr_gk20a_init_ctxsw_ucode_inst(&p_ucode_info->gpcs, &ucode_size,
1862                 p_gpcs_boot_desc,
1863                 g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
1864                 g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
1865
1866         p_ucode_info->surface_desc.size = ucode_size;
1867         dma_set_attr(DMA_ATTR_READ_ONLY, &attrs);
1868         p_ucode_info->surface_desc.cpuva = dma_alloc_attrs(d,
1869                                         p_ucode_info->surface_desc.size,
1870                                         &p_ucode_info->surface_desc.iova,
1871                                         GFP_KERNEL,
1872                                         &attrs);
1873         if (!p_ucode_info->surface_desc.cpuva) {
1874                 nvhost_err(d, "memory allocation failed\n");
1875                 err = -ENOMEM;
1876                 goto clean_up;
1877         }
1878
1879         err = gk20a_get_sgtable(d, &p_ucode_info->surface_desc.sgt,
1880                                 p_ucode_info->surface_desc.cpuva,
1881                                 p_ucode_info->surface_desc.iova,
1882                                 p_ucode_info->surface_desc.size);
1883         if (err) {
1884                 nvhost_err(d, "failed to create sg table\n");
1885                 goto clean_up;
1886         }
1887
1888         p_buf = (u8 *)p_ucode_info->surface_desc.cpuva;
1889         if (!p_buf) {
1890                 release_firmware(fecs_fw);
1891                 release_firmware(gpccs_fw);
1892                 nvhost_err(d, "failed to map surface desc buffer");
1893                 return -ENOMEM;
1894         }
1895
1896         gr_gk20a_copy_ctxsw_ucode_inst(p_buf, &p_ucode_info->fecs,
1897                 p_fecs_boot_desc, p_fecs_boot_image,
1898                 g->gr.ctx_vars.ucode.fecs.inst.l,
1899                 g->gr.ctx_vars.ucode.fecs.data.l);
1900
1901         gr_gk20a_copy_ctxsw_ucode_inst(p_buf, &p_ucode_info->gpcs,
1902                 p_gpcs_boot_desc, p_gpcs_boot_image,
1903                 g->gr.ctx_vars.ucode.gpccs.inst.l,
1904                 g->gr.ctx_vars.ucode.gpccs.data.l);
1905
1906         err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
1907         if (err)
1908                 goto clean_up;
1909
1910         gk20a_free_sgtable(&p_ucode_info->surface_desc.sgt);
1911
1912         return 0;
1913
1914  clean_up:
1915         if (p_ucode_info->ucode_gpuva)
1916                 gk20a_gmmu_unmap(vm, p_ucode_info->ucode_gpuva,
1917                         p_ucode_info->surface_desc.size, mem_flag_none);
1918         if (p_ucode_info->surface_desc.sgt)
1919                 gk20a_free_sgtable(&p_ucode_info->surface_desc.sgt);
1920         if (p_ucode_info->surface_desc.cpuva)
1921                 dma_free_attrs(d, p_ucode_info->surface_desc.size,
1922                                 p_ucode_info->surface_desc.cpuva,
1923                                 p_ucode_info->surface_desc.iova,
1924                                 &attrs);
1925         p_ucode_info->surface_desc.cpuva = NULL;
1926         p_ucode_info->surface_desc.iova = 0;
1927
1928         return err;
1929 }
1930
1931 static void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
1932 {
1933         struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1934         int retries = 20;
1935         phys_addr_t inst_ptr;
1936         u32 val;
1937
1938         while ((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
1939                         gr_fecs_ctxsw_status_1_arb_busy_m()) && retries) {
1940                 udelay(2);
1941                 retries--;
1942         }
1943         if (!retries)
1944                 nvhost_err(dev_from_gk20a(g), "arbiter idle timeout");
1945
1946         gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
1947
1948         inst_ptr = p_ucode_info->inst_blk_desc.cpu_pa;
1949         gk20a_writel(g, gr_fecs_new_ctx_r(),
1950                         gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
1951                         gr_fecs_new_ctx_target_m() |
1952                         gr_fecs_new_ctx_valid_m());
1953
1954         gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
1955                         gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
1956                         gr_fecs_arb_ctx_ptr_target_m());
1957
1958         gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
1959
1960         /* Wait for arbiter command to complete */
1961         retries = 20;
1962         val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1963         while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
1964                 udelay(2);
1965                 retries--;
1966                 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1967         }
1968         if (!retries)
1969                 nvhost_err(dev_from_gk20a(g), "arbiter complete timeout");
1970
1971         gk20a_writel(g, gr_fecs_current_ctx_r(),
1972                         gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
1973                         gr_fecs_current_ctx_target_m() |
1974                         gr_fecs_current_ctx_valid_m());
1975         /* Send command to arbiter to flush */
1976         gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
1977
1978         retries = 20;
1979         val = (gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
1980         while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
1981                 udelay(2);
1982                 retries--;
1983                 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1984         }
1985         if (!retries)
1986                 nvhost_err(dev_from_gk20a(g), "arbiter complete timeout");
1987 }
1988
1989 static int gr_gk20a_load_ctxsw_ucode_inst(struct gk20a *g, u64 addr_base,
1990         struct gk20a_ctxsw_ucode_inst *p_inst, u32 reg_offset)
1991 {
1992         u32 addr_code32;
1993         u32 addr_data32;
1994         u32 addr_load32;
1995         u32 dst = 0;
1996         u32 blocks;
1997         u32 b;
1998
1999         addr_code32 = u64_lo32((addr_base + p_inst->code.offset) >> 8);
2000         addr_data32 = u64_lo32((addr_base + p_inst->data.offset) >> 8);
2001         addr_load32 = u64_lo32((addr_base + p_inst->boot.offset) >> 8);
2002
2003         gk20a_writel(g, reg_offset + gr_fecs_dmactl_r(),
2004                         gr_fecs_dmactl_require_ctx_f(0));
2005
2006         /*
2007          * Copy falcon bootloader header into dmem at offset 0.
2008          * Configure dmem port 0 for auto-incrementing writes starting at dmem
2009          * offset 0.
2010          */
2011         gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
2012                         gr_fecs_dmemc_offs_f(0) |
2013                         gr_fecs_dmemc_blk_f(0) |
2014                         gr_fecs_dmemc_aincw_f(1));
2015
2016         /* Write out the actual data */
2017         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2018         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
2019         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2020         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), p_inst->code.size);
2021         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2022         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_data32);
2023         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), p_inst->data.size);
2024         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
2025         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2026         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2027
2028         blocks = ((p_inst->boot.size + 0xFF) & ~0xFF) >> 8;
2029
2030         /*
2031          * Set the base FB address for the DMA transfer. Subtract off the 256
2032          * byte IMEM block offset such that the relative FB and IMEM offsets
2033          * match, allowing the IMEM tags to be properly created.
2034          */
2035
2036         dst = p_inst->boot_imem_offset;
2037         gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
2038                         (addr_load32 - (dst >> 8)));
2039
2040         for (b = 0; b < blocks; b++) {
2041                 /* Setup destination IMEM offset */
2042                 gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
2043                                 dst + (b << 8));
2044
2045                 /* Setup source offset (relative to BASE) */
2046                 gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
2047                                 dst + (b << 8));
2048
2049                 gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
2050                                 gr_fecs_dmatrfcmd_imem_f(0x01) |
2051                                 gr_fecs_dmatrfcmd_write_f(0x00) |
2052                                 gr_fecs_dmatrfcmd_size_f(0x06) |
2053                                 gr_fecs_dmatrfcmd_ctxdma_f(0));
2054         }
2055
2056         /* Specify the falcon boot vector */
2057         gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
2058                         gr_fecs_bootvec_vec_f(p_inst->boot_entry));
2059
2060         /* Write to CPUCTL to start the falcon */
2061         gk20a_writel(g, reg_offset + gr_fecs_cpuctl_r(),
2062                         gr_fecs_cpuctl_startcpu_f(0x01));
2063
2064         return 0;
2065 }
2066
2067 static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
2068 {
2069         struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
2070         u64 addr_base = p_ucode_info->ucode_gpuva;
2071
2072         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
2073
2074         gr_gk20a_load_falcon_bind_instblk(g);
2075
2076         gr_gk20a_load_ctxsw_ucode_inst(g, addr_base,
2077                 &g->ctxsw_ucode_info.fecs, 0);
2078
2079         gr_gk20a_load_ctxsw_ucode_inst(g, addr_base,
2080                 &g->ctxsw_ucode_info.gpcs,
2081                 gr_gpcs_gpccs_falcon_hwcfg_r() -
2082                 gr_fecs_falcon_hwcfg_r());
2083 }
2084
2085 static int gr_gk20a_load_ctxsw_ucode(struct gk20a *g, struct gr_gk20a *gr)
2086 {
2087         u32 ret;
2088
2089         nvhost_dbg_fn("");
2090
2091         if (tegra_platform_is_linsim()) {
2092                 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
2093                         gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
2094                 gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
2095                         gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
2096         }
2097
2098         /*
2099          * In case the gPMU falcon is not being used, revert to the old way of
2100          * loading gr ucode, without the faster bootstrap routine.
2101          */
2102         if (!support_gk20a_pmu()) {
2103                 gr_gk20a_load_falcon_dmem(g);
2104                 gr_gk20a_load_falcon_imem(g);
2105                 gr_gk20a_start_falcon_ucode(g);
2106         } else {
2107                 if (!gr->skip_ucode_init)
2108                         gr_gk20a_init_ctxsw_ucode(g);
2109                 gr_gk20a_load_falcon_with_bootloader(g);
2110                 gr->skip_ucode_init = true;
2111         }
2112
2113         ret = gr_gk20a_ctx_wait_ucode(g, 0, 0,
2114                                       GR_IS_UCODE_OP_EQUAL,
2115                                       eUcodeHandshakeInitComplete,
2116                                       GR_IS_UCODE_OP_SKIP, 0);
2117         if (ret) {
2118                 nvhost_err(dev_from_gk20a(g), "falcon ucode init timeout");
2119                 return ret;
2120         }
2121
2122         if (support_gk20a_pmu())
2123                 gk20a_writel(g, gr_fecs_current_ctx_r(),
2124                         gr_fecs_current_ctx_valid_false_f());
2125
2126         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
2127         gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
2128         gk20a_writel(g, gr_fecs_method_push_r(),
2129                      gr_fecs_method_push_adr_set_watchdog_timeout_f());
2130
2131         nvhost_dbg_fn("done");
2132         return 0;
2133 }
2134
2135 static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
2136 {
2137         u32 golden_ctx_image_size = 0;
2138         u32 zcull_ctx_image_size = 0;
2139         u32 pm_ctx_image_size = 0;
2140         u32 ret;
2141         struct fecs_method_op_gk20a op = {
2142                 .mailbox = { .id = 0, .data = 0,
2143                              .clr = ~0, .ok = 0, .fail = 0},
2144                 .method.data = 0,
2145                 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
2146                 .cond.fail = GR_IS_UCODE_OP_SKIP,
2147                 };
2148
2149         nvhost_dbg_fn("");
2150         op.method.addr = gr_fecs_method_push_adr_discover_image_size_v();
2151         op.mailbox.ret = &golden_ctx_image_size;
2152         ret = gr_gk20a_submit_fecs_method_op(g, op);
2153         if (ret) {
2154                 nvhost_err(dev_from_gk20a(g),
2155                            "query golden image size failed");
2156                 return ret;
2157         }
2158         op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v();
2159         op.mailbox.ret = &zcull_ctx_image_size;
2160         ret = gr_gk20a_submit_fecs_method_op(g, op);
2161         if (ret) {
2162                 nvhost_err(dev_from_gk20a(g),
2163                            "query zcull ctx image size failed");
2164                 return ret;
2165         }
2166         op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v();
2167         op.mailbox.ret = &pm_ctx_image_size;
2168         ret = gr_gk20a_submit_fecs_method_op(g, op);
2169         if (ret) {
2170                 nvhost_err(dev_from_gk20a(g),
2171                            "query pm ctx image size failed");
2172                 return ret;
2173         }
2174
2175         if (!g->gr.ctx_vars.golden_image_size &&
2176             !g->gr.ctx_vars.zcull_ctxsw_image_size) {
2177                 g->gr.ctx_vars.golden_image_size = golden_ctx_image_size;
2178                 g->gr.ctx_vars.zcull_ctxsw_image_size = zcull_ctx_image_size;
2179         } else {
2180                 /* hw is different after railgating? */
2181                 BUG_ON(g->gr.ctx_vars.golden_image_size != golden_ctx_image_size);
2182                 BUG_ON(g->gr.ctx_vars.zcull_ctxsw_image_size != zcull_ctx_image_size);
2183         }
2184
2185         g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
2186
2187         nvhost_dbg_fn("done");
2188         return 0;
2189 }
2190
2191 static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
2192 {
2193         struct gr_gk20a *gr = &g->gr;
2194         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2195         struct mem_handle *mem;
2196         u32 i, attr_buffer_size;
2197
2198         u32 cb_buffer_size = gr_scc_bundle_cb_size_div_256b__prod_v() *
2199                 gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
2200
2201         u32 pagepool_buffer_size = gr_scc_pagepool_total_pages_hwmax_value_v() *
2202                 gr_scc_pagepool_total_pages_byte_granularity_v();
2203
2204         u32 attr_cb_default_size = gr_gpc0_ppc0_cbm_cfg_size_default_v();
2205         u32 alpha_cb_default_size = gr_gpc0_ppc0_cbm_cfg2_size_default_v();
2206
2207         u32 attr_cb_size =
2208                 attr_cb_default_size + (attr_cb_default_size >> 1);
2209         u32 alpha_cb_size =
2210                 alpha_cb_default_size + (alpha_cb_default_size >> 1);
2211
2212         u32 num_tpcs_per_pes = proj_scal_litter_num_tpcs_per_pes_v();
2213         u32 attr_max_size_per_tpc =
2214                 gr_gpc0_ppc0_cbm_cfg_size_v(~0) / num_tpcs_per_pes;
2215         u32 alpha_max_size_per_tpc =
2216                 gr_gpc0_ppc0_cbm_cfg2_size_v(~0) / num_tpcs_per_pes;
2217
2218
2219         nvhost_dbg_fn("");
2220
2221         attr_cb_size =
2222                 (attr_cb_size > attr_max_size_per_tpc) ?
2223                         attr_max_size_per_tpc : attr_cb_size;
2224         attr_cb_default_size =
2225                 (attr_cb_default_size > attr_cb_size) ?
2226                         attr_cb_size : attr_cb_default_size;
2227         alpha_cb_size =
2228                 (alpha_cb_size > alpha_max_size_per_tpc) ?
2229                         alpha_max_size_per_tpc : alpha_cb_size;
2230         alpha_cb_default_size =
2231                 (alpha_cb_default_size > alpha_cb_size) ?
2232                         alpha_cb_size : alpha_cb_default_size;
2233
2234         attr_buffer_size =
2235                 (gr_gpc0_ppc0_cbm_cfg_size_granularity_v() * alpha_cb_size +
2236                  gr_gpc0_ppc0_cbm_cfg2_size_granularity_v() * alpha_cb_size) *
2237                  gr->gpc_count;
2238
2239         nvhost_dbg_info("cb_buffer_size : %d", cb_buffer_size);
2240
2241         mem = nvhost_memmgr_alloc(memmgr, cb_buffer_size,
2242                                   DEFAULT_ALLOC_ALIGNMENT,
2243                                   DEFAULT_ALLOC_FLAGS,
2244                                   0);
2245         if (IS_ERR(mem))
2246                 goto clean_up;
2247
2248         gr->global_ctx_buffer[CIRCULAR].ref = mem;
2249         gr->global_ctx_buffer[CIRCULAR].size = cb_buffer_size;
2250
2251         mem = nvhost_memmgr_alloc(memmgr, cb_buffer_size,
2252                                   DEFAULT_ALLOC_ALIGNMENT,
2253                                   DEFAULT_ALLOC_FLAGS,
2254                                   NVMAP_HEAP_CARVEOUT_VPR);
2255         if (!IS_ERR(mem)) {
2256                 gr->global_ctx_buffer[CIRCULAR_VPR].ref = mem;
2257                 gr->global_ctx_buffer[CIRCULAR_VPR].size = cb_buffer_size;
2258         }
2259
2260         nvhost_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
2261
2262         mem = nvhost_memmgr_alloc(memmgr, pagepool_buffer_size,
2263                                   DEFAULT_ALLOC_ALIGNMENT,
2264                                   DEFAULT_ALLOC_FLAGS,
2265                                   0);
2266         if (IS_ERR(mem))
2267                 goto clean_up;
2268
2269         gr->global_ctx_buffer[PAGEPOOL].ref = mem;
2270         gr->global_ctx_buffer[PAGEPOOL].size = pagepool_buffer_size;
2271
2272         mem = nvhost_memmgr_alloc(memmgr, pagepool_buffer_size,
2273                                   DEFAULT_ALLOC_ALIGNMENT,
2274                                   DEFAULT_ALLOC_FLAGS,
2275                                   NVMAP_HEAP_CARVEOUT_VPR);
2276         if (!IS_ERR(mem)) {
2277                 gr->global_ctx_buffer[PAGEPOOL_VPR].ref = mem;
2278                 gr->global_ctx_buffer[PAGEPOOL_VPR].size = pagepool_buffer_size;
2279         }
2280
2281         nvhost_dbg_info("attr_buffer_size : %d", attr_buffer_size);
2282
2283         mem = nvhost_memmgr_alloc(memmgr, attr_buffer_size,
2284                                   DEFAULT_ALLOC_ALIGNMENT,
2285                                   DEFAULT_ALLOC_FLAGS,
2286                                   0);
2287         if (IS_ERR(mem))
2288                 goto clean_up;
2289
2290         gr->global_ctx_buffer[ATTRIBUTE].ref = mem;
2291         gr->global_ctx_buffer[ATTRIBUTE].size = attr_buffer_size;
2292
2293         mem = nvhost_memmgr_alloc(memmgr, attr_buffer_size,
2294                                   DEFAULT_ALLOC_ALIGNMENT,
2295                                   DEFAULT_ALLOC_FLAGS,
2296                                   NVMAP_HEAP_CARVEOUT_VPR);
2297         if (!IS_ERR(mem)) {
2298                 gr->global_ctx_buffer[ATTRIBUTE_VPR].ref = mem;
2299                 gr->global_ctx_buffer[ATTRIBUTE_VPR].size = attr_buffer_size;
2300         }
2301
2302         nvhost_dbg_info("golden_image_size : %d",
2303                    gr->ctx_vars.golden_image_size);
2304
2305         mem = nvhost_memmgr_alloc(memmgr, gr->ctx_vars.golden_image_size,
2306                                   DEFAULT_ALLOC_ALIGNMENT,
2307                                   DEFAULT_ALLOC_FLAGS,
2308                                   0);
2309         if (IS_ERR(mem))
2310                 goto clean_up;
2311
2312         gr->global_ctx_buffer[GOLDEN_CTX].ref = mem;
2313         gr->global_ctx_buffer[GOLDEN_CTX].size =
2314                 gr->ctx_vars.golden_image_size;
2315
2316         nvhost_dbg_info("priv_access_map_size : %d",
2317                    gr->ctx_vars.priv_access_map_size);
2318
2319         mem = nvhost_memmgr_alloc(memmgr, gr->ctx_vars.priv_access_map_size,
2320                                   DEFAULT_ALLOC_ALIGNMENT,
2321                                   DEFAULT_ALLOC_FLAGS,
2322                                   0);
2323         if (IS_ERR(mem))
2324                 goto clean_up;
2325
2326         gr->global_ctx_buffer[PRIV_ACCESS_MAP].ref = mem;
2327         gr->global_ctx_buffer[PRIV_ACCESS_MAP].size =
2328                 gr->ctx_vars.priv_access_map_size;
2329
2330         nvhost_dbg_fn("done");
2331         return 0;
2332
2333  clean_up:
2334         nvhost_err(dev_from_gk20a(g), "fail");
2335         for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2336                 if (gr->global_ctx_buffer[i].ref) {
2337                         nvhost_memmgr_put(memmgr,
2338                                           gr->global_ctx_buffer[i].ref);
2339                         memset(&gr->global_ctx_buffer[i],
2340                                 0, sizeof(struct mem_desc));
2341                 }
2342         }
2343         return -ENOMEM;
2344 }
2345
2346 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
2347 {
2348         struct gr_gk20a *gr = &g->gr;
2349         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2350         u32 i;
2351
2352         for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2353                 nvhost_memmgr_put(memmgr, gr->global_ctx_buffer[i].ref);
2354                 memset(&gr->global_ctx_buffer[i], 0, sizeof(struct mem_desc));
2355         }
2356
2357         nvhost_dbg_fn("done");
2358 }
2359
2360 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
2361                                         struct channel_gk20a *c)
2362 {
2363         struct vm_gk20a *ch_vm = c->vm;
2364         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2365         struct mem_handle *handle_ref;
2366         u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2367         struct gr_gk20a *gr = &g->gr;
2368         u64 gpu_va;
2369         u32 i;
2370         nvhost_dbg_fn("");
2371
2372         /* Circular Buffer */
2373         if (!c->vpr || (gr->global_ctx_buffer[CIRCULAR_VPR].ref == NULL))
2374                 handle_ref = gr->global_ctx_buffer[CIRCULAR].ref;
2375         else
2376                 handle_ref = gr->global_ctx_buffer[CIRCULAR_VPR].ref;
2377
2378         gpu_va = gk20a_vm_map(ch_vm, memmgr, handle_ref,
2379                               /*offset_align, flags, kind*/
2380                               0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0,
2381                               NULL, false, mem_flag_none);
2382         if (!gpu_va)
2383                 goto clean_up;
2384         g_bfr_va[CIRCULAR_VA] = gpu_va;
2385
2386         /* Attribute Buffer */
2387         if (!c->vpr || (gr->global_ctx_buffer[ATTRIBUTE_VPR].ref == NULL))
2388                 handle_ref = gr->global_ctx_buffer[ATTRIBUTE].ref;
2389         else
2390                 handle_ref = gr->global_ctx_buffer[ATTRIBUTE_VPR].ref;
2391
2392         gpu_va = gk20a_vm_map(ch_vm, memmgr, handle_ref,
2393                               /*offset_align, flags, kind*/
2394                               0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0,
2395                               NULL, false, mem_flag_none);
2396         if (!gpu_va)
2397                 goto clean_up;
2398         g_bfr_va[ATTRIBUTE_VA] = gpu_va;
2399
2400         /* Page Pool */
2401         if (!c->vpr || (gr->global_ctx_buffer[PAGEPOOL_VPR].ref == NULL))
2402                 handle_ref = gr->global_ctx_buffer[PAGEPOOL].ref;
2403         else
2404                 handle_ref = gr->global_ctx_buffer[PAGEPOOL_VPR].ref;
2405
2406         gpu_va = gk20a_vm_map(ch_vm, memmgr, handle_ref,
2407                               /*offset_align, flags, kind*/
2408                               0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0,
2409                               NULL, false, mem_flag_none);
2410         if (!gpu_va)
2411                 goto clean_up;
2412         g_bfr_va[PAGEPOOL_VA] = gpu_va;
2413
2414         /* Golden Image */
2415         gpu_va = gk20a_vm_map(ch_vm, memmgr,
2416                               gr->global_ctx_buffer[GOLDEN_CTX].ref,
2417                               /*offset_align, flags, kind*/
2418                               0, 0, 0, NULL, false, mem_flag_none);
2419         if (!gpu_va)
2420                 goto clean_up;
2421         g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
2422
2423         /* Priv register Access Map */
2424         gpu_va = gk20a_vm_map(ch_vm, memmgr,
2425                               gr->global_ctx_buffer[PRIV_ACCESS_MAP].ref,
2426                               /*offset_align, flags, kind*/
2427                               0, 0, 0, NULL, false,
2428                               mem_flag_none);
2429         if (!gpu_va)
2430                 goto clean_up;
2431         g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
2432
2433         c->ch_ctx.global_ctx_buffer_mapped = true;
2434         return 0;
2435
2436  clean_up:
2437         for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2438                 if (g_bfr_va[i]) {
2439                         gk20a_vm_unmap(ch_vm, g_bfr_va[i]);
2440                         g_bfr_va[i] = 0;
2441                 }
2442         }
2443         return -ENOMEM;
2444 }
2445
2446 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
2447 {
2448         struct vm_gk20a *ch_vm = c->vm;
2449         u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2450         u32 i;
2451
2452         nvhost_dbg_fn("");
2453
2454         for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2455                 if (g_bfr_va[i]) {
2456                         gk20a_vm_unmap(ch_vm, g_bfr_va[i]);
2457                         g_bfr_va[i] = 0;
2458                 }
2459         }
2460         c->ch_ctx.global_ctx_buffer_mapped = false;
2461 }
2462
2463 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
2464                                 struct channel_gk20a *c)
2465 {
2466         struct gr_gk20a *gr = &g->gr;
2467         struct gr_ctx_desc *gr_ctx = &c->ch_ctx.gr_ctx;
2468         struct vm_gk20a *ch_vm = c->vm;
2469         struct device *d = dev_from_gk20a(g);
2470         struct sg_table *sgt;
2471         DEFINE_DMA_ATTRS(attrs);
2472         int err = 0;
2473
2474         nvhost_dbg_fn("");
2475
2476         if (gr->ctx_vars.buffer_size == 0)
2477                 return 0;
2478
2479         /* alloc channel gr ctx buffer */
2480         gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
2481         gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
2482
2483         gr_ctx->size = gr->ctx_vars.buffer_total_size;
2484         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2485         gr_ctx->pages = dma_alloc_attrs(d, gr_ctx->size,
2486                                 &gr_ctx->iova, GFP_KERNEL, &attrs);
2487         if (!gr_ctx->pages)
2488                 return -ENOMEM;
2489
2490         err = gk20a_get_sgtable_from_pages(d, &sgt, gr_ctx->pages,
2491                         gr_ctx->iova, gr_ctx->size);
2492         if (err)
2493                 goto err_free;
2494
2495         gr_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, gr_ctx->size,
2496                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2497                                 mem_flag_none);
2498         if (!gr_ctx->gpu_va)
2499                 goto err_free_sgt;
2500
2501         gk20a_free_sgtable(&sgt);
2502
2503         return 0;
2504
2505  err_free_sgt:
2506         gk20a_free_sgtable(&sgt);
2507  err_free:
2508         dma_free_attrs(d, gr_ctx->size,
2509                 gr_ctx->pages, gr_ctx->iova, &attrs);
2510         gr_ctx->pages = NULL;
2511         gr_ctx->iova = 0;
2512
2513         return err;
2514 }
2515
2516 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
2517 {
2518         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2519         struct vm_gk20a *ch_vm = c->vm;
2520         struct gk20a *g = c->g;
2521         struct device *d = dev_from_gk20a(g);
2522         DEFINE_DMA_ATTRS(attrs);
2523
2524         nvhost_dbg_fn("");
2525
2526         if (!ch_ctx->gr_ctx.gpu_va)
2527                 return;
2528
2529         gk20a_gmmu_unmap(ch_vm, ch_ctx->gr_ctx.gpu_va,
2530                         ch_ctx->gr_ctx.size, mem_flag_none);
2531         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2532         dma_free_attrs(d, ch_ctx->gr_ctx.size,
2533                 ch_ctx->gr_ctx.pages, ch_ctx->gr_ctx.iova, &attrs);
2534         ch_ctx->gr_ctx.pages = NULL;
2535         ch_ctx->gr_ctx.iova = 0;
2536 }
2537
2538 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
2539                                 struct channel_gk20a *c)
2540 {
2541         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2542         struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
2543         struct vm_gk20a *ch_vm = c->vm;
2544
2545         nvhost_dbg_fn("");
2546
2547         patch_ctx->mem.ref = nvhost_memmgr_alloc(memmgr, 128 * sizeof(u32),
2548                                                  DEFAULT_ALLOC_ALIGNMENT,
2549                                                  DEFAULT_ALLOC_FLAGS,
2550                                                  0);
2551         if (IS_ERR(patch_ctx->mem.ref))
2552                 return -ENOMEM;
2553
2554         patch_ctx->gpu_va = gk20a_vm_map(ch_vm, memmgr,
2555                                          patch_ctx->mem.ref,
2556                                          /*offset_align, flags, kind*/
2557                                          0, 0, 0, NULL, false, mem_flag_none);
2558         if (!patch_ctx->gpu_va)
2559                 goto clean_up;
2560
2561         nvhost_dbg_fn("done");
2562         return 0;
2563
2564  clean_up:
2565         nvhost_err(dev_from_gk20a(g), "fail");
2566         if (patch_ctx->mem.ref) {
2567                 nvhost_memmgr_put(memmgr, patch_ctx->mem.ref);
2568                 patch_ctx->mem.ref = 0;
2569         }
2570
2571         return -ENOMEM;
2572 }
2573
2574 static void gr_gk20a_unmap_channel_patch_ctx(struct channel_gk20a *c)
2575 {
2576         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2577         struct vm_gk20a *ch_vm = c->vm;
2578
2579         nvhost_dbg_fn("");
2580
2581         if (patch_ctx->gpu_va)
2582                 gk20a_vm_unmap(ch_vm, patch_ctx->gpu_va);
2583         patch_ctx->gpu_va = 0;
2584         patch_ctx->data_count = 0;
2585 }
2586
2587 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
2588 {
2589         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2590         struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
2591
2592         nvhost_dbg_fn("");
2593
2594         gr_gk20a_unmap_channel_patch_ctx(c);
2595
2596         if (patch_ctx->mem.ref) {
2597                 nvhost_memmgr_put(memmgr, patch_ctx->mem.ref);
2598                 patch_ctx->mem.ref = 0;
2599         }
2600 }
2601
2602 void gk20a_free_channel_ctx(struct channel_gk20a *c)
2603 {
2604         gr_gk20a_unmap_global_ctx_buffers(c);
2605         gr_gk20a_free_channel_patch_ctx(c);
2606         gr_gk20a_free_channel_gr_ctx(c);
2607
2608         /* zcull_ctx, pm_ctx */
2609
2610         memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
2611
2612         c->num_objects = 0;
2613         c->first_init = false;
2614 }
2615
2616 int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
2617                         struct nvhost_alloc_obj_ctx_args *args)
2618 {
2619         struct gk20a *g = c->g;
2620         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2621         bool change_to_compute_mode = false;
2622         int err = 0;
2623
2624         nvhost_dbg_fn("");
2625
2626         /* an address space needs to have been bound at this point.*/
2627         if (!gk20a_channel_as_bound(c)) {
2628                 nvhost_err(dev_from_gk20a(g),
2629                            "not bound to address space at time"
2630                            " of grctx allocation");
2631                 return -EINVAL;
2632         }
2633
2634         switch (args->class_num) {
2635         case KEPLER_COMPUTE_A:
2636                 /* tbd: NV2080_CTRL_GPU_COMPUTE_MODE_RULES_EXCLUSIVE_COMPUTE */
2637                 /* tbd: PDB_PROP_GRAPHICS_DISTINCT_3D_AND_COMPUTE_STATE_DEF  */
2638                 change_to_compute_mode = true;
2639                 break;
2640         case KEPLER_C:
2641         case FERMI_TWOD_A:
2642         case KEPLER_DMA_COPY_A:
2643                 break;
2644
2645         default:
2646                 nvhost_err(dev_from_gk20a(g),
2647                            "invalid obj class 0x%x", args->class_num);
2648                 err = -EINVAL;
2649                 goto out;
2650         }
2651
2652         /* allocate gr ctx buffer */
2653         if (ch_ctx->gr_ctx.pages == NULL) {
2654                 err = gr_gk20a_alloc_channel_gr_ctx(g, c);
2655                 if (err) {
2656                         nvhost_err(dev_from_gk20a(g),
2657                                 "fail to allocate gr ctx buffer");
2658                         goto out;
2659                 }
2660                 c->obj_class = args->class_num;
2661         } else {
2662                 /*TBD: needs to be more subtle about which is being allocated
2663                 * as some are allowed to be allocated along same channel */
2664                 nvhost_err(dev_from_gk20a(g),
2665                         "too many classes alloc'd on same channel");
2666                 err = -EINVAL;
2667                 goto out;
2668         }
2669
2670         /* commit gr ctx buffer */
2671         err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
2672         if (err) {
2673                 nvhost_err(dev_from_gk20a(g),
2674                         "fail to commit gr ctx buffer");
2675                 goto out;
2676         }
2677
2678         /* allocate patch buffer */
2679         if (ch_ctx->patch_ctx.mem.ref == NULL) {
2680                 err = gr_gk20a_alloc_channel_patch_ctx(g, c);
2681                 if (err) {
2682                         nvhost_err(dev_from_gk20a(g),
2683                                 "fail to allocate patch buffer");
2684                         goto out;
2685                 }
2686         }
2687
2688         /* map global buffer to channel gpu_va and commit */
2689         if (!ch_ctx->global_ctx_buffer_mapped) {
2690                 err = gr_gk20a_map_global_ctx_buffers(g, c);
2691                 if (err) {
2692                         nvhost_err(dev_from_gk20a(g),
2693                                 "fail to map global ctx buffer");
2694                         goto out;
2695                 }
2696                 gr_gk20a_elpg_protected_call(g,
2697                         gr_gk20a_commit_global_ctx_buffers(g, c, true));
2698         }
2699
2700         /* init golden image, ELPG enabled after this is done */
2701         err = gr_gk20a_init_golden_ctx_image(g, c);
2702         if (err) {
2703                 nvhost_err(dev_from_gk20a(g),
2704                         "fail to init golden ctx image");
2705                 goto out;
2706         }
2707
2708         /* load golden image */
2709         if (!c->first_init) {
2710                 err = gr_gk20a_elpg_protected_call(g,
2711                         gr_gk20a_load_golden_ctx_image(g, c));
2712                 if (err) {
2713                         nvhost_err(dev_from_gk20a(g),
2714                                 "fail to load golden ctx image");
2715                         goto out;
2716                 }
2717                 c->first_init = true;
2718         }
2719         gk20a_mm_l2_invalidate(g);
2720
2721         c->num_objects++;
2722
2723         nvhost_dbg_fn("done");
2724         return 0;
2725 out:
2726         /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
2727            can be reused so no need to release them.
2728            2. golden image init and load is a one time thing so if
2729            they pass, no need to undo. */
2730         nvhost_err(dev_from_gk20a(g), "fail");
2731         return err;
2732 }
2733
2734 int gk20a_free_obj_ctx(struct channel_gk20a  *c,
2735                        struct nvhost_free_obj_ctx_args *args)
2736 {
2737         unsigned long timeout = gk20a_get_gr_idle_timeout(c->g);
2738
2739         nvhost_dbg_fn("");
2740
2741         if (c->num_objects == 0)
2742                 return 0;
2743
2744         c->num_objects--;
2745
2746         if (c->num_objects == 0) {
2747                 c->first_init = false;
2748                 gk20a_disable_channel(c,
2749                         !c->hwctx->has_timedout,
2750                         timeout);
2751                 gr_gk20a_unmap_channel_patch_ctx(c);
2752         }
2753
2754         return 0;
2755 }
2756
2757 static void gk20a_remove_gr_support(struct gr_gk20a *gr)
2758 {
2759         struct gk20a *g = gr->g;
2760         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2761         struct device *d = dev_from_gk20a(g);
2762
2763         nvhost_dbg_fn("");
2764
2765         gr_gk20a_free_global_ctx_buffers(g);
2766
2767         dma_free_coherent(d, gr->mmu_wr_mem.size,
2768                 gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
2769         gr->mmu_wr_mem.cpuva = NULL;
2770         gr->mmu_wr_mem.iova = 0;
2771         dma_free_coherent(d, gr->mmu_rd_mem.size,
2772                 gr->mmu_rd_mem.cpuva, gr->mmu_rd_mem.iova);
2773         gr->mmu_rd_mem.cpuva = NULL;
2774         gr->mmu_rd_mem.iova = 0;
2775
2776         nvhost_memmgr_put(memmgr, gr->compbit_store.mem.ref);
2777
2778         memset(&gr->mmu_wr_mem, 0, sizeof(struct mmu_desc));
2779         memset(&gr->mmu_rd_mem, 0, sizeof(struct mmu_desc));
2780         memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
2781
2782         kfree(gr->gpc_tpc_count);
2783         kfree(gr->gpc_zcb_count);
2784         kfree(gr->gpc_ppc_count);
2785         kfree(gr->pes_tpc_count[0]);
2786         kfree(gr->pes_tpc_count[1]);
2787         kfree(gr->pes_tpc_mask[0]);
2788         kfree(gr->pes_tpc_mask[1]);
2789         kfree(gr->gpc_skip_mask);
2790         kfree(gr->map_tiles);
2791         gr->gpc_tpc_count = NULL;
2792         gr->gpc_zcb_count = NULL;
2793         gr->gpc_ppc_count = NULL;
2794         gr->pes_tpc_count[0] = NULL;
2795         gr->pes_tpc_count[1] = NULL;
2796         gr->pes_tpc_mask[0] = NULL;
2797         gr->pes_tpc_mask[1] = NULL;
2798         gr->gpc_skip_mask = NULL;
2799         gr->map_tiles = NULL;
2800
2801         kfree(gr->ctx_vars.ucode.fecs.inst.l);
2802         kfree(gr->ctx_vars.ucode.fecs.data.l);
2803         kfree(gr->ctx_vars.ucode.gpccs.inst.l);
2804         kfree(gr->ctx_vars.ucode.gpccs.data.l);
2805         kfree(gr->ctx_vars.sw_bundle_init.l);
2806         kfree(gr->ctx_vars.sw_method_init.l);
2807         kfree(gr->ctx_vars.sw_ctx_load.l);
2808         kfree(gr->ctx_vars.sw_non_ctx_load.l);
2809         kfree(gr->ctx_vars.ctxsw_regs.sys.l);
2810         kfree(gr->ctx_vars.ctxsw_regs.gpc.l);
2811         kfree(gr->ctx_vars.ctxsw_regs.tpc.l);
2812         kfree(gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
2813         kfree(gr->ctx_vars.ctxsw_regs.ppc.l);
2814         kfree(gr->ctx_vars.ctxsw_regs.pm_sys.l);
2815         kfree(gr->ctx_vars.ctxsw_regs.pm_gpc.l);
2816         kfree(gr->ctx_vars.ctxsw_regs.pm_tpc.l);
2817
2818         kfree(gr->ctx_vars.local_golden_image);
2819         gr->ctx_vars.local_golden_image = NULL;
2820
2821         nvhost_allocator_destroy(&gr->comp_tags);
2822 }
2823
2824 static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
2825 {
2826         u32 gpc_index, pes_index;
2827         u32 pes_tpc_mask;
2828         u32 pes_tpc_count;
2829         u32 pes_heavy_index;
2830         u32 gpc_new_skip_mask;
2831         u32 tmp;
2832
2833         tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
2834         gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
2835
2836         tmp = gk20a_readl(g, top_num_gpcs_r());
2837         gr->max_gpc_count = top_num_gpcs_value_v(tmp);
2838
2839         tmp = gk20a_readl(g, top_num_fbps_r());
2840         gr->max_fbps_count = top_num_fbps_value_v(tmp);
2841
2842         tmp = gk20a_readl(g, top_tpc_per_gpc_r());
2843         gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
2844
2845         gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
2846
2847         tmp = gk20a_readl(g, top_num_fbps_r());
2848         gr->sys_count = top_num_fbps_value_v(tmp);
2849
2850         tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
2851         gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
2852
2853         gr->pe_count_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
2854         gr->max_zcull_per_gpc_count = proj_scal_litter_num_zcull_banks_v();
2855
2856         if (!gr->gpc_count) {
2857                 nvhost_err(dev_from_gk20a(g), "gpc_count==0!");
2858                 goto clean_up;
2859         }
2860
2861         gr->gpc_tpc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2862         gr->gpc_zcb_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2863         gr->gpc_ppc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2864         gr->pes_tpc_count[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2865         gr->pes_tpc_count[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2866         gr->pes_tpc_mask[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2867         gr->pes_tpc_mask[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2868         gr->gpc_skip_mask =
2869                 kzalloc(gr_pd_dist_skip_table__size_1_v() * 4 * sizeof(u32),
2870                         GFP_KERNEL);
2871
2872         if (!gr->gpc_tpc_count || !gr->gpc_zcb_count || !gr->gpc_ppc_count ||
2873             !gr->pes_tpc_count[0] || !gr->pes_tpc_count[1] ||
2874             !gr->pes_tpc_mask[0] || !gr->pes_tpc_mask[1] || !gr->gpc_skip_mask)
2875                 goto clean_up;
2876
2877         gr->ppc_count = 0;
2878         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
2879                 tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r());
2880
2881                 gr->gpc_tpc_count[gpc_index] =
2882                         gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
2883                 gr->tpc_count += gr->gpc_tpc_count[gpc_index];
2884
2885                 gr->gpc_zcb_count[gpc_index] =
2886                         gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
2887                 gr->zcb_count += gr->gpc_zcb_count[gpc_index];
2888
2889                 gr->gpc_ppc_count[gpc_index] = gr->pe_count_per_gpc;
2890                 gr->ppc_count += gr->gpc_ppc_count[gpc_index];
2891                 for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
2892
2893                         tmp = gk20a_readl(g,
2894                                 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
2895                                 gpc_index * proj_gpc_stride_v());
2896
2897                         pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
2898                         pes_tpc_count = count_bits(pes_tpc_mask);
2899
2900                         gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
2901                         gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
2902                 }
2903
2904                 gpc_new_skip_mask = 0;
2905                 if (gr->pes_tpc_count[0][gpc_index] +
2906                     gr->pes_tpc_count[1][gpc_index] == 5) {
2907                         pes_heavy_index =
2908                                 gr->pes_tpc_count[0][gpc_index] >
2909                                 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
2910
2911                         gpc_new_skip_mask =
2912                                 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
2913                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
2914                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
2915
2916                 } else if ((gr->pes_tpc_count[0][gpc_index] +
2917                             gr->pes_tpc_count[1][gpc_index] == 4) &&
2918                            (gr->pes_tpc_count[0][gpc_index] !=
2919                             gr->pes_tpc_count[1][gpc_index])) {
2920                                 pes_heavy_index =
2921                                     gr->pes_tpc_count[0][gpc_index] >
2922                                     gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
2923
2924                         gpc_new_skip_mask =
2925                                 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
2926                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
2927                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
2928                 }
2929                 gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
2930         }
2931
2932         nvhost_dbg_info("fbps: %d", gr->num_fbps);
2933         nvhost_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
2934         nvhost_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
2935         nvhost_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
2936         nvhost_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
2937         nvhost_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
2938         nvhost_dbg_info("sys_count: %d", gr->sys_count);
2939         nvhost_dbg_info("gpc_count: %d", gr->gpc_count);
2940         nvhost_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
2941         nvhost_dbg_info("tpc_count: %d", gr->tpc_count);
2942         nvhost_dbg_info("ppc_count: %d", gr->ppc_count);
2943
2944         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2945                 nvhost_dbg_info("gpc_tpc_count[%d] : %d",
2946                            gpc_index, gr->gpc_tpc_count[gpc_index]);
2947         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2948                 nvhost_dbg_info("gpc_zcb_count[%d] : %d",
2949                            gpc_index, gr->gpc_zcb_count[gpc_index]);
2950         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2951                 nvhost_dbg_info("gpc_ppc_count[%d] : %d",
2952                            gpc_index, gr->gpc_ppc_count[gpc_index]);
2953         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2954                 nvhost_dbg_info("gpc_skip_mask[%d] : %d",
2955                            gpc_index, gr->gpc_skip_mask[gpc_index]);
2956         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2957                 for (pes_index = 0;
2958                      pes_index < gr->pe_count_per_gpc;
2959                      pes_index++)
2960                         nvhost_dbg_info("pes_tpc_count[%d][%d] : %d",
2961                                    pes_index, gpc_index,
2962                                    gr->pes_tpc_count[pes_index][gpc_index]);
2963
2964         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2965                 for (pes_index = 0;
2966                      pes_index < gr->pe_count_per_gpc;
2967                      pes_index++)
2968                         nvhost_dbg_info("pes_tpc_mask[%d][%d] : %d",
2969                                    pes_index, gpc_index,
2970                                    gr->pes_tpc_mask[pes_index][gpc_index]);
2971
2972         gr->bundle_cb_default_size = gr_scc_bundle_cb_size_div_256b__prod_v();
2973         gr->min_gpm_fifo_depth = gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
2974         gr->bundle_cb_token_limit = gr_pd_ab_dist_cfg2_token_limit_init_v();
2975         gr->attrib_cb_default_size = gr_gpc0_ppc0_cbm_cfg_size_default_v();
2976         /* gk20a has a fixed beta CB RAM, don't alloc more */
2977         gr->attrib_cb_size = gr->attrib_cb_default_size;
2978         gr->alpha_cb_default_size = gr_gpc0_ppc0_cbm_cfg2_size_default_v();
2979         gr->alpha_cb_size = gr->alpha_cb_default_size + (gr->alpha_cb_default_size >> 1);
2980         gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
2981
2982         nvhost_dbg_info("bundle_cb_default_size: %d",
2983                    gr->bundle_cb_default_size);
2984         nvhost_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
2985         nvhost_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
2986         nvhost_dbg_info("attrib_cb_default_size: %d",
2987                    gr->attrib_cb_default_size);
2988         nvhost_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
2989         nvhost_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
2990         nvhost_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
2991         nvhost_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
2992
2993         return 0;
2994
2995 clean_up:
2996         return -ENOMEM;
2997 }
2998
2999 static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
3000 {
3001         struct device *d = dev_from_gk20a(g);
3002
3003         gr->mmu_wr_mem_size = gr->mmu_rd_mem_size = 0x1000;
3004
3005         gr->mmu_wr_mem.size = gr->mmu_wr_mem_size;
3006         gr->mmu_wr_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_wr_mem_size,
3007                                         &gr->mmu_wr_mem.iova, GFP_KERNEL);
3008         if (!gr->mmu_wr_mem.cpuva)
3009                 goto err;
3010
3011         gr->mmu_rd_mem.size = gr->mmu_rd_mem_size;
3012         gr->mmu_rd_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_rd_mem_size,
3013                                         &gr->mmu_rd_mem.iova, GFP_KERNEL);
3014         if (!gr->mmu_rd_mem.cpuva)
3015                 goto err_free_wr_mem;
3016         return 0;
3017
3018  err_free_wr_mem:
3019         dma_free_coherent(d, gr->mmu_wr_mem.size,
3020                 gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
3021         gr->mmu_wr_mem.cpuva = NULL;
3022         gr->mmu_wr_mem.iova = 0;
3023  err:
3024         return -ENOMEM;
3025 }
3026
3027 static u32 prime_set[18] = {
3028         2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
3029
3030 static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
3031 {
3032         s32 comm_denom;
3033         s32 mul_factor;
3034         s32 *init_frac = NULL;
3035         s32 *init_err = NULL;
3036         s32 *run_err = NULL;
3037         s32 *sorted_num_tpcs = NULL;
3038         s32 *sorted_to_unsorted_gpc_map = NULL;
3039         u32 gpc_index;
3040         u32 gpc_mark = 0;
3041         u32 num_tpc;
3042         u32 max_tpc_count = 0;
3043         u32 swap;
3044         u32 tile_count;
3045         u32 index;
3046         bool delete_map = false;
3047         bool gpc_sorted;
3048         int ret = 0;
3049
3050         init_frac = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3051         init_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3052         run_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3053         sorted_num_tpcs =
3054                 kzalloc(proj_scal_max_gpcs_v() *
3055                         proj_scal_max_tpc_per_gpc_v() * sizeof(s32),
3056                         GFP_KERNEL);
3057         sorted_to_unsorted_gpc_map =
3058                 kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3059
3060         if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
3061               sorted_to_unsorted_gpc_map)) {
3062                 ret = -ENOMEM;
3063                 goto clean_up;
3064         }
3065
3066         gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
3067
3068         if (gr->tpc_count == 3)
3069                 gr->map_row_offset = 2;
3070         else if (gr->tpc_count < 3)
3071                 gr->map_row_offset = 1;
3072         else {
3073                 gr->map_row_offset = 3;
3074
3075                 for (index = 1; index < 18; index++) {
3076                         u32 prime = prime_set[index];
3077                         if ((gr->tpc_count % prime) != 0) {
3078                                 gr->map_row_offset = prime;
3079                                 break;
3080                         }
3081                 }
3082         }
3083
3084         switch (gr->tpc_count) {
3085         case 15:
3086                 gr->map_row_offset = 6;
3087                 break;
3088         case 14:
3089                 gr->map_row_offset = 5;
3090                 break;
3091         case 13:
3092                 gr->map_row_offset = 2;
3093                 break;
3094         case 11:
3095                 gr->map_row_offset = 7;
3096                 break;
3097         case 10:
3098                 gr->map_row_offset = 6;
3099                 break;
3100         case 7:
3101         case 5:
3102                 gr->map_row_offset = 1;
3103                 break;
3104         default:
3105                 break;
3106         }
3107
3108         if (gr->map_tiles) {
3109                 if (gr->map_tile_count != gr->tpc_count)
3110                         delete_map = true;
3111
3112                 for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
3113                         if ((u32)gr->map_tiles[tile_count] >= gr->tpc_count)
3114                                 delete_map = true;
3115                 }
3116
3117                 if (delete_map) {
3118                         kfree(gr->map_tiles);
3119                         gr->map_tiles = NULL;
3120                         gr->map_tile_count = 0;
3121                 }
3122         }
3123
3124         if (gr->map_tiles == NULL) {
3125                 gr->map_tile_count = proj_scal_max_gpcs_v();
3126
3127                 gr->map_tiles = kzalloc(proj_scal_max_gpcs_v() * sizeof(u8), GFP_KERNEL);
3128                 if (gr->map_tiles == NULL) {
3129                         ret = -ENOMEM;
3130                         goto clean_up;
3131                 }
3132
3133                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3134                         sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
3135                         sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
3136                 }
3137
3138                 gpc_sorted = false;
3139                 while (!gpc_sorted) {
3140                         gpc_sorted = true;
3141                         for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
3142                                 if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
3143                                         gpc_sorted = false;
3144                                         swap = sorted_num_tpcs[gpc_index];
3145                                         sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
3146                                         sorted_num_tpcs[gpc_index + 1] = swap;
3147                                         swap = sorted_to_unsorted_gpc_map[gpc_index];
3148                                         sorted_to_unsorted_gpc_map[gpc_index] =
3149                                                 sorted_to_unsorted_gpc_map[gpc_index + 1];
3150                                         sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
3151                                 }
3152                         }
3153                 }
3154
3155                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3156                         if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
3157                                 max_tpc_count = gr->gpc_tpc_count[gpc_index];
3158
3159                 mul_factor = gr->gpc_count * max_tpc_count;
3160                 if (mul_factor & 0x1)
3161                         mul_factor = 2;
3162                 else
3163                         mul_factor = 1;
3164
3165                 comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
3166
3167                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3168                         num_tpc = sorted_num_tpcs[gpc_index];
3169
3170                         init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
3171
3172                         if (num_tpc != 0)
3173                                 init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
3174                         else
3175                                 init_err[gpc_index] = 0;
3176
3177                         run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
3178                 }
3179
3180                 while (gpc_mark < gr->tpc_count) {
3181                         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3182                                 if ((run_err[gpc_index] * 2) >= comm_denom) {
3183                                         gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
3184                                         run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
3185                                 } else
3186                                         run_err[gpc_index] += init_frac[gpc_index];
3187                         }
3188                 }
3189         }
3190
3191 clean_up:
3192         kfree(init_frac);
3193         kfree(init_err);
3194         kfree(run_err);
3195         kfree(sorted_num_tpcs);
3196         kfree(sorted_to_unsorted_gpc_map);
3197
3198         if (ret)
3199                 nvhost_err(dev_from_gk20a(g), "fail");
3200         else
3201                 nvhost_dbg_fn("done");
3202
3203         return ret;
3204 }
3205
3206 static int gr_gk20a_init_comptag(struct gk20a *g, struct gr_gk20a *gr)
3207 {
3208         struct mem_mgr *memmgr = mem_mgr_from_g(g);
3209
3210         /* max memory size (MB) to cover */
3211         u32 max_size = gr->max_comptag_mem;
3212         /* one tag line covers 128KB */
3213         u32 max_comptag_lines = max_size << 3;
3214
3215         u32 hw_max_comptag_lines =
3216                 ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_init_v();
3217
3218         u32 cbc_param =
3219                 gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r());
3220         u32 comptags_per_cacheline =
3221                 ltc_ltcs_ltss_cbc_param_comptags_per_cache_line_v(cbc_param);
3222         u32 slices_per_fbp =
3223                 ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(cbc_param);
3224         u32 cacheline_size =
3225                 512 << ltc_ltcs_ltss_cbc_param_cache_line_size_v(cbc_param);
3226
3227         u32 compbit_backing_size;
3228         int ret = 0;
3229
3230         nvhost_dbg_fn("");
3231
3232         if (max_comptag_lines == 0) {
3233                 gr->compbit_store.mem.size = 0;
3234                 return 0;
3235         }
3236
3237         if (max_comptag_lines > hw_max_comptag_lines)
3238                 max_comptag_lines = hw_max_comptag_lines;
3239
3240         /* no hybird fb */
3241         compbit_backing_size =
3242                 DIV_ROUND_UP(max_comptag_lines, comptags_per_cacheline) *
3243                 cacheline_size * slices_per_fbp * gr->num_fbps;
3244
3245         /* aligned to 2KB * num_fbps */
3246         compbit_backing_size +=
3247                 gr->num_fbps << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
3248
3249         /* must be a multiple of 64KB */
3250         compbit_backing_size = roundup(compbit_backing_size, 64*1024);
3251
3252         max_comptag_lines =
3253                 (compbit_backing_size * comptags_per_cacheline) /
3254                 cacheline_size * slices_per_fbp * gr->num_fbps;
3255
3256         if (max_comptag_lines > hw_max_comptag_lines)
3257                 max_comptag_lines = hw_max_comptag_lines;
3258
3259         nvhost_dbg_info("compbit backing store size : %d",
3260                 compbit_backing_size);
3261         nvhost_dbg_info("max comptag lines : %d",
3262                 max_comptag_lines);
3263
3264         gr->compbit_store.mem.ref =
3265                 nvhost_memmgr_alloc(memmgr, compbit_backing_size,
3266                                     DEFAULT_ALLOC_ALIGNMENT,
3267                                     DEFAULT_ALLOC_FLAGS,
3268                                     0);
3269         if (IS_ERR(gr->compbit_store.mem.ref)) {
3270                 nvhost_err(dev_from_gk20a(g), "failed to allocate"
3271                            "backing store for compbit : size %d",
3272                            compbit_backing_size);
3273                 return PTR_ERR(gr->compbit_store.mem.ref);
3274         }
3275         gr->compbit_store.mem.size = compbit_backing_size;
3276
3277         gr->compbit_store.mem.sgt =
3278                 nvhost_memmgr_pin(memmgr, gr->compbit_store.mem.ref,
3279                                 dev_from_gk20a(g), mem_flag_none);
3280         if (IS_ERR(gr->compbit_store.mem.sgt)) {
3281                 ret = PTR_ERR(gr->compbit_store.mem.sgt);
3282                 goto clean_up;
3283         }
3284         gr->compbit_store.base_pa =
3285                 gk20a_mm_iova_addr(gr->compbit_store.mem.sgt->sgl);
3286
3287         nvhost_allocator_init(&gr->comp_tags, "comptag",
3288                               1, /* start */
3289                               max_comptag_lines - 1, /* length*/
3290                               1); /* align */
3291
3292         return 0;
3293
3294 clean_up:
3295         if (gr->compbit_store.mem.sgt)
3296                 nvhost_memmgr_free_sg_table(memmgr, gr->compbit_store.mem.ref,
3297                                 gr->compbit_store.mem.sgt);
3298         nvhost_memmgr_put(memmgr, gr->compbit_store.mem.ref);
3299         return ret;
3300 }
3301
3302 int gk20a_gr_clear_comptags(struct gk20a *g, u32 min, u32 max)
3303 {
3304         struct gr_gk20a *gr = &g->gr;
3305         u32 fbp, slice, ctrl1, val;
3306         unsigned long end_jiffies = jiffies +
3307                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3308         u32 delay = GR_IDLE_CHECK_DEFAULT;
3309         u32 slices_per_fbp =
3310                 ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(
3311                         gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r()));
3312
3313         nvhost_dbg_fn("");
3314
3315         if (gr->compbit_store.mem.size == 0)
3316                 return 0;
3317
3318         gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl2_r(),
3319                      ltc_ltcs_ltss_cbc_ctrl2_clear_lower_bound_f(min));
3320         gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl3_r(),
3321                      ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_f(max));
3322         gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl1_r(),
3323                      gk20a_readl(g, ltc_ltcs_ltss_cbc_ctrl1_r()) |
3324                      ltc_ltcs_ltss_cbc_ctrl1_clear_active_f());
3325
3326         for (fbp = 0; fbp < gr->num_fbps; fbp++) {
3327                 for (slice = 0; slice < slices_per_fbp; slice++) {
3328
3329                         delay = GR_IDLE_CHECK_DEFAULT;
3330
3331                         ctrl1 = ltc_ltc0_lts0_cbc_ctrl1_r() +
3332                                 fbp * proj_ltc_stride_v() +
3333                                 slice * proj_lts_stride_v();
3334
3335                         do {
3336                                 val = gk20a_readl(g, ctrl1);
3337                                 if (ltc_ltcs_ltss_cbc_ctrl1_clear_v(val) !=
3338                                     ltc_ltcs_ltss_cbc_ctrl1_clear_active_v())
3339                                         break;
3340
3341                                 usleep_range(delay, delay * 2);
3342                                 delay = min_t(u32, delay << 1,
3343                                         GR_IDLE_CHECK_MAX);
3344
3345                         } while (time_before(jiffies, end_jiffies) |
3346                                         !tegra_platform_is_silicon());
3347
3348                         if (!time_before(jiffies, end_jiffies)) {
3349                                 nvhost_err(dev_from_gk20a(g),
3350                                            "comp tag clear timeout\n");
3351                                 return -EBUSY;
3352                         }
3353                 }
3354         }
3355
3356         return 0;
3357 }
3358
3359 static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
3360 {
3361         struct gr_zcull_gk20a *zcull = &gr->zcull;
3362
3363         zcull->aliquot_width = gr->tpc_count * 16;
3364         zcull->aliquot_height = 16;
3365
3366         zcull->width_align_pixels = gr->tpc_count * 16;
3367         zcull->height_align_pixels = 32;
3368
3369         zcull->aliquot_size =
3370                 zcull->aliquot_width * zcull->aliquot_height;
3371
3372         /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
3373         zcull->pixel_squares_by_aliquots =
3374                 gr->zcb_count * 16 * 16 * gr->tpc_count /
3375                 (gr->gpc_count * gr->gpc_tpc_count[0]);
3376
3377         zcull->total_aliquots =
3378                 gr_gpc0_zcull_total_ram_size_num_aliquots_f(
3379                         gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
3380
3381         return 0;
3382 }
3383
3384 u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
3385 {
3386         /* assuming gr has already been initialized */
3387         return gr->ctx_vars.zcull_ctxsw_image_size;
3388 }
3389
3390 int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
3391                         struct channel_gk20a *c, u64 zcull_va, u32 mode)
3392 {
3393         struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
3394
3395         zcull_ctx->ctx_sw_mode = mode;
3396         zcull_ctx->gpu_va = zcull_va;
3397
3398         /* TBD: don't disable channel in sw method processing */
3399         return gr_gk20a_ctx_zcull_setup(g, c, true);
3400 }
3401
3402 int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
3403                         struct gr_zcull_info *zcull_params)
3404 {
3405         struct gr_zcull_gk20a *zcull = &gr->zcull;
3406
3407         zcull_params->width_align_pixels = zcull->width_align_pixels;
3408         zcull_params->height_align_pixels = zcull->height_align_pixels;
3409         zcull_params->pixel_squares_by_aliquots =
3410                 zcull->pixel_squares_by_aliquots;
3411         zcull_params->aliquot_total = zcull->total_aliquots;
3412
3413         zcull_params->region_byte_multiplier =
3414                 gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
3415         zcull_params->region_header_size =
3416                 proj_scal_litter_num_gpcs_v() *
3417                 gr_zcull_save_restore_header_bytes_per_gpc_v();
3418
3419         zcull_params->subregion_header_size =
3420                 proj_scal_litter_num_gpcs_v() *
3421                 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
3422
3423         zcull_params->subregion_width_align_pixels =
3424                 gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
3425         zcull_params->subregion_height_align_pixels =
3426                 gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
3427         zcull_params->subregion_count = gr_zcull_subregion_qty_v();
3428
3429         return 0;
3430 }
3431
3432 static int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
3433                                 struct zbc_entry *color_val, u32 index)
3434 {
3435         struct fifo_gk20a *f = &g->fifo;
3436         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3437         u32 i;
3438         unsigned long end_jiffies = jiffies +
3439                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3440         u32 ret;
3441
3442         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3443         if (ret) {
3444                 nvhost_err(dev_from_gk20a(g),
3445                         "failed to disable gr engine activity\n");
3446                 return ret;
3447         }
3448
3449         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3450         if (ret) {
3451                 nvhost_err(dev_from_gk20a(g),
3452                         "failed to idle graphics\n");
3453                 goto clean_up;
3454         }
3455
3456         /* update l2 table */
3457         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3458                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3459                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3460                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(index +
3461                                         GK20A_STARTOF_ZBC_TABLE));
3462
3463         for (i = 0; i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++)
3464                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(i),
3465                         color_val->color_l2[i]);
3466
3467         /* update ds table */
3468         gk20a_writel(g, gr_ds_zbc_color_r_r(),
3469                 gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
3470         gk20a_writel(g, gr_ds_zbc_color_g_r(),
3471                 gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
3472         gk20a_writel(g, gr_ds_zbc_color_b_r(),
3473                 gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
3474         gk20a_writel(g, gr_ds_zbc_color_a_r(),
3475                 gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
3476
3477         gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3478                 gr_ds_zbc_color_fmt_val_f(color_val->format));
3479
3480         gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3481                 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3482
3483         /* trigger the write */
3484         gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3485                 gr_ds_zbc_tbl_ld_select_c_f() |
3486                 gr_ds_zbc_tbl_ld_action_write_f() |
3487                 gr_ds_zbc_tbl_ld_trigger_active_f());
3488
3489         /* update local copy */
3490         for (i = 0; i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++) {
3491                 gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
3492                 gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
3493         }
3494         gr->zbc_col_tbl[index].format = color_val->format;
3495         gr->zbc_col_tbl[index].ref_cnt++;
3496
3497 clean_up:
3498         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3499         if (ret) {
3500                 nvhost_err(dev_from_gk20a(g),
3501                         "failed to enable gr engine activity\n");
3502         }
3503
3504         return ret;
3505 }
3506
3507 static int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
3508                                 struct zbc_entry *depth_val, u32 index)
3509 {
3510         struct fifo_gk20a *f = &g->fifo;
3511         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3512         unsigned long end_jiffies = jiffies +
3513                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3514         u32 ret;
3515
3516         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3517         if (ret) {
3518                 nvhost_err(dev_from_gk20a(g),
3519                         "failed to disable gr engine activity\n");
3520                 return ret;
3521         }
3522
3523         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3524         if (ret) {
3525                 nvhost_err(dev_from_gk20a(g),
3526                         "failed to idle graphics\n");
3527                 goto clean_up;
3528         }
3529
3530         /* update l2 table */
3531         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3532                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3533                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3534                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(index +
3535                                         GK20A_STARTOF_ZBC_TABLE));
3536
3537         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(),
3538                         depth_val->depth);
3539
3540         /* update ds table */
3541         gk20a_writel(g, gr_ds_zbc_z_r(),
3542                 gr_ds_zbc_z_val_f(depth_val->depth));
3543
3544         gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3545                 gr_ds_zbc_z_fmt_val_f(depth_val->format));
3546
3547         gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3548                 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3549
3550         /* trigger the write */
3551         gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3552                 gr_ds_zbc_tbl_ld_select_z_f() |
3553                 gr_ds_zbc_tbl_ld_action_write_f() |
3554                 gr_ds_zbc_tbl_ld_trigger_active_f());
3555
3556         /* update local copy */
3557         gr->zbc_dep_tbl[index].depth = depth_val->depth;
3558         gr->zbc_dep_tbl[index].format = depth_val->format;
3559         gr->zbc_dep_tbl[index].ref_cnt++;
3560
3561 clean_up:
3562         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3563         if (ret) {
3564                 nvhost_err(dev_from_gk20a(g),
3565                         "failed to enable gr engine activity\n");
3566         }
3567
3568         return ret;
3569 }
3570
3571 int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
3572                      struct zbc_entry *zbc_val)
3573 {
3574         struct zbc_color_table *c_tbl;
3575         struct zbc_depth_table *d_tbl;
3576         u32 i, ret = -ENOMEM;
3577         bool added = false;
3578         u32 entries;
3579
3580         /* no endian swap ? */
3581
3582         switch (zbc_val->type) {
3583         case GK20A_ZBC_TYPE_COLOR:
3584                 /* search existing tables */
3585                 for (i = 0; i < gr->max_used_color_index; i++) {
3586
3587                         c_tbl = &gr->zbc_col_tbl[i];
3588
3589                         if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
3590                             memcmp(c_tbl->color_ds, zbc_val->color_ds,
3591                                 sizeof(zbc_val->color_ds)) == 0) {
3592
3593                                 if (memcmp(c_tbl->color_l2, zbc_val->color_l2,
3594                                     sizeof(zbc_val->color_l2))) {
3595                                         nvhost_err(dev_from_gk20a(g),
3596                                                 "zbc l2 and ds color don't match with existing entries");
3597                                         return -EINVAL;
3598                                 }
3599                                 added = true;
3600                                 c_tbl->ref_cnt++;
3601                                 ret = 0;
3602                                 break;
3603                         }
3604                 }
3605                 /* add new table */
3606                 if (!added &&
3607                     gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
3608
3609                         c_tbl =
3610                             &gr->zbc_col_tbl[gr->max_used_color_index];
3611                         WARN_ON(c_tbl->ref_cnt != 0);
3612
3613                         ret = gr_gk20a_add_zbc_color(g, gr,
3614                                 zbc_val, gr->max_used_color_index);
3615
3616                         if (!ret)
3617                                 gr->max_used_color_index++;
3618                 }
3619                 break;
3620         case GK20A_ZBC_TYPE_DEPTH:
3621                 /* search existing tables */
3622                 for (i = 0; i < gr->max_used_depth_index; i++) {
3623
3624                         d_tbl = &gr->zbc_dep_tbl[i];
3625
3626                         if (d_tbl->ref_cnt &&
3627                             d_tbl->depth == zbc_val->depth &&
3628                             d_tbl->format == zbc_val->format) {
3629                                 added = true;
3630                                 d_tbl->ref_cnt++;
3631                                 ret = 0;
3632                                 break;
3633                         }
3634                 }
3635                 /* add new table */
3636                 if (!added &&
3637                     gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
3638
3639                         d_tbl =
3640                             &gr->zbc_dep_tbl[gr->max_used_depth_index];
3641                         WARN_ON(d_tbl->ref_cnt != 0);
3642
3643                         ret = gr_gk20a_add_zbc_depth(g, gr,
3644                                 zbc_val, gr->max_used_depth_index);
3645
3646                         if (!ret)
3647                                 gr->max_used_depth_index++;
3648                 }
3649                 break;
3650         default:
3651                 nvhost_err(dev_from_gk20a(g),
3652                         "invalid zbc table type %d", zbc_val->type);
3653                 return -EINVAL;
3654         }
3655
3656         if (!added && ret == 0) {
3657                 /* update zbc for elpg only when new entry is added */
3658                 entries = max(gr->max_used_color_index,
3659                                         gr->max_used_depth_index);
3660                 pmu_save_zbc(g, entries);
3661         }
3662
3663         return ret;
3664 }
3665
3666 int gr_gk20a_clear_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
3667 {
3668         struct fifo_gk20a *f = &g->fifo;
3669         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3670         u32 i, j;
3671         unsigned long end_jiffies = jiffies +
3672                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3673         u32 ret;
3674
3675         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3676         if (ret) {
3677                 nvhost_err(dev_from_gk20a(g),
3678                         "failed to disable gr engine activity\n");
3679                 return ret;
3680         }
3681
3682         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3683         if (ret) {
3684                 nvhost_err(dev_from_gk20a(g),
3685                         "failed to idle graphics\n");
3686                 goto clean_up;
3687         }
3688
3689         for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
3690                 gr->zbc_col_tbl[i].format = 0;
3691                 gr->zbc_col_tbl[i].ref_cnt = 0;
3692
3693                 gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3694                         gr_ds_zbc_color_fmt_val_invalid_f());
3695                 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3696                         gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
3697
3698                 /* trigger the write */
3699                 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3700                         gr_ds_zbc_tbl_ld_select_c_f() |
3701                         gr_ds_zbc_tbl_ld_action_write_f() |
3702                         gr_ds_zbc_tbl_ld_trigger_active_f());
3703
3704                 /* clear l2 table */
3705                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3706                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3707                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3708                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(i +
3709                                         GK20A_STARTOF_ZBC_TABLE));
3710
3711                 for (j = 0; j < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); j++) {
3712                         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(j), 0);
3713                         gr->zbc_col_tbl[i].color_l2[j] = 0;
3714                         gr->zbc_col_tbl[i].color_ds[j] = 0;
3715                 }
3716         }
3717         gr->max_used_color_index = 0;
3718         gr->max_default_color_index = 0;
3719
3720         for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
3721                 gr->zbc_dep_tbl[i].depth = 0;
3722                 gr->zbc_dep_tbl[i].format = 0;
3723                 gr->zbc_dep_tbl[i].ref_cnt = 0;
3724
3725                 gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3726                         gr_ds_zbc_z_fmt_val_invalid_f());
3727                 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3728                         gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
3729
3730                 /* trigger the write */
3731                 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3732                         gr_ds_zbc_tbl_ld_select_z_f() |
3733                         gr_ds_zbc_tbl_ld_action_write_f() |
3734                         gr_ds_zbc_tbl_ld_trigger_active_f());
3735
3736                 /* clear l2 table */
3737                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3738                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3739                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3740                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(i +
3741                                         GK20A_STARTOF_ZBC_TABLE));
3742
3743                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
3744         }
3745         gr->max_used_depth_index = 0;
3746         gr->max_default_depth_index = 0;
3747
3748 clean_up:
3749         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3750         if (ret) {
3751                 nvhost_err(dev_from_gk20a(g),
3752                         "failed to enable gr engine activity\n");
3753         }
3754
3755         /* elpg stuff */
3756
3757         return ret;
3758 }
3759
3760 /* get a zbc table entry specified by index
3761  * return table size when type is invalid */
3762 int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
3763                         struct zbc_query_params *query_params)
3764 {
3765         u32 index = query_params->index_size;
3766         u32 i;
3767
3768         switch (query_params->type) {
3769         case GK20A_ZBC_TYPE_INVALID:
3770                 query_params->index_size = GK20A_ZBC_TABLE_SIZE;
3771                 break;
3772         case GK20A_ZBC_TYPE_COLOR:
3773                 if (index >= GK20A_ZBC_TABLE_SIZE) {
3774                         nvhost_err(dev_from_gk20a(g),
3775                                 "invalid zbc color table index\n");
3776                         return -EINVAL;
3777                 }
3778                 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3779                         query_params->color_l2[i] =
3780                                 gr->zbc_col_tbl[index].color_l2[i];
3781                         query_params->color_ds[i] =
3782                                 gr->zbc_col_tbl[index].color_ds[i];
3783                 }
3784                 query_params->format = gr->zbc_col_tbl[index].format;
3785                 query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
3786                 break;
3787         case GK20A_ZBC_TYPE_DEPTH:
3788                 if (index >= GK20A_ZBC_TABLE_SIZE) {
3789                         nvhost_err(dev_from_gk20a(g),
3790                                 "invalid zbc depth table index\n");
3791                         return -EINVAL;
3792                 }
3793                 query_params->depth = gr->zbc_dep_tbl[index].depth;
3794                 query_params->format = gr->zbc_dep_tbl[index].format;
3795                 query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
3796                 break;
3797         default:
3798                 nvhost_err(dev_from_gk20a(g),
3799                                 "invalid zbc table type\n");
3800                 return -EINVAL;
3801         }
3802
3803         return 0;
3804 }
3805
3806 static int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
3807 {
3808         struct zbc_entry zbc_val;
3809         u32 i, err;
3810
3811         /* load default color table */
3812         zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3813
3814         zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
3815         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3816                 zbc_val.color_ds[i] = 0;
3817                 zbc_val.color_l2[i] = 0;
3818         }
3819         err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3820
3821         zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
3822         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3823                 zbc_val.color_ds[i] = 0xffffffff;
3824                 zbc_val.color_l2[i] = 0x3f800000;
3825         }
3826         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3827
3828         zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3829         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3830                 zbc_val.color_ds[i] = 0;
3831                 zbc_val.color_l2[i] = 0;
3832         }
3833         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3834
3835         zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3836         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3837                 zbc_val.color_ds[i] = 0x3f800000;
3838                 zbc_val.color_l2[i] = 0x3f800000;
3839         }
3840         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3841
3842         if (!err)
3843                 gr->max_default_color_index = 4;
3844         else {
3845                 nvhost_err(dev_from_gk20a(g),
3846                            "fail to load default zbc color table\n");
3847                 return err;
3848         }
3849
3850         /* load default depth table */
3851         zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3852
3853         zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3854         zbc_val.depth = 0;
3855         err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3856
3857         zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3858         zbc_val.depth = 0x3f800000;
3859         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3860
3861         if (!err)
3862                 gr->max_default_depth_index = 2;
3863         else {
3864                 nvhost_err(dev_from_gk20a(g),
3865                            "fail to load default zbc depth table\n");
3866                 return err;
3867         }
3868
3869         return 0;
3870 }
3871
3872 static int gr_gk20a_init_zbc(struct gk20a *g, struct gr_gk20a *gr)
3873 {
3874         u32 i, j;
3875
3876         /* reset zbc clear */
3877         for (i = 0; i < GK20A_SIZEOF_ZBC_TABLE -
3878             GK20A_STARTOF_ZBC_TABLE; i++) {
3879                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3880                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3881                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3882                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(
3883                                         i + GK20A_STARTOF_ZBC_TABLE));
3884                 for (j = 0; j < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); j++)
3885                         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(j), 0);
3886                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
3887         }
3888
3889         gr_gk20a_clear_zbc_table(g, gr);
3890
3891         gr_gk20a_load_zbc_default_table(g, gr);
3892
3893         return 0;
3894 }
3895
3896 int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
3897                         struct zbc_entry *zbc_val)
3898 {
3899         nvhost_dbg_fn("");
3900
3901         return gr_gk20a_elpg_protected_call(g,
3902                 gr_gk20a_add_zbc(g, gr, zbc_val));
3903 }
3904
3905 void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine)
3906 {
3907         u32 gate_ctrl;
3908
3909         gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3910
3911         switch (mode) {
3912         case BLCG_RUN:
3913                 gate_ctrl = set_field(gate_ctrl,
3914                                 therm_gate_ctrl_blk_clk_m(),
3915                                 therm_gate_ctrl_blk_clk_run_f());
3916                 break;
3917         case BLCG_AUTO:
3918                 gate_ctrl = set_field(gate_ctrl,
3919                                 therm_gate_ctrl_blk_clk_m(),
3920                                 therm_gate_ctrl_blk_clk_auto_f());
3921                 break;
3922         default:
3923                 nvhost_err(dev_from_gk20a(g),
3924                         "invalid blcg mode %d", mode);
3925                 return;
3926         }
3927
3928         gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3929 }
3930
3931 void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine)
3932 {
3933         u32 gate_ctrl, idle_filter;
3934
3935         gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3936
3937         switch (mode) {
3938         case ELCG_RUN:
3939                 gate_ctrl = set_field(gate_ctrl,
3940                                 therm_gate_ctrl_eng_clk_m(),
3941                                 therm_gate_ctrl_eng_clk_run_f());
3942                 gate_ctrl = set_field(gate_ctrl,
3943                                 therm_gate_ctrl_eng_pwr_m(),
3944                                 /* set elpg to auto to meet hw expectation */
3945                                 therm_gate_ctrl_eng_pwr_auto_f());
3946                 break;
3947         case ELCG_STOP:
3948                 gate_ctrl = set_field(gate_ctrl,
3949                                 therm_gate_ctrl_eng_clk_m(),
3950                                 therm_gate_ctrl_eng_clk_stop_f());
3951                 break;
3952         case ELCG_AUTO:
3953                 gate_ctrl = set_field(gate_ctrl,
3954                                 therm_gate_ctrl_eng_clk_m(),
3955                                 therm_gate_ctrl_eng_clk_auto_f());
3956                 break;
3957         default:
3958                 nvhost_err(dev_from_gk20a(g),
3959                         "invalid elcg mode %d", mode);
3960         }
3961
3962         if (tegra_platform_is_linsim()) {
3963                 gate_ctrl = set_field(gate_ctrl,
3964                         therm_gate_ctrl_eng_delay_after_m(),
3965                         therm_gate_ctrl_eng_delay_after_f(4));
3966         }
3967
3968         /* 2 * (1 << 9) = 1024 clks */
3969         gate_ctrl = set_field(gate_ctrl,
3970                 therm_gate_ctrl_eng_idle_filt_exp_m(),
3971                 therm_gate_ctrl_eng_idle_filt_exp_f(9));
3972         gate_ctrl = set_field(gate_ctrl,
3973                 therm_gate_ctrl_eng_idle_filt_mant_m(),
3974                 therm_gate_ctrl_eng_idle_filt_mant_f(2));
3975         gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3976
3977         /* default fecs_idle_filter to 0 */
3978         idle_filter = gk20a_readl(g, therm_fecs_idle_filter_r());
3979         idle_filter &= ~therm_fecs_idle_filter_value_m();
3980         gk20a_writel(g, therm_fecs_idle_filter_r(), idle_filter);
3981         /* default hubmmu_idle_filter to 0 */
3982         idle_filter = gk20a_readl(g, therm_hubmmu_idle_filter_r());
3983         idle_filter &= ~therm_hubmmu_idle_filter_value_m();
3984         gk20a_writel(g, therm_hubmmu_idle_filter_r(), idle_filter);
3985 }
3986
3987 static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
3988 {
3989         u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
3990         u32 *zcull_map_tiles, *zcull_bank_counters;
3991         u32 map_counter;
3992         u32 rcp_conserv;
3993         u32 offset;
3994         bool floorsweep = false;
3995
3996         if (!gr->map_tiles)
3997                 return -1;
3998
3999         zcull_map_tiles = kzalloc(proj_scal_max_gpcs_v() *
4000                         proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
4001         if (!zcull_map_tiles) {
4002                 nvhost_err(dev_from_gk20a(g),
4003                         "failed to allocate zcull temp buffers");
4004                 return -ENOMEM;
4005         }
4006         zcull_bank_counters = kzalloc(proj_scal_max_gpcs_v() *
4007                         proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
4008
4009         if (!zcull_bank_counters) {
4010                 nvhost_err(dev_from_gk20a(g),
4011                         "failed to allocate zcull temp buffers");
4012                 kfree(zcull_map_tiles);
4013                 return -ENOMEM;
4014         }
4015
4016         for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
4017                 zcull_map_tiles[map_counter] =
4018                         zcull_bank_counters[gr->map_tiles[map_counter]];
4019                 zcull_bank_counters[gr->map_tiles[map_counter]]++;
4020         }
4021
4022         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(),
4023                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(zcull_map_tiles[0]) |
4024                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(zcull_map_tiles[1]) |
4025                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(zcull_map_tiles[2]) |
4026                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(zcull_map_tiles[3]) |
4027                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(zcull_map_tiles[4]) |
4028                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(zcull_map_tiles[5]) |
4029                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(zcull_map_tiles[6]) |
4030                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(zcull_map_tiles[7]));
4031
4032         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(),
4033                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(zcull_map_tiles[8]) |
4034                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(zcull_map_tiles[9]) |
4035                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(zcull_map_tiles[10]) |
4036                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(zcull_map_tiles[11]) |
4037                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(zcull_map_tiles[12]) |
4038                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(zcull_map_tiles[13]) |
4039                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(zcull_map_tiles[14]) |
4040                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(zcull_map_tiles[15]));
4041
4042         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(),
4043                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(zcull_map_tiles[16]) |
4044                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(zcull_map_tiles[17]) |
4045                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(zcull_map_tiles[18]) |
4046                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(zcull_map_tiles[19]) |
4047                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(zcull_map_tiles[20]) |
4048                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(zcull_map_tiles[21]) |
4049                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(zcull_map_tiles[22]) |
4050                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(zcull_map_tiles[23]));
4051
4052         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(),
4053                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(zcull_map_tiles[24]) |
4054                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(zcull_map_tiles[25]) |
4055                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(zcull_map_tiles[26]) |
4056                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(zcull_map_tiles[27]) |
4057                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(zcull_map_tiles[28]) |
4058                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(zcull_map_tiles[29]) |
4059                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(zcull_map_tiles[30]) |
4060                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(zcull_map_tiles[31]));
4061
4062         kfree(zcull_map_tiles);
4063         kfree(zcull_bank_counters);
4064
4065         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4066                 gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
4067                 gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
4068
4069                 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
4070                     gpc_zcull_count < gpc_tpc_count) {
4071                         nvhost_err(dev_from_gk20a(g),
4072                                 "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
4073                                 gpc_zcull_count, gpc_tpc_count, gpc_index);
4074                         return -EINVAL;
4075                 }
4076                 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
4077                     gpc_zcull_count != 0)
4078                         floorsweep = true;
4079         }
4080
4081         /* 1.0f / 1.0f * gr_gpc0_zcull_sm_num_rcp_conservative__max_v() */
4082         rcp_conserv = gr_gpc0_zcull_sm_num_rcp_conservative__max_v();
4083
4084         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4085                 offset = gpc_index * proj_gpc_stride_v();
4086
4087                 if (floorsweep) {
4088                         gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4089                                 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4090                                 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4091                                         gr->max_zcull_per_gpc_count));
4092                 } else {
4093                         gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4094                                 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4095                                 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4096                                         gr->gpc_tpc_count[gpc_index]));
4097                 }
4098
4099                 gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
4100                         gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
4101                         gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
4102
4103                 gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
4104                         gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
4105         }
4106
4107         gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
4108                 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
4109
4110         return 0;
4111 }
4112
4113 static void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
4114 {
4115         /* enable tpc exception forwarding */
4116         gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(),
4117                 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f());
4118
4119         /* enable gpc exception forwarding */
4120         gk20a_writel(g, gr_gpc0_gpccs_gpc_exception_en_r(),
4121                 gr_gpc0_gpccs_gpc_exception_en_tpc_0_enabled_f());
4122 }
4123
4124 static int gk20a_init_gr_setup_hw(struct gk20a *g)
4125 {
4126         struct gr_gk20a *gr = &g->gr;
4127         struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
4128         struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
4129         struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
4130         u32 data;
4131         u32 addr_lo, addr_hi;
4132         u64 addr;
4133         u32 compbit_base_post_divide;
4134         u64 compbit_base_post_multiply64;
4135         unsigned long end_jiffies = jiffies +
4136                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4137         u32 fe_go_idle_timeout_save;
4138         u32 last_bundle_data = 0;
4139         u32 last_method_data = 0;
4140         u32 i, err;
4141         u32 l1c_dbg_reg_val;
4142
4143         nvhost_dbg_fn("");
4144
4145         /* slcg prod values */
4146         gr_gk20a_slcg_gr_load_gating_prod(g, g->slcg_enabled);
4147         gr_gk20a_slcg_perf_load_gating_prod(g, g->slcg_enabled);
4148
4149         /* init mmu debug buffer */
4150         addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_wr_mem.iova);
4151         addr_lo = u64_lo32(addr);
4152         addr_hi = u64_hi32(addr);
4153         addr = (addr_lo >> fb_mmu_debug_wr_addr_alignment_v()) |
4154                 (addr_hi << (32 - fb_mmu_debug_wr_addr_alignment_v()));
4155
4156         gk20a_writel(g, fb_mmu_debug_wr_r(),
4157                      fb_mmu_debug_wr_aperture_vid_mem_f() |
4158                      fb_mmu_debug_wr_vol_false_f() |
4159                      fb_mmu_debug_wr_addr_v(addr));
4160
4161         addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_rd_mem.iova);
4162         addr_lo = u64_lo32(addr);
4163         addr_hi = u64_hi32(addr);
4164         addr = (addr_lo >> fb_mmu_debug_rd_addr_alignment_v()) |
4165                 (addr_hi << (32 - fb_mmu_debug_rd_addr_alignment_v()));
4166
4167         gk20a_writel(g, fb_mmu_debug_rd_r(),
4168                      fb_mmu_debug_rd_aperture_vid_mem_f() |
4169                      fb_mmu_debug_rd_vol_false_f() |
4170                      fb_mmu_debug_rd_addr_v(addr));
4171
4172         /* load gr floorsweeping registers */
4173         data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
4174         data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
4175                         gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
4176         gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
4177
4178         gr_gk20a_zcull_init_hw(g, gr);
4179
4180         gr_gk20a_blcg_gr_load_gating_prod(g, g->blcg_enabled);
4181         gr_gk20a_pg_gr_load_gating_prod(g, true);
4182
4183         if (g->elcg_enabled) {
4184                 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
4185                 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
4186         } else {
4187                 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
4188                 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
4189         }
4190
4191         /* Bug 1340570: increase the clock timeout to avoid potential
4192          * operation failure at high gpcclk rate. Default values are 0x400.
4193          */
4194         gk20a_writel(g, pri_ringstation_sys_master_config_r(0x15), 0x800);
4195         gk20a_writel(g, pri_ringstation_gpc_master_config_r(0xa), 0x800);
4196         gk20a_writel(g, pri_ringstation_fbp_master_config_r(0x8), 0x800);
4197
4198         /* enable fifo access */
4199         gk20a_writel(g, gr_gpfifo_ctl_r(),
4200                      gr_gpfifo_ctl_access_enabled_f() |
4201                      gr_gpfifo_ctl_semaphore_access_enabled_f());
4202
4203         /* TBD: reload gr ucode when needed */
4204
4205         /* enable interrupts */
4206         gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
4207         gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
4208
4209         /* enable fecs error interrupts */
4210         gk20a_writel(g, gr_fecs_host_int_enable_r(),
4211                      gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
4212                      gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
4213                      gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
4214                      gr_fecs_host_int_enable_watchdog_enable_f());
4215
4216         /* enable exceptions */
4217         gk20a_writel(g, gr_fe_hww_esr_r(),
4218                      gr_fe_hww_esr_en_enable_f() |
4219                      gr_fe_hww_esr_reset_active_f());
4220         gk20a_writel(g, gr_memfmt_hww_esr_r(),
4221                      gr_memfmt_hww_esr_en_enable_f() |
4222                      gr_memfmt_hww_esr_reset_active_f());
4223         gk20a_writel(g, gr_scc_hww_esr_r(),
4224                      gr_scc_hww_esr_en_enable_f() |
4225                      gr_scc_hww_esr_reset_active_f());
4226         gk20a_writel(g, gr_mme_hww_esr_r(),
4227                      gr_mme_hww_esr_en_enable_f() |
4228                      gr_mme_hww_esr_reset_active_f());
4229         gk20a_writel(g, gr_pd_hww_esr_r(),
4230                      gr_pd_hww_esr_en_enable_f() |
4231                      gr_pd_hww_esr_reset_active_f());
4232         gk20a_writel(g, gr_sked_hww_esr_r(), /* enabled by default */
4233                      gr_sked_hww_esr_reset_active_f());
4234         gk20a_writel(g, gr_ds_hww_esr_r(),
4235                      gr_ds_hww_esr_en_enabled_f() |
4236                      gr_ds_hww_esr_reset_task_f());
4237         gk20a_writel(g, gr_ds_hww_report_mask_r(),
4238                      gr_ds_hww_report_mask_sph0_err_report_f() |
4239                      gr_ds_hww_report_mask_sph1_err_report_f() |
4240                      gr_ds_hww_report_mask_sph2_err_report_f() |
4241                      gr_ds_hww_report_mask_sph3_err_report_f() |
4242                      gr_ds_hww_report_mask_sph4_err_report_f() |
4243                      gr_ds_hww_report_mask_sph5_err_report_f() |
4244                      gr_ds_hww_report_mask_sph6_err_report_f() |
4245                      gr_ds_hww_report_mask_sph7_err_report_f() |
4246                      gr_ds_hww_report_mask_sph8_err_report_f() |
4247                      gr_ds_hww_report_mask_sph9_err_report_f() |
4248                      gr_ds_hww_report_mask_sph10_err_report_f() |
4249                      gr_ds_hww_report_mask_sph11_err_report_f() |
4250                      gr_ds_hww_report_mask_sph12_err_report_f() |
4251                      gr_ds_hww_report_mask_sph13_err_report_f() |
4252                      gr_ds_hww_report_mask_sph14_err_report_f() |
4253                      gr_ds_hww_report_mask_sph15_err_report_f() |
4254                      gr_ds_hww_report_mask_sph16_err_report_f() |
4255                      gr_ds_hww_report_mask_sph17_err_report_f() |
4256                      gr_ds_hww_report_mask_sph18_err_report_f() |
4257                      gr_ds_hww_report_mask_sph19_err_report_f() |
4258                      gr_ds_hww_report_mask_sph20_err_report_f() |
4259                      gr_ds_hww_report_mask_sph21_err_report_f() |
4260                      gr_ds_hww_report_mask_sph22_err_report_f() |
4261                      gr_ds_hww_report_mask_sph23_err_report_f());
4262
4263         /* setup sm warp esr report masks */
4264         gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4265                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4266                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4267                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4268                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4269                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4270                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4271                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4272                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4273                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4274                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4275                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4276                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4277                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4278                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4279                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4280                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4281                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4282                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4283                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4284                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4285
4286         /* setup sm global esr report mask */
4287         gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4288                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4289                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4290                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4291                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4292                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4293                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4294                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4295
4296         /* enable per GPC exceptions */
4297         gk20a_gr_enable_gpc_exceptions(g);
4298
4299         /* TBD: ECC for L1/SM */
4300         /* TBD: enable per BE exceptions */
4301
4302         /* reset and enable all exceptions */
4303         gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
4304         gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
4305         gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
4306         gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
4307         gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
4308         gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
4309
4310         /* ignore status from some units */
4311         data = gk20a_readl(g, gr_status_mask_r());
4312         gk20a_writel(g, gr_status_mask_r(), data & gr->status_disable_mask);
4313
4314         gr_gk20a_init_zbc(g, gr);
4315
4316         {
4317                 u64 compbit_base_post_divide64 = (gr->compbit_store.base_pa >>
4318                                 ltc_ltcs_ltss_cbc_base_alignment_shift_v());
4319                 do_div(compbit_base_post_divide64, gr->num_fbps);
4320                 compbit_base_post_divide = u64_lo32(compbit_base_post_divide64);
4321         }
4322
4323         compbit_base_post_multiply64 = ((u64)compbit_base_post_divide *
4324                 gr->num_fbps) << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
4325
4326         if (compbit_base_post_multiply64 < gr->compbit_store.base_pa)
4327                 compbit_base_post_divide++;
4328
4329         gk20a_writel(g, ltc_ltcs_ltss_cbc_base_r(),
4330                 compbit_base_post_divide);
4331
4332         nvhost_dbg(dbg_info | dbg_map | dbg_pte,
4333                    "compbit base.pa: 0x%x,%08x cbc_base:0x%08x\n",
4334                    (u32)(gr->compbit_store.base_pa>>32),
4335                    (u32)(gr->compbit_store.base_pa & 0xffffffff),
4336                    compbit_base_post_divide);
4337
4338         /* load ctx init */
4339         for (i = 0; i < sw_ctx_load->count; i++)
4340                 gk20a_writel(g, sw_ctx_load->l[i].addr,
4341                              sw_ctx_load->l[i].value);
4342
4343         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4344         if (err)
4345                 goto out;
4346
4347         /* save and disable fe_go_idle */
4348         fe_go_idle_timeout_save =
4349                 gk20a_readl(g, gr_fe_go_idle_timeout_r());
4350         gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4351                 (fe_go_idle_timeout_save & gr_fe_go_idle_timeout_count_f(0)) |
4352                 gr_fe_go_idle_timeout_count_disabled_f());
4353
4354         /* override a few ctx state registers */
4355         gr_gk20a_commit_global_cb_manager(g, NULL, false);
4356         gr_gk20a_commit_global_timeslice(g, NULL, false);
4357
4358         /* floorsweep anything left */
4359         gr_gk20a_ctx_state_floorsweep(g);
4360
4361         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4362         if (err)
4363                 goto restore_fe_go_idle;
4364
4365         /* enable pipe mode override */
4366         gk20a_writel(g, gr_pipe_bundle_config_r(),
4367                 gr_pipe_bundle_config_override_pipe_mode_enabled_f());
4368
4369         /* load bundle init */
4370         err = 0;
4371         for (i = 0; i < sw_bundle_init->count; i++) {
4372
4373                 if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
4374                         gk20a_writel(g, gr_pipe_bundle_data_r(),
4375                                 sw_bundle_init->l[i].value);
4376                         last_bundle_data = sw_bundle_init->l[i].value;
4377                 }
4378
4379                 gk20a_writel(g, gr_pipe_bundle_address_r(),
4380                              sw_bundle_init->l[i].addr);
4381
4382                 if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
4383                     GR_GO_IDLE_BUNDLE)
4384                         err |= gr_gk20a_wait_idle(g, end_jiffies,
4385                                         GR_IDLE_CHECK_DEFAULT);
4386                 else if (0) { /* IS_SILICON */
4387                         u32 delay = GR_IDLE_CHECK_DEFAULT;
4388                         do {
4389                                 u32 gr_status = gk20a_readl(g, gr_status_r());
4390
4391                                 if (gr_status_fe_method_lower_v(gr_status) ==
4392                                     gr_status_fe_method_lower_idle_v())
4393                                         break;
4394
4395                                 usleep_range(delay, delay * 2);
4396                                 delay = min_t(u32, delay << 1,
4397                                         GR_IDLE_CHECK_MAX);
4398
4399                         } while (time_before(jiffies, end_jiffies) |
4400                                         !tegra_platform_is_silicon());
4401                 }
4402         }
4403
4404         /* disable pipe mode override */
4405         gk20a_writel(g, gr_pipe_bundle_config_r(),
4406                      gr_pipe_bundle_config_override_pipe_mode_disabled_f());
4407
4408 restore_fe_go_idle:
4409         /* restore fe_go_idle */
4410         gk20a_writel(g, gr_fe_go_idle_timeout_r(), fe_go_idle_timeout_save);
4411
4412         if (err || gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT))
4413                 goto out;
4414
4415         /* load method init */
4416         if (sw_method_init->count) {
4417                 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4418                              sw_method_init->l[0].value);
4419                 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4420                              gr_pri_mme_shadow_raw_index_write_trigger_f() |
4421                              sw_method_init->l[0].addr);
4422                 last_method_data = sw_method_init->l[0].value;
4423         }
4424         for (i = 1; i < sw_method_init->count; i++) {
4425                 if (sw_method_init->l[i].value != last_method_data) {
4426                         gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4427                                 sw_method_init->l[i].value);
4428                         last_method_data = sw_method_init->l[i].value;
4429                 }
4430                 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4431                         gr_pri_mme_shadow_raw_index_write_trigger_f() |
4432                         sw_method_init->l[i].addr);
4433         }
4434
4435         gk20a_mm_l2_invalidate(g);
4436
4437         /* turn on cya15 bit for a default val that missed the cut */
4438         l1c_dbg_reg_val = gk20a_readl(g, gr_gpc0_tpc0_l1c_dbg_r());
4439         l1c_dbg_reg_val |= gr_gpc0_tpc0_l1c_dbg_cya15_en_f();
4440         gk20a_writel(g, gr_gpc0_tpc0_l1c_dbg_r(), l1c_dbg_reg_val);
4441
4442         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4443         if (err)
4444                 goto out;
4445
4446 out:
4447         nvhost_dbg_fn("done");
4448         return 0;
4449 }
4450
4451 static int gk20a_init_gr_prepare(struct gk20a *g)
4452 {
4453         u32 gpfifo_ctrl, pmc_en;
4454         u32 err = 0;
4455
4456         /* disable fifo access */
4457         pmc_en = gk20a_readl(g, mc_enable_r());
4458         if (pmc_en & mc_enable_pgraph_enabled_f()) {
4459                 gpfifo_ctrl = gk20a_readl(g, gr_gpfifo_ctl_r());
4460                 gpfifo_ctrl &= ~gr_gpfifo_ctl_access_enabled_f();
4461                 gk20a_writel(g, gr_gpfifo_ctl_r(), gpfifo_ctrl);
4462         }
4463
4464         /* reset gr engine */
4465         gk20a_reset(g, mc_enable_pgraph_enabled_f()
4466                         | mc_enable_blg_enabled_f()
4467                         | mc_enable_perfmon_enabled_f());
4468
4469         /* enable fifo access */
4470         gk20a_writel(g, gr_gpfifo_ctl_r(),
4471                 gr_gpfifo_ctl_access_enabled_f() |
4472                 gr_gpfifo_ctl_semaphore_access_enabled_f());
4473
4474         if (!g->gr.ctx_vars.valid) {
4475                 err = gr_gk20a_init_ctx_vars(g, &g->gr);
4476                 if (err)
4477                         nvhost_err(dev_from_gk20a(g),
4478                                 "fail to load gr init ctx");
4479         }
4480         return err;
4481 }
4482
4483 static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
4484 {
4485         struct gr_gk20a *gr = &g->gr;
4486         struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
4487         unsigned long end_jiffies = jiffies +
4488                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4489         u32 i, err = 0;
4490
4491         nvhost_dbg_fn("");
4492
4493         /* enable interrupts */
4494         gk20a_writel(g, gr_intr_r(), ~0);
4495         gk20a_writel(g, gr_intr_en_r(), ~0);
4496
4497         /* reset ctx switch state */
4498         gr_gk20a_ctx_reset(g, 0);
4499
4500         /* clear scc ram */
4501         gk20a_writel(g, gr_scc_init_r(),
4502                 gr_scc_init_ram_trigger_f());
4503
4504         /* load non_ctx init */
4505         for (i = 0; i < sw_non_ctx_load->count; i++)
4506                 gk20a_writel(g, sw_non_ctx_load->l[i].addr,
4507                         sw_non_ctx_load->l[i].value);
4508
4509         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4510         if (err)
4511                 goto out;
4512
4513         err = gr_gk20a_load_ctxsw_ucode(g, gr);
4514         if (err)
4515                 goto out;
4516
4517         /* this appears query for sw states but fecs actually init
4518            ramchain, etc so this is hw init */
4519         err = gr_gk20a_init_ctx_state(g, gr);
4520         if (err)
4521                 goto out;
4522
4523 out:
4524         if (err)
4525                 nvhost_err(dev_from_gk20a(g), "fail");
4526         else
4527                 nvhost_dbg_fn("done");
4528
4529         return 0;
4530 }
4531
4532 /*
4533  * XXX Merge this list with the debugger/profiler
4534  * session regops whitelists?
4535  */
4536 static u32 wl_addr_gk20a[] = {
4537         /* this list must be sorted (low to high) */
4538         0x404468, /* gr_pri_mme_max_instructions       */
4539         0x418800, /* gr_pri_gpcs_setup_debug           */
4540         0x419a04, /* gr_pri_gpcs_tpcs_tex_lod_dbg      */
4541         0x419a08, /* gr_pri_gpcs_tpcs_tex_samp_dbg     */
4542         0x419e10, /* gr_pri_gpcs_tpcs_sm_dbgr_control0 */
4543         0x419f78, /* gr_pri_gpcs_tpcs_sm_disp_ctrl     */
4544 };
4545
4546 static int gr_gk20a_init_access_map(struct gk20a *g)
4547 {
4548         struct gr_gk20a *gr = &g->gr;
4549         struct mem_handle *mem;
4550         void *data;
4551         u32 w, page, nr_pages =
4552                 DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
4553                              PAGE_SIZE);
4554
4555         mem = gr->global_ctx_buffer[PRIV_ACCESS_MAP].ref;
4556
4557         for (page = 0; page < nr_pages; page++) {
4558                 data = nvhost_memmgr_kmap(mem, page);
4559                 if (!data) {
4560                         nvhost_err(dev_from_gk20a(g),
4561                                    "failed to map priv access map memory");
4562                         return -ENOMEM;
4563                 }
4564                 memset(data, 0x0, PAGE_SIZE);
4565
4566                 /* no good unless ARRAY_SIZE(w) == something small */
4567                 for (w = 0; w < ARRAY_SIZE(wl_addr_gk20a); w++) {
4568                         u32 map_bit, map_byte, map_shift;
4569                         u32 map_page, pb_idx;
4570                         map_bit = wl_addr_gk20a[w] >> 2;
4571                         map_byte = map_bit >> 3;
4572                         map_page = map_byte >> PAGE_SHIFT;
4573                         if (map_page != page)
4574                                 continue;
4575                         map_shift = map_bit & 0x7; /* i.e. 0-7 */
4576                         pb_idx = (map_byte & ~PAGE_MASK);
4577                         nvhost_dbg_info(
4578                                 "access map addr:0x%x pg:%d pb:%d bit:%d",
4579                                 wl_addr_gk20a[w], map_page, pb_idx, map_shift);
4580                         ((u8 *)data)[pb_idx] |= (1 << map_shift);
4581                 }
4582                 /* uncached on cpu side, so no need to flush? */
4583                 nvhost_memmgr_kunmap(mem, page, data);
4584         }
4585
4586         return 0;
4587 }
4588
4589 static int gk20a_init_gr_setup_sw(struct gk20a *g)
4590 {
4591         struct gr_gk20a *gr = &g->gr;
4592         int err;
4593
4594         nvhost_dbg_fn("");
4595
4596         if (gr->sw_ready) {
4597                 nvhost_dbg_fn("skip init");
4598                 return 0;
4599         }
4600
4601         gr->g = g;
4602
4603         err = gr_gk20a_init_gr_config(g, gr);
4604         if (err)
4605                 goto clean_up;
4606
4607         err = gr_gk20a_init_mmu_sw(g, gr);
4608         if (err)
4609                 goto clean_up;
4610
4611         err = gr_gk20a_init_map_tiles(g, gr);
4612         if (err)
4613                 goto clean_up;
4614
4615         if (tegra_cpu_is_asim())
4616                 gr->max_comptag_mem = 1; /* MBs worth of comptag coverage */
4617         else {
4618                 nvhost_dbg_info("total ram pages : %lu", totalram_pages);
4619                 gr->max_comptag_mem = totalram_pages
4620                                          >> (10 - (PAGE_SHIFT - 10));
4621         }
4622         err = gr_gk20a_init_comptag(g, gr);
4623         if (err)
4624                 goto clean_up;
4625
4626         err = gr_gk20a_init_zcull(g, gr);
4627         if (err)
4628                 goto clean_up;
4629
4630         err = gr_gk20a_alloc_global_ctx_buffers(g);
4631         if (err)
4632                 goto clean_up;
4633
4634         err = gr_gk20a_init_access_map(g);
4635         if (err)
4636                 goto clean_up;
4637
4638         mutex_init(&gr->ctx_mutex);
4639         spin_lock_init(&gr->ch_tlb_lock);
4640
4641         gr->remove_support = gk20a_remove_gr_support;
4642         gr->sw_ready = true;
4643
4644         nvhost_dbg_fn("done");
4645         return 0;
4646
4647 clean_up:
4648         nvhost_err(dev_from_gk20a(g), "fail");
4649         gk20a_remove_gr_support(gr);
4650         return err;
4651 }
4652
4653 int gk20a_init_gr_support(struct gk20a *g)
4654 {
4655         u32 err;
4656
4657         nvhost_dbg_fn("");
4658
4659         err = gk20a_init_gr_prepare(g);
4660         if (err)
4661                 return err;
4662
4663         /* this is required before gr_gk20a_init_ctx_state */
4664         mutex_init(&g->gr.fecs_mutex);
4665
4666         err = gk20a_init_gr_reset_enable_hw(g);
4667         if (err)
4668                 return err;
4669
4670         err = gk20a_init_gr_setup_sw(g);
4671         if (err)
4672                 return err;
4673
4674         err = gk20a_init_gr_setup_hw(g);
4675         if (err)
4676                 return err;
4677
4678         return 0;
4679 }
4680
4681 #define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE   0x02dc
4682 #define NVA297_SET_CIRCULAR_BUFFER_SIZE         0x1280
4683 #define NVA297_SET_SHADER_EXCEPTIONS            0x1528
4684 #define NVA0C0_SET_SHADER_EXCEPTIONS            0x1528
4685
4686 #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
4687
4688 struct gr_isr_data {
4689         u32 addr;
4690         u32 data_lo;
4691         u32 data_hi;
4692         u32 curr_ctx;
4693         u32 chid;
4694         u32 offset;
4695         u32 sub_chan;
4696         u32 class_num;
4697 };
4698
4699 static void gk20a_gr_set_shader_exceptions(struct gk20a *g,
4700                                            struct gr_isr_data *isr_data)
4701 {
4702         u32 val;
4703
4704         nvhost_dbg_fn("");
4705
4706         if (isr_data->data_lo ==
4707             NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE)
4708                 val = 0;
4709         else
4710                 val = ~0;
4711
4712         gk20a_writel(g,
4713                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4714                 val);
4715         gk20a_writel(g,
4716                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4717                 val);
4718 }
4719
4720 static void gk20a_gr_set_circular_buffer_size(struct gk20a *g,
4721                         struct gr_isr_data *isr_data)
4722 {
4723         struct gr_gk20a *gr = &g->gr;
4724         u32 gpc_index, ppc_index, stride, val, offset;
4725         u32 cb_size = isr_data->data_lo * 4;
4726
4727         nvhost_dbg_fn("");
4728
4729         if (cb_size > gr->attrib_cb_size)
4730                 cb_size = gr->attrib_cb_size;
4731
4732         gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4733                 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4734                  ~gr_ds_tga_constraintlogic_beta_cbsize_f(~0)) |
4735                  gr_ds_tga_constraintlogic_beta_cbsize_f(cb_size));
4736
4737         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4738                 stride = proj_gpc_stride_v() * gpc_index;
4739
4740                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4741                         ppc_index++) {
4742
4743                         val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg_r() +
4744                                 stride +
4745                                 proj_ppc_in_gpc_stride_v() * ppc_index);
4746
4747                         offset = gr_gpc0_ppc0_cbm_cfg_start_offset_v(val);
4748
4749                         val = set_field(val,
4750                                 gr_gpc0_ppc0_cbm_cfg_size_m(),
4751                                 gr_gpc0_ppc0_cbm_cfg_size_f(cb_size *
4752                                         gr->pes_tpc_count[ppc_index][gpc_index]));
4753                         val = set_field(val,
4754                                 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4755                                 (offset + 1));
4756
4757                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4758                                 stride +
4759                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4760
4761                         val = set_field(val,
4762                                 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4763                                 offset);
4764
4765                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4766                                 stride +
4767                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4768                 }
4769         }
4770 }
4771
4772 static void gk20a_gr_set_alpha_circular_buffer_size(struct gk20a *g,
4773                                                 struct gr_isr_data *isr_data)
4774 {
4775         struct gr_gk20a *gr = &g->gr;
4776         u32 gpc_index, ppc_index, stride, val;
4777         u32 pd_ab_max_output;
4778         u32 alpha_cb_size = isr_data->data_lo * 4;
4779
4780         nvhost_dbg_fn("");
4781         /* if (NO_ALPHA_BETA_TIMESLICE_SUPPORT_DEF)
4782                 return; */
4783
4784         if (alpha_cb_size > gr->alpha_cb_size)
4785                 alpha_cb_size = gr->alpha_cb_size;
4786
4787         gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4788                 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4789                  ~gr_ds_tga_constraintlogic_alpha_cbsize_f(~0)) |
4790                  gr_ds_tga_constraintlogic_alpha_cbsize_f(alpha_cb_size));
4791
4792         pd_ab_max_output = alpha_cb_size *
4793                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v() /
4794                 gr_pd_ab_dist_cfg1_max_output_granularity_v();
4795
4796         gk20a_writel(g, gr_pd_ab_dist_cfg1_r(),
4797                 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output));
4798
4799         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4800                 stride = proj_gpc_stride_v() * gpc_index;
4801
4802                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4803                         ppc_index++) {
4804
4805                         val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4806                                 stride +
4807                                 proj_ppc_in_gpc_stride_v() * ppc_index);
4808
4809                         val = set_field(val, gr_gpc0_ppc0_cbm_cfg2_size_m(),
4810                                         gr_gpc0_ppc0_cbm_cfg2_size_f(alpha_cb_size *
4811                                                 gr->pes_tpc_count[ppc_index][gpc_index]));
4812
4813                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4814                                 stride +
4815                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4816                 }
4817         }
4818 }
4819
4820 void gk20a_gr_reset(struct gk20a *g)
4821 {
4822         int err;
4823         err = gk20a_init_gr_prepare(g);
4824         BUG_ON(err);
4825         err = gk20a_init_gr_reset_enable_hw(g);
4826         BUG_ON(err);
4827         err = gk20a_init_gr_setup_hw(g);
4828         BUG_ON(err);
4829 }
4830
4831 static int gk20a_gr_handle_illegal_method(struct gk20a *g,
4832                                           struct gr_isr_data *isr_data)
4833 {
4834         nvhost_dbg_fn("");
4835
4836         if (isr_data->class_num == KEPLER_COMPUTE_A) {
4837                 switch (isr_data->offset << 2) {
4838                 case NVA0C0_SET_SHADER_EXCEPTIONS:
4839                         gk20a_gr_set_shader_exceptions(g, isr_data);
4840                         break;
4841                 default:
4842                         goto fail;
4843                 }
4844         }
4845
4846         if (isr_data->class_num == KEPLER_C) {
4847                 switch (isr_data->offset << 2) {
4848                 case NVA297_SET_SHADER_EXCEPTIONS:
4849                         gk20a_gr_set_shader_exceptions(g, isr_data);
4850                         break;
4851                 case NVA297_SET_CIRCULAR_BUFFER_SIZE:
4852                         gk20a_gr_set_circular_buffer_size(g, isr_data);
4853                         break;
4854                 case NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE:
4855                         gk20a_gr_set_alpha_circular_buffer_size(g, isr_data);
4856                         break;
4857                 default:
4858                         goto fail;
4859                 }
4860         }
4861         return 0;
4862
4863 fail:
4864         nvhost_err(dev_from_gk20a(g), "invalid method class 0x%08x"
4865                 ", offset 0x%08x address 0x%08x\n",
4866                 isr_data->class_num, isr_data->offset, isr_data->addr);
4867         return -EINVAL;
4868 }
4869
4870 static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
4871                   struct gr_isr_data *isr_data)
4872 {
4873         struct fifo_gk20a *f = &g->fifo;
4874         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4875         nvhost_dbg_fn("");
4876         gk20a_set_error_notifier(ch->hwctx,
4877                                 NVHOST_CHANNEL_GR_SEMAPHORE_TIMEOUT);
4878         nvhost_err(dev_from_gk20a(g),
4879                    "gr semaphore timeout\n");
4880         return -EINVAL;
4881 }
4882
4883 static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
4884                   struct gr_isr_data *isr_data)
4885 {
4886         struct fifo_gk20a *f = &g->fifo;
4887         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4888         nvhost_dbg_fn("");
4889         gk20a_set_error_notifier(ch->hwctx,
4890                                 NVHOST_CHANNEL_GR_ILLEGAL_NOTIFY);
4891         /* This is an unrecoverable error, reset is needed */
4892         nvhost_err(dev_from_gk20a(g),
4893                    "gr semaphore timeout\n");
4894         return -EINVAL;
4895 }
4896
4897 static int gk20a_gr_handle_illegal_class(struct gk20a *g,
4898                                           struct gr_isr_data *isr_data)
4899 {
4900         struct fifo_gk20a *f = &g->fifo;
4901         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4902         nvhost_dbg_fn("");
4903         gk20a_set_error_notifier(ch->hwctx,
4904                                 NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4905         nvhost_err(dev_from_gk20a(g),
4906                    "invalid class 0x%08x, offset 0x%08x",
4907                    isr_data->class_num, isr_data->offset);
4908         return -EINVAL;
4909 }
4910
4911 static int gk20a_gr_handle_class_error(struct gk20a *g,
4912                                           struct gr_isr_data *isr_data)
4913 {
4914         struct fifo_gk20a *f = &g->fifo;
4915         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4916         nvhost_dbg_fn("");
4917
4918         gk20a_set_error_notifier(ch->hwctx,
4919                         NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4920         nvhost_err(dev_from_gk20a(g),
4921                    "class error 0x%08x, offset 0x%08x",
4922                    isr_data->class_num, isr_data->offset);
4923         return -EINVAL;
4924 }
4925
4926 static int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
4927                                              struct gr_isr_data *isr_data)
4928 {
4929         struct fifo_gk20a *f = &g->fifo;
4930         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4931
4932         wake_up(&ch->semaphore_wq);
4933
4934         return 0;
4935 }
4936
4937 static int gk20a_gr_handle_notify_pending(struct gk20a *g,
4938                                           struct gr_isr_data *isr_data)
4939 {
4940         struct fifo_gk20a *f = &g->fifo;
4941         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4942
4943 #if defined(CONFIG_GK20A_CYCLE_STATS)
4944         void *virtual_address;
4945         u32 buffer_size;
4946         u32 offset;
4947         u32 new_offset;
4948         bool exit;
4949         struct share_buffer_head *sh_hdr;
4950         u32 raw_reg;
4951         u64 mask_orig;
4952         u64 v = 0;
4953         struct gk20a_cyclestate_buffer_elem *op_elem;
4954         /* GL will never use payload 0 for cycle state */
4955         if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
4956                 return 0;
4957
4958         mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
4959
4960         virtual_address = ch->cyclestate.cyclestate_buffer;
4961         buffer_size = ch->cyclestate.cyclestate_buffer_size;
4962         offset = isr_data->data_lo;
4963         exit = false;
4964         while (!exit) {
4965                 if (offset >= buffer_size) {
4966                         WARN_ON(1);
4967                         break;
4968                 }
4969
4970                 sh_hdr = (struct share_buffer_head *)
4971                         ((char *)virtual_address + offset);
4972
4973                 if (sh_hdr->size < sizeof(struct share_buffer_head)) {
4974                         WARN_ON(1);
4975                         break;
4976                 }
4977                 new_offset = offset + sh_hdr->size;
4978
4979                 switch (sh_hdr->operation) {
4980                 case OP_END:
4981                         exit = true;
4982                         break;
4983
4984                 case BAR0_READ32:
4985                 case BAR0_WRITE32:
4986                 {
4987                         op_elem =
4988                                 (struct gk20a_cyclestate_buffer_elem *)
4989                                         sh_hdr;
4990                         if (op_elem->offset_bar0 <
4991                                 resource_size(g->reg_mem)) {
4992                                 mask_orig =
4993                                         ((1ULL <<
4994                                         (op_elem->last_bit + 1))
4995                                         -1)&~((1ULL <<
4996                                         op_elem->first_bit)-1);
4997
4998                                 raw_reg =
4999                                         gk20a_readl(g,
5000                                                 op_elem->offset_bar0);
5001
5002                                 switch (sh_hdr->operation) {
5003                                 case BAR0_READ32:
5004                                         op_elem->data =
5005                                         (raw_reg & mask_orig)
5006                                                 >> op_elem->first_bit;
5007                                         break;
5008
5009                                 case BAR0_WRITE32:
5010                                         v = 0;
5011                                         if ((unsigned int)mask_orig !=
5012                                         (unsigned int)~0) {
5013                                                 v = (unsigned int)
5014                                                         (raw_reg & ~mask_orig);
5015                                         }
5016
5017                                         v |= ((op_elem->data
5018                                                 << op_elem->first_bit)
5019                                                 & mask_orig);
5020
5021                                         gk20a_writel(g,
5022                                                 op_elem->offset_bar0,
5023                                                 (unsigned int)v);
5024                                                 break;
5025
5026                                 default:
5027                                                 break;
5028                                 }
5029                         } else {
5030                                 sh_hdr->failed = true;
5031                                 WARN_ON(1);
5032                         }
5033                 }
5034                 break;
5035                 default:
5036                 /* no operation content case */
5037                         exit = true;
5038                         break;
5039                 }
5040                 sh_hdr->completed = true;
5041                 offset = new_offset;
5042         }
5043         mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
5044 #endif
5045         nvhost_dbg_fn("");
5046         wake_up(&ch->notifier_wq);
5047         return 0;
5048 }
5049
5050 /* Used by sw interrupt thread to translate current ctx to chid.
5051  * For performance, we don't want to go through 128 channels every time.
5052  * A small tlb is used here to cache translation */
5053 static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx)
5054 {
5055         struct fifo_gk20a *f = &g->fifo;
5056         struct gr_gk20a *gr = &g->gr;
5057         u32 chid = -1;
5058         u32 i;
5059
5060         spin_lock(&gr->ch_tlb_lock);
5061
5062         /* check cache first */
5063         for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5064                 if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
5065                         chid = gr->chid_tlb[i].hw_chid;
5066                         goto unlock;
5067                 }
5068         }
5069
5070         /* slow path */
5071         for (chid = 0; chid < f->num_channels; chid++)
5072                 if (f->channel[chid].in_use) {
5073                         if ((u32)(f->channel[chid].inst_block.cpu_pa >>
5074                                 ram_in_base_shift_v()) ==
5075                                 gr_fecs_current_ctx_ptr_v(curr_ctx))
5076                                 break;
5077         }
5078
5079         if (chid >= f->num_channels) {
5080                 chid = -1;
5081                 goto unlock;
5082         }
5083
5084         /* add to free tlb entry */
5085         for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5086                 if (gr->chid_tlb[i].curr_ctx == 0) {
5087                         gr->chid_tlb[i].curr_ctx = curr_ctx;
5088                         gr->chid_tlb[i].hw_chid = chid;
5089                         goto unlock;
5090                 }
5091         }
5092
5093         /* no free entry, flush one */
5094         gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
5095         gr->chid_tlb[gr->channel_tlb_flush_index].hw_chid = chid;
5096
5097         gr->channel_tlb_flush_index =
5098                 (gr->channel_tlb_flush_index + 1) &
5099                 (GR_CHANNEL_MAP_TLB_SIZE - 1);
5100
5101 unlock:
5102         spin_unlock(&gr->ch_tlb_lock);
5103         return chid;
5104 }
5105
5106 static int gk20a_gr_lock_down_sm(struct gk20a *g, u32 global_esr_mask)
5107 {
5108         unsigned long end_jiffies = jiffies +
5109                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5110         u32 delay = GR_IDLE_CHECK_DEFAULT;
5111         bool mmu_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled(g);
5112         u32 dbgr_control0;
5113
5114         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "locking down SM");
5115
5116         /* assert stop trigger */
5117         dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5118         dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5119         gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
5120
5121         /* wait for the sm to lock down */
5122         do {
5123                 u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5124                 u32 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5125                 u32 dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r());
5126                 bool locked_down =
5127                         (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
5128                          gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
5129                 bool error_pending =
5130                         (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) !=
5131                          gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) ||
5132                         ((global_esr & ~global_esr_mask) != 0);
5133
5134                 if (locked_down || !error_pending) {
5135                         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "locked down SM");
5136
5137                         /* de-assert stop trigger */
5138                         dbgr_control0 &= ~gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5139                         gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
5140
5141                         return 0;
5142                 }
5143
5144                 /* if an mmu fault is pending and mmu debug mode is not
5145                  * enabled, the sm will never lock down. */
5146                 if (!mmu_debug_mode_enabled && gk20a_fifo_mmu_fault_pending(g)) {
5147                         nvhost_err(dev_from_gk20a(g), "mmu fault pending, sm will"
5148                                    " never lock down!");
5149                         return -EFAULT;
5150                 }
5151
5152                 usleep_range(delay, delay * 2);
5153                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
5154
5155         } while (time_before(jiffies, end_jiffies));
5156
5157         nvhost_err(dev_from_gk20a(g), "timed out while trying to lock down SM");
5158
5159         return -EAGAIN;
5160 }
5161
5162 bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
5163 {
5164         u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5165
5166         /* check if an sm debugger is attached */
5167         if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
5168                         gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v())
5169                 return true;
5170
5171         return false;
5172 }
5173
5174 static void gk20a_gr_clear_sm_hww(struct gk20a *g, u32 global_esr)
5175 {
5176         gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r(), global_esr);
5177
5178         /* clear the warp hww */
5179         gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r(),
5180                         gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f());
5181 }
5182
5183 static struct channel_gk20a *
5184 channel_from_hw_chid(struct gk20a *g, u32 hw_chid)
5185 {
5186         return g->fifo.channel+hw_chid;
5187 }
5188
5189 static int gk20a_gr_handle_sm_exception(struct gk20a *g,
5190                 struct gr_isr_data *isr_data)
5191 {
5192         int ret = 0;
5193         bool do_warp_sync = false;
5194         /* these three interrupts don't require locking down the SM. They can
5195          * be handled by usermode clients as they aren't fatal. Additionally,
5196          * usermode clients may wish to allow some warps to execute while others
5197          * are at breakpoints, as opposed to fatal errors where all warps should
5198          * halt. */
5199         u32 global_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()   |
5200                           gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
5201                           gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
5202         u32 global_esr, warp_esr;
5203         bool sm_debugger_attached = gk20a_gr_sm_debugger_attached(g);
5204         struct channel_gk20a *fault_ch;
5205
5206         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
5207
5208         global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5209         warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5210
5211         /* if an sm debugger is attached, disable forwarding of tpc exceptions.
5212          * the debugger will reenable exceptions after servicing them. */
5213         if (sm_debugger_attached) {
5214                 u32 tpc_exception_en = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
5215                 tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
5216                 gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), tpc_exception_en);
5217                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "SM debugger attached");
5218         }
5219
5220         /* if a debugger is present and an error has occurred, do a warp sync */
5221         if (sm_debugger_attached && ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
5222                 nvhost_dbg(dbg_intr, "warp sync needed");
5223                 do_warp_sync = true;
5224         }
5225
5226         if (do_warp_sync) {
5227                 ret = gk20a_gr_lock_down_sm(g, global_mask);
5228                 if (ret) {
5229                         nvhost_err(dev_from_gk20a(g), "sm did not lock down!\n");
5230                         return ret;
5231                 }
5232         }
5233
5234         /* finally, signal any client waiting on an event */
5235         fault_ch = channel_from_hw_chid(g, isr_data->chid);
5236         if (fault_ch)
5237                 gk20a_dbg_gpu_post_events(fault_ch);
5238
5239         return ret;
5240 }
5241
5242 static int gk20a_gr_handle_tpc_exception(struct gk20a *g,
5243                 struct gr_isr_data *isr_data)
5244 {
5245         int ret = 0;
5246         u32 tpc_exception = gk20a_readl(g, gr_gpcs_tpcs_tpccs_tpc_exception_r());
5247
5248         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "");
5249
5250         /* check if an sm exeption is pending  */
5251         if (gr_gpcs_tpcs_tpccs_tpc_exception_sm_v(tpc_exception) ==
5252                         gr_gpcs_tpcs_tpccs_tpc_exception_sm_pending_v()) {
5253                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "SM exception pending");
5254                 ret = gk20a_gr_handle_sm_exception(g, isr_data);
5255         }
5256
5257         return ret;
5258 }
5259
5260 static int gk20a_gr_handle_gpc_exception(struct gk20a *g,
5261                 struct gr_isr_data *isr_data)
5262 {
5263         int ret = 0;
5264         u32 gpc_exception = gk20a_readl(g, gr_gpcs_gpccs_gpc_exception_r());
5265
5266         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "");
5267
5268         /* check if tpc 0 has an exception */
5269         if (gr_gpcs_gpccs_gpc_exception_tpc_v(gpc_exception) ==
5270                         gr_gpcs_gpccs_gpc_exception_tpc_0_pending_v()) {
5271                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "TPC exception pending");
5272                 ret = gk20a_gr_handle_tpc_exception(g, isr_data);
5273         }
5274
5275         return ret;
5276 }
5277
5278 int gk20a_gr_isr(struct gk20a *g)
5279 {
5280         struct gr_isr_data isr_data;
5281         u32 grfifo_ctl;
5282         u32 obj_table;
5283         int need_reset = 0;
5284         u32 gr_intr = gk20a_readl(g, gr_intr_r());
5285
5286         nvhost_dbg_fn("");
5287         nvhost_dbg(dbg_intr, "pgraph intr %08x", gr_intr);
5288
5289         if (!gr_intr)
5290                 return 0;
5291
5292         grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
5293         grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
5294         grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
5295
5296         gk20a_writel(g, gr_gpfifo_ctl_r(),
5297                 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
5298                 gr_gpfifo_ctl_semaphore_access_f(0));
5299
5300         isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
5301         isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
5302         isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
5303         isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
5304         isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
5305         isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
5306         obj_table = gk20a_readl(g,
5307                 gr_fe_object_table_r(isr_data.sub_chan));
5308         isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
5309
5310         isr_data.chid =
5311                 gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx);
5312         if (isr_data.chid == -1) {
5313                 nvhost_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
5314                            isr_data.curr_ctx);
5315                 goto clean_up;
5316         }
5317
5318         nvhost_dbg(dbg_intr | dbg_gpu_dbg,
5319                 "channel %d: addr 0x%08x, "
5320                 "data 0x%08x 0x%08x,"
5321                 "ctx 0x%08x, offset 0x%08x, "
5322                 "subchannel 0x%08x, class 0x%08x",
5323                 isr_data.chid, isr_data.addr,
5324                 isr_data.data_hi, isr_data.data_lo,
5325                 isr_data.curr_ctx, isr_data.offset,
5326                 isr_data.sub_chan, isr_data.class_num);
5327
5328         if (gr_intr & gr_intr_notify_pending_f()) {
5329                 gk20a_gr_handle_notify_pending(g, &isr_data);
5330                 gk20a_writel(g, gr_intr_r(),
5331                         gr_intr_notify_reset_f());
5332                 gr_intr &= ~gr_intr_notify_pending_f();
5333         }
5334
5335         if (gr_intr & gr_intr_semaphore_pending_f()) {
5336                 gk20a_gr_handle_semaphore_pending(g, &isr_data);
5337                 gk20a_writel(g, gr_intr_r(),
5338                         gr_intr_semaphore_reset_f());
5339                 gr_intr &= ~gr_intr_semaphore_pending_f();
5340         }
5341
5342         if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
5343                 need_reset |= gk20a_gr_handle_semaphore_timeout_pending(g,
5344                         &isr_data);
5345                 gk20a_writel(g, gr_intr_r(),
5346                         gr_intr_semaphore_reset_f());
5347                 gr_intr &= ~gr_intr_semaphore_pending_f();
5348         }
5349
5350         if (gr_intr & gr_intr_illegal_notify_pending_f()) {
5351                 need_reset |= gk20a_gr_intr_illegal_notify_pending(g,
5352                         &isr_data);
5353                 gk20a_writel(g, gr_intr_r(),
5354                         gr_intr_illegal_notify_reset_f());
5355                 gr_intr &= ~gr_intr_illegal_notify_pending_f();
5356         }
5357
5358         if (gr_intr & gr_intr_illegal_method_pending_f()) {
5359                 need_reset |= gk20a_gr_handle_illegal_method(g, &isr_data);
5360                 gk20a_writel(g, gr_intr_r(),
5361                         gr_intr_illegal_method_reset_f());
5362                 gr_intr &= ~gr_intr_illegal_method_pending_f();
5363         }
5364
5365         if (gr_intr & gr_intr_illegal_class_pending_f()) {
5366                 need_reset |= gk20a_gr_handle_illegal_class(g, &isr_data);
5367                 gk20a_writel(g, gr_intr_r(),
5368                         gr_intr_illegal_class_reset_f());
5369                 gr_intr &= ~gr_intr_illegal_class_pending_f();
5370         }
5371
5372         if (gr_intr & gr_intr_class_error_pending_f()) {
5373                 need_reset |= gk20a_gr_handle_class_error(g, &isr_data);
5374                 gk20a_writel(g, gr_intr_r(),
5375                         gr_intr_class_error_reset_f());
5376                 gr_intr &= ~gr_intr_class_error_pending_f();
5377         }
5378
5379         /* this one happens if someone tries to hit a non-whitelisted
5380          * register using set_falcon[4] */
5381         if (gr_intr & gr_intr_firmware_method_pending_f()) {
5382                 need_reset |= true;
5383                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "firmware method intr pending\n");
5384                 gk20a_writel(g, gr_intr_r(),
5385                         gr_intr_firmware_method_reset_f());
5386                 gr_intr &= ~gr_intr_firmware_method_pending_f();
5387         }
5388
5389         if (gr_intr & gr_intr_exception_pending_f()) {
5390                 u32 exception = gk20a_readl(g, gr_exception_r());
5391                 struct fifo_gk20a *f = &g->fifo;
5392                 struct channel_gk20a *ch = &f->channel[isr_data.chid];
5393
5394                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "exception %08x\n", exception);
5395
5396                 if (exception & gr_exception_fe_m()) {
5397                         u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
5398                         nvhost_dbg(dbg_intr, "fe warning %08x\n", fe);
5399                         gk20a_writel(g, gr_fe_hww_esr_r(), fe);
5400                 }
5401
5402                 /* check if a gpc exception has occurred */
5403                 if (exception & gr_exception_gpc_m() && need_reset == 0) {
5404                         u32 exception1 = gk20a_readl(g, gr_exception1_r());
5405                         u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5406
5407                         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "GPC exception pending");
5408
5409                         /* if no sm debugger is present, clean up the channel */
5410                         if (!gk20a_gr_sm_debugger_attached(g)) {
5411                                 nvhost_dbg(dbg_intr | dbg_gpu_dbg,
5412                                            "SM debugger not attached, clearing interrupt");
5413                                 need_reset |= -EFAULT;
5414                         }
5415                         else {
5416                                 /* check if gpc 0 has an exception */
5417                                 if (exception1 & gr_exception1_gpc_0_pending_f())
5418                                         need_reset |= gk20a_gr_handle_gpc_exception(g, &isr_data);
5419                                 /* clear the hwws, also causes tpc and gpc
5420                                  * exceptions to be cleared */
5421                                 gk20a_gr_clear_sm_hww(g, global_esr);
5422                         }
5423
5424                         if (need_reset)
5425                                 gk20a_set_error_notifier(ch,
5426                                         NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
5427                 }
5428
5429                 gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
5430                 gr_intr &= ~gr_intr_exception_pending_f();
5431         }
5432
5433         if (need_reset)
5434                 gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), true);
5435
5436 clean_up:
5437         gk20a_writel(g, gr_gpfifo_ctl_r(),
5438                 grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
5439                 gr_gpfifo_ctl_semaphore_access_f(1));
5440
5441         if (gr_intr)
5442                 nvhost_err(dev_from_gk20a(g),
5443                            "unhandled gr interrupt 0x%08x", gr_intr);
5444
5445         return 0;
5446 }
5447
5448 int gk20a_gr_nonstall_isr(struct gk20a *g)
5449 {
5450         u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
5451         u32 clear_intr = 0;
5452
5453         nvhost_dbg(dbg_intr, "pgraph nonstall intr %08x", gr_intr);
5454
5455         if (gr_intr & gr_intr_nonstall_trap_pending_f()) {
5456                 gk20a_channel_semaphore_wakeup(g);
5457                 clear_intr |= gr_intr_nonstall_trap_pending_f();
5458         }
5459
5460         gk20a_writel(g, gr_intr_nonstall_r(), clear_intr);
5461
5462         return 0;
5463 }
5464
5465 int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
5466 {
5467         BUG_ON(size == NULL);
5468         return gr_gk20a_submit_fecs_method_op(g,
5469                    (struct fecs_method_op_gk20a) {
5470                            .mailbox.id = 0,
5471                            .mailbox.data = 0,
5472                            .mailbox.clr = ~0,
5473                            .method.data = 1,
5474                            .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
5475                            .mailbox.ret = size,
5476                            .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
5477                            .mailbox.ok = 0,
5478                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5479                            .mailbox.fail = 0});
5480 }
5481
5482 int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
5483 {
5484         return gr_gk20a_submit_fecs_method_op(g,
5485                    (struct fecs_method_op_gk20a){
5486                            .mailbox.id = 4,
5487                            .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
5488                                             gr_fecs_current_ctx_valid_f(1) |
5489                                             gr_fecs_current_ctx_target_vid_mem_f()),
5490                            .mailbox.clr = ~0,
5491                            .method.data = 1,
5492                            .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
5493                            .mailbox.ret = NULL,
5494                            .cond.ok = GR_IS_UCODE_OP_EQUAL,
5495                            .mailbox.ok = 1,
5496                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5497                            .mailbox.fail = 0});
5498 }
5499
5500 int gr_gk20a_fecs_set_reglist_virual_addr(struct gk20a *g, u64 pmu_va)
5501 {
5502         return gr_gk20a_submit_fecs_method_op(g,
5503                    (struct fecs_method_op_gk20a) {
5504                            .mailbox.id = 4,
5505                            .mailbox.data = u64_lo32(pmu_va >> 8),
5506                            .mailbox.clr = ~0,
5507                            .method.data = 1,
5508                            .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
5509                            .mailbox.ret = NULL,
5510                            .cond.ok = GR_IS_UCODE_OP_EQUAL,
5511                            .mailbox.ok = 1,
5512                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5513                            .mailbox.fail = 0});
5514 }
5515
5516 int gk20a_gr_suspend(struct gk20a *g)
5517 {
5518         unsigned long end_jiffies = jiffies +
5519                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5520         u32 ret = 0;
5521
5522         nvhost_dbg_fn("");
5523
5524         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
5525         if (ret)
5526                 return ret;
5527
5528         gk20a_writel(g, gr_gpfifo_ctl_r(),
5529                 gr_gpfifo_ctl_access_disabled_f());
5530
5531         /* disable gr intr */
5532         gk20a_writel(g, gr_intr_r(), 0);
5533         gk20a_writel(g, gr_intr_en_r(), 0);
5534
5535         /* disable all exceptions */
5536         gk20a_writel(g, gr_exception_r(), 0);
5537         gk20a_writel(g, gr_exception_en_r(), 0);
5538         gk20a_writel(g, gr_exception1_r(), 0);
5539         gk20a_writel(g, gr_exception1_en_r(), 0);
5540         gk20a_writel(g, gr_exception2_r(), 0);
5541         gk20a_writel(g, gr_exception2_en_r(), 0);
5542
5543         gk20a_gr_flush_channel_tlb(&g->gr);
5544
5545         nvhost_dbg_fn("done");
5546         return ret;
5547 }
5548
5549 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
5550                                                u32 addr,
5551                                                bool is_quad, u32 quad,
5552                                                u32 *context_buffer,
5553                                                u32 context_buffer_size,
5554                                                u32 *priv_offset);
5555
5556 /* This function will decode a priv address and return the partition type and numbers. */
5557 int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
5558                               int  *addr_type, /* enum ctxsw_addr_type */
5559                               u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
5560                               u32 *broadcast_flags)
5561 {
5562         u32 gpc_addr;
5563         u32 ppc_address;
5564         u32 ppc_broadcast_addr;
5565
5566         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5567
5568         /* setup defaults */
5569         ppc_address = 0;
5570         ppc_broadcast_addr = 0;
5571         *addr_type = CTXSW_ADDR_TYPE_SYS;
5572         *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
5573         *gpc_num = 0;
5574         *tpc_num = 0;
5575         *ppc_num = 0;
5576         *be_num  = 0;
5577
5578         if (pri_is_gpc_addr(addr)) {
5579                 *addr_type = CTXSW_ADDR_TYPE_GPC;
5580                 gpc_addr = pri_gpccs_addr_mask(addr);
5581                 if (pri_is_gpc_addr_shared(addr)) {
5582                         *addr_type = CTXSW_ADDR_TYPE_GPC;
5583                         *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
5584                 } else
5585                         *gpc_num = pri_get_gpc_num(addr);
5586
5587                 if (pri_is_tpc_addr(gpc_addr)) {
5588                         *addr_type = CTXSW_ADDR_TYPE_TPC;
5589                         if (pri_is_tpc_addr_shared(gpc_addr)) {
5590                                 *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
5591                                 return 0;
5592                         }
5593                         *tpc_num = pri_get_tpc_num(gpc_addr);
5594                 }
5595                 return 0;
5596         } else if (pri_is_be_addr(addr)) {
5597                 *addr_type = CTXSW_ADDR_TYPE_BE;
5598                 if (pri_is_be_addr_shared(addr)) {
5599                         *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
5600                         return 0;
5601                 }
5602                 *be_num = pri_get_be_num(addr);
5603                 return 0;
5604         } else {
5605                 *addr_type = CTXSW_ADDR_TYPE_SYS;
5606                 return 0;
5607         }
5608         /* PPC!?!?!?! */
5609
5610         /*NOTREACHED*/
5611         return -EINVAL;
5612 }
5613
5614 static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
5615                                       u32 gpc_num,
5616                                       u32 *priv_addr_table, u32 *t)
5617 {
5618     u32 ppc_num;
5619
5620     nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5621
5622     for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
5623             priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
5624                                                    gpc_num, ppc_num);
5625
5626     return 0;
5627 }
5628
5629 /*
5630  * The context buffer is indexed using BE broadcast addresses and GPC/TPC
5631  * unicast addresses. This function will convert a BE unicast address to a BE
5632  * broadcast address and split a GPC/TPC broadcast address into a table of
5633  * GPC/TPC addresses.  The addresses generated by this function can be
5634  * successfully processed by gr_gk20a_find_priv_offset_in_buffer
5635  */
5636 static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
5637                                            u32 addr,
5638                                            u32 *priv_addr_table,
5639                                            u32 *num_registers)
5640 {
5641         int addr_type; /*enum ctxsw_addr_type */
5642         u32 gpc_num, tpc_num, ppc_num, be_num;
5643         u32 broadcast_flags;
5644         u32 t;
5645         int err;
5646
5647         t = 0;
5648         *num_registers = 0;
5649
5650         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5651
5652         err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
5653                                         &gpc_num, &tpc_num, &ppc_num, &be_num,
5654                                         &broadcast_flags);
5655         nvhost_dbg(dbg_gpu_dbg, "addr_type = %d", addr_type);
5656         if (err)
5657                 return err;
5658
5659         if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
5660             (addr_type == CTXSW_ADDR_TYPE_BE)) {
5661                 /* The BE broadcast registers are included in the compressed PRI
5662                  * table. Convert a BE unicast address to a broadcast address
5663                  * so that we can look up the offset. */
5664                 if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
5665                     !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
5666                         priv_addr_table[t++] = pri_be_shared_addr(addr);
5667                 else
5668                         priv_addr_table[t++] = addr;
5669
5670                 *num_registers = t;
5671                 return 0;
5672         }
5673
5674         /* The GPC/TPC unicast registers are included in the compressed PRI
5675          * tables. Convert a GPC/TPC broadcast address to unicast addresses so
5676          * that we can look up the offsets. */
5677         if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
5678                 for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
5679
5680                         if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5681                                 for (tpc_num = 0;
5682                                      tpc_num < g->gr.gpc_tpc_count[gpc_num];
5683                                      tpc_num++)
5684                                         priv_addr_table[t++] =
5685                                                 pri_tpc_addr(pri_tpccs_addr_mask(addr),
5686                                                              gpc_num, tpc_num);
5687
5688                         else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
5689                                 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5690                                                                priv_addr_table, &t);
5691                                 if (err)
5692                                         return err;
5693                         } else
5694                                 priv_addr_table[t++] =
5695                                         pri_gpc_addr(pri_gpccs_addr_mask(addr),
5696                                                      gpc_num);
5697                 }
5698         } else {
5699                 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5700                         for (tpc_num = 0;
5701                              tpc_num < g->gr.gpc_tpc_count[gpc_num];
5702                              tpc_num++)
5703                                 priv_addr_table[t++] =
5704                                         pri_tpc_addr(pri_tpccs_addr_mask(addr),
5705                                                      gpc_num, tpc_num);
5706                 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
5707                         err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5708                                                        priv_addr_table, &t);
5709                 else
5710                         priv_addr_table[t++] = addr;
5711         }
5712
5713         *num_registers = t;
5714         return 0;
5715 }
5716
5717 int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
5718                                     u32 addr,
5719                                     u32 max_offsets,
5720                                     u32 *offsets,&n