video: tegra: gk20a: do not set error notifier during debugging
[linux-3.10.git] / drivers / video / tegra / host / gk20a / gr_gk20a.c
1 /*
2  * drivers/video/tegra/host/gk20a/gr_gk20a.c
3  *
4  * GK20A Graphics
5  *
6  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
7  *
8  * This program is free software; you can redistribute it and/or modify it
9  * under the terms and conditions of the GNU General Public License,
10  * version 2, as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  *
17  * You should have received a copy of the GNU General Public License along with
18  * this program; if not, write to the Free Software Foundation, Inc.,
19  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
20  */
21
22 #include <linux/delay.h>        /* for udelay */
23 #include <linux/mm.h>           /* for totalram_pages */
24 #include <linux/scatterlist.h>
25 #include <linux/nvmap.h>
26 #include <linux/tegra-soc.h>
27 #include <linux/nvhost_dbg_gpu_ioctl.h>
28 #include <linux/vmalloc.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/firmware.h>
31
32 #include "../dev.h"
33 #include "bus_client.h"
34
35 #include "gk20a.h"
36 #include "gr_ctx_gk20a.h"
37
38 #include "hw_ccsr_gk20a.h"
39 #include "hw_ctxsw_prog_gk20a.h"
40 #include "hw_fifo_gk20a.h"
41 #include "hw_gr_gk20a.h"
42 #include "hw_mc_gk20a.h"
43 #include "hw_ram_gk20a.h"
44 #include "hw_pri_ringmaster_gk20a.h"
45 #include "hw_pri_ringstation_sys_gk20a.h"
46 #include "hw_pri_ringstation_gpc_gk20a.h"
47 #include "hw_pri_ringstation_fbp_gk20a.h"
48 #include "hw_proj_gk20a.h"
49 #include "hw_top_gk20a.h"
50 #include "hw_ltc_gk20a.h"
51 #include "hw_fb_gk20a.h"
52 #include "hw_therm_gk20a.h"
53 #include "hw_pbdma_gk20a.h"
54 #include "chip_support.h"
55 #include "nvhost_memmgr.h"
56 #include "gk20a_gating_reglist.h"
57 #include "gr_pri_gk20a.h"
58 #include "regops_gk20a.h"
59 #include "dbg_gpu_gk20a.h"
60
61 #define BLK_SIZE (256)
62
63 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
64 static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx,
65                                     u32 addr, u32 data, bool patch);
66
67 /* global ctx buffer */
68 static int  gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
69 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
70 static int  gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
71                                             struct channel_gk20a *c);
72 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
73
74 /* channel gr ctx buffer */
75 static int  gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
76                                         struct channel_gk20a *c);
77 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
78
79 /* channel patch ctx buffer */
80 static int  gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
81                                         struct channel_gk20a *c);
82 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
83
84 /* golden ctx image */
85 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
86                                           struct channel_gk20a *c);
87 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
88                                           struct channel_gk20a *c);
89
90 void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
91 {
92         int i;
93
94         nvhost_err(dev_from_gk20a(g), "gr_fecs_os_r : %d",
95                 gk20a_readl(g, gr_fecs_os_r()));
96         nvhost_err(dev_from_gk20a(g), "gr_fecs_cpuctl_r : 0x%x",
97                 gk20a_readl(g, gr_fecs_cpuctl_r()));
98         nvhost_err(dev_from_gk20a(g), "gr_fecs_idlestate_r : 0x%x",
99                 gk20a_readl(g, gr_fecs_idlestate_r()));
100         nvhost_err(dev_from_gk20a(g), "gr_fecs_mailbox0_r : 0x%x",
101                 gk20a_readl(g, gr_fecs_mailbox0_r()));
102         nvhost_err(dev_from_gk20a(g), "gr_fecs_mailbox1_r : 0x%x",
103                 gk20a_readl(g, gr_fecs_mailbox1_r()));
104         nvhost_err(dev_from_gk20a(g), "gr_fecs_irqstat_r : 0x%x",
105                 gk20a_readl(g, gr_fecs_irqstat_r()));
106         nvhost_err(dev_from_gk20a(g), "gr_fecs_irqmode_r : 0x%x",
107                 gk20a_readl(g, gr_fecs_irqmode_r()));
108         nvhost_err(dev_from_gk20a(g), "gr_fecs_irqmask_r : 0x%x",
109                 gk20a_readl(g, gr_fecs_irqmask_r()));
110         nvhost_err(dev_from_gk20a(g), "gr_fecs_irqdest_r : 0x%x",
111                 gk20a_readl(g, gr_fecs_irqdest_r()));
112         nvhost_err(dev_from_gk20a(g), "gr_fecs_debug1_r : 0x%x",
113                 gk20a_readl(g, gr_fecs_debug1_r()));
114         nvhost_err(dev_from_gk20a(g), "gr_fecs_debuginfo_r : 0x%x",
115                 gk20a_readl(g, gr_fecs_debuginfo_r()));
116
117         for (i = 0; i < gr_fecs_ctxsw_mailbox__size_1_v(); i++)
118                 nvhost_err(dev_from_gk20a(g), "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
119                         i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
120
121         nvhost_err(dev_from_gk20a(g), "gr_fecs_engctl_r : 0x%x",
122                 gk20a_readl(g, gr_fecs_engctl_r()));
123         nvhost_err(dev_from_gk20a(g), "gr_fecs_curctx_r : 0x%x",
124                 gk20a_readl(g, gr_fecs_curctx_r()));
125         nvhost_err(dev_from_gk20a(g), "gr_fecs_nxtctx_r : 0x%x",
126                 gk20a_readl(g, gr_fecs_nxtctx_r()));
127
128         gk20a_writel(g, gr_fecs_icd_cmd_r(),
129                 gr_fecs_icd_cmd_opc_rreg_f() |
130                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
131         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_IMB : 0x%x",
132                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
133
134         gk20a_writel(g, gr_fecs_icd_cmd_r(),
135                 gr_fecs_icd_cmd_opc_rreg_f() |
136                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
137         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_DMB : 0x%x",
138                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
139
140         gk20a_writel(g, gr_fecs_icd_cmd_r(),
141                 gr_fecs_icd_cmd_opc_rreg_f() |
142                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
143         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_CSW : 0x%x",
144                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
145
146         gk20a_writel(g, gr_fecs_icd_cmd_r(),
147                 gr_fecs_icd_cmd_opc_rreg_f() |
148                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
149         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_CTX : 0x%x",
150                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
151
152         gk20a_writel(g, gr_fecs_icd_cmd_r(),
153                 gr_fecs_icd_cmd_opc_rreg_f() |
154                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
155         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_EXCI : 0x%x",
156                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
157
158         for (i = 0; i < 4; i++) {
159                 gk20a_writel(g, gr_fecs_icd_cmd_r(),
160                         gr_fecs_icd_cmd_opc_rreg_f() |
161                         gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
162                 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_PC : 0x%x",
163                         gk20a_readl(g, gr_fecs_icd_rdata_r()));
164
165                 gk20a_writel(g, gr_fecs_icd_cmd_r(),
166                         gr_fecs_icd_cmd_opc_rreg_f() |
167                         gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
168                 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_SP : 0x%x",
169                         gk20a_readl(g, gr_fecs_icd_rdata_r()));
170         }
171 }
172
173 static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
174 {
175         u32 i, ucode_u32_size;
176         const u32 *ucode_u32_data;
177         u32 checksum;
178
179         nvhost_dbg_fn("");
180
181         gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
182                                               gr_gpccs_dmemc_blk_f(0)  |
183                                               gr_gpccs_dmemc_aincw_f(1)));
184
185         ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
186         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
187
188         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
189                 gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
190                 checksum += ucode_u32_data[i];
191         }
192
193         gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
194                                              gr_fecs_dmemc_blk_f(0)  |
195                                              gr_fecs_dmemc_aincw_f(1)));
196
197         ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
198         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
199
200         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
201                 gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
202                 checksum += ucode_u32_data[i];
203         }
204         nvhost_dbg_fn("done");
205 }
206
207 static void gr_gk20a_load_falcon_imem(struct gk20a *g)
208 {
209         u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
210         const u32 *ucode_u32_data;
211         u32 tag, i, pad_start, pad_end;
212         u32 checksum;
213
214         nvhost_dbg_fn("");
215
216         cfg = gk20a_readl(g, gr_fecs_cfg_r());
217         fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
218
219         cfg = gk20a_readl(g, gr_gpc0_cfg_r());
220         gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
221
222         /* Use the broadcast address to access all of the GPCCS units. */
223         gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
224                                               gr_gpccs_imemc_blk_f(0) |
225                                               gr_gpccs_imemc_aincw_f(1)));
226
227         /* Setup the tags for the instruction memory. */
228         tag = 0;
229         gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
230
231         ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
232         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
233
234         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
235                 if (i && ((i % (256/sizeof(u32))) == 0)) {
236                         tag++;
237                         gk20a_writel(g, gr_gpccs_imemt_r(0),
238                                       gr_gpccs_imemt_tag_f(tag));
239                 }
240                 gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
241                 checksum += ucode_u32_data[i];
242         }
243
244         pad_start = i*4;
245         pad_end = pad_start+(256-pad_start%256)+256;
246         for (i = pad_start;
247              (i < gpccs_imem_size * 256) && (i < pad_end);
248              i += 4) {
249                 if (i && ((i % 256) == 0)) {
250                         tag++;
251                         gk20a_writel(g, gr_gpccs_imemt_r(0),
252                                       gr_gpccs_imemt_tag_f(tag));
253                 }
254                 gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
255         }
256
257         gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
258                                              gr_fecs_imemc_blk_f(0) |
259                                              gr_fecs_imemc_aincw_f(1)));
260
261         /* Setup the tags for the instruction memory. */
262         tag = 0;
263         gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
264
265         ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
266         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
267
268         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
269                 if (i && ((i % (256/sizeof(u32))) == 0)) {
270                         tag++;
271                         gk20a_writel(g, gr_fecs_imemt_r(0),
272                                       gr_fecs_imemt_tag_f(tag));
273                 }
274                 gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
275                 checksum += ucode_u32_data[i];
276         }
277
278         pad_start = i*4;
279         pad_end = pad_start+(256-pad_start%256)+256;
280         for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
281                 if (i && ((i % 256) == 0)) {
282                         tag++;
283                         gk20a_writel(g, gr_fecs_imemt_r(0),
284                                       gr_fecs_imemt_tag_f(tag));
285                 }
286                 gk20a_writel(g, gr_fecs_imemd_r(0), 0);
287         }
288 }
289
290 static int gr_gk20a_wait_idle(struct gk20a *g, unsigned long end_jiffies,
291                 u32 expect_delay)
292 {
293         u32 delay = expect_delay;
294         bool gr_enabled;
295         bool ctxsw_active;
296         bool gr_busy;
297
298         nvhost_dbg_fn("");
299
300         do {
301                 /* fmodel: host gets fifo_engine_status(gr) from gr
302                    only when gr_status is read */
303                 gk20a_readl(g, gr_status_r());
304
305                 gr_enabled = gk20a_readl(g, mc_enable_r()) &
306                         mc_enable_pgraph_enabled_f();
307
308                 ctxsw_active = gk20a_readl(g,
309                         fifo_engine_status_r(ENGINE_GR_GK20A)) &
310                         fifo_engine_status_ctxsw_in_progress_f();
311
312                 gr_busy = gk20a_readl(g, gr_engine_status_r()) &
313                         gr_engine_status_value_busy_f();
314
315                 if (!gr_enabled || (!gr_busy && !ctxsw_active)) {
316                         nvhost_dbg_fn("done");
317                         return 0;
318                 }
319
320                 usleep_range(delay, delay * 2);
321                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
322
323         } while (time_before(jiffies, end_jiffies));
324
325         nvhost_err(dev_from_gk20a(g),
326                 "timeout, ctxsw busy : %d, gr busy : %d",
327                 ctxsw_active, gr_busy);
328
329         return -EAGAIN;
330 }
331
332 static int gr_gk20a_ctx_reset(struct gk20a *g, u32 rst_mask)
333 {
334         u32 delay = GR_IDLE_CHECK_DEFAULT;
335         unsigned long end_jiffies = jiffies +
336                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
337         u32 reg;
338
339         nvhost_dbg_fn("");
340
341         /* Force clocks on */
342         gk20a_writel(g, gr_fe_pwr_mode_r(),
343                      gr_fe_pwr_mode_req_send_f() |
344                      gr_fe_pwr_mode_mode_force_on_f());
345
346         /* Wait for the clocks to indicate that they are on */
347         do {
348                 reg = gk20a_readl(g, gr_fe_pwr_mode_r());
349
350                 if (gr_fe_pwr_mode_req_v(reg) == gr_fe_pwr_mode_req_done_v())
351                         break;
352
353                 usleep_range(delay, delay * 2);
354                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
355
356         } while (time_before(jiffies, end_jiffies));
357
358         if (!time_before(jiffies, end_jiffies)) {
359                 nvhost_err(dev_from_gk20a(g),
360                            "failed to force the clocks on\n");
361                 WARN_ON(1);
362         }
363
364         if (rst_mask) {
365                 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), rst_mask);
366         } else {
367                 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
368                              gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
369                              gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
370                              gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
371                              gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
372                              gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
373                              gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
374                              gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
375                              gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
376                              gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
377         }
378
379         /* we need to read the reset register *and* wait for a moment to ensure
380          * reset propagation */
381
382         gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
383         udelay(20);
384
385         gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
386                      gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
387                      gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
388                      gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
389                      gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
390                      gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
391                      gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
392                      gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
393                      gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
394                      gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
395
396         /* we need to readl the reset and then wait a small moment after that */
397         gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
398         udelay(20);
399
400         /* Set power mode back to auto */
401         gk20a_writel(g, gr_fe_pwr_mode_r(),
402                      gr_fe_pwr_mode_req_send_f() |
403                      gr_fe_pwr_mode_mode_auto_f());
404
405         /* Wait for the request to complete */
406         end_jiffies = jiffies + msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
407         do {
408                 reg = gk20a_readl(g, gr_fe_pwr_mode_r());
409
410                 if (gr_fe_pwr_mode_req_v(reg) == gr_fe_pwr_mode_req_done_v())
411                         break;
412
413                 usleep_range(delay, delay * 2);
414                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
415
416         } while (time_before(jiffies, end_jiffies));
417
418         if (!time_before(jiffies, end_jiffies)) {
419                 nvhost_err(dev_from_gk20a(g),
420                            "failed to set power mode to auto\n");
421                 WARN_ON(1);
422         }
423
424         return 0;
425 }
426
427 static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
428                                    u32 *mailbox_ret, u32 opc_success,
429                                    u32 mailbox_ok, u32 opc_fail,
430                                    u32 mailbox_fail)
431 {
432         unsigned long end_jiffies = jiffies +
433                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
434         u32 delay = GR_IDLE_CHECK_DEFAULT;
435         u32 check = WAIT_UCODE_LOOP;
436         u32 reg;
437
438         nvhost_dbg_fn("");
439
440         while (check == WAIT_UCODE_LOOP) {
441                 if (!time_before(jiffies, end_jiffies) &&
442                                 tegra_platform_is_silicon())
443                         check = WAIT_UCODE_TIMEOUT;
444
445                 reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
446
447                 if (mailbox_ret)
448                         *mailbox_ret = reg;
449
450                 switch (opc_success) {
451                 case GR_IS_UCODE_OP_EQUAL:
452                         if (reg == mailbox_ok)
453                                 check = WAIT_UCODE_OK;
454                         break;
455                 case GR_IS_UCODE_OP_NOT_EQUAL:
456                         if (reg != mailbox_ok)
457                                 check = WAIT_UCODE_OK;
458                         break;
459                 case GR_IS_UCODE_OP_AND:
460                         if (reg & mailbox_ok)
461                                 check = WAIT_UCODE_OK;
462                         break;
463                 case GR_IS_UCODE_OP_LESSER:
464                         if (reg < mailbox_ok)
465                                 check = WAIT_UCODE_OK;
466                         break;
467                 case GR_IS_UCODE_OP_LESSER_EQUAL:
468                         if (reg <= mailbox_ok)
469                                 check = WAIT_UCODE_OK;
470                         break;
471                 case GR_IS_UCODE_OP_SKIP:
472                         /* do no success check */
473                         break;
474                 default:
475                         nvhost_err(dev_from_gk20a(g),
476                                    "invalid success opcode 0x%x", opc_success);
477
478                         check = WAIT_UCODE_ERROR;
479                         break;
480                 }
481
482                 switch (opc_fail) {
483                 case GR_IS_UCODE_OP_EQUAL:
484                         if (reg == mailbox_fail)
485                                 check = WAIT_UCODE_ERROR;
486                         break;
487                 case GR_IS_UCODE_OP_NOT_EQUAL:
488                         if (reg != mailbox_fail)
489                                 check = WAIT_UCODE_ERROR;
490                         break;
491                 case GR_IS_UCODE_OP_AND:
492                         if (reg & mailbox_fail)
493                                 check = WAIT_UCODE_ERROR;
494                         break;
495                 case GR_IS_UCODE_OP_LESSER:
496                         if (reg < mailbox_fail)
497                                 check = WAIT_UCODE_ERROR;
498                         break;
499                 case GR_IS_UCODE_OP_LESSER_EQUAL:
500                         if (reg <= mailbox_fail)
501                                 check = WAIT_UCODE_ERROR;
502                         break;
503                 case GR_IS_UCODE_OP_SKIP:
504                         /* do no check on fail*/
505                         break;
506                 default:
507                         nvhost_err(dev_from_gk20a(g),
508                                    "invalid fail opcode 0x%x", opc_fail);
509                         check = WAIT_UCODE_ERROR;
510                         break;
511                 }
512
513                 usleep_range(delay, delay * 2);
514                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
515         }
516
517         if (check == WAIT_UCODE_TIMEOUT) {
518                 nvhost_err(dev_from_gk20a(g),
519                            "timeout waiting on ucode response");
520                 gk20a_fecs_dump_falcon_stats(g);
521                 return -1;
522         } else if (check == WAIT_UCODE_ERROR) {
523                 nvhost_err(dev_from_gk20a(g),
524                            "ucode method failed on mailbox=%d value=0x%08x",
525                            mailbox_id, reg);
526                 gk20a_fecs_dump_falcon_stats(g);
527                 return -1;
528         }
529
530         nvhost_dbg_fn("done");
531         return 0;
532 }
533
534 /* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
535  * We should replace most, if not all, fecs method calls to this instead. */
536 struct fecs_method_op_gk20a {
537         struct {
538                 u32 addr;
539                 u32 data;
540         } method;
541
542         struct {
543                 u32 id;
544                 u32 data;
545                 u32 clr;
546                 u32 *ret;
547                 u32 ok;
548                 u32 fail;
549         } mailbox;
550
551         struct {
552                 u32 ok;
553                 u32 fail;
554         } cond;
555
556 };
557
558 int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
559                                    struct fecs_method_op_gk20a op)
560 {
561         struct gr_gk20a *gr = &g->gr;
562         int ret;
563
564         mutex_lock(&gr->fecs_mutex);
565
566         if (op.mailbox.id != 0)
567                 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
568                              op.mailbox.data);
569
570         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
571                 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
572
573         gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
574         gk20a_writel(g, gr_fecs_method_push_r(),
575                 gr_fecs_method_push_adr_f(op.method.addr));
576
577         /* op.mb.id == 4 cases require waiting for completion on
578          * for op.mb.id == 0 */
579         if (op.mailbox.id == 4)
580                 op.mailbox.id = 0;
581
582         ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
583                                       op.cond.ok, op.mailbox.ok,
584                                       op.cond.fail, op.mailbox.fail);
585
586         mutex_unlock(&gr->fecs_mutex);
587
588         return ret;
589 }
590
591 int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
592 {
593         return gr_gk20a_submit_fecs_method_op(g,
594               (struct fecs_method_op_gk20a) {
595                       .method.addr = fecs_method,
596                       .method.data = ~0,
597                       .mailbox = { .id   = 1, /*sideband?*/
598                                    .data = ~0, .clr = ~0, .ret = ret,
599                                    .ok   = gr_fecs_ctxsw_mailbox_value_pass_v(),
600                                    .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
601                       .cond.ok = GR_IS_UCODE_OP_EQUAL,
602                       .cond.fail = GR_IS_UCODE_OP_EQUAL });
603 }
604
605 /* Stop processing (stall) context switches at FECS */
606 int gr_gk20a_disable_ctxsw(struct gk20a *g)
607 {
608         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
609         return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0);
610 }
611
612 /* Start processing (continue) context switches at FECS */
613 int gr_gk20a_enable_ctxsw(struct gk20a *g)
614 {
615         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
616         return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0);
617 }
618
619
620 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
621 {
622         u32 addr_lo;
623         u32 addr_hi;
624         void *inst_ptr = NULL;
625
626         nvhost_dbg_fn("");
627
628         /* flush gpu_va before commit */
629         gk20a_mm_fb_flush(c->g);
630         gk20a_mm_l2_flush(c->g, true);
631
632         inst_ptr = c->inst_block.cpuva;
633         if (!inst_ptr)
634                 return -ENOMEM;
635
636         addr_lo = u64_lo32(gpu_va) >> 12;
637         addr_hi = u64_hi32(gpu_va);
638
639         mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
640                  ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
641                  ram_in_gr_wfi_ptr_lo_f(addr_lo));
642
643         mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
644                  ram_in_gr_wfi_ptr_hi_f(addr_hi));
645
646         gk20a_mm_l2_invalidate(c->g);
647
648         return 0;
649 }
650
651 /*
652  * Context state can be written directly or "patched" at times.
653  * So that code can be used in either situation it is written
654  * using a series _ctx_patch_write(..., patch) statements.
655  * However any necessary cpu map/unmap and gpu l2 invalidates
656  * should be minimized (to avoid doing it once per patch write).
657  * Before a sequence of these set up with "_ctx_patch_write_begin"
658  * and close with "_ctx_patch_write_end."
659  */
660 static int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
661                                           struct channel_ctx_gk20a *ch_ctx)
662 {
663         /* being defensive still... */
664         if (ch_ctx->patch_ctx.cpu_va) {
665                 nvhost_err(dev_from_gk20a(g), "nested ctx patch begin?");
666                 return -EBUSY;
667         }
668
669         ch_ctx->patch_ctx.cpu_va =
670                 nvhost_memmgr_mmap(ch_ctx->patch_ctx.mem.ref);
671
672         if (!ch_ctx->patch_ctx.cpu_va)
673                 return -ENOMEM;
674
675         return 0;
676 }
677
678 static int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
679                                         struct channel_ctx_gk20a *ch_ctx)
680 {
681         /* being defensive still... */
682         if (!ch_ctx->patch_ctx.cpu_va) {
683                 nvhost_err(dev_from_gk20a(g), "dangling ctx patch end?");
684                 return -EINVAL;
685         }
686
687         nvhost_memmgr_munmap(ch_ctx->patch_ctx.mem.ref,
688                              ch_ctx->patch_ctx.cpu_va);
689         ch_ctx->patch_ctx.cpu_va = NULL;
690
691         gk20a_mm_l2_invalidate(g);
692         return 0;
693 }
694
695 static int gr_gk20a_ctx_patch_write(struct gk20a *g,
696                                     struct channel_ctx_gk20a *ch_ctx,
697                                     u32 addr, u32 data, bool patch)
698 {
699         u32 patch_slot = 0;
700         void *patch_ptr = NULL;
701         bool mapped_here = false;
702
703         BUG_ON(patch != 0 && ch_ctx == NULL);
704
705         if (patch) {
706                 if (!ch_ctx)
707                         return -EINVAL;
708                 /* we added an optimization prolog, epilog
709                  * to get rid of unnecessary maps and l2 invals.
710                  * but be defensive still... */
711                 if (!ch_ctx->patch_ctx.cpu_va) {
712                         int err;
713                         nvhost_err(dev_from_gk20a(g),
714                                    "per-write ctx patch begin?");
715                         /* yes, gr_gk20a_ctx_patch_smpc causes this one */
716                         err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
717                         if (err)
718                                 return err;
719                         mapped_here = true;
720                 } else
721                         mapped_here = false;
722
723                 patch_ptr = ch_ctx->patch_ctx.cpu_va;
724                 patch_slot = ch_ctx->patch_ctx.data_count * 2;
725
726                 mem_wr32(patch_ptr, patch_slot++, addr);
727                 mem_wr32(patch_ptr, patch_slot++, data);
728
729                 ch_ctx->patch_ctx.data_count++;
730
731                 if (mapped_here)
732                         gr_gk20a_ctx_patch_write_end(g, ch_ctx);
733
734         } else
735                 gk20a_writel(g, addr, data);
736
737         return 0;
738 }
739
740 static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
741                                         struct channel_gk20a *c)
742 {
743         u32 inst_base_ptr = u64_lo32(c->inst_block.cpu_pa
744                                      >> ram_in_base_shift_v());
745         u32 ret;
746
747         nvhost_dbg_info("bind channel %d inst ptr 0x%08x",
748                    c->hw_chid, inst_base_ptr);
749
750         ret = gr_gk20a_submit_fecs_method_op(g,
751                      (struct fecs_method_op_gk20a) {
752                      .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
753                      .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
754                                      gr_fecs_current_ctx_target_vid_mem_f() |
755                                      gr_fecs_current_ctx_valid_f(1)),
756                      .mailbox = { .id = 0, .data = 0,
757                                   .clr = 0x30,
758                                   .ret = NULL,
759                                   .ok = 0x10,
760                                   .fail = 0x20, },
761                      .cond.ok = GR_IS_UCODE_OP_AND,
762                      .cond.fail = GR_IS_UCODE_OP_AND});
763         if (ret)
764                 nvhost_err(dev_from_gk20a(g),
765                         "bind channel instance failed");
766
767         return ret;
768 }
769
770 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
771                                     bool disable_fifo)
772 {
773         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
774         struct fifo_gk20a *f = &g->fifo;
775         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
776         u32 va_lo, va_hi, va;
777         int ret = 0;
778         void *ctx_ptr = NULL;
779
780         nvhost_dbg_fn("");
781
782         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
783                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
784                         0, pgprot_dmacoherent(PAGE_KERNEL));
785         if (!ctx_ptr)
786                 return -ENOMEM;
787
788         if (ch_ctx->zcull_ctx.gpu_va == 0 &&
789             ch_ctx->zcull_ctx.ctx_sw_mode ==
790                 ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
791                 ret = -EINVAL;
792                 goto clean_up;
793         }
794
795         va_lo = u64_lo32(ch_ctx->zcull_ctx.gpu_va);
796         va_hi = u64_hi32(ch_ctx->zcull_ctx.gpu_va);
797         va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
798
799         if (disable_fifo) {
800                 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
801                 if (ret) {
802                         nvhost_err(dev_from_gk20a(g),
803                                 "failed to disable gr engine activity\n");
804                         goto clean_up;
805                 }
806         }
807
808         /* Channel gr_ctx buffer is gpu cacheable.
809            Flush and invalidate before cpu update. */
810         gk20a_mm_fb_flush(g);
811         gk20a_mm_l2_flush(g, true);
812
813         mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
814                  ch_ctx->zcull_ctx.ctx_sw_mode);
815
816         mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
817
818         if (disable_fifo) {
819                 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
820                 if (ret) {
821                         nvhost_err(dev_from_gk20a(g),
822                                 "failed to enable gr engine activity\n");
823                         goto clean_up;
824                 }
825         }
826         gk20a_mm_l2_invalidate(g);
827
828 clean_up:
829         vunmap(ctx_ptr);
830
831         return ret;
832 }
833
834 static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
835                         struct channel_gk20a *c, bool patch)
836 {
837         struct gr_gk20a *gr = &g->gr;
838         struct channel_ctx_gk20a *ch_ctx = NULL;
839         u32 attrib_offset_in_chunk = 0;
840         u32 alpha_offset_in_chunk = 0;
841         u32 pd_ab_max_output;
842         u32 gpc_index, ppc_index;
843         u32 temp;
844         u32 cbm_cfg_size1, cbm_cfg_size2;
845
846         nvhost_dbg_fn("");
847
848         if (patch) {
849                 int err;
850                 ch_ctx = &c->ch_ctx;
851                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
852                 if (err)
853                         return err;
854         }
855
856         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
857                 gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
858                 gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
859                 patch);
860
861         pd_ab_max_output = (gr->alpha_cb_default_size *
862                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
863                 gr_pd_ab_dist_cfg1_max_output_granularity_v();
864
865         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
866                 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
867                 gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
868
869         alpha_offset_in_chunk = attrib_offset_in_chunk +
870                 gr->tpc_count * gr->attrib_cb_size;
871
872         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
873                 temp = proj_gpc_stride_v() * gpc_index;
874                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
875                      ppc_index++) {
876                         cbm_cfg_size1 = gr->attrib_cb_default_size *
877                                 gr->pes_tpc_count[ppc_index][gpc_index];
878                         cbm_cfg_size2 = gr->alpha_cb_default_size *
879                                 gr->pes_tpc_count[ppc_index][gpc_index];
880
881                         gr_gk20a_ctx_patch_write(g, ch_ctx,
882                                 gr_gpc0_ppc0_cbm_cfg_r() + temp +
883                                 proj_ppc_in_gpc_stride_v() * ppc_index,
884                                 gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
885                                 gr_gpc0_ppc0_cbm_cfg_start_offset_f(attrib_offset_in_chunk) |
886                                 gr_gpc0_ppc0_cbm_cfg_size_f(cbm_cfg_size1), patch);
887
888                         attrib_offset_in_chunk += gr->attrib_cb_size *
889                                 gr->pes_tpc_count[ppc_index][gpc_index];
890
891                         gr_gk20a_ctx_patch_write(g, ch_ctx,
892                                 gr_gpc0_ppc0_cbm_cfg2_r() + temp +
893                                 proj_ppc_in_gpc_stride_v() * ppc_index,
894                                 gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
895                                 gr_gpc0_ppc0_cbm_cfg2_size_f(cbm_cfg_size2), patch);
896
897                         alpha_offset_in_chunk += gr->alpha_cb_size *
898                                 gr->pes_tpc_count[ppc_index][gpc_index];
899                 }
900         }
901
902         if (patch)
903                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
904
905         return 0;
906 }
907
908 static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
909                         struct channel_gk20a *c, bool patch)
910 {
911         struct gr_gk20a *gr = &g->gr;
912         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
913         u64 addr;
914         u32 size;
915         u32 data;
916
917         nvhost_dbg_fn("");
918         if (patch) {
919                 int err;
920                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
921                 if (err)
922                         return err;
923         }
924
925         /* global pagepool buffer */
926         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
927                 gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
928                 (u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
929                  (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
930
931         size = gr->global_ctx_buffer[PAGEPOOL].size /
932                 gr_scc_pagepool_total_pages_byte_granularity_v();
933
934         if (size == gr_scc_pagepool_total_pages_hwmax_value_v())
935                 size = gr_scc_pagepool_total_pages_hwmax_v();
936
937         nvhost_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
938                 addr, size);
939
940         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
941                 gr_scc_pagepool_base_addr_39_8_f(addr), patch);
942
943         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
944                 gr_scc_pagepool_total_pages_f(size) |
945                 gr_scc_pagepool_valid_true_f(), patch);
946
947         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
948                 gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
949
950         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
951                 gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
952
953         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
954                 gr_pd_pagepool_total_pages_f(size) |
955                 gr_pd_pagepool_valid_true_f(), patch);
956
957         /* global bundle cb */
958         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
959                 gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
960                 (u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
961                  (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
962
963         size = gr->bundle_cb_default_size;
964
965         nvhost_dbg_info("bundle cb addr : 0x%016llx, size : %d",
966                 addr, size);
967
968         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
969                 gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
970
971         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
972                 gr_scc_bundle_cb_size_div_256b_f(size) |
973                 gr_scc_bundle_cb_size_valid_true_f(), patch);
974
975         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
976                 gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
977
978         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
979                 gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
980                 gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
981
982         /* data for state_limit */
983         data = (gr->bundle_cb_default_size *
984                 gr_scc_bundle_cb_size_div_256b_byte_granularity_v()) /
985                 gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v();
986
987         data = min_t(u32, data, gr->min_gpm_fifo_depth);
988
989         nvhost_dbg_info("bundle cb token limit : %d, state limit : %d",
990                    gr->bundle_cb_token_limit, data);
991
992         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
993                 gr_pd_ab_dist_cfg2_token_limit_f(gr->bundle_cb_token_limit) |
994                 gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
995
996         /* global attrib cb */
997         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
998                 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
999                 (u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
1000                  (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
1001
1002         nvhost_dbg_info("attrib cb addr : 0x%016llx", addr);
1003
1004         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
1005                 gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
1006                 gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
1007
1008         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
1009                 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
1010                 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
1011
1012         if (patch)
1013                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
1014
1015         return 0;
1016 }
1017
1018 static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
1019 {
1020         struct gr_gk20a *gr = &g->gr;
1021         struct channel_ctx_gk20a *ch_ctx = NULL;
1022         u32 gpm_pd_cfg;
1023         u32 pd_ab_dist_cfg0;
1024         u32 ds_debug;
1025         u32 mpc_vtg_debug;
1026         u32 pe_vaf;
1027         u32 pe_vsc_vpc;
1028
1029         nvhost_dbg_fn("");
1030
1031         gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
1032         pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
1033         ds_debug = gk20a_readl(g, gr_ds_debug_r());
1034         mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
1035
1036         if (patch) {
1037                 int err;
1038                 ch_ctx = &c->ch_ctx;
1039                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
1040                 if (err)
1041                         return err;
1042         }
1043
1044         if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
1045                 pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
1046                 pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
1047
1048                 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
1049                 pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
1050                 pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
1051                 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
1052                 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
1053                 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
1054
1055                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1056                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
1057                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
1058                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1059                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1060                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1061         } else {
1062                 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
1063                 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
1064                 ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
1065                 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
1066
1067                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1068                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1069                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1070                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1071         }
1072
1073         if (patch)
1074                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
1075
1076         return 0;
1077 }
1078
1079 static int gr_gk20a_setup_rop_mapping(struct gk20a *g,
1080                                 struct gr_gk20a *gr)
1081 {
1082         u32 norm_entries, norm_shift;
1083         u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
1084         u32 map0, map1, map2, map3, map4, map5;
1085
1086         if (!gr->map_tiles)
1087                 return -1;
1088
1089         nvhost_dbg_fn("");
1090
1091         gk20a_writel(g, gr_crstr_map_table_cfg_r(),
1092                      gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
1093                      gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
1094
1095         map0 =  gr_crstr_gpc_map0_tile0_f(gr->map_tiles[0]) |
1096                 gr_crstr_gpc_map0_tile1_f(gr->map_tiles[1]) |
1097                 gr_crstr_gpc_map0_tile2_f(gr->map_tiles[2]) |
1098                 gr_crstr_gpc_map0_tile3_f(gr->map_tiles[3]) |
1099                 gr_crstr_gpc_map0_tile4_f(gr->map_tiles[4]) |
1100                 gr_crstr_gpc_map0_tile5_f(gr->map_tiles[5]);
1101
1102         map1 =  gr_crstr_gpc_map1_tile6_f(gr->map_tiles[6]) |
1103                 gr_crstr_gpc_map1_tile7_f(gr->map_tiles[7]) |
1104                 gr_crstr_gpc_map1_tile8_f(gr->map_tiles[8]) |
1105                 gr_crstr_gpc_map1_tile9_f(gr->map_tiles[9]) |
1106                 gr_crstr_gpc_map1_tile10_f(gr->map_tiles[10]) |
1107                 gr_crstr_gpc_map1_tile11_f(gr->map_tiles[11]);
1108
1109         map2 =  gr_crstr_gpc_map2_tile12_f(gr->map_tiles[12]) |
1110                 gr_crstr_gpc_map2_tile13_f(gr->map_tiles[13]) |
1111                 gr_crstr_gpc_map2_tile14_f(gr->map_tiles[14]) |
1112                 gr_crstr_gpc_map2_tile15_f(gr->map_tiles[15]) |
1113                 gr_crstr_gpc_map2_tile16_f(gr->map_tiles[16]) |
1114                 gr_crstr_gpc_map2_tile17_f(gr->map_tiles[17]);
1115
1116         map3 =  gr_crstr_gpc_map3_tile18_f(gr->map_tiles[18]) |
1117                 gr_crstr_gpc_map3_tile19_f(gr->map_tiles[19]) |
1118                 gr_crstr_gpc_map3_tile20_f(gr->map_tiles[20]) |
1119                 gr_crstr_gpc_map3_tile21_f(gr->map_tiles[21]) |
1120                 gr_crstr_gpc_map3_tile22_f(gr->map_tiles[22]) |
1121                 gr_crstr_gpc_map3_tile23_f(gr->map_tiles[23]);
1122
1123         map4 =  gr_crstr_gpc_map4_tile24_f(gr->map_tiles[24]) |
1124                 gr_crstr_gpc_map4_tile25_f(gr->map_tiles[25]) |
1125                 gr_crstr_gpc_map4_tile26_f(gr->map_tiles[26]) |
1126                 gr_crstr_gpc_map4_tile27_f(gr->map_tiles[27]) |
1127                 gr_crstr_gpc_map4_tile28_f(gr->map_tiles[28]) |
1128                 gr_crstr_gpc_map4_tile29_f(gr->map_tiles[29]);
1129
1130         map5 =  gr_crstr_gpc_map5_tile30_f(gr->map_tiles[30]) |
1131                 gr_crstr_gpc_map5_tile31_f(gr->map_tiles[31]) |
1132                 gr_crstr_gpc_map5_tile32_f(0) |
1133                 gr_crstr_gpc_map5_tile33_f(0) |
1134                 gr_crstr_gpc_map5_tile34_f(0) |
1135                 gr_crstr_gpc_map5_tile35_f(0);
1136
1137         gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
1138         gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
1139         gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
1140         gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
1141         gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
1142         gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
1143
1144         switch (gr->tpc_count) {
1145         case 1:
1146                 norm_shift = 4;
1147                 break;
1148         case 2:
1149         case 3:
1150                 norm_shift = 3;
1151                 break;
1152         case 4:
1153         case 5:
1154         case 6:
1155         case 7:
1156                 norm_shift = 2;
1157                 break;
1158         case 8:
1159         case 9:
1160         case 10:
1161         case 11:
1162         case 12:
1163         case 13:
1164         case 14:
1165         case 15:
1166                 norm_shift = 1;
1167                 break;
1168         default:
1169                 norm_shift = 0;
1170                 break;
1171         }
1172
1173         norm_entries = gr->tpc_count << norm_shift;
1174         coeff5_mod = (1 << 5) % norm_entries;
1175         coeff6_mod = (1 << 6) % norm_entries;
1176         coeff7_mod = (1 << 7) % norm_entries;
1177         coeff8_mod = (1 << 8) % norm_entries;
1178         coeff9_mod = (1 << 9) % norm_entries;
1179         coeff10_mod = (1 << 10) % norm_entries;
1180         coeff11_mod = (1 << 11) % norm_entries;
1181
1182         gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
1183                      gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
1184                      gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
1185                      gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
1186                      gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
1187                      gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
1188
1189         gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
1190                      gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
1191                      gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
1192                      gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
1193                      gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
1194                      gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
1195                      gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
1196
1197         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
1198         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
1199         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
1200         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
1201         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
1202         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
1203
1204         gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
1205                      gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
1206                      gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
1207
1208         gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
1209         gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
1210         gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
1211         gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
1212         gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
1213         gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
1214
1215         return 0;
1216 }
1217
1218 static inline u32 count_bits(u32 mask)
1219 {
1220         u32 temp = mask;
1221         u32 count;
1222         for (count = 0; temp != 0; count++)
1223                 temp &= temp - 1;
1224
1225         return count;
1226 }
1227
1228 static inline u32 clear_count_bits(u32 num, u32 clear_count)
1229 {
1230         u32 count = clear_count;
1231         for (; (num != 0) && (count != 0); count--)
1232                 num &= num - 1;
1233
1234         return num;
1235 }
1236
1237 static int gr_gk20a_setup_alpha_beta_tables(struct gk20a *g,
1238                                         struct gr_gk20a *gr)
1239 {
1240         u32 table_index_bits = 5;
1241         u32 rows = (1 << table_index_bits);
1242         u32 row_stride = gr_pd_alpha_ratio_table__size_1_v() / rows;
1243
1244         u32 row;
1245         u32 index;
1246         u32 gpc_index;
1247         u32 gpcs_per_reg = 4;
1248         u32 pes_index;
1249         u32 tpc_count_pes;
1250         u32 num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
1251
1252         u32 alpha_target, beta_target;
1253         u32 alpha_bits, beta_bits;
1254         u32 alpha_mask, beta_mask, partial_mask;
1255         u32 reg_offset;
1256         bool assign_alpha;
1257
1258         u32 map_alpha[gr_pd_alpha_ratio_table__size_1_v()];
1259         u32 map_beta[gr_pd_alpha_ratio_table__size_1_v()];
1260         u32 map_reg_used[gr_pd_alpha_ratio_table__size_1_v()];
1261
1262         nvhost_dbg_fn("");
1263
1264         memset(map_alpha, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1265         memset(map_beta, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1266         memset(map_reg_used, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1267
1268         for (row = 0; row < rows; ++row) {
1269                 alpha_target = max_t(u32, gr->tpc_count * row / rows, 1);
1270                 beta_target = gr->tpc_count - alpha_target;
1271
1272                 assign_alpha = (alpha_target < beta_target);
1273
1274                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1275                         reg_offset = (row * row_stride) + (gpc_index / gpcs_per_reg);
1276                         alpha_mask = beta_mask = 0;
1277
1278                         for (pes_index = 0; pes_index < num_pes_per_gpc; pes_index++) {
1279                                 tpc_count_pes = gr->pes_tpc_count[pes_index][gpc_index];
1280
1281                                 if (assign_alpha) {
1282                                         alpha_bits = (alpha_target == 0) ? 0 : tpc_count_pes;
1283                                         beta_bits = tpc_count_pes - alpha_bits;
1284                                 } else {
1285                                         beta_bits = (beta_target == 0) ? 0 : tpc_count_pes;
1286                                         alpha_bits = tpc_count_pes - beta_bits;
1287                                 }
1288
1289                                 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index];
1290                                 partial_mask = clear_count_bits(partial_mask, tpc_count_pes - alpha_bits);
1291                                 alpha_mask |= partial_mask;
1292
1293                                 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index] ^ partial_mask;
1294                                 beta_mask |= partial_mask;
1295
1296                                 alpha_target -= min(alpha_bits, alpha_target);
1297                                 beta_target -= min(beta_bits, beta_target);
1298
1299                                 if ((alpha_bits > 0) || (beta_bits > 0))
1300                                         assign_alpha = !assign_alpha;
1301                         }
1302
1303                         switch (gpc_index % gpcs_per_reg) {
1304                         case 0:
1305                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n0_mask_f(alpha_mask);
1306                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n0_mask_f(beta_mask);
1307                                 break;
1308                         case 1:
1309                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n1_mask_f(alpha_mask);
1310                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n1_mask_f(beta_mask);
1311                                 break;
1312                         case 2:
1313                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n2_mask_f(alpha_mask);
1314                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n2_mask_f(beta_mask);
1315                                 break;
1316                         case 3:
1317                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n3_mask_f(alpha_mask);
1318                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n3_mask_f(beta_mask);
1319                                 break;
1320                         }
1321                         map_reg_used[reg_offset] = true;
1322                 }
1323         }
1324
1325         for (index = 0; index < gr_pd_alpha_ratio_table__size_1_v(); index++) {
1326                 if (map_reg_used[index]) {
1327                         gk20a_writel(g, gr_pd_alpha_ratio_table_r(index), map_alpha[index]);
1328                         gk20a_writel(g, gr_pd_beta_ratio_table_r(index), map_beta[index]);
1329                 }
1330         }
1331
1332         return 0;
1333 }
1334
1335 static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
1336 {
1337         struct gr_gk20a *gr = &g->gr;
1338         u32 tpc_index, gpc_index;
1339         u32 tpc_offset, gpc_offset;
1340         u32 sm_id = 0, gpc_id = 0;
1341         u32 sm_id_to_gpc_id[proj_scal_max_gpcs_v() * proj_scal_max_tpc_per_gpc_v()];
1342         u32 tpc_per_gpc;
1343         u32 max_ways_evict = INVALID_MAX_WAYS;
1344
1345         nvhost_dbg_fn("");
1346
1347         for (tpc_index = 0; tpc_index < gr->max_tpc_per_gpc_count; tpc_index++) {
1348                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1349                         gpc_offset = proj_gpc_stride_v() * gpc_index;
1350                         if (tpc_index < gr->gpc_tpc_count[gpc_index]) {
1351                                 tpc_offset = proj_tpc_in_gpc_stride_v() * tpc_index;
1352
1353                                 gk20a_writel(g, gr_gpc0_tpc0_sm_cfg_r() + gpc_offset + tpc_offset,
1354                                              gr_gpc0_tpc0_sm_cfg_sm_id_f(sm_id));
1355                                 gk20a_writel(g, gr_gpc0_tpc0_l1c_cfg_smid_r() + gpc_offset + tpc_offset,
1356                                              gr_gpc0_tpc0_l1c_cfg_smid_value_f(sm_id));
1357                                 gk20a_writel(g, gr_gpc0_gpm_pd_sm_id_r(tpc_index) + gpc_offset,
1358                                              gr_gpc0_gpm_pd_sm_id_id_f(sm_id));
1359                                 gk20a_writel(g, gr_gpc0_tpc0_pe_cfg_smid_r() + gpc_offset + tpc_offset,
1360                                              gr_gpc0_tpc0_pe_cfg_smid_value_f(sm_id));
1361
1362                                 sm_id_to_gpc_id[sm_id] = gpc_index;
1363                                 sm_id++;
1364                         }
1365
1366                         gk20a_writel(g, gr_gpc0_gpm_pd_active_tpcs_r() + gpc_offset,
1367                                      gr_gpc0_gpm_pd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1368                         gk20a_writel(g, gr_gpc0_gpm_sd_active_tpcs_r() + gpc_offset,
1369                                      gr_gpc0_gpm_sd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1370                 }
1371         }
1372
1373         for (tpc_index = 0, gpc_id = 0;
1374              tpc_index < gr_pd_num_tpc_per_gpc__size_1_v();
1375              tpc_index++, gpc_id += 8) {
1376
1377                 if (gpc_id >= gr->gpc_count)
1378                         gpc_id = 0;
1379
1380                 tpc_per_gpc =
1381                         gr_pd_num_tpc_per_gpc_count0_f(gr->gpc_tpc_count[gpc_id + 0]) |
1382                         gr_pd_num_tpc_per_gpc_count1_f(gr->gpc_tpc_count[gpc_id + 1]) |
1383                         gr_pd_num_tpc_per_gpc_count2_f(gr->gpc_tpc_count[gpc_id + 2]) |
1384                         gr_pd_num_tpc_per_gpc_count3_f(gr->gpc_tpc_count[gpc_id + 3]) |
1385                         gr_pd_num_tpc_per_gpc_count4_f(gr->gpc_tpc_count[gpc_id + 4]) |
1386                         gr_pd_num_tpc_per_gpc_count5_f(gr->gpc_tpc_count[gpc_id + 5]) |
1387                         gr_pd_num_tpc_per_gpc_count6_f(gr->gpc_tpc_count[gpc_id + 6]) |
1388                         gr_pd_num_tpc_per_gpc_count7_f(gr->gpc_tpc_count[gpc_id + 7]);
1389
1390                 gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1391                 gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1392         }
1393
1394         /* gr__setup_pd_mapping stubbed for gk20a */
1395         gr_gk20a_setup_rop_mapping(g, gr);
1396         gr_gk20a_setup_alpha_beta_tables(g, gr);
1397
1398         if (gr->num_fbps == 1)
1399                 max_ways_evict = 9;
1400
1401         if (max_ways_evict != INVALID_MAX_WAYS)
1402                 gk20a_writel(g, ltc_ltcs_ltss_tstg_set_mgmt_r(),
1403                              ((gk20a_readl(g, ltc_ltcs_ltss_tstg_set_mgmt_r()) &
1404                                ~(ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(~0))) |
1405                               ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(max_ways_evict)));
1406
1407         for (gpc_index = 0;
1408              gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
1409              gpc_index += 4) {
1410
1411                 gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
1412                              gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
1413                              gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
1414                              gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
1415                              gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
1416         }
1417
1418         gk20a_writel(g, gr_cwd_fs_r(),
1419                      gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1420                      gr_cwd_fs_num_tpcs_f(gr->tpc_count));
1421
1422         gk20a_writel(g, gr_bes_zrop_settings_r(),
1423                      gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
1424         gk20a_writel(g, gr_bes_crop_settings_r(),
1425                      gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
1426
1427         return 0;
1428 }
1429
1430 static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
1431 {
1432         struct gk20a *g = c->g;
1433         int ret;
1434
1435         u32 inst_base_ptr =
1436                 u64_lo32(c->inst_block.cpu_pa
1437                 >> ram_in_base_shift_v());
1438
1439
1440         nvhost_dbg_fn("");
1441
1442         ret = gr_gk20a_submit_fecs_method_op(g,
1443                 (struct fecs_method_op_gk20a) {
1444                 .method.addr = save_type,
1445                 .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1446                                 gr_fecs_current_ctx_target_vid_mem_f() |
1447                                 gr_fecs_current_ctx_valid_f(1)),
1448                 .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
1449                         .ok = 1, .fail = 2,
1450                 },
1451                 .cond.ok = GR_IS_UCODE_OP_AND,
1452                 .cond.fail = GR_IS_UCODE_OP_AND,
1453                  });
1454
1455         if (ret)
1456                 nvhost_err(dev_from_gk20a(g), "save context image failed");
1457
1458         return ret;
1459 }
1460
1461 /* init global golden image from a fresh gr_ctx in channel ctx.
1462    save a copy in local_golden_image in ctx_vars */
1463 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
1464                                           struct channel_gk20a *c)
1465 {
1466         struct gr_gk20a *gr = &g->gr;
1467         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1468         u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
1469         u32 ctx_header_words;
1470         u32 i;
1471         u32 data;
1472         void *ctx_ptr = NULL;
1473         void *gold_ptr = NULL;
1474         u32 err = 0;
1475
1476         nvhost_dbg_fn("");
1477
1478         /* golden ctx is global to all channels. Although only the first
1479            channel initializes golden image, driver needs to prevent multiple
1480            channels from initializing golden ctx at the same time */
1481         mutex_lock(&gr->ctx_mutex);
1482
1483         if (gr->ctx_vars.golden_image_initialized)
1484                 goto clean_up;
1485
1486         err = gr_gk20a_fecs_ctx_bind_channel(g, c);
1487         if (err)
1488                 goto clean_up;
1489
1490         err = gr_gk20a_elpg_protected_call(g,
1491                         gr_gk20a_commit_global_ctx_buffers(g, c, false));
1492         if (err)
1493                 goto clean_up;
1494
1495         gold_ptr = nvhost_memmgr_mmap(gr->global_ctx_buffer[GOLDEN_CTX].ref);
1496         if (!gold_ptr)
1497                 goto clean_up;
1498
1499         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1500                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1501                         0, pgprot_dmacoherent(PAGE_KERNEL));
1502         if (!ctx_ptr)
1503                 goto clean_up;
1504
1505         ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
1506         ctx_header_words >>= 2;
1507
1508         /* Channel gr_ctx buffer is gpu cacheable.
1509            Flush before cpu read. */
1510         gk20a_mm_fb_flush(g);
1511         gk20a_mm_l2_flush(g, false);
1512
1513         for (i = 0; i < ctx_header_words; i++) {
1514                 data = mem_rd32(ctx_ptr, i);
1515                 mem_wr32(gold_ptr, i, data);
1516         }
1517
1518         mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
1519                  ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
1520
1521         mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
1522
1523         gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
1524
1525         gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
1526
1527         if (gr->ctx_vars.local_golden_image == NULL) {
1528
1529                 gr->ctx_vars.local_golden_image =
1530                         kzalloc(gr->ctx_vars.golden_image_size, GFP_KERNEL);
1531
1532                 if (gr->ctx_vars.local_golden_image == NULL) {
1533                         err = -ENOMEM;
1534                         goto clean_up;
1535                 }
1536
1537                 for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1538                         gr->ctx_vars.local_golden_image[i] =
1539                                 mem_rd32(gold_ptr, i);
1540         }
1541
1542         gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
1543
1544         gr->ctx_vars.golden_image_initialized = true;
1545
1546         gk20a_mm_l2_invalidate(g);
1547
1548         gk20a_writel(g, gr_fecs_current_ctx_r(),
1549                 gr_fecs_current_ctx_valid_false_f());
1550
1551 clean_up:
1552         if (err)
1553                 nvhost_err(dev_from_gk20a(g), "fail");
1554         else
1555                 nvhost_dbg_fn("done");
1556
1557         if (gold_ptr)
1558                 nvhost_memmgr_munmap(gr->global_ctx_buffer[GOLDEN_CTX].ref,
1559                                      gold_ptr);
1560         if (ctx_ptr)
1561                 vunmap(ctx_ptr);
1562
1563         mutex_unlock(&gr->ctx_mutex);
1564         return err;
1565 }
1566
1567 int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
1568                                     struct channel_gk20a *c,
1569                                     bool enable_smpc_ctxsw)
1570 {
1571         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1572         void *ctx_ptr = NULL;
1573         u32 data;
1574
1575         /*XXX caller responsible for making sure the channel is quiesced? */
1576
1577         /* Channel gr_ctx buffer is gpu cacheable.
1578            Flush and invalidate before cpu update. */
1579         gk20a_mm_fb_flush(g);
1580         gk20a_mm_l2_flush(g, true);
1581
1582         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1583                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1584                         0, pgprot_dmacoherent(PAGE_KERNEL));
1585         if (!ctx_ptr)
1586                 return -ENOMEM;
1587
1588         data = mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1589         data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
1590         data |= enable_smpc_ctxsw ?
1591                 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
1592                 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
1593         mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1594                  data);
1595
1596         vunmap(ctx_ptr);
1597
1598         gk20a_mm_l2_invalidate(g);
1599
1600         return 0;
1601 }
1602
1603 /* load saved fresh copy of gloden image into channel gr_ctx */
1604 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
1605                                         struct channel_gk20a *c)
1606 {
1607         struct gr_gk20a *gr = &g->gr;
1608         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1609         u32 virt_addr_lo;
1610         u32 virt_addr_hi;
1611         u32 i, v, data;
1612         int ret = 0;
1613         void *ctx_ptr = NULL;
1614
1615         nvhost_dbg_fn("");
1616
1617         if (gr->ctx_vars.local_golden_image == NULL)
1618                 return -1;
1619
1620         /* Channel gr_ctx buffer is gpu cacheable.
1621            Flush and invalidate before cpu update. */
1622         gk20a_mm_fb_flush(g);
1623         gk20a_mm_l2_flush(g, true);
1624
1625         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1626                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1627                         0, pgprot_dmacoherent(PAGE_KERNEL));
1628         if (!ctx_ptr)
1629                 return -ENOMEM;
1630
1631         for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1632                 mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
1633
1634         mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
1635         mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
1636
1637         virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
1638         virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
1639
1640         mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
1641                  ch_ctx->patch_ctx.data_count);
1642         mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
1643                  virt_addr_lo);
1644         mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
1645                  virt_addr_hi);
1646
1647         /* no user for client managed performance counter ctx */
1648         ch_ctx->pm_ctx.ctx_sw_mode =
1649                 ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
1650         data = mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1651         data = data & ~ctxsw_prog_main_image_pm_mode_m();
1652         data |= ch_ctx->pm_ctx.ctx_sw_mode;
1653         mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1654                  data);
1655
1656         mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0);
1657
1658         /* set priv access map */
1659         virt_addr_lo =
1660                  u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1661         virt_addr_hi =
1662                  u64_hi32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1663
1664         mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
1665                  ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f());
1666         mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
1667                  virt_addr_lo);
1668         mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
1669                  virt_addr_hi);
1670         /* disable verif features */
1671         v = mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
1672         v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
1673         v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
1674         mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
1675
1676
1677         vunmap(ctx_ptr);
1678
1679         gk20a_mm_l2_invalidate(g);
1680
1681         if (tegra_platform_is_linsim()) {
1682                 u32 inst_base_ptr =
1683                         u64_lo32(c->inst_block.cpu_pa
1684                         >> ram_in_base_shift_v());
1685
1686                 ret = gr_gk20a_submit_fecs_method_op(g,
1687                           (struct fecs_method_op_gk20a) {
1688                                   .method.data =
1689                                           (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1690                                            gr_fecs_current_ctx_target_vid_mem_f() |
1691                                            gr_fecs_current_ctx_valid_f(1)),
1692                                   .method.addr =
1693                                           gr_fecs_method_push_adr_restore_golden_v(),
1694                                   .mailbox = {
1695                                           .id = 0, .data = 0,
1696                                           .clr = ~0, .ret = NULL,
1697                                           .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
1698                                           .fail = 0},
1699                                   .cond.ok = GR_IS_UCODE_OP_EQUAL,
1700                                   .cond.fail = GR_IS_UCODE_OP_SKIP});
1701
1702                 if (ret)
1703                         nvhost_err(dev_from_gk20a(g),
1704                                    "restore context image failed");
1705         }
1706
1707         return ret;
1708 }
1709
1710 static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
1711 {
1712         nvhost_dbg_fn("");
1713
1714         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
1715                      gr_fecs_ctxsw_mailbox_clear_value_f(~0));
1716
1717         gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
1718         gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
1719
1720         gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
1721         gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
1722
1723         nvhost_dbg_fn("done");
1724 }
1725
1726 static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
1727 {
1728         struct mm_gk20a *mm = &g->mm;
1729         struct vm_gk20a *vm = &mm->pmu.vm;
1730         struct device *d = dev_from_gk20a(g);
1731         struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1732         void *inst_ptr;
1733         u32 pde_addr_lo;
1734         u32 pde_addr_hi;
1735         u64 pde_addr;
1736
1737         /* Alloc mem of inst block */
1738         p_ucode_info->inst_blk_desc.size = ram_in_alloc_size_v();
1739         p_ucode_info->inst_blk_desc.cpuva = dma_alloc_coherent(d,
1740                                         p_ucode_info->inst_blk_desc.size,
1741                                         &p_ucode_info->inst_blk_desc.iova,
1742                                         GFP_KERNEL);
1743         if (!p_ucode_info->inst_blk_desc.cpuva) {
1744                 nvhost_err(d, "failed to allocate memory\n");
1745                 return -ENOMEM;
1746         }
1747
1748         p_ucode_info->inst_blk_desc.cpu_pa = gk20a_get_phys_from_iova(d,
1749                                         p_ucode_info->inst_blk_desc.iova);
1750
1751         inst_ptr = p_ucode_info->inst_blk_desc.cpuva;
1752
1753         /* Set inst block */
1754         mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
1755                  u64_lo32(vm->va_limit) | 0xFFF);
1756         mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
1757                 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
1758
1759         pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
1760         pde_addr_lo = u64_lo32(pde_addr >> 12);
1761         pde_addr_hi = u64_hi32(pde_addr);
1762         mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
1763                 ram_in_page_dir_base_target_vid_mem_f() |
1764                 ram_in_page_dir_base_vol_true_f() |
1765                 ram_in_page_dir_base_lo_f(pde_addr_lo));
1766         mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
1767                 ram_in_page_dir_base_hi_f(pde_addr_hi));
1768
1769         /* Map ucode surface to GMMU */
1770         p_ucode_info->ucode_gpuva = gk20a_gmmu_map(vm,
1771                                         &p_ucode_info->surface_desc.sgt,
1772                                         p_ucode_info->surface_desc.size,
1773                                         0, /* flags */
1774                                         mem_flag_read_only);
1775         if (!p_ucode_info->ucode_gpuva) {
1776                 nvhost_err(d, "failed to update gmmu ptes\n");
1777                 return -ENOMEM;
1778         }
1779
1780         return 0;
1781 }
1782
1783 static void gr_gk20a_init_ctxsw_ucode_segment(
1784         struct gk20a_ctxsw_ucode_segment *p_seg, u32 *p_offset, u32 size)
1785 {
1786         p_seg->offset = *p_offset;
1787         p_seg->size = size;
1788         *p_offset = ALIGN(*p_offset + size, BLK_SIZE);
1789 }
1790
1791 static void gr_gk20a_init_ctxsw_ucode_inst(
1792         struct gk20a_ctxsw_ucode_inst *p_inst, u32 *p_offset,
1793         struct gk20a_ctxsw_bootloader_desc *p_bootdesc,
1794         u32 code_size, u32 data_size)
1795 {
1796         u32 boot_size = ALIGN(p_bootdesc->bootloader_size, sizeof(u32));
1797         p_inst->boot_entry = p_bootdesc->bootloader_entry_point;
1798         p_inst->boot_imem_offset = p_bootdesc->bootloader_imem_offset;
1799         gr_gk20a_init_ctxsw_ucode_segment(&p_inst->boot, p_offset, boot_size);
1800         gr_gk20a_init_ctxsw_ucode_segment(&p_inst->code, p_offset, code_size);
1801         gr_gk20a_init_ctxsw_ucode_segment(&p_inst->data, p_offset, data_size);
1802 }
1803
1804 static int gr_gk20a_copy_ctxsw_ucode_inst(
1805         u8 *p_buf,
1806         struct gk20a_ctxsw_ucode_inst *p_inst,
1807         struct gk20a_ctxsw_bootloader_desc *p_bootdesc, u32 *p_bootimage,
1808         u32 *p_code, u32 *p_data)
1809 {
1810         memcpy(p_buf + p_inst->boot.offset, p_bootimage, p_inst->boot.size);
1811         memcpy(p_buf + p_inst->code.offset, p_code, p_inst->code.size);
1812         memcpy(p_buf + p_inst->data.offset, p_data, p_inst->data.size);
1813         return 0;
1814 }
1815
1816 static int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
1817 {
1818         struct device *d = dev_from_gk20a(g);
1819         struct mm_gk20a *mm = &g->mm;
1820         struct vm_gk20a *vm = &mm->pmu.vm;
1821         struct gk20a_ctxsw_bootloader_desc *p_fecs_boot_desc;
1822         struct gk20a_ctxsw_bootloader_desc *p_gpcs_boot_desc;
1823         const struct firmware *fecs_fw;
1824         const struct firmware *gpccs_fw;
1825         u32 *p_fecs_boot_image;
1826         u32 *p_gpcs_boot_image;
1827         struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1828         u8 *p_buf;
1829         u32 ucode_size;
1830         int err = 0;
1831         DEFINE_DMA_ATTRS(attrs);
1832
1833         fecs_fw = nvhost_client_request_firmware(g->dev,
1834                                         GK20A_FECS_UCODE_IMAGE);
1835         if (!fecs_fw) {
1836                 nvhost_err(d, "failed to load fecs ucode!!");
1837                 return -ENOENT;
1838         }
1839
1840         p_fecs_boot_desc = fecs_fw->data;
1841         p_fecs_boot_image = fecs_fw->data +
1842                                 sizeof(struct gk20a_ctxsw_bootloader_desc);
1843
1844         gpccs_fw = nvhost_client_request_firmware(g->dev,
1845                                         GK20A_GPCCS_UCODE_IMAGE);
1846         if (!gpccs_fw) {
1847                 release_firmware(fecs_fw);
1848                 nvhost_err(d, "failed to load gpccs ucode!!");
1849                 return -ENOENT;
1850         }
1851
1852         p_gpcs_boot_desc = gpccs_fw->data;
1853         p_gpcs_boot_image = gpccs_fw->data +
1854                                 sizeof(struct gk20a_ctxsw_bootloader_desc);
1855
1856         ucode_size = 0;
1857         gr_gk20a_init_ctxsw_ucode_inst(&p_ucode_info->fecs, &ucode_size,
1858                 p_fecs_boot_desc,
1859                 g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
1860                 g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
1861         gr_gk20a_init_ctxsw_ucode_inst(&p_ucode_info->gpcs, &ucode_size,
1862                 p_gpcs_boot_desc,
1863                 g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
1864                 g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
1865
1866         p_ucode_info->surface_desc.size = ucode_size;
1867         dma_set_attr(DMA_ATTR_READ_ONLY, &attrs);
1868         p_ucode_info->surface_desc.cpuva = dma_alloc_attrs(d,
1869                                         p_ucode_info->surface_desc.size,
1870                                         &p_ucode_info->surface_desc.iova,
1871                                         GFP_KERNEL,
1872                                         &attrs);
1873         if (!p_ucode_info->surface_desc.cpuva) {
1874                 nvhost_err(d, "memory allocation failed\n");
1875                 err = -ENOMEM;
1876                 goto clean_up;
1877         }
1878
1879         err = gk20a_get_sgtable(d, &p_ucode_info->surface_desc.sgt,
1880                                 p_ucode_info->surface_desc.cpuva,
1881                                 p_ucode_info->surface_desc.iova,
1882                                 p_ucode_info->surface_desc.size);
1883         if (err) {
1884                 nvhost_err(d, "failed to create sg table\n");
1885                 goto clean_up;
1886         }
1887
1888         p_buf = (u8 *)p_ucode_info->surface_desc.cpuva;
1889         if (!p_buf) {
1890                 release_firmware(fecs_fw);
1891                 release_firmware(gpccs_fw);
1892                 nvhost_err(d, "failed to map surface desc buffer");
1893                 return -ENOMEM;
1894         }
1895
1896         gr_gk20a_copy_ctxsw_ucode_inst(p_buf, &p_ucode_info->fecs,
1897                 p_fecs_boot_desc, p_fecs_boot_image,
1898                 g->gr.ctx_vars.ucode.fecs.inst.l,
1899                 g->gr.ctx_vars.ucode.fecs.data.l);
1900
1901         gr_gk20a_copy_ctxsw_ucode_inst(p_buf, &p_ucode_info->gpcs,
1902                 p_gpcs_boot_desc, p_gpcs_boot_image,
1903                 g->gr.ctx_vars.ucode.gpccs.inst.l,
1904                 g->gr.ctx_vars.ucode.gpccs.data.l);
1905
1906         err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
1907         if (err)
1908                 goto clean_up;
1909
1910         gk20a_free_sgtable(&p_ucode_info->surface_desc.sgt);
1911
1912         return 0;
1913
1914  clean_up:
1915         if (p_ucode_info->ucode_gpuva)
1916                 gk20a_gmmu_unmap(vm, p_ucode_info->ucode_gpuva,
1917                         p_ucode_info->surface_desc.size, mem_flag_none);
1918         if (p_ucode_info->surface_desc.sgt)
1919                 gk20a_free_sgtable(&p_ucode_info->surface_desc.sgt);
1920         if (p_ucode_info->surface_desc.cpuva)
1921                 dma_free_attrs(d, p_ucode_info->surface_desc.size,
1922                                 p_ucode_info->surface_desc.cpuva,
1923                                 p_ucode_info->surface_desc.iova,
1924                                 &attrs);
1925         p_ucode_info->surface_desc.cpuva = NULL;
1926         p_ucode_info->surface_desc.iova = 0;
1927
1928         return err;
1929 }
1930
1931 static void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
1932 {
1933         struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1934         int retries = 20;
1935         phys_addr_t inst_ptr;
1936         u32 val;
1937
1938         while ((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
1939                         gr_fecs_ctxsw_status_1_arb_busy_m()) && retries) {
1940                 udelay(2);
1941                 retries--;
1942         }
1943         if (!retries)
1944                 nvhost_err(dev_from_gk20a(g), "arbiter idle timeout");
1945
1946         gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
1947
1948         inst_ptr = p_ucode_info->inst_blk_desc.cpu_pa;
1949         gk20a_writel(g, gr_fecs_new_ctx_r(),
1950                         gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
1951                         gr_fecs_new_ctx_target_m() |
1952                         gr_fecs_new_ctx_valid_m());
1953
1954         gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
1955                         gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
1956                         gr_fecs_arb_ctx_ptr_target_m());
1957
1958         gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
1959
1960         /* Wait for arbiter command to complete */
1961         retries = 20;
1962         val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1963         while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
1964                 udelay(2);
1965                 retries--;
1966                 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1967         }
1968         if (!retries)
1969                 nvhost_err(dev_from_gk20a(g), "arbiter complete timeout");
1970
1971         gk20a_writel(g, gr_fecs_current_ctx_r(),
1972                         gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
1973                         gr_fecs_current_ctx_target_m() |
1974                         gr_fecs_current_ctx_valid_m());
1975         /* Send command to arbiter to flush */
1976         gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
1977
1978         retries = 20;
1979         val = (gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
1980         while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
1981                 udelay(2);
1982                 retries--;
1983                 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1984         }
1985         if (!retries)
1986                 nvhost_err(dev_from_gk20a(g), "arbiter complete timeout");
1987 }
1988
1989 static int gr_gk20a_load_ctxsw_ucode_inst(struct gk20a *g, u64 addr_base,
1990         struct gk20a_ctxsw_ucode_inst *p_inst, u32 reg_offset)
1991 {
1992         u32 addr_code32;
1993         u32 addr_data32;
1994         u32 addr_load32;
1995         u32 dst = 0;
1996         u32 blocks;
1997         u32 b;
1998
1999         addr_code32 = u64_lo32((addr_base + p_inst->code.offset) >> 8);
2000         addr_data32 = u64_lo32((addr_base + p_inst->data.offset) >> 8);
2001         addr_load32 = u64_lo32((addr_base + p_inst->boot.offset) >> 8);
2002
2003         gk20a_writel(g, reg_offset + gr_fecs_dmactl_r(),
2004                         gr_fecs_dmactl_require_ctx_f(0));
2005
2006         /*
2007          * Copy falcon bootloader header into dmem at offset 0.
2008          * Configure dmem port 0 for auto-incrementing writes starting at dmem
2009          * offset 0.
2010          */
2011         gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
2012                         gr_fecs_dmemc_offs_f(0) |
2013                         gr_fecs_dmemc_blk_f(0) |
2014                         gr_fecs_dmemc_aincw_f(1));
2015
2016         /* Write out the actual data */
2017         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2018         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
2019         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2020         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), p_inst->code.size);
2021         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2022         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_data32);
2023         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), p_inst->data.size);
2024         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
2025         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2026         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2027
2028         blocks = ((p_inst->boot.size + 0xFF) & ~0xFF) >> 8;
2029
2030         /*
2031          * Set the base FB address for the DMA transfer. Subtract off the 256
2032          * byte IMEM block offset such that the relative FB and IMEM offsets
2033          * match, allowing the IMEM tags to be properly created.
2034          */
2035
2036         dst = p_inst->boot_imem_offset;
2037         gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
2038                         (addr_load32 - (dst >> 8)));
2039
2040         for (b = 0; b < blocks; b++) {
2041                 /* Setup destination IMEM offset */
2042                 gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
2043                                 dst + (b << 8));
2044
2045                 /* Setup source offset (relative to BASE) */
2046                 gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
2047                                 dst + (b << 8));
2048
2049                 gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
2050                                 gr_fecs_dmatrfcmd_imem_f(0x01) |
2051                                 gr_fecs_dmatrfcmd_write_f(0x00) |
2052                                 gr_fecs_dmatrfcmd_size_f(0x06) |
2053                                 gr_fecs_dmatrfcmd_ctxdma_f(0));
2054         }
2055
2056         /* Specify the falcon boot vector */
2057         gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
2058                         gr_fecs_bootvec_vec_f(p_inst->boot_entry));
2059
2060         /* Write to CPUCTL to start the falcon */
2061         gk20a_writel(g, reg_offset + gr_fecs_cpuctl_r(),
2062                         gr_fecs_cpuctl_startcpu_f(0x01));
2063
2064         return 0;
2065 }
2066
2067 static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
2068 {
2069         struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
2070         u64 addr_base = p_ucode_info->ucode_gpuva;
2071
2072         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
2073
2074         gr_gk20a_load_falcon_bind_instblk(g);
2075
2076         gr_gk20a_load_ctxsw_ucode_inst(g, addr_base,
2077                 &g->ctxsw_ucode_info.fecs, 0);
2078
2079         gr_gk20a_load_ctxsw_ucode_inst(g, addr_base,
2080                 &g->ctxsw_ucode_info.gpcs,
2081                 gr_gpcs_gpccs_falcon_hwcfg_r() -
2082                 gr_fecs_falcon_hwcfg_r());
2083 }
2084
2085 static int gr_gk20a_load_ctxsw_ucode(struct gk20a *g, struct gr_gk20a *gr)
2086 {
2087         u32 ret;
2088
2089         nvhost_dbg_fn("");
2090
2091         if (tegra_platform_is_linsim()) {
2092                 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
2093                         gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
2094                 gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
2095                         gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
2096         }
2097
2098         /*
2099          * In case the gPMU falcon is not being used, revert to the old way of
2100          * loading gr ucode, without the faster bootstrap routine.
2101          */
2102         if (!support_gk20a_pmu()) {
2103                 gr_gk20a_load_falcon_dmem(g);
2104                 gr_gk20a_load_falcon_imem(g);
2105                 gr_gk20a_start_falcon_ucode(g);
2106         } else {
2107                 if (!gr->skip_ucode_init)
2108                         gr_gk20a_init_ctxsw_ucode(g);
2109                 gr_gk20a_load_falcon_with_bootloader(g);
2110                 gr->skip_ucode_init = true;
2111         }
2112
2113         ret = gr_gk20a_ctx_wait_ucode(g, 0, 0,
2114                                       GR_IS_UCODE_OP_EQUAL,
2115                                       eUcodeHandshakeInitComplete,
2116                                       GR_IS_UCODE_OP_SKIP, 0);
2117         if (ret) {
2118                 nvhost_err(dev_from_gk20a(g), "falcon ucode init timeout");
2119                 return ret;
2120         }
2121
2122         if (support_gk20a_pmu())
2123                 gk20a_writel(g, gr_fecs_current_ctx_r(),
2124                         gr_fecs_current_ctx_valid_false_f());
2125
2126         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
2127         gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
2128         gk20a_writel(g, gr_fecs_method_push_r(),
2129                      gr_fecs_method_push_adr_set_watchdog_timeout_f());
2130
2131         nvhost_dbg_fn("done");
2132         return 0;
2133 }
2134
2135 static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
2136 {
2137         u32 golden_ctx_image_size = 0;
2138         u32 zcull_ctx_image_size = 0;
2139         u32 pm_ctx_image_size = 0;
2140         u32 ret;
2141         struct fecs_method_op_gk20a op = {
2142                 .mailbox = { .id = 0, .data = 0,
2143                              .clr = ~0, .ok = 0, .fail = 0},
2144                 .method.data = 0,
2145                 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
2146                 .cond.fail = GR_IS_UCODE_OP_SKIP,
2147                 };
2148
2149         nvhost_dbg_fn("");
2150         op.method.addr = gr_fecs_method_push_adr_discover_image_size_v();
2151         op.mailbox.ret = &golden_ctx_image_size;
2152         ret = gr_gk20a_submit_fecs_method_op(g, op);
2153         if (ret) {
2154                 nvhost_err(dev_from_gk20a(g),
2155                            "query golden image size failed");
2156                 return ret;
2157         }
2158         op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v();
2159         op.mailbox.ret = &zcull_ctx_image_size;
2160         ret = gr_gk20a_submit_fecs_method_op(g, op);
2161         if (ret) {
2162                 nvhost_err(dev_from_gk20a(g),
2163                            "query zcull ctx image size failed");
2164                 return ret;
2165         }
2166         op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v();
2167         op.mailbox.ret = &pm_ctx_image_size;
2168         ret = gr_gk20a_submit_fecs_method_op(g, op);
2169         if (ret) {
2170                 nvhost_err(dev_from_gk20a(g),
2171                            "query pm ctx image size failed");
2172                 return ret;
2173         }
2174
2175         if (!g->gr.ctx_vars.golden_image_size &&
2176             !g->gr.ctx_vars.zcull_ctxsw_image_size) {
2177                 g->gr.ctx_vars.golden_image_size = golden_ctx_image_size;
2178                 g->gr.ctx_vars.zcull_ctxsw_image_size = zcull_ctx_image_size;
2179         } else {
2180                 /* hw is different after railgating? */
2181                 BUG_ON(g->gr.ctx_vars.golden_image_size != golden_ctx_image_size);
2182                 BUG_ON(g->gr.ctx_vars.zcull_ctxsw_image_size != zcull_ctx_image_size);
2183         }
2184
2185         g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
2186
2187         nvhost_dbg_fn("done");
2188         return 0;
2189 }
2190
2191 static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
2192 {
2193         struct gr_gk20a *gr = &g->gr;
2194         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2195         struct mem_handle *mem;
2196         u32 i, attr_buffer_size;
2197
2198         u32 cb_buffer_size = gr_scc_bundle_cb_size_div_256b__prod_v() *
2199                 gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
2200
2201         u32 pagepool_buffer_size = gr_scc_pagepool_total_pages_hwmax_value_v() *
2202                 gr_scc_pagepool_total_pages_byte_granularity_v();
2203
2204         u32 attr_cb_default_size = gr_gpc0_ppc0_cbm_cfg_size_default_v();
2205         u32 alpha_cb_default_size = gr_gpc0_ppc0_cbm_cfg2_size_default_v();
2206
2207         u32 attr_cb_size =
2208                 attr_cb_default_size + (attr_cb_default_size >> 1);
2209         u32 alpha_cb_size =
2210                 alpha_cb_default_size + (alpha_cb_default_size >> 1);
2211
2212         u32 num_tpcs_per_pes = proj_scal_litter_num_tpcs_per_pes_v();
2213         u32 attr_max_size_per_tpc =
2214                 gr_gpc0_ppc0_cbm_cfg_size_v(~0) / num_tpcs_per_pes;
2215         u32 alpha_max_size_per_tpc =
2216                 gr_gpc0_ppc0_cbm_cfg2_size_v(~0) / num_tpcs_per_pes;
2217
2218
2219         nvhost_dbg_fn("");
2220
2221         attr_cb_size =
2222                 (attr_cb_size > attr_max_size_per_tpc) ?
2223                         attr_max_size_per_tpc : attr_cb_size;
2224         attr_cb_default_size =
2225                 (attr_cb_default_size > attr_cb_size) ?
2226                         attr_cb_size : attr_cb_default_size;
2227         alpha_cb_size =
2228                 (alpha_cb_size > alpha_max_size_per_tpc) ?
2229                         alpha_max_size_per_tpc : alpha_cb_size;
2230         alpha_cb_default_size =
2231                 (alpha_cb_default_size > alpha_cb_size) ?
2232                         alpha_cb_size : alpha_cb_default_size;
2233
2234         attr_buffer_size =
2235                 (gr_gpc0_ppc0_cbm_cfg_size_granularity_v() * alpha_cb_size +
2236                  gr_gpc0_ppc0_cbm_cfg2_size_granularity_v() * alpha_cb_size) *
2237                  gr->gpc_count;
2238
2239         nvhost_dbg_info("cb_buffer_size : %d", cb_buffer_size);
2240
2241         mem = nvhost_memmgr_alloc(memmgr, cb_buffer_size,
2242                                   DEFAULT_ALLOC_ALIGNMENT,
2243                                   DEFAULT_ALLOC_FLAGS,
2244                                   0);
2245         if (IS_ERR(mem))
2246                 goto clean_up;
2247
2248         gr->global_ctx_buffer[CIRCULAR].ref = mem;
2249         gr->global_ctx_buffer[CIRCULAR].size = cb_buffer_size;
2250
2251         mem = nvhost_memmgr_alloc(memmgr, cb_buffer_size,
2252                                   DEFAULT_ALLOC_ALIGNMENT,
2253                                   DEFAULT_ALLOC_FLAGS,
2254                                   NVMAP_HEAP_CARVEOUT_VPR);
2255         if (!IS_ERR(mem)) {
2256                 gr->global_ctx_buffer[CIRCULAR_VPR].ref = mem;
2257                 gr->global_ctx_buffer[CIRCULAR_VPR].size = cb_buffer_size;
2258         }
2259
2260         nvhost_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
2261
2262         mem = nvhost_memmgr_alloc(memmgr, pagepool_buffer_size,
2263                                   DEFAULT_ALLOC_ALIGNMENT,
2264                                   DEFAULT_ALLOC_FLAGS,
2265                                   0);
2266         if (IS_ERR(mem))
2267                 goto clean_up;
2268
2269         gr->global_ctx_buffer[PAGEPOOL].ref = mem;
2270         gr->global_ctx_buffer[PAGEPOOL].size = pagepool_buffer_size;
2271
2272         mem = nvhost_memmgr_alloc(memmgr, pagepool_buffer_size,
2273                                   DEFAULT_ALLOC_ALIGNMENT,
2274                                   DEFAULT_ALLOC_FLAGS,
2275                                   NVMAP_HEAP_CARVEOUT_VPR);
2276         if (!IS_ERR(mem)) {
2277                 gr->global_ctx_buffer[PAGEPOOL_VPR].ref = mem;
2278                 gr->global_ctx_buffer[PAGEPOOL_VPR].size = pagepool_buffer_size;
2279         }
2280
2281         nvhost_dbg_info("attr_buffer_size : %d", attr_buffer_size);
2282
2283         mem = nvhost_memmgr_alloc(memmgr, attr_buffer_size,
2284                                   DEFAULT_ALLOC_ALIGNMENT,
2285                                   DEFAULT_ALLOC_FLAGS,
2286                                   0);
2287         if (IS_ERR(mem))
2288                 goto clean_up;
2289
2290         gr->global_ctx_buffer[ATTRIBUTE].ref = mem;
2291         gr->global_ctx_buffer[ATTRIBUTE].size = attr_buffer_size;
2292
2293         mem = nvhost_memmgr_alloc(memmgr, attr_buffer_size,
2294                                   DEFAULT_ALLOC_ALIGNMENT,
2295                                   DEFAULT_ALLOC_FLAGS,
2296                                   NVMAP_HEAP_CARVEOUT_VPR);
2297         if (!IS_ERR(mem)) {
2298                 gr->global_ctx_buffer[ATTRIBUTE_VPR].ref = mem;
2299                 gr->global_ctx_buffer[ATTRIBUTE_VPR].size = attr_buffer_size;
2300         }
2301
2302         nvhost_dbg_info("golden_image_size : %d",
2303                    gr->ctx_vars.golden_image_size);
2304
2305         mem = nvhost_memmgr_alloc(memmgr, gr->ctx_vars.golden_image_size,
2306                                   DEFAULT_ALLOC_ALIGNMENT,
2307                                   DEFAULT_ALLOC_FLAGS,
2308                                   0);
2309         if (IS_ERR(mem))
2310                 goto clean_up;
2311
2312         gr->global_ctx_buffer[GOLDEN_CTX].ref = mem;
2313         gr->global_ctx_buffer[GOLDEN_CTX].size =
2314                 gr->ctx_vars.golden_image_size;
2315
2316         nvhost_dbg_info("priv_access_map_size : %d",
2317                    gr->ctx_vars.priv_access_map_size);
2318
2319         mem = nvhost_memmgr_alloc(memmgr, gr->ctx_vars.priv_access_map_size,
2320                                   DEFAULT_ALLOC_ALIGNMENT,
2321                                   DEFAULT_ALLOC_FLAGS,
2322                                   0);
2323         if (IS_ERR(mem))
2324                 goto clean_up;
2325
2326         gr->global_ctx_buffer[PRIV_ACCESS_MAP].ref = mem;
2327         gr->global_ctx_buffer[PRIV_ACCESS_MAP].size =
2328                 gr->ctx_vars.priv_access_map_size;
2329
2330         nvhost_dbg_fn("done");
2331         return 0;
2332
2333  clean_up:
2334         nvhost_err(dev_from_gk20a(g), "fail");
2335         for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2336                 if (gr->global_ctx_buffer[i].ref) {
2337                         nvhost_memmgr_put(memmgr,
2338                                           gr->global_ctx_buffer[i].ref);
2339                         memset(&gr->global_ctx_buffer[i],
2340                                 0, sizeof(struct mem_desc));
2341                 }
2342         }
2343         return -ENOMEM;
2344 }
2345
2346 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
2347 {
2348         struct gr_gk20a *gr = &g->gr;
2349         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2350         u32 i;
2351
2352         for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2353                 nvhost_memmgr_put(memmgr, gr->global_ctx_buffer[i].ref);
2354                 memset(&gr->global_ctx_buffer[i], 0, sizeof(struct mem_desc));
2355         }
2356
2357         nvhost_dbg_fn("done");
2358 }
2359
2360 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
2361                                         struct channel_gk20a *c)
2362 {
2363         struct vm_gk20a *ch_vm = c->vm;
2364         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2365         struct mem_handle *handle_ref;
2366         u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2367         struct gr_gk20a *gr = &g->gr;
2368         u64 gpu_va;
2369         u32 i;
2370         nvhost_dbg_fn("");
2371
2372         /* Circular Buffer */
2373         if (!c->vpr || (gr->global_ctx_buffer[CIRCULAR_VPR].ref == NULL))
2374                 handle_ref = gr->global_ctx_buffer[CIRCULAR].ref;
2375         else
2376                 handle_ref = gr->global_ctx_buffer[CIRCULAR_VPR].ref;
2377
2378         gpu_va = gk20a_vm_map(ch_vm, memmgr, handle_ref,
2379                               /*offset_align, flags, kind*/
2380                               0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0,
2381                               NULL, false, mem_flag_none);
2382         if (!gpu_va)
2383                 goto clean_up;
2384         g_bfr_va[CIRCULAR_VA] = gpu_va;
2385
2386         /* Attribute Buffer */
2387         if (!c->vpr || (gr->global_ctx_buffer[ATTRIBUTE_VPR].ref == NULL))
2388                 handle_ref = gr->global_ctx_buffer[ATTRIBUTE].ref;
2389         else
2390                 handle_ref = gr->global_ctx_buffer[ATTRIBUTE_VPR].ref;
2391
2392         gpu_va = gk20a_vm_map(ch_vm, memmgr, handle_ref,
2393                               /*offset_align, flags, kind*/
2394                               0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0,
2395                               NULL, false, mem_flag_none);
2396         if (!gpu_va)
2397                 goto clean_up;
2398         g_bfr_va[ATTRIBUTE_VA] = gpu_va;
2399
2400         /* Page Pool */
2401         if (!c->vpr || (gr->global_ctx_buffer[PAGEPOOL_VPR].ref == NULL))
2402                 handle_ref = gr->global_ctx_buffer[PAGEPOOL].ref;
2403         else
2404                 handle_ref = gr->global_ctx_buffer[PAGEPOOL_VPR].ref;
2405
2406         gpu_va = gk20a_vm_map(ch_vm, memmgr, handle_ref,
2407                               /*offset_align, flags, kind*/
2408                               0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0,
2409                               NULL, false, mem_flag_none);
2410         if (!gpu_va)
2411                 goto clean_up;
2412         g_bfr_va[PAGEPOOL_VA] = gpu_va;
2413
2414         /* Golden Image */
2415         gpu_va = gk20a_vm_map(ch_vm, memmgr,
2416                               gr->global_ctx_buffer[GOLDEN_CTX].ref,
2417                               /*offset_align, flags, kind*/
2418                               0, 0, 0, NULL, false, mem_flag_none);
2419         if (!gpu_va)
2420                 goto clean_up;
2421         g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
2422
2423         /* Priv register Access Map */
2424         gpu_va = gk20a_vm_map(ch_vm, memmgr,
2425                               gr->global_ctx_buffer[PRIV_ACCESS_MAP].ref,
2426                               /*offset_align, flags, kind*/
2427                               0, 0, 0, NULL, false,
2428                               mem_flag_none);
2429         if (!gpu_va)
2430                 goto clean_up;
2431         g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
2432
2433         c->ch_ctx.global_ctx_buffer_mapped = true;
2434         return 0;
2435
2436  clean_up:
2437         for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2438                 if (g_bfr_va[i]) {
2439                         gk20a_vm_unmap(ch_vm, g_bfr_va[i]);
2440                         g_bfr_va[i] = 0;
2441                 }
2442         }
2443         return -ENOMEM;
2444 }
2445
2446 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
2447 {
2448         struct vm_gk20a *ch_vm = c->vm;
2449         u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2450         u32 i;
2451
2452         nvhost_dbg_fn("");
2453
2454         for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2455                 if (g_bfr_va[i]) {
2456                         gk20a_vm_unmap(ch_vm, g_bfr_va[i]);
2457                         g_bfr_va[i] = 0;
2458                 }
2459         }
2460         c->ch_ctx.global_ctx_buffer_mapped = false;
2461 }
2462
2463 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
2464                                 struct channel_gk20a *c)
2465 {
2466         struct gr_gk20a *gr = &g->gr;
2467         struct gr_ctx_desc *gr_ctx = &c->ch_ctx.gr_ctx;
2468         struct vm_gk20a *ch_vm = c->vm;
2469         struct device *d = dev_from_gk20a(g);
2470         struct sg_table *sgt;
2471         DEFINE_DMA_ATTRS(attrs);
2472         int err = 0;
2473
2474         nvhost_dbg_fn("");
2475
2476         if (gr->ctx_vars.buffer_size == 0)
2477                 return 0;
2478
2479         /* alloc channel gr ctx buffer */
2480         gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
2481         gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
2482
2483         gr_ctx->size = gr->ctx_vars.buffer_total_size;
2484         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2485         gr_ctx->pages = dma_alloc_attrs(d, gr_ctx->size,
2486                                 &gr_ctx->iova, GFP_KERNEL, &attrs);
2487         if (!gr_ctx->pages)
2488                 return -ENOMEM;
2489
2490         err = gk20a_get_sgtable_from_pages(d, &sgt, gr_ctx->pages,
2491                         gr_ctx->iova, gr_ctx->size);
2492         if (err)
2493                 goto err_free;
2494
2495         gr_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, gr_ctx->size,
2496                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2497                                 mem_flag_none);
2498         if (!gr_ctx->gpu_va)
2499                 goto err_free_sgt;
2500
2501         gk20a_free_sgtable(&sgt);
2502
2503         return 0;
2504
2505  err_free_sgt:
2506         gk20a_free_sgtable(&sgt);
2507  err_free:
2508         dma_free_attrs(d, gr_ctx->size,
2509                 gr_ctx->pages, gr_ctx->iova, &attrs);
2510         gr_ctx->pages = NULL;
2511         gr_ctx->iova = 0;
2512
2513         return err;
2514 }
2515
2516 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
2517 {
2518         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2519         struct vm_gk20a *ch_vm = c->vm;
2520         struct gk20a *g = c->g;
2521         struct device *d = dev_from_gk20a(g);
2522         DEFINE_DMA_ATTRS(attrs);
2523
2524         nvhost_dbg_fn("");
2525
2526         gk20a_gmmu_unmap(ch_vm, ch_ctx->gr_ctx.gpu_va,
2527                         ch_ctx->gr_ctx.size, mem_flag_none);
2528         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2529         dma_free_attrs(d, ch_ctx->gr_ctx.size,
2530                 ch_ctx->gr_ctx.pages, ch_ctx->gr_ctx.iova, &attrs);
2531         ch_ctx->gr_ctx.pages = NULL;
2532         ch_ctx->gr_ctx.iova = 0;
2533 }
2534
2535 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
2536                                 struct channel_gk20a *c)
2537 {
2538         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2539         struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
2540         struct vm_gk20a *ch_vm = c->vm;
2541
2542         nvhost_dbg_fn("");
2543
2544         patch_ctx->mem.ref = nvhost_memmgr_alloc(memmgr, 128 * sizeof(u32),
2545                                                  DEFAULT_ALLOC_ALIGNMENT,
2546                                                  DEFAULT_ALLOC_FLAGS,
2547                                                  0);
2548         if (IS_ERR(patch_ctx->mem.ref))
2549                 return -ENOMEM;
2550
2551         patch_ctx->gpu_va = gk20a_vm_map(ch_vm, memmgr,
2552                                          patch_ctx->mem.ref,
2553                                          /*offset_align, flags, kind*/
2554                                          0, 0, 0, NULL, false, mem_flag_none);
2555         if (!patch_ctx->gpu_va)
2556                 goto clean_up;
2557
2558         nvhost_dbg_fn("done");
2559         return 0;
2560
2561  clean_up:
2562         nvhost_err(dev_from_gk20a(g), "fail");
2563         if (patch_ctx->mem.ref) {
2564                 nvhost_memmgr_put(memmgr, patch_ctx->mem.ref);
2565                 patch_ctx->mem.ref = 0;
2566         }
2567
2568         return -ENOMEM;
2569 }
2570
2571 static void gr_gk20a_unmap_channel_patch_ctx(struct channel_gk20a *c)
2572 {
2573         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2574         struct vm_gk20a *ch_vm = c->vm;
2575
2576         nvhost_dbg_fn("");
2577
2578         if (patch_ctx->gpu_va)
2579                 gk20a_vm_unmap(ch_vm, patch_ctx->gpu_va);
2580         patch_ctx->gpu_va = 0;
2581         patch_ctx->data_count = 0;
2582 }
2583
2584 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
2585 {
2586         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2587         struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
2588
2589         nvhost_dbg_fn("");
2590
2591         gr_gk20a_unmap_channel_patch_ctx(c);
2592
2593         if (patch_ctx->mem.ref) {
2594                 nvhost_memmgr_put(memmgr, patch_ctx->mem.ref);
2595                 patch_ctx->mem.ref = 0;
2596         }
2597 }
2598
2599 void gk20a_free_channel_ctx(struct channel_gk20a *c)
2600 {
2601         gr_gk20a_unmap_global_ctx_buffers(c);
2602         gr_gk20a_free_channel_patch_ctx(c);
2603         gr_gk20a_free_channel_gr_ctx(c);
2604
2605         /* zcull_ctx, pm_ctx */
2606
2607         memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
2608
2609         c->num_objects = 0;
2610         c->first_init = false;
2611 }
2612
2613 int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
2614                         struct nvhost_alloc_obj_ctx_args *args)
2615 {
2616         struct gk20a *g = c->g;
2617         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2618         bool change_to_compute_mode = false;
2619         int err = 0;
2620
2621         nvhost_dbg_fn("");
2622
2623         /* an address space needs to have been bound at this point.*/
2624         if (!gk20a_channel_as_bound(c)) {
2625                 nvhost_err(dev_from_gk20a(g),
2626                            "not bound to address space at time"
2627                            " of grctx allocation");
2628                 return -EINVAL;
2629         }
2630
2631         switch (args->class_num) {
2632         case KEPLER_COMPUTE_A:
2633                 /* tbd: NV2080_CTRL_GPU_COMPUTE_MODE_RULES_EXCLUSIVE_COMPUTE */
2634                 /* tbd: PDB_PROP_GRAPHICS_DISTINCT_3D_AND_COMPUTE_STATE_DEF  */
2635                 change_to_compute_mode = true;
2636                 break;
2637         case KEPLER_C:
2638         case FERMI_TWOD_A:
2639         case KEPLER_DMA_COPY_A:
2640                 break;
2641
2642         default:
2643                 nvhost_err(dev_from_gk20a(g),
2644                            "invalid obj class 0x%x", args->class_num);
2645                 err = -EINVAL;
2646                 goto out;
2647         }
2648
2649         /* allocate gr ctx buffer */
2650         if (ch_ctx->gr_ctx.pages == NULL) {
2651                 err = gr_gk20a_alloc_channel_gr_ctx(g, c);
2652                 if (err) {
2653                         nvhost_err(dev_from_gk20a(g),
2654                                 "fail to allocate gr ctx buffer");
2655                         goto out;
2656                 }
2657                 c->obj_class = args->class_num;
2658         } else {
2659                 /*TBD: needs to be more subtle about which is being allocated
2660                 * as some are allowed to be allocated along same channel */
2661                 nvhost_err(dev_from_gk20a(g),
2662                         "too many classes alloc'd on same channel");
2663                 err = -EINVAL;
2664                 goto out;
2665         }
2666
2667         /* commit gr ctx buffer */
2668         err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
2669         if (err) {
2670                 nvhost_err(dev_from_gk20a(g),
2671                         "fail to commit gr ctx buffer");
2672                 goto out;
2673         }
2674
2675         /* allocate patch buffer */
2676         if (ch_ctx->patch_ctx.mem.ref == NULL) {
2677                 err = gr_gk20a_alloc_channel_patch_ctx(g, c);
2678                 if (err) {
2679                         nvhost_err(dev_from_gk20a(g),
2680                                 "fail to allocate patch buffer");
2681                         goto out;
2682                 }
2683         }
2684
2685         /* map global buffer to channel gpu_va and commit */
2686         if (!ch_ctx->global_ctx_buffer_mapped) {
2687                 err = gr_gk20a_map_global_ctx_buffers(g, c);
2688                 if (err) {
2689                         nvhost_err(dev_from_gk20a(g),
2690                                 "fail to map global ctx buffer");
2691                         goto out;
2692                 }
2693                 gr_gk20a_elpg_protected_call(g,
2694                         gr_gk20a_commit_global_ctx_buffers(g, c, true));
2695         }
2696
2697         /* init golden image, ELPG enabled after this is done */
2698         err = gr_gk20a_init_golden_ctx_image(g, c);
2699         if (err) {
2700                 nvhost_err(dev_from_gk20a(g),
2701                         "fail to init golden ctx image");
2702                 goto out;
2703         }
2704
2705         /* load golden image */
2706         if (!c->first_init) {
2707                 err = gr_gk20a_elpg_protected_call(g,
2708                         gr_gk20a_load_golden_ctx_image(g, c));
2709                 if (err) {
2710                         nvhost_err(dev_from_gk20a(g),
2711                                 "fail to load golden ctx image");
2712                         goto out;
2713                 }
2714                 c->first_init = true;
2715         }
2716         gk20a_mm_l2_invalidate(g);
2717
2718         c->num_objects++;
2719
2720         nvhost_dbg_fn("done");
2721         return 0;
2722 out:
2723         /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
2724            can be reused so no need to release them.
2725            2. golden image init and load is a one time thing so if
2726            they pass, no need to undo. */
2727         nvhost_err(dev_from_gk20a(g), "fail");
2728         return err;
2729 }
2730
2731 int gk20a_free_obj_ctx(struct channel_gk20a  *c,
2732                        struct nvhost_free_obj_ctx_args *args)
2733 {
2734         unsigned long timeout = gk20a_get_gr_idle_timeout(c->g);
2735
2736         nvhost_dbg_fn("");
2737
2738         if (c->num_objects == 0)
2739                 return 0;
2740
2741         c->num_objects--;
2742
2743         if (c->num_objects == 0) {
2744                 c->first_init = false;
2745                 gk20a_disable_channel(c,
2746                         !c->hwctx->has_timedout,
2747                         timeout);
2748                 gr_gk20a_unmap_channel_patch_ctx(c);
2749         }
2750
2751         return 0;
2752 }
2753
2754 static void gk20a_remove_gr_support(struct gr_gk20a *gr)
2755 {
2756         struct gk20a *g = gr->g;
2757         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2758         struct device *d = dev_from_gk20a(g);
2759
2760         nvhost_dbg_fn("");
2761
2762         gr_gk20a_free_global_ctx_buffers(g);
2763
2764         dma_free_coherent(d, gr->mmu_wr_mem.size,
2765                 gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
2766         gr->mmu_wr_mem.cpuva = NULL;
2767         gr->mmu_wr_mem.iova = 0;
2768         dma_free_coherent(d, gr->mmu_rd_mem.size,
2769                 gr->mmu_rd_mem.cpuva, gr->mmu_rd_mem.iova);
2770         gr->mmu_rd_mem.cpuva = NULL;
2771         gr->mmu_rd_mem.iova = 0;
2772
2773         nvhost_memmgr_put(memmgr, gr->compbit_store.mem.ref);
2774
2775         memset(&gr->mmu_wr_mem, 0, sizeof(struct mmu_desc));
2776         memset(&gr->mmu_rd_mem, 0, sizeof(struct mmu_desc));
2777         memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
2778
2779         kfree(gr->gpc_tpc_count);
2780         kfree(gr->gpc_zcb_count);
2781         kfree(gr->gpc_ppc_count);
2782         kfree(gr->pes_tpc_count[0]);
2783         kfree(gr->pes_tpc_count[1]);
2784         kfree(gr->pes_tpc_mask[0]);
2785         kfree(gr->pes_tpc_mask[1]);
2786         kfree(gr->gpc_skip_mask);
2787         kfree(gr->map_tiles);
2788         gr->gpc_tpc_count = NULL;
2789         gr->gpc_zcb_count = NULL;
2790         gr->gpc_ppc_count = NULL;
2791         gr->pes_tpc_count[0] = NULL;
2792         gr->pes_tpc_count[1] = NULL;
2793         gr->pes_tpc_mask[0] = NULL;
2794         gr->pes_tpc_mask[1] = NULL;
2795         gr->gpc_skip_mask = NULL;
2796         gr->map_tiles = NULL;
2797
2798         kfree(gr->ctx_vars.ucode.fecs.inst.l);
2799         kfree(gr->ctx_vars.ucode.fecs.data.l);
2800         kfree(gr->ctx_vars.ucode.gpccs.inst.l);
2801         kfree(gr->ctx_vars.ucode.gpccs.data.l);
2802         kfree(gr->ctx_vars.sw_bundle_init.l);
2803         kfree(gr->ctx_vars.sw_method_init.l);
2804         kfree(gr->ctx_vars.sw_ctx_load.l);
2805         kfree(gr->ctx_vars.sw_non_ctx_load.l);
2806         kfree(gr->ctx_vars.ctxsw_regs.sys.l);
2807         kfree(gr->ctx_vars.ctxsw_regs.gpc.l);
2808         kfree(gr->ctx_vars.ctxsw_regs.tpc.l);
2809         kfree(gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
2810         kfree(gr->ctx_vars.ctxsw_regs.ppc.l);
2811         kfree(gr->ctx_vars.ctxsw_regs.pm_sys.l);
2812         kfree(gr->ctx_vars.ctxsw_regs.pm_gpc.l);
2813         kfree(gr->ctx_vars.ctxsw_regs.pm_tpc.l);
2814
2815         kfree(gr->ctx_vars.local_golden_image);
2816         gr->ctx_vars.local_golden_image = NULL;
2817
2818         nvhost_allocator_destroy(&gr->comp_tags);
2819 }
2820
2821 static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
2822 {
2823         u32 gpc_index, pes_index;
2824         u32 pes_tpc_mask;
2825         u32 pes_tpc_count;
2826         u32 pes_heavy_index;
2827         u32 gpc_new_skip_mask;
2828         u32 tmp;
2829
2830         tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
2831         gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
2832
2833         tmp = gk20a_readl(g, top_num_gpcs_r());
2834         gr->max_gpc_count = top_num_gpcs_value_v(tmp);
2835
2836         tmp = gk20a_readl(g, top_num_fbps_r());
2837         gr->max_fbps_count = top_num_fbps_value_v(tmp);
2838
2839         tmp = gk20a_readl(g, top_tpc_per_gpc_r());
2840         gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
2841
2842         gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
2843
2844         tmp = gk20a_readl(g, top_num_fbps_r());
2845         gr->sys_count = top_num_fbps_value_v(tmp);
2846
2847         tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
2848         gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
2849
2850         gr->pe_count_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
2851         gr->max_zcull_per_gpc_count = proj_scal_litter_num_zcull_banks_v();
2852
2853         if (!gr->gpc_count) {
2854                 nvhost_err(dev_from_gk20a(g), "gpc_count==0!");
2855                 goto clean_up;
2856         }
2857
2858         gr->gpc_tpc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2859         gr->gpc_zcb_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2860         gr->gpc_ppc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2861         gr->pes_tpc_count[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2862         gr->pes_tpc_count[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2863         gr->pes_tpc_mask[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2864         gr->pes_tpc_mask[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2865         gr->gpc_skip_mask =
2866                 kzalloc(gr_pd_dist_skip_table__size_1_v() * 4 * sizeof(u32),
2867                         GFP_KERNEL);
2868
2869         if (!gr->gpc_tpc_count || !gr->gpc_zcb_count || !gr->gpc_ppc_count ||
2870             !gr->pes_tpc_count[0] || !gr->pes_tpc_count[1] ||
2871             !gr->pes_tpc_mask[0] || !gr->pes_tpc_mask[1] || !gr->gpc_skip_mask)
2872                 goto clean_up;
2873
2874         gr->ppc_count = 0;
2875         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
2876                 tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r());
2877
2878                 gr->gpc_tpc_count[gpc_index] =
2879                         gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
2880                 gr->tpc_count += gr->gpc_tpc_count[gpc_index];
2881
2882                 gr->gpc_zcb_count[gpc_index] =
2883                         gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
2884                 gr->zcb_count += gr->gpc_zcb_count[gpc_index];
2885
2886                 gr->gpc_ppc_count[gpc_index] = gr->pe_count_per_gpc;
2887                 gr->ppc_count += gr->gpc_ppc_count[gpc_index];
2888                 for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
2889
2890                         tmp = gk20a_readl(g,
2891                                 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
2892                                 gpc_index * proj_gpc_stride_v());
2893
2894                         pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
2895                         pes_tpc_count = count_bits(pes_tpc_mask);
2896
2897                         gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
2898                         gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
2899                 }
2900
2901                 gpc_new_skip_mask = 0;
2902                 if (gr->pes_tpc_count[0][gpc_index] +
2903                     gr->pes_tpc_count[1][gpc_index] == 5) {
2904                         pes_heavy_index =
2905                                 gr->pes_tpc_count[0][gpc_index] >
2906                                 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
2907
2908                         gpc_new_skip_mask =
2909                                 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
2910                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
2911                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
2912
2913                 } else if ((gr->pes_tpc_count[0][gpc_index] +
2914                             gr->pes_tpc_count[1][gpc_index] == 4) &&
2915                            (gr->pes_tpc_count[0][gpc_index] !=
2916                             gr->pes_tpc_count[1][gpc_index])) {
2917                                 pes_heavy_index =
2918                                     gr->pes_tpc_count[0][gpc_index] >
2919                                     gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
2920
2921                         gpc_new_skip_mask =
2922                                 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
2923                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
2924                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
2925                 }
2926                 gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
2927         }
2928
2929         nvhost_dbg_info("fbps: %d", gr->num_fbps);
2930         nvhost_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
2931         nvhost_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
2932         nvhost_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
2933         nvhost_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
2934         nvhost_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
2935         nvhost_dbg_info("sys_count: %d", gr->sys_count);
2936         nvhost_dbg_info("gpc_count: %d", gr->gpc_count);
2937         nvhost_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
2938         nvhost_dbg_info("tpc_count: %d", gr->tpc_count);
2939         nvhost_dbg_info("ppc_count: %d", gr->ppc_count);
2940
2941         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2942                 nvhost_dbg_info("gpc_tpc_count[%d] : %d",
2943                            gpc_index, gr->gpc_tpc_count[gpc_index]);
2944         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2945                 nvhost_dbg_info("gpc_zcb_count[%d] : %d",
2946                            gpc_index, gr->gpc_zcb_count[gpc_index]);
2947         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2948                 nvhost_dbg_info("gpc_ppc_count[%d] : %d",
2949                            gpc_index, gr->gpc_ppc_count[gpc_index]);
2950         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2951                 nvhost_dbg_info("gpc_skip_mask[%d] : %d",
2952                            gpc_index, gr->gpc_skip_mask[gpc_index]);
2953         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2954                 for (pes_index = 0;
2955                      pes_index < gr->pe_count_per_gpc;
2956                      pes_index++)
2957                         nvhost_dbg_info("pes_tpc_count[%d][%d] : %d",
2958                                    pes_index, gpc_index,
2959                                    gr->pes_tpc_count[pes_index][gpc_index]);
2960
2961         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2962                 for (pes_index = 0;
2963                      pes_index < gr->pe_count_per_gpc;
2964                      pes_index++)
2965                         nvhost_dbg_info("pes_tpc_mask[%d][%d] : %d",
2966                                    pes_index, gpc_index,
2967                                    gr->pes_tpc_mask[pes_index][gpc_index]);
2968
2969         gr->bundle_cb_default_size = gr_scc_bundle_cb_size_div_256b__prod_v();
2970         gr->min_gpm_fifo_depth = gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
2971         gr->bundle_cb_token_limit = gr_pd_ab_dist_cfg2_token_limit_init_v();
2972         gr->attrib_cb_default_size = gr_gpc0_ppc0_cbm_cfg_size_default_v();
2973         /* gk20a has a fixed beta CB RAM, don't alloc more */
2974         gr->attrib_cb_size = gr->attrib_cb_default_size;
2975         gr->alpha_cb_default_size = gr_gpc0_ppc0_cbm_cfg2_size_default_v();
2976         gr->alpha_cb_size = gr->alpha_cb_default_size + (gr->alpha_cb_default_size >> 1);
2977         gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
2978
2979         nvhost_dbg_info("bundle_cb_default_size: %d",
2980                    gr->bundle_cb_default_size);
2981         nvhost_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
2982         nvhost_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
2983         nvhost_dbg_info("attrib_cb_default_size: %d",
2984                    gr->attrib_cb_default_size);
2985         nvhost_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
2986         nvhost_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
2987         nvhost_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
2988         nvhost_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
2989
2990         return 0;
2991
2992 clean_up:
2993         return -ENOMEM;
2994 }
2995
2996 static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
2997 {
2998         struct device *d = dev_from_gk20a(g);
2999
3000         gr->mmu_wr_mem_size = gr->mmu_rd_mem_size = 0x1000;
3001
3002         gr->mmu_wr_mem.size = gr->mmu_wr_mem_size;
3003         gr->mmu_wr_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_wr_mem_size,
3004                                         &gr->mmu_wr_mem.iova, GFP_KERNEL);
3005         if (!gr->mmu_wr_mem.cpuva)
3006                 goto err;
3007
3008         gr->mmu_rd_mem.size = gr->mmu_rd_mem_size;
3009         gr->mmu_rd_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_rd_mem_size,
3010                                         &gr->mmu_rd_mem.iova, GFP_KERNEL);
3011         if (!gr->mmu_rd_mem.cpuva)
3012                 goto err_free_wr_mem;
3013         return 0;
3014
3015  err_free_wr_mem:
3016         dma_free_coherent(d, gr->mmu_wr_mem.size,
3017                 gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
3018         gr->mmu_wr_mem.cpuva = NULL;
3019         gr->mmu_wr_mem.iova = 0;
3020  err:
3021         return -ENOMEM;
3022 }
3023
3024 static u32 prime_set[18] = {
3025         2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
3026
3027 static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
3028 {
3029         s32 comm_denom;
3030         s32 mul_factor;
3031         s32 *init_frac = NULL;
3032         s32 *init_err = NULL;
3033         s32 *run_err = NULL;
3034         s32 *sorted_num_tpcs = NULL;
3035         s32 *sorted_to_unsorted_gpc_map = NULL;
3036         u32 gpc_index;
3037         u32 gpc_mark = 0;
3038         u32 num_tpc;
3039         u32 max_tpc_count = 0;
3040         u32 swap;
3041         u32 tile_count;
3042         u32 index;
3043         bool delete_map = false;
3044         bool gpc_sorted;
3045         int ret = 0;
3046
3047         init_frac = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3048         init_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3049         run_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3050         sorted_num_tpcs =
3051                 kzalloc(proj_scal_max_gpcs_v() *
3052                         proj_scal_max_tpc_per_gpc_v() * sizeof(s32),
3053                         GFP_KERNEL);
3054         sorted_to_unsorted_gpc_map =
3055                 kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3056
3057         if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
3058               sorted_to_unsorted_gpc_map)) {
3059                 ret = -ENOMEM;
3060                 goto clean_up;
3061         }
3062
3063         gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
3064
3065         if (gr->tpc_count == 3)
3066                 gr->map_row_offset = 2;
3067         else if (gr->tpc_count < 3)
3068                 gr->map_row_offset = 1;
3069         else {
3070                 gr->map_row_offset = 3;
3071
3072                 for (index = 1; index < 18; index++) {
3073                         u32 prime = prime_set[index];
3074                         if ((gr->tpc_count % prime) != 0) {
3075                                 gr->map_row_offset = prime;
3076                                 break;
3077                         }
3078                 }
3079         }
3080
3081         switch (gr->tpc_count) {
3082         case 15:
3083                 gr->map_row_offset = 6;
3084                 break;
3085         case 14:
3086                 gr->map_row_offset = 5;
3087                 break;
3088         case 13:
3089                 gr->map_row_offset = 2;
3090                 break;
3091         case 11:
3092                 gr->map_row_offset = 7;
3093                 break;
3094         case 10:
3095                 gr->map_row_offset = 6;
3096                 break;
3097         case 7:
3098         case 5:
3099                 gr->map_row_offset = 1;
3100                 break;
3101         default:
3102                 break;
3103         }
3104
3105         if (gr->map_tiles) {
3106                 if (gr->map_tile_count != gr->tpc_count)
3107                         delete_map = true;
3108
3109                 for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
3110                         if ((u32)gr->map_tiles[tile_count] >= gr->tpc_count)
3111                                 delete_map = true;
3112                 }
3113
3114                 if (delete_map) {
3115                         kfree(gr->map_tiles);
3116                         gr->map_tiles = NULL;
3117                         gr->map_tile_count = 0;
3118                 }
3119         }
3120
3121         if (gr->map_tiles == NULL) {
3122                 gr->map_tile_count = proj_scal_max_gpcs_v();
3123
3124                 gr->map_tiles = kzalloc(proj_scal_max_gpcs_v() * sizeof(u8), GFP_KERNEL);
3125                 if (gr->map_tiles == NULL) {
3126                         ret = -ENOMEM;
3127                         goto clean_up;
3128                 }
3129
3130                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3131                         sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
3132                         sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
3133                 }
3134
3135                 gpc_sorted = false;
3136                 while (!gpc_sorted) {
3137                         gpc_sorted = true;
3138                         for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
3139                                 if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
3140                                         gpc_sorted = false;
3141                                         swap = sorted_num_tpcs[gpc_index];
3142                                         sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
3143                                         sorted_num_tpcs[gpc_index + 1] = swap;
3144                                         swap = sorted_to_unsorted_gpc_map[gpc_index];
3145                                         sorted_to_unsorted_gpc_map[gpc_index] =
3146                                                 sorted_to_unsorted_gpc_map[gpc_index + 1];
3147                                         sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
3148                                 }
3149                         }
3150                 }
3151
3152                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3153                         if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
3154                                 max_tpc_count = gr->gpc_tpc_count[gpc_index];
3155
3156                 mul_factor = gr->gpc_count * max_tpc_count;
3157                 if (mul_factor & 0x1)
3158                         mul_factor = 2;
3159                 else
3160                         mul_factor = 1;
3161
3162                 comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
3163
3164                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3165                         num_tpc = sorted_num_tpcs[gpc_index];
3166
3167                         init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
3168
3169                         if (num_tpc != 0)
3170                                 init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
3171                         else
3172                                 init_err[gpc_index] = 0;
3173
3174                         run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
3175                 }
3176
3177                 while (gpc_mark < gr->tpc_count) {
3178                         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3179                                 if ((run_err[gpc_index] * 2) >= comm_denom) {
3180                                         gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
3181                                         run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
3182                                 } else
3183                                         run_err[gpc_index] += init_frac[gpc_index];
3184                         }
3185                 }
3186         }
3187
3188 clean_up:
3189         kfree(init_frac);
3190         kfree(init_err);
3191         kfree(run_err);
3192         kfree(sorted_num_tpcs);
3193         kfree(sorted_to_unsorted_gpc_map);
3194
3195         if (ret)
3196                 nvhost_err(dev_from_gk20a(g), "fail");
3197         else
3198                 nvhost_dbg_fn("done");
3199
3200         return ret;
3201 }
3202
3203 static int gr_gk20a_init_comptag(struct gk20a *g, struct gr_gk20a *gr)
3204 {
3205         struct mem_mgr *memmgr = mem_mgr_from_g(g);
3206
3207         /* max memory size (MB) to cover */
3208         u32 max_size = gr->max_comptag_mem;
3209         /* one tag line covers 128KB */
3210         u32 max_comptag_lines = max_size << 3;
3211
3212         u32 hw_max_comptag_lines =
3213                 ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_init_v();
3214
3215         u32 cbc_param =
3216                 gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r());
3217         u32 comptags_per_cacheline =
3218                 ltc_ltcs_ltss_cbc_param_comptags_per_cache_line_v(cbc_param);
3219         u32 slices_per_fbp =
3220                 ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(cbc_param);
3221         u32 cacheline_size =
3222                 512 << ltc_ltcs_ltss_cbc_param_cache_line_size_v(cbc_param);
3223
3224         u32 compbit_backing_size;
3225         int ret = 0;
3226
3227         nvhost_dbg_fn("");
3228
3229         if (max_comptag_lines == 0) {
3230                 gr->compbit_store.mem.size = 0;
3231                 return 0;
3232         }
3233
3234         if (max_comptag_lines > hw_max_comptag_lines)
3235                 max_comptag_lines = hw_max_comptag_lines;
3236
3237         /* no hybird fb */
3238         compbit_backing_size =
3239                 DIV_ROUND_UP(max_comptag_lines, comptags_per_cacheline) *
3240                 cacheline_size * slices_per_fbp * gr->num_fbps;
3241
3242         /* aligned to 2KB * num_fbps */
3243         compbit_backing_size +=
3244                 gr->num_fbps << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
3245
3246         /* must be a multiple of 64KB */
3247         compbit_backing_size = roundup(compbit_backing_size, 64*1024);
3248
3249         max_comptag_lines =
3250                 (compbit_backing_size * comptags_per_cacheline) /
3251                 cacheline_size * slices_per_fbp * gr->num_fbps;
3252
3253         if (max_comptag_lines > hw_max_comptag_lines)
3254                 max_comptag_lines = hw_max_comptag_lines;
3255
3256         nvhost_dbg_info("compbit backing store size : %d",
3257                 compbit_backing_size);
3258         nvhost_dbg_info("max comptag lines : %d",
3259                 max_comptag_lines);
3260
3261         gr->compbit_store.mem.ref =
3262                 nvhost_memmgr_alloc(memmgr, compbit_backing_size,
3263                                     DEFAULT_ALLOC_ALIGNMENT,
3264                                     DEFAULT_ALLOC_FLAGS,
3265                                     0);
3266         if (IS_ERR(gr->compbit_store.mem.ref)) {
3267                 nvhost_err(dev_from_gk20a(g), "failed to allocate"
3268                            "backing store for compbit : size %d",
3269                            compbit_backing_size);
3270                 return PTR_ERR(gr->compbit_store.mem.ref);
3271         }
3272         gr->compbit_store.mem.size = compbit_backing_size;
3273
3274         gr->compbit_store.mem.sgt =
3275                 nvhost_memmgr_pin(memmgr, gr->compbit_store.mem.ref,
3276                                 dev_from_gk20a(g), mem_flag_none);
3277         if (IS_ERR(gr->compbit_store.mem.sgt)) {
3278                 ret = PTR_ERR(gr->compbit_store.mem.sgt);
3279                 goto clean_up;
3280         }
3281         gr->compbit_store.base_pa =
3282                 gk20a_mm_iova_addr(gr->compbit_store.mem.sgt->sgl);
3283
3284         nvhost_allocator_init(&gr->comp_tags, "comptag",
3285                               1, /* start */
3286                               max_comptag_lines - 1, /* length*/
3287                               1); /* align */
3288
3289         return 0;
3290
3291 clean_up:
3292         if (gr->compbit_store.mem.sgt)
3293                 nvhost_memmgr_free_sg_table(memmgr, gr->compbit_store.mem.ref,
3294                                 gr->compbit_store.mem.sgt);
3295         nvhost_memmgr_put(memmgr, gr->compbit_store.mem.ref);
3296         return ret;
3297 }
3298
3299 int gk20a_gr_clear_comptags(struct gk20a *g, u32 min, u32 max)
3300 {
3301         struct gr_gk20a *gr = &g->gr;
3302         u32 fbp, slice, ctrl1, val;
3303         unsigned long end_jiffies = jiffies +
3304                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3305         u32 delay = GR_IDLE_CHECK_DEFAULT;
3306         u32 slices_per_fbp =
3307                 ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(
3308                         gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r()));
3309
3310         nvhost_dbg_fn("");
3311
3312         if (gr->compbit_store.mem.size == 0)
3313                 return 0;
3314
3315         gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl2_r(),
3316                      ltc_ltcs_ltss_cbc_ctrl2_clear_lower_bound_f(min));
3317         gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl3_r(),
3318                      ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_f(max));
3319         gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl1_r(),
3320                      gk20a_readl(g, ltc_ltcs_ltss_cbc_ctrl1_r()) |
3321                      ltc_ltcs_ltss_cbc_ctrl1_clear_active_f());
3322
3323         for (fbp = 0; fbp < gr->num_fbps; fbp++) {
3324                 for (slice = 0; slice < slices_per_fbp; slice++) {
3325
3326                         delay = GR_IDLE_CHECK_DEFAULT;
3327
3328                         ctrl1 = ltc_ltc0_lts0_cbc_ctrl1_r() +
3329                                 fbp * proj_ltc_stride_v() +
3330                                 slice * proj_lts_stride_v();
3331
3332                         do {
3333                                 val = gk20a_readl(g, ctrl1);
3334                                 if (ltc_ltcs_ltss_cbc_ctrl1_clear_v(val) !=
3335                                     ltc_ltcs_ltss_cbc_ctrl1_clear_active_v())
3336                                         break;
3337
3338                                 usleep_range(delay, delay * 2);
3339                                 delay = min_t(u32, delay << 1,
3340                                         GR_IDLE_CHECK_MAX);
3341
3342                         } while (time_before(jiffies, end_jiffies) |
3343                                         !tegra_platform_is_silicon());
3344
3345                         if (!time_before(jiffies, end_jiffies)) {
3346                                 nvhost_err(dev_from_gk20a(g),
3347                                            "comp tag clear timeout\n");
3348                                 return -EBUSY;
3349                         }
3350                 }
3351         }
3352
3353         return 0;
3354 }
3355
3356 static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
3357 {
3358         struct gr_zcull_gk20a *zcull = &gr->zcull;
3359
3360         zcull->aliquot_width = gr->tpc_count * 16;
3361         zcull->aliquot_height = 16;
3362
3363         zcull->width_align_pixels = gr->tpc_count * 16;
3364         zcull->height_align_pixels = 32;
3365
3366         zcull->aliquot_size =
3367                 zcull->aliquot_width * zcull->aliquot_height;
3368
3369         /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
3370         zcull->pixel_squares_by_aliquots =
3371                 gr->zcb_count * 16 * 16 * gr->tpc_count /
3372                 (gr->gpc_count * gr->gpc_tpc_count[0]);
3373
3374         zcull->total_aliquots =
3375                 gr_gpc0_zcull_total_ram_size_num_aliquots_f(
3376                         gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
3377
3378         return 0;
3379 }
3380
3381 u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
3382 {
3383         /* assuming gr has already been initialized */
3384         return gr->ctx_vars.zcull_ctxsw_image_size;
3385 }
3386
3387 int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
3388                         struct channel_gk20a *c, u64 zcull_va, u32 mode)
3389 {
3390         struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
3391
3392         zcull_ctx->ctx_sw_mode = mode;
3393         zcull_ctx->gpu_va = zcull_va;
3394
3395         /* TBD: don't disable channel in sw method processing */
3396         return gr_gk20a_ctx_zcull_setup(g, c, true);
3397 }
3398
3399 int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
3400                         struct gr_zcull_info *zcull_params)
3401 {
3402         struct gr_zcull_gk20a *zcull = &gr->zcull;
3403
3404         zcull_params->width_align_pixels = zcull->width_align_pixels;
3405         zcull_params->height_align_pixels = zcull->height_align_pixels;
3406         zcull_params->pixel_squares_by_aliquots =
3407                 zcull->pixel_squares_by_aliquots;
3408         zcull_params->aliquot_total = zcull->total_aliquots;
3409
3410         zcull_params->region_byte_multiplier =
3411                 gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
3412         zcull_params->region_header_size =
3413                 proj_scal_litter_num_gpcs_v() *
3414                 gr_zcull_save_restore_header_bytes_per_gpc_v();
3415
3416         zcull_params->subregion_header_size =
3417                 proj_scal_litter_num_gpcs_v() *
3418                 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
3419
3420         zcull_params->subregion_width_align_pixels =
3421                 gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
3422         zcull_params->subregion_height_align_pixels =
3423                 gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
3424         zcull_params->subregion_count = gr_zcull_subregion_qty_v();
3425
3426         return 0;
3427 }
3428
3429 static int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
3430                                 struct zbc_entry *color_val, u32 index)
3431 {
3432         struct fifo_gk20a *f = &g->fifo;
3433         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3434         u32 i;
3435         unsigned long end_jiffies = jiffies +
3436                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3437         u32 ret;
3438
3439         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3440         if (ret) {
3441                 nvhost_err(dev_from_gk20a(g),
3442                         "failed to disable gr engine activity\n");
3443                 return ret;
3444         }
3445
3446         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3447         if (ret) {
3448                 nvhost_err(dev_from_gk20a(g),
3449                         "failed to idle graphics\n");
3450                 goto clean_up;
3451         }
3452
3453         /* update l2 table */
3454         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3455                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3456                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3457                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(index +
3458                                         GK20A_STARTOF_ZBC_TABLE));
3459
3460         for (i = 0; i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++)
3461                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(i),
3462                         color_val->color_l2[i]);
3463
3464         /* update ds table */
3465         gk20a_writel(g, gr_ds_zbc_color_r_r(),
3466                 gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
3467         gk20a_writel(g, gr_ds_zbc_color_g_r(),
3468                 gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
3469         gk20a_writel(g, gr_ds_zbc_color_b_r(),
3470                 gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
3471         gk20a_writel(g, gr_ds_zbc_color_a_r(),
3472                 gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
3473
3474         gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3475                 gr_ds_zbc_color_fmt_val_f(color_val->format));
3476
3477         gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3478                 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3479
3480         /* trigger the write */
3481         gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3482                 gr_ds_zbc_tbl_ld_select_c_f() |
3483                 gr_ds_zbc_tbl_ld_action_write_f() |
3484                 gr_ds_zbc_tbl_ld_trigger_active_f());
3485
3486         /* update local copy */
3487         for (i = 0; i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++) {
3488                 gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
3489                 gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
3490         }
3491         gr->zbc_col_tbl[index].format = color_val->format;
3492         gr->zbc_col_tbl[index].ref_cnt++;
3493
3494 clean_up:
3495         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3496         if (ret) {
3497                 nvhost_err(dev_from_gk20a(g),
3498                         "failed to enable gr engine activity\n");
3499         }
3500
3501         return ret;
3502 }
3503
3504 static int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
3505                                 struct zbc_entry *depth_val, u32 index)
3506 {
3507         struct fifo_gk20a *f = &g->fifo;
3508         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3509         unsigned long end_jiffies = jiffies +
3510                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3511         u32 ret;
3512
3513         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3514         if (ret) {
3515                 nvhost_err(dev_from_gk20a(g),
3516                         "failed to disable gr engine activity\n");
3517                 return ret;
3518         }
3519
3520         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3521         if (ret) {
3522                 nvhost_err(dev_from_gk20a(g),
3523                         "failed to idle graphics\n");
3524                 goto clean_up;
3525         }
3526
3527         /* update l2 table */
3528         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3529                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3530                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3531                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(index +
3532                                         GK20A_STARTOF_ZBC_TABLE));
3533
3534         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(),
3535                         depth_val->depth);
3536
3537         /* update ds table */
3538         gk20a_writel(g, gr_ds_zbc_z_r(),
3539                 gr_ds_zbc_z_val_f(depth_val->depth));
3540
3541         gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3542                 gr_ds_zbc_z_fmt_val_f(depth_val->format));
3543
3544         gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3545                 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3546
3547         /* trigger the write */
3548         gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3549                 gr_ds_zbc_tbl_ld_select_z_f() |
3550                 gr_ds_zbc_tbl_ld_action_write_f() |
3551                 gr_ds_zbc_tbl_ld_trigger_active_f());
3552
3553         /* update local copy */
3554         gr->zbc_dep_tbl[index].depth = depth_val->depth;
3555         gr->zbc_dep_tbl[index].format = depth_val->format;
3556         gr->zbc_dep_tbl[index].ref_cnt++;
3557
3558 clean_up:
3559         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3560         if (ret) {
3561                 nvhost_err(dev_from_gk20a(g),
3562                         "failed to enable gr engine activity\n");
3563         }
3564
3565         return ret;
3566 }
3567
3568 int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
3569                      struct zbc_entry *zbc_val)
3570 {
3571         struct zbc_color_table *c_tbl;
3572         struct zbc_depth_table *d_tbl;
3573         u32 i, ret = -ENOMEM;
3574         bool added = false;
3575         u32 entries;
3576
3577         /* no endian swap ? */
3578
3579         switch (zbc_val->type) {
3580         case GK20A_ZBC_TYPE_COLOR:
3581                 /* search existing tables */
3582                 for (i = 0; i < gr->max_used_color_index; i++) {
3583
3584                         c_tbl = &gr->zbc_col_tbl[i];
3585
3586                         if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
3587                             memcmp(c_tbl->color_ds, zbc_val->color_ds,
3588                                 sizeof(zbc_val->color_ds)) == 0) {
3589
3590                                 if (memcmp(c_tbl->color_l2, zbc_val->color_l2,
3591                                     sizeof(zbc_val->color_l2))) {
3592                                         nvhost_err(dev_from_gk20a(g),
3593                                                 "zbc l2 and ds color don't match with existing entries");
3594                                         return -EINVAL;
3595                                 }
3596                                 added = true;
3597                                 c_tbl->ref_cnt++;
3598                                 ret = 0;
3599                                 break;
3600                         }
3601                 }
3602                 /* add new table */
3603                 if (!added &&
3604                     gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
3605
3606                         c_tbl =
3607                             &gr->zbc_col_tbl[gr->max_used_color_index];
3608                         WARN_ON(c_tbl->ref_cnt != 0);
3609
3610                         ret = gr_gk20a_add_zbc_color(g, gr,
3611                                 zbc_val, gr->max_used_color_index);
3612
3613                         if (!ret)
3614                                 gr->max_used_color_index++;
3615                 }
3616                 break;
3617         case GK20A_ZBC_TYPE_DEPTH:
3618                 /* search existing tables */
3619                 for (i = 0; i < gr->max_used_depth_index; i++) {
3620
3621                         d_tbl = &gr->zbc_dep_tbl[i];
3622
3623                         if (d_tbl->ref_cnt &&
3624                             d_tbl->depth == zbc_val->depth &&
3625                             d_tbl->format == zbc_val->format) {
3626                                 added = true;
3627                                 d_tbl->ref_cnt++;
3628                                 ret = 0;
3629                                 break;
3630                         }
3631                 }
3632                 /* add new table */
3633                 if (!added &&
3634                     gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
3635
3636                         d_tbl =
3637                             &gr->zbc_dep_tbl[gr->max_used_depth_index];
3638                         WARN_ON(d_tbl->ref_cnt != 0);
3639
3640                         ret = gr_gk20a_add_zbc_depth(g, gr,
3641                                 zbc_val, gr->max_used_depth_index);
3642
3643                         if (!ret)
3644                                 gr->max_used_depth_index++;
3645                 }
3646                 break;
3647         default:
3648                 nvhost_err(dev_from_gk20a(g),
3649                         "invalid zbc table type %d", zbc_val->type);
3650                 return -EINVAL;
3651         }
3652
3653         if (!added && ret == 0) {
3654                 /* update zbc for elpg only when new entry is added */
3655                 entries = max(gr->max_used_color_index,
3656                                         gr->max_used_depth_index);
3657                 pmu_save_zbc(g, entries);
3658         }
3659
3660         return ret;
3661 }
3662
3663 int gr_gk20a_clear_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
3664 {
3665         struct fifo_gk20a *f = &g->fifo;
3666         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3667         u32 i, j;
3668         unsigned long end_jiffies = jiffies +
3669                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3670         u32 ret;
3671
3672         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3673         if (ret) {
3674                 nvhost_err(dev_from_gk20a(g),
3675                         "failed to disable gr engine activity\n");
3676                 return ret;
3677         }
3678
3679         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3680         if (ret) {
3681                 nvhost_err(dev_from_gk20a(g),
3682                         "failed to idle graphics\n");
3683                 goto clean_up;
3684         }
3685
3686         for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
3687                 gr->zbc_col_tbl[i].format = 0;
3688                 gr->zbc_col_tbl[i].ref_cnt = 0;
3689
3690                 gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3691                         gr_ds_zbc_color_fmt_val_invalid_f());
3692                 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3693                         gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
3694
3695                 /* trigger the write */
3696                 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3697                         gr_ds_zbc_tbl_ld_select_c_f() |
3698                         gr_ds_zbc_tbl_ld_action_write_f() |
3699                         gr_ds_zbc_tbl_ld_trigger_active_f());
3700
3701                 /* clear l2 table */
3702                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3703                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3704                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3705                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(i +
3706                                         GK20A_STARTOF_ZBC_TABLE));
3707
3708                 for (j = 0; j < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); j++) {
3709                         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(j), 0);
3710                         gr->zbc_col_tbl[i].color_l2[j] = 0;
3711                         gr->zbc_col_tbl[i].color_ds[j] = 0;
3712                 }
3713         }
3714         gr->max_used_color_index = 0;
3715         gr->max_default_color_index = 0;
3716
3717         for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
3718                 gr->zbc_dep_tbl[i].depth = 0;
3719                 gr->zbc_dep_tbl[i].format = 0;
3720                 gr->zbc_dep_tbl[i].ref_cnt = 0;
3721
3722                 gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3723                         gr_ds_zbc_z_fmt_val_invalid_f());
3724                 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3725                         gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
3726
3727                 /* trigger the write */
3728                 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3729                         gr_ds_zbc_tbl_ld_select_z_f() |
3730                         gr_ds_zbc_tbl_ld_action_write_f() |
3731                         gr_ds_zbc_tbl_ld_trigger_active_f());
3732
3733                 /* clear l2 table */
3734                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3735                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3736                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3737                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(i +
3738                                         GK20A_STARTOF_ZBC_TABLE));
3739
3740                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
3741         }
3742         gr->max_used_depth_index = 0;
3743         gr->max_default_depth_index = 0;
3744
3745 clean_up:
3746         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3747         if (ret) {
3748                 nvhost_err(dev_from_gk20a(g),
3749                         "failed to enable gr engine activity\n");
3750         }
3751
3752         /* elpg stuff */
3753
3754         return ret;
3755 }
3756
3757 /* get a zbc table entry specified by index
3758  * return table size when type is invalid */
3759 int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
3760                         struct zbc_query_params *query_params)
3761 {
3762         u32 index = query_params->index_size;
3763         u32 i;
3764
3765         switch (query_params->type) {
3766         case GK20A_ZBC_TYPE_INVALID:
3767                 query_params->index_size = GK20A_ZBC_TABLE_SIZE;
3768                 break;
3769         case GK20A_ZBC_TYPE_COLOR:
3770                 if (index >= GK20A_ZBC_TABLE_SIZE) {
3771                         nvhost_err(dev_from_gk20a(g),
3772                                 "invalid zbc color table index\n");
3773                         return -EINVAL;
3774                 }
3775                 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3776                         query_params->color_l2[i] =
3777                                 gr->zbc_col_tbl[index].color_l2[i];
3778                         query_params->color_ds[i] =
3779                                 gr->zbc_col_tbl[index].color_ds[i];
3780                 }
3781                 query_params->format = gr->zbc_col_tbl[index].format;
3782                 query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
3783                 break;
3784         case GK20A_ZBC_TYPE_DEPTH:
3785                 if (index >= GK20A_ZBC_TABLE_SIZE) {
3786                         nvhost_err(dev_from_gk20a(g),
3787                                 "invalid zbc depth table index\n");
3788                         return -EINVAL;
3789                 }
3790                 query_params->depth = gr->zbc_dep_tbl[index].depth;
3791                 query_params->format = gr->zbc_dep_tbl[index].format;
3792                 query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
3793                 break;
3794         default:
3795                 nvhost_err(dev_from_gk20a(g),
3796                                 "invalid zbc table type\n");
3797                 return -EINVAL;
3798         }
3799
3800         return 0;
3801 }
3802
3803 static int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
3804 {
3805         struct zbc_entry zbc_val;
3806         u32 i, err;
3807
3808         /* load default color table */
3809         zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3810
3811         zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
3812         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3813                 zbc_val.color_ds[i] = 0;
3814                 zbc_val.color_l2[i] = 0;
3815         }
3816         err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3817
3818         zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
3819         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3820                 zbc_val.color_ds[i] = 0xffffffff;
3821                 zbc_val.color_l2[i] = 0x3f800000;
3822         }
3823         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3824
3825         zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3826         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3827                 zbc_val.color_ds[i] = 0;
3828                 zbc_val.color_l2[i] = 0;
3829         }
3830         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3831
3832         zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3833         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3834                 zbc_val.color_ds[i] = 0x3f800000;
3835                 zbc_val.color_l2[i] = 0x3f800000;
3836         }
3837         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3838
3839         if (!err)
3840                 gr->max_default_color_index = 4;
3841         else {
3842                 nvhost_err(dev_from_gk20a(g),
3843                            "fail to load default zbc color table\n");
3844                 return err;
3845         }
3846
3847         /* load default depth table */
3848         zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3849
3850         zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3851         zbc_val.depth = 0;
3852         err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3853
3854         zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3855         zbc_val.depth = 0x3f800000;
3856         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3857
3858         if (!err)
3859                 gr->max_default_depth_index = 2;
3860         else {
3861                 nvhost_err(dev_from_gk20a(g),
3862                            "fail to load default zbc depth table\n");
3863                 return err;
3864         }
3865
3866         return 0;
3867 }
3868
3869 static int gr_gk20a_init_zbc(struct gk20a *g, struct gr_gk20a *gr)
3870 {
3871         u32 i, j;
3872
3873         /* reset zbc clear */
3874         for (i = 0; i < GK20A_SIZEOF_ZBC_TABLE -
3875             GK20A_STARTOF_ZBC_TABLE; i++) {
3876                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3877                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3878                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3879                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(
3880                                         i + GK20A_STARTOF_ZBC_TABLE));
3881                 for (j = 0; j < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); j++)
3882                         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(j), 0);
3883                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
3884         }
3885
3886         gr_gk20a_clear_zbc_table(g, gr);
3887
3888         gr_gk20a_load_zbc_default_table(g, gr);
3889
3890         return 0;
3891 }
3892
3893 int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
3894                         struct zbc_entry *zbc_val)
3895 {
3896         nvhost_dbg_fn("");
3897
3898         return gr_gk20a_elpg_protected_call(g,
3899                 gr_gk20a_add_zbc(g, gr, zbc_val));
3900 }
3901
3902 void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine)
3903 {
3904         u32 gate_ctrl;
3905
3906         gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3907
3908         switch (mode) {
3909         case BLCG_RUN:
3910                 gate_ctrl = set_field(gate_ctrl,
3911                                 therm_gate_ctrl_blk_clk_m(),
3912                                 therm_gate_ctrl_blk_clk_run_f());
3913                 break;
3914         case BLCG_AUTO:
3915                 gate_ctrl = set_field(gate_ctrl,
3916                                 therm_gate_ctrl_blk_clk_m(),
3917                                 therm_gate_ctrl_blk_clk_auto_f());
3918                 break;
3919         default:
3920                 nvhost_err(dev_from_gk20a(g),
3921                         "invalid blcg mode %d", mode);
3922                 return;
3923         }
3924
3925         gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3926 }
3927
3928 void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine)
3929 {
3930         u32 gate_ctrl, idle_filter;
3931
3932         gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3933
3934         switch (mode) {
3935         case ELCG_RUN:
3936                 gate_ctrl = set_field(gate_ctrl,
3937                                 therm_gate_ctrl_eng_clk_m(),
3938                                 therm_gate_ctrl_eng_clk_run_f());
3939                 gate_ctrl = set_field(gate_ctrl,
3940                                 therm_gate_ctrl_eng_pwr_m(),
3941                                 /* set elpg to auto to meet hw expectation */
3942                                 therm_gate_ctrl_eng_pwr_auto_f());
3943                 break;
3944         case ELCG_STOP:
3945                 gate_ctrl = set_field(gate_ctrl,
3946                                 therm_gate_ctrl_eng_clk_m(),
3947                                 therm_gate_ctrl_eng_clk_stop_f());
3948                 break;
3949         case ELCG_AUTO:
3950                 gate_ctrl = set_field(gate_ctrl,
3951                                 therm_gate_ctrl_eng_clk_m(),
3952                                 therm_gate_ctrl_eng_clk_auto_f());
3953                 break;
3954         default:
3955                 nvhost_err(dev_from_gk20a(g),
3956                         "invalid elcg mode %d", mode);
3957         }
3958
3959         if (tegra_platform_is_linsim()) {
3960                 gate_ctrl = set_field(gate_ctrl,
3961                         therm_gate_ctrl_eng_delay_after_m(),
3962                         therm_gate_ctrl_eng_delay_after_f(4));
3963         }
3964
3965         /* 2 * (1 << 9) = 1024 clks */
3966         gate_ctrl = set_field(gate_ctrl,
3967                 therm_gate_ctrl_eng_idle_filt_exp_m(),
3968                 therm_gate_ctrl_eng_idle_filt_exp_f(9));
3969         gate_ctrl = set_field(gate_ctrl,
3970                 therm_gate_ctrl_eng_idle_filt_mant_m(),
3971                 therm_gate_ctrl_eng_idle_filt_mant_f(2));
3972         gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3973
3974         /* default fecs_idle_filter to 0 */
3975         idle_filter = gk20a_readl(g, therm_fecs_idle_filter_r());
3976         idle_filter &= ~therm_fecs_idle_filter_value_m();
3977         gk20a_writel(g, therm_fecs_idle_filter_r(), idle_filter);
3978         /* default hubmmu_idle_filter to 0 */
3979         idle_filter = gk20a_readl(g, therm_hubmmu_idle_filter_r());
3980         idle_filter &= ~therm_hubmmu_idle_filter_value_m();
3981         gk20a_writel(g, therm_hubmmu_idle_filter_r(), idle_filter);
3982 }
3983
3984 static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
3985 {
3986         u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
3987         u32 *zcull_map_tiles, *zcull_bank_counters;
3988         u32 map_counter;
3989         u32 rcp_conserv;
3990         u32 offset;
3991         bool floorsweep = false;
3992
3993         if (!gr->map_tiles)
3994                 return -1;
3995
3996         zcull_map_tiles = kzalloc(proj_scal_max_gpcs_v() *
3997                         proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3998         if (!zcull_map_tiles) {
3999                 nvhost_err(dev_from_gk20a(g),
4000                         "failed to allocate zcull temp buffers");
4001                 return -ENOMEM;
4002         }
4003         zcull_bank_counters = kzalloc(proj_scal_max_gpcs_v() *
4004                         proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
4005
4006         if (!zcull_bank_counters) {
4007                 nvhost_err(dev_from_gk20a(g),
4008                         "failed to allocate zcull temp buffers");
4009                 kfree(zcull_map_tiles);
4010                 return -ENOMEM;
4011         }
4012
4013         for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
4014                 zcull_map_tiles[map_counter] =
4015                         zcull_bank_counters[gr->map_tiles[map_counter]];
4016                 zcull_bank_counters[gr->map_tiles[map_counter]]++;
4017         }
4018
4019         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(),
4020                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(zcull_map_tiles[0]) |
4021                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(zcull_map_tiles[1]) |
4022                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(zcull_map_tiles[2]) |
4023                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(zcull_map_tiles[3]) |
4024                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(zcull_map_tiles[4]) |
4025                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(zcull_map_tiles[5]) |
4026                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(zcull_map_tiles[6]) |
4027                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(zcull_map_tiles[7]));
4028
4029         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(),
4030                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(zcull_map_tiles[8]) |
4031                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(zcull_map_tiles[9]) |
4032                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(zcull_map_tiles[10]) |
4033                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(zcull_map_tiles[11]) |
4034                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(zcull_map_tiles[12]) |
4035                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(zcull_map_tiles[13]) |
4036                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(zcull_map_tiles[14]) |
4037                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(zcull_map_tiles[15]));
4038
4039         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(),
4040                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(zcull_map_tiles[16]) |
4041                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(zcull_map_tiles[17]) |
4042                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(zcull_map_tiles[18]) |
4043                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(zcull_map_tiles[19]) |
4044                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(zcull_map_tiles[20]) |
4045                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(zcull_map_tiles[21]) |
4046                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(zcull_map_tiles[22]) |
4047                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(zcull_map_tiles[23]));
4048
4049         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(),
4050                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(zcull_map_tiles[24]) |
4051                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(zcull_map_tiles[25]) |
4052                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(zcull_map_tiles[26]) |
4053                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(zcull_map_tiles[27]) |
4054                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(zcull_map_tiles[28]) |
4055                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(zcull_map_tiles[29]) |
4056                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(zcull_map_tiles[30]) |
4057                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(zcull_map_tiles[31]));
4058
4059         kfree(zcull_map_tiles);
4060         kfree(zcull_bank_counters);
4061
4062         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4063                 gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
4064                 gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
4065
4066                 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
4067                     gpc_zcull_count < gpc_tpc_count) {
4068                         nvhost_err(dev_from_gk20a(g),
4069                                 "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
4070                                 gpc_zcull_count, gpc_tpc_count, gpc_index);
4071                         return -EINVAL;
4072                 }
4073                 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
4074                     gpc_zcull_count != 0)
4075                         floorsweep = true;
4076         }
4077
4078         /* 1.0f / 1.0f * gr_gpc0_zcull_sm_num_rcp_conservative__max_v() */
4079         rcp_conserv = gr_gpc0_zcull_sm_num_rcp_conservative__max_v();
4080
4081         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4082                 offset = gpc_index * proj_gpc_stride_v();
4083
4084                 if (floorsweep) {
4085                         gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4086                                 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4087                                 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4088                                         gr->max_zcull_per_gpc_count));
4089                 } else {
4090                         gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4091                                 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4092                                 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4093                                         gr->gpc_tpc_count[gpc_index]));
4094                 }
4095
4096                 gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
4097                         gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
4098                         gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
4099
4100                 gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
4101                         gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
4102         }
4103
4104         gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
4105                 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
4106
4107         return 0;
4108 }
4109
4110 static void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
4111 {
4112         /* enable tpc exception forwarding */
4113         gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(),
4114                 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f());
4115
4116         /* enable gpc exception forwarding */
4117         gk20a_writel(g, gr_gpc0_gpccs_gpc_exception_en_r(),
4118                 gr_gpc0_gpccs_gpc_exception_en_tpc_0_enabled_f());
4119 }
4120
4121 static int gk20a_init_gr_setup_hw(struct gk20a *g)
4122 {
4123         struct gr_gk20a *gr = &g->gr;
4124         struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
4125         struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
4126         struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
4127         u32 data;
4128         u32 addr_lo, addr_hi;
4129         u64 addr;
4130         u32 compbit_base_post_divide;
4131         u64 compbit_base_post_multiply64;
4132         unsigned long end_jiffies = jiffies +
4133                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4134         u32 fe_go_idle_timeout_save;
4135         u32 last_bundle_data = 0;
4136         u32 last_method_data = 0;
4137         u32 i, err;
4138         u32 l1c_dbg_reg_val;
4139
4140         nvhost_dbg_fn("");
4141
4142         /* slcg prod values */
4143         gr_gk20a_slcg_gr_load_gating_prod(g, g->slcg_enabled);
4144         gr_gk20a_slcg_perf_load_gating_prod(g, g->slcg_enabled);
4145
4146         /* init mmu debug buffer */
4147         addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_wr_mem.iova);
4148         addr_lo = u64_lo32(addr);
4149         addr_hi = u64_hi32(addr);
4150         addr = (addr_lo >> fb_mmu_debug_wr_addr_alignment_v()) |
4151                 (addr_hi << (32 - fb_mmu_debug_wr_addr_alignment_v()));
4152
4153         gk20a_writel(g, fb_mmu_debug_wr_r(),
4154                      fb_mmu_debug_wr_aperture_vid_mem_f() |
4155                      fb_mmu_debug_wr_vol_false_f() |
4156                      fb_mmu_debug_wr_addr_v(addr));
4157
4158         addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_rd_mem.iova);
4159         addr_lo = u64_lo32(addr);
4160         addr_hi = u64_hi32(addr);
4161         addr = (addr_lo >> fb_mmu_debug_rd_addr_alignment_v()) |
4162                 (addr_hi << (32 - fb_mmu_debug_rd_addr_alignment_v()));
4163
4164         gk20a_writel(g, fb_mmu_debug_rd_r(),
4165                      fb_mmu_debug_rd_aperture_vid_mem_f() |
4166                      fb_mmu_debug_rd_vol_false_f() |
4167                      fb_mmu_debug_rd_addr_v(addr));
4168
4169         /* load gr floorsweeping registers */
4170         data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
4171         data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
4172                         gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
4173         gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
4174
4175         gr_gk20a_zcull_init_hw(g, gr);
4176
4177         gr_gk20a_blcg_gr_load_gating_prod(g, g->blcg_enabled);
4178         gr_gk20a_pg_gr_load_gating_prod(g, true);
4179
4180         if (g->elcg_enabled) {
4181                 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
4182                 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
4183         } else {
4184                 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
4185                 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
4186         }
4187
4188         /* Bug 1340570: increase the clock timeout to avoid potential
4189          * operation failure at high gpcclk rate. Default values are 0x400.
4190          */
4191         gk20a_writel(g, pri_ringstation_sys_master_config_r(0x15), 0x800);
4192         gk20a_writel(g, pri_ringstation_gpc_master_config_r(0xa), 0x800);
4193         gk20a_writel(g, pri_ringstation_fbp_master_config_r(0x8), 0x800);
4194
4195         /* enable fifo access */
4196         gk20a_writel(g, gr_gpfifo_ctl_r(),
4197                      gr_gpfifo_ctl_access_enabled_f() |
4198                      gr_gpfifo_ctl_semaphore_access_enabled_f());
4199
4200         /* TBD: reload gr ucode when needed */
4201
4202         /* enable interrupts */
4203         gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
4204         gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
4205
4206         /* enable fecs error interrupts */
4207         gk20a_writel(g, gr_fecs_host_int_enable_r(),
4208                      gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
4209                      gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
4210                      gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
4211                      gr_fecs_host_int_enable_watchdog_enable_f());
4212
4213         /* enable exceptions */
4214         gk20a_writel(g, gr_fe_hww_esr_r(),
4215                      gr_fe_hww_esr_en_enable_f() |
4216                      gr_fe_hww_esr_reset_active_f());
4217         gk20a_writel(g, gr_memfmt_hww_esr_r(),
4218                      gr_memfmt_hww_esr_en_enable_f() |
4219                      gr_memfmt_hww_esr_reset_active_f());
4220         gk20a_writel(g, gr_scc_hww_esr_r(),
4221                      gr_scc_hww_esr_en_enable_f() |
4222                      gr_scc_hww_esr_reset_active_f());
4223         gk20a_writel(g, gr_mme_hww_esr_r(),
4224                      gr_mme_hww_esr_en_enable_f() |
4225                      gr_mme_hww_esr_reset_active_f());
4226         gk20a_writel(g, gr_pd_hww_esr_r(),
4227                      gr_pd_hww_esr_en_enable_f() |
4228                      gr_pd_hww_esr_reset_active_f());
4229         gk20a_writel(g, gr_sked_hww_esr_r(), /* enabled by default */
4230                      gr_sked_hww_esr_reset_active_f());
4231         gk20a_writel(g, gr_ds_hww_esr_r(),
4232                      gr_ds_hww_esr_en_enabled_f() |
4233                      gr_ds_hww_esr_reset_task_f());
4234         gk20a_writel(g, gr_ds_hww_report_mask_r(),
4235                      gr_ds_hww_report_mask_sph0_err_report_f() |
4236                      gr_ds_hww_report_mask_sph1_err_report_f() |
4237                      gr_ds_hww_report_mask_sph2_err_report_f() |
4238                      gr_ds_hww_report_mask_sph3_err_report_f() |
4239                      gr_ds_hww_report_mask_sph4_err_report_f() |
4240                      gr_ds_hww_report_mask_sph5_err_report_f() |
4241                      gr_ds_hww_report_mask_sph6_err_report_f() |
4242                      gr_ds_hww_report_mask_sph7_err_report_f() |
4243                      gr_ds_hww_report_mask_sph8_err_report_f() |
4244                      gr_ds_hww_report_mask_sph9_err_report_f() |
4245                      gr_ds_hww_report_mask_sph10_err_report_f() |
4246                      gr_ds_hww_report_mask_sph11_err_report_f() |
4247                      gr_ds_hww_report_mask_sph12_err_report_f() |
4248                      gr_ds_hww_report_mask_sph13_err_report_f() |
4249                      gr_ds_hww_report_mask_sph14_err_report_f() |
4250                      gr_ds_hww_report_mask_sph15_err_report_f() |
4251                      gr_ds_hww_report_mask_sph16_err_report_f() |
4252                      gr_ds_hww_report_mask_sph17_err_report_f() |
4253                      gr_ds_hww_report_mask_sph18_err_report_f() |
4254                      gr_ds_hww_report_mask_sph19_err_report_f() |
4255                      gr_ds_hww_report_mask_sph20_err_report_f() |
4256                      gr_ds_hww_report_mask_sph21_err_report_f() |
4257                      gr_ds_hww_report_mask_sph22_err_report_f() |
4258                      gr_ds_hww_report_mask_sph23_err_report_f());
4259
4260         /* setup sm warp esr report masks */
4261         gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4262                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4263                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4264                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4265                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4266                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4267                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4268                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4269                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4270                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4271                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4272                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4273                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4274                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4275                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4276                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4277                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4278                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4279                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4280                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4281                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4282
4283         /* setup sm global esr report mask */
4284         gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4285                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4286                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4287                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4288                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4289                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4290                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4291                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4292
4293         /* enable per GPC exceptions */
4294         gk20a_gr_enable_gpc_exceptions(g);
4295
4296         /* TBD: ECC for L1/SM */
4297         /* TBD: enable per BE exceptions */
4298
4299         /* reset and enable all exceptions */
4300         gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
4301         gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
4302         gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
4303         gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
4304         gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
4305         gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
4306
4307         /* ignore status from some units */
4308         data = gk20a_readl(g, gr_status_mask_r());
4309         gk20a_writel(g, gr_status_mask_r(), data & gr->status_disable_mask);
4310
4311         gr_gk20a_init_zbc(g, gr);
4312
4313         {
4314                 u64 compbit_base_post_divide64 = (gr->compbit_store.base_pa >>
4315                                 ltc_ltcs_ltss_cbc_base_alignment_shift_v());
4316                 do_div(compbit_base_post_divide64, gr->num_fbps);
4317                 compbit_base_post_divide = u64_lo32(compbit_base_post_divide64);
4318         }
4319
4320         compbit_base_post_multiply64 = ((u64)compbit_base_post_divide *
4321                 gr->num_fbps) << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
4322
4323         if (compbit_base_post_multiply64 < gr->compbit_store.base_pa)
4324                 compbit_base_post_divide++;
4325
4326         gk20a_writel(g, ltc_ltcs_ltss_cbc_base_r(),
4327                 compbit_base_post_divide);
4328
4329         nvhost_dbg(dbg_info | dbg_map | dbg_pte,
4330                    "compbit base.pa: 0x%x,%08x cbc_base:0x%08x\n",
4331                    (u32)(gr->compbit_store.base_pa>>32),
4332                    (u32)(gr->compbit_store.base_pa & 0xffffffff),
4333                    compbit_base_post_divide);
4334
4335         /* load ctx init */
4336         for (i = 0; i < sw_ctx_load->count; i++)
4337                 gk20a_writel(g, sw_ctx_load->l[i].addr,
4338                              sw_ctx_load->l[i].value);
4339
4340         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4341         if (err)
4342                 goto out;
4343
4344         /* save and disable fe_go_idle */
4345         fe_go_idle_timeout_save =
4346                 gk20a_readl(g, gr_fe_go_idle_timeout_r());
4347         gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4348                 (fe_go_idle_timeout_save & gr_fe_go_idle_timeout_count_f(0)) |
4349                 gr_fe_go_idle_timeout_count_disabled_f());
4350
4351         /* override a few ctx state registers */
4352         gr_gk20a_commit_global_cb_manager(g, NULL, false);
4353         gr_gk20a_commit_global_timeslice(g, NULL, false);
4354
4355         /* floorsweep anything left */
4356         gr_gk20a_ctx_state_floorsweep(g);
4357
4358         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4359         if (err)
4360                 goto restore_fe_go_idle;
4361
4362         /* enable pipe mode override */
4363         gk20a_writel(g, gr_pipe_bundle_config_r(),
4364                 gr_pipe_bundle_config_override_pipe_mode_enabled_f());
4365
4366         /* load bundle init */
4367         err = 0;
4368         for (i = 0; i < sw_bundle_init->count; i++) {
4369
4370                 if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
4371                         gk20a_writel(g, gr_pipe_bundle_data_r(),
4372                                 sw_bundle_init->l[i].value);
4373                         last_bundle_data = sw_bundle_init->l[i].value;
4374                 }
4375
4376                 gk20a_writel(g, gr_pipe_bundle_address_r(),
4377                              sw_bundle_init->l[i].addr);
4378
4379                 if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
4380                     GR_GO_IDLE_BUNDLE)
4381                         err |= gr_gk20a_wait_idle(g, end_jiffies,
4382                                         GR_IDLE_CHECK_DEFAULT);
4383                 else if (0) { /* IS_SILICON */
4384                         u32 delay = GR_IDLE_CHECK_DEFAULT;
4385                         do {
4386                                 u32 gr_status = gk20a_readl(g, gr_status_r());
4387
4388                                 if (gr_status_fe_method_lower_v(gr_status) ==
4389                                     gr_status_fe_method_lower_idle_v())
4390                                         break;
4391
4392                                 usleep_range(delay, delay * 2);
4393                                 delay = min_t(u32, delay << 1,
4394                                         GR_IDLE_CHECK_MAX);
4395
4396                         } while (time_before(jiffies, end_jiffies) |
4397                                         !tegra_platform_is_silicon());
4398                 }
4399         }
4400
4401         /* disable pipe mode override */
4402         gk20a_writel(g, gr_pipe_bundle_config_r(),
4403                      gr_pipe_bundle_config_override_pipe_mode_disabled_f());
4404
4405 restore_fe_go_idle:
4406         /* restore fe_go_idle */
4407         gk20a_writel(g, gr_fe_go_idle_timeout_r(), fe_go_idle_timeout_save);
4408
4409         if (err || gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT))
4410                 goto out;
4411
4412         /* load method init */
4413         if (sw_method_init->count) {
4414                 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4415                              sw_method_init->l[0].value);
4416                 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4417                              gr_pri_mme_shadow_raw_index_write_trigger_f() |
4418                              sw_method_init->l[0].addr);
4419                 last_method_data = sw_method_init->l[0].value;
4420         }
4421         for (i = 1; i < sw_method_init->count; i++) {
4422                 if (sw_method_init->l[i].value != last_method_data) {
4423                         gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4424                                 sw_method_init->l[i].value);
4425                         last_method_data = sw_method_init->l[i].value;
4426                 }
4427                 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4428                         gr_pri_mme_shadow_raw_index_write_trigger_f() |
4429                         sw_method_init->l[i].addr);
4430         }
4431
4432         gk20a_mm_l2_invalidate(g);
4433
4434         /* turn on cya15 bit for a default val that missed the cut */
4435         l1c_dbg_reg_val = gk20a_readl(g, gr_gpc0_tpc0_l1c_dbg_r());
4436         l1c_dbg_reg_val |= gr_gpc0_tpc0_l1c_dbg_cya15_en_f();
4437         gk20a_writel(g, gr_gpc0_tpc0_l1c_dbg_r(), l1c_dbg_reg_val);
4438
4439         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4440         if (err)
4441                 goto out;
4442
4443 out:
4444         nvhost_dbg_fn("done");
4445         return 0;
4446 }
4447
4448 static int gk20a_init_gr_prepare(struct gk20a *g)
4449 {
4450         u32 gpfifo_ctrl, pmc_en;
4451         u32 err = 0;
4452
4453         /* disable fifo access */
4454         pmc_en = gk20a_readl(g, mc_enable_r());
4455         if (pmc_en & mc_enable_pgraph_enabled_f()) {
4456                 gpfifo_ctrl = gk20a_readl(g, gr_gpfifo_ctl_r());
4457                 gpfifo_ctrl &= ~gr_gpfifo_ctl_access_enabled_f();
4458                 gk20a_writel(g, gr_gpfifo_ctl_r(), gpfifo_ctrl);
4459         }
4460
4461         /* reset gr engine */
4462         gk20a_reset(g, mc_enable_pgraph_enabled_f()
4463                         | mc_enable_blg_enabled_f()
4464                         | mc_enable_perfmon_enabled_f());
4465
4466         /* enable fifo access */
4467         gk20a_writel(g, gr_gpfifo_ctl_r(),
4468                 gr_gpfifo_ctl_access_enabled_f() |
4469                 gr_gpfifo_ctl_semaphore_access_enabled_f());
4470
4471         if (!g->gr.ctx_vars.valid) {
4472                 err = gr_gk20a_init_ctx_vars(g, &g->gr);
4473                 if (err)
4474                         nvhost_err(dev_from_gk20a(g),
4475                                 "fail to load gr init ctx");
4476         }
4477         return err;
4478 }
4479
4480 static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
4481 {
4482         struct gr_gk20a *gr = &g->gr;
4483         struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
4484         unsigned long end_jiffies = jiffies +
4485                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4486         u32 i, err = 0;
4487
4488         nvhost_dbg_fn("");
4489
4490         /* enable interrupts */
4491         gk20a_writel(g, gr_intr_r(), ~0);
4492         gk20a_writel(g, gr_intr_en_r(), ~0);
4493
4494         /* reset ctx switch state */
4495         gr_gk20a_ctx_reset(g, 0);
4496
4497         /* clear scc ram */
4498         gk20a_writel(g, gr_scc_init_r(),
4499                 gr_scc_init_ram_trigger_f());
4500
4501         /* load non_ctx init */
4502         for (i = 0; i < sw_non_ctx_load->count; i++)
4503                 gk20a_writel(g, sw_non_ctx_load->l[i].addr,
4504                         sw_non_ctx_load->l[i].value);
4505
4506         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4507         if (err)
4508                 goto out;
4509
4510         err = gr_gk20a_load_ctxsw_ucode(g, gr);
4511         if (err)
4512                 goto out;
4513
4514         /* this appears query for sw states but fecs actually init
4515            ramchain, etc so this is hw init */
4516         err = gr_gk20a_init_ctx_state(g, gr);
4517         if (err)
4518                 goto out;
4519
4520 out:
4521         if (err)
4522                 nvhost_err(dev_from_gk20a(g), "fail");
4523         else
4524                 nvhost_dbg_fn("done");
4525
4526         return 0;
4527 }
4528
4529 /*
4530  * XXX Merge this list with the debugger/profiler
4531  * session regops whitelists?
4532  */
4533 static u32 wl_addr_gk20a[] = {
4534         /* this list must be sorted (low to high) */
4535         0x404468, /* gr_pri_mme_max_instructions       */
4536         0x418800, /* gr_pri_gpcs_setup_debug           */
4537         0x419a04, /* gr_pri_gpcs_tpcs_tex_lod_dbg      */
4538         0x419a08, /* gr_pri_gpcs_tpcs_tex_samp_dbg     */
4539         0x419e10, /* gr_pri_gpcs_tpcs_sm_dbgr_control0 */
4540         0x419f78, /* gr_pri_gpcs_tpcs_sm_disp_ctrl     */
4541 };
4542
4543 static int gr_gk20a_init_access_map(struct gk20a *g)
4544 {
4545         struct gr_gk20a *gr = &g->gr;
4546         struct mem_handle *mem;
4547         void *data;
4548         u32 w, page, nr_pages =
4549                 DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
4550                              PAGE_SIZE);
4551
4552         mem = gr->global_ctx_buffer[PRIV_ACCESS_MAP].ref;
4553
4554         for (page = 0; page < nr_pages; page++) {
4555                 data = nvhost_memmgr_kmap(mem, page);
4556                 if (!data) {
4557                         nvhost_err(dev_from_gk20a(g),
4558                                    "failed to map priv access map memory");
4559                         return -ENOMEM;
4560                 }
4561                 memset(data, 0x0, PAGE_SIZE);
4562
4563                 /* no good unless ARRAY_SIZE(w) == something small */
4564                 for (w = 0; w < ARRAY_SIZE(wl_addr_gk20a); w++) {
4565                         u32 map_bit, map_byte, map_shift;
4566                         u32 map_page, pb_idx;
4567                         map_bit = wl_addr_gk20a[w] >> 2;
4568                         map_byte = map_bit >> 3;
4569                         map_page = map_byte >> PAGE_SHIFT;
4570                         if (map_page != page)
4571                                 continue;
4572                         map_shift = map_bit & 0x7; /* i.e. 0-7 */
4573                         pb_idx = (map_byte & ~PAGE_MASK);
4574                         nvhost_dbg_info(
4575                                 "access map addr:0x%x pg:%d pb:%d bit:%d",
4576                                 wl_addr_gk20a[w], map_page, pb_idx, map_shift);
4577                         ((u8 *)data)[pb_idx] |= (1 << map_shift);
4578                 }
4579                 /* uncached on cpu side, so no need to flush? */
4580                 nvhost_memmgr_kunmap(mem, page, data);
4581         }
4582
4583         return 0;
4584 }
4585
4586 static int gk20a_init_gr_setup_sw(struct gk20a *g)
4587 {
4588         struct gr_gk20a *gr = &g->gr;
4589         int err;
4590
4591         nvhost_dbg_fn("");
4592
4593         if (gr->sw_ready) {
4594                 nvhost_dbg_fn("skip init");
4595                 return 0;
4596         }
4597
4598         gr->g = g;
4599
4600         err = gr_gk20a_init_gr_config(g, gr);
4601         if (err)
4602                 goto clean_up;
4603
4604         err = gr_gk20a_init_mmu_sw(g, gr);
4605         if (err)
4606                 goto clean_up;
4607
4608         err = gr_gk20a_init_map_tiles(g, gr);
4609         if (err)
4610                 goto clean_up;
4611
4612         if (tegra_cpu_is_asim())
4613                 gr->max_comptag_mem = 1; /* MBs worth of comptag coverage */
4614         else {
4615                 nvhost_dbg_info("total ram pages : %lu", totalram_pages);
4616                 gr->max_comptag_mem = totalram_pages
4617                                          >> (10 - (PAGE_SHIFT - 10));
4618         }
4619         err = gr_gk20a_init_comptag(g, gr);
4620         if (err)
4621                 goto clean_up;
4622
4623         err = gr_gk20a_init_zcull(g, gr);
4624         if (err)
4625                 goto clean_up;
4626
4627         err = gr_gk20a_alloc_global_ctx_buffers(g);
4628         if (err)
4629                 goto clean_up;
4630
4631         err = gr_gk20a_init_access_map(g);
4632         if (err)
4633                 goto clean_up;
4634
4635         mutex_init(&gr->ctx_mutex);
4636         spin_lock_init(&gr->ch_tlb_lock);
4637
4638         gr->remove_support = gk20a_remove_gr_support;
4639         gr->sw_ready = true;
4640
4641         nvhost_dbg_fn("done");
4642         return 0;
4643
4644 clean_up:
4645         nvhost_err(dev_from_gk20a(g), "fail");
4646         gk20a_remove_gr_support(gr);
4647         return err;
4648 }
4649
4650 int gk20a_init_gr_support(struct gk20a *g)
4651 {
4652         u32 err;
4653
4654         nvhost_dbg_fn("");
4655
4656         err = gk20a_init_gr_prepare(g);
4657         if (err)
4658                 return err;
4659
4660         /* this is required before gr_gk20a_init_ctx_state */
4661         mutex_init(&g->gr.fecs_mutex);
4662
4663         err = gk20a_init_gr_reset_enable_hw(g);
4664         if (err)
4665                 return err;
4666
4667         err = gk20a_init_gr_setup_sw(g);
4668         if (err)
4669                 return err;
4670
4671         err = gk20a_init_gr_setup_hw(g);
4672         if (err)
4673                 return err;
4674
4675         return 0;
4676 }
4677
4678 #define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE   0x02dc
4679 #define NVA297_SET_CIRCULAR_BUFFER_SIZE         0x1280
4680 #define NVA297_SET_SHADER_EXCEPTIONS            0x1528
4681 #define NVA0C0_SET_SHADER_EXCEPTIONS            0x1528
4682
4683 #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
4684
4685 struct gr_isr_data {
4686         u32 addr;
4687         u32 data_lo;
4688         u32 data_hi;
4689         u32 curr_ctx;
4690         u32 chid;
4691         u32 offset;
4692         u32 sub_chan;
4693         u32 class_num;
4694 };
4695
4696 static void gk20a_gr_set_shader_exceptions(struct gk20a *g,
4697                                            struct gr_isr_data *isr_data)
4698 {
4699         u32 val;
4700
4701         nvhost_dbg_fn("");
4702
4703         if (isr_data->data_lo ==
4704             NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE)
4705                 val = 0;
4706         else
4707                 val = ~0;
4708
4709         gk20a_writel(g,
4710                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4711                 val);
4712         gk20a_writel(g,
4713                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4714                 val);
4715 }
4716
4717 static void gk20a_gr_set_circular_buffer_size(struct gk20a *g,
4718                         struct gr_isr_data *isr_data)
4719 {
4720         struct gr_gk20a *gr = &g->gr;
4721         u32 gpc_index, ppc_index, stride, val, offset;
4722         u32 cb_size = isr_data->data_lo * 4;
4723
4724         nvhost_dbg_fn("");
4725
4726         if (cb_size > gr->attrib_cb_size)
4727                 cb_size = gr->attrib_cb_size;
4728
4729         gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4730                 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4731                  ~gr_ds_tga_constraintlogic_beta_cbsize_f(~0)) |
4732                  gr_ds_tga_constraintlogic_beta_cbsize_f(cb_size));
4733
4734         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4735                 stride = proj_gpc_stride_v() * gpc_index;
4736
4737                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4738                         ppc_index++) {
4739
4740                         val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg_r() +
4741                                 stride +
4742                                 proj_ppc_in_gpc_stride_v() * ppc_index);
4743
4744                         offset = gr_gpc0_ppc0_cbm_cfg_start_offset_v(val);
4745
4746                         val = set_field(val,
4747                                 gr_gpc0_ppc0_cbm_cfg_size_m(),
4748                                 gr_gpc0_ppc0_cbm_cfg_size_f(cb_size *
4749                                         gr->pes_tpc_count[ppc_index][gpc_index]));
4750                         val = set_field(val,
4751                                 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4752                                 (offset + 1));
4753
4754                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4755                                 stride +
4756                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4757
4758                         val = set_field(val,
4759                                 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4760                                 offset);
4761
4762                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4763                                 stride +
4764                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4765                 }
4766         }
4767 }
4768
4769 static void gk20a_gr_set_alpha_circular_buffer_size(struct gk20a *g,
4770                                                 struct gr_isr_data *isr_data)
4771 {
4772         struct gr_gk20a *gr = &g->gr;
4773         u32 gpc_index, ppc_index, stride, val;
4774         u32 pd_ab_max_output;
4775         u32 alpha_cb_size = isr_data->data_lo * 4;
4776
4777         nvhost_dbg_fn("");
4778         /* if (NO_ALPHA_BETA_TIMESLICE_SUPPORT_DEF)
4779                 return; */
4780
4781         if (alpha_cb_size > gr->alpha_cb_size)
4782                 alpha_cb_size = gr->alpha_cb_size;
4783
4784         gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4785                 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4786                  ~gr_ds_tga_constraintlogic_alpha_cbsize_f(~0)) |
4787                  gr_ds_tga_constraintlogic_alpha_cbsize_f(alpha_cb_size));
4788
4789         pd_ab_max_output = alpha_cb_size *
4790                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v() /
4791                 gr_pd_ab_dist_cfg1_max_output_granularity_v();
4792
4793         gk20a_writel(g, gr_pd_ab_dist_cfg1_r(),
4794                 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output));
4795
4796         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4797                 stride = proj_gpc_stride_v() * gpc_index;
4798
4799                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4800                         ppc_index++) {
4801
4802                         val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4803                                 stride +
4804                                 proj_ppc_in_gpc_stride_v() * ppc_index);
4805
4806                         val = set_field(val, gr_gpc0_ppc0_cbm_cfg2_size_m(),
4807                                         gr_gpc0_ppc0_cbm_cfg2_size_f(alpha_cb_size *
4808                                                 gr->pes_tpc_count[ppc_index][gpc_index]));
4809
4810                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4811                                 stride +
4812                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4813                 }
4814         }
4815 }
4816
4817 void gk20a_gr_reset(struct gk20a *g)
4818 {
4819         int err;
4820         err = gk20a_init_gr_prepare(g);
4821         BUG_ON(err);
4822         err = gk20a_init_gr_reset_enable_hw(g);
4823         BUG_ON(err);
4824         err = gk20a_init_gr_setup_hw(g);
4825         BUG_ON(err);
4826 }
4827
4828 static int gk20a_gr_handle_illegal_method(struct gk20a *g,
4829                                           struct gr_isr_data *isr_data)
4830 {
4831         nvhost_dbg_fn("");
4832
4833         if (isr_data->class_num == KEPLER_COMPUTE_A) {
4834                 switch (isr_data->offset << 2) {
4835                 case NVA0C0_SET_SHADER_EXCEPTIONS:
4836                         gk20a_gr_set_shader_exceptions(g, isr_data);
4837                         break;
4838                 default:
4839                         goto fail;
4840                 }
4841         }
4842
4843         if (isr_data->class_num == KEPLER_C) {
4844                 switch (isr_data->offset << 2) {
4845                 case NVA297_SET_SHADER_EXCEPTIONS:
4846                         gk20a_gr_set_shader_exceptions(g, isr_data);
4847                         break;
4848                 case NVA297_SET_CIRCULAR_BUFFER_SIZE:
4849                         gk20a_gr_set_circular_buffer_size(g, isr_data);
4850                         break;
4851                 case NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE:
4852                         gk20a_gr_set_alpha_circular_buffer_size(g, isr_data);
4853                         break;
4854                 default:
4855                         goto fail;
4856                 }
4857         }
4858         return 0;
4859
4860 fail:
4861         nvhost_err(dev_from_gk20a(g), "invalid method class 0x%08x"
4862                 ", offset 0x%08x address 0x%08x\n",
4863                 isr_data->class_num, isr_data->offset, isr_data->addr);
4864         return -EINVAL;
4865 }
4866
4867 static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
4868                   struct gr_isr_data *isr_data)
4869 {
4870         struct fifo_gk20a *f = &g->fifo;
4871         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4872         nvhost_dbg_fn("");
4873         gk20a_set_error_notifier(ch->hwctx,
4874                                 NVHOST_CHANNEL_GR_SEMAPHORE_TIMEOUT);
4875         nvhost_err(dev_from_gk20a(g),
4876                    "gr semaphore timeout\n");
4877         return -EINVAL;
4878 }
4879
4880 static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
4881                   struct gr_isr_data *isr_data)
4882 {
4883         struct fifo_gk20a *f = &g->fifo;
4884         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4885         nvhost_dbg_fn("");
4886         gk20a_set_error_notifier(ch->hwctx,
4887                                 NVHOST_CHANNEL_GR_ILLEGAL_NOTIFY);
4888         /* This is an unrecoverable error, reset is needed */
4889         nvhost_err(dev_from_gk20a(g),
4890                    "gr semaphore timeout\n");
4891         return -EINVAL;
4892 }
4893
4894 static int gk20a_gr_handle_illegal_class(struct gk20a *g,
4895                                           struct gr_isr_data *isr_data)
4896 {
4897         struct fifo_gk20a *f = &g->fifo;
4898         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4899         nvhost_dbg_fn("");
4900         gk20a_set_error_notifier(ch->hwctx,
4901                                 NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4902         nvhost_err(dev_from_gk20a(g),
4903                    "invalid class 0x%08x, offset 0x%08x",
4904                    isr_data->class_num, isr_data->offset);
4905         return -EINVAL;
4906 }
4907
4908 static int gk20a_gr_handle_class_error(struct gk20a *g,
4909                                           struct gr_isr_data *isr_data)
4910 {
4911         struct fifo_gk20a *f = &g->fifo;
4912         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4913         nvhost_dbg_fn("");
4914
4915         gk20a_set_error_notifier(ch->hwctx,
4916                         NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4917         nvhost_err(dev_from_gk20a(g),
4918                    "class error 0x%08x, offset 0x%08x",
4919                    isr_data->class_num, isr_data->offset);
4920         return -EINVAL;
4921 }
4922
4923 static int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
4924                                              struct gr_isr_data *isr_data)
4925 {
4926         struct fifo_gk20a *f = &g->fifo;
4927         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4928
4929         wake_up(&ch->semaphore_wq);
4930
4931         return 0;
4932 }
4933
4934 static int gk20a_gr_handle_notify_pending(struct gk20a *g,
4935                                           struct gr_isr_data *isr_data)
4936 {
4937         struct fifo_gk20a *f = &g->fifo;
4938         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4939
4940 #if defined(CONFIG_GK20A_CYCLE_STATS)
4941         void *virtual_address;
4942         u32 buffer_size;
4943         u32 offset;
4944         u32 new_offset;
4945         bool exit;
4946         struct share_buffer_head *sh_hdr;
4947         u32 raw_reg;
4948         u64 mask_orig;
4949         u64 v = 0;
4950         struct gk20a_cyclestate_buffer_elem *op_elem;
4951         /* GL will never use payload 0 for cycle state */
4952         if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
4953                 return 0;
4954
4955         mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
4956
4957         virtual_address = ch->cyclestate.cyclestate_buffer;
4958         buffer_size = ch->cyclestate.cyclestate_buffer_size;
4959         offset = isr_data->data_lo;
4960         exit = false;
4961         while (!exit) {
4962                 if (offset >= buffer_size) {
4963                         WARN_ON(1);
4964                         break;
4965                 }
4966
4967                 sh_hdr = (struct share_buffer_head *)
4968                         ((char *)virtual_address + offset);
4969
4970                 if (sh_hdr->size < sizeof(struct share_buffer_head)) {
4971                         WARN_ON(1);
4972                         break;
4973                 }
4974                 new_offset = offset + sh_hdr->size;
4975
4976                 switch (sh_hdr->operation) {
4977                 case OP_END:
4978                         exit = true;
4979                         break;
4980
4981                 case BAR0_READ32:
4982                 case BAR0_WRITE32:
4983                 {
4984                         op_elem =
4985                                 (struct gk20a_cyclestate_buffer_elem *)
4986                                         sh_hdr;
4987                         if (op_elem->offset_bar0 <
4988                                 resource_size(g->reg_mem)) {
4989                                 mask_orig =
4990                                         ((1ULL <<
4991                                         (op_elem->last_bit + 1))
4992                                         -1)&~((1ULL <<
4993                                         op_elem->first_bit)-1);
4994
4995                                 raw_reg =
4996                                         gk20a_readl(g,
4997                                                 op_elem->offset_bar0);
4998
4999                                 switch (sh_hdr->operation) {
5000                                 case BAR0_READ32:
5001                                         op_elem->data =
5002                                         (raw_reg & mask_orig)
5003                                                 >> op_elem->first_bit;
5004                                         break;
5005
5006                                 case BAR0_WRITE32:
5007                                         v = 0;
5008                                         if ((unsigned int)mask_orig !=
5009                                         (unsigned int)~0) {
5010                                                 v = (unsigned int)
5011                                                         (raw_reg & ~mask_orig);
5012                                         }
5013
5014                                         v |= ((op_elem->data
5015                                                 << op_elem->first_bit)
5016                                                 & mask_orig);
5017
5018                                         gk20a_writel(g,
5019                                                 op_elem->offset_bar0,
5020                                                 (unsigned int)v);
5021                                                 break;
5022
5023                                 default:
5024                                                 break;
5025                                 }
5026                         } else {
5027                                 sh_hdr->failed = true;
5028                                 WARN_ON(1);
5029                         }
5030                 }
5031                 break;
5032                 default:
5033                 /* no operation content case */
5034                         exit = true;
5035                         break;
5036                 }
5037                 sh_hdr->completed = true;
5038                 offset = new_offset;
5039         }
5040         mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
5041 #endif
5042         nvhost_dbg_fn("");
5043         wake_up(&ch->notifier_wq);
5044         return 0;
5045 }
5046
5047 /* Used by sw interrupt thread to translate current ctx to chid.
5048  * For performance, we don't want to go through 128 channels every time.
5049  * A small tlb is used here to cache translation */
5050 static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx)
5051 {
5052         struct fifo_gk20a *f = &g->fifo;
5053         struct gr_gk20a *gr = &g->gr;
5054         u32 chid = -1;
5055         u32 i;
5056
5057         spin_lock(&gr->ch_tlb_lock);
5058
5059         /* check cache first */
5060         for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5061                 if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
5062                         chid = gr->chid_tlb[i].hw_chid;
5063                         goto unlock;
5064                 }
5065         }
5066
5067         /* slow path */
5068         for (chid = 0; chid < f->num_channels; chid++)
5069                 if (f->channel[chid].in_use) {
5070                         if ((u32)(f->channel[chid].inst_block.cpu_pa >>
5071                                 ram_in_base_shift_v()) ==
5072                                 gr_fecs_current_ctx_ptr_v(curr_ctx))
5073                                 break;
5074         }
5075
5076         if (chid >= f->num_channels) {
5077                 chid = -1;
5078                 goto unlock;
5079         }
5080
5081         /* add to free tlb entry */
5082         for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5083                 if (gr->chid_tlb[i].curr_ctx == 0) {
5084                         gr->chid_tlb[i].curr_ctx = curr_ctx;
5085                         gr->chid_tlb[i].hw_chid = chid;
5086                         goto unlock;
5087                 }
5088         }
5089
5090         /* no free entry, flush one */
5091         gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
5092         gr->chid_tlb[gr->channel_tlb_flush_index].hw_chid = chid;
5093
5094         gr->channel_tlb_flush_index =
5095                 (gr->channel_tlb_flush_index + 1) &
5096                 (GR_CHANNEL_MAP_TLB_SIZE - 1);
5097
5098 unlock:
5099         spin_unlock(&gr->ch_tlb_lock);
5100         return chid;
5101 }
5102
5103 static int gk20a_gr_lock_down_sm(struct gk20a *g, u32 global_esr_mask)
5104 {
5105         unsigned long end_jiffies = jiffies +
5106                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5107         u32 delay = GR_IDLE_CHECK_DEFAULT;
5108         bool mmu_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled(g);
5109         u32 dbgr_control0;
5110
5111         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "locking down SM");
5112
5113         /* assert stop trigger */
5114         dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5115         dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5116         gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
5117
5118         /* wait for the sm to lock down */
5119         do {
5120                 u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5121                 u32 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5122                 u32 dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r());
5123                 bool locked_down =
5124                         (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
5125                          gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
5126                 bool error_pending =
5127                         (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) !=
5128                          gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) ||
5129                         ((global_esr & ~global_esr_mask) != 0);
5130
5131                 if (locked_down || !error_pending) {
5132                         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "locked down SM");
5133
5134                         /* de-assert stop trigger */
5135                         dbgr_control0 &= ~gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5136                         gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
5137
5138                         return 0;
5139                 }
5140
5141                 /* if an mmu fault is pending and mmu debug mode is not
5142                  * enabled, the sm will never lock down. */
5143                 if (!mmu_debug_mode_enabled && gk20a_fifo_mmu_fault_pending(g)) {
5144                         nvhost_err(dev_from_gk20a(g), "mmu fault pending, sm will"
5145                                    " never lock down!");
5146                         return -EFAULT;
5147                 }
5148
5149                 usleep_range(delay, delay * 2);
5150                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
5151
5152         } while (time_before(jiffies, end_jiffies));
5153
5154         nvhost_err(dev_from_gk20a(g), "timed out while trying to lock down SM");
5155
5156         return -EAGAIN;
5157 }
5158
5159 bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
5160 {
5161         u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5162
5163         /* check if an sm debugger is attached */
5164         if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
5165                         gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v())
5166                 return true;
5167
5168         return false;
5169 }
5170
5171 static void gk20a_gr_clear_sm_hww(struct gk20a *g, u32 global_esr)
5172 {
5173         gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r(), global_esr);
5174
5175         /* clear the warp hww */
5176         gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r(),
5177                         gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f());
5178 }
5179
5180 static struct channel_gk20a *
5181 channel_from_hw_chid(struct gk20a *g, u32 hw_chid)
5182 {
5183         return g->fifo.channel+hw_chid;
5184 }
5185
5186 static int gk20a_gr_handle_sm_exception(struct gk20a *g,
5187                 struct gr_isr_data *isr_data)
5188 {
5189         int ret = 0;
5190         bool do_warp_sync = false;
5191         /* these three interrupts don't require locking down the SM. They can
5192          * be handled by usermode clients as they aren't fatal. Additionally,
5193          * usermode clients may wish to allow some warps to execute while others
5194          * are at breakpoints, as opposed to fatal errors where all warps should
5195          * halt. */
5196         u32 global_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()   |
5197                           gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
5198                           gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
5199         u32 global_esr, warp_esr;
5200         bool sm_debugger_attached = gk20a_gr_sm_debugger_attached(g);
5201         struct channel_gk20a *fault_ch;
5202
5203         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
5204
5205         global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5206         warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5207
5208         /* if an sm debugger is attached, disable forwarding of tpc exceptions.
5209          * the debugger will reenable exceptions after servicing them. */
5210         if (sm_debugger_attached) {
5211                 u32 tpc_exception_en = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
5212                 tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
5213                 gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), tpc_exception_en);
5214                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "SM debugger attached");
5215         }
5216
5217         /* if a debugger is present and an error has occurred, do a warp sync */
5218         if (sm_debugger_attached && ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
5219                 nvhost_dbg(dbg_intr, "warp sync needed");
5220                 do_warp_sync = true;
5221         }
5222
5223         if (do_warp_sync) {
5224                 ret = gk20a_gr_lock_down_sm(g, global_mask);
5225                 if (ret) {
5226                         nvhost_err(dev_from_gk20a(g), "sm did not lock down!\n");
5227                         return ret;
5228                 }
5229         }
5230
5231         /* finally, signal any client waiting on an event */
5232         fault_ch = channel_from_hw_chid(g, isr_data->chid);
5233         if (fault_ch)
5234                 gk20a_dbg_gpu_post_events(fault_ch);
5235
5236         return ret;
5237 }
5238
5239 static int gk20a_gr_handle_tpc_exception(struct gk20a *g,
5240                 struct gr_isr_data *isr_data)
5241 {
5242         int ret = 0;
5243         u32 tpc_exception = gk20a_readl(g, gr_gpcs_tpcs_tpccs_tpc_exception_r());
5244
5245         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "");
5246
5247         /* check if an sm exeption is pending  */
5248         if (gr_gpcs_tpcs_tpccs_tpc_exception_sm_v(tpc_exception) ==
5249                         gr_gpcs_tpcs_tpccs_tpc_exception_sm_pending_v()) {
5250                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "SM exception pending");
5251                 ret = gk20a_gr_handle_sm_exception(g, isr_data);
5252         }
5253
5254         return ret;
5255 }
5256
5257 static int gk20a_gr_handle_gpc_exception(struct gk20a *g,
5258                 struct gr_isr_data *isr_data)
5259 {
5260         int ret = 0;
5261         u32 gpc_exception = gk20a_readl(g, gr_gpcs_gpccs_gpc_exception_r());
5262
5263         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "");
5264
5265         /* check if tpc 0 has an exception */
5266         if (gr_gpcs_gpccs_gpc_exception_tpc_v(gpc_exception) ==
5267                         gr_gpcs_gpccs_gpc_exception_tpc_0_pending_v()) {
5268                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "TPC exception pending");
5269                 ret = gk20a_gr_handle_tpc_exception(g, isr_data);
5270         }
5271
5272         return ret;
5273 }
5274
5275 int gk20a_gr_isr(struct gk20a *g)
5276 {
5277         struct gr_isr_data isr_data;
5278         u32 grfifo_ctl;
5279         u32 obj_table;
5280         int need_reset = 0;
5281         u32 gr_intr = gk20a_readl(g, gr_intr_r());
5282
5283         nvhost_dbg_fn("");
5284         nvhost_dbg(dbg_intr, "pgraph intr %08x", gr_intr);
5285
5286         if (!gr_intr)
5287                 return 0;
5288
5289         grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
5290         grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
5291         grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
5292
5293         gk20a_writel(g, gr_gpfifo_ctl_r(),
5294                 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
5295                 gr_gpfifo_ctl_semaphore_access_f(0));
5296
5297         isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
5298         isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
5299         isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
5300         isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
5301         isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
5302         isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
5303         obj_table = gk20a_readl(g,
5304                 gr_fe_object_table_r(isr_data.sub_chan));
5305         isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
5306
5307         isr_data.chid =
5308                 gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx);
5309         if (isr_data.chid == -1) {
5310                 nvhost_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
5311                            isr_data.curr_ctx);
5312                 goto clean_up;
5313         }
5314
5315         nvhost_dbg(dbg_intr | dbg_gpu_dbg,
5316                 "channel %d: addr 0x%08x, "
5317                 "data 0x%08x 0x%08x,"
5318                 "ctx 0x%08x, offset 0x%08x, "
5319                 "subchannel 0x%08x, class 0x%08x",
5320                 isr_data.chid, isr_data.addr,
5321                 isr_data.data_hi, isr_data.data_lo,
5322                 isr_data.curr_ctx, isr_data.offset,
5323                 isr_data.sub_chan, isr_data.class_num);
5324
5325         if (gr_intr & gr_intr_notify_pending_f()) {
5326                 gk20a_gr_handle_notify_pending(g, &isr_data);
5327                 gk20a_writel(g, gr_intr_r(),
5328                         gr_intr_notify_reset_f());
5329                 gr_intr &= ~gr_intr_notify_pending_f();
5330         }
5331
5332         if (gr_intr & gr_intr_semaphore_pending_f()) {
5333                 gk20a_gr_handle_semaphore_pending(g, &isr_data);
5334                 gk20a_writel(g, gr_intr_r(),
5335                         gr_intr_semaphore_reset_f());
5336                 gr_intr &= ~gr_intr_semaphore_pending_f();
5337         }
5338
5339         if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
5340                 need_reset |= gk20a_gr_handle_semaphore_timeout_pending(g,
5341                         &isr_data);
5342                 gk20a_writel(g, gr_intr_r(),
5343                         gr_intr_semaphore_reset_f());
5344                 gr_intr &= ~gr_intr_semaphore_pending_f();
5345         }
5346
5347         if (gr_intr & gr_intr_illegal_notify_pending_f()) {
5348                 need_reset |= gk20a_gr_intr_illegal_notify_pending(g,
5349                         &isr_data);
5350                 gk20a_writel(g, gr_intr_r(),
5351                         gr_intr_illegal_notify_reset_f());
5352                 gr_intr &= ~gr_intr_illegal_notify_pending_f();
5353         }
5354
5355         if (gr_intr & gr_intr_illegal_method_pending_f()) {
5356                 need_reset |= gk20a_gr_handle_illegal_method(g, &isr_data);
5357                 gk20a_writel(g, gr_intr_r(),
5358                         gr_intr_illegal_method_reset_f());
5359                 gr_intr &= ~gr_intr_illegal_method_pending_f();
5360         }
5361
5362         if (gr_intr & gr_intr_illegal_class_pending_f()) {
5363                 need_reset |= gk20a_gr_handle_illegal_class(g, &isr_data);
5364                 gk20a_writel(g, gr_intr_r(),
5365                         gr_intr_illegal_class_reset_f());
5366                 gr_intr &= ~gr_intr_illegal_class_pending_f();
5367         }
5368
5369         if (gr_intr & gr_intr_class_error_pending_f()) {
5370                 need_reset |= gk20a_gr_handle_class_error(g, &isr_data);
5371                 gk20a_writel(g, gr_intr_r(),
5372                         gr_intr_class_error_reset_f());
5373                 gr_intr &= ~gr_intr_class_error_pending_f();
5374         }
5375
5376         /* this one happens if someone tries to hit a non-whitelisted
5377          * register using set_falcon[4] */
5378         if (gr_intr & gr_intr_firmware_method_pending_f()) {
5379                 need_reset |= true;
5380                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "firmware method intr pending\n");
5381                 gk20a_writel(g, gr_intr_r(),
5382                         gr_intr_firmware_method_reset_f());
5383                 gr_intr &= ~gr_intr_firmware_method_pending_f();
5384         }
5385
5386         if (gr_intr & gr_intr_exception_pending_f()) {
5387                 u32 exception = gk20a_readl(g, gr_exception_r());
5388                 struct fifo_gk20a *f = &g->fifo;
5389                 struct channel_gk20a *ch = &f->channel[isr_data.chid];
5390
5391                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "exception %08x\n", exception);
5392
5393                 if (exception & gr_exception_fe_m()) {
5394                         u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
5395                         nvhost_dbg(dbg_intr, "fe warning %08x\n", fe);
5396                         gk20a_writel(g, gr_fe_hww_esr_r(), fe);
5397                 }
5398
5399                 /* check if a gpc exception has occurred */
5400                 if (exception & gr_exception_gpc_m() && need_reset == 0) {
5401                         u32 exception1 = gk20a_readl(g, gr_exception1_r());
5402                         u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5403
5404                         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "GPC exception pending");
5405
5406                         /* if no sm debugger is present, clean up the channel */
5407                         if (!gk20a_gr_sm_debugger_attached(g)) {
5408                                 nvhost_dbg(dbg_intr | dbg_gpu_dbg,
5409                                            "SM debugger not attached, clearing interrupt");
5410                                 need_reset |= -EFAULT;
5411                         }
5412                         else {
5413                                 /* check if gpc 0 has an exception */
5414                                 if (exception1 & gr_exception1_gpc_0_pending_f())
5415                                         need_reset |= gk20a_gr_handle_gpc_exception(g, &isr_data);
5416                                 /* clear the hwws, also causes tpc and gpc
5417                                  * exceptions to be cleared */
5418                                 gk20a_gr_clear_sm_hww(g, global_esr);
5419                         }
5420
5421                         if (need_reset)
5422                                 gk20a_set_error_notifier(ch,
5423                                         NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
5424                 }
5425
5426                 gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
5427                 gr_intr &= ~gr_intr_exception_pending_f();
5428         }
5429
5430         if (need_reset)
5431                 gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), true);
5432
5433 clean_up:
5434         gk20a_writel(g, gr_gpfifo_ctl_r(),
5435                 grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
5436                 gr_gpfifo_ctl_semaphore_access_f(1));
5437
5438         if (gr_intr)
5439                 nvhost_err(dev_from_gk20a(g),
5440                            "unhandled gr interrupt 0x%08x", gr_intr);
5441
5442         return 0;
5443 }
5444
5445 int gk20a_gr_nonstall_isr(struct gk20a *g)
5446 {
5447         u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
5448         u32 clear_intr = 0;
5449
5450         nvhost_dbg(dbg_intr, "pgraph nonstall intr %08x", gr_intr);
5451
5452         if (gr_intr & gr_intr_nonstall_trap_pending_f()) {
5453                 gk20a_channel_semaphore_wakeup(g);
5454                 clear_intr |= gr_intr_nonstall_trap_pending_f();
5455         }
5456
5457         gk20a_writel(g, gr_intr_nonstall_r(), clear_intr);
5458
5459         return 0;
5460 }
5461
5462 int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
5463 {
5464         BUG_ON(size == NULL);
5465         return gr_gk20a_submit_fecs_method_op(g,
5466                    (struct fecs_method_op_gk20a) {
5467                            .mailbox.id = 0,
5468                            .mailbox.data = 0,
5469                            .mailbox.clr = ~0,
5470                            .method.data = 1,
5471                            .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
5472                            .mailbox.ret = size,
5473                            .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
5474                            .mailbox.ok = 0,
5475                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5476                            .mailbox.fail = 0});
5477 }
5478
5479 int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
5480 {
5481         return gr_gk20a_submit_fecs_method_op(g,
5482                    (struct fecs_method_op_gk20a){
5483                            .mailbox.id = 4,
5484                            .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
5485                                             gr_fecs_current_ctx_valid_f(1) |
5486                                             gr_fecs_current_ctx_target_vid_mem_f()),
5487                            .mailbox.clr = ~0,
5488                            .method.data = 1,
5489                            .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
5490                            .mailbox.ret = NULL,
5491                            .cond.ok = GR_IS_UCODE_OP_EQUAL,
5492                            .mailbox.ok = 1,
5493                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5494                            .mailbox.fail = 0});
5495 }
5496
5497 int gr_gk20a_fecs_set_reglist_virual_addr(struct gk20a *g, u64 pmu_va)
5498 {
5499         return gr_gk20a_submit_fecs_method_op(g,
5500                    (struct fecs_method_op_gk20a) {
5501                            .mailbox.id = 4,
5502                            .mailbox.data = u64_lo32(pmu_va >> 8),
5503                            .mailbox.clr = ~0,
5504                            .method.data = 1,
5505                            .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
5506                            .mailbox.ret = NULL,
5507                            .cond.ok = GR_IS_UCODE_OP_EQUAL,
5508                            .mailbox.ok = 1,
5509                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5510                            .mailbox.fail = 0});
5511 }
5512
5513 int gk20a_gr_suspend(struct gk20a *g)
5514 {
5515         unsigned long end_jiffies = jiffies +
5516                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5517         u32 ret = 0;
5518
5519         nvhost_dbg_fn("");
5520
5521         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
5522         if (ret)
5523                 return ret;
5524
5525         gk20a_writel(g, gr_gpfifo_ctl_r(),
5526                 gr_gpfifo_ctl_access_disabled_f());
5527
5528         /* disable gr intr */
5529         gk20a_writel(g, gr_intr_r(), 0);
5530         gk20a_writel(g, gr_intr_en_r(), 0);
5531
5532         /* disable all exceptions */
5533         gk20a_writel(g, gr_exception_r(), 0);
5534         gk20a_writel(g, gr_exception_en_r(), 0);
5535         gk20a_writel(g, gr_exception1_r(), 0);
5536         gk20a_writel(g, gr_exception1_en_r(), 0);
5537         gk20a_writel(g, gr_exception2_r(), 0);
5538         gk20a_writel(g, gr_exception2_en_r(), 0);
5539
5540         gk20a_gr_flush_channel_tlb(&g->gr);
5541
5542         nvhost_dbg_fn("done");
5543         return ret;
5544 }
5545
5546 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
5547                                                u32 addr,
5548                                                bool is_quad, u32 quad,
5549                                                u32 *context_buffer,
5550                                                u32 context_buffer_size,
5551                                                u32 *priv_offset);
5552
5553 /* This function will decode a priv address and return the partition type and numbers. */
5554 int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
5555                               int  *addr_type, /* enum ctxsw_addr_type */
5556                               u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
5557                               u32 *broadcast_flags)
5558 {
5559         u32 gpc_addr;
5560         u32 ppc_address;
5561         u32 ppc_broadcast_addr;
5562
5563         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5564
5565         /* setup defaults */
5566         ppc_address = 0;
5567         ppc_broadcast_addr = 0;
5568         *addr_type = CTXSW_ADDR_TYPE_SYS;
5569         *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
5570         *gpc_num = 0;
5571         *tpc_num = 0;
5572         *ppc_num = 0;
5573         *be_num  = 0;
5574
5575         if (pri_is_gpc_addr(addr)) {
5576                 *addr_type = CTXSW_ADDR_TYPE_GPC;
5577                 gpc_addr = pri_gpccs_addr_mask(addr);
5578                 if (pri_is_gpc_addr_shared(addr)) {
5579                         *addr_type = CTXSW_ADDR_TYPE_GPC;
5580                         *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
5581                 } else
5582                         *gpc_num = pri_get_gpc_num(addr);
5583
5584                 if (pri_is_tpc_addr(gpc_addr)) {
5585                         *addr_type = CTXSW_ADDR_TYPE_TPC;
5586                         if (pri_is_tpc_addr_shared(gpc_addr)) {
5587                                 *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
5588                                 return 0;
5589                         }
5590                         *tpc_num = pri_get_tpc_num(gpc_addr);
5591                 }
5592                 return 0;
5593         } else if (pri_is_be_addr(addr)) {
5594                 *addr_type = CTXSW_ADDR_TYPE_BE;
5595                 if (pri_is_be_addr_shared(addr)) {
5596                         *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
5597                         return 0;
5598                 }
5599                 *be_num = pri_get_be_num(addr);
5600                 return 0;
5601         } else {
5602                 *addr_type = CTXSW_ADDR_TYPE_SYS;
5603                 return 0;
5604         }
5605         /* PPC!?!?!?! */
5606
5607         /*NOTREACHED*/
5608         return -EINVAL;
5609 }
5610
5611 static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
5612                                       u32 gpc_num,
5613                                       u32 *priv_addr_table, u32 *t)
5614 {
5615     u32 ppc_num;
5616
5617     nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5618
5619     for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
5620             priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
5621                                                    gpc_num, ppc_num);
5622
5623     return 0;
5624 }
5625
5626 /*
5627  * The context buffer is indexed using BE broadcast addresses and GPC/TPC
5628  * unicast addresses. This function will convert a BE unicast address to a BE
5629  * broadcast address and split a GPC/TPC broadcast address into a table of
5630  * GPC/TPC addresses.  The addresses generated by this function can be
5631  * successfully processed by gr_gk20a_find_priv_offset_in_buffer
5632  */
5633 static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
5634                                            u32 addr,
5635                                            u32 *priv_addr_table,
5636                                            u32 *num_registers)
5637 {
5638         int addr_type; /*enum ctxsw_addr_type */
5639         u32 gpc_num, tpc_num, ppc_num, be_num;
5640         u32 broadcast_flags;
5641         u32 t;
5642         int err;
5643
5644         t = 0;
5645         *num_registers = 0;
5646
5647         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5648
5649         err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
5650                                         &gpc_num, &tpc_num, &ppc_num, &be_num,
5651                                         &broadcast_flags);
5652         nvhost_dbg(dbg_gpu_dbg, "addr_type = %d", addr_type);
5653         if (err)
5654                 return err;
5655
5656         if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
5657             (addr_type == CTXSW_ADDR_TYPE_BE)) {
5658                 /* The BE broadcast registers are included in the compressed PRI
5659                  * table. Convert a BE unicast address to a broadcast address
5660                  * so that we can look up the offset. */
5661                 if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
5662                     !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
5663                         priv_addr_table[t++] = pri_be_shared_addr(addr);
5664                 else
5665                         priv_addr_table[t++] = addr;
5666
5667                 *num_registers = t;
5668                 return 0;
5669         }
5670
5671         /* The GPC/TPC unicast registers are included in the compressed PRI
5672          * tables. Convert a GPC/TPC broadcast address to unicast addresses so
5673          * that we can look up the offsets. */
5674         if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
5675                 for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
5676
5677                         if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5678                                 for (tpc_num = 0;
5679                                      tpc_num < g->gr.gpc_tpc_count[gpc_num];
5680                                      tpc_num++)
5681                                         priv_addr_table[t++] =
5682                                                 pri_tpc_addr(pri_tpccs_addr_mask(addr),
5683                                                              gpc_num, tpc_num);
5684
5685                         else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
5686                                 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5687                                                                priv_addr_table, &t);
5688                                 if (err)
5689                                         return err;
5690                         } else
5691                                 priv_addr_table[t++] =
5692                                         pri_gpc_addr(pri_gpccs_addr_mask(addr),
5693                                                      gpc_num);
5694                 }
5695         } else {
5696                 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5697                         for (tpc_num = 0;
5698                              tpc_num < g->gr.gpc_tpc_count[gpc_num];
5699                              tpc_num++)
5700                                 priv_addr_table[t++] =
5701                                         pri_tpc_addr(pri_tpccs_addr_mask(addr),
5702                                                      gpc_num, tpc_num);
5703                 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
5704                         err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5705                                                        priv_addr_table, &t);
5706                 else
5707                         priv_addr_table[t++] = addr;
5708         }
5709
5710         *num_registers = t;
5711         return 0;
5712 }
5713