8cdaa2d9c995fa6d0dc4111fef51c272ec330f99
[linux-3.10.git] / drivers / video / tegra / host / gk20a / gr_gk20a.c
1 /*
2  * drivers/video/tegra/host/gk20a/gr_gk20a.c
3  *
4  * GK20A Graphics
5  *
6  * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
7  *
8  * This program is free software; you can redistribute it and/or modify it
9  * under the terms and conditions of the GNU General Public License,
10  * version 2, as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  *
17  * You should have received a copy of the GNU General Public License along with
18  * this program; if not, write to the Free Software Foundation, Inc.,
19  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
20  */
21
22 #include <linux/delay.h>        /* for udelay */
23 #include <linux/mm.h>           /* for totalram_pages */
24 #include <linux/scatterlist.h>
25 #include <linux/nvmap.h>
26 #include <linux/tegra-soc.h>
27 #include <linux/nvhost_dbg_gpu_ioctl.h>
28
29 #include "../dev.h"
30
31 #include "gk20a.h"
32 #include "gr_ctx_gk20a.h"
33
34 #include "hw_ccsr_gk20a.h"
35 #include "hw_ctxsw_prog_gk20a.h"
36 #include "hw_fifo_gk20a.h"
37 #include "hw_gr_gk20a.h"
38 #include "hw_mc_gk20a.h"
39 #include "hw_ram_gk20a.h"
40 #include "hw_pri_ringmaster_gk20a.h"
41 #include "hw_pri_ringstation_sys_gk20a.h"
42 #include "hw_pri_ringstation_gpc_gk20a.h"
43 #include "hw_pri_ringstation_fbp_gk20a.h"
44 #include "hw_proj_gk20a.h"
45 #include "hw_top_gk20a.h"
46 #include "hw_ltc_gk20a.h"
47 #include "hw_fb_gk20a.h"
48 #include "hw_therm_gk20a.h"
49 #include "hw_pbdma_gk20a.h"
50 #include "chip_support.h"
51 #include "nvhost_memmgr.h"
52 #include "gk20a_gating_reglist.h"
53 #include "gr_pri_gk20a.h"
54 #include "regops_gk20a.h"
55 #include "dbg_gpu_gk20a.h"
56
57 #define BLK_SIZE (256)
58
59 struct gk20a_ctxsw_bootloader_desc g_fecs_bootloader_desc = {
60         /* .bootLoaderStartOffset  = */ 0x0,
61         /* .bootLoaderSize         = */ 0x85,
62         /* .bootLoaderImemOffset   = */ 0x4f00,
63         /* .bootLoaderEntryPoint   = */ 0x4f00,
64 };
65
66 u32 g_fecs_bootloader_image[] = {
67 /* 0x0000 */  0x001000d0, 0x0004fe00, 0x107ea4bd, 0x02f8004f, 0x00000089,
68               0x12f99dbf, 0x98089a98, 0xdf940991,
69 /* 0x0020 */  0x08de940c, 0xfd049098, 0x9b9805ef, 0x05edfd06, 0x98059c98,
70               0x9f98079d, 0x00ebfe03, 0x00000089,
71 /* 0x0040 */  0xfe019998, 0x94bd0096, 0x004f543e, 0xb80499fa, 0x00010099,
72               0x08f49fa6, 0xfe07f8f6, 0xc7fe00d6,
73 /* 0x0060 */  0x3ef4bd00, 0x8e004f76, 0xbc060000, 0xf9fa90fe, 0x00ffb805,
74               0xfba60001, 0xf8ef08f4, 0xf91bb203,
75 /* 0x0080 */  0xfba4bd05, 0x00000011, 0x00000000, 0x00000000, 0x00000000,
76               0x00000000, 0x00000000, 0x00000000,
77 /* 0x00a0 */  0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
78               0x00000000, 0x00000000, 0x00000000,
79 /* 0x00c0 */  0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
80               0x00000000, 0x00000000, 0x00000000,
81 /* 0x00e0 */  0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
82               0x00000000, 0x00000000, 0x00000000,
83 };
84
85 struct gk20a_ctxsw_bootloader_desc g_gpccs_bootloader_desc = {
86         /* .bootLoaderStartOffset  = */ 0x0,
87         /* .bootLoaderSize         = */ 0x85,
88         /* .bootLoaderImemOffset   = */ 0x2700,
89         /* .bootLoaderEntryPoint   = */ 0x2700,
90 };
91
92 u32 g_gpccs_bootloader_image[] = {
93 /* 0x0000 */  0x000800d0, 0x0004fe00, 0x107ea4bd, 0x02f80027, 0x00000089,
94               0x12f99dbf, 0x98089a98, 0xdf940991,
95 /* 0x0020 */  0x08de940c, 0xfd049098, 0x9b9805ef, 0x05edfd06, 0x98059c98,
96               0x9f98079d, 0x00ebfe03, 0x00000089,
97 /* 0x0040 */  0xfe019998, 0x94bd0096, 0x0027543e, 0xb80499fa, 0x00010099,
98               0x08f49fa6, 0xfe07f8f6, 0xc7fe00d6,
99 /* 0x0060 */  0x3ef4bd00, 0x8e002776, 0xbc060000, 0xf9fa90fe, 0x00ffb805,
100               0xfba60001, 0xf8ef08f4, 0xf91bb203,
101 /* 0x0080 */  0xfba4bd05, 0x00000011, 0x00000000, 0x00000000, 0x00000000,
102               0x00000000, 0x00000000, 0x00000000,
103 /* 0x00a0 */  0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
104               0x00000000, 0x00000000, 0x00000000,
105 /* 0x00c0 */  0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
106               0x00000000, 0x00000000, 0x00000000,
107 /* 0x00e0 */  0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
108               0x00000000, 0x00000000, 0x00000000,
109 };
110
111 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
112 static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx,
113                                     u32 addr, u32 data, bool patch);
114
115 /* global ctx buffer */
116 static int  gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
117 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
118 static int  gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
119                                             struct channel_gk20a *c);
120 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
121
122 /* channel gr ctx buffer */
123 static int  gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
124                                         struct channel_gk20a *c);
125 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
126
127 /* channel patch ctx buffer */
128 static int  gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
129                                         struct channel_gk20a *c);
130 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
131
132 /* golden ctx image */
133 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
134                                           struct channel_gk20a *c);
135 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
136                                           struct channel_gk20a *c);
137
138 void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
139 {
140         int i;
141
142         nvhost_err(dev_from_gk20a(g), "gr_fecs_os_r : %d",
143                 gk20a_readl(g, gr_fecs_os_r()));
144         nvhost_err(dev_from_gk20a(g), "gr_fecs_cpuctl_r : 0x%x",
145                 gk20a_readl(g, gr_fecs_cpuctl_r()));
146         nvhost_err(dev_from_gk20a(g), "gr_fecs_idlestate_r : 0x%x",
147                 gk20a_readl(g, gr_fecs_idlestate_r()));
148         nvhost_err(dev_from_gk20a(g), "gr_fecs_mailbox0_r : 0x%x",
149                 gk20a_readl(g, gr_fecs_mailbox0_r()));
150         nvhost_err(dev_from_gk20a(g), "gr_fecs_mailbox1_r : 0x%x",
151                 gk20a_readl(g, gr_fecs_mailbox1_r()));
152         nvhost_err(dev_from_gk20a(g), "gr_fecs_irqstat_r : 0x%x",
153                 gk20a_readl(g, gr_fecs_irqstat_r()));
154         nvhost_err(dev_from_gk20a(g), "gr_fecs_irqmode_r : 0x%x",
155                 gk20a_readl(g, gr_fecs_irqmode_r()));
156         nvhost_err(dev_from_gk20a(g), "gr_fecs_irqmask_r : 0x%x",
157                 gk20a_readl(g, gr_fecs_irqmask_r()));
158         nvhost_err(dev_from_gk20a(g), "gr_fecs_irqdest_r : 0x%x",
159                 gk20a_readl(g, gr_fecs_irqdest_r()));
160         nvhost_err(dev_from_gk20a(g), "gr_fecs_debug1_r : 0x%x",
161                 gk20a_readl(g, gr_fecs_debug1_r()));
162         nvhost_err(dev_from_gk20a(g), "gr_fecs_debuginfo_r : 0x%x",
163                 gk20a_readl(g, gr_fecs_debuginfo_r()));
164
165         for (i = 0; i < gr_fecs_ctxsw_mailbox__size_1_v(); i++)
166                 nvhost_err(dev_from_gk20a(g), "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
167                         i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
168
169         nvhost_err(dev_from_gk20a(g), "gr_fecs_engctl_r : 0x%x",
170                 gk20a_readl(g, gr_fecs_engctl_r()));
171         nvhost_err(dev_from_gk20a(g), "gr_fecs_curctx_r : 0x%x",
172                 gk20a_readl(g, gr_fecs_curctx_r()));
173         nvhost_err(dev_from_gk20a(g), "gr_fecs_nxtctx_r : 0x%x",
174                 gk20a_readl(g, gr_fecs_nxtctx_r()));
175
176         gk20a_writel(g, gr_fecs_icd_cmd_r(),
177                 gr_fecs_icd_cmd_opc_rreg_f() |
178                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
179         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_IMB : 0x%x",
180                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
181
182         gk20a_writel(g, gr_fecs_icd_cmd_r(),
183                 gr_fecs_icd_cmd_opc_rreg_f() |
184                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
185         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_DMB : 0x%x",
186                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
187
188         gk20a_writel(g, gr_fecs_icd_cmd_r(),
189                 gr_fecs_icd_cmd_opc_rreg_f() |
190                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
191         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_CSW : 0x%x",
192                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
193
194         gk20a_writel(g, gr_fecs_icd_cmd_r(),
195                 gr_fecs_icd_cmd_opc_rreg_f() |
196                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
197         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_CTX : 0x%x",
198                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
199
200         gk20a_writel(g, gr_fecs_icd_cmd_r(),
201                 gr_fecs_icd_cmd_opc_rreg_f() |
202                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
203         nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_EXCI : 0x%x",
204                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
205
206         for (i = 0; i < 4; i++) {
207                 gk20a_writel(g, gr_fecs_icd_cmd_r(),
208                         gr_fecs_icd_cmd_opc_rreg_f() |
209                         gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
210                 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_PC : 0x%x",
211                         gk20a_readl(g, gr_fecs_icd_rdata_r()));
212
213                 gk20a_writel(g, gr_fecs_icd_cmd_r(),
214                         gr_fecs_icd_cmd_opc_rreg_f() |
215                         gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
216                 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_SP : 0x%x",
217                         gk20a_readl(g, gr_fecs_icd_rdata_r()));
218         }
219 }
220
221 static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
222 {
223         u32 i, ucode_u32_size;
224         const u32 *ucode_u32_data;
225         u32 checksum;
226
227         nvhost_dbg_fn("");
228
229         gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
230                                               gr_gpccs_dmemc_blk_f(0)  |
231                                               gr_gpccs_dmemc_aincw_f(1)));
232
233         ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
234         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
235
236         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
237                 gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
238                 checksum += ucode_u32_data[i];
239         }
240
241         gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
242                                              gr_fecs_dmemc_blk_f(0)  |
243                                              gr_fecs_dmemc_aincw_f(1)));
244
245         ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
246         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
247
248         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
249                 gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
250                 checksum += ucode_u32_data[i];
251         }
252         nvhost_dbg_fn("done");
253 }
254
255 static void gr_gk20a_load_falcon_imem(struct gk20a *g)
256 {
257         u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
258         const u32 *ucode_u32_data;
259         u32 tag, i, pad_start, pad_end;
260         u32 checksum;
261
262         nvhost_dbg_fn("");
263
264         cfg = gk20a_readl(g, gr_fecs_cfg_r());
265         fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
266
267         cfg = gk20a_readl(g, gr_gpc0_cfg_r());
268         gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
269
270         /* Use the broadcast address to access all of the GPCCS units. */
271         gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
272                                               gr_gpccs_imemc_blk_f(0) |
273                                               gr_gpccs_imemc_aincw_f(1)));
274
275         /* Setup the tags for the instruction memory. */
276         tag = 0;
277         gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
278
279         ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
280         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
281
282         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
283                 if (i && ((i % (256/sizeof(u32))) == 0)) {
284                         tag++;
285                         gk20a_writel(g, gr_gpccs_imemt_r(0),
286                                       gr_gpccs_imemt_tag_f(tag));
287                 }
288                 gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
289                 checksum += ucode_u32_data[i];
290         }
291
292         pad_start = i*4;
293         pad_end = pad_start+(256-pad_start%256)+256;
294         for (i = pad_start;
295              (i < gpccs_imem_size * 256) && (i < pad_end);
296              i += 4) {
297                 if (i && ((i % 256) == 0)) {
298                         tag++;
299                         gk20a_writel(g, gr_gpccs_imemt_r(0),
300                                       gr_gpccs_imemt_tag_f(tag));
301                 }
302                 gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
303         }
304
305         gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
306                                              gr_fecs_imemc_blk_f(0) |
307                                              gr_fecs_imemc_aincw_f(1)));
308
309         /* Setup the tags for the instruction memory. */
310         tag = 0;
311         gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
312
313         ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
314         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
315
316         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
317                 if (i && ((i % (256/sizeof(u32))) == 0)) {
318                         tag++;
319                         gk20a_writel(g, gr_fecs_imemt_r(0),
320                                       gr_fecs_imemt_tag_f(tag));
321                 }
322                 gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
323                 checksum += ucode_u32_data[i];
324         }
325
326         pad_start = i*4;
327         pad_end = pad_start+(256-pad_start%256)+256;
328         for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
329                 if (i && ((i % 256) == 0)) {
330                         tag++;
331                         gk20a_writel(g, gr_fecs_imemt_r(0),
332                                       gr_fecs_imemt_tag_f(tag));
333                 }
334                 gk20a_writel(g, gr_fecs_imemd_r(0), 0);
335         }
336 }
337
338 static int gr_gk20a_wait_idle(struct gk20a *g, unsigned long end_jiffies,
339                 u32 expect_delay)
340 {
341         u32 delay = expect_delay;
342         bool gr_enabled;
343         bool ctxsw_active;
344         bool gr_busy;
345
346         nvhost_dbg_fn("");
347
348         do {
349                 /* fmodel: host gets fifo_engine_status(gr) from gr
350                    only when gr_status is read */
351                 gk20a_readl(g, gr_status_r());
352
353                 gr_enabled = gk20a_readl(g, mc_enable_r()) &
354                         mc_enable_pgraph_enabled_f();
355
356                 ctxsw_active = gk20a_readl(g,
357                         fifo_engine_status_r(ENGINE_GR_GK20A)) &
358                         fifo_engine_status_ctxsw_in_progress_f();
359
360                 gr_busy = gk20a_readl(g, gr_engine_status_r()) &
361                         gr_engine_status_value_busy_f();
362
363                 if (!gr_enabled || (!gr_busy && !ctxsw_active)) {
364                         nvhost_dbg_fn("done");
365                         return 0;
366                 }
367
368                 usleep_range(delay, delay * 2);
369                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
370
371         } while (time_before(jiffies, end_jiffies));
372
373         nvhost_err(dev_from_gk20a(g),
374                 "timeout, ctxsw busy : %d, gr busy : %d",
375                 ctxsw_active, gr_busy);
376
377         return -EAGAIN;
378 }
379
380 static int gr_gk20a_ctx_reset(struct gk20a *g, u32 rst_mask)
381 {
382         u32 delay = GR_IDLE_CHECK_DEFAULT;
383         unsigned long end_jiffies = jiffies +
384                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
385         u32 reg;
386
387         nvhost_dbg_fn("");
388
389         /* Force clocks on */
390         gk20a_writel(g, gr_fe_pwr_mode_r(),
391                      gr_fe_pwr_mode_req_send_f() |
392                      gr_fe_pwr_mode_mode_force_on_f());
393
394         /* Wait for the clocks to indicate that they are on */
395         do {
396                 reg = gk20a_readl(g, gr_fe_pwr_mode_r());
397
398                 if (gr_fe_pwr_mode_req_v(reg) == gr_fe_pwr_mode_req_done_v())
399                         break;
400
401                 usleep_range(delay, delay * 2);
402                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
403
404         } while (time_before(jiffies, end_jiffies));
405
406         if (!time_before(jiffies, end_jiffies)) {
407                 nvhost_err(dev_from_gk20a(g),
408                            "failed to force the clocks on\n");
409                 WARN_ON(1);
410         }
411
412         if (rst_mask) {
413                 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), rst_mask);
414         } else {
415                 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
416                              gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
417                              gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
418                              gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
419                              gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
420                              gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
421                              gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
422                              gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
423                              gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
424                              gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
425         }
426
427         /* we need to read the reset register *and* wait for a moment to ensure
428          * reset propagation */
429
430         gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
431         udelay(20);
432
433         gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
434                      gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
435                      gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
436                      gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
437                      gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
438                      gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
439                      gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
440                      gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
441                      gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
442                      gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
443
444         /* we need to readl the reset and then wait a small moment after that */
445         gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
446         udelay(20);
447
448         /* Set power mode back to auto */
449         gk20a_writel(g, gr_fe_pwr_mode_r(),
450                      gr_fe_pwr_mode_req_send_f() |
451                      gr_fe_pwr_mode_mode_auto_f());
452
453         /* Wait for the request to complete */
454         end_jiffies = jiffies + msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
455         do {
456                 reg = gk20a_readl(g, gr_fe_pwr_mode_r());
457
458                 if (gr_fe_pwr_mode_req_v(reg) == gr_fe_pwr_mode_req_done_v())
459                         break;
460
461                 usleep_range(delay, delay * 2);
462                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
463
464         } while (time_before(jiffies, end_jiffies));
465
466         if (!time_before(jiffies, end_jiffies)) {
467                 nvhost_err(dev_from_gk20a(g),
468                            "failed to set power mode to auto\n");
469                 WARN_ON(1);
470         }
471
472         return 0;
473 }
474
475 static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
476                                    u32 *mailbox_ret, u32 opc_success,
477                                    u32 mailbox_ok, u32 opc_fail,
478                                    u32 mailbox_fail)
479 {
480         unsigned long end_jiffies = jiffies +
481                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
482         u32 delay = GR_IDLE_CHECK_DEFAULT;
483         u32 check = WAIT_UCODE_LOOP;
484         u32 reg;
485
486         nvhost_dbg_fn("");
487
488         while (check == WAIT_UCODE_LOOP) {
489                 if (!time_before(jiffies, end_jiffies) &&
490                                 tegra_platform_is_silicon())
491                         check = WAIT_UCODE_TIMEOUT;
492
493                 reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
494
495                 if (mailbox_ret)
496                         *mailbox_ret = reg;
497
498                 switch (opc_success) {
499                 case GR_IS_UCODE_OP_EQUAL:
500                         if (reg == mailbox_ok)
501                                 check = WAIT_UCODE_OK;
502                         break;
503                 case GR_IS_UCODE_OP_NOT_EQUAL:
504                         if (reg != mailbox_ok)
505                                 check = WAIT_UCODE_OK;
506                         break;
507                 case GR_IS_UCODE_OP_AND:
508                         if (reg & mailbox_ok)
509                                 check = WAIT_UCODE_OK;
510                         break;
511                 case GR_IS_UCODE_OP_LESSER:
512                         if (reg < mailbox_ok)
513                                 check = WAIT_UCODE_OK;
514                         break;
515                 case GR_IS_UCODE_OP_LESSER_EQUAL:
516                         if (reg <= mailbox_ok)
517                                 check = WAIT_UCODE_OK;
518                         break;
519                 case GR_IS_UCODE_OP_SKIP:
520                         /* do no success check */
521                         break;
522                 default:
523                         nvhost_err(dev_from_gk20a(g),
524                                    "invalid success opcode 0x%x", opc_success);
525
526                         check = WAIT_UCODE_ERROR;
527                         break;
528                 }
529
530                 switch (opc_fail) {
531                 case GR_IS_UCODE_OP_EQUAL:
532                         if (reg == mailbox_fail)
533                                 check = WAIT_UCODE_ERROR;
534                         break;
535                 case GR_IS_UCODE_OP_NOT_EQUAL:
536                         if (reg != mailbox_fail)
537                                 check = WAIT_UCODE_ERROR;
538                         break;
539                 case GR_IS_UCODE_OP_AND:
540                         if (reg & mailbox_fail)
541                                 check = WAIT_UCODE_ERROR;
542                         break;
543                 case GR_IS_UCODE_OP_LESSER:
544                         if (reg < mailbox_fail)
545                                 check = WAIT_UCODE_ERROR;
546                         break;
547                 case GR_IS_UCODE_OP_LESSER_EQUAL:
548                         if (reg <= mailbox_fail)
549                                 check = WAIT_UCODE_ERROR;
550                         break;
551                 case GR_IS_UCODE_OP_SKIP:
552                         /* do no check on fail*/
553                         break;
554                 default:
555                         nvhost_err(dev_from_gk20a(g),
556                                    "invalid fail opcode 0x%x", opc_fail);
557                         check = WAIT_UCODE_ERROR;
558                         break;
559                 }
560
561                 usleep_range(delay, delay * 2);
562                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
563         }
564
565         if (check == WAIT_UCODE_TIMEOUT) {
566                 nvhost_err(dev_from_gk20a(g),
567                            "timeout waiting on ucode response");
568                 gk20a_fecs_dump_falcon_stats(g);
569                 return -1;
570         } else if (check == WAIT_UCODE_ERROR) {
571                 nvhost_err(dev_from_gk20a(g),
572                            "ucode method failed on mailbox=%d value=0x%08x",
573                            mailbox_id, reg);
574                 gk20a_fecs_dump_falcon_stats(g);
575                 return -1;
576         }
577
578         nvhost_dbg_fn("done");
579         return 0;
580 }
581
582 /* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
583  * We should replace most, if not all, fecs method calls to this instead. */
584 struct fecs_method_op_gk20a {
585         struct {
586                 u32 addr;
587                 u32 data;
588         } method;
589
590         struct {
591                 u32 id;
592                 u32 data;
593                 u32 clr;
594                 u32 *ret;
595                 u32 ok;
596                 u32 fail;
597         } mailbox;
598
599         struct {
600                 u32 ok;
601                 u32 fail;
602         } cond;
603
604 };
605
606 int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
607                                    struct fecs_method_op_gk20a op)
608 {
609         struct gr_gk20a *gr = &g->gr;
610         int ret;
611
612         mutex_lock(&gr->fecs_mutex);
613
614         if (op.mailbox.id != 0)
615                 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
616                              op.mailbox.data);
617
618         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
619                 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
620
621         gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
622         gk20a_writel(g, gr_fecs_method_push_r(),
623                 gr_fecs_method_push_adr_f(op.method.addr));
624
625         /* op.mb.id == 4 cases require waiting for completion on
626          * for op.mb.id == 0 */
627         if (op.mailbox.id == 4)
628                 op.mailbox.id = 0;
629
630         ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
631                                       op.cond.ok, op.mailbox.ok,
632                                       op.cond.fail, op.mailbox.fail);
633
634         mutex_unlock(&gr->fecs_mutex);
635
636         return ret;
637 }
638
639 int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
640 {
641         return gr_gk20a_submit_fecs_method_op(g,
642               (struct fecs_method_op_gk20a) {
643                       .method.addr = fecs_method,
644                       .method.data = ~0,
645                       .mailbox = { .id   = 1, /*sideband?*/
646                                    .data = ~0, .clr = ~0, .ret = ret,
647                                    .ok   = gr_fecs_ctxsw_mailbox_value_pass_v(),
648                                    .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
649                       .cond.ok = GR_IS_UCODE_OP_EQUAL,
650                       .cond.fail = GR_IS_UCODE_OP_EQUAL });
651 }
652
653 /* Stop processing (stall) context switches at FECS */
654 int gr_gk20a_disable_ctxsw(struct gk20a *g)
655 {
656         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
657         return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0);
658 }
659
660 /* Start processing (continue) context switches at FECS */
661 int gr_gk20a_enable_ctxsw(struct gk20a *g)
662 {
663         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
664         return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0);
665 }
666
667
668 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
669 {
670         u32 addr_lo;
671         u32 addr_hi;
672         void *inst_ptr = NULL;
673
674         nvhost_dbg_fn("");
675
676         /* flush gpu_va before commit */
677         gk20a_mm_fb_flush(c->g);
678         gk20a_mm_l2_flush(c->g, true);
679
680         inst_ptr = c->inst_block.cpuva;
681         if (!inst_ptr)
682                 return -ENOMEM;
683
684         addr_lo = u64_lo32(gpu_va) >> 12;
685         addr_hi = u64_hi32(gpu_va);
686
687         mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
688                  ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
689                  ram_in_gr_wfi_ptr_lo_f(addr_lo));
690
691         mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
692                  ram_in_gr_wfi_ptr_hi_f(addr_hi));
693
694         gk20a_mm_l2_invalidate(c->g);
695
696         return 0;
697 }
698
699 /*
700  * Context state can be written directly or "patched" at times.
701  * So that code can be used in either situation it is written
702  * using a series _ctx_patch_write(..., patch) statements.
703  * However any necessary cpu map/unmap and gpu l2 invalidates
704  * should be minimized (to avoid doing it once per patch write).
705  * Before a sequence of these set up with "_ctx_patch_write_begin"
706  * and close with "_ctx_patch_write_end."
707  */
708 static int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
709                                           struct channel_ctx_gk20a *ch_ctx)
710 {
711         /* being defensive still... */
712         if (ch_ctx->patch_ctx.cpu_va) {
713                 nvhost_err(dev_from_gk20a(g), "nested ctx patch begin?");
714                 return -EBUSY;
715         }
716
717         ch_ctx->patch_ctx.cpu_va =
718                 nvhost_memmgr_mmap(ch_ctx->patch_ctx.mem.ref);
719
720         if (!ch_ctx->patch_ctx.cpu_va)
721                 return -ENOMEM;
722
723         return 0;
724 }
725
726 static int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
727                                         struct channel_ctx_gk20a *ch_ctx)
728 {
729         /* being defensive still... */
730         if (!ch_ctx->patch_ctx.cpu_va) {
731                 nvhost_err(dev_from_gk20a(g), "dangling ctx patch end?");
732                 return -EINVAL;
733         }
734
735         nvhost_memmgr_munmap(ch_ctx->patch_ctx.mem.ref,
736                              ch_ctx->patch_ctx.cpu_va);
737         ch_ctx->patch_ctx.cpu_va = NULL;
738
739         gk20a_mm_l2_invalidate(g);
740         return 0;
741 }
742
743 static int gr_gk20a_ctx_patch_write(struct gk20a *g,
744                                     struct channel_ctx_gk20a *ch_ctx,
745                                     u32 addr, u32 data, bool patch)
746 {
747         u32 patch_slot = 0;
748         void *patch_ptr = NULL;
749         bool mapped_here = false;
750
751         BUG_ON(patch != 0 && ch_ctx == NULL);
752
753         if (patch) {
754                 if (!ch_ctx)
755                         return -EINVAL;
756                 /* we added an optimization prolog, epilog
757                  * to get rid of unnecessary maps and l2 invals.
758                  * but be defensive still... */
759                 if (!ch_ctx->patch_ctx.cpu_va) {
760                         int err;
761                         nvhost_err(dev_from_gk20a(g),
762                                    "per-write ctx patch begin?");
763                         /* yes, gr_gk20a_ctx_patch_smpc causes this one */
764                         err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
765                         if (err)
766                                 return err;
767                         mapped_here = true;
768                 } else
769                         mapped_here = false;
770
771                 patch_ptr = ch_ctx->patch_ctx.cpu_va;
772                 patch_slot = ch_ctx->patch_ctx.data_count * 2;
773
774                 mem_wr32(patch_ptr, patch_slot++, addr);
775                 mem_wr32(patch_ptr, patch_slot++, data);
776
777                 ch_ctx->patch_ctx.data_count++;
778
779                 if (mapped_here)
780                         gr_gk20a_ctx_patch_write_end(g, ch_ctx);
781
782         } else
783                 gk20a_writel(g, addr, data);
784
785         return 0;
786 }
787
788 static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
789                                         struct channel_gk20a *c)
790 {
791         u32 inst_base_ptr = u64_lo32(c->inst_block.cpu_pa
792                                      >> ram_in_base_shift_v());
793         u32 ret;
794
795         nvhost_dbg_info("bind channel %d inst ptr 0x%08x",
796                    c->hw_chid, inst_base_ptr);
797
798         ret = gr_gk20a_submit_fecs_method_op(g,
799                      (struct fecs_method_op_gk20a) {
800                      .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
801                      .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
802                                      gr_fecs_current_ctx_target_vid_mem_f() |
803                                      gr_fecs_current_ctx_valid_f(1)),
804                      .mailbox = { .id = 0, .data = 0,
805                                   .clr = 0x30,
806                                   .ret = NULL,
807                                   .ok = 0x10,
808                                   .fail = 0x20, },
809                      .cond.ok = GR_IS_UCODE_OP_AND,
810                      .cond.fail = GR_IS_UCODE_OP_AND});
811         if (ret)
812                 nvhost_err(dev_from_gk20a(g),
813                         "bind channel instance failed");
814
815         return ret;
816 }
817
818 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
819                                     bool disable_fifo)
820 {
821         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
822         struct fifo_gk20a *f = &g->fifo;
823         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
824         u32 va_lo, va_hi, va;
825         int ret = 0;
826         void *ctx_ptr = NULL;
827
828         nvhost_dbg_fn("");
829
830         ctx_ptr = nvhost_memmgr_mmap(ch_ctx->gr_ctx.mem.ref);
831         if (!ctx_ptr)
832                 return -ENOMEM;
833
834         if (ch_ctx->zcull_ctx.gpu_va == 0 &&
835             ch_ctx->zcull_ctx.ctx_sw_mode ==
836                 ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
837                 ret = -EINVAL;
838                 goto clean_up;
839         }
840
841         va_lo = u64_lo32(ch_ctx->zcull_ctx.gpu_va);
842         va_hi = u64_hi32(ch_ctx->zcull_ctx.gpu_va);
843         va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
844
845         if (disable_fifo) {
846                 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
847                 if (ret) {
848                         nvhost_err(dev_from_gk20a(g),
849                                 "failed to disable gr engine activity\n");
850                         goto clean_up;
851                 }
852         }
853
854         /* Channel gr_ctx buffer is gpu cacheable.
855            Flush and invalidate before cpu update. */
856         gk20a_mm_fb_flush(g);
857         gk20a_mm_l2_flush(g, true);
858
859         mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
860                  ch_ctx->zcull_ctx.ctx_sw_mode);
861
862         mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
863
864         if (disable_fifo) {
865                 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
866                 if (ret) {
867                         nvhost_err(dev_from_gk20a(g),
868                                 "failed to enable gr engine activity\n");
869                         goto clean_up;
870                 }
871         }
872         gk20a_mm_l2_invalidate(g);
873
874 clean_up:
875         nvhost_memmgr_munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
876
877         return ret;
878 }
879
880 static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
881                         struct channel_gk20a *c, bool patch)
882 {
883         struct gr_gk20a *gr = &g->gr;
884         struct channel_ctx_gk20a *ch_ctx = NULL;
885         u32 attrib_offset_in_chunk = 0;
886         u32 alpha_offset_in_chunk = 0;
887         u32 pd_ab_max_output;
888         u32 gpc_index, ppc_index;
889         u32 temp;
890         u32 cbm_cfg_size1, cbm_cfg_size2;
891
892         nvhost_dbg_fn("");
893
894         if (patch) {
895                 int err;
896                 ch_ctx = &c->ch_ctx;
897                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
898                 if (err)
899                         return err;
900         }
901
902         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
903                 gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
904                 gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
905                 patch);
906
907         pd_ab_max_output = (gr->alpha_cb_default_size *
908                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
909                 gr_pd_ab_dist_cfg1_max_output_granularity_v();
910
911         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
912                 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
913                 gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
914
915         alpha_offset_in_chunk = attrib_offset_in_chunk +
916                 gr->tpc_count * gr->attrib_cb_size;
917
918         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
919                 temp = proj_gpc_stride_v() * gpc_index;
920                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
921                      ppc_index++) {
922                         cbm_cfg_size1 = gr->attrib_cb_default_size *
923                                 gr->pes_tpc_count[ppc_index][gpc_index];
924                         cbm_cfg_size2 = gr->alpha_cb_default_size *
925                                 gr->pes_tpc_count[ppc_index][gpc_index];
926
927                         gr_gk20a_ctx_patch_write(g, ch_ctx,
928                                 gr_gpc0_ppc0_cbm_cfg_r() + temp +
929                                 proj_ppc_in_gpc_stride_v() * ppc_index,
930                                 gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
931                                 gr_gpc0_ppc0_cbm_cfg_start_offset_f(attrib_offset_in_chunk) |
932                                 gr_gpc0_ppc0_cbm_cfg_size_f(cbm_cfg_size1), patch);
933
934                         attrib_offset_in_chunk += gr->attrib_cb_size *
935                                 gr->pes_tpc_count[ppc_index][gpc_index];
936
937                         gr_gk20a_ctx_patch_write(g, ch_ctx,
938                                 gr_gpc0_ppc0_cbm_cfg2_r() + temp +
939                                 proj_ppc_in_gpc_stride_v() * ppc_index,
940                                 gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
941                                 gr_gpc0_ppc0_cbm_cfg2_size_f(cbm_cfg_size2), patch);
942
943                         alpha_offset_in_chunk += gr->alpha_cb_size *
944                                 gr->pes_tpc_count[ppc_index][gpc_index];
945                 }
946         }
947
948         if (patch)
949                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
950
951         return 0;
952 }
953
954 static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
955                         struct channel_gk20a *c, bool patch)
956 {
957         struct gr_gk20a *gr = &g->gr;
958         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
959         u64 addr;
960         u32 size;
961         u32 data;
962
963         nvhost_dbg_fn("");
964         if (patch) {
965                 int err;
966                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
967                 if (err)
968                         return err;
969         }
970
971         /* global pagepool buffer */
972         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
973                 gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
974                 (u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
975                  (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
976
977         size = gr->global_ctx_buffer[PAGEPOOL].size /
978                 gr_scc_pagepool_total_pages_byte_granularity_v();
979
980         if (size == gr_scc_pagepool_total_pages_hwmax_value_v())
981                 size = gr_scc_pagepool_total_pages_hwmax_v();
982
983         nvhost_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
984                 addr, size);
985
986         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
987                 gr_scc_pagepool_base_addr_39_8_f(addr), patch);
988
989         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
990                 gr_scc_pagepool_total_pages_f(size) |
991                 gr_scc_pagepool_valid_true_f(), patch);
992
993         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
994                 gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
995
996         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
997                 gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
998
999         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
1000                 gr_pd_pagepool_total_pages_f(size) |
1001                 gr_pd_pagepool_valid_true_f(), patch);
1002
1003         /* global bundle cb */
1004         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
1005                 gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
1006                 (u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
1007                  (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
1008
1009         size = gr->bundle_cb_default_size;
1010
1011         nvhost_dbg_info("bundle cb addr : 0x%016llx, size : %d",
1012                 addr, size);
1013
1014         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
1015                 gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
1016
1017         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
1018                 gr_scc_bundle_cb_size_div_256b_f(size) |
1019                 gr_scc_bundle_cb_size_valid_true_f(), patch);
1020
1021         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
1022                 gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
1023
1024         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
1025                 gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
1026                 gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
1027
1028         /* data for state_limit */
1029         data = (gr->bundle_cb_default_size *
1030                 gr_scc_bundle_cb_size_div_256b_byte_granularity_v()) /
1031                 gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v();
1032
1033         data = min_t(u32, data, gr->min_gpm_fifo_depth);
1034
1035         nvhost_dbg_info("bundle cb token limit : %d, state limit : %d",
1036                    gr->bundle_cb_token_limit, data);
1037
1038         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
1039                 gr_pd_ab_dist_cfg2_token_limit_f(gr->bundle_cb_token_limit) |
1040                 gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
1041
1042         /* global attrib cb */
1043         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
1044                 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
1045                 (u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
1046                  (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
1047
1048         nvhost_dbg_info("attrib cb addr : 0x%016llx", addr);
1049
1050         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
1051                 gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
1052                 gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
1053
1054         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
1055                 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
1056                 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
1057
1058         if (patch)
1059                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
1060
1061         return 0;
1062 }
1063
1064 static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
1065 {
1066         struct gr_gk20a *gr = &g->gr;
1067         struct channel_ctx_gk20a *ch_ctx = NULL;
1068         u32 gpm_pd_cfg;
1069         u32 pd_ab_dist_cfg0;
1070         u32 ds_debug;
1071         u32 mpc_vtg_debug;
1072         u32 pe_vaf;
1073         u32 pe_vsc_vpc;
1074
1075         nvhost_dbg_fn("");
1076
1077         gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
1078         pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
1079         ds_debug = gk20a_readl(g, gr_ds_debug_r());
1080         mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
1081
1082         if (patch) {
1083                 int err;
1084                 ch_ctx = &c->ch_ctx;
1085                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
1086                 if (err)
1087                         return err;
1088         }
1089
1090         if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
1091                 pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
1092                 pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
1093
1094                 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
1095                 pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
1096                 pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
1097                 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
1098                 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
1099                 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
1100
1101                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1102                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
1103                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
1104                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1105                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1106                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1107         } else {
1108                 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
1109                 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
1110                 ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
1111                 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
1112
1113                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1114                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1115                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1116                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1117         }
1118
1119         if (patch)
1120                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
1121
1122         return 0;
1123 }
1124
1125 static int gr_gk20a_setup_rop_mapping(struct gk20a *g,
1126                                 struct gr_gk20a *gr)
1127 {
1128         u32 norm_entries, norm_shift;
1129         u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
1130         u32 map0, map1, map2, map3, map4, map5;
1131
1132         if (!gr->map_tiles)
1133                 return -1;
1134
1135         nvhost_dbg_fn("");
1136
1137         gk20a_writel(g, gr_crstr_map_table_cfg_r(),
1138                      gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
1139                      gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
1140
1141         map0 =  gr_crstr_gpc_map0_tile0_f(gr->map_tiles[0]) |
1142                 gr_crstr_gpc_map0_tile1_f(gr->map_tiles[1]) |
1143                 gr_crstr_gpc_map0_tile2_f(gr->map_tiles[2]) |
1144                 gr_crstr_gpc_map0_tile3_f(gr->map_tiles[3]) |
1145                 gr_crstr_gpc_map0_tile4_f(gr->map_tiles[4]) |
1146                 gr_crstr_gpc_map0_tile5_f(gr->map_tiles[5]);
1147
1148         map1 =  gr_crstr_gpc_map1_tile6_f(gr->map_tiles[6]) |
1149                 gr_crstr_gpc_map1_tile7_f(gr->map_tiles[7]) |
1150                 gr_crstr_gpc_map1_tile8_f(gr->map_tiles[8]) |
1151                 gr_crstr_gpc_map1_tile9_f(gr->map_tiles[9]) |
1152                 gr_crstr_gpc_map1_tile10_f(gr->map_tiles[10]) |
1153                 gr_crstr_gpc_map1_tile11_f(gr->map_tiles[11]);
1154
1155         map2 =  gr_crstr_gpc_map2_tile12_f(gr->map_tiles[12]) |
1156                 gr_crstr_gpc_map2_tile13_f(gr->map_tiles[13]) |
1157                 gr_crstr_gpc_map2_tile14_f(gr->map_tiles[14]) |
1158                 gr_crstr_gpc_map2_tile15_f(gr->map_tiles[15]) |
1159                 gr_crstr_gpc_map2_tile16_f(gr->map_tiles[16]) |
1160                 gr_crstr_gpc_map2_tile17_f(gr->map_tiles[17]);
1161
1162         map3 =  gr_crstr_gpc_map3_tile18_f(gr->map_tiles[18]) |
1163                 gr_crstr_gpc_map3_tile19_f(gr->map_tiles[19]) |
1164                 gr_crstr_gpc_map3_tile20_f(gr->map_tiles[20]) |
1165                 gr_crstr_gpc_map3_tile21_f(gr->map_tiles[21]) |
1166                 gr_crstr_gpc_map3_tile22_f(gr->map_tiles[22]) |
1167                 gr_crstr_gpc_map3_tile23_f(gr->map_tiles[23]);
1168
1169         map4 =  gr_crstr_gpc_map4_tile24_f(gr->map_tiles[24]) |
1170                 gr_crstr_gpc_map4_tile25_f(gr->map_tiles[25]) |
1171                 gr_crstr_gpc_map4_tile26_f(gr->map_tiles[26]) |
1172                 gr_crstr_gpc_map4_tile27_f(gr->map_tiles[27]) |
1173                 gr_crstr_gpc_map4_tile28_f(gr->map_tiles[28]) |
1174                 gr_crstr_gpc_map4_tile29_f(gr->map_tiles[29]);
1175
1176         map5 =  gr_crstr_gpc_map5_tile30_f(gr->map_tiles[30]) |
1177                 gr_crstr_gpc_map5_tile31_f(gr->map_tiles[31]) |
1178                 gr_crstr_gpc_map5_tile32_f(0) |
1179                 gr_crstr_gpc_map5_tile33_f(0) |
1180                 gr_crstr_gpc_map5_tile34_f(0) |
1181                 gr_crstr_gpc_map5_tile35_f(0);
1182
1183         gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
1184         gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
1185         gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
1186         gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
1187         gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
1188         gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
1189
1190         switch (gr->tpc_count) {
1191         case 1:
1192                 norm_shift = 4;
1193                 break;
1194         case 2:
1195         case 3:
1196                 norm_shift = 3;
1197                 break;
1198         case 4:
1199         case 5:
1200         case 6:
1201         case 7:
1202                 norm_shift = 2;
1203                 break;
1204         case 8:
1205         case 9:
1206         case 10:
1207         case 11:
1208         case 12:
1209         case 13:
1210         case 14:
1211         case 15:
1212                 norm_shift = 1;
1213                 break;
1214         default:
1215                 norm_shift = 0;
1216                 break;
1217         }
1218
1219         norm_entries = gr->tpc_count << norm_shift;
1220         coeff5_mod = (1 << 5) % norm_entries;
1221         coeff6_mod = (1 << 6) % norm_entries;
1222         coeff7_mod = (1 << 7) % norm_entries;
1223         coeff8_mod = (1 << 8) % norm_entries;
1224         coeff9_mod = (1 << 9) % norm_entries;
1225         coeff10_mod = (1 << 10) % norm_entries;
1226         coeff11_mod = (1 << 11) % norm_entries;
1227
1228         gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
1229                      gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
1230                      gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
1231                      gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
1232                      gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
1233                      gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
1234
1235         gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
1236                      gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
1237                      gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
1238                      gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
1239                      gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
1240                      gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
1241                      gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
1242
1243         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
1244         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
1245         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
1246         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
1247         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
1248         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
1249
1250         gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
1251                      gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
1252                      gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
1253
1254         gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
1255         gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
1256         gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
1257         gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
1258         gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
1259         gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
1260
1261         return 0;
1262 }
1263
1264 static inline u32 count_bits(u32 mask)
1265 {
1266         u32 temp = mask;
1267         u32 count;
1268         for (count = 0; temp != 0; count++)
1269                 temp &= temp - 1;
1270
1271         return count;
1272 }
1273
1274 static inline u32 clear_count_bits(u32 num, u32 clear_count)
1275 {
1276         u32 count = clear_count;
1277         for (; (num != 0) && (count != 0); count--)
1278                 num &= num - 1;
1279
1280         return num;
1281 }
1282
1283 static int gr_gk20a_setup_alpha_beta_tables(struct gk20a *g,
1284                                         struct gr_gk20a *gr)
1285 {
1286         u32 table_index_bits = 5;
1287         u32 rows = (1 << table_index_bits);
1288         u32 row_stride = gr_pd_alpha_ratio_table__size_1_v() / rows;
1289
1290         u32 row;
1291         u32 index;
1292         u32 gpc_index;
1293         u32 gpcs_per_reg = 4;
1294         u32 pes_index;
1295         u32 tpc_count_pes;
1296         u32 num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
1297
1298         u32 alpha_target, beta_target;
1299         u32 alpha_bits, beta_bits;
1300         u32 alpha_mask, beta_mask, partial_mask;
1301         u32 reg_offset;
1302         bool assign_alpha;
1303
1304         u32 map_alpha[gr_pd_alpha_ratio_table__size_1_v()];
1305         u32 map_beta[gr_pd_alpha_ratio_table__size_1_v()];
1306         u32 map_reg_used[gr_pd_alpha_ratio_table__size_1_v()];
1307
1308         nvhost_dbg_fn("");
1309
1310         memset(map_alpha, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1311         memset(map_beta, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1312         memset(map_reg_used, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1313
1314         for (row = 0; row < rows; ++row) {
1315                 alpha_target = max_t(u32, gr->tpc_count * row / rows, 1);
1316                 beta_target = gr->tpc_count - alpha_target;
1317
1318                 assign_alpha = (alpha_target < beta_target);
1319
1320                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1321                         reg_offset = (row * row_stride) + (gpc_index / gpcs_per_reg);
1322                         alpha_mask = beta_mask = 0;
1323
1324                         for (pes_index = 0; pes_index < num_pes_per_gpc; pes_index++) {
1325                                 tpc_count_pes = gr->pes_tpc_count[pes_index][gpc_index];
1326
1327                                 if (assign_alpha) {
1328                                         alpha_bits = (alpha_target == 0) ? 0 : tpc_count_pes;
1329                                         beta_bits = tpc_count_pes - alpha_bits;
1330                                 } else {
1331                                         beta_bits = (beta_target == 0) ? 0 : tpc_count_pes;
1332                                         alpha_bits = tpc_count_pes - beta_bits;
1333                                 }
1334
1335                                 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index];
1336                                 partial_mask = clear_count_bits(partial_mask, tpc_count_pes - alpha_bits);
1337                                 alpha_mask |= partial_mask;
1338
1339                                 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index] ^ partial_mask;
1340                                 beta_mask |= partial_mask;
1341
1342                                 alpha_target -= min(alpha_bits, alpha_target);
1343                                 beta_target -= min(beta_bits, beta_target);
1344
1345                                 if ((alpha_bits > 0) || (beta_bits > 0))
1346                                         assign_alpha = !assign_alpha;
1347                         }
1348
1349                         switch (gpc_index % gpcs_per_reg) {
1350                         case 0:
1351                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n0_mask_f(alpha_mask);
1352                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n0_mask_f(beta_mask);
1353                                 break;
1354                         case 1:
1355                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n1_mask_f(alpha_mask);
1356                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n1_mask_f(beta_mask);
1357                                 break;
1358                         case 2:
1359                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n2_mask_f(alpha_mask);
1360                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n2_mask_f(beta_mask);
1361                                 break;
1362                         case 3:
1363                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n3_mask_f(alpha_mask);
1364                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n3_mask_f(beta_mask);
1365                                 break;
1366                         }
1367                         map_reg_used[reg_offset] = true;
1368                 }
1369         }
1370
1371         for (index = 0; index < gr_pd_alpha_ratio_table__size_1_v(); index++) {
1372                 if (map_reg_used[index]) {
1373                         gk20a_writel(g, gr_pd_alpha_ratio_table_r(index), map_alpha[index]);
1374                         gk20a_writel(g, gr_pd_beta_ratio_table_r(index), map_beta[index]);
1375                 }
1376         }
1377
1378         return 0;
1379 }
1380
1381 static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
1382 {
1383         struct gr_gk20a *gr = &g->gr;
1384         u32 tpc_index, gpc_index;
1385         u32 tpc_offset, gpc_offset;
1386         u32 sm_id = 0, gpc_id = 0;
1387         u32 sm_id_to_gpc_id[proj_scal_max_gpcs_v() * proj_scal_max_tpc_per_gpc_v()];
1388         u32 tpc_per_gpc;
1389         u32 max_ways_evict = INVALID_MAX_WAYS;
1390
1391         nvhost_dbg_fn("");
1392
1393         for (tpc_index = 0; tpc_index < gr->max_tpc_per_gpc_count; tpc_index++) {
1394                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1395                         gpc_offset = proj_gpc_stride_v() * gpc_index;
1396                         if (tpc_index < gr->gpc_tpc_count[gpc_index]) {
1397                                 tpc_offset = proj_tpc_in_gpc_stride_v() * tpc_index;
1398
1399                                 gk20a_writel(g, gr_gpc0_tpc0_sm_cfg_r() + gpc_offset + tpc_offset,
1400                                              gr_gpc0_tpc0_sm_cfg_sm_id_f(sm_id));
1401                                 gk20a_writel(g, gr_gpc0_tpc0_l1c_cfg_smid_r() + gpc_offset + tpc_offset,
1402                                              gr_gpc0_tpc0_l1c_cfg_smid_value_f(sm_id));
1403                                 gk20a_writel(g, gr_gpc0_gpm_pd_sm_id_r(tpc_index) + gpc_offset,
1404                                              gr_gpc0_gpm_pd_sm_id_id_f(sm_id));
1405                                 gk20a_writel(g, gr_gpc0_tpc0_pe_cfg_smid_r() + gpc_offset + tpc_offset,
1406                                              gr_gpc0_tpc0_pe_cfg_smid_value_f(sm_id));
1407
1408                                 sm_id_to_gpc_id[sm_id] = gpc_index;
1409                                 sm_id++;
1410                         }
1411
1412                         gk20a_writel(g, gr_gpc0_gpm_pd_active_tpcs_r() + gpc_offset,
1413                                      gr_gpc0_gpm_pd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1414                         gk20a_writel(g, gr_gpc0_gpm_sd_active_tpcs_r() + gpc_offset,
1415                                      gr_gpc0_gpm_sd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1416                 }
1417         }
1418
1419         for (tpc_index = 0, gpc_id = 0;
1420              tpc_index < gr_pd_num_tpc_per_gpc__size_1_v();
1421              tpc_index++, gpc_id += 8) {
1422
1423                 if (gpc_id >= gr->gpc_count)
1424                         gpc_id = 0;
1425
1426                 tpc_per_gpc =
1427                         gr_pd_num_tpc_per_gpc_count0_f(gr->gpc_tpc_count[gpc_id + 0]) |
1428                         gr_pd_num_tpc_per_gpc_count1_f(gr->gpc_tpc_count[gpc_id + 1]) |
1429                         gr_pd_num_tpc_per_gpc_count2_f(gr->gpc_tpc_count[gpc_id + 2]) |
1430                         gr_pd_num_tpc_per_gpc_count3_f(gr->gpc_tpc_count[gpc_id + 3]) |
1431                         gr_pd_num_tpc_per_gpc_count4_f(gr->gpc_tpc_count[gpc_id + 4]) |
1432                         gr_pd_num_tpc_per_gpc_count5_f(gr->gpc_tpc_count[gpc_id + 5]) |
1433                         gr_pd_num_tpc_per_gpc_count6_f(gr->gpc_tpc_count[gpc_id + 6]) |
1434                         gr_pd_num_tpc_per_gpc_count7_f(gr->gpc_tpc_count[gpc_id + 7]);
1435
1436                 gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1437                 gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1438         }
1439
1440         /* gr__setup_pd_mapping stubbed for gk20a */
1441         gr_gk20a_setup_rop_mapping(g, gr);
1442         gr_gk20a_setup_alpha_beta_tables(g, gr);
1443
1444         if (gr->num_fbps == 1)
1445                 max_ways_evict = 9;
1446
1447         if (max_ways_evict != INVALID_MAX_WAYS)
1448                 gk20a_writel(g, ltc_ltcs_ltss_tstg_set_mgmt_r(),
1449                              ((gk20a_readl(g, ltc_ltcs_ltss_tstg_set_mgmt_r()) &
1450                                ~(ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(~0))) |
1451                               ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(max_ways_evict)));
1452
1453         for (gpc_index = 0;
1454              gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
1455              gpc_index += 4) {
1456
1457                 gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
1458                              gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
1459                              gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
1460                              gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
1461                              gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
1462         }
1463
1464         gk20a_writel(g, gr_cwd_fs_r(),
1465                      gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1466                      gr_cwd_fs_num_tpcs_f(gr->tpc_count));
1467
1468         gk20a_writel(g, gr_bes_zrop_settings_r(),
1469                      gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
1470         gk20a_writel(g, gr_bes_crop_settings_r(),
1471                      gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
1472
1473         return 0;
1474 }
1475
1476 static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
1477 {
1478         struct gk20a *g = c->g;
1479         int ret;
1480
1481         u32 inst_base_ptr =
1482                 u64_lo32(c->inst_block.cpu_pa
1483                 >> ram_in_base_shift_v());
1484
1485
1486         nvhost_dbg_fn("");
1487
1488         ret = gr_gk20a_submit_fecs_method_op(g,
1489                 (struct fecs_method_op_gk20a) {
1490                 .method.addr = save_type,
1491                 .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1492                                 gr_fecs_current_ctx_target_vid_mem_f() |
1493                                 gr_fecs_current_ctx_valid_f(1)),
1494                 .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
1495                         .ok = 1, .fail = 2,
1496                 },
1497                 .cond.ok = GR_IS_UCODE_OP_AND,
1498                 .cond.fail = GR_IS_UCODE_OP_AND,
1499                  });
1500
1501         if (ret)
1502                 nvhost_err(dev_from_gk20a(g), "save context image failed");
1503
1504         return ret;
1505 }
1506
1507 /* init global golden image from a fresh gr_ctx in channel ctx.
1508    save a copy in local_golden_image in ctx_vars */
1509 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
1510                                           struct channel_gk20a *c)
1511 {
1512         struct gr_gk20a *gr = &g->gr;
1513         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1514         u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
1515         u32 ctx_header_words;
1516         u32 i;
1517         u32 data;
1518         void *ctx_ptr = NULL;
1519         void *gold_ptr = NULL;
1520         u32 err = 0;
1521
1522         nvhost_dbg_fn("");
1523
1524         /* golden ctx is global to all channels. Although only the first
1525            channel initializes golden image, driver needs to prevent multiple
1526            channels from initializing golden ctx at the same time */
1527         mutex_lock(&gr->ctx_mutex);
1528
1529         if (gr->ctx_vars.golden_image_initialized)
1530                 goto clean_up;
1531
1532         err = gr_gk20a_fecs_ctx_bind_channel(g, c);
1533         if (err)
1534                 goto clean_up;
1535
1536         err = gr_gk20a_elpg_protected_call(g,
1537                         gr_gk20a_commit_global_ctx_buffers(g, c, false));
1538         if (err)
1539                 goto clean_up;
1540
1541         gold_ptr = nvhost_memmgr_mmap(gr->global_ctx_buffer[GOLDEN_CTX].ref);
1542         if (!gold_ptr)
1543                 goto clean_up;
1544
1545         ctx_ptr = nvhost_memmgr_mmap(ch_ctx->gr_ctx.mem.ref);
1546         if (!ctx_ptr)
1547                 goto clean_up;
1548
1549         ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
1550         ctx_header_words >>= 2;
1551
1552         /* Channel gr_ctx buffer is gpu cacheable.
1553            Flush before cpu read. */
1554         gk20a_mm_fb_flush(g);
1555         gk20a_mm_l2_flush(g, false);
1556
1557         for (i = 0; i < ctx_header_words; i++) {
1558                 data = mem_rd32(ctx_ptr, i);
1559                 mem_wr32(gold_ptr, i, data);
1560         }
1561
1562         mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
1563                  ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
1564
1565         mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
1566
1567         gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
1568
1569         gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
1570
1571         if (gr->ctx_vars.local_golden_image == NULL) {
1572
1573                 gr->ctx_vars.local_golden_image =
1574                         kzalloc(gr->ctx_vars.golden_image_size, GFP_KERNEL);
1575
1576                 if (gr->ctx_vars.local_golden_image == NULL) {
1577                         err = -ENOMEM;
1578                         goto clean_up;
1579                 }
1580
1581                 for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1582                         gr->ctx_vars.local_golden_image[i] =
1583                                 mem_rd32(gold_ptr, i);
1584         }
1585
1586         gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
1587
1588         gr->ctx_vars.golden_image_initialized = true;
1589
1590         gk20a_mm_l2_invalidate(g);
1591
1592         gk20a_writel(g, gr_fecs_current_ctx_r(),
1593                 gr_fecs_current_ctx_valid_false_f());
1594
1595 clean_up:
1596         if (err)
1597                 nvhost_err(dev_from_gk20a(g), "fail");
1598         else
1599                 nvhost_dbg_fn("done");
1600
1601         if (gold_ptr)
1602                 nvhost_memmgr_munmap(gr->global_ctx_buffer[GOLDEN_CTX].ref,
1603                                      gold_ptr);
1604         if (ctx_ptr)
1605                 nvhost_memmgr_munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
1606
1607         mutex_unlock(&gr->ctx_mutex);
1608         return err;
1609 }
1610
1611 /* load saved fresh copy of gloden image into channel gr_ctx */
1612 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
1613                                         struct channel_gk20a *c)
1614 {
1615         struct gr_gk20a *gr = &g->gr;
1616         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1617         u32 virt_addr_lo;
1618         u32 virt_addr_hi;
1619         u32 i;
1620         int ret = 0;
1621         void *ctx_ptr = NULL;
1622
1623         nvhost_dbg_fn("");
1624
1625         if (gr->ctx_vars.local_golden_image == NULL)
1626                 return -1;
1627
1628         /* Channel gr_ctx buffer is gpu cacheable.
1629            Flush and invalidate before cpu update. */
1630         gk20a_mm_fb_flush(g);
1631         gk20a_mm_l2_flush(g, true);
1632
1633         ctx_ptr = nvhost_memmgr_mmap(ch_ctx->gr_ctx.mem.ref);
1634         if (!ctx_ptr)
1635                 return -ENOMEM;
1636
1637         for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1638                 mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
1639
1640         mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
1641         mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
1642
1643         virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
1644         virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
1645
1646         mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
1647                  ch_ctx->patch_ctx.data_count);
1648         mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
1649                  virt_addr_lo);
1650         mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
1651                  virt_addr_hi);
1652
1653         /* no user for client managed performance counter ctx */
1654         ch_ctx->pm_ctx.ctx_sw_mode =
1655                 ctxsw_prog_main_image_pm_mode_no_ctxsw_v();
1656
1657         mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1658                 ch_ctx->pm_ctx.ctx_sw_mode);
1659         mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0);
1660
1661         nvhost_memmgr_munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
1662
1663         gk20a_mm_l2_invalidate(g);
1664
1665         if (tegra_platform_is_linsim()) {
1666                 u32 inst_base_ptr =
1667                         u64_lo32(c->inst_block.cpu_pa
1668                         >> ram_in_base_shift_v());
1669
1670                 ret = gr_gk20a_submit_fecs_method_op(g,
1671                           (struct fecs_method_op_gk20a) {
1672                                   .method.data =
1673                                           (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1674                                            gr_fecs_current_ctx_target_vid_mem_f() |
1675                                            gr_fecs_current_ctx_valid_f(1)),
1676                                   .method.addr =
1677                                           gr_fecs_method_push_adr_restore_golden_v(),
1678                                   .mailbox = {
1679                                           .id = 0, .data = 0,
1680                                           .clr = ~0, .ret = NULL,
1681                                           .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
1682                                           .fail = 0},
1683                                   .cond.ok = GR_IS_UCODE_OP_EQUAL,
1684                                   .cond.fail = GR_IS_UCODE_OP_SKIP});
1685
1686                 if (ret)
1687                         nvhost_err(dev_from_gk20a(g),
1688                                    "restore context image failed");
1689         }
1690
1691         return ret;
1692 }
1693
1694 static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
1695 {
1696         nvhost_dbg_fn("");
1697
1698         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
1699                      gr_fecs_ctxsw_mailbox_clear_value_f(~0));
1700
1701         gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
1702         gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
1703
1704         gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
1705         gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
1706
1707         nvhost_dbg_fn("done");
1708 }
1709
1710 static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
1711 {
1712         struct mm_gk20a *mm = &g->mm;
1713         struct vm_gk20a *vm = &mm->pmu.vm;
1714         struct device *d = dev_from_gk20a(g);
1715         struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1716         void *inst_ptr;
1717         u32 pde_addr_lo;
1718         u32 pde_addr_hi;
1719         u64 pde_addr;
1720
1721         /* Alloc mem of inst block */
1722         p_ucode_info->inst_blk_desc.size = ram_in_alloc_size_v();
1723         p_ucode_info->inst_blk_desc.cpuva = dma_alloc_coherent(d,
1724                                         p_ucode_info->inst_blk_desc.size,
1725                                         &p_ucode_info->inst_blk_desc.iova,
1726                                         GFP_KERNEL);
1727         if (!p_ucode_info->inst_blk_desc.cpuva) {
1728                 nvhost_err(d, "failed to allocate memory\n");
1729                 return -ENOMEM;
1730         }
1731
1732         p_ucode_info->inst_blk_desc.cpu_pa = gk20a_get_phys_from_iova(d,
1733                                         p_ucode_info->inst_blk_desc.iova);
1734
1735         inst_ptr = p_ucode_info->inst_blk_desc.cpuva;
1736
1737         /* Set inst block */
1738         mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
1739                  u64_lo32(vm->va_limit) | 0xFFF);
1740         mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
1741                 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
1742
1743         pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
1744         pde_addr_lo = u64_lo32(pde_addr >> 12);
1745         pde_addr_hi = u64_hi32(pde_addr);
1746         mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
1747                 ram_in_page_dir_base_target_vid_mem_f() |
1748                 ram_in_page_dir_base_vol_true_f() |
1749                 ram_in_page_dir_base_lo_f(pde_addr_lo));
1750         mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
1751                 ram_in_page_dir_base_hi_f(pde_addr_hi));
1752
1753         /* Map ucode surface to GMMU */
1754         p_ucode_info->ucode_gpuva = gk20a_gmmu_map(vm,
1755                                         &p_ucode_info->surface_desc.sgt,
1756                                         p_ucode_info->surface_desc.size,
1757                                         0, /* flags */
1758                                         mem_flag_read_only);
1759         if (!p_ucode_info->ucode_gpuva) {
1760                 nvhost_err(d, "failed to update gmmu ptes\n");
1761                 return -ENOMEM;
1762         }
1763
1764         return 0;
1765 }
1766
1767 static void gr_gk20a_init_ctxsw_ucode_segment(
1768         struct gk20a_ctxsw_ucode_segment *p_seg, u32 *p_offset, u32 size)
1769 {
1770         p_seg->offset = *p_offset;
1771         p_seg->size = size;
1772         *p_offset = ALIGN(*p_offset + size, BLK_SIZE);
1773 }
1774
1775 static void gr_gk20a_init_ctxsw_ucode_inst(
1776         struct gk20a_ctxsw_ucode_inst *p_inst, u32 *p_offset,
1777         struct gk20a_ctxsw_bootloader_desc *p_bootdesc,
1778         u32 code_size, u32 data_size)
1779 {
1780         u32 boot_size = ALIGN(p_bootdesc->bootloader_size, sizeof(u32));
1781         p_inst->boot_entry = p_bootdesc->bootloader_entry_point;
1782         p_inst->boot_imem_offset = p_bootdesc->bootloader_imem_offset;
1783         gr_gk20a_init_ctxsw_ucode_segment(&p_inst->boot, p_offset, boot_size);
1784         gr_gk20a_init_ctxsw_ucode_segment(&p_inst->code, p_offset, code_size);
1785         gr_gk20a_init_ctxsw_ucode_segment(&p_inst->data, p_offset, data_size);
1786 }
1787
1788 static int gr_gk20a_copy_ctxsw_ucode_inst(
1789         u8 *p_buf,
1790         struct gk20a_ctxsw_ucode_inst *p_inst,
1791         struct gk20a_ctxsw_bootloader_desc *p_bootdesc, u32 *p_bootimage,
1792         u32 *p_code, u32 *p_data)
1793 {
1794         memcpy(p_buf + p_inst->boot.offset, p_bootimage, p_inst->boot.size);
1795         memcpy(p_buf + p_inst->code.offset, p_code, p_inst->code.size);
1796         memcpy(p_buf + p_inst->data.offset, p_data, p_inst->data.size);
1797         return 0;
1798 }
1799
1800 static int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
1801 {
1802         struct device *d = dev_from_gk20a(g);
1803         struct mm_gk20a *mm = &g->mm;
1804         struct vm_gk20a *vm = &mm->pmu.vm;
1805         struct gk20a_ctxsw_bootloader_desc *p_fecs_boot_desc =
1806                 &g_fecs_bootloader_desc;
1807         struct gk20a_ctxsw_bootloader_desc *p_gpcs_boot_desc =
1808                 &g_gpccs_bootloader_desc;
1809         u32 *p_fecs_boot_image = g_fecs_bootloader_image;
1810         u32 *p_gpcs_boot_image = g_gpccs_bootloader_image;
1811         struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1812         u8 *p_buf;
1813         u32 ucode_size;
1814         int err = 0;
1815         DEFINE_DMA_ATTRS(attrs);
1816
1817         ucode_size = 0;
1818         gr_gk20a_init_ctxsw_ucode_inst(&p_ucode_info->fecs, &ucode_size,
1819                 p_fecs_boot_desc,
1820                 g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
1821                 g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
1822         gr_gk20a_init_ctxsw_ucode_inst(&p_ucode_info->gpcs, &ucode_size,
1823                 p_gpcs_boot_desc,
1824                 g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
1825                 g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
1826
1827         p_ucode_info->surface_desc.size = ucode_size;
1828         dma_set_attr(DMA_ATTR_READ_ONLY, &attrs);
1829         p_ucode_info->surface_desc.cpuva = dma_alloc_attrs(d,
1830                                         p_ucode_info->surface_desc.size,
1831                                         &p_ucode_info->surface_desc.iova,
1832                                         GFP_KERNEL,
1833                                         &attrs);
1834         if (!p_ucode_info->surface_desc.cpuva) {
1835                 nvhost_err(d, "memory allocation failed\n");
1836                 err = -ENOMEM;
1837                 goto clean_up;
1838         }
1839
1840         err = gk20a_get_sgtable(d, &p_ucode_info->surface_desc.sgt,
1841                                 p_ucode_info->surface_desc.cpuva,
1842                                 p_ucode_info->surface_desc.iova,
1843                                 p_ucode_info->surface_desc.size);
1844         if (err) {
1845                 nvhost_err(d, "failed to create sg table\n");
1846                 goto clean_up;
1847         }
1848
1849         p_buf = (u8 *)p_ucode_info->surface_desc.cpuva;
1850
1851         gr_gk20a_copy_ctxsw_ucode_inst(p_buf, &p_ucode_info->fecs,
1852                 p_fecs_boot_desc, p_fecs_boot_image,
1853                 g->gr.ctx_vars.ucode.fecs.inst.l,
1854                 g->gr.ctx_vars.ucode.fecs.data.l);
1855
1856         gr_gk20a_copy_ctxsw_ucode_inst(p_buf, &p_ucode_info->gpcs,
1857                 p_gpcs_boot_desc, p_gpcs_boot_image,
1858                 g->gr.ctx_vars.ucode.gpccs.inst.l,
1859                 g->gr.ctx_vars.ucode.gpccs.data.l);
1860
1861         err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
1862         if (err)
1863                 goto clean_up;
1864
1865         gk20a_free_sgtable(&p_ucode_info->surface_desc.sgt);
1866
1867         return 0;
1868
1869  clean_up:
1870         if (p_ucode_info->ucode_gpuva)
1871                 gk20a_gmmu_unmap(vm, p_ucode_info->ucode_gpuva,
1872                         p_ucode_info->surface_desc.size, mem_flag_none);
1873         if (p_ucode_info->surface_desc.sgt)
1874                 gk20a_free_sgtable(&p_ucode_info->surface_desc.sgt);
1875         if (p_ucode_info->surface_desc.cpuva)
1876                 dma_free_attrs(d, p_ucode_info->surface_desc.size,
1877                                 p_ucode_info->surface_desc.cpuva,
1878                                 p_ucode_info->surface_desc.iova,
1879                                 &attrs);
1880         p_ucode_info->surface_desc.cpuva = NULL;
1881         p_ucode_info->surface_desc.iova = 0;
1882
1883         return err;
1884 }
1885
1886 static void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
1887 {
1888         struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1889         int retries = 20;
1890         phys_addr_t inst_ptr;
1891         u32 val;
1892
1893         while ((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
1894                         gr_fecs_ctxsw_status_1_arb_busy_m()) && retries) {
1895                 udelay(2);
1896                 retries--;
1897         }
1898         if (!retries)
1899                 nvhost_err(dev_from_gk20a(g), "arbiter idle timeout");
1900
1901         gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
1902
1903         inst_ptr = p_ucode_info->inst_blk_desc.cpu_pa;
1904         gk20a_writel(g, gr_fecs_new_ctx_r(),
1905                         gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
1906                         gr_fecs_new_ctx_target_m() |
1907                         gr_fecs_new_ctx_valid_m());
1908
1909         gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
1910                         gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
1911                         gr_fecs_arb_ctx_ptr_target_m());
1912
1913         gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
1914
1915         /* Wait for arbiter command to complete */
1916         retries = 20;
1917         val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1918         while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
1919                 udelay(2);
1920                 retries--;
1921                 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1922         }
1923         if (!retries)
1924                 nvhost_err(dev_from_gk20a(g), "arbiter complete timeout");
1925
1926         gk20a_writel(g, gr_fecs_current_ctx_r(),
1927                         gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
1928                         gr_fecs_current_ctx_target_m() |
1929                         gr_fecs_current_ctx_valid_m());
1930         /* Send command to arbiter to flush */
1931         gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
1932
1933         retries = 20;
1934         val = (gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
1935         while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
1936                 udelay(2);
1937                 retries--;
1938                 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1939         }
1940         if (!retries)
1941                 nvhost_err(dev_from_gk20a(g), "arbiter complete timeout");
1942 }
1943
1944 static int gr_gk20a_load_ctxsw_ucode_inst(struct gk20a *g, u64 addr_base,
1945         struct gk20a_ctxsw_ucode_inst *p_inst, u32 reg_offset)
1946 {
1947         u32 addr_code32;
1948         u32 addr_data32;
1949         u32 addr_load32;
1950         u32 dst = 0;
1951         u32 blocks;
1952         u32 b;
1953
1954         addr_code32 = u64_lo32((addr_base + p_inst->code.offset) >> 8);
1955         addr_data32 = u64_lo32((addr_base + p_inst->data.offset) >> 8);
1956         addr_load32 = u64_lo32((addr_base + p_inst->boot.offset) >> 8);
1957
1958         gk20a_writel(g, reg_offset + gr_fecs_dmactl_r(),
1959                         gr_fecs_dmactl_require_ctx_f(0));
1960
1961         /*
1962          * Copy falcon bootloader header into dmem at offset 0.
1963          * Configure dmem port 0 for auto-incrementing writes starting at dmem
1964          * offset 0.
1965          */
1966         gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
1967                         gr_fecs_dmemc_offs_f(0) |
1968                         gr_fecs_dmemc_blk_f(0) |
1969                         gr_fecs_dmemc_aincw_f(1));
1970
1971         /* Write out the actual data */
1972         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
1973         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
1974         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
1975         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), p_inst->code.size);
1976         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
1977         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_data32);
1978         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), p_inst->data.size);
1979         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
1980         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
1981         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
1982
1983         blocks = ((p_inst->boot.size + 0xFF) & ~0xFF) >> 8;
1984
1985         /*
1986          * Set the base FB address for the DMA transfer. Subtract off the 256
1987          * byte IMEM block offset such that the relative FB and IMEM offsets
1988          * match, allowing the IMEM tags to be properly created.
1989          */
1990
1991         dst = p_inst->boot_imem_offset;
1992         gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
1993                         (addr_load32 - (dst >> 8)));
1994
1995         for (b = 0; b < blocks; b++) {
1996                 /* Setup destination IMEM offset */
1997                 gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
1998                                 dst + (b << 8));
1999
2000                 /* Setup source offset (relative to BASE) */
2001                 gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
2002                                 dst + (b << 8));
2003
2004                 gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
2005                                 gr_fecs_dmatrfcmd_imem_f(0x01) |
2006                                 gr_fecs_dmatrfcmd_write_f(0x00) |
2007                                 gr_fecs_dmatrfcmd_size_f(0x06) |
2008                                 gr_fecs_dmatrfcmd_ctxdma_f(0));
2009         }
2010
2011         /* Specify the falcon boot vector */
2012         gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
2013                         gr_fecs_bootvec_vec_f(p_inst->boot_entry));
2014
2015         /* Write to CPUCTL to start the falcon */
2016         gk20a_writel(g, reg_offset + gr_fecs_cpuctl_r(),
2017                         gr_fecs_cpuctl_startcpu_f(0x01));
2018
2019         return 0;
2020 }
2021
2022 static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
2023 {
2024         struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
2025         u64 addr_base = p_ucode_info->ucode_gpuva;
2026
2027         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
2028
2029         gr_gk20a_load_falcon_bind_instblk(g);
2030
2031         gr_gk20a_load_ctxsw_ucode_inst(g, addr_base,
2032                 &g->ctxsw_ucode_info.fecs, 0);
2033
2034         gr_gk20a_load_ctxsw_ucode_inst(g, addr_base,
2035                 &g->ctxsw_ucode_info.gpcs,
2036                 gr_gpcs_gpccs_falcon_hwcfg_r() -
2037                 gr_fecs_falcon_hwcfg_r());
2038 }
2039
2040 static int gr_gk20a_load_ctxsw_ucode(struct gk20a *g, struct gr_gk20a *gr)
2041 {
2042         u32 ret;
2043
2044         nvhost_dbg_fn("");
2045
2046         if (tegra_platform_is_linsim()) {
2047                 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
2048                         gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
2049                 gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
2050                         gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
2051         }
2052
2053         /*
2054          * In case the gPMU falcon is not being used, revert to the old way of
2055          * loading gr ucode, without the faster bootstrap routine.
2056          */
2057         if (!support_gk20a_pmu()) {
2058                 gr_gk20a_load_falcon_dmem(g);
2059                 gr_gk20a_load_falcon_imem(g);
2060                 gr_gk20a_start_falcon_ucode(g);
2061         } else {
2062                 if (!gr->skip_ucode_init)
2063                         gr_gk20a_init_ctxsw_ucode(g);
2064                 gr_gk20a_load_falcon_with_bootloader(g);
2065                 gr->skip_ucode_init = true;
2066         }
2067
2068         ret = gr_gk20a_ctx_wait_ucode(g, 0, 0,
2069                                       GR_IS_UCODE_OP_EQUAL,
2070                                       eUcodeHandshakeInitComplete,
2071                                       GR_IS_UCODE_OP_SKIP, 0);
2072         if (ret) {
2073                 nvhost_err(dev_from_gk20a(g), "falcon ucode init timeout");
2074                 return ret;
2075         }
2076
2077         if (support_gk20a_pmu())
2078                 gk20a_writel(g, gr_fecs_current_ctx_r(),
2079                         gr_fecs_current_ctx_valid_false_f());
2080
2081         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
2082         gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
2083         gk20a_writel(g, gr_fecs_method_push_r(),
2084                      gr_fecs_method_push_adr_set_watchdog_timeout_f());
2085
2086         nvhost_dbg_fn("done");
2087         return 0;
2088 }
2089
2090 static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
2091 {
2092         u32 golden_ctx_image_size = 0;
2093         u32 zcull_ctx_image_size = 0;
2094         u32 pm_ctx_image_size = 0;
2095         u32 ret;
2096         struct fecs_method_op_gk20a op = {
2097                 .mailbox = { .id = 0, .data = 0,
2098                              .clr = ~0, .ok = 0, .fail = 0},
2099                 .method.data = 0,
2100                 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
2101                 .cond.fail = GR_IS_UCODE_OP_SKIP,
2102                 };
2103
2104         nvhost_dbg_fn("");
2105         op.method.addr = gr_fecs_method_push_adr_discover_image_size_v();
2106         op.mailbox.ret = &golden_ctx_image_size;
2107         ret = gr_gk20a_submit_fecs_method_op(g, op);
2108         if (ret) {
2109                 nvhost_err(dev_from_gk20a(g),
2110                            "query golden image size failed");
2111                 return ret;
2112         }
2113         op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v();
2114         op.mailbox.ret = &zcull_ctx_image_size;
2115         ret = gr_gk20a_submit_fecs_method_op(g, op);
2116         if (ret) {
2117                 nvhost_err(dev_from_gk20a(g),
2118                            "query zcull ctx image size failed");
2119                 return ret;
2120         }
2121         op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v();
2122         op.mailbox.ret = &pm_ctx_image_size;
2123         ret = gr_gk20a_submit_fecs_method_op(g, op);
2124         if (ret) {
2125                 nvhost_err(dev_from_gk20a(g),
2126                            "query pm ctx image size failed");
2127                 return ret;
2128         }
2129
2130         if (!g->gr.ctx_vars.golden_image_size &&
2131             !g->gr.ctx_vars.zcull_ctxsw_image_size) {
2132                 g->gr.ctx_vars.golden_image_size = golden_ctx_image_size;
2133                 g->gr.ctx_vars.zcull_ctxsw_image_size = zcull_ctx_image_size;
2134         } else {
2135                 /* hw is different after railgating? */
2136                 BUG_ON(g->gr.ctx_vars.golden_image_size != golden_ctx_image_size);
2137                 BUG_ON(g->gr.ctx_vars.zcull_ctxsw_image_size != zcull_ctx_image_size);
2138         }
2139
2140         nvhost_dbg_fn("done");
2141         return 0;
2142 }
2143
2144 static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
2145 {
2146         struct gr_gk20a *gr = &g->gr;
2147         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2148         struct mem_handle *mem;
2149         u32 i, attr_buffer_size;
2150
2151         u32 cb_buffer_size = gr_scc_bundle_cb_size_div_256b__prod_v() *
2152                 gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
2153
2154         u32 pagepool_buffer_size = gr_scc_pagepool_total_pages_hwmax_value_v() *
2155                 gr_scc_pagepool_total_pages_byte_granularity_v();
2156
2157         u32 attr_cb_default_size = gr_gpc0_ppc0_cbm_cfg_size_default_v();
2158         u32 alpha_cb_default_size = gr_gpc0_ppc0_cbm_cfg2_size_default_v();
2159
2160         u32 attr_cb_size =
2161                 attr_cb_default_size + (attr_cb_default_size >> 1);
2162         u32 alpha_cb_size =
2163                 alpha_cb_default_size + (alpha_cb_default_size >> 1);
2164
2165         u32 num_tpcs_per_pes = proj_scal_litter_num_tpcs_per_pes_v();
2166         u32 attr_max_size_per_tpc =
2167                 gr_gpc0_ppc0_cbm_cfg_size_v(~0) / num_tpcs_per_pes;
2168         u32 alpha_max_size_per_tpc =
2169                 gr_gpc0_ppc0_cbm_cfg2_size_v(~0) / num_tpcs_per_pes;
2170
2171
2172         nvhost_dbg_fn("");
2173
2174         attr_cb_size =
2175                 (attr_cb_size > attr_max_size_per_tpc) ?
2176                         attr_max_size_per_tpc : attr_cb_size;
2177         attr_cb_default_size =
2178                 (attr_cb_default_size > attr_cb_size) ?
2179                         attr_cb_size : attr_cb_default_size;
2180         alpha_cb_size =
2181                 (alpha_cb_size > alpha_max_size_per_tpc) ?
2182                         alpha_max_size_per_tpc : alpha_cb_size;
2183         alpha_cb_default_size =
2184                 (alpha_cb_default_size > alpha_cb_size) ?
2185                         alpha_cb_size : alpha_cb_default_size;
2186
2187         attr_buffer_size =
2188                 (gr_gpc0_ppc0_cbm_cfg_size_granularity_v() * alpha_cb_size +
2189                  gr_gpc0_ppc0_cbm_cfg2_size_granularity_v() * alpha_cb_size) *
2190                  gr->gpc_count;
2191
2192         nvhost_dbg_info("cb_buffer_size : %d", cb_buffer_size);
2193
2194         mem = nvhost_memmgr_alloc(memmgr, cb_buffer_size,
2195                                   DEFAULT_ALLOC_ALIGNMENT,
2196                                   DEFAULT_ALLOC_FLAGS,
2197                                   0);
2198         if (IS_ERR(mem))
2199                 goto clean_up;
2200
2201         gr->global_ctx_buffer[CIRCULAR].ref = mem;
2202         gr->global_ctx_buffer[CIRCULAR].size = cb_buffer_size;
2203
2204         mem = nvhost_memmgr_alloc(memmgr, cb_buffer_size,
2205                                   DEFAULT_ALLOC_ALIGNMENT,
2206                                   DEFAULT_ALLOC_FLAGS,
2207                                   NVMAP_HEAP_CARVEOUT_VPR);
2208         if (!IS_ERR(mem)) {
2209                 gr->global_ctx_buffer[CIRCULAR_VPR].ref = mem;
2210                 gr->global_ctx_buffer[CIRCULAR_VPR].size = cb_buffer_size;
2211         }
2212
2213         nvhost_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
2214
2215         mem = nvhost_memmgr_alloc(memmgr, pagepool_buffer_size,
2216                                   DEFAULT_ALLOC_ALIGNMENT,
2217                                   DEFAULT_ALLOC_FLAGS,
2218                                   0);
2219         if (IS_ERR(mem))
2220                 goto clean_up;
2221
2222         gr->global_ctx_buffer[PAGEPOOL].ref = mem;
2223         gr->global_ctx_buffer[PAGEPOOL].size = pagepool_buffer_size;
2224
2225         mem = nvhost_memmgr_alloc(memmgr, pagepool_buffer_size,
2226                                   DEFAULT_ALLOC_ALIGNMENT,
2227                                   DEFAULT_ALLOC_FLAGS,
2228                                   NVMAP_HEAP_CARVEOUT_VPR);
2229         if (!IS_ERR(mem)) {
2230                 gr->global_ctx_buffer[PAGEPOOL_VPR].ref = mem;
2231                 gr->global_ctx_buffer[PAGEPOOL_VPR].size = pagepool_buffer_size;
2232         }
2233
2234         nvhost_dbg_info("attr_buffer_size : %d", attr_buffer_size);
2235
2236         mem = nvhost_memmgr_alloc(memmgr, attr_buffer_size,
2237                                   DEFAULT_ALLOC_ALIGNMENT,
2238                                   DEFAULT_ALLOC_FLAGS,
2239                                   0);
2240         if (IS_ERR(mem))
2241                 goto clean_up;
2242
2243         gr->global_ctx_buffer[ATTRIBUTE].ref = mem;
2244         gr->global_ctx_buffer[ATTRIBUTE].size = attr_buffer_size;
2245
2246         mem = nvhost_memmgr_alloc(memmgr, attr_buffer_size,
2247                                   DEFAULT_ALLOC_ALIGNMENT,
2248                                   DEFAULT_ALLOC_FLAGS,
2249                                   NVMAP_HEAP_CARVEOUT_VPR);
2250         if (!IS_ERR(mem)) {
2251                 gr->global_ctx_buffer[ATTRIBUTE_VPR].ref = mem;
2252                 gr->global_ctx_buffer[ATTRIBUTE_VPR].size = attr_buffer_size;
2253         }
2254
2255         nvhost_dbg_info("golden_image_size : %d",
2256                    gr->ctx_vars.golden_image_size);
2257
2258         mem = nvhost_memmgr_alloc(memmgr, gr->ctx_vars.golden_image_size,
2259                                   DEFAULT_ALLOC_ALIGNMENT,
2260                                   DEFAULT_ALLOC_FLAGS,
2261                                   0);
2262         if (IS_ERR(mem))
2263                 goto clean_up;
2264
2265         gr->global_ctx_buffer[GOLDEN_CTX].ref = mem;
2266         gr->global_ctx_buffer[GOLDEN_CTX].size =
2267                 gr->ctx_vars.golden_image_size;
2268
2269         nvhost_dbg_fn("done");
2270         return 0;
2271
2272  clean_up:
2273         nvhost_err(dev_from_gk20a(g), "fail");
2274         for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2275                 if (gr->global_ctx_buffer[i].ref) {
2276                         nvhost_memmgr_put(memmgr,
2277                                           gr->global_ctx_buffer[i].ref);
2278                         memset(&gr->global_ctx_buffer[i],
2279                                 0, sizeof(struct mem_desc));
2280                 }
2281         }
2282         return -ENOMEM;
2283 }
2284
2285 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
2286 {
2287         struct gr_gk20a *gr = &g->gr;
2288         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2289         u32 i;
2290
2291         for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2292                 nvhost_memmgr_put(memmgr, gr->global_ctx_buffer[i].ref);
2293                 memset(&gr->global_ctx_buffer[i], 0, sizeof(struct mem_desc));
2294         }
2295
2296         nvhost_dbg_fn("done");
2297 }
2298
2299 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
2300                                         struct channel_gk20a *c)
2301 {
2302         struct vm_gk20a *ch_vm = c->vm;
2303         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2304         struct mem_handle *handle_ref;
2305         u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2306         struct gr_gk20a *gr = &g->gr;
2307         u64 gpu_va;
2308         u32 i;
2309         nvhost_dbg_fn("");
2310
2311         /* Circular Buffer */
2312         if (!c->vpr || (gr->global_ctx_buffer[CIRCULAR_VPR].ref == NULL))
2313                 handle_ref = gr->global_ctx_buffer[CIRCULAR].ref;
2314         else
2315                 handle_ref = gr->global_ctx_buffer[CIRCULAR_VPR].ref;
2316
2317         gpu_va = gk20a_vm_map(ch_vm, memmgr, handle_ref,
2318                               /*offset_align, flags, kind*/
2319                               0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0,
2320                               NULL, false, mem_flag_none);
2321         if (!gpu_va)
2322                 goto clean_up;
2323         g_bfr_va[CIRCULAR_VA] = gpu_va;
2324
2325         /* Attribute Buffer */
2326         if (!c->vpr || (gr->global_ctx_buffer[ATTRIBUTE_VPR].ref == NULL))
2327                 handle_ref = gr->global_ctx_buffer[ATTRIBUTE].ref;
2328         else
2329                 handle_ref = gr->global_ctx_buffer[ATTRIBUTE_VPR].ref;
2330
2331         gpu_va = gk20a_vm_map(ch_vm, memmgr, handle_ref,
2332                               /*offset_align, flags, kind*/
2333                               0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0,
2334                               NULL, false, mem_flag_none);
2335         if (!gpu_va)
2336                 goto clean_up;
2337         g_bfr_va[ATTRIBUTE_VA] = gpu_va;
2338
2339         /* Page Pool */
2340         if (!c->vpr || (gr->global_ctx_buffer[PAGEPOOL_VPR].ref == NULL))
2341                 handle_ref = gr->global_ctx_buffer[PAGEPOOL].ref;
2342         else
2343                 handle_ref = gr->global_ctx_buffer[PAGEPOOL_VPR].ref;
2344
2345         gpu_va = gk20a_vm_map(ch_vm, memmgr, handle_ref,
2346                               /*offset_align, flags, kind*/
2347                               0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0,
2348                               NULL, false, mem_flag_none);
2349         if (!gpu_va)
2350                 goto clean_up;
2351         g_bfr_va[PAGEPOOL_VA] = gpu_va;
2352
2353         /* Golden Image */
2354         gpu_va = gk20a_vm_map(ch_vm, memmgr,
2355                               gr->global_ctx_buffer[GOLDEN_CTX].ref,
2356                               /*offset_align, flags, kind*/
2357                               0, 0, 0, NULL, false, mem_flag_none);
2358         if (!gpu_va)
2359                 goto clean_up;
2360         g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
2361
2362         c->ch_ctx.global_ctx_buffer_mapped = true;
2363         return 0;
2364
2365  clean_up:
2366         for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2367                 if (g_bfr_va[i]) {
2368                         gk20a_vm_unmap(ch_vm, g_bfr_va[i]);
2369                         g_bfr_va[i] = 0;
2370                 }
2371         }
2372         return -ENOMEM;
2373 }
2374
2375 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
2376 {
2377         struct vm_gk20a *ch_vm = c->vm;
2378         u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2379         u32 i;
2380
2381         nvhost_dbg_fn("");
2382
2383         for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2384                 if (g_bfr_va[i]) {
2385                         gk20a_vm_unmap(ch_vm, g_bfr_va[i]);
2386                         g_bfr_va[i] = 0;
2387                 }
2388         }
2389         c->ch_ctx.global_ctx_buffer_mapped = false;
2390 }
2391
2392 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
2393                                 struct channel_gk20a *c)
2394 {
2395         struct gr_gk20a *gr = &g->gr;
2396         struct gr_ctx_desc *gr_ctx = &c->ch_ctx.gr_ctx;
2397         struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
2398         struct vm_gk20a *ch_vm = c->vm;
2399
2400         nvhost_dbg_fn("");
2401
2402         if (gr->ctx_vars.buffer_size == 0)
2403                 return 0;
2404
2405         /* alloc channel gr ctx buffer */
2406         gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
2407         gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
2408
2409         gr_ctx->mem.ref = nvhost_memmgr_alloc(memmgr,
2410                                               gr->ctx_vars.buffer_total_size,
2411                                               DEFAULT_ALLOC_ALIGNMENT,
2412                                               DEFAULT_ALLOC_FLAGS,
2413                                               0);
2414
2415         if (IS_ERR(gr_ctx->mem.ref))
2416                 return -ENOMEM;
2417
2418         gr_ctx->gpu_va = gk20a_vm_map(ch_vm, memmgr,
2419                 gr_ctx->mem.ref,
2420                 /*offset_align, flags, kind*/
2421                 0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0, NULL, false,
2422                 mem_flag_none);
2423         if (!gr_ctx->gpu_va) {
2424                 nvhost_memmgr_put(memmgr, gr_ctx->mem.ref);
2425                 return -ENOMEM;
2426         }
2427
2428         return 0;
2429 }
2430
2431 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
2432 {
2433         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2434         struct mem_mgr *ch_nvmap = gk20a_channel_mem_mgr(c);
2435         struct vm_gk20a *ch_vm = c->vm;
2436
2437         nvhost_dbg_fn("");
2438
2439         gk20a_vm_unmap(ch_vm, ch_ctx->gr_ctx.gpu_va);
2440         nvhost_memmgr_put(ch_nvmap, ch_ctx->gr_ctx.mem.ref);
2441 }
2442
2443 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
2444                                 struct channel_gk20a *c)
2445 {
2446         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2447         struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
2448         struct vm_gk20a *ch_vm = c->vm;
2449
2450         nvhost_dbg_fn("");
2451
2452         patch_ctx->mem.ref = nvhost_memmgr_alloc(memmgr, 128 * sizeof(u32),
2453                                                  DEFAULT_ALLOC_ALIGNMENT,
2454                                                  DEFAULT_ALLOC_FLAGS,
2455                                                  0);
2456         if (IS_ERR(patch_ctx->mem.ref))
2457                 return -ENOMEM;
2458
2459         patch_ctx->gpu_va = gk20a_vm_map(ch_vm, memmgr,
2460                                          patch_ctx->mem.ref,
2461                                          /*offset_align, flags, kind*/
2462                                          0, 0, 0, NULL, false, mem_flag_none);
2463         if (!patch_ctx->gpu_va)
2464                 goto clean_up;
2465
2466         nvhost_dbg_fn("done");
2467         return 0;
2468
2469  clean_up:
2470         nvhost_err(dev_from_gk20a(g), "fail");
2471         if (patch_ctx->mem.ref) {
2472                 nvhost_memmgr_put(memmgr, patch_ctx->mem.ref);
2473                 patch_ctx->mem.ref = 0;
2474         }
2475
2476         return -ENOMEM;
2477 }
2478
2479 static void gr_gk20a_unmap_channel_patch_ctx(struct channel_gk20a *c)
2480 {
2481         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2482         struct vm_gk20a *ch_vm = c->vm;
2483
2484         nvhost_dbg_fn("");
2485
2486         if (patch_ctx->gpu_va)
2487                 gk20a_vm_unmap(ch_vm, patch_ctx->gpu_va);
2488         patch_ctx->gpu_va = 0;
2489         patch_ctx->data_count = 0;
2490 }
2491
2492 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
2493 {
2494         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2495         struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
2496
2497         nvhost_dbg_fn("");
2498
2499         gr_gk20a_unmap_channel_patch_ctx(c);
2500
2501         if (patch_ctx->mem.ref) {
2502                 nvhost_memmgr_put(memmgr, patch_ctx->mem.ref);
2503                 patch_ctx->mem.ref = 0;
2504         }
2505 }
2506
2507 void gk20a_free_channel_ctx(struct channel_gk20a *c)
2508 {
2509         gr_gk20a_unmap_global_ctx_buffers(c);
2510         gr_gk20a_free_channel_patch_ctx(c);
2511         gr_gk20a_free_channel_gr_ctx(c);
2512
2513         /* zcull_ctx, pm_ctx */
2514
2515         memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
2516
2517         c->num_objects = 0;
2518         c->first_init = false;
2519 }
2520
2521 int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
2522                         struct nvhost_alloc_obj_ctx_args *args)
2523 {
2524         struct gk20a *g = c->g;
2525         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2526         bool change_to_compute_mode = false;
2527         int err = 0;
2528
2529         nvhost_dbg_fn("");
2530
2531         /* an address space needs to have been bound at this point.*/
2532         if (!gk20a_channel_as_bound(c)) {
2533                 nvhost_err(dev_from_gk20a(g),
2534                            "not bound to address space at time"
2535                            " of grctx allocation");
2536                 return -EINVAL;
2537         }
2538
2539         switch (args->class_num) {
2540         case KEPLER_COMPUTE_A:
2541                 /* tbd: NV2080_CTRL_GPU_COMPUTE_MODE_RULES_EXCLUSIVE_COMPUTE */
2542                 /* tbd: PDB_PROP_GRAPHICS_DISTINCT_3D_AND_COMPUTE_STATE_DEF  */
2543                 change_to_compute_mode = true;
2544                 break;
2545         case KEPLER_C:
2546         case FERMI_TWOD_A:
2547         case KEPLER_DMA_COPY_A:
2548                 break;
2549
2550         default:
2551                 nvhost_err(dev_from_gk20a(g),
2552                            "invalid obj class 0x%x", args->class_num);
2553                 err = -EINVAL;
2554                 goto out;
2555         }
2556
2557         /* allocate gr ctx buffer */
2558         if (ch_ctx->gr_ctx.mem.ref == NULL) {
2559                 err = gr_gk20a_alloc_channel_gr_ctx(g, c);
2560                 if (err) {
2561                         nvhost_err(dev_from_gk20a(g),
2562                                 "fail to allocate gr ctx buffer");
2563                         goto out;
2564                 }
2565         } else {
2566                 /*TBD: needs to be more subtle about which is being allocated
2567                 * as some are allowed to be allocated along same channel */
2568                 nvhost_err(dev_from_gk20a(g),
2569                         "too many classes alloc'd on same channel");
2570                 err = -EINVAL;
2571                 goto out;
2572         }
2573
2574         /* commit gr ctx buffer */
2575         err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
2576         if (err) {
2577                 nvhost_err(dev_from_gk20a(g),
2578                         "fail to commit gr ctx buffer");
2579                 goto out;
2580         }
2581
2582         /* allocate patch buffer */
2583         if (ch_ctx->patch_ctx.mem.ref == NULL) {
2584                 err = gr_gk20a_alloc_channel_patch_ctx(g, c);
2585                 if (err) {
2586                         nvhost_err(dev_from_gk20a(g),
2587                                 "fail to allocate patch buffer");
2588                         goto out;
2589                 }
2590         }
2591
2592         /* map global buffer to channel gpu_va and commit */
2593         if (!ch_ctx->global_ctx_buffer_mapped) {
2594                 err = gr_gk20a_map_global_ctx_buffers(g, c);
2595                 if (err) {
2596                         nvhost_err(dev_from_gk20a(g),
2597                                 "fail to map global ctx buffer");
2598                         goto out;
2599                 }
2600                 gr_gk20a_elpg_protected_call(g,
2601                         gr_gk20a_commit_global_ctx_buffers(g, c, true));
2602         }
2603
2604         /* init golden image, ELPG enabled after this is done */
2605         err = gr_gk20a_init_golden_ctx_image(g, c);
2606         if (err) {
2607                 nvhost_err(dev_from_gk20a(g),
2608                         "fail to init golden ctx image");
2609                 goto out;
2610         }
2611
2612         /* load golden image */
2613         if (!c->first_init) {
2614                 err = gr_gk20a_elpg_protected_call(g,
2615                         gr_gk20a_load_golden_ctx_image(g, c));
2616                 if (err) {
2617                         nvhost_err(dev_from_gk20a(g),
2618                                 "fail to load golden ctx image");
2619                         goto out;
2620                 }
2621                 c->first_init = true;
2622         }
2623         gk20a_mm_l2_invalidate(g);
2624         c->num_objects++;
2625
2626         nvhost_dbg_fn("done");
2627         return 0;
2628 out:
2629         /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
2630            can be reused so no need to release them.
2631            2. golden image init and load is a one time thing so if
2632            they pass, no need to undo. */
2633         nvhost_err(dev_from_gk20a(g), "fail");
2634         return err;
2635 }
2636
2637 int gk20a_free_obj_ctx(struct channel_gk20a  *c,
2638                        struct nvhost_free_obj_ctx_args *args)
2639 {
2640         unsigned long timeout = gk20a_get_gr_idle_timeout(c->g);
2641
2642         nvhost_dbg_fn("");
2643
2644         if (c->num_objects == 0)
2645                 return 0;
2646
2647         c->num_objects--;
2648
2649         if (c->num_objects == 0) {
2650                 c->first_init = false;
2651                 gk20a_disable_channel(c, true, /*wait for finish*/
2652                                       timeout);
2653                 gr_gk20a_unmap_channel_patch_ctx(c);
2654         }
2655
2656         return 0;
2657 }
2658
2659 static void gk20a_remove_gr_support(struct gr_gk20a *gr)
2660 {
2661         struct gk20a *g = gr->g;
2662         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2663
2664         nvhost_dbg_fn("");
2665
2666         gr_gk20a_free_global_ctx_buffers(g);
2667
2668         nvhost_memmgr_free_sg_table(memmgr, gr->mmu_wr_mem.mem.ref,
2669                         gr->mmu_wr_mem.mem.sgt);
2670         nvhost_memmgr_unpin(memmgr, gr->mmu_rd_mem.mem.ref,
2671                         dev_from_gk20a(g), gr->mmu_rd_mem.mem.sgt);
2672         nvhost_memmgr_put(memmgr, gr->mmu_wr_mem.mem.ref);
2673         nvhost_memmgr_put(memmgr, gr->mmu_rd_mem.mem.ref);
2674         nvhost_memmgr_put(memmgr, gr->compbit_store.mem.ref);
2675         memset(&gr->mmu_wr_mem, 0, sizeof(struct mem_desc));
2676         memset(&gr->mmu_rd_mem, 0, sizeof(struct mem_desc));
2677         memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
2678
2679         kfree(gr->gpc_tpc_count);
2680         kfree(gr->gpc_zcb_count);
2681         kfree(gr->gpc_ppc_count);
2682         kfree(gr->pes_tpc_count[0]);
2683         kfree(gr->pes_tpc_count[1]);
2684         kfree(gr->pes_tpc_mask[0]);
2685         kfree(gr->pes_tpc_mask[1]);
2686         kfree(gr->gpc_skip_mask);
2687         kfree(gr->map_tiles);
2688         gr->gpc_tpc_count = NULL;
2689         gr->gpc_zcb_count = NULL;
2690         gr->gpc_ppc_count = NULL;
2691         gr->pes_tpc_count[0] = NULL;
2692         gr->pes_tpc_count[1] = NULL;
2693         gr->pes_tpc_mask[0] = NULL;
2694         gr->pes_tpc_mask[1] = NULL;
2695         gr->gpc_skip_mask = NULL;
2696         gr->map_tiles = NULL;
2697
2698         kfree(gr->ctx_vars.ucode.fecs.inst.l);
2699         kfree(gr->ctx_vars.ucode.fecs.data.l);
2700         kfree(gr->ctx_vars.ucode.gpccs.inst.l);
2701         kfree(gr->ctx_vars.ucode.gpccs.data.l);
2702         kfree(gr->ctx_vars.sw_bundle_init.l);
2703         kfree(gr->ctx_vars.sw_method_init.l);
2704         kfree(gr->ctx_vars.sw_ctx_load.l);
2705         kfree(gr->ctx_vars.sw_non_ctx_load.l);
2706         kfree(gr->ctx_vars.ctxsw_regs.sys.l);
2707         kfree(gr->ctx_vars.ctxsw_regs.gpc.l);
2708         kfree(gr->ctx_vars.ctxsw_regs.tpc.l);
2709         kfree(gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
2710         kfree(gr->ctx_vars.ctxsw_regs.ppc.l);
2711         kfree(gr->ctx_vars.ctxsw_regs.pm_sys.l);
2712         kfree(gr->ctx_vars.ctxsw_regs.pm_gpc.l);
2713         kfree(gr->ctx_vars.ctxsw_regs.pm_tpc.l);
2714
2715         kfree(gr->ctx_vars.local_golden_image);
2716         gr->ctx_vars.local_golden_image = NULL;
2717
2718         nvhost_allocator_destroy(&gr->comp_tags);
2719 }
2720
2721 static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
2722 {
2723         u32 gpc_index, pes_index;
2724         u32 pes_tpc_mask;
2725         u32 pes_tpc_count;
2726         u32 pes_heavy_index;
2727         u32 gpc_new_skip_mask;
2728         u32 tmp;
2729
2730         tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
2731         gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
2732
2733         tmp = gk20a_readl(g, top_num_gpcs_r());
2734         gr->max_gpc_count = top_num_gpcs_value_v(tmp);
2735
2736         tmp = gk20a_readl(g, top_num_fbps_r());
2737         gr->max_fbps_count = top_num_fbps_value_v(tmp);
2738
2739         tmp = gk20a_readl(g, top_tpc_per_gpc_r());
2740         gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
2741
2742         gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
2743
2744         tmp = gk20a_readl(g, top_num_fbps_r());
2745         gr->sys_count = top_num_fbps_value_v(tmp);
2746
2747         tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
2748         gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
2749
2750         gr->pe_count_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
2751         gr->max_zcull_per_gpc_count = proj_scal_litter_num_zcull_banks_v();
2752
2753         if (!gr->gpc_count) {
2754                 nvhost_err(dev_from_gk20a(g), "gpc_count==0!");
2755                 goto clean_up;
2756         }
2757
2758         gr->gpc_tpc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2759         gr->gpc_zcb_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2760         gr->gpc_ppc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2761         gr->pes_tpc_count[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2762         gr->pes_tpc_count[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2763         gr->pes_tpc_mask[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2764         gr->pes_tpc_mask[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2765         gr->gpc_skip_mask =
2766                 kzalloc(gr_pd_dist_skip_table__size_1_v() * 4 * sizeof(u32),
2767                         GFP_KERNEL);
2768
2769         if (!gr->gpc_tpc_count || !gr->gpc_zcb_count || !gr->gpc_ppc_count ||
2770             !gr->pes_tpc_count[0] || !gr->pes_tpc_count[1] ||
2771             !gr->pes_tpc_mask[0] || !gr->pes_tpc_mask[1] || !gr->gpc_skip_mask)
2772                 goto clean_up;
2773
2774         gr->ppc_count = 0;
2775         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
2776                 tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r());
2777
2778                 gr->gpc_tpc_count[gpc_index] =
2779                         gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
2780                 gr->tpc_count += gr->gpc_tpc_count[gpc_index];
2781
2782                 gr->gpc_zcb_count[gpc_index] =
2783                         gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
2784                 gr->zcb_count += gr->gpc_zcb_count[gpc_index];
2785
2786                 gr->gpc_ppc_count[gpc_index] = gr->pe_count_per_gpc;
2787                 gr->ppc_count += gr->gpc_ppc_count[gpc_index];
2788                 for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
2789
2790                         tmp = gk20a_readl(g,
2791                                 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
2792                                 gpc_index * proj_gpc_stride_v());
2793
2794                         pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
2795                         pes_tpc_count = count_bits(pes_tpc_mask);
2796
2797                         gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
2798                         gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
2799                 }
2800
2801                 gpc_new_skip_mask = 0;
2802                 if (gr->pes_tpc_count[0][gpc_index] +
2803                     gr->pes_tpc_count[1][gpc_index] == 5) {
2804                         pes_heavy_index =
2805                                 gr->pes_tpc_count[0][gpc_index] >
2806                                 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
2807
2808                         gpc_new_skip_mask =
2809                                 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
2810                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
2811                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
2812
2813                 } else if ((gr->pes_tpc_count[0][gpc_index] +
2814                             gr->pes_tpc_count[1][gpc_index] == 4) &&
2815                            (gr->pes_tpc_count[0][gpc_index] !=
2816                             gr->pes_tpc_count[1][gpc_index])) {
2817                                 pes_heavy_index =
2818                                     gr->pes_tpc_count[0][gpc_index] >
2819                                     gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
2820
2821                         gpc_new_skip_mask =
2822                                 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
2823                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
2824                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
2825                 }
2826                 gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
2827         }
2828
2829         nvhost_dbg_info("fbps: %d", gr->num_fbps);
2830         nvhost_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
2831         nvhost_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
2832         nvhost_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
2833         nvhost_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
2834         nvhost_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
2835         nvhost_dbg_info("sys_count: %d", gr->sys_count);
2836         nvhost_dbg_info("gpc_count: %d", gr->gpc_count);
2837         nvhost_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
2838         nvhost_dbg_info("tpc_count: %d", gr->tpc_count);
2839         nvhost_dbg_info("ppc_count: %d", gr->ppc_count);
2840
2841         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2842                 nvhost_dbg_info("gpc_tpc_count[%d] : %d",
2843                            gpc_index, gr->gpc_tpc_count[gpc_index]);
2844         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2845                 nvhost_dbg_info("gpc_zcb_count[%d] : %d",
2846                            gpc_index, gr->gpc_zcb_count[gpc_index]);
2847         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2848                 nvhost_dbg_info("gpc_ppc_count[%d] : %d",
2849                            gpc_index, gr->gpc_ppc_count[gpc_index]);
2850         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2851                 nvhost_dbg_info("gpc_skip_mask[%d] : %d",
2852                            gpc_index, gr->gpc_skip_mask[gpc_index]);
2853         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2854                 for (pes_index = 0;
2855                      pes_index < gr->pe_count_per_gpc;
2856                      pes_index++)
2857                         nvhost_dbg_info("pes_tpc_count[%d][%d] : %d",
2858                                    pes_index, gpc_index,
2859                                    gr->pes_tpc_count[pes_index][gpc_index]);
2860
2861         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2862                 for (pes_index = 0;
2863                      pes_index < gr->pe_count_per_gpc;
2864                      pes_index++)
2865                         nvhost_dbg_info("pes_tpc_mask[%d][%d] : %d",
2866                                    pes_index, gpc_index,
2867                                    gr->pes_tpc_mask[pes_index][gpc_index]);
2868
2869         gr->bundle_cb_default_size = gr_scc_bundle_cb_size_div_256b__prod_v();
2870         gr->min_gpm_fifo_depth = gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
2871         gr->bundle_cb_token_limit = gr_pd_ab_dist_cfg2_token_limit_init_v();
2872         gr->attrib_cb_default_size = gr_gpc0_ppc0_cbm_cfg_size_default_v();
2873         /* gk20a has a fixed beta CB RAM, don't alloc more */
2874         gr->attrib_cb_size = gr->attrib_cb_default_size;
2875         gr->alpha_cb_default_size = gr_gpc0_ppc0_cbm_cfg2_size_default_v();
2876         gr->alpha_cb_size = gr->alpha_cb_default_size + (gr->alpha_cb_default_size >> 1);
2877         gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
2878
2879         nvhost_dbg_info("bundle_cb_default_size: %d",
2880                    gr->bundle_cb_default_size);
2881         nvhost_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
2882         nvhost_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
2883         nvhost_dbg_info("attrib_cb_default_size: %d",
2884                    gr->attrib_cb_default_size);
2885         nvhost_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
2886         nvhost_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
2887         nvhost_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
2888         nvhost_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
2889
2890         return 0;
2891
2892 clean_up:
2893         return -ENOMEM;
2894 }
2895
2896 static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
2897 {
2898         struct mem_mgr *memmgr = mem_mgr_from_g(g);
2899         void *mmu_ptr;
2900
2901         gr->mmu_wr_mem_size = gr->mmu_rd_mem_size = 0x1000;
2902
2903         gr->mmu_wr_mem.mem.ref = nvhost_memmgr_alloc(memmgr,
2904                                                      gr->mmu_wr_mem_size,
2905                                                      DEFAULT_ALLOC_ALIGNMENT,
2906                                                      DEFAULT_ALLOC_FLAGS,
2907                                                      0);
2908         if (IS_ERR(gr->mmu_wr_mem.mem.ref))
2909                 goto clean_up;
2910         gr->mmu_wr_mem.mem.size = gr->mmu_wr_mem_size;
2911
2912         gr->mmu_rd_mem.mem.ref = nvhost_memmgr_alloc(memmgr,
2913                                                      gr->mmu_rd_mem_size,
2914                                                      DEFAULT_ALLOC_ALIGNMENT,
2915                                                      DEFAULT_ALLOC_FLAGS,
2916                                                      0);
2917         if (IS_ERR(gr->mmu_rd_mem.mem.ref))
2918                 goto clean_up;
2919         gr->mmu_rd_mem.mem.size = gr->mmu_rd_mem_size;
2920
2921         mmu_ptr = nvhost_memmgr_mmap(gr->mmu_wr_mem.mem.ref);
2922         if (!mmu_ptr)
2923                 goto clean_up;
2924         memset(mmu_ptr, 0, gr->mmu_wr_mem.mem.size);
2925         nvhost_memmgr_munmap(gr->mmu_wr_mem.mem.ref, mmu_ptr);
2926
2927         mmu_ptr = nvhost_memmgr_mmap(gr->mmu_rd_mem.mem.ref);
2928         if (!mmu_ptr)
2929                 goto clean_up;
2930         memset(mmu_ptr, 0, gr->mmu_rd_mem.mem.size);
2931         nvhost_memmgr_munmap(gr->mmu_rd_mem.mem.ref, mmu_ptr);
2932
2933         gr->mmu_wr_mem.mem.sgt =
2934                 nvhost_memmgr_sg_table(memmgr, gr->mmu_wr_mem.mem.ref);
2935         if (IS_ERR(gr->mmu_wr_mem.mem.sgt))
2936                 goto clean_up;
2937
2938         gr->mmu_rd_mem.mem.sgt =
2939                 nvhost_memmgr_sg_table(memmgr, gr->mmu_rd_mem.mem.ref);
2940         if (IS_ERR(gr->mmu_rd_mem.mem.sgt))
2941                 goto clean_up;
2942         return 0;
2943
2944 clean_up:
2945         return -ENOMEM;
2946 }
2947
2948 static u32 prime_set[18] = {
2949         2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
2950
2951 static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
2952 {
2953         s32 comm_denom;
2954         s32 mul_factor;
2955         s32 *init_frac = NULL;
2956         s32 *init_err = NULL;
2957         s32 *run_err = NULL;
2958         s32 *sorted_num_tpcs = NULL;
2959         s32 *sorted_to_unsorted_gpc_map = NULL;
2960         u32 gpc_index;
2961         u32 gpc_mark = 0;
2962         u32 num_tpc;
2963         u32 max_tpc_count = 0;
2964         u32 swap;
2965         u32 tile_count;
2966         u32 index;
2967         bool delete_map = false;
2968         bool gpc_sorted;
2969         int ret = 0;
2970
2971         init_frac = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
2972         init_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
2973         run_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
2974         sorted_num_tpcs =
2975                 kzalloc(proj_scal_max_gpcs_v() *
2976                         proj_scal_max_tpc_per_gpc_v() * sizeof(s32),
2977                         GFP_KERNEL);
2978         sorted_to_unsorted_gpc_map =
2979                 kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
2980
2981         if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
2982               sorted_to_unsorted_gpc_map)) {
2983                 ret = -ENOMEM;
2984                 goto clean_up;
2985         }
2986
2987         gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
2988
2989         if (gr->tpc_count == 3)
2990                 gr->map_row_offset = 2;
2991         else if (gr->tpc_count < 3)
2992                 gr->map_row_offset = 1;
2993         else {
2994                 gr->map_row_offset = 3;
2995
2996                 for (index = 1; index < 18; index++) {
2997                         u32 prime = prime_set[index];
2998                         if ((gr->tpc_count % prime) != 0) {
2999                                 gr->map_row_offset = prime;
3000                                 break;
3001                         }
3002                 }
3003         }
3004
3005         switch (gr->tpc_count) {
3006         case 15:
3007                 gr->map_row_offset = 6;
3008                 break;
3009         case 14:
3010                 gr->map_row_offset = 5;
3011                 break;
3012         case 13:
3013                 gr->map_row_offset = 2;
3014                 break;
3015         case 11:
3016                 gr->map_row_offset = 7;
3017                 break;
3018         case 10:
3019                 gr->map_row_offset = 6;
3020                 break;
3021         case 7:
3022         case 5:
3023                 gr->map_row_offset = 1;
3024                 break;
3025         default:
3026                 break;
3027         }
3028
3029         if (gr->map_tiles) {
3030                 if (gr->map_tile_count != gr->tpc_count)
3031                         delete_map = true;
3032
3033                 for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
3034                         if ((u32)gr->map_tiles[tile_count] >= gr->tpc_count)
3035                                 delete_map = true;
3036                 }
3037
3038                 if (delete_map) {
3039                         kfree(gr->map_tiles);
3040                         gr->map_tiles = NULL;
3041                         gr->map_tile_count = 0;
3042                 }
3043         }
3044
3045         if (gr->map_tiles == NULL) {
3046                 gr->map_tile_count = proj_scal_max_gpcs_v();
3047
3048                 gr->map_tiles = kzalloc(proj_scal_max_gpcs_v() * sizeof(u8), GFP_KERNEL);
3049                 if (gr->map_tiles == NULL) {
3050                         ret = -ENOMEM;
3051                         goto clean_up;
3052                 }
3053
3054                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3055                         sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
3056                         sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
3057                 }
3058
3059                 gpc_sorted = false;
3060                 while (!gpc_sorted) {
3061                         gpc_sorted = true;
3062                         for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
3063                                 if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
3064                                         gpc_sorted = false;
3065                                         swap = sorted_num_tpcs[gpc_index];
3066                                         sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
3067                                         sorted_num_tpcs[gpc_index + 1] = swap;
3068                                         swap = sorted_to_unsorted_gpc_map[gpc_index];
3069                                         sorted_to_unsorted_gpc_map[gpc_index] =
3070                                                 sorted_to_unsorted_gpc_map[gpc_index + 1];
3071                                         sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
3072                                 }
3073                         }
3074                 }
3075
3076                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3077                         if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
3078                                 max_tpc_count = gr->gpc_tpc_count[gpc_index];
3079
3080                 mul_factor = gr->gpc_count * max_tpc_count;
3081                 if (mul_factor & 0x1)
3082                         mul_factor = 2;
3083                 else
3084                         mul_factor = 1;
3085
3086                 comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
3087
3088                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3089                         num_tpc = sorted_num_tpcs[gpc_index];
3090
3091                         init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
3092
3093                         if (num_tpc != 0)
3094                                 init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
3095                         else
3096                                 init_err[gpc_index] = 0;
3097
3098                         run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
3099                 }
3100
3101                 while (gpc_mark < gr->tpc_count) {
3102                         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3103                                 if ((run_err[gpc_index] * 2) >= comm_denom) {
3104                                         gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
3105                                         run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
3106                                 } else
3107                                         run_err[gpc_index] += init_frac[gpc_index];
3108                         }
3109                 }
3110         }
3111
3112 clean_up:
3113         kfree(init_frac);
3114         kfree(init_err);
3115         kfree(run_err);
3116         kfree(sorted_num_tpcs);
3117         kfree(sorted_to_unsorted_gpc_map);
3118
3119         if (ret)
3120                 nvhost_err(dev_from_gk20a(g), "fail");
3121         else
3122                 nvhost_dbg_fn("done");
3123
3124         return ret;
3125 }
3126
3127 static int gr_gk20a_init_comptag(struct gk20a *g, struct gr_gk20a *gr)
3128 {
3129         struct mem_mgr *memmgr = mem_mgr_from_g(g);
3130
3131         /* max memory size (MB) to cover */
3132         u32 max_size = gr->max_comptag_mem;
3133         /* one tag line covers 128KB */
3134         u32 max_comptag_lines = max_size << 3;
3135
3136         u32 hw_max_comptag_lines =
3137                 ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_init_v();
3138
3139         u32 cbc_param =
3140                 gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r());
3141         u32 comptags_per_cacheline =
3142                 ltc_ltcs_ltss_cbc_param_comptags_per_cache_line_v(cbc_param);
3143         u32 slices_per_fbp =
3144                 ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(cbc_param);
3145         u32 cacheline_size =
3146                 512 << ltc_ltcs_ltss_cbc_param_cache_line_size_v(cbc_param);
3147
3148         u32 compbit_backing_size;
3149         int ret = 0;
3150
3151         nvhost_dbg_fn("");
3152
3153         if (max_comptag_lines == 0) {
3154                 gr->compbit_store.mem.size = 0;
3155                 return 0;
3156         }
3157
3158         if (max_comptag_lines > hw_max_comptag_lines)
3159                 max_comptag_lines = hw_max_comptag_lines;
3160
3161         /* no hybird fb */
3162         compbit_backing_size =
3163                 DIV_ROUND_UP(max_comptag_lines, comptags_per_cacheline) *
3164                 cacheline_size * slices_per_fbp * gr->num_fbps;
3165
3166         /* aligned to 2KB * num_fbps */
3167         compbit_backing_size +=
3168                 gr->num_fbps << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
3169
3170         /* must be a multiple of 64KB */
3171         compbit_backing_size = roundup(compbit_backing_size, 64*1024);
3172
3173         max_comptag_lines =
3174                 (compbit_backing_size * comptags_per_cacheline) /
3175                 cacheline_size * slices_per_fbp * gr->num_fbps;
3176
3177         if (max_comptag_lines > hw_max_comptag_lines)
3178                 max_comptag_lines = hw_max_comptag_lines;
3179
3180         nvhost_dbg_info("compbit backing store size : %d",
3181                 compbit_backing_size);
3182         nvhost_dbg_info("max comptag lines : %d",
3183                 max_comptag_lines);
3184
3185         gr->compbit_store.mem.ref =
3186                 nvhost_memmgr_alloc(memmgr, compbit_backing_size,
3187                                     DEFAULT_ALLOC_ALIGNMENT,
3188                                     DEFAULT_ALLOC_FLAGS,
3189                                     0);
3190         if (IS_ERR(gr->compbit_store.mem.ref)) {
3191                 nvhost_err(dev_from_gk20a(g), "failed to allocate"
3192                            "backing store for compbit : size %d",
3193                            compbit_backing_size);
3194                 return PTR_ERR(gr->compbit_store.mem.ref);
3195         }
3196         gr->compbit_store.mem.size = compbit_backing_size;
3197
3198         gr->compbit_store.mem.sgt =
3199                 nvhost_memmgr_pin(memmgr, gr->compbit_store.mem.ref,
3200                                 dev_from_gk20a(g), mem_flag_none);
3201         if (IS_ERR(gr->compbit_store.mem.sgt)) {
3202                 ret = PTR_ERR(gr->compbit_store.mem.sgt);
3203                 goto clean_up;
3204         }
3205         gr->compbit_store.base_pa =
3206                 gk20a_mm_iova_addr(gr->compbit_store.mem.sgt->sgl);
3207
3208         nvhost_allocator_init(&gr->comp_tags, "comptag",
3209                               1, /* start */
3210                               max_comptag_lines - 1, /* length*/
3211                               1); /* align */
3212
3213         return 0;
3214
3215 clean_up:
3216         if (gr->compbit_store.mem.sgt)
3217                 nvhost_memmgr_free_sg_table(memmgr, gr->compbit_store.mem.ref,
3218                                 gr->compbit_store.mem.sgt);
3219         nvhost_memmgr_put(memmgr, gr->compbit_store.mem.ref);
3220         return ret;
3221 }
3222
3223 int gk20a_gr_clear_comptags(struct gk20a *g, u32 min, u32 max)
3224 {
3225         struct gr_gk20a *gr = &g->gr;
3226         u32 fbp, slice, ctrl1, val;
3227         unsigned long end_jiffies = jiffies +
3228                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3229         u32 delay = GR_IDLE_CHECK_DEFAULT;
3230         u32 slices_per_fbp =
3231                 ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(
3232                         gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r()));
3233
3234         nvhost_dbg_fn("");
3235
3236         if (gr->compbit_store.mem.size == 0)
3237                 return 0;
3238
3239         gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl2_r(),
3240                      ltc_ltcs_ltss_cbc_ctrl2_clear_lower_bound_f(min));
3241         gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl3_r(),
3242                      ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_f(max));
3243         gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl1_r(),
3244                      gk20a_readl(g, ltc_ltcs_ltss_cbc_ctrl1_r()) |
3245                      ltc_ltcs_ltss_cbc_ctrl1_clear_active_f());
3246
3247         for (fbp = 0; fbp < gr->num_fbps; fbp++) {
3248                 for (slice = 0; slice < slices_per_fbp; slice++) {
3249
3250                         delay = GR_IDLE_CHECK_DEFAULT;
3251
3252                         ctrl1 = ltc_ltc0_lts0_cbc_ctrl1_r() +
3253                                 fbp * proj_ltc_stride_v() +
3254                                 slice * proj_lts_stride_v();
3255
3256                         do {
3257                                 val = gk20a_readl(g, ctrl1);
3258                                 if (ltc_ltcs_ltss_cbc_ctrl1_clear_v(val) !=
3259                                     ltc_ltcs_ltss_cbc_ctrl1_clear_active_v())
3260                                         break;
3261
3262                                 usleep_range(delay, delay * 2);
3263                                 delay = min_t(u32, delay << 1,
3264                                         GR_IDLE_CHECK_MAX);
3265
3266                         } while (time_before(jiffies, end_jiffies) |
3267                                         !tegra_platform_is_silicon());
3268
3269                         if (!time_before(jiffies, end_jiffies)) {
3270                                 nvhost_err(dev_from_gk20a(g),
3271                                            "comp tag clear timeout\n");
3272                                 return -EBUSY;
3273                         }
3274                 }
3275         }
3276
3277         return 0;
3278 }
3279
3280 static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
3281 {
3282         struct gr_zcull_gk20a *zcull = &gr->zcull;
3283
3284         zcull->aliquot_width = gr->tpc_count * 16;
3285         zcull->aliquot_height = 16;
3286
3287         zcull->width_align_pixels = gr->tpc_count * 16;
3288         zcull->height_align_pixels = 32;
3289
3290         zcull->aliquot_size =
3291                 zcull->aliquot_width * zcull->aliquot_height;
3292
3293         /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
3294         zcull->pixel_squares_by_aliquots =
3295                 gr->zcb_count * 16 * 16 * gr->tpc_count /
3296                 (gr->gpc_count * gr->gpc_tpc_count[0]);
3297
3298         zcull->total_aliquots =
3299                 gr_gpc0_zcull_total_ram_size_num_aliquots_f(
3300                         gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
3301
3302         return 0;
3303 }
3304
3305 u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
3306 {
3307         /* assuming gr has already been initialized */
3308         return gr->ctx_vars.zcull_ctxsw_image_size;
3309 }
3310
3311 int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
3312                         struct channel_gk20a *c, u64 zcull_va, u32 mode)
3313 {
3314         struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
3315
3316         zcull_ctx->ctx_sw_mode = mode;
3317         zcull_ctx->gpu_va = zcull_va;
3318
3319         /* TBD: don't disable channel in sw method processing */
3320         return gr_gk20a_ctx_zcull_setup(g, c, true);
3321 }
3322
3323 int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
3324                         struct gr_zcull_info *zcull_params)
3325 {
3326         struct gr_zcull_gk20a *zcull = &gr->zcull;
3327
3328         zcull_params->width_align_pixels = zcull->width_align_pixels;
3329         zcull_params->height_align_pixels = zcull->height_align_pixels;
3330         zcull_params->pixel_squares_by_aliquots =
3331                 zcull->pixel_squares_by_aliquots;
3332         zcull_params->aliquot_total = zcull->total_aliquots;
3333
3334         zcull_params->region_byte_multiplier =
3335                 gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
3336         zcull_params->region_header_size =
3337                 proj_scal_litter_num_gpcs_v() *
3338                 gr_zcull_save_restore_header_bytes_per_gpc_v();
3339
3340         zcull_params->subregion_header_size =
3341                 proj_scal_litter_num_gpcs_v() *
3342                 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
3343
3344         zcull_params->subregion_width_align_pixels =
3345                 gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
3346         zcull_params->subregion_height_align_pixels =
3347                 gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
3348         zcull_params->subregion_count = gr_zcull_subregion_qty_v();
3349
3350         return 0;
3351 }
3352
3353 static int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
3354                                 struct zbc_entry *color_val, u32 index)
3355 {
3356         struct fifo_gk20a *f = &g->fifo;
3357         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3358         u32 i;
3359         unsigned long end_jiffies = jiffies +
3360                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3361         u32 ret;
3362
3363         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3364         if (ret) {
3365                 nvhost_err(dev_from_gk20a(g),
3366                         "failed to disable gr engine activity\n");
3367                 return ret;
3368         }
3369
3370         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3371         if (ret) {
3372                 nvhost_err(dev_from_gk20a(g),
3373                         "failed to idle graphics\n");
3374                 goto clean_up;
3375         }
3376
3377         /* update l2 table */
3378         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3379                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3380                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3381                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(index +
3382                                         GK20A_STARTOF_ZBC_TABLE));
3383
3384         for (i = 0; i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++)
3385                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(i),
3386                         color_val->color_l2[i]);
3387
3388         /* update ds table */
3389         gk20a_writel(g, gr_ds_zbc_color_r_r(),
3390                 gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
3391         gk20a_writel(g, gr_ds_zbc_color_g_r(),
3392                 gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
3393         gk20a_writel(g, gr_ds_zbc_color_b_r(),
3394                 gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
3395         gk20a_writel(g, gr_ds_zbc_color_a_r(),
3396                 gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
3397
3398         gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3399                 gr_ds_zbc_color_fmt_val_f(color_val->format));
3400
3401         gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3402                 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3403
3404         /* trigger the write */
3405         gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3406                 gr_ds_zbc_tbl_ld_select_c_f() |
3407                 gr_ds_zbc_tbl_ld_action_write_f() |
3408                 gr_ds_zbc_tbl_ld_trigger_active_f());
3409
3410         /* update local copy */
3411         for (i = 0; i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++) {
3412                 gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
3413                 gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
3414         }
3415         gr->zbc_col_tbl[index].format = color_val->format;
3416         gr->zbc_col_tbl[index].ref_cnt++;
3417
3418 clean_up:
3419         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3420         if (ret) {
3421                 nvhost_err(dev_from_gk20a(g),
3422                         "failed to enable gr engine activity\n");
3423         }
3424
3425         return ret;
3426 }
3427
3428 static int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
3429                                 struct zbc_entry *depth_val, u32 index)
3430 {
3431         struct fifo_gk20a *f = &g->fifo;
3432         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3433         unsigned long end_jiffies = jiffies +
3434                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3435         u32 ret;
3436
3437         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3438         if (ret) {
3439                 nvhost_err(dev_from_gk20a(g),
3440                         "failed to disable gr engine activity\n");
3441                 return ret;
3442         }
3443
3444         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3445         if (ret) {
3446                 nvhost_err(dev_from_gk20a(g),
3447                         "failed to idle graphics\n");
3448                 goto clean_up;
3449         }
3450
3451         /* update l2 table */
3452         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3453                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3454                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3455                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(index +
3456                                         GK20A_STARTOF_ZBC_TABLE));
3457
3458         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(),
3459                         depth_val->depth);
3460
3461         /* update ds table */
3462         gk20a_writel(g, gr_ds_zbc_z_r(),
3463                 gr_ds_zbc_z_val_f(depth_val->depth));
3464
3465         gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3466                 gr_ds_zbc_z_fmt_val_f(depth_val->format));
3467
3468         gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3469                 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3470
3471         /* trigger the write */
3472         gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3473                 gr_ds_zbc_tbl_ld_select_z_f() |
3474                 gr_ds_zbc_tbl_ld_action_write_f() |
3475                 gr_ds_zbc_tbl_ld_trigger_active_f());
3476
3477         /* update local copy */
3478         gr->zbc_dep_tbl[index].depth = depth_val->depth;
3479         gr->zbc_dep_tbl[index].format = depth_val->format;
3480         gr->zbc_dep_tbl[index].ref_cnt++;
3481
3482 clean_up:
3483         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3484         if (ret) {
3485                 nvhost_err(dev_from_gk20a(g),
3486                         "failed to enable gr engine activity\n");
3487         }
3488
3489         return ret;
3490 }
3491
3492 int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
3493                      struct zbc_entry *zbc_val)
3494 {
3495         struct zbc_color_table *c_tbl;
3496         struct zbc_depth_table *d_tbl;
3497         u32 i, ret = -ENOMEM;
3498         bool added = false;
3499         u32 entries;
3500
3501         /* no endian swap ? */
3502
3503         switch (zbc_val->type) {
3504         case GK20A_ZBC_TYPE_COLOR:
3505                 /* search existing tables */
3506                 for (i = 0; i < gr->max_used_color_index; i++) {
3507
3508                         c_tbl = &gr->zbc_col_tbl[i];
3509
3510                         if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
3511                             memcmp(c_tbl->color_ds, zbc_val->color_ds,
3512                                 sizeof(zbc_val->color_ds)) == 0) {
3513
3514                                 if (memcmp(c_tbl->color_l2, zbc_val->color_l2,
3515                                     sizeof(zbc_val->color_l2))) {
3516                                         nvhost_err(dev_from_gk20a(g),
3517                                                 "zbc l2 and ds color don't match with existing entries");
3518                                         return -EINVAL;
3519                                 }
3520                                 added = true;
3521                                 c_tbl->ref_cnt++;
3522                                 ret = 0;
3523                                 break;
3524                         }
3525                 }
3526                 /* add new table */
3527                 if (!added &&
3528                     gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
3529
3530                         c_tbl =
3531                             &gr->zbc_col_tbl[gr->max_used_color_index];
3532                         WARN_ON(c_tbl->ref_cnt != 0);
3533
3534                         ret = gr_gk20a_add_zbc_color(g, gr,
3535                                 zbc_val, gr->max_used_color_index);
3536
3537                         if (!ret)
3538                                 gr->max_used_color_index++;
3539                 }
3540                 break;
3541         case GK20A_ZBC_TYPE_DEPTH:
3542                 /* search existing tables */
3543                 for (i = 0; i < gr->max_used_depth_index; i++) {
3544
3545                         d_tbl = &gr->zbc_dep_tbl[i];
3546
3547                         if (d_tbl->ref_cnt &&
3548                             d_tbl->depth == zbc_val->depth &&
3549                             d_tbl->format == zbc_val->format) {
3550                                 added = true;
3551                                 d_tbl->ref_cnt++;
3552                                 ret = 0;
3553                                 break;
3554                         }
3555                 }
3556                 /* add new table */
3557                 if (!added &&
3558                     gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
3559
3560                         d_tbl =
3561                             &gr->zbc_dep_tbl[gr->max_used_depth_index];
3562                         WARN_ON(d_tbl->ref_cnt != 0);
3563
3564                         ret = gr_gk20a_add_zbc_depth(g, gr,
3565                                 zbc_val, gr->max_used_depth_index);
3566
3567                         if (!ret)
3568                                 gr->max_used_depth_index++;
3569                 }
3570                 break;
3571         default:
3572                 nvhost_err(dev_from_gk20a(g),
3573                         "invalid zbc table type %d", zbc_val->type);
3574                 return -EINVAL;
3575         }
3576
3577         if (!added && ret == 0) {
3578                 /* update zbc for elpg only when new entry is added */
3579                 entries = max(gr->max_used_color_index,
3580                                         gr->max_used_depth_index);
3581                 pmu_save_zbc(g, entries);
3582         }
3583
3584         return ret;
3585 }
3586
3587 int gr_gk20a_clear_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
3588 {
3589         struct fifo_gk20a *f = &g->fifo;
3590         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3591         u32 i, j;
3592         unsigned long end_jiffies = jiffies +
3593                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3594         u32 ret;
3595
3596         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3597         if (ret) {
3598                 nvhost_err(dev_from_gk20a(g),
3599                         "failed to disable gr engine activity\n");
3600                 return ret;
3601         }
3602
3603         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3604         if (ret) {
3605                 nvhost_err(dev_from_gk20a(g),
3606                         "failed to idle graphics\n");
3607                 goto clean_up;
3608         }
3609
3610         for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
3611                 gr->zbc_col_tbl[i].format = 0;
3612                 gr->zbc_col_tbl[i].ref_cnt = 0;
3613
3614                 gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3615                         gr_ds_zbc_color_fmt_val_invalid_f());
3616                 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3617                         gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
3618
3619                 /* trigger the write */
3620                 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3621                         gr_ds_zbc_tbl_ld_select_c_f() |
3622                         gr_ds_zbc_tbl_ld_action_write_f() |
3623                         gr_ds_zbc_tbl_ld_trigger_active_f());
3624
3625                 /* clear l2 table */
3626                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3627                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3628                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3629                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(i +
3630                                         GK20A_STARTOF_ZBC_TABLE));
3631
3632                 for (j = 0; j < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); j++) {
3633                         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(j), 0);
3634                         gr->zbc_col_tbl[i].color_l2[j] = 0;
3635                         gr->zbc_col_tbl[i].color_ds[j] = 0;
3636                 }
3637         }
3638         gr->max_used_color_index = 0;
3639         gr->max_default_color_index = 0;
3640
3641         for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
3642                 gr->zbc_dep_tbl[i].depth = 0;
3643                 gr->zbc_dep_tbl[i].format = 0;
3644                 gr->zbc_dep_tbl[i].ref_cnt = 0;
3645
3646                 gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3647                         gr_ds_zbc_z_fmt_val_invalid_f());
3648                 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3649                         gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
3650
3651                 /* trigger the write */
3652                 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3653                         gr_ds_zbc_tbl_ld_select_z_f() |
3654                         gr_ds_zbc_tbl_ld_action_write_f() |
3655                         gr_ds_zbc_tbl_ld_trigger_active_f());
3656
3657                 /* clear l2 table */
3658                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3659                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3660                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3661                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(i +
3662                                         GK20A_STARTOF_ZBC_TABLE));
3663
3664                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
3665         }
3666         gr->max_used_depth_index = 0;
3667         gr->max_default_depth_index = 0;
3668
3669 clean_up:
3670         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3671         if (ret) {
3672                 nvhost_err(dev_from_gk20a(g),
3673                         "failed to enable gr engine activity\n");
3674         }
3675
3676         /* elpg stuff */
3677
3678         return ret;
3679 }
3680
3681 /* get a zbc table entry specified by index
3682  * return table size when type is invalid */
3683 int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
3684                         struct zbc_query_params *query_params)
3685 {
3686         u32 index = query_params->index_size;
3687         u32 i;
3688
3689         switch (query_params->type) {
3690         case GK20A_ZBC_TYPE_INVALID:
3691                 query_params->index_size = GK20A_ZBC_TABLE_SIZE;
3692                 break;
3693         case GK20A_ZBC_TYPE_COLOR:
3694                 if (index >= GK20A_ZBC_TABLE_SIZE) {
3695                         nvhost_err(dev_from_gk20a(g),
3696                                 "invalid zbc color table index\n");
3697                         return -EINVAL;
3698                 }
3699                 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3700                         query_params->color_l2[i] =
3701                                 gr->zbc_col_tbl[index].color_l2[i];
3702                         query_params->color_ds[i] =
3703                                 gr->zbc_col_tbl[index].color_ds[i];
3704                 }
3705                 query_params->format = gr->zbc_col_tbl[index].format;
3706                 query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
3707                 break;
3708         case GK20A_ZBC_TYPE_DEPTH:
3709                 if (index >= GK20A_ZBC_TABLE_SIZE) {
3710                         nvhost_err(dev_from_gk20a(g),
3711                                 "invalid zbc depth table index\n");
3712                         return -EINVAL;
3713                 }
3714                 query_params->depth = gr->zbc_dep_tbl[index].depth;
3715                 query_params->format = gr->zbc_dep_tbl[index].format;
3716                 query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
3717                 break;
3718         default:
3719                 nvhost_err(dev_from_gk20a(g),
3720                                 "invalid zbc table type\n");
3721                 return -EINVAL;
3722         }
3723
3724         return 0;
3725 }
3726
3727 static int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
3728 {
3729         struct zbc_entry zbc_val;
3730         u32 i, err;
3731
3732         /* load default color table */
3733         zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3734
3735         zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
3736         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3737                 zbc_val.color_ds[i] = 0;
3738                 zbc_val.color_l2[i] = 0;
3739         }
3740         err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3741
3742         zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
3743         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3744                 zbc_val.color_ds[i] = 0xffffffff;
3745                 zbc_val.color_l2[i] = 0x3f800000;
3746         }
3747         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3748
3749         zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3750         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3751                 zbc_val.color_ds[i] = 0;
3752                 zbc_val.color_l2[i] = 0;
3753         }
3754         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3755
3756         zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3757         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3758                 zbc_val.color_ds[i] = 0x3f800000;
3759                 zbc_val.color_l2[i] = 0x3f800000;
3760         }
3761         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3762
3763         if (!err)
3764                 gr->max_default_color_index = 4;
3765         else {
3766                 nvhost_err(dev_from_gk20a(g),
3767                            "fail to load default zbc color table\n");
3768                 return err;
3769         }
3770
3771         /* load default depth table */
3772         zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3773
3774         zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3775         zbc_val.depth = 0;
3776         err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3777
3778         zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3779         zbc_val.depth = 0x3f800000;
3780         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3781
3782         if (!err)
3783                 gr->max_default_depth_index = 2;
3784         else {
3785                 nvhost_err(dev_from_gk20a(g),
3786                            "fail to load default zbc depth table\n");
3787                 return err;
3788         }
3789
3790         return 0;
3791 }
3792
3793 static int gr_gk20a_init_zbc(struct gk20a *g, struct gr_gk20a *gr)
3794 {
3795         u32 i, j;
3796
3797         /* reset zbc clear */
3798         for (i = 0; i < GK20A_SIZEOF_ZBC_TABLE -
3799             GK20A_STARTOF_ZBC_TABLE; i++) {
3800                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3801                         (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3802                          ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3803                                 ltc_ltcs_ltss_dstg_zbc_index_address_f(
3804                                         i + GK20A_STARTOF_ZBC_TABLE));
3805                 for (j = 0; j < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); j++)
3806                         gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(j), 0);
3807                 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
3808         }
3809
3810         gr_gk20a_clear_zbc_table(g, gr);
3811
3812         gr_gk20a_load_zbc_default_table(g, gr);
3813
3814         return 0;
3815 }
3816
3817 int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
3818                         struct zbc_entry *zbc_val)
3819 {
3820         nvhost_dbg_fn("");
3821
3822         return gr_gk20a_elpg_protected_call(g,
3823                 gr_gk20a_add_zbc(g, gr, zbc_val));
3824 }
3825
3826 void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine)
3827 {
3828         u32 gate_ctrl;
3829
3830         gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3831
3832         switch (mode) {
3833         case BLCG_RUN:
3834                 gate_ctrl = set_field(gate_ctrl,
3835                                 therm_gate_ctrl_blk_clk_m(),
3836                                 therm_gate_ctrl_blk_clk_run_f());
3837                 break;
3838         case BLCG_AUTO:
3839                 gate_ctrl = set_field(gate_ctrl,
3840                                 therm_gate_ctrl_blk_clk_m(),
3841                                 therm_gate_ctrl_blk_clk_auto_f());
3842                 break;
3843         default:
3844                 nvhost_err(dev_from_gk20a(g),
3845                         "invalid blcg mode %d", mode);
3846                 return;
3847         }
3848
3849         gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3850 }
3851
3852 void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine)
3853 {
3854         u32 gate_ctrl, idle_filter;
3855
3856         gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3857
3858         switch (mode) {
3859         case ELCG_RUN:
3860                 gate_ctrl = set_field(gate_ctrl,
3861                                 therm_gate_ctrl_eng_clk_m(),
3862                                 therm_gate_ctrl_eng_clk_run_f());
3863                 gate_ctrl = set_field(gate_ctrl,
3864                                 therm_gate_ctrl_eng_pwr_m(),
3865                                 /* set elpg to auto to meet hw expectation */
3866                                 therm_gate_ctrl_eng_pwr_auto_f());
3867                 break;
3868         case ELCG_STOP:
3869                 gate_ctrl = set_field(gate_ctrl,
3870                                 therm_gate_ctrl_eng_clk_m(),
3871                                 therm_gate_ctrl_eng_clk_stop_f());
3872                 break;
3873         case ELCG_AUTO:
3874                 gate_ctrl = set_field(gate_ctrl,
3875                                 therm_gate_ctrl_eng_clk_m(),
3876                                 therm_gate_ctrl_eng_clk_auto_f());
3877                 break;
3878         default:
3879                 nvhost_err(dev_from_gk20a(g),
3880                         "invalid elcg mode %d", mode);
3881         }
3882
3883         if (tegra_platform_is_linsim()) {
3884                 gate_ctrl = set_field(gate_ctrl,
3885                         therm_gate_ctrl_eng_delay_after_m(),
3886                         therm_gate_ctrl_eng_delay_after_f(4));
3887         }
3888
3889         /* 2 * (1 << 9) = 1024 clks */
3890         gate_ctrl = set_field(gate_ctrl,
3891                 therm_gate_ctrl_eng_idle_filt_exp_m(),
3892                 therm_gate_ctrl_eng_idle_filt_exp_f(9));
3893         gate_ctrl = set_field(gate_ctrl,
3894                 therm_gate_ctrl_eng_idle_filt_mant_m(),
3895                 therm_gate_ctrl_eng_idle_filt_mant_f(2));
3896         gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3897
3898         /* default fecs_idle_filter to 0 */
3899         idle_filter = gk20a_readl(g, therm_fecs_idle_filter_r());
3900         idle_filter &= ~therm_fecs_idle_filter_value_m();
3901         gk20a_writel(g, therm_fecs_idle_filter_r(), idle_filter);
3902         /* default hubmmu_idle_filter to 0 */
3903         idle_filter = gk20a_readl(g, therm_hubmmu_idle_filter_r());
3904         idle_filter &= ~therm_hubmmu_idle_filter_value_m();
3905         gk20a_writel(g, therm_hubmmu_idle_filter_r(), idle_filter);
3906 }
3907
3908 static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
3909 {
3910         u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
3911         u32 *zcull_map_tiles, *zcull_bank_counters;
3912         u32 map_counter;
3913         u32 rcp_conserv;
3914         u32 offset;
3915         bool floorsweep = false;
3916
3917         if (!gr->map_tiles)
3918                 return -1;
3919
3920         zcull_map_tiles = kzalloc(proj_scal_max_gpcs_v() *
3921                         proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3922         if (!zcull_map_tiles) {
3923                 nvhost_err(dev_from_gk20a(g),
3924                         "failed to allocate zcull temp buffers");
3925                 return -ENOMEM;
3926         }
3927         zcull_bank_counters = kzalloc(proj_scal_max_gpcs_v() *
3928                         proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3929
3930         if (!zcull_bank_counters) {
3931                 nvhost_err(dev_from_gk20a(g),
3932                         "failed to allocate zcull temp buffers");
3933                 kfree(zcull_map_tiles);
3934                 return -ENOMEM;
3935         }
3936
3937         for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
3938                 zcull_map_tiles[map_counter] =
3939                         zcull_bank_counters[gr->map_tiles[map_counter]];
3940                 zcull_bank_counters[gr->map_tiles[map_counter]]++;
3941         }
3942
3943         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(),
3944                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(zcull_map_tiles[0]) |
3945                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(zcull_map_tiles[1]) |
3946                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(zcull_map_tiles[2]) |
3947                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(zcull_map_tiles[3]) |
3948                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(zcull_map_tiles[4]) |
3949                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(zcull_map_tiles[5]) |
3950                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(zcull_map_tiles[6]) |
3951                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(zcull_map_tiles[7]));
3952
3953         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(),
3954                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(zcull_map_tiles[8]) |
3955                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(zcull_map_tiles[9]) |
3956                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(zcull_map_tiles[10]) |
3957                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(zcull_map_tiles[11]) |
3958                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(zcull_map_tiles[12]) |
3959                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(zcull_map_tiles[13]) |
3960                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(zcull_map_tiles[14]) |
3961                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(zcull_map_tiles[15]));
3962
3963         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(),
3964                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(zcull_map_tiles[16]) |
3965                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(zcull_map_tiles[17]) |
3966                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(zcull_map_tiles[18]) |
3967                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(zcull_map_tiles[19]) |
3968                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(zcull_map_tiles[20]) |
3969                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(zcull_map_tiles[21]) |
3970                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(zcull_map_tiles[22]) |
3971                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(zcull_map_tiles[23]));
3972
3973         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(),
3974                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(zcull_map_tiles[24]) |
3975                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(zcull_map_tiles[25]) |
3976                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(zcull_map_tiles[26]) |
3977                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(zcull_map_tiles[27]) |
3978                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(zcull_map_tiles[28]) |
3979                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(zcull_map_tiles[29]) |
3980                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(zcull_map_tiles[30]) |
3981                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(zcull_map_tiles[31]));
3982
3983         kfree(zcull_map_tiles);
3984         kfree(zcull_bank_counters);
3985
3986         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3987                 gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
3988                 gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
3989
3990                 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3991                     gpc_zcull_count < gpc_tpc_count) {
3992                         nvhost_err(dev_from_gk20a(g),
3993                                 "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
3994                                 gpc_zcull_count, gpc_tpc_count, gpc_index);
3995                         return -EINVAL;
3996                 }
3997                 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3998                     gpc_zcull_count != 0)
3999                         floorsweep = true;
4000         }
4001
4002         /* 1.0f / 1.0f * gr_gpc0_zcull_sm_num_rcp_conservative__max_v() */
4003         rcp_conserv = gr_gpc0_zcull_sm_num_rcp_conservative__max_v();
4004
4005         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4006                 offset = gpc_index * proj_gpc_stride_v();
4007
4008                 if (floorsweep) {
4009                         gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4010                                 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4011                                 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4012                                         gr->max_zcull_per_gpc_count));
4013                 } else {
4014                         gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4015                                 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4016                                 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4017                                         gr->gpc_tpc_count[gpc_index]));
4018                 }
4019
4020                 gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
4021                         gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
4022                         gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
4023
4024                 gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
4025                         gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
4026         }
4027
4028         gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
4029                 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
4030
4031         return 0;
4032 }
4033
4034 static void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
4035 {
4036         /* enable tpc exception forwarding */
4037         gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(),
4038                 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f());
4039
4040         /* enable gpc exception forwarding */
4041         gk20a_writel(g, gr_gpc0_gpccs_gpc_exception_en_r(),
4042                 gr_gpc0_gpccs_gpc_exception_en_tpc_0_enabled_f());
4043 }
4044
4045 static int gk20a_init_gr_setup_hw(struct gk20a *g)
4046 {
4047         struct gr_gk20a *gr = &g->gr;
4048         struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
4049         struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
4050         struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
4051         u32 data;
4052         u32 addr_lo, addr_hi, addr;
4053         u32 compbit_base_post_divide;
4054         u64 compbit_base_post_multiply64;
4055         unsigned long end_jiffies = jiffies +
4056                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4057         u32 fe_go_idle_timeout_save;
4058         u32 last_bundle_data = 0;
4059         u32 last_method_data = 0;
4060         u32 i, err;
4061         u32 l1c_dbg_reg_val;
4062
4063         nvhost_dbg_fn("");
4064
4065         /* slcg prod values */
4066         gr_gk20a_slcg_gr_load_gating_prod(g, g->slcg_enabled);
4067         gr_gk20a_slcg_perf_load_gating_prod(g, g->slcg_enabled);
4068
4069         /* init mmu debug buffer */
4070         addr = gk20a_mm_iova_addr(gr->mmu_wr_mem.mem.sgt->sgl);
4071         addr_lo = u64_lo32(addr);
4072         addr_hi = u64_hi32(addr);
4073         addr = (addr_lo >> fb_mmu_debug_wr_addr_alignment_v()) |
4074                 (addr_hi << (32 - fb_mmu_debug_wr_addr_alignment_v()));
4075
4076         gk20a_writel(g, fb_mmu_debug_wr_r(),
4077                      fb_mmu_debug_wr_aperture_vid_mem_f() |
4078                      fb_mmu_debug_wr_vol_false_f() |
4079                      fb_mmu_debug_wr_addr_v(addr));
4080
4081         addr = gk20a_mm_iova_addr(gr->mmu_rd_mem.mem.sgt->sgl);
4082         addr_lo = u64_lo32(addr);
4083         addr_hi = u64_hi32(addr);
4084         addr = (addr_lo >> fb_mmu_debug_rd_addr_alignment_v()) |
4085                 (addr_hi << (32 - fb_mmu_debug_rd_addr_alignment_v()));
4086
4087         gk20a_writel(g, fb_mmu_debug_rd_r(),
4088                      fb_mmu_debug_rd_aperture_vid_mem_f() |
4089                      fb_mmu_debug_rd_vol_false_f() |
4090                      fb_mmu_debug_rd_addr_v(addr));
4091
4092         /* load gr floorsweeping registers */
4093         data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
4094         data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
4095                         gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
4096         gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
4097
4098         gr_gk20a_zcull_init_hw(g, gr);
4099
4100         gr_gk20a_blcg_gr_load_gating_prod(g, g->blcg_enabled);
4101         gr_gk20a_pg_gr_load_gating_prod(g, true);
4102
4103         if (g->elcg_enabled) {
4104                 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
4105                 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
4106         } else {
4107                 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
4108                 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
4109         }
4110
4111         /* Bug 1340570: increase the clock timeout to avoid potential
4112          * operation failure at high gpcclk rate. Default values are 0x400.
4113          */
4114         gk20a_writel(g, pri_ringstation_sys_master_config_r(0x15), 0x800);
4115         gk20a_writel(g, pri_ringstation_gpc_master_config_r(0xa), 0x800);
4116         gk20a_writel(g, pri_ringstation_fbp_master_config_r(0x8), 0x800);
4117
4118         /* enable fifo access */
4119         gk20a_writel(g, gr_gpfifo_ctl_r(),
4120                      gr_gpfifo_ctl_access_enabled_f() |
4121                      gr_gpfifo_ctl_semaphore_access_enabled_f());
4122
4123         /* TBD: reload gr ucode when needed */
4124
4125         /* enable interrupts */
4126         gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
4127         gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
4128
4129         /* enable fecs error interrupts */
4130         gk20a_writel(g, gr_fecs_host_int_enable_r(),
4131                      gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
4132                      gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
4133                      gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
4134                      gr_fecs_host_int_enable_watchdog_enable_f());
4135
4136         /* enable exceptions */
4137         gk20a_writel(g, gr_fe_hww_esr_r(),
4138                      gr_fe_hww_esr_en_enable_f() |
4139                      gr_fe_hww_esr_reset_active_f());
4140         gk20a_writel(g, gr_memfmt_hww_esr_r(),
4141                      gr_memfmt_hww_esr_en_enable_f() |
4142                      gr_memfmt_hww_esr_reset_active_f());
4143         gk20a_writel(g, gr_scc_hww_esr_r(),
4144                      gr_scc_hww_esr_en_enable_f() |
4145                      gr_scc_hww_esr_reset_active_f());
4146         gk20a_writel(g, gr_mme_hww_esr_r(),
4147                      gr_mme_hww_esr_en_enable_f() |
4148                      gr_mme_hww_esr_reset_active_f());
4149         gk20a_writel(g, gr_pd_hww_esr_r(),
4150                      gr_pd_hww_esr_en_enable_f() |
4151                      gr_pd_hww_esr_reset_active_f());
4152         gk20a_writel(g, gr_sked_hww_esr_r(), /* enabled by default */
4153                      gr_sked_hww_esr_reset_active_f());
4154         gk20a_writel(g, gr_ds_hww_esr_r(),
4155                      gr_ds_hww_esr_en_enabled_f() |
4156                      gr_ds_hww_esr_reset_task_f());
4157         gk20a_writel(g, gr_ds_hww_report_mask_r(),
4158                      gr_ds_hww_report_mask_sph0_err_report_f() |
4159                      gr_ds_hww_report_mask_sph1_err_report_f() |
4160                      gr_ds_hww_report_mask_sph2_err_report_f() |
4161                      gr_ds_hww_report_mask_sph3_err_report_f() |
4162                      gr_ds_hww_report_mask_sph4_err_report_f() |
4163                      gr_ds_hww_report_mask_sph5_err_report_f() |
4164                      gr_ds_hww_report_mask_sph6_err_report_f() |
4165                      gr_ds_hww_report_mask_sph7_err_report_f() |
4166                      gr_ds_hww_report_mask_sph8_err_report_f() |
4167                      gr_ds_hww_report_mask_sph9_err_report_f() |
4168                      gr_ds_hww_report_mask_sph10_err_report_f() |
4169                      gr_ds_hww_report_mask_sph11_err_report_f() |
4170                      gr_ds_hww_report_mask_sph12_err_report_f() |
4171                      gr_ds_hww_report_mask_sph13_err_report_f() |
4172                      gr_ds_hww_report_mask_sph14_err_report_f() |
4173                      gr_ds_hww_report_mask_sph15_err_report_f() |
4174                      gr_ds_hww_report_mask_sph16_err_report_f() |
4175                      gr_ds_hww_report_mask_sph17_err_report_f() |
4176                      gr_ds_hww_report_mask_sph18_err_report_f() |
4177                      gr_ds_hww_report_mask_sph19_err_report_f() |
4178                      gr_ds_hww_report_mask_sph20_err_report_f() |
4179                      gr_ds_hww_report_mask_sph21_err_report_f() |
4180                      gr_ds_hww_report_mask_sph22_err_report_f() |
4181                      gr_ds_hww_report_mask_sph23_err_report_f());
4182
4183         /* setup sm warp esr report masks */
4184         gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4185                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4186                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4187                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4188                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4189                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4190                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4191                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4192                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4193                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4194                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4195                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4196                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4197                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4198                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4199                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4200                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4201                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4202                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4203                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4204                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4205
4206         /* setup sm global esr report mask */
4207         gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4208                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4209                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4210                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4211                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4212                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4213                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4214                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4215
4216         /* enable per GPC exceptions */
4217         gk20a_gr_enable_gpc_exceptions(g);
4218
4219         /* TBD: ECC for L1/SM */
4220         /* TBD: enable per BE exceptions */
4221
4222         /* reset and enable all exceptions */
4223         gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
4224         gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
4225         gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
4226         gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
4227         gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
4228         gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
4229
4230         /* ignore status from some units */
4231         data = gk20a_readl(g, gr_status_mask_r());
4232         gk20a_writel(g, gr_status_mask_r(), data & gr->status_disable_mask);
4233
4234         gr_gk20a_init_zbc(g, gr);
4235
4236         {
4237                 u64 compbit_base_post_divide64 = (gr->compbit_store.base_pa >>
4238                                 ltc_ltcs_ltss_cbc_base_alignment_shift_v());
4239                 do_div(compbit_base_post_divide64, gr->num_fbps);
4240                 compbit_base_post_divide = u64_lo32(compbit_base_post_divide64);
4241         }
4242
4243         compbit_base_post_multiply64 = ((u64)compbit_base_post_divide *
4244                 gr->num_fbps) << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
4245
4246         if (compbit_base_post_multiply64 < gr->compbit_store.base_pa)
4247                 compbit_base_post_divide++;
4248
4249         gk20a_writel(g, ltc_ltcs_ltss_cbc_base_r(),
4250                 compbit_base_post_divide);
4251
4252         nvhost_dbg(dbg_info | dbg_map | dbg_pte,
4253                    "compbit base.pa: 0x%x,%08x cbc_base:0x%08x\n",
4254                    (u32)(gr->compbit_store.base_pa>>32),
4255                    (u32)(gr->compbit_store.base_pa & 0xffffffff),
4256                    compbit_base_post_divide);
4257
4258         /* load ctx init */
4259         for (i = 0; i < sw_ctx_load->count; i++)
4260                 gk20a_writel(g, sw_ctx_load->l[i].addr,
4261                              sw_ctx_load->l[i].value);
4262
4263         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4264         if (err)
4265                 goto out;
4266
4267         /* save and disable fe_go_idle */
4268         fe_go_idle_timeout_save =
4269                 gk20a_readl(g, gr_fe_go_idle_timeout_r());
4270         gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4271                 (fe_go_idle_timeout_save & gr_fe_go_idle_timeout_count_f(0)) |
4272                 gr_fe_go_idle_timeout_count_disabled_f());
4273
4274         /* override a few ctx state registers */
4275         gr_gk20a_commit_global_cb_manager(g, NULL, false);
4276         gr_gk20a_commit_global_timeslice(g, NULL, false);
4277
4278         /* floorsweep anything left */
4279         gr_gk20a_ctx_state_floorsweep(g);
4280
4281         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4282         if (err)
4283                 goto restore_fe_go_idle;
4284
4285         /* enable pipe mode override */
4286         gk20a_writel(g, gr_pipe_bundle_config_r(),
4287                 gr_pipe_bundle_config_override_pipe_mode_enabled_f());
4288
4289         /* load bundle init */
4290         err = 0;
4291         for (i = 0; i < sw_bundle_init->count; i++) {
4292
4293                 if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
4294                         gk20a_writel(g, gr_pipe_bundle_data_r(),
4295                                 sw_bundle_init->l[i].value);
4296                         last_bundle_data = sw_bundle_init->l[i].value;
4297                 }
4298
4299                 gk20a_writel(g, gr_pipe_bundle_address_r(),
4300                              sw_bundle_init->l[i].addr);
4301
4302                 if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
4303                     GR_GO_IDLE_BUNDLE)
4304                         err |= gr_gk20a_wait_idle(g, end_jiffies,
4305                                         GR_IDLE_CHECK_DEFAULT);
4306                 else if (0) { /* IS_SILICON */
4307                         u32 delay = GR_IDLE_CHECK_DEFAULT;
4308                         do {
4309                                 u32 gr_status = gk20a_readl(g, gr_status_r());
4310
4311                                 if (gr_status_fe_method_lower_v(gr_status) ==
4312                                     gr_status_fe_method_lower_idle_v())
4313                                         break;
4314
4315                                 usleep_range(delay, delay * 2);
4316                                 delay = min_t(u32, delay << 1,
4317                                         GR_IDLE_CHECK_MAX);
4318
4319                         } while (time_before(jiffies, end_jiffies) |
4320                                         !tegra_platform_is_silicon());
4321                 }
4322         }
4323
4324         /* disable pipe mode override */
4325         gk20a_writel(g, gr_pipe_bundle_config_r(),
4326                      gr_pipe_bundle_config_override_pipe_mode_disabled_f());
4327
4328 restore_fe_go_idle:
4329         /* restore fe_go_idle */
4330         gk20a_writel(g, gr_fe_go_idle_timeout_r(), fe_go_idle_timeout_save);
4331
4332         if (err || gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT))
4333                 goto out;
4334
4335         /* load method init */
4336         if (sw_method_init->count) {
4337                 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4338                              sw_method_init->l[0].value);
4339                 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4340                              gr_pri_mme_shadow_raw_index_write_trigger_f() |
4341                              sw_method_init->l[0].addr);
4342                 last_method_data = sw_method_init->l[0].value;
4343         }
4344         for (i = 1; i < sw_method_init->count; i++) {
4345                 if (sw_method_init->l[i].value != last_method_data) {
4346                         gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4347                                 sw_method_init->l[i].value);
4348                         last_method_data = sw_method_init->l[i].value;
4349                 }
4350                 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4351                         gr_pri_mme_shadow_raw_index_write_trigger_f() |
4352                         sw_method_init->l[i].addr);
4353         }
4354
4355         gk20a_mm_l2_invalidate(g);
4356
4357         /* turn on cya15 bit for a default val that missed the cut */
4358         l1c_dbg_reg_val = gk20a_readl(g, gr_gpc0_tpc0_l1c_dbg_r());
4359         l1c_dbg_reg_val |= gr_gpc0_tpc0_l1c_dbg_cya15_en_f();
4360         gk20a_writel(g, gr_gpc0_tpc0_l1c_dbg_r(), l1c_dbg_reg_val);
4361
4362         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4363         if (err)
4364                 goto out;
4365
4366 out:
4367         nvhost_dbg_fn("done");
4368         return 0;
4369 }
4370
4371 static int gk20a_init_gr_prepare(struct gk20a *g)
4372 {
4373         u32 gpfifo_ctrl, pmc_en;
4374         u32 err = 0;
4375
4376         /* disable fifo access */
4377         pmc_en = gk20a_readl(g, mc_enable_r());
4378         if (pmc_en & mc_enable_pgraph_enabled_f()) {
4379                 gpfifo_ctrl = gk20a_readl(g, gr_gpfifo_ctl_r());
4380                 gpfifo_ctrl &= ~gr_gpfifo_ctl_access_enabled_f();
4381                 gk20a_writel(g, gr_gpfifo_ctl_r(), gpfifo_ctrl);
4382         }
4383
4384         /* reset gr engine */
4385         gk20a_reset(g, mc_enable_pgraph_enabled_f()
4386                         | mc_enable_blg_enabled_f()
4387                         | mc_enable_perfmon_enabled_f());
4388
4389         /* enable fifo access */
4390         gk20a_writel(g, gr_gpfifo_ctl_r(),
4391                 gr_gpfifo_ctl_access_enabled_f() |
4392                 gr_gpfifo_ctl_semaphore_access_enabled_f());
4393
4394         if (!g->gr.ctx_vars.valid) {
4395                 err = gr_gk20a_init_ctx_vars(g, &g->gr);
4396                 if (err)
4397                         nvhost_err(dev_from_gk20a(g),
4398                                 "fail to load gr init ctx");
4399         }
4400         return err;
4401 }
4402
4403 static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
4404 {
4405         struct gr_gk20a *gr = &g->gr;
4406         struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
4407         unsigned long end_jiffies = jiffies +
4408                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4409         u32 i, err = 0;
4410
4411         nvhost_dbg_fn("");
4412
4413         /* enable interrupts */
4414         gk20a_writel(g, gr_intr_r(), ~0);
4415         gk20a_writel(g, gr_intr_en_r(), ~0);
4416
4417         /* reset ctx switch state */
4418         gr_gk20a_ctx_reset(g, 0);
4419
4420         /* clear scc ram */
4421         gk20a_writel(g, gr_scc_init_r(),
4422                 gr_scc_init_ram_trigger_f());
4423
4424         /* load non_ctx init */
4425         for (i = 0; i < sw_non_ctx_load->count; i++)
4426                 gk20a_writel(g, sw_non_ctx_load->l[i].addr,
4427                         sw_non_ctx_load->l[i].value);
4428
4429         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4430         if (err)
4431                 goto out;
4432
4433         err = gr_gk20a_load_ctxsw_ucode(g, gr);
4434         if (err)
4435                 goto out;
4436
4437         /* this appears query for sw states but fecs actually init
4438            ramchain, etc so this is hw init */
4439         err = gr_gk20a_init_ctx_state(g, gr);
4440         if (err)
4441                 goto out;
4442
4443 out:
4444         if (err)
4445                 nvhost_err(dev_from_gk20a(g), "fail");
4446         else
4447                 nvhost_dbg_fn("done");
4448
4449         return 0;
4450 }
4451
4452 static int gk20a_init_gr_setup_sw(struct gk20a *g)
4453 {
4454         struct gr_gk20a *gr = &g->gr;
4455         int err;
4456
4457         nvhost_dbg_fn("");
4458
4459         if (gr->sw_ready) {
4460                 nvhost_dbg_fn("skip init");
4461                 return 0;
4462         }
4463
4464         gr->g = g;
4465
4466         err = gr_gk20a_init_gr_config(g, gr);
4467         if (err)
4468                 goto clean_up;
4469
4470         err = gr_gk20a_init_mmu_sw(g, gr);
4471         if (err)
4472                 goto clean_up;
4473
4474         err = gr_gk20a_init_map_tiles(g, gr);
4475         if (err)
4476                 goto clean_up;
4477
4478         if (tegra_cpu_is_asim())
4479                 gr->max_comptag_mem = 1; /* MBs worth of comptag coverage */
4480         else {
4481                 nvhost_dbg_info("total ram pages : %lu", totalram_pages);
4482                 gr->max_comptag_mem = totalram_pages
4483                                          >> (10 - (PAGE_SHIFT - 10));
4484         }
4485         err = gr_gk20a_init_comptag(g, gr);
4486         if (err)
4487                 goto clean_up;
4488
4489         err = gr_gk20a_init_zcull(g, gr);
4490         if (err)
4491                 goto clean_up;
4492
4493         err = gr_gk20a_alloc_global_ctx_buffers(g);
4494         if (err)
4495                 goto clean_up;
4496
4497         mutex_init(&gr->ctx_mutex);
4498         spin_lock_init(&gr->ch_tlb_lock);
4499
4500         gr->remove_support = gk20a_remove_gr_support;
4501         gr->sw_ready = true;
4502
4503         nvhost_dbg_fn("done");
4504         return 0;
4505
4506 clean_up:
4507         nvhost_err(dev_from_gk20a(g), "fail");
4508         gk20a_remove_gr_support(gr);
4509         return err;
4510 }
4511
4512 int gk20a_init_gr_support(struct gk20a *g)
4513 {
4514         u32 err;
4515
4516         nvhost_dbg_fn("");
4517
4518         err = gk20a_init_gr_prepare(g);
4519         if (err)
4520                 return err;
4521
4522         /* this is required before gr_gk20a_init_ctx_state */
4523         mutex_init(&g->gr.fecs_mutex);
4524
4525         err = gk20a_init_gr_reset_enable_hw(g);
4526         if (err)
4527                 return err;
4528
4529         err = gk20a_init_gr_setup_sw(g);
4530         if (err)
4531                 return err;
4532
4533         err = gk20a_init_gr_setup_hw(g);
4534         if (err)
4535                 return err;
4536
4537         return 0;
4538 }
4539
4540 #define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE   0x02dc
4541 #define NVA297_SET_CIRCULAR_BUFFER_SIZE         0x1280
4542 #define NVA297_SET_SHADER_EXCEPTIONS            0x1528
4543 #define NVA0C0_SET_SHADER_EXCEPTIONS            0x1528
4544
4545 #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
4546
4547 struct gr_isr_data {
4548         u32 addr;
4549         u32 data_lo;
4550         u32 data_hi;
4551         u32 curr_ctx;
4552         u32 chid;
4553         u32 offset;
4554         u32 sub_chan;
4555         u32 class_num;
4556 };
4557
4558 static void gk20a_gr_set_shader_exceptions(struct gk20a *g,
4559                                            struct gr_isr_data *isr_data)
4560 {
4561         u32 val;
4562
4563         nvhost_dbg_fn("");
4564
4565         if (isr_data->data_lo ==
4566             NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE)
4567                 val = 0;
4568         else
4569                 val = ~0;
4570
4571         gk20a_writel(g,
4572                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4573                 val);
4574         gk20a_writel(g,
4575                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4576                 val);
4577 }
4578
4579 static void gk20a_gr_set_circular_buffer_size(struct gk20a *g,
4580                         struct gr_isr_data *isr_data)
4581 {
4582         struct gr_gk20a *gr = &g->gr;
4583         u32 gpc_index, ppc_index, stride, val, offset;
4584         u32 cb_size = isr_data->data_lo * 4;
4585
4586         nvhost_dbg_fn("");
4587
4588         if (cb_size > gr->attrib_cb_size)
4589                 cb_size = gr->attrib_cb_size;
4590
4591         gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4592                 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4593                  ~gr_ds_tga_constraintlogic_beta_cbsize_f(~0)) |
4594                  gr_ds_tga_constraintlogic_beta_cbsize_f(cb_size));
4595
4596         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4597                 stride = proj_gpc_stride_v() * gpc_index;
4598
4599                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4600                         ppc_index++) {
4601
4602                         val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg_r() +
4603                                 stride +
4604                                 proj_ppc_in_gpc_stride_v() * ppc_index);
4605
4606                         offset = gr_gpc0_ppc0_cbm_cfg_start_offset_v(val);
4607
4608                         val = set_field(val,
4609                                 gr_gpc0_ppc0_cbm_cfg_size_m(),
4610                                 gr_gpc0_ppc0_cbm_cfg_size_f(cb_size *
4611                                         gr->pes_tpc_count[ppc_index][gpc_index]));
4612                         val = set_field(val,
4613                                 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4614                                 (offset + 1));
4615
4616                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4617                                 stride +
4618                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4619
4620                         val = set_field(val,
4621                                 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4622                                 offset);
4623
4624                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4625                                 stride +
4626                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4627                 }
4628         }
4629 }
4630
4631 static void gk20a_gr_set_alpha_circular_buffer_size(struct gk20a *g,
4632                                                 struct gr_isr_data *isr_data)
4633 {
4634         struct gr_gk20a *gr = &g->gr;
4635         u32 gpc_index, ppc_index, stride, val;
4636         u32 pd_ab_max_output;
4637         u32 alpha_cb_size = isr_data->data_lo * 4;
4638
4639         nvhost_dbg_fn("");
4640         /* if (NO_ALPHA_BETA_TIMESLICE_SUPPORT_DEF)
4641                 return; */
4642
4643         if (alpha_cb_size > gr->alpha_cb_size)
4644                 alpha_cb_size = gr->alpha_cb_size;
4645
4646         gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4647                 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4648                  ~gr_ds_tga_constraintlogic_alpha_cbsize_f(~0)) |
4649                  gr_ds_tga_constraintlogic_alpha_cbsize_f(alpha_cb_size));
4650
4651         pd_ab_max_output = alpha_cb_size *
4652                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v() /
4653                 gr_pd_ab_dist_cfg1_max_output_granularity_v();
4654
4655         gk20a_writel(g, gr_pd_ab_dist_cfg1_r(),
4656                 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output));
4657
4658         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4659                 stride = proj_gpc_stride_v() * gpc_index;
4660
4661                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4662                         ppc_index++) {
4663
4664                         val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4665                                 stride +
4666                                 proj_ppc_in_gpc_stride_v() * ppc_index);
4667
4668                         val = set_field(val, gr_gpc0_ppc0_cbm_cfg2_size_m(),
4669                                         gr_gpc0_ppc0_cbm_cfg2_size_f(alpha_cb_size *
4670                                                 gr->pes_tpc_count[ppc_index][gpc_index]));
4671
4672                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4673                                 stride +
4674                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4675                 }
4676         }
4677 }
4678
4679 void gk20a_gr_reset(struct gk20a *g)
4680 {
4681         int err;
4682         err = gk20a_init_gr_prepare(g);
4683         BUG_ON(err);
4684         err = gk20a_init_gr_reset_enable_hw(g);
4685         BUG_ON(err);
4686         err = gk20a_init_gr_setup_hw(g);
4687         BUG_ON(err);
4688 }
4689
4690 static int gk20a_gr_handle_illegal_method(struct gk20a *g,
4691                                           struct gr_isr_data *isr_data)
4692 {
4693         nvhost_dbg_fn("");
4694
4695         if (isr_data->class_num == KEPLER_COMPUTE_A) {
4696                 switch (isr_data->offset << 2) {
4697                 case NVA0C0_SET_SHADER_EXCEPTIONS:
4698                         gk20a_gr_set_shader_exceptions(g, isr_data);
4699                         break;
4700                 default:
4701                         goto fail;
4702                 }
4703         }
4704
4705         if (isr_data->class_num == KEPLER_C) {
4706                 switch (isr_data->offset << 2) {
4707                 case NVA297_SET_SHADER_EXCEPTIONS:
4708                         gk20a_gr_set_shader_exceptions(g, isr_data);
4709                         break;
4710                 case NVA297_SET_CIRCULAR_BUFFER_SIZE:
4711                         gk20a_gr_set_circular_buffer_size(g, isr_data);
4712                         break;
4713                 case NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE:
4714                         gk20a_gr_set_alpha_circular_buffer_size(g, isr_data);
4715                         break;
4716                 default:
4717                         goto fail;
4718                 }
4719         }
4720         return 0;
4721
4722 fail:
4723         nvhost_err(dev_from_gk20a(g), "invalid method class 0x%08x"
4724                 ", offset 0x%08x address 0x%08x\n",
4725                 isr_data->class_num, isr_data->offset, isr_data->addr);
4726         return -EINVAL;
4727 }
4728
4729 static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
4730                   struct gr_isr_data *isr_data)
4731 {
4732         struct fifo_gk20a *f = &g->fifo;
4733         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4734         nvhost_dbg_fn("");
4735         gk20a_set_error_notifier(ch->hwctx,
4736                                 NVHOST_CHANNEL_GR_SEMAPHORE_TIMEOUT);
4737         nvhost_err(dev_from_gk20a(g),
4738                    "gr semaphore timeout\n");
4739         return -EINVAL;
4740 }
4741
4742 static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
4743                   struct gr_isr_data *isr_data)
4744 {
4745         struct fifo_gk20a *f = &g->fifo;
4746         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4747         nvhost_dbg_fn("");
4748         gk20a_set_error_notifier(ch->hwctx,
4749                                 NVHOST_CHANNEL_GR_ILLEGAL_NOTIFY);
4750         /* This is an unrecoverable error, reset is needed */
4751         nvhost_err(dev_from_gk20a(g),
4752                    "gr semaphore timeout\n");
4753         return -EINVAL;
4754 }
4755
4756 static int gk20a_gr_handle_illegal_class(struct gk20a *g,
4757                                           struct gr_isr_data *isr_data)
4758 {
4759         struct fifo_gk20a *f = &g->fifo;
4760         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4761         nvhost_dbg_fn("");
4762         gk20a_set_error_notifier(ch->hwctx,
4763                                 NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4764         nvhost_err(dev_from_gk20a(g),
4765                    "invalid class 0x%08x, offset 0x%08x",
4766                    isr_data->class_num, isr_data->offset);
4767         return -EINVAL;
4768 }
4769
4770 static int gk20a_gr_handle_class_error(struct gk20a *g,
4771                                           struct gr_isr_data *isr_data)
4772 {
4773         struct fifo_gk20a *f = &g->fifo;
4774         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4775         nvhost_dbg_fn("");
4776
4777         gk20a_set_error_notifier(ch->hwctx,
4778                         NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4779         nvhost_err(dev_from_gk20a(g),
4780                    "class error 0x%08x, offset 0x%08x",
4781                    isr_data->class_num, isr_data->offset);
4782         return -EINVAL;
4783 }
4784
4785 static int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
4786                                              struct gr_isr_data *isr_data)
4787 {
4788         struct fifo_gk20a *f = &g->fifo;
4789         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4790
4791         wake_up(&ch->semaphore_wq);
4792
4793         return 0;
4794 }
4795
4796 static int gk20a_gr_handle_notify_pending(struct gk20a *g,
4797                                           struct gr_isr_data *isr_data)
4798 {
4799         struct fifo_gk20a *f = &g->fifo;
4800         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4801
4802 #if defined(CONFIG_TEGRA_GPU_CYCLE_STATS)
4803         void *virtual_address;
4804         u32 buffer_size;
4805         u32 offset;
4806         u32 new_offset;
4807         bool exit;
4808         struct share_buffer_head *sh_hdr;
4809         u32 raw_reg;
4810         u64 mask_orig;
4811         u64 v = 0;
4812         struct gk20a_cyclestate_buffer_elem *op_elem;
4813         /* GL will never use payload 0 for cycle state */
4814         if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
4815                 return 0;
4816
4817         mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
4818
4819         virtual_address = ch->cyclestate.cyclestate_buffer;
4820         buffer_size = ch->cyclestate.cyclestate_buffer_size;
4821         offset = isr_data->data_lo;
4822         exit = false;
4823         while (!exit) {
4824                 if (offset >= buffer_size) {
4825                         WARN_ON(1);
4826                         break;
4827                 }
4828
4829                 sh_hdr = (struct share_buffer_head *)
4830                         ((char *)virtual_address + offset);
4831
4832                 if (sh_hdr->size < sizeof(struct share_buffer_head)) {
4833                         WARN_ON(1);
4834                         break;
4835                 }
4836                 new_offset = offset + sh_hdr->size;
4837
4838                 switch (sh_hdr->operation) {
4839                 case OP_END:
4840                         exit = true;
4841                         break;
4842
4843                 case BAR0_READ32:
4844                 case BAR0_WRITE32:
4845                 {
4846                         op_elem =
4847                                 (struct gk20a_cyclestate_buffer_elem *)
4848                                         sh_hdr;
4849                         if (op_elem->offset_bar0 <
4850                                 TEGRA_GK20A_BAR0_SIZE) {
4851                                 mask_orig =
4852                                         ((1ULL <<
4853                                         (op_elem->last_bit + 1))
4854                                         -1)&~((1ULL <<
4855                                         op_elem->first_bit)-1);
4856
4857                                 raw_reg =
4858                                         gk20a_readl(g,
4859                                                 op_elem->offset_bar0);
4860
4861                                 switch (sh_hdr->operation) {
4862                                 case BAR0_READ32:
4863                                         op_elem->data =
4864                                         (raw_reg & mask_orig)
4865                                                 >> op_elem->first_bit;
4866                                         break;
4867
4868                                 case BAR0_WRITE32:
4869                                         v = 0;
4870                                         if ((unsigned int)mask_orig !=
4871                                         (unsigned int)~0) {
4872                                                 v = (unsigned int)
4873                                                         (raw_reg & ~mask_orig);
4874                                         }
4875
4876                                         v |= ((op_elem->data
4877                                                 << op_elem->first_bit)
4878                                                 & mask_orig);
4879
4880                                         gk20a_writel(g,
4881                                                 op_elem->offset_bar0,
4882                                                 (unsigned int)v);
4883                                                 break;
4884
4885                                 default:
4886                                                 break;
4887                                 }
4888                         } else {
4889                                 sh_hdr->failed = true;
4890                                 WARN_ON(1);
4891                         }
4892                 }
4893                 break;
4894                 default:
4895                 /* no operation content case */
4896                         exit = true;
4897                         break;
4898                 }
4899                 sh_hdr->completed = true;
4900                 offset = new_offset;
4901         }
4902         mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
4903 #endif
4904         nvhost_dbg_fn("");
4905         wake_up(&ch->notifier_wq);
4906         return 0;
4907 }
4908
4909 /* Used by sw interrupt thread to translate current ctx to chid.
4910  * For performance, we don't want to go through 128 channels every time.
4911  * A small tlb is used here to cache translation */
4912 static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx)
4913 {
4914         struct fifo_gk20a *f = &g->fifo;
4915         struct gr_gk20a *gr = &g->gr;
4916         u32 chid = -1;
4917         u32 i;
4918
4919         spin_lock(&gr->ch_tlb_lock);
4920
4921         /* check cache first */
4922         for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
4923                 if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
4924                         chid = gr->chid_tlb[i].hw_chid;
4925                         goto unlock;
4926                 }
4927         }
4928
4929         /* slow path */
4930         for (chid = 0; chid < f->num_channels; chid++)
4931                 if (f->channel[chid].in_use) {
4932                         if ((u32)(f->channel[chid].inst_block.cpu_pa >>
4933                                 ram_in_base_shift_v()) ==
4934                                 gr_fecs_current_ctx_ptr_v(curr_ctx))
4935                                 break;
4936         }
4937
4938         if (chid >= f->num_channels) {
4939                 chid = -1;
4940                 goto unlock;
4941         }
4942
4943         /* add to free tlb entry */
4944         for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
4945                 if (gr->chid_tlb[i].curr_ctx == 0) {
4946                         gr->chid_tlb[i].curr_ctx = curr_ctx;
4947                         gr->chid_tlb[i].hw_chid = chid;
4948                         goto unlock;
4949                 }
4950         }
4951
4952         /* no free entry, flush one */
4953         gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
4954         gr->chid_tlb[gr->channel_tlb_flush_index].hw_chid = chid;
4955
4956         gr->channel_tlb_flush_index =
4957                 (gr->channel_tlb_flush_index + 1) &
4958                 (GR_CHANNEL_MAP_TLB_SIZE - 1);
4959
4960 unlock:
4961         spin_unlock(&gr->ch_tlb_lock);
4962         return chid;
4963 }
4964
4965 static int gk20a_gr_lock_down_sm(struct gk20a *g, u32 global_esr_mask)
4966 {
4967         unsigned long end_jiffies = jiffies +
4968                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4969         u32 delay = GR_IDLE_CHECK_DEFAULT;
4970         bool mmu_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled(g);
4971         u32 dbgr_control0;
4972
4973         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "locking down SM");
4974
4975         /* assert stop trigger */
4976         dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
4977         dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
4978         gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
4979
4980         /* wait for the sm to lock down */
4981         do {
4982                 u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
4983                 u32 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
4984                 u32 dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r());
4985                 bool locked_down =
4986                         (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
4987                          gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
4988                 bool error_pending =
4989                         (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) !=
4990                          gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) ||
4991                         ((global_esr & ~global_esr_mask) != 0);
4992
4993                 if (locked_down || !error_pending) {
4994                         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "locked down SM");
4995
4996                         /* de-assert stop trigger */
4997                         dbgr_control0 &= ~gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
4998                         gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
4999
5000                         return 0;
5001                 }
5002
5003                 /* if an mmu fault is pending and mmu debug mode is not
5004                  * enabled, the sm will never lock down. */
5005                 if (!mmu_debug_mode_enabled && gk20a_fifo_mmu_fault_pending(g)) {
5006                         nvhost_err(dev_from_gk20a(g), "mmu fault pending, sm will"
5007                                    " never lock down!");
5008                         return -EFAULT;
5009                 }
5010
5011                 usleep_range(delay, delay * 2);
5012                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
5013
5014         } while (time_before(jiffies, end_jiffies));
5015
5016         nvhost_err(dev_from_gk20a(g), "timed out while trying to lock down SM");
5017
5018         return -EAGAIN;
5019 }
5020
5021 bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
5022 {
5023         u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5024
5025         /* check if an sm debugger is attached */
5026         if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
5027                         gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v())
5028                 return true;
5029
5030         return false;
5031 }
5032
5033 static void gk20a_gr_clear_sm_hww(struct gk20a *g, u32 global_esr)
5034 {
5035         gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r(), global_esr);
5036
5037         /* clear the warp hww */
5038         gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r(),
5039                         gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f());
5040 }
5041
5042 static struct channel_gk20a *
5043 channel_from_hw_chid(struct gk20a *g, u32 hw_chid)
5044 {
5045         return g->fifo.channel+hw_chid;
5046 }
5047
5048 static int gk20a_gr_handle_sm_exception(struct gk20a *g,
5049                 struct gr_isr_data *isr_data)
5050 {
5051         int ret = 0;
5052         bool do_warp_sync = false;
5053         /* these three interrupts don't require locking down the SM. They can
5054          * be handled by usermode clients as they aren't fatal. Additionally,
5055          * usermode clients may wish to allow some warps to execute while others
5056          * are at breakpoints, as opposed to fatal errors where all warps should
5057          * halt. */
5058         u32 global_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()   |
5059                           gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
5060                           gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
5061         u32 global_esr, warp_esr;
5062         bool sm_debugger_attached = gk20a_gr_sm_debugger_attached(g);
5063         struct channel_gk20a *fault_ch;
5064
5065         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
5066
5067         global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5068         warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5069
5070         /* if an sm debugger is attached, disable forwarding of tpc exceptions.
5071          * the debugger will reenable exceptions after servicing them. */
5072         if (sm_debugger_attached) {
5073                 u32 tpc_exception_en = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
5074                 tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
5075                 gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), tpc_exception_en);
5076                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "SM debugger attached");
5077         }
5078
5079         /* if a debugger is present and an error has occurred, do a warp sync */
5080         if (sm_debugger_attached && ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
5081                 nvhost_dbg(dbg_intr, "warp sync needed");
5082                 do_warp_sync = true;
5083         }
5084
5085         if (do_warp_sync) {
5086                 ret = gk20a_gr_lock_down_sm(g, global_mask);
5087                 if (ret) {
5088                         nvhost_err(dev_from_gk20a(g), "sm did not lock down!\n");
5089                         return ret;
5090                 }
5091         }
5092
5093         /* finally, signal any client waiting on an event */
5094         fault_ch = channel_from_hw_chid(g, isr_data->chid);
5095         if (fault_ch)
5096                 gk20a_dbg_gpu_post_events(fault_ch);
5097
5098         return ret;
5099 }
5100
5101 static int gk20a_gr_handle_tpc_exception(struct gk20a *g,
5102                 struct gr_isr_data *isr_data)
5103 {
5104         int ret = 0;
5105         u32 tpc_exception = gk20a_readl(g, gr_gpcs_tpcs_tpccs_tpc_exception_r());
5106
5107         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "");
5108
5109         /* check if an sm exeption is pending  */
5110         if (gr_gpcs_tpcs_tpccs_tpc_exception_sm_v(tpc_exception) ==
5111                         gr_gpcs_tpcs_tpccs_tpc_exception_sm_pending_v()) {
5112                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "SM exception pending");
5113                 ret = gk20a_gr_handle_sm_exception(g, isr_data);
5114         }
5115
5116         return ret;
5117 }
5118
5119 static int gk20a_gr_handle_gpc_exception(struct gk20a *g,
5120                 struct gr_isr_data *isr_data)
5121 {
5122         int ret = 0;
5123         u32 gpc_exception = gk20a_readl(g, gr_gpcs_gpccs_gpc_exception_r());
5124
5125         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "");
5126
5127         /* check if tpc 0 has an exception */
5128         if (gr_gpcs_gpccs_gpc_exception_tpc_v(gpc_exception) ==
5129                         gr_gpcs_gpccs_gpc_exception_tpc_0_pending_v()) {
5130                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "TPC exception pending");
5131                 ret = gk20a_gr_handle_tpc_exception(g, isr_data);
5132         }
5133
5134         return ret;
5135 }
5136
5137 int gk20a_gr_isr(struct gk20a *g)
5138 {
5139         struct gr_isr_data isr_data;
5140         u32 grfifo_ctl;
5141         u32 obj_table;
5142         int need_reset = 0;
5143         u32 gr_intr = gk20a_readl(g, gr_intr_r());
5144
5145         nvhost_dbg_fn("");
5146         nvhost_dbg(dbg_intr, "pgraph intr %08x", gr_intr);
5147
5148         if (!gr_intr)
5149                 return 0;
5150
5151         grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
5152         grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
5153         grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
5154
5155         gk20a_writel(g, gr_gpfifo_ctl_r(),
5156                 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
5157                 gr_gpfifo_ctl_semaphore_access_f(0));
5158
5159         isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
5160         isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
5161         isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
5162         isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
5163         isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
5164         isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
5165         obj_table = gk20a_readl(g,
5166                 gr_fe_object_table_r(isr_data.sub_chan));
5167         isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
5168
5169         isr_data.chid =
5170                 gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx);
5171         if (isr_data.chid == -1) {
5172                 nvhost_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
5173                            isr_data.curr_ctx);
5174                 goto clean_up;
5175         }
5176
5177         nvhost_dbg(dbg_intr | dbg_gpu_dbg,
5178                 "channel %d: addr 0x%08x, "
5179                 "data 0x%08x 0x%08x,"
5180                 "ctx 0x%08x, offset 0x%08x, "
5181                 "subchannel 0x%08x, class 0x%08x",
5182                 isr_data.chid, isr_data.addr,
5183                 isr_data.data_hi, isr_data.data_lo,
5184                 isr_data.curr_ctx, isr_data.offset,
5185                 isr_data.sub_chan, isr_data.class_num);
5186
5187         if (gr_intr & gr_intr_notify_pending_f()) {
5188                 gk20a_gr_handle_notify_pending(g, &isr_data);
5189                 gk20a_writel(g, gr_intr_r(),
5190                         gr_intr_notify_reset_f());
5191                 gr_intr &= ~gr_intr_notify_pending_f();
5192         }
5193
5194         if (gr_intr & gr_intr_semaphore_pending_f()) {
5195                 gk20a_gr_handle_semaphore_pending(g, &isr_data);
5196                 gk20a_writel(g, gr_intr_r(),
5197                         gr_intr_semaphore_reset_f());
5198                 gr_intr &= ~gr_intr_semaphore_pending_f();
5199         }
5200
5201         if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
5202                 need_reset |= gk20a_gr_handle_semaphore_timeout_pending(g,
5203                         &isr_data);
5204                 gk20a_writel(g, gr_intr_r(),
5205                         gr_intr_semaphore_reset_f());
5206                 gr_intr &= ~gr_intr_semaphore_pending_f();
5207         }
5208
5209         if (gr_intr & gr_intr_illegal_notify_pending_f()) {
5210                 need_reset |= gk20a_gr_intr_illegal_notify_pending(g,
5211                         &isr_data);
5212                 gk20a_writel(g, gr_intr_r(),
5213                         gr_intr_illegal_notify_reset_f());
5214                 gr_intr &= ~gr_intr_illegal_notify_pending_f();
5215         }
5216
5217         if (gr_intr & gr_intr_illegal_method_pending_f()) {
5218                 need_reset |= gk20a_gr_handle_illegal_method(g, &isr_data);
5219                 gk20a_writel(g, gr_intr_r(),
5220                         gr_intr_illegal_method_reset_f());
5221                 gr_intr &= ~gr_intr_illegal_method_pending_f();
5222         }
5223
5224         if (gr_intr & gr_intr_illegal_class_pending_f()) {
5225                 need_reset |= gk20a_gr_handle_illegal_class(g, &isr_data);
5226                 gk20a_writel(g, gr_intr_r(),
5227                         gr_intr_illegal_class_reset_f());
5228                 gr_intr &= ~gr_intr_illegal_class_pending_f();
5229         }
5230
5231         if (gr_intr & gr_intr_class_error_pending_f()) {
5232                 need_reset |= gk20a_gr_handle_class_error(g, &isr_data);
5233                 gk20a_writel(g, gr_intr_r(),
5234                         gr_intr_class_error_reset_f());
5235                 gr_intr &= ~gr_intr_class_error_pending_f();
5236         }
5237
5238         if (gr_intr & gr_intr_exception_pending_f()) {
5239                 u32 exception = gk20a_readl(g, gr_exception_r());
5240                 struct fifo_gk20a *f = &g->fifo;
5241                 struct channel_gk20a *ch = &f->channel[isr_data.chid];
5242
5243                 gk20a_set_error_notifier(ch->hwctx,
5244                                         NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
5245
5246                 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "exception %08x\n", exception);
5247
5248                 if (exception & gr_exception_fe_m()) {
5249                         u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
5250                         nvhost_dbg(dbg_intr, "fe warning %08x\n", fe);
5251                         gk20a_writel(g, gr_fe_hww_esr_r(), fe);
5252                 }
5253
5254                 /* check if a gpc exception has occurred */
5255                 if (exception & gr_exception_gpc_m() && need_reset == 0) {
5256                         u32 exception1 = gk20a_readl(g, gr_exception1_r());
5257                         u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5258
5259                         nvhost_dbg(dbg_intr | dbg_gpu_dbg, "GPC exception pending");
5260
5261                         /* if no sm debugger is present, clean up the channel */
5262                         if (!gk20a_gr_sm_debugger_attached(g)) {
5263                                 nvhost_dbg(dbg_intr | dbg_gpu_dbg,
5264                                            "SM debugger not attached, clearing interrupt");
5265                                 need_reset |= -EFAULT;
5266                         }
5267                         else {
5268                                 /* check if gpc 0 has an exception */
5269                                 if (exception1 & gr_exception1_gpc_0_pending_f())
5270                                         need_reset |= gk20a_gr_handle_gpc_exception(g, &isr_data);
5271                                 /* clear the hwws, also causes tpc and gpc
5272                                  * exceptions to be cleared */
5273                                 gk20a_gr_clear_sm_hww(g, global_esr);
5274                         }
5275
5276                 }
5277
5278                 gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
5279                 gr_intr &= ~gr_intr_exception_pending_f();
5280         }
5281
5282         if (need_reset)
5283                 gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A));
5284
5285 clean_up:
5286         gk20a_writel(g, gr_gpfifo_ctl_r(),
5287                 grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
5288                 gr_gpfifo_ctl_semaphore_access_f(1));
5289
5290         if (gr_intr)
5291                 nvhost_err(dev_from_gk20a(g),
5292                            "unhandled gr interrupt 0x%08x", gr_intr);
5293
5294         return 0;
5295 }
5296
5297 int gk20a_gr_nonstall_isr(struct gk20a *g)
5298 {
5299         u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
5300         u32 clear_intr = 0;
5301
5302         nvhost_dbg(dbg_intr, "pgraph nonstall intr %08x", gr_intr);
5303
5304         if (gr_intr & gr_intr_nonstall_trap_pending_f()) {
5305                 gk20a_channel_semaphore_wakeup(g);
5306                 clear_intr |= gr_intr_nonstall_trap_pending_f();
5307         }
5308
5309         gk20a_writel(g, gr_intr_nonstall_r(), clear_intr);
5310
5311         return 0;
5312 }
5313
5314 int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
5315 {
5316         BUG_ON(size == NULL);
5317         return gr_gk20a_submit_fecs_method_op(g,
5318                    (struct fecs_method_op_gk20a) {
5319                            .mailbox.id = 0,
5320                            .mailbox.data = 0,
5321                            .mailbox.clr = ~0,
5322                            .method.data = 1,
5323                            .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
5324                            .mailbox.ret = size,
5325                            .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
5326                            .mailbox.ok = 0,
5327                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5328                            .mailbox.fail = 0});
5329 }
5330
5331 int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
5332 {
5333         return gr_gk20a_submit_fecs_method_op(g,
5334                    (struct fecs_method_op_gk20a){
5335                            .mailbox.id = 4,
5336                            .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
5337                                             gr_fecs_current_ctx_valid_f(1) |
5338                                             gr_fecs_current_ctx_target_vid_mem_f()),
5339                            .mailbox.clr = ~0,
5340                            .method.data = 1,
5341                            .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
5342                            .mailbox.ret = NULL,
5343                            .cond.ok = GR_IS_UCODE_OP_EQUAL,
5344                            .mailbox.ok = 1,
5345                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5346                            .mailbox.fail = 0});
5347 }
5348
5349 int gr_gk20a_fecs_set_reglist_virual_addr(struct gk20a *g, u64 pmu_va)
5350 {
5351         return gr_gk20a_submit_fecs_method_op(g,
5352                    (struct fecs_method_op_gk20a) {
5353                            .mailbox.id = 4,
5354                            .mailbox.data = u64_lo32(pmu_va >> 8),
5355                            .mailbox.clr = ~0,
5356                            .method.data = 1,
5357                            .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
5358                            .mailbox.ret = NULL,
5359                            .cond.ok = GR_IS_UCODE_OP_EQUAL,
5360                            .mailbox.ok = 1,
5361                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5362                            .mailbox.fail = 0});
5363 }
5364
5365 int gk20a_gr_suspend(struct gk20a *g)
5366 {
5367         unsigned long end_jiffies = jiffies +
5368                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5369         u32 ret = 0;
5370
5371         nvhost_dbg_fn("");
5372
5373         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
5374         if (ret)
5375                 return ret;
5376
5377         gk20a_writel(g, gr_gpfifo_ctl_r(),
5378                 gr_gpfifo_ctl_access_disabled_f());
5379
5380         /* disable gr intr */
5381         gk20a_writel(g, gr_intr_r(), 0);
5382         gk20a_writel(g, gr_intr_en_r(), 0);
5383
5384         /* disable all exceptions */
5385         gk20a_writel(g, gr_exception_r(), 0);
5386         gk20a_writel(g, gr_exception_en_r(), 0);
5387         gk20a_writel(g, gr_exception1_r(), 0);
5388         gk20a_writel(g, gr_exception1_en_r(), 0);
5389         gk20a_writel(g, gr_exception2_r(), 0);
5390         gk20a_writel(g, gr_exception2_en_r(), 0);
5391
5392         gk20a_gr_flush_channel_tlb(&g->gr);
5393
5394         nvhost_dbg_fn("done");
5395         return ret;
5396 }
5397
5398 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
5399                                                u32 addr,
5400                                                bool is_quad, u32 quad,
5401                                                u32 *context_buffer,
5402                                                u32 context_buffer_size,
5403                                                u32 *priv_offset);
5404
5405 /* This function will decode a priv address and return the partition type and numbers. */
5406 int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
5407                               int  *addr_type, /* enum ctxsw_addr_type */
5408                               u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
5409                               u32 *broadcast_flags)
5410 {
5411         u32 gpc_addr;
5412         u32 ppc_address;
5413         u32 ppc_broadcast_addr;
5414
5415         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5416
5417         /* setup defaults */
5418         ppc_address = 0;
5419         ppc_broadcast_addr = 0;
5420         *addr_type = CTXSW_ADDR_TYPE_SYS;
5421         *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
5422         *gpc_num = 0;
5423         *tpc_num = 0;
5424         *ppc_num = 0;
5425         *be_num  = 0;
5426
5427         if (pri_is_gpc_addr(addr)) {
5428                 *addr_type = CTXSW_ADDR_TYPE_GPC;
5429                 gpc_addr = pri_gpccs_addr_mask(addr);
5430                 if (pri_is_gpc_addr_shared(addr)) {
5431                         *addr_type = CTXSW_ADDR_TYPE_GPC;
5432                         *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
5433                 } else
5434                         *gpc_num = pri_get_gpc_num(addr);
5435
5436                 if (pri_is_tpc_addr(gpc_addr)) {
5437                         *addr_type = CTXSW_ADDR_TYPE_TPC;
5438                         if (pri_is_tpc_addr_shared(gpc_addr)) {
5439                                 *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
5440                                 return 0;
5441                         }
5442                         *tpc_num = pri_get_tpc_num(gpc_addr);
5443                 }
5444                 return 0;
5445         } else if (pri_is_be_addr(addr)) {
5446                 *addr_type = CTXSW_ADDR_TYPE_BE;
5447                 if (pri_is_be_addr_shared(addr)) {
5448                         *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
5449                         return 0;
5450                 }
5451                 *be_num = pri_get_be_num(addr);
5452                 return 0;
5453         } else {
5454                 *addr_type = CTXSW_ADDR_TYPE_SYS;
5455                 return 0;
5456         }
5457         /* PPC!?!?!?! */
5458
5459         /*NOTREACHED*/
5460         return -EINVAL;
5461 }
5462
5463 static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
5464                                       u32 gpc_num,
5465                                       u32 *priv_addr_table, u32 *t)
5466 {
5467     u32 ppc_num;
5468
5469     nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5470
5471     for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
5472             priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
5473                                                    gpc_num, ppc_num);
5474
5475     return 0;
5476 }
5477
5478 /*
5479  * The context buffer is indexed using BE broadcast addresses and GPC/TPC
5480  * unicast addresses. This function will convert a BE unicast address to a BE
5481  * broadcast address and split a GPC/TPC broadcast address into a table of
5482  * GPC/TPC addresses.  The addresses generated by this function can be
5483  * successfully processed by gr_gk20a_find_priv_offset_in_buffer
5484  */
5485 static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
5486                                            u32 addr,
5487                                            u32 *priv_addr_table,
5488                                            u32 *num_registers)
5489 {
5490         int addr_type; /*enum ctxsw_addr_type */
5491         u32 gpc_num, tpc_num, ppc_num, be_num;
5492         u32 broadcast_flags;
5493         u32 t;
5494         int err;
5495
5496         t = 0;
5497         *num_registers = 0;
5498
5499         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5500
5501         err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
5502                                         &gpc_num, &tpc_num, &ppc_num, &be_num,
5503                                         &broadcast_flags);
5504         nvhost_dbg(dbg_gpu_dbg, "addr_type = %d", addr_type);
5505         if (err)
5506                 return err;
5507
5508         if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
5509             (addr_type == CTXSW_ADDR_TYPE_BE)) {
5510                 /* The BE broadcast registers are included in the compressed PRI
5511                  * table. Convert a BE unicast address to a broadcast address
5512                  * so that we can look up the offset. */
5513                 if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
5514                     !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
5515                         priv_addr_table[t++] = pri_be_shared_addr(addr);
5516                 else
5517                         priv_addr_table[t++] = addr;
5518
5519                 *num_registers = t;
5520                 return 0;
5521         }
5522
5523         /* The GPC/TPC unicast registers are included in the compressed PRI
5524          * tables. Convert a GPC/TPC broadcast address to unicast addresses so
5525          * that we can look up the offsets. */
5526         if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
5527                 for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
5528
5529                         if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5530                                 for (tpc_num = 0;
5531                                      tpc_num < g->gr.gpc_tpc_count[gpc_num];
5532                                      tpc_num++)
5533                                         priv_addr_table[t++] =
5534                                                 pri_tpc_addr(pri_tpccs_addr_mask(addr),
5535                                                              gpc_num, tpc_num);
5536
5537                         else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
5538                                 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5539                                                                priv_addr_table, &t);
5540                                 if (err)
5541                                         return err;
5542                         } else
5543                                 priv_addr_table[t++] =
5544                                         pri_gpc_addr(pri_gpccs_addr_mask(addr),
5545                                                      gpc_num);
5546                 }
5547         } else {
5548                 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5549                         for (tpc_num = 0;
5550                              tpc_num < g->gr.gpc_tpc_count[gpc_num];
5551                              tpc_num++)
5552                                 priv_addr_table[t++] =
5553                                         pri_tpc_addr(pri_tpccs_addr_mask(addr),
5554                                                      gpc_num, tpc_num);
5555                 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
5556                         err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5557                                                        priv_addr_table, &t);
5558                 else
5559                         priv_addr_table[t++] = addr;
5560         }
5561
5562         *num_registers = t;
5563         return 0;
5564 }
5565
5566 int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
5567                                     u32 addr,
5568                                     u32 max_offsets,
5569                                     u32 *offsets, u32 *offset_addrs,
5570                                     u32 *num_offsets,
5571                                     bool is_quad, u32 quad)
5572 {
5573         u32 i;
5574         u32 priv_offset = 0;
5575         u32 *priv_registers;
5576         u32 num_registers = 0;
5577         int err = 0;
5578         u32 potential_offsets = proj_scal_litter_num_gpcs_v() *
5579                 proj_scal_litter_num_tpc_per_gpc_v();
5580
5581         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5582
5583         /* implementation is crossed-up if either of these happen */
5584         if (max_offsets > potential_offsets)
5585                 return -EINVAL;
5586
5587         if (!g->gr.ctx_vars.golden_image_initialized)
5588                 return -ENODEV;
5589
5590         priv_registers = kzalloc(sizeof(u32) * potential_offsets, GFP_KERNEL);
5591         if (IS_ERR_OR_NULL(priv_registers)) {
5592                 nvhost_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets);
5593                 err = PTR_ERR(priv_registers);
5594                 goto cleanup;
5595         }
5596         memset(offsets,      0, sizeof(u32) * max_offsets);
5597         memset(offset_addrs, 0, sizeof(u32) * max_offsets);
5598         *num_offsets = 0;
5599
5600         gr_gk20a_create_priv_addr_table(g, addr, &priv_registers[0], &num_registers);
5601
5602         if ((max_offsets > 1) && (num_registers > max_offsets)) {
5603                 err = -EINVAL;
5604                 goto cleanup;
5605         }
5606
5607         if ((max_offsets == 1) && (num_registers > 1))
5608                 num_registers = 1;
5609
5610         if (!g->gr.ctx_vars.local_golden_image) {
5611                 nvhost_dbg_fn("no context switch header info to work with");
5612                 err = -EINVAL;
5613                 goto cleanup;
5614         }
5615
5616         for (i = 0; i < num_registers; i++) {
5617                 err = gr_gk20a_find_priv_offset_in_buffer(g,
5618                                                   priv_registers[i],
5619                                                   is_quad, quad,
5620                                                   g->gr.ctx_vars.local_golden_image,
5621                                                   g->gr.ctx_vars.golden_image_size,
5622                                                   &priv_offset);
5623                 if (err) {
5624                         nvhost_dbg_fn("Could not determine priv_offset for addr:0x%x",
5625                                       addr); /*, grPriRegStr(addr)));*/
5626                         goto cleanup;
5627                 }
5628
5629                 offsets[i] = priv_offset;
5630                 offset_addrs[i] = priv_registers[i];
5631         }
5632
5633     *num_offsets = num_registers;
5634
5635  cleanup:
5636
5637     if (!IS_ERR_OR_NULL(priv_registers))
5638             kfree(priv_registers);
5639
5640     return err;
5641 }
5642
5643 /* Setup some register tables.  This looks hacky; our
5644  * register/offset functions are just that, functions.
5645  * So they can't be used as initializers... TBD: fix to
5646  * generate consts at least on an as-needed basis.
5647  */
5648 static const u32 _num_ovr_perf_regs = 17;
5649 static u32 _ovr_perf_regs[17] = { 0, };
5650 /* Following are the blocks of registers that the ucode
5651  stores in the extended region.*/
5652 /* ==  ctxsw_extended_sm_dsm_perf_counter_register_stride_v() ? */
5653 static const u32 _num_sm_dsm_perf_regs = 5;
5654 /* ==  ctxsw_extended_sm_dsm_perf_counter_control_register_stride_v() ?*/
5655 static const u32 _num_sm_dsm_perf_ctrl_regs = 4;
5656 static u32 _sm_dsm_perf_regs[5];
5657 static u32 _sm_dsm_perf_ctrl_regs[4];
5658
5659 static void init_sm_dsm_reg_info(void)
5660 {
5661         if (_ovr_perf_regs[0] != 0)
5662                 return;
5663
5664         _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
5665         _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
5666         _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
5667         _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
5668         _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
5669         _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
5670         _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
5671         _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
5672         _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
5673         _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
5674         _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
5675         _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
5676         _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
5677         _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
5678         _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
5679         _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
5680         _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
5681
5682
5683         _sm_dsm_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r();
5684         _sm_dsm_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r();
5685         _sm_dsm_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r();
5686         _sm_dsm_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r();
5687         _sm_dsm_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r();
5688
5689         _sm_dsm_perf_ctrl_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r();
5690         _sm_dsm_perf_ctrl_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r();
5691         _sm_dsm_perf_ctrl_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r();
5692         _sm_dsm_perf_ctrl_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r();
5693
5694 }
5695
5696 /* TBD: would like to handle this elsewhere, at a higher level.
5697  * these are currently constructed in a "test-then-write" style
5698  * which makes it impossible to know externally whether a ctx
5699  * write will actually occur. so later we should put a lazy,
5700  *  map-and-hold system in the patch write state */
5701 int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
5702                             struct channel_ctx_gk20a *ch_ctx,
5703                             u32 addr, u32 data,
5704                             u8 *context)
5705 {
5706         u32 num_gpc = g->gr.gpc_count;
5707         u32 num_tpc;
5708         u32 tpc, gpc, reg;
5709         u32 chk_addr;
5710         u32 vaddr_lo;
5711         u32 vaddr_hi;
5712         u32 tmp;
5713
5714         init_sm_dsm_reg_info();
5715
5716         nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5717
5718         for (reg = 0; reg < _num_ovr_perf_regs; reg++)&nbs