gpu: nvgpu: gk20a: check ctx valid bit
[linux-3.10.git] / drivers / gpu / nvgpu / gk20a / gr_gk20a.c
1 /*
2  * GK20A Graphics
3  *
4  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program; if not, write to the Free Software Foundation, Inc.,
17  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18  */
19
20 #include <linux/delay.h>        /* for udelay */
21 #include <linux/mm.h>           /* for totalram_pages */
22 #include <linux/scatterlist.h>
23 #include <linux/tegra-soc.h>
24 #include <linux/nvhost_dbg_gpu_ioctl.h>
25 #include <linux/vmalloc.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/firmware.h>
28 #include <linux/nvhost.h>
29
30 #include "gk20a.h"
31 #include "kind_gk20a.h"
32 #include "gr_ctx_gk20a.h"
33
34 #include "hw_ccsr_gk20a.h"
35 #include "hw_ctxsw_prog_gk20a.h"
36 #include "hw_fifo_gk20a.h"
37 #include "hw_gr_gk20a.h"
38 #include "hw_gmmu_gk20a.h"
39 #include "hw_mc_gk20a.h"
40 #include "hw_ram_gk20a.h"
41 #include "hw_pri_ringmaster_gk20a.h"
42 #include "hw_pri_ringstation_sys_gk20a.h"
43 #include "hw_pri_ringstation_gpc_gk20a.h"
44 #include "hw_pri_ringstation_fbp_gk20a.h"
45 #include "hw_proj_gk20a.h"
46 #include "hw_top_gk20a.h"
47 #include "hw_ltc_gk20a.h"
48 #include "hw_fb_gk20a.h"
49 #include "hw_therm_gk20a.h"
50 #include "hw_pbdma_gk20a.h"
51 #include "gr_pri_gk20a.h"
52 #include "regops_gk20a.h"
53 #include "dbg_gpu_gk20a.h"
54
55 #define BLK_SIZE (256)
56
57 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
58
59 /* global ctx buffer */
60 static int  gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
61 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
62 static int  gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
63                                             struct channel_gk20a *c);
64 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
65
66 /* channel gr ctx buffer */
67 static int  gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
68                                         struct channel_gk20a *c);
69 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
70
71 /* channel patch ctx buffer */
72 static int  gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
73                                         struct channel_gk20a *c);
74 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
75
76 /* golden ctx image */
77 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
78                                           struct channel_gk20a *c);
79 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
80                                           struct channel_gk20a *c);
81
82 void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
83 {
84         int i;
85
86         gk20a_err(dev_from_gk20a(g), "gr_fecs_os_r : %d",
87                 gk20a_readl(g, gr_fecs_os_r()));
88         gk20a_err(dev_from_gk20a(g), "gr_fecs_cpuctl_r : 0x%x",
89                 gk20a_readl(g, gr_fecs_cpuctl_r()));
90         gk20a_err(dev_from_gk20a(g), "gr_fecs_idlestate_r : 0x%x",
91                 gk20a_readl(g, gr_fecs_idlestate_r()));
92         gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox0_r : 0x%x",
93                 gk20a_readl(g, gr_fecs_mailbox0_r()));
94         gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox1_r : 0x%x",
95                 gk20a_readl(g, gr_fecs_mailbox1_r()));
96         gk20a_err(dev_from_gk20a(g), "gr_fecs_irqstat_r : 0x%x",
97                 gk20a_readl(g, gr_fecs_irqstat_r()));
98         gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmode_r : 0x%x",
99                 gk20a_readl(g, gr_fecs_irqmode_r()));
100         gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmask_r : 0x%x",
101                 gk20a_readl(g, gr_fecs_irqmask_r()));
102         gk20a_err(dev_from_gk20a(g), "gr_fecs_irqdest_r : 0x%x",
103                 gk20a_readl(g, gr_fecs_irqdest_r()));
104         gk20a_err(dev_from_gk20a(g), "gr_fecs_debug1_r : 0x%x",
105                 gk20a_readl(g, gr_fecs_debug1_r()));
106         gk20a_err(dev_from_gk20a(g), "gr_fecs_debuginfo_r : 0x%x",
107                 gk20a_readl(g, gr_fecs_debuginfo_r()));
108
109         for (i = 0; i < gr_fecs_ctxsw_mailbox__size_1_v(); i++)
110                 gk20a_err(dev_from_gk20a(g), "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
111                         i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
112
113         gk20a_err(dev_from_gk20a(g), "gr_fecs_engctl_r : 0x%x",
114                 gk20a_readl(g, gr_fecs_engctl_r()));
115         gk20a_err(dev_from_gk20a(g), "gr_fecs_curctx_r : 0x%x",
116                 gk20a_readl(g, gr_fecs_curctx_r()));
117         gk20a_err(dev_from_gk20a(g), "gr_fecs_nxtctx_r : 0x%x",
118                 gk20a_readl(g, gr_fecs_nxtctx_r()));
119
120         gk20a_writel(g, gr_fecs_icd_cmd_r(),
121                 gr_fecs_icd_cmd_opc_rreg_f() |
122                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
123         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_IMB : 0x%x",
124                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
125
126         gk20a_writel(g, gr_fecs_icd_cmd_r(),
127                 gr_fecs_icd_cmd_opc_rreg_f() |
128                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
129         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_DMB : 0x%x",
130                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
131
132         gk20a_writel(g, gr_fecs_icd_cmd_r(),
133                 gr_fecs_icd_cmd_opc_rreg_f() |
134                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
135         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CSW : 0x%x",
136                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
137
138         gk20a_writel(g, gr_fecs_icd_cmd_r(),
139                 gr_fecs_icd_cmd_opc_rreg_f() |
140                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
141         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CTX : 0x%x",
142                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
143
144         gk20a_writel(g, gr_fecs_icd_cmd_r(),
145                 gr_fecs_icd_cmd_opc_rreg_f() |
146                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
147         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_EXCI : 0x%x",
148                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
149
150         for (i = 0; i < 4; i++) {
151                 gk20a_writel(g, gr_fecs_icd_cmd_r(),
152                         gr_fecs_icd_cmd_opc_rreg_f() |
153                         gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
154                 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_PC : 0x%x",
155                         gk20a_readl(g, gr_fecs_icd_rdata_r()));
156
157                 gk20a_writel(g, gr_fecs_icd_cmd_r(),
158                         gr_fecs_icd_cmd_opc_rreg_f() |
159                         gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
160                 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_SP : 0x%x",
161                         gk20a_readl(g, gr_fecs_icd_rdata_r()));
162         }
163 }
164
165 static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
166 {
167         u32 i, ucode_u32_size;
168         const u32 *ucode_u32_data;
169         u32 checksum;
170
171         gk20a_dbg_fn("");
172
173         gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
174                                               gr_gpccs_dmemc_blk_f(0)  |
175                                               gr_gpccs_dmemc_aincw_f(1)));
176
177         ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
178         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
179
180         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
181                 gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
182                 checksum += ucode_u32_data[i];
183         }
184
185         gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
186                                              gr_fecs_dmemc_blk_f(0)  |
187                                              gr_fecs_dmemc_aincw_f(1)));
188
189         ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
190         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
191
192         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
193                 gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
194                 checksum += ucode_u32_data[i];
195         }
196         gk20a_dbg_fn("done");
197 }
198
199 static void gr_gk20a_load_falcon_imem(struct gk20a *g)
200 {
201         u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
202         const u32 *ucode_u32_data;
203         u32 tag, i, pad_start, pad_end;
204         u32 checksum;
205
206         gk20a_dbg_fn("");
207
208         cfg = gk20a_readl(g, gr_fecs_cfg_r());
209         fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
210
211         cfg = gk20a_readl(g, gr_gpc0_cfg_r());
212         gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
213
214         /* Use the broadcast address to access all of the GPCCS units. */
215         gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
216                                               gr_gpccs_imemc_blk_f(0) |
217                                               gr_gpccs_imemc_aincw_f(1)));
218
219         /* Setup the tags for the instruction memory. */
220         tag = 0;
221         gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
222
223         ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
224         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
225
226         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
227                 if (i && ((i % (256/sizeof(u32))) == 0)) {
228                         tag++;
229                         gk20a_writel(g, gr_gpccs_imemt_r(0),
230                                       gr_gpccs_imemt_tag_f(tag));
231                 }
232                 gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
233                 checksum += ucode_u32_data[i];
234         }
235
236         pad_start = i*4;
237         pad_end = pad_start+(256-pad_start%256)+256;
238         for (i = pad_start;
239              (i < gpccs_imem_size * 256) && (i < pad_end);
240              i += 4) {
241                 if (i && ((i % 256) == 0)) {
242                         tag++;
243                         gk20a_writel(g, gr_gpccs_imemt_r(0),
244                                       gr_gpccs_imemt_tag_f(tag));
245                 }
246                 gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
247         }
248
249         gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
250                                              gr_fecs_imemc_blk_f(0) |
251                                              gr_fecs_imemc_aincw_f(1)));
252
253         /* Setup the tags for the instruction memory. */
254         tag = 0;
255         gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
256
257         ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
258         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
259
260         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
261                 if (i && ((i % (256/sizeof(u32))) == 0)) {
262                         tag++;
263                         gk20a_writel(g, gr_fecs_imemt_r(0),
264                                       gr_fecs_imemt_tag_f(tag));
265                 }
266                 gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
267                 checksum += ucode_u32_data[i];
268         }
269
270         pad_start = i*4;
271         pad_end = pad_start+(256-pad_start%256)+256;
272         for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
273                 if (i && ((i % 256) == 0)) {
274                         tag++;
275                         gk20a_writel(g, gr_fecs_imemt_r(0),
276                                       gr_fecs_imemt_tag_f(tag));
277                 }
278                 gk20a_writel(g, gr_fecs_imemd_r(0), 0);
279         }
280 }
281
282 static int gr_gk20a_wait_idle(struct gk20a *g, unsigned long end_jiffies,
283                 u32 expect_delay)
284 {
285         u32 delay = expect_delay;
286         bool gr_enabled;
287         bool ctxsw_active;
288         bool gr_busy;
289
290         gk20a_dbg_fn("");
291
292         do {
293                 /* fmodel: host gets fifo_engine_status(gr) from gr
294                    only when gr_status is read */
295                 gk20a_readl(g, gr_status_r());
296
297                 gr_enabled = gk20a_readl(g, mc_enable_r()) &
298                         mc_enable_pgraph_enabled_f();
299
300                 ctxsw_active = gk20a_readl(g,
301                         fifo_engine_status_r(ENGINE_GR_GK20A)) &
302                         fifo_engine_status_ctxsw_in_progress_f();
303
304                 gr_busy = gk20a_readl(g, gr_engine_status_r()) &
305                         gr_engine_status_value_busy_f();
306
307                 if (!gr_enabled || (!gr_busy && !ctxsw_active)) {
308                         gk20a_dbg_fn("done");
309                         return 0;
310                 }
311
312                 usleep_range(delay, delay * 2);
313                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
314
315         } while (time_before(jiffies, end_jiffies)
316                         || !tegra_platform_is_silicon());
317
318         gk20a_err(dev_from_gk20a(g),
319                 "timeout, ctxsw busy : %d, gr busy : %d",
320                 ctxsw_active, gr_busy);
321
322         return -EAGAIN;
323 }
324
325 static int gr_gk20a_wait_fe_idle(struct gk20a *g, unsigned long end_jiffies,
326                 u32 expect_delay)
327 {
328         u32 val;
329         u32 delay = expect_delay;
330
331         gk20a_dbg_fn("");
332
333         do {
334                 val = gk20a_readl(g, gr_status_r());
335
336                 if (!gr_status_fe_method_upper_v(val) &&
337                         !gr_status_fe_method_lower_v(val) &&
338                         !gr_status_fe_method_fe_gi_v(val)) {
339                         gk20a_dbg_fn("done");
340                         return 0;
341                 }
342
343                 usleep_range(delay, delay * 2);
344                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
345         } while (time_before(jiffies, end_jiffies)
346                         || !tegra_platform_is_silicon());
347
348         gk20a_err(dev_from_gk20a(g),
349                 "timeout, fe busy : %x", val);
350
351         return -EAGAIN;
352 }
353 static int gr_gk20a_ctx_reset(struct gk20a *g, u32 rst_mask)
354 {
355         u32 delay = GR_IDLE_CHECK_DEFAULT;
356         unsigned long end_jiffies = jiffies +
357                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
358         u32 reg;
359
360         gk20a_dbg_fn("");
361
362         if (!tegra_platform_is_linsim()) {
363                 /* Force clocks on */
364                 gk20a_writel(g, gr_fe_pwr_mode_r(),
365                              gr_fe_pwr_mode_req_send_f() |
366                              gr_fe_pwr_mode_mode_force_on_f());
367
368                 /* Wait for the clocks to indicate that they are on */
369                 do {
370                         reg = gk20a_readl(g, gr_fe_pwr_mode_r());
371
372                         if (gr_fe_pwr_mode_req_v(reg) ==
373                                         gr_fe_pwr_mode_req_done_v())
374                                 break;
375
376                         usleep_range(delay, delay * 2);
377                         delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
378
379                 } while (time_before(jiffies, end_jiffies));
380
381                 if (!time_before(jiffies, end_jiffies)) {
382                         gk20a_err(dev_from_gk20a(g),
383                                    "failed to force the clocks on\n");
384                         WARN_ON(1);
385                 }
386         }
387         if (rst_mask) {
388                 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), rst_mask);
389         } else {
390                 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
391                              gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
392                              gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
393                              gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
394                              gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
395                              gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
396                              gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
397                              gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
398                              gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
399                              gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
400         }
401
402         /* we need to read the reset register *and* wait for a moment to ensure
403          * reset propagation */
404
405         gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
406         udelay(20);
407
408         gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
409                      gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
410                      gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
411                      gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
412                      gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
413                      gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
414                      gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
415                      gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
416                      gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
417                      gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
418
419         /* we need to readl the reset and then wait a small moment after that */
420         gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
421         udelay(20);
422
423         if (!tegra_platform_is_linsim()) {
424                 /* Set power mode back to auto */
425                 gk20a_writel(g, gr_fe_pwr_mode_r(),
426                              gr_fe_pwr_mode_req_send_f() |
427                              gr_fe_pwr_mode_mode_auto_f());
428
429                 /* Wait for the request to complete */
430                 end_jiffies = jiffies +
431                         msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
432                 do {
433                         reg = gk20a_readl(g, gr_fe_pwr_mode_r());
434
435                         if (gr_fe_pwr_mode_req_v(reg) ==
436                                         gr_fe_pwr_mode_req_done_v())
437                                 break;
438
439                         usleep_range(delay, delay * 2);
440                         delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
441
442                 } while (time_before(jiffies, end_jiffies));
443
444                 if (!time_before(jiffies, end_jiffies))
445                         gk20a_warn(dev_from_gk20a(g),
446                                    "failed to set power mode to auto\n");
447         }
448
449         return 0;
450 }
451
452 static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
453                                    u32 *mailbox_ret, u32 opc_success,
454                                    u32 mailbox_ok, u32 opc_fail,
455                                    u32 mailbox_fail)
456 {
457         unsigned long end_jiffies = jiffies +
458                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
459         u32 delay = GR_IDLE_CHECK_DEFAULT;
460         u32 check = WAIT_UCODE_LOOP;
461         u32 reg;
462
463         gk20a_dbg_fn("");
464
465         while (check == WAIT_UCODE_LOOP) {
466                 if (!time_before(jiffies, end_jiffies) &&
467                                 tegra_platform_is_silicon())
468                         check = WAIT_UCODE_TIMEOUT;
469
470                 reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
471
472                 if (mailbox_ret)
473                         *mailbox_ret = reg;
474
475                 switch (opc_success) {
476                 case GR_IS_UCODE_OP_EQUAL:
477                         if (reg == mailbox_ok)
478                                 check = WAIT_UCODE_OK;
479                         break;
480                 case GR_IS_UCODE_OP_NOT_EQUAL:
481                         if (reg != mailbox_ok)
482                                 check = WAIT_UCODE_OK;
483                         break;
484                 case GR_IS_UCODE_OP_AND:
485                         if (reg & mailbox_ok)
486                                 check = WAIT_UCODE_OK;
487                         break;
488                 case GR_IS_UCODE_OP_LESSER:
489                         if (reg < mailbox_ok)
490                                 check = WAIT_UCODE_OK;
491                         break;
492                 case GR_IS_UCODE_OP_LESSER_EQUAL:
493                         if (reg <= mailbox_ok)
494                                 check = WAIT_UCODE_OK;
495                         break;
496                 case GR_IS_UCODE_OP_SKIP:
497                         /* do no success check */
498                         break;
499                 default:
500                         gk20a_err(dev_from_gk20a(g),
501                                    "invalid success opcode 0x%x", opc_success);
502
503                         check = WAIT_UCODE_ERROR;
504                         break;
505                 }
506
507                 switch (opc_fail) {
508                 case GR_IS_UCODE_OP_EQUAL:
509                         if (reg == mailbox_fail)
510                                 check = WAIT_UCODE_ERROR;
511                         break;
512                 case GR_IS_UCODE_OP_NOT_EQUAL:
513                         if (reg != mailbox_fail)
514                                 check = WAIT_UCODE_ERROR;
515                         break;
516                 case GR_IS_UCODE_OP_AND:
517                         if (reg & mailbox_fail)
518                                 check = WAIT_UCODE_ERROR;
519                         break;
520                 case GR_IS_UCODE_OP_LESSER:
521                         if (reg < mailbox_fail)
522                                 check = WAIT_UCODE_ERROR;
523                         break;
524                 case GR_IS_UCODE_OP_LESSER_EQUAL:
525                         if (reg <= mailbox_fail)
526                                 check = WAIT_UCODE_ERROR;
527                         break;
528                 case GR_IS_UCODE_OP_SKIP:
529                         /* do no check on fail*/
530                         break;
531                 default:
532                         gk20a_err(dev_from_gk20a(g),
533                                    "invalid fail opcode 0x%x", opc_fail);
534                         check = WAIT_UCODE_ERROR;
535                         break;
536                 }
537
538                 usleep_range(delay, delay * 2);
539                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
540         }
541
542         if (check == WAIT_UCODE_TIMEOUT) {
543                 gk20a_err(dev_from_gk20a(g),
544                            "timeout waiting on ucode response");
545                 gk20a_fecs_dump_falcon_stats(g);
546                 return -1;
547         } else if (check == WAIT_UCODE_ERROR) {
548                 gk20a_err(dev_from_gk20a(g),
549                            "ucode method failed on mailbox=%d value=0x%08x",
550                            mailbox_id, reg);
551                 gk20a_fecs_dump_falcon_stats(g);
552                 return -1;
553         }
554
555         gk20a_dbg_fn("done");
556         return 0;
557 }
558
559 /* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
560  * We should replace most, if not all, fecs method calls to this instead. */
561 struct fecs_method_op_gk20a {
562         struct {
563                 u32 addr;
564                 u32 data;
565         } method;
566
567         struct {
568                 u32 id;
569                 u32 data;
570                 u32 clr;
571                 u32 *ret;
572                 u32 ok;
573                 u32 fail;
574         } mailbox;
575
576         struct {
577                 u32 ok;
578                 u32 fail;
579         } cond;
580
581 };
582
583 int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
584                                    struct fecs_method_op_gk20a op)
585 {
586         struct gr_gk20a *gr = &g->gr;
587         int ret;
588
589         mutex_lock(&gr->fecs_mutex);
590
591         if (op.mailbox.id != 0)
592                 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
593                              op.mailbox.data);
594
595         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
596                 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
597
598         gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
599         gk20a_writel(g, gr_fecs_method_push_r(),
600                 gr_fecs_method_push_adr_f(op.method.addr));
601
602         /* op.mb.id == 4 cases require waiting for completion on
603          * for op.mb.id == 0 */
604         if (op.mailbox.id == 4)
605                 op.mailbox.id = 0;
606
607         ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
608                                       op.cond.ok, op.mailbox.ok,
609                                       op.cond.fail, op.mailbox.fail);
610
611         mutex_unlock(&gr->fecs_mutex);
612
613         return ret;
614 }
615
616 int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
617 {
618         return gr_gk20a_submit_fecs_method_op(g,
619               (struct fecs_method_op_gk20a) {
620                       .method.addr = fecs_method,
621                       .method.data = ~0,
622                       .mailbox = { .id   = 1, /*sideband?*/
623                                    .data = ~0, .clr = ~0, .ret = ret,
624                                    .ok   = gr_fecs_ctxsw_mailbox_value_pass_v(),
625                                    .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
626                       .cond.ok = GR_IS_UCODE_OP_EQUAL,
627                       .cond.fail = GR_IS_UCODE_OP_EQUAL });
628 }
629
630 /* Stop processing (stall) context switches at FECS.
631  * The caller must hold the dbg_sessions_lock, else if mutliple stop methods
632  * are sent to the ucode in sequence, it can get into an undefined state. */
633 int gr_gk20a_disable_ctxsw(struct gk20a *g)
634 {
635         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
636         return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0);
637 }
638
639 /* Start processing (continue) context switches at FECS */
640 int gr_gk20a_enable_ctxsw(struct gk20a *g)
641 {
642         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
643         return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0);
644 }
645
646
647 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
648 {
649         u32 addr_lo;
650         u32 addr_hi;
651         void *inst_ptr = NULL;
652
653         gk20a_dbg_fn("");
654
655         inst_ptr = c->inst_block.cpuva;
656         if (!inst_ptr)
657                 return -ENOMEM;
658
659         addr_lo = u64_lo32(gpu_va) >> 12;
660         addr_hi = u64_hi32(gpu_va);
661
662         gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
663                  ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
664                  ram_in_gr_wfi_ptr_lo_f(addr_lo));
665
666         gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
667                  ram_in_gr_wfi_ptr_hi_f(addr_hi));
668
669         return 0;
670 }
671
672 /*
673  * Context state can be written directly or "patched" at times.
674  * So that code can be used in either situation it is written
675  * using a series _ctx_patch_write(..., patch) statements.
676  * However any necessary cpu map/unmap and gpu l2 invalidates
677  * should be minimized (to avoid doing it once per patch write).
678  * Before a sequence of these set up with "_ctx_patch_write_begin"
679  * and close with "_ctx_patch_write_end."
680  */
681 int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
682                                           struct channel_ctx_gk20a *ch_ctx)
683 {
684         /* being defensive still... */
685         if (ch_ctx->patch_ctx.cpu_va) {
686                 gk20a_err(dev_from_gk20a(g), "nested ctx patch begin?");
687                 return -EBUSY;
688         }
689
690         ch_ctx->patch_ctx.cpu_va = vmap(ch_ctx->patch_ctx.pages,
691                         PAGE_ALIGN(ch_ctx->patch_ctx.size) >> PAGE_SHIFT,
692                         0, pgprot_dmacoherent(PAGE_KERNEL));
693
694         if (!ch_ctx->patch_ctx.cpu_va)
695                 return -ENOMEM;
696
697         return 0;
698 }
699
700 int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
701                                         struct channel_ctx_gk20a *ch_ctx)
702 {
703         /* being defensive still... */
704         if (!ch_ctx->patch_ctx.cpu_va) {
705                 gk20a_err(dev_from_gk20a(g), "dangling ctx patch end?");
706                 return -EINVAL;
707         }
708
709         vunmap(ch_ctx->patch_ctx.cpu_va);
710         ch_ctx->patch_ctx.cpu_va = NULL;
711         return 0;
712 }
713
714 int gr_gk20a_ctx_patch_write(struct gk20a *g,
715                                     struct channel_ctx_gk20a *ch_ctx,
716                                     u32 addr, u32 data, bool patch)
717 {
718         u32 patch_slot = 0;
719         void *patch_ptr = NULL;
720         bool mapped_here = false;
721
722         BUG_ON(patch != 0 && ch_ctx == NULL);
723
724         if (patch) {
725                 if (!ch_ctx)
726                         return -EINVAL;
727                 /* we added an optimization prolog, epilog
728                  * to get rid of unnecessary maps and l2 invals.
729                  * but be defensive still... */
730                 if (!ch_ctx->patch_ctx.cpu_va) {
731                         int err;
732                         gk20a_err(dev_from_gk20a(g),
733                                    "per-write ctx patch begin?");
734                         /* yes, gr_gk20a_ctx_patch_smpc causes this one */
735                         err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
736                         if (err)
737                                 return err;
738                         mapped_here = true;
739                 } else
740                         mapped_here = false;
741
742                 patch_ptr = ch_ctx->patch_ctx.cpu_va;
743                 patch_slot = ch_ctx->patch_ctx.data_count * 2;
744
745                 gk20a_mem_wr32(patch_ptr, patch_slot++, addr);
746                 gk20a_mem_wr32(patch_ptr, patch_slot++, data);
747
748                 ch_ctx->patch_ctx.data_count++;
749
750                 if (mapped_here)
751                         gr_gk20a_ctx_patch_write_end(g, ch_ctx);
752
753         } else
754                 gk20a_writel(g, addr, data);
755
756         return 0;
757 }
758
759 static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
760                                         struct channel_gk20a *c)
761 {
762         u32 inst_base_ptr = u64_lo32(c->inst_block.cpu_pa
763                                      >> ram_in_base_shift_v());
764         u32 ret;
765
766         gk20a_dbg_info("bind channel %d inst ptr 0x%08x",
767                    c->hw_chid, inst_base_ptr);
768
769         ret = gr_gk20a_submit_fecs_method_op(g,
770                      (struct fecs_method_op_gk20a) {
771                      .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
772                      .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
773                                      gr_fecs_current_ctx_target_vid_mem_f() |
774                                      gr_fecs_current_ctx_valid_f(1)),
775                      .mailbox = { .id = 0, .data = 0,
776                                   .clr = 0x30,
777                                   .ret = NULL,
778                                   .ok = 0x10,
779                                   .fail = 0x20, },
780                      .cond.ok = GR_IS_UCODE_OP_AND,
781                      .cond.fail = GR_IS_UCODE_OP_AND});
782         if (ret)
783                 gk20a_err(dev_from_gk20a(g),
784                         "bind channel instance failed");
785
786         return ret;
787 }
788
789 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
790                                     bool disable_fifo)
791 {
792         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
793         struct fifo_gk20a *f = &g->fifo;
794         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
795         u32 va_lo, va_hi, va;
796         int ret = 0;
797         void *ctx_ptr = NULL;
798
799         gk20a_dbg_fn("");
800
801         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
802                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
803                         0, pgprot_dmacoherent(PAGE_KERNEL));
804         if (!ctx_ptr)
805                 return -ENOMEM;
806
807         if (ch_ctx->zcull_ctx.gpu_va == 0 &&
808             ch_ctx->zcull_ctx.ctx_sw_mode ==
809                 ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
810                 ret = -EINVAL;
811                 goto clean_up;
812         }
813
814         va_lo = u64_lo32(ch_ctx->zcull_ctx.gpu_va);
815         va_hi = u64_hi32(ch_ctx->zcull_ctx.gpu_va);
816         va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
817
818         if (disable_fifo) {
819                 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
820                 if (ret) {
821                         gk20a_err(dev_from_gk20a(g),
822                                 "failed to disable gr engine activity\n");
823                         goto clean_up;
824                 }
825         }
826
827         gk20a_mm_fb_flush(g);
828
829         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
830                  ch_ctx->zcull_ctx.ctx_sw_mode);
831
832         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
833
834         if (disable_fifo) {
835                 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
836                 if (ret) {
837                         gk20a_err(dev_from_gk20a(g),
838                                 "failed to enable gr engine activity\n");
839                         goto clean_up;
840                 }
841         }
842
843 clean_up:
844         vunmap(ctx_ptr);
845
846         return ret;
847 }
848
849 static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
850                         struct channel_gk20a *c, bool patch)
851 {
852         struct gr_gk20a *gr = &g->gr;
853         struct channel_ctx_gk20a *ch_ctx = NULL;
854         u32 attrib_offset_in_chunk = 0;
855         u32 alpha_offset_in_chunk = 0;
856         u32 pd_ab_max_output;
857         u32 gpc_index, ppc_index;
858         u32 temp;
859         u32 cbm_cfg_size1, cbm_cfg_size2;
860
861         gk20a_dbg_fn("");
862
863         if (patch) {
864                 int err;
865                 ch_ctx = &c->ch_ctx;
866                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
867                 if (err)
868                         return err;
869         }
870
871         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
872                 gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
873                 gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
874                 patch);
875
876         pd_ab_max_output = (gr->alpha_cb_default_size *
877                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
878                 gr_pd_ab_dist_cfg1_max_output_granularity_v();
879
880         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
881                 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
882                 gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
883
884         alpha_offset_in_chunk = attrib_offset_in_chunk +
885                 gr->tpc_count * gr->attrib_cb_size;
886
887         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
888                 temp = proj_gpc_stride_v() * gpc_index;
889                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
890                      ppc_index++) {
891                         cbm_cfg_size1 = gr->attrib_cb_default_size *
892                                 gr->pes_tpc_count[ppc_index][gpc_index];
893                         cbm_cfg_size2 = gr->alpha_cb_default_size *
894                                 gr->pes_tpc_count[ppc_index][gpc_index];
895
896                         gr_gk20a_ctx_patch_write(g, ch_ctx,
897                                 gr_gpc0_ppc0_cbm_cfg_r() + temp +
898                                 proj_ppc_in_gpc_stride_v() * ppc_index,
899                                 gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
900                                 gr_gpc0_ppc0_cbm_cfg_start_offset_f(attrib_offset_in_chunk) |
901                                 gr_gpc0_ppc0_cbm_cfg_size_f(cbm_cfg_size1), patch);
902
903                         attrib_offset_in_chunk += gr->attrib_cb_size *
904                                 gr->pes_tpc_count[ppc_index][gpc_index];
905
906                         gr_gk20a_ctx_patch_write(g, ch_ctx,
907                                 gr_gpc0_ppc0_cbm_cfg2_r() + temp +
908                                 proj_ppc_in_gpc_stride_v() * ppc_index,
909                                 gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
910                                 gr_gpc0_ppc0_cbm_cfg2_size_f(cbm_cfg_size2), patch);
911
912                         alpha_offset_in_chunk += gr->alpha_cb_size *
913                                 gr->pes_tpc_count[ppc_index][gpc_index];
914                 }
915         }
916
917         if (patch)
918                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
919
920         return 0;
921 }
922
923 static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
924                         struct channel_gk20a *c, bool patch)
925 {
926         struct gr_gk20a *gr = &g->gr;
927         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
928         u64 addr;
929         u32 size;
930
931         gk20a_dbg_fn("");
932         if (patch) {
933                 int err;
934                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
935                 if (err)
936                         return err;
937         }
938
939         /* global pagepool buffer */
940         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
941                 gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
942                 (u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
943                  (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
944
945         size = gr->global_ctx_buffer[PAGEPOOL].size /
946                 gr_scc_pagepool_total_pages_byte_granularity_v();
947
948         if (size == gr_scc_pagepool_total_pages_hwmax_value_v())
949                 size = gr_scc_pagepool_total_pages_hwmax_v();
950
951         gk20a_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
952                 addr, size);
953
954         g->ops.gr.commit_global_pagepool(g, ch_ctx, addr, size, patch);
955
956         /* global bundle cb */
957         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
958                 gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
959                 (u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
960                  (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
961
962         size = gr->bundle_cb_default_size;
963
964         gk20a_dbg_info("bundle cb addr : 0x%016llx, size : %d",
965                 addr, size);
966
967         g->ops.gr.commit_global_bundle_cb(g, ch_ctx, addr, size, patch);
968
969         /* global attrib cb */
970         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
971                 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
972                 (u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
973                  (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
974
975         gk20a_dbg_info("attrib cb addr : 0x%016llx", addr);
976         g->ops.gr.commit_global_attrib_cb(g, ch_ctx, addr, patch);
977
978         if (patch)
979                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
980
981         return 0;
982 }
983
984 static void gr_gk20a_commit_global_attrib_cb(struct gk20a *g,
985                                             struct channel_ctx_gk20a *ch_ctx,
986                                             u64 addr, bool patch)
987 {
988         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
989                 gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
990                 gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
991
992         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
993                 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
994                 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
995 }
996
997 static void gr_gk20a_commit_global_bundle_cb(struct gk20a *g,
998                                             struct channel_ctx_gk20a *ch_ctx,
999                                             u64 addr, u64 size, bool patch)
1000 {
1001         u32 data;
1002
1003         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
1004                 gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
1005
1006         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
1007                 gr_scc_bundle_cb_size_div_256b_f(size) |
1008                 gr_scc_bundle_cb_size_valid_true_f(), patch);
1009
1010         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
1011                 gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
1012
1013         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
1014                 gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
1015                 gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
1016
1017         /* data for state_limit */
1018         data = (g->gr.bundle_cb_default_size *
1019                 gr_scc_bundle_cb_size_div_256b_byte_granularity_v()) /
1020                 gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v();
1021
1022         data = min_t(u32, data, g->gr.min_gpm_fifo_depth);
1023
1024         gk20a_dbg_info("bundle cb token limit : %d, state limit : %d",
1025                    g->gr.bundle_cb_token_limit, data);
1026
1027         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
1028                 gr_pd_ab_dist_cfg2_token_limit_f(g->gr.bundle_cb_token_limit) |
1029                 gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
1030
1031 }
1032
1033 static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
1034 {
1035         struct gr_gk20a *gr = &g->gr;
1036         struct channel_ctx_gk20a *ch_ctx = NULL;
1037         u32 gpm_pd_cfg;
1038         u32 pd_ab_dist_cfg0;
1039         u32 ds_debug;
1040         u32 mpc_vtg_debug;
1041         u32 pe_vaf;
1042         u32 pe_vsc_vpc;
1043
1044         gk20a_dbg_fn("");
1045
1046         gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
1047         pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
1048         ds_debug = gk20a_readl(g, gr_ds_debug_r());
1049         mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
1050
1051         if (patch) {
1052                 int err;
1053                 ch_ctx = &c->ch_ctx;
1054                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
1055                 if (err)
1056                         return err;
1057         }
1058
1059         if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
1060                 pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
1061                 pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
1062
1063                 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
1064                 pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
1065                 pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
1066                 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
1067                 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
1068                 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
1069
1070                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1071                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
1072                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
1073                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1074                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1075                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1076         } else {
1077                 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
1078                 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
1079                 ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
1080                 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
1081
1082                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1083                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1084                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1085                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1086         }
1087
1088         if (patch)
1089                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
1090
1091         return 0;
1092 }
1093
1094 int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr)
1095 {
1096         u32 norm_entries, norm_shift;
1097         u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
1098         u32 map0, map1, map2, map3, map4, map5;
1099
1100         if (!gr->map_tiles)
1101                 return -1;
1102
1103         gk20a_dbg_fn("");
1104
1105         gk20a_writel(g, gr_crstr_map_table_cfg_r(),
1106                      gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
1107                      gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
1108
1109         map0 =  gr_crstr_gpc_map0_tile0_f(gr->map_tiles[0]) |
1110                 gr_crstr_gpc_map0_tile1_f(gr->map_tiles[1]) |
1111                 gr_crstr_gpc_map0_tile2_f(gr->map_tiles[2]) |
1112                 gr_crstr_gpc_map0_tile3_f(gr->map_tiles[3]) |
1113                 gr_crstr_gpc_map0_tile4_f(gr->map_tiles[4]) |
1114                 gr_crstr_gpc_map0_tile5_f(gr->map_tiles[5]);
1115
1116         map1 =  gr_crstr_gpc_map1_tile6_f(gr->map_tiles[6]) |
1117                 gr_crstr_gpc_map1_tile7_f(gr->map_tiles[7]) |
1118                 gr_crstr_gpc_map1_tile8_f(gr->map_tiles[8]) |
1119                 gr_crstr_gpc_map1_tile9_f(gr->map_tiles[9]) |
1120                 gr_crstr_gpc_map1_tile10_f(gr->map_tiles[10]) |
1121                 gr_crstr_gpc_map1_tile11_f(gr->map_tiles[11]);
1122
1123         map2 =  gr_crstr_gpc_map2_tile12_f(gr->map_tiles[12]) |
1124                 gr_crstr_gpc_map2_tile13_f(gr->map_tiles[13]) |
1125                 gr_crstr_gpc_map2_tile14_f(gr->map_tiles[14]) |
1126                 gr_crstr_gpc_map2_tile15_f(gr->map_tiles[15]) |
1127                 gr_crstr_gpc_map2_tile16_f(gr->map_tiles[16]) |
1128                 gr_crstr_gpc_map2_tile17_f(gr->map_tiles[17]);
1129
1130         map3 =  gr_crstr_gpc_map3_tile18_f(gr->map_tiles[18]) |
1131                 gr_crstr_gpc_map3_tile19_f(gr->map_tiles[19]) |
1132                 gr_crstr_gpc_map3_tile20_f(gr->map_tiles[20]) |
1133                 gr_crstr_gpc_map3_tile21_f(gr->map_tiles[21]) |
1134                 gr_crstr_gpc_map3_tile22_f(gr->map_tiles[22]) |
1135                 gr_crstr_gpc_map3_tile23_f(gr->map_tiles[23]);
1136
1137         map4 =  gr_crstr_gpc_map4_tile24_f(gr->map_tiles[24]) |
1138                 gr_crstr_gpc_map4_tile25_f(gr->map_tiles[25]) |
1139                 gr_crstr_gpc_map4_tile26_f(gr->map_tiles[26]) |
1140                 gr_crstr_gpc_map4_tile27_f(gr->map_tiles[27]) |
1141                 gr_crstr_gpc_map4_tile28_f(gr->map_tiles[28]) |
1142                 gr_crstr_gpc_map4_tile29_f(gr->map_tiles[29]);
1143
1144         map5 =  gr_crstr_gpc_map5_tile30_f(gr->map_tiles[30]) |
1145                 gr_crstr_gpc_map5_tile31_f(gr->map_tiles[31]) |
1146                 gr_crstr_gpc_map5_tile32_f(0) |
1147                 gr_crstr_gpc_map5_tile33_f(0) |
1148                 gr_crstr_gpc_map5_tile34_f(0) |
1149                 gr_crstr_gpc_map5_tile35_f(0);
1150
1151         gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
1152         gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
1153         gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
1154         gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
1155         gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
1156         gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
1157
1158         switch (gr->tpc_count) {
1159         case 1:
1160                 norm_shift = 4;
1161                 break;
1162         case 2:
1163         case 3:
1164                 norm_shift = 3;
1165                 break;
1166         case 4:
1167         case 5:
1168         case 6:
1169         case 7:
1170                 norm_shift = 2;
1171                 break;
1172         case 8:
1173         case 9:
1174         case 10:
1175         case 11:
1176         case 12:
1177         case 13:
1178         case 14:
1179         case 15:
1180                 norm_shift = 1;
1181                 break;
1182         default:
1183                 norm_shift = 0;
1184                 break;
1185         }
1186
1187         norm_entries = gr->tpc_count << norm_shift;
1188         coeff5_mod = (1 << 5) % norm_entries;
1189         coeff6_mod = (1 << 6) % norm_entries;
1190         coeff7_mod = (1 << 7) % norm_entries;
1191         coeff8_mod = (1 << 8) % norm_entries;
1192         coeff9_mod = (1 << 9) % norm_entries;
1193         coeff10_mod = (1 << 10) % norm_entries;
1194         coeff11_mod = (1 << 11) % norm_entries;
1195
1196         gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
1197                      gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
1198                      gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
1199                      gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
1200                      gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
1201                      gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
1202
1203         gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
1204                      gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
1205                      gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
1206                      gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
1207                      gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
1208                      gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
1209                      gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
1210
1211         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
1212         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
1213         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
1214         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
1215         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
1216         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
1217
1218         gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
1219                      gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
1220                      gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
1221
1222         gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
1223         gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
1224         gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
1225         gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
1226         gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
1227         gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
1228
1229         return 0;
1230 }
1231
1232 static inline u32 count_bits(u32 mask)
1233 {
1234         u32 temp = mask;
1235         u32 count;
1236         for (count = 0; temp != 0; count++)
1237                 temp &= temp - 1;
1238
1239         return count;
1240 }
1241
1242 static inline u32 clear_count_bits(u32 num, u32 clear_count)
1243 {
1244         u32 count = clear_count;
1245         for (; (num != 0) && (count != 0); count--)
1246                 num &= num - 1;
1247
1248         return num;
1249 }
1250
1251 static int gr_gk20a_setup_alpha_beta_tables(struct gk20a *g,
1252                                         struct gr_gk20a *gr)
1253 {
1254         u32 table_index_bits = 5;
1255         u32 rows = (1 << table_index_bits);
1256         u32 row_stride = gr_pd_alpha_ratio_table__size_1_v() / rows;
1257
1258         u32 row;
1259         u32 index;
1260         u32 gpc_index;
1261         u32 gpcs_per_reg = 4;
1262         u32 pes_index;
1263         u32 tpc_count_pes;
1264         u32 num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
1265
1266         u32 alpha_target, beta_target;
1267         u32 alpha_bits, beta_bits;
1268         u32 alpha_mask, beta_mask, partial_mask;
1269         u32 reg_offset;
1270         bool assign_alpha;
1271
1272         u32 map_alpha[gr_pd_alpha_ratio_table__size_1_v()];
1273         u32 map_beta[gr_pd_alpha_ratio_table__size_1_v()];
1274         u32 map_reg_used[gr_pd_alpha_ratio_table__size_1_v()];
1275
1276         gk20a_dbg_fn("");
1277
1278         memset(map_alpha, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1279         memset(map_beta, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1280         memset(map_reg_used, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1281
1282         for (row = 0; row < rows; ++row) {
1283                 alpha_target = max_t(u32, gr->tpc_count * row / rows, 1);
1284                 beta_target = gr->tpc_count - alpha_target;
1285
1286                 assign_alpha = (alpha_target < beta_target);
1287
1288                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1289                         reg_offset = (row * row_stride) + (gpc_index / gpcs_per_reg);
1290                         alpha_mask = beta_mask = 0;
1291
1292                         for (pes_index = 0; pes_index < num_pes_per_gpc; pes_index++) {
1293                                 tpc_count_pes = gr->pes_tpc_count[pes_index][gpc_index];
1294
1295                                 if (assign_alpha) {
1296                                         alpha_bits = (alpha_target == 0) ? 0 : tpc_count_pes;
1297                                         beta_bits = tpc_count_pes - alpha_bits;
1298                                 } else {
1299                                         beta_bits = (beta_target == 0) ? 0 : tpc_count_pes;
1300                                         alpha_bits = tpc_count_pes - beta_bits;
1301                                 }
1302
1303                                 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index];
1304                                 partial_mask = clear_count_bits(partial_mask, tpc_count_pes - alpha_bits);
1305                                 alpha_mask |= partial_mask;
1306
1307                                 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index] ^ partial_mask;
1308                                 beta_mask |= partial_mask;
1309
1310                                 alpha_target -= min(alpha_bits, alpha_target);
1311                                 beta_target -= min(beta_bits, beta_target);
1312
1313                                 if ((alpha_bits > 0) || (beta_bits > 0))
1314                                         assign_alpha = !assign_alpha;
1315                         }
1316
1317                         switch (gpc_index % gpcs_per_reg) {
1318                         case 0:
1319                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n0_mask_f(alpha_mask);
1320                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n0_mask_f(beta_mask);
1321                                 break;
1322                         case 1:
1323                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n1_mask_f(alpha_mask);
1324                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n1_mask_f(beta_mask);
1325                                 break;
1326                         case 2:
1327                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n2_mask_f(alpha_mask);
1328                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n2_mask_f(beta_mask);
1329                                 break;
1330                         case 3:
1331                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n3_mask_f(alpha_mask);
1332                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n3_mask_f(beta_mask);
1333                                 break;
1334                         }
1335                         map_reg_used[reg_offset] = true;
1336                 }
1337         }
1338
1339         for (index = 0; index < gr_pd_alpha_ratio_table__size_1_v(); index++) {
1340                 if (map_reg_used[index]) {
1341                         gk20a_writel(g, gr_pd_alpha_ratio_table_r(index), map_alpha[index]);
1342                         gk20a_writel(g, gr_pd_beta_ratio_table_r(index), map_beta[index]);
1343                 }
1344         }
1345
1346         return 0;
1347 }
1348
1349 static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
1350 {
1351         struct gr_gk20a *gr = &g->gr;
1352         u32 tpc_index, gpc_index;
1353         u32 tpc_offset, gpc_offset;
1354         u32 sm_id = 0, gpc_id = 0;
1355         u32 sm_id_to_gpc_id[proj_scal_max_gpcs_v() * proj_scal_max_tpc_per_gpc_v()];
1356         u32 tpc_per_gpc;
1357         u32 max_ways_evict = INVALID_MAX_WAYS;
1358         u32 l1c_dbg_reg_val;
1359
1360         gk20a_dbg_fn("");
1361
1362         for (tpc_index = 0; tpc_index < gr->max_tpc_per_gpc_count; tpc_index++) {
1363                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1364                         gpc_offset = proj_gpc_stride_v() * gpc_index;
1365                         if (tpc_index < gr->gpc_tpc_count[gpc_index]) {
1366                                 tpc_offset = proj_tpc_in_gpc_stride_v() * tpc_index;
1367
1368                                 gk20a_writel(g, gr_gpc0_tpc0_sm_cfg_r() + gpc_offset + tpc_offset,
1369                                              gr_gpc0_tpc0_sm_cfg_sm_id_f(sm_id));
1370                                 gk20a_writel(g, gr_gpc0_tpc0_l1c_cfg_smid_r() + gpc_offset + tpc_offset,
1371                                              gr_gpc0_tpc0_l1c_cfg_smid_value_f(sm_id));
1372                                 gk20a_writel(g, gr_gpc0_gpm_pd_sm_id_r(tpc_index) + gpc_offset,
1373                                              gr_gpc0_gpm_pd_sm_id_id_f(sm_id));
1374                                 gk20a_writel(g, gr_gpc0_tpc0_pe_cfg_smid_r() + gpc_offset + tpc_offset,
1375                                              gr_gpc0_tpc0_pe_cfg_smid_value_f(sm_id));
1376
1377                                 sm_id_to_gpc_id[sm_id] = gpc_index;
1378                                 sm_id++;
1379                         }
1380
1381                         gk20a_writel(g, gr_gpc0_gpm_pd_active_tpcs_r() + gpc_offset,
1382                                      gr_gpc0_gpm_pd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1383                         gk20a_writel(g, gr_gpc0_gpm_sd_active_tpcs_r() + gpc_offset,
1384                                      gr_gpc0_gpm_sd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1385                 }
1386         }
1387
1388         for (tpc_index = 0, gpc_id = 0;
1389              tpc_index < gr_pd_num_tpc_per_gpc__size_1_v();
1390              tpc_index++, gpc_id += 8) {
1391
1392                 if (gpc_id >= gr->gpc_count)
1393                         gpc_id = 0;
1394
1395                 tpc_per_gpc =
1396                         gr_pd_num_tpc_per_gpc_count0_f(gr->gpc_tpc_count[gpc_id + 0]) |
1397                         gr_pd_num_tpc_per_gpc_count1_f(gr->gpc_tpc_count[gpc_id + 1]) |
1398                         gr_pd_num_tpc_per_gpc_count2_f(gr->gpc_tpc_count[gpc_id + 2]) |
1399                         gr_pd_num_tpc_per_gpc_count3_f(gr->gpc_tpc_count[gpc_id + 3]) |
1400                         gr_pd_num_tpc_per_gpc_count4_f(gr->gpc_tpc_count[gpc_id + 4]) |
1401                         gr_pd_num_tpc_per_gpc_count5_f(gr->gpc_tpc_count[gpc_id + 5]) |
1402                         gr_pd_num_tpc_per_gpc_count6_f(gr->gpc_tpc_count[gpc_id + 6]) |
1403                         gr_pd_num_tpc_per_gpc_count7_f(gr->gpc_tpc_count[gpc_id + 7]);
1404
1405                 gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1406                 gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1407         }
1408
1409         /* gr__setup_pd_mapping stubbed for gk20a */
1410         gr_gk20a_setup_rop_mapping(g, gr);
1411         if (g->ops.gr.setup_alpha_beta_tables)
1412                 g->ops.gr.setup_alpha_beta_tables(g, gr);
1413
1414         if (gr->num_fbps == 1)
1415                 max_ways_evict = 9;
1416
1417         if (max_ways_evict != INVALID_MAX_WAYS)
1418                 g->ops.ltc.set_max_ways_evict_last(g, max_ways_evict);
1419
1420         for (gpc_index = 0;
1421              gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
1422              gpc_index += 4) {
1423
1424                 gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
1425                              gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
1426                              gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
1427                              gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
1428                              gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
1429         }
1430
1431         gk20a_writel(g, gr_cwd_fs_r(),
1432                      gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1433                      gr_cwd_fs_num_tpcs_f(gr->tpc_count));
1434
1435         gk20a_writel(g, gr_bes_zrop_settings_r(),
1436                      gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
1437         gk20a_writel(g, gr_bes_crop_settings_r(),
1438                      gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
1439
1440         /* turn on cya15 bit for a default val that missed the cut */
1441         l1c_dbg_reg_val = gk20a_readl(g, gr_gpc0_tpc0_l1c_dbg_r());
1442         l1c_dbg_reg_val |= gr_gpc0_tpc0_l1c_dbg_cya15_en_f();
1443         gk20a_writel(g, gr_gpc0_tpc0_l1c_dbg_r(), l1c_dbg_reg_val);
1444
1445         return 0;
1446 }
1447
1448 static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
1449 {
1450         struct gk20a *g = c->g;
1451         int ret;
1452
1453         u32 inst_base_ptr =
1454                 u64_lo32(c->inst_block.cpu_pa
1455                 >> ram_in_base_shift_v());
1456
1457
1458         gk20a_dbg_fn("");
1459
1460         ret = gr_gk20a_submit_fecs_method_op(g,
1461                 (struct fecs_method_op_gk20a) {
1462                 .method.addr = save_type,
1463                 .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1464                                 gr_fecs_current_ctx_target_vid_mem_f() |
1465                                 gr_fecs_current_ctx_valid_f(1)),
1466                 .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
1467                         .ok = 1, .fail = 2,
1468                 },
1469                 .cond.ok = GR_IS_UCODE_OP_AND,
1470                 .cond.fail = GR_IS_UCODE_OP_AND,
1471                  });
1472
1473         if (ret)
1474                 gk20a_err(dev_from_gk20a(g), "save context image failed");
1475
1476         return ret;
1477 }
1478
1479 static u32 gk20a_init_sw_bundle(struct gk20a *g)
1480 {
1481         struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
1482         u32 last_bundle_data = 0;
1483         u32 err = 0;
1484         int i;
1485         unsigned long end_jiffies = jiffies +
1486                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
1487
1488         /* enable pipe mode override */
1489         gk20a_writel(g, gr_pipe_bundle_config_r(),
1490                 gr_pipe_bundle_config_override_pipe_mode_enabled_f());
1491
1492         /* load bundle init */
1493         for (i = 0; i < sw_bundle_init->count; i++) {
1494                 err |= gr_gk20a_wait_fe_idle(g, end_jiffies,
1495                                         GR_IDLE_CHECK_DEFAULT);
1496                 if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
1497                         gk20a_writel(g, gr_pipe_bundle_data_r(),
1498                                 sw_bundle_init->l[i].value);
1499                         last_bundle_data = sw_bundle_init->l[i].value;
1500                 }
1501
1502                 gk20a_writel(g, gr_pipe_bundle_address_r(),
1503                              sw_bundle_init->l[i].addr);
1504
1505                 if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
1506                     GR_GO_IDLE_BUNDLE)
1507                         err |= gr_gk20a_wait_idle(g, end_jiffies,
1508                                         GR_IDLE_CHECK_DEFAULT);
1509         }
1510
1511         /* disable pipe mode override */
1512         gk20a_writel(g, gr_pipe_bundle_config_r(),
1513                      gr_pipe_bundle_config_override_pipe_mode_disabled_f());
1514
1515         return err;
1516 }
1517
1518 /* init global golden image from a fresh gr_ctx in channel ctx.
1519    save a copy in local_golden_image in ctx_vars */
1520 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
1521                                           struct channel_gk20a *c)
1522 {
1523         struct gr_gk20a *gr = &g->gr;
1524         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1525         u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
1526         u32 ctx_header_words;
1527         u32 i;
1528         u32 data;
1529         void *ctx_ptr = NULL;
1530         void *gold_ptr = NULL;
1531         u32 err = 0;
1532
1533         gk20a_dbg_fn("");
1534
1535         /* golden ctx is global to all channels. Although only the first
1536            channel initializes golden image, driver needs to prevent multiple
1537            channels from initializing golden ctx at the same time */
1538         mutex_lock(&gr->ctx_mutex);
1539
1540         if (gr->ctx_vars.golden_image_initialized)
1541                 goto clean_up;
1542
1543         err = gr_gk20a_fecs_ctx_bind_channel(g, c);
1544         if (err)
1545                 goto clean_up;
1546
1547         err = gk20a_init_sw_bundle(g);
1548         if (err)
1549                 goto clean_up;
1550
1551         err = gr_gk20a_elpg_protected_call(g,
1552                         gr_gk20a_commit_global_ctx_buffers(g, c, false));
1553         if (err)
1554                 goto clean_up;
1555
1556         gold_ptr = vmap(gr->global_ctx_buffer[GOLDEN_CTX].pages,
1557                         PAGE_ALIGN(gr->global_ctx_buffer[GOLDEN_CTX].size) >>
1558                         PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
1559         if (!gold_ptr)
1560                 goto clean_up;
1561
1562         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1563                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1564                         0, pgprot_dmacoherent(PAGE_KERNEL));
1565         if (!ctx_ptr)
1566                 goto clean_up;
1567
1568         ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
1569         ctx_header_words >>= 2;
1570
1571         gk20a_mm_l2_flush(g, true);
1572
1573         for (i = 0; i < ctx_header_words; i++) {
1574                 data = gk20a_mem_rd32(ctx_ptr, i);
1575                 gk20a_mem_wr32(gold_ptr, i, data);
1576         }
1577
1578         gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
1579                  ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
1580
1581         gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
1582
1583         gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
1584
1585         gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
1586
1587         if (gr->ctx_vars.local_golden_image == NULL) {
1588
1589                 gr->ctx_vars.local_golden_image =
1590                         kzalloc(gr->ctx_vars.golden_image_size, GFP_KERNEL);
1591
1592                 if (gr->ctx_vars.local_golden_image == NULL) {
1593                         err = -ENOMEM;
1594                         goto clean_up;
1595                 }
1596
1597                 for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1598                         gr->ctx_vars.local_golden_image[i] =
1599                                 gk20a_mem_rd32(gold_ptr, i);
1600         }
1601
1602         gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
1603
1604         gr->ctx_vars.golden_image_initialized = true;
1605
1606         gk20a_writel(g, gr_fecs_current_ctx_r(),
1607                 gr_fecs_current_ctx_valid_false_f());
1608
1609 clean_up:
1610         if (err)
1611                 gk20a_err(dev_from_gk20a(g), "fail");
1612         else
1613                 gk20a_dbg_fn("done");
1614
1615         if (gold_ptr)
1616                 vunmap(gold_ptr);
1617         if (ctx_ptr)
1618                 vunmap(ctx_ptr);
1619
1620         mutex_unlock(&gr->ctx_mutex);
1621         return err;
1622 }
1623
1624 int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
1625                                     struct channel_gk20a *c,
1626                                     bool enable_smpc_ctxsw)
1627 {
1628         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1629         void *ctx_ptr = NULL;
1630         u32 data;
1631
1632         /* Channel gr_ctx buffer is gpu cacheable.
1633            Flush and invalidate before cpu update. */
1634         gk20a_mm_l2_flush(g, true);
1635
1636         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1637                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1638                         0, pgprot_dmacoherent(PAGE_KERNEL));
1639         if (!ctx_ptr)
1640                 return -ENOMEM;
1641
1642         data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1643         data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
1644         data |= enable_smpc_ctxsw ?
1645                 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
1646                 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
1647         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1648                  data);
1649
1650         vunmap(ctx_ptr);
1651
1652         return 0;
1653 }
1654
1655 /* load saved fresh copy of gloden image into channel gr_ctx */
1656 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
1657                                         struct channel_gk20a *c)
1658 {
1659         struct gr_gk20a *gr = &g->gr;
1660         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1661         u32 virt_addr_lo;
1662         u32 virt_addr_hi;
1663         u32 i, v, data;
1664         int ret = 0;
1665         void *ctx_ptr = NULL;
1666
1667         gk20a_dbg_fn("");
1668
1669         if (gr->ctx_vars.local_golden_image == NULL)
1670                 return -1;
1671
1672         /* Channel gr_ctx buffer is gpu cacheable.
1673            Flush and invalidate before cpu update. */
1674         gk20a_mm_l2_flush(g, true);
1675
1676         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1677                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1678                         0, pgprot_dmacoherent(PAGE_KERNEL));
1679         if (!ctx_ptr)
1680                 return -ENOMEM;
1681
1682         for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1683                 gk20a_mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
1684
1685         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
1686         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
1687
1688         virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
1689         virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
1690
1691         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
1692                  ch_ctx->patch_ctx.data_count);
1693         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
1694                  virt_addr_lo);
1695         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
1696                  virt_addr_hi);
1697
1698         /* no user for client managed performance counter ctx */
1699         ch_ctx->pm_ctx.ctx_sw_mode =
1700                 ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
1701         data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1702         data = data & ~ctxsw_prog_main_image_pm_mode_m();
1703         data |= ch_ctx->pm_ctx.ctx_sw_mode;
1704         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1705                  data);
1706
1707         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0);
1708
1709         /* set priv access map */
1710         virt_addr_lo =
1711                  u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1712         virt_addr_hi =
1713                  u64_hi32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1714
1715         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
1716                  ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f());
1717         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
1718                  virt_addr_lo);
1719         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
1720                  virt_addr_hi);
1721         /* disable verif features */
1722         v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
1723         v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
1724         v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
1725         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
1726
1727
1728         vunmap(ctx_ptr);
1729
1730         if (tegra_platform_is_linsim()) {
1731                 u32 inst_base_ptr =
1732                         u64_lo32(c->inst_block.cpu_pa
1733                         >> ram_in_base_shift_v());
1734
1735                 ret = gr_gk20a_submit_fecs_method_op(g,
1736                           (struct fecs_method_op_gk20a) {
1737                                   .method.data =
1738                                           (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1739                                            gr_fecs_current_ctx_target_vid_mem_f() |
1740                                            gr_fecs_current_ctx_valid_f(1)),
1741                                   .method.addr =
1742                                           gr_fecs_method_push_adr_restore_golden_v(),
1743                                   .mailbox = {
1744                                           .id = 0, .data = 0,
1745                                           .clr = ~0, .ret = NULL,
1746                                           .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
1747                                           .fail = 0},
1748                                   .cond.ok = GR_IS_UCODE_OP_EQUAL,
1749                                   .cond.fail = GR_IS_UCODE_OP_SKIP});
1750
1751                 if (ret)
1752                         gk20a_err(dev_from_gk20a(g),
1753                                    "restore context image failed");
1754         }
1755
1756         return ret;
1757 }
1758
1759 static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
1760 {
1761         gk20a_dbg_fn("");
1762
1763         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
1764                      gr_fecs_ctxsw_mailbox_clear_value_f(~0));
1765
1766         gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
1767         gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
1768
1769         gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
1770         gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
1771
1772         gk20a_dbg_fn("done");
1773 }
1774
1775 static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
1776 {
1777         struct mm_gk20a *mm = &g->mm;
1778         struct vm_gk20a *vm = &mm->pmu.vm;
1779         struct device *d = dev_from_gk20a(g);
1780         struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1781         void *inst_ptr;
1782         u32 pde_addr_lo;
1783         u32 pde_addr_hi;
1784         u64 pde_addr;
1785         dma_addr_t iova;
1786
1787         /* Alloc mem of inst block */
1788         ucode_info->inst_blk_desc.size = ram_in_alloc_size_v();
1789         ucode_info->inst_blk_desc.cpuva = dma_alloc_coherent(d,
1790                                         ucode_info->inst_blk_desc.size,
1791                                         &iova,
1792                                         GFP_KERNEL);
1793         if (!ucode_info->inst_blk_desc.cpuva) {
1794                 gk20a_err(d, "failed to allocate memory\n");
1795                 return -ENOMEM;
1796         }
1797
1798         ucode_info->inst_blk_desc.iova = iova;
1799         ucode_info->inst_blk_desc.cpu_pa = gk20a_get_phys_from_iova(d,
1800                                         ucode_info->inst_blk_desc.iova);
1801
1802         inst_ptr = ucode_info->inst_blk_desc.cpuva;
1803
1804         /* Set inst block */
1805         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
1806                  u64_lo32(vm->va_limit) | 0xFFF);
1807         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
1808                 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
1809
1810         pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
1811         pde_addr_lo = u64_lo32(pde_addr >> 12);
1812         pde_addr_hi = u64_hi32(pde_addr);
1813         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
1814                 ram_in_page_dir_base_target_vid_mem_f() |
1815                 ram_in_page_dir_base_vol_true_f() |
1816                 ram_in_page_dir_base_lo_f(pde_addr_lo));
1817         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
1818                 ram_in_page_dir_base_hi_f(pde_addr_hi));
1819
1820         /* Map ucode surface to GMMU */
1821         ucode_info->ucode_gpuva = gk20a_gmmu_map(vm,
1822                                         &ucode_info->surface_desc.sgt,
1823                                         ucode_info->surface_desc.size,
1824                                         0, /* flags */
1825                                         gk20a_mem_flag_read_only);
1826         if (!ucode_info->ucode_gpuva) {
1827                 gk20a_err(d, "failed to update gmmu ptes\n");
1828                 return -ENOMEM;
1829         }
1830
1831         return 0;
1832 }
1833
1834 static void gr_gk20a_init_ctxsw_ucode_segment(
1835         struct gk20a_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size)
1836 {
1837         p_seg->offset = *offset;
1838         p_seg->size = size;
1839         *offset = ALIGN(*offset + size, BLK_SIZE);
1840 }
1841
1842 static void gr_gk20a_init_ctxsw_ucode_segments(
1843         struct gk20a_ctxsw_ucode_segments *segments, u32 *offset,
1844         struct gk20a_ctxsw_bootloader_desc *bootdesc,
1845         u32 code_size, u32 data_size)
1846 {
1847         u32 boot_size = ALIGN(bootdesc->size, sizeof(u32));
1848         segments->boot_entry = bootdesc->entry_point;
1849         segments->boot_imem_offset = bootdesc->imem_offset;
1850         gr_gk20a_init_ctxsw_ucode_segment(&segments->boot, offset, boot_size);
1851         gr_gk20a_init_ctxsw_ucode_segment(&segments->code, offset, code_size);
1852         gr_gk20a_init_ctxsw_ucode_segment(&segments->data, offset, data_size);
1853 }
1854
1855 static int gr_gk20a_copy_ctxsw_ucode_segments(
1856         u8 *buf,
1857         struct gk20a_ctxsw_ucode_segments *segments,
1858         u32 *bootimage,
1859         u32 *code, u32 *data)
1860 {
1861         memcpy(buf + segments->boot.offset, bootimage, segments->boot.size);
1862         memcpy(buf + segments->code.offset, code,      segments->code.size);
1863         memcpy(buf + segments->data.offset, data,      segments->data.size);
1864         return 0;
1865 }
1866
1867 static int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
1868 {
1869         struct device *d = dev_from_gk20a(g);
1870         struct mm_gk20a *mm = &g->mm;
1871         struct vm_gk20a *vm = &mm->pmu.vm;
1872         struct gk20a_ctxsw_bootloader_desc *fecs_boot_desc;
1873         struct gk20a_ctxsw_bootloader_desc *gpccs_boot_desc;
1874         const struct firmware *fecs_fw;
1875         const struct firmware *gpccs_fw;
1876         u32 *fecs_boot_image;
1877         u32 *gpccs_boot_image;
1878         struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1879         u8 *buf;
1880         u32 ucode_size;
1881         int err = 0;
1882         dma_addr_t iova;
1883         DEFINE_DMA_ATTRS(attrs);
1884
1885         fecs_fw = gk20a_request_firmware(g, GK20A_FECS_UCODE_IMAGE);
1886         if (!fecs_fw) {
1887                 gk20a_err(d, "failed to load fecs ucode!!");
1888                 return -ENOENT;
1889         }
1890
1891         fecs_boot_desc = (void *)fecs_fw->data;
1892         fecs_boot_image = (void *)(fecs_fw->data +
1893                                 sizeof(struct gk20a_ctxsw_bootloader_desc));
1894
1895         gpccs_fw = gk20a_request_firmware(g, GK20A_GPCCS_UCODE_IMAGE);
1896         if (!gpccs_fw) {
1897                 release_firmware(fecs_fw);
1898                 gk20a_err(d, "failed to load gpccs ucode!!");
1899                 return -ENOENT;
1900         }
1901
1902         gpccs_boot_desc = (void *)gpccs_fw->data;
1903         gpccs_boot_image = (void *)(gpccs_fw->data +
1904                                 sizeof(struct gk20a_ctxsw_bootloader_desc));
1905
1906         ucode_size = 0;
1907         gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->fecs, &ucode_size,
1908                 fecs_boot_desc,
1909                 g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
1910                 g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
1911         gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->gpccs, &ucode_size,
1912                 gpccs_boot_desc,
1913                 g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
1914                 g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
1915
1916         ucode_info->surface_desc.size = ucode_size;
1917         dma_set_attr(DMA_ATTR_READ_ONLY, &attrs);
1918         ucode_info->surface_desc.cpuva = dma_alloc_attrs(d,
1919                                         ucode_info->surface_desc.size,
1920                                         &iova,
1921                                         GFP_KERNEL,
1922                                         &attrs);
1923         if (!ucode_info->surface_desc.cpuva) {
1924                 gk20a_err(d, "memory allocation failed\n");
1925                 err = -ENOMEM;
1926                 goto clean_up;
1927         }
1928
1929         ucode_info->surface_desc.iova = iova;
1930         err = gk20a_get_sgtable(d, &ucode_info->surface_desc.sgt,
1931                                 ucode_info->surface_desc.cpuva,
1932                                 ucode_info->surface_desc.iova,
1933                                 ucode_info->surface_desc.size);
1934         if (err) {
1935                 gk20a_err(d, "failed to create sg table\n");
1936                 goto clean_up;
1937         }
1938
1939         buf = (u8 *)ucode_info->surface_desc.cpuva;
1940         if (!buf) {
1941                 gk20a_err(d, "failed to map surface desc buffer");
1942                 err = -ENOMEM;
1943                 goto clean_up;
1944         }
1945
1946         gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->fecs,
1947                 fecs_boot_image,
1948                 g->gr.ctx_vars.ucode.fecs.inst.l,
1949                 g->gr.ctx_vars.ucode.fecs.data.l);
1950
1951         release_firmware(fecs_fw);
1952         fecs_fw = NULL;
1953
1954         gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->gpccs,
1955                 gpccs_boot_image,
1956                 g->gr.ctx_vars.ucode.gpccs.inst.l,
1957                 g->gr.ctx_vars.ucode.gpccs.data.l);
1958
1959         release_firmware(gpccs_fw);
1960         gpccs_fw = NULL;
1961
1962         err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
1963         if (err)
1964                 goto clean_up;
1965
1966         gk20a_free_sgtable(&ucode_info->surface_desc.sgt);
1967
1968         return 0;
1969
1970  clean_up:
1971         if (ucode_info->ucode_gpuva)
1972                 gk20a_gmmu_unmap(vm, ucode_info->ucode_gpuva,
1973                         ucode_info->surface_desc.size, gk20a_mem_flag_none);
1974         if (ucode_info->surface_desc.sgt)
1975                 gk20a_free_sgtable(&ucode_info->surface_desc.sgt);
1976         if (ucode_info->surface_desc.cpuva)
1977                 dma_free_attrs(d, ucode_info->surface_desc.size,
1978                                 ucode_info->surface_desc.cpuva,
1979                                 ucode_info->surface_desc.iova,
1980                                 &attrs);
1981         ucode_info->surface_desc.cpuva = NULL;
1982         ucode_info->surface_desc.iova = 0;
1983
1984         release_firmware(gpccs_fw);
1985         gpccs_fw = NULL;
1986         release_firmware(fecs_fw);
1987         fecs_fw = NULL;
1988
1989         return err;
1990 }
1991
1992 static void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
1993 {
1994         struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1995         int retries = 20;
1996         phys_addr_t inst_ptr;
1997         u32 val;
1998
1999         while ((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
2000                         gr_fecs_ctxsw_status_1_arb_busy_m()) && retries) {
2001                 udelay(2);
2002                 retries--;
2003         }
2004         if (!retries)
2005                 gk20a_err(dev_from_gk20a(g), "arbiter idle timeout");
2006
2007         gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
2008
2009         inst_ptr = ucode_info->inst_blk_desc.cpu_pa;
2010         gk20a_writel(g, gr_fecs_new_ctx_r(),
2011                         gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
2012                         gr_fecs_new_ctx_target_m() |
2013                         gr_fecs_new_ctx_valid_m());
2014
2015         gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
2016                         gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
2017                         gr_fecs_arb_ctx_ptr_target_m());
2018
2019         gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
2020
2021         /* Wait for arbiter command to complete */
2022         retries = 20;
2023         val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2024         while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
2025                 udelay(2);
2026                 retries--;
2027                 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2028         }
2029         if (!retries)
2030                 gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
2031
2032         gk20a_writel(g, gr_fecs_current_ctx_r(),
2033                         gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
2034                         gr_fecs_current_ctx_target_m() |
2035                         gr_fecs_current_ctx_valid_m());
2036         /* Send command to arbiter to flush */
2037         gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
2038
2039         retries = 20;
2040         val = (gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
2041         while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
2042                 udelay(2);
2043                 retries--;
2044                 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2045         }
2046         if (!retries)
2047                 gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
2048 }
2049
2050 static int gr_gk20a_load_ctxsw_ucode_segments(struct gk20a *g, u64 addr_base,
2051         struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
2052 {
2053         u32 addr_code32;
2054         u32 addr_data32;
2055         u32 addr_load32;
2056         u32 dst = 0;
2057         u32 blocks;
2058         u32 b;
2059
2060         addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
2061         addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
2062         addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
2063
2064         gk20a_writel(g, reg_offset + gr_fecs_dmactl_r(),
2065                         gr_fecs_dmactl_require_ctx_f(0));
2066
2067         /*
2068          * Copy falcon bootloader header into dmem at offset 0.
2069          * Configure dmem port 0 for auto-incrementing writes starting at dmem
2070          * offset 0.
2071          */
2072         gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
2073                         gr_fecs_dmemc_offs_f(0) |
2074                         gr_fecs_dmemc_blk_f(0) |
2075                         gr_fecs_dmemc_aincw_f(1));
2076
2077         /* Write out the actual data */
2078         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2079         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
2080         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2081         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->code.size);
2082         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2083         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_data32);
2084         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->data.size);
2085         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
2086         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2087         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2088
2089         blocks = ((segments->boot.size + 0xFF) & ~0xFF) >> 8;
2090
2091         /*
2092          * Set the base FB address for the DMA transfer. Subtract off the 256
2093          * byte IMEM block offset such that the relative FB and IMEM offsets
2094          * match, allowing the IMEM tags to be properly created.
2095          */
2096
2097         dst = segments->boot_imem_offset;
2098         gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
2099                         (addr_load32 - (dst >> 8)));
2100
2101         for (b = 0; b < blocks; b++) {
2102                 /* Setup destination IMEM offset */
2103                 gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
2104                                 dst + (b << 8));
2105
2106                 /* Setup source offset (relative to BASE) */
2107                 gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
2108                                 dst + (b << 8));
2109
2110                 gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
2111                                 gr_fecs_dmatrfcmd_imem_f(0x01) |
2112                                 gr_fecs_dmatrfcmd_write_f(0x00) |
2113                                 gr_fecs_dmatrfcmd_size_f(0x06) |
2114                                 gr_fecs_dmatrfcmd_ctxdma_f(0));
2115         }
2116
2117         /* Specify the falcon boot vector */
2118         gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
2119                         gr_fecs_bootvec_vec_f(segments->boot_entry));
2120
2121         /* Write to CPUCTL to start the falcon */
2122         gk20a_writel(g, reg_offset + gr_fecs_cpuctl_r(),
2123                         gr_fecs_cpuctl_startcpu_f(0x01));
2124
2125         return 0;
2126 }
2127
2128 static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
2129 {
2130         struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2131         u64 addr_base = ucode_info->ucode_gpuva;
2132
2133         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
2134
2135         gr_gk20a_load_falcon_bind_instblk(g);
2136
2137         gr_gk20a_load_ctxsw_ucode_segments(g, addr_base,
2138                 &g->ctxsw_ucode_info.fecs, 0);
2139
2140         gr_gk20a_load_ctxsw_ucode_segments(g, addr_base,
2141                 &g->ctxsw_ucode_info.gpccs,
2142                 gr_gpcs_gpccs_falcon_hwcfg_r() -
2143                 gr_fecs_falcon_hwcfg_r());
2144 }
2145
2146 static int gr_gk20a_load_ctxsw_ucode(struct gk20a *g, struct gr_gk20a *gr)
2147 {
2148         u32 ret;
2149
2150         gk20a_dbg_fn("");
2151
2152         if (tegra_platform_is_linsim()) {
2153                 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
2154                         gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
2155                 gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
2156                         gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
2157         }
2158
2159         /*
2160          * In case the gPMU falcon is not being used, revert to the old way of
2161          * loading gr ucode, without the faster bootstrap routine.
2162          */
2163         if (!support_gk20a_pmu()) {
2164                 gr_gk20a_load_falcon_dmem(g);
2165                 gr_gk20a_load_falcon_imem(g);
2166                 gr_gk20a_start_falcon_ucode(g);
2167         } else {
2168                 if (!gr->skip_ucode_init)
2169                         gr_gk20a_init_ctxsw_ucode(g);
2170                 gr_gk20a_load_falcon_with_bootloader(g);
2171                 gr->skip_ucode_init = true;
2172         }
2173
2174         ret = gr_gk20a_ctx_wait_ucode(g, 0, 0,
2175                                       GR_IS_UCODE_OP_EQUAL,
2176                                       eUcodeHandshakeInitComplete,
2177                                       GR_IS_UCODE_OP_SKIP, 0);
2178         if (ret) {
2179                 gk20a_err(dev_from_gk20a(g), "falcon ucode init timeout");
2180                 return ret;
2181         }
2182
2183         if (support_gk20a_pmu())
2184                 gk20a_writel(g, gr_fecs_current_ctx_r(),
2185                         gr_fecs_current_ctx_valid_false_f());
2186
2187         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
2188         gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
2189         gk20a_writel(g, gr_fecs_method_push_r(),
2190                      gr_fecs_method_push_adr_set_watchdog_timeout_f());
2191
2192         gk20a_dbg_fn("done");
2193         return 0;
2194 }
2195
2196 static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
2197 {
2198         u32 golden_ctx_image_size = 0;
2199         u32 zcull_ctx_image_size = 0;
2200         u32 pm_ctx_image_size = 0;
2201         u32 ret;
2202         struct fecs_method_op_gk20a op = {
2203                 .mailbox = { .id = 0, .data = 0,
2204                              .clr = ~0, .ok = 0, .fail = 0},
2205                 .method.data = 0,
2206                 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
2207                 .cond.fail = GR_IS_UCODE_OP_SKIP,
2208                 };
2209
2210         gk20a_dbg_fn("");
2211         op.method.addr = gr_fecs_method_push_adr_discover_image_size_v();
2212         op.mailbox.ret = &golden_ctx_image_size;
2213         ret = gr_gk20a_submit_fecs_method_op(g, op);
2214         if (ret) {
2215                 gk20a_err(dev_from_gk20a(g),
2216                            "query golden image size failed");
2217                 return ret;
2218         }
2219         op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v();
2220         op.mailbox.ret = &zcull_ctx_image_size;
2221         ret = gr_gk20a_submit_fecs_method_op(g, op);
2222         if (ret) {
2223                 gk20a_err(dev_from_gk20a(g),
2224                            "query zcull ctx image size failed");
2225                 return ret;
2226         }
2227         op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v();
2228         op.mailbox.ret = &pm_ctx_image_size;
2229         ret = gr_gk20a_submit_fecs_method_op(g, op);
2230         if (ret) {
2231                 gk20a_err(dev_from_gk20a(g),
2232                            "query pm ctx image size failed");
2233                 return ret;
2234         }
2235
2236         if (!g->gr.ctx_vars.golden_image_size &&
2237             !g->gr.ctx_vars.zcull_ctxsw_image_size) {
2238                 g->gr.ctx_vars.golden_image_size = golden_ctx_image_size;
2239                 g->gr.ctx_vars.zcull_ctxsw_image_size = zcull_ctx_image_size;
2240         } else {
2241                 /* hw is different after railgating? */
2242                 BUG_ON(g->gr.ctx_vars.golden_image_size != golden_ctx_image_size);
2243                 BUG_ON(g->gr.ctx_vars.zcull_ctxsw_image_size != zcull_ctx_image_size);
2244         }
2245
2246         g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
2247
2248         gk20a_dbg_fn("done");
2249         return 0;
2250 }
2251
2252 static void gk20a_gr_destroy_ctx_buffer(struct platform_device *pdev,
2253                                         struct gr_ctx_buffer_desc *desc)
2254 {
2255         struct device *dev = &pdev->dev;
2256         gk20a_free_sgtable(&desc->sgt);
2257         dma_free_attrs(dev, desc->size, desc->pages,
2258                        desc->iova, &desc->attrs);
2259 }
2260
2261 static int gk20a_gr_alloc_ctx_buffer(struct platform_device *pdev,
2262                                      struct gr_ctx_buffer_desc *desc,
2263                                      size_t size)
2264 {
2265         struct device *dev = &pdev->dev;
2266         DEFINE_DMA_ATTRS(attrs);
2267         dma_addr_t iova;
2268         int err = 0;
2269
2270         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2271
2272         desc->pages = dma_alloc_attrs(&pdev->dev, size, &iova,
2273                                       GFP_KERNEL, &attrs);
2274         if (!desc->pages)
2275                 return -ENOMEM;
2276
2277         desc->iova = iova;
2278         desc->size = size;
2279         desc->attrs = attrs;
2280         desc->destroy = gk20a_gr_destroy_ctx_buffer;
2281         err = gk20a_get_sgtable_from_pages(&pdev->dev, &desc->sgt, desc->pages,
2282                                            desc->iova, desc->size);
2283         if (err) {
2284                 dma_free_attrs(dev, desc->size, desc->pages,
2285                                desc->iova, &desc->attrs);
2286                 memset(desc, 0, sizeof(*desc));
2287         }
2288
2289         return err;
2290 }
2291
2292 static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
2293 {
2294         struct gk20a_platform *platform = platform_get_drvdata(g->dev);
2295         struct gr_gk20a *gr = &g->gr;
2296         int i, attr_buffer_size, err;
2297         struct platform_device *pdev = g->dev;
2298
2299         u32 cb_buffer_size = gr->bundle_cb_default_size *
2300                 gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
2301
2302         u32 pagepool_buffer_size = gr_scc_pagepool_total_pages_hwmax_value_v() *
2303                 gr_scc_pagepool_total_pages_byte_granularity_v();
2304
2305         gk20a_dbg_fn("");
2306
2307         attr_buffer_size = g->ops.gr.calc_global_ctx_buffer_size(g);
2308
2309         gk20a_dbg_info("cb_buffer_size : %d", cb_buffer_size);
2310
2311         err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[CIRCULAR],
2312                                         cb_buffer_size);
2313         if (err)
2314                 goto clean_up;
2315
2316         if (platform->secure_alloc)
2317                 platform->secure_alloc(pdev,
2318                                        &gr->global_ctx_buffer[CIRCULAR_VPR],
2319                                        cb_buffer_size);
2320
2321         gk20a_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
2322
2323         err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[PAGEPOOL],
2324                                         pagepool_buffer_size);
2325         if (err)
2326                 goto clean_up;
2327
2328         if (platform->secure_alloc)
2329                 platform->secure_alloc(pdev,
2330                                        &gr->global_ctx_buffer[PAGEPOOL_VPR],
2331                                        pagepool_buffer_size);
2332
2333         gk20a_dbg_info("attr_buffer_size : %d", attr_buffer_size);
2334
2335         err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[ATTRIBUTE],
2336                                         attr_buffer_size);
2337         if (err)
2338                 goto clean_up;
2339
2340         if (platform->secure_alloc)
2341                 platform->secure_alloc(pdev,
2342                                        &gr->global_ctx_buffer[ATTRIBUTE_VPR],
2343                                        attr_buffer_size);
2344
2345         if (platform->secure_buffer.destroy)
2346                 platform->secure_buffer.destroy(pdev, &platform->secure_buffer);
2347
2348         gk20a_dbg_info("golden_image_size : %d",
2349                    gr->ctx_vars.golden_image_size);
2350
2351         err = gk20a_gr_alloc_ctx_buffer(pdev,
2352                                         &gr->global_ctx_buffer[GOLDEN_CTX],
2353                                         gr->ctx_vars.golden_image_size);
2354         if (err)
2355                 goto clean_up;
2356
2357         gk20a_dbg_info("priv_access_map_size : %d",
2358                    gr->ctx_vars.priv_access_map_size);
2359
2360         err = gk20a_gr_alloc_ctx_buffer(pdev,
2361                                         &gr->global_ctx_buffer[PRIV_ACCESS_MAP],
2362                                         gr->ctx_vars.priv_access_map_size);
2363
2364         if (err)
2365                 goto clean_up;
2366
2367         gk20a_dbg_fn("done");
2368         return 0;
2369
2370  clean_up:
2371         gk20a_err(dev_from_gk20a(g), "fail");
2372         for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2373                 if (gr->global_ctx_buffer[i].destroy) {
2374                         gr->global_ctx_buffer[i].destroy(pdev,
2375                                         &gr->global_ctx_buffer[i]);
2376                 }
2377         }
2378         return -ENOMEM;
2379 }
2380
2381 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
2382 {
2383         struct platform_device *pdev = g->dev;
2384         struct gr_gk20a *gr = &g->gr;
2385         DEFINE_DMA_ATTRS(attrs);
2386         u32 i;
2387
2388         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2389
2390         for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2391                 gr->global_ctx_buffer[i].destroy(pdev,
2392                                 &gr->global_ctx_buffer[i]);
2393         }
2394
2395         gk20a_dbg_fn("done");
2396 }
2397
2398 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
2399                                         struct channel_gk20a *c)
2400 {
2401         struct vm_gk20a *ch_vm = c->vm;
2402         u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2403         u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
2404         struct gr_gk20a *gr = &g->gr;
2405         struct sg_table *sgt;
2406         u64 size;
2407         u64 gpu_va;
2408         u32 i;
2409         gk20a_dbg_fn("");
2410
2411         /* Circular Buffer */
2412         if (!c->vpr || (gr->global_ctx_buffer[CIRCULAR_VPR].sgt == NULL)) {
2413                 sgt = gr->global_ctx_buffer[CIRCULAR].sgt;
2414                 size = gr->global_ctx_buffer[CIRCULAR].size;
2415         } else {
2416                 sgt = gr->global_ctx_buffer[CIRCULAR_VPR].sgt;
2417                 size = gr->global_ctx_buffer[CIRCULAR_VPR].size;
2418         }
2419
2420         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2421                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2422                                 gk20a_mem_flag_none);
2423         if (!gpu_va)
2424                 goto clean_up;
2425         g_bfr_va[CIRCULAR_VA] = gpu_va;
2426         g_bfr_size[CIRCULAR_VA] = size;
2427
2428         /* Attribute Buffer */
2429         if (!c->vpr || (gr->global_ctx_buffer[ATTRIBUTE_VPR].sgt == NULL)) {
2430                 sgt = gr->global_ctx_buffer[ATTRIBUTE].sgt;
2431                 size = gr->global_ctx_buffer[ATTRIBUTE].size;
2432         } else {
2433                 sgt = gr->global_ctx_buffer[ATTRIBUTE_VPR].sgt;
2434                 size = gr->global_ctx_buffer[ATTRIBUTE_VPR].size;
2435         }
2436
2437         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2438                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2439                                 gk20a_mem_flag_none);
2440         if (!gpu_va)
2441                 goto clean_up;
2442         g_bfr_va[ATTRIBUTE_VA] = gpu_va;
2443         g_bfr_size[ATTRIBUTE_VA] = size;
2444
2445         /* Page Pool */
2446         if (!c->vpr || (gr->global_ctx_buffer[PAGEPOOL_VPR].sgt == NULL)) {
2447                 sgt = gr->global_ctx_buffer[PAGEPOOL].sgt;
2448                 size = gr->global_ctx_buffer[PAGEPOOL].size;
2449         } else {
2450                 sgt = gr->global_ctx_buffer[PAGEPOOL_VPR].sgt;
2451                 size = gr->global_ctx_buffer[PAGEPOOL_VPR].size;
2452         }
2453
2454         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2455                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2456                                 gk20a_mem_flag_none);
2457         if (!gpu_va)
2458                 goto clean_up;
2459         g_bfr_va[PAGEPOOL_VA] = gpu_va;
2460         g_bfr_size[PAGEPOOL_VA] = size;
2461
2462         /* Golden Image */
2463         sgt = gr->global_ctx_buffer[GOLDEN_CTX].sgt;
2464         size = gr->global_ctx_buffer[GOLDEN_CTX].size;
2465         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
2466                                 gk20a_mem_flag_none);
2467         if (!gpu_va)
2468                 goto clean_up;
2469         g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
2470         g_bfr_size[GOLDEN_CTX_VA] = size;
2471
2472         /* Priv register Access Map */
2473         sgt = gr->global_ctx_buffer[PRIV_ACCESS_MAP].sgt;
2474         size = gr->global_ctx_buffer[PRIV_ACCESS_MAP].size;
2475         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
2476                                 gk20a_mem_flag_none);
2477         if (!gpu_va)
2478                 goto clean_up;
2479         g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
2480         g_bfr_size[PRIV_ACCESS_MAP_VA] = size;
2481
2482         c->ch_ctx.global_ctx_buffer_mapped = true;
2483         return 0;
2484
2485  clean_up:
2486         for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2487                 if (g_bfr_va[i]) {
2488                         gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
2489                                          gr->global_ctx_buffer[i].size,
2490                                          gk20a_mem_flag_none);
2491                         g_bfr_va[i] = 0;
2492                 }
2493         }
2494         return -ENOMEM;
2495 }
2496
2497 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
2498 {
2499         struct vm_gk20a *ch_vm = c->vm;
2500         u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2501         u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
2502         u32 i;
2503
2504         gk20a_dbg_fn("");
2505
2506         for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2507                 if (g_bfr_va[i]) {
2508                         gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
2509                                          g_bfr_size[i],
2510                                          gk20a_mem_flag_none);
2511                         g_bfr_va[i] = 0;
2512                         g_bfr_size[i] = 0;
2513                 }
2514         }
2515         c->ch_ctx.global_ctx_buffer_mapped = false;
2516 }
2517
2518 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
2519                                 struct channel_gk20a *c)
2520 {
2521         struct gr_gk20a *gr = &g->gr;
2522         struct gr_ctx_desc *gr_ctx = &c->ch_ctx.gr_ctx;
2523         struct vm_gk20a *ch_vm = c->vm;
2524         struct device *d = dev_from_gk20a(g);
2525         struct sg_table *sgt;
2526         DEFINE_DMA_ATTRS(attrs);
2527         int err = 0;
2528         dma_addr_t iova;
2529
2530         gk20a_dbg_fn("");
2531
2532         if (gr->ctx_vars.buffer_size == 0)
2533                 return 0;
2534
2535         /* alloc channel gr ctx buffer */
2536         gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
2537         gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
2538
2539         gr_ctx->size = gr->ctx_vars.buffer_total_size;
2540         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2541         gr_ctx->pages = dma_alloc_attrs(d, gr_ctx->size,
2542                                 &iova, GFP_KERNEL, &attrs);
2543         if (!gr_ctx->pages)
2544                 return -ENOMEM;
2545
2546         gr_ctx->iova = iova;
2547         err = gk20a_get_sgtable_from_pages(d, &sgt, gr_ctx->pages,
2548                         gr_ctx->iova, gr_ctx->size);
2549         if (err)
2550                 goto err_free;
2551
2552         gr_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, gr_ctx->size,
2553                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2554                                 gk20a_mem_flag_none);
2555         if (!gr_ctx->gpu_va)
2556                 goto err_free_sgt;
2557
2558         gk20a_free_sgtable(&sgt);
2559
2560         return 0;
2561
2562  err_free_sgt:
2563         gk20a_free_sgtable(&sgt);
2564  err_free:
2565         dma_free_attrs(d, gr_ctx->size,
2566                 gr_ctx->pages, gr_ctx->iova, &attrs);
2567         gr_ctx->pages = NULL;
2568         gr_ctx->iova = 0;
2569
2570         return err;
2571 }
2572
2573 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
2574 {
2575         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2576         struct vm_gk20a *ch_vm = c->vm;
2577         struct gk20a *g = c->g;
2578         struct device *d = dev_from_gk20a(g);
2579         DEFINE_DMA_ATTRS(attrs);
2580
2581         gk20a_dbg_fn("");
2582
2583         if (!ch_ctx->gr_ctx.gpu_va)
2584                 return;
2585
2586         gk20a_gmmu_unmap(ch_vm, ch_ctx->gr_ctx.gpu_va,
2587                         ch_ctx->gr_ctx.size, gk20a_mem_flag_none);
2588         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2589         dma_free_attrs(d, ch_ctx->gr_ctx.size,
2590                 ch_ctx->gr_ctx.pages, ch_ctx->gr_ctx.iova, &attrs);
2591         ch_ctx->gr_ctx.pages = NULL;
2592         ch_ctx->gr_ctx.iova = 0;
2593 }
2594
2595 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
2596                                 struct channel_gk20a *c)
2597 {
2598         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2599         struct device *d = dev_from_gk20a(g);
2600         struct vm_gk20a *ch_vm = c->vm;
2601         DEFINE_DMA_ATTRS(attrs);
2602         struct sg_table *sgt;
2603         int err = 0;
2604         dma_addr_t iova;
2605
2606         gk20a_dbg_fn("");
2607
2608         patch_ctx->size = 128 * sizeof(u32);
2609         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2610         patch_ctx->pages = dma_alloc_attrs(d, patch_ctx->size,
2611                                 &iova, GFP_KERNEL,
2612                                 &attrs);
2613         if (!patch_ctx->pages)
2614                 return -ENOMEM;
2615
2616         patch_ctx->iova = iova;
2617         err = gk20a_get_sgtable_from_pages(d, &sgt, patch_ctx->pages,
2618                         patch_ctx->iova, patch_ctx->size);
2619         if (err)
2620                 goto err_free;
2621
2622         patch_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, patch_ctx->size,
2623                                         0, gk20a_mem_flag_none);
2624         if (!patch_ctx->gpu_va)
2625                 goto err_free_sgtable;
2626
2627         gk20a_free_sgtable(&sgt);
2628
2629         gk20a_dbg_fn("done");
2630         return 0;
2631
2632  err_free_sgtable:
2633         gk20a_free_sgtable(&sgt);
2634  err_free:
2635         dma_free_attrs(d, patch_ctx->size,
2636                 patch_ctx->pages, patch_ctx->iova, &attrs);
2637         patch_ctx->pages = NULL;
2638         patch_ctx->iova = 0;
2639         gk20a_err(dev_from_gk20a(g), "fail");
2640         return err;
2641 }
2642
2643 static void gr_gk20a_unmap_channel_patch_ctx(struct channel_gk20a *c)
2644 {
2645         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2646         struct vm_gk20a *ch_vm = c->vm;
2647
2648         gk20a_dbg_fn("");
2649
2650         if (patch_ctx->gpu_va)
2651                 gk20a_gmmu_unmap(ch_vm, patch_ctx->gpu_va,
2652                         patch_ctx->size, gk20a_mem_flag_none);
2653         patch_ctx->gpu_va = 0;
2654         patch_ctx->data_count = 0;
2655 }
2656
2657 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
2658 {
2659         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2660         struct gk20a *g = c->g;
2661         struct device *d = dev_from_gk20a(g);
2662         DEFINE_DMA_ATTRS(attrs);
2663
2664         gk20a_dbg_fn("");
2665
2666         gr_gk20a_unmap_channel_patch_ctx(c);
2667
2668         if (patch_ctx->pages) {
2669                 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2670                 dma_free_attrs(d, patch_ctx->size,
2671                         patch_ctx->pages, patch_ctx->iova, &attrs);
2672                 patch_ctx->pages = NULL;
2673                 patch_ctx->iova = 0;
2674         }
2675 }
2676
2677 void gk20a_free_channel_ctx(struct channel_gk20a *c)
2678 {
2679         gr_gk20a_unmap_global_ctx_buffers(c);
2680         gr_gk20a_free_channel_patch_ctx(c);
2681         gr_gk20a_free_channel_gr_ctx(c);
2682
2683         /* zcull_ctx, pm_ctx */
2684
2685         memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
2686
2687         c->num_objects = 0;
2688         c->first_init = false;
2689 }
2690
2691 static bool gr_gk20a_is_valid_class(struct gk20a *g, u32 class_num)
2692 {
2693         bool valid = false;
2694
2695         switch (class_num) {
2696         case KEPLER_COMPUTE_A:
2697         case KEPLER_C:
2698         case FERMI_TWOD_A:
2699         case KEPLER_DMA_COPY_A:
2700                 valid = true;
2701                 break;
2702
2703         default:
2704                 break;
2705         }
2706
2707         return valid;
2708 }
2709
2710 int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
2711                         struct nvhost_alloc_obj_ctx_args *args)
2712 {
2713         struct gk20a *g = c->g;
2714         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2715         int err = 0;
2716
2717         gk20a_dbg_fn("");
2718
2719         /* an address space needs to have been bound at this point.*/
2720         if (!gk20a_channel_as_bound(c)) {
2721                 gk20a_err(dev_from_gk20a(g),
2722                            "not bound to address space at time"
2723                            " of grctx allocation");
2724                 return -EINVAL;
2725         }
2726
2727         if (!g->ops.gr.is_valid_class(g, args->class_num)) {
2728                 gk20a_err(dev_from_gk20a(g),
2729                            "invalid obj class 0x%x", args->class_num);
2730                 err = -EINVAL;
2731                 goto out;
2732         }
2733
2734         /* allocate gr ctx buffer */
2735         if (ch_ctx->gr_ctx.pages == NULL) {
2736                 err = gr_gk20a_alloc_channel_gr_ctx(g, c);
2737                 if (err) {
2738                         gk20a_err(dev_from_gk20a(g),
2739                                 "fail to allocate gr ctx buffer");
2740                         goto out;
2741                 }
2742                 c->obj_class = args->class_num;
2743         } else {
2744                 /*TBD: needs to be more subtle about which is being allocated
2745                 * as some are allowed to be allocated along same channel */
2746                 gk20a_err(dev_from_gk20a(g),
2747                         "too many classes alloc'd on same channel");
2748                 err = -EINVAL;
2749                 goto out;
2750         }
2751
2752         /* commit gr ctx buffer */
2753         err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
2754         if (err) {
2755                 gk20a_err(dev_from_gk20a(g),
2756                         "fail to commit gr ctx buffer");
2757                 goto out;
2758         }
2759
2760         /* allocate patch buffer */
2761         if (ch_ctx->patch_ctx.pages == NULL) {
2762                 err = gr_gk20a_alloc_channel_patch_ctx(g, c);
2763                 if (err) {
2764                         gk20a_err(dev_from_gk20a(g),
2765                                 "fail to allocate patch buffer");
2766                         goto out;
2767                 }
2768         }
2769
2770         /* map global buffer to channel gpu_va and commit */
2771         if (!ch_ctx->global_ctx_buffer_mapped) {
2772                 err = gr_gk20a_map_global_ctx_buffers(g, c);
2773                 if (err) {
2774                         gk20a_err(dev_from_gk20a(g),
2775                                 "fail to map global ctx buffer");
2776                         goto out;
2777                 }
2778                 gr_gk20a_elpg_protected_call(g,
2779                         gr_gk20a_commit_global_ctx_buffers(g, c, true));
2780         }
2781
2782         /* tweak any perf parameters per-context here */
2783         if (args->class_num == KEPLER_COMPUTE_A) {
2784                 int begin_err;
2785                 u32 tex_lock_disable_mask =
2786                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_m()         |
2787                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_m()    |
2788                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_m()   |
2789                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_m()     |
2790                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_m() |
2791                         gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_m();
2792
2793                 u32 texlock = gk20a_readl(g, gr_gpcs_tpcs_sm_sch_texlock_r());
2794
2795                 texlock = (texlock & ~tex_lock_disable_mask) |
2796                 (gr_gpcs_tpcs_sm_sch_texlock_tex_hash_disable_f()         |
2797                  gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_disable_f()    |
2798                  gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_disable_f()   |
2799                  gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_disable_f()     |
2800                  gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_disable_f() |
2801                  gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_disable_f());
2802
2803                 begin_err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
2804
2805                 if (!begin_err) {
2806                         err = gr_gk20a_ctx_patch_write(g, ch_ctx,
2807                                 gr_gpcs_tpcs_sm_sch_texlock_r(),
2808                                 texlock, true);
2809                 }
2810                 if ((begin_err || err)) {
2811                         gk20a_err(dev_from_gk20a(g),
2812                                    "failed to set texlock for compute class");
2813                 }
2814                 if (!begin_err)
2815                         gr_gk20a_ctx_patch_write_end(g, ch_ctx);
2816         }
2817
2818         /* init golden image, ELPG enabled after this is done */
2819         err = gr_gk20a_init_golden_ctx_image(g, c);
2820         if (err) {
2821                 gk20a_err(dev_from_gk20a(g),
2822                         "fail to init golden ctx image");
2823                 goto out;
2824         }
2825
2826         /* load golden image */
2827         if (!c->first_init) {
2828                 err = gr_gk20a_elpg_protected_call(g,
2829                         gr_gk20a_load_golden_ctx_image(g, c));
2830                 if (err) {
2831                         gk20a_err(dev_from_gk20a(g),
2832                                 "fail to load golden ctx image");
2833                         goto out;
2834                 }
2835                 c->first_init = true;
2836         }
2837
2838         c->num_objects++;
2839
2840         gk20a_dbg_fn("done");
2841         return 0;
2842 out:
2843         /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
2844            can be reused so no need to release them.
2845            2. golden image init and load is a one time thing so if
2846            they pass, no need to undo. */
2847         gk20a_err(dev_from_gk20a(g), "fail");
2848         return err;
2849 }
2850
2851 int gk20a_free_obj_ctx(struct channel_gk20a  *c,
2852                        struct nvhost_free_obj_ctx_args *args)
2853 {
2854         unsigned long timeout = gk20a_get_gr_idle_timeout(c->g);
2855
2856         gk20a_dbg_fn("");
2857
2858         if (c->num_objects == 0)
2859                 return 0;
2860
2861         c->num_objects--;
2862
2863         if (c->num_objects == 0) {
2864                 c->first_init = false;
2865                 gk20a_disable_channel(c,
2866                         !c->has_timedout,
2867                         timeout);
2868                 gr_gk20a_unmap_channel_patch_ctx(c);
2869         }
2870
2871         return 0;
2872 }
2873
2874 static void gk20a_remove_gr_support(struct gr_gk20a *gr)
2875 {
2876         struct gk20a *g = gr->g;
2877         struct device *d = dev_from_gk20a(g);
2878         DEFINE_DMA_ATTRS(attrs);
2879
2880         gk20a_dbg_fn("");
2881
2882         gr_gk20a_free_global_ctx_buffers(g);
2883
2884         dma_free_coherent(d, gr->mmu_wr_mem.size,
2885                 gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
2886         gr->mmu_wr_mem.cpuva = NULL;
2887         gr->mmu_wr_mem.iova = 0;
2888         dma_free_coherent(d, gr->mmu_rd_mem.size,
2889                 gr->mmu_rd_mem.cpuva, gr->mmu_rd_mem.iova);
2890         gr->mmu_rd_mem.cpuva = NULL;
2891         gr->mmu_rd_mem.iova = 0;
2892
2893         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2894         dma_free_attrs(d, gr->compbit_store.size, gr->compbit_store.pages,
2895                         gr->compbit_store.base_iova, &attrs);
2896
2897         memset(&gr->mmu_wr_mem, 0, sizeof(struct mmu_desc));
2898         memset(&gr->mmu_rd_mem, 0, sizeof(struct mmu_desc));
2899         memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
2900
2901         kfree(gr->gpc_tpc_count);
2902         kfree(gr->gpc_zcb_count);
2903         kfree(gr->gpc_ppc_count);
2904         kfree(gr->pes_tpc_count[0]);
2905         kfree(gr->pes_tpc_count[1]);
2906         kfree(gr->pes_tpc_mask[0]);
2907         kfree(gr->pes_tpc_mask[1]);
2908         kfree(gr->gpc_skip_mask);
2909         kfree(gr->map_tiles);
2910         gr->gpc_tpc_count = NULL;
2911         gr->gpc_zcb_count = NULL;
2912         gr->gpc_ppc_count = NULL;
2913         gr->pes_tpc_count[0] = NULL;
2914         gr->pes_tpc_count[1] = NULL;
2915         gr->pes_tpc_mask[0] = NULL;
2916         gr->pes_tpc_mask[1] = NULL;
2917         gr->gpc_skip_mask = NULL;
2918         gr->map_tiles = NULL;
2919
2920         kfree(gr->ctx_vars.ucode.fecs.inst.l);
2921         kfree(gr->ctx_vars.ucode.fecs.data.l);
2922         kfree(gr->ctx_vars.ucode.gpccs.inst.l);
2923         kfree(gr->ctx_vars.ucode.gpccs.data.l);
2924         kfree(gr->ctx_vars.sw_bundle_init.l);
2925         kfree(gr->ctx_vars.sw_method_init.l);
2926         kfree(gr->ctx_vars.sw_ctx_load.l);
2927         kfree(gr->ctx_vars.sw_non_ctx_load.l);
2928         kfree(gr->ctx_vars.ctxsw_regs.sys.l);
2929         kfree(gr->ctx_vars.ctxsw_regs.gpc.l);
2930         kfree(gr->ctx_vars.ctxsw_regs.tpc.l);
2931         kfree(gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
2932         kfree(gr->ctx_vars.ctxsw_regs.ppc.l);
2933         kfree(gr->ctx_vars.ctxsw_regs.pm_sys.l);
2934         kfree(gr->ctx_vars.ctxsw_regs.pm_gpc.l);
2935         kfree(gr->ctx_vars.ctxsw_regs.pm_tpc.l);
2936
2937         kfree(gr->ctx_vars.local_golden_image);
2938         gr->ctx_vars.local_golden_image = NULL;
2939
2940         gk20a_allocator_destroy(&gr->comp_tags);
2941 }
2942
2943 static void gr_gk20a_bundle_cb_defaults(struct gk20a *g)
2944 {
2945         struct gr_gk20a *gr = &g->gr;
2946
2947         gr->bundle_cb_default_size =
2948                 gr_scc_bundle_cb_size_div_256b__prod_v();
2949         gr->min_gpm_fifo_depth =
2950                 gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
2951         gr->bundle_cb_token_limit =
2952                 gr_pd_ab_dist_cfg2_token_limit_init_v();
2953 }
2954
2955 static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
2956 {
2957         u32 gpc_index, pes_index;
2958         u32 pes_tpc_mask;
2959         u32 pes_tpc_count;
2960         u32 pes_heavy_index;
2961         u32 gpc_new_skip_mask;
2962         u32 tmp;
2963
2964         tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
2965         gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
2966
2967         tmp = gk20a_readl(g, top_num_gpcs_r());
2968         gr->max_gpc_count = top_num_gpcs_value_v(tmp);
2969
2970         tmp = gk20a_readl(g, top_num_fbps_r());
2971         gr->max_fbps_count = top_num_fbps_value_v(tmp);
2972
2973         tmp = gk20a_readl(g, top_tpc_per_gpc_r());
2974         gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
2975
2976         gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
2977
2978         tmp = gk20a_readl(g, top_num_fbps_r());
2979         gr->sys_count = top_num_fbps_value_v(tmp);
2980
2981         tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
2982         gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
2983
2984         gr->pe_count_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
2985         gr->max_zcull_per_gpc_count = proj_scal_litter_num_zcull_banks_v();
2986
2987         if (!gr->gpc_count) {
2988                 gk20a_err(dev_from_gk20a(g), "gpc_count==0!");
2989                 goto clean_up;
2990         }
2991
2992         gr->gpc_tpc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2993         gr->gpc_zcb_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2994         gr->gpc_ppc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2995         gr->pes_tpc_count[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2996         gr->pes_tpc_count[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2997         gr->pes_tpc_mask[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2998         gr->pes_tpc_mask[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2999         gr->gpc_skip_mask =
3000                 kzalloc(gr_pd_dist_skip_table__size_1_v() * 4 * sizeof(u32),
3001                         GFP_KERNEL);
3002
3003         if (!gr->gpc_tpc_count || !gr->gpc_zcb_count || !gr->gpc_ppc_count ||
3004             !gr->pes_tpc_count[0] || !gr->pes_tpc_count[1] ||
3005             !gr->pes_tpc_mask[0] || !gr->pes_tpc_mask[1] || !gr->gpc_skip_mask)
3006                 goto clean_up;
3007
3008         gr->ppc_count = 0;
3009         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3010                 tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r());
3011
3012                 gr->gpc_tpc_count[gpc_index] =
3013                         gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
3014                 gr->tpc_count += gr->gpc_tpc_count[gpc_index];
3015
3016                 gr->gpc_zcb_count[gpc_index] =
3017                         gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
3018                 gr->zcb_count += gr->gpc_zcb_count[gpc_index];
3019
3020                 gr->gpc_ppc_count[gpc_index] = gr->pe_count_per_gpc;
3021                 gr->ppc_count += gr->gpc_ppc_count[gpc_index];
3022                 for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
3023
3024                         tmp = gk20a_readl(g,
3025                                 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
3026                                 gpc_index * proj_gpc_stride_v());
3027
3028                         pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
3029                         pes_tpc_count = count_bits(pes_tpc_mask);
3030
3031                         gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
3032                         gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
3033                 }
3034
3035                 gpc_new_skip_mask = 0;
3036                 if (gr->pes_tpc_count[0][gpc_index] +
3037                     gr->pes_tpc_count[1][gpc_index] == 5) {
3038                         pes_heavy_index =
3039                                 gr->pes_tpc_count[0][gpc_index] >
3040                                 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3041
3042                         gpc_new_skip_mask =
3043                                 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3044                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3045                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3046
3047                 } else if ((gr->pes_tpc_count[0][gpc_index] +
3048                             gr->pes_tpc_count[1][gpc_index] == 4) &&
3049                            (gr->pes_tpc_count[0][gpc_index] !=
3050                             gr->pes_tpc_count[1][gpc_index])) {
3051                                 pes_heavy_index =
3052                                     gr->pes_tpc_count[0][gpc_index] >
3053                                     gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3054
3055                         gpc_new_skip_mask =
3056                                 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3057                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3058                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3059                 }
3060                 gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
3061         }
3062
3063         gk20a_dbg_info("fbps: %d", gr->num_fbps);
3064         gk20a_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
3065         gk20a_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
3066         gk20a_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
3067         gk20a_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
3068         gk20a_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
3069         gk20a_dbg_info("sys_count: %d", gr->sys_count);
3070         gk20a_dbg_info("gpc_count: %d", gr->gpc_count);
3071         gk20a_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
3072         gk20a_dbg_info("tpc_count: %d", gr->tpc_count);
3073         gk20a_dbg_info("ppc_count: %d", gr->ppc_count);
3074
3075         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3076                 gk20a_dbg_info("gpc_tpc_count[%d] : %d",
3077                            gpc_index, gr->gpc_tpc_count[gpc_index]);
3078         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3079                 gk20a_dbg_info("gpc_zcb_count[%d] : %d",
3080                            gpc_index, gr->gpc_zcb_count[gpc_index]);
3081         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3082                 gk20a_dbg_info("gpc_ppc_count[%d] : %d",
3083                            gpc_index, gr->gpc_ppc_count[gpc_index]);
3084         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3085                 gk20a_dbg_info("gpc_skip_mask[%d] : %d",
3086                            gpc_index, gr->gpc_skip_mask[gpc_index]);
3087         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3088                 for (pes_index = 0;
3089                      pes_index < gr->pe_count_per_gpc;
3090                      pes_index++)
3091                         gk20a_dbg_info("pes_tpc_count[%d][%d] : %d",
3092                                    pes_index, gpc_index,
3093                                    gr->pes_tpc_count[pes_index][gpc_index]);
3094
3095         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3096                 for (pes_index = 0;
3097                      pes_index < gr->pe_count_per_gpc;
3098                      pes_index++)
3099                         gk20a_dbg_info("pes_tpc_mask[%d][%d] : %d",
3100                                    pes_index, gpc_index,
3101                                    gr->pes_tpc_mask[pes_index][gpc_index]);
3102
3103         g->ops.gr.bundle_cb_defaults(g);
3104         g->ops.gr.cb_size_default(g);
3105         g->ops.gr.calc_global_ctx_buffer_size(g);
3106         gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
3107
3108         gk20a_dbg_info("bundle_cb_default_size: %d",
3109                    gr->bundle_cb_default_size);
3110         gk20a_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
3111         gk20a_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
3112         gk20a_dbg_info("attrib_cb_default_size: %d",
3113                    gr->attrib_cb_default_size);
3114         gk20a_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
3115         gk20a_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
3116         gk20a_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
3117         gk20a_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
3118
3119         return 0;
3120
3121 clean_up:
3122         return -ENOMEM;
3123 }
3124
3125 static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
3126 {
3127         struct device *d = dev_from_gk20a(g);
3128         dma_addr_t iova;
3129
3130         gr->mmu_wr_mem_size = gr->mmu_rd_mem_size = 0x1000;
3131
3132         gr->mmu_wr_mem.size = gr->mmu_wr_mem_size;
3133         gr->mmu_wr_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_wr_mem_size,
3134                                         &iova, GFP_KERNEL);
3135         if (!gr->mmu_wr_mem.cpuva)
3136                 goto err;
3137
3138         gr->mmu_wr_mem.iova = iova;
3139
3140         gr->mmu_rd_mem.size = gr->mmu_rd_mem_size;
3141         gr->mmu_rd_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_rd_mem_size,
3142                                         &iova, GFP_KERNEL);
3143         if (!gr->mmu_rd_mem.cpuva)
3144                 goto err_free_wr_mem;
3145
3146         gr->mmu_rd_mem.iova = iova;
3147         return 0;
3148
3149  err_free_wr_mem:
3150         dma_free_coherent(d, gr->mmu_wr_mem.size,
3151                 gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
3152         gr->mmu_wr_mem.cpuva = NULL;
3153         gr->mmu_wr_mem.iova = 0;
3154  err:
3155         return -ENOMEM;
3156 }
3157
3158 static u32 prime_set[18] = {
3159         2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
3160
3161 static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
3162 {
3163         s32 comm_denom;
3164         s32 mul_factor;
3165         s32 *init_frac = NULL;
3166         s32 *init_err = NULL;
3167         s32 *run_err = NULL;
3168         s32 *sorted_num_tpcs = NULL;
3169         s32 *sorted_to_unsorted_gpc_map = NULL;
3170         u32 gpc_index;
3171         u32 gpc_mark = 0;
3172         u32 num_tpc;
3173         u32 max_tpc_count = 0;
3174         u32 swap;
3175         u32 tile_count;
3176         u32 index;
3177         bool delete_map = false;
3178         bool gpc_sorted;
3179         int ret = 0;
3180
3181         init_frac = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3182         init_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3183         run_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3184         sorted_num_tpcs =
3185                 kzalloc(proj_scal_max_gpcs_v() *
3186                         proj_scal_max_tpc_per_gpc_v() * sizeof(s32),
3187                         GFP_KERNEL);
3188         sorted_to_unsorted_gpc_map =
3189                 kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3190
3191         if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
3192               sorted_to_unsorted_gpc_map)) {
3193                 ret = -ENOMEM;
3194                 goto clean_up;
3195         }
3196
3197         gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
3198
3199         if (gr->tpc_count == 3)
3200                 gr->map_row_offset = 2;
3201         else if (gr->tpc_count < 3)
3202                 gr->map_row_offset = 1;
3203         else {
3204                 gr->map_row_offset = 3;
3205
3206                 for (index = 1; index < 18; index++) {
3207                         u32 prime = prime_set[index];
3208                         if ((gr->tpc_count % prime) != 0) {
3209                                 gr->map_row_offset = prime;
3210                                 break;
3211                         }
3212                 }
3213         }
3214
3215         switch (gr->tpc_count) {
3216         case 15:
3217                 gr->map_row_offset = 6;
3218                 break;
3219         case 14:
3220                 gr->map_row_offset = 5;
3221                 break;
3222         case 13:
3223                 gr->map_row_offset = 2;
3224                 break;
3225         case 11:
3226                 gr->map_row_offset = 7;
3227                 break;
3228         case 10:
3229                 gr->map_row_offset = 6;
3230                 break;
3231         case 7:
3232         case 5:
3233                 gr->map_row_offset = 1;
3234                 break;
3235         default:
3236                 break;
3237         }
3238
3239         if (gr->map_tiles) {
3240                 if (gr->map_tile_count != gr->tpc_count)
3241                         delete_map = true;
3242
3243                 for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
3244                         if ((u32)gr->map_tiles[tile_count] >= gr->tpc_count)
3245                                 delete_map = true;
3246                 }
3247
3248                 if (delete_map) {
3249                         kfree(gr->map_tiles);
3250                         gr->map_tiles = NULL;
3251                         gr->map_tile_count = 0;
3252                 }
3253         }
3254
3255         if (gr->map_tiles == NULL) {
3256                 gr->map_tile_count = proj_scal_max_gpcs_v();
3257
3258                 gr->map_tiles = kzalloc(proj_scal_max_gpcs_v() * sizeof(u8), GFP_KERNEL);
3259                 if (gr->map_tiles == NULL) {
3260                         ret = -ENOMEM;
3261                         goto clean_up;
3262                 }
3263
3264                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3265                         sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
3266                         sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
3267                 }
3268
3269                 gpc_sorted = false;
3270                 while (!gpc_sorted) {
3271                         gpc_sorted = true;
3272                         for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
3273                                 if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
3274                                         gpc_sorted = false;
3275                                         swap = sorted_num_tpcs[gpc_index];
3276                                         sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
3277                                         sorted_num_tpcs[gpc_index + 1] = swap;
3278                                         swap = sorted_to_unsorted_gpc_map[gpc_index];
3279                                         sorted_to_unsorted_gpc_map[gpc_index] =
3280                                                 sorted_to_unsorted_gpc_map[gpc_index + 1];
3281                                         sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
3282                                 }
3283                         }
3284                 }
3285
3286                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3287                         if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
3288                                 max_tpc_count = gr->gpc_tpc_count[gpc_index];
3289
3290                 mul_factor = gr->gpc_count * max_tpc_count;
3291                 if (mul_factor & 0x1)
3292                         mul_factor = 2;
3293                 else
3294                         mul_factor = 1;
3295
3296                 comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
3297
3298                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3299                         num_tpc = sorted_num_tpcs[gpc_index];
3300
3301                         init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
3302
3303                         if (num_tpc != 0)
3304                                 init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
3305                         else
3306                                 init_err[gpc_index] = 0;
3307
3308                         run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
3309                 }
3310
3311                 while (gpc_mark < gr->tpc_count) {
3312                         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3313                                 if ((run_err[gpc_index] * 2) >= comm_denom) {
3314                                         gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
3315                                         run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
3316                                 } else
3317                                         run_err[gpc_index] += init_frac[gpc_index];
3318                         }
3319                 }
3320         }
3321
3322 clean_up:
3323         kfree(init_frac);
3324         kfree(init_err);
3325         kfree(run_err);
3326         kfree(sorted_num_tpcs);
3327         kfree(sorted_to_unsorted_gpc_map);
3328
3329         if (ret)
3330                 gk20a_err(dev_from_gk20a(g), "fail");
3331         else
3332                 gk20a_dbg_fn("done");
3333
3334         return ret;
3335 }
3336
3337 static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
3338 {
3339         struct gr_zcull_gk20a *zcull = &gr->zcull;
3340
3341         zcull->aliquot_width = gr->tpc_count * 16;
3342         zcull->aliquot_height = 16;
3343
3344         zcull->width_align_pixels = gr->tpc_count * 16;
3345         zcull->height_align_pixels = 32;
3346
3347         zcull->aliquot_size =
3348                 zcull->aliquot_width * zcull->aliquot_height;
3349
3350         /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
3351         zcull->pixel_squares_by_aliquots =
3352                 gr->zcb_count * 16 * 16 * gr->tpc_count /
3353                 (gr->gpc_count * gr->gpc_tpc_count[0]);
3354
3355         zcull->total_aliquots =
3356                 gr_gpc0_zcull_total_ram_size_num_aliquots_f(
3357                         gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
3358
3359         return 0;
3360 }
3361
3362 u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
3363 {
3364         /* assuming gr has already been initialized */
3365         return gr->ctx_vars.zcull_ctxsw_image_size;
3366 }
3367
3368 int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
3369                         struct channel_gk20a *c, u64 zcull_va, u32 mode)
3370 {
3371         struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
3372
3373         zcull_ctx->ctx_sw_mode = mode;
3374         zcull_ctx->gpu_va = zcull_va;
3375
3376         /* TBD: don't disable channel in sw method processing */
3377         return gr_gk20a_ctx_zcull_setup(g, c, true);
3378 }
3379
3380 int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
3381                         struct gr_zcull_info *zcull_params)
3382 {
3383         struct gr_zcull_gk20a *zcull = &gr->zcull;
3384
3385         zcull_params->width_align_pixels = zcull->width_align_pixels;
3386         zcull_params->height_align_pixels = zcull->height_align_pixels;
3387         zcull_params->pixel_squares_by_aliquots =
3388                 zcull->pixel_squares_by_aliquots;
3389         zcull_params->aliquot_total = zcull->total_aliquots;
3390
3391         zcull_params->region_byte_multiplier =
3392                 gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
3393         zcull_params->region_header_size =
3394                 proj_scal_litter_num_gpcs_v() *
3395                 gr_zcull_save_restore_header_bytes_per_gpc_v();
3396
3397         zcull_params->subregion_header_size =
3398                 proj_scal_litter_num_gpcs_v() *
3399                 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
3400
3401         zcull_params->subregion_width_align_pixels =
3402                 gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
3403         zcull_params->subregion_height_align_pixels =
3404                 gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
3405         zcull_params->subregion_count = gr_zcull_subregion_qty_v();
3406
3407         return 0;
3408 }
3409
3410 static int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
3411                                   struct zbc_entry *color_val, u32 index)
3412 {
3413         struct fifo_gk20a *f = &g->fifo;
3414         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3415         u32 i;
3416         unsigned long end_jiffies = jiffies +
3417                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3418         u32 ret;
3419
3420         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3421         if (ret) {
3422                 gk20a_err(dev_from_gk20a(g),
3423                         "failed to disable gr engine activity\n");
3424                 return ret;
3425         }
3426
3427         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3428         if (ret) {
3429                 gk20a_err(dev_from_gk20a(g),
3430                         "failed to idle graphics\n");
3431                 goto clean_up;
3432         }
3433
3434         /* update l2 table */
3435         g->ops.ltc.set_zbc_color_entry(g, color_val, index);
3436
3437         /* update ds table */
3438         gk20a_writel(g, gr_ds_zbc_color_r_r(),
3439                 gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
3440         gk20a_writel(g, gr_ds_zbc_color_g_r(),
3441                 gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
3442         gk20a_writel(g, gr_ds_zbc_color_b_r(),
3443                 gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
3444         gk20a_writel(g, gr_ds_zbc_color_a_r(),
3445                 gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
3446
3447         gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3448                 gr_ds_zbc_color_fmt_val_f(color_val->format));
3449
3450         gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3451                 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3452
3453         /* trigger the write */
3454         gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3455                 gr_ds_zbc_tbl_ld_select_c_f() |
3456                 gr_ds_zbc_tbl_ld_action_write_f() |
3457                 gr_ds_zbc_tbl_ld_trigger_active_f());
3458
3459         /* update local copy */
3460         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3461                 gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
3462                 gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
3463         }
3464         gr->zbc_col_tbl[index].format = color_val->format;
3465         gr->zbc_col_tbl[index].ref_cnt++;
3466
3467 clean_up:
3468         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3469         if (ret) {
3470                 gk20a_err(dev_from_gk20a(g),
3471                         "failed to enable gr engine activity\n");
3472         }
3473
3474         return ret;
3475 }
3476
3477 static int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
3478                                 struct zbc_entry *depth_val, u32 index)
3479 {
3480         struct fifo_gk20a *f = &g->fifo;
3481         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3482         unsigned long end_jiffies = jiffies +
3483                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3484         u32 ret;
3485
3486         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3487         if (ret) {
3488                 gk20a_err(dev_from_gk20a(g),
3489                         "failed to disable gr engine activity\n");
3490                 return ret;
3491         }
3492
3493         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3494         if (ret) {
3495                 gk20a_err(dev_from_gk20a(g),
3496                         "failed to idle graphics\n");
3497                 goto clean_up;
3498         }
3499
3500         /* update l2 table */
3501         g->ops.ltc.set_zbc_depth_entry(g, depth_val, index);
3502
3503         /* update ds table */
3504         gk20a_writel(g, gr_ds_zbc_z_r(),
3505                 gr_ds_zbc_z_val_f(depth_val->depth));
3506
3507         gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3508                 gr_ds_zbc_z_fmt_val_f(depth_val->format));
3509
3510         gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3511                 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3512
3513         /* trigger the write */
3514         gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3515                 gr_ds_zbc_tbl_ld_select_z_f() |
3516                 gr_ds_zbc_tbl_ld_action_write_f() |
3517                 gr_ds_zbc_tbl_ld_trigger_active_f());
3518
3519         /* update local copy */
3520         gr->zbc_dep_tbl[index].depth = depth_val->depth;
3521         gr->zbc_dep_tbl[index].format = depth_val->format;
3522         gr->zbc_dep_tbl[index].ref_cnt++;
3523
3524 clean_up:
3525         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3526         if (ret) {
3527                 gk20a_err(dev_from_gk20a(g),
3528                         "failed to enable gr engine activity\n");
3529         }
3530
3531         return ret;
3532 }
3533
3534 void gr_gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
3535 {
3536         struct fifo_gk20a *f = &g->fifo;
3537         struct fifo_engine_info_gk20a *gr_info =
3538                 f->engine_info + ENGINE_GR_GK20A;
3539         unsigned long end_jiffies = jiffies +
3540                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3541         u32 ret;
3542
3543         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3544         if (ret) {
3545                 gk20a_err(dev_from_gk20a(g),
3546                         "failed to disable gr engine activity\n");
3547                 return;
3548         }
3549
3550         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3551         if (ret) {
3552                 gk20a_err(dev_from_gk20a(g),
3553                         "failed to idle graphics\n");
3554                 goto clean_up;
3555         }
3556
3557         /* update zbc */
3558         gk20a_pmu_save_zbc(g, entries);
3559
3560 clean_up:
3561         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3562         if (ret) {
3563                 gk20a_err(dev_from_gk20a(g),
3564                         "failed to enable gr engine activity\n");
3565         }
3566
3567         return;
3568 }
3569
3570 int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
3571                      struct zbc_entry *zbc_val)
3572 {
3573         struct zbc_color_table *c_tbl;
3574         struct zbc_depth_table *d_tbl;
3575         u32 i, ret = -ENOMEM;
3576         bool added = false;
3577         u32 entries;
3578
3579         /* no endian swap ? */
3580
3581         mutex_lock(&gr->zbc_lock);
3582         switch (zbc_val->type) {
3583         case GK20A_ZBC_TYPE_COLOR:
3584                 /* search existing tables */
3585                 for (i = 0; i < gr->max_used_color_index; i++) {
3586
3587                         c_tbl = &gr->zbc_col_tbl[i];
3588
3589                         if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
3590                             memcmp(c_tbl->color_ds, zbc_val->color_ds,
3591                                 sizeof(zbc_val->color_ds)) == 0) {
3592
3593                                 if (memcmp(c_tbl->color_l2, zbc_val->color_l2,
3594                                     sizeof(zbc_val->color_l2))) {
3595                                         gk20a_err(dev_from_gk20a(g),
3596                                                 "zbc l2 and ds color don't match with existing entries");
3597                                         ret = -EINVAL;
3598                                         goto err_mutex;
3599                                 }
3600                                 added = true;
3601                                 c_tbl->ref_cnt++;
3602                                 ret = 0;
3603                                 break;
3604                         }
3605                 }
3606                 /* add new table */
3607                 if (!added &&
3608                     gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
3609
3610                         c_tbl =
3611                             &gr->zbc_col_tbl[gr->max_used_color_index];
3612                         WARN_ON(c_tbl->ref_cnt != 0);
3613
3614                         ret = gr_gk20a_add_zbc_color(g, gr,
3615                                 zbc_val, gr->max_used_color_index);
3616
3617                         if (!ret)
3618                                 gr->max_used_color_index++;
3619                 }
3620                 break;
3621         case GK20A_ZBC_TYPE_DEPTH:
3622                 /* search existing tables */
3623                 for (i = 0; i < gr->max_used_depth_index; i++) {
3624
3625                         d_tbl = &gr->zbc_dep_tbl[i];
3626
3627                         if (d_tbl->ref_cnt &&
3628                             d_tbl->depth == zbc_val->depth &&
3629                             d_tbl->format == zbc_val->format) {
3630                                 added = true;
3631                                 d_tbl->ref_cnt++;
3632                                 ret = 0;
3633                                 break;
3634                         }
3635                 }
3636                 /* add new table */
3637                 if (!added &&
3638                     gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
3639
3640                         d_tbl =
3641                             &gr->zbc_dep_tbl[gr->max_used_depth_index];
3642                         WARN_ON(d_tbl->ref_cnt != 0);
3643
3644                         ret = gr_gk20a_add_zbc_depth(g, gr,
3645                                 zbc_val, gr->max_used_depth_index);
3646
3647                         if (!ret)
3648                                 gr->max_used_depth_index++;
3649                 }
3650                 break;
3651         default:
3652                 gk20a_err(dev_from_gk20a(g),
3653                         "invalid zbc table type %d", zbc_val->type);
3654                 ret = -EINVAL;
3655                 goto err_mutex;
3656         }
3657
3658         if (!added && ret == 0) {
3659                 /* update zbc for elpg only when new entry is added */
3660                 entries = max(gr->max_used_color_index,
3661                                         gr->max_used_depth_index);
3662                 gr_gk20a_pmu_save_zbc(g, entries);
3663         }
3664
3665 err_mutex:
3666         mutex_unlock(&gr->zbc_lock);
3667         return ret;
3668 }
3669
3670 /* get a zbc table entry specified by index
3671  * return table size when type is invalid */
3672 int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
3673                         struct zbc_query_params *query_params)
3674 {
3675         u32 index = query_params->index_size;
3676         u32 i;
3677
3678         switch (query_params->type) {
3679         case GK20A_ZBC_TYPE_INVALID:
3680                 query_params->index_size = GK20A_ZBC_TABLE_SIZE;
3681                 break;
3682         case GK20A_ZBC_TYPE_COLOR:
3683                 if (index >= GK20A_ZBC_TABLE_SIZE) {
3684                         gk20a_err(dev_from_gk20a(g),
3685                                 "invalid zbc color table index\n");
3686                         return -EINVAL;
3687                 }
3688                 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3689                         query_params->color_l2[i] =
3690                                 gr->zbc_col_tbl[index].color_l2[i];
3691                         query_params->color_ds[i] =
3692                                 gr->zbc_col_tbl[index].color_ds[i];
3693                 }
3694                 query_params->format = gr->zbc_col_tbl[index].format;
3695                 query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
3696                 break;
3697         case GK20A_ZBC_TYPE_DEPTH:
3698                 if (index >= GK20A_ZBC_TABLE_SIZE) {
3699                         gk20a_err(dev_from_gk20a(g),
3700                                 "invalid zbc depth table index\n");
3701                         return -EINVAL;
3702                 }
3703                 query_params->depth = gr->zbc_dep_tbl[index].depth;
3704                 query_params->format = gr->zbc_dep_tbl[index].format;
3705                 query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
3706                 break;
3707         default:
3708                 gk20a_err(dev_from_gk20a(g),
3709                                 "invalid zbc table type\n");
3710                 return -EINVAL;
3711         }
3712
3713         return 0;
3714 }
3715
3716 int gr_gk20a_load_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
3717 {
3718         int i, ret;
3719
3720         mutex_init(&gr->zbc_lock);
3721         for (i = 0; i < gr->max_used_color_index; i++) {
3722                 struct zbc_color_table *c_tbl = &gr->zbc_col_tbl[i];
3723                 struct zbc_entry zbc_val;
3724
3725                 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3726                 memcpy(zbc_val.color_ds,
3727                        c_tbl->color_ds, sizeof(zbc_val.color_ds));
3728                 memcpy(zbc_val.color_l2,
3729                        c_tbl->color_l2, sizeof(zbc_val.color_l2));
3730                 zbc_val.format = c_tbl->format;
3731
3732                 ret = gr_gk20a_add_zbc_color(g, gr, &zbc_val, i);
3733
3734                 if (ret)
3735                         return ret;
3736         }
3737         for (i = 0; i < gr->max_used_depth_index; i++) {
3738                 struct zbc_depth_table *d_tbl = &gr->zbc_dep_tbl[i];
3739                 struct zbc_entry zbc_val;
3740
3741                 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3742                 zbc_val.depth = d_tbl->depth;
3743                 zbc_val.format = d_tbl->format;
3744
3745                 ret = gr_gk20a_add_zbc_depth(g, gr, &zbc_val, i);
3746                 if (ret)
3747                         return ret;
3748         }
3749         return 0;
3750 }
3751
3752 int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
3753 {
3754         struct zbc_entry zbc_val;
3755         u32 i, err;
3756
3757         /* load default color table */
3758         zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3759
3760         zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
3761         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3762                 zbc_val.color_ds[i] = 0;
3763                 zbc_val.color_l2[i] = 0;
3764         }
3765         err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3766
3767         zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
3768         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3769                 zbc_val.color_ds[i] = 0xffffffff;
3770                 zbc_val.color_l2[i] = 0x3f800000;
3771         }
3772         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3773
3774         zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3775         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3776                 zbc_val.color_ds[i] = 0;
3777                 zbc_val.color_l2[i] = 0;
3778         }
3779         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3780
3781         zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3782         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3783                 zbc_val.color_ds[i] = 0x3f800000;
3784                 zbc_val.color_l2[i] = 0x3f800000;
3785         }
3786         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3787
3788         if (!err)
3789                 gr->max_default_color_index = 4;
3790         else {
3791                 gk20a_err(dev_from_gk20a(g),
3792                            "fail to load default zbc color table\n");
3793                 return err;
3794         }
3795
3796         /* load default depth table */
3797         zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3798
3799         zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3800         zbc_val.depth = 0;
3801         err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3802
3803         zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3804         zbc_val.depth = 0x3f800000;
3805         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3806
3807         if (!err)
3808                 gr->max_default_depth_index = 2;
3809         else {
3810                 gk20a_err(dev_from_gk20a(g),
3811                            "fail to load default zbc depth table\n");
3812                 return err;
3813         }
3814
3815         return 0;
3816 }
3817
3818 int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
3819                         struct zbc_entry *zbc_val)
3820 {
3821         gk20a_dbg_fn("");
3822
3823         return gr_gk20a_elpg_protected_call(g,
3824                 gr_gk20a_add_zbc(g, gr, zbc_val));
3825 }
3826
3827 void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine)
3828 {
3829         u32 gate_ctrl;
3830
3831         gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3832
3833         switch (mode) {
3834         case BLCG_RUN:
3835                 gate_ctrl = set_field(gate_ctrl,
3836                                 therm_gate_ctrl_blk_clk_m(),
3837                                 therm_gate_ctrl_blk_clk_run_f());
3838                 break;
3839         case BLCG_AUTO:
3840                 gate_ctrl = set_field(gate_ctrl,
3841                                 therm_gate_ctrl_blk_clk_m(),
3842                                 therm_gate_ctrl_blk_clk_auto_f());
3843                 break;
3844         default:
3845                 gk20a_err(dev_from_gk20a(g),
3846                         "invalid blcg mode %d", mode);
3847                 return;
3848         }
3849
3850         gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3851 }
3852
3853 void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine)
3854 {
3855         u32 gate_ctrl, idle_filter;
3856
3857         gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3858
3859         switch (mode) {
3860         case ELCG_RUN:
3861                 gate_ctrl = set_field(gate_ctrl,
3862                                 therm_gate_ctrl_eng_clk_m(),
3863                                 therm_gate_ctrl_eng_clk_run_f());
3864                 gate_ctrl = set_field(gate_ctrl,
3865                                 therm_gate_ctrl_eng_pwr_m(),
3866                                 /* set elpg to auto to meet hw expectation */
3867                                 therm_gate_ctrl_eng_pwr_auto_f());
3868                 break;
3869         case ELCG_STOP:
3870                 gate_ctrl = set_field(gate_ctrl,
3871                                 therm_gate_ctrl_eng_clk_m(),
3872                                 therm_gate_ctrl_eng_clk_stop_f());
3873                 break;
3874         case ELCG_AUTO:
3875                 gate_ctrl = set_field(gate_ctrl,
3876                                 therm_gate_ctrl_eng_clk_m(),
3877                                 therm_gate_ctrl_eng_clk_auto_f());
3878                 break;
3879         default:
3880                 gk20a_err(dev_from_gk20a(g),
3881                         "invalid elcg mode %d", mode);
3882         }
3883
3884         if (tegra_platform_is_linsim()) {
3885                 gate_ctrl = set_field(gate_ctrl,
3886                         therm_gate_ctrl_eng_delay_after_m(),
3887                         therm_gate_ctrl_eng_delay_after_f(4));
3888         }
3889
3890         /* 2 * (1 << 9) = 1024 clks */
3891         gate_ctrl = set_field(gate_ctrl,
3892                 therm_gate_ctrl_eng_idle_filt_exp_m(),
3893                 therm_gate_ctrl_eng_idle_filt_exp_f(9));
3894         gate_ctrl = set_field(gate_ctrl,
3895                 therm_gate_ctrl_eng_idle_filt_mant_m(),
3896                 therm_gate_ctrl_eng_idle_filt_mant_f(2));
3897         gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3898
3899         /* default fecs_idle_filter to 0 */
3900         idle_filter = gk20a_readl(g, therm_fecs_idle_filter_r());
3901         idle_filter &= ~therm_fecs_idle_filter_value_m();
3902         gk20a_writel(g, therm_fecs_idle_filter_r(), idle_filter);
3903         /* default hubmmu_idle_filter to 0 */
3904         idle_filter = gk20a_readl(g, therm_hubmmu_idle_filter_r());
3905         idle_filter &= ~therm_hubmmu_idle_filter_value_m();
3906         gk20a_writel(g, therm_hubmmu_idle_filter_r(), idle_filter);
3907 }
3908
3909 static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
3910 {
3911         u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
3912         u32 *zcull_map_tiles, *zcull_bank_counters;
3913         u32 map_counter;
3914         u32 rcp_conserv;
3915         u32 offset;
3916         bool floorsweep = false;
3917
3918         if (!gr->map_tiles)
3919                 return -1;
3920
3921         zcull_map_tiles = kzalloc(proj_scal_max_gpcs_v() *
3922                         proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3923         if (!zcull_map_tiles) {
3924                 gk20a_err(dev_from_gk20a(g),
3925                         "failed to allocate zcull temp buffers");
3926                 return -ENOMEM;
3927         }
3928         zcull_bank_counters = kzalloc(proj_scal_max_gpcs_v() *
3929                         proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3930
3931         if (!zcull_bank_counters) {
3932                 gk20a_err(dev_from_gk20a(g),
3933                         "failed to allocate zcull temp buffers");
3934                 kfree(zcull_map_tiles);
3935                 return -ENOMEM;
3936         }
3937
3938         for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
3939                 zcull_map_tiles[map_counter] =
3940                         zcull_bank_counters[gr->map_tiles[map_counter]];
3941                 zcull_bank_counters[gr->map_tiles[map_counter]]++;
3942         }
3943
3944         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(),
3945                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(zcull_map_tiles[0]) |
3946                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(zcull_map_tiles[1]) |
3947                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(zcull_map_tiles[2]) |
3948                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(zcull_map_tiles[3]) |
3949                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(zcull_map_tiles[4]) |
3950                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(zcull_map_tiles[5]) |
3951                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(zcull_map_tiles[6]) |
3952                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(zcull_map_tiles[7]));
3953
3954         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(),
3955                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(zcull_map_tiles[8]) |
3956                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(zcull_map_tiles[9]) |
3957                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(zcull_map_tiles[10]) |
3958                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(zcull_map_tiles[11]) |
3959                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(zcull_map_tiles[12]) |
3960                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(zcull_map_tiles[13]) |
3961                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(zcull_map_tiles[14]) |
3962                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(zcull_map_tiles[15]));
3963
3964         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(),
3965                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(zcull_map_tiles[16]) |
3966                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(zcull_map_tiles[17]) |
3967                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(zcull_map_tiles[18]) |
3968                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(zcull_map_tiles[19]) |
3969                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(zcull_map_tiles[20]) |
3970                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(zcull_map_tiles[21]) |
3971                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(zcull_map_tiles[22]) |
3972                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(zcull_map_tiles[23]));
3973
3974         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(),
3975                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(zcull_map_tiles[24]) |
3976                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(zcull_map_tiles[25]) |
3977                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(zcull_map_tiles[26]) |
3978                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(zcull_map_tiles[27]) |
3979                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(zcull_map_tiles[28]) |
3980                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(zcull_map_tiles[29]) |
3981                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(zcull_map_tiles[30]) |
3982                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(zcull_map_tiles[31]));
3983
3984         kfree(zcull_map_tiles);
3985         kfree(zcull_bank_counters);
3986
3987         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3988                 gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
3989                 gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
3990
3991                 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3992                     gpc_zcull_count < gpc_tpc_count) {
3993                         gk20a_err(dev_from_gk20a(g),
3994                                 "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
3995                                 gpc_zcull_count, gpc_tpc_count, gpc_index);
3996                         return -EINVAL;
3997                 }
3998                 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3999                     gpc_zcull_count != 0)
4000                         floorsweep = true;
4001         }
4002
4003         /* 1.0f / 1.0f * gr_gpc0_zcull_sm_num_rcp_conservative__max_v() */
4004         rcp_conserv = gr_gpc0_zcull_sm_num_rcp_conservative__max_v();
4005
4006         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4007                 offset = gpc_index * proj_gpc_stride_v();
4008
4009                 if (floorsweep) {
4010                         gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4011                                 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4012                                 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4013                                         gr->max_zcull_per_gpc_count));
4014                 } else {
4015                         gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4016                                 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4017                                 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4018                                         gr->gpc_tpc_count[gpc_index]));
4019                 }
4020
4021                 gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
4022                         gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
4023                         gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
4024
4025                 gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
4026                         gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
4027         }
4028
4029         gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
4030                 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
4031
4032         return 0;
4033 }
4034
4035 static void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
4036 {
4037         /* enable tpc exception forwarding */
4038         gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(),
4039                 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f());
4040
4041         /* enable gpc exception forwarding */
4042         gk20a_writel(g, gr_gpc0_gpccs_gpc_exception_en_r(),
4043                 gr_gpc0_gpccs_gpc_exception_en_tpc_0_enabled_f());
4044 }
4045
4046
4047 void gr_gk20a_enable_hww_exceptions(struct gk20a *g)
4048 {
4049         /* enable exceptions */
4050         gk20a_writel(g, gr_fe_hww_esr_r(),
4051                      gr_fe_hww_esr_en_enable_f() |
4052                      gr_fe_hww_esr_reset_active_f());
4053         gk20a_writel(g, gr_memfmt_hww_esr_r(),
4054                      gr_memfmt_hww_esr_en_enable_f() |
4055                      gr_memfmt_hww_esr_reset_active_f());
4056         gk20a_writel(g, gr_scc_hww_esr_r(),
4057                      gr_scc_hww_esr_en_enable_f() |
4058                      gr_scc_hww_esr_reset_active_f());
4059         gk20a_writel(g, gr_mme_hww_esr_r(),
4060                      gr_mme_hww_esr_en_enable_f() |
4061                      gr_mme_hww_esr_reset_active_f());
4062         gk20a_writel(g, gr_pd_hww_esr_r(),
4063                      gr_pd_hww_esr_en_enable_f() |
4064                      gr_pd_hww_esr_reset_active_f());
4065         gk20a_writel(g, gr_sked_hww_esr_r(), /* enabled by default */
4066                      gr_sked_hww_esr_reset_active_f());
4067         gk20a_writel(g, gr_ds_hww_esr_r(),
4068                      gr_ds_hww_esr_en_enabled_f() |
4069                      gr_ds_hww_esr_reset_task_f());
4070         gk20a_writel(g, gr_ds_hww_report_mask_r(),
4071                      gr_ds_hww_report_mask_sph0_err_report_f() |
4072                      gr_ds_hww_report_mask_sph1_err_report_f() |
4073                      gr_ds_hww_report_mask_sph2_err_report_f() |
4074                      gr_ds_hww_report_mask_sph3_err_report_f() |
4075                      gr_ds_hww_report_mask_sph4_err_report_f() |
4076                      gr_ds_hww_report_mask_sph5_err_report_f() |
4077                      gr_ds_hww_report_mask_sph6_err_report_f() |
4078                      gr_ds_hww_report_mask_sph7_err_report_f() |
4079                      gr_ds_hww_report_mask_sph8_err_report_f() |
4080                      gr_ds_hww_report_mask_sph9_err_report_f() |
4081                      gr_ds_hww_report_mask_sph10_err_report_f() |
4082                      gr_ds_hww_report_mask_sph11_err_report_f() |
4083                      gr_ds_hww_report_mask_sph12_err_report_f() |
4084                      gr_ds_hww_report_mask_sph13_err_report_f() |
4085                      gr_ds_hww_report_mask_sph14_err_report_f() |
4086                      gr_ds_hww_report_mask_sph15_err_report_f() |
4087                      gr_ds_hww_report_mask_sph16_err_report_f() |
4088                      gr_ds_hww_report_mask_sph17_err_report_f() |
4089                      gr_ds_hww_report_mask_sph18_err_report_f() |
4090                      gr_ds_hww_report_mask_sph19_err_report_f() |
4091                      gr_ds_hww_report_mask_sph20_err_report_f() |
4092                      gr_ds_hww_report_mask_sph21_err_report_f() |
4093                      gr_ds_hww_report_mask_sph22_err_report_f() |
4094                      gr_ds_hww_report_mask_sph23_err_report_f());
4095 }
4096
4097 static void gr_gk20a_set_hww_esr_report_mask(struct gk20a *g)
4098 {
4099         /* setup sm warp esr report masks */
4100         gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4101                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4102                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4103                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4104                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4105                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4106                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4107                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4108                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4109                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4110                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4111                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4112                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4113                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4114                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4115                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4116                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4117                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4118                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4119                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4120                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4121
4122         /* setup sm global esr report mask */
4123         gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4124                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4125                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4126                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4127                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4128                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4129                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4130                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4131 }
4132
4133 static int gk20a_init_gr_setup_hw(struct gk20a *g)
4134 {
4135         struct gr_gk20a *gr = &g->gr;
4136         struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
4137         struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
4138         u32 data;
4139         u32 addr_lo, addr_hi;
4140         u64 addr;
4141         unsigned long end_jiffies = jiffies +
4142                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4143         u32 fe_go_idle_timeout_save;
4144         u32 last_method_data = 0;
4145         u32 i, err;
4146
4147         gk20a_dbg_fn("");
4148
4149         /* slcg prod values */
4150         g->ops.clock_gating.slcg_gr_load_gating_prod(g, g->slcg_enabled);
4151         g->ops.clock_gating.slcg_perf_load_gating_prod(g, g->slcg_enabled);
4152
4153         /* init mmu debug buffer */
4154         addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_wr_mem.iova);
4155         addr_lo = u64_lo32(addr);
4156         addr_hi = u64_hi32(addr);
4157         addr = (addr_lo >> fb_mmu_debug_wr_addr_alignment_v()) |
4158                 (addr_hi << (32 - fb_mmu_debug_wr_addr_alignment_v()));
4159
4160         gk20a_writel(g, fb_mmu_debug_wr_r(),
4161                      fb_mmu_debug_wr_aperture_vid_mem_f() |
4162                      fb_mmu_debug_wr_vol_false_f() |
4163                      fb_mmu_debug_wr_addr_v(addr));
4164
4165         addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_rd_mem.iova);
4166         addr_lo = u64_lo32(addr);
4167         addr_hi = u64_hi32(addr);
4168         addr = (addr_lo >> fb_mmu_debug_rd_addr_alignment_v()) |
4169                 (addr_hi << (32 - fb_mmu_debug_rd_addr_alignment_v()));
4170
4171         gk20a_writel(g, fb_mmu_debug_rd_r(),
4172                      fb_mmu_debug_rd_aperture_vid_mem_f() |
4173                      fb_mmu_debug_rd_vol_false_f() |
4174                      fb_mmu_debug_rd_addr_v(addr));
4175
4176         /* load gr floorsweeping registers */
4177         data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
4178         data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
4179                         gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
4180         gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
4181
4182         gr_gk20a_zcull_init_hw(g, gr);
4183
4184         g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
4185         g->ops.clock_gating.pg_gr_load_gating_prod(g, true);
4186
4187         if (g->elcg_enabled) {
4188                 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
4189                 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
4190         } else {
4191                 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
4192                 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
4193         }
4194
4195         /* Bug 1340570: increase the clock timeout to avoid potential
4196          * operation failure at high gpcclk rate. Default values are 0x400.
4197          */
4198         gk20a_writel(g, pri_ringstation_sys_master_config_r(0x15), 0x800);
4199         gk20a_writel(g, pri_ringstation_gpc_master_config_r(0xa), 0x800);
4200         gk20a_writel(g, pri_ringstation_fbp_master_config_r(0x8), 0x800);
4201
4202         /* enable fifo access */
4203         gk20a_writel(g, gr_gpfifo_ctl_r(),
4204                      gr_gpfifo_ctl_access_enabled_f() |
4205                      gr_gpfifo_ctl_semaphore_access_enabled_f());
4206
4207         /* TBD: reload gr ucode when needed */
4208
4209         /* enable interrupts */
4210         gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
4211         gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
4212
4213         /* enable fecs error interrupts */
4214         gk20a_writel(g, gr_fecs_host_int_enable_r(),
4215                      gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
4216                      gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
4217                      gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
4218                      gr_fecs_host_int_enable_watchdog_enable_f());
4219
4220         g->ops.gr.enable_hww_exceptions(g);
4221         g->ops.gr.set_hww_esr_report_mask(g);
4222
4223         /* enable per GPC exceptions */
4224         gk20a_gr_enable_gpc_exceptions(g);
4225
4226         /* TBD: ECC for L1/SM */
4227         /* TBD: enable per BE exceptions */
4228
4229         /* reset and enable all exceptions */
4230         gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
4231         gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
4232         gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
4233         gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
4234         gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
4235         gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
4236
4237         /* ignore status from some units */
4238         data = gk20a_readl(g, gr_status_mask_r());
4239         gk20a_writel(g, gr_status_mask_r(), data & gr->status_disable_mask);
4240
4241         if (gr->sw_ready)
4242                 gr_gk20a_load_zbc_table(g, gr);
4243         else
4244                 gr_gk20a_load_zbc_default_table(g, gr);
4245
4246         g->ops.ltc.init_cbc(g, gr);
4247
4248         /* load ctx init */
4249         for (i = 0; i < sw_ctx_load->count; i++)
4250                 gk20a_writel(g, sw_ctx_load->l[i].addr,
4251                              sw_ctx_load->l[i].value);
4252
4253         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4254         if (err)
4255                 goto out;
4256
4257         /* save and disable fe_go_idle */
4258         fe_go_idle_timeout_save =
4259                 gk20a_readl(g, gr_fe_go_idle_timeout_r());
4260         gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4261                 (fe_go_idle_timeout_save & gr_fe_go_idle_timeout_count_f(0)) |
4262                 gr_fe_go_idle_timeout_count_disabled_f());
4263
4264         /* override a few ctx state registers */
4265         g->ops.gr.commit_global_cb_manager(g, NULL, false);
4266         gr_gk20a_commit_global_timeslice(g, NULL, false);
4267
4268         /* floorsweep anything left */
4269         g->ops.gr.init_fs_state(g);
4270
4271         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4272         if (err)
4273                 goto restore_fe_go_idle;
4274
4275 restore_fe_go_idle:
4276         /* restore fe_go_idle */
4277         gk20a_writel(g, gr_fe_go_idle_timeout_r(), fe_go_idle_timeout_save);
4278
4279         if (err || gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT))
4280                 goto out;
4281
4282         /* load method init */
4283         if (sw_method_init->count) {
4284                 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4285                              sw_method_init->l[0].value);
4286                 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4287                              gr_pri_mme_shadow_raw_index_write_trigger_f() |
4288                              sw_method_init->l[0].addr);
4289                 last_method_data = sw_method_init->l[0].value;
4290         }
4291         for (i = 1; i < sw_method_init->count; i++) {
4292                 if (sw_method_init->l[i].value != last_method_data) {
4293                         gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4294                                 sw_method_init->l[i].value);
4295                         last_method_data = sw_method_init->l[i].value;
4296                 }
4297                 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4298                         gr_pri_mme_shadow_raw_index_write_trigger_f() |
4299                         sw_method_init->l[i].addr);
4300         }
4301
4302         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4303         if (err)
4304                 goto out;
4305
4306 out:
4307         gk20a_dbg_fn("done");
4308         return 0;
4309 }
4310
4311 static int gk20a_init_gr_prepare(struct gk20a *g)
4312 {
4313         u32 gpfifo_ctrl, pmc_en;
4314         u32 err = 0;
4315
4316         /* disable fifo access */
4317         pmc_en = gk20a_readl(g, mc_enable_r());
4318         if (pmc_en & mc_enable_pgraph_enabled_f()) {
4319                 gpfifo_ctrl = gk20a_readl(g, gr_gpfifo_ctl_r());
4320                 gpfifo_ctrl &= ~gr_gpfifo_ctl_access_enabled_f();
4321                 gk20a_writel(g, gr_gpfifo_ctl_r(), gpfifo_ctrl);
4322         }
4323
4324         /* reset gr engine */
4325         gk20a_reset(g, mc_enable_pgraph_enabled_f()
4326                         | mc_enable_blg_enabled_f()
4327                         | mc_enable_perfmon_enabled_f());
4328
4329         /* enable fifo access */
4330         gk20a_writel(g, gr_gpfifo_ctl_r(),
4331                 gr_gpfifo_ctl_access_enabled_f() |
4332                 gr_gpfifo_ctl_semaphore_access_enabled_f());
4333
4334         if (!g->gr.ctx_vars.valid) {
4335                 err = gr_gk20a_init_ctx_vars(g, &g->gr);
4336                 if (err)
4337                         gk20a_err(dev_from_gk20a(g),
4338                                 "fail to load gr init ctx");
4339         }
4340         return err;
4341 }
4342
4343 static int gr_gk20a_wait_mem_scrubbing(struct gk20a *g)
4344 {
4345         int retries = GR_IDLE_CHECK_MAX / GR_IDLE_CHECK_DEFAULT;
4346         bool fecs_scrubbing;
4347         bool gpccs_scrubbing;
4348
4349         gk20a_dbg_fn("");
4350
4351         do {
4352                 fecs_scrubbing = gk20a_readl(g, gr_fecs_dmactl_r()) &
4353                         (gr_fecs_dmactl_imem_scrubbing_m() |
4354                          gr_fecs_dmactl_dmem_scrubbing_m());
4355
4356                 gpccs_scrubbing = gk20a_readl(g, gr_gpccs_dmactl_r()) &
4357                         (gr_gpccs_dmactl_imem_scrubbing_m() |
4358                          gr_gpccs_dmactl_imem_scrubbing_m());
4359
4360                 if (!fecs_scrubbing && !gpccs_scrubbing) {
4361                         gk20a_dbg_fn("done");
4362                         return 0;
4363                 }
4364
4365                 udelay(GR_IDLE_CHECK_DEFAULT);
4366         } while (--retries || !tegra_platform_is_silicon());
4367
4368         gk20a_err(dev_from_gk20a(g), "Falcon mem scrubbing timeout");
4369         return -ETIMEDOUT;
4370 }
4371
4372 static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
4373 {
4374         struct gr_gk20a *gr = &g->gr;
4375         struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
4376         unsigned long end_jiffies = jiffies +
4377                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4378         u32 i, err = 0;
4379
4380         gk20a_dbg_fn("");
4381
4382         /* enable interrupts */
4383         gk20a_writel(g, gr_intr_r(), ~0);
4384         gk20a_writel(g, gr_intr_en_r(), ~0);
4385
4386         /* reset ctx switch state */
4387         gr_gk20a_ctx_reset(g, 0);
4388
4389         /* clear scc ram */
4390         gk20a_writel(g, gr_scc_init_r(),
4391                 gr_scc_init_ram_trigger_f());
4392
4393         /* load non_ctx init */
4394         for (i = 0; i < sw_non_ctx_load->count; i++)
4395                 gk20a_writel(g, sw_non_ctx_load->l[i].addr,
4396                         sw_non_ctx_load->l[i].value);
4397
4398         err = gr_gk20a_wait_mem_scrubbing(g);
4399         if (err)
4400                 goto out;
4401
4402         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4403         if (err)
4404                 goto out;
4405
4406         err = gr_gk20a_load_ctxsw_ucode(g, gr);
4407         if (err)
4408                 goto out;
4409
4410         /* this appears query for sw states but fecs actually init
4411            ramchain, etc so this is hw init */
4412         err = gr_gk20a_init_ctx_state(g, gr);
4413         if (err)
4414                 goto out;
4415
4416 out:
4417         if (err)
4418                 gk20a_err(dev_from_gk20a(g), "fail");
4419         else
4420                 gk20a_dbg_fn("done");
4421
4422         return 0;
4423 }
4424
4425 /*
4426  * XXX Merge this list with the debugger/profiler
4427  * session regops whitelists?
4428  */
4429 static u32 wl_addr_gk20a[] = {
4430         /* this list must be sorted (low to high) */
4431         0x404468, /* gr_pri_mme_max_instructions       */
4432         0x418800, /* gr_pri_gpcs_setup_debug           */
4433         0x419a04, /* gr_pri_gpcs_tpcs_tex_lod_dbg      */
4434         0x419a08, /* gr_pri_gpcs_tpcs_tex_samp_dbg     */
4435         0x419e10, /* gr_pri_gpcs_tpcs_sm_dbgr_control0 */
4436         0x419f78, /* gr_pri_gpcs_tpcs_sm_disp_ctrl     */
4437 };
4438
4439 static int gr_gk20a_init_access_map(struct gk20a *g)
4440 {
4441         struct gr_gk20a *gr = &g->gr;
4442         void *data;
4443         int err = 0;
4444         u32 w, nr_pages =
4445                 DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
4446                              PAGE_SIZE);
4447
4448         data = vmap(gr->global_ctx_buffer[PRIV_ACCESS_MAP].pages,
4449                     PAGE_ALIGN(gr->global_ctx_buffer[PRIV_ACCESS_MAP].size) >>
4450                     PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
4451         if (!data) {
4452                 gk20a_err(dev_from_gk20a(g),
4453                           "failed to map priv access map memory");
4454                 err = -ENOMEM;
4455                 goto clean_up;
4456         }
4457
4458         memset(data, 0x0, PAGE_SIZE * nr_pages);
4459
4460         for (w = 0; w < ARRAY_SIZE(wl_addr_gk20a); w++) {
4461                 u32 map_bit, map_byte, map_shift;
4462                 map_bit = wl_addr_gk20a[w] >> 2;
4463                 map_byte = map_bit >> 3;
4464                 map_shift = map_bit & 0x7; /* i.e. 0-7 */
4465                 gk20a_dbg_info("access map addr:0x%x byte:0x%x bit:%d",
4466                   wl_addr_gk20a[w], map_byte, map_shift);
4467                 ((u8 *)data)[map_byte] |= 1 << map_shift;
4468         }
4469
4470 clean_up:
4471         if (data)
4472                 vunmap(data);
4473         return 0;
4474 }
4475
4476 static int gk20a_init_gr_setup_sw(struct gk20a *g)
4477 {
4478         struct gr_gk20a *gr = &g->gr;
4479         int err;
4480
4481         gk20a_dbg_fn("");
4482
4483         if (gr->sw_ready) {
4484                 gk20a_dbg_fn("skip init");
4485                 return 0;
4486         }
4487
4488         gr->g = g;
4489
4490         err = gr_gk20a_init_gr_config(g, gr);
4491         if (err)
4492                 goto clean_up;
4493
4494         err = gr_gk20a_init_mmu_sw(g, gr);
4495         if (err)
4496                 goto clean_up;
4497
4498         err = gr_gk20a_init_map_tiles(g, gr);
4499         if (err)
4500                 goto clean_up;
4501
4502         if (tegra_cpu_is_asim())
4503                 gr->max_comptag_mem = 1; /* MBs worth of comptag coverage */
4504         else {
4505                 gk20a_dbg_info("total ram pages : %lu", totalram_pages);
4506                 gr->max_comptag_mem = totalram_pages
4507                                          >> (10 - (PAGE_SHIFT - 10));
4508         }
4509         err = g->ops.ltc.init_comptags(g, gr);
4510         if (err)
4511                 goto clean_up;
4512
4513         err = gr_gk20a_init_zcull(g, gr);
4514         if (err)
4515                 goto clean_up;
4516
4517         err = gr_gk20a_alloc_global_ctx_buffers(g);
4518         if (err)
4519                 goto clean_up;
4520
4521         err = gr_gk20a_init_access_map(g);
4522         if (err)
4523                 goto clean_up;
4524
4525         mutex_init(&gr->ctx_mutex);
4526         spin_lock_init(&gr->ch_tlb_lock);
4527
4528         gr->remove_support = gk20a_remove_gr_support;
4529         gr->sw_ready = true;
4530
4531         gk20a_dbg_fn("done");
4532         return 0;
4533
4534 clean_up:
4535         gk20a_err(dev_from_gk20a(g), "fail");
4536         gk20a_remove_gr_support(gr);
4537         return err;
4538 }
4539
4540 int gk20a_init_gr_support(struct gk20a *g)
4541 {
4542         u32 err;
4543
4544         gk20a_dbg_fn("");
4545
4546         err = gk20a_init_gr_prepare(g);
4547         if (err)
4548                 return err;
4549
4550         /* this is required before gr_gk20a_init_ctx_state */
4551         mutex_init(&g->gr.fecs_mutex);
4552
4553         err = gk20a_init_gr_reset_enable_hw(g);
4554         if (err)
4555                 return err;
4556
4557         err = gk20a_init_gr_setup_sw(g);
4558         if (err)
4559                 return err;
4560
4561         err = gk20a_init_gr_setup_hw(g);
4562         if (err)
4563                 return err;
4564
4565         /* GR is inialized, signal possible waiters */
4566         g->gr.initialized = true;
4567         wake_up(&g->gr.init_wq);
4568
4569         return 0;
4570 }
4571
4572 /* Wait until GR is initialized */
4573 void gk20a_gr_wait_initialized(struct gk20a *g)
4574 {
4575         wait_event(g->gr.init_wq, g->gr.initialized);
4576 }
4577
4578 #define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE   0x02dc
4579 #define NVA297_SET_CIRCULAR_BUFFER_SIZE         0x1280
4580 #define NVA297_SET_SHADER_EXCEPTIONS            0x1528
4581 #define NVA0C0_SET_SHADER_EXCEPTIONS            0x1528
4582
4583 #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
4584
4585 struct gr_isr_data {
4586         u32 addr;
4587         u32 data_lo;
4588         u32 data_hi;
4589         u32 curr_ctx;
4590         u32 chid;
4591         u32 offset;
4592         u32 sub_chan;
4593         u32 class_num;
4594 };
4595
4596 void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data)
4597 {
4598         gk20a_dbg_fn("");
4599
4600         if (data == NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE) {
4601                 gk20a_writel(g,
4602                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), 0);
4603                 gk20a_writel(g,
4604                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), 0);
4605         } else {
4606                 /* setup sm warp esr report masks */
4607                 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4608                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4609                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4610                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4611                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4612                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4613                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4614                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4615                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4616                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4617                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4618                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4619                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4620                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4621                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4622                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4623                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4624                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4625                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4626                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4627                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4628
4629                 /* setup sm global esr report mask */
4630                 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4631                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4632                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4633                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4634                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4635                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4636                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4637                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4638         }
4639 }
4640
4641 static void gk20a_gr_set_circular_buffer_size(struct gk20a *g, u32 data)
4642 {
4643         struct gr_gk20a *gr = &g->gr;
4644         u32 gpc_index, ppc_index, stride, val, offset;
4645         u32 cb_size = data * 4;
4646
4647         gk20a_dbg_fn("");
4648
4649         if (cb_size > gr->attrib_cb_size)
4650                 cb_size = gr->attrib_cb_size;
4651
4652         gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4653                 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4654                  ~gr_ds_tga_constraintlogic_beta_cbsize_f(~0)) |
4655                  gr_ds_tga_constraintlogic_beta_cbsize_f(cb_size));
4656
4657         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4658                 stride = proj_gpc_stride_v() * gpc_index;
4659
4660                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4661                         ppc_index++) {
4662
4663                         val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg_r() +
4664                                 stride +
4665                                 proj_ppc_in_gpc_stride_v() * ppc_index);
4666
4667                         offset = gr_gpc0_ppc0_cbm_cfg_start_offset_v(val);
4668
4669                         val = set_field(val,
4670                                 gr_gpc0_ppc0_cbm_cfg_size_m(),
4671                                 gr_gpc0_ppc0_cbm_cfg_size_f(cb_size *
4672                                         gr->pes_tpc_count[ppc_index][gpc_index]));
4673                         val = set_field(val,
4674                                 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4675                                 (offset + 1));
4676
4677                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4678                                 stride +
4679                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4680
4681                         val = set_field(val,
4682                                 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4683                                 offset);
4684
4685                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4686                                 stride +
4687                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4688                 }
4689         }
4690 }
4691
4692 static void gk20a_gr_set_alpha_circular_buffer_size(struct gk20a *g, u32 data)
4693 {
4694         struct gr_gk20a *gr = &g->gr;
4695         u32 gpc_index, ppc_index, stride, val;
4696         u32 pd_ab_max_output;
4697         u32 alpha_cb_size = data * 4;
4698
4699         gk20a_dbg_fn("");
4700         /* if (NO_ALPHA_BETA_TIMESLICE_SUPPORT_DEF)
4701                 return; */
4702
4703         if (alpha_cb_size > gr->alpha_cb_size)
4704                 alpha_cb_size = gr->alpha_cb_size;
4705
4706         gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4707                 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4708                  ~gr_ds_tga_constraintlogic_alpha_cbsize_f(~0)) |
4709                  gr_ds_tga_constraintlogic_alpha_cbsize_f(alpha_cb_size));
4710
4711         pd_ab_max_output = alpha_cb_size *
4712                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v() /
4713                 gr_pd_ab_dist_cfg1_max_output_granularity_v();
4714
4715         gk20a_writel(g, gr_pd_ab_dist_cfg1_r(),
4716                 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output));
4717
4718         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4719                 stride = proj_gpc_stride_v() * gpc_index;
4720
4721                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4722                         ppc_index++) {
4723
4724                         val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4725                                 stride +
4726                                 proj_ppc_in_gpc_stride_v() * ppc_index);
4727
4728                         val = set_field(val, gr_gpc0_ppc0_cbm_cfg2_size_m(),
4729                                         gr_gpc0_ppc0_cbm_cfg2_size_f(alpha_cb_size *
4730                                                 gr->pes_tpc_count[ppc_index][gpc_index]));
4731
4732                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4733                                 stride +
4734                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4735                 }
4736         }
4737 }
4738
4739 int gk20a_gr_reset(struct gk20a *g)
4740 {
4741         int err;
4742         u32 size;
4743
4744         err = gk20a_init_gr_prepare(g);
4745         if (err)
4746                 return err;
4747
4748         err = gk20a_init_gr_reset_enable_hw(g);
4749         if (err)
4750                 return err;
4751
4752         err = gk20a_init_gr_setup_hw(g);
4753         if (err)
4754                 return err;
4755
4756         size = 0;
4757         err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
4758         if (err) {
4759                 gk20a_err(dev_from_gk20a(g),
4760                         "fail to query fecs pg buffer size");
4761                 return err;
4762         }
4763
4764         err = gr_gk20a_fecs_set_reglist_bind_inst(g,
4765                         g->mm.pmu.inst_block.cpu_pa);
4766         if (err) {
4767                 gk20a_err(dev_from_gk20a(g),
4768                         "fail to bind pmu inst to gr");
4769                 return err;
4770         }
4771
4772         err = gr_gk20a_fecs_set_reglist_virtual_addr(g, g->pmu.pg_buf.pmu_va);
4773         if (err) {
4774                 gk20a_err(dev_from_gk20a(g),
4775                         "fail to set pg buffer pmu va");
4776                 return err;
4777         }
4778
4779         return 0;
4780 }
4781
4782 static int gr_gk20a_handle_sw_method(struct gk20a *g, u32 addr,
4783                                           u32 class_num, u32 offset, u32 data)
4784 {
4785         gk20a_dbg_fn("");
4786
4787         if (class_num == KEPLER_COMPUTE_A) {
4788                 switch (offset << 2) {
4789                 case NVA0C0_SET_SHADER_EXCEPTIONS:
4790                         gk20a_gr_set_shader_exceptions(g, data);
4791                         break;
4792                 default:
4793                         goto fail;
4794                 }
4795         }
4796
4797         if (class_num == KEPLER_C) {
4798                 switch (offset << 2) {
4799                 case NVA297_SET_SHADER_EXCEPTIONS:
4800                         gk20a_gr_set_shader_exceptions(g, data);
4801                         break;
4802                 case NVA297_SET_CIRCULAR_BUFFER_SIZE:
4803                         g->ops.gr.set_circular_buffer_size(g, data);
4804                         break;
4805                 case NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE:
4806                         g->ops.gr.set_alpha_circular_buffer_size(g, data);
4807                         break;
4808                 default:
4809                         goto fail;
4810                 }
4811         }
4812         return 0;
4813
4814 fail:
4815         return -EINVAL;
4816 }
4817
4818 static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
4819                   struct gr_isr_data *isr_data)
4820 {
4821         struct fifo_gk20a *f = &g->fifo;
4822         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4823         gk20a_dbg_fn("");
4824         gk20a_set_error_notifier(ch,
4825                                 NVHOST_CHANNEL_GR_SEMAPHORE_TIMEOUT);
4826         gk20a_err(dev_from_gk20a(g),
4827                    "gr semaphore timeout\n");
4828         return -EINVAL;
4829 }
4830
4831 static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
4832                   struct gr_isr_data *isr_data)
4833 {
4834         struct fifo_gk20a *f = &g->fifo;
4835         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4836         gk20a_dbg_fn("");
4837         gk20a_set_error_notifier(ch,
4838                                 NVHOST_CHANNEL_GR_ILLEGAL_NOTIFY);
4839         /* This is an unrecoverable error, reset is needed */
4840         gk20a_err(dev_from_gk20a(g),
4841                    "gr semaphore timeout\n");
4842         return -EINVAL;
4843 }
4844
4845 static int gk20a_gr_handle_illegal_method(struct gk20a *g,
4846                                           struct gr_isr_data *isr_data)
4847 {
4848         int ret = g->ops.gr.handle_sw_method(g, isr_data->addr,
4849                         isr_data->class_num, isr_data->offset,
4850                         isr_data->data_lo);
4851         if (ret)
4852                 gk20a_err(dev_from_gk20a(g), "invalid method class 0x%08x"
4853                         ", offset 0x%08x address 0x%08x\n",
4854                         isr_data->class_num, isr_data->offset, isr_data->addr);
4855
4856         return ret;
4857 }
4858
4859 static int gk20a_gr_handle_illegal_class(struct gk20a *g,
4860                                           struct gr_isr_data *isr_data)
4861 {
4862         struct fifo_gk20a *f = &g->fifo;
4863         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4864         gk20a_dbg_fn("");
4865         gk20a_set_error_notifier(ch,
4866                                 NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4867         gk20a_err(dev_from_gk20a(g),
4868                    "invalid class 0x%08x, offset 0x%08x",
4869                    isr_data->class_num, isr_data->offset);
4870         return -EINVAL;
4871 }
4872
4873 static int gk20a_gr_handle_fecs_error(struct gk20a *g,
4874                                           struct gr_isr_data *isr_data)
4875 {
4876         struct fifo_gk20a *f = &g->fifo;
4877         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4878         u32 gr_fecs_intr = gk20a_readl(g, gr_fecs_intr_r());
4879         gk20a_dbg_fn("");
4880
4881         gk20a_err(dev_from_gk20a(g),
4882                    "unhandled fecs error interrupt 0x%08x for channel %u",
4883                    gr_fecs_intr, ch->hw_chid);
4884
4885         gk20a_writel(g, gr_fecs_intr_r(), gr_fecs_intr);
4886         return -EINVAL;
4887 }
4888
4889 static int gk20a_gr_handle_class_error(struct gk20a *g,
4890                                           struct gr_isr_data *isr_data)
4891 {
4892         struct fifo_gk20a *f = &g->fifo;
4893         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4894         u32 gr_class_error =
4895                 gr_class_error_code_v(gk20a_readl(g, gr_class_error_r()));
4896         gk20a_dbg_fn("");
4897
4898         gk20a_set_error_notifier(ch,
4899                         NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4900         gk20a_err(dev_from_gk20a(g),
4901                    "class error 0x%08x, offset 0x%08x, unhandled intr 0x%08x for channel %u\n",
4902                    isr_data->class_num, isr_data->offset,
4903                    gr_class_error, ch->hw_chid);
4904         return -EINVAL;
4905 }
4906
4907 static int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
4908                                              struct gr_isr_data *isr_data)
4909 {
4910         struct fifo_gk20a *f = &g->fifo;
4911         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4912
4913         wake_up(&ch->semaphore_wq);
4914
4915         return 0;
4916 }
4917
4918 #if defined(CONFIG_GK20A_CYCLE_STATS)
4919 static inline bool is_valid_cyclestats_bar0_offset_gk20a(struct gk20a *g,
4920                                                          u32 offset)
4921 {
4922         /* support only 24-bit 4-byte aligned offsets */
4923         bool valid = !(offset & 0xFF000003);
4924         /* whitelist check */
4925         valid = valid &&
4926                 is_bar0_global_offset_whitelisted_gk20a(offset);
4927         /* resource size check in case there was a problem
4928          * with allocating the assumed size of bar0 */
4929         valid = valid &&
4930                 offset < resource_size(g->reg_mem);
4931         return valid;
4932 }
4933 #endif
4934
4935 static int gk20a_gr_handle_notify_pending(struct gk20a *g,
4936                                           struct gr_isr_data *isr_data)
4937 {
4938         struct fifo_gk20a *f = &g->fifo;
4939         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4940
4941 #if defined(CONFIG_GK20A_CYCLE_STATS)
4942         void *virtual_address;
4943         u32 buffer_size;
4944         u32 offset;
4945         u32 new_offset;
4946         bool exit;
4947         struct share_buffer_head *sh_hdr;
4948         u32 raw_reg;
4949         u64 mask_orig;
4950         u64 v = 0;
4951         struct gk20a_cyclestate_buffer_elem *op_elem;
4952         /* GL will never use payload 0 for cycle state */
4953         if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
4954                 return 0;
4955
4956         mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
4957
4958         virtual_address = ch->cyclestate.cyclestate_buffer;
4959         buffer_size = ch->cyclestate.cyclestate_buffer_size;
4960         offset = isr_data->data_lo;
4961         exit = false;
4962         while (!exit) {
4963                 if (offset >= buffer_size) {
4964                         WARN_ON(1);
4965                         break;
4966                 }
4967
4968                 sh_hdr = (struct share_buffer_head *)
4969                         ((char *)virtual_address + offset);
4970
4971                 if (sh_hdr->size < sizeof(struct share_buffer_head)) {
4972                         WARN_ON(1);
4973                         break;
4974                 }
4975                 new_offset = offset + sh_hdr->size;
4976
4977                 switch (sh_hdr->operation) {
4978                 case OP_END:
4979                         exit = true;
4980                         break;
4981
4982                 case BAR0_READ32:
4983                 case BAR0_WRITE32:
4984                 {
4985                         bool valid;
4986                         op_elem =
4987                                 (struct gk20a_cyclestate_buffer_elem *)
4988                                         sh_hdr;
4989                         valid = is_valid_cyclestats_bar0_offset_gk20a(g,
4990                                                         op_elem->offset_bar0);
4991                         if (!valid) {
4992                                 gk20a_err(dev_from_gk20a(g),
4993                                            "invalid cycletstats op offset: 0x%x\n",
4994                                            op_elem->offset_bar0);
4995
4996                                 sh_hdr->failed = exit = true;
4997                                 break;
4998                         }
4999
5000
5001                         mask_orig =
5002                                 ((1ULL <<
5003                                   (op_elem->last_bit + 1))
5004                                  -1)&~((1ULL <<
5005                                         op_elem->first_bit)-1);
5006
5007                         raw_reg =
5008                                 gk20a_readl(g,
5009                                             op_elem->offset_bar0);
5010
5011                         switch (sh_hdr->operation) {
5012                         case BAR0_READ32:
5013                                 op_elem->data =
5014                                         (raw_reg & mask_orig)
5015                                         >> op_elem->first_bit;
5016                                 break;
5017
5018                         case BAR0_WRITE32:
5019                                 v = 0;
5020                                 if ((unsigned int)mask_orig !=
5021                                     (unsigned int)~0) {
5022                                         v = (unsigned int)
5023                                                 (raw_reg & ~mask_orig);
5024                                 }
5025
5026                                 v |= ((op_elem->data
5027                                        << op_elem->first_bit)
5028                                       & mask_orig);
5029
5030                                 gk20a_writel(g,
5031                                              op_elem->offset_bar0,
5032                                              (unsigned int)v);
5033                                 break;
5034                         default:
5035                                 /* nop ok?*/
5036                                 break;
5037                         }
5038                 }
5039                 break;
5040
5041                 default:
5042                         /* no operation content case */
5043                         exit = true;
5044                         break;
5045                 }
5046                 sh_hdr->completed = true;
5047                 offset = new_offset;
5048         }
5049         mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
5050 #endif
5051         gk20a_dbg_fn("");
5052         wake_up(&ch->notifier_wq);
5053         return 0;
5054 }
5055
5056 /* Used by sw interrupt thread to translate current ctx to chid.
5057  * For performance, we don't want to go through 128 channels every time.
5058  * curr_ctx should be the value read from gr_fecs_current_ctx_r().
5059  * A small tlb is used here to cache translation */
5060 static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx)
5061 {
5062         struct fifo_gk20a *f = &g->fifo;
5063         struct gr_gk20a *gr = &g->gr;
5064         u32 chid = -1;
5065         u32 i;
5066
5067         /* when contexts are unloaded from GR, the valid bit is reset
5068          * but the instance pointer information remains intact. So the
5069          * valid bit must be checked to be absolutely certain that a
5070          * valid context is currently resident. */
5071         if (!gr_fecs_current_ctx_valid_v(curr_ctx))
5072                 return -1;
5073
5074         spin_lock(&gr->ch_tlb_lock);
5075
5076         /* check cache first */
5077         for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5078                 if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
5079                         chid = gr->chid_tlb[i].hw_chid;
5080                         goto unlock;
5081                 }
5082         }
5083
5084         /* slow path */
5085         for (chid = 0; chid < f->num_channels; chid++)
5086                 if (f->channel[chid].in_use) {
5087                         if ((u32)(f->channel[chid].inst_block.cpu_pa >>
5088                                 ram_in_base_shift_v()) ==
5089                                 gr_fecs_current_ctx_ptr_v(curr_ctx))
5090                                 break;
5091         }
5092
5093         if (chid >= f->num_channels) {
5094                 chid = -1;
5095                 goto unlock;
5096         }
5097
5098         /* add to free tlb entry */
5099         for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5100                 if (gr->chid_tlb[i].curr_ctx == 0) {
5101                         gr->chid_tlb[i].curr_ctx = curr_ctx;
5102                         gr->chid_tlb[i].hw_chid = chid;
5103                         goto unlock;
5104                 }
5105         }
5106
5107         /* no free entry, flush one */
5108         gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
5109         gr->chid_tlb[gr->channel_tlb_flush_index].hw_chid = chid;
5110
5111         gr->channel_tlb_flush_index =
5112                 (gr->channel_tlb_flush_index + 1) &
5113                 (GR_CHANNEL_MAP_TLB_SIZE - 1);
5114
5115 unlock:
5116         spin_unlock(&gr->ch_tlb_lock);
5117         return chid;
5118 }
5119
5120 static int gk20a_gr_lock_down_sm(struct gk20a *g, u32 global_esr_mask)
5121 {
5122         unsigned long end_jiffies = jiffies +
5123                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5124         u32 delay = GR_IDLE_CHECK_DEFAULT;
5125         bool mmu_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled(g);
5126         u32 dbgr_control0;
5127
5128         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "locking down SM");
5129
5130         /* assert stop trigger */
5131         dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5132         dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5133         gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
5134
5135         /* wait for the sm to lock down */
5136         do {
5137                 u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5138                 u32 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5139                 u32 dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r());
5140                 bool locked_down =
5141                         (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
5142                          gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
5143                 bool error_pending =
5144                         (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) !=
5145                          gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) ||
5146                         ((global_esr & ~global_esr_mask) != 0);
5147
5148                 if (locked_down || !error_pending) {
5149                         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "locked down SM");
5150
5151                         /* de-assert stop trigger */
5152                         dbgr_control0 &= ~gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5153                         gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
5154
5155                         return 0;
5156                 }
5157
5158                 /* if an mmu fault is pending and mmu debug mode is not
5159                  * enabled, the sm will never lock down. */
5160                 if (!mmu_debug_mode_enabled && gk20a_fifo_mmu_fault_pending(g)) {
5161                         gk20a_err(dev_from_gk20a(g), "mmu fault pending, sm will"
5162                                    " never lock down!");
5163                         return -EFAULT;
5164                 }
5165
5166                 usleep_range(delay, delay * 2);
5167                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
5168
5169         } while (time_before(jiffies, end_jiffies)
5170                         || !tegra_platform_is_silicon());
5171
5172         gk20a_err(dev_from_gk20a(g), "timed out while trying to lock down SM");
5173
5174         return -EAGAIN;
5175 }
5176
5177 bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
5178 {
5179         u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5180
5181         /* check if an sm debugger is attached */
5182         if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
5183                         gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v())
5184                 return true;
5185
5186         return false;
5187 }
5188
5189 static void gk20a_gr_clear_sm_hww(struct gk20a *g, u32 global_esr)
5190 {
5191         gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r(), global_esr);
5192
5193         /* clear the warp hww */
5194         gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r(),
5195                         gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f());
5196 }
5197
5198 static struct channel_gk20a *
5199 channel_from_hw_chid(struct gk20a *g, u32 hw_chid)
5200 {
5201         return g->fifo.channel+hw_chid;
5202 }
5203
5204 static int gk20a_gr_handle_sm_exception(struct gk20a *g,
5205                 struct gr_isr_data *isr_data)
5206 {
5207         int ret = 0;
5208         bool do_warp_sync = false;
5209         /* these three interrupts don't require locking down the SM. They can
5210          * be handled by usermode clients as they aren't fatal. Additionally,
5211          * usermode clients may wish to allow some warps to execute while others
5212          * are at breakpoints, as opposed to fatal errors where all warps should
5213          * halt. */
5214         u32 global_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()   |
5215                           gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
5216                           gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
5217         u32 global_esr, warp_esr;
5218         bool sm_debugger_attached = gk20a_gr_sm_debugger_attached(g);
5219         struct channel_gk20a *fault_ch;
5220
5221         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
5222
5223         global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5224         warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5225
5226         /* if an sm debugger is attached, disable forwarding of tpc exceptions.
5227          * the debugger will reenable exceptions after servicing them. */
5228         if (sm_debugger_attached) {
5229                 u32 tpc_exception_en = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
5230                 tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
5231                 gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), tpc_exception_en);
5232                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM debugger attached");
5233         }
5234
5235         /* if a debugger is present and an error has occurred, do a warp sync */
5236         if (sm_debugger_attached && ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
5237                 gk20a_dbg(gpu_dbg_intr, "warp sync needed");
5238                 do_warp_sync = true;
5239         }
5240
5241         if (do_warp_sync) {
5242                 ret = gk20a_gr_lock_down_sm(g, global_mask);
5243                 if (ret) {
5244                         gk20a_err(dev_from_gk20a(g), "sm did not lock down!\n");
5245                         return ret;
5246                 }
5247         }
5248
5249         /* finally, signal any client waiting on an event */
5250         fault_ch = channel_from_hw_chid(g, isr_data->chid);
5251         if (fault_ch)
5252                 gk20a_dbg_gpu_post_events(fault_ch);
5253
5254         return ret;
5255 }
5256
5257 static int gk20a_gr_handle_tpc_exception(struct gk20a *g,
5258                 struct gr_isr_data *isr_data)
5259 {
5260         int ret = 0;
5261         u32 tpc_exception = gk20a_readl(g, gr_gpcs_tpcs_tpccs_tpc_exception_r());
5262
5263         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
5264
5265         /* check if an sm exeption is pending  */
5266         if (gr_gpcs_tpcs_tpccs_tpc_exception_sm_v(tpc_exception) ==
5267                         gr_gpcs_tpcs_tpccs_tpc_exception_sm_pending_v()) {
5268                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM exception pending");
5269                 ret = gk20a_gr_handle_sm_exception(g, isr_data);
5270         }
5271
5272         return ret;
5273 }
5274
5275 static int gk20a_gr_handle_gpc_exception(struct gk20a *g,
5276                 struct gr_isr_data *isr_data)
5277 {
5278         int ret = 0;
5279         u32 gpc_exception = gk20a_readl(g, gr_gpcs_gpccs_gpc_exception_r());
5280
5281         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
5282
5283         /* check if tpc 0 has an exception */
5284         if (gr_gpcs_gpccs_gpc_exception_tpc_v(gpc_exception) ==
5285                         gr_gpcs_gpccs_gpc_exception_tpc_0_pending_v()) {
5286                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "TPC exception pending");
5287                 ret = gk20a_gr_handle_tpc_exception(g, isr_data);
5288         }
5289
5290         return ret;
5291 }
5292
5293 int gk20a_gr_isr(struct gk20a *g)
5294 {
5295         struct gr_isr_data isr_data;
5296         u32 grfifo_ctl;
5297         u32 obj_table;
5298         int need_reset = 0;
5299         u32 gr_intr = gk20a_readl(g, gr_intr_r());
5300
5301         gk20a_dbg_fn("");
5302         gk20a_dbg(gpu_dbg_intr, "pgraph intr %08x", gr_intr);
5303
5304         if (!gr_intr)
5305                 return 0;
5306
5307         grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
5308         grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
5309         grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
5310
5311         gk20a_writel(g, gr_gpfifo_ctl_r(),
5312                 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
5313                 gr_gpfifo_ctl_semaphore_access_f(0));
5314
5315         isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
5316         isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
5317         isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
5318         isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
5319         isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
5320         isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
5321         obj_table = gk20a_readl(g,
5322                 gr_fe_object_table_r(isr_data.sub_chan));
5323         isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
5324
5325         isr_data.chid =
5326                 gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx);
5327         if (isr_data.chid == -1) {
5328                 gk20a_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
5329                            isr_data.curr_ctx);
5330                 goto clean_up;
5331         }
5332
5333         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5334                 "channel %d: addr 0x%08x, "
5335                 "data 0x%08x 0x%08x,"
5336                 "ctx 0x%08x, offset 0x%08x, "
5337                 "subchannel 0x%08x, class 0x%08x",
5338                 isr_data.chid, isr_data.addr,
5339                 isr_data.data_hi, isr_data.data_lo,
5340                 isr_data.curr_ctx, isr_data.offset,
5341                 isr_data.sub_chan, isr_data.class_num);
5342
5343         if (gr_intr & gr_intr_notify_pending_f()) {
5344                 gk20a_gr_handle_notify_pending(g, &isr_data);
5345                 gk20a_writel(g, gr_intr_r(),
5346                         gr_intr_notify_reset_f());
5347                 gr_intr &= ~gr_intr_notify_pending_f();
5348         }
5349
5350         if (gr_intr & gr_intr_semaphore_pending_f()) {
5351                 gk20a_gr_handle_semaphore_pending(g, &isr_data);
5352                 gk20a_writel(g, gr_intr_r(),
5353                         gr_intr_semaphore_reset_f());
5354                 gr_intr &= ~gr_intr_semaphore_pending_f();
5355         }
5356
5357         if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
5358                 need_reset |= gk20a_gr_handle_semaphore_timeout_pending(g,
5359                         &isr_data);
5360                 gk20a_writel(g, gr_intr_r(),
5361                         gr_intr_semaphore_reset_f());
5362                 gr_intr &= ~gr_intr_semaphore_pending_f();
5363         }
5364
5365         if (gr_intr & gr_intr_illegal_notify_pending_f()) {
5366                 need_reset |= gk20a_gr_intr_illegal_notify_pending(g,
5367                         &isr_data);
5368                 gk20a_writel(g, gr_intr_r(),
5369                         gr_intr_illegal_notify_reset_f());
5370                 gr_intr &= ~gr_intr_illegal_notify_pending_f();
5371         }
5372
5373         if (gr_intr & gr_intr_illegal_method_pending_f()) {
5374                 need_reset |= gk20a_gr_handle_illegal_method(g, &isr_data);
5375                 gk20a_writel(g, gr_intr_r(),
5376                         gr_intr_illegal_method_reset_f());
5377                 gr_intr &= ~gr_intr_illegal_method_pending_f();
5378         }
5379
5380         if (gr_intr & gr_intr_illegal_class_pending_f()) {
5381                 need_reset |= gk20a_gr_handle_illegal_class(g, &isr_data);
5382                 gk20a_writel(g, gr_intr_r(),
5383                         gr_intr_illegal_class_reset_f());
5384                 gr_intr &= ~gr_intr_illegal_class_pending_f();
5385         }
5386
5387         if (gr_intr & gr_intr_fecs_error_pending_f()) {
5388                 need_reset |= gk20a_gr_handle_fecs_error(g, &isr_data);
5389                 gk20a_writel(g, gr_intr_r(),
5390                         gr_intr_fecs_error_reset_f());
5391                 gr_intr &= ~gr_intr_fecs_error_pending_f();
5392         }
5393
5394         if (gr_intr & gr_intr_class_error_pending_f()) {
5395                 need_reset |= gk20a_gr_handle_class_error(g, &isr_data);
5396                 gk20a_writel(g, gr_intr_r(),
5397                         gr_intr_class_error_reset_f());
5398                 gr_intr &= ~gr_intr_class_error_pending_f();
5399         }
5400
5401         /* this one happens if someone tries to hit a non-whitelisted
5402          * register using set_falcon[4] */
5403         if (gr_intr & gr_intr_firmware_method_pending_f()) {
5404                 need_reset |= true;
5405                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "firmware method intr pending\n");
5406                 gk20a_writel(g, gr_intr_r(),
5407                         gr_intr_firmware_method_reset_f());
5408                 gr_intr &= ~gr_intr_firmware_method_pending_f();
5409         }
5410
5411         if (gr_intr & gr_intr_exception_pending_f()) {
5412                 u32 exception = gk20a_readl(g, gr_exception_r());
5413                 struct fifo_gk20a *f = &g->fifo;
5414                 struct channel_gk20a *ch = &f->channel[isr_data.chid];
5415
5416                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
5417
5418                 if (exception & gr_exception_fe_m()) {
5419                         u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
5420                         gk20a_dbg(gpu_dbg_intr, "fe warning %08x\n", fe);
5421                         gk20a_writel(g, gr_fe_hww_esr_r(), fe);
5422                         need_reset |= -EFAULT;
5423                 }
5424
5425                 /* check if a gpc exception has occurred */
5426                 if (exception & gr_exception_gpc_m() && need_reset == 0) {
5427                         u32 exception1 = gk20a_readl(g, gr_exception1_r());
5428                         u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5429
5430                         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC exception pending");
5431
5432                         /* if no sm debugger is present, clean up the channel */
5433                         if (!gk20a_gr_sm_debugger_attached(g)) {
5434                                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5435                                            "SM debugger not attached, clearing interrupt");
5436                                 need_reset |= -EFAULT;
5437                         } else {
5438                                 /* check if gpc 0 has an exception */
5439                                 if (exception1 & gr_exception1_gpc_0_pending_f())
5440                                         need_reset |= gk20a_gr_handle_gpc_exception(g, &isr_data);
5441                                 /* clear the hwws, also causes tpc and gpc
5442                                  * exceptions to be cleared */
5443                                 gk20a_gr_clear_sm_hww(g, global_esr);
5444                         }
5445
5446                         if (need_reset)
5447                                 gk20a_set_error_notifier(ch,
5448                                         NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
5449                 }
5450
5451                 gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
5452                 gr_intr &= ~gr_intr_exception_pending_f();
5453         }
5454
5455         if (need_reset)
5456                 gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), true);
5457
5458 clean_up:
5459         gk20a_writel(g, gr_gpfifo_ctl_r(),
5460                 grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
5461                 gr_gpfifo_ctl_semaphore_access_f(1));
5462
5463         if (gr_intr)
5464                 gk20a_err(dev_from_gk20a(g),
5465                            "unhandled gr interrupt 0x%08x", gr_intr);
5466
5467         return 0;
5468 }
5469
5470 int gk20a_gr_nonstall_isr(struct gk20a *g)
5471 {
5472         u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
5473         u32 clear_intr = 0;
5474
5475         gk20a_dbg(gpu_dbg_intr, "pgraph nonstall intr %08x", gr_intr);
5476
5477         if (gr_intr & gr_intr_nonstall_trap_pending_f()) {
5478                 gk20a_channel_semaphore_wakeup(g);
5479                 clear_intr |= gr_intr_nonstall_trap_pending_f();
5480         }
5481
5482         gk20a_writel(g, gr_intr_nonstall_r(), clear_intr);
5483
5484         return 0;
5485 }
5486
5487 int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
5488 {
5489         BUG_ON(size == NULL);
5490         return gr_gk20a_submit_fecs_method_op(g,
5491                    (struct fecs_method_op_gk20a) {
5492                            .mailbox.id = 0,
5493                            .mailbox.data = 0,
5494                            .mailbox.clr = ~0,
5495                            .method.data = 1,
5496                            .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
5497                            .mailbox.ret = size,
5498                            .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
5499                            .mailbox.ok = 0,
5500                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5501                            .mailbox.fail = 0});
5502 }
5503
5504 int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
5505 {
5506         return gr_gk20a_submit_fecs_method_op(g,
5507                    (struct fecs_method_op_gk20a){
5508                            .mailbox.id = 4,
5509                            .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
5510                                             gr_fecs_current_ctx_valid_f(1) |
5511                                             gr_fecs_current_ctx_target_vid_mem_f()),
5512                            .mailbox.clr = ~0,
5513                            .method.data = 1,
5514                            .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
5515                            .mailbox.ret = NULL,
5516                            .cond.ok = GR_IS_UCODE_OP_EQUAL,
5517                            .mailbox.ok = 1,
5518                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5519                            .mailbox.fail = 0});
5520 }
5521
5522 int gr_gk20a_fecs_set_reglist_virtual_addr(struct gk20a *g, u64 pmu_va)
5523 {
5524         return gr_gk20a_submit_fecs_method_op(g,
5525                    (struct fecs_method_op_gk20a) {
5526                            .mailbox.id = 4,
5527                            .mailbox.data = u64_lo32(pmu_va >> 8),
5528                            .mailbox.clr = ~0,
5529                            .method.data = 1,
5530                            .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
5531                            .mailbox.ret = NULL,
5532                            .cond.ok = GR_IS_UCODE_OP_EQUAL,
5533                            .mailbox.ok = 1,
5534                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5535                            .mailbox.fail = 0});
5536 }
5537
5538 int gk20a_gr_suspend(struct gk20a *g)
5539 {
5540         unsigned long end_jiffies = jiffies +
5541                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5542         u32 ret = 0;
5543
5544         gk20a_dbg_fn("");
5545
5546         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
5547         if (ret)
5548                 return ret;
5549
5550         gk20a_writel(g, gr_gpfifo_ctl_r(),
5551                 gr_gpfifo_ctl_access_disabled_f());
5552
5553         /* disable gr intr */
5554         gk20a_writel(g, gr_intr_r(), 0);
5555         gk20a_writel(g, gr_intr_en_r(), 0);
5556
5557         /* disable all exceptions */
5558         gk20a_writel(g, gr_exception_r(), 0);
5559         gk20a_writel(g, gr_exception_en_r(), 0);
5560         gk20a_writel(g, gr_exception1_r(), 0);
5561         gk20a_writel(g, gr_exception1_en_r(), 0);
5562         gk20a_writel(g, gr_exception2_r(), 0);
5563         gk20a_writel(g, gr_exception2_en_r(), 0);
5564
5565         gk20a_gr_flush_channel_tlb(&g->gr);
5566
5567         g->gr.initialized = false;
5568
5569         gk20a_dbg_fn("done");
5570         return ret;
5571 }
5572
5573 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
5574                                                u32 addr,
5575                                                bool is_quad, u32 quad,
5576                                                u32 *context_buffer,
5577                                                u32 context_buffer_size,
5578                                                u32 *priv_offset);
5579
5580 /* This function will decode a priv address and return the partition type and numbers. */
5581 int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
5582                               int  *addr_type, /* enum ctxsw_addr_type */
5583                               u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
5584                               u32 *broadcast_flags)
5585 {
5586         u32 gpc_addr;
5587         u32 ppc_address;
5588         u32 ppc_broadcast_addr;
5589
5590         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5591
5592         /* setup defaults */
5593         ppc_address = 0;
5594         ppc_broadcast_addr = 0;
5595         *addr_type = CTXSW_ADDR_TYPE_SYS;
5596         *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
5597         *gpc_num = 0;
5598         *tpc_num = 0;
5599         *ppc_num = 0;
5600         *be_num  = 0;
5601
5602         if (pri_is_gpc_addr(addr)) {
5603                 *addr_type = CTXSW_ADDR_TYPE_GPC;
5604                 gpc_addr = pri_gpccs_addr_mask(addr);
5605                 if (pri_is_gpc_addr_shared(addr)) {
5606                         *addr_type = CTXSW_ADDR_TYPE_GPC;
5607                         *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
5608                 } else
5609                         *gpc_num = pri_get_gpc_num(addr);
5610
5611                 if (pri_is_tpc_addr(gpc_addr)) {
5612                         *addr_type = CTXSW_ADDR_TYPE_TPC;
5613                         if (pri_is_tpc_addr_shared(gpc_addr)) {
5614                                 *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
5615                                 return 0;
5616                         }
5617                         *tpc_num = pri_get_tpc_num(gpc_addr);
5618                 }
5619                 return 0;
5620         } else if (pri_is_be_addr(addr)) {
5621                 *addr_type = CTXSW_ADDR_TYPE_BE;
5622                 if (pri_is_be_addr_shared(addr)) {
5623                         *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
5624                         return 0;
5625                 }
5626                 *be_num = pri_get_be_num(addr);
5627                 return 0;
5628         } else {
5629                 *addr_type = CTXSW_ADDR_TYPE_SYS;
5630                 return 0;
5631         }
5632         /* PPC!?!?!?! */
5633
5634         /*NOTREACHED*/
5635         return -EINVAL;
5636 }
5637
5638 static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
5639                                       u32 gpc_num,
5640                                       u32 *priv_addr_table, u32 *t)
5641 {
5642     u32 ppc_num;
5643
5644     gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5645
5646     for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
5647             priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
5648                                                    gpc_num, ppc_num);
5649
5650     return 0;
5651 }
5652
5653 /*
5654  * The context buffer is indexed using BE broadcast addresses and GPC/TPC
5655  * unicast addresses. This function will convert a BE unicast address to a BE
5656  * broadcast address and split a GPC/TPC broadcast address into a table of
5657  * GPC/TPC addresses.  The addresses generated by this function can be
5658  * successfully processed by gr_gk20a_find_priv_offset_in_buffer
5659  */
5660 static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
5661                                            u32 addr,
5662                                            u32 *priv_addr_table,
5663                                            u32 *num_registers)
5664 {
5665         int addr_type; /*enum ctxsw_addr_type */
5666         u32 gpc_num, tpc_num, ppc_num, be_num;
5667         u32 broadcast_flags;
5668         u32 t;
5669         int err;
5670
5671         t = 0;
5672         *num_registers = 0;
5673
5674         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5675
5676         err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
5677                                         &gpc_num, &tpc_num, &ppc_num, &be_num,
5678                                         &broadcast_flags);
5679         gk20a_dbg(gpu_dbg_gpu_dbg, "addr_type = %d", addr_type);
5680         if (err)
5681                 return err;
5682
5683         if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
5684             (addr_type == CTXSW_ADDR_TYPE_BE)) {
5685                 /* The BE broadcast registers are included in the compressed PRI
5686                  * table. Convert a BE unicast address to a broadcast address
5687                  * so that we can look up the offset. */
5688                 if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
5689                     !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
5690                         priv_addr_table[t++] = pri_be_shared_addr(addr);
5691                 else
5692                         priv_addr_table[t++] = addr;
5693
5694                 *num_registers = t;
5695                 return 0;
5696         }
5697
5698         /* The GPC/TPC unicast registers are included in the compressed PRI
5699          * tables. Convert a GPC/TPC broadcast address to unicast addresses so
5700          * that we can look up the offsets. */
5701         if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
5702                 for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
5703
5704                         if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5705                                 for (tpc_num = 0;
5706                                      tpc_num < g->gr.gpc_tpc_count[gpc_num];
5707                                      tpc_num++)
5708                                         priv_addr_table[t++] =
5709                                                 pri_tpc_addr(pri_tpccs_addr_mask(addr),
5710                                                              gpc_num, tpc_num);
5711
5712                         else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
5713                                 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5714                                                                priv_addr_table, &t);
5715                                 if (err)
5716                                         return err;
5717                         } else
5718                                 priv_addr_table[t++] =
5719                                         pri_gpc_addr(pri_gpccs_addr_mask(addr),
5720                                                      gpc_num);
5721                 }
5722         } else {
5723                 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5724                         for (tpc_num = 0;
5725                              tpc_num < g->gr.gpc_tpc_count[gpc_num];
5726                              tpc_num++)
5727                                 priv_addr_table[t++] =
5728                                         pri_tpc_addr(pri_tpccs_addr_mask(addr),
5729                                                      gpc_num, tpc_num);
5730                 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
5731                         err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5732                                                        priv_addr_table, &t);
5733                 else
5734                         priv_addr_table[t++] = addr;
5735         }
5736
5737         *num_registers = t;
5738         return 0;
5739 }
5740
5741 int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
5742                                     u32 addr,
5743                                     u32 max_offsets,
5744                                     u32 *offsets, u32 *offset_addrs,
5745                                     u32 *num_offsets,
5746                                     bool is_quad, u32 quad)
5747 {
5748         u32 i;
5749         u32 priv_offset = 0;
5750         u32 *priv_registers;
5751         u32 num_registers = 0;
5752         int err = 0;
5753         u32 potential_offsets = proj_scal_litter_num_gpcs_v() *
5754                 proj_scal_litter_num_tpc_per_gpc_v();
5755
5756        &nbs