gk20a: Moved bind fecs to init_gr_support
[linux-3.10.git] / drivers / gpu / nvgpu / gk20a / gr_gk20a.c
1 /*
2  * GK20A Graphics
3  *
4  * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program; if not, write to the Free Software Foundation, Inc.,
17  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18  */
19
20 #include <linux/delay.h>        /* for udelay */
21 #include <linux/mm.h>           /* for totalram_pages */
22 #include <linux/scatterlist.h>
23 #include <linux/tegra-soc.h>
24 #include <linux/nvhost_dbg_gpu_ioctl.h>
25 #include <linux/vmalloc.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/firmware.h>
28 #include <linux/nvhost.h>
29
30 #include "gk20a.h"
31 #include "kind_gk20a.h"
32 #include "gr_ctx_gk20a.h"
33
34 #include "hw_ccsr_gk20a.h"
35 #include "hw_ctxsw_prog_gk20a.h"
36 #include "hw_fifo_gk20a.h"
37 #include "hw_gr_gk20a.h"
38 #include "hw_gmmu_gk20a.h"
39 #include "hw_mc_gk20a.h"
40 #include "hw_ram_gk20a.h"
41 #include "hw_pri_ringmaster_gk20a.h"
42 #include "hw_pri_ringstation_sys_gk20a.h"
43 #include "hw_pri_ringstation_gpc_gk20a.h"
44 #include "hw_pri_ringstation_fbp_gk20a.h"
45 #include "hw_proj_gk20a.h"
46 #include "hw_top_gk20a.h"
47 #include "hw_ltc_gk20a.h"
48 #include "hw_fb_gk20a.h"
49 #include "hw_therm_gk20a.h"
50 #include "hw_pbdma_gk20a.h"
51 #include "gr_pri_gk20a.h"
52 #include "regops_gk20a.h"
53 #include "dbg_gpu_gk20a.h"
54
55 #define BLK_SIZE (256)
56
57 static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g);
58 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
59
60 /* global ctx buffer */
61 static int  gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
62 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
63 static int  gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
64                                             struct channel_gk20a *c);
65 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
66
67 /* channel gr ctx buffer */
68 static int  gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
69                                         struct channel_gk20a *c);
70 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
71
72 /* channel patch ctx buffer */
73 static int  gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
74                                         struct channel_gk20a *c);
75 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
76
77 /* golden ctx image */
78 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
79                                           struct channel_gk20a *c);
80 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
81                                           struct channel_gk20a *c);
82
83 void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
84 {
85         int i;
86
87         gk20a_err(dev_from_gk20a(g), "gr_fecs_os_r : %d",
88                 gk20a_readl(g, gr_fecs_os_r()));
89         gk20a_err(dev_from_gk20a(g), "gr_fecs_cpuctl_r : 0x%x",
90                 gk20a_readl(g, gr_fecs_cpuctl_r()));
91         gk20a_err(dev_from_gk20a(g), "gr_fecs_idlestate_r : 0x%x",
92                 gk20a_readl(g, gr_fecs_idlestate_r()));
93         gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox0_r : 0x%x",
94                 gk20a_readl(g, gr_fecs_mailbox0_r()));
95         gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox1_r : 0x%x",
96                 gk20a_readl(g, gr_fecs_mailbox1_r()));
97         gk20a_err(dev_from_gk20a(g), "gr_fecs_irqstat_r : 0x%x",
98                 gk20a_readl(g, gr_fecs_irqstat_r()));
99         gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmode_r : 0x%x",
100                 gk20a_readl(g, gr_fecs_irqmode_r()));
101         gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmask_r : 0x%x",
102                 gk20a_readl(g, gr_fecs_irqmask_r()));
103         gk20a_err(dev_from_gk20a(g), "gr_fecs_irqdest_r : 0x%x",
104                 gk20a_readl(g, gr_fecs_irqdest_r()));
105         gk20a_err(dev_from_gk20a(g), "gr_fecs_debug1_r : 0x%x",
106                 gk20a_readl(g, gr_fecs_debug1_r()));
107         gk20a_err(dev_from_gk20a(g), "gr_fecs_debuginfo_r : 0x%x",
108                 gk20a_readl(g, gr_fecs_debuginfo_r()));
109
110         for (i = 0; i < gr_fecs_ctxsw_mailbox__size_1_v(); i++)
111                 gk20a_err(dev_from_gk20a(g), "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
112                         i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
113
114         gk20a_err(dev_from_gk20a(g), "gr_fecs_engctl_r : 0x%x",
115                 gk20a_readl(g, gr_fecs_engctl_r()));
116         gk20a_err(dev_from_gk20a(g), "gr_fecs_curctx_r : 0x%x",
117                 gk20a_readl(g, gr_fecs_curctx_r()));
118         gk20a_err(dev_from_gk20a(g), "gr_fecs_nxtctx_r : 0x%x",
119                 gk20a_readl(g, gr_fecs_nxtctx_r()));
120
121         gk20a_writel(g, gr_fecs_icd_cmd_r(),
122                 gr_fecs_icd_cmd_opc_rreg_f() |
123                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
124         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_IMB : 0x%x",
125                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
126
127         gk20a_writel(g, gr_fecs_icd_cmd_r(),
128                 gr_fecs_icd_cmd_opc_rreg_f() |
129                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
130         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_DMB : 0x%x",
131                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
132
133         gk20a_writel(g, gr_fecs_icd_cmd_r(),
134                 gr_fecs_icd_cmd_opc_rreg_f() |
135                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
136         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CSW : 0x%x",
137                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
138
139         gk20a_writel(g, gr_fecs_icd_cmd_r(),
140                 gr_fecs_icd_cmd_opc_rreg_f() |
141                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
142         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CTX : 0x%x",
143                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
144
145         gk20a_writel(g, gr_fecs_icd_cmd_r(),
146                 gr_fecs_icd_cmd_opc_rreg_f() |
147                 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
148         gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_EXCI : 0x%x",
149                 gk20a_readl(g, gr_fecs_icd_rdata_r()));
150
151         for (i = 0; i < 4; i++) {
152                 gk20a_writel(g, gr_fecs_icd_cmd_r(),
153                         gr_fecs_icd_cmd_opc_rreg_f() |
154                         gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
155                 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_PC : 0x%x",
156                         gk20a_readl(g, gr_fecs_icd_rdata_r()));
157
158                 gk20a_writel(g, gr_fecs_icd_cmd_r(),
159                         gr_fecs_icd_cmd_opc_rreg_f() |
160                         gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
161                 gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_SP : 0x%x",
162                         gk20a_readl(g, gr_fecs_icd_rdata_r()));
163         }
164 }
165
166 static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
167 {
168         u32 i, ucode_u32_size;
169         const u32 *ucode_u32_data;
170         u32 checksum;
171
172         gk20a_dbg_fn("");
173
174         gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
175                                               gr_gpccs_dmemc_blk_f(0)  |
176                                               gr_gpccs_dmemc_aincw_f(1)));
177
178         ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
179         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
180
181         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
182                 gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
183                 checksum += ucode_u32_data[i];
184         }
185
186         gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
187                                              gr_fecs_dmemc_blk_f(0)  |
188                                              gr_fecs_dmemc_aincw_f(1)));
189
190         ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
191         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
192
193         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
194                 gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
195                 checksum += ucode_u32_data[i];
196         }
197         gk20a_dbg_fn("done");
198 }
199
200 static void gr_gk20a_load_falcon_imem(struct gk20a *g)
201 {
202         u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
203         const u32 *ucode_u32_data;
204         u32 tag, i, pad_start, pad_end;
205         u32 checksum;
206
207         gk20a_dbg_fn("");
208
209         cfg = gk20a_readl(g, gr_fecs_cfg_r());
210         fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
211
212         cfg = gk20a_readl(g, gr_gpc0_cfg_r());
213         gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
214
215         /* Use the broadcast address to access all of the GPCCS units. */
216         gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
217                                               gr_gpccs_imemc_blk_f(0) |
218                                               gr_gpccs_imemc_aincw_f(1)));
219
220         /* Setup the tags for the instruction memory. */
221         tag = 0;
222         gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
223
224         ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
225         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
226
227         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
228                 if (i && ((i % (256/sizeof(u32))) == 0)) {
229                         tag++;
230                         gk20a_writel(g, gr_gpccs_imemt_r(0),
231                                       gr_gpccs_imemt_tag_f(tag));
232                 }
233                 gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
234                 checksum += ucode_u32_data[i];
235         }
236
237         pad_start = i*4;
238         pad_end = pad_start+(256-pad_start%256)+256;
239         for (i = pad_start;
240              (i < gpccs_imem_size * 256) && (i < pad_end);
241              i += 4) {
242                 if (i && ((i % 256) == 0)) {
243                         tag++;
244                         gk20a_writel(g, gr_gpccs_imemt_r(0),
245                                       gr_gpccs_imemt_tag_f(tag));
246                 }
247                 gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
248         }
249
250         gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
251                                              gr_fecs_imemc_blk_f(0) |
252                                              gr_fecs_imemc_aincw_f(1)));
253
254         /* Setup the tags for the instruction memory. */
255         tag = 0;
256         gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
257
258         ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
259         ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
260
261         for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
262                 if (i && ((i % (256/sizeof(u32))) == 0)) {
263                         tag++;
264                         gk20a_writel(g, gr_fecs_imemt_r(0),
265                                       gr_fecs_imemt_tag_f(tag));
266                 }
267                 gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
268                 checksum += ucode_u32_data[i];
269         }
270
271         pad_start = i*4;
272         pad_end = pad_start+(256-pad_start%256)+256;
273         for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
274                 if (i && ((i % 256) == 0)) {
275                         tag++;
276                         gk20a_writel(g, gr_fecs_imemt_r(0),
277                                       gr_fecs_imemt_tag_f(tag));
278                 }
279                 gk20a_writel(g, gr_fecs_imemd_r(0), 0);
280         }
281 }
282
283 static int gr_gk20a_wait_idle(struct gk20a *g, unsigned long end_jiffies,
284                 u32 expect_delay)
285 {
286         u32 delay = expect_delay;
287         bool gr_enabled;
288         bool ctxsw_active;
289         bool gr_busy;
290
291         gk20a_dbg_fn("");
292
293         do {
294                 /* fmodel: host gets fifo_engine_status(gr) from gr
295                    only when gr_status is read */
296                 gk20a_readl(g, gr_status_r());
297
298                 gr_enabled = gk20a_readl(g, mc_enable_r()) &
299                         mc_enable_pgraph_enabled_f();
300
301                 ctxsw_active = gk20a_readl(g,
302                         fifo_engine_status_r(ENGINE_GR_GK20A)) &
303                         fifo_engine_status_ctxsw_in_progress_f();
304
305                 gr_busy = gk20a_readl(g, gr_engine_status_r()) &
306                         gr_engine_status_value_busy_f();
307
308                 if (!gr_enabled || (!gr_busy && !ctxsw_active)) {
309                         gk20a_dbg_fn("done");
310                         return 0;
311                 }
312
313                 usleep_range(delay, delay * 2);
314                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
315
316         } while (time_before(jiffies, end_jiffies)
317                         || !tegra_platform_is_silicon());
318
319         gk20a_err(dev_from_gk20a(g),
320                 "timeout, ctxsw busy : %d, gr busy : %d",
321                 ctxsw_active, gr_busy);
322
323         return -EAGAIN;
324 }
325
326 static int gr_gk20a_wait_fe_idle(struct gk20a *g, unsigned long end_jiffies,
327                 u32 expect_delay)
328 {
329         u32 val;
330         u32 delay = expect_delay;
331
332         gk20a_dbg_fn("");
333
334         do {
335                 val = gk20a_readl(g, gr_status_r());
336
337                 if (!gr_status_fe_method_upper_v(val) &&
338                         !gr_status_fe_method_lower_v(val) &&
339                         !gr_status_fe_method_fe_gi_v(val)) {
340                         gk20a_dbg_fn("done");
341                         return 0;
342                 }
343
344                 usleep_range(delay, delay * 2);
345                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
346         } while (time_before(jiffies, end_jiffies)
347                         || !tegra_platform_is_silicon());
348
349         gk20a_err(dev_from_gk20a(g),
350                 "timeout, fe busy : %x", val);
351
352         return -EAGAIN;
353 }
354 static int gr_gk20a_ctx_reset(struct gk20a *g, u32 rst_mask)
355 {
356         u32 delay = GR_IDLE_CHECK_DEFAULT;
357         unsigned long end_jiffies = jiffies +
358                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
359         u32 reg;
360
361         gk20a_dbg_fn("");
362
363         if (!tegra_platform_is_linsim()) {
364                 /* Force clocks on */
365                 gk20a_writel(g, gr_fe_pwr_mode_r(),
366                              gr_fe_pwr_mode_req_send_f() |
367                              gr_fe_pwr_mode_mode_force_on_f());
368
369                 /* Wait for the clocks to indicate that they are on */
370                 do {
371                         reg = gk20a_readl(g, gr_fe_pwr_mode_r());
372
373                         if (gr_fe_pwr_mode_req_v(reg) ==
374                                         gr_fe_pwr_mode_req_done_v())
375                                 break;
376
377                         usleep_range(delay, delay * 2);
378                         delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
379
380                 } while (time_before(jiffies, end_jiffies));
381
382                 if (!time_before(jiffies, end_jiffies)) {
383                         gk20a_err(dev_from_gk20a(g),
384                                    "failed to force the clocks on\n");
385                         WARN_ON(1);
386                 }
387         }
388         if (rst_mask) {
389                 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), rst_mask);
390         } else {
391                 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
392                              gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
393                              gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
394                              gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
395                              gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
396                              gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
397                              gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
398                              gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
399                              gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
400                              gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
401         }
402
403         /* we need to read the reset register *and* wait for a moment to ensure
404          * reset propagation */
405
406         gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
407         udelay(20);
408
409         gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
410                      gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
411                      gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
412                      gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
413                      gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
414                      gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
415                      gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
416                      gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
417                      gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
418                      gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
419
420         /* we need to readl the reset and then wait a small moment after that */
421         gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
422         udelay(20);
423
424         if (!tegra_platform_is_linsim()) {
425                 /* Set power mode back to auto */
426                 gk20a_writel(g, gr_fe_pwr_mode_r(),
427                              gr_fe_pwr_mode_req_send_f() |
428                              gr_fe_pwr_mode_mode_auto_f());
429
430                 /* Wait for the request to complete */
431                 end_jiffies = jiffies +
432                         msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
433                 do {
434                         reg = gk20a_readl(g, gr_fe_pwr_mode_r());
435
436                         if (gr_fe_pwr_mode_req_v(reg) ==
437                                         gr_fe_pwr_mode_req_done_v())
438                                 break;
439
440                         usleep_range(delay, delay * 2);
441                         delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
442
443                 } while (time_before(jiffies, end_jiffies));
444
445                 if (!time_before(jiffies, end_jiffies))
446                         gk20a_warn(dev_from_gk20a(g),
447                                    "failed to set power mode to auto\n");
448         }
449
450         return 0;
451 }
452
453 static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
454                                    u32 *mailbox_ret, u32 opc_success,
455                                    u32 mailbox_ok, u32 opc_fail,
456                                    u32 mailbox_fail)
457 {
458         unsigned long end_jiffies = jiffies +
459                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
460         u32 delay = GR_IDLE_CHECK_DEFAULT;
461         u32 check = WAIT_UCODE_LOOP;
462         u32 reg;
463
464         gk20a_dbg_fn("");
465
466         while (check == WAIT_UCODE_LOOP) {
467                 if (!time_before(jiffies, end_jiffies) &&
468                                 tegra_platform_is_silicon())
469                         check = WAIT_UCODE_TIMEOUT;
470
471                 reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
472
473                 if (mailbox_ret)
474                         *mailbox_ret = reg;
475
476                 switch (opc_success) {
477                 case GR_IS_UCODE_OP_EQUAL:
478                         if (reg == mailbox_ok)
479                                 check = WAIT_UCODE_OK;
480                         break;
481                 case GR_IS_UCODE_OP_NOT_EQUAL:
482                         if (reg != mailbox_ok)
483                                 check = WAIT_UCODE_OK;
484                         break;
485                 case GR_IS_UCODE_OP_AND:
486                         if (reg & mailbox_ok)
487                                 check = WAIT_UCODE_OK;
488                         break;
489                 case GR_IS_UCODE_OP_LESSER:
490                         if (reg < mailbox_ok)
491                                 check = WAIT_UCODE_OK;
492                         break;
493                 case GR_IS_UCODE_OP_LESSER_EQUAL:
494                         if (reg <= mailbox_ok)
495                                 check = WAIT_UCODE_OK;
496                         break;
497                 case GR_IS_UCODE_OP_SKIP:
498                         /* do no success check */
499                         break;
500                 default:
501                         gk20a_err(dev_from_gk20a(g),
502                                    "invalid success opcode 0x%x", opc_success);
503
504                         check = WAIT_UCODE_ERROR;
505                         break;
506                 }
507
508                 switch (opc_fail) {
509                 case GR_IS_UCODE_OP_EQUAL:
510                         if (reg == mailbox_fail)
511                                 check = WAIT_UCODE_ERROR;
512                         break;
513                 case GR_IS_UCODE_OP_NOT_EQUAL:
514                         if (reg != mailbox_fail)
515                                 check = WAIT_UCODE_ERROR;
516                         break;
517                 case GR_IS_UCODE_OP_AND:
518                         if (reg & mailbox_fail)
519                                 check = WAIT_UCODE_ERROR;
520                         break;
521                 case GR_IS_UCODE_OP_LESSER:
522                         if (reg < mailbox_fail)
523                                 check = WAIT_UCODE_ERROR;
524                         break;
525                 case GR_IS_UCODE_OP_LESSER_EQUAL:
526                         if (reg <= mailbox_fail)
527                                 check = WAIT_UCODE_ERROR;
528                         break;
529                 case GR_IS_UCODE_OP_SKIP:
530                         /* do no check on fail*/
531                         break;
532                 default:
533                         gk20a_err(dev_from_gk20a(g),
534                                    "invalid fail opcode 0x%x", opc_fail);
535                         check = WAIT_UCODE_ERROR;
536                         break;
537                 }
538
539                 usleep_range(delay, delay * 2);
540                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
541         }
542
543         if (check == WAIT_UCODE_TIMEOUT) {
544                 gk20a_err(dev_from_gk20a(g),
545                            "timeout waiting on ucode response");
546                 gk20a_fecs_dump_falcon_stats(g);
547                 return -1;
548         } else if (check == WAIT_UCODE_ERROR) {
549                 gk20a_err(dev_from_gk20a(g),
550                            "ucode method failed on mailbox=%d value=0x%08x",
551                            mailbox_id, reg);
552                 gk20a_fecs_dump_falcon_stats(g);
553                 return -1;
554         }
555
556         gk20a_dbg_fn("done");
557         return 0;
558 }
559
560 /* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
561  * We should replace most, if not all, fecs method calls to this instead. */
562 struct fecs_method_op_gk20a {
563         struct {
564                 u32 addr;
565                 u32 data;
566         } method;
567
568         struct {
569                 u32 id;
570                 u32 data;
571                 u32 clr;
572                 u32 *ret;
573                 u32 ok;
574                 u32 fail;
575         } mailbox;
576
577         struct {
578                 u32 ok;
579                 u32 fail;
580         } cond;
581
582 };
583
584 int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
585                                    struct fecs_method_op_gk20a op)
586 {
587         struct gr_gk20a *gr = &g->gr;
588         int ret;
589
590         mutex_lock(&gr->fecs_mutex);
591
592         if (op.mailbox.id != 0)
593                 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
594                              op.mailbox.data);
595
596         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
597                 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
598
599         gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
600         gk20a_writel(g, gr_fecs_method_push_r(),
601                 gr_fecs_method_push_adr_f(op.method.addr));
602
603         /* op.mb.id == 4 cases require waiting for completion on
604          * for op.mb.id == 0 */
605         if (op.mailbox.id == 4)
606                 op.mailbox.id = 0;
607
608         ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
609                                       op.cond.ok, op.mailbox.ok,
610                                       op.cond.fail, op.mailbox.fail);
611
612         mutex_unlock(&gr->fecs_mutex);
613
614         return ret;
615 }
616
617 int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
618 {
619         return gr_gk20a_submit_fecs_method_op(g,
620               (struct fecs_method_op_gk20a) {
621                       .method.addr = fecs_method,
622                       .method.data = ~0,
623                       .mailbox = { .id   = 1, /*sideband?*/
624                                    .data = ~0, .clr = ~0, .ret = ret,
625                                    .ok   = gr_fecs_ctxsw_mailbox_value_pass_v(),
626                                    .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
627                       .cond.ok = GR_IS_UCODE_OP_EQUAL,
628                       .cond.fail = GR_IS_UCODE_OP_EQUAL });
629 }
630
631 /* Stop processing (stall) context switches at FECS.
632  * The caller must hold the dbg_sessions_lock, else if mutliple stop methods
633  * are sent to the ucode in sequence, it can get into an undefined state. */
634 int gr_gk20a_disable_ctxsw(struct gk20a *g)
635 {
636         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
637         return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0);
638 }
639
640 /* Start processing (continue) context switches at FECS */
641 int gr_gk20a_enable_ctxsw(struct gk20a *g)
642 {
643         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
644         return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0);
645 }
646
647
648 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
649 {
650         u32 addr_lo;
651         u32 addr_hi;
652         void *inst_ptr = NULL;
653
654         gk20a_dbg_fn("");
655
656         inst_ptr = c->inst_block.cpuva;
657         if (!inst_ptr)
658                 return -ENOMEM;
659
660         addr_lo = u64_lo32(gpu_va) >> 12;
661         addr_hi = u64_hi32(gpu_va);
662
663         gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
664                  ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
665                  ram_in_gr_wfi_ptr_lo_f(addr_lo));
666
667         gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
668                  ram_in_gr_wfi_ptr_hi_f(addr_hi));
669
670         return 0;
671 }
672
673 /*
674  * Context state can be written directly or "patched" at times.
675  * So that code can be used in either situation it is written
676  * using a series _ctx_patch_write(..., patch) statements.
677  * However any necessary cpu map/unmap and gpu l2 invalidates
678  * should be minimized (to avoid doing it once per patch write).
679  * Before a sequence of these set up with "_ctx_patch_write_begin"
680  * and close with "_ctx_patch_write_end."
681  */
682 int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
683                                           struct channel_ctx_gk20a *ch_ctx)
684 {
685         /* being defensive still... */
686         if (ch_ctx->patch_ctx.cpu_va) {
687                 gk20a_err(dev_from_gk20a(g), "nested ctx patch begin?");
688                 return -EBUSY;
689         }
690
691         ch_ctx->patch_ctx.cpu_va = vmap(ch_ctx->patch_ctx.pages,
692                         PAGE_ALIGN(ch_ctx->patch_ctx.size) >> PAGE_SHIFT,
693                         0, pgprot_dmacoherent(PAGE_KERNEL));
694
695         if (!ch_ctx->patch_ctx.cpu_va)
696                 return -ENOMEM;
697
698         return 0;
699 }
700
701 int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
702                                         struct channel_ctx_gk20a *ch_ctx)
703 {
704         /* being defensive still... */
705         if (!ch_ctx->patch_ctx.cpu_va) {
706                 gk20a_err(dev_from_gk20a(g), "dangling ctx patch end?");
707                 return -EINVAL;
708         }
709
710         vunmap(ch_ctx->patch_ctx.cpu_va);
711         ch_ctx->patch_ctx.cpu_va = NULL;
712         return 0;
713 }
714
715 int gr_gk20a_ctx_patch_write(struct gk20a *g,
716                                     struct channel_ctx_gk20a *ch_ctx,
717                                     u32 addr, u32 data, bool patch)
718 {
719         u32 patch_slot = 0;
720         void *patch_ptr = NULL;
721         bool mapped_here = false;
722
723         BUG_ON(patch != 0 && ch_ctx == NULL);
724
725         if (patch) {
726                 if (!ch_ctx)
727                         return -EINVAL;
728                 /* we added an optimization prolog, epilog
729                  * to get rid of unnecessary maps and l2 invals.
730                  * but be defensive still... */
731                 if (!ch_ctx->patch_ctx.cpu_va) {
732                         int err;
733                         gk20a_err(dev_from_gk20a(g),
734                                    "per-write ctx patch begin?");
735                         /* yes, gr_gk20a_ctx_patch_smpc causes this one */
736                         err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
737                         if (err)
738                                 return err;
739                         mapped_here = true;
740                 } else
741                         mapped_here = false;
742
743                 patch_ptr = ch_ctx->patch_ctx.cpu_va;
744                 patch_slot = ch_ctx->patch_ctx.data_count * 2;
745
746                 gk20a_mem_wr32(patch_ptr, patch_slot++, addr);
747                 gk20a_mem_wr32(patch_ptr, patch_slot++, data);
748
749                 ch_ctx->patch_ctx.data_count++;
750
751                 if (mapped_here)
752                         gr_gk20a_ctx_patch_write_end(g, ch_ctx);
753
754         } else
755                 gk20a_writel(g, addr, data);
756
757         return 0;
758 }
759
760 static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
761                                         struct channel_gk20a *c)
762 {
763         u32 inst_base_ptr = u64_lo32(c->inst_block.cpu_pa
764                                      >> ram_in_base_shift_v());
765         u32 ret;
766
767         gk20a_dbg_info("bind channel %d inst ptr 0x%08x",
768                    c->hw_chid, inst_base_ptr);
769
770         ret = gr_gk20a_submit_fecs_method_op(g,
771                      (struct fecs_method_op_gk20a) {
772                      .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
773                      .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
774                                      gr_fecs_current_ctx_target_vid_mem_f() |
775                                      gr_fecs_current_ctx_valid_f(1)),
776                      .mailbox = { .id = 0, .data = 0,
777                                   .clr = 0x30,
778                                   .ret = NULL,
779                                   .ok = 0x10,
780                                   .fail = 0x20, },
781                      .cond.ok = GR_IS_UCODE_OP_AND,
782                      .cond.fail = GR_IS_UCODE_OP_AND});
783         if (ret)
784                 gk20a_err(dev_from_gk20a(g),
785                         "bind channel instance failed");
786
787         return ret;
788 }
789
790 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
791                                     bool disable_fifo)
792 {
793         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
794         struct fifo_gk20a *f = &g->fifo;
795         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
796         u32 va_lo, va_hi, va;
797         int ret = 0;
798         void *ctx_ptr = NULL;
799
800         gk20a_dbg_fn("");
801
802         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
803                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
804                         0, pgprot_dmacoherent(PAGE_KERNEL));
805         if (!ctx_ptr)
806                 return -ENOMEM;
807
808         if (ch_ctx->zcull_ctx.gpu_va == 0 &&
809             ch_ctx->zcull_ctx.ctx_sw_mode ==
810                 ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
811                 ret = -EINVAL;
812                 goto clean_up;
813         }
814
815         va_lo = u64_lo32(ch_ctx->zcull_ctx.gpu_va);
816         va_hi = u64_hi32(ch_ctx->zcull_ctx.gpu_va);
817         va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
818
819         if (disable_fifo) {
820                 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
821                 if (ret) {
822                         gk20a_err(dev_from_gk20a(g),
823                                 "failed to disable gr engine activity\n");
824                         goto clean_up;
825                 }
826         }
827
828         gk20a_mm_fb_flush(g);
829
830         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
831                  ch_ctx->zcull_ctx.ctx_sw_mode);
832
833         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
834
835         if (disable_fifo) {
836                 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
837                 if (ret) {
838                         gk20a_err(dev_from_gk20a(g),
839                                 "failed to enable gr engine activity\n");
840                         goto clean_up;
841                 }
842         }
843
844 clean_up:
845         vunmap(ctx_ptr);
846
847         return ret;
848 }
849
850 static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
851                         struct channel_gk20a *c, bool patch)
852 {
853         struct gr_gk20a *gr = &g->gr;
854         struct channel_ctx_gk20a *ch_ctx = NULL;
855         u32 attrib_offset_in_chunk = 0;
856         u32 alpha_offset_in_chunk = 0;
857         u32 pd_ab_max_output;
858         u32 gpc_index, ppc_index;
859         u32 temp;
860         u32 cbm_cfg_size1, cbm_cfg_size2;
861
862         gk20a_dbg_fn("");
863
864         if (patch) {
865                 int err;
866                 ch_ctx = &c->ch_ctx;
867                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
868                 if (err)
869                         return err;
870         }
871
872         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
873                 gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
874                 gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
875                 patch);
876
877         pd_ab_max_output = (gr->alpha_cb_default_size *
878                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
879                 gr_pd_ab_dist_cfg1_max_output_granularity_v();
880
881         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
882                 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
883                 gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
884
885         alpha_offset_in_chunk = attrib_offset_in_chunk +
886                 gr->tpc_count * gr->attrib_cb_size;
887
888         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
889                 temp = proj_gpc_stride_v() * gpc_index;
890                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
891                      ppc_index++) {
892                         cbm_cfg_size1 = gr->attrib_cb_default_size *
893                                 gr->pes_tpc_count[ppc_index][gpc_index];
894                         cbm_cfg_size2 = gr->alpha_cb_default_size *
895                                 gr->pes_tpc_count[ppc_index][gpc_index];
896
897                         gr_gk20a_ctx_patch_write(g, ch_ctx,
898                                 gr_gpc0_ppc0_cbm_cfg_r() + temp +
899                                 proj_ppc_in_gpc_stride_v() * ppc_index,
900                                 gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
901                                 gr_gpc0_ppc0_cbm_cfg_start_offset_f(attrib_offset_in_chunk) |
902                                 gr_gpc0_ppc0_cbm_cfg_size_f(cbm_cfg_size1), patch);
903
904                         attrib_offset_in_chunk += gr->attrib_cb_size *
905                                 gr->pes_tpc_count[ppc_index][gpc_index];
906
907                         gr_gk20a_ctx_patch_write(g, ch_ctx,
908                                 gr_gpc0_ppc0_cbm_cfg2_r() + temp +
909                                 proj_ppc_in_gpc_stride_v() * ppc_index,
910                                 gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
911                                 gr_gpc0_ppc0_cbm_cfg2_size_f(cbm_cfg_size2), patch);
912
913                         alpha_offset_in_chunk += gr->alpha_cb_size *
914                                 gr->pes_tpc_count[ppc_index][gpc_index];
915                 }
916         }
917
918         if (patch)
919                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
920
921         return 0;
922 }
923
924 static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
925                         struct channel_gk20a *c, bool patch)
926 {
927         struct gr_gk20a *gr = &g->gr;
928         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
929         u64 addr;
930         u32 size;
931
932         gk20a_dbg_fn("");
933         if (patch) {
934                 int err;
935                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
936                 if (err)
937                         return err;
938         }
939
940         /* global pagepool buffer */
941         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
942                 gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
943                 (u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
944                  (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
945
946         size = gr->global_ctx_buffer[PAGEPOOL].size /
947                 gr_scc_pagepool_total_pages_byte_granularity_v();
948
949         if (size == gr_scc_pagepool_total_pages_hwmax_value_v())
950                 size = gr_scc_pagepool_total_pages_hwmax_v();
951
952         gk20a_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
953                 addr, size);
954
955         g->ops.gr.commit_global_pagepool(g, ch_ctx, addr, size, patch);
956
957         /* global bundle cb */
958         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
959                 gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
960                 (u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
961                  (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
962
963         size = gr->bundle_cb_default_size;
964
965         gk20a_dbg_info("bundle cb addr : 0x%016llx, size : %d",
966                 addr, size);
967
968         g->ops.gr.commit_global_bundle_cb(g, ch_ctx, addr, size, patch);
969
970         /* global attrib cb */
971         addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
972                 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
973                 (u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
974                  (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
975
976         gk20a_dbg_info("attrib cb addr : 0x%016llx", addr);
977         g->ops.gr.commit_global_attrib_cb(g, ch_ctx, addr, patch);
978
979         if (patch)
980                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
981
982         return 0;
983 }
984
985 static void gr_gk20a_commit_global_attrib_cb(struct gk20a *g,
986                                             struct channel_ctx_gk20a *ch_ctx,
987                                             u64 addr, bool patch)
988 {
989         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
990                 gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
991                 gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
992
993         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
994                 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
995                 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
996 }
997
998 static void gr_gk20a_commit_global_bundle_cb(struct gk20a *g,
999                                             struct channel_ctx_gk20a *ch_ctx,
1000                                             u64 addr, u64 size, bool patch)
1001 {
1002         u32 data;
1003
1004         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
1005                 gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
1006
1007         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
1008                 gr_scc_bundle_cb_size_div_256b_f(size) |
1009                 gr_scc_bundle_cb_size_valid_true_f(), patch);
1010
1011         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
1012                 gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
1013
1014         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
1015                 gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
1016                 gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
1017
1018         /* data for state_limit */
1019         data = (g->gr.bundle_cb_default_size *
1020                 gr_scc_bundle_cb_size_div_256b_byte_granularity_v()) /
1021                 gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v();
1022
1023         data = min_t(u32, data, g->gr.min_gpm_fifo_depth);
1024
1025         gk20a_dbg_info("bundle cb token limit : %d, state limit : %d",
1026                    g->gr.bundle_cb_token_limit, data);
1027
1028         gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
1029                 gr_pd_ab_dist_cfg2_token_limit_f(g->gr.bundle_cb_token_limit) |
1030                 gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
1031
1032 }
1033
1034 static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
1035 {
1036         struct gr_gk20a *gr = &g->gr;
1037         struct channel_ctx_gk20a *ch_ctx = NULL;
1038         u32 gpm_pd_cfg;
1039         u32 pd_ab_dist_cfg0;
1040         u32 ds_debug;
1041         u32 mpc_vtg_debug;
1042         u32 pe_vaf;
1043         u32 pe_vsc_vpc;
1044
1045         gk20a_dbg_fn("");
1046
1047         gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
1048         pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
1049         ds_debug = gk20a_readl(g, gr_ds_debug_r());
1050         mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
1051
1052         if (patch) {
1053                 int err;
1054                 ch_ctx = &c->ch_ctx;
1055                 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
1056                 if (err)
1057                         return err;
1058         }
1059
1060         if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
1061                 pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
1062                 pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
1063
1064                 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
1065                 pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
1066                 pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
1067                 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
1068                 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
1069                 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
1070
1071                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1072                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
1073                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
1074                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1075                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1076                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1077         } else {
1078                 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
1079                 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
1080                 ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
1081                 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
1082
1083                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1084                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1085                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1086                 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1087         }
1088
1089         if (patch)
1090                 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
1091
1092         return 0;
1093 }
1094
1095 int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr)
1096 {
1097         u32 norm_entries, norm_shift;
1098         u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
1099         u32 map0, map1, map2, map3, map4, map5;
1100
1101         if (!gr->map_tiles)
1102                 return -1;
1103
1104         gk20a_dbg_fn("");
1105
1106         gk20a_writel(g, gr_crstr_map_table_cfg_r(),
1107                      gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
1108                      gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
1109
1110         map0 =  gr_crstr_gpc_map0_tile0_f(gr->map_tiles[0]) |
1111                 gr_crstr_gpc_map0_tile1_f(gr->map_tiles[1]) |
1112                 gr_crstr_gpc_map0_tile2_f(gr->map_tiles[2]) |
1113                 gr_crstr_gpc_map0_tile3_f(gr->map_tiles[3]) |
1114                 gr_crstr_gpc_map0_tile4_f(gr->map_tiles[4]) |
1115                 gr_crstr_gpc_map0_tile5_f(gr->map_tiles[5]);
1116
1117         map1 =  gr_crstr_gpc_map1_tile6_f(gr->map_tiles[6]) |
1118                 gr_crstr_gpc_map1_tile7_f(gr->map_tiles[7]) |
1119                 gr_crstr_gpc_map1_tile8_f(gr->map_tiles[8]) |
1120                 gr_crstr_gpc_map1_tile9_f(gr->map_tiles[9]) |
1121                 gr_crstr_gpc_map1_tile10_f(gr->map_tiles[10]) |
1122                 gr_crstr_gpc_map1_tile11_f(gr->map_tiles[11]);
1123
1124         map2 =  gr_crstr_gpc_map2_tile12_f(gr->map_tiles[12]) |
1125                 gr_crstr_gpc_map2_tile13_f(gr->map_tiles[13]) |
1126                 gr_crstr_gpc_map2_tile14_f(gr->map_tiles[14]) |
1127                 gr_crstr_gpc_map2_tile15_f(gr->map_tiles[15]) |
1128                 gr_crstr_gpc_map2_tile16_f(gr->map_tiles[16]) |
1129                 gr_crstr_gpc_map2_tile17_f(gr->map_tiles[17]);
1130
1131         map3 =  gr_crstr_gpc_map3_tile18_f(gr->map_tiles[18]) |
1132                 gr_crstr_gpc_map3_tile19_f(gr->map_tiles[19]) |
1133                 gr_crstr_gpc_map3_tile20_f(gr->map_tiles[20]) |
1134                 gr_crstr_gpc_map3_tile21_f(gr->map_tiles[21]) |
1135                 gr_crstr_gpc_map3_tile22_f(gr->map_tiles[22]) |
1136                 gr_crstr_gpc_map3_tile23_f(gr->map_tiles[23]);
1137
1138         map4 =  gr_crstr_gpc_map4_tile24_f(gr->map_tiles[24]) |
1139                 gr_crstr_gpc_map4_tile25_f(gr->map_tiles[25]) |
1140                 gr_crstr_gpc_map4_tile26_f(gr->map_tiles[26]) |
1141                 gr_crstr_gpc_map4_tile27_f(gr->map_tiles[27]) |
1142                 gr_crstr_gpc_map4_tile28_f(gr->map_tiles[28]) |
1143                 gr_crstr_gpc_map4_tile29_f(gr->map_tiles[29]);
1144
1145         map5 =  gr_crstr_gpc_map5_tile30_f(gr->map_tiles[30]) |
1146                 gr_crstr_gpc_map5_tile31_f(gr->map_tiles[31]) |
1147                 gr_crstr_gpc_map5_tile32_f(0) |
1148                 gr_crstr_gpc_map5_tile33_f(0) |
1149                 gr_crstr_gpc_map5_tile34_f(0) |
1150                 gr_crstr_gpc_map5_tile35_f(0);
1151
1152         gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
1153         gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
1154         gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
1155         gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
1156         gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
1157         gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
1158
1159         switch (gr->tpc_count) {
1160         case 1:
1161                 norm_shift = 4;
1162                 break;
1163         case 2:
1164         case 3:
1165                 norm_shift = 3;
1166                 break;
1167         case 4:
1168         case 5:
1169         case 6:
1170         case 7:
1171                 norm_shift = 2;
1172                 break;
1173         case 8:
1174         case 9:
1175         case 10:
1176         case 11:
1177         case 12:
1178         case 13:
1179         case 14:
1180         case 15:
1181                 norm_shift = 1;
1182                 break;
1183         default:
1184                 norm_shift = 0;
1185                 break;
1186         }
1187
1188         norm_entries = gr->tpc_count << norm_shift;
1189         coeff5_mod = (1 << 5) % norm_entries;
1190         coeff6_mod = (1 << 6) % norm_entries;
1191         coeff7_mod = (1 << 7) % norm_entries;
1192         coeff8_mod = (1 << 8) % norm_entries;
1193         coeff9_mod = (1 << 9) % norm_entries;
1194         coeff10_mod = (1 << 10) % norm_entries;
1195         coeff11_mod = (1 << 11) % norm_entries;
1196
1197         gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
1198                      gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
1199                      gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
1200                      gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
1201                      gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
1202                      gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
1203
1204         gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
1205                      gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
1206                      gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
1207                      gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
1208                      gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
1209                      gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
1210                      gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
1211
1212         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
1213         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
1214         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
1215         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
1216         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
1217         gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
1218
1219         gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
1220                      gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
1221                      gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
1222
1223         gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
1224         gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
1225         gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
1226         gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
1227         gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
1228         gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
1229
1230         return 0;
1231 }
1232
1233 static inline u32 count_bits(u32 mask)
1234 {
1235         u32 temp = mask;
1236         u32 count;
1237         for (count = 0; temp != 0; count++)
1238                 temp &= temp - 1;
1239
1240         return count;
1241 }
1242
1243 static inline u32 clear_count_bits(u32 num, u32 clear_count)
1244 {
1245         u32 count = clear_count;
1246         for (; (num != 0) && (count != 0); count--)
1247                 num &= num - 1;
1248
1249         return num;
1250 }
1251
1252 static int gr_gk20a_setup_alpha_beta_tables(struct gk20a *g,
1253                                         struct gr_gk20a *gr)
1254 {
1255         u32 table_index_bits = 5;
1256         u32 rows = (1 << table_index_bits);
1257         u32 row_stride = gr_pd_alpha_ratio_table__size_1_v() / rows;
1258
1259         u32 row;
1260         u32 index;
1261         u32 gpc_index;
1262         u32 gpcs_per_reg = 4;
1263         u32 pes_index;
1264         u32 tpc_count_pes;
1265         u32 num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
1266
1267         u32 alpha_target, beta_target;
1268         u32 alpha_bits, beta_bits;
1269         u32 alpha_mask, beta_mask, partial_mask;
1270         u32 reg_offset;
1271         bool assign_alpha;
1272
1273         u32 map_alpha[gr_pd_alpha_ratio_table__size_1_v()];
1274         u32 map_beta[gr_pd_alpha_ratio_table__size_1_v()];
1275         u32 map_reg_used[gr_pd_alpha_ratio_table__size_1_v()];
1276
1277         gk20a_dbg_fn("");
1278
1279         memset(map_alpha, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1280         memset(map_beta, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1281         memset(map_reg_used, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1282
1283         for (row = 0; row < rows; ++row) {
1284                 alpha_target = max_t(u32, gr->tpc_count * row / rows, 1);
1285                 beta_target = gr->tpc_count - alpha_target;
1286
1287                 assign_alpha = (alpha_target < beta_target);
1288
1289                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1290                         reg_offset = (row * row_stride) + (gpc_index / gpcs_per_reg);
1291                         alpha_mask = beta_mask = 0;
1292
1293                         for (pes_index = 0; pes_index < num_pes_per_gpc; pes_index++) {
1294                                 tpc_count_pes = gr->pes_tpc_count[pes_index][gpc_index];
1295
1296                                 if (assign_alpha) {
1297                                         alpha_bits = (alpha_target == 0) ? 0 : tpc_count_pes;
1298                                         beta_bits = tpc_count_pes - alpha_bits;
1299                                 } else {
1300                                         beta_bits = (beta_target == 0) ? 0 : tpc_count_pes;
1301                                         alpha_bits = tpc_count_pes - beta_bits;
1302                                 }
1303
1304                                 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index];
1305                                 partial_mask = clear_count_bits(partial_mask, tpc_count_pes - alpha_bits);
1306                                 alpha_mask |= partial_mask;
1307
1308                                 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index] ^ partial_mask;
1309                                 beta_mask |= partial_mask;
1310
1311                                 alpha_target -= min(alpha_bits, alpha_target);
1312                                 beta_target -= min(beta_bits, beta_target);
1313
1314                                 if ((alpha_bits > 0) || (beta_bits > 0))
1315                                         assign_alpha = !assign_alpha;
1316                         }
1317
1318                         switch (gpc_index % gpcs_per_reg) {
1319                         case 0:
1320                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n0_mask_f(alpha_mask);
1321                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n0_mask_f(beta_mask);
1322                                 break;
1323                         case 1:
1324                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n1_mask_f(alpha_mask);
1325                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n1_mask_f(beta_mask);
1326                                 break;
1327                         case 2:
1328                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n2_mask_f(alpha_mask);
1329                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n2_mask_f(beta_mask);
1330                                 break;
1331                         case 3:
1332                                 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n3_mask_f(alpha_mask);
1333                                 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n3_mask_f(beta_mask);
1334                                 break;
1335                         }
1336                         map_reg_used[reg_offset] = true;
1337                 }
1338         }
1339
1340         for (index = 0; index < gr_pd_alpha_ratio_table__size_1_v(); index++) {
1341                 if (map_reg_used[index]) {
1342                         gk20a_writel(g, gr_pd_alpha_ratio_table_r(index), map_alpha[index]);
1343                         gk20a_writel(g, gr_pd_beta_ratio_table_r(index), map_beta[index]);
1344                 }
1345         }
1346
1347         return 0;
1348 }
1349
1350 static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
1351 {
1352         struct gr_gk20a *gr = &g->gr;
1353         u32 tpc_index, gpc_index;
1354         u32 tpc_offset, gpc_offset;
1355         u32 sm_id = 0, gpc_id = 0;
1356         u32 sm_id_to_gpc_id[proj_scal_max_gpcs_v() * proj_scal_max_tpc_per_gpc_v()];
1357         u32 tpc_per_gpc;
1358         u32 max_ways_evict = INVALID_MAX_WAYS;
1359         u32 l1c_dbg_reg_val;
1360
1361         gk20a_dbg_fn("");
1362
1363         for (tpc_index = 0; tpc_index < gr->max_tpc_per_gpc_count; tpc_index++) {
1364                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1365                         gpc_offset = proj_gpc_stride_v() * gpc_index;
1366                         if (tpc_index < gr->gpc_tpc_count[gpc_index]) {
1367                                 tpc_offset = proj_tpc_in_gpc_stride_v() * tpc_index;
1368
1369                                 gk20a_writel(g, gr_gpc0_tpc0_sm_cfg_r() + gpc_offset + tpc_offset,
1370                                              gr_gpc0_tpc0_sm_cfg_sm_id_f(sm_id));
1371                                 gk20a_writel(g, gr_gpc0_tpc0_l1c_cfg_smid_r() + gpc_offset + tpc_offset,
1372                                              gr_gpc0_tpc0_l1c_cfg_smid_value_f(sm_id));
1373                                 gk20a_writel(g, gr_gpc0_gpm_pd_sm_id_r(tpc_index) + gpc_offset,
1374                                              gr_gpc0_gpm_pd_sm_id_id_f(sm_id));
1375                                 gk20a_writel(g, gr_gpc0_tpc0_pe_cfg_smid_r() + gpc_offset + tpc_offset,
1376                                              gr_gpc0_tpc0_pe_cfg_smid_value_f(sm_id));
1377
1378                                 sm_id_to_gpc_id[sm_id] = gpc_index;
1379                                 sm_id++;
1380                         }
1381
1382                         gk20a_writel(g, gr_gpc0_gpm_pd_active_tpcs_r() + gpc_offset,
1383                                      gr_gpc0_gpm_pd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1384                         gk20a_writel(g, gr_gpc0_gpm_sd_active_tpcs_r() + gpc_offset,
1385                                      gr_gpc0_gpm_sd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1386                 }
1387         }
1388
1389         for (tpc_index = 0, gpc_id = 0;
1390              tpc_index < gr_pd_num_tpc_per_gpc__size_1_v();
1391              tpc_index++, gpc_id += 8) {
1392
1393                 if (gpc_id >= gr->gpc_count)
1394                         gpc_id = 0;
1395
1396                 tpc_per_gpc =
1397                         gr_pd_num_tpc_per_gpc_count0_f(gr->gpc_tpc_count[gpc_id + 0]) |
1398                         gr_pd_num_tpc_per_gpc_count1_f(gr->gpc_tpc_count[gpc_id + 1]) |
1399                         gr_pd_num_tpc_per_gpc_count2_f(gr->gpc_tpc_count[gpc_id + 2]) |
1400                         gr_pd_num_tpc_per_gpc_count3_f(gr->gpc_tpc_count[gpc_id + 3]) |
1401                         gr_pd_num_tpc_per_gpc_count4_f(gr->gpc_tpc_count[gpc_id + 4]) |
1402                         gr_pd_num_tpc_per_gpc_count5_f(gr->gpc_tpc_count[gpc_id + 5]) |
1403                         gr_pd_num_tpc_per_gpc_count6_f(gr->gpc_tpc_count[gpc_id + 6]) |
1404                         gr_pd_num_tpc_per_gpc_count7_f(gr->gpc_tpc_count[gpc_id + 7]);
1405
1406                 gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1407                 gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1408         }
1409
1410         /* gr__setup_pd_mapping stubbed for gk20a */
1411         gr_gk20a_setup_rop_mapping(g, gr);
1412         if (g->ops.gr.setup_alpha_beta_tables)
1413                 g->ops.gr.setup_alpha_beta_tables(g, gr);
1414
1415         if (gr->num_fbps == 1)
1416                 max_ways_evict = 9;
1417
1418         if (max_ways_evict != INVALID_MAX_WAYS)
1419                 g->ops.ltc.set_max_ways_evict_last(g, max_ways_evict);
1420
1421         for (gpc_index = 0;
1422              gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
1423              gpc_index += 4) {
1424
1425                 gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
1426                              gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
1427                              gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
1428                              gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
1429                              gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
1430         }
1431
1432         gk20a_writel(g, gr_cwd_fs_r(),
1433                      gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1434                      gr_cwd_fs_num_tpcs_f(gr->tpc_count));
1435
1436         gk20a_writel(g, gr_bes_zrop_settings_r(),
1437                      gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
1438         gk20a_writel(g, gr_bes_crop_settings_r(),
1439                      gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
1440
1441         /* turn on cya15 bit for a default val that missed the cut */
1442         l1c_dbg_reg_val = gk20a_readl(g, gr_gpc0_tpc0_l1c_dbg_r());
1443         l1c_dbg_reg_val |= gr_gpc0_tpc0_l1c_dbg_cya15_en_f();
1444         gk20a_writel(g, gr_gpc0_tpc0_l1c_dbg_r(), l1c_dbg_reg_val);
1445
1446         return 0;
1447 }
1448
1449 static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
1450 {
1451         struct gk20a *g = c->g;
1452         int ret;
1453
1454         u32 inst_base_ptr =
1455                 u64_lo32(c->inst_block.cpu_pa
1456                 >> ram_in_base_shift_v());
1457
1458
1459         gk20a_dbg_fn("");
1460
1461         ret = gr_gk20a_submit_fecs_method_op(g,
1462                 (struct fecs_method_op_gk20a) {
1463                 .method.addr = save_type,
1464                 .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1465                                 gr_fecs_current_ctx_target_vid_mem_f() |
1466                                 gr_fecs_current_ctx_valid_f(1)),
1467                 .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
1468                         .ok = 1, .fail = 2,
1469                 },
1470                 .cond.ok = GR_IS_UCODE_OP_AND,
1471                 .cond.fail = GR_IS_UCODE_OP_AND,
1472                  });
1473
1474         if (ret)
1475                 gk20a_err(dev_from_gk20a(g), "save context image failed");
1476
1477         return ret;
1478 }
1479
1480 static u32 gk20a_init_sw_bundle(struct gk20a *g)
1481 {
1482         struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
1483         u32 last_bundle_data = 0;
1484         u32 err = 0;
1485         int i;
1486         unsigned long end_jiffies = jiffies +
1487                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
1488
1489         /* enable pipe mode override */
1490         gk20a_writel(g, gr_pipe_bundle_config_r(),
1491                 gr_pipe_bundle_config_override_pipe_mode_enabled_f());
1492
1493         /* load bundle init */
1494         for (i = 0; i < sw_bundle_init->count; i++) {
1495                 err |= gr_gk20a_wait_fe_idle(g, end_jiffies,
1496                                         GR_IDLE_CHECK_DEFAULT);
1497                 if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
1498                         gk20a_writel(g, gr_pipe_bundle_data_r(),
1499                                 sw_bundle_init->l[i].value);
1500                         last_bundle_data = sw_bundle_init->l[i].value;
1501                 }
1502
1503                 gk20a_writel(g, gr_pipe_bundle_address_r(),
1504                              sw_bundle_init->l[i].addr);
1505
1506                 if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
1507                     GR_GO_IDLE_BUNDLE)
1508                         err |= gr_gk20a_wait_idle(g, end_jiffies,
1509                                         GR_IDLE_CHECK_DEFAULT);
1510         }
1511
1512         /* disable pipe mode override */
1513         gk20a_writel(g, gr_pipe_bundle_config_r(),
1514                      gr_pipe_bundle_config_override_pipe_mode_disabled_f());
1515
1516         return err;
1517 }
1518
1519 /* init global golden image from a fresh gr_ctx in channel ctx.
1520    save a copy in local_golden_image in ctx_vars */
1521 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
1522                                           struct channel_gk20a *c)
1523 {
1524         struct gr_gk20a *gr = &g->gr;
1525         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1526         u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
1527         u32 ctx_header_words;
1528         u32 i;
1529         u32 data;
1530         void *ctx_ptr = NULL;
1531         void *gold_ptr = NULL;
1532         u32 err = 0;
1533
1534         gk20a_dbg_fn("");
1535
1536         /* golden ctx is global to all channels. Although only the first
1537            channel initializes golden image, driver needs to prevent multiple
1538            channels from initializing golden ctx at the same time */
1539         mutex_lock(&gr->ctx_mutex);
1540
1541         if (gr->ctx_vars.golden_image_initialized)
1542                 goto clean_up;
1543
1544         err = gr_gk20a_fecs_ctx_bind_channel(g, c);
1545         if (err)
1546                 goto clean_up;
1547
1548         err = gk20a_init_sw_bundle(g);
1549         if (err)
1550                 goto clean_up;
1551
1552         err = gr_gk20a_elpg_protected_call(g,
1553                         gr_gk20a_commit_global_ctx_buffers(g, c, false));
1554         if (err)
1555                 goto clean_up;
1556
1557         gold_ptr = vmap(gr->global_ctx_buffer[GOLDEN_CTX].pages,
1558                         PAGE_ALIGN(gr->global_ctx_buffer[GOLDEN_CTX].size) >>
1559                         PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
1560         if (!gold_ptr)
1561                 goto clean_up;
1562
1563         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1564                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1565                         0, pgprot_dmacoherent(PAGE_KERNEL));
1566         if (!ctx_ptr)
1567                 goto clean_up;
1568
1569         ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
1570         ctx_header_words >>= 2;
1571
1572         gk20a_mm_l2_flush(g, true);
1573
1574         for (i = 0; i < ctx_header_words; i++) {
1575                 data = gk20a_mem_rd32(ctx_ptr, i);
1576                 gk20a_mem_wr32(gold_ptr, i, data);
1577         }
1578
1579         gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
1580                  ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
1581
1582         gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
1583
1584         gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
1585
1586         gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
1587
1588         if (gr->ctx_vars.local_golden_image == NULL) {
1589
1590                 gr->ctx_vars.local_golden_image =
1591                         kzalloc(gr->ctx_vars.golden_image_size, GFP_KERNEL);
1592
1593                 if (gr->ctx_vars.local_golden_image == NULL) {
1594                         err = -ENOMEM;
1595                         goto clean_up;
1596                 }
1597
1598                 for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1599                         gr->ctx_vars.local_golden_image[i] =
1600                                 gk20a_mem_rd32(gold_ptr, i);
1601         }
1602
1603         gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
1604
1605         gr->ctx_vars.golden_image_initialized = true;
1606
1607         gk20a_writel(g, gr_fecs_current_ctx_r(),
1608                 gr_fecs_current_ctx_valid_false_f());
1609
1610 clean_up:
1611         if (err)
1612                 gk20a_err(dev_from_gk20a(g), "fail");
1613         else
1614                 gk20a_dbg_fn("done");
1615
1616         if (gold_ptr)
1617                 vunmap(gold_ptr);
1618         if (ctx_ptr)
1619                 vunmap(ctx_ptr);
1620
1621         mutex_unlock(&gr->ctx_mutex);
1622         return err;
1623 }
1624
1625 int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
1626                                     struct channel_gk20a *c,
1627                                     bool enable_smpc_ctxsw)
1628 {
1629         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1630         void *ctx_ptr = NULL;
1631         u32 data;
1632
1633         /* Channel gr_ctx buffer is gpu cacheable.
1634            Flush and invalidate before cpu update. */
1635         gk20a_mm_l2_flush(g, true);
1636
1637         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1638                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1639                         0, pgprot_dmacoherent(PAGE_KERNEL));
1640         if (!ctx_ptr)
1641                 return -ENOMEM;
1642
1643         data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1644         data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
1645         data |= enable_smpc_ctxsw ?
1646                 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
1647                 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
1648         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1649                  data);
1650
1651         vunmap(ctx_ptr);
1652
1653         return 0;
1654 }
1655
1656 /* load saved fresh copy of gloden image into channel gr_ctx */
1657 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
1658                                         struct channel_gk20a *c)
1659 {
1660         struct gr_gk20a *gr = &g->gr;
1661         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1662         u32 virt_addr_lo;
1663         u32 virt_addr_hi;
1664         u32 i, v, data;
1665         int ret = 0;
1666         void *ctx_ptr = NULL;
1667
1668         gk20a_dbg_fn("");
1669
1670         if (gr->ctx_vars.local_golden_image == NULL)
1671                 return -1;
1672
1673         /* Channel gr_ctx buffer is gpu cacheable.
1674            Flush and invalidate before cpu update. */
1675         gk20a_mm_l2_flush(g, true);
1676
1677         ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
1678                         PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
1679                         0, pgprot_dmacoherent(PAGE_KERNEL));
1680         if (!ctx_ptr)
1681                 return -ENOMEM;
1682
1683         for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1684                 gk20a_mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
1685
1686         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
1687         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
1688
1689         virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
1690         virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
1691
1692         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
1693                  ch_ctx->patch_ctx.data_count);
1694         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
1695                  virt_addr_lo);
1696         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
1697                  virt_addr_hi);
1698
1699         /* no user for client managed performance counter ctx */
1700         ch_ctx->pm_ctx.ctx_sw_mode =
1701                 ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
1702         data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
1703         data = data & ~ctxsw_prog_main_image_pm_mode_m();
1704         data |= ch_ctx->pm_ctx.ctx_sw_mode;
1705         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1706                  data);
1707
1708         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0);
1709
1710         /* set priv access map */
1711         virt_addr_lo =
1712                  u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1713         virt_addr_hi =
1714                  u64_hi32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
1715
1716         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
1717                  ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f());
1718         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
1719                  virt_addr_lo);
1720         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
1721                  virt_addr_hi);
1722         /* disable verif features */
1723         v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
1724         v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
1725         v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
1726         gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
1727
1728
1729         vunmap(ctx_ptr);
1730
1731         if (tegra_platform_is_linsim()) {
1732                 u32 inst_base_ptr =
1733                         u64_lo32(c->inst_block.cpu_pa
1734                         >> ram_in_base_shift_v());
1735
1736                 ret = gr_gk20a_submit_fecs_method_op(g,
1737                           (struct fecs_method_op_gk20a) {
1738                                   .method.data =
1739                                           (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1740                                            gr_fecs_current_ctx_target_vid_mem_f() |
1741                                            gr_fecs_current_ctx_valid_f(1)),
1742                                   .method.addr =
1743                                           gr_fecs_method_push_adr_restore_golden_v(),
1744                                   .mailbox = {
1745                                           .id = 0, .data = 0,
1746                                           .clr = ~0, .ret = NULL,
1747                                           .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
1748                                           .fail = 0},
1749                                   .cond.ok = GR_IS_UCODE_OP_EQUAL,
1750                                   .cond.fail = GR_IS_UCODE_OP_SKIP});
1751
1752                 if (ret)
1753                         gk20a_err(dev_from_gk20a(g),
1754                                    "restore context image failed");
1755         }
1756
1757         return ret;
1758 }
1759
1760 static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
1761 {
1762         gk20a_dbg_fn("");
1763
1764         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
1765                      gr_fecs_ctxsw_mailbox_clear_value_f(~0));
1766
1767         gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
1768         gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
1769
1770         gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
1771         gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
1772
1773         gk20a_dbg_fn("done");
1774 }
1775
1776 static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
1777 {
1778         struct mm_gk20a *mm = &g->mm;
1779         struct vm_gk20a *vm = &mm->pmu.vm;
1780         struct device *d = dev_from_gk20a(g);
1781         struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1782         void *inst_ptr;
1783         u32 pde_addr_lo;
1784         u32 pde_addr_hi;
1785         u64 pde_addr;
1786         dma_addr_t iova;
1787
1788         /* Alloc mem of inst block */
1789         ucode_info->inst_blk_desc.size = ram_in_alloc_size_v();
1790         ucode_info->inst_blk_desc.cpuva = dma_alloc_coherent(d,
1791                                         ucode_info->inst_blk_desc.size,
1792                                         &iova,
1793                                         GFP_KERNEL);
1794         if (!ucode_info->inst_blk_desc.cpuva) {
1795                 gk20a_err(d, "failed to allocate memory\n");
1796                 return -ENOMEM;
1797         }
1798
1799         ucode_info->inst_blk_desc.iova = iova;
1800         ucode_info->inst_blk_desc.cpu_pa = gk20a_get_phys_from_iova(d,
1801                                         ucode_info->inst_blk_desc.iova);
1802
1803         inst_ptr = ucode_info->inst_blk_desc.cpuva;
1804
1805         /* Set inst block */
1806         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
1807                  u64_lo32(vm->va_limit) | 0xFFF);
1808         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
1809                 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
1810
1811         pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
1812         pde_addr_lo = u64_lo32(pde_addr >> 12);
1813         pde_addr_hi = u64_hi32(pde_addr);
1814         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
1815                 ram_in_page_dir_base_target_vid_mem_f() |
1816                 ram_in_page_dir_base_vol_true_f() |
1817                 ram_in_page_dir_base_lo_f(pde_addr_lo));
1818         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
1819                 ram_in_page_dir_base_hi_f(pde_addr_hi));
1820
1821         /* Map ucode surface to GMMU */
1822         ucode_info->ucode_gpuva = gk20a_gmmu_map(vm,
1823                                         &ucode_info->surface_desc.sgt,
1824                                         ucode_info->surface_desc.size,
1825                                         0, /* flags */
1826                                         gk20a_mem_flag_read_only);
1827         if (!ucode_info->ucode_gpuva) {
1828                 gk20a_err(d, "failed to update gmmu ptes\n");
1829                 return -ENOMEM;
1830         }
1831
1832         return 0;
1833 }
1834
1835 static void gr_gk20a_init_ctxsw_ucode_segment(
1836         struct gk20a_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size)
1837 {
1838         p_seg->offset = *offset;
1839         p_seg->size = size;
1840         *offset = ALIGN(*offset + size, BLK_SIZE);
1841 }
1842
1843 static void gr_gk20a_init_ctxsw_ucode_segments(
1844         struct gk20a_ctxsw_ucode_segments *segments, u32 *offset,
1845         struct gk20a_ctxsw_bootloader_desc *bootdesc,
1846         u32 code_size, u32 data_size)
1847 {
1848         u32 boot_size = ALIGN(bootdesc->size, sizeof(u32));
1849         segments->boot_entry = bootdesc->entry_point;
1850         segments->boot_imem_offset = bootdesc->imem_offset;
1851         gr_gk20a_init_ctxsw_ucode_segment(&segments->boot, offset, boot_size);
1852         gr_gk20a_init_ctxsw_ucode_segment(&segments->code, offset, code_size);
1853         gr_gk20a_init_ctxsw_ucode_segment(&segments->data, offset, data_size);
1854 }
1855
1856 static int gr_gk20a_copy_ctxsw_ucode_segments(
1857         u8 *buf,
1858         struct gk20a_ctxsw_ucode_segments *segments,
1859         u32 *bootimage,
1860         u32 *code, u32 *data)
1861 {
1862         memcpy(buf + segments->boot.offset, bootimage, segments->boot.size);
1863         memcpy(buf + segments->code.offset, code,      segments->code.size);
1864         memcpy(buf + segments->data.offset, data,      segments->data.size);
1865         return 0;
1866 }
1867
1868 static int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
1869 {
1870         struct device *d = dev_from_gk20a(g);
1871         struct mm_gk20a *mm = &g->mm;
1872         struct vm_gk20a *vm = &mm->pmu.vm;
1873         struct gk20a_ctxsw_bootloader_desc *fecs_boot_desc;
1874         struct gk20a_ctxsw_bootloader_desc *gpccs_boot_desc;
1875         const struct firmware *fecs_fw;
1876         const struct firmware *gpccs_fw;
1877         u32 *fecs_boot_image;
1878         u32 *gpccs_boot_image;
1879         struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1880         u8 *buf;
1881         u32 ucode_size;
1882         int err = 0;
1883         dma_addr_t iova;
1884         DEFINE_DMA_ATTRS(attrs);
1885
1886         fecs_fw = gk20a_request_firmware(g, GK20A_FECS_UCODE_IMAGE);
1887         if (!fecs_fw) {
1888                 gk20a_err(d, "failed to load fecs ucode!!");
1889                 return -ENOENT;
1890         }
1891
1892         fecs_boot_desc = (void *)fecs_fw->data;
1893         fecs_boot_image = (void *)(fecs_fw->data +
1894                                 sizeof(struct gk20a_ctxsw_bootloader_desc));
1895
1896         gpccs_fw = gk20a_request_firmware(g, GK20A_GPCCS_UCODE_IMAGE);
1897         if (!gpccs_fw) {
1898                 release_firmware(fecs_fw);
1899                 gk20a_err(d, "failed to load gpccs ucode!!");
1900                 return -ENOENT;
1901         }
1902
1903         gpccs_boot_desc = (void *)gpccs_fw->data;
1904         gpccs_boot_image = (void *)(gpccs_fw->data +
1905                                 sizeof(struct gk20a_ctxsw_bootloader_desc));
1906
1907         ucode_size = 0;
1908         gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->fecs, &ucode_size,
1909                 fecs_boot_desc,
1910                 g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
1911                 g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
1912         gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->gpccs, &ucode_size,
1913                 gpccs_boot_desc,
1914                 g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
1915                 g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
1916
1917         ucode_info->surface_desc.size = ucode_size;
1918         dma_set_attr(DMA_ATTR_READ_ONLY, &attrs);
1919         ucode_info->surface_desc.cpuva = dma_alloc_attrs(d,
1920                                         ucode_info->surface_desc.size,
1921                                         &iova,
1922                                         GFP_KERNEL,
1923                                         &attrs);
1924         if (!ucode_info->surface_desc.cpuva) {
1925                 gk20a_err(d, "memory allocation failed\n");
1926                 err = -ENOMEM;
1927                 goto clean_up;
1928         }
1929
1930         ucode_info->surface_desc.iova = iova;
1931         err = gk20a_get_sgtable(d, &ucode_info->surface_desc.sgt,
1932                                 ucode_info->surface_desc.cpuva,
1933                                 ucode_info->surface_desc.iova,
1934                                 ucode_info->surface_desc.size);
1935         if (err) {
1936                 gk20a_err(d, "failed to create sg table\n");
1937                 goto clean_up;
1938         }
1939
1940         buf = (u8 *)ucode_info->surface_desc.cpuva;
1941         if (!buf) {
1942                 gk20a_err(d, "failed to map surface desc buffer");
1943                 err = -ENOMEM;
1944                 goto clean_up;
1945         }
1946
1947         gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->fecs,
1948                 fecs_boot_image,
1949                 g->gr.ctx_vars.ucode.fecs.inst.l,
1950                 g->gr.ctx_vars.ucode.fecs.data.l);
1951
1952         release_firmware(fecs_fw);
1953         fecs_fw = NULL;
1954
1955         gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->gpccs,
1956                 gpccs_boot_image,
1957                 g->gr.ctx_vars.ucode.gpccs.inst.l,
1958                 g->gr.ctx_vars.ucode.gpccs.data.l);
1959
1960         release_firmware(gpccs_fw);
1961         gpccs_fw = NULL;
1962
1963         err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
1964         if (err)
1965                 goto clean_up;
1966
1967         gk20a_free_sgtable(&ucode_info->surface_desc.sgt);
1968
1969         return 0;
1970
1971  clean_up:
1972         if (ucode_info->ucode_gpuva)
1973                 gk20a_gmmu_unmap(vm, ucode_info->ucode_gpuva,
1974                         ucode_info->surface_desc.size, gk20a_mem_flag_none);
1975         if (ucode_info->surface_desc.sgt)
1976                 gk20a_free_sgtable(&ucode_info->surface_desc.sgt);
1977         if (ucode_info->surface_desc.cpuva)
1978                 dma_free_attrs(d, ucode_info->surface_desc.size,
1979                                 ucode_info->surface_desc.cpuva,
1980                                 ucode_info->surface_desc.iova,
1981                                 &attrs);
1982         ucode_info->surface_desc.cpuva = NULL;
1983         ucode_info->surface_desc.iova = 0;
1984
1985         release_firmware(gpccs_fw);
1986         gpccs_fw = NULL;
1987         release_firmware(fecs_fw);
1988         fecs_fw = NULL;
1989
1990         return err;
1991 }
1992
1993 static void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
1994 {
1995         struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
1996         int retries = 20;
1997         phys_addr_t inst_ptr;
1998         u32 val;
1999
2000         while ((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
2001                         gr_fecs_ctxsw_status_1_arb_busy_m()) && retries) {
2002                 udelay(2);
2003                 retries--;
2004         }
2005         if (!retries)
2006                 gk20a_err(dev_from_gk20a(g), "arbiter idle timeout");
2007
2008         gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
2009
2010         inst_ptr = ucode_info->inst_blk_desc.cpu_pa;
2011         gk20a_writel(g, gr_fecs_new_ctx_r(),
2012                         gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
2013                         gr_fecs_new_ctx_target_m() |
2014                         gr_fecs_new_ctx_valid_m());
2015
2016         gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
2017                         gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
2018                         gr_fecs_arb_ctx_ptr_target_m());
2019
2020         gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
2021
2022         /* Wait for arbiter command to complete */
2023         retries = 20;
2024         val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2025         while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
2026                 udelay(2);
2027                 retries--;
2028                 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2029         }
2030         if (!retries)
2031                 gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
2032
2033         gk20a_writel(g, gr_fecs_current_ctx_r(),
2034                         gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
2035                         gr_fecs_current_ctx_target_m() |
2036                         gr_fecs_current_ctx_valid_m());
2037         /* Send command to arbiter to flush */
2038         gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
2039
2040         retries = 20;
2041         val = (gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
2042         while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
2043                 udelay(2);
2044                 retries--;
2045                 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
2046         }
2047         if (!retries)
2048                 gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
2049 }
2050
2051 static int gr_gk20a_load_ctxsw_ucode_segments(struct gk20a *g, u64 addr_base,
2052         struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
2053 {
2054         u32 addr_code32;
2055         u32 addr_data32;
2056         u32 addr_load32;
2057         u32 dst = 0;
2058         u32 blocks;
2059         u32 b;
2060
2061         addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
2062         addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
2063         addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
2064
2065         gk20a_writel(g, reg_offset + gr_fecs_dmactl_r(),
2066                         gr_fecs_dmactl_require_ctx_f(0));
2067
2068         /*
2069          * Copy falcon bootloader header into dmem at offset 0.
2070          * Configure dmem port 0 for auto-incrementing writes starting at dmem
2071          * offset 0.
2072          */
2073         gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
2074                         gr_fecs_dmemc_offs_f(0) |
2075                         gr_fecs_dmemc_blk_f(0) |
2076                         gr_fecs_dmemc_aincw_f(1));
2077
2078         /* Write out the actual data */
2079         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2080         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
2081         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2082         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->code.size);
2083         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2084         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_data32);
2085         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->data.size);
2086         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
2087         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2088         gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
2089
2090         blocks = ((segments->boot.size + 0xFF) & ~0xFF) >> 8;
2091
2092         /*
2093          * Set the base FB address for the DMA transfer. Subtract off the 256
2094          * byte IMEM block offset such that the relative FB and IMEM offsets
2095          * match, allowing the IMEM tags to be properly created.
2096          */
2097
2098         dst = segments->boot_imem_offset;
2099         gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
2100                         (addr_load32 - (dst >> 8)));
2101
2102         for (b = 0; b < blocks; b++) {
2103                 /* Setup destination IMEM offset */
2104                 gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
2105                                 dst + (b << 8));
2106
2107                 /* Setup source offset (relative to BASE) */
2108                 gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
2109                                 dst + (b << 8));
2110
2111                 gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
2112                                 gr_fecs_dmatrfcmd_imem_f(0x01) |
2113                                 gr_fecs_dmatrfcmd_write_f(0x00) |
2114                                 gr_fecs_dmatrfcmd_size_f(0x06) |
2115                                 gr_fecs_dmatrfcmd_ctxdma_f(0));
2116         }
2117
2118         /* Specify the falcon boot vector */
2119         gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
2120                         gr_fecs_bootvec_vec_f(segments->boot_entry));
2121
2122         /* Write to CPUCTL to start the falcon */
2123         gk20a_writel(g, reg_offset + gr_fecs_cpuctl_r(),
2124                         gr_fecs_cpuctl_startcpu_f(0x01));
2125
2126         return 0;
2127 }
2128
2129 static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
2130 {
2131         struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
2132         u64 addr_base = ucode_info->ucode_gpuva;
2133
2134         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
2135
2136         gr_gk20a_load_falcon_bind_instblk(g);
2137
2138         gr_gk20a_load_ctxsw_ucode_segments(g, addr_base,
2139                 &g->ctxsw_ucode_info.fecs, 0);
2140
2141         gr_gk20a_load_ctxsw_ucode_segments(g, addr_base,
2142                 &g->ctxsw_ucode_info.gpccs,
2143                 gr_gpcs_gpccs_falcon_hwcfg_r() -
2144                 gr_fecs_falcon_hwcfg_r());
2145 }
2146
2147 static int gr_gk20a_load_ctxsw_ucode(struct gk20a *g, struct gr_gk20a *gr)
2148 {
2149         u32 ret;
2150
2151         gk20a_dbg_fn("");
2152
2153         if (tegra_platform_is_linsim()) {
2154                 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
2155                         gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
2156                 gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
2157                         gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
2158         }
2159
2160         /*
2161          * In case the gPMU falcon is not being used, revert to the old way of
2162          * loading gr ucode, without the faster bootstrap routine.
2163          */
2164         if (!support_gk20a_pmu()) {
2165                 gr_gk20a_load_falcon_dmem(g);
2166                 gr_gk20a_load_falcon_imem(g);
2167                 gr_gk20a_start_falcon_ucode(g);
2168         } else {
2169                 if (!gr->skip_ucode_init)
2170                         gr_gk20a_init_ctxsw_ucode(g);
2171                 gr_gk20a_load_falcon_with_bootloader(g);
2172                 gr->skip_ucode_init = true;
2173         }
2174
2175         ret = gr_gk20a_ctx_wait_ucode(g, 0, 0,
2176                                       GR_IS_UCODE_OP_EQUAL,
2177                                       eUcodeHandshakeInitComplete,
2178                                       GR_IS_UCODE_OP_SKIP, 0);
2179         if (ret) {
2180                 gk20a_err(dev_from_gk20a(g), "falcon ucode init timeout");
2181                 return ret;
2182         }
2183
2184         if (support_gk20a_pmu())
2185                 gk20a_writel(g, gr_fecs_current_ctx_r(),
2186                         gr_fecs_current_ctx_valid_false_f());
2187
2188         gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
2189         gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
2190         gk20a_writel(g, gr_fecs_method_push_r(),
2191                      gr_fecs_method_push_adr_set_watchdog_timeout_f());
2192
2193         gk20a_dbg_fn("done");
2194         return 0;
2195 }
2196
2197 static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
2198 {
2199         u32 golden_ctx_image_size = 0;
2200         u32 zcull_ctx_image_size = 0;
2201         u32 pm_ctx_image_size = 0;
2202         u32 ret;
2203         struct fecs_method_op_gk20a op = {
2204                 .mailbox = { .id = 0, .data = 0,
2205                              .clr = ~0, .ok = 0, .fail = 0},
2206                 .method.data = 0,
2207                 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
2208                 .cond.fail = GR_IS_UCODE_OP_SKIP,
2209                 };
2210
2211         gk20a_dbg_fn("");
2212         op.method.addr = gr_fecs_method_push_adr_discover_image_size_v();
2213         op.mailbox.ret = &golden_ctx_image_size;
2214         ret = gr_gk20a_submit_fecs_method_op(g, op);
2215         if (ret) {
2216                 gk20a_err(dev_from_gk20a(g),
2217                            "query golden image size failed");
2218                 return ret;
2219         }
2220         op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v();
2221         op.mailbox.ret = &zcull_ctx_image_size;
2222         ret = gr_gk20a_submit_fecs_method_op(g, op);
2223         if (ret) {
2224                 gk20a_err(dev_from_gk20a(g),
2225                            "query zcull ctx image size failed");
2226                 return ret;
2227         }
2228         op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v();
2229         op.mailbox.ret = &pm_ctx_image_size;
2230         ret = gr_gk20a_submit_fecs_method_op(g, op);
2231         if (ret) {
2232                 gk20a_err(dev_from_gk20a(g),
2233                            "query pm ctx image size failed");
2234                 return ret;
2235         }
2236
2237         if (!g->gr.ctx_vars.golden_image_size &&
2238             !g->gr.ctx_vars.zcull_ctxsw_image_size) {
2239                 g->gr.ctx_vars.golden_image_size = golden_ctx_image_size;
2240                 g->gr.ctx_vars.zcull_ctxsw_image_size = zcull_ctx_image_size;
2241         } else {
2242                 /* hw is different after railgating? */
2243                 BUG_ON(g->gr.ctx_vars.golden_image_size != golden_ctx_image_size);
2244                 BUG_ON(g->gr.ctx_vars.zcull_ctxsw_image_size != zcull_ctx_image_size);
2245         }
2246
2247         g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
2248
2249         gk20a_dbg_fn("done");
2250         return 0;
2251 }
2252
2253 static void gk20a_gr_destroy_ctx_buffer(struct platform_device *pdev,
2254                                         struct gr_ctx_buffer_desc *desc)
2255 {
2256         struct device *dev = &pdev->dev;
2257         gk20a_free_sgtable(&desc->sgt);
2258         dma_free_attrs(dev, desc->size, desc->pages,
2259                        desc->iova, &desc->attrs);
2260 }
2261
2262 static int gk20a_gr_alloc_ctx_buffer(struct platform_device *pdev,
2263                                      struct gr_ctx_buffer_desc *desc,
2264                                      size_t size)
2265 {
2266         struct device *dev = &pdev->dev;
2267         DEFINE_DMA_ATTRS(attrs);
2268         dma_addr_t iova;
2269         int err = 0;
2270
2271         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2272
2273         desc->pages = dma_alloc_attrs(&pdev->dev, size, &iova,
2274                                       GFP_KERNEL, &attrs);
2275         if (!desc->pages)
2276                 return -ENOMEM;
2277
2278         desc->iova = iova;
2279         desc->size = size;
2280         desc->attrs = attrs;
2281         desc->destroy = gk20a_gr_destroy_ctx_buffer;
2282         err = gk20a_get_sgtable_from_pages(&pdev->dev, &desc->sgt, desc->pages,
2283                                            desc->iova, desc->size);
2284         if (err) {
2285                 dma_free_attrs(dev, desc->size, desc->pages,
2286                                desc->iova, &desc->attrs);
2287                 memset(desc, 0, sizeof(*desc));
2288         }
2289
2290         return err;
2291 }
2292
2293 static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
2294 {
2295         struct gk20a_platform *platform = platform_get_drvdata(g->dev);
2296         struct gr_gk20a *gr = &g->gr;
2297         int i, attr_buffer_size, err;
2298         struct platform_device *pdev = g->dev;
2299
2300         u32 cb_buffer_size = gr->bundle_cb_default_size *
2301                 gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
2302
2303         u32 pagepool_buffer_size = gr_scc_pagepool_total_pages_hwmax_value_v() *
2304                 gr_scc_pagepool_total_pages_byte_granularity_v();
2305
2306         gk20a_dbg_fn("");
2307
2308         attr_buffer_size = g->ops.gr.calc_global_ctx_buffer_size(g);
2309
2310         gk20a_dbg_info("cb_buffer_size : %d", cb_buffer_size);
2311
2312         err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[CIRCULAR],
2313                                         cb_buffer_size);
2314         if (err)
2315                 goto clean_up;
2316
2317         if (platform->secure_alloc)
2318                 platform->secure_alloc(pdev,
2319                                        &gr->global_ctx_buffer[CIRCULAR_VPR],
2320                                        cb_buffer_size);
2321
2322         gk20a_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
2323
2324         err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[PAGEPOOL],
2325                                         pagepool_buffer_size);
2326         if (err)
2327                 goto clean_up;
2328
2329         if (platform->secure_alloc)
2330                 platform->secure_alloc(pdev,
2331                                        &gr->global_ctx_buffer[PAGEPOOL_VPR],
2332                                        pagepool_buffer_size);
2333
2334         gk20a_dbg_info("attr_buffer_size : %d", attr_buffer_size);
2335
2336         err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[ATTRIBUTE],
2337                                         attr_buffer_size);
2338         if (err)
2339                 goto clean_up;
2340
2341         if (platform->secure_alloc)
2342                 platform->secure_alloc(pdev,
2343                                        &gr->global_ctx_buffer[ATTRIBUTE_VPR],
2344                                        attr_buffer_size);
2345
2346         if (platform->secure_buffer.destroy)
2347                 platform->secure_buffer.destroy(pdev, &platform->secure_buffer);
2348
2349         gk20a_dbg_info("golden_image_size : %d",
2350                    gr->ctx_vars.golden_image_size);
2351
2352         err = gk20a_gr_alloc_ctx_buffer(pdev,
2353                                         &gr->global_ctx_buffer[GOLDEN_CTX],
2354                                         gr->ctx_vars.golden_image_size);
2355         if (err)
2356                 goto clean_up;
2357
2358         gk20a_dbg_info("priv_access_map_size : %d",
2359                    gr->ctx_vars.priv_access_map_size);
2360
2361         err = gk20a_gr_alloc_ctx_buffer(pdev,
2362                                         &gr->global_ctx_buffer[PRIV_ACCESS_MAP],
2363                                         gr->ctx_vars.priv_access_map_size);
2364
2365         if (err)
2366                 goto clean_up;
2367
2368         gk20a_dbg_fn("done");
2369         return 0;
2370
2371  clean_up:
2372         gk20a_err(dev_from_gk20a(g), "fail");
2373         for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2374                 if (gr->global_ctx_buffer[i].destroy) {
2375                         gr->global_ctx_buffer[i].destroy(pdev,
2376                                         &gr->global_ctx_buffer[i]);
2377                 }
2378         }
2379         return -ENOMEM;
2380 }
2381
2382 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
2383 {
2384         struct platform_device *pdev = g->dev;
2385         struct gr_gk20a *gr = &g->gr;
2386         DEFINE_DMA_ATTRS(attrs);
2387         u32 i;
2388
2389         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2390
2391         for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2392                 gr->global_ctx_buffer[i].destroy(pdev,
2393                                 &gr->global_ctx_buffer[i]);
2394         }
2395
2396         gk20a_dbg_fn("done");
2397 }
2398
2399 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
2400                                         struct channel_gk20a *c)
2401 {
2402         struct vm_gk20a *ch_vm = c->vm;
2403         u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2404         u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
2405         struct gr_gk20a *gr = &g->gr;
2406         struct sg_table *sgt;
2407         u64 size;
2408         u64 gpu_va;
2409         u32 i;
2410         gk20a_dbg_fn("");
2411
2412         /* Circular Buffer */
2413         if (!c->vpr || (gr->global_ctx_buffer[CIRCULAR_VPR].sgt == NULL)) {
2414                 sgt = gr->global_ctx_buffer[CIRCULAR].sgt;
2415                 size = gr->global_ctx_buffer[CIRCULAR].size;
2416         } else {
2417                 sgt = gr->global_ctx_buffer[CIRCULAR_VPR].sgt;
2418                 size = gr->global_ctx_buffer[CIRCULAR_VPR].size;
2419         }
2420
2421         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2422                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2423                                 gk20a_mem_flag_none);
2424         if (!gpu_va)
2425                 goto clean_up;
2426         g_bfr_va[CIRCULAR_VA] = gpu_va;
2427         g_bfr_size[CIRCULAR_VA] = size;
2428
2429         /* Attribute Buffer */
2430         if (!c->vpr || (gr->global_ctx_buffer[ATTRIBUTE_VPR].sgt == NULL)) {
2431                 sgt = gr->global_ctx_buffer[ATTRIBUTE].sgt;
2432                 size = gr->global_ctx_buffer[ATTRIBUTE].size;
2433         } else {
2434                 sgt = gr->global_ctx_buffer[ATTRIBUTE_VPR].sgt;
2435                 size = gr->global_ctx_buffer[ATTRIBUTE_VPR].size;
2436         }
2437
2438         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2439                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2440                                 gk20a_mem_flag_none);
2441         if (!gpu_va)
2442                 goto clean_up;
2443         g_bfr_va[ATTRIBUTE_VA] = gpu_va;
2444         g_bfr_size[ATTRIBUTE_VA] = size;
2445
2446         /* Page Pool */
2447         if (!c->vpr || (gr->global_ctx_buffer[PAGEPOOL_VPR].sgt == NULL)) {
2448                 sgt = gr->global_ctx_buffer[PAGEPOOL].sgt;
2449                 size = gr->global_ctx_buffer[PAGEPOOL].size;
2450         } else {
2451                 sgt = gr->global_ctx_buffer[PAGEPOOL_VPR].sgt;
2452                 size = gr->global_ctx_buffer[PAGEPOOL_VPR].size;
2453         }
2454
2455         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
2456                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2457                                 gk20a_mem_flag_none);
2458         if (!gpu_va)
2459                 goto clean_up;
2460         g_bfr_va[PAGEPOOL_VA] = gpu_va;
2461         g_bfr_size[PAGEPOOL_VA] = size;
2462
2463         /* Golden Image */
2464         sgt = gr->global_ctx_buffer[GOLDEN_CTX].sgt;
2465         size = gr->global_ctx_buffer[GOLDEN_CTX].size;
2466         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
2467                                 gk20a_mem_flag_none);
2468         if (!gpu_va)
2469                 goto clean_up;
2470         g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
2471         g_bfr_size[GOLDEN_CTX_VA] = size;
2472
2473         /* Priv register Access Map */
2474         sgt = gr->global_ctx_buffer[PRIV_ACCESS_MAP].sgt;
2475         size = gr->global_ctx_buffer[PRIV_ACCESS_MAP].size;
2476         gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
2477                                 gk20a_mem_flag_none);
2478         if (!gpu_va)
2479                 goto clean_up;
2480         g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
2481         g_bfr_size[PRIV_ACCESS_MAP_VA] = size;
2482
2483         c->ch_ctx.global_ctx_buffer_mapped = true;
2484         return 0;
2485
2486  clean_up:
2487         for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2488                 if (g_bfr_va[i]) {
2489                         gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
2490                                          gr->global_ctx_buffer[i].size,
2491                                          gk20a_mem_flag_none);
2492                         g_bfr_va[i] = 0;
2493                 }
2494         }
2495         return -ENOMEM;
2496 }
2497
2498 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
2499 {
2500         struct vm_gk20a *ch_vm = c->vm;
2501         u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2502         u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
2503         u32 i;
2504
2505         gk20a_dbg_fn("");
2506
2507         for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2508                 if (g_bfr_va[i]) {
2509                         gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
2510                                          g_bfr_size[i],
2511                                          gk20a_mem_flag_none);
2512                         g_bfr_va[i] = 0;
2513                         g_bfr_size[i] = 0;
2514                 }
2515         }
2516         c->ch_ctx.global_ctx_buffer_mapped = false;
2517 }
2518
2519 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
2520                                 struct channel_gk20a *c)
2521 {
2522         struct gr_gk20a *gr = &g->gr;
2523         struct gr_ctx_desc *gr_ctx = &c->ch_ctx.gr_ctx;
2524         struct vm_gk20a *ch_vm = c->vm;
2525         struct device *d = dev_from_gk20a(g);
2526         struct sg_table *sgt;
2527         DEFINE_DMA_ATTRS(attrs);
2528         int err = 0;
2529         dma_addr_t iova;
2530
2531         gk20a_dbg_fn("");
2532
2533         if (gr->ctx_vars.buffer_size == 0)
2534                 return 0;
2535
2536         /* alloc channel gr ctx buffer */
2537         gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
2538         gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
2539
2540         gr_ctx->size = gr->ctx_vars.buffer_total_size;
2541         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2542         gr_ctx->pages = dma_alloc_attrs(d, gr_ctx->size,
2543                                 &iova, GFP_KERNEL, &attrs);
2544         if (!gr_ctx->pages)
2545                 return -ENOMEM;
2546
2547         gr_ctx->iova = iova;
2548         err = gk20a_get_sgtable_from_pages(d, &sgt, gr_ctx->pages,
2549                         gr_ctx->iova, gr_ctx->size);
2550         if (err)
2551                 goto err_free;
2552
2553         gr_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, gr_ctx->size,
2554                                 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
2555                                 gk20a_mem_flag_none);
2556         if (!gr_ctx->gpu_va)
2557                 goto err_free_sgt;
2558
2559         gk20a_free_sgtable(&sgt);
2560
2561         return 0;
2562
2563  err_free_sgt:
2564         gk20a_free_sgtable(&sgt);
2565  err_free:
2566         dma_free_attrs(d, gr_ctx->size,
2567                 gr_ctx->pages, gr_ctx->iova, &attrs);
2568         gr_ctx->pages = NULL;
2569         gr_ctx->iova = 0;
2570
2571         return err;
2572 }
2573
2574 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
2575 {
2576         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2577         struct vm_gk20a *ch_vm = c->vm;
2578         struct gk20a *g = c->g;
2579         struct device *d = dev_from_gk20a(g);
2580         DEFINE_DMA_ATTRS(attrs);
2581
2582         gk20a_dbg_fn("");
2583
2584         if (!ch_ctx->gr_ctx.gpu_va)
2585                 return;
2586
2587         gk20a_gmmu_unmap(ch_vm, ch_ctx->gr_ctx.gpu_va,
2588                         ch_ctx->gr_ctx.size, gk20a_mem_flag_none);
2589         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2590         dma_free_attrs(d, ch_ctx->gr_ctx.size,
2591                 ch_ctx->gr_ctx.pages, ch_ctx->gr_ctx.iova, &attrs);
2592         ch_ctx->gr_ctx.pages = NULL;
2593         ch_ctx->gr_ctx.iova = 0;
2594 }
2595
2596 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
2597                                 struct channel_gk20a *c)
2598 {
2599         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2600         struct device *d = dev_from_gk20a(g);
2601         struct vm_gk20a *ch_vm = c->vm;
2602         DEFINE_DMA_ATTRS(attrs);
2603         struct sg_table *sgt;
2604         int err = 0;
2605         dma_addr_t iova;
2606
2607         gk20a_dbg_fn("");
2608
2609         patch_ctx->size = 128 * sizeof(u32);
2610         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2611         patch_ctx->pages = dma_alloc_attrs(d, patch_ctx->size,
2612                                 &iova, GFP_KERNEL,
2613                                 &attrs);
2614         if (!patch_ctx->pages)
2615                 return -ENOMEM;
2616
2617         patch_ctx->iova = iova;
2618         err = gk20a_get_sgtable_from_pages(d, &sgt, patch_ctx->pages,
2619                         patch_ctx->iova, patch_ctx->size);
2620         if (err)
2621                 goto err_free;
2622
2623         patch_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, patch_ctx->size,
2624                                         0, gk20a_mem_flag_none);
2625         if (!patch_ctx->gpu_va)
2626                 goto err_free_sgtable;
2627
2628         gk20a_free_sgtable(&sgt);
2629
2630         gk20a_dbg_fn("done");
2631         return 0;
2632
2633  err_free_sgtable:
2634         gk20a_free_sgtable(&sgt);
2635  err_free:
2636         dma_free_attrs(d, patch_ctx->size,
2637                 patch_ctx->pages, patch_ctx->iova, &attrs);
2638         patch_ctx->pages = NULL;
2639         patch_ctx->iova = 0;
2640         gk20a_err(dev_from_gk20a(g), "fail");
2641         return err;
2642 }
2643
2644 static void gr_gk20a_unmap_channel_patch_ctx(struct channel_gk20a *c)
2645 {
2646         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2647         struct vm_gk20a *ch_vm = c->vm;
2648
2649         gk20a_dbg_fn("");
2650
2651         if (patch_ctx->gpu_va)
2652                 gk20a_gmmu_unmap(ch_vm, patch_ctx->gpu_va,
2653                         patch_ctx->size, gk20a_mem_flag_none);
2654         patch_ctx->gpu_va = 0;
2655         patch_ctx->data_count = 0;
2656 }
2657
2658 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
2659 {
2660         struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2661         struct gk20a *g = c->g;
2662         struct device *d = dev_from_gk20a(g);
2663         DEFINE_DMA_ATTRS(attrs);
2664
2665         gk20a_dbg_fn("");
2666
2667         gr_gk20a_unmap_channel_patch_ctx(c);
2668
2669         if (patch_ctx->pages) {
2670                 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2671                 dma_free_attrs(d, patch_ctx->size,
2672                         patch_ctx->pages, patch_ctx->iova, &attrs);
2673                 patch_ctx->pages = NULL;
2674                 patch_ctx->iova = 0;
2675         }
2676 }
2677
2678 void gk20a_free_channel_ctx(struct channel_gk20a *c)
2679 {
2680         gr_gk20a_unmap_global_ctx_buffers(c);
2681         gr_gk20a_free_channel_patch_ctx(c);
2682         gr_gk20a_free_channel_gr_ctx(c);
2683
2684         /* zcull_ctx, pm_ctx */
2685
2686         memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
2687
2688         c->num_objects = 0;
2689         c->first_init = false;
2690 }
2691
2692 static bool gr_gk20a_is_valid_class(struct gk20a *g, u32 class_num)
2693 {
2694         bool valid = false;
2695
2696         switch (class_num) {
2697         case KEPLER_COMPUTE_A:
2698         case KEPLER_C:
2699         case FERMI_TWOD_A:
2700         case KEPLER_DMA_COPY_A:
2701                 valid = true;
2702                 break;
2703
2704         default:
2705                 break;
2706         }
2707
2708         return valid;
2709 }
2710
2711 int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
2712                         struct nvhost_alloc_obj_ctx_args *args)
2713 {
2714         struct gk20a *g = c->g;
2715         struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2716         int err = 0;
2717
2718         gk20a_dbg_fn("");
2719
2720         /* an address space needs to have been bound at this point.*/
2721         if (!gk20a_channel_as_bound(c)) {
2722                 gk20a_err(dev_from_gk20a(g),
2723                            "not bound to address space at time"
2724                            " of grctx allocation");
2725                 return -EINVAL;
2726         }
2727
2728         if (!g->ops.gr.is_valid_class(g, args->class_num)) {
2729                 gk20a_err(dev_from_gk20a(g),
2730                            "invalid obj class 0x%x", args->class_num);
2731                 err = -EINVAL;
2732                 goto out;
2733         }
2734
2735         /* allocate gr ctx buffer */
2736         if (ch_ctx->gr_ctx.pages == NULL) {
2737                 err = gr_gk20a_alloc_channel_gr_ctx(g, c);
2738                 if (err) {
2739                         gk20a_err(dev_from_gk20a(g),
2740                                 "fail to allocate gr ctx buffer");
2741                         goto out;
2742                 }
2743                 c->obj_class = args->class_num;
2744         } else {
2745                 /*TBD: needs to be more subtle about which is being allocated
2746                 * as some are allowed to be allocated along same channel */
2747                 gk20a_err(dev_from_gk20a(g),
2748                         "too many classes alloc'd on same channel");
2749                 err = -EINVAL;
2750                 goto out;
2751         }
2752
2753         /* commit gr ctx buffer */
2754         err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
2755         if (err) {
2756                 gk20a_err(dev_from_gk20a(g),
2757                         "fail to commit gr ctx buffer");
2758                 goto out;
2759         }
2760
2761         /* allocate patch buffer */
2762         if (ch_ctx->patch_ctx.pages == NULL) {
2763                 err = gr_gk20a_alloc_channel_patch_ctx(g, c);
2764                 if (err) {
2765                         gk20a_err(dev_from_gk20a(g),
2766                                 "fail to allocate patch buffer");
2767                         goto out;
2768                 }
2769         }
2770
2771         /* map global buffer to channel gpu_va and commit */
2772         if (!ch_ctx->global_ctx_buffer_mapped) {
2773                 err = gr_gk20a_map_global_ctx_buffers(g, c);
2774                 if (err) {
2775                         gk20a_err(dev_from_gk20a(g),
2776                                 "fail to map global ctx buffer");
2777                         goto out;
2778                 }
2779                 gr_gk20a_elpg_protected_call(g,
2780                         gr_gk20a_commit_global_ctx_buffers(g, c, true));
2781         }
2782
2783         /* tweak any perf parameters per-context here */
2784         if (args->class_num == KEPLER_COMPUTE_A) {
2785                 int begin_err;
2786                 u32 tex_lock_disable_mask =
2787                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_m()         |
2788                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_m()    |
2789                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_m()   |
2790                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_m()     |
2791                         gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_m() |
2792                         gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_m();
2793
2794                 u32 texlock = gk20a_readl(g, gr_gpcs_tpcs_sm_sch_texlock_r());
2795
2796                 texlock = (texlock & ~tex_lock_disable_mask) |
2797                 (gr_gpcs_tpcs_sm_sch_texlock_tex_hash_disable_f()         |
2798                  gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_disable_f()    |
2799                  gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_disable_f()   |
2800                  gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_disable_f()     |
2801                  gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_disable_f() |
2802                  gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_disable_f());
2803
2804                 begin_err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
2805
2806                 if (!begin_err) {
2807                         err = gr_gk20a_ctx_patch_write(g, ch_ctx,
2808                                 gr_gpcs_tpcs_sm_sch_texlock_r(),
2809                                 texlock, true);
2810                 }
2811                 if ((begin_err || err)) {
2812                         gk20a_err(dev_from_gk20a(g),
2813                                    "failed to set texlock for compute class");
2814                 }
2815                 if (!begin_err)
2816                         gr_gk20a_ctx_patch_write_end(g, ch_ctx);
2817         }
2818
2819         /* init golden image, ELPG enabled after this is done */
2820         err = gr_gk20a_init_golden_ctx_image(g, c);
2821         if (err) {
2822                 gk20a_err(dev_from_gk20a(g),
2823                         "fail to init golden ctx image");
2824                 goto out;
2825         }
2826
2827         /* load golden image */
2828         if (!c->first_init) {
2829                 err = gr_gk20a_elpg_protected_call(g,
2830                         gr_gk20a_load_golden_ctx_image(g, c));
2831                 if (err) {
2832                         gk20a_err(dev_from_gk20a(g),
2833                                 "fail to load golden ctx image");
2834                         goto out;
2835                 }
2836                 c->first_init = true;
2837         }
2838
2839         c->num_objects++;
2840
2841         gk20a_dbg_fn("done");
2842         return 0;
2843 out:
2844         /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
2845            can be reused so no need to release them.
2846            2. golden image init and load is a one time thing so if
2847            they pass, no need to undo. */
2848         gk20a_err(dev_from_gk20a(g), "fail");
2849         return err;
2850 }
2851
2852 int gk20a_free_obj_ctx(struct channel_gk20a  *c,
2853                        struct nvhost_free_obj_ctx_args *args)
2854 {
2855         unsigned long timeout = gk20a_get_gr_idle_timeout(c->g);
2856
2857         gk20a_dbg_fn("");
2858
2859         if (c->num_objects == 0)
2860                 return 0;
2861
2862         c->num_objects--;
2863
2864         if (c->num_objects == 0) {
2865                 c->first_init = false;
2866                 gk20a_disable_channel(c,
2867                         !c->has_timedout,
2868                         timeout);
2869                 gr_gk20a_unmap_channel_patch_ctx(c);
2870         }
2871
2872         return 0;
2873 }
2874
2875 static void gk20a_remove_gr_support(struct gr_gk20a *gr)
2876 {
2877         struct gk20a *g = gr->g;
2878         struct device *d = dev_from_gk20a(g);
2879         DEFINE_DMA_ATTRS(attrs);
2880
2881         gk20a_dbg_fn("");
2882
2883         gr_gk20a_free_global_ctx_buffers(g);
2884
2885         dma_free_coherent(d, gr->mmu_wr_mem.size,
2886                 gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
2887         gr->mmu_wr_mem.cpuva = NULL;
2888         gr->mmu_wr_mem.iova = 0;
2889         dma_free_coherent(d, gr->mmu_rd_mem.size,
2890                 gr->mmu_rd_mem.cpuva, gr->mmu_rd_mem.iova);
2891         gr->mmu_rd_mem.cpuva = NULL;
2892         gr->mmu_rd_mem.iova = 0;
2893
2894         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
2895         dma_free_attrs(d, gr->compbit_store.size, gr->compbit_store.pages,
2896                         gr->compbit_store.base_iova, &attrs);
2897
2898         memset(&gr->mmu_wr_mem, 0, sizeof(struct mmu_desc));
2899         memset(&gr->mmu_rd_mem, 0, sizeof(struct mmu_desc));
2900         memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
2901
2902         kfree(gr->gpc_tpc_count);
2903         kfree(gr->gpc_zcb_count);
2904         kfree(gr->gpc_ppc_count);
2905         kfree(gr->pes_tpc_count[0]);
2906         kfree(gr->pes_tpc_count[1]);
2907         kfree(gr->pes_tpc_mask[0]);
2908         kfree(gr->pes_tpc_mask[1]);
2909         kfree(gr->gpc_skip_mask);
2910         kfree(gr->map_tiles);
2911         gr->gpc_tpc_count = NULL;
2912         gr->gpc_zcb_count = NULL;
2913         gr->gpc_ppc_count = NULL;
2914         gr->pes_tpc_count[0] = NULL;
2915         gr->pes_tpc_count[1] = NULL;
2916         gr->pes_tpc_mask[0] = NULL;
2917         gr->pes_tpc_mask[1] = NULL;
2918         gr->gpc_skip_mask = NULL;
2919         gr->map_tiles = NULL;
2920
2921         kfree(gr->ctx_vars.ucode.fecs.inst.l);
2922         kfree(gr->ctx_vars.ucode.fecs.data.l);
2923         kfree(gr->ctx_vars.ucode.gpccs.inst.l);
2924         kfree(gr->ctx_vars.ucode.gpccs.data.l);
2925         kfree(gr->ctx_vars.sw_bundle_init.l);
2926         kfree(gr->ctx_vars.sw_method_init.l);
2927         kfree(gr->ctx_vars.sw_ctx_load.l);
2928         kfree(gr->ctx_vars.sw_non_ctx_load.l);
2929         kfree(gr->ctx_vars.ctxsw_regs.sys.l);
2930         kfree(gr->ctx_vars.ctxsw_regs.gpc.l);
2931         kfree(gr->ctx_vars.ctxsw_regs.tpc.l);
2932         kfree(gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
2933         kfree(gr->ctx_vars.ctxsw_regs.ppc.l);
2934         kfree(gr->ctx_vars.ctxsw_regs.pm_sys.l);
2935         kfree(gr->ctx_vars.ctxsw_regs.pm_gpc.l);
2936         kfree(gr->ctx_vars.ctxsw_regs.pm_tpc.l);
2937
2938         kfree(gr->ctx_vars.local_golden_image);
2939         gr->ctx_vars.local_golden_image = NULL;
2940
2941         gk20a_allocator_destroy(&gr->comp_tags);
2942 }
2943
2944 static void gr_gk20a_bundle_cb_defaults(struct gk20a *g)
2945 {
2946         struct gr_gk20a *gr = &g->gr;
2947
2948         gr->bundle_cb_default_size =
2949                 gr_scc_bundle_cb_size_div_256b__prod_v();
2950         gr->min_gpm_fifo_depth =
2951                 gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
2952         gr->bundle_cb_token_limit =
2953                 gr_pd_ab_dist_cfg2_token_limit_init_v();
2954 }
2955
2956 static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
2957 {
2958         u32 gpc_index, pes_index;
2959         u32 pes_tpc_mask;
2960         u32 pes_tpc_count;
2961         u32 pes_heavy_index;
2962         u32 gpc_new_skip_mask;
2963         u32 tmp;
2964
2965         tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
2966         gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
2967
2968         tmp = gk20a_readl(g, top_num_gpcs_r());
2969         gr->max_gpc_count = top_num_gpcs_value_v(tmp);
2970
2971         tmp = gk20a_readl(g, top_num_fbps_r());
2972         gr->max_fbps_count = top_num_fbps_value_v(tmp);
2973
2974         tmp = gk20a_readl(g, top_tpc_per_gpc_r());
2975         gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
2976
2977         gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
2978
2979         tmp = gk20a_readl(g, top_num_fbps_r());
2980         gr->sys_count = top_num_fbps_value_v(tmp);
2981
2982         tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
2983         gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
2984
2985         gr->pe_count_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
2986         gr->max_zcull_per_gpc_count = proj_scal_litter_num_zcull_banks_v();
2987
2988         if (!gr->gpc_count) {
2989                 gk20a_err(dev_from_gk20a(g), "gpc_count==0!");
2990                 goto clean_up;
2991         }
2992
2993         gr->gpc_tpc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2994         gr->gpc_zcb_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2995         gr->gpc_ppc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2996         gr->pes_tpc_count[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2997         gr->pes_tpc_count[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2998         gr->pes_tpc_mask[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2999         gr->pes_tpc_mask[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
3000         gr->gpc_skip_mask =
3001                 kzalloc(gr_pd_dist_skip_table__size_1_v() * 4 * sizeof(u32),
3002                         GFP_KERNEL);
3003
3004         if (!gr->gpc_tpc_count || !gr->gpc_zcb_count || !gr->gpc_ppc_count ||
3005             !gr->pes_tpc_count[0] || !gr->pes_tpc_count[1] ||
3006             !gr->pes_tpc_mask[0] || !gr->pes_tpc_mask[1] || !gr->gpc_skip_mask)
3007                 goto clean_up;
3008
3009         gr->ppc_count = 0;
3010         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3011                 tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r());
3012
3013                 gr->gpc_tpc_count[gpc_index] =
3014                         gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
3015                 gr->tpc_count += gr->gpc_tpc_count[gpc_index];
3016
3017                 gr->gpc_zcb_count[gpc_index] =
3018                         gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
3019                 gr->zcb_count += gr->gpc_zcb_count[gpc_index];
3020
3021                 gr->gpc_ppc_count[gpc_index] = gr->pe_count_per_gpc;
3022                 gr->ppc_count += gr->gpc_ppc_count[gpc_index];
3023                 for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
3024
3025                         tmp = gk20a_readl(g,
3026                                 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
3027                                 gpc_index * proj_gpc_stride_v());
3028
3029                         pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
3030                         pes_tpc_count = count_bits(pes_tpc_mask);
3031
3032                         gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
3033                         gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
3034                 }
3035
3036                 gpc_new_skip_mask = 0;
3037                 if (gr->pes_tpc_count[0][gpc_index] +
3038                     gr->pes_tpc_count[1][gpc_index] == 5) {
3039                         pes_heavy_index =
3040                                 gr->pes_tpc_count[0][gpc_index] >
3041                                 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3042
3043                         gpc_new_skip_mask =
3044                                 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3045                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3046                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3047
3048                 } else if ((gr->pes_tpc_count[0][gpc_index] +
3049                             gr->pes_tpc_count[1][gpc_index] == 4) &&
3050                            (gr->pes_tpc_count[0][gpc_index] !=
3051                             gr->pes_tpc_count[1][gpc_index])) {
3052                                 pes_heavy_index =
3053                                     gr->pes_tpc_count[0][gpc_index] >
3054                                     gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
3055
3056                         gpc_new_skip_mask =
3057                                 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
3058                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
3059                                    (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
3060                 }
3061                 gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
3062         }
3063
3064         gk20a_dbg_info("fbps: %d", gr->num_fbps);
3065         gk20a_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
3066         gk20a_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
3067         gk20a_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
3068         gk20a_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
3069         gk20a_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
3070         gk20a_dbg_info("sys_count: %d", gr->sys_count);
3071         gk20a_dbg_info("gpc_count: %d", gr->gpc_count);
3072         gk20a_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
3073         gk20a_dbg_info("tpc_count: %d", gr->tpc_count);
3074         gk20a_dbg_info("ppc_count: %d", gr->ppc_count);
3075
3076         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3077                 gk20a_dbg_info("gpc_tpc_count[%d] : %d",
3078                            gpc_index, gr->gpc_tpc_count[gpc_index]);
3079         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3080                 gk20a_dbg_info("gpc_zcb_count[%d] : %d",
3081                            gpc_index, gr->gpc_zcb_count[gpc_index]);
3082         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3083                 gk20a_dbg_info("gpc_ppc_count[%d] : %d",
3084                            gpc_index, gr->gpc_ppc_count[gpc_index]);
3085         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3086                 gk20a_dbg_info("gpc_skip_mask[%d] : %d",
3087                            gpc_index, gr->gpc_skip_mask[gpc_index]);
3088         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3089                 for (pes_index = 0;
3090                      pes_index < gr->pe_count_per_gpc;
3091                      pes_index++)
3092                         gk20a_dbg_info("pes_tpc_count[%d][%d] : %d",
3093                                    pes_index, gpc_index,
3094                                    gr->pes_tpc_count[pes_index][gpc_index]);
3095
3096         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3097                 for (pes_index = 0;
3098                      pes_index < gr->pe_count_per_gpc;
3099                      pes_index++)
3100                         gk20a_dbg_info("pes_tpc_mask[%d][%d] : %d",
3101                                    pes_index, gpc_index,
3102                                    gr->pes_tpc_mask[pes_index][gpc_index]);
3103
3104         g->ops.gr.bundle_cb_defaults(g);
3105         g->ops.gr.cb_size_default(g);
3106         g->ops.gr.calc_global_ctx_buffer_size(g);
3107         gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
3108
3109         gk20a_dbg_info("bundle_cb_default_size: %d",
3110                    gr->bundle_cb_default_size);
3111         gk20a_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
3112         gk20a_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
3113         gk20a_dbg_info("attrib_cb_default_size: %d",
3114                    gr->attrib_cb_default_size);
3115         gk20a_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
3116         gk20a_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
3117         gk20a_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
3118         gk20a_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
3119
3120         return 0;
3121
3122 clean_up:
3123         return -ENOMEM;
3124 }
3125
3126 static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
3127 {
3128         struct device *d = dev_from_gk20a(g);
3129         dma_addr_t iova;
3130
3131         gr->mmu_wr_mem_size = gr->mmu_rd_mem_size = 0x1000;
3132
3133         gr->mmu_wr_mem.size = gr->mmu_wr_mem_size;
3134         gr->mmu_wr_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_wr_mem_size,
3135                                         &iova, GFP_KERNEL);
3136         if (!gr->mmu_wr_mem.cpuva)
3137                 goto err;
3138
3139         gr->mmu_wr_mem.iova = iova;
3140
3141         gr->mmu_rd_mem.size = gr->mmu_rd_mem_size;
3142         gr->mmu_rd_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_rd_mem_size,
3143                                         &iova, GFP_KERNEL);
3144         if (!gr->mmu_rd_mem.cpuva)
3145                 goto err_free_wr_mem;
3146
3147         gr->mmu_rd_mem.iova = iova;
3148         return 0;
3149
3150  err_free_wr_mem:
3151         dma_free_coherent(d, gr->mmu_wr_mem.size,
3152                 gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
3153         gr->mmu_wr_mem.cpuva = NULL;
3154         gr->mmu_wr_mem.iova = 0;
3155  err:
3156         return -ENOMEM;
3157 }
3158
3159 static u32 prime_set[18] = {
3160         2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
3161
3162 static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
3163 {
3164         s32 comm_denom;
3165         s32 mul_factor;
3166         s32 *init_frac = NULL;
3167         s32 *init_err = NULL;
3168         s32 *run_err = NULL;
3169         s32 *sorted_num_tpcs = NULL;
3170         s32 *sorted_to_unsorted_gpc_map = NULL;
3171         u32 gpc_index;
3172         u32 gpc_mark = 0;
3173         u32 num_tpc;
3174         u32 max_tpc_count = 0;
3175         u32 swap;
3176         u32 tile_count;
3177         u32 index;
3178         bool delete_map = false;
3179         bool gpc_sorted;
3180         int ret = 0;
3181
3182         init_frac = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3183         init_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3184         run_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3185         sorted_num_tpcs =
3186                 kzalloc(proj_scal_max_gpcs_v() *
3187                         proj_scal_max_tpc_per_gpc_v() * sizeof(s32),
3188                         GFP_KERNEL);
3189         sorted_to_unsorted_gpc_map =
3190                 kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
3191
3192         if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
3193               sorted_to_unsorted_gpc_map)) {
3194                 ret = -ENOMEM;
3195                 goto clean_up;
3196         }
3197
3198         gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
3199
3200         if (gr->tpc_count == 3)
3201                 gr->map_row_offset = 2;
3202         else if (gr->tpc_count < 3)
3203                 gr->map_row_offset = 1;
3204         else {
3205                 gr->map_row_offset = 3;
3206
3207                 for (index = 1; index < 18; index++) {
3208                         u32 prime = prime_set[index];
3209                         if ((gr->tpc_count % prime) != 0) {
3210                                 gr->map_row_offset = prime;
3211                                 break;
3212                         }
3213                 }
3214         }
3215
3216         switch (gr->tpc_count) {
3217         case 15:
3218                 gr->map_row_offset = 6;
3219                 break;
3220         case 14:
3221                 gr->map_row_offset = 5;
3222                 break;
3223         case 13:
3224                 gr->map_row_offset = 2;
3225                 break;
3226         case 11:
3227                 gr->map_row_offset = 7;
3228                 break;
3229         case 10:
3230                 gr->map_row_offset = 6;
3231                 break;
3232         case 7:
3233         case 5:
3234                 gr->map_row_offset = 1;
3235                 break;
3236         default:
3237                 break;
3238         }
3239
3240         if (gr->map_tiles) {
3241                 if (gr->map_tile_count != gr->tpc_count)
3242                         delete_map = true;
3243
3244                 for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
3245                         if ((u32)gr->map_tiles[tile_count] >= gr->tpc_count)
3246                                 delete_map = true;
3247                 }
3248
3249                 if (delete_map) {
3250                         kfree(gr->map_tiles);
3251                         gr->map_tiles = NULL;
3252                         gr->map_tile_count = 0;
3253                 }
3254         }
3255
3256         if (gr->map_tiles == NULL) {
3257                 gr->map_tile_count = proj_scal_max_gpcs_v();
3258
3259                 gr->map_tiles = kzalloc(proj_scal_max_gpcs_v() * sizeof(u8), GFP_KERNEL);
3260                 if (gr->map_tiles == NULL) {
3261                         ret = -ENOMEM;
3262                         goto clean_up;
3263                 }
3264
3265                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3266                         sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
3267                         sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
3268                 }
3269
3270                 gpc_sorted = false;
3271                 while (!gpc_sorted) {
3272                         gpc_sorted = true;
3273                         for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
3274                                 if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
3275                                         gpc_sorted = false;
3276                                         swap = sorted_num_tpcs[gpc_index];
3277                                         sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
3278                                         sorted_num_tpcs[gpc_index + 1] = swap;
3279                                         swap = sorted_to_unsorted_gpc_map[gpc_index];
3280                                         sorted_to_unsorted_gpc_map[gpc_index] =
3281                                                 sorted_to_unsorted_gpc_map[gpc_index + 1];
3282                                         sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
3283                                 }
3284                         }
3285                 }
3286
3287                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
3288                         if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
3289                                 max_tpc_count = gr->gpc_tpc_count[gpc_index];
3290
3291                 mul_factor = gr->gpc_count * max_tpc_count;
3292                 if (mul_factor & 0x1)
3293                         mul_factor = 2;
3294                 else
3295                         mul_factor = 1;
3296
3297                 comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
3298
3299                 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3300                         num_tpc = sorted_num_tpcs[gpc_index];
3301
3302                         init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
3303
3304                         if (num_tpc != 0)
3305                                 init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
3306                         else
3307                                 init_err[gpc_index] = 0;
3308
3309                         run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
3310                 }
3311
3312                 while (gpc_mark < gr->tpc_count) {
3313                         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3314                                 if ((run_err[gpc_index] * 2) >= comm_denom) {
3315                                         gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
3316                                         run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
3317                                 } else
3318                                         run_err[gpc_index] += init_frac[gpc_index];
3319                         }
3320                 }
3321         }
3322
3323 clean_up:
3324         kfree(init_frac);
3325         kfree(init_err);
3326         kfree(run_err);
3327         kfree(sorted_num_tpcs);
3328         kfree(sorted_to_unsorted_gpc_map);
3329
3330         if (ret)
3331                 gk20a_err(dev_from_gk20a(g), "fail");
3332         else
3333                 gk20a_dbg_fn("done");
3334
3335         return ret;
3336 }
3337
3338 static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
3339 {
3340         struct gr_zcull_gk20a *zcull = &gr->zcull;
3341
3342         zcull->aliquot_width = gr->tpc_count * 16;
3343         zcull->aliquot_height = 16;
3344
3345         zcull->width_align_pixels = gr->tpc_count * 16;
3346         zcull->height_align_pixels = 32;
3347
3348         zcull->aliquot_size =
3349                 zcull->aliquot_width * zcull->aliquot_height;
3350
3351         /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
3352         zcull->pixel_squares_by_aliquots =
3353                 gr->zcb_count * 16 * 16 * gr->tpc_count /
3354                 (gr->gpc_count * gr->gpc_tpc_count[0]);
3355
3356         zcull->total_aliquots =
3357                 gr_gpc0_zcull_total_ram_size_num_aliquots_f(
3358                         gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
3359
3360         return 0;
3361 }
3362
3363 u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
3364 {
3365         /* assuming gr has already been initialized */
3366         return gr->ctx_vars.zcull_ctxsw_image_size;
3367 }
3368
3369 int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
3370                         struct channel_gk20a *c, u64 zcull_va, u32 mode)
3371 {
3372         struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
3373
3374         zcull_ctx->ctx_sw_mode = mode;
3375         zcull_ctx->gpu_va = zcull_va;
3376
3377         /* TBD: don't disable channel in sw method processing */
3378         return gr_gk20a_ctx_zcull_setup(g, c, true);
3379 }
3380
3381 int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
3382                         struct gr_zcull_info *zcull_params)
3383 {
3384         struct gr_zcull_gk20a *zcull = &gr->zcull;
3385
3386         zcull_params->width_align_pixels = zcull->width_align_pixels;
3387         zcull_params->height_align_pixels = zcull->height_align_pixels;
3388         zcull_params->pixel_squares_by_aliquots =
3389                 zcull->pixel_squares_by_aliquots;
3390         zcull_params->aliquot_total = zcull->total_aliquots;
3391
3392         zcull_params->region_byte_multiplier =
3393                 gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
3394         zcull_params->region_header_size =
3395                 proj_scal_litter_num_gpcs_v() *
3396                 gr_zcull_save_restore_header_bytes_per_gpc_v();
3397
3398         zcull_params->subregion_header_size =
3399                 proj_scal_litter_num_gpcs_v() *
3400                 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
3401
3402         zcull_params->subregion_width_align_pixels =
3403                 gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
3404         zcull_params->subregion_height_align_pixels =
3405                 gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
3406         zcull_params->subregion_count = gr_zcull_subregion_qty_v();
3407
3408         return 0;
3409 }
3410
3411 static int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
3412                                   struct zbc_entry *color_val, u32 index)
3413 {
3414         struct fifo_gk20a *f = &g->fifo;
3415         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3416         u32 i;
3417         unsigned long end_jiffies = jiffies +
3418                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3419         u32 ret;
3420
3421         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3422         if (ret) {
3423                 gk20a_err(dev_from_gk20a(g),
3424                         "failed to disable gr engine activity\n");
3425                 return ret;
3426         }
3427
3428         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3429         if (ret) {
3430                 gk20a_err(dev_from_gk20a(g),
3431                         "failed to idle graphics\n");
3432                 goto clean_up;
3433         }
3434
3435         /* update l2 table */
3436         g->ops.ltc.set_zbc_color_entry(g, color_val, index);
3437
3438         /* update ds table */
3439         gk20a_writel(g, gr_ds_zbc_color_r_r(),
3440                 gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
3441         gk20a_writel(g, gr_ds_zbc_color_g_r(),
3442                 gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
3443         gk20a_writel(g, gr_ds_zbc_color_b_r(),
3444                 gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
3445         gk20a_writel(g, gr_ds_zbc_color_a_r(),
3446                 gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
3447
3448         gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3449                 gr_ds_zbc_color_fmt_val_f(color_val->format));
3450
3451         gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3452                 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3453
3454         /* trigger the write */
3455         gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3456                 gr_ds_zbc_tbl_ld_select_c_f() |
3457                 gr_ds_zbc_tbl_ld_action_write_f() |
3458                 gr_ds_zbc_tbl_ld_trigger_active_f());
3459
3460         /* update local copy */
3461         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3462                 gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
3463                 gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
3464         }
3465         gr->zbc_col_tbl[index].format = color_val->format;
3466         gr->zbc_col_tbl[index].ref_cnt++;
3467
3468 clean_up:
3469         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3470         if (ret) {
3471                 gk20a_err(dev_from_gk20a(g),
3472                         "failed to enable gr engine activity\n");
3473         }
3474
3475         return ret;
3476 }
3477
3478 static int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
3479                                 struct zbc_entry *depth_val, u32 index)
3480 {
3481         struct fifo_gk20a *f = &g->fifo;
3482         struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3483         unsigned long end_jiffies = jiffies +
3484                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3485         u32 ret;
3486
3487         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3488         if (ret) {
3489                 gk20a_err(dev_from_gk20a(g),
3490                         "failed to disable gr engine activity\n");
3491                 return ret;
3492         }
3493
3494         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3495         if (ret) {
3496                 gk20a_err(dev_from_gk20a(g),
3497                         "failed to idle graphics\n");
3498                 goto clean_up;
3499         }
3500
3501         /* update l2 table */
3502         g->ops.ltc.set_zbc_depth_entry(g, depth_val, index);
3503
3504         /* update ds table */
3505         gk20a_writel(g, gr_ds_zbc_z_r(),
3506                 gr_ds_zbc_z_val_f(depth_val->depth));
3507
3508         gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3509                 gr_ds_zbc_z_fmt_val_f(depth_val->format));
3510
3511         gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3512                 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3513
3514         /* trigger the write */
3515         gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3516                 gr_ds_zbc_tbl_ld_select_z_f() |
3517                 gr_ds_zbc_tbl_ld_action_write_f() |
3518                 gr_ds_zbc_tbl_ld_trigger_active_f());
3519
3520         /* update local copy */
3521         gr->zbc_dep_tbl[index].depth = depth_val->depth;
3522         gr->zbc_dep_tbl[index].format = depth_val->format;
3523         gr->zbc_dep_tbl[index].ref_cnt++;
3524
3525 clean_up:
3526         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3527         if (ret) {
3528                 gk20a_err(dev_from_gk20a(g),
3529                         "failed to enable gr engine activity\n");
3530         }
3531
3532         return ret;
3533 }
3534
3535 void gr_gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
3536 {
3537         struct fifo_gk20a *f = &g->fifo;
3538         struct fifo_engine_info_gk20a *gr_info =
3539                 f->engine_info + ENGINE_GR_GK20A;
3540         unsigned long end_jiffies = jiffies +
3541                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3542         u32 ret;
3543
3544         ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3545         if (ret) {
3546                 gk20a_err(dev_from_gk20a(g),
3547                         "failed to disable gr engine activity\n");
3548                 return;
3549         }
3550
3551         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3552         if (ret) {
3553                 gk20a_err(dev_from_gk20a(g),
3554                         "failed to idle graphics\n");
3555                 goto clean_up;
3556         }
3557
3558         /* update zbc */
3559         gk20a_pmu_save_zbc(g, entries);
3560
3561 clean_up:
3562         ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3563         if (ret) {
3564                 gk20a_err(dev_from_gk20a(g),
3565                         "failed to enable gr engine activity\n");
3566         }
3567
3568         return;
3569 }
3570
3571 int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
3572                      struct zbc_entry *zbc_val)
3573 {
3574         struct zbc_color_table *c_tbl;
3575         struct zbc_depth_table *d_tbl;
3576         u32 i, ret = -ENOMEM;
3577         bool added = false;
3578         u32 entries;
3579
3580         /* no endian swap ? */
3581
3582         mutex_lock(&gr->zbc_lock);
3583         switch (zbc_val->type) {
3584         case GK20A_ZBC_TYPE_COLOR:
3585                 /* search existing tables */
3586                 for (i = 0; i < gr->max_used_color_index; i++) {
3587
3588                         c_tbl = &gr->zbc_col_tbl[i];
3589
3590                         if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
3591                             memcmp(c_tbl->color_ds, zbc_val->color_ds,
3592                                 sizeof(zbc_val->color_ds)) == 0) {
3593
3594                                 if (memcmp(c_tbl->color_l2, zbc_val->color_l2,
3595                                     sizeof(zbc_val->color_l2))) {
3596                                         gk20a_err(dev_from_gk20a(g),
3597                                                 "zbc l2 and ds color don't match with existing entries");
3598                                         ret = -EINVAL;
3599                                         goto err_mutex;
3600                                 }
3601                                 added = true;
3602                                 c_tbl->ref_cnt++;
3603                                 ret = 0;
3604                                 break;
3605                         }
3606                 }
3607                 /* add new table */
3608                 if (!added &&
3609                     gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
3610
3611                         c_tbl =
3612                             &gr->zbc_col_tbl[gr->max_used_color_index];
3613                         WARN_ON(c_tbl->ref_cnt != 0);
3614
3615                         ret = gr_gk20a_add_zbc_color(g, gr,
3616                                 zbc_val, gr->max_used_color_index);
3617
3618                         if (!ret)
3619                                 gr->max_used_color_index++;
3620                 }
3621                 break;
3622         case GK20A_ZBC_TYPE_DEPTH:
3623                 /* search existing tables */
3624                 for (i = 0; i < gr->max_used_depth_index; i++) {
3625
3626                         d_tbl = &gr->zbc_dep_tbl[i];
3627
3628                         if (d_tbl->ref_cnt &&
3629                             d_tbl->depth == zbc_val->depth &&
3630                             d_tbl->format == zbc_val->format) {
3631                                 added = true;
3632                                 d_tbl->ref_cnt++;
3633                                 ret = 0;
3634                                 break;
3635                         }
3636                 }
3637                 /* add new table */
3638                 if (!added &&
3639                     gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
3640
3641                         d_tbl =
3642                             &gr->zbc_dep_tbl[gr->max_used_depth_index];
3643                         WARN_ON(d_tbl->ref_cnt != 0);
3644
3645                         ret = gr_gk20a_add_zbc_depth(g, gr,
3646                                 zbc_val, gr->max_used_depth_index);
3647
3648                         if (!ret)
3649                                 gr->max_used_depth_index++;
3650                 }
3651                 break;
3652         default:
3653                 gk20a_err(dev_from_gk20a(g),
3654                         "invalid zbc table type %d", zbc_val->type);
3655                 ret = -EINVAL;
3656                 goto err_mutex;
3657         }
3658
3659         if (!added && ret == 0) {
3660                 /* update zbc for elpg only when new entry is added */
3661                 entries = max(gr->max_used_color_index,
3662                                         gr->max_used_depth_index);
3663                 gr_gk20a_pmu_save_zbc(g, entries);
3664         }
3665
3666 err_mutex:
3667         mutex_unlock(&gr->zbc_lock);
3668         return ret;
3669 }
3670
3671 /* get a zbc table entry specified by index
3672  * return table size when type is invalid */
3673 int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
3674                         struct zbc_query_params *query_params)
3675 {
3676         u32 index = query_params->index_size;
3677         u32 i;
3678
3679         switch (query_params->type) {
3680         case GK20A_ZBC_TYPE_INVALID:
3681                 query_params->index_size = GK20A_ZBC_TABLE_SIZE;
3682                 break;
3683         case GK20A_ZBC_TYPE_COLOR:
3684                 if (index >= GK20A_ZBC_TABLE_SIZE) {
3685                         gk20a_err(dev_from_gk20a(g),
3686                                 "invalid zbc color table index\n");
3687                         return -EINVAL;
3688                 }
3689                 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3690                         query_params->color_l2[i] =
3691                                 gr->zbc_col_tbl[index].color_l2[i];
3692                         query_params->color_ds[i] =
3693                                 gr->zbc_col_tbl[index].color_ds[i];
3694                 }
3695                 query_params->format = gr->zbc_col_tbl[index].format;
3696                 query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
3697                 break;
3698         case GK20A_ZBC_TYPE_DEPTH:
3699                 if (index >= GK20A_ZBC_TABLE_SIZE) {
3700                         gk20a_err(dev_from_gk20a(g),
3701                                 "invalid zbc depth table index\n");
3702                         return -EINVAL;
3703                 }
3704                 query_params->depth = gr->zbc_dep_tbl[index].depth;
3705                 query_params->format = gr->zbc_dep_tbl[index].format;
3706                 query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
3707                 break;
3708         default:
3709                 gk20a_err(dev_from_gk20a(g),
3710                                 "invalid zbc table type\n");
3711                 return -EINVAL;
3712         }
3713
3714         return 0;
3715 }
3716
3717 int gr_gk20a_load_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
3718 {
3719         int i, ret;
3720
3721         mutex_init(&gr->zbc_lock);
3722         for (i = 0; i < gr->max_used_color_index; i++) {
3723                 struct zbc_color_table *c_tbl = &gr->zbc_col_tbl[i];
3724                 struct zbc_entry zbc_val;
3725
3726                 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3727                 memcpy(zbc_val.color_ds,
3728                        c_tbl->color_ds, sizeof(zbc_val.color_ds));
3729                 memcpy(zbc_val.color_l2,
3730                        c_tbl->color_l2, sizeof(zbc_val.color_l2));
3731                 zbc_val.format = c_tbl->format;
3732
3733                 ret = gr_gk20a_add_zbc_color(g, gr, &zbc_val, i);
3734
3735                 if (ret)
3736                         return ret;
3737         }
3738         for (i = 0; i < gr->max_used_depth_index; i++) {
3739                 struct zbc_depth_table *d_tbl = &gr->zbc_dep_tbl[i];
3740                 struct zbc_entry zbc_val;
3741
3742                 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3743                 zbc_val.depth = d_tbl->depth;
3744                 zbc_val.format = d_tbl->format;
3745
3746                 ret = gr_gk20a_add_zbc_depth(g, gr, &zbc_val, i);
3747                 if (ret)
3748                         return ret;
3749         }
3750         return 0;
3751 }
3752
3753 int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
3754 {
3755         struct zbc_entry zbc_val;
3756         u32 i, err;
3757
3758         /* load default color table */
3759         zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3760
3761         zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
3762         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3763                 zbc_val.color_ds[i] = 0;
3764                 zbc_val.color_l2[i] = 0;
3765         }
3766         err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3767
3768         zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
3769         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3770                 zbc_val.color_ds[i] = 0xffffffff;
3771                 zbc_val.color_l2[i] = 0x3f800000;
3772         }
3773         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3774
3775         zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3776         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3777                 zbc_val.color_ds[i] = 0;
3778                 zbc_val.color_l2[i] = 0;
3779         }
3780         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3781
3782         zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3783         for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3784                 zbc_val.color_ds[i] = 0x3f800000;
3785                 zbc_val.color_l2[i] = 0x3f800000;
3786         }
3787         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3788
3789         if (!err)
3790                 gr->max_default_color_index = 4;
3791         else {
3792                 gk20a_err(dev_from_gk20a(g),
3793                            "fail to load default zbc color table\n");
3794                 return err;
3795         }
3796
3797         /* load default depth table */
3798         zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3799
3800         zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3801         zbc_val.depth = 0;
3802         err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3803
3804         zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3805         zbc_val.depth = 0x3f800000;
3806         err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3807
3808         if (!err)
3809                 gr->max_default_depth_index = 2;
3810         else {
3811                 gk20a_err(dev_from_gk20a(g),
3812                            "fail to load default zbc depth table\n");
3813                 return err;
3814         }
3815
3816         return 0;
3817 }
3818
3819 int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
3820                         struct zbc_entry *zbc_val)
3821 {
3822         gk20a_dbg_fn("");
3823
3824         return gr_gk20a_elpg_protected_call(g,
3825                 gr_gk20a_add_zbc(g, gr, zbc_val));
3826 }
3827
3828 void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine)
3829 {
3830         u32 gate_ctrl;
3831
3832         gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3833
3834         switch (mode) {
3835         case BLCG_RUN:
3836                 gate_ctrl = set_field(gate_ctrl,
3837                                 therm_gate_ctrl_blk_clk_m(),
3838                                 therm_gate_ctrl_blk_clk_run_f());
3839                 break;
3840         case BLCG_AUTO:
3841                 gate_ctrl = set_field(gate_ctrl,
3842                                 therm_gate_ctrl_blk_clk_m(),
3843                                 therm_gate_ctrl_blk_clk_auto_f());
3844                 break;
3845         default:
3846                 gk20a_err(dev_from_gk20a(g),
3847                         "invalid blcg mode %d", mode);
3848                 return;
3849         }
3850
3851         gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3852 }
3853
3854 void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine)
3855 {
3856         u32 gate_ctrl, idle_filter;
3857
3858         gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3859
3860         switch (mode) {
3861         case ELCG_RUN:
3862                 gate_ctrl = set_field(gate_ctrl,
3863                                 therm_gate_ctrl_eng_clk_m(),
3864                                 therm_gate_ctrl_eng_clk_run_f());
3865                 gate_ctrl = set_field(gate_ctrl,
3866                                 therm_gate_ctrl_eng_pwr_m(),
3867                                 /* set elpg to auto to meet hw expectation */
3868                                 therm_gate_ctrl_eng_pwr_auto_f());
3869                 break;
3870         case ELCG_STOP:
3871                 gate_ctrl = set_field(gate_ctrl,
3872                                 therm_gate_ctrl_eng_clk_m(),
3873                                 therm_gate_ctrl_eng_clk_stop_f());
3874                 break;
3875         case ELCG_AUTO:
3876                 gate_ctrl = set_field(gate_ctrl,
3877                                 therm_gate_ctrl_eng_clk_m(),
3878                                 therm_gate_ctrl_eng_clk_auto_f());
3879                 break;
3880         default:
3881                 gk20a_err(dev_from_gk20a(g),
3882                         "invalid elcg mode %d", mode);
3883         }
3884
3885         if (tegra_platform_is_linsim()) {
3886                 gate_ctrl = set_field(gate_ctrl,
3887                         therm_gate_ctrl_eng_delay_after_m(),
3888                         therm_gate_ctrl_eng_delay_after_f(4));
3889         }
3890
3891         /* 2 * (1 << 9) = 1024 clks */
3892         gate_ctrl = set_field(gate_ctrl,
3893                 therm_gate_ctrl_eng_idle_filt_exp_m(),
3894                 therm_gate_ctrl_eng_idle_filt_exp_f(9));
3895         gate_ctrl = set_field(gate_ctrl,
3896                 therm_gate_ctrl_eng_idle_filt_mant_m(),
3897                 therm_gate_ctrl_eng_idle_filt_mant_f(2));
3898         gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3899
3900         /* default fecs_idle_filter to 0 */
3901         idle_filter = gk20a_readl(g, therm_fecs_idle_filter_r());
3902         idle_filter &= ~therm_fecs_idle_filter_value_m();
3903         gk20a_writel(g, therm_fecs_idle_filter_r(), idle_filter);
3904         /* default hubmmu_idle_filter to 0 */
3905         idle_filter = gk20a_readl(g, therm_hubmmu_idle_filter_r());
3906         idle_filter &= ~therm_hubmmu_idle_filter_value_m();
3907         gk20a_writel(g, therm_hubmmu_idle_filter_r(), idle_filter);
3908 }
3909
3910 static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
3911 {
3912         u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
3913         u32 *zcull_map_tiles, *zcull_bank_counters;
3914         u32 map_counter;
3915         u32 rcp_conserv;
3916         u32 offset;
3917         bool floorsweep = false;
3918
3919         if (!gr->map_tiles)
3920                 return -1;
3921
3922         zcull_map_tiles = kzalloc(proj_scal_max_gpcs_v() *
3923                         proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3924         if (!zcull_map_tiles) {
3925                 gk20a_err(dev_from_gk20a(g),
3926                         "failed to allocate zcull temp buffers");
3927                 return -ENOMEM;
3928         }
3929         zcull_bank_counters = kzalloc(proj_scal_max_gpcs_v() *
3930                         proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3931
3932         if (!zcull_bank_counters) {
3933                 gk20a_err(dev_from_gk20a(g),
3934                         "failed to allocate zcull temp buffers");
3935                 kfree(zcull_map_tiles);
3936                 return -ENOMEM;
3937         }
3938
3939         for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
3940                 zcull_map_tiles[map_counter] =
3941                         zcull_bank_counters[gr->map_tiles[map_counter]];
3942                 zcull_bank_counters[gr->map_tiles[map_counter]]++;
3943         }
3944
3945         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(),
3946                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(zcull_map_tiles[0]) |
3947                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(zcull_map_tiles[1]) |
3948                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(zcull_map_tiles[2]) |
3949                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(zcull_map_tiles[3]) |
3950                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(zcull_map_tiles[4]) |
3951                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(zcull_map_tiles[5]) |
3952                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(zcull_map_tiles[6]) |
3953                 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(zcull_map_tiles[7]));
3954
3955         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(),
3956                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(zcull_map_tiles[8]) |
3957                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(zcull_map_tiles[9]) |
3958                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(zcull_map_tiles[10]) |
3959                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(zcull_map_tiles[11]) |
3960                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(zcull_map_tiles[12]) |
3961                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(zcull_map_tiles[13]) |
3962                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(zcull_map_tiles[14]) |
3963                 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(zcull_map_tiles[15]));
3964
3965         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(),
3966                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(zcull_map_tiles[16]) |
3967                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(zcull_map_tiles[17]) |
3968                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(zcull_map_tiles[18]) |
3969                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(zcull_map_tiles[19]) |
3970                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(zcull_map_tiles[20]) |
3971                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(zcull_map_tiles[21]) |
3972                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(zcull_map_tiles[22]) |
3973                 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(zcull_map_tiles[23]));
3974
3975         gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(),
3976                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(zcull_map_tiles[24]) |
3977                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(zcull_map_tiles[25]) |
3978                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(zcull_map_tiles[26]) |
3979                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(zcull_map_tiles[27]) |
3980                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(zcull_map_tiles[28]) |
3981                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(zcull_map_tiles[29]) |
3982                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(zcull_map_tiles[30]) |
3983                 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(zcull_map_tiles[31]));
3984
3985         kfree(zcull_map_tiles);
3986         kfree(zcull_bank_counters);
3987
3988         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3989                 gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
3990                 gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
3991
3992                 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3993                     gpc_zcull_count < gpc_tpc_count) {
3994                         gk20a_err(dev_from_gk20a(g),
3995                                 "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
3996                                 gpc_zcull_count, gpc_tpc_count, gpc_index);
3997                         return -EINVAL;
3998                 }
3999                 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
4000                     gpc_zcull_count != 0)
4001                         floorsweep = true;
4002         }
4003
4004         /* 1.0f / 1.0f * gr_gpc0_zcull_sm_num_rcp_conservative__max_v() */
4005         rcp_conserv = gr_gpc0_zcull_sm_num_rcp_conservative__max_v();
4006
4007         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4008                 offset = gpc_index * proj_gpc_stride_v();
4009
4010                 if (floorsweep) {
4011                         gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4012                                 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4013                                 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4014                                         gr->max_zcull_per_gpc_count));
4015                 } else {
4016                         gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
4017                                 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
4018                                 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
4019                                         gr->gpc_tpc_count[gpc_index]));
4020                 }
4021
4022                 gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
4023                         gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
4024                         gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
4025
4026                 gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
4027                         gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
4028         }
4029
4030         gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
4031                 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
4032
4033         return 0;
4034 }
4035
4036 static void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
4037 {
4038         /* enable tpc exception forwarding */
4039         gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(),
4040                 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f());
4041
4042         /* enable gpc exception forwarding */
4043         gk20a_writel(g, gr_gpc0_gpccs_gpc_exception_en_r(),
4044                 gr_gpc0_gpccs_gpc_exception_en_tpc_0_enabled_f());
4045 }
4046
4047
4048 void gr_gk20a_enable_hww_exceptions(struct gk20a *g)
4049 {
4050         /* enable exceptions */
4051         gk20a_writel(g, gr_fe_hww_esr_r(),
4052                      gr_fe_hww_esr_en_enable_f() |
4053                      gr_fe_hww_esr_reset_active_f());
4054         gk20a_writel(g, gr_memfmt_hww_esr_r(),
4055                      gr_memfmt_hww_esr_en_enable_f() |
4056                      gr_memfmt_hww_esr_reset_active_f());
4057         gk20a_writel(g, gr_scc_hww_esr_r(),
4058                      gr_scc_hww_esr_en_enable_f() |
4059                      gr_scc_hww_esr_reset_active_f());
4060         gk20a_writel(g, gr_mme_hww_esr_r(),
4061                      gr_mme_hww_esr_en_enable_f() |
4062                      gr_mme_hww_esr_reset_active_f());
4063         gk20a_writel(g, gr_pd_hww_esr_r(),
4064                      gr_pd_hww_esr_en_enable_f() |
4065                      gr_pd_hww_esr_reset_active_f());
4066         gk20a_writel(g, gr_sked_hww_esr_r(), /* enabled by default */
4067                      gr_sked_hww_esr_reset_active_f());
4068         gk20a_writel(g, gr_ds_hww_esr_r(),
4069                      gr_ds_hww_esr_en_enabled_f() |
4070                      gr_ds_hww_esr_reset_task_f());
4071         gk20a_writel(g, gr_ds_hww_report_mask_r(),
4072                      gr_ds_hww_report_mask_sph0_err_report_f() |
4073                      gr_ds_hww_report_mask_sph1_err_report_f() |
4074                      gr_ds_hww_report_mask_sph2_err_report_f() |
4075                      gr_ds_hww_report_mask_sph3_err_report_f() |
4076                      gr_ds_hww_report_mask_sph4_err_report_f() |
4077                      gr_ds_hww_report_mask_sph5_err_report_f() |
4078                      gr_ds_hww_report_mask_sph6_err_report_f() |
4079                      gr_ds_hww_report_mask_sph7_err_report_f() |
4080                      gr_ds_hww_report_mask_sph8_err_report_f() |
4081                      gr_ds_hww_report_mask_sph9_err_report_f() |
4082                      gr_ds_hww_report_mask_sph10_err_report_f() |
4083                      gr_ds_hww_report_mask_sph11_err_report_f() |
4084                      gr_ds_hww_report_mask_sph12_err_report_f() |
4085                      gr_ds_hww_report_mask_sph13_err_report_f() |
4086                      gr_ds_hww_report_mask_sph14_err_report_f() |
4087                      gr_ds_hww_report_mask_sph15_err_report_f() |
4088                      gr_ds_hww_report_mask_sph16_err_report_f() |
4089                      gr_ds_hww_report_mask_sph17_err_report_f() |
4090                      gr_ds_hww_report_mask_sph18_err_report_f() |
4091                      gr_ds_hww_report_mask_sph19_err_report_f() |
4092                      gr_ds_hww_report_mask_sph20_err_report_f() |
4093                      gr_ds_hww_report_mask_sph21_err_report_f() |
4094                      gr_ds_hww_report_mask_sph22_err_report_f() |
4095                      gr_ds_hww_report_mask_sph23_err_report_f());
4096 }
4097
4098 static void gr_gk20a_set_hww_esr_report_mask(struct gk20a *g)
4099 {
4100         /* setup sm warp esr report masks */
4101         gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4102                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4103                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4104                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4105                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4106                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4107                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4108                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4109                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4110                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4111                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4112                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4113                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4114                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4115                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4116                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4117                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4118                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4119                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4120                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4121                 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4122
4123         /* setup sm global esr report mask */
4124         gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4125                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4126                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4127                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4128                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4129                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4130                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4131                 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4132 }
4133
4134 static int gk20a_init_gr_setup_hw(struct gk20a *g)
4135 {
4136         struct gr_gk20a *gr = &g->gr;
4137         struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
4138         struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
4139         u32 data;
4140         u32 addr_lo, addr_hi;
4141         u64 addr;
4142         unsigned long end_jiffies = jiffies +
4143                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4144         u32 fe_go_idle_timeout_save;
4145         u32 last_method_data = 0;
4146         u32 i, err;
4147
4148         gk20a_dbg_fn("");
4149
4150         /* slcg prod values */
4151         g->ops.clock_gating.slcg_gr_load_gating_prod(g, g->slcg_enabled);
4152         g->ops.clock_gating.slcg_perf_load_gating_prod(g, g->slcg_enabled);
4153
4154         /* init mmu debug buffer */
4155         addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_wr_mem.iova);
4156         addr_lo = u64_lo32(addr);
4157         addr_hi = u64_hi32(addr);
4158         addr = (addr_lo >> fb_mmu_debug_wr_addr_alignment_v()) |
4159                 (addr_hi << (32 - fb_mmu_debug_wr_addr_alignment_v()));
4160
4161         gk20a_writel(g, fb_mmu_debug_wr_r(),
4162                      fb_mmu_debug_wr_aperture_vid_mem_f() |
4163                      fb_mmu_debug_wr_vol_false_f() |
4164                      fb_mmu_debug_wr_addr_v(addr));
4165
4166         addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_rd_mem.iova);
4167         addr_lo = u64_lo32(addr);
4168         addr_hi = u64_hi32(addr);
4169         addr = (addr_lo >> fb_mmu_debug_rd_addr_alignment_v()) |
4170                 (addr_hi << (32 - fb_mmu_debug_rd_addr_alignment_v()));
4171
4172         gk20a_writel(g, fb_mmu_debug_rd_r(),
4173                      fb_mmu_debug_rd_aperture_vid_mem_f() |
4174                      fb_mmu_debug_rd_vol_false_f() |
4175                      fb_mmu_debug_rd_addr_v(addr));
4176
4177         /* load gr floorsweeping registers */
4178         data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
4179         data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
4180                         gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
4181         gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
4182
4183         gr_gk20a_zcull_init_hw(g, gr);
4184
4185         g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
4186         g->ops.clock_gating.pg_gr_load_gating_prod(g, true);
4187
4188         if (g->elcg_enabled) {
4189                 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
4190                 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
4191         } else {
4192                 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
4193                 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
4194         }
4195
4196         /* Bug 1340570: increase the clock timeout to avoid potential
4197          * operation failure at high gpcclk rate. Default values are 0x400.
4198          */
4199         gk20a_writel(g, pri_ringstation_sys_master_config_r(0x15), 0x800);
4200         gk20a_writel(g, pri_ringstation_gpc_master_config_r(0xa), 0x800);
4201         gk20a_writel(g, pri_ringstation_fbp_master_config_r(0x8), 0x800);
4202
4203         /* enable fifo access */
4204         gk20a_writel(g, gr_gpfifo_ctl_r(),
4205                      gr_gpfifo_ctl_access_enabled_f() |
4206                      gr_gpfifo_ctl_semaphore_access_enabled_f());
4207
4208         /* TBD: reload gr ucode when needed */
4209
4210         /* enable interrupts */
4211         gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
4212         gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
4213
4214         /* enable fecs error interrupts */
4215         gk20a_writel(g, gr_fecs_host_int_enable_r(),
4216                      gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
4217                      gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
4218                      gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
4219                      gr_fecs_host_int_enable_watchdog_enable_f());
4220
4221         g->ops.gr.enable_hww_exceptions(g);
4222         g->ops.gr.set_hww_esr_report_mask(g);
4223
4224         /* enable per GPC exceptions */
4225         gk20a_gr_enable_gpc_exceptions(g);
4226
4227         /* TBD: ECC for L1/SM */
4228         /* TBD: enable per BE exceptions */
4229
4230         /* reset and enable all exceptions */
4231         gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
4232         gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
4233         gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
4234         gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
4235         gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
4236         gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
4237
4238         /* ignore status from some units */
4239         data = gk20a_readl(g, gr_status_mask_r());
4240         gk20a_writel(g, gr_status_mask_r(), data & gr->status_disable_mask);
4241
4242         if (gr->sw_ready)
4243                 gr_gk20a_load_zbc_table(g, gr);
4244         else
4245                 gr_gk20a_load_zbc_default_table(g, gr);
4246
4247         g->ops.ltc.init_cbc(g, gr);
4248
4249         /* load ctx init */
4250         for (i = 0; i < sw_ctx_load->count; i++)
4251                 gk20a_writel(g, sw_ctx_load->l[i].addr,
4252                              sw_ctx_load->l[i].value);
4253
4254         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4255         if (err)
4256                 goto out;
4257
4258         /* save and disable fe_go_idle */
4259         fe_go_idle_timeout_save =
4260                 gk20a_readl(g, gr_fe_go_idle_timeout_r());
4261         gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4262                 (fe_go_idle_timeout_save & gr_fe_go_idle_timeout_count_f(0)) |
4263                 gr_fe_go_idle_timeout_count_disabled_f());
4264
4265         /* override a few ctx state registers */
4266         g->ops.gr.commit_global_cb_manager(g, NULL, false);
4267         gr_gk20a_commit_global_timeslice(g, NULL, false);
4268
4269         /* floorsweep anything left */
4270         g->ops.gr.init_fs_state(g);
4271
4272         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4273         if (err)
4274                 goto restore_fe_go_idle;
4275
4276 restore_fe_go_idle:
4277         /* restore fe_go_idle */
4278         gk20a_writel(g, gr_fe_go_idle_timeout_r(), fe_go_idle_timeout_save);
4279
4280         if (err || gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT))
4281                 goto out;
4282
4283         /* load method init */
4284         if (sw_method_init->count) {
4285                 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4286                              sw_method_init->l[0].value);
4287                 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4288                              gr_pri_mme_shadow_raw_index_write_trigger_f() |
4289                              sw_method_init->l[0].addr);
4290                 last_method_data = sw_method_init->l[0].value;
4291         }
4292         for (i = 1; i < sw_method_init->count; i++) {
4293                 if (sw_method_init->l[i].value != last_method_data) {
4294                         gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4295                                 sw_method_init->l[i].value);
4296                         last_method_data = sw_method_init->l[i].value;
4297                 }
4298                 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4299                         gr_pri_mme_shadow_raw_index_write_trigger_f() |
4300                         sw_method_init->l[i].addr);
4301         }
4302
4303         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4304         if (err)
4305                 goto out;
4306
4307 out:
4308         gk20a_dbg_fn("done");
4309         return 0;
4310 }
4311
4312 static int gk20a_init_gr_prepare(struct gk20a *g)
4313 {
4314         u32 gpfifo_ctrl, pmc_en;
4315         u32 err = 0;
4316
4317         /* disable fifo access */
4318         pmc_en = gk20a_readl(g, mc_enable_r());
4319         if (pmc_en & mc_enable_pgraph_enabled_f()) {
4320                 gpfifo_ctrl = gk20a_readl(g, gr_gpfifo_ctl_r());
4321                 gpfifo_ctrl &= ~gr_gpfifo_ctl_access_enabled_f();
4322                 gk20a_writel(g, gr_gpfifo_ctl_r(), gpfifo_ctrl);
4323         }
4324
4325         /* reset gr engine */
4326         gk20a_reset(g, mc_enable_pgraph_enabled_f()
4327                         | mc_enable_blg_enabled_f()
4328                         | mc_enable_perfmon_enabled_f());
4329
4330         /* enable fifo access */
4331         gk20a_writel(g, gr_gpfifo_ctl_r(),
4332                 gr_gpfifo_ctl_access_enabled_f() |
4333                 gr_gpfifo_ctl_semaphore_access_enabled_f());
4334
4335         if (!g->gr.ctx_vars.valid) {
4336                 err = gr_gk20a_init_ctx_vars(g, &g->gr);
4337                 if (err)
4338                         gk20a_err(dev_from_gk20a(g),
4339                                 "fail to load gr init ctx");
4340         }
4341         return err;
4342 }
4343
4344 static int gr_gk20a_wait_mem_scrubbing(struct gk20a *g)
4345 {
4346         int retries = GR_IDLE_CHECK_MAX / GR_IDLE_CHECK_DEFAULT;
4347         bool fecs_scrubbing;
4348         bool gpccs_scrubbing;
4349
4350         gk20a_dbg_fn("");
4351
4352         do {
4353                 fecs_scrubbing = gk20a_readl(g, gr_fecs_dmactl_r()) &
4354                         (gr_fecs_dmactl_imem_scrubbing_m() |
4355                          gr_fecs_dmactl_dmem_scrubbing_m());
4356
4357                 gpccs_scrubbing = gk20a_readl(g, gr_gpccs_dmactl_r()) &
4358                         (gr_gpccs_dmactl_imem_scrubbing_m() |
4359                          gr_gpccs_dmactl_imem_scrubbing_m());
4360
4361                 if (!fecs_scrubbing && !gpccs_scrubbing) {
4362                         gk20a_dbg_fn("done");
4363                         return 0;
4364                 }
4365
4366                 udelay(GR_IDLE_CHECK_DEFAULT);
4367         } while (--retries || !tegra_platform_is_silicon());
4368
4369         gk20a_err(dev_from_gk20a(g), "Falcon mem scrubbing timeout");
4370         return -ETIMEDOUT;
4371 }
4372
4373 static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
4374 {
4375         struct gr_gk20a *gr = &g->gr;
4376         struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
4377         unsigned long end_jiffies = jiffies +
4378                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4379         u32 i, err = 0;
4380
4381         gk20a_dbg_fn("");
4382
4383         /* enable interrupts */
4384         gk20a_writel(g, gr_intr_r(), ~0);
4385         gk20a_writel(g, gr_intr_en_r(), ~0);
4386
4387         /* reset ctx switch state */
4388         gr_gk20a_ctx_reset(g, 0);
4389
4390         /* clear scc ram */
4391         gk20a_writel(g, gr_scc_init_r(),
4392                 gr_scc_init_ram_trigger_f());
4393
4394         /* load non_ctx init */
4395         for (i = 0; i < sw_non_ctx_load->count; i++)
4396                 gk20a_writel(g, sw_non_ctx_load->l[i].addr,
4397                         sw_non_ctx_load->l[i].value);
4398
4399         err = gr_gk20a_wait_mem_scrubbing(g);
4400         if (err)
4401                 goto out;
4402
4403         err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4404         if (err)
4405                 goto out;
4406
4407         err = gr_gk20a_load_ctxsw_ucode(g, gr);
4408         if (err)
4409                 goto out;
4410
4411         /* this appears query for sw states but fecs actually init
4412            ramchain, etc so this is hw init */
4413         err = gr_gk20a_init_ctx_state(g, gr);
4414         if (err)
4415                 goto out;
4416
4417 out:
4418         if (err)
4419                 gk20a_err(dev_from_gk20a(g), "fail");
4420         else
4421                 gk20a_dbg_fn("done");
4422
4423         return 0;
4424 }
4425
4426 /*
4427  * XXX Merge this list with the debugger/profiler
4428  * session regops whitelists?
4429  */
4430 static u32 wl_addr_gk20a[] = {
4431         /* this list must be sorted (low to high) */
4432         0x404468, /* gr_pri_mme_max_instructions       */
4433         0x408944, /* gr_pri_bes_crop_hww_esr           */
4434         0x418800, /* gr_pri_gpcs_setup_debug           */
4435         0x419a04, /* gr_pri_gpcs_tpcs_tex_lod_dbg      */
4436         0x419a08, /* gr_pri_gpcs_tpcs_tex_samp_dbg     */
4437         0x419e10, /* gr_pri_gpcs_tpcs_sm_dbgr_control0 */
4438         0x419f78, /* gr_pri_gpcs_tpcs_sm_disp_ctrl     */
4439 };
4440
4441 static int gr_gk20a_init_access_map(struct gk20a *g)
4442 {
4443         struct gr_gk20a *gr = &g->gr;
4444         void *data;
4445         int err = 0;
4446         u32 w, nr_pages =
4447                 DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
4448                              PAGE_SIZE);
4449
4450         data = vmap(gr->global_ctx_buffer[PRIV_ACCESS_MAP].pages,
4451                     PAGE_ALIGN(gr->global_ctx_buffer[PRIV_ACCESS_MAP].size) >>
4452                     PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
4453         if (!data) {
4454                 gk20a_err(dev_from_gk20a(g),
4455                           "failed to map priv access map memory");
4456                 err = -ENOMEM;
4457                 goto clean_up;
4458         }
4459
4460         memset(data, 0x0, PAGE_SIZE * nr_pages);
4461
4462         for (w = 0; w < ARRAY_SIZE(wl_addr_gk20a); w++) {
4463                 u32 map_bit, map_byte, map_shift;
4464                 map_bit = wl_addr_gk20a[w] >> 2;
4465                 map_byte = map_bit >> 3;
4466                 map_shift = map_bit & 0x7; /* i.e. 0-7 */
4467                 gk20a_dbg_info("access map addr:0x%x byte:0x%x bit:%d",
4468                   wl_addr_gk20a[w], map_byte, map_shift);
4469                 ((u8 *)data)[map_byte] |= 1 << map_shift;
4470         }
4471
4472 clean_up:
4473         if (data)
4474                 vunmap(data);
4475         return 0;
4476 }
4477
4478 static int gk20a_init_gr_setup_sw(struct gk20a *g)
4479 {
4480         struct gr_gk20a *gr = &g->gr;
4481         int err;
4482
4483         gk20a_dbg_fn("");
4484
4485         if (gr->sw_ready) {
4486                 gk20a_dbg_fn("skip init");
4487                 return 0;
4488         }
4489
4490         gr->g = g;
4491
4492         err = gr_gk20a_init_gr_config(g, gr);
4493         if (err)
4494                 goto clean_up;
4495
4496         err = gr_gk20a_init_mmu_sw(g, gr);
4497         if (err)
4498                 goto clean_up;
4499
4500         err = gr_gk20a_init_map_tiles(g, gr);
4501         if (err)
4502                 goto clean_up;
4503
4504         if (tegra_cpu_is_asim())
4505                 gr->max_comptag_mem = 1; /* MBs worth of comptag coverage */
4506         else {
4507                 gk20a_dbg_info("total ram pages : %lu", totalram_pages);
4508                 gr->max_comptag_mem = totalram_pages
4509                                          >> (10 - (PAGE_SHIFT - 10));
4510         }
4511         err = g->ops.ltc.init_comptags(g, gr);
4512         if (err)
4513                 goto clean_up;
4514
4515         err = gr_gk20a_init_zcull(g, gr);
4516         if (err)
4517                 goto clean_up;
4518
4519         err = gr_gk20a_alloc_global_ctx_buffers(g);
4520         if (err)
4521                 goto clean_up;
4522
4523         err = gr_gk20a_init_access_map(g);
4524         if (err)
4525                 goto clean_up;
4526
4527         mutex_init(&gr->ctx_mutex);
4528         spin_lock_init(&gr->ch_tlb_lock);
4529
4530         gr->remove_support = gk20a_remove_gr_support;
4531         gr->sw_ready = true;
4532
4533         gk20a_dbg_fn("done");
4534         return 0;
4535
4536 clean_up:
4537         gk20a_err(dev_from_gk20a(g), "fail");
4538         gk20a_remove_gr_support(gr);
4539         return err;
4540 }
4541
4542 static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g)
4543 {
4544         struct pmu_gk20a *pmu = &g->pmu;
4545         struct mm_gk20a *mm = &g->mm;
4546         struct vm_gk20a *vm = &mm->pmu.vm;
4547         struct device *d = dev_from_gk20a(g);
4548         int err = 0;
4549
4550         u32 size;
4551         struct sg_table *sgt_pg_buf;
4552         dma_addr_t iova;
4553
4554         gk20a_dbg_fn("");
4555
4556         size = 0;
4557
4558         err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
4559         if (err) {
4560                 gk20a_err(dev_from_gk20a(g),
4561                         "fail to query fecs pg buffer size");
4562                 return err;
4563         }
4564
4565         if (!pmu->pg_buf.cpuva) {
4566                 pmu->pg_buf.cpuva = dma_alloc_coherent(d, size,
4567                                                 &iova,
4568                                                 GFP_KERNEL);
4569                 if (!pmu->pg_buf.cpuva) {
4570                         gk20a_err(d, "failed to allocate memory\n");
4571                         return -ENOMEM;
4572                 }
4573
4574                 pmu->pg_buf.iova = iova;
4575                 pmu->pg_buf.size = size;
4576
4577                 err = gk20a_get_sgtable(d, &sgt_pg_buf,
4578                                         pmu->pg_buf.cpuva,
4579                                         pmu->pg_buf.iova,
4580                                         size);
4581                 if (err) {
4582                         gk20a_err(d, "failed to create sg table\n");
4583                         goto err_free_pg_buf;
4584                 }
4585
4586                 pmu->pg_buf.pmu_va = gk20a_gmmu_map(vm,
4587                                         &sgt_pg_buf,
4588                                         size,
4589                                         0, /* flags */
4590                                         gk20a_mem_flag_none);
4591                 if (!pmu->pg_buf.pmu_va) {
4592                         gk20a_err(d, "failed to map fecs pg buffer");
4593                         err = -ENOMEM;
4594                         goto err_free_sgtable;
4595                 }
4596
4597                 gk20a_free_sgtable(&sgt_pg_buf);
4598         }
4599
4600
4601         err = gr_gk20a_fecs_set_reglist_bind_inst(g, mm->pmu.inst_block.cpu_pa);
4602         if (err) {
4603                 gk20a_err(dev_from_gk20a(g),
4604                         "fail to bind pmu inst to gr");
4605                 return err;
4606         }
4607
4608         err = gr_gk20a_fecs_set_reglist_virtual_addr(g, pmu->pg_buf.pmu_va);
4609         if (err) {
4610                 gk20a_err(dev_from_gk20a(g),
4611                         "fail to set pg buffer pmu va");
4612                 return err;
4613         }
4614
4615         return err;
4616
4617 err_free_sgtable:
4618         gk20a_free_sgtable(&sgt_pg_buf);
4619 err_free_pg_buf:
4620         dma_free_coherent(d, size,
4621                 pmu->pg_buf.cpuva, pmu->pg_buf.iova);
4622         pmu->pg_buf.cpuva = NULL;
4623         pmu->pg_buf.iova = 0;
4624         return err;
4625 }
4626
4627 int gk20a_init_gr_support(struct gk20a *g)
4628 {
4629         u32 err;
4630
4631         gk20a_dbg_fn("");
4632
4633         err = gk20a_init_gr_prepare(g);
4634         if (err)
4635                 return err;
4636
4637         /* this is required before gr_gk20a_init_ctx_state */
4638         mutex_init(&g->gr.fecs_mutex);
4639
4640         err = gk20a_init_gr_reset_enable_hw(g);
4641         if (err)
4642                 return err;
4643
4644         err = gk20a_init_gr_setup_sw(g);
4645         if (err)
4646                 return err;
4647
4648         err = gk20a_init_gr_setup_hw(g);
4649         if (err)
4650                 return err;
4651
4652         err = gk20a_init_gr_bind_fecs_elpg(g);
4653         if (err)
4654                 return err;
4655
4656         /* GR is inialized, signal possible waiters */
4657         g->gr.initialized = true;
4658         wake_up(&g->gr.init_wq);
4659
4660         return 0;
4661 }
4662
4663 /* Wait until GR is initialized */
4664 void gk20a_gr_wait_initialized(struct gk20a *g)
4665 {
4666         wait_event(g->gr.init_wq, g->gr.initialized);
4667 }
4668
4669 #define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE   0x02dc
4670 #define NVA297_SET_CIRCULAR_BUFFER_SIZE         0x1280
4671 #define NVA297_SET_SHADER_EXCEPTIONS            0x1528
4672 #define NVA0C0_SET_SHADER_EXCEPTIONS            0x1528
4673
4674 #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
4675
4676 struct gr_isr_data {
4677         u32 addr;
4678         u32 data_lo;
4679         u32 data_hi;
4680         u32 curr_ctx;
4681         u32 chid;
4682         u32 offset;
4683         u32 sub_chan;
4684         u32 class_num;
4685 };
4686
4687 void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data)
4688 {
4689         gk20a_dbg_fn("");
4690
4691         if (data == NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE) {
4692                 gk20a_writel(g,
4693                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), 0);
4694                 gk20a_writel(g,
4695                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), 0);
4696         } else {
4697                 /* setup sm warp esr report masks */
4698                 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4699                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
4700                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
4701                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
4702                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
4703                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
4704                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
4705                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
4706                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
4707                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
4708                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
4709                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
4710                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
4711                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
4712                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4713                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4714                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4715                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4716                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4717                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4718                         gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4719
4720                 /* setup sm global esr report mask */
4721                 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4722                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4723                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4724                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4725                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4726                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4727                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4728                         gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4729         }
4730 }
4731
4732 static void gk20a_gr_set_circular_buffer_size(struct gk20a *g, u32 data)
4733 {
4734         struct gr_gk20a *gr = &g->gr;
4735         u32 gpc_index, ppc_index, stride, val, offset;
4736         u32 cb_size = data * 4;
4737
4738         gk20a_dbg_fn("");
4739
4740         if (cb_size > gr->attrib_cb_size)
4741                 cb_size = gr->attrib_cb_size;
4742
4743         gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4744                 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4745                  ~gr_ds_tga_constraintlogic_beta_cbsize_f(~0)) |
4746                  gr_ds_tga_constraintlogic_beta_cbsize_f(cb_size));
4747
4748         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4749                 stride = proj_gpc_stride_v() * gpc_index;
4750
4751                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4752                         ppc_index++) {
4753
4754                         val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg_r() +
4755                                 stride +
4756                                 proj_ppc_in_gpc_stride_v() * ppc_index);
4757
4758                         offset = gr_gpc0_ppc0_cbm_cfg_start_offset_v(val);
4759
4760                         val = set_field(val,
4761                                 gr_gpc0_ppc0_cbm_cfg_size_m(),
4762                                 gr_gpc0_ppc0_cbm_cfg_size_f(cb_size *
4763                                         gr->pes_tpc_count[ppc_index][gpc_index]));
4764                         val = set_field(val,
4765                                 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4766                                 (offset + 1));
4767
4768                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4769                                 stride +
4770                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4771
4772                         val = set_field(val,
4773                                 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4774                                 offset);
4775
4776                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4777                                 stride +
4778                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4779                 }
4780         }
4781 }
4782
4783 static void gk20a_gr_set_alpha_circular_buffer_size(struct gk20a *g, u32 data)
4784 {
4785         struct gr_gk20a *gr = &g->gr;
4786         u32 gpc_index, ppc_index, stride, val;
4787         u32 pd_ab_max_output;
4788         u32 alpha_cb_size = data * 4;
4789
4790         gk20a_dbg_fn("");
4791         /* if (NO_ALPHA_BETA_TIMESLICE_SUPPORT_DEF)
4792                 return; */
4793
4794         if (alpha_cb_size > gr->alpha_cb_size)
4795                 alpha_cb_size = gr->alpha_cb_size;
4796
4797         gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4798                 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4799                  ~gr_ds_tga_constraintlogic_alpha_cbsize_f(~0)) |
4800                  gr_ds_tga_constraintlogic_alpha_cbsize_f(alpha_cb_size));
4801
4802         pd_ab_max_output = alpha_cb_size *
4803                 gr_gpc0_ppc0_cbm_cfg_size_granularity_v() /
4804                 gr_pd_ab_dist_cfg1_max_output_granularity_v();
4805
4806         gk20a_writel(g, gr_pd_ab_dist_cfg1_r(),
4807                 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output));
4808
4809         for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4810                 stride = proj_gpc_stride_v() * gpc_index;
4811
4812                 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4813                         ppc_index++) {
4814
4815                         val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4816                                 stride +
4817                                 proj_ppc_in_gpc_stride_v() * ppc_index);
4818
4819                         val = set_field(val, gr_gpc0_ppc0_cbm_cfg2_size_m(),
4820                                         gr_gpc0_ppc0_cbm_cfg2_size_f(alpha_cb_size *
4821                                                 gr->pes_tpc_count[ppc_index][gpc_index]));
4822
4823                         gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4824                                 stride +
4825                                 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4826                 }
4827         }
4828 }
4829
4830 int gk20a_gr_reset(struct gk20a *g)
4831 {
4832         int err;
4833         u32 size;
4834
4835         err = gk20a_init_gr_prepare(g);
4836         if (err)
4837                 return err;
4838
4839         err = gk20a_init_gr_reset_enable_hw(g);
4840         if (err)
4841                 return err;
4842
4843         err = gk20a_init_gr_setup_hw(g);
4844         if (err)
4845                 return err;
4846
4847         size = 0;
4848         err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
4849         if (err) {
4850                 gk20a_err(dev_from_gk20a(g),
4851                         "fail to query fecs pg buffer size");
4852                 return err;
4853         }
4854
4855         err = gr_gk20a_fecs_set_reglist_bind_inst(g,
4856                         g->mm.pmu.inst_block.cpu_pa);
4857         if (err) {
4858                 gk20a_err(dev_from_gk20a(g),
4859                         "fail to bind pmu inst to gr");
4860                 return err;
4861         }
4862
4863         err = gr_gk20a_fecs_set_reglist_virtual_addr(g, g->pmu.pg_buf.pmu_va);
4864         if (err) {
4865                 gk20a_err(dev_from_gk20a(g),
4866                         "fail to set pg buffer pmu va");
4867                 return err;
4868         }
4869
4870         return 0;
4871 }
4872
4873 static int gr_gk20a_handle_sw_method(struct gk20a *g, u32 addr,
4874                                           u32 class_num, u32 offset, u32 data)
4875 {
4876         gk20a_dbg_fn("");
4877
4878         if (class_num == KEPLER_COMPUTE_A) {
4879                 switch (offset << 2) {
4880                 case NVA0C0_SET_SHADER_EXCEPTIONS:
4881                         gk20a_gr_set_shader_exceptions(g, data);
4882                         break;
4883                 default:
4884                         goto fail;
4885                 }
4886         }
4887
4888         if (class_num == KEPLER_C) {
4889                 switch (offset << 2) {
4890                 case NVA297_SET_SHADER_EXCEPTIONS:
4891                         gk20a_gr_set_shader_exceptions(g, data);
4892                         break;
4893                 case NVA297_SET_CIRCULAR_BUFFER_SIZE:
4894                         g->ops.gr.set_circular_buffer_size(g, data);
4895                         break;
4896                 case NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE:
4897                         g->ops.gr.set_alpha_circular_buffer_size(g, data);
4898                         break;
4899                 default:
4900                         goto fail;
4901                 }
4902         }
4903         return 0;
4904
4905 fail:
4906         return -EINVAL;
4907 }
4908
4909 static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
4910                   struct gr_isr_data *isr_data)
4911 {
4912         struct fifo_gk20a *f = &g->fifo;
4913         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4914         gk20a_dbg_fn("");
4915         gk20a_set_error_notifier(ch,
4916                                 NVHOST_CHANNEL_GR_SEMAPHORE_TIMEOUT);
4917         gk20a_err(dev_from_gk20a(g),
4918                    "gr semaphore timeout\n");
4919         return -EINVAL;
4920 }
4921
4922 static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
4923                   struct gr_isr_data *isr_data)
4924 {
4925         struct fifo_gk20a *f = &g->fifo;
4926         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4927         gk20a_dbg_fn("");
4928         gk20a_set_error_notifier(ch,
4929                                 NVHOST_CHANNEL_GR_ILLEGAL_NOTIFY);
4930         /* This is an unrecoverable error, reset is needed */
4931         gk20a_err(dev_from_gk20a(g),
4932                    "gr semaphore timeout\n");
4933         return -EINVAL;
4934 }
4935
4936 static int gk20a_gr_handle_illegal_method(struct gk20a *g,
4937                                           struct gr_isr_data *isr_data)
4938 {
4939         int ret = g->ops.gr.handle_sw_method(g, isr_data->addr,
4940                         isr_data->class_num, isr_data->offset,
4941                         isr_data->data_lo);
4942         if (ret)
4943                 gk20a_err(dev_from_gk20a(g), "invalid method class 0x%08x"
4944                         ", offset 0x%08x address 0x%08x\n",
4945                         isr_data->class_num, isr_data->offset, isr_data->addr);
4946
4947         return ret;
4948 }
4949
4950 static int gk20a_gr_handle_illegal_class(struct gk20a *g,
4951                                           struct gr_isr_data *isr_data)
4952 {
4953         struct fifo_gk20a *f = &g->fifo;
4954         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4955         gk20a_dbg_fn("");
4956         gk20a_set_error_notifier(ch,
4957                                 NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4958         gk20a_err(dev_from_gk20a(g),
4959                    "invalid class 0x%08x, offset 0x%08x",
4960                    isr_data->class_num, isr_data->offset);
4961         return -EINVAL;
4962 }
4963
4964 static int gk20a_gr_handle_fecs_error(struct gk20a *g,
4965                                           struct gr_isr_data *isr_data)
4966 {
4967         struct fifo_gk20a *f = &g->fifo;
4968         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4969         u32 gr_fecs_intr = gk20a_readl(g, gr_fecs_intr_r());
4970         gk20a_dbg_fn("");
4971
4972         gk20a_err(dev_from_gk20a(g),
4973                    "unhandled fecs error interrupt 0x%08x for channel %u",
4974                    gr_fecs_intr, ch->hw_chid);
4975
4976         gk20a_writel(g, gr_fecs_intr_r(), gr_fecs_intr);
4977         return -EINVAL;
4978 }
4979
4980 static int gk20a_gr_handle_class_error(struct gk20a *g,
4981                                           struct gr_isr_data *isr_data)
4982 {
4983         struct fifo_gk20a *f = &g->fifo;
4984         struct channel_gk20a *ch = &f->channel[isr_data->chid];
4985         u32 gr_class_error =
4986                 gr_class_error_code_v(gk20a_readl(g, gr_class_error_r()));
4987         gk20a_dbg_fn("");
4988
4989         gk20a_set_error_notifier(ch,
4990                         NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
4991         gk20a_err(dev_from_gk20a(g),
4992                    "class error 0x%08x, offset 0x%08x, unhandled intr 0x%08x for channel %u\n",
4993                    isr_data->class_num, isr_data->offset,
4994                    gr_class_error, ch->hw_chid);
4995         return -EINVAL;
4996 }
4997
4998 static int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
4999                                              struct gr_isr_data *isr_data)
5000 {
5001         struct fifo_gk20a *f = &g->fifo;
5002         struct channel_gk20a *ch = &f->channel[isr_data->chid];
5003
5004         wake_up(&ch->semaphore_wq);
5005
5006         return 0;
5007 }
5008
5009 #if defined(CONFIG_GK20A_CYCLE_STATS)
5010 static inline bool is_valid_cyclestats_bar0_offset_gk20a(struct gk20a *g,
5011                                                          u32 offset)
5012 {
5013         /* support only 24-bit 4-byte aligned offsets */
5014         bool valid = !(offset & 0xFF000003);
5015         /* whitelist check */
5016         valid = valid &&
5017                 is_bar0_global_offset_whitelisted_gk20a(offset);
5018         /* resource size check in case there was a problem
5019          * with allocating the assumed size of bar0 */
5020         valid = valid &&
5021                 offset < resource_size(g->reg_mem);
5022         return valid;
5023 }
5024 #endif
5025
5026 static int gk20a_gr_handle_notify_pending(struct gk20a *g,
5027                                           struct gr_isr_data *isr_data)
5028 {
5029         struct fifo_gk20a *f = &g->fifo;
5030         struct channel_gk20a *ch = &f->channel[isr_data->chid];
5031
5032 #if defined(CONFIG_GK20A_CYCLE_STATS)
5033         void *virtual_address;
5034         u32 buffer_size;
5035         u32 offset;
5036         u32 new_offset;
5037         bool exit;
5038         struct share_buffer_head *sh_hdr;
5039         u32 raw_reg;
5040         u64 mask_orig;
5041         u64 v = 0;
5042         struct gk20a_cyclestate_buffer_elem *op_elem;
5043         /* GL will never use payload 0 for cycle state */
5044         if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
5045                 return 0;
5046
5047         mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
5048
5049         virtual_address = ch->cyclestate.cyclestate_buffer;
5050         buffer_size = ch->cyclestate.cyclestate_buffer_size;
5051         offset = isr_data->data_lo;
5052         exit = false;
5053         while (!exit) {
5054                 if (offset >= buffer_size) {
5055                         WARN_ON(1);
5056                         break;
5057                 }
5058
5059                 sh_hdr = (struct share_buffer_head *)
5060                         ((char *)virtual_address + offset);
5061
5062                 if (sh_hdr->size < sizeof(struct share_buffer_head)) {
5063                         WARN_ON(1);
5064                         break;
5065                 }
5066                 new_offset = offset + sh_hdr->size;
5067
5068                 switch (sh_hdr->operation) {
5069                 case OP_END:
5070                         exit = true;
5071                         break;
5072
5073                 case BAR0_READ32:
5074                 case BAR0_WRITE32:
5075                 {
5076                         bool valid;
5077                         op_elem =
5078                                 (struct gk20a_cyclestate_buffer_elem *)
5079                                         sh_hdr;
5080                         valid = is_valid_cyclestats_bar0_offset_gk20a(g,
5081                                                         op_elem->offset_bar0);
5082                         if (!valid) {
5083                                 gk20a_err(dev_from_gk20a(g),
5084                                            "invalid cycletstats op offset: 0x%x\n",
5085                                            op_elem->offset_bar0);
5086
5087                                 sh_hdr->failed = exit = true;
5088                                 break;
5089                         }
5090
5091
5092                         mask_orig =
5093                                 ((1ULL <<
5094                                   (op_elem->last_bit + 1))
5095                                  -1)&~((1ULL <<
5096                                         op_elem->first_bit)-1);
5097
5098                         raw_reg =
5099                                 gk20a_readl(g,
5100                                             op_elem->offset_bar0);
5101
5102                         switch (sh_hdr->operation) {
5103                         case BAR0_READ32:
5104                                 op_elem->data =
5105                                         (raw_reg & mask_orig)
5106                                         >> op_elem->first_bit;
5107                                 break;
5108
5109                         case BAR0_WRITE32:
5110                                 v = 0;
5111                                 if ((unsigned int)mask_orig !=
5112                                     (unsigned int)~0) {
5113                                         v = (unsigned int)
5114                                                 (raw_reg & ~mask_orig);
5115                                 }
5116
5117                                 v |= ((op_elem->data
5118                                        << op_elem->first_bit)
5119                                       & mask_orig);
5120
5121                                 gk20a_writel(g,
5122                                              op_elem->offset_bar0,
5123                                              (unsigned int)v);
5124                                 break;
5125                         default:
5126                                 /* nop ok?*/
5127                                 break;
5128                         }
5129                 }
5130                 break;
5131
5132                 default:
5133                         /* no operation content case */
5134                         exit = true;
5135                         break;
5136                 }
5137                 sh_hdr->completed = true;
5138                 offset = new_offset;
5139         }
5140         mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
5141 #endif
5142         gk20a_dbg_fn("");
5143         wake_up(&ch->notifier_wq);
5144         return 0;
5145 }
5146
5147 /* Used by sw interrupt thread to translate current ctx to chid.
5148  * For performance, we don't want to go through 128 channels every time.
5149  * curr_ctx should be the value read from gr_fecs_current_ctx_r().
5150  * A small tlb is used here to cache translation */
5151 static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx)
5152 {
5153         struct fifo_gk20a *f = &g->fifo;
5154         struct gr_gk20a *gr = &g->gr;
5155         u32 chid = -1;
5156         u32 i;
5157
5158         /* when contexts are unloaded from GR, the valid bit is reset
5159          * but the instance pointer information remains intact. So the
5160          * valid bit must be checked to be absolutely certain that a
5161          * valid context is currently resident. */
5162         if (!gr_fecs_current_ctx_valid_v(curr_ctx))
5163                 return -1;
5164
5165         spin_lock(&gr->ch_tlb_lock);
5166
5167         /* check cache first */
5168         for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5169                 if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
5170                         chid = gr->chid_tlb[i].hw_chid;
5171                         goto unlock;
5172                 }
5173         }
5174
5175         /* slow path */
5176         for (chid = 0; chid < f->num_channels; chid++)
5177                 if (f->channel[chid].in_use) {
5178                         if ((u32)(f->channel[chid].inst_block.cpu_pa >>
5179                                 ram_in_base_shift_v()) ==
5180                                 gr_fecs_current_ctx_ptr_v(curr_ctx))
5181                                 break;
5182         }
5183
5184         if (chid >= f->num_channels) {
5185                 chid = -1;
5186                 goto unlock;
5187         }
5188
5189         /* add to free tlb entry */
5190         for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
5191                 if (gr->chid_tlb[i].curr_ctx == 0) {
5192                         gr->chid_tlb[i].curr_ctx = curr_ctx;
5193                         gr->chid_tlb[i].hw_chid = chid;
5194                         goto unlock;
5195                 }
5196         }
5197
5198         /* no free entry, flush one */
5199         gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
5200         gr->chid_tlb[gr->channel_tlb_flush_index].hw_chid = chid;
5201
5202         gr->channel_tlb_flush_index =
5203                 (gr->channel_tlb_flush_index + 1) &
5204                 (GR_CHANNEL_MAP_TLB_SIZE - 1);
5205
5206 unlock:
5207         spin_unlock(&gr->ch_tlb_lock);
5208         return chid;
5209 }
5210
5211 static int gk20a_gr_lock_down_sm(struct gk20a *g, u32 global_esr_mask)
5212 {
5213         unsigned long end_jiffies = jiffies +
5214                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5215         u32 delay = GR_IDLE_CHECK_DEFAULT;
5216         bool mmu_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled(g);
5217         u32 dbgr_control0;
5218
5219         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "locking down SM");
5220
5221         /* assert stop trigger */
5222         dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5223         dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5224         gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
5225
5226         /* wait for the sm to lock down */
5227         do {
5228                 u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5229                 u32 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5230                 u32 dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r());
5231                 bool locked_down =
5232                         (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
5233                          gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
5234                 bool error_pending =
5235                         (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) !=
5236                          gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) ||
5237                         ((global_esr & ~global_esr_mask) != 0);
5238
5239                 if (locked_down || !error_pending) {
5240                         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "locked down SM");
5241
5242                         /* de-assert stop trigger */
5243                         dbgr_control0 &= ~gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
5244                         gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
5245
5246                         return 0;
5247                 }
5248
5249                 /* if an mmu fault is pending and mmu debug mode is not
5250                  * enabled, the sm will never lock down. */
5251                 if (!mmu_debug_mode_enabled && gk20a_fifo_mmu_fault_pending(g)) {
5252                         gk20a_err(dev_from_gk20a(g), "mmu fault pending, sm will"
5253                                    " never lock down!");
5254                         return -EFAULT;
5255                 }
5256
5257                 usleep_range(delay, delay * 2);
5258                 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
5259
5260         } while (time_before(jiffies, end_jiffies)
5261                         || !tegra_platform_is_silicon());
5262
5263         gk20a_err(dev_from_gk20a(g), "timed out while trying to lock down SM");
5264
5265         return -EAGAIN;
5266 }
5267
5268 bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
5269 {
5270         u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
5271
5272         /* check if an sm debugger is attached */
5273         if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
5274                         gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v())
5275                 return true;
5276
5277         return false;
5278 }
5279
5280 static void gk20a_gr_clear_sm_hww(struct gk20a *g, u32 global_esr)
5281 {
5282         gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r(), global_esr);
5283
5284         /* clear the warp hww */
5285         gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r(),
5286                         gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f());
5287 }
5288
5289 static struct channel_gk20a *
5290 channel_from_hw_chid(struct gk20a *g, u32 hw_chid)
5291 {
5292         return g->fifo.channel+hw_chid;
5293 }
5294
5295 static int gk20a_gr_handle_sm_exception(struct gk20a *g,
5296                 struct gr_isr_data *isr_data)
5297 {
5298         int ret = 0;
5299         bool do_warp_sync = false;
5300         /* these three interrupts don't require locking down the SM. They can
5301          * be handled by usermode clients as they aren't fatal. Additionally,
5302          * usermode clients may wish to allow some warps to execute while others
5303          * are at breakpoints, as opposed to fatal errors where all warps should
5304          * halt. */
5305         u32 global_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()   |
5306                           gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
5307                           gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
5308         u32 global_esr, warp_esr;
5309         bool sm_debugger_attached = gk20a_gr_sm_debugger_attached(g);
5310         struct channel_gk20a *fault_ch;
5311
5312         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
5313
5314         global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5315         warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
5316
5317         /* if an sm debugger is attached, disable forwarding of tpc exceptions.
5318          * the debugger will reenable exceptions after servicing them. */
5319         if (sm_debugger_attached) {
5320                 u32 tpc_exception_en = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
5321                 tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
5322                 gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), tpc_exception_en);
5323                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM debugger attached");
5324         }
5325
5326         /* if a debugger is present and an error has occurred, do a warp sync */
5327         if (sm_debugger_attached && ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
5328                 gk20a_dbg(gpu_dbg_intr, "warp sync needed");
5329                 do_warp_sync = true;
5330         }
5331
5332         if (do_warp_sync) {
5333                 ret = gk20a_gr_lock_down_sm(g, global_mask);
5334                 if (ret) {
5335                         gk20a_err(dev_from_gk20a(g), "sm did not lock down!\n");
5336                         return ret;
5337                 }
5338         }
5339
5340         /* finally, signal any client waiting on an event */
5341         fault_ch = channel_from_hw_chid(g, isr_data->chid);
5342         if (fault_ch)
5343                 gk20a_dbg_gpu_post_events(fault_ch);
5344
5345         return ret;
5346 }
5347
5348 static int gk20a_gr_handle_tpc_exception(struct gk20a *g,
5349                 struct gr_isr_data *isr_data)
5350 {
5351         int ret = 0;
5352         u32 tpc_exception = gk20a_readl(g, gr_gpcs_tpcs_tpccs_tpc_exception_r());
5353
5354         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
5355
5356         /* check if an sm exeption is pending  */
5357         if (gr_gpcs_tpcs_tpccs_tpc_exception_sm_v(tpc_exception) ==
5358                         gr_gpcs_tpcs_tpccs_tpc_exception_sm_pending_v()) {
5359                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM exception pending");
5360                 ret = gk20a_gr_handle_sm_exception(g, isr_data);
5361         }
5362
5363         return ret;
5364 }
5365
5366 static int gk20a_gr_handle_gpc_exception(struct gk20a *g,
5367                 struct gr_isr_data *isr_data)
5368 {
5369         int ret = 0;
5370         u32 gpc_exception = gk20a_readl(g, gr_gpcs_gpccs_gpc_exception_r());
5371
5372         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
5373
5374         /* check if tpc 0 has an exception */
5375         if (gr_gpcs_gpccs_gpc_exception_tpc_v(gpc_exception) ==
5376                         gr_gpcs_gpccs_gpc_exception_tpc_0_pending_v()) {
5377                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "TPC exception pending");
5378                 ret = gk20a_gr_handle_tpc_exception(g, isr_data);
5379         }
5380
5381         return ret;
5382 }
5383
5384 int gk20a_gr_isr(struct gk20a *g)
5385 {
5386         struct gr_isr_data isr_data;
5387         u32 grfifo_ctl;
5388         u32 obj_table;
5389         int need_reset = 0;
5390         u32 gr_intr = gk20a_readl(g, gr_intr_r());
5391
5392         gk20a_dbg_fn("");
5393         gk20a_dbg(gpu_dbg_intr, "pgraph intr %08x", gr_intr);
5394
5395         if (!gr_intr)
5396                 return 0;
5397
5398         grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
5399         grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
5400         grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
5401
5402         gk20a_writel(g, gr_gpfifo_ctl_r(),
5403                 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
5404                 gr_gpfifo_ctl_semaphore_access_f(0));
5405
5406         isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
5407         isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
5408         isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
5409         isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
5410         isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
5411         isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
5412         obj_table = gk20a_readl(g,
5413                 gr_fe_object_table_r(isr_data.sub_chan));
5414         isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
5415
5416         isr_data.chid =
5417                 gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx);
5418         if (isr_data.chid == -1) {
5419                 gk20a_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
5420                            isr_data.curr_ctx);
5421                 goto clean_up;
5422         }
5423
5424         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5425                 "channel %d: addr 0x%08x, "
5426                 "data 0x%08x 0x%08x,"
5427                 "ctx 0x%08x, offset 0x%08x, "
5428                 "subchannel 0x%08x, class 0x%08x",
5429                 isr_data.chid, isr_data.addr,
5430                 isr_data.data_hi, isr_data.data_lo,
5431                 isr_data.curr_ctx, isr_data.offset,
5432                 isr_data.sub_chan, isr_data.class_num);
5433
5434         if (gr_intr & gr_intr_notify_pending_f()) {
5435                 gk20a_gr_handle_notify_pending(g, &isr_data);
5436                 gk20a_writel(g, gr_intr_r(),
5437                         gr_intr_notify_reset_f());
5438                 gr_intr &= ~gr_intr_notify_pending_f();
5439         }
5440
5441         if (gr_intr & gr_intr_semaphore_pending_f()) {
5442                 gk20a_gr_handle_semaphore_pending(g, &isr_data);
5443                 gk20a_writel(g, gr_intr_r(),
5444                         gr_intr_semaphore_reset_f());
5445                 gr_intr &= ~gr_intr_semaphore_pending_f();
5446         }
5447
5448         if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
5449                 need_reset |= gk20a_gr_handle_semaphore_timeout_pending(g,
5450                         &isr_data);
5451                 gk20a_writel(g, gr_intr_r(),
5452                         gr_intr_semaphore_reset_f());
5453                 gr_intr &= ~gr_intr_semaphore_pending_f();
5454         }
5455
5456         if (gr_intr & gr_intr_illegal_notify_pending_f()) {
5457                 need_reset |= gk20a_gr_intr_illegal_notify_pending(g,
5458                         &isr_data);
5459                 gk20a_writel(g, gr_intr_r(),
5460                         gr_intr_illegal_notify_reset_f());
5461                 gr_intr &= ~gr_intr_illegal_notify_pending_f();
5462         }
5463
5464         if (gr_intr & gr_intr_illegal_method_pending_f()) {
5465                 need_reset |= gk20a_gr_handle_illegal_method(g, &isr_data);
5466                 gk20a_writel(g, gr_intr_r(),
5467                         gr_intr_illegal_method_reset_f());
5468                 gr_intr &= ~gr_intr_illegal_method_pending_f();
5469         }
5470
5471         if (gr_intr & gr_intr_illegal_class_pending_f()) {
5472                 need_reset |= gk20a_gr_handle_illegal_class(g, &isr_data);
5473                 gk20a_writel(g, gr_intr_r(),
5474                         gr_intr_illegal_class_reset_f());
5475                 gr_intr &= ~gr_intr_illegal_class_pending_f();
5476         }
5477
5478         if (gr_intr & gr_intr_fecs_error_pending_f()) {
5479                 need_reset |= gk20a_gr_handle_fecs_error(g, &isr_data);
5480                 gk20a_writel(g, gr_intr_r(),
5481                         gr_intr_fecs_error_reset_f());
5482                 gr_intr &= ~gr_intr_fecs_error_pending_f();
5483         }
5484
5485         if (gr_intr & gr_intr_class_error_pending_f()) {
5486                 need_reset |= gk20a_gr_handle_class_error(g, &isr_data);
5487                 gk20a_writel(g, gr_intr_r(),
5488                         gr_intr_class_error_reset_f());
5489                 gr_intr &= ~gr_intr_class_error_pending_f();
5490         }
5491
5492         /* this one happens if someone tries to hit a non-whitelisted
5493          * register using set_falcon[4] */
5494         if (gr_intr & gr_intr_firmware_method_pending_f()) {
5495                 need_reset |= true;
5496                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "firmware method intr pending\n");
5497                 gk20a_writel(g, gr_intr_r(),
5498                         gr_intr_firmware_method_reset_f());
5499                 gr_intr &= ~gr_intr_firmware_method_pending_f();
5500         }
5501
5502         if (gr_intr & gr_intr_exception_pending_f()) {
5503                 u32 exception = gk20a_readl(g, gr_exception_r());
5504                 struct fifo_gk20a *f = &g->fifo;
5505                 struct channel_gk20a *ch = &f->channel[isr_data.chid];
5506
5507                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
5508
5509                 if (exception & gr_exception_fe_m()) {
5510                         u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
5511                         gk20a_dbg(gpu_dbg_intr, "fe warning %08x\n", fe);
5512                         gk20a_writel(g, gr_fe_hww_esr_r(), fe);
5513                         need_reset |= -EFAULT;
5514                 }
5515
5516                 /* check if a gpc exception has occurred */
5517                 if (exception & gr_exception_gpc_m() && need_reset == 0) {
5518                         u32 exception1 = gk20a_readl(g, gr_exception1_r());
5519                         u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5520
5521                         gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC exception pending");
5522
5523                         /* if no sm debugger is present, clean up the channel */
5524                         if (!gk20a_gr_sm_debugger_attached(g)) {
5525                                 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5526                                            "SM debugger not attached, clearing interrupt");
5527                                 need_reset |= -EFAULT;
5528                         } else {
5529                                 /* check if gpc 0 has an exception */
5530                                 if (exception1 & gr_exception1_gpc_0_pending_f())
5531                                         need_reset |= gk20a_gr_handle_gpc_exception(g, &isr_data);
5532                                 /* clear the hwws, also causes tpc and gpc
5533                                  * exceptions to be cleared */
5534                                 gk20a_gr_clear_sm_hww(g, global_esr);
5535                         }
5536
5537                         if (need_reset)
5538                                 gk20a_set_error_notifier(ch,
5539                                         NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
5540                 }
5541
5542                 gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
5543                 gr_intr &= ~gr_intr_exception_pending_f();
5544         }
5545
5546         if (need_reset)
5547                 gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), true);
5548
5549 clean_up:
5550         gk20a_writel(g, gr_gpfifo_ctl_r(),
5551                 grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
5552                 gr_gpfifo_ctl_semaphore_access_f(1));
5553
5554         if (gr_intr)
5555                 gk20a_err(dev_from_gk20a(g),
5556                            "unhandled gr interrupt 0x%08x", gr_intr);
5557
5558         return 0;
5559 }
5560
5561 int gk20a_gr_nonstall_isr(struct gk20a *g)
5562 {
5563         u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
5564         u32 clear_intr = 0;
5565
5566         gk20a_dbg(gpu_dbg_intr, "pgraph nonstall intr %08x", gr_intr);
5567
5568         if (gr_intr & gr_intr_nonstall_trap_pending_f()) {
5569                 gk20a_channel_semaphore_wakeup(g);
5570                 clear_intr |= gr_intr_nonstall_trap_pending_f();
5571         }
5572
5573         gk20a_writel(g, gr_intr_nonstall_r(), clear_intr);
5574
5575         return 0;
5576 }
5577
5578 int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
5579 {
5580         BUG_ON(size == NULL);
5581         return gr_gk20a_submit_fecs_method_op(g,
5582                    (struct fecs_method_op_gk20a) {
5583                            .mailbox.id = 0,
5584                            .mailbox.data = 0,
5585                            .mailbox.clr = ~0,
5586                            .method.data = 1,
5587                            .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
5588                            .mailbox.ret = size,
5589                            .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
5590                            .mailbox.ok = 0,
5591                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5592                            .mailbox.fail = 0});
5593 }
5594
5595 int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
5596 {
5597         return gr_gk20a_submit_fecs_method_op(g,
5598                    (struct fecs_method_op_gk20a){
5599                            .mailbox.id = 4,
5600                            .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
5601                                             gr_fecs_current_ctx_valid_f(1) |
5602                                             gr_fecs_current_ctx_target_vid_mem_f()),
5603                            .mailbox.clr = ~0,
5604                            .method.data = 1,
5605                            .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
5606                            .mailbox.ret = NULL,
5607                            .cond.ok = GR_IS_UCODE_OP_EQUAL,
5608                            .mailbox.ok = 1,
5609                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5610                            .mailbox.fail = 0});
5611 }
5612
5613 int gr_gk20a_fecs_set_reglist_virtual_addr(struct gk20a *g, u64 pmu_va)
5614 {
5615         return gr_gk20a_submit_fecs_method_op(g,
5616                    (struct fecs_method_op_gk20a) {
5617                            .mailbox.id = 4,
5618                            .mailbox.data = u64_lo32(pmu_va >> 8),
5619                            .mailbox.clr = ~0,
5620                            .method.data = 1,
5621                            .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
5622                            .mailbox.ret = NULL,
5623                            .cond.ok = GR_IS_UCODE_OP_EQUAL,
5624                            .mailbox.ok = 1,
5625                            .cond.fail = GR_IS_UCODE_OP_SKIP,
5626                            .mailbox.fail = 0});
5627 }
5628
5629 int gk20a_gr_suspend(struct gk20a *g)
5630 {
5631         unsigned long end_jiffies = jiffies +
5632                 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5633         u32 ret = 0;
5634
5635         gk20a_dbg_fn("");
5636
5637         ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
5638         if (ret)
5639                 return ret;
5640
5641         gk20a_writel(g, gr_gpfifo_ctl_r(),
5642                 gr_gpfifo_ctl_access_disabled_f());
5643
5644         /* disable gr intr */
5645         gk20a_writel(g, gr_intr_r(), 0);
5646         gk20a_writel(g, gr_intr_en_r(), 0);
5647
5648         /* disable all exceptions */
5649         gk20a_writel(g, gr_exception_r(), 0);
5650         gk20a_writel(g, gr_exception_en_r(), 0);
5651         gk20a_writel(g, gr_exception1_r(), 0);
5652         gk20a_writel(g, gr_exception1_en_r(), 0);
5653         gk20a_writel(g, gr_exception2_r(), 0);
5654         gk20a_writel(g, gr_exception2_en_r(), 0);
5655
5656         gk20a_gr_flush_channel_tlb(&g->gr);
5657
5658         g->gr.initialized = false;
5659
5660         gk20a_dbg_fn("done");
5661         return ret;
5662 }
5663
5664 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
5665                                                u32 addr,
5666                                                bool is_quad, u32 quad,
5667                                                u32 *context_buffer,
5668                                                u32 context_buffer_size,
5669                                                u32 *priv_offset);
5670
5671 /* This function will decode a priv address and return the partition type and numbers. */
5672 int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
5673                               int  *addr_type, /* enum ctxsw_addr_type */
5674                               u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
5675                               u32 *broadcast_flags)
5676 {
5677         u32 gpc_addr;
5678         u32 ppc_address;
5679         u32 ppc_broadcast_addr;
5680
5681         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5682
5683         /* setup defaults */
5684         ppc_address = 0;
5685         ppc_broadcast_addr = 0;
5686         *addr_type = CTXSW_ADDR_TYPE_SYS;
5687         *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
5688         *gpc_num = 0;
5689         *tpc_num = 0;
5690         *ppc_num = 0;
5691         *be_num  = 0;
5692
5693         if (pri_is_gpc_addr(addr)) {
5694                 *addr_type = CTXSW_ADDR_TYPE_GPC;
5695                 gpc_addr = pri_gpccs_addr_mask(addr);
5696                 if (pri_is_gpc_addr_shared(addr)) {
5697                         *addr_type = CTXSW_ADDR_TYPE_GPC;
5698                         *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
5699                 } else
5700                         *gpc_num = pri_get_gpc_num(addr);
5701
5702                 if (pri_is_tpc_addr(gpc_addr)) {
5703                         *addr_type = CTXSW_ADDR_TYPE_TPC;
5704                         if (pri_is_tpc_addr_shared(gpc_addr)) {
5705                                 *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
5706                                 return 0;
5707                         }
5708                         *tpc_num = pri_get_tpc_num(gpc_addr);
5709                 }
5710                 return 0;
5711         } else if (pri_is_be_addr(addr)) {
5712                 *addr_type = CTXSW_ADDR_TYPE_BE;
5713                 if (pri_is_be_addr_shared(addr)) {
5714                         *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
5715                         return 0;
5716                 }
5717                 *be_num = pri_get_be_num(addr);
5718                 return 0;
5719         } else {
5720                 *addr_type = CTXSW_ADDR_TYPE_SYS;
5721                 return 0;
5722         }
5723         /* PPC!?!?!?! */
5724
5725         /*NOTREACHED*/
5726         return -EINVAL;
5727 }
5728
5729 static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
5730                                       u32 gpc_num,
5731                                       u32 *priv_addr_table, u32 *t)
5732 {
5733     u32 ppc_num;
5734
5735     gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5736
5737     for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
5738             priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
5739                                                    gpc_num, ppc_num);
5740
5741     return 0;
5742 }
5743
5744 /*
5745  * The context buffer is indexed using BE broadcast addresses and GPC/TPC
5746  * unicast addresses. This function will convert a BE unicast address to a BE
5747  * broadcast address and split a GPC/TPC broadcast address into a table of
5748  * GPC/TPC addresses.  The addresses generated by this function can be
5749  * successfully processed by gr_gk20a_find_priv_offset_in_buffer
5750  */
5751 static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
5752                                            u32 addr,
5753                                            u32 *priv_addr_table,
5754                                            u32 *num_registers)
5755 {
5756         int addr_type; /*enum ctxsw_addr_type */
5757         u32 gpc_num, tpc_num, ppc_num, be_num;
5758         u32 broadcast_flags;
5759         u32 t;
5760         int err;
5761
5762         t = 0;
5763         *num_registers = 0;
5764
5765         gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
5766
5767         err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
5768                                         &gpc_num, &tpc_num, &ppc_num, &be_num,
5769                                         &broadcast_flags);
5770         gk20a_dbg(gpu_dbg_gpu_dbg, "addr_type = %d", addr_type);
5771         if (err)
5772                 return err;
5773
5774         if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
5775             (addr_type == CTXSW_ADDR_TYPE_BE)) {
5776                 /* The BE broadcast registers are included in the compressed PRI
5777                  * table. Convert a BE unicast address to a broadcast address
5778     &nb