2 * drivers/video/tegra/host/gk20a/gr_gk20a.c
6 * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22 #include <linux/delay.h> /* for udelay */
23 #include <linux/mm.h> /* for totalram_pages */
24 #include <linux/scatterlist.h>
25 #include <linux/nvmap.h>
26 #include <linux/tegra-soc.h>
27 #include <linux/nvhost_dbg_gpu_ioctl.h>
32 #include "gr_ctx_gk20a.h"
34 #include "hw_ccsr_gk20a.h"
35 #include "hw_ctxsw_prog_gk20a.h"
36 #include "hw_fifo_gk20a.h"
37 #include "hw_gr_gk20a.h"
38 #include "hw_mc_gk20a.h"
39 #include "hw_ram_gk20a.h"
40 #include "hw_pri_ringmaster_gk20a.h"
41 #include "hw_pri_ringstation_sys_gk20a.h"
42 #include "hw_pri_ringstation_gpc_gk20a.h"
43 #include "hw_pri_ringstation_fbp_gk20a.h"
44 #include "hw_proj_gk20a.h"
45 #include "hw_top_gk20a.h"
46 #include "hw_ltc_gk20a.h"
47 #include "hw_fb_gk20a.h"
48 #include "hw_therm_gk20a.h"
49 #include "hw_pbdma_gk20a.h"
50 #include "chip_support.h"
51 #include "nvhost_memmgr.h"
52 #include "gk20a_gating_reglist.h"
53 #include "gr_pri_gk20a.h"
54 #include "regops_gk20a.h"
55 #include "dbg_gpu_gk20a.h"
57 #define BLK_SIZE (256)
59 struct gk20a_ctxsw_bootloader_desc g_fecs_bootloader_desc = {
60 /* .bootLoaderStartOffset = */ 0x0,
61 /* .bootLoaderSize = */ 0x85,
62 /* .bootLoaderImemOffset = */ 0x4f00,
63 /* .bootLoaderEntryPoint = */ 0x4f00,
66 u32 g_fecs_bootloader_image[] = {
67 /* 0x0000 */ 0x001000d0, 0x0004fe00, 0x107ea4bd, 0x02f8004f, 0x00000089,
68 0x12f99dbf, 0x98089a98, 0xdf940991,
69 /* 0x0020 */ 0x08de940c, 0xfd049098, 0x9b9805ef, 0x05edfd06, 0x98059c98,
70 0x9f98079d, 0x00ebfe03, 0x00000089,
71 /* 0x0040 */ 0xfe019998, 0x94bd0096, 0x004f543e, 0xb80499fa, 0x00010099,
72 0x08f49fa6, 0xfe07f8f6, 0xc7fe00d6,
73 /* 0x0060 */ 0x3ef4bd00, 0x8e004f76, 0xbc060000, 0xf9fa90fe, 0x00ffb805,
74 0xfba60001, 0xf8ef08f4, 0xf91bb203,
75 /* 0x0080 */ 0xfba4bd05, 0x00000011, 0x00000000, 0x00000000, 0x00000000,
76 0x00000000, 0x00000000, 0x00000000,
77 /* 0x00a0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
78 0x00000000, 0x00000000, 0x00000000,
79 /* 0x00c0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
80 0x00000000, 0x00000000, 0x00000000,
81 /* 0x00e0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
82 0x00000000, 0x00000000, 0x00000000,
85 struct gk20a_ctxsw_bootloader_desc g_gpccs_bootloader_desc = {
86 /* .bootLoaderStartOffset = */ 0x0,
87 /* .bootLoaderSize = */ 0x85,
88 /* .bootLoaderImemOffset = */ 0x2700,
89 /* .bootLoaderEntryPoint = */ 0x2700,
92 u32 g_gpccs_bootloader_image[] = {
93 /* 0x0000 */ 0x000800d0, 0x0004fe00, 0x107ea4bd, 0x02f80027, 0x00000089,
94 0x12f99dbf, 0x98089a98, 0xdf940991,
95 /* 0x0020 */ 0x08de940c, 0xfd049098, 0x9b9805ef, 0x05edfd06, 0x98059c98,
96 0x9f98079d, 0x00ebfe03, 0x00000089,
97 /* 0x0040 */ 0xfe019998, 0x94bd0096, 0x0027543e, 0xb80499fa, 0x00010099,
98 0x08f49fa6, 0xfe07f8f6, 0xc7fe00d6,
99 /* 0x0060 */ 0x3ef4bd00, 0x8e002776, 0xbc060000, 0xf9fa90fe, 0x00ffb805,
100 0xfba60001, 0xf8ef08f4, 0xf91bb203,
101 /* 0x0080 */ 0xfba4bd05, 0x00000011, 0x00000000, 0x00000000, 0x00000000,
102 0x00000000, 0x00000000, 0x00000000,
103 /* 0x00a0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
104 0x00000000, 0x00000000, 0x00000000,
105 /* 0x00c0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
106 0x00000000, 0x00000000, 0x00000000,
107 /* 0x00e0 */ 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
108 0x00000000, 0x00000000, 0x00000000,
111 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
112 static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx,
113 u32 addr, u32 data, bool patch);
115 /* global ctx buffer */
116 static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
117 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
118 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
119 struct channel_gk20a *c);
120 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
122 /* channel gr ctx buffer */
123 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
124 struct channel_gk20a *c);
125 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
127 /* channel patch ctx buffer */
128 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
129 struct channel_gk20a *c);
130 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
132 /* golden ctx image */
133 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
134 struct channel_gk20a *c);
135 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
136 struct channel_gk20a *c);
138 void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
142 nvhost_err(dev_from_gk20a(g), "gr_fecs_os_r : %d",
143 gk20a_readl(g, gr_fecs_os_r()));
144 nvhost_err(dev_from_gk20a(g), "gr_fecs_cpuctl_r : 0x%x",
145 gk20a_readl(g, gr_fecs_cpuctl_r()));
146 nvhost_err(dev_from_gk20a(g), "gr_fecs_idlestate_r : 0x%x",
147 gk20a_readl(g, gr_fecs_idlestate_r()));
148 nvhost_err(dev_from_gk20a(g), "gr_fecs_mailbox0_r : 0x%x",
149 gk20a_readl(g, gr_fecs_mailbox0_r()));
150 nvhost_err(dev_from_gk20a(g), "gr_fecs_mailbox1_r : 0x%x",
151 gk20a_readl(g, gr_fecs_mailbox1_r()));
152 nvhost_err(dev_from_gk20a(g), "gr_fecs_irqstat_r : 0x%x",
153 gk20a_readl(g, gr_fecs_irqstat_r()));
154 nvhost_err(dev_from_gk20a(g), "gr_fecs_irqmode_r : 0x%x",
155 gk20a_readl(g, gr_fecs_irqmode_r()));
156 nvhost_err(dev_from_gk20a(g), "gr_fecs_irqmask_r : 0x%x",
157 gk20a_readl(g, gr_fecs_irqmask_r()));
158 nvhost_err(dev_from_gk20a(g), "gr_fecs_irqdest_r : 0x%x",
159 gk20a_readl(g, gr_fecs_irqdest_r()));
160 nvhost_err(dev_from_gk20a(g), "gr_fecs_debug1_r : 0x%x",
161 gk20a_readl(g, gr_fecs_debug1_r()));
162 nvhost_err(dev_from_gk20a(g), "gr_fecs_debuginfo_r : 0x%x",
163 gk20a_readl(g, gr_fecs_debuginfo_r()));
165 for (i = 0; i < gr_fecs_ctxsw_mailbox__size_1_v(); i++)
166 nvhost_err(dev_from_gk20a(g), "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
167 i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
169 nvhost_err(dev_from_gk20a(g), "gr_fecs_engctl_r : 0x%x",
170 gk20a_readl(g, gr_fecs_engctl_r()));
171 nvhost_err(dev_from_gk20a(g), "gr_fecs_curctx_r : 0x%x",
172 gk20a_readl(g, gr_fecs_curctx_r()));
173 nvhost_err(dev_from_gk20a(g), "gr_fecs_nxtctx_r : 0x%x",
174 gk20a_readl(g, gr_fecs_nxtctx_r()));
176 gk20a_writel(g, gr_fecs_icd_cmd_r(),
177 gr_fecs_icd_cmd_opc_rreg_f() |
178 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
179 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_IMB : 0x%x",
180 gk20a_readl(g, gr_fecs_icd_rdata_r()));
182 gk20a_writel(g, gr_fecs_icd_cmd_r(),
183 gr_fecs_icd_cmd_opc_rreg_f() |
184 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
185 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_DMB : 0x%x",
186 gk20a_readl(g, gr_fecs_icd_rdata_r()));
188 gk20a_writel(g, gr_fecs_icd_cmd_r(),
189 gr_fecs_icd_cmd_opc_rreg_f() |
190 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
191 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_CSW : 0x%x",
192 gk20a_readl(g, gr_fecs_icd_rdata_r()));
194 gk20a_writel(g, gr_fecs_icd_cmd_r(),
195 gr_fecs_icd_cmd_opc_rreg_f() |
196 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
197 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_CTX : 0x%x",
198 gk20a_readl(g, gr_fecs_icd_rdata_r()));
200 gk20a_writel(g, gr_fecs_icd_cmd_r(),
201 gr_fecs_icd_cmd_opc_rreg_f() |
202 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
203 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_EXCI : 0x%x",
204 gk20a_readl(g, gr_fecs_icd_rdata_r()));
206 for (i = 0; i < 4; i++) {
207 gk20a_writel(g, gr_fecs_icd_cmd_r(),
208 gr_fecs_icd_cmd_opc_rreg_f() |
209 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
210 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_PC : 0x%x",
211 gk20a_readl(g, gr_fecs_icd_rdata_r()));
213 gk20a_writel(g, gr_fecs_icd_cmd_r(),
214 gr_fecs_icd_cmd_opc_rreg_f() |
215 gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
216 nvhost_err(dev_from_gk20a(g), "FECS_FALCON_REG_SP : 0x%x",
217 gk20a_readl(g, gr_fecs_icd_rdata_r()));
221 static int gr_gk20a_wait_idle(struct gk20a *g, unsigned long end_jiffies,
224 u32 delay = expect_delay;
232 /* fmodel: host gets fifo_engine_status(gr) from gr
233 only when gr_status is read */
234 gk20a_readl(g, gr_status_r());
236 gr_enabled = gk20a_readl(g, mc_enable_r()) &
237 mc_enable_pgraph_enabled_f();
239 ctxsw_active = gk20a_readl(g,
240 fifo_engine_status_r(ENGINE_GR_GK20A)) &
241 fifo_engine_status_ctxsw_in_progress_f();
243 gr_busy = gk20a_readl(g, gr_engine_status_r()) &
244 gr_engine_status_value_busy_f();
246 if (!gr_enabled || (!gr_busy && !ctxsw_active)) {
247 nvhost_dbg_fn("done");
251 usleep_range(delay, delay * 2);
252 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
254 } while (time_before(jiffies, end_jiffies));
256 nvhost_err(dev_from_gk20a(g),
257 "timeout, ctxsw busy : %d, gr busy : %d",
258 ctxsw_active, gr_busy);
263 static int gr_gk20a_ctx_reset(struct gk20a *g, u32 rst_mask)
265 u32 delay = GR_IDLE_CHECK_DEFAULT;
266 unsigned long end_jiffies = jiffies +
267 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
272 /* Force clocks on */
273 gk20a_writel(g, gr_fe_pwr_mode_r(),
274 gr_fe_pwr_mode_req_send_f() |
275 gr_fe_pwr_mode_mode_force_on_f());
277 /* Wait for the clocks to indicate that they are on */
279 reg = gk20a_readl(g, gr_fe_pwr_mode_r());
281 if (gr_fe_pwr_mode_req_v(reg) == gr_fe_pwr_mode_req_done_v())
284 usleep_range(delay, delay * 2);
285 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
287 } while (time_before(jiffies, end_jiffies));
289 if (!time_before(jiffies, end_jiffies)) {
290 nvhost_err(dev_from_gk20a(g),
291 "failed to force the clocks on\n");
296 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), rst_mask);
298 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
299 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
300 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
301 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() |
302 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
303 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
304 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() |
305 gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
306 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
307 gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
310 /* Delay for > 10 nvclks after writing reset. */
311 gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
313 gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
314 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
315 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
316 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() |
317 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
318 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
319 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() |
320 gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
321 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
322 gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
324 /* Delay for > 10 nvclks after writing reset. */
325 gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
327 end_jiffies = jiffies + msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
329 /* Set power mode back to auto */
330 gk20a_writel(g, gr_fe_pwr_mode_r(),
331 gr_fe_pwr_mode_req_send_f() |
332 gr_fe_pwr_mode_mode_auto_f());
334 /* Wait for the request to complete */
336 reg = gk20a_readl(g, gr_fe_pwr_mode_r());
338 if (gr_fe_pwr_mode_req_v(reg) == gr_fe_pwr_mode_req_done_v())
341 usleep_range(delay, delay * 2);
342 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
344 } while (time_before(jiffies, end_jiffies));
346 if (!time_before(jiffies, end_jiffies)) {
347 nvhost_err(dev_from_gk20a(g),
348 "failed to set power mode to auto\n");
355 static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
356 u32 *mailbox_ret, u32 opc_success,
357 u32 mailbox_ok, u32 opc_fail,
360 unsigned long end_jiffies = jiffies +
361 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
362 u32 delay = GR_IDLE_CHECK_DEFAULT;
363 u32 check = WAIT_UCODE_LOOP;
368 while (check == WAIT_UCODE_LOOP) {
369 if (!time_before(jiffies, end_jiffies) &&
370 tegra_platform_is_silicon())
371 check = WAIT_UCODE_TIMEOUT;
373 reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
378 switch (opc_success) {
379 case GR_IS_UCODE_OP_EQUAL:
380 if (reg == mailbox_ok)
381 check = WAIT_UCODE_OK;
383 case GR_IS_UCODE_OP_NOT_EQUAL:
384 if (reg != mailbox_ok)
385 check = WAIT_UCODE_OK;
387 case GR_IS_UCODE_OP_AND:
388 if (reg & mailbox_ok)
389 check = WAIT_UCODE_OK;
391 case GR_IS_UCODE_OP_LESSER:
392 if (reg < mailbox_ok)
393 check = WAIT_UCODE_OK;
395 case GR_IS_UCODE_OP_LESSER_EQUAL:
396 if (reg <= mailbox_ok)
397 check = WAIT_UCODE_OK;
399 case GR_IS_UCODE_OP_SKIP:
400 /* do no success check */
403 nvhost_err(dev_from_gk20a(g),
404 "invalid success opcode 0x%x", opc_success);
406 check = WAIT_UCODE_ERROR;
411 case GR_IS_UCODE_OP_EQUAL:
412 if (reg == mailbox_fail)
413 check = WAIT_UCODE_ERROR;
415 case GR_IS_UCODE_OP_NOT_EQUAL:
416 if (reg != mailbox_fail)
417 check = WAIT_UCODE_ERROR;
419 case GR_IS_UCODE_OP_AND:
420 if (reg & mailbox_fail)
421 check = WAIT_UCODE_ERROR;
423 case GR_IS_UCODE_OP_LESSER:
424 if (reg < mailbox_fail)
425 check = WAIT_UCODE_ERROR;
427 case GR_IS_UCODE_OP_LESSER_EQUAL:
428 if (reg <= mailbox_fail)
429 check = WAIT_UCODE_ERROR;
431 case GR_IS_UCODE_OP_SKIP:
432 /* do no check on fail*/
435 nvhost_err(dev_from_gk20a(g),
436 "invalid fail opcode 0x%x", opc_fail);
437 check = WAIT_UCODE_ERROR;
441 usleep_range(delay, delay * 2);
442 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
445 if (check == WAIT_UCODE_TIMEOUT) {
446 nvhost_err(dev_from_gk20a(g),
447 "timeout waiting on ucode response");
449 } else if (check == WAIT_UCODE_ERROR) {
450 nvhost_err(dev_from_gk20a(g),
451 "ucode method failed on mailbox=%d value=0x%08x",
456 nvhost_dbg_fn("done");
460 /* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
461 * We should replace most, if not all, fecs method calls to this instead. */
462 struct fecs_method_op_gk20a {
484 int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
485 struct fecs_method_op_gk20a op)
487 struct gr_gk20a *gr = &g->gr;
490 mutex_lock(&gr->fecs_mutex);
492 if (op.mailbox.id != 0)
493 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
496 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
497 gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
499 gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
500 gk20a_writel(g, gr_fecs_method_push_r(),
501 gr_fecs_method_push_adr_f(op.method.addr));
503 /* op.mb.id == 4 cases require waiting for completion on
504 * for op.mb.id == 0 */
505 if (op.mailbox.id == 4)
508 ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
509 op.cond.ok, op.mailbox.ok,
510 op.cond.fail, op.mailbox.fail);
512 mutex_unlock(&gr->fecs_mutex);
517 int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
519 return gr_gk20a_submit_fecs_method_op(g,
520 (struct fecs_method_op_gk20a) {
521 .method.addr = fecs_method,
523 .mailbox = { .id = 1, /*sideband?*/
524 .data = ~0, .clr = ~0, .ret = ret,
525 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
526 .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
527 .cond.ok = GR_IS_UCODE_OP_EQUAL,
528 .cond.fail = GR_IS_UCODE_OP_EQUAL });
531 /* Stop processing (stall) context switches at FECS */
532 int gr_gk20a_disable_ctxsw(struct gk20a *g)
534 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
535 return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0);
538 /* Start processing (continue) context switches at FECS */
539 int gr_gk20a_enable_ctxsw(struct gk20a *g)
541 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
542 return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0);
546 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
551 void *inst_ptr = NULL;
555 /* flush gpu_va before commit */
556 gk20a_mm_fb_flush(c->g);
557 gk20a_mm_l2_flush(c->g, true);
559 inst_ptr = nvhost_memmgr_mmap(c->inst_block.mem.ref);
565 addr_lo = u64_lo32(gpu_va) >> 12;
566 addr_hi = u64_hi32(gpu_va);
568 mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
569 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
570 ram_in_gr_wfi_ptr_lo_f(addr_lo));
572 mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
573 ram_in_gr_wfi_ptr_hi_f(addr_hi));
575 nvhost_memmgr_munmap(c->inst_block.mem.ref, inst_ptr);
577 gk20a_mm_l2_invalidate(c->g);
583 nvhost_memmgr_munmap(c->inst_block.mem.ref, inst_ptr);
589 * Context state can be written directly or "patched" at times.
590 * So that code can be used in either situation it is written
591 * using a series _ctx_patch_write(..., patch) statements.
592 * However any necessary cpu map/unmap and gpu l2 invalidates
593 * should be minimized (to avoid doing it once per patch write).
594 * Before a sequence of these set up with "_ctx_patch_write_begin"
595 * and close with "_ctx_patch_write_end."
597 static int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
598 struct channel_ctx_gk20a *ch_ctx)
600 /* being defensive still... */
601 if (ch_ctx->patch_ctx.cpu_va) {
602 nvhost_err(dev_from_gk20a(g), "nested ctx patch begin?");
606 ch_ctx->patch_ctx.cpu_va =
607 nvhost_memmgr_mmap(ch_ctx->patch_ctx.mem.ref);
609 if (!ch_ctx->patch_ctx.cpu_va)
615 static int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
616 struct channel_ctx_gk20a *ch_ctx)
618 /* being defensive still... */
619 if (!ch_ctx->patch_ctx.cpu_va) {
620 nvhost_err(dev_from_gk20a(g), "dangling ctx patch end?");
624 nvhost_memmgr_munmap(ch_ctx->patch_ctx.mem.ref,
625 ch_ctx->patch_ctx.cpu_va);
626 ch_ctx->patch_ctx.cpu_va = NULL;
628 gk20a_mm_l2_invalidate(g);
632 static int gr_gk20a_ctx_patch_write(struct gk20a *g,
633 struct channel_ctx_gk20a *ch_ctx,
634 u32 addr, u32 data, bool patch)
637 void *patch_ptr = NULL;
638 bool mapped_here = false;
640 BUG_ON(patch != 0 && ch_ctx == NULL);
645 /* we added an optimization prolog, epilog
646 * to get rid of unnecessary maps and l2 invals.
647 * but be defensive still... */
648 if (!ch_ctx->patch_ctx.cpu_va) {
650 nvhost_err(dev_from_gk20a(g),
651 "per-write ctx patch begin?");
652 /* yes, gr_gk20a_ctx_patch_smpc causes this one */
653 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
660 patch_ptr = ch_ctx->patch_ctx.cpu_va;
661 patch_slot = ch_ctx->patch_ctx.data_count * 2;
663 mem_wr32(patch_ptr, patch_slot++, addr);
664 mem_wr32(patch_ptr, patch_slot++, data);
666 ch_ctx->patch_ctx.data_count++;
669 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
672 gk20a_writel(g, addr, data);
677 static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
678 struct channel_gk20a *c)
680 u32 inst_base_ptr = u64_lo32(sg_phys(c->inst_block.mem.sgt->sgl)
681 >> ram_in_base_shift_v());
684 nvhost_dbg_info("bind channel %d inst ptr 0x%08x",
685 c->hw_chid, inst_base_ptr);
687 ret = gr_gk20a_submit_fecs_method_op(g,
688 (struct fecs_method_op_gk20a) {
689 .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
690 .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
691 gr_fecs_current_ctx_target_vid_mem_f() |
692 gr_fecs_current_ctx_valid_f(1)),
693 .mailbox = { .id = 0, .data = 0,
698 .cond.ok = GR_IS_UCODE_OP_AND,
699 .cond.fail = GR_IS_UCODE_OP_AND});
701 nvhost_err(dev_from_gk20a(g),
702 "bind channel instance failed");
707 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
710 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
711 struct fifo_gk20a *f = &g->fifo;
712 struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
713 u32 va_lo, va_hi, va;
715 void *ctx_ptr = NULL;
719 ctx_ptr = nvhost_memmgr_mmap(ch_ctx->gr_ctx.mem.ref);
723 if (ch_ctx->zcull_ctx.gpu_va == 0 &&
724 ch_ctx->zcull_ctx.ctx_sw_mode ==
725 ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
730 va_lo = u64_lo32(ch_ctx->zcull_ctx.gpu_va);
731 va_hi = u64_hi32(ch_ctx->zcull_ctx.gpu_va);
732 va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
735 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
737 nvhost_err(dev_from_gk20a(g),
738 "failed to disable gr engine activity\n");
743 /* Channel gr_ctx buffer is gpu cacheable.
744 Flush and invalidate before cpu update. */
745 gk20a_mm_fb_flush(g);
746 gk20a_mm_l2_flush(g, true);
748 mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
749 ch_ctx->zcull_ctx.ctx_sw_mode);
751 mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
754 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
756 nvhost_err(dev_from_gk20a(g),
757 "failed to enable gr engine activity\n");
761 gk20a_mm_l2_invalidate(g);
764 nvhost_memmgr_munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
769 static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
770 struct channel_gk20a *c, bool patch)
772 struct gr_gk20a *gr = &g->gr;
773 struct channel_ctx_gk20a *ch_ctx = NULL;
774 u32 attrib_offset_in_chunk = 0;
775 u32 alpha_offset_in_chunk = 0;
776 u32 pd_ab_max_output;
777 u32 gpc_index, ppc_index;
779 u32 cbm_cfg_size1, cbm_cfg_size2;
786 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
791 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
792 gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
793 gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
796 pd_ab_max_output = (gr->alpha_cb_default_size *
797 gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
798 gr_pd_ab_dist_cfg1_max_output_granularity_v();
800 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
801 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
802 gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
804 alpha_offset_in_chunk = attrib_offset_in_chunk +
805 gr->tpc_count * gr->attrib_cb_size;
807 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
808 temp = proj_gpc_stride_v() * gpc_index;
809 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
811 cbm_cfg_size1 = gr->attrib_cb_default_size *
812 gr->pes_tpc_count[ppc_index][gpc_index];
813 cbm_cfg_size2 = gr->alpha_cb_default_size *
814 gr->pes_tpc_count[ppc_index][gpc_index];
816 gr_gk20a_ctx_patch_write(g, ch_ctx,
817 gr_gpc0_ppc0_cbm_cfg_r() + temp +
818 proj_ppc_in_gpc_stride_v() * ppc_index,
819 gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
820 gr_gpc0_ppc0_cbm_cfg_start_offset_f(attrib_offset_in_chunk) |
821 gr_gpc0_ppc0_cbm_cfg_size_f(cbm_cfg_size1), patch);
823 attrib_offset_in_chunk += gr->attrib_cb_size *
824 gr->pes_tpc_count[ppc_index][gpc_index];
826 gr_gk20a_ctx_patch_write(g, ch_ctx,
827 gr_gpc0_ppc0_cbm_cfg2_r() + temp +
828 proj_ppc_in_gpc_stride_v() * ppc_index,
829 gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
830 gr_gpc0_ppc0_cbm_cfg2_size_f(cbm_cfg_size2), patch);
832 alpha_offset_in_chunk += gr->alpha_cb_size *
833 gr->pes_tpc_count[ppc_index][gpc_index];
838 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
843 static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
844 struct channel_gk20a *c, bool patch)
846 struct gr_gk20a *gr = &g->gr;
847 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
855 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
860 /* global pagepool buffer */
861 addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
862 gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
863 (u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
864 (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
866 size = gr->global_ctx_buffer[PAGEPOOL].size /
867 gr_scc_pagepool_total_pages_byte_granularity_v();
869 if (size == gr_scc_pagepool_total_pages_hwmax_value_v())
870 size = gr_scc_pagepool_total_pages_hwmax_v();
872 nvhost_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
875 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
876 gr_scc_pagepool_base_addr_39_8_f(addr), patch);
878 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
879 gr_scc_pagepool_total_pages_f(size) |
880 gr_scc_pagepool_valid_true_f(), patch);
882 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
883 gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
885 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
886 gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
888 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
889 gr_pd_pagepool_total_pages_f(size) |
890 gr_pd_pagepool_valid_true_f(), patch);
892 /* global bundle cb */
893 addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
894 gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
895 (u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
896 (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
898 size = gr->bundle_cb_default_size;
900 nvhost_dbg_info("bundle cb addr : 0x%016llx, size : %d",
903 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
904 gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
906 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
907 gr_scc_bundle_cb_size_div_256b_f(size) |
908 gr_scc_bundle_cb_size_valid_true_f(), patch);
910 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
911 gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
913 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
914 gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
915 gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
917 /* data for state_limit */
918 data = (gr->bundle_cb_default_size *
919 gr_scc_bundle_cb_size_div_256b_byte_granularity_v()) /
920 gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v();
922 data = min_t(u32, data, gr->min_gpm_fifo_depth);
924 nvhost_dbg_info("bundle cb token limit : %d, state limit : %d",
925 gr->bundle_cb_token_limit, data);
927 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
928 gr_pd_ab_dist_cfg2_token_limit_f(gr->bundle_cb_token_limit) |
929 gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
931 /* global attrib cb */
932 addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
933 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
934 (u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
935 (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
937 nvhost_dbg_info("attrib cb addr : 0x%016llx", addr);
939 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
940 gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
941 gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
943 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
944 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
945 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
948 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
953 static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
955 struct gr_gk20a *gr = &g->gr;
956 struct channel_ctx_gk20a *ch_ctx = NULL;
966 gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
967 pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
968 ds_debug = gk20a_readl(g, gr_ds_debug_r());
969 mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
974 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
979 if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
980 pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
981 pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
983 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
984 pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
985 pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
986 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
987 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
988 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
990 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
991 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
992 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
993 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
994 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
995 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
997 gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
998 pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
999 ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
1000 mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
1002 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
1003 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
1004 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
1005 gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
1009 gr_gk20a_ctx_patch_write_end(g, ch_ctx);
1014 static int gr_gk20a_setup_rop_mapping(struct gk20a *g,
1015 struct gr_gk20a *gr)
1017 u32 norm_entries, norm_shift;
1018 u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
1019 u32 map0, map1, map2, map3, map4, map5;
1026 gk20a_writel(g, gr_crstr_map_table_cfg_r(),
1027 gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
1028 gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
1030 map0 = gr_crstr_gpc_map0_tile0_f(gr->map_tiles[0]) |
1031 gr_crstr_gpc_map0_tile1_f(gr->map_tiles[1]) |
1032 gr_crstr_gpc_map0_tile2_f(gr->map_tiles[2]) |
1033 gr_crstr_gpc_map0_tile3_f(gr->map_tiles[3]) |
1034 gr_crstr_gpc_map0_tile4_f(gr->map_tiles[4]) |
1035 gr_crstr_gpc_map0_tile5_f(gr->map_tiles[5]);
1037 map1 = gr_crstr_gpc_map1_tile6_f(gr->map_tiles[6]) |
1038 gr_crstr_gpc_map1_tile7_f(gr->map_tiles[7]) |
1039 gr_crstr_gpc_map1_tile8_f(gr->map_tiles[8]) |
1040 gr_crstr_gpc_map1_tile9_f(gr->map_tiles[9]) |
1041 gr_crstr_gpc_map1_tile10_f(gr->map_tiles[10]) |
1042 gr_crstr_gpc_map1_tile11_f(gr->map_tiles[11]);
1044 map2 = gr_crstr_gpc_map2_tile12_f(gr->map_tiles[12]) |
1045 gr_crstr_gpc_map2_tile13_f(gr->map_tiles[13]) |
1046 gr_crstr_gpc_map2_tile14_f(gr->map_tiles[14]) |
1047 gr_crstr_gpc_map2_tile15_f(gr->map_tiles[15]) |
1048 gr_crstr_gpc_map2_tile16_f(gr->map_tiles[16]) |
1049 gr_crstr_gpc_map2_tile17_f(gr->map_tiles[17]);
1051 map3 = gr_crstr_gpc_map3_tile18_f(gr->map_tiles[18]) |
1052 gr_crstr_gpc_map3_tile19_f(gr->map_tiles[19]) |
1053 gr_crstr_gpc_map3_tile20_f(gr->map_tiles[20]) |
1054 gr_crstr_gpc_map3_tile21_f(gr->map_tiles[21]) |
1055 gr_crstr_gpc_map3_tile22_f(gr->map_tiles[22]) |
1056 gr_crstr_gpc_map3_tile23_f(gr->map_tiles[23]);
1058 map4 = gr_crstr_gpc_map4_tile24_f(gr->map_tiles[24]) |
1059 gr_crstr_gpc_map4_tile25_f(gr->map_tiles[25]) |
1060 gr_crstr_gpc_map4_tile26_f(gr->map_tiles[26]) |
1061 gr_crstr_gpc_map4_tile27_f(gr->map_tiles[27]) |
1062 gr_crstr_gpc_map4_tile28_f(gr->map_tiles[28]) |
1063 gr_crstr_gpc_map4_tile29_f(gr->map_tiles[29]);
1065 map5 = gr_crstr_gpc_map5_tile30_f(gr->map_tiles[30]) |
1066 gr_crstr_gpc_map5_tile31_f(gr->map_tiles[31]) |
1067 gr_crstr_gpc_map5_tile32_f(0) |
1068 gr_crstr_gpc_map5_tile33_f(0) |
1069 gr_crstr_gpc_map5_tile34_f(0) |
1070 gr_crstr_gpc_map5_tile35_f(0);
1072 gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
1073 gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
1074 gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
1075 gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
1076 gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
1077 gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
1079 switch (gr->tpc_count) {
1108 norm_entries = gr->tpc_count << norm_shift;
1109 coeff5_mod = (1 << 5) % norm_entries;
1110 coeff6_mod = (1 << 6) % norm_entries;
1111 coeff7_mod = (1 << 7) % norm_entries;
1112 coeff8_mod = (1 << 8) % norm_entries;
1113 coeff9_mod = (1 << 9) % norm_entries;
1114 coeff10_mod = (1 << 10) % norm_entries;
1115 coeff11_mod = (1 << 11) % norm_entries;
1117 gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
1118 gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
1119 gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
1120 gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
1121 gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
1122 gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
1124 gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
1125 gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
1126 gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
1127 gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
1128 gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
1129 gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
1130 gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
1132 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
1133 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
1134 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
1135 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
1136 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
1137 gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
1139 gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
1140 gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
1141 gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
1143 gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
1144 gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
1145 gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
1146 gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
1147 gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
1148 gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
1153 static inline u32 count_bits(u32 mask)
1157 for (count = 0; temp != 0; count++)
1163 static inline u32 clear_count_bits(u32 num, u32 clear_count)
1165 u32 count = clear_count;
1166 for (; (num != 0) && (count != 0); count--)
1172 static int gr_gk20a_setup_alpha_beta_tables(struct gk20a *g,
1173 struct gr_gk20a *gr)
1175 u32 table_index_bits = 5;
1176 u32 rows = (1 << table_index_bits);
1177 u32 row_stride = gr_pd_alpha_ratio_table__size_1_v() / rows;
1182 u32 gpcs_per_reg = 4;
1185 u32 num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
1187 u32 alpha_target, beta_target;
1188 u32 alpha_bits, beta_bits;
1189 u32 alpha_mask, beta_mask, partial_mask;
1193 u32 map_alpha[gr_pd_alpha_ratio_table__size_1_v()];
1194 u32 map_beta[gr_pd_alpha_ratio_table__size_1_v()];
1195 u32 map_reg_used[gr_pd_alpha_ratio_table__size_1_v()];
1199 memset(map_alpha, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1200 memset(map_beta, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1201 memset(map_reg_used, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
1203 for (row = 0; row < rows; ++row) {
1204 alpha_target = max_t(u32, gr->tpc_count * row / rows, 1);
1205 beta_target = gr->tpc_count - alpha_target;
1207 assign_alpha = (alpha_target < beta_target);
1209 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1210 reg_offset = (row * row_stride) + (gpc_index / gpcs_per_reg);
1211 alpha_mask = beta_mask = 0;
1213 for (pes_index = 0; pes_index < num_pes_per_gpc; pes_index++) {
1214 tpc_count_pes = gr->pes_tpc_count[pes_index][gpc_index];
1217 alpha_bits = (alpha_target == 0) ? 0 : tpc_count_pes;
1218 beta_bits = tpc_count_pes - alpha_bits;
1220 beta_bits = (beta_target == 0) ? 0 : tpc_count_pes;
1221 alpha_bits = tpc_count_pes - beta_bits;
1224 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index];
1225 partial_mask = clear_count_bits(partial_mask, tpc_count_pes - alpha_bits);
1226 alpha_mask |= partial_mask;
1228 partial_mask = gr->pes_tpc_mask[pes_index][gpc_index] ^ partial_mask;
1229 beta_mask |= partial_mask;
1231 alpha_target -= min(alpha_bits, alpha_target);
1232 beta_target -= min(beta_bits, beta_target);
1234 if ((alpha_bits > 0) || (beta_bits > 0))
1235 assign_alpha = !assign_alpha;
1238 switch (gpc_index % gpcs_per_reg) {
1240 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n0_mask_f(alpha_mask);
1241 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n0_mask_f(beta_mask);
1244 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n1_mask_f(alpha_mask);
1245 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n1_mask_f(beta_mask);
1248 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n2_mask_f(alpha_mask);
1249 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n2_mask_f(beta_mask);
1252 map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n3_mask_f(alpha_mask);
1253 map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n3_mask_f(beta_mask);
1256 map_reg_used[reg_offset] = true;
1260 for (index = 0; index < gr_pd_alpha_ratio_table__size_1_v(); index++) {
1261 if (map_reg_used[index]) {
1262 gk20a_writel(g, gr_pd_alpha_ratio_table_r(index), map_alpha[index]);
1263 gk20a_writel(g, gr_pd_beta_ratio_table_r(index), map_beta[index]);
1270 static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
1272 struct gr_gk20a *gr = &g->gr;
1273 u32 tpc_index, gpc_index;
1274 u32 tpc_offset, gpc_offset;
1275 u32 sm_id = 0, gpc_id = 0;
1276 u32 sm_id_to_gpc_id[proj_scal_max_gpcs_v() * proj_scal_max_tpc_per_gpc_v()];
1278 u32 max_ways_evict = INVALID_MAX_WAYS;
1282 for (tpc_index = 0; tpc_index < gr->max_tpc_per_gpc_count; tpc_index++) {
1283 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
1284 gpc_offset = proj_gpc_stride_v() * gpc_index;
1285 if (tpc_index < gr->gpc_tpc_count[gpc_index]) {
1286 tpc_offset = proj_tpc_in_gpc_stride_v() * tpc_index;
1288 gk20a_writel(g, gr_gpc0_tpc0_sm_cfg_r() + gpc_offset + tpc_offset,
1289 gr_gpc0_tpc0_sm_cfg_sm_id_f(sm_id));
1290 gk20a_writel(g, gr_gpc0_tpc0_l1c_cfg_smid_r() + gpc_offset + tpc_offset,
1291 gr_gpc0_tpc0_l1c_cfg_smid_value_f(sm_id));
1292 gk20a_writel(g, gr_gpc0_gpm_pd_sm_id_r(tpc_index) + gpc_offset,
1293 gr_gpc0_gpm_pd_sm_id_id_f(sm_id));
1294 gk20a_writel(g, gr_gpc0_tpc0_pe_cfg_smid_r() + gpc_offset + tpc_offset,
1295 gr_gpc0_tpc0_pe_cfg_smid_value_f(sm_id));
1297 sm_id_to_gpc_id[sm_id] = gpc_index;
1301 gk20a_writel(g, gr_gpc0_gpm_pd_active_tpcs_r() + gpc_offset,
1302 gr_gpc0_gpm_pd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1303 gk20a_writel(g, gr_gpc0_gpm_sd_active_tpcs_r() + gpc_offset,
1304 gr_gpc0_gpm_sd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
1308 for (tpc_index = 0, gpc_id = 0;
1309 tpc_index < gr_pd_num_tpc_per_gpc__size_1_v();
1310 tpc_index++, gpc_id += 8) {
1312 if (gpc_id >= gr->gpc_count)
1316 gr_pd_num_tpc_per_gpc_count0_f(gr->gpc_tpc_count[gpc_id + 0]) |
1317 gr_pd_num_tpc_per_gpc_count1_f(gr->gpc_tpc_count[gpc_id + 1]) |
1318 gr_pd_num_tpc_per_gpc_count2_f(gr->gpc_tpc_count[gpc_id + 2]) |
1319 gr_pd_num_tpc_per_gpc_count3_f(gr->gpc_tpc_count[gpc_id + 3]) |
1320 gr_pd_num_tpc_per_gpc_count4_f(gr->gpc_tpc_count[gpc_id + 4]) |
1321 gr_pd_num_tpc_per_gpc_count5_f(gr->gpc_tpc_count[gpc_id + 5]) |
1322 gr_pd_num_tpc_per_gpc_count6_f(gr->gpc_tpc_count[gpc_id + 6]) |
1323 gr_pd_num_tpc_per_gpc_count7_f(gr->gpc_tpc_count[gpc_id + 7]);
1325 gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1326 gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
1329 /* gr__setup_pd_mapping stubbed for gk20a */
1330 gr_gk20a_setup_rop_mapping(g, gr);
1331 gr_gk20a_setup_alpha_beta_tables(g, gr);
1333 if (gr->num_fbps == 1)
1336 if (max_ways_evict != INVALID_MAX_WAYS)
1337 gk20a_writel(g, ltc_ltcs_ltss_tstg_set_mgmt_r(),
1338 ((gk20a_readl(g, ltc_ltcs_ltss_tstg_set_mgmt_r()) &
1339 ~(ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(~0))) |
1340 ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(max_ways_evict)));
1343 gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
1346 gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
1347 gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
1348 gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
1349 gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
1350 gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
1353 gk20a_writel(g, gr_cwd_fs_r(),
1354 gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
1355 gr_cwd_fs_num_tpcs_f(gr->tpc_count));
1357 gk20a_writel(g, gr_bes_zrop_settings_r(),
1358 gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
1359 gk20a_writel(g, gr_bes_crop_settings_r(),
1360 gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
1365 static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
1367 struct gk20a *g = c->g;
1371 u64_lo32(sg_phys(c->inst_block.mem.sgt->sgl)
1372 >> ram_in_base_shift_v());
1377 ret = gr_gk20a_submit_fecs_method_op(g,
1378 (struct fecs_method_op_gk20a) {
1379 .method.addr = save_type,
1380 .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1381 gr_fecs_current_ctx_target_vid_mem_f() |
1382 gr_fecs_current_ctx_valid_f(1)),
1383 .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
1386 .cond.ok = GR_IS_UCODE_OP_AND,
1387 .cond.fail = GR_IS_UCODE_OP_AND,
1391 nvhost_err(dev_from_gk20a(g), "save context image failed");
1396 /* init global golden image from a fresh gr_ctx in channel ctx.
1397 save a copy in local_golden_image in ctx_vars */
1398 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
1399 struct channel_gk20a *c)
1401 struct gr_gk20a *gr = &g->gr;
1402 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1403 u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
1404 u32 ctx_header_words;
1407 void *ctx_ptr = NULL;
1408 void *gold_ptr = NULL;
1413 /* golden ctx is global to all channels. Although only the first
1414 channel initializes golden image, driver needs to prevent multiple
1415 channels from initializing golden ctx at the same time */
1416 mutex_lock(&gr->ctx_mutex);
1418 if (gr->ctx_vars.golden_image_initialized)
1421 err = gr_gk20a_fecs_ctx_bind_channel(g, c);
1425 err = gr_gk20a_elpg_protected_call(g,
1426 gr_gk20a_commit_global_ctx_buffers(g, c, false));
1430 gold_ptr = nvhost_memmgr_mmap(gr->global_ctx_buffer[GOLDEN_CTX].ref);
1434 ctx_ptr = nvhost_memmgr_mmap(ch_ctx->gr_ctx.mem.ref);
1438 ctx_header_words = roundup(ctx_header_bytes, sizeof(u32));
1439 ctx_header_words >>= 2;
1441 /* Channel gr_ctx buffer is gpu cacheable.
1442 Flush before cpu read. */
1443 gk20a_mm_fb_flush(g);
1444 gk20a_mm_l2_flush(g, false);
1446 for (i = 0; i < ctx_header_words; i++) {
1447 data = mem_rd32(ctx_ptr, i);
1448 mem_wr32(gold_ptr, i, data);
1451 mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
1452 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
1454 mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
1456 gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
1458 gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
1460 if (gr->ctx_vars.local_golden_image == NULL) {
1462 gr->ctx_vars.local_golden_image =
1463 kzalloc(gr->ctx_vars.golden_image_size, GFP_KERNEL);
1465 if (gr->ctx_vars.local_golden_image == NULL) {
1470 for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1471 gr->ctx_vars.local_golden_image[i] =
1472 mem_rd32(gold_ptr, i);
1475 gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
1477 gr->ctx_vars.golden_image_initialized = true;
1479 gk20a_mm_l2_invalidate(g);
1481 gk20a_writel(g, gr_fecs_current_ctx_r(),
1482 gr_fecs_current_ctx_valid_false_f());
1486 nvhost_dbg(dbg_fn | dbg_err, "fail");
1488 nvhost_dbg_fn("done");
1491 nvhost_memmgr_munmap(gr->global_ctx_buffer[GOLDEN_CTX].ref,
1494 nvhost_memmgr_munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
1496 mutex_unlock(&gr->ctx_mutex);
1500 /* load saved fresh copy of gloden image into channel gr_ctx */
1501 static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
1502 struct channel_gk20a *c)
1504 struct gr_gk20a *gr = &g->gr;
1505 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
1510 void *ctx_ptr = NULL;
1514 if (gr->ctx_vars.local_golden_image == NULL)
1517 /* Channel gr_ctx buffer is gpu cacheable.
1518 Flush and invalidate before cpu update. */
1519 gk20a_mm_fb_flush(g);
1520 gk20a_mm_l2_flush(g, true);
1522 ctx_ptr = nvhost_memmgr_mmap(ch_ctx->gr_ctx.mem.ref);
1526 for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
1527 mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
1529 mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
1530 mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
1532 virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
1533 virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
1535 mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
1536 ch_ctx->patch_ctx.data_count);
1537 mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
1539 mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
1542 /* no user for client managed performance counter ctx */
1543 ch_ctx->pm_ctx.ctx_sw_mode =
1544 ctxsw_prog_main_image_pm_mode_no_ctxsw_v();
1546 mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
1547 ch_ctx->pm_ctx.ctx_sw_mode);
1548 mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0);
1550 nvhost_memmgr_munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
1552 gk20a_mm_l2_invalidate(g);
1554 if (tegra_platform_is_linsim()) {
1556 u64_lo32(sg_phys(c->inst_block.mem.sgt->sgl)
1557 >> ram_in_base_shift_v());
1559 ret = gr_gk20a_submit_fecs_method_op(g,
1560 (struct fecs_method_op_gk20a) {
1562 (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
1563 gr_fecs_current_ctx_target_vid_mem_f() |
1564 gr_fecs_current_ctx_valid_f(1)),
1566 gr_fecs_method_push_adr_restore_golden_v(),
1569 .clr = ~0, .ret = NULL,
1570 .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
1572 .cond.ok = GR_IS_UCODE_OP_EQUAL,
1573 .cond.fail = GR_IS_UCODE_OP_SKIP});
1576 nvhost_err(dev_from_gk20a(g),
1577 "restore context image failed");
1583 static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
1585 struct mm_gk20a *mm = &g->mm;
1586 struct mem_mgr *memmgr = mem_mgr_from_mm(mm);
1587 struct vm_gk20a *vm = &mm->pmu.vm;
1588 struct device *d = dev_from_gk20a(g);
1589 struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1595 /* Alloc mem of inst block */
1596 p_ucode_info->inst_blk_desc.ref = nvhost_memmgr_alloc(memmgr,
1597 ram_in_alloc_size_v(),
1598 DEFAULT_ALLOC_ALIGNMENT,
1599 DEFAULT_ALLOC_FLAGS,
1601 if (IS_ERR(p_ucode_info->inst_blk_desc.ref)) {
1602 p_ucode_info->inst_blk_desc.ref = 0;
1605 p_ucode_info->inst_blk_desc.sgt =
1606 nvhost_memmgr_sg_table(memmgr,
1607 p_ucode_info->inst_blk_desc.ref);
1608 p_ucode_info->inst_blk_desc.size = ram_in_alloc_size_v();
1610 inst_ptr = nvhost_memmgr_mmap(p_ucode_info->inst_blk_desc.ref);
1612 nvhost_err(d, "failed to map inst_blk desc buffer");
1613 nvhost_memmgr_put(memmgr, p_ucode_info->inst_blk_desc.ref);
1617 /* Set inst block */
1618 mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
1619 u64_lo32(vm->va_limit) | 0xFFF);
1620 mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
1621 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
1623 pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
1624 pde_addr_lo = u64_lo32(pde_addr >> 12);
1625 pde_addr_hi = u64_hi32(pde_addr);
1626 mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
1627 ram_in_page_dir_base_target_vid_mem_f() |
1628 ram_in_page_dir_base_vol_true_f() |
1629 ram_in_page_dir_base_lo_f(pde_addr_lo));
1630 mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
1631 ram_in_page_dir_base_hi_f(pde_addr_hi));
1633 nvhost_memmgr_munmap(p_ucode_info->inst_blk_desc.ref, (void *)inst_ptr);
1635 /* Map ucode surface to GMMU */
1636 p_ucode_info->ucode_va = vm->map(vm, memmgr,
1637 p_ucode_info->surface_desc.ref, 0, 0, 0, NULL, false,
1643 static void gr_gk20a_init_ctxsw_ucode_segment(
1644 struct gk20a_ctxsw_ucode_segment *p_seg, u32 *p_offset, u32 size)
1646 p_seg->offset = *p_offset;
1648 *p_offset = ALIGN(*p_offset + size, BLK_SIZE);
1651 static void gr_gk20a_init_ctxsw_ucode_inst(
1652 struct gk20a_ctxsw_ucode_inst *p_inst, u32 *p_offset,
1653 struct gk20a_ctxsw_bootloader_desc *p_bootdesc,
1654 u32 code_size, u32 data_size)
1656 u32 boot_size = ALIGN(p_bootdesc->bootloader_size, sizeof(u32));
1657 p_inst->boot_entry = p_bootdesc->bootloader_entry_point;
1658 p_inst->boot_imem_offset = p_bootdesc->bootloader_imem_offset;
1659 gr_gk20a_init_ctxsw_ucode_segment(&p_inst->boot, p_offset, boot_size);
1660 gr_gk20a_init_ctxsw_ucode_segment(&p_inst->code, p_offset, code_size);
1661 gr_gk20a_init_ctxsw_ucode_segment(&p_inst->data, p_offset, data_size);
1664 static int gr_gk20a_copy_ctxsw_ucode_inst(
1666 struct gk20a_ctxsw_ucode_inst *p_inst,
1667 struct gk20a_ctxsw_bootloader_desc *p_bootdesc, u32 *p_bootimage,
1668 u32 *p_code, u32 *p_data)
1670 memcpy(p_buf + p_inst->boot.offset, p_bootimage, p_inst->boot.size);
1671 memcpy(p_buf + p_inst->code.offset, p_code, p_inst->code.size);
1672 memcpy(p_buf + p_inst->data.offset, p_data, p_inst->data.size);
1676 static int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
1678 struct mm_gk20a *mm = &g->mm;
1679 struct mem_mgr *memmgr = mem_mgr_from_mm(mm);
1680 struct device *d = dev_from_gk20a(g);
1681 struct gk20a_ctxsw_bootloader_desc *p_fecs_boot_desc =
1682 &g_fecs_bootloader_desc;
1683 struct gk20a_ctxsw_bootloader_desc *p_gpcs_boot_desc =
1684 &g_gpccs_bootloader_desc;
1685 u32 *p_fecs_boot_image = g_fecs_bootloader_image;
1686 u32 *p_gpcs_boot_image = g_gpccs_bootloader_image;
1687 struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1692 gr_gk20a_init_ctxsw_ucode_inst(&p_ucode_info->fecs, &ucode_size,
1694 g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
1695 g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
1696 gr_gk20a_init_ctxsw_ucode_inst(&p_ucode_info->gpcs, &ucode_size,
1698 g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
1699 g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
1701 p_ucode_info->surface_desc.ref = nvhost_memmgr_alloc(memmgr,
1703 DEFAULT_ALLOC_ALIGNMENT,
1704 DEFAULT_ALLOC_FLAGS,
1707 p_buf = (u8 *)nvhost_memmgr_mmap(p_ucode_info->surface_desc.ref);
1709 nvhost_err(d, "failed to map surface desc buffer");
1713 gr_gk20a_copy_ctxsw_ucode_inst(p_buf, &p_ucode_info->fecs,
1714 p_fecs_boot_desc, p_fecs_boot_image,
1715 g->gr.ctx_vars.ucode.fecs.inst.l,
1716 g->gr.ctx_vars.ucode.fecs.data.l);
1718 gr_gk20a_copy_ctxsw_ucode_inst(p_buf, &p_ucode_info->gpcs,
1719 p_gpcs_boot_desc, p_gpcs_boot_image,
1720 g->gr.ctx_vars.ucode.gpccs.inst.l,
1721 g->gr.ctx_vars.ucode.gpccs.data.l);
1723 nvhost_memmgr_munmap(p_ucode_info->surface_desc.ref, p_buf);
1725 gr_gk20a_init_ctxsw_ucode_vaspace(g);
1730 static void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
1732 struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1734 phys_addr_t inst_ptr;
1737 while ((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
1738 gr_fecs_ctxsw_status_1_arb_busy_m()) && retries) {
1743 nvhost_err(dev_from_gk20a(g), "arbiter idle timeout");
1745 gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
1747 inst_ptr = sg_phys(p_ucode_info->inst_blk_desc.sgt->sgl);
1748 gk20a_writel(g, gr_fecs_new_ctx_r(),
1749 gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
1750 gr_fecs_new_ctx_target_m() |
1751 gr_fecs_new_ctx_valid_m());
1753 gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
1754 gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
1755 gr_fecs_arb_ctx_ptr_target_m());
1757 gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
1759 /* Wait for arbiter command to complete */
1761 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1762 while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
1765 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1768 nvhost_err(dev_from_gk20a(g), "arbiter complete timeout");
1770 gk20a_writel(g, gr_fecs_current_ctx_r(),
1771 gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
1772 gr_fecs_current_ctx_target_m() |
1773 gr_fecs_current_ctx_valid_m());
1774 /* Send command to arbiter to flush */
1775 gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
1778 val = (gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
1779 while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
1782 val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
1785 nvhost_err(dev_from_gk20a(g), "arbiter complete timeout");
1788 static int gr_gk20a_load_ctxsw_ucode_inst(struct gk20a *g, u64 addr_base,
1789 struct gk20a_ctxsw_ucode_inst *p_inst, u32 reg_offset)
1798 addr_code32 = u64_lo32((addr_base + p_inst->code.offset) >> 8);
1799 addr_data32 = u64_lo32((addr_base + p_inst->data.offset) >> 8);
1800 addr_load32 = u64_lo32((addr_base + p_inst->boot.offset) >> 8);
1802 gk20a_writel(g, reg_offset + gr_fecs_dmactl_r(),
1803 gr_fecs_dmactl_require_ctx_f(0));
1806 * Copy falcon bootloader header into dmem at offset 0.
1807 * Configure dmem port 0 for auto-incrementing writes starting at dmem
1810 gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
1811 gr_fecs_dmemc_offs_f(0) |
1812 gr_fecs_dmemc_blk_f(0) |
1813 gr_fecs_dmemc_aincw_f(1));
1815 /* Write out the actual data */
1816 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
1817 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
1818 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
1819 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), p_inst->code.size);
1820 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
1821 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_data32);
1822 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), p_inst->data.size);
1823 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
1824 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
1825 gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
1827 blocks = ((p_inst->boot.size + 0xFF) & ~0xFF) >> 8;
1830 * Set the base FB address for the DMA transfer. Subtract off the 256
1831 * byte IMEM block offset such that the relative FB and IMEM offsets
1832 * match, allowing the IMEM tags to be properly created.
1835 dst = p_inst->boot_imem_offset;
1836 gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
1837 (addr_load32 - (dst >> 8)));
1839 for (b = 0; b < blocks; b++) {
1840 /* Setup destination IMEM offset */
1841 gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
1844 /* Setup source offset (relative to BASE) */
1845 gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
1848 gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
1849 gr_fecs_dmatrfcmd_imem_f(0x01) |
1850 gr_fecs_dmatrfcmd_write_f(0x00) |
1851 gr_fecs_dmatrfcmd_size_f(0x06) |
1852 gr_fecs_dmatrfcmd_ctxdma_f(0));
1855 /* Specify the falcon boot vector */
1856 gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
1857 gr_fecs_bootvec_vec_f(p_inst->boot_entry));
1859 /* Write to CPUCTL to start the falcon */
1860 gk20a_writel(g, reg_offset + gr_fecs_cpuctl_r(),
1861 gr_fecs_cpuctl_startcpu_f(0x01));
1866 static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
1868 struct gk20a_ctxsw_ucode_info *p_ucode_info = &g->ctxsw_ucode_info;
1869 u64 addr_base = p_ucode_info->ucode_va;
1871 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
1873 gr_gk20a_load_falcon_bind_instblk(g);
1875 gr_gk20a_load_ctxsw_ucode_inst(g, addr_base,
1876 &g->ctxsw_ucode_info.fecs, 0);
1878 gr_gk20a_load_ctxsw_ucode_inst(g, addr_base,
1879 &g->ctxsw_ucode_info.gpcs,
1880 gr_gpcs_gpccs_falcon_hwcfg_r() -
1881 gr_fecs_falcon_hwcfg_r());
1884 static int gr_gk20a_load_ctxsw_ucode(struct gk20a *g, struct gr_gk20a *gr)
1890 if (tegra_platform_is_linsim()) {
1891 gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
1892 gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
1893 gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
1894 gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
1897 if (!gr->skip_ucode_init)
1898 gr_gk20a_init_ctxsw_ucode(g);
1899 gr_gk20a_load_falcon_with_bootloader(g);
1900 gr->skip_ucode_init = true;
1902 ret = gr_gk20a_ctx_wait_ucode(g, 0, 0,
1903 GR_IS_UCODE_OP_EQUAL,
1904 eUcodeHandshakeInitComplete,
1905 GR_IS_UCODE_OP_SKIP, 0);
1907 nvhost_err(dev_from_gk20a(g), "falcon ucode init timeout");
1911 gk20a_writel(g, gr_fecs_current_ctx_r(),
1912 gr_fecs_current_ctx_valid_false_f());
1914 gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
1915 gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
1916 gk20a_writel(g, gr_fecs_method_push_r(),
1917 gr_fecs_method_push_adr_set_watchdog_timeout_f());
1919 nvhost_dbg_fn("done");
1923 static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
1925 u32 golden_ctx_image_size = 0;
1926 u32 zcull_ctx_image_size = 0;
1927 u32 pm_ctx_image_size = 0;
1929 struct fecs_method_op_gk20a op = {
1930 .mailbox = { .id = 0, .data = 0,
1931 .clr = ~0, .ok = 0, .fail = 0},
1933 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
1934 .cond.fail = GR_IS_UCODE_OP_SKIP,
1938 op.method.addr = gr_fecs_method_push_adr_discover_image_size_v();
1939 op.mailbox.ret = &golden_ctx_image_size;
1940 ret = gr_gk20a_submit_fecs_method_op(g, op);
1942 nvhost_err(dev_from_gk20a(g),
1943 "query golden image size failed");
1946 op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v();
1947 op.mailbox.ret = &zcull_ctx_image_size;
1948 ret = gr_gk20a_submit_fecs_method_op(g, op);
1950 nvhost_err(dev_from_gk20a(g),
1951 "query zcull ctx image size failed");
1954 op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v();
1955 op.mailbox.ret = &pm_ctx_image_size;
1956 ret = gr_gk20a_submit_fecs_method_op(g, op);
1958 nvhost_err(dev_from_gk20a(g),
1959 "query pm ctx image size failed");
1963 if (!g->gr.ctx_vars.golden_image_size &&
1964 !g->gr.ctx_vars.zcull_ctxsw_image_size) {
1965 g->gr.ctx_vars.golden_image_size = golden_ctx_image_size;
1966 g->gr.ctx_vars.zcull_ctxsw_image_size = zcull_ctx_image_size;
1968 /* hw is different after railgating? */
1969 BUG_ON(g->gr.ctx_vars.golden_image_size != golden_ctx_image_size);
1970 BUG_ON(g->gr.ctx_vars.zcull_ctxsw_image_size != zcull_ctx_image_size);
1973 nvhost_dbg_fn("done");
1977 static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
1979 struct gr_gk20a *gr = &g->gr;
1980 struct mem_mgr *memmgr = mem_mgr_from_g(g);
1981 struct mem_handle *mem;
1982 u32 i, attr_buffer_size;
1984 u32 cb_buffer_size = gr_scc_bundle_cb_size_div_256b__prod_v() *
1985 gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
1987 u32 pagepool_buffer_size = gr_scc_pagepool_total_pages_hwmax_value_v() *
1988 gr_scc_pagepool_total_pages_byte_granularity_v();
1990 u32 attr_cb_default_size = gr_gpc0_ppc0_cbm_cfg_size_default_v();
1991 u32 alpha_cb_default_size = gr_gpc0_ppc0_cbm_cfg2_size_default_v();
1994 attr_cb_default_size + (attr_cb_default_size >> 1);
1996 alpha_cb_default_size + (alpha_cb_default_size >> 1);
1998 u32 num_tpcs_per_pes = proj_scal_litter_num_tpcs_per_pes_v();
1999 u32 attr_max_size_per_tpc =
2000 gr_gpc0_ppc0_cbm_cfg_size_v(~0) / num_tpcs_per_pes;
2001 u32 alpha_max_size_per_tpc =
2002 gr_gpc0_ppc0_cbm_cfg2_size_v(~0) / num_tpcs_per_pes;
2008 (attr_cb_size > attr_max_size_per_tpc) ?
2009 attr_max_size_per_tpc : attr_cb_size;
2010 attr_cb_default_size =
2011 (attr_cb_default_size > attr_cb_size) ?
2012 attr_cb_size : attr_cb_default_size;
2014 (alpha_cb_size > alpha_max_size_per_tpc) ?
2015 alpha_max_size_per_tpc : alpha_cb_size;
2016 alpha_cb_default_size =
2017 (alpha_cb_default_size > alpha_cb_size) ?
2018 alpha_cb_size : alpha_cb_default_size;
2021 (gr_gpc0_ppc0_cbm_cfg_size_granularity_v() * alpha_cb_size +
2022 gr_gpc0_ppc0_cbm_cfg2_size_granularity_v() * alpha_cb_size) *
2025 nvhost_dbg_info("cb_buffer_size : %d", cb_buffer_size);
2027 mem = nvhost_memmgr_alloc(memmgr, cb_buffer_size,
2028 DEFAULT_ALLOC_ALIGNMENT,
2029 DEFAULT_ALLOC_FLAGS,
2034 gr->global_ctx_buffer[CIRCULAR].ref = mem;
2035 gr->global_ctx_buffer[CIRCULAR].size = cb_buffer_size;
2037 mem = nvhost_memmgr_alloc(memmgr, cb_buffer_size,
2038 DEFAULT_ALLOC_ALIGNMENT,
2039 DEFAULT_ALLOC_FLAGS,
2040 NVMAP_HEAP_CARVEOUT_VPR);
2042 gr->global_ctx_buffer[CIRCULAR_VPR].ref = mem;
2043 gr->global_ctx_buffer[CIRCULAR_VPR].size = cb_buffer_size;
2046 nvhost_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
2048 mem = nvhost_memmgr_alloc(memmgr, pagepool_buffer_size,
2049 DEFAULT_ALLOC_ALIGNMENT,
2050 DEFAULT_ALLOC_FLAGS,
2055 gr->global_ctx_buffer[PAGEPOOL].ref = mem;
2056 gr->global_ctx_buffer[PAGEPOOL].size = pagepool_buffer_size;
2058 mem = nvhost_memmgr_alloc(memmgr, pagepool_buffer_size,
2059 DEFAULT_ALLOC_ALIGNMENT,
2060 DEFAULT_ALLOC_FLAGS,
2061 NVMAP_HEAP_CARVEOUT_VPR);
2063 gr->global_ctx_buffer[PAGEPOOL_VPR].ref = mem;
2064 gr->global_ctx_buffer[PAGEPOOL_VPR].size = pagepool_buffer_size;
2067 nvhost_dbg_info("attr_buffer_size : %d", attr_buffer_size);
2069 mem = nvhost_memmgr_alloc(memmgr, attr_buffer_size,
2070 DEFAULT_ALLOC_ALIGNMENT,
2071 DEFAULT_ALLOC_FLAGS,
2076 gr->global_ctx_buffer[ATTRIBUTE].ref = mem;
2077 gr->global_ctx_buffer[ATTRIBUTE].size = attr_buffer_size;
2079 mem = nvhost_memmgr_alloc(memmgr, attr_buffer_size,
2080 DEFAULT_ALLOC_ALIGNMENT,
2081 DEFAULT_ALLOC_FLAGS,
2082 NVMAP_HEAP_CARVEOUT_VPR);
2084 gr->global_ctx_buffer[ATTRIBUTE_VPR].ref = mem;
2085 gr->global_ctx_buffer[ATTRIBUTE_VPR].size = attr_buffer_size;
2088 nvhost_dbg_info("golden_image_size : %d",
2089 gr->ctx_vars.golden_image_size);
2091 mem = nvhost_memmgr_alloc(memmgr, gr->ctx_vars.golden_image_size,
2092 DEFAULT_ALLOC_ALIGNMENT,
2093 DEFAULT_ALLOC_FLAGS,
2098 gr->global_ctx_buffer[GOLDEN_CTX].ref = mem;
2099 gr->global_ctx_buffer[GOLDEN_CTX].size =
2100 gr->ctx_vars.golden_image_size;
2102 nvhost_dbg_fn("done");
2106 nvhost_dbg(dbg_fn | dbg_err, "fail");
2107 for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2108 if (gr->global_ctx_buffer[i].ref) {
2109 nvhost_memmgr_put(memmgr,
2110 gr->global_ctx_buffer[i].ref);
2111 memset(&gr->global_ctx_buffer[i],
2112 0, sizeof(struct mem_desc));
2118 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
2120 struct gr_gk20a *gr = &g->gr;
2121 struct mem_mgr *memmgr = mem_mgr_from_g(g);
2124 for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
2125 nvhost_memmgr_put(memmgr, gr->global_ctx_buffer[i].ref);
2126 memset(&gr->global_ctx_buffer[i], 0, sizeof(struct mem_desc));
2129 nvhost_dbg_fn("done");
2132 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
2133 struct channel_gk20a *c)
2135 struct vm_gk20a *ch_vm = c->vm;
2136 struct mem_mgr *memmgr = mem_mgr_from_g(g);
2137 struct mem_handle *handle_ref;
2138 u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2139 struct gr_gk20a *gr = &g->gr;
2144 /* Circular Buffer */
2145 if (!c->vpr || (gr->global_ctx_buffer[CIRCULAR_VPR].ref == NULL))
2146 handle_ref = gr->global_ctx_buffer[CIRCULAR].ref;
2148 handle_ref = gr->global_ctx_buffer[CIRCULAR_VPR].ref;
2150 gpu_va = ch_vm->map(ch_vm, memmgr, handle_ref,
2151 /*offset_align, flags, kind*/
2152 0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0,
2153 NULL, false, mem_flag_none);
2156 g_bfr_va[CIRCULAR_VA] = gpu_va;
2158 /* Attribute Buffer */
2159 if (!c->vpr || (gr->global_ctx_buffer[ATTRIBUTE_VPR].ref == NULL))
2160 handle_ref = gr->global_ctx_buffer[ATTRIBUTE].ref;
2162 handle_ref = gr->global_ctx_buffer[ATTRIBUTE_VPR].ref;
2164 gpu_va = ch_vm->map(ch_vm, memmgr, handle_ref,
2165 /*offset_align, flags, kind*/
2166 0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0,
2167 NULL, false, mem_flag_none);
2170 g_bfr_va[ATTRIBUTE_VA] = gpu_va;
2173 if (!c->vpr || (gr->global_ctx_buffer[PAGEPOOL_VPR].ref == NULL))
2174 handle_ref = gr->global_ctx_buffer[PAGEPOOL].ref;
2176 handle_ref = gr->global_ctx_buffer[PAGEPOOL_VPR].ref;
2178 gpu_va = ch_vm->map(ch_vm, memmgr, handle_ref,
2179 /*offset_align, flags, kind*/
2180 0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0,
2181 NULL, false, mem_flag_none);
2184 g_bfr_va[PAGEPOOL_VA] = gpu_va;
2187 gpu_va = ch_vm->map(ch_vm, memmgr,
2188 gr->global_ctx_buffer[GOLDEN_CTX].ref,
2189 /*offset_align, flags, kind*/
2190 0, 0, 0, NULL, false, mem_flag_none);
2193 g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
2195 c->ch_ctx.global_ctx_buffer_mapped = true;
2199 for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2201 ch_vm->unmap(ch_vm, g_bfr_va[i]);
2208 static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
2210 struct vm_gk20a *ch_vm = c->vm;
2211 u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
2216 for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
2218 ch_vm->unmap(ch_vm, g_bfr_va[i]);
2222 c->ch_ctx.global_ctx_buffer_mapped = false;
2225 static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
2226 struct channel_gk20a *c)
2228 struct gr_gk20a *gr = &g->gr;
2229 struct gr_ctx_desc *gr_ctx = &c->ch_ctx.gr_ctx;
2230 struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
2231 struct vm_gk20a *ch_vm = c->vm;
2235 if (gr->ctx_vars.buffer_size == 0)
2238 /* alloc channel gr ctx buffer */
2239 gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
2240 gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
2242 gr_ctx->mem.ref = nvhost_memmgr_alloc(memmgr,
2243 gr->ctx_vars.buffer_total_size,
2244 DEFAULT_ALLOC_ALIGNMENT,
2245 DEFAULT_ALLOC_FLAGS,
2248 if (IS_ERR(gr_ctx->mem.ref))
2251 gr_ctx->gpu_va = ch_vm->map(ch_vm, memmgr,
2253 /*offset_align, flags, kind*/
2254 0, NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 0, NULL, false,
2256 if (!gr_ctx->gpu_va) {
2257 nvhost_memmgr_put(memmgr, gr_ctx->mem.ref);
2264 static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
2266 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2267 struct mem_mgr *ch_nvmap = gk20a_channel_mem_mgr(c);
2268 struct vm_gk20a *ch_vm = c->vm;
2272 ch_vm->unmap(ch_vm, ch_ctx->gr_ctx.gpu_va);
2273 nvhost_memmgr_put(ch_nvmap, ch_ctx->gr_ctx.mem.ref);
2276 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
2277 struct channel_gk20a *c)
2279 struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2280 struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
2281 struct vm_gk20a *ch_vm = c->vm;
2285 patch_ctx->mem.ref = nvhost_memmgr_alloc(memmgr, 128 * sizeof(u32),
2286 DEFAULT_ALLOC_ALIGNMENT,
2287 DEFAULT_ALLOC_FLAGS,
2289 if (IS_ERR(patch_ctx->mem.ref))
2292 patch_ctx->gpu_va = ch_vm->map(ch_vm, memmgr,
2294 /*offset_align, flags, kind*/
2295 0, 0, 0, NULL, false, mem_flag_none);
2296 if (!patch_ctx->gpu_va)
2299 nvhost_dbg_fn("done");
2303 nvhost_dbg(dbg_fn | dbg_err, "fail");
2304 if (patch_ctx->mem.ref) {
2305 nvhost_memmgr_put(memmgr, patch_ctx->mem.ref);
2306 patch_ctx->mem.ref = 0;
2312 static void gr_gk20a_unmap_channel_patch_ctx(struct channel_gk20a *c)
2314 struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2315 struct vm_gk20a *ch_vm = c->vm;
2319 if (patch_ctx->gpu_va)
2320 ch_vm->unmap(ch_vm, patch_ctx->gpu_va);
2321 patch_ctx->gpu_va = 0;
2322 patch_ctx->data_count = 0;
2325 static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
2327 struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
2328 struct mem_mgr *memmgr = gk20a_channel_mem_mgr(c);
2332 gr_gk20a_unmap_channel_patch_ctx(c);
2334 if (patch_ctx->mem.ref) {
2335 nvhost_memmgr_put(memmgr, patch_ctx->mem.ref);
2336 patch_ctx->mem.ref = 0;
2340 void gk20a_free_channel_ctx(struct channel_gk20a *c)
2342 gr_gk20a_unmap_global_ctx_buffers(c);
2343 gr_gk20a_free_channel_patch_ctx(c);
2344 gr_gk20a_free_channel_gr_ctx(c);
2346 /* zcull_ctx, pm_ctx */
2348 memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
2351 c->first_init = false;
2354 int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
2355 struct nvhost_alloc_obj_ctx_args *args)
2357 struct gk20a *g = c->g;
2358 struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
2359 bool change_to_compute_mode = false;
2364 /* an address space needs to have been bound at this point.*/
2365 if (!gk20a_channel_as_bound(c)) {
2366 nvhost_err(dev_from_gk20a(g),
2367 "not bound to address space at time"
2368 " of grctx allocation");
2372 switch (args->class_num) {
2373 case KEPLER_COMPUTE_A:
2374 /* tbd: NV2080_CTRL_GPU_COMPUTE_MODE_RULES_EXCLUSIVE_COMPUTE */
2375 /* tbd: PDB_PROP_GRAPHICS_DISTINCT_3D_AND_COMPUTE_STATE_DEF */
2376 change_to_compute_mode = true;
2380 case KEPLER_DMA_COPY_A:
2384 nvhost_err(dev_from_gk20a(g),
2385 "invalid obj class 0x%x", args->class_num);
2390 /* allocate gr ctx buffer */
2391 if (ch_ctx->gr_ctx.mem.ref == NULL) {
2392 err = gr_gk20a_alloc_channel_gr_ctx(g, c);
2394 nvhost_err(dev_from_gk20a(g),
2395 "fail to allocate gr ctx buffer");
2399 /*TBD: needs to be more subtle about which is being allocated
2400 * as some are allowed to be allocated along same channel */
2401 nvhost_err(dev_from_gk20a(g),
2402 "too many classes alloc'd on same channel");
2407 /* commit gr ctx buffer */
2408 err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
2410 nvhost_err(dev_from_gk20a(g),
2411 "fail to commit gr ctx buffer");
2415 /* allocate patch buffer */
2416 if (ch_ctx->patch_ctx.mem.ref == NULL) {
2417 err = gr_gk20a_alloc_channel_patch_ctx(g, c);
2419 nvhost_err(dev_from_gk20a(g),
2420 "fail to allocate patch buffer");
2425 /* map global buffer to channel gpu_va and commit */
2426 if (!ch_ctx->global_ctx_buffer_mapped) {
2427 err = gr_gk20a_map_global_ctx_buffers(g, c);
2429 nvhost_err(dev_from_gk20a(g),
2430 "fail to map global ctx buffer");
2433 gr_gk20a_elpg_protected_call(g,
2434 gr_gk20a_commit_global_ctx_buffers(g, c, true));
2437 /* init golden image, ELPG enabled after this is done */
2438 err = gr_gk20a_init_golden_ctx_image(g, c);
2440 nvhost_err(dev_from_gk20a(g),
2441 "fail to init golden ctx image");
2445 /* load golden image */
2446 if (!c->first_init) {
2447 err = gr_gk20a_elpg_protected_call(g,
2448 gr_gk20a_load_golden_ctx_image(g, c));
2450 nvhost_err(dev_from_gk20a(g),
2451 "fail to load golden ctx image");
2454 c->first_init = true;
2456 gk20a_mm_l2_invalidate(g);
2459 nvhost_dbg_fn("done");
2462 /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
2463 can be reused so no need to release them.
2464 2. golden image init and load is a one time thing so if
2465 they pass, no need to undo. */
2466 nvhost_dbg(dbg_fn | dbg_err, "fail");
2470 int gk20a_free_obj_ctx(struct channel_gk20a *c,
2471 struct nvhost_free_obj_ctx_args *args)
2473 unsigned long timeout = gk20a_get_gr_idle_timeout(c->g);
2477 if (c->num_objects == 0)
2482 if (c->num_objects == 0) {
2483 c->first_init = false;
2484 gk20a_disable_channel(c, true, /*wait for finish*/
2486 gr_gk20a_unmap_channel_patch_ctx(c);
2492 static void gk20a_remove_gr_support(struct gr_gk20a *gr)
2494 struct gk20a *g = gr->g;
2495 struct mem_mgr *memmgr = mem_mgr_from_g(g);
2499 gr_gk20a_free_global_ctx_buffers(g);
2501 nvhost_memmgr_free_sg_table(memmgr, gr->mmu_wr_mem.mem.ref,
2502 gr->mmu_wr_mem.mem.sgt);
2503 nvhost_memmgr_unpin(memmgr, gr->mmu_rd_mem.mem.ref,
2504 dev_from_gk20a(g), gr->mmu_rd_mem.mem.sgt);
2505 nvhost_memmgr_put(memmgr, gr->mmu_wr_mem.mem.ref);
2506 nvhost_memmgr_put(memmgr, gr->mmu_rd_mem.mem.ref);
2507 nvhost_memmgr_put(memmgr, gr->compbit_store.mem.ref);
2508 memset(&gr->mmu_wr_mem, 0, sizeof(struct mem_desc));
2509 memset(&gr->mmu_rd_mem, 0, sizeof(struct mem_desc));
2510 memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
2512 kfree(gr->gpc_tpc_count);
2513 kfree(gr->gpc_zcb_count);
2514 kfree(gr->gpc_ppc_count);
2515 kfree(gr->pes_tpc_count[0]);
2516 kfree(gr->pes_tpc_count[1]);
2517 kfree(gr->pes_tpc_mask[0]);
2518 kfree(gr->pes_tpc_mask[1]);
2519 kfree(gr->gpc_skip_mask);
2520 kfree(gr->map_tiles);
2521 gr->gpc_tpc_count = NULL;
2522 gr->gpc_zcb_count = NULL;
2523 gr->gpc_ppc_count = NULL;
2524 gr->pes_tpc_count[0] = NULL;
2525 gr->pes_tpc_count[1] = NULL;
2526 gr->pes_tpc_mask[0] = NULL;
2527 gr->pes_tpc_mask[1] = NULL;
2528 gr->gpc_skip_mask = NULL;
2529 gr->map_tiles = NULL;
2531 kfree(gr->ctx_vars.ucode.fecs.inst.l);
2532 kfree(gr->ctx_vars.ucode.fecs.data.l);
2533 kfree(gr->ctx_vars.ucode.gpccs.inst.l);
2534 kfree(gr->ctx_vars.ucode.gpccs.data.l);
2535 kfree(gr->ctx_vars.sw_bundle_init.l);
2536 kfree(gr->ctx_vars.sw_method_init.l);
2537 kfree(gr->ctx_vars.sw_ctx_load.l);
2538 kfree(gr->ctx_vars.sw_non_ctx_load.l);
2539 kfree(gr->ctx_vars.ctxsw_regs.sys.l);
2540 kfree(gr->ctx_vars.ctxsw_regs.gpc.l);
2541 kfree(gr->ctx_vars.ctxsw_regs.tpc.l);
2542 kfree(gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
2543 kfree(gr->ctx_vars.ctxsw_regs.ppc.l);
2544 kfree(gr->ctx_vars.ctxsw_regs.pm_sys.l);
2545 kfree(gr->ctx_vars.ctxsw_regs.pm_gpc.l);
2546 kfree(gr->ctx_vars.ctxsw_regs.pm_tpc.l);
2548 kfree(gr->ctx_vars.local_golden_image);
2549 gr->ctx_vars.local_golden_image = NULL;
2551 nvhost_allocator_destroy(&gr->comp_tags);
2554 static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
2556 u32 gpc_index, pes_index;
2559 u32 pes_heavy_index;
2560 u32 gpc_new_skip_mask;
2563 tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
2564 gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
2566 tmp = gk20a_readl(g, top_num_gpcs_r());
2567 gr->max_gpc_count = top_num_gpcs_value_v(tmp);
2569 tmp = gk20a_readl(g, top_num_fbps_r());
2570 gr->max_fbps_count = top_num_fbps_value_v(tmp);
2572 tmp = gk20a_readl(g, top_tpc_per_gpc_r());
2573 gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
2575 gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
2577 tmp = gk20a_readl(g, top_num_fbps_r());
2578 gr->sys_count = top_num_fbps_value_v(tmp);
2580 tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
2581 gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
2583 gr->pe_count_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
2584 gr->max_zcull_per_gpc_count = proj_scal_litter_num_zcull_banks_v();
2586 if (!gr->gpc_count) {
2587 nvhost_err(dev_from_gk20a(g), "gpc_count==0!");
2591 gr->gpc_tpc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2592 gr->gpc_zcb_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2593 gr->gpc_ppc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2594 gr->pes_tpc_count[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2595 gr->pes_tpc_count[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2596 gr->pes_tpc_mask[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2597 gr->pes_tpc_mask[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
2599 kzalloc(gr_pd_dist_skip_table__size_1_v() * 4 * sizeof(u32),
2602 if (!gr->gpc_tpc_count || !gr->gpc_zcb_count || !gr->gpc_ppc_count ||
2603 !gr->pes_tpc_count[0] || !gr->pes_tpc_count[1] ||
2604 !gr->pes_tpc_mask[0] || !gr->pes_tpc_mask[1] || !gr->gpc_skip_mask)
2608 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
2609 tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r());
2611 gr->gpc_tpc_count[gpc_index] =
2612 gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
2613 gr->tpc_count += gr->gpc_tpc_count[gpc_index];
2615 gr->gpc_zcb_count[gpc_index] =
2616 gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
2617 gr->zcb_count += gr->gpc_zcb_count[gpc_index];
2619 gr->gpc_ppc_count[gpc_index] = gr->pe_count_per_gpc;
2620 gr->ppc_count += gr->gpc_ppc_count[gpc_index];
2621 for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
2623 tmp = gk20a_readl(g,
2624 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
2625 gpc_index * proj_gpc_stride_v());
2627 pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
2628 pes_tpc_count = count_bits(pes_tpc_mask);
2630 gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
2631 gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
2634 gpc_new_skip_mask = 0;
2635 if (gr->pes_tpc_count[0][gpc_index] +
2636 gr->pes_tpc_count[1][gpc_index] == 5) {
2638 gr->pes_tpc_count[0][gpc_index] >
2639 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
2642 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
2643 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
2644 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
2646 } else if ((gr->pes_tpc_count[0][gpc_index] +
2647 gr->pes_tpc_count[1][gpc_index] == 4) &&
2648 (gr->pes_tpc_count[0][gpc_index] !=
2649 gr->pes_tpc_count[1][gpc_index])) {
2651 gr->pes_tpc_count[0][gpc_index] >
2652 gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
2655 gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
2656 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
2657 (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
2659 gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
2662 nvhost_dbg_info("fbps: %d", gr->num_fbps);
2663 nvhost_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
2664 nvhost_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
2665 nvhost_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
2666 nvhost_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
2667 nvhost_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
2668 nvhost_dbg_info("sys_count: %d", gr->sys_count);
2669 nvhost_dbg_info("gpc_count: %d", gr->gpc_count);
2670 nvhost_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
2671 nvhost_dbg_info("tpc_count: %d", gr->tpc_count);
2672 nvhost_dbg_info("ppc_count: %d", gr->ppc_count);
2674 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2675 nvhost_dbg_info("gpc_tpc_count[%d] : %d",
2676 gpc_index, gr->gpc_tpc_count[gpc_index]);
2677 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2678 nvhost_dbg_info("gpc_zcb_count[%d] : %d",
2679 gpc_index, gr->gpc_zcb_count[gpc_index]);
2680 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2681 nvhost_dbg_info("gpc_ppc_count[%d] : %d",
2682 gpc_index, gr->gpc_ppc_count[gpc_index]);
2683 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2684 nvhost_dbg_info("gpc_skip_mask[%d] : %d",
2685 gpc_index, gr->gpc_skip_mask[gpc_index]);
2686 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2688 pes_index < gr->pe_count_per_gpc;
2690 nvhost_dbg_info("pes_tpc_count[%d][%d] : %d",
2691 pes_index, gpc_index,
2692 gr->pes_tpc_count[pes_index][gpc_index]);
2694 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2696 pes_index < gr->pe_count_per_gpc;
2698 nvhost_dbg_info("pes_tpc_mask[%d][%d] : %d",
2699 pes_index, gpc_index,
2700 gr->pes_tpc_mask[pes_index][gpc_index]);
2702 gr->bundle_cb_default_size = gr_scc_bundle_cb_size_div_256b__prod_v();
2703 gr->min_gpm_fifo_depth = gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
2704 gr->bundle_cb_token_limit = gr_pd_ab_dist_cfg2_token_limit_init_v();
2705 gr->attrib_cb_default_size = gr_gpc0_ppc0_cbm_cfg_size_default_v();
2706 /* gk20a has a fixed beta CB RAM, don't alloc more */
2707 gr->attrib_cb_size = gr->attrib_cb_default_size;
2708 gr->alpha_cb_default_size = gr_gpc0_ppc0_cbm_cfg2_size_default_v();
2709 gr->alpha_cb_size = gr->alpha_cb_default_size + (gr->alpha_cb_default_size >> 1);
2710 gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
2712 nvhost_dbg_info("bundle_cb_default_size: %d",
2713 gr->bundle_cb_default_size);
2714 nvhost_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
2715 nvhost_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
2716 nvhost_dbg_info("attrib_cb_default_size: %d",
2717 gr->attrib_cb_default_size);
2718 nvhost_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
2719 nvhost_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
2720 nvhost_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
2721 nvhost_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
2729 static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
2731 struct mem_mgr *memmgr = mem_mgr_from_g(g);
2734 gr->mmu_wr_mem_size = gr->mmu_rd_mem_size = 0x1000;
2736 gr->mmu_wr_mem.mem.ref = nvhost_memmgr_alloc(memmgr,
2737 gr->mmu_wr_mem_size,
2738 DEFAULT_ALLOC_ALIGNMENT,
2739 DEFAULT_ALLOC_FLAGS,
2741 if (IS_ERR(gr->mmu_wr_mem.mem.ref))
2743 gr->mmu_wr_mem.mem.size = gr->mmu_wr_mem_size;
2745 gr->mmu_rd_mem.mem.ref = nvhost_memmgr_alloc(memmgr,
2746 gr->mmu_rd_mem_size,
2747 DEFAULT_ALLOC_ALIGNMENT,
2748 DEFAULT_ALLOC_FLAGS,
2750 if (IS_ERR(gr->mmu_rd_mem.mem.ref))
2752 gr->mmu_rd_mem.mem.size = gr->mmu_rd_mem_size;
2754 mmu_ptr = nvhost_memmgr_mmap(gr->mmu_wr_mem.mem.ref);
2757 memset(mmu_ptr, 0, gr->mmu_wr_mem.mem.size);
2758 nvhost_memmgr_munmap(gr->mmu_wr_mem.mem.ref, mmu_ptr);
2760 mmu_ptr = nvhost_memmgr_mmap(gr->mmu_rd_mem.mem.ref);
2763 memset(mmu_ptr, 0, gr->mmu_rd_mem.mem.size);
2764 nvhost_memmgr_munmap(gr->mmu_rd_mem.mem.ref, mmu_ptr);
2766 gr->mmu_wr_mem.mem.sgt =
2767 nvhost_memmgr_sg_table(memmgr, gr->mmu_wr_mem.mem.ref);
2768 if (IS_ERR(gr->mmu_wr_mem.mem.sgt))
2771 gr->mmu_rd_mem.mem.sgt =
2772 nvhost_memmgr_sg_table(memmgr, gr->mmu_rd_mem.mem.ref);
2773 if (IS_ERR(gr->mmu_rd_mem.mem.sgt))
2781 static u32 prime_set[18] = {
2782 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
2784 static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
2788 s32 *init_frac = NULL;
2789 s32 *init_err = NULL;
2790 s32 *run_err = NULL;
2791 s32 *sorted_num_tpcs = NULL;
2792 s32 *sorted_to_unsorted_gpc_map = NULL;
2796 u32 max_tpc_count = 0;
2800 bool delete_map = false;
2804 init_frac = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
2805 init_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
2806 run_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
2808 kzalloc(proj_scal_max_gpcs_v() *
2809 proj_scal_max_tpc_per_gpc_v() * sizeof(s32),
2811 sorted_to_unsorted_gpc_map =
2812 kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
2814 if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
2815 sorted_to_unsorted_gpc_map)) {
2820 gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
2822 if (gr->tpc_count == 3)
2823 gr->map_row_offset = 2;
2824 else if (gr->tpc_count < 3)
2825 gr->map_row_offset = 1;
2827 gr->map_row_offset = 3;
2829 for (index = 1; index < 18; index++) {
2830 u32 prime = prime_set[index];
2831 if ((gr->tpc_count % prime) != 0) {
2832 gr->map_row_offset = prime;
2838 switch (gr->tpc_count) {
2840 gr->map_row_offset = 6;
2843 gr->map_row_offset = 5;
2846 gr->map_row_offset = 2;
2849 gr->map_row_offset = 7;
2852 gr->map_row_offset = 6;
2856 gr->map_row_offset = 1;
2862 if (gr->map_tiles) {
2863 if (gr->map_tile_count != gr->tpc_count)
2866 for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
2867 if ((u32)gr->map_tiles[tile_count] >= gr->tpc_count)
2872 kfree(gr->map_tiles);
2873 gr->map_tiles = NULL;
2874 gr->map_tile_count = 0;
2878 if (gr->map_tiles == NULL) {
2879 gr->map_tile_count = proj_scal_max_gpcs_v();
2881 gr->map_tiles = kzalloc(proj_scal_max_gpcs_v() * sizeof(u8), GFP_KERNEL);
2882 if (gr->map_tiles == NULL) {
2887 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
2888 sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
2889 sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
2893 while (!gpc_sorted) {
2895 for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
2896 if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
2898 swap = sorted_num_tpcs[gpc_index];
2899 sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
2900 sorted_num_tpcs[gpc_index + 1] = swap;
2901 swap = sorted_to_unsorted_gpc_map[gpc_index];
2902 sorted_to_unsorted_gpc_map[gpc_index] =
2903 sorted_to_unsorted_gpc_map[gpc_index + 1];
2904 sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
2909 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
2910 if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
2911 max_tpc_count = gr->gpc_tpc_count[gpc_index];
2913 mul_factor = gr->gpc_count * max_tpc_count;
2914 if (mul_factor & 0x1)
2919 comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
2921 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
2922 num_tpc = sorted_num_tpcs[gpc_index];
2924 init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
2927 init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
2929 init_err[gpc_index] = 0;
2931 run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
2934 while (gpc_mark < gr->tpc_count) {
2935 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
2936 if ((run_err[gpc_index] * 2) >= comm_denom) {
2937 gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
2938 run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
2940 run_err[gpc_index] += init_frac[gpc_index];
2949 kfree(sorted_num_tpcs);
2950 kfree(sorted_to_unsorted_gpc_map);
2953 nvhost_dbg(dbg_fn | dbg_err, "fail");
2955 nvhost_dbg_fn("done");
2960 static int gr_gk20a_init_comptag(struct gk20a *g, struct gr_gk20a *gr)
2962 struct mem_mgr *memmgr = mem_mgr_from_g(g);
2964 /* max memory size (MB) to cover */
2965 u32 max_size = gr->max_comptag_mem;
2966 /* one tag line covers 128KB */
2967 u32 max_comptag_lines = max_size << 3;
2969 u32 hw_max_comptag_lines =
2970 ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_init_v();
2973 gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r());
2974 u32 comptags_per_cacheline =
2975 ltc_ltcs_ltss_cbc_param_comptags_per_cache_line_v(cbc_param);
2976 u32 slices_per_fbp =
2977 ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(cbc_param);
2978 u32 cacheline_size =
2979 512 << ltc_ltcs_ltss_cbc_param_cache_line_size_v(cbc_param);
2981 u32 compbit_backing_size;
2986 if (max_comptag_lines == 0) {
2987 gr->compbit_store.mem.size = 0;
2991 if (max_comptag_lines > hw_max_comptag_lines)
2992 max_comptag_lines = hw_max_comptag_lines;
2995 compbit_backing_size =
2996 DIV_ROUND_UP(max_comptag_lines, comptags_per_cacheline) *
2997 cacheline_size * slices_per_fbp * gr->num_fbps;
2999 /* aligned to 2KB * num_fbps */
3000 compbit_backing_size +=
3001 gr->num_fbps << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
3003 /* must be a multiple of 64KB */
3004 compbit_backing_size = roundup(compbit_backing_size, 64*1024);
3007 (compbit_backing_size * comptags_per_cacheline) /
3008 cacheline_size * slices_per_fbp * gr->num_fbps;
3010 if (max_comptag_lines > hw_max_comptag_lines)
3011 max_comptag_lines = hw_max_comptag_lines;
3013 nvhost_dbg_info("compbit backing store size : %d",
3014 compbit_backing_size);
3015 nvhost_dbg_info("max comptag lines : %d",
3018 gr->compbit_store.mem.ref =
3019 nvhost_memmgr_alloc(memmgr, compbit_backing_size,
3020 DEFAULT_ALLOC_ALIGNMENT,
3021 DEFAULT_ALLOC_FLAGS,
3023 if (IS_ERR(gr->compbit_store.mem.ref)) {
3024 nvhost_err(dev_from_gk20a(g), "failed to allocate"
3025 "backing store for compbit : size %d",
3026 compbit_backing_size);
3027 return PTR_ERR(gr->compbit_store.mem.ref);
3029 gr->compbit_store.mem.size = compbit_backing_size;
3031 gr->compbit_store.mem.sgt =
3032 nvhost_memmgr_pin(memmgr, gr->compbit_store.mem.ref,
3033 dev_from_gk20a(g), mem_flag_none);
3034 if (IS_ERR(gr->compbit_store.mem.sgt)) {
3035 ret = PTR_ERR(gr->compbit_store.mem.sgt);
3038 gr->compbit_store.base_pa =
3039 gk20a_mm_iova_addr(gr->compbit_store.mem.sgt->sgl);
3041 nvhost_allocator_init(&gr->comp_tags, "comptag",
3043 max_comptag_lines - 1, /* length*/
3049 if (gr->compbit_store.mem.sgt)
3050 nvhost_memmgr_free_sg_table(memmgr, gr->compbit_store.mem.ref,
3051 gr->compbit_store.mem.sgt);
3052 nvhost_memmgr_put(memmgr, gr->compbit_store.mem.ref);
3056 int gk20a_gr_clear_comptags(struct gk20a *g, u32 min, u32 max)
3058 struct gr_gk20a *gr = &g->gr;
3059 u32 fbp, slice, ctrl1, val;
3060 unsigned long end_jiffies = jiffies +
3061 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3062 u32 delay = GR_IDLE_CHECK_DEFAULT;
3063 u32 slices_per_fbp =
3064 ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(
3065 gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r()));
3069 if (gr->compbit_store.mem.size == 0)
3072 gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl2_r(),
3073 ltc_ltcs_ltss_cbc_ctrl2_clear_lower_bound_f(min));
3074 gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl3_r(),
3075 ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_f(max));
3076 gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl1_r(),
3077 gk20a_readl(g, ltc_ltcs_ltss_cbc_ctrl1_r()) |
3078 ltc_ltcs_ltss_cbc_ctrl1_clear_active_f());
3080 for (fbp = 0; fbp < gr->num_fbps; fbp++) {
3081 for (slice = 0; slice < slices_per_fbp; slice++) {
3083 delay = GR_IDLE_CHECK_DEFAULT;
3085 ctrl1 = ltc_ltc0_lts0_cbc_ctrl1_r() +
3086 fbp * proj_ltc_stride_v() +
3087 slice * proj_lts_stride_v();
3090 val = gk20a_readl(g, ctrl1);
3091 if (ltc_ltcs_ltss_cbc_ctrl1_clear_v(val) !=
3092 ltc_ltcs_ltss_cbc_ctrl1_clear_active_v())
3095 usleep_range(delay, delay * 2);
3096 delay = min_t(u32, delay << 1,
3099 } while (time_before(jiffies, end_jiffies) |
3100 !tegra_platform_is_silicon());
3102 if (!time_before(jiffies, end_jiffies)) {
3103 nvhost_err(dev_from_gk20a(g),
3104 "comp tag clear timeout\n");
3113 static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
3115 struct gr_zcull_gk20a *zcull = &gr->zcull;
3117 zcull->aliquot_width = gr->tpc_count * 16;
3118 zcull->aliquot_height = 16;
3120 zcull->width_align_pixels = gr->tpc_count * 16;
3121 zcull->height_align_pixels = 32;
3123 zcull->aliquot_size =
3124 zcull->aliquot_width * zcull->aliquot_height;
3126 /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
3127 zcull->pixel_squares_by_aliquots =
3128 gr->zcb_count * 16 * 16 * gr->tpc_count /
3129 (gr->gpc_count * gr->gpc_tpc_count[0]);
3131 zcull->total_aliquots =
3132 gr_gpc0_zcull_total_ram_size_num_aliquots_f(
3133 gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
3138 u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
3140 /* assuming gr has already been initialized */
3141 return gr->ctx_vars.zcull_ctxsw_image_size;
3144 int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
3145 struct channel_gk20a *c, u64 zcull_va, u32 mode)
3147 struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
3149 zcull_ctx->ctx_sw_mode = mode;
3150 zcull_ctx->gpu_va = zcull_va;
3152 /* TBD: don't disable channel in sw method processing */
3153 return gr_gk20a_ctx_zcull_setup(g, c, true);
3156 int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
3157 struct gr_zcull_info *zcull_params)
3159 struct gr_zcull_gk20a *zcull = &gr->zcull;
3161 zcull_params->width_align_pixels = zcull->width_align_pixels;
3162 zcull_params->height_align_pixels = zcull->height_align_pixels;
3163 zcull_params->pixel_squares_by_aliquots =
3164 zcull->pixel_squares_by_aliquots;
3165 zcull_params->aliquot_total = zcull->total_aliquots;
3167 zcull_params->region_byte_multiplier =
3168 gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
3169 zcull_params->region_header_size =
3170 proj_scal_litter_num_gpcs_v() *
3171 gr_zcull_save_restore_header_bytes_per_gpc_v();
3173 zcull_params->subregion_header_size =
3174 proj_scal_litter_num_gpcs_v() *
3175 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
3177 zcull_params->subregion_width_align_pixels =
3178 gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
3179 zcull_params->subregion_height_align_pixels =
3180 gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
3181 zcull_params->subregion_count = gr_zcull_subregion_qty_v();
3186 static int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
3187 struct zbc_entry *color_val, u32 index)
3189 struct fifo_gk20a *f = &g->fifo;
3190 struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3192 unsigned long end_jiffies = jiffies +
3193 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3196 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3198 nvhost_err(dev_from_gk20a(g),
3199 "failed to disable gr engine activity\n");
3203 ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3205 nvhost_err(dev_from_gk20a(g),
3206 "failed to idle graphics\n");
3210 /* update l2 table */
3211 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3212 (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3213 ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3214 ltc_ltcs_ltss_dstg_zbc_index_address_f(index +
3215 GK20A_STARTOF_ZBC_TABLE));
3217 for (i = 0; i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++)
3218 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(i),
3219 color_val->color_l2[i]);
3221 /* update ds table */
3222 gk20a_writel(g, gr_ds_zbc_color_r_r(),
3223 gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
3224 gk20a_writel(g, gr_ds_zbc_color_g_r(),
3225 gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
3226 gk20a_writel(g, gr_ds_zbc_color_b_r(),
3227 gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
3228 gk20a_writel(g, gr_ds_zbc_color_a_r(),
3229 gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
3231 gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3232 gr_ds_zbc_color_fmt_val_f(color_val->format));
3234 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3235 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3237 /* trigger the write */
3238 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3239 gr_ds_zbc_tbl_ld_select_c_f() |
3240 gr_ds_zbc_tbl_ld_action_write_f() |
3241 gr_ds_zbc_tbl_ld_trigger_active_f());
3243 /* update local copy */
3244 for (i = 0; i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++) {
3245 gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
3246 gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
3248 gr->zbc_col_tbl[index].format = color_val->format;
3249 gr->zbc_col_tbl[index].ref_cnt++;
3252 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3254 nvhost_err(dev_from_gk20a(g),
3255 "failed to enable gr engine activity\n");
3261 static int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
3262 struct zbc_entry *depth_val, u32 index)
3264 struct fifo_gk20a *f = &g->fifo;
3265 struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3266 unsigned long end_jiffies = jiffies +
3267 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3270 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3272 nvhost_err(dev_from_gk20a(g),
3273 "failed to disable gr engine activity\n");
3277 ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3279 nvhost_err(dev_from_gk20a(g),
3280 "failed to idle graphics\n");
3284 /* update l2 table */
3285 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3286 (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3287 ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3288 ltc_ltcs_ltss_dstg_zbc_index_address_f(index +
3289 GK20A_STARTOF_ZBC_TABLE));
3291 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(),
3294 /* update ds table */
3295 gk20a_writel(g, gr_ds_zbc_z_r(),
3296 gr_ds_zbc_z_val_f(depth_val->depth));
3298 gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3299 gr_ds_zbc_z_fmt_val_f(depth_val->format));
3301 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3302 gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
3304 /* trigger the write */
3305 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3306 gr_ds_zbc_tbl_ld_select_z_f() |
3307 gr_ds_zbc_tbl_ld_action_write_f() |
3308 gr_ds_zbc_tbl_ld_trigger_active_f());
3310 /* update local copy */
3311 gr->zbc_dep_tbl[index].depth = depth_val->depth;
3312 gr->zbc_dep_tbl[index].format = depth_val->format;
3313 gr->zbc_dep_tbl[index].ref_cnt++;
3316 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3318 nvhost_err(dev_from_gk20a(g),
3319 "failed to enable gr engine activity\n");
3325 int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
3326 struct zbc_entry *zbc_val)
3328 struct zbc_color_table *c_tbl;
3329 struct zbc_depth_table *d_tbl;
3330 u32 i, ret = -ENOMEM;
3334 /* no endian swap ? */
3336 switch (zbc_val->type) {
3337 case GK20A_ZBC_TYPE_COLOR:
3338 /* search existing tables */
3339 for (i = 0; i < gr->max_used_color_index; i++) {
3341 c_tbl = &gr->zbc_col_tbl[i];
3343 if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
3344 memcmp(c_tbl->color_ds, zbc_val->color_ds,
3345 sizeof(zbc_val->color_ds)) == 0) {
3347 if (memcmp(c_tbl->color_l2, zbc_val->color_l2,
3348 sizeof(zbc_val->color_l2))) {
3349 nvhost_err(dev_from_gk20a(g),
3350 "zbc l2 and ds color don't match with existing entries");
3361 gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
3364 &gr->zbc_col_tbl[gr->max_used_color_index];
3365 WARN_ON(c_tbl->ref_cnt != 0);
3367 ret = gr_gk20a_add_zbc_color(g, gr,
3368 zbc_val, gr->max_used_color_index);
3371 gr->max_used_color_index++;
3374 case GK20A_ZBC_TYPE_DEPTH:
3375 /* search existing tables */
3376 for (i = 0; i < gr->max_used_depth_index; i++) {
3378 d_tbl = &gr->zbc_dep_tbl[i];
3380 if (d_tbl->ref_cnt &&
3381 d_tbl->depth == zbc_val->depth &&
3382 d_tbl->format == zbc_val->format) {
3391 gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
3394 &gr->zbc_dep_tbl[gr->max_used_depth_index];
3395 WARN_ON(d_tbl->ref_cnt != 0);
3397 ret = gr_gk20a_add_zbc_depth(g, gr,
3398 zbc_val, gr->max_used_depth_index);
3401 gr->max_used_depth_index++;
3405 nvhost_err(dev_from_gk20a(g),
3406 "invalid zbc table type %d", zbc_val->type);
3410 if (!added && ret == 0) {
3411 /* update zbc for elpg only when new entry is added */
3412 entries = max(gr->max_used_color_index,
3413 gr->max_used_depth_index);
3414 pmu_save_zbc(g, entries);
3420 int gr_gk20a_clear_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
3422 struct fifo_gk20a *f = &g->fifo;
3423 struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
3425 unsigned long end_jiffies = jiffies +
3426 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3429 ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
3431 nvhost_err(dev_from_gk20a(g),
3432 "failed to disable gr engine activity\n");
3436 ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
3438 nvhost_err(dev_from_gk20a(g),
3439 "failed to idle graphics\n");
3443 for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
3444 gr->zbc_col_tbl[i].format = 0;
3445 gr->zbc_col_tbl[i].ref_cnt = 0;
3447 gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
3448 gr_ds_zbc_color_fmt_val_invalid_f());
3449 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3450 gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
3452 /* trigger the write */
3453 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3454 gr_ds_zbc_tbl_ld_select_c_f() |
3455 gr_ds_zbc_tbl_ld_action_write_f() |
3456 gr_ds_zbc_tbl_ld_trigger_active_f());
3458 /* clear l2 table */
3459 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3460 (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3461 ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3462 ltc_ltcs_ltss_dstg_zbc_index_address_f(i +
3463 GK20A_STARTOF_ZBC_TABLE));
3465 for (j = 0; j < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); j++) {
3466 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(j), 0);
3467 gr->zbc_col_tbl[i].color_l2[j] = 0;
3468 gr->zbc_col_tbl[i].color_ds[j] = 0;
3471 gr->max_used_color_index = 0;
3472 gr->max_default_color_index = 0;
3474 for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
3475 gr->zbc_dep_tbl[i].depth = 0;
3476 gr->zbc_dep_tbl[i].format = 0;
3477 gr->zbc_dep_tbl[i].ref_cnt = 0;
3479 gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
3480 gr_ds_zbc_z_fmt_val_invalid_f());
3481 gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
3482 gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
3484 /* trigger the write */
3485 gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
3486 gr_ds_zbc_tbl_ld_select_z_f() |
3487 gr_ds_zbc_tbl_ld_action_write_f() |
3488 gr_ds_zbc_tbl_ld_trigger_active_f());
3490 /* clear l2 table */
3491 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3492 (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3493 ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3494 ltc_ltcs_ltss_dstg_zbc_index_address_f(i +
3495 GK20A_STARTOF_ZBC_TABLE));
3497 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
3499 gr->max_used_depth_index = 0;
3500 gr->max_default_depth_index = 0;
3503 ret = gk20a_fifo_enable_engine_activity(g, gr_info);
3505 nvhost_err(dev_from_gk20a(g),
3506 "failed to enable gr engine activity\n");
3514 /* get a zbc table entry specified by index
3515 * return table size when type is invalid */
3516 int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
3517 struct zbc_query_params *query_params)
3519 u32 index = query_params->index_size;
3522 switch (query_params->type) {
3523 case GK20A_ZBC_TYPE_INVALID:
3524 query_params->index_size = GK20A_ZBC_TABLE_SIZE;
3526 case GK20A_ZBC_TYPE_COLOR:
3527 if (index >= GK20A_ZBC_TABLE_SIZE) {
3528 nvhost_err(dev_from_gk20a(g),
3529 "invalid zbc color table index\n");
3532 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3533 query_params->color_l2[i] =
3534 gr->zbc_col_tbl[index].color_l2[i];
3535 query_params->color_ds[i] =
3536 gr->zbc_col_tbl[index].color_ds[i];
3538 query_params->format = gr->zbc_col_tbl[index].format;
3539 query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
3541 case GK20A_ZBC_TYPE_DEPTH:
3542 if (index >= GK20A_ZBC_TABLE_SIZE) {
3543 nvhost_err(dev_from_gk20a(g),
3544 "invalid zbc depth table index\n");
3547 query_params->depth = gr->zbc_dep_tbl[index].depth;
3548 query_params->format = gr->zbc_dep_tbl[index].format;
3549 query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
3552 nvhost_err(dev_from_gk20a(g),
3553 "invalid zbc table type\n");
3560 static int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
3562 struct zbc_entry zbc_val;
3565 /* load default color table */
3566 zbc_val.type = GK20A_ZBC_TYPE_COLOR;
3568 zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
3569 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3570 zbc_val.color_ds[i] = 0;
3571 zbc_val.color_l2[i] = 0;
3573 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3575 zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
3576 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3577 zbc_val.color_ds[i] = 0xffffffff;
3578 zbc_val.color_l2[i] = 0x3f800000;
3580 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3582 zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3583 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3584 zbc_val.color_ds[i] = 0;
3585 zbc_val.color_l2[i] = 0;
3587 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3589 zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
3590 for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
3591 zbc_val.color_ds[i] = 0x3f800000;
3592 zbc_val.color_l2[i] = 0x3f800000;
3594 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3597 gr->max_default_color_index = 4;
3599 nvhost_err(dev_from_gk20a(g),
3600 "fail to load default zbc color table\n");
3604 /* load default depth table */
3605 zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
3607 zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3609 err = gr_gk20a_add_zbc(g, gr, &zbc_val);
3611 zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
3612 zbc_val.depth = 0x3f800000;
3613 err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
3616 gr->max_default_depth_index = 2;
3618 nvhost_err(dev_from_gk20a(g),
3619 "fail to load default zbc depth table\n");
3626 static int gr_gk20a_init_zbc(struct gk20a *g, struct gr_gk20a *gr)
3630 /* reset zbc clear */
3631 for (i = 0; i < GK20A_SIZEOF_ZBC_TABLE -
3632 GK20A_STARTOF_ZBC_TABLE; i++) {
3633 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
3634 (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
3635 ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
3636 ltc_ltcs_ltss_dstg_zbc_index_address_f(
3637 i + GK20A_STARTOF_ZBC_TABLE));
3638 for (j = 0; j < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); j++)
3639 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(j), 0);
3640 gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
3643 gr_gk20a_clear_zbc_table(g, gr);
3645 gr_gk20a_load_zbc_default_table(g, gr);
3650 int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
3651 struct zbc_entry *zbc_val)
3655 return gr_gk20a_elpg_protected_call(g,
3656 gr_gk20a_add_zbc(g, gr, zbc_val));
3659 void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine)
3661 u32 gate_ctrl, idle_filter;
3663 gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
3667 gate_ctrl = set_field(gate_ctrl,
3668 therm_gate_ctrl_eng_clk_m(),
3669 therm_gate_ctrl_eng_clk_run_f());
3670 gate_ctrl = set_field(gate_ctrl,
3671 therm_gate_ctrl_eng_pwr_m(),
3672 /* set elpg to auto to meet hw expectation */
3673 therm_gate_ctrl_eng_pwr_auto_f());
3676 gate_ctrl = set_field(gate_ctrl,
3677 therm_gate_ctrl_eng_clk_m(),
3678 therm_gate_ctrl_eng_clk_stop_f());
3681 gate_ctrl = set_field(gate_ctrl,
3682 therm_gate_ctrl_eng_clk_m(),
3683 therm_gate_ctrl_eng_clk_auto_f());
3686 nvhost_err(dev_from_gk20a(g),
3687 "invalid elcg mode %d", mode);
3690 if (tegra_platform_is_linsim()) {
3691 gate_ctrl = set_field(gate_ctrl,
3692 therm_gate_ctrl_eng_delay_after_m(),
3693 therm_gate_ctrl_eng_delay_after_f(4));
3696 /* 2 * (1 << 9) = 1024 clks */
3697 gate_ctrl = set_field(gate_ctrl,
3698 therm_gate_ctrl_eng_idle_filt_exp_m(),
3699 therm_gate_ctrl_eng_idle_filt_exp_f(9));
3700 gate_ctrl = set_field(gate_ctrl,
3701 therm_gate_ctrl_eng_idle_filt_mant_m(),
3702 therm_gate_ctrl_eng_idle_filt_mant_f(2));
3703 gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
3705 /* default fecs_idle_filter to 0 */
3706 idle_filter = gk20a_readl(g, therm_fecs_idle_filter_r());
3707 idle_filter &= ~therm_fecs_idle_filter_value_m();
3708 gk20a_writel(g, therm_fecs_idle_filter_r(), idle_filter);
3709 /* default hubmmu_idle_filter to 0 */
3710 idle_filter = gk20a_readl(g, therm_hubmmu_idle_filter_r());
3711 idle_filter &= ~therm_hubmmu_idle_filter_value_m();
3712 gk20a_writel(g, therm_hubmmu_idle_filter_r(), idle_filter);
3715 static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
3717 u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
3718 u32 *zcull_map_tiles, *zcull_bank_counters;
3722 bool floorsweep = false;
3727 zcull_map_tiles = kzalloc(proj_scal_max_gpcs_v() *
3728 proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3729 zcull_bank_counters = kzalloc(proj_scal_max_gpcs_v() *
3730 proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
3732 if (!zcull_map_tiles && !zcull_bank_counters) {
3733 nvhost_err(dev_from_gk20a(g),
3734 "failed to allocate zcull temp buffers");
3738 for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
3739 zcull_map_tiles[map_counter] =
3740 zcull_bank_counters[gr->map_tiles[map_counter]];
3741 zcull_bank_counters[gr->map_tiles[map_counter]]++;
3744 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(),
3745 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(zcull_map_tiles[0]) |
3746 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(zcull_map_tiles[1]) |
3747 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(zcull_map_tiles[2]) |
3748 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(zcull_map_tiles[3]) |
3749 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(zcull_map_tiles[4]) |
3750 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(zcull_map_tiles[5]) |
3751 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(zcull_map_tiles[6]) |
3752 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(zcull_map_tiles[7]));
3754 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(),
3755 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(zcull_map_tiles[8]) |
3756 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(zcull_map_tiles[9]) |
3757 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(zcull_map_tiles[10]) |
3758 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(zcull_map_tiles[11]) |
3759 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(zcull_map_tiles[12]) |
3760 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(zcull_map_tiles[13]) |
3761 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(zcull_map_tiles[14]) |
3762 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(zcull_map_tiles[15]));
3764 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(),
3765 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(zcull_map_tiles[16]) |
3766 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(zcull_map_tiles[17]) |
3767 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(zcull_map_tiles[18]) |
3768 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(zcull_map_tiles[19]) |
3769 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(zcull_map_tiles[20]) |
3770 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(zcull_map_tiles[21]) |
3771 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(zcull_map_tiles[22]) |
3772 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(zcull_map_tiles[23]));
3774 gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(),
3775 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(zcull_map_tiles[24]) |
3776 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(zcull_map_tiles[25]) |
3777 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(zcull_map_tiles[26]) |
3778 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(zcull_map_tiles[27]) |
3779 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(zcull_map_tiles[28]) |
3780 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(zcull_map_tiles[29]) |
3781 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(zcull_map_tiles[30]) |
3782 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(zcull_map_tiles[31]));
3784 kfree(zcull_map_tiles);
3785 kfree(zcull_bank_counters);
3787 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3788 gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
3789 gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
3791 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3792 gpc_zcull_count < gpc_tpc_count) {
3793 nvhost_err(dev_from_gk20a(g),
3794 "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
3795 gpc_zcull_count, gpc_tpc_count, gpc_index);
3798 if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
3799 gpc_zcull_count != 0)
3803 /* 1.0f / 1.0f * gr_gpc0_zcull_sm_num_rcp_conservative__max_v() */
3804 rcp_conserv = gr_gpc0_zcull_sm_num_rcp_conservative__max_v();
3806 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
3807 offset = gpc_index * proj_gpc_stride_v();
3810 gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
3811 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
3812 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
3813 gr->max_zcull_per_gpc_count));
3815 gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
3816 gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
3817 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
3818 gr->gpc_tpc_count[gpc_index]));
3821 gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
3822 gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
3823 gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
3825 gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
3826 gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
3829 gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
3830 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
3835 static void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
3837 /* enable tpc exception forwarding */
3838 gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(),
3839 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f());
3841 /* enable gpc exception forwarding */
3842 gk20a_writel(g, gr_gpc0_gpccs_gpc_exception_en_r(),
3843 gr_gpc0_gpccs_gpc_exception_en_tpc_0_enabled_f());
3846 static int gk20a_init_gr_setup_hw(struct gk20a *g)
3848 struct gr_gk20a *gr = &g->gr;
3849 struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
3850 struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
3851 struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
3853 u32 addr_lo, addr_hi, addr;
3854 u32 compbit_base_post_divide;
3855 u64 compbit_base_post_multiply64;
3856 unsigned long end_jiffies = jiffies +
3857 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
3858 u32 fe_go_idle_timeout_save;
3859 u32 last_bundle_data = 0;
3860 u32 last_method_data = 0;
3862 u32 l1c_dbg_reg_val;
3866 /* slcg prod values */
3867 gr_gk20a_slcg_gr_load_gating_prod(g, g->slcg_enabled);
3868 gr_gk20a_slcg_perf_load_gating_prod(g, g->slcg_enabled);
3870 /* init mmu debug buffer */
3871 addr = gk20a_mm_iova_addr(gr->mmu_wr_mem.mem.sgt->sgl);
3872 addr_lo = u64_lo32(addr);
3873 addr_hi = u64_hi32(addr);
3874 addr = (addr_lo >> fb_mmu_debug_wr_addr_alignment_v()) |
3875 (addr_hi << (32 - fb_mmu_debug_wr_addr_alignment_v()));
3877 gk20a_writel(g, fb_mmu_debug_wr_r(),
3878 fb_mmu_debug_wr_aperture_vid_mem_f() |
3879 fb_mmu_debug_wr_vol_false_f() |
3880 fb_mmu_debug_wr_addr_v(addr));
3882 addr = gk20a_mm_iova_addr(gr->mmu_rd_mem.mem.sgt->sgl);
3883 addr_lo = u64_lo32(addr);
3884 addr_hi = u64_hi32(addr);
3885 addr = (addr_lo >> fb_mmu_debug_rd_addr_alignment_v()) |
3886 (addr_hi << (32 - fb_mmu_debug_rd_addr_alignment_v()));
3888 gk20a_writel(g, fb_mmu_debug_rd_r(),
3889 fb_mmu_debug_rd_aperture_vid_mem_f() |
3890 fb_mmu_debug_rd_vol_false_f() |
3891 fb_mmu_debug_rd_addr_v(addr));
3893 /* load gr floorsweeping registers */
3894 data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
3895 data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
3896 gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
3897 gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
3899 gr_gk20a_zcull_init_hw(g, gr);
3901 gr_gk20a_blcg_gr_load_gating_prod(g, g->blcg_enabled);
3902 gr_gk20a_pg_gr_load_gating_prod(g, true);
3904 if (g->elcg_enabled) {
3905 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
3906 gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
3908 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
3909 gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
3912 /* Bug 1340570: increase the clock timeout to avoid potential
3913 * operation failure at high gpcclk rate. Default values are 0x400.
3915 gk20a_writel(g, pri_ringstation_sys_master_config_r(0x15), 0x800);
3916 gk20a_writel(g, pri_ringstation_gpc_master_config_r(0xa), 0x800);
3917 gk20a_writel(g, pri_ringstation_fbp_master_config_r(0x8), 0x800);
3919 /* enable fifo access */
3920 gk20a_writel(g, gr_gpfifo_ctl_r(),
3921 gr_gpfifo_ctl_access_enabled_f() |
3922 gr_gpfifo_ctl_semaphore_access_enabled_f());
3924 /* TBD: reload gr ucode when needed */
3926 /* enable interrupts */
3927 gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
3928 gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
3930 /* enable fecs error interrupts */
3931 gk20a_writel(g, gr_fecs_host_int_enable_r(),
3932 gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
3933 gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
3934 gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
3935 gr_fecs_host_int_enable_watchdog_enable_f());
3937 /* enable exceptions */
3938 gk20a_writel(g, gr_fe_hww_esr_r(),
3939 gr_fe_hww_esr_en_enable_f() |
3940 gr_fe_hww_esr_reset_active_f());
3941 gk20a_writel(g, gr_memfmt_hww_esr_r(),
3942 gr_memfmt_hww_esr_en_enable_f() |
3943 gr_memfmt_hww_esr_reset_active_f());
3944 gk20a_writel(g, gr_scc_hww_esr_r(),
3945 gr_scc_hww_esr_en_enable_f() |
3946 gr_scc_hww_esr_reset_active_f());
3947 gk20a_writel(g, gr_mme_hww_esr_r(),
3948 gr_mme_hww_esr_en_enable_f() |
3949 gr_mme_hww_esr_reset_active_f());
3950 gk20a_writel(g, gr_pd_hww_esr_r(),
3951 gr_pd_hww_esr_en_enable_f() |
3952 gr_pd_hww_esr_reset_active_f());
3953 gk20a_writel(g, gr_sked_hww_esr_r(), /* enabled by default */
3954 gr_sked_hww_esr_reset_active_f());
3955 gk20a_writel(g, gr_ds_hww_esr_r(),
3956 gr_ds_hww_esr_en_enabled_f() |
3957 gr_ds_hww_esr_reset_task_f());
3958 gk20a_writel(g, gr_ds_hww_report_mask_r(),
3959 gr_ds_hww_report_mask_sph0_err_report_f() |
3960 gr_ds_hww_report_mask_sph1_err_report_f() |
3961 gr_ds_hww_report_mask_sph2_err_report_f() |
3962 gr_ds_hww_report_mask_sph3_err_report_f() |
3963 gr_ds_hww_report_mask_sph4_err_report_f() |
3964 gr_ds_hww_report_mask_sph5_err_report_f() |
3965 gr_ds_hww_report_mask_sph6_err_report_f() |
3966 gr_ds_hww_report_mask_sph7_err_report_f() |
3967 gr_ds_hww_report_mask_sph8_err_report_f() |
3968 gr_ds_hww_report_mask_sph9_err_report_f() |
3969 gr_ds_hww_report_mask_sph10_err_report_f() |
3970 gr_ds_hww_report_mask_sph11_err_report_f() |
3971 gr_ds_hww_report_mask_sph12_err_report_f() |
3972 gr_ds_hww_report_mask_sph13_err_report_f() |
3973 gr_ds_hww_report_mask_sph14_err_report_f() |
3974 gr_ds_hww_report_mask_sph15_err_report_f() |
3975 gr_ds_hww_report_mask_sph16_err_report_f() |
3976 gr_ds_hww_report_mask_sph17_err_report_f() |
3977 gr_ds_hww_report_mask_sph18_err_report_f() |
3978 gr_ds_hww_report_mask_sph19_err_report_f() |
3979 gr_ds_hww_report_mask_sph20_err_report_f() |
3980 gr_ds_hww_report_mask_sph21_err_report_f() |
3981 gr_ds_hww_report_mask_sph22_err_report_f() |
3982 gr_ds_hww_report_mask_sph23_err_report_f());
3984 /* setup sm warp esr report masks */
3985 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
3986 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
3987 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
3988 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
3989 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
3990 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
3991 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
3992 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
3993 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
3994 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
3995 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
3996 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
3997 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
3998 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
3999 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
4000 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
4001 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
4002 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
4003 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
4004 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
4005 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
4007 /* setup sm global esr report mask */
4008 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4009 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
4010 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
4011 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
4012 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
4013 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
4014 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
4015 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
4017 /* enable per GPC exceptions */
4018 gk20a_gr_enable_gpc_exceptions(g);
4020 /* TBD: ECC for L1/SM */
4021 /* TBD: enable per BE exceptions */
4023 /* reset and enable all exceptions */
4024 gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
4025 gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
4026 gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
4027 gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
4028 gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
4029 gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
4031 /* ignore status from some units */
4032 data = gk20a_readl(g, gr_status_mask_r());
4033 gk20a_writel(g, gr_status_mask_r(), data & gr->status_disable_mask);
4035 gr_gk20a_init_zbc(g, gr);
4038 u64 compbit_base_post_divide64 = (gr->compbit_store.base_pa >>
4039 ltc_ltcs_ltss_cbc_base_alignment_shift_v());
4040 do_div(compbit_base_post_divide64, gr->num_fbps);
4041 compbit_base_post_divide = u64_lo32(compbit_base_post_divide64);
4044 compbit_base_post_multiply64 = ((u64)compbit_base_post_divide *
4045 gr->num_fbps) << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
4047 if (compbit_base_post_multiply64 < gr->compbit_store.base_pa)
4048 compbit_base_post_divide++;
4050 gk20a_writel(g, ltc_ltcs_ltss_cbc_base_r(),
4051 compbit_base_post_divide);
4053 nvhost_dbg(dbg_info | dbg_map | dbg_pte,
4054 "compbit base.pa: 0x%x,%08x cbc_base:0x%08x\n",
4055 (u32)(gr->compbit_store.base_pa>>32),
4056 (u32)(gr->compbit_store.base_pa & 0xffffffff),
4057 compbit_base_post_divide);
4060 for (i = 0; i < sw_ctx_load->count; i++)
4061 gk20a_writel(g, sw_ctx_load->l[i].addr,
4062 sw_ctx_load->l[i].value);
4064 err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4068 /* save and disable fe_go_idle */
4069 fe_go_idle_timeout_save =
4070 gk20a_readl(g, gr_fe_go_idle_timeout_r());
4071 gk20a_writel(g, gr_fe_go_idle_timeout_r(),
4072 (fe_go_idle_timeout_save & gr_fe_go_idle_timeout_count_f(0)) |
4073 gr_fe_go_idle_timeout_count_disabled_f());
4075 /* override a few ctx state registers */
4076 gr_gk20a_commit_global_cb_manager(g, NULL, false);
4077 gr_gk20a_commit_global_timeslice(g, NULL, false);
4079 /* floorsweep anything left */
4080 gr_gk20a_ctx_state_floorsweep(g);
4082 err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4084 goto restore_fe_go_idle;
4086 /* enable pipe mode override */
4087 gk20a_writel(g, gr_pipe_bundle_config_r(),
4088 gr_pipe_bundle_config_override_pipe_mode_enabled_f());
4090 /* load bundle init */
4092 for (i = 0; i < sw_bundle_init->count; i++) {
4094 if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
4095 gk20a_writel(g, gr_pipe_bundle_data_r(),
4096 sw_bundle_init->l[i].value);
4097 last_bundle_data = sw_bundle_init->l[i].value;
4100 gk20a_writel(g, gr_pipe_bundle_address_r(),
4101 sw_bundle_init->l[i].addr);
4103 if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
4105 err |= gr_gk20a_wait_idle(g, end_jiffies,
4106 GR_IDLE_CHECK_DEFAULT);
4107 else if (0) { /* IS_SILICON */
4108 u32 delay = GR_IDLE_CHECK_DEFAULT;
4110 u32 gr_status = gk20a_readl(g, gr_status_r());
4112 if (gr_status_fe_method_lower_v(gr_status) ==
4113 gr_status_fe_method_lower_idle_v())
4116 usleep_range(delay, delay * 2);
4117 delay = min_t(u32, delay << 1,
4120 } while (time_before(jiffies, end_jiffies) |
4121 !tegra_platform_is_silicon());
4125 /* disable pipe mode override */
4126 gk20a_writel(g, gr_pipe_bundle_config_r(),
4127 gr_pipe_bundle_config_override_pipe_mode_disabled_f());
4130 /* restore fe_go_idle */
4131 gk20a_writel(g, gr_fe_go_idle_timeout_r(), fe_go_idle_timeout_save);
4133 if (err || gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT))
4136 /* load method init */
4137 if (sw_method_init->count) {
4138 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4139 sw_method_init->l[0].value);
4140 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4141 gr_pri_mme_shadow_raw_index_write_trigger_f() |
4142 sw_method_init->l[0].addr);
4143 last_method_data = sw_method_init->l[0].value;
4145 for (i = 1; i < sw_method_init->count; i++) {
4146 if (sw_method_init->l[i].value != last_method_data) {
4147 gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
4148 sw_method_init->l[i].value);
4149 last_method_data = sw_method_init->l[i].value;
4151 gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
4152 gr_pri_mme_shadow_raw_index_write_trigger_f() |
4153 sw_method_init->l[i].addr);
4156 gk20a_mm_l2_invalidate(g);
4158 /* turn on cya15 bit for a default val that missed the cut */
4159 l1c_dbg_reg_val = gk20a_readl(g, gr_gpc0_tpc0_l1c_dbg_r());
4160 l1c_dbg_reg_val |= gr_gpc0_tpc0_l1c_dbg_cya15_en_f();
4161 gk20a_writel(g, gr_gpc0_tpc0_l1c_dbg_r(), l1c_dbg_reg_val);
4163 err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4168 nvhost_dbg_fn("done");
4172 static int gk20a_init_gr_prepare(struct gk20a *g)
4174 u32 gpfifo_ctrl, pmc_en;
4177 /* disable fifo access */
4178 pmc_en = gk20a_readl(g, mc_enable_r());
4179 if (pmc_en & mc_enable_pgraph_enabled_f()) {
4180 gpfifo_ctrl = gk20a_readl(g, gr_gpfifo_ctl_r());
4181 gpfifo_ctrl &= ~gr_gpfifo_ctl_access_enabled_f();
4182 gk20a_writel(g, gr_gpfifo_ctl_r(), gpfifo_ctrl);
4185 /* reset gr engine */
4186 pmc_en &= ~mc_enable_pgraph_enabled_f();
4187 pmc_en &= ~mc_enable_blg_enabled_f();
4188 pmc_en &= ~mc_enable_perfmon_enabled_f();
4189 gk20a_writel(g, mc_enable_r(), pmc_en);
4191 usleep_range(1000, 2000);
4193 pmc_en |= mc_enable_pgraph_enabled_f();
4194 pmc_en |= mc_enable_blg_enabled_f();
4195 pmc_en |= mc_enable_perfmon_enabled_f();
4196 gk20a_writel(g, mc_enable_r(), pmc_en);
4197 pmc_en = gk20a_readl(g, mc_enable_r());
4199 /* enable fifo access */
4200 gk20a_writel(g, gr_gpfifo_ctl_r(),
4201 gr_gpfifo_ctl_access_enabled_f() |
4202 gr_gpfifo_ctl_semaphore_access_enabled_f());
4204 if (!g->gr.ctx_vars.valid) {
4205 err = gr_gk20a_init_ctx_vars(g, &g->gr);
4207 nvhost_err(dev_from_gk20a(g),
4208 "fail to load gr init ctx");
4213 static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
4215 struct gr_gk20a *gr = &g->gr;
4216 struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
4217 unsigned long end_jiffies = jiffies +
4218 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4223 /* enable interrupts */
4224 gk20a_writel(g, gr_intr_r(), ~0);
4225 gk20a_writel(g, gr_intr_en_r(), ~0);
4227 /* reset ctx switch state */
4228 gr_gk20a_ctx_reset(g, 0);
4231 gk20a_writel(g, gr_scc_init_r(),
4232 gr_scc_init_ram_trigger_f());
4234 /* load non_ctx init */
4235 for (i = 0; i < sw_non_ctx_load->count; i++)
4236 gk20a_writel(g, sw_non_ctx_load->l[i].addr,
4237 sw_non_ctx_load->l[i].value);
4239 err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
4243 err = gr_gk20a_load_ctxsw_ucode(g, gr);
4247 /* this appears query for sw states but fecs actually init
4248 ramchain, etc so this is hw init */
4249 err = gr_gk20a_init_ctx_state(g, gr);
4255 nvhost_dbg(dbg_fn | dbg_err, "fail");
4257 nvhost_dbg_fn("done");
4262 static int gk20a_init_gr_setup_sw(struct gk20a *g)
4264 struct gr_gk20a *gr = &g->gr;
4270 nvhost_dbg_fn("skip init");
4276 err = gr_gk20a_init_gr_config(g, gr);
4280 err = gr_gk20a_init_mmu_sw(g, gr);
4284 err = gr_gk20a_init_map_tiles(g, gr);
4288 if (tegra_cpu_is_asim())
4289 gr->max_comptag_mem = 1; /* MBs worth of comptag coverage */
4291 nvhost_dbg_info("total ram pages : %lu", totalram_pages);
4292 gr->max_comptag_mem = totalram_pages
4293 >> (10 - (PAGE_SHIFT - 10));
4295 err = gr_gk20a_init_comptag(g, gr);
4299 err = gr_gk20a_init_zcull(g, gr);
4303 err = gr_gk20a_alloc_global_ctx_buffers(g);
4307 mutex_init(&gr->ctx_mutex);
4308 spin_lock_init(&gr->ch_tlb_lock);
4310 gr->remove_support = gk20a_remove_gr_support;
4311 gr->sw_ready = true;
4313 nvhost_dbg_fn("done");
4317 nvhost_dbg(dbg_fn | dbg_err, "fail");
4318 gk20a_remove_gr_support(gr);
4322 int gk20a_init_gr_support(struct gk20a *g)
4328 err = gk20a_init_gr_prepare(g);
4332 /* this is required before gr_gk20a_init_ctx_state */
4333 mutex_init(&g->gr.fecs_mutex);
4335 err = gk20a_init_gr_reset_enable_hw(g);
4339 err = gk20a_init_gr_setup_sw(g);
4343 err = gk20a_init_gr_setup_hw(g);
4350 #define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dc
4351 #define NVA297_SET_CIRCULAR_BUFFER_SIZE 0x1280
4352 #define NVA297_SET_SHADER_EXCEPTIONS 0x1528
4353 #define NVA0C0_SET_SHADER_EXCEPTIONS 0x1528
4355 #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
4357 struct gr_isr_data {
4368 static void gk20a_gr_set_shader_exceptions(struct gk20a *g,
4369 struct gr_isr_data *isr_data)
4375 if (isr_data->data_lo ==
4376 NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE)
4382 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
4385 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
4389 static void gk20a_gr_set_circular_buffer_size(struct gk20a *g,
4390 struct gr_isr_data *isr_data)
4392 struct gr_gk20a *gr = &g->gr;
4393 u32 gpc_index, ppc_index, stride, val, offset;
4394 u32 cb_size = isr_data->data_lo * 4;
4398 if (cb_size > gr->attrib_cb_size)
4399 cb_size = gr->attrib_cb_size;
4401 gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4402 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4403 ~gr_ds_tga_constraintlogic_beta_cbsize_f(~0)) |
4404 gr_ds_tga_constraintlogic_beta_cbsize_f(cb_size));
4406 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4407 stride = proj_gpc_stride_v() * gpc_index;
4409 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4412 val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg_r() +
4414 proj_ppc_in_gpc_stride_v() * ppc_index);
4416 offset = gr_gpc0_ppc0_cbm_cfg_start_offset_v(val);
4418 val = set_field(val,
4419 gr_gpc0_ppc0_cbm_cfg_size_m(),
4420 gr_gpc0_ppc0_cbm_cfg_size_f(cb_size *
4421 gr->pes_tpc_count[ppc_index][gpc_index]));
4422 val = set_field(val,
4423 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4426 gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4428 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4430 val = set_field(val,
4431 gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
4434 gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
4436 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4441 static void gk20a_gr_set_alpha_circular_buffer_size(struct gk20a *g,
4442 struct gr_isr_data *isr_data)
4444 struct gr_gk20a *gr = &g->gr;
4445 u32 gpc_index, ppc_index, stride, val;
4446 u32 pd_ab_max_output;
4447 u32 alpha_cb_size = isr_data->data_lo * 4;
4450 /* if (NO_ALPHA_BETA_TIMESLICE_SUPPORT_DEF)
4453 if (alpha_cb_size > gr->alpha_cb_size)
4454 alpha_cb_size = gr->alpha_cb_size;
4456 gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
4457 (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
4458 ~gr_ds_tga_constraintlogic_alpha_cbsize_f(~0)) |
4459 gr_ds_tga_constraintlogic_alpha_cbsize_f(alpha_cb_size));
4461 pd_ab_max_output = alpha_cb_size *
4462 gr_gpc0_ppc0_cbm_cfg_size_granularity_v() /
4463 gr_pd_ab_dist_cfg1_max_output_granularity_v();
4465 gk20a_writel(g, gr_pd_ab_dist_cfg1_r(),
4466 gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output));
4468 for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
4469 stride = proj_gpc_stride_v() * gpc_index;
4471 for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
4474 val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4476 proj_ppc_in_gpc_stride_v() * ppc_index);
4478 val = set_field(val, gr_gpc0_ppc0_cbm_cfg2_size_m(),
4479 gr_gpc0_ppc0_cbm_cfg2_size_f(alpha_cb_size *
4480 gr->pes_tpc_count[ppc_index][gpc_index]));
4482 gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg2_r() +
4484 proj_ppc_in_gpc_stride_v() * ppc_index, val);
4489 void gk20a_gr_reset(struct gk20a *g)
4492 err = gk20a_init_gr_prepare(g);
4494 err = gk20a_init_gr_reset_enable_hw(g);
4496 err = gk20a_init_gr_setup_hw(g);
4500 static void gk20a_gr_nop_method(struct gk20a *g)
4502 /* Reset method in PBDMA 0 */
4503 gk20a_writel(g, pbdma_method0_r(0),
4504 pbdma_udma_nop_r());
4505 gk20a_writel(g, pbdma_data0_r(0), 0);
4508 static int gk20a_gr_handle_illegal_method(struct gk20a *g,
4509 struct gr_isr_data *isr_data)
4513 if (isr_data->class_num == KEPLER_COMPUTE_A) {
4514 switch (isr_data->offset << 2) {
4515 case NVA0C0_SET_SHADER_EXCEPTIONS:
4516 gk20a_gr_set_shader_exceptions(g, isr_data);
4523 if (isr_data->class_num == KEPLER_C) {
4524 switch (isr_data->offset << 2) {
4525 case NVA297_SET_SHADER_EXCEPTIONS:
4526 gk20a_gr_set_shader_exceptions(g, isr_data);
4528 case NVA297_SET_CIRCULAR_BUFFER_SIZE:
4529 gk20a_gr_set_circular_buffer_size(g, isr_data);
4531 case NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE:
4532 gk20a_gr_set_alpha_circular_buffer_size(g, isr_data);
4542 gk20a_gr_nop_method(g);
4543 nvhost_err(dev_from_gk20a(g), "invalid method class 0x%08x"
4544 ", offset 0x%08x address 0x%08x\n",
4545 isr_data->class_num, isr_data->offset, isr_data->addr);
4549 static int gk20a_gr_handle_illegal_class(struct gk20a *g,
4550 struct gr_isr_data *isr_data)
4556 gk20a_gr_nop_method(g);
4557 nvhost_err(dev_from_gk20a(g),
4558 "invalid class 0x%08x, offset 0x%08x",
4559 isr_data->class_num, isr_data->offset);
4563 static int gk20a_gr_handle_class_error(struct gk20a *g,
4564 struct gr_isr_data *isr_data)
4570 gk20a_gr_nop_method(g);
4571 nvhost_err(dev_from_gk20a(g),
4572 "class error 0x%08x, offset 0x%08x",
4573 isr_data->class_num, isr_data->offset);
4577 static int gk20a_gr_handle_notify_pending(struct gk20a *g,
4578 struct gr_isr_data *isr_data)
4580 struct fifo_gk20a *f = &g->fifo;
4581 struct channel_gk20a *ch = &f->channel[isr_data->chid];
4583 #if defined(CONFIG_TEGRA_GPU_CYCLE_STATS)
4584 void *virtual_address;
4589 struct share_buffer_head *sh_hdr;
4593 struct gk20a_cyclestate_buffer_elem *op_elem;
4594 /* GL will never use payload 0 for cycle state */
4595 if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
4598 mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
4600 virtual_address = ch->cyclestate.cyclestate_buffer;
4601 buffer_size = ch->cyclestate.cyclestate_buffer_size;
4602 offset = isr_data->data_lo;
4605 if (offset >= buffer_size) {
4610 sh_hdr = (struct share_buffer_head *)
4611 ((char *)virtual_address + offset);
4613 if (sh_hdr->size < sizeof(struct share_buffer_head)) {
4617 new_offset = offset + sh_hdr->size;
4619 switch (sh_hdr->operation) {
4628 (struct gk20a_cyclestate_buffer_elem *)
4630 if (op_elem->offset_bar0 <
4631 TEGRA_GK20A_BAR0_SIZE) {
4634 (op_elem->last_bit + 1))
4636 op_elem->first_bit)-1);
4640 op_elem->offset_bar0);
4642 switch (sh_hdr->operation) {
4645 (raw_reg & mask_orig)
4646 >> op_elem->first_bit;
4651 if ((unsigned int)mask_orig !=
4654 (raw_reg & ~mask_orig);
4657 v |= ((op_elem->data
4658 << op_elem->first_bit)
4662 op_elem->offset_bar0,
4670 sh_hdr->failed = true;
4676 /* no operation content case */
4680 sh_hdr->completed = true;
4681 offset = new_offset;
4683 mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
4686 wake_up(&ch->notifier_wq);
4690 /* Used by sw interrupt thread to translate current ctx to chid.
4691 * For performance, we don't want to go through 128 channels every time.
4692 * A small tlb is used here to cache translation */
4693 static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx)
4695 struct fifo_gk20a *f = &g->fifo;
4696 struct gr_gk20a *gr = &g->gr;
4699 struct scatterlist *ctx_sg;
4701 spin_lock(&gr->ch_tlb_lock);
4703 /* check cache first */
4704 for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
4705 if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
4706 chid = gr->chid_tlb[i].hw_chid;
4712 for (chid = 0; chid < f->num_channels; chid++)
4713 if (f->channel[chid].in_use) {
4714 ctx_sg = f->channel[chid].inst_block.mem.sgt->sgl;
4715 if ((u32)(sg_phys(ctx_sg) >> ram_in_base_shift_v()) ==
4716 gr_fecs_current_ctx_ptr_v(curr_ctx))
4720 if (chid >= f->num_channels) {
4725 /* add to free tlb entry */
4726 for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
4727 if (gr->chid_tlb[i].curr_ctx == 0) {
4728 gr->chid_tlb[i].curr_ctx = curr_ctx;
4729 gr->chid_tlb[i].hw_chid = chid;
4734 /* no free entry, flush one */
4735 gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
4736 gr->chid_tlb[gr->channel_tlb_flush_index].hw_chid = chid;
4738 gr->channel_tlb_flush_index =
4739 (gr->channel_tlb_flush_index + 1) &
4740 (GR_CHANNEL_MAP_TLB_SIZE - 1);
4743 spin_unlock(&gr->ch_tlb_lock);
4747 static int gk20a_gr_lock_down_sm(struct gk20a *g, u32 global_esr_mask)
4749 unsigned long end_jiffies = jiffies +
4750 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
4751 u32 delay = GR_IDLE_CHECK_DEFAULT;
4752 bool mmu_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled(g);
4755 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "locking down SM");
4757 /* assert stop trigger */
4758 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
4759 dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
4760 gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
4762 /* wait for the sm to lock down */
4764 u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
4765 u32 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
4766 u32 dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r());
4768 (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
4769 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
4770 bool error_pending =
4771 (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) !=
4772 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) ||
4773 ((global_esr & ~global_esr_mask) != 0);
4775 if (locked_down || !error_pending) {
4776 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "locked down SM");
4778 /* de-assert stop trigger */
4779 dbgr_control0 &= ~gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
4780 gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
4785 /* if an mmu fault is pending and mmu debug mode is not
4786 * enabled, the sm will never lock down. */
4787 if (!mmu_debug_mode_enabled && gk20a_fifo_mmu_fault_pending(g)) {
4788 nvhost_err(dev_from_gk20a(g), "mmu fault pending, sm will"
4789 " never lock down!");
4793 usleep_range(delay, delay * 2);
4794 delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
4796 } while (time_before(jiffies, end_jiffies));
4798 nvhost_err(dev_from_gk20a(g), "timed out while trying to lock down SM");
4803 static bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
4805 u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
4807 /* check if an sm debugger is attached */
4808 if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
4809 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v())
4815 static void gk20a_gr_clear_sm_hww(struct gk20a *g, u32 global_esr)
4817 gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r(), global_esr);
4819 /* clear the warp hww */
4820 gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r(),
4821 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f());
4824 static struct channel_gk20a *
4825 channel_from_hw_chid(struct gk20a *g, u32 hw_chid)
4827 return g->fifo.channel+hw_chid;
4830 static int gk20a_gr_handle_sm_exception(struct gk20a *g,
4831 struct gr_isr_data *isr_data)
4834 bool do_warp_sync = false;
4835 /* these three interrupts don't require locking down the SM. They can
4836 * be handled by usermode clients as they aren't fatal. Additionally,
4837 * usermode clients may wish to allow some warps to execute while others
4838 * are at breakpoints, as opposed to fatal errors where all warps should
4840 u32 global_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f() |
4841 gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
4842 gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
4843 u32 global_esr, warp_esr;
4844 bool sm_debugger_attached = gk20a_gr_sm_debugger_attached(g);
4845 struct channel_gk20a *fault_ch;
4847 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
4849 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
4850 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
4852 /* if an sm debugger is attached, disable forwarding of tpc exceptions.
4853 * the debugger will reenable exceptions after servicing them. */
4854 if (sm_debugger_attached) {
4855 u32 tpc_exception_en = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
4856 tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
4857 gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), tpc_exception_en);
4858 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "SM debugger attached");
4861 /* if a debugger is present and an error has occurred, do a warp sync */
4862 if (sm_debugger_attached && ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
4863 nvhost_dbg(dbg_intr, "warp sync needed");
4864 do_warp_sync = true;
4868 ret = gk20a_gr_lock_down_sm(g, global_mask);
4870 nvhost_err(dev_from_gk20a(g), "sm did not lock down!\n");
4875 /* finally, signal any client waiting on an event */
4876 fault_ch = channel_from_hw_chid(g, isr_data->chid);
4878 gk20a_dbg_gpu_post_events(fault_ch);
4883 static int gk20a_gr_handle_tpc_exception(struct gk20a *g,
4884 struct gr_isr_data *isr_data)
4887 u32 tpc_exception = gk20a_readl(g, gr_gpcs_tpcs_tpccs_tpc_exception_r());
4889 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "");
4891 /* check if an sm exeption is pending */
4892 if (gr_gpcs_tpcs_tpccs_tpc_exception_sm_v(tpc_exception) ==
4893 gr_gpcs_tpcs_tpccs_tpc_exception_sm_pending_v()) {
4894 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "SM exception pending");
4895 ret = gk20a_gr_handle_sm_exception(g, isr_data);
4901 static int gk20a_gr_handle_gpc_exception(struct gk20a *g,
4902 struct gr_isr_data *isr_data)
4905 u32 gpc_exception = gk20a_readl(g, gr_gpcs_gpccs_gpc_exception_r());
4907 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "");
4909 /* check if tpc 0 has an exception */
4910 if (gr_gpcs_gpccs_gpc_exception_tpc_v(gpc_exception) ==
4911 gr_gpcs_gpccs_gpc_exception_tpc_0_pending_v()) {
4912 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "TPC exception pending");
4913 ret = gk20a_gr_handle_tpc_exception(g, isr_data);
4919 int gk20a_gr_isr(struct gk20a *g)
4921 struct gr_isr_data isr_data;
4925 u32 gr_intr = gk20a_readl(g, gr_intr_r());
4932 grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
4933 grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
4934 grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
4936 gk20a_writel(g, gr_gpfifo_ctl_r(),
4937 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
4938 gr_gpfifo_ctl_semaphore_access_f(0));
4940 isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
4941 isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
4942 isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
4943 isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
4944 isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
4945 isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
4946 obj_table = gk20a_readl(g,
4947 gr_fe_object_table_r(isr_data.sub_chan));
4948 isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
4951 gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx);
4952 if (isr_data.chid == -1) {
4953 nvhost_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
4958 nvhost_dbg(dbg_intr | dbg_gpu_dbg,
4959 "channel %d: addr 0x%08x, "
4960 "data 0x%08x 0x%08x,"
4961 "ctx 0x%08x, offset 0x%08x, "
4962 "subchannel 0x%08x, class 0x%08x",
4963 isr_data.chid, isr_data.addr,
4964 isr_data.data_hi, isr_data.data_lo,
4965 isr_data.curr_ctx, isr_data.offset,
4966 isr_data.sub_chan, isr_data.class_num);
4968 if (gr_intr & gr_intr_notify_pending_f()) {
4969 gk20a_gr_handle_notify_pending(g, &isr_data);
4970 gk20a_writel(g, gr_intr_r(),
4971 gr_intr_notify_reset_f());
4972 gr_intr &= ~gr_intr_notify_pending_f();
4975 if (gr_intr & gr_intr_illegal_method_pending_f()) {
4976 ret = gk20a_gr_handle_illegal_method(g, &isr_data);
4977 gk20a_writel(g, gr_intr_r(),
4978 gr_intr_illegal_method_reset_f());
4979 gr_intr &= ~gr_intr_illegal_method_pending_f();
4982 if (gr_intr & gr_intr_illegal_class_pending_f()) {
4983 ret = gk20a_gr_handle_illegal_class(g, &isr_data);
4984 gk20a_writel(g, gr_intr_r(),
4985 gr_intr_illegal_class_reset_f());
4986 gr_intr &= ~gr_intr_illegal_class_pending_f();
4989 if (gr_intr & gr_intr_class_error_pending_f()) {
4990 ret = gk20a_gr_handle_class_error(g, &isr_data);
4991 gk20a_writel(g, gr_intr_r(),
4992 gr_intr_class_error_reset_f());
4993 gr_intr &= ~gr_intr_class_error_pending_f();
4996 if (gr_intr & gr_intr_exception_pending_f()) {
4997 u32 exception = gk20a_readl(g, gr_exception_r());
4999 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "exception %08x\n", exception);
5001 if (exception & gr_exception_fe_m()) {
5002 u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
5003 nvhost_dbg(dbg_intr, "fe warning %08x\n", fe);
5004 gk20a_writel(g, gr_fe_hww_esr_r(), fe);
5007 /* check if a gpc exception has occurred */
5008 if (exception & gr_exception_gpc_m() && ret == 0) {
5009 u32 exception1 = gk20a_readl(g, gr_exception1_r());
5010 u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
5012 nvhost_dbg(dbg_intr | dbg_gpu_dbg, "GPC exception pending");
5014 /* if no sm debugger is present, clean up the channel */
5015 if (!gk20a_gr_sm_debugger_attached(g)) {
5016 nvhost_dbg(dbg_intr | dbg_gpu_dbg,
5017 "SM debugger not attached, clearing interrupt");
5021 /* check if gpc 0 has an exception */
5022 if (exception1 & gr_exception1_gpc_0_pending_f())
5023 ret = gk20a_gr_handle_gpc_exception(g, &isr_data);
5024 /* clear the hwws, also causes tpc and gpc
5025 * exceptions to be cleared */
5026 gk20a_gr_clear_sm_hww(g, global_esr);
5031 gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
5032 gr_intr &= ~gr_intr_exception_pending_f();
5036 gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A));
5039 gk20a_writel(g, gr_gpfifo_ctl_r(),
5040 grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
5041 gr_gpfifo_ctl_semaphore_access_f(1));
5044 nvhost_err(dev_from_gk20a(g),
5045 "unhandled gr interrupt 0x%08x", gr_intr);
5050 int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
5052 BUG_ON(size == NULL);
5053 return gr_gk20a_submit_fecs_method_op(g,
5054 (struct fecs_method_op_gk20a) {
5059 .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
5060 .mailbox.ret = size,
5061 .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
5063 .cond.fail = GR_IS_UCODE_OP_SKIP,
5064 .mailbox.fail = 0});
5067 int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
5069 return gr_gk20a_submit_fecs_method_op(g,
5070 (struct fecs_method_op_gk20a){
5072 .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
5073 gr_fecs_current_ctx_valid_f(1) |
5074 gr_fecs_current_ctx_target_vid_mem_f()),
5077 .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
5078 .mailbox.ret = NULL,
5079 .cond.ok = GR_IS_UCODE_OP_EQUAL,
5081 .cond.fail = GR_IS_UCODE_OP_SKIP,
5082 .mailbox.fail = 0});
5085 int gr_gk20a_fecs_set_reglist_virual_addr(struct gk20a *g, u64 pmu_va)
5087 return gr_gk20a_submit_fecs_method_op(g,
5088 (struct fecs_method_op_gk20a) {
5090 .mailbox.data = u64_lo32(pmu_va >> 8),
5093 .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
5094 .mailbox.ret = NULL,
5095 .cond.ok = GR_IS_UCODE_OP_EQUAL,
5097 .cond.fail = GR_IS_UCODE_OP_SKIP,
5098 .mailbox.fail = 0});
5101 int gk20a_gr_suspend(struct gk20a *g)
5103 unsigned long end_jiffies = jiffies +
5104 msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
5109 ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
5113 gk20a_writel(g, gr_gpfifo_ctl_r(),
5114 gr_gpfifo_ctl_access_disabled_f());
5116 /* disable gr intr */
5117 gk20a_writel(g, gr_intr_r(), 0);
5118 gk20a_writel(g, gr_intr_en_r(), 0);
5120 /* disable all exceptions */
5121 gk20a_writel(g, gr_exception_r(), 0);
5122 gk20a_writel(g, gr_exception_en_r(), 0);
5123 gk20a_writel(g, gr_exception1_r(), 0);
5124 gk20a_writel(g, gr_exception1_en_r(), 0);
5125 gk20a_writel(g, gr_exception2_r(), 0);
5126 gk20a_writel(g, gr_exception2_en_r(), 0);
5128 gk20a_gr_flush_channel_tlb(&g->gr);
5130 nvhost_dbg_fn("done");
5134 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
5136 bool is_quad, u32 quad,
5137 u32 *context_buffer,
5138 u32 context_buffer_size,
5141 /* This function will decode a priv address and return the partition type and numbers. */
5142 int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
5143 int *addr_type, /* enum ctxsw_addr_type */
5144 u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
5145 u32 *broadcast_flags)
5149 u32 ppc_broadcast_addr;
5151 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5153 /* setup defaults */
5155 ppc_broadcast_addr = 0;
5156 *addr_type = CTXSW_ADDR_TYPE_SYS;
5157 *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
5163 if (pri_is_gpc_addr(addr)) {
5164 *addr_type = CTXSW_ADDR_TYPE_GPC;
5165 gpc_addr = pri_gpccs_addr_mask(addr);
5166 if (pri_is_gpc_addr_shared(addr)) {
5167 *addr_type = CTXSW_ADDR_TYPE_GPC;
5168 *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
5170 *gpc_num = pri_get_gpc_num(addr);
5172 if (pri_is_tpc_addr(gpc_addr)) {
5173 *addr_type = CTXSW_ADDR_TYPE_TPC;
5174 if (pri_is_tpc_addr_shared(gpc_addr)) {
5175 *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
5178 *tpc_num = pri_get_tpc_num(gpc_addr);
5181 } else if (pri_is_be_addr(addr)) {
5182 *addr_type = CTXSW_ADDR_TYPE_BE;
5183 if (pri_is_be_addr_shared(addr)) {
5184 *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
5187 *be_num = pri_get_be_num(addr);
5190 *addr_type = CTXSW_ADDR_TYPE_SYS;
5199 static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
5201 u32 *priv_addr_table, u32 *t)
5205 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5207 for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
5208 priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
5215 * The context buffer is indexed using BE broadcast addresses and GPC/TPC
5216 * unicast addresses. This function will convert a BE unicast address to a BE
5217 * broadcast address and split a GPC/TPC broadcast address into a table of
5218 * GPC/TPC addresses. The addresses generated by this function can be
5219 * successfully processed by gr_gk20a_find_priv_offset_in_buffer
5221 static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
5223 u32 *priv_addr_table,
5226 int addr_type; /*enum ctxsw_addr_type */
5227 u32 gpc_num, tpc_num, ppc_num, be_num;
5228 u32 broadcast_flags;
5235 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5237 err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
5238 &gpc_num, &tpc_num, &ppc_num, &be_num,
5240 nvhost_dbg(dbg_gpu_dbg, "addr_type = %d", addr_type);
5244 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
5245 (addr_type == CTXSW_ADDR_TYPE_BE)) {
5246 /* The BE broadcast registers are included in the compressed PRI
5247 * table. Convert a BE unicast address to a broadcast address
5248 * so that we can look up the offset. */
5249 if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
5250 !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
5251 priv_addr_table[t++] = pri_be_shared_addr(addr);
5253 priv_addr_table[t++] = addr;
5259 /* The GPC/TPC unicast registers are included in the compressed PRI
5260 * tables. Convert a GPC/TPC broadcast address to unicast addresses so
5261 * that we can look up the offsets. */
5262 if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
5263 for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
5265 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5267 tpc_num < g->gr.gpc_tpc_count[gpc_num];
5269 priv_addr_table[t++] =
5270 pri_tpc_addr(pri_tpccs_addr_mask(addr),
5273 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
5274 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5275 priv_addr_table, &t);
5279 priv_addr_table[t++] =
5280 pri_gpc_addr(pri_gpccs_addr_mask(addr),
5284 if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
5286 tpc_num < g->gr.gpc_tpc_count[gpc_num];
5288 priv_addr_table[t++] =
5289 pri_tpc_addr(pri_tpccs_addr_mask(addr),
5291 else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
5292 err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
5293 priv_addr_table, &t);
5295 priv_addr_table[t++] = addr;
5302 int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
5305 u32 *offsets, u32 *offset_addrs,
5307 bool is_quad, u32 quad)
5310 u32 priv_offset = 0;
5311 u32 *priv_registers;
5312 u32 num_registers = 0;
5314 u32 potential_offsets = proj_scal_litter_num_gpcs_v() *
5315 proj_scal_litter_num_tpc_per_gpc_v();
5317 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5319 /* implementation is crossed-up if either of these happen */
5320 if (max_offsets > potential_offsets)
5323 if (!g->gr.ctx_vars.golden_image_initialized)
5326 priv_registers = kzalloc(sizeof(u32) * potential_offsets, GFP_KERNEL);
5327 if (IS_ERR_OR_NULL(priv_registers)) {
5328 nvhost_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets);
5329 err = PTR_ERR(priv_registers);
5332 memset(offsets, 0, sizeof(u32) * max_offsets);
5333 memset(offset_addrs, 0, sizeof(u32) * max_offsets);
5336 gr_gk20a_create_priv_addr_table(g, addr, &priv_registers[0], &num_registers);
5338 if ((max_offsets > 1) && (num_registers > max_offsets)) {
5343 if ((max_offsets == 1) && (num_registers > 1))
5346 if (!g->gr.ctx_vars.local_golden_image) {
5347 nvhost_dbg_fn("no context switch header info to work with");
5352 for (i = 0; i < num_registers; i++) {
5353 err = gr_gk20a_find_priv_offset_in_buffer(g,
5356 g->gr.ctx_vars.local_golden_image,
5357 g->gr.ctx_vars.golden_image_size,
5360 nvhost_dbg_fn("Could not determine priv_offset for addr:0x%x",
5361 addr); /*, grPriRegStr(addr)));*/
5365 offsets[i] = priv_offset;
5366 offset_addrs[i] = priv_registers[i];
5369 *num_offsets = num_registers;
5373 if (!IS_ERR_OR_NULL(priv_registers))
5374 kfree(priv_registers);
5379 /* Setup some register tables. This looks hacky; our
5380 * register/offset functions are just that, functions.
5381 * So they can't be used as initializers... TBD: fix to
5382 * generate consts at least on an as-needed basis.
5384 static const u32 _num_ovr_perf_regs = 17;
5385 static u32 _ovr_perf_regs[17] = { 0, };
5386 /* Following are the blocks of registers that the ucode
5387 stores in the extended region.*/
5388 /* == ctxsw_extended_sm_dsm_perf_counter_register_stride_v() ? */
5389 static const u32 _num_sm_dsm_perf_regs = 5;
5390 /* == ctxsw_extended_sm_dsm_perf_counter_control_register_stride_v() ?*/
5391 static const u32 _num_sm_dsm_perf_ctrl_regs = 4;
5392 static u32 _sm_dsm_perf_regs[5];
5393 static u32 _sm_dsm_perf_ctrl_regs[4];
5395 static void init_sm_dsm_reg_info(void)
5397 if (_ovr_perf_regs[0] != 0)
5400 _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
5401 _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
5402 _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
5403 _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
5404 _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
5405 _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
5406 _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
5407 _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
5408 _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
5409 _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
5410 _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
5411 _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
5412 _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
5413 _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
5414 _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
5415 _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
5416 _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
5419 _sm_dsm_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r();
5420 _sm_dsm_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r();
5421 _sm_dsm_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r();
5422 _sm_dsm_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r();
5423 _sm_dsm_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r();
5425 _sm_dsm_perf_ctrl_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r();
5426 _sm_dsm_perf_ctrl_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r();
5427 _sm_dsm_perf_ctrl_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r();
5428 _sm_dsm_perf_ctrl_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r();
5432 /* TBD: would like to handle this elsewhere, at a higher level.
5433 * these are currently constructed in a "test-then-write" style
5434 * which makes it impossible to know externally whether a ctx
5435 * write will actually occur. so later we should put a lazy,
5436 * map-and-hold system in the patch write state */
5437 int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
5438 struct channel_ctx_gk20a *ch_ctx,
5442 u32 num_gpc = g->gr.gpc_count;
5450 init_sm_dsm_reg_info();
5452 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5454 for (reg = 0; reg < _num_ovr_perf_regs; reg++) {
5455 for (gpc = 0; gpc < num_gpc; gpc++) {
5456 num_tpc = g->gr.gpc_tpc_count[gpc];
5457 for (tpc = 0; tpc < num_tpc; tpc++) {
5458 chk_addr = ((proj_gpc_stride_v() * gpc) +
5459 (proj_tpc_in_gpc_stride_v() * tpc) +
5460 _ovr_perf_regs[reg]);
5461 if (chk_addr != addr)
5463 /* reset the patch count from previous
5464 runs,if ucode has already processed
5466 tmp = mem_rd32(context +
5467 ctxsw_prog_main_image_patch_count_o(), 0);
5470 ch_ctx->patch_ctx.data_count = 0;
5472 gr_gk20a_ctx_patch_write(g, ch_ctx,
5475 vaddr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
5476 vaddr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
5479 ctxsw_prog_main_image_patch_count_o(),
5480 0, ch_ctx->patch_ctx.data_count);
5482 ctxsw_prog_main_image_patch_adr_lo_o(),
5485 ctxsw_prog_main_image_patch_adr_hi_o(),
5488 /* we're not caching these on cpu side,
5489 but later watch for it */
5491 /* the l2 invalidate in the patch_write
5492 * would be too early for this? */
5493 gk20a_mm_l2_invalidate(g);
5503 void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset)
5512 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "offset=0x%x", offset);
5514 gpc = pri_get_gpc_num(offset);
5515 gpc_tpc_addr = pri_gpccs_addr_mask(offset);
5516 tpc = pri_get_tpc_num(gpc_tpc_addr);
5518 quad_ctrl = quad & 0x1; /* first bit tells us quad */
5519 half_ctrl = (quad >> 1) & 0x1; /* second bit tells us half */
5521 gpc_tpc_stride = gpc * proj_gpc_stride_v() +
5522 tpc * proj_tpc_in_gpc_stride_v();
5523 gpc_tpc_addr = gr_gpc0_tpc0_sm_halfctl_ctrl_r() + gpc_tpc_stride;
5525 reg = gk20a_readl(g, gpc_tpc_addr);
5526 reg = set_field(reg,
5527 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_m(),
5530 gk20a_writel(g, gpc_tpc_addr, reg);
5532 gpc_tpc_addr = gr_gpc0_tpc0_sm_debug_sfe_control_r() + gpc_tpc_stride;
5533 reg = gk20a_readl(g, gpc_tpc_addr);
5534 reg = set_field(reg,
5535 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_m(),
5537 gk20a_writel(g, gpc_tpc_addr, reg);
5540 #define ILLEGAL_ID (~0)
5542 static inline bool check_main_image_header_magic(void *context)
5544 u32 magic = mem_rd32(context +
5545 ctxsw_prog_main_image_magic_value_o(), 0);
5546 nvhost_dbg(dbg_gpu_dbg, "main image magic=0x%x", magic);
5547 return magic == ctxsw_prog_main_image_magic_value_v_value_v();
5549 static inline bool check_local_header_magic(void *context)
5551 u32 magic = mem_rd32(context +
5552 ctxsw_prog_local_magic_value_o(), 0);
5553 nvhost_dbg(dbg_gpu_dbg, "local magic=0x%x", magic);
5554 return magic == ctxsw_prog_local_magic_value_v_value_v();
5558 /* most likely dupe of ctxsw_gpccs_header__size_1_v() */
5559 static inline int ctxsw_prog_ucode_header_size_in_bytes(void)
5564 static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
5566 bool is_quad, u32 quad,
5567 u32 *context_buffer,
5568 u32 context_buffer_size,
5572 u32 gpc_num, tpc_num;
5573 u32 num_gpcs, num_tpcs;
5575 u32 ext_priv_offset, ext_priv_size;
5577 u32 offset_to_segment, offset_to_segment_end;
5578 u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
5579 u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
5580 u32 num_ext_gpccs_ext_buffer_segments;
5581 u32 inter_seg_offset;
5582 u32 tpc_gpc_mask = (proj_tpc_in_gpc_stride_v() - 1);
5584 u32 *sm_dsm_perf_ctrl_regs = NULL;
5585 u32 num_sm_dsm_perf_ctrl_regs = 0;
5586 u32 *sm_dsm_perf_regs = NULL;
5587 u32 num_sm_dsm_perf_regs = 0;
5588 u32 buffer_segments_size = 0;
5589 u32 marker_size = 0;
5590 u32 control_register_stride = 0;
5591 u32 perf_register_stride = 0;
5593 /* Only have TPC registers in extended region, so if not a TPC reg,
5594 then return error so caller can look elsewhere. */
5595 if (pri_is_gpc_addr(addr)) {
5597 gpc_num = pri_get_gpc_num(addr);
5598 gpc_addr = pri_gpccs_addr_mask(addr);
5599 if (pri_is_tpc_addr(gpc_addr))
5600 tpc_num = pri_get_tpc_num(gpc_addr);
5604 nvhost_dbg_info(" gpc = %d tpc = %d",
5609 buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
5610 /* note below is in words/num_registers */
5611 marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
5613 context = context_buffer;
5614 /* sanity check main header */
5615 if (!check_main_image_header_magic(context)) {
5616 nvhost_err(dev_from_gk20a(g),
5617 "Invalid main header: magic value");
5620 num_gpcs = mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
5621 if (gpc_num >= num_gpcs) {
5622 nvhost_err(dev_from_gk20a(g),
5623 "GPC 0x%08x is greater than total count 0x%08x!\n",
5628 data32 = mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0);
5629 ext_priv_size = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
5630 if (0 == ext_priv_size) {
5631 nvhost_dbg_info(" No extended memory in context buffer");
5634 ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);
5636 offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
5637 offset_to_segment_end = offset_to_segment +
5638 (ext_priv_size * buffer_segments_size);
5640 /* check local header magic */
5641 context += ctxsw_prog_ucode_header_size_in_bytes();
5642 if (!check_local_header_magic(context)) {
5643 nvhost_err(dev_from_gk20a(g),
5644 "Invalid local header: magic value\n");
5649 * See if the incoming register address is in the first table of
5650 * registers. We check this by decoding only the TPC addr portion.
5651 * If we get a hit on the TPC bit, we then double check the address
5652 * by computing it from the base gpc/tpc strides. Then make sure
5653 * it is a real match.
5655 num_sm_dsm_perf_regs = _num_sm_dsm_perf_regs;
5656 sm_dsm_perf_regs = _sm_dsm_perf_regs;
5657 perf_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v();
5659 init_sm_dsm_reg_info();
5661 for (i = 0; i < num_sm_dsm_perf_regs; i++) {
5662 if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
5663 sm_dsm_perf_reg_id = i;
5665 nvhost_dbg_info("register match: 0x%08x",
5666 sm_dsm_perf_regs[i]);
5668 chk_addr = (proj_gpc_base_v() +
5669 (proj_gpc_stride_v() * gpc_num) +
5670 proj_tpc_in_gpc_base_v() +
5671 (proj_tpc_in_gpc_stride_v() * tpc_num) +
5672 (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask));
5674 if (chk_addr != addr) {
5675 nvhost_err(dev_from_gk20a(g),
5676 "Oops addr miss-match! : 0x%08x != 0x%08x\n",
5684 /* Didn't find reg in supported group 1.
5685 * so try the second group now */
5686 num_sm_dsm_perf_ctrl_regs = _num_sm_dsm_perf_ctrl_regs;
5687 sm_dsm_perf_ctrl_regs = _sm_dsm_perf_ctrl_regs;
5688 control_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v();
5690 if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
5691 for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
5692 if ((addr & tpc_gpc_mask) ==
5693 (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
5694 sm_dsm_perf_ctrl_reg_id = i;
5696 nvhost_dbg_info("register match: 0x%08x",
5697 sm_dsm_perf_ctrl_regs[i]);
5699 chk_addr = (proj_gpc_base_v() +
5700 (proj_gpc_stride_v() * gpc_num) +
5701 proj_tpc_in_gpc_base_v() +
5702 (proj_tpc_in_gpc_stride_v() * tpc_num) +
5703 (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
5706 if (chk_addr != addr) {
5707 nvhost_err(dev_from_gk20a(g),
5708 "Oops addr miss-match! : 0x%08x != 0x%08x\n",
5719 if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
5720 (ILLEGAL_ID == sm_dsm_perf_reg_id))
5723 /* Skip the FECS extended header, nothing there for us now. */
5724 offset_to_segment += buffer_segments_size;
5726 /* skip through the GPCCS extended headers until we get to the data for
5727 * our GPC. The size of each gpc extended segment is enough to hold the
5728 * max tpc count for the gpcs,in 256b chunks.
5731 max_tpc_count = proj_scal_litter_num_tpc_per_gpc_v();
5733 num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2);
5735 offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
5736 buffer_segments_size * gpc_num);
5738 num_tpcs = g->gr.gpc_tpc_count[gpc_num];
5740 /* skip the head marker to start with */
5741 inter_seg_offset = marker_size;
5743 if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
5744 /* skip over control regs of TPC's before the one we want.
5745 * then skip to the register in this tpc */
5746 inter_seg_offset = inter_seg_offset +
5747 (tpc_num * control_register_stride) +
5748 sm_dsm_perf_ctrl_reg_id;
5750 /* skip all the control registers */
5751 inter_seg_offset = inter_seg_offset +
5752 (num_tpcs * control_register_stride);
5754 /* skip the marker between control and counter segments */
5755 inter_seg_offset += marker_size;
5757 /* skip over counter regs of TPCs before the one we want */
5758 inter_seg_offset = inter_seg_offset +
5759 (tpc_num * perf_register_stride) *
5760 ctxsw_prog_extended_num_smpc_quadrants_v();
5762 /* skip over the register for the quadrants we do not want.
5763 * then skip to the register in this tpc */
5764 inter_seg_offset = inter_seg_offset +
5765 (perf_register_stride * quad) +
5769 /* set the offset to the segment offset plus the inter segment offset to
5771 offset_to_segment += (inter_seg_offset * 4);
5773 /* last sanity check: did we somehow compute an offset outside the
5774 * extended buffer? */
5775 if (offset_to_segment > offset_to_segment_end) {
5776 nvhost_err(dev_from_gk20a(g),
5777 "Overflow ctxsw buffer! 0x%08x > 0x%08x\n",
5778 offset_to_segment, offset_to_segment_end);
5782 *priv_offset = offset_to_segment;
5789 gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
5790 int addr_type,/* enum ctxsw_addr_type */
5792 u32 gpc_num, u32 num_tpcs,
5793 u32 num_ppcs, u32 ppc_mask,
5797 u32 address, base_address;
5798 u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
5799 u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
5800 struct aiv_gk20a *reg;
5802 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);
5804 if (!g->gr.ctx_vars.valid)
5807 /* Process the SYS/BE segment. */
5808 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
5809 (addr_type == CTXSW_ADDR_TYPE_BE)) {
5810 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
5811 reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i];
5812 address = reg->addr;
5813 sys_offset = reg->index;
5815 if (pri_addr == address) {
5816 *priv_offset = sys_offset;
5822 /* Process the TPC segment. */
5823 if (addr_type == CTXSW_ADDR_TYPE_TPC) {
5824 for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
5825 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
5826 reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i];
5827 address = reg->addr;
5828 tpc_addr = pri_tpccs_addr_mask(address);
5829 base_address = proj_gpc_base_v() +
5830 (gpc_num * proj_gpc_stride_v()) +
5831 proj_tpc_in_gpc_base_v() +
5832 (tpc_num * proj_tpc_in_gpc_stride_v());
5833 address = base_address + tpc_addr;
5835 * The data for the TPCs is interleaved in the context buffer.
5836 * Example with num_tpcs = 2
5837 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
5838 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
5840 tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
5842 if (pri_addr == address) {
5843 *priv_offset = tpc_offset;
5850 /* Process the PPC segment. */
5851 if (addr_type == CTXSW_ADDR_TYPE_PPC) {
5852 for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
5853 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
5854 reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i];
5855 address = reg->addr;
5856 ppc_addr = pri_ppccs_addr_mask(address);
5857 base_address = proj_gpc_base_v() +
5858 (gpc_num * proj_gpc_stride_v()) +
5859 proj_ppc_in_gpc_base_v() +
5860 (ppc_num * proj_ppc_in_gpc_stride_v());
5861 address = base_address + ppc_addr;
5863 * The data for the PPCs is interleaved in the context buffer.
5864 * Example with numPpcs = 2
5865 * 0 1 2 3 4 5 6 7 8 9 10 11 ...
5866 * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
5868 ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4);
5870 if (pri_addr == address) {
5871 *priv_offset = ppc_offset;
5879 /* Process the GPC segment. */
5880 if (addr_type == CTXSW_ADDR_TYPE_GPC) {
5881 for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
5882 reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i];
5884 address = reg->addr;
5885 gpc_addr = pri_gpccs_addr_mask(address);
5886 gpc_offset = reg->index;
5888 base_address = proj_gpc_base_v() +
5889 (gpc_num * proj_gpc_stride_v());
5890 address = base_address + gpc_addr;
5892 if (pri_addr == address) {
5893 *priv_offset = gpc_offset;
5902 static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
5904 u32 *num_ppcs, u32 *ppc_mask,
5908 u32 litter_num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
5911 * if there is only 1 PES_PER_GPC, then we put the PES registers
5912 * in the GPC reglist, so we can't error out if ppc.count == 0
5914 if ((!g->gr.ctx_vars.valid) ||
5915 ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) &&
5916 (litter_num_pes_per_gpc > 1)))
5919 data32 = mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0);
5921 *num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
5922 *ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
5924 *reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count;
5932 * This function will return the 32 bit offset for a priv register if it is
5933 * present in the context buffer.
5935 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
5937 bool is_quad, u32 quad,
5938 u32 *context_buffer,
5939 u32 context_buffer_size,
5942 struct gr_gk20a *gr = &g->gr;
5945 int addr_type; /*enum ctxsw_addr_type */
5946 u32 broadcast_flags;
5947 u32 gpc_num, tpc_num, ppc_num, be_num;
5948 u32 num_gpcs, num_tpcs, num_ppcs;
5950 u32 sys_priv_offset, gpc_priv_offset;
5951 u32 ppc_mask, reg_list_ppc_count;
5953 u32 offset_to_segment;
5955 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
5957 err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
5958 &gpc_num, &tpc_num, &ppc_num, &be_num,
5963 context = context_buffer;
5964 if (!check_main_image_header_magic(context)) {
5965 nvhost_err(dev_from_gk20a(g),
5966 "Invalid main header: magic value");
5969 num_gpcs = mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
5971 /* Parse the FECS local header. */
5972 context += ctxsw_prog_ucode_header_size_in_bytes();
5973 if (!check_local_header_magic(context)) {
5974 nvhost_err(dev_from_gk20a(g),
5975 "Invalid FECS local header: magic value\n");
5978 data32 = mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
5979 sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
5981 /* If found in Ext buffer, ok.
5982 * If it failed and we expected to find it there (quad offset)
5983 * then return the error. Otherwise continue on.
5985 err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
5986 addr, is_quad, quad, context_buffer,
5987 context_buffer_size, priv_offset);
5988 if (!err || (err && is_quad))
5991 if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
5992 (addr_type == CTXSW_ADDR_TYPE_BE)) {
5993 /* Find the offset in the FECS segment. */
5994 offset_to_segment = sys_priv_offset *
5995 ctxsw_prog_ucode_header_size_in_bytes();
5997 err = gr_gk20a_process_context_buffer_priv_segment(g,
6004 *priv_offset = (offset_to_segment + offset);
6008 if ((gpc_num + 1) > num_gpcs) {
6009 nvhost_err(dev_from_gk20a(g),
6010 "GPC %d not in this context buffer.\n",
6015 /* Parse the GPCCS local header(s).*/
6016 for (i = 0; i < num_gpcs; i++) {
6017 context += ctxsw_prog_ucode_header_size_in_bytes();
6018 if (!check_local_header_magic(context)) {
6019 nvhost_err(dev_from_gk20a(g),
6020 "Invalid GPCCS local header: magic value\n");
6024 data32 = mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
6025 gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
6027 err = gr_gk20a_determine_ppc_configuration(g, context,
6028 &num_ppcs, &ppc_mask,
6029 ®_list_ppc_count);
6033 num_tpcs = mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0);
6035 if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
6036 nvhost_err(dev_from_gk20a(g),
6037 "GPC %d TPC %d not in this context buffer.\n",
6042 /* Find the offset in the GPCCS segment.*/
6044 offset_to_segment = gpc_priv_offset *
6045 ctxsw_prog_ucode_header_size_in_bytes();
6047 if (addr_type == CTXSW_ADDR_TYPE_TPC) {
6048 /*reg = gr->ctx_vars.ctxsw_regs.tpc.l;*/
6049 } else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
6050 /* The ucode stores TPC data before PPC data.
6051 * Advance offset past TPC data to PPC data. */
6052 offset_to_segment +=
6053 ((gr->ctx_vars.ctxsw_regs.tpc.count *
6055 } else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
6056 /* The ucode stores TPC/PPC data before GPC data.
6057 * Advance offset past TPC/PPC data to GPC data. */
6058 /* note 1 PES_PER_GPC case */
6059 u32 litter_num_pes_per_gpc =
6060 proj_scal_litter_num_pes_per_gpc_v();
6061 if (litter_num_pes_per_gpc > 1) {
6062 offset_to_segment +=
6063 (((gr->ctx_vars.ctxsw_regs.tpc.count *
6065 ((reg_list_ppc_count * num_ppcs) << 2));
6067 offset_to_segment +=
6068 ((gr->ctx_vars.ctxsw_regs.tpc.count *
6072 nvhost_err(dev_from_gk20a(g),
6073 " Unknown address type.\n");
6076 err = gr_gk20a_process_context_buffer_priv_segment(g,
6084 *priv_offset = offset_to_segment + offset;
6093 int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
6094 struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
6095 u32 num_ctx_wr_ops, u32 num_ctx_rd_ops)
6097 struct gk20a *g = ch->g;
6098 struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
6099 void *ctx_ptr = NULL;
6100 int curr_gr_chid, curr_gr_ctx;
6101 bool ch_is_curr_ctx, restart_gr_ctxsw = false;
6102 u32 i, j, offset, v;
6103 u32 max_offsets = proj_scal_litter_num_gpcs_v() *
6104 proj_scal_litter_num_tpc_per_gpc_v();
6105 u32 *offsets = NULL;
6106 u32 *offset_addrs = NULL;
6107 u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
6110 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
6111 num_ctx_wr_ops, num_ctx_rd_ops);
6113 /* disable channel switching.
6114 * at that point the hardware state can be inspected to
6115 * determine if the context we're interested in is current.
6117 err = gr_gk20a_disable_ctxsw(g);
6119 nvhost_err(dev_from_gk20a(g), "unable to stop gr ctxsw");
6120 /* this should probably be ctx-fatal... */
6124 restart_gr_ctxsw = true;
6126 curr_gr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
6127 curr_gr_chid = gk20a_gr_get_chid_from_ctx(g, curr_gr_ctx);
6128 ch_is_curr_ctx = (curr_gr_chid != -1) && (ch->hw_chid == curr_gr_chid);
6130 nvhost_dbg(dbg_fn | dbg_gpu_dbg, "is curr ctx=%d", ch_is_curr_ctx);
6131 if (ch_is_curr_ctx) {
6132 for (pass = 0; pass < 2; pass++) {
6134 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
6135 /* only do ctx ops and only on the right pass */
6136 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
6137 (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
6138 ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
6141 /* if this is a quad access, setup for special access*/
6142 if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD))
6143 gr_gk20a_access_smpc_reg(g, ctx_ops[i].quad,
6145 offset = ctx_ops[i].offset;
6147 if (pass == 0) { /* write pass */
6148 v = gk20a_readl(g, offset);
6149 v &= ~ctx_ops[i].and_n_mask_lo;
6150 v |= ctx_ops[i].value_lo;
6151 gk20a_writel(g, offset, v);
6153 nvhost_dbg(dbg_gpu_dbg,
6154 "direct wr: offset=0x%x v=0x%x",
6157 if (ctx_ops[i].op == REGOP(WRITE_64)) {
6158 v = gk20a_readl(g, offset + 4);
6159 v &= ~ctx_ops[i].and_n_mask_hi;
6160 v |= ctx_ops[i].value_hi;
6161 gk20a_writel(g, offset + 4, v);
6163 nvhost_dbg(dbg_gpu_dbg,
6164 "direct wr: offset=0x%x v=0x%x",
6168 } else { /* read pass */
6169 ctx_ops[i].value_lo =
6170 gk20a_readl(g, offset);
6172 nvhost_dbg(dbg_gpu_dbg,
6173 "direct rd: offset=0x%x v=0x%x",
6174 offset, ctx_ops[i].value_lo);
6176 if (ctx_ops[i].op == REGOP(READ_64)) {
6177 ctx_ops[i].value_hi =
6178 gk20a_readl(g, offset + 4);
6180 nvhost_dbg(dbg_gpu_dbg,
6181 "direct rd: offset=0x%x v=0x%x",
6182 offset, ctx_ops[i].value_lo);
6184 ctx_ops[i].value_hi = 0;
6192 /* they're the same size, so just use one alloc for both */
6193 offsets = kzalloc(2 * sizeof(u32) * max_offsets, GFP_KERNEL);
6198 offset_addrs = offsets + max_offsets;
6200 /* would have been a variant of gr_gk20a_apply_instmem_overrides */
6201 /* recoded in-place instead.*/
6202 ctx_ptr = nvhost_memmgr_mmap(ch_ctx->gr_ctx.mem.ref);
6209 /* Channel gr_ctx buffer is gpu cacheable; so flush and invalidate.
6210 * There should be no on-going/in-flight references by the gpu now. */
6211 gk20a_mm_fb_flush(g);
6212 gk20a_mm_l2_flush(g, true);
6214 /* write to appropriate place in context image,
6215 * first have to figure out where that really is */
6217 /* first pass is writes, second reads */
6218 for (pass = 0; pass < 2; pass++) {
6220 for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
6223 /* only do ctx ops and only on the right pass */
6224 if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
6225 (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
6226 ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
6229 err = gr_gk20a_get_ctx_buffer_offsets(g,
6232 offsets, offset_addrs,
6234 ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
6237 nvhost_dbg(dbg_gpu_dbg,
6238 "ctx op invalid offset: offset=0x%x",
6241 NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
6245 /* if this is a quad access, setup for special access*/
6246 if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD))
6247 gr_gk20a_access_smpc_reg(g, ctx_ops[i].quad,
6250 for (j = 0; j < num_offsets; j++) {
6251 /* sanity check, don't write outside, worst case */
6252 if (offsets[j] >= g->gr.ctx_vars.golden_image_size)
6254 if (pass == 0) { /* write pass */
6255 v = mem_rd32(ctx_ptr + offsets[j], 0);
6256 v &= ~ctx_ops[i].and_n_mask_lo;
6257 v |= ctx_ops[i].value_lo;
6258 mem_wr32(ctx_ptr + offsets[j], 0, v);
6260 nvhost_dbg(dbg_gpu_dbg,
6261 "context wr: offset=0x%x v=0x%x",
6264 if (ctx_ops[i].op == REGOP(WRITE_64)) {
6265 v = mem_rd32(ctx_ptr + offsets[j] + 4, 0);
6266 v &= ~ctx_ops[i].and_n_mask_hi;
6267 v |= ctx_ops[i].value_hi;
6268 mem_wr32(ctx_ptr + offsets[j] + 4, 0, v);
6270 nvhost_dbg(dbg_gpu_dbg,
6271 "context wr: offset=0x%x v=0x%x",
6275 /* check to see if we need to add a special WAR
6276 for some of the SMPC perf regs */
6277 gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
6280 } else { /* read pass */
6281 ctx_ops[i].value_lo =
6282 mem_rd32(ctx_ptr + offsets[0], 0);
6284 nvhost_dbg(dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
6285 offsets[0], ctx_ops[i].value_lo);
6287 if (ctx_ops[i].op == REGOP(READ_64)) {
6288 ctx_ops[i].value_hi =
6289 mem_rd32(ctx_ptr + offsets[0] + 4, 0);
6291 nvhost_dbg(dbg_gpu_dbg,
6292 "context rd: offset=0x%x v=0x%x",
6293 offsets[0] + 4, ctx_ops[i].value_hi);
6295 ctx_ops[i].value_hi = 0;
6302 /* flush cpu caches for the ctx buffer? only if cpu cached, of course.
6303 * they aren't, yet */
6305 FLUSH_CPU_DCACHE(ctx_ptr,
6306 sg_phys(ch_ctx->gr_ctx.mem.ref), size);
6315 nvhost_memmgr_munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
6317 if (restart_gr_ctxsw) {
6318 int tmp_err = gr_gk20a_enable_ctxsw(g);
6320 nvhost_err(dev_from_gk20a(g), "unable to restart ctxsw!\n");