gpu: nvgpu: gk20a: always map ptes for 64 bit arch
[linux-3.10.git] / drivers / gpu / nvgpu / gk20a / mm_gk20a.c
1 /*
2  * drivers/video/tegra/host/gk20a/mm_gk20a.c
3  *
4  * GK20A memory management
5  *
6  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
7  *
8  * This program is free software; you can redistribute it and/or modify it
9  * under the terms and conditions of the GNU General Public License,
10  * version 2, as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  *
17  * You should have received a copy of the GNU General Public License along with
18  * this program; if not, write to the Free Software Foundation, Inc.,
19  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
20  */
21
22 #include <linux/delay.h>
23 #include <linux/highmem.h>
24 #include <linux/log2.h>
25 #include <linux/nvhost.h>
26 #include <linux/pm_runtime.h>
27 #include <linux/scatterlist.h>
28 #include <linux/nvmap.h>
29 #include <linux/tegra-soc.h>
30 #include <linux/vmalloc.h>
31 #include <linux/dma-buf.h>
32 #include <asm/cacheflush.h>
33
34 #include "gk20a.h"
35 #include "mm_gk20a.h"
36 #include "hw_gmmu_gk20a.h"
37 #include "hw_fb_gk20a.h"
38 #include "hw_bus_gk20a.h"
39 #include "hw_ram_gk20a.h"
40 #include "hw_mc_gk20a.h"
41 #include "hw_flush_gk20a.h"
42 #include "hw_ltc_gk20a.h"
43
44 #include "kind_gk20a.h"
45
46 #ifdef CONFIG_ARM64
47 #define outer_flush_range(a, b)
48 #define __cpuc_flush_dcache_area __flush_dcache_area
49 #endif
50
51 /*
52  * GPU mapping life cycle
53  * ======================
54  *
55  * Kernel mappings
56  * ---------------
57  *
58  * Kernel mappings are created through vm.map(..., false):
59  *
60  *  - Mappings to the same allocations are reused and refcounted.
61  *  - This path does not support deferred unmapping (i.e. kernel must wait for
62  *    all hw operations on the buffer to complete before unmapping).
63  *  - References to dmabuf are owned and managed by the (kernel) clients of
64  *    the gk20a_vm layer.
65  *
66  *
67  * User space mappings
68  * -------------------
69  *
70  * User space mappings are created through as.map_buffer -> vm.map(..., true):
71  *
72  *  - Mappings to the same allocations are reused and refcounted.
73  *  - This path supports deferred unmapping (i.e. we delay the actual unmapping
74  *    until all hw operations have completed).
75  *  - References to dmabuf are owned and managed by the vm_gk20a
76  *    layer itself. vm.map acquires these refs, and sets
77  *    mapped_buffer->own_mem_ref to record that we must release the refs when we
78  *    actually unmap.
79  *
80  */
81
82 static inline int vm_aspace_id(struct vm_gk20a *vm)
83 {
84         /* -1 is bar1 or pmu, etc. */
85         return vm->as_share ? vm->as_share->id : -1;
86 }
87 static inline u32 hi32(u64 f)
88 {
89         return (u32)(f >> 32);
90 }
91 static inline u32 lo32(u64 f)
92 {
93         return (u32)(f & 0xffffffff);
94 }
95
96 #define FLUSH_CPU_DCACHE(va, pa, size)  \
97         do {    \
98                 __cpuc_flush_dcache_area((void *)(va), (size_t)(size)); \
99                 outer_flush_range(pa, pa + (size_t)(size));             \
100         } while (0)
101
102 static void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer);
103 static struct mapped_buffer_node *find_mapped_buffer_locked(
104                                         struct rb_root *root, u64 addr);
105 static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
106                                 struct rb_root *root, struct dma_buf *dmabuf,
107                                 u32 kind);
108 static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
109                                    enum gmmu_pgsz_gk20a pgsz_idx,
110                                    struct sg_table *sgt,
111                                    u64 first_vaddr, u64 last_vaddr,
112                                    u8 kind_v, u32 ctag_offset, bool cacheable,
113                                    int rw_flag);
114 static void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i);
115 static void gk20a_vm_remove_support(struct vm_gk20a *vm);
116
117
118 /* note: keep the page sizes sorted lowest to highest here */
119 static const u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, SZ_128K };
120 static const u32 gmmu_page_shifts[gmmu_nr_page_sizes] = { 12, 17 };
121 static const u64 gmmu_page_offset_masks[gmmu_nr_page_sizes] = { 0xfffLL,
122                                                                 0x1ffffLL };
123 static const u64 gmmu_page_masks[gmmu_nr_page_sizes] = { ~0xfffLL, ~0x1ffffLL };
124
125 struct gk20a_comptags {
126         u32 offset;
127         u32 lines;
128 };
129
130 struct gk20a_dmabuf_priv {
131         struct mutex lock;
132
133         struct gk20a_allocator *comptag_allocator;
134         struct gk20a_comptags comptags;
135
136         struct dma_buf_attachment *attach;
137         struct sg_table *sgt;
138
139         int pin_count;
140 };
141
142 static void gk20a_mm_delete_priv(void *_priv)
143 {
144         struct gk20a_dmabuf_priv *priv = _priv;
145         if (!priv)
146                 return;
147
148         if (priv->comptags.lines) {
149                 BUG_ON(!priv->comptag_allocator);
150                 priv->comptag_allocator->free(priv->comptag_allocator,
151                                               priv->comptags.offset,
152                                               priv->comptags.lines);
153         }
154
155         kfree(priv);
156 }
157
158 struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf)
159 {
160         struct gk20a_dmabuf_priv *priv;
161
162         priv = dma_buf_get_drvdata(dmabuf, dev);
163         if (WARN_ON(!priv))
164                 return ERR_PTR(-EINVAL);
165
166         mutex_lock(&priv->lock);
167
168         if (priv->pin_count == 0) {
169                 priv->attach = dma_buf_attach(dmabuf, dev);
170                 if (IS_ERR(priv->attach)) {
171                         mutex_unlock(&priv->lock);
172                         return (struct sg_table *)priv->attach;
173                 }
174
175                 priv->sgt = dma_buf_map_attachment(priv->attach,
176                                                    DMA_BIDIRECTIONAL);
177                 if (IS_ERR(priv->sgt)) {
178                         dma_buf_detach(dmabuf, priv->attach);
179                         mutex_unlock(&priv->lock);
180                         return priv->sgt;
181                 }
182         }
183
184         priv->pin_count++;
185         mutex_unlock(&priv->lock);
186         return priv->sgt;
187 }
188
189 void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
190                     struct sg_table *sgt)
191 {
192         struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
193         dma_addr_t dma_addr;
194
195         if (IS_ERR(priv) || !priv)
196                 return;
197
198         mutex_lock(&priv->lock);
199         WARN_ON(priv->sgt != sgt);
200         priv->pin_count--;
201         WARN_ON(priv->pin_count < 0);
202         dma_addr = sg_dma_address(priv->sgt->sgl);
203         if (priv->pin_count == 0) {
204                 dma_buf_unmap_attachment(priv->attach, priv->sgt,
205                                          DMA_BIDIRECTIONAL);
206                 dma_buf_detach(dmabuf, priv->attach);
207         }
208         mutex_unlock(&priv->lock);
209 }
210
211
212 static void gk20a_get_comptags(struct device *dev,
213                                struct dma_buf *dmabuf,
214                                struct gk20a_comptags *comptags)
215 {
216         struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
217
218         if (!comptags)
219                 return;
220
221         if (!priv) {
222                 comptags->lines = 0;
223                 comptags->offset = 0;
224                 return;
225         }
226
227         *comptags = priv->comptags;
228 }
229
230 static int gk20a_alloc_comptags(struct device *dev,
231                                 struct dma_buf *dmabuf,
232                                 struct gk20a_allocator *allocator,
233                                 int lines)
234 {
235         struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
236         u32 offset = 0;
237         int err;
238
239         if (!priv)
240                 return -ENOSYS;
241
242         if (!lines)
243                 return -EINVAL;
244
245         /* store the allocator so we can use it when we free the ctags */
246         priv->comptag_allocator = allocator;
247         err = allocator->alloc(allocator, &offset, lines);
248         if (!err) {
249                 priv->comptags.lines = lines;
250                 priv->comptags.offset = offset;
251         }
252         return err;
253 }
254
255
256
257
258 static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
259 {
260         gk20a_dbg_fn("");
261         if (g->ops.fb.reset)
262                 g->ops.fb.reset(g);
263
264         if (g->ops.fb.init_fs_state)
265                 g->ops.fb.init_fs_state(g);
266
267         return 0;
268 }
269
270 void gk20a_remove_mm_support(struct mm_gk20a *mm)
271 {
272         struct gk20a *g = mm->g;
273         struct device *d = dev_from_gk20a(g);
274         struct vm_gk20a *vm = &mm->bar1.vm;
275         struct inst_desc *inst_block = &mm->bar1.inst_block;
276
277         gk20a_dbg_fn("");
278
279         if (inst_block->cpuva)
280                 dma_free_coherent(d, inst_block->size,
281                         inst_block->cpuva, inst_block->iova);
282         inst_block->cpuva = NULL;
283         inst_block->iova = 0;
284
285         gk20a_vm_remove_support(vm);
286 }
287
288 int gk20a_init_mm_setup_sw(struct gk20a *g)
289 {
290         struct mm_gk20a *mm = &g->mm;
291         int i;
292
293         gk20a_dbg_fn("");
294
295         if (mm->sw_ready) {
296                 gk20a_dbg_fn("skip init");
297                 return 0;
298         }
299
300         mm->g = g;
301         mutex_init(&mm->tlb_lock);
302         mutex_init(&mm->l2_op_lock);
303         mm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
304         mm->compression_page_size = gmmu_page_sizes[gmmu_page_size_big];
305         mm->pde_stride    = mm->big_page_size << 10;
306         mm->pde_stride_shift = ilog2(mm->pde_stride);
307         BUG_ON(mm->pde_stride_shift > 31); /* we have assumptions about this */
308
309         for (i = 0; i < ARRAY_SIZE(gmmu_page_sizes); i++) {
310
311                 u32 num_ptes, pte_space, num_pages;
312
313                 /* assuming "full" page tables */
314                 num_ptes = mm->pde_stride / gmmu_page_sizes[i];
315
316                 pte_space = num_ptes * gmmu_pte__size_v();
317                 /* allocate whole pages */
318                 pte_space = roundup(pte_space, PAGE_SIZE);
319
320                 num_pages = pte_space / PAGE_SIZE;
321                 /* make sure "order" is viable */
322                 BUG_ON(!is_power_of_2(num_pages));
323
324                 mm->page_table_sizing[i].num_ptes = num_ptes;
325                 mm->page_table_sizing[i].order = ilog2(num_pages);
326         }
327
328         /*TBD: make channel vm size configurable */
329         mm->channel.size = 1ULL << NV_GMMU_VA_RANGE;
330
331         gk20a_dbg_info("channel vm size: %dMB", (int)(mm->channel.size >> 20));
332
333         gk20a_dbg_info("small page-size (%dKB) pte array: %dKB",
334                         gmmu_page_sizes[gmmu_page_size_small] >> 10,
335                         (mm->page_table_sizing[gmmu_page_size_small].num_ptes *
336                          gmmu_pte__size_v()) >> 10);
337
338         gk20a_dbg_info("big page-size (%dKB) pte array: %dKB",
339                         gmmu_page_sizes[gmmu_page_size_big] >> 10,
340                         (mm->page_table_sizing[gmmu_page_size_big].num_ptes *
341                          gmmu_pte__size_v()) >> 10);
342
343
344         gk20a_init_bar1_vm(mm);
345
346         mm->remove_support = gk20a_remove_mm_support;
347         mm->sw_ready = true;
348
349         gk20a_dbg_fn("done");
350         return 0;
351 }
352
353 /* make sure gk20a_init_mm_support is called before */
354 static int gk20a_init_mm_setup_hw(struct gk20a *g)
355 {
356         struct mm_gk20a *mm = &g->mm;
357         struct inst_desc *inst_block = &mm->bar1.inst_block;
358         phys_addr_t inst_pa = inst_block->cpu_pa;
359
360         gk20a_dbg_fn("");
361
362         /* set large page size in fb
363          * note this is very early on, can we defer it ? */
364         {
365                 u32 fb_mmu_ctrl = gk20a_readl(g, fb_mmu_ctrl_r());
366
367                 if (gmmu_page_sizes[gmmu_page_size_big] == SZ_128K)
368                         fb_mmu_ctrl = (fb_mmu_ctrl &
369                                        ~fb_mmu_ctrl_vm_pg_size_f(~0x0)) |
370                                 fb_mmu_ctrl_vm_pg_size_128kb_f();
371                 else
372                         BUG_ON(1); /* no support/testing for larger ones yet */
373
374                 gk20a_writel(g, fb_mmu_ctrl_r(), fb_mmu_ctrl);
375         }
376
377         inst_pa = (u32)(inst_pa >> bar1_instance_block_shift_gk20a());
378         gk20a_dbg_info("bar1 inst block ptr: 0x%08x",  (u32)inst_pa);
379
380         gk20a_writel(g, bus_bar1_block_r(),
381                      bus_bar1_block_target_vid_mem_f() |
382                      bus_bar1_block_mode_virtual_f() |
383                      bus_bar1_block_ptr_f(inst_pa));
384         if (gk20a_mm_fb_flush(g) || gk20a_mm_fb_flush(g))
385                 return -EBUSY;
386
387         gk20a_dbg_fn("done");
388         return 0;
389 }
390
391 int gk20a_init_mm_support(struct gk20a *g)
392 {
393         u32 err;
394
395         err = gk20a_init_mm_reset_enable_hw(g);
396         if (err)
397                 return err;
398
399         err = gk20a_init_mm_setup_sw(g);
400         if (err)
401                 return err;
402
403         err = gk20a_init_mm_setup_hw(g);
404         if (err)
405                 return err;
406
407         return err;
408 }
409
410 #ifdef CONFIG_GK20A_PHYS_PAGE_TABLES
411 static int alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
412                             void **handle,
413                             struct sg_table **sgt,
414                             size_t *size)
415 {
416         u32 num_pages = 1 << order;
417         u32 len = num_pages * PAGE_SIZE;
418         int err;
419         struct page *pages;
420
421         gk20a_dbg_fn("");
422
423         pages = alloc_pages(GFP_KERNEL, order);
424         if (!pages) {
425                 gk20a_dbg(gpu_dbg_pte, "alloc_pages failed\n");
426                 goto err_out;
427         }
428         *sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
429         if (!sgt) {
430                 gk20a_dbg(gpu_dbg_pte, "cannot allocate sg table");
431                 goto err_alloced;
432         }
433         err = sg_alloc_table(*sgt, 1, GFP_KERNEL);
434         if (err) {
435                 gk20a_dbg(gpu_dbg_pte, "sg_alloc_table failed\n");
436                 goto err_sg_table;
437         }
438         sg_set_page((*sgt)->sgl, pages, len, 0);
439         *handle = page_address(pages);
440         memset(*handle, 0, len);
441         *size = len;
442         FLUSH_CPU_DCACHE(*handle, sg_phys((*sgt)->sgl), len);
443
444         return 0;
445
446 err_sg_table:
447         kfree(*sgt);
448 err_alloced:
449         __free_pages(pages, order);
450 err_out:
451         return -ENOMEM;
452 }
453
454 static void free_gmmu_pages(struct vm_gk20a *vm, void *handle,
455                             struct sg_table *sgt, u32 order,
456                             size_t size)
457 {
458         gk20a_dbg_fn("");
459         BUG_ON(sgt == NULL);
460         free_pages((unsigned long)handle, order);
461         sg_free_table(sgt);
462         kfree(sgt);
463 }
464
465 static int map_gmmu_pages(void *handle, struct sg_table *sgt,
466                           void **va, size_t size)
467 {
468         FLUSH_CPU_DCACHE(handle, sg_phys(sgt->sgl), sgt->sgl->length);
469         *va = handle;
470         return 0;
471 }
472
473 static void unmap_gmmu_pages(void *handle, struct sg_table *sgt, void *va)
474 {
475         FLUSH_CPU_DCACHE(handle, sg_phys(sgt->sgl), sgt->sgl->length);
476 }
477 #else
478 /* APIs for 64 bit arch */
479 static int __alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
480                             void **handle,
481                             struct sg_table **sgt,
482                             size_t *size);
483 static void __free_gmmu_pages(struct vm_gk20a *vm, void *handle,
484                             struct sg_table *sgt, u32 order,
485                             size_t size);
486 static int __map_gmmu_pages(void *handle, struct sg_table *sgt,
487                           void **kva, size_t size);
488 static void __unmap_gmmu_pages(void *handle, struct sg_table *sgt, void *va);
489
490 static int alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
491                             void **handle,
492                             struct sg_table **sgt,
493                             size_t *size)
494 {
495         struct device *d = dev_from_vm(vm);
496         u32 num_pages = 1 << order;
497         u32 len = num_pages * PAGE_SIZE;
498         dma_addr_t iova;
499         DEFINE_DMA_ATTRS(attrs);
500         struct page **pages;
501         int err = 0;
502
503         gk20a_dbg_fn("");
504
505         if (IS_ENABLED(CONFIG_ARM64))
506                 return __alloc_gmmu_pages(vm, order, handle, sgt, size);
507
508         *size = len;
509         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
510         pages = dma_alloc_attrs(d, len, &iova, GFP_KERNEL, &attrs);
511         if (!pages) {
512                 gk20a_err(d, "memory allocation failed\n");
513                 goto err_out;
514         }
515
516         err = gk20a_get_sgtable_from_pages(d, sgt, pages,
517                                 iova, len);
518         if (err) {
519                 gk20a_err(d, "sgt allocation failed\n");
520                 goto err_free;
521         }
522
523         *handle = (void *)pages;
524
525         return 0;
526
527 err_free:
528         dma_free_attrs(d, len, pages, iova, &attrs);
529         pages = NULL;
530         iova = 0;
531 err_out:
532         return -ENOMEM;
533 }
534
535 static void free_gmmu_pages(struct vm_gk20a *vm, void *handle,
536                             struct sg_table *sgt, u32 order,
537                             size_t size)
538 {
539         struct device *d = dev_from_vm(vm);
540         u64 iova;
541         DEFINE_DMA_ATTRS(attrs);
542         struct page **pages = (struct page **)handle;
543
544         gk20a_dbg_fn("");
545         BUG_ON(sgt == NULL);
546
547         if (IS_ENABLED(CONFIG_ARM64)) {
548                 __free_gmmu_pages(vm, handle, sgt, order, size);
549                 return;
550         }
551
552         iova = sg_dma_address(sgt->sgl);
553
554         gk20a_free_sgtable(&sgt);
555
556         dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
557         dma_free_attrs(d, size, pages, iova, &attrs);
558         pages = NULL;
559         iova = 0;
560 }
561
562 static int map_gmmu_pages(void *handle, struct sg_table *sgt,
563                           void **kva, size_t size)
564 {
565         int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
566         struct page **pages = (struct page **)handle;
567         gk20a_dbg_fn("");
568
569         if (IS_ENABLED(CONFIG_ARM64))
570                 return __map_gmmu_pages(handle, sgt, kva, size);
571
572         *kva = vmap(pages, count, 0, pgprot_dmacoherent(PAGE_KERNEL));
573         if (!(*kva))
574                 return -ENOMEM;
575
576         return 0;
577 }
578
579 static void unmap_gmmu_pages(void *handle, struct sg_table *sgt, void *va)
580 {
581         gk20a_dbg_fn("");
582
583         if (IS_ENABLED(CONFIG_ARM64)) {
584                 __unmap_gmmu_pages(handle, sgt, va);
585                 return;
586         }
587
588         vunmap(va);
589 }
590
591 static int __alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
592                             void **handle,
593                             struct sg_table **sgt,
594                             size_t *size)
595 {
596         struct device *d = dev_from_vm(vm);
597         u32 num_pages = 1 << order;
598         u32 len = num_pages * PAGE_SIZE;
599         dma_addr_t iova;
600         void *cpuva;
601         int err = 0;
602
603         *size = len;
604         cpuva = dma_zalloc_coherent(d, len, &iova, GFP_KERNEL);
605         if (!cpuva) {
606                 gk20a_err(d, "memory allocation failed\n");
607                 goto err_out;
608         }
609
610         err = gk20a_get_sgtable(d, sgt, cpuva, iova, len);
611         if (err) {
612                 gk20a_err(d, "sgt allocation failed\n");
613                 goto err_free;
614         }
615
616         *handle = cpuva;
617
618         return 0;
619
620 err_free:
621         dma_free_coherent(d, len, cpuva, iova);
622         cpuva = NULL;
623         iova = 0;
624 err_out:
625         return -ENOMEM;
626 }
627
628 static void __free_gmmu_pages(struct vm_gk20a *vm, void *handle,
629                             struct sg_table *sgt, u32 order,
630                             size_t size)
631 {
632         struct device *d = dev_from_vm(vm);
633         u64 iova;
634
635         iova = sg_dma_address(sgt->sgl);
636
637         gk20a_free_sgtable(&sgt);
638
639         dma_free_coherent(d, size, handle, iova);
640         handle = NULL;
641         iova = 0;
642 }
643
644 static int __map_gmmu_pages(void *handle, struct sg_table *sgt,
645                           void **kva, size_t size)
646 {
647         *kva = handle;
648         return 0;
649 }
650
651 static void __unmap_gmmu_pages(void *handle, struct sg_table *sgt, void *va)
652 {
653         gk20a_dbg_fn("");
654 }
655 #endif
656
657 /* allocate a phys contig region big enough for a full
658  * sized gmmu page table for the given gmmu_page_size.
659  * the whole range is zeroed so it's "invalid"/will fault
660  */
661
662 static int zalloc_gmmu_page_table_gk20a(struct vm_gk20a *vm,
663                                         enum gmmu_pgsz_gk20a gmmu_pgsz_idx,
664                                         struct page_table_gk20a *pte)
665 {
666         int err;
667         u32 pte_order;
668         void *handle = NULL;
669         struct sg_table *sgt;
670         size_t size;
671
672         gk20a_dbg_fn("");
673
674         /* allocate enough pages for the table */
675         pte_order = vm->mm->page_table_sizing[gmmu_pgsz_idx].order;
676
677         err = alloc_gmmu_pages(vm, pte_order, &handle, &sgt, &size);
678         if (err)
679                 return err;
680
681         gk20a_dbg(gpu_dbg_pte, "pte = 0x%p, addr=%08llx, size %d",
682                         pte, gk20a_mm_iova_addr(sgt->sgl), pte_order);
683
684         pte->ref = handle;
685         pte->sgt = sgt;
686         pte->size = size;
687
688         return 0;
689 }
690
691 /* given address range (inclusive) determine the pdes crossed */
692 static inline void pde_range_from_vaddr_range(struct vm_gk20a *vm,
693                                               u64 addr_lo, u64 addr_hi,
694                                               u32 *pde_lo, u32 *pde_hi)
695 {
696         *pde_lo = (u32)(addr_lo >> vm->mm->pde_stride_shift);
697         *pde_hi = (u32)(addr_hi >> vm->mm->pde_stride_shift);
698         gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d",
699                    addr_lo, addr_hi, vm->mm->pde_stride_shift);
700         gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d",
701                    *pde_lo, *pde_hi);
702 }
703
704 static inline u32 *pde_from_index(struct vm_gk20a *vm, u32 i)
705 {
706         return (u32 *) (((u8 *)vm->pdes.kv) + i*gmmu_pde__size_v());
707 }
708
709 static inline u32 pte_index_from_vaddr(struct vm_gk20a *vm,
710                                        u64 addr, enum gmmu_pgsz_gk20a pgsz_idx)
711 {
712         u32 ret;
713         /* mask off pde part */
714         addr = addr & ((((u64)1) << vm->mm->pde_stride_shift) - ((u64)1));
715         /* shift over to get pte index. note assumption that pte index
716          * doesn't leak over into the high 32b */
717         ret = (u32)(addr >> gmmu_page_shifts[pgsz_idx]);
718
719         gk20a_dbg(gpu_dbg_pte, "addr=0x%llx pte_i=0x%x", addr, ret);
720         return ret;
721 }
722
723 static inline void pte_space_page_offset_from_index(u32 i, u32 *pte_page,
724                                                     u32 *pte_offset)
725 {
726         /* ptes are 8B regardless of pagesize */
727         /* pte space pages are 4KB. so 512 ptes per 4KB page*/
728         *pte_page = i >> 9;
729
730         /* this offset is a pte offset, not a byte offset */
731         *pte_offset = i & ((1<<9)-1);
732
733         gk20a_dbg(gpu_dbg_pte, "i=0x%x pte_page=0x%x pte_offset=0x%x",
734                    i, *pte_page, *pte_offset);
735 }
736
737
738 /*
739  * given a pde index/page table number make sure it has
740  * backing store and if not go ahead allocate it and
741  * record it in the appropriate pde
742  */
743 static int validate_gmmu_page_table_gk20a_locked(struct vm_gk20a *vm,
744                                 u32 i, enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
745 {
746         int err;
747         struct page_table_gk20a *pte =
748                 vm->pdes.ptes[gmmu_pgsz_idx] + i;
749
750         gk20a_dbg_fn("");
751
752         /* if it's already in place it's valid */
753         if (pte->ref)
754                 return 0;
755
756         gk20a_dbg(gpu_dbg_pte, "alloc %dKB ptes for pde %d",
757                    gmmu_page_sizes[gmmu_pgsz_idx]/1024, i);
758
759         err = zalloc_gmmu_page_table_gk20a(vm, gmmu_pgsz_idx, pte);
760         if (err)
761                 return err;
762
763         /* rewrite pde */
764         update_gmmu_pde_locked(vm, i);
765
766         return 0;
767 }
768
769 static struct vm_reserved_va_node *addr_to_reservation(struct vm_gk20a *vm,
770                                                        u64 addr)
771 {
772         struct vm_reserved_va_node *va_node;
773         list_for_each_entry(va_node, &vm->reserved_va_list, reserved_va_list)
774                 if (addr >= va_node->vaddr_start &&
775                     addr < (u64)va_node->vaddr_start + (u64)va_node->size)
776                         return va_node;
777
778         return NULL;
779 }
780
781 int gk20a_vm_get_buffers(struct vm_gk20a *vm,
782                          struct mapped_buffer_node ***mapped_buffers,
783                          int *num_buffers)
784 {
785         struct mapped_buffer_node *mapped_buffer;
786         struct mapped_buffer_node **buffer_list;
787         struct rb_node *node;
788         int i = 0;
789
790         mutex_lock(&vm->update_gmmu_lock);
791
792         buffer_list = kzalloc(sizeof(*buffer_list) *
793                               vm->num_user_mapped_buffers, GFP_KERNEL);
794         if (!buffer_list) {
795                 mutex_unlock(&vm->update_gmmu_lock);
796                 return -ENOMEM;
797         }
798
799         node = rb_first(&vm->mapped_buffers);
800         while (node) {
801                 mapped_buffer =
802                         container_of(node, struct mapped_buffer_node, node);
803                 if (mapped_buffer->user_mapped) {
804                         buffer_list[i] = mapped_buffer;
805                         kref_get(&mapped_buffer->ref);
806                         i++;
807                 }
808                 node = rb_next(&mapped_buffer->node);
809         }
810
811         BUG_ON(i != vm->num_user_mapped_buffers);
812
813         *num_buffers = vm->num_user_mapped_buffers;
814         *mapped_buffers = buffer_list;
815
816         mutex_unlock(&vm->update_gmmu_lock);
817
818         return 0;
819 }
820
821 static void gk20a_vm_unmap_locked_kref(struct kref *ref)
822 {
823         struct mapped_buffer_node *mapped_buffer =
824                 container_of(ref, struct mapped_buffer_node, ref);
825         gk20a_vm_unmap_locked(mapped_buffer);
826 }
827
828 void gk20a_vm_put_buffers(struct vm_gk20a *vm,
829                                  struct mapped_buffer_node **mapped_buffers,
830                                  int num_buffers)
831 {
832         int i;
833
834         mutex_lock(&vm->update_gmmu_lock);
835
836         for (i = 0; i < num_buffers; ++i)
837                 kref_put(&mapped_buffers[i]->ref,
838                          gk20a_vm_unmap_locked_kref);
839
840         mutex_unlock(&vm->update_gmmu_lock);
841
842         kfree(mapped_buffers);
843 }
844
845 static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset)
846 {
847         struct device *d = dev_from_vm(vm);
848         int retries;
849         struct mapped_buffer_node *mapped_buffer;
850
851         mutex_lock(&vm->update_gmmu_lock);
852
853         mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
854         if (!mapped_buffer) {
855                 mutex_unlock(&vm->update_gmmu_lock);
856                 gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
857                 return;
858         }
859
860         if (mapped_buffer->flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
861                 mutex_unlock(&vm->update_gmmu_lock);
862
863                 retries = 1000;
864                 while (retries) {
865                         if (atomic_read(&mapped_buffer->ref.refcount) == 1)
866                                 break;
867                         retries--;
868                         udelay(50);
869                 }
870                 if (!retries)
871                         gk20a_err(d, "sync-unmap failed on 0x%llx",
872                                                                 offset);
873                 mutex_lock(&vm->update_gmmu_lock);
874         }
875
876         mapped_buffer->user_mapped--;
877         if (mapped_buffer->user_mapped == 0)
878                 vm->num_user_mapped_buffers--;
879         kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
880
881         mutex_unlock(&vm->update_gmmu_lock);
882 }
883
884 static u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
885                              u64 size,
886                              enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
887
888 {
889         struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx];
890         int err;
891         u64 offset;
892         u32 start_page_nr = 0, num_pages;
893         u64 gmmu_page_size = gmmu_page_sizes[gmmu_pgsz_idx];
894
895         if (gmmu_pgsz_idx >= ARRAY_SIZE(gmmu_page_sizes)) {
896                 dev_warn(dev_from_vm(vm),
897                          "invalid page size requested in gk20a vm alloc");
898                 return -EINVAL;
899         }
900
901         if ((gmmu_pgsz_idx == gmmu_page_size_big) && !vm->big_pages) {
902                 dev_warn(dev_from_vm(vm),
903                          "unsupportd page size requested");
904                 return -EINVAL;
905
906         }
907
908         /* be certain we round up to gmmu_page_size if needed */
909         /* TBD: DIV_ROUND_UP -> undefined reference to __aeabi_uldivmod */
910         size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1);
911
912         gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
913                         gmmu_page_sizes[gmmu_pgsz_idx]>>10);
914
915         /* The vma allocator represents page accounting. */
916         num_pages = size >> gmmu_page_shifts[gmmu_pgsz_idx];
917
918         err = vma->alloc(vma, &start_page_nr, num_pages);
919
920         if (err) {
921                 gk20a_err(dev_from_vm(vm),
922                            "%s oom: sz=0x%llx", vma->name, size);
923                 return 0;
924         }
925
926         offset = (u64)start_page_nr << gmmu_page_shifts[gmmu_pgsz_idx];
927         gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset);
928
929         return offset;
930 }
931
932 static int gk20a_vm_free_va(struct vm_gk20a *vm,
933                              u64 offset, u64 size,
934                              enum gmmu_pgsz_gk20a pgsz_idx)
935 {
936         struct gk20a_allocator *vma = &vm->vma[pgsz_idx];
937         u32 page_size = gmmu_page_sizes[pgsz_idx];
938         u32 page_shift = gmmu_page_shifts[pgsz_idx];
939         u32 start_page_nr, num_pages;
940         int err;
941
942         gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
943                         vma->name, offset, size);
944
945         start_page_nr = (u32)(offset >> page_shift);
946         num_pages = (u32)((size + page_size - 1) >> page_shift);
947
948         err = vma->free(vma, start_page_nr, num_pages);
949         if (err) {
950                 gk20a_err(dev_from_vm(vm),
951                            "not found: offset=0x%llx, sz=0x%llx",
952                            offset, size);
953         }
954
955         return err;
956 }
957
958 static int insert_mapped_buffer(struct rb_root *root,
959                                 struct mapped_buffer_node *mapped_buffer)
960 {
961         struct rb_node **new_node = &(root->rb_node), *parent = NULL;
962
963         /* Figure out where to put new node */
964         while (*new_node) {
965                 struct mapped_buffer_node *cmp_with =
966                         container_of(*new_node, struct mapped_buffer_node,
967                                      node);
968
969                 parent = *new_node;
970
971                 if (cmp_with->addr > mapped_buffer->addr) /* u64 cmp */
972                         new_node = &((*new_node)->rb_left);
973                 else if (cmp_with->addr != mapped_buffer->addr) /* u64 cmp */
974                         new_node = &((*new_node)->rb_right);
975                 else
976                         return -EINVAL; /* no fair dup'ing */
977         }
978
979         /* Add new node and rebalance tree. */
980         rb_link_node(&mapped_buffer->node, parent, new_node);
981         rb_insert_color(&mapped_buffer->node, root);
982
983         return 0;
984 }
985
986 static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
987                                 struct rb_root *root, struct dma_buf *dmabuf,
988                                 u32 kind)
989 {
990         struct rb_node *node = rb_first(root);
991         while (node) {
992                 struct mapped_buffer_node *mapped_buffer =
993                         container_of(node, struct mapped_buffer_node, node);
994                 if (mapped_buffer->dmabuf == dmabuf &&
995                     kind == mapped_buffer->kind)
996                         return mapped_buffer;
997                 node = rb_next(&mapped_buffer->node);
998         }
999         return 0;
1000 }
1001
1002 static struct mapped_buffer_node *find_mapped_buffer_locked(
1003                                         struct rb_root *root, u64 addr)
1004 {
1005
1006         struct rb_node *node = root->rb_node;
1007         while (node) {
1008                 struct mapped_buffer_node *mapped_buffer =
1009                         container_of(node, struct mapped_buffer_node, node);
1010                 if (mapped_buffer->addr > addr) /* u64 cmp */
1011                         node = node->rb_left;
1012                 else if (mapped_buffer->addr != addr) /* u64 cmp */
1013                         node = node->rb_right;
1014                 else
1015                         return mapped_buffer;
1016         }
1017         return 0;
1018 }
1019
1020 static struct mapped_buffer_node *find_mapped_buffer_range_locked(
1021                                         struct rb_root *root, u64 addr)
1022 {
1023         struct rb_node *node = root->rb_node;
1024         while (node) {
1025                 struct mapped_buffer_node *m =
1026                         container_of(node, struct mapped_buffer_node, node);
1027                 if (m->addr <= addr && m->addr + m->size > addr)
1028                         return m;
1029                 else if (m->addr > addr) /* u64 cmp */
1030                         node = node->rb_left;
1031                 else
1032                         node = node->rb_right;
1033         }
1034         return 0;
1035 }
1036
1037 #define BFR_ATTRS (sizeof(nvmap_bfr_param)/sizeof(nvmap_bfr_param[0]))
1038
1039 struct buffer_attrs {
1040         struct sg_table *sgt;
1041         u64 size;
1042         u64 align;
1043         u32 ctag_offset;
1044         u32 ctag_lines;
1045         int pgsz_idx;
1046         u8 kind_v;
1047         u8 uc_kind_v;
1048 };
1049
1050 static void gmmu_select_page_size(struct buffer_attrs *bfr)
1051 {
1052         int i;
1053         /*  choose the biggest first (top->bottom) */
1054         for (i = (gmmu_nr_page_sizes-1); i >= 0; i--)
1055                 if (!(gmmu_page_offset_masks[i] & bfr->align)) {
1056                         /* would like to add this too but nvmap returns the
1057                          * original requested size not the allocated size.
1058                          * (!(gmmu_page_offset_masks[i] & bfr->size)) */
1059                         bfr->pgsz_idx = i;
1060                         break;
1061                 }
1062 }
1063
1064 static int setup_buffer_kind_and_compression(struct device *d,
1065                                              u32 flags,
1066                                              struct buffer_attrs *bfr,
1067                                              enum gmmu_pgsz_gk20a pgsz_idx)
1068 {
1069         bool kind_compressible;
1070
1071         if (unlikely(bfr->kind_v == gmmu_pte_kind_invalid_v()))
1072                 bfr->kind_v = gmmu_pte_kind_pitch_v();
1073
1074         if (unlikely(!gk20a_kind_is_supported(bfr->kind_v))) {
1075                 gk20a_err(d, "kind 0x%x not supported", bfr->kind_v);
1076                 return -EINVAL;
1077         }
1078
1079         bfr->uc_kind_v = gmmu_pte_kind_invalid_v();
1080         /* find a suitable uncompressed kind if it becomes necessary later */
1081         kind_compressible = gk20a_kind_is_compressible(bfr->kind_v);
1082         if (kind_compressible) {
1083                 bfr->uc_kind_v = gk20a_get_uncompressed_kind(bfr->kind_v);
1084                 if (unlikely(bfr->uc_kind_v == gmmu_pte_kind_invalid_v())) {
1085                         /* shouldn't happen, but it is worth cross-checking */
1086                         gk20a_err(d, "comptag kind 0x%x can't be"
1087                                    " downgraded to uncompressed kind",
1088                                    bfr->kind_v);
1089                         return -EINVAL;
1090                 }
1091         }
1092         /* comptags only supported for suitable kinds, 128KB pagesize */
1093         if (unlikely(kind_compressible &&
1094                      (gmmu_page_sizes[pgsz_idx] != 128*1024))) {
1095                 /*
1096                 gk20a_warn(d, "comptags specified"
1097                 " but pagesize being used doesn't support it");*/
1098                 /* it is safe to fall back to uncompressed as
1099                    functionality is not harmed */
1100                 bfr->kind_v = bfr->uc_kind_v;
1101                 kind_compressible = false;
1102         }
1103         if (kind_compressible)
1104                 bfr->ctag_lines = ALIGN(bfr->size, COMP_TAG_LINE_SIZE) >>
1105                         COMP_TAG_LINE_SIZE_SHIFT;
1106         else
1107                 bfr->ctag_lines = 0;
1108
1109         return 0;
1110 }
1111
1112 static int validate_fixed_buffer(struct vm_gk20a *vm,
1113                                  struct buffer_attrs *bfr,
1114                                  u64 map_offset)
1115 {
1116         struct device *dev = dev_from_vm(vm);
1117         struct vm_reserved_va_node *va_node;
1118         struct mapped_buffer_node *buffer;
1119
1120         if (map_offset & gmmu_page_offset_masks[bfr->pgsz_idx]) {
1121                 gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx",
1122                            map_offset);
1123                 return -EINVAL;
1124         }
1125
1126         /* find the space reservation */
1127         va_node = addr_to_reservation(vm, map_offset);
1128         if (!va_node) {
1129                 gk20a_warn(dev, "fixed offset mapping without space allocation");
1130                 return -EINVAL;
1131         }
1132
1133         /* check that this mappings does not collide with existing
1134          * mappings by checking the overlapping area between the current
1135          * buffer and all other mapped buffers */
1136
1137         list_for_each_entry(buffer,
1138                 &va_node->va_buffers_list, va_buffers_list) {
1139                 s64 begin = max(buffer->addr, map_offset);
1140                 s64 end = min(buffer->addr +
1141                         buffer->size, map_offset + bfr->size);
1142                 if (end - begin > 0) {
1143                         gk20a_warn(dev, "overlapping buffer map requested");
1144                         return -EINVAL;
1145                 }
1146         }
1147
1148         return 0;
1149 }
1150
1151 static u64 __locked_gmmu_map(struct vm_gk20a *vm,
1152                                 u64 map_offset,
1153                                 struct sg_table *sgt,
1154                                 u64 size,
1155                                 int pgsz_idx,
1156                                 u8 kind_v,
1157                                 u32 ctag_offset,
1158                                 u32 flags,
1159                                 int rw_flag)
1160 {
1161         int err = 0, i = 0;
1162         u32 pde_lo, pde_hi;
1163         struct device *d = dev_from_vm(vm);
1164
1165         /* Allocate (or validate when map_offset != 0) the virtual address. */
1166         if (!map_offset) {
1167                 map_offset = gk20a_vm_alloc_va(vm, size,
1168                                           pgsz_idx);
1169                 if (!map_offset) {
1170                         gk20a_err(d, "failed to allocate va space");
1171                         err = -ENOMEM;
1172                         goto fail;
1173                 }
1174         }
1175
1176         pde_range_from_vaddr_range(vm,
1177                                    map_offset,
1178                                    map_offset + size - 1,
1179                                    &pde_lo, &pde_hi);
1180
1181         /* mark the addr range valid (but with 0 phys addr, which will fault) */
1182         for (i = pde_lo; i <= pde_hi; i++) {
1183                 err = validate_gmmu_page_table_gk20a_locked(vm, i,
1184                                                             pgsz_idx);
1185                 if (err) {
1186                         gk20a_err(d, "failed to validate page table %d: %d",
1187                                                            i, err);
1188                         goto fail;
1189                 }
1190         }
1191
1192         err = update_gmmu_ptes_locked(vm, pgsz_idx,
1193                                       sgt,
1194                                       map_offset, map_offset + size - 1,
1195                                       kind_v,
1196                                       ctag_offset,
1197                                       flags &
1198                                       NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
1199                                       rw_flag);
1200         if (err) {
1201                 gk20a_err(d, "failed to update ptes on map");
1202                 goto fail;
1203         }
1204
1205         return map_offset;
1206  fail:
1207         gk20a_err(d, "%s: failed with err=%d\n", __func__, err);
1208         return 0;
1209 }
1210
1211 static void __locked_gmmu_unmap(struct vm_gk20a *vm,
1212                                 u64 vaddr,
1213                                 u64 size,
1214                                 int pgsz_idx,
1215                                 bool va_allocated,
1216                                 int rw_flag)
1217 {
1218         int err = 0;
1219         struct gk20a *g = gk20a_from_vm(vm);
1220
1221         if (va_allocated) {
1222                 err = gk20a_vm_free_va(vm, vaddr, size, pgsz_idx);
1223                 if (err) {
1224                         dev_err(dev_from_vm(vm),
1225                                 "failed to free va");
1226                         return;
1227                 }
1228         }
1229
1230         /* unmap here needs to know the page size we assigned at mapping */
1231         err = update_gmmu_ptes_locked(vm,
1232                                 pgsz_idx,
1233                                 0, /* n/a for unmap */
1234                                 vaddr,
1235                                 vaddr + size - 1,
1236                                 0, 0, false /* n/a for unmap */,
1237                                 rw_flag);
1238         if (err)
1239                 dev_err(dev_from_vm(vm),
1240                         "failed to update gmmu ptes on unmap");
1241
1242         /* detect which if any pdes/ptes can now be released */
1243
1244         /* flush l2 so any dirty lines are written out *now*.
1245          *  also as we could potentially be switching this buffer
1246          * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
1247          * some point in the future we need to invalidate l2.  e.g. switching
1248          * from a render buffer unmap (here) to later using the same memory
1249          * for gmmu ptes.  note the positioning of this relative to any smmu
1250          * unmapping (below). */
1251
1252         gk20a_mm_l2_flush(g, true);
1253 }
1254
1255 static u64 gk20a_vm_map_duplicate_locked(struct vm_gk20a *vm,
1256                                          struct dma_buf *dmabuf,
1257                                          u64 offset_align,
1258                                          u32 flags,
1259                                          int kind,
1260                                          struct sg_table **sgt,
1261                                          bool user_mapped,
1262                                          int rw_flag)
1263 {
1264         struct mapped_buffer_node *mapped_buffer = 0;
1265
1266         mapped_buffer =
1267                 find_mapped_buffer_reverse_locked(&vm->mapped_buffers,
1268                                                   dmabuf, kind);
1269         if (!mapped_buffer)
1270                 return 0;
1271
1272         if (mapped_buffer->flags != flags)
1273                 return 0;
1274
1275         if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET &&
1276             mapped_buffer->addr != offset_align)
1277                 return 0;
1278
1279         BUG_ON(mapped_buffer->vm != vm);
1280
1281         /* mark the buffer as used */
1282         if (user_mapped) {
1283                 if (mapped_buffer->user_mapped == 0)
1284                         vm->num_user_mapped_buffers++;
1285                 mapped_buffer->user_mapped++;
1286
1287                 /* If the mapping comes from user space, we own
1288                  * the handle ref. Since we reuse an
1289                  * existing mapping here, we need to give back those
1290                  * refs once in order not to leak.
1291                  */
1292                 if (mapped_buffer->own_mem_ref)
1293                         dma_buf_put(mapped_buffer->dmabuf);
1294                 else
1295                         mapped_buffer->own_mem_ref = true;
1296         }
1297         kref_get(&mapped_buffer->ref);
1298
1299         gk20a_dbg(gpu_dbg_map,
1300                    "reusing as=%d pgsz=%d flags=0x%x ctags=%d "
1301                    "start=%d gv=0x%x,%08x -> 0x%x,%08x -> 0x%x,%08x "
1302                    "own_mem_ref=%d user_mapped=%d",
1303                    vm_aspace_id(vm), mapped_buffer->pgsz_idx,
1304                    mapped_buffer->flags,
1305                    mapped_buffer->ctag_lines,
1306                    mapped_buffer->ctag_offset,
1307                    hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
1308                    hi32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
1309                    lo32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
1310                    hi32((u64)sg_phys(mapped_buffer->sgt->sgl)),
1311                    lo32((u64)sg_phys(mapped_buffer->sgt->sgl)),
1312                    mapped_buffer->own_mem_ref, user_mapped);
1313
1314         if (sgt)
1315                 *sgt = mapped_buffer->sgt;
1316         return mapped_buffer->addr;
1317 }
1318
1319 u64 gk20a_vm_map(struct vm_gk20a *vm,
1320                         struct dma_buf *dmabuf,
1321                         u64 offset_align,
1322                         u32 flags /*NVHOST_AS_MAP_BUFFER_FLAGS_*/,
1323                         int kind,
1324                         struct sg_table **sgt,
1325                         bool user_mapped,
1326                         int rw_flag)
1327 {
1328         struct gk20a *g = gk20a_from_vm(vm);
1329         struct gk20a_allocator *ctag_allocator = &g->gr.comp_tags;
1330         struct device *d = dev_from_vm(vm);
1331         struct mapped_buffer_node *mapped_buffer = 0;
1332         bool inserted = false, va_allocated = false;
1333         u32 gmmu_page_size = 0;
1334         u64 map_offset = 0;
1335         int err = 0;
1336         struct buffer_attrs bfr = {0};
1337         struct gk20a_comptags comptags;
1338
1339         mutex_lock(&vm->update_gmmu_lock);
1340
1341         /* check if this buffer is already mapped */
1342         map_offset = gk20a_vm_map_duplicate_locked(vm, dmabuf, offset_align,
1343                                                    flags, kind, sgt,
1344                                                    user_mapped, rw_flag);
1345         if (map_offset) {
1346                 mutex_unlock(&vm->update_gmmu_lock);
1347                 return map_offset;
1348         }
1349
1350         /* pin buffer to get phys/iovmm addr */
1351         bfr.sgt = gk20a_mm_pin(d, dmabuf);
1352         if (IS_ERR(bfr.sgt)) {
1353                 /* Falling back to physical is actually possible
1354                  * here in many cases if we use 4K phys pages in the
1355                  * gmmu.  However we have some regions which require
1356                  * contig regions to work properly (either phys-contig
1357                  * or contig through smmu io_vaspace).  Until we can
1358                  * track the difference between those two cases we have
1359                  * to fail the mapping when we run out of SMMU space.
1360                  */
1361                 gk20a_warn(d, "oom allocating tracking buffer");
1362                 goto clean_up;
1363         }
1364
1365         if (sgt)
1366                 *sgt = bfr.sgt;
1367
1368         bfr.kind_v = kind;
1369         bfr.size = dmabuf->size;
1370         bfr.align = 1 << __ffs((u64)sg_dma_address(bfr.sgt->sgl));
1371         bfr.pgsz_idx = -1;
1372
1373         /* If FIX_OFFSET is set, pgsz is determined. Otherwise, select
1374          * page size according to memory alignment */
1375         if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
1376                 bfr.pgsz_idx = NV_GMMU_VA_IS_UPPER(offset_align) ?
1377                                 gmmu_page_size_big : gmmu_page_size_small;
1378         } else {
1379                 gmmu_select_page_size(&bfr);
1380         }
1381
1382         /* validate/adjust bfr attributes */
1383         if (unlikely(bfr.pgsz_idx == -1)) {
1384                 gk20a_err(d, "unsupported page size detected");
1385                 goto clean_up;
1386         }
1387
1388         if (unlikely(bfr.pgsz_idx < gmmu_page_size_small ||
1389                      bfr.pgsz_idx > gmmu_page_size_big)) {
1390                 BUG_ON(1);
1391                 err = -EINVAL;
1392                 goto clean_up;
1393         }
1394         gmmu_page_size = gmmu_page_sizes[bfr.pgsz_idx];
1395
1396         /* Check if we should use a fixed offset for mapping this buffer */
1397         if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET)  {
1398                 err = validate_fixed_buffer(vm, &bfr, offset_align);
1399                 if (err)
1400                         goto clean_up;
1401
1402                 map_offset = offset_align;
1403                 va_allocated = false;
1404         } else
1405                 va_allocated = true;
1406
1407         if (sgt)
1408                 *sgt = bfr.sgt;
1409
1410         err = setup_buffer_kind_and_compression(d, flags, &bfr, bfr.pgsz_idx);
1411         if (unlikely(err)) {
1412                 gk20a_err(d, "failure setting up kind and compression");
1413                 goto clean_up;
1414         }
1415
1416         /* bar1 and pmu vm don't need ctag */
1417         if (!vm->enable_ctag)
1418                 bfr.ctag_lines = 0;
1419
1420         gk20a_get_comptags(d, dmabuf, &comptags);
1421
1422         if (bfr.ctag_lines && !comptags.lines) {
1423                 /* allocate compression resources if needed */
1424                 err = gk20a_alloc_comptags(d, dmabuf, ctag_allocator,
1425                                            bfr.ctag_lines);
1426                 if (err) {
1427                         /* ok to fall back here if we ran out */
1428                         /* TBD: we can partially alloc ctags as well... */
1429                         bfr.ctag_lines = bfr.ctag_offset = 0;
1430                         bfr.kind_v = bfr.uc_kind_v;
1431                 } else {
1432                         gk20a_get_comptags(d, dmabuf, &comptags);
1433
1434                         /* init/clear the ctag buffer */
1435                         g->ops.ltc.clear_comptags(g,
1436                                           comptags.offset,
1437                                           comptags.offset + comptags.lines - 1);
1438                 }
1439         }
1440
1441         /* store the comptag info */
1442         bfr.ctag_offset = comptags.offset;
1443
1444         /* update gmmu ptes */
1445         map_offset = __locked_gmmu_map(vm, map_offset,
1446                                         bfr.sgt,
1447                                         bfr.size,
1448                                         bfr.pgsz_idx,
1449                                         bfr.kind_v,
1450                                         bfr.ctag_offset,
1451                                         flags, rw_flag);
1452         if (!map_offset)
1453                 goto clean_up;
1454
1455         gk20a_dbg(gpu_dbg_map,
1456            "as=%d pgsz=%d "
1457            "kind=0x%x kind_uc=0x%x flags=0x%x "
1458            "ctags=%d start=%d gv=0x%x,%08x -> 0x%x,%08x -> 0x%x,%08x",
1459            vm_aspace_id(vm), gmmu_page_size,
1460            bfr.kind_v, bfr.uc_kind_v, flags,
1461            bfr.ctag_lines, bfr.ctag_offset,
1462            hi32(map_offset), lo32(map_offset),
1463            hi32((u64)sg_dma_address(bfr.sgt->sgl)),
1464            lo32((u64)sg_dma_address(bfr.sgt->sgl)),
1465            hi32((u64)sg_phys(bfr.sgt->sgl)),
1466            lo32((u64)sg_phys(bfr.sgt->sgl)));
1467
1468 #if defined(NVHOST_DEBUG)
1469         {
1470                 int i;
1471                 struct scatterlist *sg = NULL;
1472                 gk20a_dbg(gpu_dbg_pte, "for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i)");
1473                 for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i ) {
1474                         u64 da = sg_dma_address(sg);
1475                         u64 pa = sg_phys(sg);
1476                         u64 len = sg->length;
1477                         gk20a_dbg(gpu_dbg_pte, "i=%d pa=0x%x,%08x da=0x%x,%08x len=0x%x,%08x",
1478                                    i, hi32(pa), lo32(pa), hi32(da), lo32(da),
1479                                    hi32(len), lo32(len));
1480                 }
1481         }
1482 #endif
1483
1484         /* keep track of the buffer for unmapping */
1485         /* TBD: check for multiple mapping of same buffer */
1486         mapped_buffer = kzalloc(sizeof(*mapped_buffer), GFP_KERNEL);
1487         if (!mapped_buffer) {
1488                 gk20a_warn(d, "oom allocating tracking buffer");
1489                 goto clean_up;
1490         }
1491         mapped_buffer->dmabuf      = dmabuf;
1492         mapped_buffer->sgt         = bfr.sgt;
1493         mapped_buffer->addr        = map_offset;
1494         mapped_buffer->size        = bfr.size;
1495         mapped_buffer->pgsz_idx    = bfr.pgsz_idx;
1496         mapped_buffer->ctag_offset = bfr.ctag_offset;
1497         mapped_buffer->ctag_lines  = bfr.ctag_lines;
1498         mapped_buffer->vm          = vm;
1499         mapped_buffer->flags       = flags;
1500         mapped_buffer->kind        = kind;
1501         mapped_buffer->va_allocated = va_allocated;
1502         mapped_buffer->user_mapped = user_mapped ? 1 : 0;
1503         mapped_buffer->own_mem_ref = user_mapped;
1504         INIT_LIST_HEAD(&mapped_buffer->unmap_list);
1505         INIT_LIST_HEAD(&mapped_buffer->va_buffers_list);
1506         kref_init(&mapped_buffer->ref);
1507
1508         err = insert_mapped_buffer(&vm->mapped_buffers, mapped_buffer);
1509         if (err) {
1510                 gk20a_err(d, "failed to insert into mapped buffer tree");
1511                 goto clean_up;
1512         }
1513         inserted = true;
1514         if (user_mapped)
1515                 vm->num_user_mapped_buffers++;
1516
1517         gk20a_dbg_info("allocated va @ 0x%llx", map_offset);
1518
1519         if (!va_allocated) {
1520                 struct vm_reserved_va_node *va_node;
1521
1522                 /* find the space reservation */
1523                 va_node = addr_to_reservation(vm, map_offset);
1524                 list_add_tail(&mapped_buffer->va_buffers_list,
1525                               &va_node->va_buffers_list);
1526                 mapped_buffer->va_node = va_node;
1527         }
1528
1529         mutex_unlock(&vm->update_gmmu_lock);
1530
1531         /* Invalidate kernel mappings immediately */
1532         if (vm_aspace_id(vm) == -1)
1533                 gk20a_mm_tlb_invalidate(vm);
1534
1535         return map_offset;
1536
1537 clean_up:
1538         if (inserted) {
1539                 rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
1540                 if (user_mapped)
1541                         vm->num_user_mapped_buffers--;
1542         }
1543         kfree(mapped_buffer);
1544         if (va_allocated)
1545                 gk20a_vm_free_va(vm, map_offset, bfr.size, bfr.pgsz_idx);
1546         if (!IS_ERR(bfr.sgt))
1547                 gk20a_mm_unpin(d, dmabuf, bfr.sgt);
1548
1549         mutex_unlock(&vm->update_gmmu_lock);
1550         gk20a_dbg_info("err=%d\n", err);
1551         return 0;
1552 }
1553
1554 u64 gk20a_gmmu_map(struct vm_gk20a *vm,
1555                 struct sg_table **sgt,
1556                 u64 size,
1557                 u32 flags,
1558                 int rw_flag)
1559 {
1560         u64 vaddr;
1561
1562         mutex_lock(&vm->update_gmmu_lock);
1563         vaddr = __locked_gmmu_map(vm, 0, /* already mapped? - No */
1564                                 *sgt, /* sg table */
1565                                 size,
1566                                 0, /* page size index = 0 i.e. SZ_4K */
1567                                 0, /* kind */
1568                                 0, /* ctag_offset */
1569                                 flags, rw_flag);
1570         mutex_unlock(&vm->update_gmmu_lock);
1571         if (!vaddr) {
1572                 gk20a_err(dev_from_vm(vm), "failed to allocate va space");
1573                 return 0;
1574         }
1575
1576         /* Invalidate kernel mappings immediately */
1577         gk20a_mm_tlb_invalidate(vm);
1578
1579         return vaddr;
1580 }
1581
1582 void gk20a_gmmu_unmap(struct vm_gk20a *vm,
1583                 u64 vaddr,
1584                 u64 size,
1585                 int rw_flag)
1586 {
1587         mutex_lock(&vm->update_gmmu_lock);
1588         __locked_gmmu_unmap(vm,
1589                         vaddr,
1590                         size,
1591                         0, /* page size 4K */
1592                         true, /*va_allocated */
1593                         rw_flag);
1594         mutex_unlock(&vm->update_gmmu_lock);
1595 }
1596
1597 phys_addr_t gk20a_get_phys_from_iova(struct device *d,
1598                                 u64 dma_addr)
1599 {
1600         phys_addr_t phys;
1601         u64 iova;
1602
1603         struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
1604         if (!mapping)
1605                 return dma_addr;
1606
1607         iova = dma_addr & PAGE_MASK;
1608         phys = iommu_iova_to_phys(mapping->domain, iova);
1609         return phys;
1610 }
1611
1612 /* get sg_table from already allocated buffer */
1613 int gk20a_get_sgtable(struct device *d, struct sg_table **sgt,
1614                         void *cpuva, u64 iova,
1615                         size_t size)
1616 {
1617         int err = 0;
1618         *sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
1619         if (!(*sgt)) {
1620                 dev_err(d, "failed to allocate memory\n");
1621                 err = -ENOMEM;
1622                 goto fail;
1623         }
1624         err = dma_get_sgtable(d, *sgt,
1625                         cpuva, iova,
1626                         size);
1627         if (err) {
1628                 dev_err(d, "failed to create sg table\n");
1629                 goto fail;
1630         }
1631         sg_dma_address((*sgt)->sgl) = iova;
1632
1633         return 0;
1634  fail:
1635         if (*sgt) {
1636                 kfree(*sgt);
1637                 *sgt = NULL;
1638         }
1639         return err;
1640 }
1641
1642 int gk20a_get_sgtable_from_pages(struct device *d, struct sg_table **sgt,
1643                         struct page **pages, u64 iova,
1644                         size_t size)
1645 {
1646         int err = 0;
1647         *sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
1648         if (!(*sgt)) {
1649                 dev_err(d, "failed to allocate memory\n");
1650                 err = -ENOMEM;
1651                 goto fail;
1652         }
1653         err = sg_alloc_table(*sgt, 1, GFP_KERNEL);
1654         if (err) {
1655                 dev_err(d, "failed to allocate sg_table\n");
1656                 goto fail;
1657         }
1658         sg_set_page((*sgt)->sgl, *pages, size, 0);
1659         sg_dma_address((*sgt)->sgl) = iova;
1660
1661         return 0;
1662  fail:
1663         if (*sgt) {
1664                 kfree(*sgt);
1665                 *sgt = NULL;
1666         }
1667         return err;
1668 }
1669
1670 void gk20a_free_sgtable(struct sg_table **sgt)
1671 {
1672         sg_free_table(*sgt);
1673         kfree(*sgt);
1674         *sgt = NULL;
1675 }
1676
1677 u64 gk20a_mm_iova_addr(struct scatterlist *sgl)
1678 {
1679         u64 result = sg_phys(sgl);
1680 #ifdef CONFIG_TEGRA_IOMMU_SMMU
1681         if (sg_dma_address(sgl) == DMA_ERROR_CODE)
1682                 result = 0;
1683         else if (sg_dma_address(sgl)) {
1684                 result = sg_dma_address(sgl) |
1685                         1ULL << NV_MC_SMMU_VADDR_TRANSLATION_BIT;
1686         }
1687 #endif
1688         return result;
1689 }
1690
1691 static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
1692                                    enum gmmu_pgsz_gk20a pgsz_idx,
1693                                    struct sg_table *sgt,
1694                                    u64 first_vaddr, u64 last_vaddr,
1695                                    u8 kind_v, u32 ctag_offset,
1696                                    bool cacheable,
1697                                    int rw_flag)
1698 {
1699         int err;
1700         u32 pde_lo, pde_hi, pde_i;
1701         struct scatterlist *cur_chunk;
1702         unsigned int cur_offset;
1703         u32 pte_w[2] = {0, 0}; /* invalid pte */
1704         u32 ctag = ctag_offset;
1705         u32 ctag_incr;
1706         u32 page_size  = gmmu_page_sizes[pgsz_idx];
1707         u64 addr = 0;
1708
1709         pde_range_from_vaddr_range(vm, first_vaddr, last_vaddr,
1710                                    &pde_lo, &pde_hi);
1711
1712         gk20a_dbg(gpu_dbg_pte, "size_idx=%d, pde_lo=%d, pde_hi=%d",
1713                    pgsz_idx, pde_lo, pde_hi);
1714
1715         /* If ctag_offset !=0 add 1 else add 0.  The idea is to avoid a branch
1716          * below (per-pte). Note: this doesn't work unless page size (when
1717          * comptags are active) is 128KB. We have checks elsewhere for that. */
1718         ctag_incr = !!ctag_offset;
1719
1720         if (sgt)
1721                 cur_chunk = sgt->sgl;
1722         else
1723                 cur_chunk = NULL;
1724
1725         cur_offset = 0;
1726
1727         for (pde_i = pde_lo; pde_i <= pde_hi; pde_i++) {
1728                 u32 pte_lo, pte_hi;
1729                 u32 pte_cur;
1730                 void *pte_kv_cur;
1731
1732                 struct page_table_gk20a *pte = vm->pdes.ptes[pgsz_idx] + pde_i;
1733
1734                 if (pde_i == pde_lo)
1735                         pte_lo = pte_index_from_vaddr(vm, first_vaddr,
1736                                                       pgsz_idx);
1737                 else
1738                         pte_lo = 0;
1739
1740                 if ((pde_i != pde_hi) && (pde_hi != pde_lo))
1741                         pte_hi = vm->mm->page_table_sizing[pgsz_idx].num_ptes-1;
1742                 else
1743                         pte_hi = pte_index_from_vaddr(vm, last_vaddr,
1744                                                       pgsz_idx);
1745
1746                 /* get cpu access to the ptes */
1747                 err = map_gmmu_pages(pte->ref, pte->sgt, &pte_kv_cur,
1748                                      pte->size);
1749                 if (err) {
1750                         gk20a_err(dev_from_vm(vm),
1751                                    "couldn't map ptes for update as=%d pte_ref_cnt=%d",
1752                                    vm_aspace_id(vm), pte->ref_cnt);
1753                         goto clean_up;
1754                 }
1755
1756                 gk20a_dbg(gpu_dbg_pte, "pte_lo=%d, pte_hi=%d", pte_lo, pte_hi);
1757                 for (pte_cur = pte_lo; pte_cur <= pte_hi; pte_cur++) {
1758
1759                         if (likely(sgt)) {
1760                                 u64 new_addr = gk20a_mm_iova_addr(cur_chunk);
1761                                 if (new_addr) {
1762                                         addr = new_addr;
1763                                         addr += cur_offset;
1764                                 }
1765
1766                                 pte_w[0] = gmmu_pte_valid_true_f() |
1767                                         gmmu_pte_address_sys_f(addr
1768                                                 >> gmmu_pte_address_shift_v());
1769                                 pte_w[1] = gmmu_pte_aperture_video_memory_f() |
1770                                         gmmu_pte_kind_f(kind_v) |
1771                                         gmmu_pte_comptagline_f(ctag);
1772
1773                                 if (rw_flag == gk20a_mem_flag_read_only) {
1774                                         pte_w[0] |= gmmu_pte_read_only_true_f();
1775                                         pte_w[1] |=
1776                                                 gmmu_pte_write_disable_true_f();
1777                                 } else if (rw_flag ==
1778                                            gk20a_mem_flag_write_only) {
1779                                         pte_w[1] |=
1780                                                 gmmu_pte_read_disable_true_f();
1781                                 }
1782
1783                                 if (!cacheable)
1784                                         pte_w[1] |= gmmu_pte_vol_true_f();
1785
1786                                 pte->ref_cnt++;
1787
1788                                 gk20a_dbg(gpu_dbg_pte,
1789                                            "pte_cur=%d addr=0x%x,%08x kind=%d"
1790                                            " ctag=%d vol=%d refs=%d"
1791                                            " [0x%08x,0x%08x]",
1792                                            pte_cur, hi32(addr), lo32(addr),
1793                                            kind_v, ctag, !cacheable,
1794                                            pte->ref_cnt, pte_w[1], pte_w[0]);
1795
1796                                 ctag += ctag_incr;
1797                                 cur_offset += page_size;
1798                                 addr += page_size;
1799                                 while (cur_chunk &&
1800                                         cur_offset >= cur_chunk->length) {
1801                                         cur_offset -= cur_chunk->length;
1802                                         cur_chunk = sg_next(cur_chunk);
1803                                 }
1804
1805                         } else {
1806                                 pte->ref_cnt--;
1807                                 gk20a_dbg(gpu_dbg_pte,
1808                                            "pte_cur=%d ref=%d [0x0,0x0]",
1809                                            pte_cur, pte->ref_cnt);
1810                         }
1811
1812                         gk20a_mem_wr32(pte_kv_cur + pte_cur*8, 0, pte_w[0]);
1813                         gk20a_mem_wr32(pte_kv_cur + pte_cur*8, 1, pte_w[1]);
1814                 }
1815
1816                 unmap_gmmu_pages(pte->ref, pte->sgt, pte_kv_cur);
1817
1818                 if (pte->ref_cnt == 0) {
1819                         /* It can make sense to keep around one page table for
1820                          * each flavor (empty)... in case a new map is coming
1821                          * right back to alloc (and fill it in) again.
1822                          * But: deferring unmapping should help with pathologic
1823                          * unmap/map/unmap/map cases where we'd trigger pte
1824                          * free/alloc/free/alloc.
1825                          */
1826                         free_gmmu_pages(vm, pte->ref, pte->sgt,
1827                                 vm->mm->page_table_sizing[pgsz_idx].order,
1828                                 pte->size);
1829                         pte->ref = NULL;
1830
1831                         /* rewrite pde */
1832                         update_gmmu_pde_locked(vm, pde_i);
1833                 }
1834
1835         }
1836
1837         smp_mb();
1838         vm->tlb_dirty = true;
1839         gk20a_dbg_fn("set tlb dirty");
1840
1841         return 0;
1842
1843 clean_up:
1844         /*TBD: potentially rewrite above to pre-map everything it needs to
1845          * as that's the only way it can fail */
1846         return err;
1847
1848 }
1849
1850
1851 /* for gk20a the "video memory" apertures here are misnomers. */
1852 static inline u32 big_valid_pde0_bits(u64 pte_addr)
1853 {
1854         u32 pde0_bits =
1855                 gmmu_pde_aperture_big_video_memory_f() |
1856                 gmmu_pde_address_big_sys_f(
1857                            (u32)(pte_addr >> gmmu_pde_address_shift_v()));
1858         return  pde0_bits;
1859 }
1860 static inline u32 small_valid_pde1_bits(u64 pte_addr)
1861 {
1862         u32 pde1_bits =
1863                 gmmu_pde_aperture_small_video_memory_f() |
1864                 gmmu_pde_vol_small_true_f() | /* tbd: why? */
1865                 gmmu_pde_address_small_sys_f(
1866                            (u32)(pte_addr >> gmmu_pde_address_shift_v()));
1867         return pde1_bits;
1868 }
1869
1870 /* Given the current state of the ptes associated with a pde,
1871    determine value and write it out.  There's no checking
1872    here to determine whether or not a change was actually
1873    made.  So, superfluous updates will cause unnecessary
1874    pde invalidations.
1875 */
1876 static void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i)
1877 {
1878         bool small_valid, big_valid;
1879         u64 pte_addr[2] = {0, 0};
1880         struct page_table_gk20a *small_pte =
1881                 vm->pdes.ptes[gmmu_page_size_small] + i;
1882         struct page_table_gk20a *big_pte =
1883                 vm->pdes.ptes[gmmu_page_size_big] + i;
1884         u32 pde_v[2] = {0, 0};
1885         u32 *pde;
1886
1887         small_valid = small_pte && small_pte->ref;
1888         big_valid   = big_pte && big_pte->ref;
1889
1890         if (small_valid)
1891                 pte_addr[gmmu_page_size_small] =
1892                         gk20a_mm_iova_addr(small_pte->sgt->sgl);
1893         if (big_valid)
1894                 pte_addr[gmmu_page_size_big] =
1895                         gk20a_mm_iova_addr(big_pte->sgt->sgl);
1896
1897         pde_v[0] = gmmu_pde_size_full_f();
1898         pde_v[0] |= big_valid ?
1899                 big_valid_pde0_bits(pte_addr[gmmu_page_size_big])
1900                 :
1901                 (gmmu_pde_aperture_big_invalid_f());
1902
1903         pde_v[1] |= (small_valid ?
1904                      small_valid_pde1_bits(pte_addr[gmmu_page_size_small])
1905                      :
1906                      (gmmu_pde_aperture_small_invalid_f() |
1907                       gmmu_pde_vol_small_false_f())
1908                      )
1909                 |
1910                 (big_valid ? (gmmu_pde_vol_big_true_f()) :
1911                  gmmu_pde_vol_big_false_f());
1912
1913         pde = pde_from_index(vm, i);
1914
1915         gk20a_mem_wr32(pde, 0, pde_v[0]);
1916         gk20a_mem_wr32(pde, 1, pde_v[1]);
1917
1918         smp_mb();
1919
1920         FLUSH_CPU_DCACHE(pde,
1921                          sg_phys(vm->pdes.sgt->sgl) + (i*gmmu_pde__size_v()),
1922                          sizeof(u32)*2);
1923
1924         gk20a_mm_l2_invalidate(vm->mm->g);
1925
1926         gk20a_dbg(gpu_dbg_pte, "pde:%d = 0x%x,0x%08x\n", i, pde_v[1], pde_v[0]);
1927
1928         vm->tlb_dirty  = true;
1929 }
1930
1931
1932 static int gk20a_vm_put_empty(struct vm_gk20a *vm, u64 vaddr,
1933                                u32 num_pages, u32 pgsz_idx)
1934 {
1935         struct mm_gk20a *mm = vm->mm;
1936         struct gk20a *g = mm->g;
1937         u32 pgsz = gmmu_page_sizes[pgsz_idx];
1938         u32 i;
1939         dma_addr_t iova;
1940
1941         /* allocate the zero page if the va does not already have one */
1942         if (!vm->zero_page_cpuva) {
1943                 int err = 0;
1944                 vm->zero_page_cpuva = dma_alloc_coherent(&g->dev->dev,
1945                                                          mm->big_page_size,
1946                                                          &iova,
1947                                                          GFP_KERNEL);
1948                 if (!vm->zero_page_cpuva) {
1949                         dev_err(&g->dev->dev, "failed to allocate zero page\n");
1950                         return -ENOMEM;
1951                 }
1952
1953                 vm->zero_page_iova = iova;
1954                 err = gk20a_get_sgtable(&g->dev->dev, &vm->zero_page_sgt,
1955                                         vm->zero_page_cpuva, vm->zero_page_iova,
1956                                         mm->big_page_size);
1957                 if (err) {
1958                         dma_free_coherent(&g->dev->dev, mm->big_page_size,
1959                                           vm->zero_page_cpuva,
1960                                           vm->zero_page_iova);
1961                         vm->zero_page_iova = 0;
1962                         vm->zero_page_cpuva = NULL;
1963
1964                         dev_err(&g->dev->dev, "failed to create sg table for zero page\n");
1965                         return -ENOMEM;
1966                 }
1967         }
1968
1969         for (i = 0; i < num_pages; i++) {
1970                 u64 page_vaddr = __locked_gmmu_map(vm, vaddr,
1971                         vm->zero_page_sgt, pgsz, pgsz_idx, 0, 0,
1972                         NVHOST_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET,
1973                         gk20a_mem_flag_none);
1974
1975                 if (!page_vaddr) {
1976                         gk20a_err(dev_from_vm(vm), "failed to remap clean buffers!");
1977                         goto err_unmap;
1978                 }
1979                 vaddr += pgsz;
1980         }
1981
1982         gk20a_mm_l2_flush(mm->g, true);
1983
1984         return 0;
1985
1986 err_unmap:
1987
1988         WARN_ON(1);
1989         /* something went wrong. unmap pages */
1990         while (i--) {
1991                 vaddr -= pgsz;
1992                 __locked_gmmu_unmap(vm, vaddr, pgsz, pgsz_idx, 0,
1993                                     gk20a_mem_flag_none);
1994         }
1995
1996         return -EINVAL;
1997 }
1998
1999 /* NOTE! mapped_buffers lock must be held */
2000 static void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer)
2001 {
2002         struct vm_gk20a *vm = mapped_buffer->vm;
2003
2004         if (mapped_buffer->va_node &&
2005             mapped_buffer->va_node->sparse) {
2006                 u64 vaddr = mapped_buffer->addr;
2007                 u32 pgsz_idx = mapped_buffer->pgsz_idx;
2008                 u32 num_pages = mapped_buffer->size >>
2009                         gmmu_page_shifts[pgsz_idx];
2010
2011                 /* there is little we can do if this fails... */
2012                 gk20a_vm_put_empty(vm, vaddr, num_pages, pgsz_idx);
2013
2014         } else
2015                 __locked_gmmu_unmap(vm,
2016                                 mapped_buffer->addr,
2017                                 mapped_buffer->size,
2018                                 mapped_buffer->pgsz_idx,
2019                                 mapped_buffer->va_allocated,
2020                                 gk20a_mem_flag_none);
2021
2022         gk20a_dbg(gpu_dbg_map, "as=%d pgsz=%d gv=0x%x,%08x own_mem_ref=%d",
2023                    vm_aspace_id(vm), gmmu_page_sizes[mapped_buffer->pgsz_idx],
2024                    hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
2025                    mapped_buffer->own_mem_ref);
2026
2027         gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf,
2028                        mapped_buffer->sgt);
2029
2030         /* remove from mapped buffer tree and remove list, free */
2031         rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
2032         if (!list_empty(&mapped_buffer->va_buffers_list))
2033                 list_del(&mapped_buffer->va_buffers_list);
2034
2035         /* keep track of mapped buffers */
2036         if (mapped_buffer->user_mapped)
2037                 vm->num_user_mapped_buffers--;
2038
2039         if (mapped_buffer->own_mem_ref)
2040                 dma_buf_put(mapped_buffer->dmabuf);
2041
2042         kfree(mapped_buffer);
2043
2044         return;
2045 }
2046
2047 void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset)
2048 {
2049         struct device *d = dev_from_vm(vm);
2050         struct mapped_buffer_node *mapped_buffer;
2051
2052         mutex_lock(&vm->update_gmmu_lock);
2053         mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
2054         if (!mapped_buffer) {
2055                 mutex_unlock(&vm->update_gmmu_lock);
2056                 gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
2057                 return;
2058         }
2059         kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
2060         mutex_unlock(&vm->update_gmmu_lock);
2061 }
2062
2063 static void gk20a_vm_remove_support(struct vm_gk20a *vm)
2064 {
2065         struct gk20a *g = vm->mm->g;
2066         struct mapped_buffer_node *mapped_buffer;
2067         struct vm_reserved_va_node *va_node, *va_node_tmp;
2068         struct rb_node *node;
2069
2070         gk20a_dbg_fn("");
2071         mutex_lock(&vm->update_gmmu_lock);
2072
2073         /* TBD: add a flag here for the unmap code to recognize teardown
2074          * and short-circuit any otherwise expensive operations. */
2075
2076         node = rb_first(&vm->mapped_buffers);
2077         while (node) {
2078                 mapped_buffer =
2079                         container_of(node, struct mapped_buffer_node, node);
2080                 gk20a_vm_unmap_locked(mapped_buffer);
2081                 node = rb_first(&vm->mapped_buffers);
2082         }
2083
2084         /* destroy remaining reserved memory areas */
2085         list_for_each_entry_safe(va_node, va_node_tmp, &vm->reserved_va_list,
2086                 reserved_va_list) {
2087                 list_del(&va_node->reserved_va_list);
2088                 kfree(va_node);
2089         }
2090
2091         /* TBD: unmapping all buffers above may not actually free
2092          * all vm ptes.  jettison them here for certain... */
2093
2094         unmap_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, vm->pdes.kv);
2095         free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0, vm->pdes.size);
2096
2097         kfree(vm->pdes.ptes[gmmu_page_size_small]);
2098         kfree(vm->pdes.ptes[gmmu_page_size_big]);
2099         gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]);
2100         gk20a_allocator_destroy(&vm->vma[gmmu_page_size_big]);
2101
2102         mutex_unlock(&vm->update_gmmu_lock);
2103
2104         /* release zero page if used */
2105         if (vm->zero_page_cpuva)
2106                 dma_free_coherent(&g->dev->dev, vm->mm->big_page_size,
2107                                   vm->zero_page_cpuva, vm->zero_page_iova);
2108
2109         /* vm is not used anymore. release it. */
2110         kfree(vm);
2111 }
2112
2113 static void gk20a_vm_remove_support_kref(struct kref *ref)
2114 {
2115         struct vm_gk20a *vm = container_of(ref, struct vm_gk20a, ref);
2116         gk20a_vm_remove_support(vm);
2117 }
2118
2119 void gk20a_vm_get(struct vm_gk20a *vm)
2120 {
2121         kref_get(&vm->ref);
2122 }
2123
2124 void gk20a_vm_put(struct vm_gk20a *vm)
2125 {
2126         kref_put(&vm->ref, gk20a_vm_remove_support_kref);
2127 }
2128
2129 /* address space interfaces for the gk20a module */
2130 int gk20a_vm_alloc_share(struct gk20a_as_share *as_share)
2131 {
2132         struct gk20a_as *as = as_share->as;
2133         struct gk20a *g = gk20a_from_as(as);
2134         struct mm_gk20a *mm = &g->mm;
2135         struct vm_gk20a *vm;
2136         u64 vma_size;
2137         u32 num_pages, low_hole_pages;
2138         char name[32];
2139         int err;
2140
2141         gk20a_dbg_fn("");
2142
2143         vm = kzalloc(sizeof(*vm), GFP_KERNEL);
2144         if (!vm)
2145                 return -ENOMEM;
2146
2147         as_share->vm = vm;
2148
2149         vm->mm = mm;
2150         vm->as_share = as_share;
2151
2152         vm->big_pages = true;
2153
2154         vm->va_start  = mm->pde_stride;   /* create a one pde hole */
2155         vm->va_limit  = mm->channel.size; /* note this means channel.size is
2156                                              really just the max */
2157         {
2158                 u32 pde_lo, pde_hi;
2159                 pde_range_from_vaddr_range(vm,
2160                                            0, vm->va_limit-1,
2161                                            &pde_lo, &pde_hi);
2162                 vm->pdes.num_pdes = pde_hi + 1;
2163         }
2164
2165         vm->pdes.ptes[gmmu_page_size_small] =
2166                 kzalloc(sizeof(struct page_table_gk20a) *
2167                         vm->pdes.num_pdes, GFP_KERNEL);
2168
2169         vm->pdes.ptes[gmmu_page_size_big] =
2170                 kzalloc(sizeof(struct page_table_gk20a) *
2171                         vm->pdes.num_pdes, GFP_KERNEL);
2172
2173         if (!(vm->pdes.ptes[gmmu_page_size_small] &&
2174               vm->pdes.ptes[gmmu_page_size_big]))
2175                 return -ENOMEM;
2176
2177         gk20a_dbg_info("init space for va_limit=0x%llx num_pdes=%d",
2178                    vm->va_limit, vm->pdes.num_pdes);
2179
2180         /* allocate the page table directory */
2181         err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
2182                                &vm->pdes.sgt, &vm->pdes.size);
2183         if (err)
2184                 return -ENOMEM;
2185
2186         err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
2187                              vm->pdes.size);
2188         if (err) {
2189                 free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
2190                                         vm->pdes.size);
2191                 return -ENOMEM;
2192         }
2193         gk20a_dbg(gpu_dbg_pte, "pdes.kv = 0x%p, pdes.phys = 0x%llx",
2194                         vm->pdes.kv,
2195                         gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
2196         /* we could release vm->pdes.kv but it's only one page... */
2197
2198
2199         /* low-half: alloc small pages */
2200         /* high-half: alloc big pages */
2201         vma_size = mm->channel.size >> 1;
2202
2203         snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
2204                  gmmu_page_sizes[gmmu_page_size_small]>>10);
2205         num_pages = (u32)(vma_size >> gmmu_page_shifts[gmmu_page_size_small]);
2206
2207         /* num_pages above is without regard to the low-side hole. */
2208         low_hole_pages = (vm->va_start >>
2209                           gmmu_page_shifts[gmmu_page_size_small]);
2210
2211         gk20a_allocator_init(&vm->vma[gmmu_page_size_small], name,
2212               low_hole_pages,             /* start */
2213               num_pages - low_hole_pages, /* length */
2214               1);                         /* align */
2215
2216         snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
2217                  gmmu_page_sizes[gmmu_page_size_big]>>10);
2218
2219         num_pages = (u32)(vma_size >> gmmu_page_shifts[gmmu_page_size_big]);
2220         gk20a_allocator_init(&vm->vma[gmmu_page_size_big], name,
2221                               num_pages, /* start */
2222                               num_pages, /* length */
2223                               1); /* align */
2224
2225         vm->mapped_buffers = RB_ROOT;
2226
2227         mutex_init(&vm->update_gmmu_lock);
2228         kref_init(&vm->ref);
2229         INIT_LIST_HEAD(&vm->reserved_va_list);
2230
2231         vm->enable_ctag = true;
2232
2233         return 0;
2234 }
2235
2236
2237 int gk20a_vm_release_share(struct gk20a_as_share *as_share)
2238 {
2239         struct vm_gk20a *vm = as_share->vm;
2240
2241         gk20a_dbg_fn("");
2242
2243         vm->as_share = NULL;
2244
2245         /* put as reference to vm */
2246         gk20a_vm_put(vm);
2247
2248         as_share->vm = NULL;
2249
2250         return 0;
2251 }
2252
2253
2254 int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
2255                          struct nvhost_as_alloc_space_args *args)
2256
2257 {       int err = -ENOMEM;
2258         int pgsz_idx;
2259         u32 start_page_nr;
2260         struct gk20a_allocator *vma;
2261         struct vm_gk20a *vm = as_share->vm;
2262         struct vm_reserved_va_node *va_node;
2263         u64 vaddr_start = 0;
2264
2265         gk20a_dbg_fn("flags=0x%x pgsz=0x%x nr_pages=0x%x o/a=0x%llx",
2266                         args->flags, args->page_size, args->pages,
2267                         args->o_a.offset);
2268
2269         /* determine pagesz idx */
2270         for (pgsz_idx = gmmu_page_size_small;
2271              pgsz_idx < gmmu_nr_page_sizes;
2272              pgsz_idx++) {
2273                 if (gmmu_page_sizes[pgsz_idx] == args->page_size)
2274                         break;
2275         }
2276
2277         if (pgsz_idx >= gmmu_nr_page_sizes) {
2278                 err = -EINVAL;
2279                 goto clean_up;
2280         }
2281
2282         va_node = kzalloc(sizeof(*va_node), GFP_KERNEL);
2283         if (!va_node) {
2284                 err = -ENOMEM;
2285                 goto clean_up;
2286         }
2287
2288         if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_SPARSE &&
2289             pgsz_idx != gmmu_page_size_big) {
2290                 err = -ENOSYS;
2291                 kfree(va_node);
2292                 goto clean_up;
2293         }
2294
2295         start_page_nr = 0;
2296         if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
2297                 start_page_nr = (u32)(args->o_a.offset >>
2298                                       gmmu_page_shifts[pgsz_idx]);
2299
2300         vma = &vm->vma[pgsz_idx];
2301         err = vma->alloc(vma, &start_page_nr, args->pages);
2302         if (err) {
2303                 kfree(va_node);
2304                 goto clean_up;
2305         }
2306
2307         vaddr_start = (u64)start_page_nr << gmmu_page_shifts[pgsz_idx];
2308
2309         va_node->vaddr_start = vaddr_start;
2310         va_node->size = (u64)args->page_size * (u64)args->pages;
2311         va_node->pgsz_idx = args->page_size;
2312         INIT_LIST_HEAD(&va_node->va_buffers_list);
2313         INIT_LIST_HEAD(&va_node->reserved_va_list);
2314
2315         mutex_lock(&vm->update_gmmu_lock);
2316
2317         /* mark that we need to use sparse mappings here */
2318         if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_SPARSE) {
2319                 err = gk20a_vm_put_empty(vm, vaddr_start, args->pages,
2320                                          pgsz_idx);
2321                 if (err) {
2322                         mutex_unlock(&vm->update_gmmu_lock);
2323                         vma->free(vma, start_page_nr, args->pages);
2324                         kfree(va_node);
2325                         goto clean_up;
2326                 }
2327
2328                 va_node->sparse = true;
2329         }
2330
2331         list_add_tail(&va_node->reserved_va_list, &vm->reserved_va_list);
2332
2333         mutex_unlock(&vm->update_gmmu_lock);
2334
2335         args->o_a.offset = vaddr_start;
2336
2337 clean_up:
2338         return err;
2339 }
2340
2341 int gk20a_vm_free_space(struct gk20a_as_share *as_share,
2342                         struct nvhost_as_free_space_args *args)
2343 {
2344         int err = -ENOMEM;
2345         int pgsz_idx;
2346         u32 start_page_nr;
2347         struct gk20a_allocator *vma;
2348         struct vm_gk20a *vm = as_share->vm;
2349         struct vm_reserved_va_node *va_node;
2350
2351         gk20a_dbg_fn("pgsz=0x%x nr_pages=0x%x o/a=0x%llx", args->page_size,
2352                         args->pages, args->offset);
2353
2354         /* determine pagesz idx */
2355         for (pgsz_idx = gmmu_page_size_small;
2356              pgsz_idx < gmmu_nr_page_sizes;
2357              pgsz_idx++) {
2358                 if (gmmu_page_sizes[pgsz_idx] == args->page_size)
2359                         break;
2360         }
2361
2362         if (pgsz_idx >= gmmu_nr_page_sizes) {
2363                 err = -EINVAL;
2364                 goto clean_up;
2365         }
2366
2367         start_page_nr = (u32)(args->offset >>
2368                               gmmu_page_shifts[pgsz_idx]);
2369
2370         vma = &vm->vma[pgsz_idx];
2371         err = vma->free(vma, start_page_nr, args->pages);
2372
2373         if (err)
2374                 goto clean_up;
2375
2376         mutex_lock(&vm->update_gmmu_lock);
2377         va_node = addr_to_reservation(vm, args->offset);
2378         if (va_node) {
2379                 struct mapped_buffer_node *buffer;
2380
2381                 /* there is no need to unallocate the buffers in va. Just
2382                  * convert them into normal buffers */
2383
2384                 list_for_each_entry(buffer,
2385                         &va_node->va_buffers_list, va_buffers_list)
2386                         list_del_init(&buffer->va_buffers_list);
2387
2388                 list_del(&va_node->reserved_va_list);
2389
2390                 /* if this was a sparse mapping, free the va */
2391                 if (va_node->sparse)
2392                         __locked_gmmu_unmap(vm,
2393                                 va_node->vaddr_start,
2394                                 va_node->size,
2395                                 va_node->pgsz_idx,
2396                                 false,
2397                                 gk20a_mem_flag_none);
2398                 kfree(va_node);
2399         }
2400         mutex_unlock(&vm->update_gmmu_lock);
2401
2402 clean_up:
2403         return err;
2404 }
2405
2406 int gk20a_vm_bind_channel(struct gk20a_as_share *as_share,
2407                           struct channel_gk20a *ch)
2408 {
2409         int err = 0;
2410         struct vm_gk20a *vm = as_share->vm;
2411
2412         gk20a_dbg_fn("");
2413
2414         ch->vm = vm;
2415         err = channel_gk20a_commit_va(ch);
2416         if (err)
2417                 ch->vm = 0;
2418
2419         return err;
2420 }
2421
2422 int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev)
2423 {
2424         struct gk20a_dmabuf_priv *priv;
2425         static DEFINE_MUTEX(priv_lock);
2426
2427         priv = dma_buf_get_drvdata(dmabuf, dev);
2428         if (likely(priv))
2429                 return 0;
2430
2431         mutex_lock(&priv_lock);
2432         priv = dma_buf_get_drvdata(dmabuf, dev);
2433         if (priv)
2434                 goto priv_exist_or_err;
2435         priv = kzalloc(sizeof(*priv), GFP_KERNEL);
2436         if (!priv) {
2437                 priv = ERR_PTR(-ENOMEM);
2438                 goto priv_exist_or_err;
2439         }
2440         mutex_init(&priv->lock);
2441         dma_buf_set_drvdata(dmabuf, dev, priv, gk20a_mm_delete_priv);
2442 priv_exist_or_err:
2443         mutex_unlock(&priv_lock);
2444         if (IS_ERR(priv))
2445                 return -ENOMEM;
2446
2447         return 0;
2448 }
2449
2450
2451 static int gk20a_dmabuf_get_kind(struct dma_buf *dmabuf)
2452 {
2453         int kind = 0;
2454 #ifdef CONFIG_TEGRA_NVMAP
2455         int err;
2456         u64 nvmap_param;
2457
2458         err = nvmap_get_dmabuf_param(dmabuf, NVMAP_HANDLE_PARAM_KIND,
2459                                      &nvmap_param);
2460         kind = err ? kind : nvmap_param;
2461 #endif
2462         return kind;
2463 }
2464
2465 int gk20a_vm_map_buffer(struct gk20a_as_share *as_share,
2466                         int dmabuf_fd,
2467                         u64 *offset_align,
2468                         u32 flags, /*NVHOST_AS_MAP_BUFFER_FLAGS_*/
2469                         int kind)
2470 {
2471         int err = 0;
2472         struct vm_gk20a *vm = as_share->vm;
2473         struct dma_buf *dmabuf;
2474         u64 ret_va;
2475
2476         gk20a_dbg_fn("");
2477
2478         /* get ref to the mem handle (released on unmap_locked) */
2479         dmabuf = dma_buf_get(dmabuf_fd);
2480         if (!dmabuf)
2481                 return 0;
2482
2483         err = gk20a_dmabuf_alloc_drvdata(dmabuf, dev_from_vm(vm));
2484         if (err) {
2485                 dma_buf_put(dmabuf);
2486                 return err;
2487         }
2488
2489         if (kind == -1)
2490                 kind = gk20a_dmabuf_get_kind(dmabuf);
2491
2492         ret_va = gk20a_vm_map(vm, dmabuf, *offset_align,
2493                         flags, kind, NULL, true,
2494                         gk20a_mem_flag_none);
2495         *offset_align = ret_va;
2496         if (!ret_va) {
2497                 dma_buf_put(dmabuf);
2498                 err = -EINVAL;
2499         }
2500
2501         return err;
2502 }
2503
2504 int gk20a_vm_unmap_buffer(struct gk20a_as_share *as_share, u64 offset)
2505 {
2506         struct vm_gk20a *vm = as_share->vm;
2507
2508         gk20a_dbg_fn("");
2509
2510         gk20a_vm_unmap_user(vm, offset);
2511         return 0;
2512 }
2513
2514 int gk20a_init_bar1_vm(struct mm_gk20a *mm)
2515 {
2516         int err;
2517         phys_addr_t inst_pa;
2518         void *inst_ptr;
2519         struct vm_gk20a *vm = &mm->bar1.vm;
2520         struct gk20a *g = gk20a_from_mm(mm);
2521         struct device *d = dev_from_gk20a(g);
2522         struct inst_desc *inst_block = &mm->bar1.inst_block;
2523         u64 pde_addr;
2524         u32 pde_addr_lo;
2525         u32 pde_addr_hi;
2526         dma_addr_t iova;
2527
2528         vm->mm = mm;
2529
2530         mm->bar1.aperture_size = bar1_aperture_size_mb_gk20a() << 20;
2531
2532         gk20a_dbg_info("bar1 vm size = 0x%x", mm->bar1.aperture_size);
2533
2534         vm->va_start = mm->pde_stride * 1;
2535         vm->va_limit = mm->bar1.aperture_size;
2536
2537         {
2538                 u32 pde_lo, pde_hi;
2539                 pde_range_from_vaddr_range(vm,
2540                                            0, vm->va_limit-1,
2541                                            &pde_lo, &pde_hi);
2542                 vm->pdes.num_pdes = pde_hi + 1;
2543         }
2544
2545         /* bar1 is likely only to ever use/need small page sizes. */
2546         /* But just in case, for now... arrange for both.*/
2547         vm->pdes.ptes[gmmu_page_size_small] =
2548                 kzalloc(sizeof(struct page_table_gk20a) *
2549                         vm->pdes.num_pdes, GFP_KERNEL);
2550
2551         vm->pdes.ptes[gmmu_page_size_big] =
2552                 kzalloc(sizeof(struct page_table_gk20a) *
2553                         vm->pdes.num_pdes, GFP_KERNEL);
2554
2555         if (!(vm->pdes.ptes[gmmu_page_size_small] &&
2556               vm->pdes.ptes[gmmu_page_size_big]))
2557                 return -ENOMEM;
2558
2559         gk20a_dbg_info("init space for bar1 va_limit=0x%llx num_pdes=%d",
2560                    vm->va_limit, vm->pdes.num_pdes);
2561
2562
2563         /* allocate the page table directory */
2564         err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
2565                                &vm->pdes.sgt, &vm->pdes.size);
2566         if (err)
2567                 goto clean_up;
2568
2569         err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
2570                              vm->pdes.size);
2571         if (err) {
2572                 free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
2573                                         vm->pdes.size);
2574                 goto clean_up;
2575         }
2576         gk20a_dbg(gpu_dbg_pte, "bar 1 pdes.kv = 0x%p, pdes.phys = 0x%llx",
2577                         vm->pdes.kv, gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
2578         /* we could release vm->pdes.kv but it's only one page... */
2579
2580         pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
2581         pde_addr_lo = u64_lo32(pde_addr >> 12);
2582         pde_addr_hi = u64_hi32(pde_addr);
2583
2584         gk20a_dbg_info("pde pa=0x%llx pde_addr_lo=0x%x pde_addr_hi=0x%x",
2585                 (u64)gk20a_mm_iova_addr(vm->pdes.sgt->sgl),
2586                 pde_addr_lo, pde_addr_hi);
2587
2588         /* allocate instance mem for bar1 */
2589         inst_block->size = ram_in_alloc_size_v();
2590         inst_block->cpuva = dma_alloc_coherent(d, inst_block->size,
2591                                 &iova, GFP_KERNEL);
2592         if (!inst_block->cpuva) {
2593                 gk20a_err(d, "%s: memory allocation failed\n", __func__);
2594                 err = -ENOMEM;
2595                 goto clean_up;
2596         }
2597
2598         inst_block->iova = iova;
2599         inst_block->cpu_pa = gk20a_get_phys_from_iova(d, inst_block->iova);
2600         if (!inst_block->cpu_pa) {
2601                 gk20a_err(d, "%s: failed to get phys address\n", __func__);
2602                 err = -ENOMEM;
2603                 goto clean_up;
2604         }
2605
2606         inst_pa = inst_block->cpu_pa;
2607         inst_ptr = inst_block->cpuva;
2608
2609         gk20a_dbg_info("bar1 inst block physical phys = 0x%llx, kv = 0x%p",
2610                 (u64)inst_pa, inst_ptr);
2611
2612         memset(inst_ptr, 0, ram_fc_size_val_v());
2613
2614         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
2615                 ram_in_page_dir_base_target_vid_mem_f() |
2616                 ram_in_page_dir_base_vol_true_f() |
2617                 ram_in_page_dir_base_lo_f(pde_addr_lo));
2618
2619         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
2620                 ram_in_page_dir_base_hi_f(pde_addr_hi));
2621
2622         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
2623                  u64_lo32(vm->va_limit) | 0xFFF);
2624
2625         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
2626                 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
2627
2628         gk20a_dbg_info("bar1 inst block ptr: %08llx",  (u64)inst_pa);
2629         gk20a_allocator_init(&vm->vma[gmmu_page_size_small], "gk20a_bar1",
2630                               1,/*start*/
2631                               (vm->va_limit >> 12) - 1 /* length*/,
2632                               1); /* align */
2633         /* initialize just in case we try to use it anyway */
2634         gk20a_allocator_init(&vm->vma[gmmu_page_size_big], "gk20a_bar1-unused",
2635                               0x0badc0de, /* start */
2636                               1, /* length */
2637                               1); /* align */
2638
2639         vm->mapped_buffers = RB_ROOT;
2640
2641         mutex_init(&vm->update_gmmu_lock);
2642         kref_init(&vm->ref);
2643         INIT_LIST_HEAD(&vm->reserved_va_list);
2644
2645         return 0;
2646
2647 clean_up:
2648         /* free, etc */
2649         if (inst_block->cpuva)
2650                 dma_free_coherent(d, inst_block->size,
2651                         inst_block->cpuva, inst_block->iova);
2652         inst_block->cpuva = NULL;
2653         inst_block->iova = 0;
2654         return err;
2655 }
2656
2657 /* pmu vm, share channel_vm interfaces */
2658 int gk20a_init_pmu_vm(struct mm_gk20a *mm)
2659 {
2660         int err;
2661         phys_addr_t inst_pa;
2662         void *inst_ptr;
2663         struct vm_gk20a *vm = &mm->pmu.vm;
2664         struct gk20a *g = gk20a_from_mm(mm);
2665         struct device *d = dev_from_gk20a(g);
2666         struct inst_desc *inst_block = &mm->pmu.inst_block;
2667         u64 pde_addr;
2668         u32 pde_addr_lo;
2669         u32 pde_addr_hi;
2670         dma_addr_t iova;
2671
2672         vm->mm = mm;
2673
2674         mm->pmu.aperture_size = GK20A_PMU_VA_SIZE;
2675
2676         gk20a_dbg_info("pmu vm size = 0x%x", mm->pmu.aperture_size);
2677
2678         vm->va_start  = GK20A_PMU_VA_START;
2679         vm->va_limit  = vm->va_start + mm->pmu.aperture_size;
2680
2681         {
2682                 u32 pde_lo, pde_hi;
2683                 pde_range_from_vaddr_range(vm,
2684                                            0, vm->va_limit-1,
2685                                            &pde_lo, &pde_hi);
2686                 vm->pdes.num_pdes = pde_hi + 1;
2687         }
2688
2689         /* The pmu is likely only to ever use/need small page sizes. */
2690         /* But just in case, for now... arrange for both.*/
2691         vm->pdes.ptes[gmmu_page_size_small] =
2692                 kzalloc(sizeof(struct page_table_gk20a) *
2693                         vm->pdes.num_pdes, GFP_KERNEL);
2694
2695         vm->pdes.ptes[gmmu_page_size_big] =
2696                 kzalloc(sizeof(struct page_table_gk20a) *
2697                         vm->pdes.num_pdes, GFP_KERNEL);
2698
2699         if (!(vm->pdes.ptes[gmmu_page_size_small] &&
2700               vm->pdes.ptes[gmmu_page_size_big]))
2701                 return -ENOMEM;
2702
2703         gk20a_dbg_info("init space for pmu va_limit=0x%llx num_pdes=%d",
2704                    vm->va_limit, vm->pdes.num_pdes);
2705
2706         /* allocate the page table directory */
2707         err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
2708                                &vm->pdes.sgt, &vm->pdes.size);
2709         if (err)
2710                 goto clean_up;
2711
2712         err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
2713                              vm->pdes.size);
2714         if (err) {
2715                 free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
2716                                         vm->pdes.size);
2717                 goto clean_up;
2718         }
2719         gk20a_dbg_info("pmu pdes phys @ 0x%llx",
2720                         (u64)gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
2721         /* we could release vm->pdes.kv but it's only one page... */
2722
2723         pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
2724         pde_addr_lo = u64_lo32(pde_addr >> 12);
2725         pde_addr_hi = u64_hi32(pde_addr);
2726
2727         gk20a_dbg_info("pde pa=0x%llx pde_addr_lo=0x%x pde_addr_hi=0x%x",
2728                         (u64)pde_addr, pde_addr_lo, pde_addr_hi);
2729
2730         /* allocate instance mem for pmu */
2731         inst_block->size = GK20A_PMU_INST_SIZE;
2732         inst_block->cpuva = dma_alloc_coherent(d, inst_block->size,
2733                                 &iova, GFP_KERNEL);
2734         if (!inst_block->cpuva) {
2735                 gk20a_err(d, "%s: memory allocation failed\n", __func__);
2736                 err = -ENOMEM;
2737                 goto clean_up;
2738         }
2739
2740         inst_block->iova = iova;
2741         inst_block->cpu_pa = gk20a_get_phys_from_iova(d, inst_block->iova);
2742         if (!inst_block->cpu_pa) {
2743                 gk20a_err(d, "%s: failed to get phys address\n", __func__);
2744                 err = -ENOMEM;
2745                 goto clean_up;
2746         }
2747
2748         inst_pa = inst_block->cpu_pa;
2749         inst_ptr = inst_block->cpuva;
2750
2751         gk20a_dbg_info("pmu inst block physical addr: 0x%llx", (u64)inst_pa);
2752
2753         memset(inst_ptr, 0, GK20A_PMU_INST_SIZE);
2754
2755         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
2756                 ram_in_page_dir_base_target_vid_mem_f() |
2757                 ram_in_page_dir_base_vol_true_f() |
2758                 ram_in_page_dir_base_lo_f(pde_addr_lo));
2759
2760         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
2761                 ram_in_page_dir_base_hi_f(pde_addr_hi));
2762
2763         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
2764                  u64_lo32(vm->va_limit) | 0xFFF);
2765
2766         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
2767                 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
2768
2769         gk20a_allocator_init(&vm->vma[gmmu_page_size_small], "gk20a_pmu",
2770                               (vm->va_start >> 12), /* start */
2771                               (vm->va_limit - vm->va_start) >> 12, /*length*/
2772                               1); /* align */
2773         /* initialize just in case we try to use it anyway */
2774         gk20a_allocator_init(&vm->vma[gmmu_page_size_big], "gk20a_pmu-unused",
2775                               0x0badc0de, /* start */
2776                               1, /* length */
2777                               1); /* align */
2778
2779
2780         vm->mapped_buffers = RB_ROOT;
2781
2782         mutex_init(&vm->update_gmmu_lock);
2783         kref_init(&vm->ref);
2784         INIT_LIST_HEAD(&vm->reserved_va_list);
2785
2786         return 0;
2787
2788 clean_up:
2789         /* free, etc */
2790         if (inst_block->cpuva)
2791                 dma_free_coherent(d, inst_block->size,
2792                         inst_block->cpuva, inst_block->iova);
2793         inst_block->cpuva = NULL;
2794         inst_block->iova = 0;
2795         return err;
2796 }
2797
2798 int gk20a_mm_fb_flush(struct gk20a *g)
2799 {
2800         struct mm_gk20a *mm = &g->mm;
2801         u32 data;
2802         s32 retry = 100;
2803         int ret = 0;
2804
2805         gk20a_dbg_fn("");
2806
2807         mutex_lock(&mm->l2_op_lock);
2808
2809         g->ops.ltc.elpg_flush(g);
2810
2811         /* Make sure all previous writes are committed to the L2. There's no
2812            guarantee that writes are to DRAM. This will be a sysmembar internal
2813            to the L2. */
2814         gk20a_writel(g, flush_fb_flush_r(),
2815                 flush_fb_flush_pending_busy_f());
2816
2817         do {
2818                 data = gk20a_readl(g, flush_fb_flush_r());
2819
2820                 if (flush_fb_flush_outstanding_v(data) ==
2821                         flush_fb_flush_outstanding_true_v() ||
2822                     flush_fb_flush_pending_v(data) ==
2823                         flush_fb_flush_pending_busy_v()) {
2824                                 gk20a_dbg_info("fb_flush 0x%x", data);
2825                                 retry--;
2826                                 usleep_range(20, 40);
2827                 } else
2828                         break;
2829         } while (retry >= 0 || !tegra_platform_is_silicon());
2830
2831         if (retry < 0) {
2832                 gk20a_warn(dev_from_gk20a(g),
2833                         "fb_flush too many retries");
2834                 ret = -EBUSY;
2835         }
2836
2837         mutex_unlock(&mm->l2_op_lock);
2838
2839         return ret;
2840 }
2841
2842 static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
2843 {
2844         u32 data;
2845         s32 retry = 200;
2846
2847         /* Invalidate any clean lines from the L2 so subsequent reads go to
2848            DRAM. Dirty lines are not affected by this operation. */
2849         gk20a_writel(g, flush_l2_system_invalidate_r(),
2850                 flush_l2_system_invalidate_pending_busy_f());
2851
2852         do {
2853                 data = gk20a_readl(g, flush_l2_system_invalidate_r());
2854
2855                 if (flush_l2_system_invalidate_outstanding_v(data) ==
2856                         flush_l2_system_invalidate_outstanding_true_v() ||
2857                     flush_l2_system_invalidate_pending_v(data) ==
2858                         flush_l2_system_invalidate_pending_busy_v()) {
2859                                 gk20a_dbg_info("l2_system_invalidate 0x%x",
2860                                                 data);
2861                                 retry--;
2862                                 usleep_range(20, 40);
2863                 } else
2864                         break;
2865         } while (retry >= 0 || !tegra_platform_is_silicon());
2866
2867         if (retry < 0)
2868                 gk20a_warn(dev_from_gk20a(g),
2869                         "l2_system_invalidate too many retries");
2870 }
2871
2872 void gk20a_mm_l2_invalidate(struct gk20a *g)
2873 {
2874         struct mm_gk20a *mm = &g->mm;
2875         mutex_lock(&mm->l2_op_lock);
2876         gk20a_mm_l2_invalidate_locked(g);
2877         mutex_unlock(&mm->l2_op_lock);
2878 }
2879
2880 void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
2881 {
2882         struct mm_gk20a *mm = &g->mm;
2883         u32 data;
2884         s32 retry = 200;
2885
2886         gk20a_dbg_fn("");
2887
2888         mutex_lock(&mm->l2_op_lock);
2889
2890         /* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2
2891            as clean, so subsequent reads might hit in the L2. */
2892         gk20a_writel(g, flush_l2_flush_dirty_r(),
2893                 flush_l2_flush_dirty_pending_busy_f());
2894
2895         do {
2896                 data = gk20a_readl(g, flush_l2_flush_dirty_r());
2897
2898                 if (flush_l2_flush_dirty_outstanding_v(data) ==
2899                         flush_l2_flush_dirty_outstanding_true_v() ||
2900                     flush_l2_flush_dirty_pending_v(data) ==
2901                         flush_l2_flush_dirty_pending_busy_v()) {
2902                                 gk20a_dbg_info("l2_flush_dirty 0x%x", data);
2903                                 retry--;
2904                                 usleep_range(20, 40);
2905                 } else
2906                         break;
2907         } while (retry >= 0 || !tegra_platform_is_silicon());
2908
2909         if (retry < 0)
2910                 gk20a_warn(dev_from_gk20a(g),
2911                         "l2_flush_dirty too many retries");
2912
2913         if (invalidate)
2914                 gk20a_mm_l2_invalidate_locked(g);
2915
2916         mutex_unlock(&mm->l2_op_lock);
2917 }
2918
2919
2920 int gk20a_vm_find_buffer(struct vm_gk20a *vm, u64 gpu_va,
2921                          struct dma_buf **dmabuf,
2922                          u64 *offset)
2923 {
2924         struct mapped_buffer_node *mapped_buffer;
2925
2926         gk20a_dbg_fn("gpu_va=0x%llx", gpu_va);
2927
2928         mutex_lock(&vm->update_gmmu_lock);
2929
2930         mapped_buffer = find_mapped_buffer_range_locked(&vm->mapped_buffers,
2931                                                         gpu_va);
2932         if (!mapped_buffer) {
2933                 mutex_unlock(&vm->update_gmmu_lock);
2934                 return -EINVAL;
2935         }
2936
2937         *dmabuf = mapped_buffer->dmabuf;
2938         *offset = gpu_va - mapped_buffer->addr;
2939
2940         mutex_unlock(&vm->update_gmmu_lock);
2941
2942         return 0;
2943 }
2944
2945 void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm)
2946 {
2947         struct mm_gk20a *mm = vm->mm;
2948         struct gk20a *g = gk20a_from_vm(vm);
2949         u32 addr_lo = u64_lo32(gk20a_mm_iova_addr(vm->pdes.sgt->sgl) >> 12);
2950         u32 data;
2951         s32 retry = 200;
2952
2953         gk20a_dbg_fn("");
2954
2955         /* pagetables are considered sw states which are preserved after
2956            prepare_poweroff. When gk20a deinit releases those pagetables,
2957            common code in vm unmap path calls tlb invalidate that touches
2958            hw. Use the power_on flag to skip tlb invalidation when gpu
2959            power is turned off */
2960
2961         if (!g->power_on)
2962                 return;
2963
2964         /* No need to invalidate if tlb is clean */
2965         mutex_lock(&vm->update_gmmu_lock);
2966         if (!vm->tlb_dirty) {
2967                 mutex_unlock(&vm->update_gmmu_lock);
2968                 return;
2969         }
2970         vm->tlb_dirty = false;
2971         mutex_unlock(&vm->update_gmmu_lock);
2972
2973         mutex_lock(&mm->tlb_lock);
2974         do {
2975                 data = gk20a_readl(g, fb_mmu_ctrl_r());
2976                 if (fb_mmu_ctrl_pri_fifo_space_v(data) != 0)
2977                         break;
2978                 usleep_range(20, 40);
2979                 retry--;
2980         } while (retry >= 0 || !tegra_platform_is_silicon());
2981
2982         if (retry < 0)
2983                 gk20a_warn(dev_from_gk20a(g),
2984                         "wait mmu fifo space too many retries");
2985
2986         gk20a_writel(g, fb_mmu_invalidate_pdb_r(),
2987                 fb_mmu_invalidate_pdb_addr_f(addr_lo) |
2988                 fb_mmu_invalidate_pdb_aperture_vid_mem_f());
2989
2990         /* this is a sledgehammer, it would seem */
2991         gk20a_writel(g, fb_mmu_invalidate_r(),
2992                 fb_mmu_invalidate_all_pdb_true_f() |
2993                 fb_mmu_invalidate_all_va_true_f() |
2994                 fb_mmu_invalidate_trigger_true_f());
2995
2996         do {
2997                 data = gk20a_readl(g, fb_mmu_ctrl_r());
2998                 if (fb_mmu_ctrl_pri_fifo_empty_v(data) !=
2999                         fb_mmu_ctrl_pri_fifo_empty_false_f())
3000                         break;
3001                 retry--;
3002                 usleep_range(20, 40);
3003         } while (retry >= 0 || !tegra_platform_is_silicon());
3004
3005         if (retry < 0)
3006                 gk20a_warn(dev_from_gk20a(g),
3007                         "mmu invalidate too many retries");
3008
3009         mutex_unlock(&mm->tlb_lock);
3010 }
3011
3012 int gk20a_mm_suspend(struct gk20a *g)
3013 {
3014         gk20a_dbg_fn("");
3015
3016         gk20a_mm_fb_flush(g);
3017         gk20a_mm_l2_flush(g, true);
3018
3019         gk20a_dbg_fn("done");
3020         return 0;
3021 }
3022
3023 void gk20a_mm_ltc_isr(struct gk20a *g)
3024 {
3025         u32 intr;
3026
3027         intr = gk20a_readl(g, ltc_ltc0_ltss_intr_r());
3028         gk20a_err(dev_from_gk20a(g), "ltc: %08x\n", intr);
3029         gk20a_writel(g, ltc_ltc0_ltss_intr_r(), intr);
3030 }
3031
3032 bool gk20a_mm_mmu_debug_mode_enabled(struct gk20a *g)
3033 {
3034         u32 debug_ctrl = gk20a_readl(g, fb_mmu_debug_ctrl_r());
3035         return fb_mmu_debug_ctrl_debug_v(debug_ctrl) ==
3036                 fb_mmu_debug_ctrl_debug_enabled_v();
3037 }
3038
3039 static int gk20a_mm_mmu_vpr_info_fetch_wait(struct gk20a *g,
3040                                             const unsigned int msec)
3041 {
3042         unsigned long timeout;
3043
3044         timeout = jiffies + msecs_to_jiffies(msec);
3045         while (1) {
3046                 u32 val;
3047
3048                 val = gk20a_readl(g, fb_mmu_vpr_info_r());
3049                 if (fb_mmu_vpr_info_fetch_v(val) ==
3050                     fb_mmu_vpr_info_fetch_false_v())
3051                         break;
3052
3053                 if (tegra_platform_is_silicon() &&
3054                                 WARN_ON(time_after(jiffies, timeout)))
3055                         return -ETIME;
3056         }
3057
3058         return 0;
3059 }
3060
3061 int gk20a_mm_mmu_vpr_info_fetch(struct gk20a *g)
3062 {
3063         int ret = 0;
3064
3065         gk20a_busy_noresume(g->dev);
3066         if (!pm_runtime_active(&g->dev->dev))
3067                 goto fail;
3068
3069         if (gk20a_mm_mmu_vpr_info_fetch_wait(g, 5)) {
3070                 ret = -ETIME;
3071                 goto fail;
3072         }
3073
3074         gk20a_writel(g, fb_mmu_vpr_info_r(),
3075                      fb_mmu_vpr_info_fetch_true_v());
3076
3077         ret = gk20a_mm_mmu_vpr_info_fetch_wait(g, 5);
3078
3079  fail:
3080         gk20a_idle(g->dev);
3081         return ret;
3082 }