/*
 * Copyright (c) 2008-2013 Travis Geiselbrecht
 * Copyright (c) 2013-2014, NVIDIA CORPORATION. All rights reserved
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
#include <asm.h>
#include <arch/ops.h>
#include <arch/defines.h>

.text

#if ARM_WITH_CACHE

/* low level cache routines for various cpu families */

#if ARM_CPU_ARM1136 || ARM_CPU_ARM926

/* void arch_disable_cache(uint flags) */
FUNCTION(arch_disable_cache)
	mov		r12, #0			@ zero register
	mrs		r3, cpsr		@ save the old interrupt state
#if ARM_ISA_ARMv6
	cpsid 	iaf				@ interrupts disabled
#else
	orr		r3, r3, #(1<<7)
	msr		cpsr, r3
#endif

.Ldcache_disable:
	tst		r0, #DCACHE
	beq		.Licache_disable
	mrc		p15, 0, r1, c1, c0, 0	@ cr1
	tst		r1, #(1<<2)		@ is the dcache already disabled?
	beq		.Licache_disable

	bic		r1, #(1<<2)
	mcr		p15, 0, r1, c1, c0, 0	@ disable dcache

#if ARM_CPU_ARM1136
    mcr     p15, 0, r12, c7, c14, 0		@ clean & invalidate dcache
#elif ARM_CPU_ARM926
0:
	mrc		p15, 0, r15, c7, c14, 3	@ clean & invalidate dcache
	bne		0b
#else
#error whut?
#endif
	mcr		p15, 0, r0, c7, c10, 4	@ data sync barrier (formerly drain write buffer)

.Licache_disable:
	tst		r0, #ICACHE
	beq		.Ldone_disable

	mrc     p15, 0, r1, c1, c0, 0		@ cr1
	bic		r1, #(1<<12)
	mcr		p15, 0, r1, c1, c0, 0	@ disable icache

	mcr		p15, 0, r12, c7, c5, 0	@ invalidate icache

.Ldone_disable:
	msr		cpsr, r3
	bx		lr

/* void arch_enable_cache(uint flags) */
FUNCTION(arch_enable_cache)
	mov		r12, #0			@ zero register
	mrs		r3, cpsr		@ save the old interrupt state
#if ARM_ISA_ARMv6
	cpsid iaf				@ interrupts disabled
#else
	orr		r3, r3, #(1<<7)
	msr		cpsr, r3
#endif

.Ldcache_enable:
	tst		r0, #DCACHE
	beq		.Licache_enable
	mrc		p15, 0, r1, c1, c0, 0		@ cr1
	tst		r1, #(1<<2)		@ is the dcache already enabled?
	bne		.Licache_enable

	mcr		p15, 0, r12, c7, c6, 0		@ invalidate dcache

	orr		r1, #(1<<2)
	mcr		p15, 0, r1, c1, c0, 0		@ enable dcache

.Licache_enable:
	tst		r0, #ICACHE
	beq		.Ldone_enable

	mcr		p15, 0, r12, c7, c5, 0		@ invalidate icache

	mrc		p15, 0, r1, c1, c0, 0		@ cr1
	orr		r1, #(1<<12)
	mcr		p15, 0, r1, c1, c0, 0		@ enable icache

.Ldone_enable:
	msr		cpsr, r3
	bx		lr

#elif ARM_CPU_CORTEX_A9 || ARM_CPU_CORTEX_A15

/* void arch_disable_cache(uint flags) */
FUNCTION(arch_disable_cache)
	stmfd	sp!, {r4-r11, lr}
	mov	r7, r0				@ save flags
	mrs	r12, cpsr			@ save the old interrupt state
	cpsid	iaf				@ cpsid iaf, interrupts disabled

.Ldcache_disable:
	tst	r7, #DCACHE
	beq	.Licache_disable
	mrc	p15, 0, r0, c1, c0, 0		@ system control register
	tst	r0, #(1<<2)			@ is the dcache already disabled?
	beq	.Ldcache_already_disabled

	bic	r0, #(1<<2)
	mcr	p15, 0, r0, c1, c0, 0		@ disable dcache

#if ARM_CPU_CORTEX_A9
	@ flush and invalidate the dcache
	@ NOTE: trashes a bunch of registers, can't be spilling stuff to the stack
	bl	flush_invalidate_cache_v7
#endif

	b	.Ldcache_disable_L2

.Ldcache_already_disabled:
#if ARM_CPU_CORTEX_A9
	@ make sure all of the caches are invalidated
	@ NOTE: trashes a bunch of registers, can't be spilling stuff to the stack
	bl	invalidate_cache_v7
#endif
.Ldcache_disable_L2:

#if ARM_WITH_L2
	mrc	p15, 0, r0, c1, c0, 1		@ auxiliary control register
	bic	r0, #(1<<1)
	mcr	p15, 0, r0, c1, c0, 1		@ disable L2 dcache
#endif

.Licache_disable:
	tst	r7, #ICACHE
	beq	.Ldone_disable

	mrc	p15, 0, r0, c1, c0, 0		@ system control register
	bic	r0, #(1<<12)
	mcr	p15, 0, r0, c1, c0, 0		@ disable icache

.Ldone_disable:
#if ARM_CPU_CORTEX_A9
	mov	r0, #0
	mcr	p15, 0, r0, c7, c5, 0		@ invalidate icache to PoU
#endif
	msr	cpsr, r12
	ldmfd	sp!, {r4-r11, pc}

/* void arch_enable_cache(uint flags) */
FUNCTION(arch_enable_cache)
	stmfd	sp!, {r4-r11, lr}
	mov	r7, r0				@ save flags
	mrs	r12, cpsr			@ save the old interrupt state
	cpsid	iaf				@ cpsid iaf, interrupts disabled

.Ldcache_enable:
	tst	r7, #DCACHE
	beq	.Licache_enable
	mrc	p15, 0, r0, c1, c0, 0		@ system control register
	tst	r0, #(1<<2)			@ is the dcache already enabled?
	bne	.Licache_enable

	@ invalidate L1 and L2
	@ NOTE: trashes a bunch of registers, can't be spilling stuff to the stack
#if ARM_WITH_L2
	bl	invalidate_cache_v7

	@ enable the L2, if present
	mrc	p15, 0, r0, c1, c0, 1		@ auxiliary control register
	orr	r0, #(1<<1)
	mcr	p15, 0, r0, c1, c0, 1		@ enable L2 dcache
#endif

	dmb
	mrc	p15, 0, r0, c1, c0, 0		@ system control register
	orr	r0, #(1<<2)
	dsb
	mcr	p15, 0, r0, c1, c0, 0		@ enable dcache

.Licache_enable:
	tst	r7, #ICACHE
	beq	.Ldone_enable

#if ARM_CPU_CORTEX_A9
	mov	r0, #0
	mcr	p15, 0, r0, c7, c5, 0		@ invalidate icache to PoU
#endif
	mrc	p15, 0, r0, c1, c0, 0		@ system control register
	orr	r0, #(1<<12)
	mcr	p15, 0, r0, c1, c0, 0		@ enable icache

.Ldone_enable:
	msr	cpsr, r12
	ldmfd	sp!, {r4-r11, pc}

@ flush & invalidate cache routine, trashes r0-r6, r9-r11
flush_invalidate_cache_v7:
/*
 * Clean and flush the cache to maintain consistency.
 * From ARMv7 manual, B2-17
 *
 * On exit,
 *  r1, r2, r3, r9, r10, r11, r12 corrupted
 * This routine must preserve:
 *  r4, r6, r7, r8
 */
	mrc	p15, 0, r10, c0, c1, 5	@ read ID_MMFR1
	tst	r10, #0xf << 16		@ hierarchical cache (ARMv7)
	mov	r10, #0
	beq	hierarchical
	mcr	p15, 0, r10, c7, c14, 0	@ clean+invalidate D
	b	iflush
hierarchical:
	mcr	p15, 0, r10, c7, c10, 5	@ DMB
	stmfd	sp!, {r0-r7, r9-r11}
	mrc	p15, 1, r0, c0, c0, 1	@ read clidr
	ands	r3, r0, #0x7000000	@ extract loc from clidr
	mov	r3, r3, lsr #23		@ left align loc bit field
	beq	finished		@ if loc is 0, then no need to clean
	mov	r10, #0			@ start clean at cache level 0
loop1:
	add	r2, r10, r10, lsr #1	@ work out 3x current cache level
	mov	r1, r0, lsr r2		@ extract cache type bits from clidr
	and	r1, r1, #7		@ mask of the bits for current cache only
	cmp	r1, #2			@ see what cache we have at this level
	blt	skip			@ skip if no cache, or just i-cache
	mcr	p15, 2, r10, c0, c0, 0	@ select current cache level in cssr
	mcr	p15, 0, r10, c7, c5, 4	@ isb to sych the new cssr&csidr
	mrc	p15, 1, r1, c0, c0, 0	@ read the new csidr
	and	r2, r1, #7		@ extract the length of the cache lines
	add	r2, r2, #4		@ add 4 (line length offset)
	ldr	r4, =0x3ff
	ands	r4, r4, r1, lsr #3	@ find maximum number on the way size
	clz	r5, r4			@ find bit position of way size increment
	ldr	r7, =0x7fff
	ands	r7, r7, r1, lsr #13	@ extract max number of the index size
loop2:
	mov	r9, r4			@ create working copy of max way size
loop3:
	orr	r11, r10, r9, lsl r5	@ factor way and cache number into r11
	orr	r11, r11, r7, lsl r2	@ factor index number into r11
	mcr	p15, 0, r11, c7, c14, 2	@ clean & invalidate by set/way
	subs	r9, r9, #1		@ decrement the way
	bge	loop3
	subs	r7, r7, #1		@ decrement the index
	bge	loop2
skip:
	add	r10, r10, #2		@ increment cache number
	cmp	r3, r10
	bgt	loop1
finished:
	ldmfd	sp!, {r0-r7, r9-r11}
	mov	r10, #0			@ swith back to cache level 0
	mcr	p15, 2, r10, c0, c0, 0	@ select current cache level in cssr
iflush:
	mcr	p15, 0, r10, c7, c10, 4	@ DSB
	mcr	p15, 0, r10, c7, c5, 0	@ invalidate I+BTB
	mcr	p15, 0, r10, c7, c10, 4	@ DSB
	mcr	p15, 0, r10, c7, c5, 4	@ ISB
	mov	pc, lr

/*
 *	invalidate_cache_v7
 *
 *	Corrupted registers: r0-r7, r9-r11
 */
invalidate_cache_v7:
	dmb					@ ensure ordering with previous memory accesses
	mrc	p15, 1, r0, c0, c0, 1		@ read clidr
	ands	r3, r0, #0x7000000		@ extract loc from clidr
	mov	r3, r3, lsr #23			@ left align loc bit field
	beq	1005f				@ if loc is 0, then no need to clean
	mov	r10, #0				@ start clean at cache level 0
1001:
	add	r2, r10, r10, lsr #1		@ work out 3x current cache level
	mov	r1, r0, lsr r2			@ extract cache type bits from clidr
	and	r1, r1, #7			@ mask of the bits for current cache only
	cmp	r1, #2				@ see what cache we have at this level
	blt	1004f				@ skip if no cache, or just i-cache

	mcr	p15, 2, r10, c0, c0, 0		@ select current cache level in cssr
	isb					@ isb to sych the new cssr&csidr
	mrc	p15, 1, r1, c0, c0, 0		@ read the new csidr

	and	r2, r1, #7			@ extract the length of the cache lines
	add	r2, r2, #4			@ add 4 (line length offset)
	ldr	r4, =0x3ff
	ands	r4, r4, r1, lsr #3		@ find maximum number on the way size
	clz	r5, r4				@ find bit position of way size increment
	ldr	r7, =0x7fff
	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
1002:
	mov	r9, r4				@ create working copy of max way size
1003:
	orr	r11, r10, r9, lsl r5		@ factor way and cache number into r11
	orr	r11, r11, r7, lsl r2		@ factor index number into r11
	mcr	p15, 0, r11, c7, c14, 2		@ clean/flush by set/way
	subs	r9, r9, #1			@ decrement the way
	bge	1003b
	subs	r7, r7, #1			@ decrement the index
	bge	1002b
1004:
	add	r10, r10, #2			@ increment cache number
	cmp	r3, r10
	bgt	1001b
1005:
	mov	r10, #0				@ switch back to cache level 0
	mcr	p15, 2, r10, c0, c0, 0		@ select current cache level in cssr
	dsb
	isb
	mov	pc, lr

#else
#error unhandled cpu
#endif

#if ARM_CPU_ARM926 || ARM_CPU_ARM1136 || ARM_CPU_CORTEX_A8 || ARM_CPU_CORTEX_A9 || ARM_CPU_CORTEX_A15
/* shared cache flush routines */

	/* void arch_flush_cache_range(addr_t start, size_t len); */
FUNCTION(arch_clean_cache_range)
#if ARM_WITH_CP15
	add		r2, r0, r1					// calculate the end address
	bic		r0, #(CACHE_LINE-1)			// align the start with a cache line
0:
	mcr		p15, 0, r0, c7, c10, 1		// clean cache to PoC by MVA
	add		r0, r0, #CACHE_LINE
	cmp		r0, r2
	blo		0b

	/* copied from ops.S */
#if defined(ARM_CPU_CORTEX_A8) || defined(ARM_CPU_CORTEX_A9) || defined(ARM_CPU_CORTEX_A15)
	dsb		sy
#elif ARM_CPU_ARM1136
	mov		r0, #0
	mcr		p15, 0, r0, c7, c10, 4		// data sync barrier
#endif
#endif
	bx		lr

	/* void arch_flush_invalidate_cache_range(addr_t start, size_t len); */
FUNCTION(arch_clean_invalidate_cache_range)
#if ARM_WITH_CP15
	add		r2, r0, r1					// calculate the end address
	bic		r0, #(CACHE_LINE-1)			// align the start with a cache line
0:
	mcr		p15, 0, r0, c7, c14, 1		// clean & invalidate dcache to PoC by MVA
	add		r0, r0, #CACHE_LINE
	cmp		r0, r2
	blo		0b

	/* copied from ops.S */
#if defined(ARM_CPU_CORTEX_A8) || defined(ARM_CPU_CORTEX_A9) || defined(ARM_CPU_CORTEX_A15)
	dsb		sy
#elif ARM_CPU_ARM1136
	mov		r0, #0
	mcr		p15, 0, r0, c7, c10, 4		// data sync barrier
#endif
#endif
	bx		lr

	/* void arch_invalidate_cache_range(addr_t start, size_t len); */
FUNCTION(arch_invalidate_cache_range)
#if ARM_WITH_CP15
	add		r2, r0, r1					// calculate the end address
	bic		r0, #(CACHE_LINE-1)			// align the start with a cache line
0:
	mcr		p15, 0, r0, c7, c6, 1		// invalidate dcache to PoC by MVA
	add		r0, r0, #CACHE_LINE
	cmp		r0, r2
	blo		0b

	/* copied from ops.S */
#if defined(ARM_CPU_CORTEX_A8) || defined(ARM_CPU_CORTEX_A9) || defined(ARM_CPU_CORTEX_A15)
	dsb		sy
#elif ARM_CPU_ARM1136
	mov		r0, #0
	mcr		p15, 0, r0, c7, c10, 4		// data sync barrier
#endif
#endif
	bx		lr

	/* void arch_sync_cache_range(addr_t start, size_t len); */
FUNCTION(arch_sync_cache_range)
	push    { r14 }
	bl      arch_clean_cache_range

	mov     r0, #0
	mcr     p15, 0, r0, c7, c5, 0       // invalidate icache to PoU

	pop     { pc }

#endif // ARM_CPU_...

#else

/* no cache */

FUNCTION(arch_disable_cache)
	bx		lr

FUNCTION(arch_enable_cache)
	bx		lr

FUNCTION(arch_clean_cache_range)
	bx		lr

FUNCTION(arch_clean_invalidate_cache_range)
	bx		lr

FUNCTION(arch_sync_cache_range)
	bx		lr

#endif // ARM_WITH_CACHE

