/*
 * Copyright (c) 2008 Travis Geiselbrecht
 * Copyright (c) 2013-2014, NVIDIA CORPORATION. All rights reserved
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
#if ARM_USE_MMU_RELOC
#include <config.h>
#endif

#if ARM_WITH_LPAE
#include <arch/arm/mmu_ldesc_macros.h>
#else
#include <arch/arm/mmu_sdesc_macros.h>
#endif

.section ".text.boot"
.globl _start
_start:
	b	reset
	b	arm_undefined
	b	arm_syscall
	b	arm_prefetch_abort
	b	arm_data_abort
	b	arm_reserved
	b	arm_irq
	b	arm_fiq

reset:
#if !defined(WITH_MONITOR_BIN)
	adr	r3, __jumpback_addr
	str	r14, [r3]

	/* save boot params passed by the bootloader */
	adr	r3, __save_boot_regs
	stmia	r3, {r4-r12}
	mov	r3, #0

	adr	r3, __save_boot_cpsr
	mrs	r4, cpsr
	str r4, [r3]
#endif

	adr	r3, __bootarg_addr
	str	r1, [r3]

#if ARM_USE_MMU_RELOC
	/* save load address/size of image */
	adr	r3, __load_phys_size
	str	r0, [r3]

	/* Derive the phys_base addr */
	mov	r1, pc
	sub	r1, r1, #(.Laddr - _start)
.Laddr:
	adr	r3, __load_phys_base
	str	r1, [r3]

	/* save phys offset (for v -> p conversions) */
	adr	r3, __load_phys_offset
	ldr	r4, =_start
	sub	r6, r4, r1
	str	r6, [r3]

	/* ensure phys size doesn't exceed virt size */
	ldr	r4, =VMEMSIZE
	cmp	r0, r4
	bhi	.

	/* copies of phys size / phys base */
	mov	r4, r0
	mov	r5, r1
#endif

#if ARM_WITH_CP15
	mrc	p15, 0, r0, c1, c0, 0
	/* XXX this is currently for arm926, revist with armv6 cores */
	/* new thumb behavior, low exception vectors, i/d cache disable, mmu disabled */
	bic	r0, r0, #(1<<15 | 1<<13 | 1<<12)
	bic	r0, r0, #(1<<2 | 1<<0)
	bic	r0, r0, #(1<<29 | 1<<28)
	/* disable alignment faults */
	/* user mode app/libs may have unaligned references */
	bic     r0, r0, #(1<<1)
	mcr	p15, 0, r0, c1, c0, 0
#endif

#if ARM_USE_MMU_RELOC
	/*
	 * Create page table(s), on entry:
	 *	r4 = phys size
	 *	r5 = phys base
	 *	r6 = phys offset
	 */
	ldr	r0, =tt
	sub	r0, r0, r6              // second level phys
	mmu_desc_create_null_entry r0, r1
	mmu_desc_create_ident_mapping r0, r1, r2, r3

#if defined(WITH_MONITOR_BIN)
	/*
	 * With a separate monitor binary, phys base with this VMEMBASE isn't
	 * guaranteed to be L2 block aligned (i.e. carveout size is reduced by
	 * the size of the monitor), so needs to be mapped with L3 mappings.
	 *
	 * Because the carveout size is arbitrary, we can't preallocate the
	 * number of L3 page tables needed, so its taken dynamically from the
	 * carveout, reducing what's available for the kernel's heap.
	 */
	mov	r2, r4			// carveout in bytes
	mov32	r0, ((1 << MMU_L3_MAP_SHIFT) - 1)
	add	r2, r2, r0
	lsr	r2, r2, #MMU_L3_MAP_SHIFT	// roundup to L3 map size
	lsl	r2, r2, #MMU_L3_SIZE_SHIFT	// bytes of L3 tables

	/* update _early_heap_allocs for page tables */
	adr	r0, __early_heap_allocs
	str	r2, [r0]
	add	r3, r5, r4		// pt ptr = phys_base + phys_size
	sub	r3, r3, r2		// pt ptr -= L3 pagetable size

	ldr	r0, =tt
	mov	r2, r4			// carveout in bytes
	sub	r8, r0, r6		// second level phys
	mov32	r0, VMEMBASE
	mov	r1, r5
	mov	r11, #0
	mmu_desc_map_phys_l3 r0, r1, r2, r3, r8, r9, r10, r11
#else
	/*
	 * Should be L2 block aligned as there was no preceding memory
	 * usage, like in the case of a separate monitor binary.
	 */
	mmu_desc_phy_align r5, r4, r9

	/* map VMEMBASE -> phys carveout */
	mov	r2, r4			// carveout in bytes
	ldr	r3, =tt
	sub	r3, r3, r6		// second level phys
	mov32	r0, VMEMBASE		// virt
	mov	r1, r5			// phys
	mmu_desc_map_phys_l2 r0, r1, r2, r3, r9

	/* map alias -> phys carveout */
	mov	r2, r4			// carveout in bytes
	ldr	r3, =tt
	sub	r3, r3, r6		// second level phys
	mov	r0, r5			// virt
	mov	r1, r5			// phys
	mmu_desc_map_phys_l2 r0, r1, r2, r3, r9
#endif

	/* load phys pointers to first/second level tables */
	ldr	r3, =tt_level1
	sub	r8, r3, r6		// first level phys
	ldr	r3, =tt
	sub	r9, r3, r6		// second level phys

#if ARM_WITH_LPAE
	/* setup L1 entries */
	mov	r0, r9
	mov	r1, r8
	mov	r2, #0
	mmu_desc_map_phys_l1 r0, r1, r2
#endif
	/* invalidate tlb */
	mcr	p15, 0, r2, c8, c7, 0
	isb

	/* enable MMU */
	mov	r0, r8
	mov	r1, r9
	mmu_desc_init_mmu r0, r1, r2
	isb

	ldr	r0, =go_virtual
	mov	pc, r0
go_virtual:
#endif

#if WITH_CPU_EARLY_INIT
	/* call platform/arch/etc specific init code */
	bl __cpu_early_init
#endif

	/* see if we need to relocate */
	ldr	r0, __load_phys_base
	ldr	r1, =_start
	sub	r1, r1, r6		// compare is in phys addrs
	cmp	r0, r1
	beq	.Lstack_setup

	/* we need to relocate ourselves to the proper spot */
	ldr	r2, =__data_end

.Lrelocate_loop:
	ldr	r3, [r0], #4
	str	r3, [r1], #4
	cmp	r1, r2
	bne	.Lrelocate_loop

	/* we're relocated, jump to the right address */
	ldr	r0, =.Lstack_setup
	bx	r0

.ltorg

.Lstack_setup:
	/* set up the stack for irq, fiq, abort, undefined, system/user, and lastly supervisor mode */
	mrs	r0, cpsr
	bic	r0, r0, #0x1f

	ldr	r2, =abort_stack_top
	orr	r1, r0, #0x12 // irq
	msr	cpsr_c, r1
	ldr	r13, =irq_save_spot	/* save a pointer to a temporary dumping spot used during irq delivery */

	orr	r1, r0, #0x11 // fiq
	msr	cpsr_c, r1
	mov	sp, r2

	orr	r1, r0, #0x17 // abort
	msr	cpsr_c, r1
	mov	sp, r2

	orr	r1, r0, #0x1b // undefined
	msr	cpsr_c, r1
	mov	sp, r2

	orr	r1, r0, #0x1f // system
	msr	cpsr_c, r1
	mov	sp, r2

	orr	r1, r0, #0x13 // supervisor
	msr	cpsr_c, r1
	mov	sp, r2

	/* copy the initialized data segment out of rom if necessary */
	ldr	r0, =__data_start_rom
	ldr	r1, =__data_start
	ldr	r2, =__data_end

	cmp	r0, r1
	beq	.L__do_bss

.L__copy_loop:
	cmp	r1, r2
	ldrlt	r3, [r0], #4
	strlt	r3, [r1], #4
	blt	.L__copy_loop

.L__do_bss:
	/* clear out the bss */
	ldr	r0, =__bss_start
	ldr	r1, =_end
	mov	r2, #0
.L__bss_loop:
	cmp	r0, r1
	strlt	r2, [r0], #4
	blt	.L__bss_loop

	bl	kmain
	b	.

	.type	__save_boot_regs, %object
.global __save_boot_regs
__save_boot_regs:
	.long	0, 0, 0, 0, 0, 0, 0, 0, 0
	.size	__save_boot_regs, . - __save_boot_regs

.global __save_boot_cpsr
__save_boot_cpsr:
	.int 0

.ltorg

#if ARM_USE_MMU_RELOC
.align 2
/*
 * Keep location of where the image was loaded, and how much
 * contiguous memory has been made available.
 */
.global __load_phys_base
__load_phys_base:
	.int 0
.global __load_phys_size
__load_phys_size:
	.int 0
.global __load_phys_offset
__load_phys_offset:
	.int 0
.global __early_heap_allocs
__early_heap_allocs:
	.int 0
#endif

.global __jumpback_addr
__jumpback_addr:
	.int 0
.global __bootarg_addr
__bootarg_addr:
	.int 0

.bss
.align 2
	/* the abort stack is for unrecoverable errors.
	 * also note the initial working stack is set to here.
	 * when the threading system starts up it'll switch to a new
	 * dynamically allocated stack, so we don't need it for very long
	 */
abort_stack:
	.skip 1024
.global abort_stack_top
abort_stack_top:

.rodata:
.align 2

/* define the heap end as read-only data containing the end defined in the
 * linker script. other archs that use dynamic memory length discovery can make
 * this read-write and update it during init.
 */
.global _heap_end
_heap_end:
	.int _end_of_ram
