_TEXT	segment para 'CODE'
_TEXT	ends
_DATA	segment para 'DATA'
_DATA	ends

	_TEXT segment

	public	init_profiler
	public	profile_r
	public profile_l
	public profile_l2
	public profile_n
	public profile_n2
	public profile_s
	public profile_s2
	public profile_t
	public	write_profile_information
	public	write_profile_stack
	public	stack_trace_depth

 ifndef LINUX
	extrn	allocate_memory:near
 endif
	extrn	__STRING__:near
	extrn	openF:near
	extrn	closeF:near
	extrn	writeFC:near
	extrn	writeFI:near
	extrn	ab_stack_size:near
	extrn	ew_print_string:near
	extrn	ew_print_char:near
	extrn	ew_print_text:near
	extrn	create_profile_file_name:near
;	extrn	print_error:near
;	extrn	profile_stack_pointer:near

next		= 0
time		= 8
n_profiler_calls = 16
n_strict_calls	= 24
n_lazy_calls	= 32
n_curried_calls	= 40
n_words_allocated = 48
name_		= 56
FunctionProfile	= 64

profile_t:
	push	rax
	push	rdx
	rdtsc

	push	rcx
	mov	rcx,qword ptr profile_stack_pointer

	sub	edx,dword ptr global_time_hi
	push	rbx
	mov	ebx,dword ptr global_time_lo
	mov	eax,eax

	shl	rdx,32
	sub	rax,rbx
	mov	rbx,qword ptr (-8)[rcx]

	add	rax,rdx
	
	sub	rcx,8
	mov	qword ptr global_last_tail_call,rbx

	mov	qword ptr profile_stack_pointer,rcx

	inc	qword ptr n_profiler_calls[rbx]
	add	qword ptr time[rbx],rax

	mov	rax,qword ptr global_n_words_free
	mov	qword ptr global_n_words_free,r15
	sub	rax,r15
	add	qword ptr n_words_allocated[rbx],rax

	pop	rbx
	pop	rcx

	rdtsc
	mov	dword ptr global_time_hi,edx
	pop	rdx
	mov	dword ptr global_time_lo,eax
	pop	rax
	ret

profile_r:
	push	rax
	push	rdx
	rdtsc

	push	rcx
	mov	rcx,qword ptr profile_stack_pointer

	sub	edx,dword ptr global_time_hi
	push	rbx
	mov	ebx,dword ptr global_time_lo
	mov	eax,eax

	shl	rdx,32
	sub	rax,rbx
	mov	rbx,qword ptr (-8)[rcx]

	add	rax,rdx

	sub	rcx,8
	mov	qword ptr global_last_tail_call,0

	mov	qword ptr profile_stack_pointer,rcx

	inc	qword ptr n_profiler_calls[rbx]
	add	qword ptr time[rbx],rax

	mov	rax,qword ptr global_n_words_free
	mov	qword ptr global_n_words_free,r15
	sub	rax,r15
	add	qword ptr n_words_allocated[rbx],rax

	pop	rbx
	pop	rcx

	rdtsc
	mov	dword ptr global_time_hi,edx
	pop	rdx
	mov	dword ptr global_time_lo,eax
	pop	rax
	ret

profile_l:
	push	rax
	push	rdx
	rdtsc

	push	rbx
	mov	rbx,qword ptr [rbp]

	test	rbx,rbx
	je	allocate_function_profile_record_l
allocate_function_profile_record_lr:
	push	rcx
	
	mov	rbp,qword ptr global_last_tail_call
	mov	rcx,qword ptr profile_stack_pointer

	test	rbp,rbp
	jne	use_tail_calling_function_l

	mov	rbp,qword ptr (-8)[rcx]
use_tail_calling_function_lr:

	mov	qword ptr [rcx],rbx
	add	rcx,8
	
	inc	qword ptr n_curried_calls[rbx]
	jmp	profile_n_

allocate_function_profile_record_l:
	call	allocate_function_profile_record
	jmp	allocate_function_profile_record_lr

use_tail_calling_function_l:
	mov	qword ptr global_last_tail_call,0
	jmp	use_tail_calling_function_lr

profile_l2:
	push	rax
	push	rdx
	rdtsc

	push	rbx
	mov	rbx,qword ptr [rbp]

	test	rbx,rbx
	je	allocate_function_profile_record_l2
allocate_function_profile_record_l2r:
	push	rcx
	
	mov	rbp,qword ptr global_last_tail_call
	mov	rcx,qword ptr profile_stack_pointer
	
	test	rbp,rbp
	jne	use_tail_calling_function_l2

	mov	rbp,qword ptr (-8)[rcx]
use_tail_calling_function_l2r:

	mov	qword ptr [rcx],rbx
	mov	qword ptr 8[rcx],rbx
	add	rcx,16

	inc	qword ptr n_curried_calls[rbx]
	jmp	profile_n_

allocate_function_profile_record_l2:
	call	allocate_function_profile_record
	jmp	allocate_function_profile_record_l2r

use_tail_calling_function_l2:
	mov	qword ptr global_last_tail_call,0
	jmp	use_tail_calling_function_l2r

profile_n:
	push	rax
	push	rdx
	rdtsc
		
	push	rbx
	mov	rbx,qword ptr [rbp]
	
	test	rbx,rbx
	je	allocate_function_profile_record_n
allocate_function_profile_record_nr:
	push	rcx
	
	mov	rbp,qword ptr global_last_tail_call
	mov	rcx,qword ptr profile_stack_pointer

	test	rbp,rbp
	jne	use_tail_calling_function_n

	mov	rbp,qword ptr (-8)[rcx]
use_tail_calling_function_nr:

	mov	qword ptr [rcx],rbx
	add	rcx,8

	inc	qword ptr n_lazy_calls[rbx]
	jmp	profile_n_

allocate_function_profile_record_n:
	call	allocate_function_profile_record
	jmp	allocate_function_profile_record_nr

use_tail_calling_function_n:
	mov	qword ptr global_last_tail_call,0
	jmp	use_tail_calling_function_nr

profile_n2:
	push	rax
	push	rdx
	rdtsc

	push	rbx
	mov	rbx,qword ptr [rbp]

	test	rbx,rbx
	je	allocate_function_profile_record_n2
allocate_function_profile_record_n2r:
	push	rcx
	
	mov	rbp,qword ptr global_last_tail_call
	mov	rcx,qword ptr profile_stack_pointer

	test	rbp,rbp
	jne	use_tail_calling_function_n2

	mov	rbp,qword ptr (-8)[rcx]
use_tail_calling_function_n2r:

	mov	qword ptr [rcx],rbx
	mov	qword ptr 8[rcx],rbx
	add	rcx,16

	inc	qword ptr n_lazy_calls[rbx]
	jmp	profile_n_

allocate_function_profile_record_n2:
	call	allocate_function_profile_record
	jmp	allocate_function_profile_record_n2r

use_tail_calling_function_n2:
	mov	qword ptr global_last_tail_call,0
	jmp	use_tail_calling_function_n2r

profile_s2:
	push	rax
	push	rdx
	rdtsc
		
	push	rbx
	mov	rbx,qword ptr [rbp]
	
	test	rbx,rbx
	je	allocate_function_profile_record_s2
allocate_function_profile_record_s2r:
	push	rcx

	mov	rbp,qword ptr global_last_tail_call
	mov	rcx,qword ptr profile_stack_pointer
	
	test	rbp,rbp
	jne	use_tail_calling_function_s2

	mov	rbp,qword ptr (-8)[rcx]
use_tail_calling_function_s2r:

	mov	qword ptr [rcx],rbx
	mov	qword ptr 8[rcx],rbx
	add	rcx,16
	jmp	profile_s_

allocate_function_profile_record_s2:
	call	allocate_function_profile_record
	jmp	allocate_function_profile_record_s2r

use_tail_calling_function_s2:
	mov	qword ptr global_last_tail_call,0
	jmp	use_tail_calling_function_s2r

profile_s:
	push	rax
	push	rdx
	rdtsc
	
	push	rbx
	mov	rbx,qword ptr [rbp]
	
	test	rbx,rbx
	je	allocate_function_profile_record_s
allocate_function_profile_record_sr:
	push	rcx

	mov	rbp,qword ptr global_last_tail_call
	mov	rcx,qword ptr profile_stack_pointer
	
	test	rbp,rbp
	jne	use_tail_calling_function_s

	mov	rbp,qword ptr (-8)[rcx]
use_tail_calling_function_sr:

	mov	qword ptr [rcx],rbx
	add	rcx,8

profile_s_:
	inc	qword ptr n_strict_calls[rbx]

profile_n_:
	mov	qword ptr profile_stack_pointer,rcx

	sub	edx,dword ptr global_time_hi
	mov	ebx,dword ptr global_time_lo
	mov	eax,eax

	shl	rdx,32
	sub	rax,rbx

	add	rax,rdx

	inc	qword ptr n_profiler_calls[rbp]
	add	qword ptr time[rbp],rax

	mov	rax,qword ptr global_n_words_free
	mov	qword ptr global_n_words_free,r15
	sub	rax,r15
	add	qword ptr n_words_allocated[rbp],rax

	pop	rcx
	pop	rbx

	rdtsc
	mov	dword ptr global_time_hi,edx
	pop	rdx
	mov	dword ptr global_time_lo,eax
	pop	rax
	ret

allocate_function_profile_record_s:
	call	allocate_function_profile_record
	jmp	allocate_function_profile_record_sr

use_tail_calling_function_s:
	mov	qword ptr global_last_tail_call,0
	jmp	use_tail_calling_function_sr


; argument: rbp: function name adress-4
; result:   rbx: function profile record adress

allocate_function_profile_record:
	push	rax
	mov	rax,qword ptr global_n_free_records_in_block
	mov	rbx,qword ptr global_last_allocated_block

	test	rax,rax
	jne	no_alloc

	push	rcx
	push	rdx
	push	rbp

 ifdef LINUX
	sub	rsp,104
	mov	qword ptr [rsp],rsi
	mov	qword ptr 8[rsp],rdi
	mov	qword ptr 16[rsp],r8
	mov	qword ptr 24[rsp],r10
	mov	qword ptr 32[rsp],r11
	movsd	qword ptr 40[rsp],xmm0
	movsd	qword ptr 48[rsp],xmm1
	movsd	qword ptr 56[rsp],xmm2
	movsd	qword ptr 64[rsp],xmm3
	movsd	qword ptr 72[rsp],xmm4
	movsd	qword ptr 80[rsp],xmm5
	movsd	qword ptr 88[rsp],xmm6
	movsd	qword ptr 96[rsp],xmm7
 else
	sub	rsp,72
	mov	qword ptr [rsp],r8
	mov	qword ptr 8[rsp],r10
	mov	qword ptr 16[rsp],r11
	movsd	qword ptr 24[rsp],xmm0
	movsd	qword ptr 32[rsp],xmm1
	movsd	qword ptr 40[rsp],xmm2
	movsd	qword ptr 48[rsp],xmm3
	movsd	qword ptr 56[rsp],xmm4
	movsd	qword ptr 64[rsp],xmm5
 endif

	mov	rbp,rsp
	sub	rsp,40
	and	rsp,-16
 ifdef LINUX
	mov	rdi,8192
			; 128*FunctionProfile
	call	malloc
 else
	mov	rcx,128*FunctionProfile
	call	allocate_memory
 endif
	mov	rsp,rbp

 ifdef LINUX
	mov	rsi,qword ptr [rsp]
	mov	rdi,qword ptr 8[rsp]
	mov	r8,qword ptr 16[rsp]
	mov	r10,qword ptr 24[rsp]
	mov	r11,qword ptr 32[rsp]
	movlpd	xmm0,qword ptr 40[rsp]
	movlpd	xmm1,qword ptr 48[rsp]
	movlpd	xmm2,qword ptr 56[rsp]
	movlpd	xmm3,qword ptr 64[rsp]
	movlpd	xmm4,qword ptr 72[rsp]
	movlpd	xmm5,qword ptr 80[rsp]
	movlpd	xmm6,qword ptr 88[rsp]
	movlpd	xmm7,qword ptr 96[rsp]
	add	rsp,104
 else
	mov	r8,qword ptr [rsp]
	mov	r10,qword ptr 8[rsp]
	mov	r11,qword ptr 16[rsp]
	movlpd	xmm0,qword ptr 24[rsp]
	movlpd	xmm1,qword ptr 32[rsp]
	movlpd	xmm2,qword ptr 40[rsp]
	movlpd	xmm3,qword ptr 48[rsp]
	movlpd	xmm4,qword ptr 56[rsp]
	movlpd	xmm5,qword ptr 64[rsp]
	add	rsp,72
 endif

	test	rax,rax

	pop	rbp
	pop	rdx
	pop	rcx

	je	no_memory

	mov	rbx,rax
	mov	rax,128
	mov	qword ptr global_last_allocated_block,rbx

no_alloc:	
	dec	rax
	mov	qword ptr global_n_free_records_in_block,rax
	lea	rax,FunctionProfile[rbx]
	mov	qword ptr global_last_allocated_block,rax

	xor	rax,rax
	mov	qword ptr time[rbx],rax
	mov	qword ptr n_profiler_calls[rbx],rax
	mov	qword ptr n_strict_calls[rbx],rax
	mov	qword ptr n_lazy_calls[rbx],rax
	mov	qword ptr n_curried_calls[rbx],rax
	mov	qword ptr n_words_allocated[rbx],rax

	mov	rax,qword ptr global_profile_records
	mov	qword ptr name_[rbx],rbp

	mov	qword ptr next[rbx],rax
	mov	qword ptr global_profile_records,rbx

	mov	qword ptr [rbp],rbx
	pop	rax
	ret

no_memory:
	lea	rbp,not_enough_memory_for_profiler
	pop	rax
	jmp	print_error

write_profile_information:
	mov	rbp,rsp
	sub	rsp,40
	and	rsp,-16
 ifdef LINUX
	mov	r13,rsi
	mov	r14,rdi
	lea	rdi,profile_file_name
 else
	lea	rcx,profile_file_name
 endif
	call	create_profile_file_name
 ifdef LINUX
	mov	rsi,r13
	mov	rdi,r14
 endif
	mov	rsp,rbp

	mov	rax,1
	lea	rcx,profile_file_name
	call	openF

	test	r10,r10
	je	cannot_open
	
	mov	rbp,qword ptr global_profile_records

write_profile_lp:	
	test	rbp,rbp
	je	end_list

	mov	rdx,qword ptr name_[rbp]

	push	rbp

	push	rdx

	mov	edx,dword ptr (-4)[rdx]
	mov	eax,dword ptr [rdx]
	add	rdx,4

write_module_name_lp:
	sub	rax,1
	jc	end_module_name

	push	rax
	push	rdx

	movzx	r10,byte ptr [rdx]
	call	writeFC

	pop	rdx
	pop	rax

	add	rdx,1
	jmp	write_module_name_lp

end_module_name:
	mov	r10,' '
	call	writeFC

	pop	rdx

	add	rdx,7
	
write_function_name_lp:
	movzx	r10,byte ptr 1[rdx]
	add	rdx,1

	test	r10,r10
	je	end_function_name

	push	rdx

	call	writeFC

	pop	rdx

	jmp	write_function_name_lp

end_function_name:
	mov	r10,' '
	call	writeFC

	mov	rbp,qword ptr [rsp]
	mov	r10,qword ptr n_strict_calls[rbp]
	call	writeFI_space

	mov	rbp,qword ptr [rsp]
	mov	r10,qword ptr n_lazy_calls[rbp]
	call	writeFI_space

	mov	rbp,qword ptr [rsp]
	mov	r10,qword ptr n_curried_calls[rbp]
	call	writeFI_space

	mov	rbp,qword ptr [rsp]
	mov	r10,qword ptr n_profiler_calls[rbp]
	call	writeFI_space

	mov	rbp,qword ptr [rsp]
	mov	r10,qword ptr n_words_allocated[rbp]
	call	writeFI_space

	mov	rbp,qword ptr [rsp]
	mov	r10,qword ptr time[rbp]
	call	writeFI

	mov	r10,10
	call	writeFC

	pop	rbp
	mov	rbp,qword ptr next[rbp]
	jmp	write_profile_lp

writeFI_space:
	call	writeFI

	mov	r10,' '
	jmp	writeFC

end_list:
	call	closeF

cannot_open:
	ret

write_profile_stack:
	mov	rax,qword ptr profile_stack_pointer

	test	rax,rax
	je	stack_not_initialised

	push	rax

	mov	rbp,rsp
	sub	rsp,40
	and	rsp,-16
 ifdef LINUX
	mov	r13,rsi
	mov	r14,rdi
	lea	rdi,stack_trace_string
 else
	lea	rcx,stack_trace_string
 endif
	call	ew_print_string
 ifdef LINUX
	mov	rsi,r13
	mov	rdi,r14
 endif
	mov	rsp,rbp

	pop	rax

;	mov	rbp,12
	mov	rbp,qword ptr stack_trace_depth
write_functions_on_stack:
	mov	rbx,qword ptr (-8)[rax]
	sub	rax,8

	test	rbx,rbx
	je	end_profile_stack

	push	rax
	mov	rcx,qword ptr name_[rbx]

	push	rbp

	mov	edx,dword ptr (-4)[rcx]
	add	rcx,8

	mov	r12d,dword ptr [rdx]
	lea	r13,4[rdx]

	mov	rbp,rsp
	sub	rsp,40
	and	rsp,-16
 ifdef LINUX
	mov	r11,rsi
	mov	r14,rdi
	mov	rdi,rcx
 endif
	call	ew_print_string

 ifdef LINUX
	lea	rdi,module_string
 else
	lea	rcx,module_string
 endif
	call	ew_print_string

 ifdef LINUX
	mov	rsi,r12
	mov	rdi,r13
 else
	mov	rdx,r12
	mov	rcx,r13
 endif
	call	ew_print_text

 ifdef LINUX
	mov	rdi,']'
 else
	mov	rcx,']'
 endif
	call	ew_print_char

 ifdef LINUX
	mov	rdi,10
 else
	mov	rcx,10
 endif
	call	ew_print_char
 
ifdef LINUX
	mov	rsi,r11
	mov	rdi,r14
 endif
	mov	rsp,rbp

	pop	rbp
	pop	rax

	sub	rbp,1
	jne	write_functions_on_stack
	
end_profile_stack:
stack_not_initialised:
	ret

init_profiler:
	mov	rbp,rsp
	sub	rsp,40
	and	rsp,-16
 ifdef LINUX
	mov	r13,rsi
	mov	r14,rdi
	mov	rdi,qword ptr ab_stack_size
	call	malloc
	mov	rsi,r13
	mov	rdi,r14
 else
	mov	rcx,qword ptr ab_stack_size
	call	allocate_memory
 endif
	mov	rsp,rbp
	
	test	rax,rax
	je	init_profiler_error

	push	rax

	lea	rbp,start_string
	call	allocate_function_profile_record

	pop	rdx

	mov	qword ptr 8[rdx],rbx
	mov	qword ptr [rdx],0
	add	rdx,16
	mov	qword ptr profile_stack_pointer,rdx
	mov	qword ptr global_last_tail_call,0

	mov	qword ptr global_n_words_free,r15

	rdtsc
	mov	dword ptr global_time_hi,edx
	mov	dword ptr global_time_lo,eax
	ret

init_profiler_error:
	mov	qword ptr profile_stack_pointer,0
	lea	rbp,not_enough_memory_for_profile_stack
	jmp	print_error

_TEXT	ends

	_DATA segment

	align (1 shl 3)

global_n_free_records_in_block:
	dq 0
; 0 n free records in block
global_last_allocated_block:
	dq 0
; 8 latest allocated block
global_profile_records:
	dq 0
; 16 profile record list
global_time_hi:
	dd 0
; 24 clock
global_time_lo:
	dd 0
global_last_tail_call:
	dq 0
; last tail calling function
global_n_words_free:
	dq 0

profile_file_name:
	dq	__STRING__+2
	dq	0
	dq	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
	dq	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
	dq	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
	dq	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
	dq	0
stack_trace_depth:
	dq	12
	align	(1 shl 3)

; m_system also defined in istartup.s
; m_system:
;	dq	6
;	db	"System"
;	db	0
;	db	0

	dd	m_system
start_string:
	dq	0
	db	"start"
	db	0
	align	(1 shl 3)
not_enough_memory_for_profile_stack:
	db	"not enough memory for profile stack"
	db	10
	db	0
not_enough_memory_for_profiler:
	db	"not enough memory for profiler"
	db	10
	db	0
stack_trace_string:
	db	"Stack trace:"
	db	10
	db	0
module_string:
	db	" [module: "
	db	0
	align	(1 shl 3)

_DATA	ends

;	end