;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Test program to determine if reloading CR3 with the same value
; causes a TLB flush.
;
; Requires DOS and 386+ CPU to run. Requires NASM to build:
;	nasm -f bin -o test.com test.asm
;
; Changes from previous version:
; - Touching all 1024 pages of the bottom 4 MB of memory after changing CR3
; - Measuring only the amount of time needed to touch pages,
;   not time needed to load CR3
; - Using timer chip for timing instead of RDTSC,
;   so code should work on 386 and 486 systems
; - Can now run code from DOS or from a bootsector
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	ORG 100h		; DOS .COM file

; check for DOS PSP
	cmp word [0],20CDh	; "INT 20h"
	jne no_dos
	inc byte [_dos]
no_dos:

; check for 32-bit CPU
	pushf
		pushf
		pop bx		; old FLAGS -> BX
		mov ax,bx
		xor ah,70h	; try changing b14 (NT)...
		push ax		; ... or b13:b12 (IOPL)
		popf
		pushf
		pop ax		; new FLAGS -> AX
	popf
	xor ah,bh		; 32-bit CPU if we changed EFLAGS.NT...
	and ah,70h		; ...or EFLAGS.IOPL
	mov si,_cpu_msg
	je mexit

; check for virtual 8086 mode
	smsw ax			; check MSW.PE = b0 of MSW
	test al,1       	; (MSW = CR0)
	je no_vm86
	mov si,_v86_msg
mexit:
	call puts16		; display message
exit:
	xor ax,ax		; if loaded from DOS...
	or al,[_dos]
	je reboot

	mov ax,4C01h		; ...DOS terminate
	int 21h
reboot:
	mov ah,0		; await key pressed
	int 16h

	int 19h			; re-start the boot process

no_vm86:
; clear screen by re-setting text mode 3
	mov ax,3
	int 10h

; set up same segment base addresses in protected mode as in real mode
	xor ebx,ebx
	mov bx,ds
	shl ebx,4

	mov eax,ebx
	shr eax,16
	mov [_gdt_cs + 2],bx
	mov [_gdt_cs + 4],al
	mov [_gdt_cs + 7],ah

	mov [_gdt_ds + 2],bx
	mov [_gdt_ds + 4],al
	mov [_gdt_ds + 7],ah

	lea eax,[ebx + _gdt]
	mov [_gdt_ptr + 2],eax

; get some memory for page tables. The linear address
; of this memory must be aligned on a page (4K) boundary.
	lea ebp,[ebx + _heap]	; linear address of "_heap"
	add ebp,4095		; align to 4K
	and bp,0F000h
	mov edi,ebp		; EBP=linear adr (relative to 0)...
	sub edi,ebx		; ...(E)DI=offset (relative to DS base)

; use first 4K of aligned memory for page table that identity-maps
; the bottom 4 MB of memory
	mov cx,1024
	mov eax,3		; b2=0=ring 0, b1=1=writable, b0=1=present
init_ptab:
	stosd
	add eax,1000h
	loop init_ptab

; use next 4K for page directory (top-level page table) #1
; (at linear address [ebp+4096])
	mov eax,ebp		; physical linear address of page table
	or al,3			; b2=0=ring 0, b1=1=writable, b0=1=present
	stosd
	mov cx,1023		; zero the other 1023 entries
	xor eax,eax
	rep stosd

; use next 4K for page directory #2
; (at linear address [ebp+8192])
	mov eax,ebp		; physical linear address of page table
	or al,3			; b2=0=ring 0, b1=1=writable, b0=1=present
	stosd
	mov cx,1023		; zero the other 1023 entries
	xor eax,eax
	rep stosd

; switch to 32-bit pmode
	push dword 2		; interrupts off
	popfd

	lgdt [_gdt_ptr]		; load GDTR

	mov eax,1		; set CR0.PE
	mov cr0,eax

	jmp CSEL:pmode		; load all seg regs, starting with CS

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; name:			puts16
; action:		displays text on screen
; in:			0-terminated string at DS:SI
; out:			(nothing)
; modifies:		(nothing)
; minimum CPU:		8088
; notes:		real-mode (16-bit) code
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

puts16:
	push ax
	push bx
	push si
		mov ah,0Eh	; INT 10h: teletype output
		xor bx,bx	; video page 0
		jmp .2
.1:
		int 10h
.2:
		lodsb
		or al,al
		jne .1
	pop si
	pop bx
	pop ax
	ret

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32-BIT CODE
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	BITS 32

pmode:
	mov ax,DSEL
	mov ds,ax
	mov ss,ax
	mov fs,ax
	mov gs,ax

	mov ax,LSEL
	mov es,ax

; display "pmode enabled"
	mov esi,_pmode_msg
	call puts32

; enable paging
	lea eax,[ebp + 4096]		; linear address of page dir #1
	mov cr3,eax

	mov eax,80000001h		; set CR0.PG and CR0.PE
	mov cr0,eax
; ??
	jmp short paging_enabled
paging_enabled:

	mov esi,_paging_msg		; display "paging enabled"
	call puts32

;;;;;;;;;;;;; load CR3 with different value ;;;;;;;;;;;;;

	lea ebx,[ebp + 8192]
	mov cr3,ebx

; measure time needed to fault in pages
	call timer			; read start time
	mov ebx,eax

	mov edi,0			; touch all 1024 pages in bottom 4M
	mov ecx,1024
touch1:
	mov eax,[es:edi]
	add edi,4096
	loop touch1

	call timer			; get elapsed time...
	sub eax,ebx

	mov esi,_diff_msg		; ...and display it...
	call puts32
	mov ebx,10			; ...in decimal
	call wrnum

;;;;;;;;;;;;; load CR3 with same value ;;;;;;;;;;;;;

	mov eax,cr3
	mov cr3,eax

; measure time needed to fault in pages
	call timer			; read start time
	mov ebx,eax

	mov edi,0			; touch all 1024 pages in bottom 4M
	mov ecx,1024
touch2:
	mov eax,[es:edi]
	add edi,4096
	loop touch2

	call timer			; get elapsed time...
	sub eax,ebx

	mov esi,_same_msg		; ...and display it
	call puts32
	mov ebx,10			; ...in decimal
	call wrnum

;;;;;;;;;;;;; do nothing with CR3 ;;;;;;;;;;;;;

	;mov eax,cr3

; measure time needed to fault in pages
	call timer			; read start time
	mov ebx,eax

	mov edi,0			; touch all 1024 pages in bottom 4M
	mov ecx,1024
touch3:
	mov eax,[es:edi]
	add edi,4096
	loop touch3

	call timer			; get elapsed time...
	sub eax,ebx

	mov esi,_nop_msg		; ...and display it
	call puts32
	mov ebx,10			; ...in decimal
	call wrnum

; freeze
	jmp short $

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; name:			timer
; action:		reads 8243 timer chip free-running counter
; in:			timer count in EAX
; out:			(nothing)
; modifies:		(nothing)
; minimum CPU:		'386
; notes:		32-bit pmode code
;			8243 counter decrements once every 838.09 nsec
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

timer:
	push ebx
	pushf
		xor ebx,ebx
		cli			; interrupts off

		mov al,0		; latch timer value
		out 43h,al

		in al,40h		; get LSB
		mov bl,al

		in al,40h
		mov bh,al

		not ebx			; make it count UP; not down
		mov eax,ebx
	popf
	pop ebx
	ret

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; name:			putc32
; action:		displays one character on screen
; in:			character in AL
; out:			(nothing)
; modifies:		(nothing)
; minimum CPU:		'386
; notes:		32-bit pmode code
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

putc32:
	push es
	push ebx
	push eax
		cmp al,13		; handle newline
		je .1
		cmp al,' '		; ignore other ASCII control chars
		jb .2
		push eax
			mov ax,LSEL
			mov es,ax

			xor eax,eax
			mov al,[_csr_y]
			mov ah,80
			mul ah
			add al,[_csr_x]
			adc ah,0
			mov ebx,eax
		pop eax
		shl ebx,1		; EBX = (csr_y * 80 + csr_x) * 2

		mov [es:ebx + 0B8000h],al ; store char in video memory

		inc byte [_csr_x]	; advance cursor
		cmp byte [_csr_x],80
		jb .2
.1:
		mov byte [_csr_x],0	; line wrap
		inc byte [_csr_y]
		cmp byte [_csr_y],25
		jb .2
		mov byte [_csr_y],0	; screen wrap (no scrolling)
.2:
	pop eax
	pop ebx
	pop es
	ret

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; name:			puts32
; action:		displays text on screen
; in:			0-terminated string at DS:ESI
; out:			(nothing)
; modifies:		(nothing)
; minimum CPU:		'386
; notes:		32-bit pmode code
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

puts32:
	push esi
	push eax
		cld
		jmp short .2
.1:
		call putc32
.2:
		lodsb
		or al,al
		jne .1
	pop eax
	pop esi
	ret

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; name:			ltoa
; action:		converts 32-bit unsigned value to string
; in:			32-bit unsigned value in EAX, radix in EBX,
;			ESI -> buffer
; out:			(nothing)
; modifies:		ESI
; minimum CPU:		'386
; notes:		32-bit pmode code
;			ESI must point to zero byte at END of buffer
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

ltoa:
	push edx
	push eax
.1:
		xor edx,edx	; EDX:EAX is dividend...
		div ebx		; ...EBX is divisor...
				; ...EAX is quotient, EDX is remainder
		add dl,'0'	; convert remainder from binary to ASCII
		cmp dl,'9'
		jbe .2
		add dl,('A'-('9'+1))
.2:
		dec esi		; store ASCII remainders in reverse order
		mov [esi],dl
		or eax,eax	; loop until quotient == 0
		jne .1
	pop eax
	pop edx
	ret

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; name:			wrnum
; action:		displays 32-bit value
; in:			32-bit unsigned value in EAX, radix in EBX
; out:			(nothing)
; modifies:		(nothing)
; minimum CPU:		'386
; notes:		32-bit pmode code
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

wrnum:
	push esi
		mov esi,_num_buf
		call ltoa
		call puts32
	pop esi
	ret

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; DATA
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

_cpu_msg:
	db "32-bit CPU required (386SX or better)", 13, 10, 0
_v86_msg:
	db "CPU in Virtual-8086 mode (Windows DOS box or EMM386 loaded?)"
	db 13, 10, 0
_pmode_msg:
	db "Now in 32-bit protected mode", 13, 0
_paging_msg:
	db "Paging enabled", 13, 0
_diff_msg:
	db "Clock count after loading CR3 with different value: ", 0
_same_msg:
	db 13, "Clock count after loading CR3 with SAME value: ", 0
_nop_msg:
	db 13, "Clock count after doing nothing with CR3: ", 0

_gdt:
	dd 0, 0			; NULL descriptor

LSEL equ ($ - _gdt)		; LINEAR data segment selector
	dw 0FFFFh		; limit 15:0
	dw 0			; base 15:0
	db 0			; base 23:16
	db 92h			; present, ring 0, data, expand-up, writable
	db 0CFh			; page-granular limit, 32-bit
	db 0			; base 31:24

_gdt_cs:			; CODE descriptor...
CSEL equ ($ - _gdt)		; ...and selector
	dw 0FFFFh		; limit 15:0
	dw 0			; base 15:0 (set above)
	db 0			; base 23:16 (set above)
	db 9Ah			; present, ring 0, code, readable
	db 0CFh			; page-granular limit, 32-bit
	db 0			; base 31:24 (set above)

_gdt_ds:			; DATA descriptor...
DSEL equ ($ - _gdt)		; ...and selector
	dw 0FFFFh		; limit 15:0
	dw 0			; base 15:0 (set above)
	db 0			; base 23:16 (set above)
	db 92h			; present, ring 0, data, expand-up, writable
	db 0CFh			; page-granular limit, 32-bit
	db 0			; base 31:24 (set above)
_gdt_end:

_gdt_ptr:
	dw _gdt_end - _gdt - 1	; GDT limit
	dd _gdt			; GDT linear adr (fixed up above)

	times 256 db 0
_num_buf:
	db 0
_csr_x:
	db 0
_csr_y:
	db 0
_dos:
	db 0

_heap:
