html-program

.text
    alloc	96
    write	"test carry-less multiply"
    clmulll	%r34, %r21, %r22
    clmulhl	%r34, %r21, %r22
    clmulhl	%r34, %r21, %r22
    clmulhh	%r34, %r21, %r22
.rodata
align 16
vector_a:
    d8	0x7b5b546573745665
    d8	0x63746f725d53475d
vector_b:
    d8	0x4869285368617929
    d8	0x5b477565726f6e5d
result_00:
    d8	0x1d4d84c85c3440c0
    d8	0x929633d5d36f0451
result_01:
    d8	0x1bd17c8d556ab5a1
    d8	0x7fa540ac2a281315
result_10:
    d8	0x1a2bf6db3a30862f
    d8	0xbabf262df4b7d5c9
result_11:
    d8	0x1d1e1f2c592e7c45
    d8	0xd66ee03e410fd4ed
.text
    ldqr	%r12, vector_a
    ldqr	%r13, vector_b

    clmulll	%r11, %r12, %r13
    ldqr	%r21, result_00
    write	"clmul: %x128(r11) %x128(r21)"
    clmulhl	%r11, %r13, %r12
    ldqr	%r21, result_01
    write	"clmul: %x128(r11) %x128(r21)"
    clmulhl	%r11, %r12, %r13
    ldqr	%r21, result_10
    write	"clmul: %x128(r11) %x128(r21)"
    clmulhh	%r11, %r12, %r13
    ldqr	%r21, result_11
    write	"clmul: %x128(r11) %x128(r21)"

    write	"test aes"
    aesdec	%r11, %r12, %r13
    aesdeclast	%r11, %r12, %r13
    aesenc	%r11, %r12, %r13
    aesenclast	%r11, %r12, %r13
    aesimc	%r11, %r12
    aeskeygenassist %r11, %r12, 250
    write	"end aes test"
.end
.text
;*****************************************************************
; ARITHMETIC
;*****************************************************************
    alloc	96
    write	"test load constant (1234567)"
    ldi		%r1, 1234567
    write	"ldi: %i64(r1)"

    write	"test load long constant (123456789012345678)"
    ldi.l	%r1, 123456789012345678
    write	"ldi long: %i64(r1)"

    write	"test simple arithmetic"
    ldi		%r1, 1
    ldi		%r2, 2
    ldi		%r3, 3

    write	"add 1+2"
    add		%r4, %r1, %r2
    write	"add: %i64(r4)"

    write	"add immediate 1+6"
    addi	%r4, %r1, 6
    write	"addi: %i64(r4)"

    write	"sub 1-2"
    sub		%r4, %r1, %r2
    write	"sub: %i64(r4)"

    write	"sub from immediate 6-1"
    subfi	%r4, %r1, 6
    write	"subfi: %i64(r4)"

    write	"mul 3*4"
    ldi		%r1, 3
    ldi		%r2, 4
    mul		%r4, %r1, %r2
    write	"mul: %i64(r4)"

    write	"12 div 4"
    ldi		%r1, 12
    ldi		%r2, 4
    div		%r4, %r1, %r2
    write	"%i64(r4)"

    write	"15 mod 4"
    ldi		%r1, 15
    ldi		%r2, 4
    mod		%r4, %r1, %r2
    write	"mod: %i64(r4)"

    write	"test int32_t add"
    ldi.l	%r1, 0xFFFFFFFF
    ldi.l	%r2, 0xFFFFFFF0
    addws	%r3, %r1, %r2
    write	"add4: %i64(r3)"
    addiws.l	%r3, %r1, 0xFFFFFFFF
    write	"addis4.l: %i64(r3)"


    addi	%r45, %r45, 12
    mov		%r54, %r56
    sub		%r45, %r56, %r50
    addi	%r45, %r55, -1000
    cmpdne	%r12, %r56, %r10
    subfi	%r45, %r56, -10000
    subfi	%r45, %r56, -20000
    cmpdeq	%r13, %r56, %r50
    add		%r45, %r56, %r50
    addi	%r45, %r56, -10000
    mul		%r45, %r56, %r50
    muli	%r45, %r56, -10000
    mov		%r55, %r20
    ldi		%r55, 1200
    ldi		%r55, 987654
    ldi.l	%r56, 98765432198765432
    addi	%r12, %r13, -789
    cmpdne	%r14, %r13, %r77
    nand	%r43, %r44, %r34
    nor		%r43, %r44, %r34
    addi	%r56, %sp, 0
    ; callr	%r0, quadrat
    add		%r56, %sp, %sp

    ldi.l	%r55, -9223372036854775808
    addi	%r56, %sp, -64
    subfi.l	%r55, %r56,12345678901234567
    nor		%r12, %r14, %r14
    addi	%r56, %sp, -64
    nor		%r12, %r14, %r14
    subfi.l	%r55, %r56, 12345678901234567
    addi	%r56, %sp, -64
    subfi.l	%r55, %r56, -12345678901234567
    addi	%r56, %sp, -64
    subfi.l	%r55, %r56, -12345678901234567
    addi.l	%r45, %r56, 12345678



    ldi.l	%r5, 0xaFFFFFFF12345677
    ldi.l	%r6, 0xaFFFFFFF12345678

    write	"test signed overflow: %i64(r5) %i64(r6)"

    write	"add overflow"
    addo	%r2, %r5, %r6
    write	"addo: %i64(r2)"

    write	"subtract overflow"
    subo	%r2, %r5, %r6
    write	"subo: %i64(r2)"

    write	"test unsigned add carry"
    ldi		%r7, -1
    ldi		%r5, -2
    ldi		%r6, -1
    addaddc	%r2, %r5, %r6, %r7
    write	"addaddc: %u64(r5) %u64(r6) %u64(r7) => %i64(r2)"

    write	"test unsigned subtract borrow"
    ldi		%r7, -1
    ldi		%r5, 12
    ldi		%r6, -1
    subsubb	%r2, %r5, %r6, %r7
    write	"subsub: %u64(r5) %u64(r6) %u64(r7) => %i64(r2)"

    muladd	%r34, %r45, %r67, %r80
    mulsub	%r34, %r45, %r67, %r80
    mulsubf	%r34, %r45, %r67, %r80
    addadd	%r34, %r45, %r67, %r80
    addsub	%r34, %r45, %r67, %r80
    subsub	%r34, %r45, %r67, %r80

.end
.text
    alloc 96
    write "test atomic fetch-op"
    addi %r5, %sp, -64
    write "atomic base: %x64(r5)"
    ldi  %r10, 5
    ldi  %r12, 10
    ldi  %r56, 5

    write "test amoadd"

    ldaddb.relaxed %r4, %r5, %r10
    ldaddb.acquire %r4, %r5, %r10
    ldaddb.release %r4, %r5, %r10
    ldaddb.acq_rel %r4, %r5, %r10

    ldaddh.relaxed %r4, %r5, %r10
    ldaddh.acquire %r4, %r5, %r10
    ldaddh.release %r4, %r5, %r10
    ldaddh.acq_rel %r4, %r5, %r10

    ldaddw.relaxed %r4, %r5, %r10
    ldaddw.acquire %r4, %r5, %r10
    ldaddw.release %r4, %r5, %r10
    ldaddw.acq_rel %r4, %r5, %r10

    ldaddd.relaxed %r4, %r5, %r10
    ldaddd.acquire %r4, %r5, %r10
    ldaddd.release %r4, %r5, %r10
    ldaddd.acq_rel %r4, %r5, %r10

    write "test amo-binary"

    ldandw.relaxed %r4, %r5, %r10
    ldandw.acquire %r4, %r5, %r10
    ldandw.release %r4, %r5, %r10
    ldandw.acq_rel %r4, %r5, %r10

    ldorw.release %r4, %r5, %r10
    ldorw.acq_rel %r4, %r5, %r10

    ldxorw.relaxed %r4, %r5, %r10
    ldxorw.relaxed %r4, %r5, %r10

    write "test amomin"
    ldsminw.acquire %r4, %r5, %r10
    ldsmind.acq_rel %r4, %r5, %r10

    ldsmaxb.relaxed %r4, %r5, %r10
    ldsmaxh.acquire %r4, %r5, %r10
    ldsmaxw.release %r4, %r5, %r10
    ldsmaxd.acq_rel %r4, %r5, %r10

    write "test amominu"

    lduminb.relaxed %r4, %r5, %r10
    ldumind.acquire %r4, %r5, %r10
    ldumaxd.release %r4, %r5, %r10
    ldumaxw.release %r4, %r5, %r10

    write "test cas"

    casb.relaxed %r12, %r5, %r56
    casb.acquire %r12, %r5, %r56
    casb.release %r12, %r5, %r56
    casb.acq_rel %r12, %r5, %r56

    cash.relaxed %r12, %r5, %r56
    cash.acquire %r12, %r5, %r56
    cash.release %r12, %r5, %r56
    cash.acq_rel %r12, %r5, %r56

    casw.relaxed %r12, %r5, %r56
    casw.acquire %r12, %r5, %r56
    casw.release %r12, %r5, %r56
    casw.acq_rel %r12, %r5, %r56

    casd.relaxed %r12, %r5, %r56
    casd.acquire %r12, %r5, %r56
    casd.release %r12, %r5, %r56
    casd.acq_rel %r12, %r5, %r56

    write "test load atomic relaxed"
    ldab.relaxed %r12, %r5
    ldah.relaxed %r12, %r5
    ldaw.relaxed %r12, %r5
    ldad.relaxed %r12, %r5
    ldaq.relaxed %r12, %r5

    write "test load atomic acquire"
    ldab.acquire %r12, %r5
    ldah.acquire %r12, %r5
    ldaw.acquire %r12, %r5
    ldad.acquire %r12, %r5
    ldaq.acquire %r12, %r5

    write "test store atomic relaxed"
    stab.relaxed %r12, %r5
    stah.relaxed %r12, %r5
    staw.relaxed %r12, %r5
    stad.relaxed %r12, %r5
    staq.relaxed %r12, %r5

    write "test store atomic release"
    stab.release %r12, %r5
    stah.release %r12, %r5
    staw.release %r12, %r5
    stad.release %r12, %r5
    staq.release %r12, %r5

.end
.text
.data
data_lbl:
    d1	25
    d1	26
    d1	27
    d1	28

.text
program_start:
; Here we test references to data section.
; Absolute offset from begin of section
    write	"base addressing"
    alloc	96
    ldar	%r17, program_start
    ldi		%r12, data_lbl
    write	"data_lbl: %i64(r12)"

    ldi		%r12, data_hi(data_lbl)
    write	"data_hi(data_lbl): %i64(r12)"
    ldi		%r12, data_lo(data_lbl)
    write	"data_lo(data_lbl): %i64(r12)"
    ldafr	%r13, data_lbl
    write	"ldafr(data_lbl): %x64(r13)"
    ldafr.l	%r13, data_lbl
    write	"ldafr(data_lbl): %x64(r13)"

    addi	%r13, %r17, data_hi(data_lbl)
    write	"r13     %i64(r13)"
    addi	%r14, %r13, data_lo(data_lbl)+0
    write	"r14     %i64(r14)"

    addi	%r13, %r17, data_hi(data_lbl)
    write	"r13     %i64(r13)"
    ldbz	%r25, %r13, data_lo(data_lbl)+0
    ldbz	%r26, %r13, data_lo(data_lbl)+1
    ldbz	%r27, %r13, data_lo(data_lbl)+2
    ldbz	%r28, %r13, data_lo(data_lbl)+3
    write	"r25     %i64(r25)" ; must be 25
    write	"r26     %i64(r26)" ; must be 26
    write	"r27     %i64(r27)" ; must be 27
    write	"r28     %i64(r28)" ; must be 28

; test load context
    lddz	%r1, %sp, -16
    std		%r1, %sp, -16
    jmp		skipaddr
    jmp.l	skipaddr

; test indexed load/store
    stbx	%r12, %r15, %r30, 4, 14
    sthx	%r12, %r15, %r30, 4, 14
    stwx	%r12, %r15, %r30, 4, 14
    stdx	%r12, %r15, %r30, 4, 14

    ldaq.relaxed %r30, %r56
    staq.relaxed %r43, %r56

    sladd	%r43, %r56, %r23, 4
    slsub	%r43, %r56, %r23, 42
    slsubf	%r43, %r56, %r23, 12

    ldwz	%r30, %r5, 66*4	; load mid
    lddzx	%r40, %tp, %r30, 0, 4	; load base

    lddsx	%r12, %r23, %r40, 3, 114
    lddsx	%r12, %r23, %r40, 3, 114
    lddzx	%r12, %r23, %r40, 3, 114
    lddzx	%r12, %r23, %r40, 3, 114
    stwx	%r12, %r23, %r40, 3, 114
    stdx	%r12, %r23, %r40, 3, 114

    ldbsx	%r12, %r23, %r40, 3, 114
    ldbsx	%r12, %r23, %r40, 3, 114
    ldbzx	%r12, %r23, %r40, 3, 114
    ldbzx	%r12, %r23, %r40, 3, 114
    stbx	%r12, %r23, %r40, 3, 114
    stbx	%r12, %r23, %r40, 3, 114

    ldhsx	%r12, %r23, %r40, 3, 114
    ldhsx	%r12, %r23, %r40, 3, 114
    ldhzx	%r12, %r23, %r40, 3, 114
    ldhzx	%r12, %r23, %r40, 3, 114
    sthx	%r12, %r23, %r40, 3, 114
    sthx	%r12, %r23, %r40, 3, 114

.text
; LOAD/STORE
    sladd	%r54, %r56, %r12, 5

    ldbz	%r16, %r45, 8900
    ldbs	%r15, %r46, 8900
    ldbzx	%r54, %r56, %r12, 2, 37
    ldbsx	%r53, %r65, %r12, 2, 37
    ldbzx.l	%r54, %r56, %r12, 2, 37000000
    ldbsx.l	%r53, %r65, %r12, 2, -37000000
    ldbzmia	%r52, %r75, 10
    ldbsmia	%r51, %r76, 10
    ldbzmib	%r52, %r75, 10
    ldbsmib	%r51, %r76, 10
    stbmia	%r51, %r76, 10
    stbmib	%r52, %r75, 10

    ldhz	%r12, %r45, 8900
    ldhs	%r12, %r45, 8900
    ldhzx	%r54, %r56, %r12, 3, -157
    ldhsx	%r54, %r56, %r12, 2, 237
    ldhzx.l	%r54, %r56, %r12, 2, 37000000
    ldhsx.l	%r53, %r65, %r12, 2, -37000000
    ldhzmia	%r54, %r56, 12
    ldhsmia	%r54, %r56, -60
    ldhzmib	%r54, %r56, 12
    ldhsmib	%r54, %r56, -60
    sthmia	%r51, %r76, 10
    sthmib	%r52, %r75, 10

    ldwz	%r12, %r45, 8900
    ldws	%r12, %r45, 8900
    ldwzx	%r54, %r56, %r12, 2, 7
    ldwsx	%r54, %r56, %r12, 2, 7
    ldwzx.l	%r54, %r56, %r12, 2, 37000000
    ldwsx.l	%r53, %r65, %r12, 2, -37000000
    ldwzmia	%r54, %r56, 12
    ldwsmia	%r54, %r56, 32
    ldwzmib	%r54, %r56, 12
    ldwsmib	%r54, %r56, 32
    stwmia	%r51, %r76, 10
    stwmib	%r52, %r75, 10

    lddz	%r54, %r56, 5600
    ldds	%r54, %r56, 5600
    lddz.l	%r53, %r46, 98765432
    lddz	%r52, %r45, -5600
    lddz.l	%r51, %r55, -98765432
    lddzx	%r50, %r56, %r12, 2, 37
    lddsx	%r50, %r56, %r12, 2, 37
    lddzx.l	%r54, %r56, %r12, 2, 37000000
    lddsx.l	%r53, %r65, %r12, 2, -37000000
    lddzmia	%r57, %r56, -12
    lddzmia	%r57, %r56, -12
    lddsmia	%r57, %r56, -12
    lddsmia	%r57, %r56, -12
    lddzmib	%r57, %r56, -12
    lddzmib	%r57, %r56, -12
    lddsmib	%r57, %r56, -12
    lddsmib	%r57, %r56, -12
    stdmia	%r51, %r76, 10
    stdmib	%r52, %r75, 10

    ldq		%r16, %r45, 8900
    ldq.l	%r16, %r45, 8900000
    ldq.l	%r16, %r45, -8900000
    ldqx	%r54, %r56, %r12, 2, 37
    ldqx.l	%r54, %r56, %r12, 2, 37000000
    ldqx.l	%r54, %r56, %r12, 2, -37000000
    ldqmia	%r52, %r75, 10
    ldqmia	%r52, %r75, 10
    ldqmib	%r52, %r75, 10
    ldqmib	%r52, %r75, 10
    stqmia	%r51, %r76, 10
    stqmib	%r52, %r75, 10

    stb		%r12, %r45, 8900
    sth		%r12, %r45, 8900
    stw		%r12, %r45, 8900
    std		%r12, %r45, 890*8

    lddz	%r12, %r45, 8048
    std		%r12, %r45, 8064
    lddzx	%r12, %r45, %r13, 3, 7
    stdx	%r12, %r45, %r13, 3, 7

    lddz	%r60, %r55, 56
    lddz	%r60, %r56, 56
    lddz	%r46, %r55, 120
    std		%r47, %r55, 56

    lddz	%r60, %sp, 624
    std		%r60, %sp, 624
    lddzx	%r60, %sp, %r12, 3, 28
    stdx	%r60, %sp, %r12, 3, 26
    lddz	%r56, %r57, 567
    std		%r56, %r57, 567

    ldwz	%r34, %r12, 900
    lddz	%r34, %r12, 900
    stw		%r23, %r12, 900
    std		%r23, %r12, 900

    ldq		%r34, %r13, 55*16
    stq		%r35, %r13, 55*16
    ldqx	%r34, %r13, %r45, 3, 80
    stqx	%r34, %r13, %r45, 3, 80

skipaddr:
    nop	0
.end
.text
    alloc	25
    ldi.l	%r23, 0x1234567890abcdef
    write	"test population statistic instructions"
    cntpop	%r12, %r23, 3
    write	"cntpop: %i64(r12)"
    cntlz	%r12, %r23, 0
    write	"cntlz %i64(r12)"
    cnttz	%r12, %r23, 1
    cntlz	%r12, %r23, 2
    cnttz	%r12, %r23, 3
    cntlz	%r12, %r23, 4
    cnttz	%r12, %r23, 5
.end.text
    write	"test bit reverse instruction (permb)"
    alloc	80
    ldi.l	%r55, 0x1234567890ABCDEF
    write	"initial value: %x64(r55)"
    permb	%r55, %r55, 63
    permb	%r56, %r78, 63
    write	"r55 %x64(r55) %b64(r55)"
    permb	%r55, %r55, 63
    write	"r55 %x64(r55) %b64(r55)"

    permb	%r56, %r55, 0b111111 ;63
    write	"reverse bits: %x64(r56)"

    permb	%r56, %r55, 0b111110  ;32+16+8+4+2
    write	"reverse bit-pairs: %x64(r56)"

    permb	%r56, %r55, 0b111100  ;32+16+8+4
    write	"reverse nibbles (4-bits): %x64(r56)"

    permb	%r56, %r55, 0b111000 ;32+16+8
    write	"reverse 1bytes: %x64(r55) => %x64(r56)"

    permb	%r56, %r55, 0b110000  ;32+16
    write	"reverse 2bytes: %x64(r55) => %x64(r56)"

    permb	%r56, %r55, 0b100000  ;32
    write	"reverse 4bytes: %x64(r55) => %x64(r56)"
.end.text
    alloc	46
    write	"test bitwise logical"
    and		%r23, %r25, %r45
    andi	%r23, %r25, 12345
    andi.l	%r23, %r25, 1234567890
    andn	%r23, %r25, %r45
    andni	%r23, %r25, 12345
    or		%r23, %r25, %r45
    ori		%r23, %r25, 12345
    ori.l	%r23, %r25, 1234567890
    orn		%r23, %r25, %r45
    orni	%r23, %r25, 12345
    xor		%r23, %r25, %r45
    xori	%r23, %r25, 12345
    xori.l	%r23, %r25, 1234567890
    nor		%r23, %r25, %r45
    nand	%r23, %r25, %r45
    xnor	%r23, %r25, %r45
.end.text
    write	"blti, test memory"
.data
align 8
test_memory:
    d8	0
    d8	1
    d8	2
    d8	3
    d8	4
    d8	5
    d8	6
    d8	7
.text
    alloc	20
    ldafr	%r12, test_memory
    write	"test_memory: %x64(r12)"
    ldi		%r11, 0
    ldi		%r14, 0
memory_loop: (32)
    lddzx	%r13, %r12, %r11, 3, 0
    addi	%r11, %r11, 1
    addi	%r14, %r14, 1
    andi	%r11, %r11, 7
; fast_check
    bdlti.l	%r14, 200000, memory_loop
    write	"counter: %i64(r14)"
.end.text
    alloc	20
    write	"test compare-with-zero-and-long-branch"
compare_with_zero_test_continue:
compare_with_zero_backward_target:
    addi	%r2, %r2, 1
    bdeq	%r2, %r2, compare_with_zero_test_exit

    bdeq	%r1, %gz, compare_with_zero_forward_target
    bdeq.l	%r1, %gz, compare_with_zero_forward_target
    bdeq	%r1, %gz, compare_with_zero_backward_target
    bdeq.l	%r1, %gz, compare_with_zero_backward_target
    bdne	%r1, %gz, compare_with_zero_forward_target
    bdne.l	%r1, %gz, compare_with_zero_forward_target
    bdne	%r1, %gz, compare_with_zero_backward_target
    bdne.l	%r1, %gz, compare_with_zero_backward_target
    bdlt	%r1, %gz, compare_with_zero_forward_target
    bdlt.l	%r1, %gz, compare_with_zero_forward_target
    bdlt	%r1, %gz, compare_with_zero_backward_target
    bdlt.l	%r1, %gz, compare_with_zero_backward_target
    bdle	%r1, %gz, compare_with_zero_forward_target
    bdle.l	%r1, %gz, compare_with_zero_forward_target
    bdle	%r1, %gz, compare_with_zero_backward_target
    bdle.l	%r1, %gz, compare_with_zero_backward_target
    bdgt	%r1, %gz, compare_with_zero_forward_target
    bdgt.l	%r1, %gz, compare_with_zero_forward_target
    bdgt	%r1, %gz, compare_with_zero_backward_target
    bdgt.l	%r1, %gz, compare_with_zero_backward_target
    bdge	%r1, %gz, compare_with_zero_forward_target
    bdge.l	%r1, %gz, compare_with_zero_forward_target
    bdge	%r1, %gz, compare_with_zero_backward_target
    bdge.l	%r1, %gz, compare_with_zero_backward_target

compare_with_zero_forward_target:
    jmp		compare_with_zero_test_continue
compare_with_zero_test_exit:
    write	"end test compare-with-zero-and-long-branch"
.end
.text

call_code_target:

.rodata
call_data_target:

.text
    jmp	callexample
;*****************************************************************
; Function  compute A**4 of parameter A, passed in register r33
;*****************************************************************
quadrat:
    write	"function quadrat entered: r0=%x128(r0)"
    alloc	93
    write	"rsc     %s(rsc)"
    write	"psr     %s(psr)"
    write	"rsc     %s(rsc)"
    mul	%r33, %r33, %r33
    mul	%r33, %r33, %r33
    write	"r0=%x128(r0) r33=%i64(r33)"
    write	"%m(dump)"
;	mtspr	%r45, psr
    write	"function quadrat exited"
    ret
end_quadrat:

;*****************************************************************
; Example of calling sequence with branch prediction
callexample:
    alloc	91
    ldi.l	%r90, 0x1234567890abcdef
    write	"arg3 %x64(r90)"
    srpi	%r89, %r90, %r90, 16
    write	"arg2 %x64(r89)"
    srpi	%r88, %r90, %r90, 16
    write	"arg1 %x64(r88)"
    ldi		%r87, 7		; setup arguments
;   write	"%m(dump)"
    write	"rsc: %s(rsc)"
    write	"function quadrat called"
    callr	%r86, quadrat
    write	"rsc: %s(rsc)"
; Rest instructions after return from subroutine
;*****************************************************************
.text	; return to code section

; Here we test registers used by ABI (application binary interface)
; Check loader.
    write	"sp=%x64(sp) tp=%x64(tp) r0=%x128(r0)"
    write	"rsc: %s(rsc)"
    write	"psr: %s(psr)"
    write	"r14: %x64(r14)"
    write	"reta: %i64(r72)"		; out return address
    write	"retv: %i64(r73)"		; out return value
    write	"rsc: %s(rsc)"
    write	"rsc: %s(psr)"
    ldi.l	%r11, 0x407d8bffffccccff
    write	"r11: %x64(r11)"
    addi.l	%r12, %r11, 0x400000
    write	"r12: %x64(r12)"
    xor		%r20, %r19, %r11
    addi.l	%r20, %r20, 0x400000
    ldi		%r10, 10
    ldi		%r11, 11
    cmpdlt	%r2, %r11, %r10
    write	"%i64(r11) %i64(r10)"
    jmp		call_exit

    callr	%r42, quadrat
    callri	%r42, %r34, %gz
    callmi	%r42, %r34, 468
    callplt	%r42, call_data_target
    callri	%r42, %r34, %gz

call_exit:
    write	"end call test"

.end
.text
    alloc	47
    write	"test recursive calls"
    ldi.l	%r46, 0x7FFFFFFFFFFFFFFF		; comment
    ldi.l	%r46, 0x8000000000000000
    addi	%r46, %r46, -1
    write	"%i64(r46)"

    mfspr	%r20, %rsc

    alloc	54		; extend frame to 54 regs
    ldi		%r48, 1		; 
    ldi		%r53, 3		; 1 arg (33+16)
    ldi		%r52, 2		; 2 arg (34+16)
    ldi		%r51, 1		; 3 arg (35+16)
    write	"rsc: %s(rsc)"
    callr	%r50, func	; call func subroutine, safe 50 regs
    write	"r51=%i64(r51) rsc=%s(rsc)"
    ldi		%r53, 10
    callr	%r52, rekurs
    write	"rsc: %s(rsc)"
    write	"rsp: %s(rsp)"
;   write	"%m(dump)"
    jmp	smallend
func:
; at entry point func subroutine has 4 regs in frame
    alloc	8   ; extend frame from 4 to 8 regs
    write	"r0      %x128(r0)"		; print packed caller frame and return address
    write	"r1=%i64(r1) r2=%i64(r2) r3=%i64(r3)" ; print args
    ldi		%r1, 12345
    ret

rekurs:
    alloc	4
    write	"r0=%x128(r0) r1=%i64(r1)"
    write	"rsc: %s(rsc)"
    write	"rsp: %s(rsp)"
    addi	%r3, %r1, -1
    ldi		%r2, 0
    bdeq	%r1, %r2, rekret
;	cneq	%r1, %r2, 1, 0
    callr	%r2, rekurs
rekret:
    write	"rsp: %s(rsp)"
    write	"r0: %x128(r0)"
    retf	0
smallend:
    nop		0
    nop		111
    alloc	96
    write	"end_call_recursive"
.end
.text
    ; at the beginning of the program, the register stack is empty
    alloc	54   ; expand frame to 54 registers
    ehadj	simple_func_end
    ldi		%r47, 1  ; will be saved when called
    ldi		%r53, 3  ; first argument
    ldi		%r52, 2  ; second argument
    ldi		%r51, 1  ; third argument
    ; func procedure call, all registers up to 50 will be saved,
    ; return address, eip, frame size (50) are saved in r50
    callr	%r50, simple_func
    ; at this point, after returning, the frame will be again 53
    jmp		simple_func_end
simple_func:
    ; at the starting point, the func procedure has a 5-register frame
    ; their previous numbers are 50, 51, 52, 53, new - 0, 1, 2, 3
    ; extend the frame to 10 registers (another 4,5,6,7,8,9)
    alloc	10
    write	"r0 = %x128(r0)"	; print packed return info
    write	"r1 = %i64(r1)"	; print 1st argument
    write	"r2 = %i64(r2)"	; print 2nd argument
    write	"r3 = %i64(r3)"	; print 3rd argument
    ret
simple_func_end:
    nop		123
.end
.text
    write "example of carry/borrow testing"
    alloc	96

; 256-bit add (g30,%r31,r32,r33) + (g40,r41,r42,r43) => (g50,r51,r52,r53)
    ldi	%r30, -1
    ldi	%r31, -1
    ldi	%r34, -1
    ldi	%r33, -1

    ldi	%r40, 1
    ldi	%r41, 0
    ldi	%r42, 0
    ldi	%r43, 0

; throw add
    cmpdeq	%r10, %r30, %r40	; add carry out
    add		%r50, %r30, %r40	; add
    cmpdeqi	%r12, %r31, 1
    addi	%r51, %r31, 1

    cmpdeq	%r12, %r31, %r41	; add carry out
    add		%r51, %r31, %r41	; add
    cmpdeq	%r14, %r34, %r42	; add carry out
    add		%r52, %r34, %r42	; add
    cmpdeq	%r8, %r33, %r43	; add carry out
    add		%r53, %r33, %r43	; add
    write	"add carryis"
    addi	%r51, %r51, 1
    addi	%r52, %r52, 1
    addi	%r53, %r53, 1
; set last carry
    ldi		%r54, 1
    ldi		%r54, 0
    write	"multiprecision add:\nr50,r51,r52,r53,r54 = %x64(r50) %x64(r51) %x64(r52) %x64(r53) %x64(r54)"

    ldi.l	%r40, 0x7fffffffffffffff
    mulh	%r40, %r40, %r41
    write	"r40     %x64(r40)"

    ldi		%r12, 12345
    ldi.l	%r12, 12345678900

;	ldi	%r14, 0xFFFFFFFFF0
;	ld8	%r13, %r14, 0

    addc	%r12, %r14, %r46
    addc	%r12, %r14, %r46
    subb	%r12, %r14, %r46
    subb	%r12, %r14, %r46
    addaddc	%r12, %r14, %r46, %r23
    addaddc	%r12, %r14, %r46, %r22
    subsubb	%r12, %r14, %r46, %r13
    subsubb	%r12, %r14, %r46, %r14
    write	"end carry test"
    nop	11111
.end
.text
    write	"test compare"
    alloc	96
    ldi		%r20, 4
    ldi		%r21, 3
    ldi		%r22, -4
    ldi		%r23, -12
    write	"test compare instructions"
    cmpdeq	%r12, %r20, %r21
    cmpdlt	%r12, %r20, %r21
    cmpdltu	%r12, %r20, %r21
    cmpdeqi	%r12, %r20, 123456
    cmpdlti	%r12, %r20, 123456
    cmpdltui	%r12, %r20, 123456
    cmpdne	%r12, %r20, %r21
    cmpdnei	%r12, %r20, 123456
    cmpdgti	%r12, %r20, 123456
    cmpdgtui	%r12, %r20, 123456
    cmpdle	%r12, %r20, %r21
    cmpdleu	%r12, %r20, %r21

    cmpdgei	%r12, %r20, 123456
    cmpdgeui	%r12, %r20, 123456
    cmpdlei	%r12, %r20, 123456
    cmpdleui	%r12, %r20, 123456

    cmpweq	%r12, %r20, %r21
    cmpwlt	%r12, %r20, %r21
    cmpwltu	%r12, %r20, %r21
    cmpweqi	%r12, %r20, 123456
    cmpwlti	%r12, %r20, 123456
    cmpwltui	%r12, %r20, 123456
    cmpwne	%r12, %r20, %r21
    cmpwnei	%r12, %r20, 123456
    cmpwgti	%r12, %r20, 123456
    cmpwgtui	%r12, %r20, 123456
    cmpwle	%r12, %r20, %r21
    cmpwleu	%r12, %r20, %r21

    write	"compare aliases (pseudo-instructions)"
    cmpdgt	%r12, %r20, %r21	; cmplt	  r12, %r21, r20
    cmpdgtu	%r12, %r20, %r21	; cmpltu  r12, %r21, r20
    cmpdlti	%r12, %r20, 123456	; cmplti  r12, %r20, 12346
    cmpdltui	%r12, %r20, 123456	; cmpltui r12, %r20, 12346
    cmpdge	%r12, %r20, %r21	; cmpleq  r12, %r21, r20
    cmpdgeu	%r12, %r20, %r21	; cmpleu  r12, %r21, r20
    cmpdgti	%r12, %r20, 123456	; cmpgti  r12, %r20, 12346
    cmpdgtui	%r12, %r20, 123456	; cmpgtui r12, %r20, 12346


    cmpwgt	%r12, %r20, %r21	; cmplt4   r12, %r21, %r20
    cmpwgtu	%r12, %r20, %r21	; cmpltu4  r12, %r21, %r20
    cmpwlti	%r12, %r20, 123456	; cmplti4  r12, %r20, 12346
    cmpwltui	%r12, %r20, 123456	; cmpltui4 r12, %r20, 12346
    cmpwge	%r12, %r20, %r21	; cmpleq4  r12, %r21, r20
    cmpwgeu	%r12, %r20, %r21	; cmpleu4  r12, %r21, r20
    cmpwgti	%r12, %r20, 123456	; cmpgti4  r12, %r20, 12346
    cmpwgtui	%r12, %r20, 123456	; cmpgtui4 r12, %r20, 12346

; TESTS
    cmpdeq	%r14, %r12, %r45
    cmpdne	%r14, %r12, %r45

    cmpdeq	%r14, %r45, %r34
    cmpdeqi	%r14, %r45, 123
    cmpdeqi.l	%r14, %r45, 1234567890123
    cmpdlti	%r14, %r45, 123
    cmpdlti.l	%r14, %r45, 1234567890123
    cmpdlei	%r14, %r45, 123
    cmpdlei.l	%r14, %r45, 1234567890123
    cmpdlt	%r14, %r45, %r34
    cmpdgtui	%r14, %r45, 123
    cmpdgtui.l	%r14, %r45, 1234567890123
    cmpdgeui	%r14, %r45, 123
    cmpdgeui.l	%r14, %r45, 1234567890123
    cmpdgtu	%r14, %r45, %r34

    cmpdeq	%r41, %r34, %r56
    cmpdlt	%r66, %r45, %r57
    cmpdeqi	%r64, %r56, 0
.end.text
backward_target:
    alloc	61
    addi	%r2, %r2, 1
    bdeq	%r2, %r2, branch_test_exit

    bdeq	%r23, %r34, backward_target
    bdeq.l	%r23, %r34, backward_target
    bdeq	%r23, %r34, forward_target
    bdeq.l	%r23, %r34, forward_target
    bdeqi	%r23,34, backward_target
    bdeqi.l	%r23,34, backward_target
    bdeqi	%r23,34, forward_target
    bdeqi.l	%r23,34, forward_target

    bweq	%r23, %r34, backward_target
    bweq.l	%r23, %r34, backward_target
    bweq	%r23, %r34, forward_target
    bweq.l	%r23, %r34, forward_target
    bweqi	%r23,34, backward_target
    bweqi.l	%r23,34, backward_target
    bweqi	%r23,34, forward_target
    bweqi.l	%r23,34, forward_target

    bdne	%r23, %r34, backward_target
    bdne.l	%r23, %r34, backward_target
    bdne	%r23, %r34, forward_target
    bdne.l	%r23, %r34, forward_target
    bdnei	%r23,34, backward_target
    bdnei.l	%r23,34, backward_target
    bdnei	%r23,34, forward_target
    bdnei.l	%r23,34, forward_target

    bwne	%r23, %r34, backward_target
    bwne.l	%r23, %r34, backward_target
    bwne	%r23, %r34, forward_target
    bwne.l	%r23, %r34, forward_target
    bwnei	%r23,34, backward_target
    bwnei.l	%r23,34, backward_target
    bwnei	%r23,34, forward_target
    bwnei.l	%r23,34, forward_target

    bdle	%r23, %r34, backward_target
    bdle.l	%r23, %r34, backward_target
    bdle	%r23, %r34, forward_target
    bdle.l	%r23, %r34, forward_target
    bdlei	%r23,34, backward_target
    bdlei.l	%r23,34, backward_target
    bdlei	%r23,34, forward_target
    bdlei.l	%r23,34, forward_target

    bwle	%r23, %r34, backward_target
    bwle.l	%r23, %r34, backward_target
    bwle	%r23, %r34, forward_target
    bwle.l	%r23, %r34, forward_target
    bwlei	%r23,34, backward_target
    bwlei.l	%r23,34, backward_target
    bwlei	%r23,34, forward_target
    bwlei.l	%r23,34, forward_target

    bdlt	%r23, %r34, backward_target
    bdlt.l	%r23, %r34, backward_target
    bdlt	%r23, %r34, forward_target
    bdlt.l	%r23, %r34, forward_target
    bdlti	%r23,34, backward_target
    bdlti.l	%r23,34, backward_target
    bdlti	%r23,34, forward_target
    bdlti.l	%r23,34, forward_target

    bwlt	%r23, %r34, backward_target
    bwlt.l	%r23, %r34, backward_target
    bwlt	%r23, %r34, forward_target
    bwlt.l	%r23, %r34, forward_target
    bwlti	%r23,34, backward_target
    bwlti.l	%r23,34, backward_target
    bwlti	%r23,34, forward_target
    bwlti.l	%r23,34, forward_target

    bdge	%r23, %r34, backward_target
    bdge.l	%r23, %r34, backward_target
    bdge	%r23, %r34, forward_target
    bdge.l	%r23, %r34, forward_target
    bdgeui	%r23,34, backward_target
    bdgeui.l	%r23,34, backward_target
    bdgeui	%r23,34, forward_target
    bdgeui.l	%r23,34, forward_target

    bwge	%r23, %r34, backward_target
    bwge.l	%r23, %r34, backward_target
    bwge	%r23, %r34, forward_target
    bwge.l	%r23, %r34, forward_target
    bwgeui	%r23,34, backward_target
    bwgeui.l	%r23,34, backward_target
    bwgeui	%r23,34, forward_target
    bwgeui.l	%r23,34, forward_target

    bdgt	%r23, %r34, backward_target
    bdgt.l	%r23, %r34, backward_target
    bdgt	%r23, %r34, forward_target
    bdgt.l	%r23, %r34, forward_target
    bdgti	%r23,34, backward_target
    bdgti.l	%r23,34, backward_target
    bdgti	%r23,34, forward_target
    bdgti.l	%r23,34, forward_target

    bwgt	%r23, %r34, backward_target
    bwgt.l	%r23, %r34, backward_target
    bwgt	%r23, %r34, forward_target
    bwgt.l	%r23, %r34, forward_target
    bwgti	%r23,34, backward_target
    bwgti.l	%r23,34, backward_target
    bwgti	%r23,34, forward_target
    bwgti.l	%r23,34, forward_target

    bdleu	%r23, %r34, backward_target
    bdleu.l	%r23, %r34, backward_target
    bdleu	%r23, %r34, forward_target
    bdleu.l	%r23, %r34, forward_target
    bdleui	%r23,34, backward_target
    bdleui.l	%r23,34, backward_target
    bdleui	%r23,34, forward_target
    bdleui.l	%r23,34, forward_target

    bwleu	%r23, %r34, backward_target
    bwleu.l	%r23, %r34, backward_target
    bwleu	%r23, %r34, forward_target
    bwleu.l	%r23, %r34, forward_target
    bwleui	%r23,34, backward_target
    bwleui.l	%r23,34, backward_target
    bwleui	%r23,34, forward_target
    bwleui.l	%r23,34, forward_target

    bdltu	%r23, %r34, backward_target
    bdltu.l	%r23, %r34, backward_target
    bdltu	%r23, %r34, forward_target
    bdltu.l	%r23, %r34, forward_target
    bdltui	%r23,34, backward_target
    bdltui.l	%r23,34, backward_target
    bdltui	%r23,34, forward_target
    bdltui.l	%r23,34, forward_target

    bwltu	%r23, %r34, backward_target
    bwltu.l	%r23, %r34, backward_target
    bwltu	%r23, %r34, forward_target
    bwltu.l	%r23, %r34, forward_target
    bwltui	%r23,34, backward_target
    bwltui.l	%r23,34, backward_target
    bwltui	%r23,34, forward_target
    bwltui.l	%r23,34, forward_target

    bdgeu	%r23, %r34, backward_target
    bdgeu.l	%r23, %r34, backward_target
    bdgeu	%r23, %r34, forward_target
    bdgeu.l	%r23, %r34, forward_target
    bdgeui	%r23,34, backward_target
    bdgeui.l	%r23,34, backward_target
    bdgeui	%r23,34, forward_target
    bdgeui.l	%r23,34, forward_target

    bwgeu	%r23, %r34, backward_target
    bwgeu.l	%r23, %r34, backward_target
    bwgeu	%r23, %r34, forward_target
    bwgeu.l	%r23, %r34, forward_target
    bwgeui	%r23,34, backward_target
    bwgeui.l	%r23,34, backward_target
    bwgeui	%r23,34, forward_target
    bwgeui.l	%r23,34, forward_target

    bdgtu	%r23, %r34, backward_target
    bdgtu.l	%r23, %r34, backward_target
    bdgtu	%r23, %r34, forward_target
    bdgtu.l	%r23, %r34, forward_target
    bdgtui	%r23, 34, backward_target
    bdgtui.l	%r23, 34, backward_target
    bdgtui	%r23, 34, forward_target
    bdgtui.l	%r23, 34, forward_target

    bwgtu	%r23, %r34, backward_target
    bwgtu.l	%r23, %r34, backward_target
    bwgtu	%r23, %r34, forward_target
    bwgtu.l	%r23, %r34, forward_target
    bwgtui	%r23, 34, backward_target
    bwgtui.l	%r23, 34, backward_target
    bwgtui	%r23, 34, forward_target
    bwgtui.l	%r23, 34, forward_target

    bmall	%r23, 34, backward_target
    bmall.l	%r23, 34, backward_target
    bmall	%r23, 34, forward_target
    bmall.l	%r23, 34, forward_target

    bmnotall	%r23, 34, backward_target
    bmnotall.l	%r23, 34, backward_target
    bmnotall	%r23, 34, forward_target
    bmnotall.l	%r23, 34, forward_target

    bmany	%r23, 34, backward_target
    bmany.l	%r23, 34, backward_target
    bmany	%r23, 34, forward_target
    bmany.l	%r23, 34, forward_target

    bmnone	%r23, 34, backward_target
    bmnone.l	%r23, 34, backward_target
    bmnone	%r23, 34, forward_target
    bmnone.l	%r23, 34, forward_target

forward_target:
branch_test_exit:

    jmp		branch_exit

label:
    bdeq	%r12, %r13, qwe
    srpi	%r10, %r11, %r12, 45
    depq	%r61, %r91, %r32, 10
    mbsel	%r62, %r91, %r32, %r10
    perm	%r63, %r91, %r32, %r10
qwe:
    bdne	%r15, %r46, label
    bdeq	%r25, %r45, label
    bdlt	%r25, %r44, label
    bdle	%r35, %r43, label
    bdgtu	%r35, %r42, label
    bdgeu	%r45, %r41, label
    bdgt	%r45, %r40, label
    bdltu	%r55, %r76, label
    bdnei	%r55, 140, label
    bdeqi	%r65, 141, label
    bdlti	%r65, 142, label
    bdgti	%r75, 143, label
    bdltui	%r75, 170, label
    bdgtui	%r85, 160, label

    addi.l	%r45, %r34, 1234
    bbsi	%r85, 26, label
    bbci.l	%r85, 36, label
    bbsi	%r95, 46, label
    bbci.l	%r95, 56, label

    jmpr	%r45, %r23, 1
branch_exit:
    write	"end branch test"
.end
.text
    alloc	61
    write	"Example of test bit and branch"
    ldi		%r19, 0x20
    ldi		%r20, 12+3
    write	"%i64(r20)"
    ldi		%r10, 0
    bbci	%r10, 10, xxx_n
    ldi.l	%r20, 123456789012345	; load immediate
    ldi		%r21, 321		; load immediate
    add		%r23, %r20, %r21	; add
    write	"%i64(r43)"
xxx_n:	write	"%i64(r23)"

    ldi		%r46, 0xabcdef
    bbci	%r46, 56, branch_bit_exit
    bbsi	%r46, 56, branch_bit_exit
    ldi		%r56, 56
    bbc		%r46, %r56, branch_bit_exit
    bbs		%r46, %r56, branch_bit_exit

branch_bit_exit:
    write	"end branch_bit test"
.end.text
    write	"cpuid implemented number"
    alloc	96
    ldi		%r13, 0
    cpuid	%r14, %r13, 0
    write	"cpuid len %x64(r14)"
    write	"cpuid loop"
cpuid_loop:
    cpuid	%r15, %r13, 0
    write	"cpuid[%i64(r13)] = %x64(r15)"
    repdlt	%r13, %r14, cpuid_loop
.end
.rodata
    align 16
crc32c_test_string:
    ascii	"The quick brown fox jumps over the lazy dog"
.text
    write	"crc32c = 0x22620404 (expected)"
    alloc	20
    ldi		%r12, -1  ; crc32c
    ldi		%r15, 43 ; length
    mov		%r14, %r15
    ldafr	%r11, crc32c_test_string
crc32c_loop:
    ldqmia	%r13, %r11, 16
    crc32c	%r12, %r12, %r13, %r14
    addi	%r14, %r14, -16
    bdgt	%r14, %gz, crc32c_loop
    xori	%r12, %r12, -1
    write	"crc32c = 0x%x32(r12) (computed)"
.end.text
    alloc	61
    ldax	%r41, %r40, %r12, 4, 112
    ldax	%r41, %r40, %r12, 3, -12
    ldax	%r41, %r40, %r12, 4, 112
    ldi.l	%r5, -1
    mov2	%r3, %r4, %r4, %r3
    mov2	%r3, %r4, %r4, %r3


.rodata	; open text (read-only data) section
    align	16
text_lbl:	; this is label
    d1	111		; signed byte
    d1	112
    d1	113
ddd:
    align	4		; force 4-byte alignment for next data
    d1	6
    d1	7
    d1	8+0x3D	; you may use formulas!!!

.text
    write	"test addressing"

; Examples of IP-relative references.
    ldi		%r45, text_lo(text_lbl)
    write	"text_lo(text_lbl)=%i64(r45)"
    ldi		%r45, text_hi(text_lbl)
    write	"text_hi(text_lbl)=%i64(r45)"
    ldi		%r45, text_lbl
    write	"%i64(r45)"

; Example of access to text section.
; First get IP-relative reference to text section (+/- 64 MB from IP).
    ldar	%r45, text_lbl

; Now in r45 we have base address.
; But it IS NOT true address of 'text_lbl'.
; We have in r45 nearest (to 'text_lbl') least address, aligned on 16-bytes boundary.
; Remember add 'text_lo' part of label address at each displacement calculation.
    ldbz	%r50, %r45, text_lo(text_lbl)+0
    ldbz	%r51, %r45, text_lo(text_lbl)+1
    ldbz	%r52, %r45, text_lo(text_lbl)+2
    write	"%i64(r50)"	; must be 111
    write	"%i64(r51)"	; must be 112
    write	"%i64(r52)"	; must be 113

; Example of incorrect access to text section (without bundle alignment)
    ldbz	%r50, %r45, 0
    write	"%i64(r50)" ; must be 101 - start of 16-byte portion
.end
.text
    alloc	96
    addi	%r20, %gz, 128
    addi	%sp, %sp, -32
    ldi.l	%r12, 0x07060504030201
    std		%r12, %sp,0

.data
    ascii	"data section marker"
    align	8
.rodata
    ascii	"rodata section marker"
    align	8

.data
    d2	1234
first_byte:
    d1	12
.text
    ldafr	%r22, first_byte

; test interval time mask
    ldi		%r22, 0xFFFFFFFFFFFFFFFF
    ldi		%r15, 11

.rodata	; open rodata (read-only data) section
    align	8
text_begin:	; this is label
    d8	1	; signed 8-bytes
    d8	-2
    d1	101	; signed byte
    d1	102
    d1	103
    align	4
    d4	10000	; signed 4byte
    d2	10000	; signed 2byte
    space	4		; insert zeroed bytes
    d2	20000
.data	; open data (read-write) section
    align	8
eexxx:	d8	12345678	; signed 8-byte
    d8	1234567890
ssxxx:	d8	123456789012
    d8	12345678901234
.rodata
    d4	4555		; signed 4-byte
    d2	4555		; signed 2-byte
    align	8
    d8	11
text2:
.text	; open code (read-execute) section

.data	; switch to data section
    d1	120
    align	2
    d2	13400
align 8
dataname:
    d4	654321890
    d4	654321890
    d8	1234545345345
    d8	6789023356977
align 8
someplaceindata:
    d8	0x0000000000000001
    d8	0x0000000000000002
    d8	0x0000000000000003
    d8	0x0000000000000004
    d8	0x0000000000000005
    d8	0x0000000000000006
    d8	0x0000000000000007
    d8	0x0000000000000008
.text
    ldafr	%r11, someplaceindata
    ldi.l	%r15, 987777777777
    ldi		%r46, 100000
    std		%r46, %r11, 8*3
    lddz	%r46, %r11, 8*3
    write	"%i64(r46)"
    mul		%r18, %r15, %r46
    add		%r17, %r15, %r46
    andn	%r17, %r15, %r46
    cmpdlt	%r12, %r17, %r15
    write	"%i64(r15) %i64(r46) %i64(r17)"
    addi	%r17, %r17, 22
    write	"%i64(r17) %i64(r17)"
    mfspr	%r27, %itc
    write	"itc: %x64(r27)"
    write	"%m(dump)"
.end
.text
    ; at the beginning of the program, the register stack is empty
    alloc	54   ; expand frame to 54 registers
    ldar	%r4, dense_call_test_end
    mtspr	%r4, %eip
    mtspr	%r4, %reip
    ldi		%r47, 1  ; will be saved when called
    ldi		%r53, 3  ; first argument
    ldi		%r52, 2  ; second argument
    ldi		%r51, 1  ; third argument
    ; func procedure call, all registers up to 50 will be saved,
    ; return address, eip, frame size (50) are saved in r50
check_label:
    callr	%r48, simple_func_1
    callr	%r50, simple_func_2
    callr	%r52, simple_func_3
    
    jmp	dense_call_test_end

simple_func_1:
    alloc  10
    write  "simple_func_1"
    ret

simple_func_2:
    alloc  10
    write  "simple_func_2"
    ret

simple_func_3:
    alloc  10
    write  "simple_func_3"
    ret

dense_call_test_end:
    nop	123
    nop	123
    nop	123
    nop	123
    nop	123
    nop	123
.end
.text
    write	"test bit-field insert (deposit)"
    alloc	96
    ldi.l	%r30, 0xaaaaaaaaaaaaaaaa
    ldi.l	%r40, 0xeeeeeeeeeeeeeeee
    dep		%r20, %r30, %r40, 48, 24
    write	"dep: %x64(r20)"
    dep		%r20, %r40, %r30, 48, 24
    write	"dep: %x64(r20)"

    write	"test vector deposit (dep16)"
    nor		%r3, %r4, %r4
    depq	%r5, %r3, %r4, 100
    write	"dep16: %x128(r5)"
    write	"end deposit test"
.end

.text
    write	"test control device memory-mapped registers"
    alloc	96

    ; device_control base address
    ldi.l	%r24, DEVICE_CONFIG_VIRT_BASE

    write	"test pci"

    ldi.l	%r21, 0x1234567890abcdef

    lddz	%r20, %r24, DEVICE_CONTROL_DID
    write	"mem[DEVICE_CONTROL_DID] %x64(r20)"
    std		%r21, %r24, DEVICE_CONTROL_DID
    lddz	%r20, %r24, DEVICE_CONTROL_DID
    write	"mem[DEVICE_CONTROL_DID] %x64(r20)"

    lddz	%r20, %r24, DEVICE_CONTROL_CMD
    write	"mem[DEVICE_CONTROL_CMD] %x64(r20)"
    std		%r21, %r24, DEVICE_CONTROL_CMD
    lddz	%r20, %r24, DEVICE_CONTROL_CMD
    write	"mem[DEVICE_CONTROL_CMD] %x64(r20)"

    lddz	%r20, %r24, DEVICE_CONTROL_ARRAY_ADDRESS
    write	"mem[DEVICE_CONTROL_ARRAY_ADDRESS] (r20)"

    lddz	%r20, %r24, DEVICE_CONTROL_ARRAY_LEN
    write	"mem[DEVICE_CONTROL_ARRAY_LEN] %i64(r20)"

    ldi	%r22, \n

    write	"test command"
    ldi.l	%r21, 0xabcdef1234567890
    std		%r21, %r24, DEVICE_CONTROL_CMD

    write	"end_device_control_test"
.end

.text
    write	"test core mapping DEVICE_CONFIG_VIRT_BASE"
    alloc	96
    ldi.l	%r20, DEVICE_CONFIG_VIRT_BASE
    write	"DEVICE_CONFIG_VIRT_BASE: %x64(r20)"
    ldi.l	%r20, DEVICE_CONFIG_SPACE_SIZE
    write	"DEVICE_CONFIG_SPACE_SIZE: %x64(r20)"
    ldi.l	%r20, CONFIG_OFFSET_CORE_0
    write	"CONFIG_OFFSET_CORE_0: %x64(r20)"
    ldi.l	%r20, DEVICE_CORE_TIMECMP
    write	"DEVICE_CORE_TIMECMP: %x64(r20)"

    ldi.l	%r20, DEVICE_CONFIG_VIRT_BASE + CONFIG_OFFSET_CORE_0 * DEVICE_CONFIG_SPACE_SIZE ; core config
    ldi		%r19, 0xabcdef

    write	"test interrupt vector %x64(r20)"
    std		%r19, %r20, DEVICE_CORE_TIMECMP ; use DEVICE_CORE_INTERRUPT_VECTOR in place of DEVICE_CORE_TIMECMP for real interrupt

    write	"test timecmp"
    std		%r19, %r20, DEVICE_CORE_TIMECMP

    write	"test rom mapping ROM_VIRT_BASE"
    ldi.l	%r20, ROM_VIRT_BASE
    lddz	%r19, %r20, 0
    write	"mem[ROM_VIRT_BASE] %x64(r19)"

    write	"test video commands VIDEO_COMMAND_VIRT_BASE"
    ldi.l	%r20, VIDEO_COMMAND_VIRT_BASE
    ldi		%r21, 0x1234
    stw		%r21, %r20, 0x88	; clear
    stw		%r21, %r20, 0x8c	; redraw

    write	"video width/height base: %x64(r20)"
    ldwz	%r21, %r20, 0x80 ; width
    ldwz	%r22, %r20, 0x84 ; height
    write	"width=%i64(r21) heigth=%i64(r22)"

    write	"test video memory VIDEO_VIRT_BASE"
    ldi.l	%r20, VIDEO_VIRT_BASE
    write	"r20     %x64(r20)"

    ldi.l	%r25, 0x12345678
    stw		%r25, %r20, 0

    ldi		%r24, 0   ; y
loop_y: (64)
;	write	"%i64(r24)"
    ldi	%r23, 0   ; x
loop_x:
;	add	%r25, %r23, %r24
    stb		%r25, %r20, 0
    addi	%r20, %r20, 1
    addi	%r23, %r23, 1
    bdlt	%r23, %r21, loop_x

    addi	%r24, %r24,1
    bdlt	%r24, %r22, loop_y
    ; debug
    write	"end test video memory"
    nop		1234567
.end
.text
    write	"begin exception test"
    alloc	96

    ldafr	%r2, catch
    mtspr	%r2, %eip

; constructor 1
    ldi		%r4, 1
    ehadj	call_destructor_1
    write	"eip: %s(eip)"
; constructor 2
    ldi		%r5, 2
    ehadj	call_destructor_2
    write	"eip: %s(eip)"

    ldi		%r3, 0xFFFFFFFFFFFF1230
    ehthrow	%r3, 0    ; set eca, jump to eip
    write	"normal execution (never occurs)"

call_destructor_2:
    write	"call_destructor_2"
    ehcatch	%r6, end_destructor_2
    ; here dtor called
    ldi		%r4, 0
end_destructor_2:
    ehnext	%r6, call_destructor_1
    write	"normal continue after destructor_2"

call_destructor_1:
    write	"call_destructor_1"
    ehcatch	%r6, end_destructor_1
    ; here dtor called
    ldi		%r5, 0
end_destructor_1:
    ehnext	%r6, catch
    write	"normal continue after destructor_1"

call_ret:
    write	"normal exit"
    jmp		exception_exit

catch:
    write	"caught exception, exit"
    ehcatch	%r12, exception_exit
    write	"caught exception context: r12=%x64(r12)"
exception_exit:
    nop		1234567
    nop		7654321
.end
.text
; floating-point extension example
    alloc	96

    write	"test float128 immediate load (low/high parts)"
    fldqri	%r12, 3.1415926115461431423612436243
    write	"fldqri: %f128(r12)"

    write	"test fpcr modification (rm=3)"
    ldi		%r2, 3
    mtspr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"
    write	"test fpcr modification (rm=2)"
    ldi		%r2, 2
    mtspr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"
    write	"test fpcr modification (rm=1)"
    ldi		%r2, 1
    mtspr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"
    write	"test fpcr modification (rm=0)"
    ldi		%r2, 0
    mtspr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"

    write	"compare fldqri (full mantissa) & long fldi (63-bit mantissa)"
    fldqri	%r30, 3.14159265358979323846123456789012e+400
    write	"fldqri: %x128(r30) %f128(r30)"
    flddi	%r31, 3.14159265358979323846123456789012
    write	"flddi: %x128(r31) %f64(r31)"
    write	"compare fldqri (full mantissa) & short fldi (21-bit mantissa)"
    fldqri	%r30, 3.14159265358979323846123456789012
    write	"r30     %x128(r30)"
    flddi	%r31, 3.14159265358979323846123456789012
    write	"r31     %x128(r31)"
    write	"before1"
    write	"r30     %f128(r30)"
    write	"before2"
    write	"r31     %vf64(r31)"
    write	"after"
    flddi	%r30, -12.3456789e+04
.rodata
    align	16
float64data:
    double	1.234567890123456789124141241241
    double	3.1415925678888734535345231234564561
    double	3.4566345634563456346535463463456
.text
    ldar	%r21, float64data
    lddz	%r11, %r21, 8*0
    lddz	%r12, %r21, 8*1
    lddz	%r13, %r21, 8*2
    write	"ld8(f64): %f64(r11) %f64(r12) %f64(r13)"
    fldqri	%r14, 2.7182818289201
    write	"fldqri: %f128(r14)"

    fextsd2sq	%r11, %r11
    fextsd2sq	%r12, %r12
    fextsd2sq	%r13, %r13

    write	"test binary"
    fmulsq	%r15, %r11, %r14
    write	"fmulsq:  %f128(r15)"
    fnmulsq	%r15, %r11, %r14
    write	"fnmulsq: %f128(r15)"
    faddsq	%r15, %r11, %r14
    write	"faddsq:  %f128(r15)"
    fnaddsq	%r15, %r11, %r14
    write	"fnaddsq: %f128(r15)"
    fsubsq	%r15, %r14, %r11
    write	"fsubsq:  %f128(r15)"
    fdivsq	%r15, %r14, %r11
    write	"fdivsq:  %f128(r15)"

    write	"test fused fma"
;   jmp	skipfma
    fmaddsq	%r15, %r14, %r11, %r12
    write	"fmaddsq:  %f128(r15)"
    fnmaddsq %r15, %r14, %r11, %r12
    write	"fnmaddsq: %f128(r15)"
    fmsubsq	%r15, %r14, %r11, %r12
    write	"fmsubsq:  %f128(r15)"
    fnmsubsq %r15, %r14, %r11, %r12
    write	"fnmsubsq: %f128(r15)"

    write	"test unary"
    mov		%r16, %r15
    write	"r16     %f128(r16)"
    fabssq	%r16, %r15
    write	"r16     %f128(r16)"
    fnegsq	%r16, %r15
    write	"r16     %f128(r16)"
    fnabssq	%r16, %r15
    write	"r16     %f128(r16)"
    fsqrtsq	%r16, %r12
    write	"r16     %f128(r16)"
    frsqrtsq	%r16, %r12
    write	"r16     %f128(r16)"

    write	"test rounding"
    frndsq	%r17, %r12, 4
    write	"r17     %f128(r17)"
    frndsq	%r17, %r12, 3
    write	"r17     %f128(r17)"
    frndsq	%r17, %r12, 2
    write	"r17     %f128(r17)"
    frndsq	%r17, %r12, 0
    write	"r17     %f128(r17)"
    fcvtsq2iw	%r17, %r12,0
    write	"r17     %i64(r17)"
    ldi		%r17, 123456
    fcvtiw2sq	%r17, %r7,0
    write	"r17     %f128(r17)"

    write	"test fp minmax"
    fmaxsq	%r8, %r11, %r12
    write	"r8      %f128(r8)"
    fminsq	%r8, %r11, %r12
    write	"r8      %f128(r8)"
    write	"test fp abs minmax"
    famaxsq	%r8, %r11, %r12
    write	"r8      %f128(r8)"
    faminsq	%r8, %r11, %r12
    write	"r8      %f128(r8)"

    write	"test fmergesq"
    fmergesq	%r8, %r11, %r12, %r14
    write	"r8      %f128(r8)"
    fmergesq	%r8, %r14, %r11, %r12
    write	"r8      %f128(r8)"


.rodata
    align	16
xxxd:	double	1.122
    double	0.9999765432
.text
    ldar	%r21, xxxd
    ldi		%r15, 100
    lddz	%r25, %r21, 8*0
    lddz	%r26, %r21, 8*1
    fsubsq	%r22, %r25, %r16
    write	"r22     %f128(r22)"
xxloop:
    fmaddsq	%r22, %r25, %r16, %r22
    fmsubsq	%r22, %r25, %r16, %r22
    repdge	%r15, %gz, xxloop
    write	"r22     %f128(r22)"

    write	"other FPU"
    fmaddsq  %r60, %r61, %r62, %r63
    fmsubsq  %r61, %r61, %r72, %r73
    fnmaddsq %r62, %r71, %r82, %r63
    fnmsubsq %r63, %r81, %r12, %r53

    fmulsq	%r64, %r61, %r22
    fdivsq	%r65, %r11, %r27
    faddsq	%r66, %r17, %r42
    fsubsq	%r67, %r31, %r23
    fnaddsq	%r68, %r41, %r62
    fmaxsq	%r60, %r61, %r62
    fminsq	%r60, %r61, %r62
    famaxsq	%r60, %r61, %r62
    faminsq	%r60, %r61, %r62

    fcmpsqolt	%r10, %r61, %r72
    fcmpsqole	%r11, %r52, %r21
    fcmpsqole	%r12, %r43, %r12
    fcmpsqoeq	%r10, %r34, %r44
    fcmpsqueq	%r13, %r25, %r22
    fcmpsqule	%r12, %r15, %r23
    fcmpsquo	%r11, %r86, %r86

    fnegsq	%r24, %r58
    fabsdsq	%r45, %r61, %r20
    fnabsdsq	%r56, %r32, %r20
    frndsq	%r78, %r74,2
    frndsq	%r89, %r65,3
    frndsq	%r81, %r76,0
    frndsq	%r62, %r67,1
    fsqrtsq	%r63, %r78
    frsqrtsq %r64, %r69

    addi	%r45, %sp,-4800
    ldi		%r13, 2

    ldwz	%r12, %r45, 4*1
    stw		%r12, %r45, 4*1
    lddz	%r12, %r45, 8*3
    std		%r12, %r45, 8*3
    ldwzx	%r12, %r45, %r13, 2, 200
    stwx	%r12, %r45, %r13, 2, 200
    lddzx	%r12, %r45, %r13, 3, 200
    stdx	%r12, %r45, %r13, 3, 200

    faddsq	%r23, %r24, %r25
    fmaddsq	%r23, %r60, %r55, %r33
    fmulsq	%r23, %r60, %r55
    lddz	%r60, %r45, 8*6
    fmaddsq	%r23, %r60, %r55, %r33
    fmaddsq	%r24, %r61, %r25, %r32
    fmaddsq	%r25, %r62, %r55, %r23
    fmaddsq	%r26, %r63, %r75, %r73
    fmaddsq	%r27, %r64, %r75, %r73
    fmaddsq	%r28, %r65, %r85, %r63
    fmaddsq	%r29, %r66, %r85, %r63
    fmaddsq	%r30, %r67, %r55, %r23
    fmaddsq	%r31, %r68, %r55, %r23
    fmaddsq	%r12, %r32, %r76, %r85
    fmaddsq	%r12, %r32, %r76, %r85
    fmaddsq	%r10, %r32, %r76, %r85
    fmaddsq	%r10, %r32, %r76, %r85
    fmaddsq	%r10, %r32, %r76, %r85
    fmaddsq	%r13, %r32, %r76, %r85
    fmaddsq	%r14, %r32, %r76, %r85
    fmaddsq	%r15, %r32, %r76, %r85
    fmaddsq	%r16, %r32, %r76, %r85
    fmaddsq	%r17, %r32, %r76, %r85

    fcvtsq2iw	%r56, %r45, 0
    fcvtsq2uw	%r56, %r45, 0
    fcvtiw2sq	%r45, %r56, 0
    fcvtuw2sq	%r45, %r56, 0

    ldi		%r5, 0
    fldqri	%r4, 1.0
    fldqri	%r5, 1.0
    fldqri	%r6, 1.0
    fldqri	%r7, 1.0
    ldi		%r24, 128
tri_repeat:
    write	"r7      %x128(r7)"
    faddsq	%r5, %r5, %r4
    fmulsq	%r6, %r6, %r5
    fdivsq	%r7, %r4, %r6
;   write "%x128(r6)"
    repdle.l %r5, %r24, tri_repeat

    write	"test taylor series"
    fldqri	%r2, 0.44567	; f2 ,  x
    write	"x:   %f128(r2)"		; test value
    write	"test sin(x)"
    fldqri	%r5, sin(0.44567)
    write	"sin: %f128(r5)"		; test value
    ldi		%r3, 0		; s ,  0
    fmulsq	%r4, %r2, %r2	; f4 ,  x*x
    fmaddsq	%r3, %r3, %r4, %r25	; s ,  s * x*x + 1/25!
    fmsubsq	%r3, %r3, %r4, %r23	; s ,  s * x*x - 1/23!
    fmaddsq	%r3, %r3, %r4, %r21
    fmsubsq	%r3, %r3, %r4, %r19
    fmaddsq	%r3, %r3, %r4, %r17
    fmsubsq	%r3, %r3, %r4, %r15
    fmaddsq	%r3, %r3, %r4, %r13
    fmsubsq	%r3, %r3, %r4, %r11
    fmaddsq	%r3, %r3, %r4, %r9
    fmsubsq	%r3, %r3, %r4, %r7
    fmaddsq	%r3, %r3, %r4, %r5
    fmsubsq	%r3, %r3, %r4, %r3
    fmaddsq	%r3, %r3, %r4, %r1
    fmulsq	%r3, %r3, %r2	; s ,  s * x
    write	"sin: %f128(r3)"

    write	"test cos(x)"
    fldqri	%r5, cos(0.44567)
    write	"cos: %f128(r5)"		; test value
    ldi		%r3, 0		; s ,  0
    fmulsq	%r4, %r2, %r2	; f4 ,  x*x
    fmsubsq	%r3, %r3, %r4, %r26
    fmaddsq	%r3, %r3, %r4, %r24
    fmsubsq	%r3, %r3, %r4, %r22
    fmaddsq	%r3, %r3, %r4, %r20
    fmsubsq	%r3, %r3, %r4, %r18
    fmaddsq	%r3, %r3, %r4, %r16
    fmsubsq	%r3, %r3, %r4, %r14
    fmaddsq	%r3, %r3, %r4, %r12
    fmsubsq	%r3, %r3, %r4, %r10
    fmaddsq	%r3, %r3, %r4, %r8
    fmsubsq	%r3, %r3, %r4, %r6
    fmaddsq	%r3, %r3, %r4, %r4
    fmsubsq	%r3, %r3, %r4, %r2
    fmaddsq	%r3, %r3, %r4, %r1
    write	"cos: %f128(r3)"

    write	"test exp(x)"
    fldqri	%r5, exp(0.44567)
    write	"exp: %f128(r5)"	; test value
    ldi		%r3, 0			; s ,  0.0
    mov		%r4, %r2		; f4 ,  x
    flddi	%r6, 0.125
;   write	"%f128(r6)"
    fmulsq	%r4, %r4, %r6	; x ,  x/8
    fmaddsq	%r3, %r3, %r4, %r15
    fmaddsq	%r3, %r3, %r4, %r14
    fmaddsq	%r3, %r3, %r4, %r13
    fmaddsq	%r3, %r3, %r4, %r12
    fmaddsq	%r3, %r3, %r4, %r11
    fmaddsq	%r3, %r3, %r4, %r10
    fmaddsq	%r3, %r3, %r4, %r9
    fmaddsq	%r3, %r3, %r4, %r8
    fmaddsq	%r3, %r3, %r4, %r7
    fmaddsq	%r3, %r3, %r4, %r6
    fmaddsq	%r3, %r3, %r4, %r5
    fmaddsq	%r3, %r3, %r4, %r4
    fmaddsq	%r3, %r3, %r4, %r3
    fmaddsq	%r3, %r3, %r4, %r2
    fmaddsq	%r3, %r3, %r4, %r1
    fmaddsq	%r3, %r3, %r4, %r1
    fmulsq	%r3, %r3, %r3	; (e^x) ^ 8
    fmulsq	%r3, %r3, %r3
    fmulsq	%r3, %r3, %r3
    write	"exp: %f128(r3)"

    faddsq	%r1, %r2, %r3
    fmaddsq	%r2, %r10, %r20, %r30
    fmaddsq	%r1, %r11, %r21, %r31

    ; classification
    fclss	%r4, %r5, 120
    fclsd	%r4, %r5, 120
    fclsq	%r4, %r5, 120
    jmp		skipfma

fpu_backward_target:
; single branches
    bfssoeq	%r23, %r34, fpu_backward_target
    bfssoeq.l	%r23, %r34, fpu_backward_target
    bfssoeq	%r23, %r34, fpu_forward_target
    bfssoeq.l	%r23, %r34, fpu_forward_target

    bfssueq	%r23, %r34, fpu_backward_target
    bfssueq.l	%r23, %r34, fpu_backward_target
    bfssueq	%r23, %r34, fpu_forward_target
    bfssueq.l	%r23, %r34, fpu_forward_target

    bfssone	%r23, %r34, fpu_backward_target
    bfssone.l	%r23, %r34, fpu_backward_target
    bfssone	%r23, %r34, fpu_forward_target
    bfssone.l	%r23, %r34, fpu_forward_target

    bfssune	%r23, %r34, fpu_backward_target
    bfssune.l	%r23, %r34, fpu_backward_target
    bfssune	%r23, %r34, fpu_forward_target
    bfssune.l	%r23, %r34, fpu_forward_target

    bfssolt	%r23, %r34, fpu_backward_target
    bfssolt.l	%r23, %r34, fpu_backward_target
    bfssolt	%r23, %r34, fpu_forward_target
    bfssolt.l	%r23, %r34, fpu_forward_target

    bfssult	%r23, %r34, fpu_backward_target
    bfssult.l	%r23, %r34, fpu_backward_target
    bfssult	%r23, %r34, fpu_forward_target
    bfssult.l	%r23, %r34, fpu_forward_target

    bfssole	%r23, %r34, fpu_backward_target
    bfssole.l	%r23, %r34, fpu_backward_target
    bfssole	%r23, %r34, fpu_forward_target
    bfssole.l	%r23, %r34, fpu_forward_target

    bfssule	%r23, %r34, fpu_backward_target
    bfssule.l	%r23, %r34, fpu_backward_target
    bfssule	%r23, %r34, fpu_forward_target
    bfssule.l	%r23, %r34, fpu_forward_target

    bfsso	%r23, %r34, fpu_backward_target
    bfsso.l	%r23, %r34, fpu_backward_target
    bfsso	%r23, %r34, fpu_forward_target
    bfsso.l	%r23, %r34, fpu_forward_target

    bfssuo	%r23, %r34, fpu_backward_target
    bfssuo.l	%r23, %r34, fpu_backward_target
    bfssuo	%r23, %r34, fpu_forward_target
    bfssuo.l	%r23, %r34, fpu_forward_target

    bfssclass	%r23, 34, fpu_backward_target
    bfssclass.l	%r23, 34, fpu_backward_target
    bfssclass	%r23, 34, fpu_forward_target
    bfssclass.l	%r23, 34, fpu_forward_target

; double branches
    bfsdoeq	%r23, %r34, fpu_backward_target
    bfsdoeq.l	%r23, %r34, fpu_backward_target
    bfsdoeq	%r23, %r34, fpu_forward_target
    bfsdoeq.l	%r23, %r34, fpu_forward_target

    bfsdueq	%r23, %r34, fpu_backward_target
    bfsdueq.l	%r23, %r34, fpu_backward_target
    bfsdueq	%r23, %r34, fpu_forward_target
    bfsdueq.l	%r23, %r34, fpu_forward_target

    bfsdone	%r23, %r34, fpu_backward_target
    bfsdone.l	%r23, %r34, fpu_backward_target
    bfsdone	%r23, %r34, fpu_forward_target
    bfsdone.l	%r23, %r34, fpu_forward_target

    bfsdune	%r23, %r34, fpu_backward_target
    bfsdune.l	%r23, %r34, fpu_backward_target
    bfsdune	%r23, %r34, fpu_forward_target
    bfsdune.l	%r23, %r34, fpu_forward_target

    bfsdolt	%r23, %r34, fpu_backward_target
    bfsdolt.l	%r23, %r34, fpu_backward_target
    bfsdolt	%r23, %r34, fpu_forward_target
    bfsdolt.l	%r23, %r34, fpu_forward_target

    bfsdult	%r23, %r34, fpu_backward_target
    bfsdult.l	%r23, %r34, fpu_backward_target
    bfsdult	%r23, %r34, fpu_forward_target
    bfsdult.l	%r23, %r34, fpu_forward_target

    bfsdole	%r23, %r34, fpu_backward_target
    bfsdole.l	%r23, %r34, fpu_backward_target
    bfsdole	%r23, %r34, fpu_forward_target
    bfsdole.l	%r23, %r34, fpu_forward_target

    bfsdule	%r23, %r34, fpu_backward_target
    bfsdule.l	%r23, %r34, fpu_backward_target
    bfsdule	%r23, %r34, fpu_forward_target
    bfsdule.l	%r23, %r34, fpu_forward_target

    bfsdo	%r23, %r34, fpu_backward_target
    bfsdo.l	%r23, %r34, fpu_backward_target
    bfsdo	%r23, %r34, fpu_forward_target
    bfsdo.l	%r23, %r34, fpu_forward_target

    bfsduo	%r23, %r34, fpu_backward_target
    bfsduo.l	%r23, %r34, fpu_backward_target
    bfsduo	%r23, %r34, fpu_forward_target
    bfsduo.l	%r23, %r34, fpu_forward_target

    bfsdclass	%r23, 34, fpu_backward_target
    bfsdclass.l	%r23, 34, fpu_backward_target
    bfsdclass	%r23, 34, fpu_forward_target
    bfsdclass.l	%r23, 34, fpu_forward_target

; quadruple branches
    bfsqoeq	%r23, %r34, fpu_backward_target
    bfsqoeq.l	%r23, %r34, fpu_backward_target
    bfsqoeq	%r23, %r34, fpu_forward_target
    bfsqoeq.l	%r23, %r34, fpu_forward_target

    bfsqueq	%r23, %r34, fpu_backward_target
    bfsqueq.l	%r23, %r34, fpu_backward_target
    bfsqueq	%r23, %r34, fpu_forward_target
    bfsqueq.l	%r23, %r34, fpu_forward_target

    bfsqone	%r23, %r34, fpu_backward_target
    bfsqone.l	%r23, %r34, fpu_backward_target
    bfsqone	%r23, %r34, fpu_forward_target
    bfsqone.l	%r23, %r34, fpu_forward_target

    bfsqune	%r23, %r34, fpu_backward_target
    bfsqune.l	%r23, %r34, fpu_backward_target
    bfsqune	%r23, %r34, fpu_forward_target
    bfsqune.l	%r23, %r34, fpu_forward_target

    bfsqolt	%r23, %r34, fpu_backward_target
    bfsqolt.l	%r23, %r34, fpu_backward_target
    bfsqolt	%r23, %r34, fpu_forward_target
    bfsqolt.l	%r23, %r34, fpu_forward_target

    bfsqult	%r23, %r34, fpu_backward_target
    bfsqult.l	%r23, %r34, fpu_backward_target
    bfsqult	%r23, %r34, fpu_forward_target
    bfsqult.l	%r23, %r34, fpu_forward_target

    bfsqole	%r23, %r34, fpu_backward_target
    bfsqole.l	%r23, %r34, fpu_backward_target
    bfsqole	%r23, %r34, fpu_forward_target
    bfsqole.l	%r23, %r34, fpu_forward_target

    bfsqule	%r23, %r34, fpu_backward_target
    bfsqule.l	%r23, %r34, fpu_backward_target
    bfsqule	%r23, %r34, fpu_forward_target
    bfsqule.l	%r23, %r34, fpu_forward_target

    bfsqo	%r23, %r34, fpu_backward_target
    bfsqo.l	%r23, %r34, fpu_backward_target
    bfsqo	%r23, %r34, fpu_forward_target
    bfsqo.l	%r23, %r34, fpu_forward_target

    bfsquo	%r23, %r34, fpu_backward_target
    bfsquo.l	%r23, %r34, fpu_backward_target
    bfsquo	%r23, %r34, fpu_forward_target
    bfsquo.l	%r23, %r34, fpu_forward_target

    bfsqclass	%r23, 34, fpu_backward_target
    bfsqclass.l	%r23, 34, fpu_backward_target
    bfsqclass	%r23, 34, fpu_forward_target
    bfsqclass.l	%r23, 34, fpu_forward_target

fpu_forward_target:

    nulfssune	%r23, %r34, 1, 1
    nulfsdune	%r23, %r34, 1, 1
    nulfsqune	%r23, %r34, 1, 1

    nulfssone	%r23, %r34, 1, 1
    nulfsdone	%r23, %r34, 1, 1
    nulfsqone	%r23, %r34, 1, 1

    nulfssueq	%r23, %r34, 1, 1
    nulfsdueq	%r23, %r34, 1, 1
    nulfsqueq	%r23, %r34, 1, 1

    nulfssoeq	%r23, %r34, 1, 1
    nulfsdoeq	%r23, %r34, 1, 1
    nulfsqoeq	%r23, %r34, 1, 1

    nulfssclass	%r23, 94, 1, 1
    nulfsdclass	%r23, 94, 1, 1
    nulfsqclass	%r23, 94, 1, 1
skipfma:
    write	"end fpu"
.end
.text
    alloc	96
    write	"test base addressing with indexed post-update"
    ldi		%r12, 1
    addi	%r45, %sp, -512

    ldbzmia	%r23, %r45, 2
    ldhzmia	%r23, %r45, 2
    ldwzmia	%r23, %r45, 4
    lddzmia	%r23, %r45, 8
    ldqmia	%r23, %r45, 16

    ldbsmia	%r23, %r45, 2
    ldhsmia	%r23, %r45, 2
    ldwsmia	%r23, %r45, 4
    lddsmia	%r23, %r45, 8

    stbmia	%r23, %r45, 2 
    sthmia	%r23, %r45, 2
    stwmia	%r23, %r45, 4
    stdmia	%r23, %r45, 8
    stqmia	%r23, %r45, 16
    write	"end_indexed_modify_test"
.end.rodata
rodata1:
    d1	123
    align	2
rodata2:
    d2	12345
    align	4
rodata4:
    d4	123456789
    align	8
rodata8:
    d8	1234567890123456789

.data
data1:
    d1	123
    align	2
data2:
    d2	12345
    align	4
data4:
    d4	123456789
    align	8
data8:
    d8	1234567890123456789

.text
    alloc	96

    write "test ip-relative data addressing"
    ldbzr	%r34, rodata1
    ldhzr	%r34, rodata2
    ldwzr	%r34, rodata4
    lddzr	%r34, rodata8

    ldbsr	%r34, rodata1
    ldhsr	%r34, rodata2
    ldwsr	%r34, rodata4
    lddsr	%r34, rodata8

    ldbzr	%r34, data1
    ldhzr	%r34, data2
    ldwzr	%r34, data4
    lddzr	%r34, data8

    ldbsr	%r34, data1
    ldhsr	%r34, data2
    ldwsr	%r34, data4
    lddsr	%r34, data8

    stbr	%r34, data1
    sthr	%r34, data2
    stwr	%r34, data4
    stdr	%r34, data8

    write	"end ip-relative data test"
.end.text
    alloc	96
    write	"test ldafr"
    ldafr	%r22, ldafr_data
    write	"ldafr: %x64(r22)"

    write	"end_ldafr_test"
.data
ldafr_data:

.end.text
    alloc	96
    write	"check mbsel instruction"
    ldi.l	%r6, ((0x3333333333333333 ^ 0x5555555555555555) & 0xff00ff00ff00ff00) ^ 0x5555555555555555
    write	"mbsel: %x64(r6)"
    ldi.l	%r3, 0x3333333333333333
    ldi.l	%r4, 0x5555555555555555
    ldi.l	%r5, 0xff00ff00ff00ff00
    mbsel	%r6, %r3, %r4, %r5
    write	"mbsel: %x64(r6)"

    write	"end_mbsel_test"
.end.text
    alloc	61
    write	"\ntest write: special register"
    write	"ip      %s(ip)"
    write	"eip     %s(eip)"
    write	"eca     %s(eca)"
    write	"fpcr    %s(fpcr)"
    write	"rsc     %s(rsc)"
    write	"rsp     %s(rsp)"
    write	"bsp     %s(bsp)"
    write	"peb     %s(peb)"
    write	"teb     %s(teb)"
    write	"itc     %s(itc)"
    write	"itm     %s(itm)"
    write	"psr     %s(psr)"
    write	"pta     %s(pta)"
    write	"iva     %s(iva)"
    write	"kip     %s(kip)"
    write	"ksp     %s(ksp)"
    write	"krsp    %s(krsp)"
    write	"iip     %s(iip)"
    write	"iipa    %s(iipa)"
    write	"ipsr    %s(ipsr)"
    write	"cause   %s(cause)"
    write	"ifa     %s(ifa)"
    write	"iib     %s(iib)"
    write	"tpr     %s(tpr)"
    write	"lid     %s(lid)"
    write	"irr0    %s(irr0)"
    write	"irr1    %s(irr1)"
    write	"irr2    %s(irr2)"
    write	"irr3    %s(irr3)"
    write	"isr0    %s(isr0)"
    write	"isr1    %s(isr1)"
    write	"isr2    %s(isr2)"
    write	"isr3    %s(isr3)"
    write	"tsv     %s(tsv)"
    write	"cmcv    %s(cmcv)"
    write	"pmv     %s(pmv)"

    write	"\ntest mfspr: read special register"

    mfspr	%r12, %ip
    write	"ip      %x64(r12)"

    mfspr	%r12, %eip
    write	"eip     %x64(r12)"

    mfspr	%r12, %eca
    write	"%x64(r12)"

    mfspr	%r12, %fpcr
    write	"%x64(r12)"

    mfspr	%r12, %rsc
    write	"%x64(r12)"

    mfspr	%r12, %rsp
    write	"%x64(r12)"

    mfspr	%r12, %bsp
    write	"%x64(r12)"

    mfspr	%r12, %peb
    write	"%x64(r12)"

    mfspr	%r12, %teb
    write	"%x64(r12)"

    mfspr	%r12, %itc
    write	"%x64(r12)"

    mfspr	%r12, %itm
    write	"%x64(r12)"

    mfspr	%r12, %psr
    write	"%x64(r12)"

    mfspr	%r12, %pta
    write	"%x64(r12)"

    mfspr	%r12, %iva
    write	"%x64(r12)"

    mfspr	%r12, %kip
    write	"%x64(r12)"

    mfspr	%r12, %ksp
    write	"%x64(r12)"

    mfspr	%r12, %krsp
    write	"krsp    %x64(r12)"

    mfspr	%r12, %iip
    write	"iip     %x64(r12)"

    mfspr	%r12, %iipa
    write	"iipa    %x64(r12)"

    mfspr	%r12, %ipsr
    write	"ipsr    %x64(r12)"

    mfspr	%r12, %cause
    write	"cause   %x64(r12)"

    write	"%s(ifa)"
    mfspr	%r12, %ifa
    write	"ifa     %x64(r12)"

    mfspr	%r12, %iib
    write	"iib     %x128(r12)"

    mfspr	%r12, %tpr
    write	"tpr     %x64(r12)"

    mfspr	%r12, %lid
    write	"lid     %x64(r12)"

    mfspr	%r12, %irr0
    write	"irr0    %x64(r12)"

    mfspr	%r12, %irr1
    write	"irr1    %x64(r12)"

    mfspr	%r12, %irr2
    write	"irr2    %x64(r12)"

    mfspr	%r12, %irr3
    write	"irr3    %x64(r12)"

    mfspr	%r12, %isr0
    write	"%x64(r12)"

    mfspr	%r12, %isr1
    write	"%x64(r12)"

    mfspr	%r12, %isr2
    write	"%x64(r12)"

    mfspr	%r12, %isr3
    write	"%x64(r12)"

    mfspr	%r12, %tsv
    write	"%x64(r12)"

    mfspr	%r12, %cmcv
    write	"%x64(r12)"

    mfspr	%r12, %pmv
    write	"%x64(r12)"

    write	"end test mfspr"
.end
.text
    alloc	69
    write	"test min/max"
    mins	%r34, %r56, %r67
    minu	%r34, %r56, %r67
    maxs	%r34, %r56, %r67
    maxu	%r34, %r56, %r67

    minsi	%r34, %r56, 2671
    minui	%r34, %r56, 2671
    maxsi	%r34, %r56, 2671
    maxui	%r34, %r56, 2671
    write	"test minmax end"

.end

.text
    write	"test nullification (explicit masks)"
    alloc	96
    ldi		%r10, 0
    nuldeq	%r10, %r10, 5, 4
    write	"0" ; nullified
    write	"1" ; nullified
    write	"2" ; nullified
    write	"3" ; nullified
    write	"4" ; nullified
    write	"5" ; else
    write	"6" ; else
    write	"7" ; else
    write	"8" ; else

    write	"test nullification (predicate names)"
    ldi		%r10, 0
    nuldeq	%r10, %r10, equal, nonequal
    write	"0"
    write	"1"
    write	"2"
    write	"3"
    write	"4" (equal)
    write	"5"
    write	"6"
    write	"7"
    write	"8" (nonequal)


    write	"test nullification"
    ldi		%r10, 0
    nuldeq	%r10, %r10, 4, 3
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 1
    addi	%r10, %r10, 1
    addi	%r10, %r10, 1
    addi	%r10, %r10, 1

    write	"test nullification"
    ldi		%r10, 0
    nuldeq	%r10, %r10, true, false
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 1 (true)
    addi	%r10, %r10, 1
    addi	%r10, %r10, 1 (false)

    nop	0
    nop	0
    nuldeq	%r12, %r10, 4, 3
    write	"branch1: psr=%s(psr)"
    write	"branch1: %i64(r10)"
    write	"branch1: %i64(r10)"
    write	"branch1: %i64(r10)"
    write	"branch2: psr=%s(psr)"
    write	"branch2: %i64(r20)"
    write	"branch2: %i64(r20)"


    nuldeq	%r23, %r45, 0b1100, 0b0101
    nuldlt	%r23, %r45, 0b1100, 0b0101
    nuldltu	%r23, %r45, 0b1100, 0b0101

    nuldeqi	%r23, 45, 0b1100, 0b0101
    nuldlti	%r23, -45, 0b1100, 0b0101
    nuldltui	%r23, 45, 0b1100, 0b0101

    nuldeqi.l   %r23, 45000000000, 0b1100, 0b0101
    nuldlti.l   %r23, -45000000000, 0b1100, 0b0101
    nuldltui.l  %r23, 45000000000, 0b1100, 0b0101

    nulbs	%r23, %r45, 0b1100, 0b0101
    nulbsi	%r23, 45, 0b1100, 0b0101
    nop	1
    nop	2
    nop	3
    nop	4
    nop	5
    nop	6
    nop	7

    nuldeq	%r10, %r10, same_equal, same_nonequal
    write	"0e"
    write	"1e"
    write	"2e" (same_equal, same_nonequal)

    nuldne	%r10, %r10, same_equal2, same_nonequal2
    write	"0ne"
    write	"1ne"
    write	"2ne" (same_equal2, same_nonequal2)

    nuldeq	%r10, %r10, no_if_true, no_if_false (no_if_true)
    write	"else" (no_if_false)

    write	"end_nullification_test"
.end
.text
    alloc	21
    ldi		%r12, PMC_LAST
    write	"PMC_LAST = %i64(r12)"
; don't report runtine in unittests, this is non-reproducible
    mfmr	%r14, %gz, PMC_RUNTIME
;   write	"PMC_RUNTIME = %i64(r14)"
    mfmr	%r14, %gz, PMC_SHORT_INSTRUCTION
    write	"PMC_SHORT_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_LONG_INSTRUCTION
    write	"PMC_LONG_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_SHADOWED_INSTRUCTION
    write	"PMC_SHADOWED_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_NOP_INSTRUCTION
    write	"PMC_NOP_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_QUALIFIED_NOP_INSTRUCTION
    write	"PMC_QUALIFIED_NOP_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_REGISTER_SPILL
    write	"PMC_REGISTER_SPILL = %i64(r14)"
    mfmr	%r14, %gz, PMC_REGISTER_FILL
    write	"PMC_REGISTER_FILL = %i64(r14)"
    mfmr	%r14, %gz, PMC_ICACHE_HIT
    write	"PMC_ICACHE_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_ICACHE_MISS
    write	"PMC_ICACHE_MISS = %i64(r14)"
    mfmr	%r14, %gz, PMC_DCACHE_HIT
    write	"PMC_DCACHE_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_DCACHE_MISS
    write	"PMC_DCACHE_MISS = %i64(r14)"
    mfmr	%r14, %gz, PMC_INSTRUCTION_TRANSLATION_HIT
    write	"PMC_INSTRUCTION_TRANSLATION_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_INSTRUCTION_TRANSLATION_MISS
    write	"PMC_INSTRUCTION_TRANSLATION_MISS = %i64(r14)"
    mfmr	%r14, %gz, PMC_DATA_TRANSLATION_HIT
    write	"PMC_DATA_TRANSLATION_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_DATA_TRANSLATION_MISS
    write	"PMC_DATA_TRANSLATION_MISS = %i64(r14)"
    mfmr	%r14, %gz, PMC_BACKSTORE_TRANSLATION_HIT
    write	"PMC_BACKSTORE_TRANSLATION_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_BACKSTORE_TRANSLATION_MISS
    write	"PMC_BACKSTORE_TRANSLATION_MISS = %i64(r14)"
    mtmr	%r14, %gz, PMC_SHORT_INSTRUCTION
    mfmr	%r15, %gz, PMC_SHORT_INSTRUCTION
    write	"old pm reg = %i64(r15)"
.end
.text
; Simple test program
; 20! factorial compute
.text
    alloc	61
    ldi		%r15, -100
loop_stop_sard:
    srdi	%r13, %r15, 5
    repdle	%r15, %gz, loop_stop_sard

; performance test - long loop
; for(i = 1000000; i>0; i--) DoSome();

    ldi		%r20, 2500000
    ldi		%r15, 20 ; maximum factorial number
    ldi		%r21, 5
loop_stop: (64)
    addi	%r13, %r13, 5
    sub		%r14, %r14, %r55
    cmpdlt	%r24, %r14, %r14
    addi	%r13, %r13, 4
    sub		%r14, %r14, %r55
    cmpdlt	%r22, %r14, %r14
    addi	%r13, %r13, 33
    srpi	%r14, %r14, %r55, 13
    sub		%r14, %r13, %r21
    srai	%r14, %r14, 7
    repdgt	%r20, %gz, loop_stop
; print loop counter after loop (must be 0)
    write	"%i64(r20) factorials"
    ldi		%r13, 1
    ldi		%r14, 1
start:
    mul		%r13, %r13, %r14
    write	"factorial: %u64(r13)"
    repdle	%r14, %r15, start

    write	"%i64(r14) %i64(r13)"
.end
.text
    alloc	96
    write	"Example of strided loop instructions"
; fast_check
    ldi		%r12, 10000	; load loop number (10)
stride_loop_start:
;	write	"%i64(r12)"
    cmpdeq	%r4, %r12, %r12
    add		%r14, %r14, %r46
    repdgt	%r12, %gz, stride_loop_start

    write	"counter=%i64(r12)"

; Second example of strided loop.
; fast_check
    ldi		%r12, 10000	; load loop number (10)
    ldi		%r14, 10000	; load loop number (10)
stride_loop_start2:
;   write	"%i64(r12)"
    cmpdeq	%r4, %r12, %r12
    addi	%r14, %r14, -2
    repdgt	%r12, %gz, stride_loop_start2

    write	"%i64(r12) %i64(r14)"

;*****************************************************************
; 3x inner loop example
;*****************************************************************
    ldi		%r3, 0
    ldi		%r20, 0
    ldi		%r33, 80
    mov		%r10, %r33
    mov		%r11, %r33
    mov		%r12, %r33
ccloop:
;   write	"%i64(r12)"
    addi	%r20, %r20, 1
    addi	%r12, %r12, -1
    cmpdlt	%r2, %r3, %r12
;   jmp	ccloop
;   write	"%i64(r11)"
    addi	%r11, %r11, -1
    cmpdlt	%r4, %r3, %r11
    mov		%r12, %r33
;   jmp		ccloop
;   write	"%i64(r10)"
    addi	%r10, %r10, -1
    cmpdlt	%r6, %r3, %r10
    mov		%r11, %r33
    mov		%r12, %r33
;   jmp		ccloop

    write	"%i64(r20)"

; for(i=0; i<100; i++)

    ldi	%r8, 0
start1:
;   write	"%i64(r8)"
    addi	%r8, %r8,1
    cmpdlti	%r7, %r8,128
    bdnei	%r7,0,start1

; for(i=100; i>0; i--)
    ldi		%r8, 100
start2:
    write	"%i64(r8)"
    addi	%r8, %r8,-1		; current error
    cmpdlt	%r2, %r3, %r8
    bdnei	%r2, 0, start2

    write	"r3      %x64(r3)"
;	mtspr	%r3, %rsc


; for(i=100; i>0; i--) write "%x64((i)"
    ldi		%r10, 100
qqq:	cmpdlt	%r2, %r3, %r10
    write	"r10     %x64(r10)"
    addi	%r10, %r10, -1
;   jmp		qqq
sss:

    andi.l	%r55, %r55,0x000FFFFF00003F0F
    mtspr	%r12, %ifa
; test some special regs
    ldi.l	%r9, 0x123456789
;   mtspr	%r9, psr
    write	"ip: %s(ip) psr: %s(psr)"
;   mtspr	%r3, psr
    ldi		%r55, 120
    mtspr	%r55, %tpr
    write	"fpcr    %s(fpcr)"
    write	"psr     %s(psr)"

    write	"test long loop"
; test simple loop
; fast_check
    ldi		%r13, 350000 ; 35
    ldi		%r14, 350000 ; 35
    ldi		%r15, 88
    write	"%i64(r14)"
repeat_loop_start: (128)
;	write	"%i64(r12)"
    addi	%r13, %r13, 3
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 8

    addi	%r13, %r13, 4
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 7

    addi	%r13, %r13, 5
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 6

    addi	%r13, %r13, 6
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 5

    sub		%r13, %r13, %r15
    sladd	%r13, %r13, %r15, 5
    sladd	%r13, %r13, %r15, 5

    xor		%r13, %r14, %r15
    sll		%r13, %r13, %r13
    repdgt	%r14, %gz, repeat_loop_start

    write	"%i64(r13) %i64(r14)"

    write	"end test long loop"
.end
.text
    write	"test random"
    alloc	96

    random	%r3, %gz
    write	"random: %x64(r3)"
    random	%r3, %gz
    write	"random: %x64(r3)"
    ldi		%r4, 1
    random	%r3, %r4
    write	"random seed: %x64(r3)"

    write	"end_random_test"
.end.text
; test simple long loop
    alloc	61
    ldi		%r13, 1000000
    mov		%r14, %r13
    write	"loop limit: %i64(r14)"
    ldi		%r15, 88
repeat_long_loop_start: (128)
    addi	%r13, %r13, 3
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 8
    addi	%r13, %r13, 4
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 7
    addi	%r13, %r13, 5
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 6
    addi	%r13, %r13, 6
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 5
    add		%r30, %r31, %r14
    sub		%r31, %r30, %r15
    slli	%r40, %r40, 12
    ldax	%r41, %r40, %r12, 3, -12
    ldax	%r41, %r40, %r12, 4, 112
    repdgt	%r14, %gz, repeat_long_loop_start
    jmp		repeat_exit

    repdle	%r56, %r60, repeat_long_loop_start
    repdge	%r56, %r60, repeat_long_loop_start
    repdleu	%r56, %r20, repeat_long_loop_start
    repdgeu	%r56, %r20, repeat_long_loop_start

    repdle.l	%r56, %r60, repeat_long_loop_start
    repdge.l	%r56, %r60, repeat_long_loop_start
    repdleu.l	%r56, %r20, repeat_long_loop_start
    repdgeu.l	%r56, %r20, repeat_long_loop_start

repeat_exit:
    write	"end loop repeat test"
.end.text
; Here we test instructions for partial rotate register by fixed bitcount.
    alloc	90
    write	"initial values"
    ldi.l	%r50, 0x1234567890ABCDEF
    write	"%x64(r50)"
    write	"rotate left"
    srpi	%r51, %r50, %r50, 40-1
    write	"%x64(r51)"
    write	"rotate right"
    srpi	%r51, %r50, %r50, 64-40-1	; same as previous
    write	"%x64(r51)"
    write	"rotate left immediate"
    srpi	%r51, %r50, %r50, 64-40-1
    write	"%x64(r51)"
    write	"rotate right immediate"
    srpi	%r51, %r50, %r50, 40-1	; same as previous "rD+1-rC"
    write	"%x64(r51)"

; Here we test instructions for shift and mask register by fixed bitcount.
    write	"shift signed|unsigned by immediate 12 bit"
    ldi.l	%r50, 0xfedcba0123456789
    write	"%x64(r50)"
    srai	%r51, %r50, 12
    write	"%x64(r51)"
    srli	%r51, %r50, 12
    write	"%x64(r51)"
    slli	%r51, %r50, 12
    write	"%x64(r51)"
    slli	%r51, %r50, 12
    write	"%x64(r51)"

;	jmp	ddd
    ldi		%r10, 16
    slp	%r51, %r50, %r50, %r10
    write	"%x64(r51)"

    ldi.l	%r40, 0x1234567890abcdef
    ldi.l	%r50, 0xfedcba0987654321
    slsrli	%r41, %r40, 8, 40
    write	"%x64(r41)"
    slsrai	%r41, %r40, 11, 40
    write	"%x64(r41)"

    write	"test srpi"
    ldi.l	%r40, 0x1234123412341234
    ldi.l	%r50, 0x5678567856785678
    srpi	%r41, %r40, %r50, 39
    write	"%x64(r41)"
    srpi	%r41, %r50, %r40, 23
    write	"%x64(r41)"
    srpi	%r41, %r40, %r40, 24
    write	"%x64(r41)"

    write	"test vector shift right pair (srpi16) instruction"
    xor		%r2, %r2, %r2	; all zeroes
    nor		%r3, %r2, %r2	; all ones
    write	"r2      %x128(r2)"
    write	"r3      %x128(r3)"
    srpiq	%r4, %r2, %r3, 60
    write	"r4      %x128(r4)"
    srpiq	%r4, %r3, %r2, 60
    write	"r4      %x128(r4)"
    srpiq	%r4, %r2, %r3, 100
    write	"r4      %x128(r4)"
    srpiq	%r4, %r3, %r2, 100
    write	"r4      %x128(r4)"

; SHIFTS
    sll		%r42, %r33, %r34
    sll		%r42, %r33, %r34
    sra		%r52, %r73, %r44
    srl		%r62, %r73, %r44
    slp		%r72, %r17, %r17, %r24
    srp		%r82, %r16, %r16, %r15
    srpi	%r72, %r15, %r24, 32
    dep		%r10, %r14, %r85, 32, 30

    slli	%r12, %r67, 13
    slli	%r13, %r57, 13
    srai	%r14, %r48, 14
    srli	%r15, %r38, 14
    srpi	%r16, %r39, %r13, 13
    srpi	%r17, %r29, %r13, 64-13


    write	"test packed bitwise logical"
    and		%r10, %r71, %r13
    andn	%r21, %r81, %r22
    or		%r32, %r71, %r32
    orn		%r43, %r61, %r43
    nand	%r54, %r51, %r54
    nor		%r65, %r41, %r64
    xnor	%r76, %r31, %r73
    xor		%r87, %r21, %r83


    ldi		%r20, 65
    write	"r20     %c(r20)"   ; should be 'A'

    ldi		%r3, 0
    ldi.l	%r22, 0x12345FFFFFFFFFFF
    write	"%x64(r22)"
    depc	%r23, %r22, 0, 23
    write	"%x64(r23)"

    ldi.l	%r22, 0x1234567890ABCDEF
    ldi.l	%r23, 0xFEDCBA9876543210
    srpi	%r22, %r22, %r23, 24
    write	"%x64(r22)"

    ldi.l	%r24, 0x4321F00000000
    write	"%x64(r24)"
    subfi	%r25, %r24, 0
    write	"%x64(r25)"
    not		%r25, %r25
    write	"%x64(r25)"
    xor		%r25, %r25, %r24
    write	"%x64(r25)"

; Example of absd.
    ldi		%r12, -10000
    absd	%r12, %r12, %gz
    write	"r12: %i64(r12)"
.end
.text
    jmp		endfpsimd
; SSE double (SSE2)
    fmaddpd	%r16, %r71, %r69, %r13
    fmsubpd	%r15, %r78, %r58, %r23
    fnmaddpd	%r14, %r67, %r47, %r13
    fnmsubpd	%r13, %r86, %r36, %r16
    fmaddapd	%r82, %r52, %r69, %r63
    fmsubapd	%r50, %r91, %r69, %r63
    faddpd	%r12, %r86, %r25
    fnaddpd	%r11, %r82, %r19
    fsubpd	%r10, %r63, %r28
    faddcpd	%r81, %r61, %r37
    fsubcpd	%r82, %r81, %r46
    faddhpd	%r83, %r81, %r55
    fsubhpd	%r84, %r71, %r64
    fmulpd	%r81, %r71, %r11
    fmulhpd	%r60, %r11, %r22
    fdotpd	%r85, %r81, %r13
    fminpd	%r86, %r84, %r14
    fmaxpd	%r87, %r61, %r15
    faminpd	%r30, %r52, %r16
    famaxpd	%r61, %r51, %r17

    fcmppdoeq	%r80, %r81, %r63
    fcmppdone	%r11, %r81, %r32
    fcmppdolt	%r15, %r81, %r32
    fcmppdolt	%r60, %r81, %r82
    fcmppdone	%r62, %r72, %r83
    fcmppdole	%r62, %r72, %r62

    fpkpd	%r60, %r61, %r62
    fnegpd	%r61, %r51
    fabsdpd	%r61, %r51, %r3
    fnabsdpd	%r61, %r61, %r3
    frndpd	%r60, %r77,3
    frndpd	%r62, %r61,2
    frndpd	%r62, %r71,0
    frndpd	%r83, %r67,1
    fdivpd	%r83, %r67, %r20
    fsqrtpd	%r68, %r81
    frsqrtpd	%r68, %r81


; quadruple floating-point extension example
.rodata
    align	16
a:	quad	1.234567890123456789124141241241
b:	quad	3.1415925678888734535345231234564561
c:	quad	3.4566345634563456346535463463456
.text
    ldar	%r21, a
    ldq		%r3, %r21,0*16
    ldq		%r1, %r21,1*16
    ldq		%r2, %r21,2*16
    write	"%vf64(r3)"
    write	"%vf64(r1)"
    write	"%vf64(r2)"

    write	"test binary\0"
    fmulsd	%r3, %r1, %r2
    write	"%vf64(r3)"
    fnmulsd	%r3, %r1, %r2
    write	"%vf64(r3)"
    faddsd	%r4, %r1, %r2
    write	"%vf64(r4)"
    fnaddsd	%r4, %r1, %r2
    write	"%vf64(r4)"
    fsubsd	%r4, %r2, %r1
    write	"%vf64(r4)"
    fdivsd	%r4, %r2, %r1
    write	"%vf64(r4)"

    write	"test fused fma\0"
    fmaddsd	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"
    fnmaddsd	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"
    fmsubsd	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"
    fnmsubsd	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"

    write	"test unary\0"
    mov		%r6, %r5
    write	"%vf64(r6)"
    fabssd	%r6, %r5
    write	"%vf64(r6)"
    fnegsd	%r6, %r5
    write	"%vf64(r6)"
    fnabssd	%r6, %r5
    write	"%vf64(r6)"
    fsqrtsd	%r6, %r2
    write	"%vf64(r6)"
    frsqrtsd	%r6, %r2
    write	"%vf64(r6)"

    write	"test rounding\0"
    frndsd	%r7, %r2,4
    write	"%vf64(r7)"
    frndsd	%r7, %r2,2
    write	"%vf64(r7)"
    frndsd	%r7, %r2,1
    write	"%vf64(r7)"
    frndsd	%r7, %r2,0
    write	"%vf64(r7)"
    fcvtsd2iw	%r7, %r2,0
    write	"r7=%i64(r7)"
    ldi		%r7, 123456
    fcvtiw2sd	%r7, %r7,0
    write	"%vf64(r7)"

    write	"test minmax, abs minmax"
    fmaxsd	%r8, %r1, %r2
    write	"%vf64(r8)"
    fminsd	%r8, %r1, %r2
    write	"%vf64(r8)"
    famaxsd	%r8, %r1, %r2
    write	"%vf64(r8)"
    faminsd	%r8, %r1, %r2
    write	"%vf64(r8)"

    write	"test fmergesq\0"

.rodata
    align	16
xxxq:	quad	1.122
    quad	0.9999765432
.text
    ldar	%r21, a
; fast_check
    ldi		%r15, 100000 ; 10
    ldq		%r15, %r21, 0*16
    ldq		%r16, %r21, 1*16
    fsubsd	%r22, %r15, %r16
    write	"%vf64(r22)"
yyloop:
    fmaddsd	%r22, %r15, %r16, %r22
    fmsubsd	%r22, %r15, %r16, %r22
    repdge	%r15, %gz, yyloop
    write	"%vf64(r22)"


.rodata
    align	16
    quad	1.189731495357231765085759326628007e+4932
qqqq:   quad	1.23456789 + 32.0
    quad	0.2345678901234567890123456789012345678 + 0.2
    quad	2*asin(1)
    quad	255
dbl1:	double	acos(sin(3.1415926)) ;-1.2345678e+200
    double	444.689679
float1:	float	0.123456789123456789e+30
    float	2.123456789122233
    float	0.0
    float	1.0
octquad:
    quad	0.25
f32:	d4	0x3fff1234
.text
    ldar	%r45, qqqq
    ldar	%r46, dbl1
    ldar	%r47, float1
    write	"r45     %x64(r45)"
    ldq		%r63, %r45,0
    write	"%vf64(r63) %x128(r63)"
    ldq		%r63, %r45,0
    write	"%vf64(r63) %x128(r63)"
    fmulsq	%r62, %r63, %r63
    write	"%vf64(r62)"
    ldwz	%r60, %r47,0
    write	"%vf64(r60)"
    lddz	%r59, %r46,0
    ldwz	%r58, %r47,4
    ldwz	%r57, %r47,8
    write	"%vf64(r57)"
    write	"%vf64(r58)"
    write	"%vf64(r59)"
    ldq		%r53, %r45,1*16
    write	"%vf64(r53)"
    ldq		%r50, %r45,2*16
    write	"%vf64(r50)"
    ldq		%r49, %r45,3*16
    write	"%vf64(r49) %x128(r49)"
    ldwz	%r48, %r47,3*4
    write	"%vf64(r48)"
    fnegsq	%r46, %r48
    write	"%vf64(r46)"
    fmaddsq	%r40, %r52, %r52, %r53
    write	"%m(dump)"

.rodata
    align	16
__yyy:
    quad	0.5
    quad	1.0
    quad	2.25
    quad	22252.22424
    quad	-22252.22424
    quad	34.125
    quad	2.0 / 72.0
    d8	0xffffffffffffffff
    d8	0x3ffe
    d8	0xffffffffffffffff
    d8	0x3ff0
    d8	0x8000000000000000
    d8	0xbff3
    d8	0x8000000000000000
    d8	0xc003
    quad	-1.234567890123456789012345e+6
    d8	0x8000000000000000
    d8	0x3fe0
.text
    ldar	%r12, __yyy
    ldq		%r23, %r12, 0
    write	"%vf64(r23) %x128(r23)"
    ldq		%r23, %r12, 1*16
    write	"%vf64(r23) %x128(r23)"
    ldq		%r23, %r12, 2*16
    write	"%vf64(r23) %x128(r23)"
    ldq		%r23, %r12, 3*16
    write	"%vf64(r23) %x128(r23)"
    ldq		%r23, %r12, 4*16
    write	"%vf64(r23) %x128(r23)"
    ldq		%r23, %r12, 5*16
    write	"%vf64(r23) %x128(r23)"
    ldq		%r23, %r12, 6*16
    write	"%vf64(r23) %x128(r23)"
    ldq		%r27, %r12, 7*16
    write	"%vf64(r27) %x128(r27)"
    ldq		%r27, %r12, 8*16
    write	"%vf64(r27) %x128(r27)"
    ldq		%r27, %r12, 9*16
    write	"%vf64(r27) %x128(r27)"
    ldq		%r27, %r12, 10*16
    write	"%vf64(r27) %x128(r27)"
;   flddi	%r24, 8.5899345919999999995e+09 ;-1.234567890123456789012345e+6
;   write	"%vf64(r24) %x128(f24)"
;   flddi	%r24, 0.125 ; 4.656612873077392578125e-10 ; 4.656612873077392578125e-10
;   write	"%vf64(r24) %x128(f24)"
    ldq		%r25, %r12, 11*16
    write	"%vf64(r25) %x128(r25)"
    ldq		%r25, %r12, 12*16
    write	"%vf64(r25) %x128(r25)"
    fldqri	%r40, 4.345678912345678901234567890123456789012345678
    write	"%vf64(r40)"


    fmaddsd	%r23, %r60, %r55, %r33
    fmaddsd	%r24, %r61, %r25, %r32
    fmaddsd	%r25, %r62, %r55, %r23
    fmaddsd	%r26, %r63, %r75, %r73
    fmaddsd	%r27, %r64, %r75, %r73
    fmaddsd	%r28, %r65, %r85, %r63
    fmaddsd	%r29, %r66, %r85, %r63
    fmaddsd	%r30, %r67, %r95, %r23
    fmaddsd	%r31, %r68, %r95, %r23
    fmaddsd	%r10, %r21, %r26, %r27
    fmaddsd	%r13, %r21, %r26, %r27
    fmaddsd	%r10, %r21, %r26, %r27
    fmaddsd	%r12, %r21, %r26, %r27
    fmaddsd	%r11, %r21, %r26, %r27
    fmaddsd	%r13, %r21, %r26, %r27
    fmaddsd	%r14, %r21, %r26, %r27
    fmaddsd	%r15, %r21, %r26, %r27
    fmaddsd	%r16, %r21, %r26, %r27
    fmaddsd	%r17, %r21, %r26, %r27

    stq	%r16, %sp,16*2
    stq	%r17, %sp,16*3
    stq	%r18, %sp,16*4
    stq	%r19, %sp,16*5
    stq	%r20, %sp,16*6
    stq	%r21, %sp,16*7
    stq	%r22, %sp,16*8
    stq	%r23, %sp,16*9
    stq	%r24, %sp,16*10
    stq	%r25, %sp,16*11
    stq	%r26, %sp,16*12
    stq	%r27, %sp,16*13
    stq	%r28, %sp,16*14
    stq	%r29, %sp,16*15
    stq	%r30, %sp,16*16
    stq	%r31, %sp,16*17


; SSE single
    fmaddps	%r58, %r61, %r92, %r63
    fmsubps	%r82, %r52, %r92, %r63
    fnmaddps	%r82, %r52, %r69, %r63
    fnmsubps	%r50, %r91, %r69, %r63
    fmaddaps	%r82, %r52, %r69, %r63
    fmsubaps	%r50, %r91, %r69, %r63
    faddps	%r61, %r94, %r69
    fnaddps	%r68, %r54, %r72
    fsubps	%r68, %r61, %r82
    faddcps	%r81, %r71, %r82
    fsubcps	%r82, %r71, %r82
    faddhps	%r62, %r61, %r82
    fsubhps	%r62, %r61, %r62
    fmulps	%r62, %r51, %r62
    fmulhps	%r63, %r51, %r62
    fdotps	%r83, %r51, %r62
    fminps	%r83, %r61, %r62
    fmaxps	%r63, %r71, %r62
    faminps	%r64, %r71, %r82
    famaxps	%r64, %r71, %r82

    fcmppsone	%r65, %r61, %r62
    fcmppsolt	%r74, %r61, %r62
    fcmppsole	%r83, %r61, %r62
    fcmppsule	%r72, %r61, %r62
    fcmppsule	%r11, %r61, %r62
    fcmppsuo	%r20, %r61, %r62

    fpkps	%r33, %r64, %r62
    fnegps	%r60, %r69
    fabsdps	%r61, %r68, %r3
    fnabsdps	%r62, %r67, %r3
    frndps	%r63, %r66,0
    frndps	%r64, %r65,2
    frndps	%r65, %r64,1
    frndps	%r66, %r63,0
    fdivps	%r67, %r62, %r20
    fsqrtps	%r68, %r61
    frsqrtps	%r69, %r60

    faddps	%r24, %r61, %r60
    fmulpd	%r47, %r60, %r46

endfpsimd:

.end
.text
.rodata
    align	16
mmxdata:
    d8	0x123456759eabcd7f
    d8	0x123456789cabcdef

    d8	0xf87f5432afebcdf3
    d8	0xffffffffffffffff

    d8	0x1234567890abcdef
    d8	0x1234567890abcdef

    d8	0x1234567890abcdef
    d8	0x1234567890abcdef
.text
    alloc	90
    ldar	%r4, mmxdata
    ldq		%r1, %r4,0*16
    ldq		%r2, %r4,1*16
    ldq		%r3, %r4,2*16
    ldq		%r4, %r4,3*16
    write	"r1      %x128(r1)"
    write	"r2      %x128(r2)"

    write	"%vu8(r1)"
    write	"%vu16(r1)"
    write	"%vu32(r1)"
    write	"%vu64(r1)"

    vaddub	%r3, %r1, %r2
    write	"test vadd/vaddc (1 byte)\0"
    vaddcb	%r4, %r1, %r2
    write	"%vu8(r1)"
    write	"%vu16(r2)"
    write	"%vu32(r3)"
    write	"%vu64(r4)"
    write	"test vadd/vaddo signed (1 byte)\0"
    vaddob	%r4, %r1, %r2
    write	"%vi8(r1)"
    write	"%vi16(r2)"
    write	"%vi32(r3)"
    write	"%vu64(r4)"

    vsubub	%r3, %r1, %r2
    write	"test vsub/vsubb (1 byte)\0"
    vsubbb	%r4, %r1, %r2
    write	"%vu8(r1)"
    write	"%vu8(r2)"
    write	"%vu8(r3)"
    write	"%vu8(r4)"
    write	"test vsub/vsubo signed (1 byte)\0"
    vsubob	%r4, %r1, %r2
    write	"%vi8(r1)"
    write	"%vi8(r2)"
    write	"%vi8(r3)"
    write	"%vu8(r4)"

    write	"test vaddusb"
    vaddub	%r3, %r1, %r2
    vaddusb	%r4, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)\n%vu8(r4)"

    write	"test vsubusb"
    vsubub	%r3, %r1, %r2
    vsubusb	%r4, %r1, %r2
    write	"%vu8(r1):\n%vu8(r2)\n%vu8(r3)\n%vu8(r4)"

    write	"test vaddssb"
    vaddub	%r3, %r1, %r2
    vaddssb	%r4, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)\n%vi8(r4)"

    write	"test vsubssb"
    vsubub	%r3, %r1, %r2
    vsubssb	%r4, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)\n%vi8(r4)"

    write	"test pavgu (1 byte)\0"
    vavgub	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test pavgs (1 byte)\0"
    vavgsb	%r3, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write	"test vminu (1 byte)\0"
    vminub	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test vmins (1 byte)\0"
    vminsb	%r3, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write	"test vmaxu (1 byte)\0"
    vmaxub	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test vmaxs (1 byte)\0"
    vmaxsb	%r3, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write	"test merge low (1 byte)\0"
    vmrglb	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test merge high (1 byte)\0"
    vmrghb	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    vpkuush	%r2, %r3, %r4
    vpksush	%r2, %r3, %r4
    vpksssh	%r2, %r3, %r4

    vpkuusw	%r2, %r3, %r4
    vpksusw	%r2, %r3, %r4
    vpksssw	%r2, %r3, %r4

    vpkuusd	%r2, %r3, %r4
    vpksusd	%r2, %r3, %r4
    vpksssd	%r2, %r3, %r4

;	jmp	endmmx
; d1 abs
    vminsb	%r12, %r61, %r55
    vminsh	%r18, %r61, %r45
    vminsw	%r27, %r61, %r35
    vminsd	%r36, %r61, %r25

    vminub	%r14, %r61, %r15
    vminuh	%r15, %r62, %r75
    vminuw	%r17, %r63, %r85
    vminud	%r16, %r64, %r75

    vmaxsb	%r26, %r71, %r85
    vmaxsh	%r26, %r61, %r54
    vmaxsw	%r16, %r51, %r35
    vmaxsd	%r16, %r41, %r55

    vmaxub	%r11, %r61, %r53
    vmaxuh	%r12, %r55, %r55
    vmaxuw	%r16, %r46, %r56
    vmaxud	%r13, %r31, %r55

    vrolb	%r56, %r61, %r15
    vrolh	%r31, %r61, %r25
    vrolw	%r53, %r61, %r30
    vrold	%r62, %r61, %r41

    vrorb	%r16, %r11, %r52
    vrorh	%r11, %r21, %r63
    vrorw	%r71, %r31, %r74
    vrord	%r81, %r41, %r85

    vsllb	%r16, %r51, %r86
    vsllh	%r24, %r61, %r55
    vsllw	%r69, %r71, %r55
    vslld	%r77, %r81, %r55

    vsrlb	%r21, %r81, %r50
    vsrlh	%r12, %r63, %r51
    vsrlw	%r13, %r62, %r52
    vsrld	%r64, %r63, %r53

    vsrab	%r85, %r64, %r54
    vsrah	%r76, %r65, %r15
    vsraw	%r67, %r66, %r25
    vsrad	%r58, %r67, %r36

    vavgsb	%r49, %r68, %r47
    vavgsh	%r30, %r69, %r58
    vavgsw	%r26, %r11, %r69
    vavgsd	%r16, %r21, %r75

    vavgub	%r14, %r31, %r85
    vavguh	%r15, %r41, %r45
    vavguw	%r56, %r51, %r25
    vavgud	%r87, %r61, %r15

    vaddssb	%r42, %r71, %r15
    vaddssh	%r83, %r81, %r45
    vaddssw	%r74, %r41, %r85
    vaddssd	%r65, %r61, %r75

    vaddub	%r56, %r61, %r75
    vadduh	%r47, %r61, %r65
    vadduw	%r38, %r61, %r55
    vaddud	%r29, %r61, %r55

    vaddusb	%r55, %r61, %r45
    vaddush	%r65, %r61, %r35
    vaddusw	%r74, %r61, %r25
    vaddusd	%r84, %r61, %r15

    vaddcb	%r53, %r61, %r55
    vaddch	%r13, %r61, %r55
    vaddcw	%r12, %r61, %r55
    vaddcd	%r12, %r61, %r55

    vsubssb	%r56, %r61, %r15
    vsubssh	%r67, %r61, %r12
    vsubssw	%r78, %r61, %r13
    vsubssd	%r89, %r61, %r45

    vsubub	%r70, %r61, %r85
    vsubuh	%r86, %r61, %r45
    vsubuw	%r46, %r61, %r13
    vsubud	%r46, %r61, %r75

    vsubusb	%r41, %r68, %r65
    vsubush	%r12, %r37, %r55
    vsubusw	%r23, %r26, %r45
    vsubusd	%r14, %r18, %r35

    vcmpeqb	%r86, %r61, %r25
    vcmpeqh	%r44, %r72, %r15
    vcmpeqw	%r20, %r83, %r55
    vcmpeqd	%r16, %r84, %r55

;	pcmpne	%r106, %r61, %r55
;	pcmpgt	%r106, %r61, %r55
;	pcmpge	%r106, %r61, %r55
;	pcmple	%r106, %r61, %r55

    vcmpltb	%r13, %r61, %r15
    vcmplth	%r14, %r61, %r24
    vcmpltw	%r15, %r61, %r38
    vcmpltd	%r16, %r61, %r45

    vcmpltub	%r19, %r11, %r75
    vcmpltuh	%r18, %r21, %r82
    vcmpltuw	%r16, %r31, %r73
    vcmpltud	%r14, %r71, %r54

    vmrghb	%r11, %r71, %r13
    vmrghh	%r72, %r67, %r27
    vmrghw	%r13, %r58, %r55
    vmrghd	%r14, %r69, %r15

    vmrglb	%r76, %r61, %r11
    vmrglh	%r26, %r11, %r62
    vmrglw	%r16, %r15, %r73
    vmrgld	%r16, %r11, %r85

    write	"end simd(int) test"
endmmx:

.end
.text
    alloc	70
    write	"test system instructions (assembler only)"

    addi	%sp, %sp, -32	; alloc stack frame
    write	"test tpa for sp: 0x%x64(sp)"
    tpa		%r4, %sp
    write	"tpa(sp): 0x%x64(r4)"
    addi	%sp, %sp, 32	; rollback stack frame
    
    jmp		system_skip

    ldi		%r45, 1012
    syscall
    nop		0
    sysret
    rfi

    icbi	%r34, 16
    dcbt	%r34, 16
    dcbf	%r34, 16
    dcbi	%r34, 16


    mfspr	%r34, %lid
    mtspr	%r34, %lid
    mprobe	%r34, %r45, %r66
    retf	234567

    mfspr	%r32, %iv
    mfspr	%r32, %psr

; test system instructions
    ptc		%r10, %r45, %r11

    mfspr	%r12, %pta
    mfspr	%r12, %fpcr
    mtspr	%r11, %rsc

; test atomic fences
    fence.acquire
    fence.release
    fence.acq_rel
    fence.seq_cst

    mtdbr	%r44, %r66, 0
    mfdbr	%r55, %r66, 0
    mtibr	%r44, %r66, 0
    mfibr	%r55, %r66, 0
    mtitr	%r44, %r66, %r12
    mtdtr	%r44, %r66, %r12

;	bpa	b7, %r7
;	bpal	b7, b4, %r6
;	lpr	b7, %r6, label16

    undef
system_skip:
    write	"end test system instructions (assembler only)"
.end
.rodata
align 4
    d4	table_cases
    d4	label_0
    d4	label_1
    d4	label_2

table_cases:
    i4	label_0 - table_cases
    i4	label_1 - table_cases
    i4	label_2 - table_cases

.text
    alloc	80
    write	"test table switch to case 1"
    ldi		%r4, 1
    ldafr	%r5, table_cases
    jmpt	%r5, %r4

label_0:
    write	"case 0"
    cmpqeq	%r12, %r24, %gz
    cmpqne	%r12, %r24, %gz
    deps	%r18, %r20, 13, 32
    depc	%r19, %r23, 13, 32
    ldi		%r12, -1234
    ldi		%r13, 3456
    jmp		label_after_switch

label_1:
    write	"case 1"
    andi	%r45, %r44, 12345
    sladd	%r14, %sp, %r12, 2
    sladd	%r12, %r23, %r44, 3
    mov		%r12, %r13
    ldi		%r24, 0
    mtspr	%r24, %psr
    mfspr	%r12, %psr
    nand	%r34, %r34, %r45
    sll		%r12, %r23, %r45
    slli	%r12, %r23, 45
    jmp		label_after_switch

label_2:
    write	"case 2"
    addi	%r34, %r34,-1
    mov		%r58, %r45
    sladd	%r12, %r15, %r30, 14
    sladd	%r12, %r15, %r30, 5
    sladd	%r12, %r15, %r30, 5
    srd		%r34, %r56, %r40
    srdi	%r34, %r56, 40
    depa	%r40, %r78, 40, 20
    sladd	%r54, %r45, %r22, 4
    sladd	%r54, %r45, %r22, 20
    ldax	%r3, %r45, %tp, 3, 55
    jmp		label_after_switch

label_after_switch:
    write	"end table switch test"
.end
.rodata
    align	16
console_test_quad:
    quad	1.189731495357231765085759326628007e+4932
console_test_quad2:
    quad	1.23456789 + 32.0
console_test_quad3:
    quad	0.2345678901234567890123456789012345678 + 0.2
    quad	2*asin(1)
    quad	255
console_test_double:
    double	acos(sin(3.1415926)) ;-1.2345678e+200
    double	444.689679
console_test_float:
    float	0.123456789123456789e+30
    float	2.123456789122233
    float	0.0
    float	1.0
.text
    alloc	35
    write	"ip=%s(ip), eip=%s(eip), psr=%s(psr)"

    write	"end test write special regs"

    write	"\ntest write: general register"

    write	"%%i8(sp)  = %i8(sp)"
    write	"%%i16(sp) = %i16(sp)"
    write	"%%i32(sp) = %i32(sp)"
    write	"%%i64(sp) = %i64(sp)"
    write	"%%u8(sp)  = %u8(sp)"
    write	"%%u16(sp) = %u16(sp)"
    write	"%%u32(sp) = %u32(sp)"
    write	"%%u64(sp) = %u64(sp)"
    write	"%%x8(sp)  = 0x%x8(sp)"
    write	"%%x16(sp) = 0x%x16(sp)"
    write	"%%x32(sp) = 0x%x32(sp)"
    write	"%%x64(sp) = 0x%x64(sp)"

    write	"%x64(r0)"
    write	"%x64(r1)"
    write	"%x64(r2)"
    write	"%x64(r22)"
    write	"%x64(r33)"
    write	"%x64(g0)"
    write	"%x64(g1)"
    write	"%x64(tp)"
    write	"%x64(sp)"

    write	"end test write general regs"

    ldqr	%r22, console_test_quad
    write	"r22 = %x128(r22) %f128(r22)"
    ldqr	%r22, console_test_quad2
    write	"r22 = %x128(r22) %f128(r22)"
    ldqr	%r22, console_test_quad3
    write	"r22 = %x128(r22) %f128(r22)"
    lddzr	%r22, console_test_double
    write	"r22 = %x64(r22) %f64(r22)"
    ldwzr	%r22, console_test_float
    write	"r22 = %x32(r22) %f32(r22)"

    write	"end test write fp regs"
.end