html-program

.text
    alloc	96
    write	"test carry-less multiply"
    clmulll	%r34, %r21, %r22
    clmulhl	%r34, %r21, %r22
    clmulhl	%r34, %r21, %r22
    clmulhh	%r34, %r21, %r22
.rodata
align 16
vector_a:
    d8	0x7b5b546573745665
    d8	0x63746f725d53475d
vector_b:
    d8	0x4869285368617929
    d8	0x5b477565726f6e5d
result_00:
    d8	0x1d4d84c85c3440c0
    d8	0x929633d5d36f0451
result_01:
    d8	0x1bd17c8d556ab5a1
    d8	0x7fa540ac2a281315
result_10:
    d8	0x1a2bf6db3a30862f
    d8	0xbabf262df4b7d5c9
result_11:
    d8	0x1d1e1f2c592e7c45
    d8	0xd66ee03e410fd4ed
.text
    ldqr	%r12, vector_a
    ldqr	%r13, vector_b

    clmulll	%r11, %r12, %r13
    ldqr	%r21, result_00
    write	"clmul: %x128(r11) %x128(r21)"
    clmulhl	%r11, %r13, %r12
    ldqr	%r21, result_01
    write	"clmul: %x128(r11) %x128(r21)"
    clmulhl	%r11, %r12, %r13
    ldqr	%r21, result_10
    write	"clmul: %x128(r11) %x128(r21)"
    clmulhh	%r11, %r12, %r13
    ldqr	%r21, result_11
    write	"clmul: %x128(r11) %x128(r21)"

    write	"test aes"
    aesdec	%r11, %r12, %r13
    aesdeclast	%r11, %r12, %r13
    aesenc	%r11, %r12, %r13
    aesenclast	%r11, %r12, %r13
    aesimc	%r11, %r12
    aeskeygenassist %r11, %r12, 250
    write	"end aes test"
.end
.text
;*****************************************************************
; ARITHMETIC
;*****************************************************************
    alloc	96
    write	"test load constant (1234567)"
    ldi		%r1, 1234567
    write	"ldi: %i64(r1)"

    write	"test load long constant (123456789012345678)"
    ldi.l	%r1, 123456789012345678
    write	"ldi long: %i64(r1)"

    write	"test simple arithmetic"
    ldi		%r1, 1
    ldi		%r2, 2
    ldi		%r3, 3

    write	"add 1+2"
    add		%r4, %r1, %r2
    write	"add: %i64(r4)"

    write	"add immediate 1+6"
    addi	%r4, %r1, 6
    write	"addi: %i64(r4)"

    write	"sub 1-2"
    sub		%r4, %r1, %r2
    write	"sub: %i64(r4)"

    write	"sub from immediate 6-1"
    subfi	%r4, %r1, 6
    write	"subfi: %i64(r4)"

    write	"mul 3*4"
    ldi		%r1, 3
    ldi		%r2, 4
    mul		%r4, %r1, %r2
    write	"mul: %i64(r4)"

    write	"12 div 4"
    ldi		%r1, 12
    ldi		%r2, 4
    div		%r4, %r1, %r2
    write	"%i64(r4)"

    write	"15 mod 4"
    ldi		%r1, 15
    ldi		%r2, 4
    mod		%r4, %r1, %r2
    write	"mod: %i64(r4)"

    write	"test int32_t add"
    ldi.l	%r1, 0xFFFFFFFF
    ldi.l	%r2, 0xFFFFFFF0
    addws	%r3, %r1, %r2
    write	"add4: %i64(r3)"
    addiws.l	%r3, %r1, 0xFFFFFFFF
    write	"addis4.l: %i64(r3)"


    addi	%r45, %r45, 12
    mov		%r54, %r56
    sub		%r45, %r56, %r50
    addi	%r45, %r55, -1000
    cmpdne	%r12, %r56, %r10
    subfi	%r45, %r56, -10000
    subfi	%r45, %r56, -20000
    cmpdeq	%r13, %r56, %r50
    add		%r45, %r56, %r50
    addi	%r45, %r56, -10000
    mul		%r45, %r56, %r50
    muli	%r45, %r56, -10000
    mov		%r55, %r20
    ldi		%r55, 1200
    ldi		%r55, 987654
    ldi.l	%r56, 98765432198765432
    addi	%r12, %r13, -789
    cmpdne	%r14, %r13, %r77
    nand	%r43, %r44, %r34
    nor		%r43, %r44, %r34
    addi	%r56, %sp, 0
    ; callr	%r0, quadrat
    add		%r56, %sp, %sp

    ldi.l	%r55, -9223372036854775808
    addi	%r56, %sp, -64
    subfi.l	%r55, %r56,12345678901234567
    nor		%r12, %r14, %r14
    addi	%r56, %sp, -64
    nor		%r12, %r14, %r14
    subfi.l	%r55, %r56, 12345678901234567
    addi	%r56, %sp, -64
    subfi.l	%r55, %r56, -12345678901234567
    addi	%r56, %sp, -64
    subfi.l	%r55, %r56, -12345678901234567
    addi.l	%r45, %r56, 12345678



    ldi.l	%r5, 0xaFFFFFFF12345677
    ldi.l	%r6, 0xaFFFFFFF12345678

    write	"test signed overflow: %i64(r5) %i64(r6)"

    write	"add overflow"
    addo	%r2, %r5, %r6
    write	"addo: %i64(r2)"

    write	"subtract overflow"
    subo	%r2, %r5, %r6
    write	"subo: %i64(r2)"

    write	"test unsigned add carry"
    ldi		%r7, -1
    ldi		%r5, -2
    ldi		%r6, -1
    addaddc	%r2, %r5, %r6, %r7
    write	"addaddc: %u64(r5) %u64(r6) %u64(r7) => %i64(r2)"

    write	"test unsigned subtract borrow"
    ldi		%r7, -1
    ldi		%r5, 12
    ldi		%r6, -1
    subsubb	%r2, %r5, %r6, %r7
    write	"subsub: %u64(r5) %u64(r6) %u64(r7) => %i64(r2)"

    muladd	%r34, %r45, %r67, %r80
    mulsub	%r34, %r45, %r67, %r80
    mulsubf	%r34, %r45, %r67, %r80
    addadd	%r34, %r45, %r67, %r80
    addsub	%r34, %r45, %r67, %r80
    subsub	%r34, %r45, %r67, %r80

.end
.text
    alloc 96
    write "test atomic fetch-op"
    addi %r5, %sp, -64
    write "atomic base: %x64(r5)"
    ldi  %r10, 5
    ldi  %r12, 10
    ldi  %r56, 5

    write "test amoadd"

    ldaddb.relaxed %r4, %r5, %r10
    ldaddb.acquire %r4, %r5, %r10
    ldaddb.release %r4, %r5, %r10
    ldaddb.acq_rel %r4, %r5, %r10

    ldaddh.relaxed %r4, %r5, %r10
    ldaddh.acquire %r4, %r5, %r10
    ldaddh.release %r4, %r5, %r10
    ldaddh.acq_rel %r4, %r5, %r10

    ldaddw.relaxed %r4, %r5, %r10
    ldaddw.acquire %r4, %r5, %r10
    ldaddw.release %r4, %r5, %r10
    ldaddw.acq_rel %r4, %r5, %r10

    ldaddd.relaxed %r4, %r5, %r10
    ldaddd.acquire %r4, %r5, %r10
    ldaddd.release %r4, %r5, %r10
    ldaddd.acq_rel %r4, %r5, %r10

    write "test amo-binary"

    ldandw.relaxed %r4, %r5, %r10
    ldandw.acquire %r4, %r5, %r10
    ldandw.release %r4, %r5, %r10
    ldandw.acq_rel %r4, %r5, %r10

    ldorw.release %r4, %r5, %r10
    ldorw.acq_rel %r4, %r5, %r10

    ldxorw.relaxed %r4, %r5, %r10
    ldxorw.relaxed %r4, %r5, %r10

    write "test amomin"
    ldsminw.acquire %r4, %r5, %r10
    ldsmind.acq_rel %r4, %r5, %r10

    ldsmaxb.relaxed %r4, %r5, %r10
    ldsmaxh.acquire %r4, %r5, %r10
    ldsmaxw.release %r4, %r5, %r10
    ldsmaxd.acq_rel %r4, %r5, %r10

    write "test amominu"

    lduminb.relaxed %r4, %r5, %r10
    ldumind.acquire %r4, %r5, %r10
    ldumaxd.release %r4, %r5, %r10
    ldumaxw.release %r4, %r5, %r10

    write "test cas"

    casb.relaxed %r12, %r5, %r56
    casb.acquire %r12, %r5, %r56
    casb.release %r12, %r5, %r56
    casb.acq_rel %r12, %r5, %r56

    cash.relaxed %r12, %r5, %r56
    cash.acquire %r12, %r5, %r56
    cash.release %r12, %r5, %r56
    cash.acq_rel %r12, %r5, %r56

    casw.relaxed %r12, %r5, %r56
    casw.acquire %r12, %r5, %r56
    casw.release %r12, %r5, %r56
    casw.acq_rel %r12, %r5, %r56

    casd.relaxed %r12, %r5, %r56
    casd.acquire %r12, %r5, %r56
    casd.release %r12, %r5, %r56
    casd.acq_rel %r12, %r5, %r56

    write "test load atomic relaxed"
    ldab.relaxed %r12, %r5
    ldah.relaxed %r12, %r5
    ldaw.relaxed %r12, %r5
    ldad.relaxed %r12, %r5
    ldaq.relaxed %r12, %r5

    write "test load atomic acquire"
    ldab.acquire %r12, %r5
    ldah.acquire %r12, %r5
    ldaw.acquire %r12, %r5
    ldad.acquire %r12, %r5
    ldaq.acquire %r12, %r5

    write "test store atomic relaxed"
    stab.relaxed %r12, %r5
    stah.relaxed %r12, %r5
    staw.relaxed %r12, %r5
    stad.relaxed %r12, %r5
    staq.relaxed %r12, %r5

    write "test store atomic release"
    stab.release %r12, %r5
    stah.release %r12, %r5
    staw.release %r12, %r5
    stad.release %r12, %r5
    staq.release %r12, %r5

.end
.text
.data
data_lbl:
    d1	25
    d1	26
    d1	27
    d1	28

.text
program_start:
; Here we test references to data section.
; Absolute offset from begin of section
    write	"base addressing"
    alloc	96
    ldar	%r17, program_start
    ldi		%r12, data_lbl
    write	"data_lbl: %i64(r12)"

    ldi		%r12, data_hi(data_lbl)
    write	"data_hi(data_lbl): %i64(r12)"
    ldi		%r12, data_lo(data_lbl)
    write	"data_lo(data_lbl): %i64(r12)"
    ldafr	%r13, data_lbl
    write	"ldafr(data_lbl): %x64(r13)"
    ldafr.l	%r13, data_lbl
    write	"ldafr(data_lbl): %x64(r13)"

    addi	%r13, %r17, data_hi(data_lbl)
    write	"r13     %i64(r13)"
    addi	%r14, %r13, data_lo(data_lbl)+0
    write	"r14     %i64(r14)"

    addi	%r13, %r17, data_hi(data_lbl)
    write	"r13     %i64(r13)"
    ldbz	%r25, %r13, data_lo(data_lbl)+0
    ldbz	%r26, %r13, data_lo(data_lbl)+1
    ldbz	%r27, %r13, data_lo(data_lbl)+2
    ldbz	%r28, %r13, data_lo(data_lbl)+3
    write	"r25     %i64(r25)" ; must be 25
    write	"r26     %i64(r26)" ; must be 26
    write	"r27     %i64(r27)" ; must be 27
    write	"r28     %i64(r28)" ; must be 28

; test load context
    lddz	%r1, %sp, -16
    std		%r1, %sp, -16
    jmp		skipaddr
    jmp.l	skipaddr

; test indexed load/store
    stbx	%r12, %r15, %r30, 4, 14
    sthx	%r12, %r15, %r30, 4, 14
    stwx	%r12, %r15, %r30, 4, 14
    stdx	%r12, %r15, %r30, 4, 14

    ldaq.relaxed %r30, %r56
    staq.relaxed %r43, %r56

    sladd	%r43, %r56, %r23, 4
    slsub	%r43, %r56, %r23, 42
    slsubf	%r43, %r56, %r23, 12

    ldwz	%r30, %r5, 66*4	; load mid
    lddzx	%r40, %tp, %r30, 0, 4	; load base

    lddsx	%r12, %r23, %r40, 3, 114
    lddsx	%r12, %r23, %r40, 3, 114
    lddzx	%r12, %r23, %r40, 3, 114
    lddzx	%r12, %r23, %r40, 3, 114
    stwx	%r12, %r23, %r40, 3, 114
    stdx	%r12, %r23, %r40, 3, 114

    ldbsx	%r12, %r23, %r40, 3, 114
    ldbsx	%r12, %r23, %r40, 3, 114
    ldbzx	%r12, %r23, %r40, 3, 114
    ldbzx	%r12, %r23, %r40, 3, 114
    stbx	%r12, %r23, %r40, 3, 114
    stbx	%r12, %r23, %r40, 3, 114

    ldhsx	%r12, %r23, %r40, 3, 114
    ldhsx	%r12, %r23, %r40, 3, 114
    ldhzx	%r12, %r23, %r40, 3, 114
    ldhzx	%r12, %r23, %r40, 3, 114
    sthx	%r12, %r23, %r40, 3, 114
    sthx	%r12, %r23, %r40, 3, 114

.text
; LOAD/STORE
    sladd	%r54, %r56, %r12, 5

    ldbz	%r16, %r45, 8900
    ldbs	%r15, %r46, 8900
    ldbzx	%r54, %r56, %r12, 2, 37
    ldbsx	%r53, %r65, %r12, 2, 37
    ldbzx.l	%r54, %r56, %r12, 2, 37000000
    ldbsx.l	%r53, %r65, %r12, 2, -37000000
    ldbzmia	%r52, %r75, 10
    ldbsmia	%r51, %r76, 10
    ldbzmib	%r52, %r75, 10
    ldbsmib	%r51, %r76, 10
    stbmia	%r51, %r76, 10
    stbmib	%r52, %r75, 10

    ldhz	%r12, %r45, 8900
    ldhs	%r12, %r45, 8900
    ldhzx	%r54, %r56, %r12, 3, -157
    ldhsx	%r54, %r56, %r12, 2, 237
    ldhzx.l	%r54, %r56, %r12, 2, 37000000
    ldhsx.l	%r53, %r65, %r12, 2, -37000000
    ldhzmia	%r54, %r56, 12
    ldhsmia	%r54, %r56, -60
    ldhzmib	%r54, %r56, 12
    ldhsmib	%r54, %r56, -60
    sthmia	%r51, %r76, 10
    sthmib	%r52, %r75, 10

    ldwz	%r12, %r45, 8900
    ldws	%r12, %r45, 8900
    ldwzx	%r54, %r56, %r12, 2, 7
    ldwsx	%r54, %r56, %r12, 2, 7
    ldwzx.l	%r54, %r56, %r12, 2, 37000000
    ldwsx.l	%r53, %r65, %r12, 2, -37000000
    ldwzmia	%r54, %r56, 12
    ldwsmia	%r54, %r56, 32
    ldwzmib	%r54, %r56, 12
    ldwsmib	%r54, %r56, 32
    stwmia	%r51, %r76, 10
    stwmib	%r52, %r75, 10

    lddz	%r54, %r56, 5600
    ldds	%r54, %r56, 5600
    lddz.l	%r53, %r46, 98765432
    lddz	%r52, %r45, -5600
    lddz.l	%r51, %r55, -98765432
    lddzx	%r50, %r56, %r12, 2, 37
    lddsx	%r50, %r56, %r12, 2, 37
    lddzx.l	%r54, %r56, %r12, 2, 37000000
    lddsx.l	%r53, %r65, %r12, 2, -37000000
    lddzmia	%r57, %r56, -12
    lddzmia	%r57, %r56, -12
    lddsmia	%r57, %r56, -12
    lddsmia	%r57, %r56, -12
    lddzmib	%r57, %r56, -12
    lddzmib	%r57, %r56, -12
    lddsmib	%r57, %r56, -12
    lddsmib	%r57, %r56, -12
    stdmia	%r51, %r76, 10
    stdmib	%r52, %r75, 10

    ldq		%r16, %r45, 8900
    ldq.l	%r16, %r45, 8900000
    ldq.l	%r16, %r45, -8900000
    ldqx	%r54, %r56, %r12, 2, 37
    ldqx.l	%r54, %r56, %r12, 2, 37000000
    ldqx.l	%r54, %r56, %r12, 2, -37000000
    ldqmia	%r52, %r75, 10
    ldqmia	%r52, %r75, 10
    ldqmib	%r52, %r75, 10
    ldqmib	%r52, %r75, 10
    stqmia	%r51, %r76, 10
    stqmib	%r52, %r75, 10

    stb		%r12, %r45, 8900
    sth		%r12, %r45, 8900
    stw		%r12, %r45, 8900
    std		%r12, %r45, 890*8

    lddz	%r12, %r45, 8048
    std		%r12, %r45, 8064
    lddzx	%r12, %r45, %r13, 3, 7
    stdx	%r12, %r45, %r13, 3, 7

    lddz	%r60, %r55, 56
    lddz	%r60, %r56, 56
    lddz	%r46, %r55, 120
    std		%r47, %r55, 56

    lddz	%r60, %sp, 624
    std		%r60, %sp, 624
    lddzx	%r60, %sp, %r12, 3, 28
    stdx	%r60, %sp, %r12, 3, 26
    lddz	%r56, %r57, 567
    std		%r56, %r57, 567

    ldwz	%r34, %r12, 900
    lddz	%r34, %r12, 900
    stw		%r23, %r12, 900
    std		%r23, %r12, 900

    ldq		%r34, %r13, 55*16
    stq		%r35, %r13, 55*16
    ldqx	%r34, %r13, %r45, 3, 80
    stqx	%r34, %r13, %r45, 3, 80

skipaddr:
    nop	0
.end
.text
    alloc	25
    ldi.l	%r23, 0x1234567890abcdef
    write	"test population statistic instructions"
    cntpop	%r12, %r23, 3
    write	"cntpop: %i64(r12)"
    cntlz	%r12, %r23, 0
    write	"cntlz %i64(r12)"
    cnttz	%r12, %r23, 1
    cntlz	%r12, %r23, 2
    cnttz	%r12, %r23, 3
    cntlz	%r12, %r23, 4
    cnttz	%r12, %r23, 5
.end.text
    write	"test bit reverse instruction (permb)"
    alloc	80
    ldi.l	%r55, 0x1234567890ABCDEF
    write	"initial value: %x64(r55)"
    permb	%r55, %r55, 63
    permb	%r56, %r78, 63
    write	"r55 %x64(r55) %b64(r55)"
    permb	%r55, %r55, 63
    write	"r55 %x64(r55) %b64(r55)"

    permb	%r56, %r55, 0b111111 ;63
    write	"reverse bits: %x64(r56)"

    permb	%r56, %r55, 0b111110  ;32+16+8+4+2
    write	"reverse bit-pairs: %x64(r56)"

    permb	%r56, %r55, 0b111100  ;32+16+8+4
    write	"reverse nibbles (4-bits): %x64(r56)"

    permb	%r56, %r55, 0b111000 ;32+16+8
    write	"reverse 1bytes: %x64(r55) => %x64(r56)"

    permb	%r56, %r55, 0b110000  ;32+16
    write	"reverse 2bytes: %x64(r55) => %x64(r56)"

    permb	%r56, %r55, 0b100000  ;32
    write	"reverse 4bytes: %x64(r55) => %x64(r56)"
.end.text
    alloc	46
    write	"test bitwise logical"
    and		%r23, %r25, %r45
    andi	%r23, %r25, 12345
    andi.l	%r23, %r25, 1234567890
    andn	%r23, %r25, %r45
    andni	%r23, %r25, 12345
    or		%r23, %r25, %r45
    ori		%r23, %r25, 12345
    ori.l	%r23, %r25, 1234567890
    orn		%r23, %r25, %r45
    orni	%r23, %r25, 12345
    xor		%r23, %r25, %r45
    xori	%r23, %r25, 12345
    xori.l	%r23, %r25, 1234567890
    nor		%r23, %r25, %r45
    nand	%r23, %r25, %r45
    xnor	%r23, %r25, %r45
.end.text
    write	"blti, test memory"
.data
align 8
test_memory:
    d8	0
    d8	1
    d8	2
    d8	3
    d8	4
    d8	5
    d8	6
    d8	7
.text
    alloc	20
    ldafr	%r12, test_memory
    write	"test_memory: %x64(r12)"
    ldi		%r11, 0
    ldi		%r14, 0
memory_loop: (32)
    lddzx	%r13, %r12, %r11, 3, 0
    addi	%r11, %r11, 1
    addi	%r14, %r14, 1
    andi	%r11, %r11, 7
; fast_check
    bdlti.l	%r14, 200000, memory_loop
    write	"counter: %i64(r14)"
.end.text
    alloc	20
    write	"test compare-with-zero-and-long-branch"
compare_with_zero_test_continue:
compare_with_zero_backward_target:
    addi	%r2, %r2, 1
    bdeq	%r2, %r2, compare_with_zero_test_exit

    bdeq	%r1, %gz, compare_with_zero_forward_target
    bdeq.l	%r1, %gz, compare_with_zero_forward_target
    bdeq	%r1, %gz, compare_with_zero_backward_target
    bdeq.l	%r1, %gz, compare_with_zero_backward_target
    bdne	%r1, %gz, compare_with_zero_forward_target
    bdne.l	%r1, %gz, compare_with_zero_forward_target
    bdne	%r1, %gz, compare_with_zero_backward_target
    bdne.l	%r1, %gz, compare_with_zero_backward_target
    bdlt	%r1, %gz, compare_with_zero_forward_target
    bdlt.l	%r1, %gz, compare_with_zero_forward_target
    bdlt	%r1, %gz, compare_with_zero_backward_target
    bdlt.l	%r1, %gz, compare_with_zero_backward_target
    bdle	%r1, %gz, compare_with_zero_forward_target
    bdle.l	%r1, %gz, compare_with_zero_forward_target
    bdle	%r1, %gz, compare_with_zero_backward_target
    bdle.l	%r1, %gz, compare_with_zero_backward_target
    bdgt	%r1, %gz, compare_with_zero_forward_target
    bdgt.l	%r1, %gz, compare_with_zero_forward_target
    bdgt	%r1, %gz, compare_with_zero_backward_target
    bdgt.l	%r1, %gz, compare_with_zero_backward_target
    bdge	%r1, %gz, compare_with_zero_forward_target
    bdge.l	%r1, %gz, compare_with_zero_forward_target
    bdge	%r1, %gz, compare_with_zero_backward_target
    bdge.l	%r1, %gz, compare_with_zero_backward_target

compare_with_zero_forward_target:
    jmp		compare_with_zero_test_continue
compare_with_zero_test_exit:
    write	"end test compare-with-zero-and-long-branch"
.end
.text

call_code_target:

.rodata
call_data_target:

.text
    jmp	callexample
;*****************************************************************
; Function  compute A**4 of parameter A, passed in register r33
;*****************************************************************
quadrat:
    write	"function quadrat entered: r0=%x128(r0)"
    alloc	93
    write	"rsc     %s(rsc)"
    write	"psr     %s(psr)"
    write	"rsc     %s(rsc)"
    mul	%r33, %r33, %r33
    mul	%r33, %r33, %r33
    write	"r0=%x128(r0) r33=%i64(r33)"
    write	"%m(dump)"
;	mtspr	%r45, psr
    write	"function quadrat exited"
    ret
end_quadrat:

;*****************************************************************
; Example of calling sequence with branch prediction
callexample:
    alloc	91
    ldi.l	%r90, 0x1234567890abcdef
    write	"arg3 %x64(r90)"
    srpi	%r89, %r90, %r90, 16
    write	"arg2 %x64(r89)"
    srpi	%r88, %r90, %r90, 16
    write	"arg1 %x64(r88)"
    ldi		%r87, 7		; setup arguments
;   write	"%m(dump)"
    write	"rsc: %s(rsc)"
    write	"function quadrat called"
    callr	%r86, quadrat
    write	"rsc: %s(rsc)"
; Rest instructions after return from subroutine
;*****************************************************************
.text	; return to code section

; Here we test registers used by ABI (application binary interface)
; Check loader.
    write	"sp=%x64(sp) tp=%x64(tp) r0=%x128(r0)"
    write	"rsc: %s(rsc)"
    write	"psr: %s(psr)"
    write	"r14: %x64(r14)"
    write	"reta: %i64(r72)"		; out return address
    write	"retv: %i64(r73)"		; out return value
    write	"rsc: %s(rsc)"
    write	"rsc: %s(psr)"
    ldi.l	%r11, 0x407d8bffffccccff
    write	"r11: %x64(r11)"
    addi.l	%r12, %r11, 0x400000
    write	"r12: %x64(r12)"
    xor		%r20, %r19, %r11
    addi.l	%r20, %r20, 0x400000
    ldi		%r10, 10
    ldi		%r11, 11
    cmpdlt	%r2, %r11, %r10
    write	"%i64(r11) %i64(r10)"
    jmp		call_exit

    callr	%r42, quadrat
    callri	%r42, %r34, %gz
    callmi	%r42, %r34, 468
    callplt	%r42, call_data_target
    callri	%r42, %r34, %gz

call_exit:
    write	"end call test"

.end
.text
    alloc	47
    write	"test recursive calls"
    ldi.l	%r46, 0x7FFFFFFFFFFFFFFF		; comment
    ldi.l	%r46, 0x8000000000000000
    addi	%r46, %r46, -1
    write	"%i64(r46)"

    mfspr	%r20, %rsc

    alloc	54		; extend frame to 54 regs
    ldi		%r48, 1		; 
    ldi		%r53, 3		; 1 arg (33+16)
    ldi		%r52, 2		; 2 arg (34+16)
    ldi		%r51, 1		; 3 arg (35+16)
    write	"rsc: %s(rsc)"
    callr	%r50, func	; call func subroutine, safe 50 regs
    write	"r51=%i64(r51) rsc=%s(rsc)"
    ldi		%r53, 10
    callr	%r52, rekurs
    write	"rsc: %s(rsc)"
    write	"rsp: %s(rsp)"
;   write	"%m(dump)"
    jmp	smallend
func:
; at entry point func subroutine has 4 regs in frame
    alloc	8   ; extend frame from 4 to 8 regs
    write	"r0      %x128(r0)"		; print packed caller frame and return address
    write	"r1=%i64(r1) r2=%i64(r2) r3=%i64(r3)" ; print args
    ldi		%r1, 12345
    ret

rekurs:
    alloc	4
    write	"r0=%x128(r0) r1=%i64(r1)"
    write	"rsc: %s(rsc)"
    write	"rsp: %s(rsp)"
    addi	%r3, %r1, -1
    ldi		%r2, 0
    bdeq	%r1, %r2, rekret
;	cneq	%r1, %r2, 1, 0
    callr	%r2, rekurs
rekret:
    write	"rsp: %s(rsp)"
    write	"r0: %x128(r0)"
    retf	0
smallend:
    nop		0
    nop		111
    alloc	96
    write	"end_call_recursive"
.end
.text
    ; at the beginning of the program, the register stack is empty
    alloc	54   ; expand frame to 54 registers
    ehadj	simple_func_end
    ldi		%r47, 1  ; will be saved when called
    ldi		%r53, 3  ; first argument
    ldi		%r52, 2  ; second argument
    ldi		%r51, 1  ; third argument
    ; func procedure call, all registers up to 50 will be saved,
    ; return address, eip, frame size (50) are saved in r50
    callr	%r50, simple_func
    ; at this point, after returning, the frame will be again 53
    jmp		simple_func_end
simple_func:
    ; at the starting point, the func procedure has a 5-register frame
    ; their previous numbers are 50, 51, 52, 53, new - 0, 1, 2, 3
    ; extend the frame to 10 registers (another 4,5,6,7,8,9)
    alloc	10
    write	"r0 = %x128(r0)"	; print packed return info
    write	"r1 = %i64(r1)"	; print 1st argument
    write	"r2 = %i64(r2)"	; print 2nd argument
    write	"r3 = %i64(r3)"	; print 3rd argument
    ret
simple_func_end:
    nop		123
.end
.text
    write "example of carry/borrow testing"
    alloc	96

; 256-bit add (g30,%r31,r32,r33) + (g40,r41,r42,r43) => (g50,r51,r52,r53)
    ldi	%r30, -1
    ldi	%r31, -1
    ldi	%r34, -1
    ldi	%r33, -1

    ldi	%r40, 1
    ldi	%r41, 0
    ldi	%r42, 0
    ldi	%r43, 0

; throw add
    cmpdeq	%r10, %r30, %r40	; add carry out
    add		%r50, %r30, %r40	; add
    cmpdeqi	%r12, %r31, 1
    addi	%r51, %r31, 1

    cmpdeq	%r12, %r31, %r41	; add carry out
    add		%r51, %r31, %r41	; add
    cmpdeq	%r14, %r34, %r42	; add carry out
    add		%r52, %r34, %r42	; add
    cmpdeq	%r8, %r33, %r43	; add carry out
    add		%r53, %r33, %r43	; add
    write	"add carryis"
    addi	%r51, %r51, 1
    addi	%r52, %r52, 1
    addi	%r53, %r53, 1
; set last carry
    ldi		%r54, 1
    ldi		%r54, 0
    write	"multiprecision add:\nr50,r51,r52,r53,r54 = %x64(r50) %x64(r51) %x64(r52) %x64(r53) %x64(r54)"

    ldi.l	%r40, 0x7fffffffffffffff
    mulh	%r40, %r40, %r41
    write	"r40     %x64(r40)"

    ldi		%r12, 12345
    ldi.l	%r12, 12345678900

;	ldi	%r14, 0xFFFFFFFFF0
;	ld8	%r13, %r14, 0

    addc	%r12, %r14, %r46
    addc	%r12, %r14, %r46
    subb	%r12, %r14, %r46
    subb	%r12, %r14, %r46
    addaddc	%r12, %r14, %r46, %r23
    addaddc	%r12, %r14, %r46, %r22
    subsubb	%r12, %r14, %r46, %r13
    subsubb	%r12, %r14, %r46, %r14
    write	"end carry test"
    nop	11111
.end
.text
    write	"test compare"
    alloc	96
    ldi		%r20, 4
    ldi		%r21, 3
    ldi		%r22, -4
    ldi		%r23, -12
    write	"test compare instructions"
    cmpdeq	%r12, %r20, %r21
    cmpdlt	%r12, %r20, %r21
    cmpdltu	%r12, %r20, %r21
    cmpdeqi	%r12, %r20, 123456
    cmpdlti	%r12, %r20, 123456
    cmpdltui	%r12, %r20, 123456
    cmpdne	%r12, %r20, %r21
    cmpdnei	%r12, %r20, 123456
    cmpdgti	%r12, %r20, 123456
    cmpdgtui	%r12, %r20, 123456
    cmpdle	%r12, %r20, %r21
    cmpdleu	%r12, %r20, %r21

    cmpdgei	%r12, %r20, 123456
    cmpdgeui	%r12, %r20, 123456
    cmpdlei	%r12, %r20, 123456
    cmpdleui	%r12, %r20, 123456

    cmpweq	%r12, %r20, %r21
    cmpwlt	%r12, %r20, %r21
    cmpwltu	%r12, %r20, %r21
    cmpweqi	%r12, %r20, 123456
    cmpwlti	%r12, %r20, 123456
    cmpwltui	%r12, %r20, 123456
    cmpwne	%r12, %r20, %r21
    cmpwnei	%r12, %r20, 123456
    cmpwgti	%r12, %r20, 123456
    cmpwgtui	%r12, %r20, 123456
    cmpwle	%r12, %r20, %r21
    cmpwleu	%r12, %r20, %r21

    write	"compare aliases (pseudo-instructions)"
    cmpdgt	%r12, %r20, %r21	; cmplt	  r12, %r21, r20
    cmpdgtu	%r12, %r20, %r21	; cmpltu  r12, %r21, r20
    cmpdlti	%r12, %r20, 123456	; cmplti  r12, %r20, 12346
    cmpdltui	%r12, %r20, 123456	; cmpltui r12, %r20, 12346
    cmpdge	%r12, %r20, %r21	; cmpleq  r12, %r21, r20
    cmpdgeu	%r12, %r20, %r21	; cmpleu  r12, %r21, r20
    cmpdgti	%r12, %r20, 123456	; cmpgti  r12, %r20, 12346
    cmpdgtui	%r12, %r20, 123456	; cmpgtui r12, %r20, 12346


    cmpwgt	%r12, %r20, %r21	; cmplt4   r12, %r21, %r20
    cmpwgtu	%r12, %r20, %r21	; cmpltu4  r12, %r21, %r20
    cmpwlti	%r12, %r20, 123456	; cmplti4  r12, %r20, 12346
    cmpwltui	%r12, %r20, 123456	; cmpltui4 r12, %r20, 12346
    cmpwge	%r12, %r20, %r21	; cmpleq4  r12, %r21, r20
    cmpwgeu	%r12, %r20, %r21	; cmpleu4  r12, %r21, r20
    cmpwgti	%r12, %r20, 123456	; cmpgti4  r12, %r20, 12346
    cmpwgtui	%r12, %r20, 123456	; cmpgtui4 r12, %r20, 12346

; TESTS
    cmpdeq	%r14, %r12, %r45
    cmpdne	%r14, %r12, %r45

    cmpdeq	%r14, %r45, %r34
    cmpdeqi	%r14, %r45, 123
    cmpdeqi.l	%r14, %r45, 1234567890123
    cmpdlti	%r14, %r45, 123
    cmpdlti.l	%r14, %r45, 1234567890123
    cmpdlei	%r14, %r45, 123
    cmpdlei.l	%r14, %r45, 1234567890123
    cmpdlt	%r14, %r45, %r34
    cmpdgtui	%r14, %r45, 123
    cmpdgtui.l	%r14, %r45, 1234567890123
    cmpdgeui	%r14, %r45, 123
    cmpdgeui.l	%r14, %r45, 1234567890123
    cmpdgtu	%r14, %r45, %r34

    cmpdeq	%r41, %r34, %r56
    cmpdlt	%r66, %r45, %r57
    cmpdeqi	%r64, %r56, 0
.end.text
backward_target:
    alloc	61
    addi	%r2, %r2, 1
    bdeq	%r2, %r2, branch_test_exit

    bdeq	%r23, %r34, backward_target
    bdeq.l	%r23, %r34, backward_target
    bdeq	%r23, %r34, forward_target
    bdeq.l	%r23, %r34, forward_target
    bdeqi	%r23,34, backward_target
    bdeqi.l	%r23,34, backward_target
    bdeqi	%r23,34, forward_target
    bdeqi.l	%r23,34, forward_target

    bweq	%r23, %r34, backward_target
    bweq.l	%r23, %r34, backward_target
    bweq	%r23, %r34, forward_target
    bweq.l	%r23, %r34, forward_target
    bweqi	%r23,34, backward_target
    bweqi.l	%r23,34, backward_target
    bweqi	%r23,34, forward_target
    bweqi.l	%r23,34, forward_target

    bdne	%r23, %r34, backward_target
    bdne.l	%r23, %r34, backward_target
    bdne	%r23, %r34, forward_target
    bdne.l	%r23, %r34, forward_target
    bdnei	%r23,34, backward_target
    bdnei.l	%r23,34, backward_target
    bdnei	%r23,34, forward_target
    bdnei.l	%r23,34, forward_target

    bwne	%r23, %r34, backward_target
    bwne.l	%r23, %r34, backward_target
    bwne	%r23, %r34, forward_target
    bwne.l	%r23, %r34, forward_target
    bwnei	%r23,34, backward_target
    bwnei.l	%r23,34, backward_target
    bwnei	%r23,34, forward_target
    bwnei.l	%r23,34, forward_target

    bdle	%r23, %r34, backward_target
    bdle.l	%r23, %r34, backward_target
    bdle	%r23, %r34, forward_target
    bdle.l	%r23, %r34, forward_target
    bdlei	%r23,34, backward_target
    bdlei.l	%r23,34, backward_target
    bdlei	%r23,34, forward_target
    bdlei.l	%r23,34, forward_target

    bwle	%r23, %r34, backward_target
    bwle.l	%r23, %r34, backward_target
    bwle	%r23, %r34, forward_target
    bwle.l	%r23, %r34, forward_target
    bwlei	%r23,34, backward_target
    bwlei.l	%r23,34, backward_target
    bwlei	%r23,34, forward_target
    bwlei.l	%r23,34, forward_target

    bdlt	%r23, %r34, backward_target
    bdlt.l	%r23, %r34, backward_target
    bdlt	%r23, %r34, forward_target
    bdlt.l	%r23, %r34, forward_target
    bdlti	%r23,34, backward_target
    bdlti.l	%r23,34, backward_target
    bdlti	%r23,34, forward_target
    bdlti.l	%r23,34, forward_target

    bwlt	%r23, %r34, backward_target
    bwlt.l	%r23, %r34, backward_target
    bwlt	%r23, %r34, forward_target
    bwlt.l	%r23, %r34, forward_target
    bwlti	%r23,34, backward_target
    bwlti.l	%r23,34, backward_target
    bwlti	%r23,34, forward_target
    bwlti.l	%r23,34, forward_target

    bdge	%r23, %r34, backward_target
    bdge.l	%r23, %r34, backward_target
    bdge	%r23, %r34, forward_target
    bdge.l	%r23, %r34, forward_target
    bdgeui	%r23,34, backward_target
    bdgeui.l	%r23,34, backward_target
    bdgeui	%r23,34, forward_target
    bdgeui.l	%r23,34, forward_target

    bwge	%r23, %r34, backward_target
    bwge.l	%r23, %r34, backward_target
    bwge	%r23, %r34, forward_target
    bwge.l	%r23, %r34, forward_target
    bwgeui	%r23,34, backward_target
    bwgeui.l	%r23,34, backward_target
    bwgeui	%r23,34, forward_target
    bwgeui.l	%r23,34, forward_target

    bdgt	%r23, %r34, backward_target
    bdgt.l	%r23, %r34, backward_target
    bdgt	%r23, %r34, forward_target
    bdgt.l	%r23, %r34, forward_target
    bdgti	%r23,34, backward_target
    bdgti.l	%r23,34, backward_target
    bdgti	%r23,34, forward_target
    bdgti.l	%r23,34, forward_target

    bwgt	%r23, %r34, backward_target
    bwgt.l	%r23, %r34, backward_target
    bwgt	%r23, %r34, forward_target
    bwgt.l	%r23, %r34, forward_target
    bwgti	%r23,34, backward_target
    bwgti.l	%r23,34, backward_target
    bwgti	%r23,34, forward_target
    bwgti.l	%r23,34, forward_target

    bdleu	%r23, %r34, backward_target
    bdleu.l	%r23, %r34, backward_target
    bdleu	%r23, %r34, forward_target
    bdleu.l	%r23, %r34, forward_target
    bdleui	%r23,34, backward_target
    bdleui.l	%r23,34, backward_target
    bdleui	%r23,34, forward_target
    bdleui.l	%r23,34, forward_target

    bwleu	%r23, %r34, backward_target
    bwleu.l	%r23, %r34, backward_target
    bwleu	%r23, %r34, forward_target
    bwleu.l	%r23, %r34, forward_target
    bwleui	%r23,34, backward_target
    bwleui.l	%r23,34, backward_target
    bwleui	%r23,34, forward_target
    bwleui.l	%r23,34, forward_target

    bdltu	%r23, %r34, backward_target
    bdltu.l	%r23, %r34, backward_target
    bdltu	%r23, %r34, forward_target
    bdltu.l	%r23, %r34, forward_target
    bdltui	%r23,34, backward_target
    bdltui.l	%r23,34, backward_target
    bdltui	%r23,34, forward_target
    bdltui.l	%r23,34, forward_target

    bwltu	%r23, %r34, backward_target
    bwltu.l	%r23, %r34, backward_target
    bwltu	%r23, %r34, forward_target
    bwltu.l	%r23, %r34, forward_target
    bwltui	%r23,34, backward_target
    bwltui.l	%r23,34, backward_target
    bwltui	%r23,34, forward_target
    bwltui.l	%r23,34, forward_target

    bdgeu	%r23, %r34, backward_target
    bdgeu.l	%r23, %r34, backward_target
    bdgeu	%r23, %r34, forward_target
    bdgeu.l	%r23, %r34, forward_target
    bdgeui	%r23,34, backward_target
    bdgeui.l	%r23,34, backward_target
    bdgeui	%r23,34, forward_target
    bdgeui.l	%r23,34, forward_target

    bwgeu	%r23, %r34, backward_target
    bwgeu.l	%r23, %r34, backward_target
    bwgeu	%r23, %r34, forward_target
    bwgeu.l	%r23, %r34, forward_target
    bwgeui	%r23,34, backward_target
    bwgeui.l	%r23,34, backward_target
    bwgeui	%r23,34, forward_target
    bwgeui.l	%r23,34, forward_target

    bdgtu	%r23, %r34, backward_target
    bdgtu.l	%r23, %r34, backward_target
    bdgtu	%r23, %r34, forward_target
    bdgtu.l	%r23, %r34, forward_target
    bdgtui	%r23, 34, backward_target
    bdgtui.l	%r23, 34, backward_target
    bdgtui	%r23, 34, forward_target
    bdgtui.l	%r23, 34, forward_target

    bwgtu	%r23, %r34, backward_target
    bwgtu.l	%r23, %r34, backward_target
    bwgtu	%r23, %r34, forward_target
    bwgtu.l	%r23, %r34, forward_target
    bwgtui	%r23, 34, backward_target
    bwgtui.l	%r23, 34, backward_target
    bwgtui	%r23, 34, forward_target
    bwgtui.l	%r23, 34, forward_target

    bmall	%r23, 34, backward_target
    bmall.l	%r23, 34, backward_target
    bmall	%r23, 34, forward_target
    bmall.l	%r23, 34, forward_target

    bmnotall	%r23, 34, backward_target
    bmnotall.l	%r23, 34, backward_target
    bmnotall	%r23, 34, forward_target
    bmnotall.l	%r23, 34, forward_target

    bmany	%r23, 34, backward_target
    bmany.l	%r23, 34, backward_target
    bmany	%r23, 34, forward_target
    bmany.l	%r23, 34, forward_target

    bmnone	%r23, 34, backward_target
    bmnone.l	%r23, 34, backward_target
    bmnone	%r23, 34, forward_target
    bmnone.l	%r23, 34, forward_target

forward_target:
branch_test_exit:

    jmp		branch_exit

label:
    bdeq	%r12, %r13, qwe
    srpi	%r10, %r11, %r12, 45
    depq	%r61, %r91, %r32, 10
    mbsel	%r62, %r91, %r32, %r10
    perm	%r63, %r91, %r32, %r10
qwe:
    bdne	%r15, %r46, label
    bdeq	%r25, %r45, label
    bdlt	%r25, %r44, label
    bdle	%r35, %r43, label
    bdgtu	%r35, %r42, label
    bdgeu	%r45, %r41, label
    bdgt	%r45, %r40, label
    bdltu	%r55, %r76, label
    bdnei	%r55, 140, label
    bdeqi	%r65, 141, label
    bdlti	%r65, 142, label
    bdgti	%r75, 143, label
    bdltui	%r75, 170, label
    bdgtui	%r85, 160, label

    addi.l	%r45, %r34, 1234
    bbsi	%r85, 26, label
    bbci.l	%r85, 36, label
    bbsi	%r95, 46, label
    bbci.l	%r95, 56, label

    jmpr	%r45, %r23, 1
branch_exit:
    write	"end branch test"
.end
.text
    alloc	61
    write	"Example of test bit and branch"
    ldi		%r19, 0x20
    ldi		%r20, 12+3
    write	"%i64(r20)"
    ldi		%r10, 0
    bbci	%r10, 10, xxx_n
    ldi.l	%r20, 123456789012345	; load immediate
    ldi		%r21, 321		; load immediate
    add		%r23, %r20, %r21	; add
    write	"%i64(r43)"
xxx_n:	write	"%i64(r23)"

    ldi		%r46, 0xabcdef
    bbci	%r46, 56, branch_bit_exit
    bbsi	%r46, 56, branch_bit_exit
    ldi		%r56, 56
    bbc		%r46, %r56, branch_bit_exit
    bbs		%r46, %r56, branch_bit_exit

branch_bit_exit:
    write	"end branch_bit test"
.end.text
    write	"cpuid implemented number"
    alloc	96
    ldi		%r13, 0
    cpuid	%r14, %r13, 0
    write	"cpuid len %x64(r14)"
    write	"cpuid loop"
cpuid_loop:
    cpuid	%r15, %r13, 0
    write	"cpuid[%i64(r13)] = %x64(r15)"
    repdlt	%r13, %r14, cpuid_loop
.end
.rodata
    align 16
crc32c_test_string:
    ascii	"The quick brown fox jumps over the lazy dog"
.text
    write	"crc32c = 0x22620404 (expected)"
    alloc	20
    ldi		%r12, -1  ; crc32c
    ldi		%r15, 43 ; length
    mov		%r14, %r15
    ldafr	%r11, crc32c_test_string
crc32c_loop:
    ldqmia	%r13, %r11, 16
    crc32c	%r12, %r12, %r13, %r14
    addi	%r14, %r14, -16
    bdgt	%r14, %gz, crc32c_loop
    xori	%r12, %r12, -1
    write	"crc32c = 0x%x32(r12) (computed)"
.end.text
    alloc	61
    ldax	%r41, %r40, %r12, 4, 112
    ldax	%r41, %r40, %r12, 3, -12
    ldax	%r41, %r40, %r12, 4, 112
    ldi.l	%r5, -1
    mov2	%r3, %r4, %r4, %r3
    mov2	%r3, %r4, %r4, %r3


.rodata	; open text (read-only data) section
    align	16
text_lbl:	; this is label
    d1	111		; signed byte
    d1	112
    d1	113
ddd:
    align	4		; force 4-byte alignment for next data
    d1	6
    d1	7
    d1	8+0x3D	; you may use formulas!!!

.text
    write	"test addressing"

; Examples of IP-relative references.
    ldi		%r45, text_lo(text_lbl)
    write	"text_lo(text_lbl)=%i64(r45)"
    ldi		%r45, text_hi(text_lbl)
    write	"text_hi(text_lbl)=%i64(r45)"
    ldi		%r45, text_lbl
    write	"%i64(r45)"

; Example of access to text section.
; First get IP-relative reference to text section (+/- 64 MB from IP).
    ldar	%r45, text_lbl

; Now in r45 we have base address.
; But it IS NOT true address of 'text_lbl'.
; We have in r45 nearest (to 'text_lbl') least address, aligned on 16-bytes boundary.
; Remember add 'text_lo' part of label address at each displacement calculation.
    ldbz	%r50, %r45, text_lo(text_lbl)+0
    ldbz	%r51, %r45, text_lo(text_lbl)+1
    ldbz	%r52, %r45, text_lo(text_lbl)+2
    write	"%i64(r50)"	; must be 111
    write	"%i64(r51)"	; must be 112
    write	"%i64(r52)"	; must be 113

; Example of incorrect access to text section (without bundle alignment)
    ldbz	%r50, %r45, 0
    write	"%i64(r50)" ; must be 101 - start of 16-byte portion
.end
.text
    alloc	96
    addi	%r20, %gz, 128
    addi	%sp, %sp, -32
    ldi.l	%r12, 0x07060504030201
    std		%r12, %sp,0

.data
    ascii	"data section marker"
    align	8
.rodata
    ascii	"rodata section marker"
    align	8

.data
    d2	1234
first_byte:
    d1	12
.text
    ldafr	%r22, first_byte

; test interval time mask
    ldi		%r22, 0xFFFFFFFFFFFFFFFF
    ldi		%r15, 11

.rodata	; open rodata (read-only data) section
    align	8
text_begin:	; this is label
    d8	1	; signed 8-bytes
    d8	-2
    d1	101	; signed byte
    d1	102
    d1	103
    align	4
    d4	10000	; signed 4byte
    d2	10000	; signed 2byte
    space	4		; insert zeroed bytes
    d2	20000
.data	; open data (read-write) section
    align	8
eexxx:	d8	12345678	; signed 8-byte
    d8	1234567890
ssxxx:	d8	123456789012
    d8	12345678901234
.rodata
    d4	4555		; signed 4-byte
    d2	4555		; signed 2-byte
    align	8
    d8	11
text2:
.text	; open code (read-execute) section

.data	; switch to data section
    d1	120
    align	2
    d2	13400
align 8
dataname:
    d4	654321890
    d4	654321890
    d8	1234545345345
    d8	6789023356977
align 8
someplaceindata:
    d8	0x0000000000000001
    d8	0x0000000000000002
    d8	0x0000000000000003
    d8	0x0000000000000004
    d8	0x0000000000000005
    d8	0x0000000000000006
    d8	0x0000000000000007
    d8	0x0000000000000008
.text
    ldafr	%r11, someplaceindata
    ldi.l	%r15, 987777777777
    ldi		%r46, 100000
    std		%r46, %r11, 8*3
    lddz	%r46, %r11, 8*3
    write	"%i64(r46)"
    mul		%r18, %r15, %r46
    add		%r17, %r15, %r46
    andn	%r17, %r15, %r46
    cmpdlt	%r12, %r17, %r15
    write	"%i64(r15) %i64(r46) %i64(r17)"
    addi	%r17, %r17, 22
    write	"%i64(r17) %i64(r17)"
    mfspr	%r27, %itc
    write	"itc: %x64(r27)"
    write	"%m(dump)"
.end
.text
    ; at the beginning of the program, the register stack is empty
    alloc	54   ; expand frame to 54 registers
    ldar	%r4, dense_call_test_end
    mtspr	%r4, %eip
    mtspr	%r4, %reip
    ldi		%r47, 1  ; will be saved when called
    ldi		%r53, 3  ; first argument
    ldi		%r52, 2  ; second argument
    ldi		%r51, 1  ; third argument
    ; func procedure call, all registers up to 50 will be saved,
    ; return address, eip, frame size (50) are saved in r50
check_label:
    callr	%r48, simple_func_1
    callr	%r50, simple_func_2
    callr	%r52, simple_func_3
    
    jmp	dense_call_test_end

simple_func_1:
    alloc  10
    write  "simple_func_1"
    ret

simple_func_2:
    alloc  10
    write  "simple_func_2"
    ret

simple_func_3:
    alloc  10
    write  "simple_func_3"
    ret

dense_call_test_end:
    nop	123
    nop	123
    nop	123
    nop	123
    nop	123
    nop	123
.end
.text
    write	"test bit-field insert (deposit)"
    alloc	96
    ldi.l	%r30, 0xaaaaaaaaaaaaaaaa
    ldi.l	%r40, 0xeeeeeeeeeeeeeeee
    dep		%r20, %r30, %r40, 48, 24
    write	"dep: %x64(r20)"
    dep		%r20, %r40, %r30, 48, 24
    write	"dep: %x64(r20)"

    write	"test vector deposit (dep16)"
    nor		%r3, %r4, %r4
    depq	%r5, %r3, %r4, 100
    write	"dep16: %x128(r5)"
    write	"end deposit test"
.end

.text
    write	"test control device memory-mapped registers"
    alloc	96

    ; device_control base address
    ldi.l	%r24, DEVICE_CONFIG_VIRT_BASE

    write	"test pci"

    ldi.l	%r21, 0x1234567890abcdef

    lddz	%r20, %r24, DEVICE_CONTROL_DID
    write	"mem[DEVICE_CONTROL_DID] %x64(r20)"
    std		%r21, %r24, DEVICE_CONTROL_DID
    lddz	%r20, %r24, DEVICE_CONTROL_DID
    write	"mem[DEVICE_CONTROL_DID] %x64(r20)"

    lddz	%r20, %r24, DEVICE_CONTROL_CMD
    write	"mem[DEVICE_CONTROL_CMD] %x64(r20)"
    std		%r21, %r24, DEVICE_CONTROL_CMD
    lddz	%r20, %r24, DEVICE_CONTROL_CMD
    write	"mem[DEVICE_CONTROL_CMD] %x64(r20)"

    lddz	%r20, %r24, DEVICE_CONTROL_ARRAY_ADDRESS
    write	"mem[DEVICE_CONTROL_ARRAY_ADDRESS] (r20)"

    lddz	%r20, %r24, DEVICE_CONTROL_ARRAY_LEN
    write	"mem[DEVICE_CONTROL_ARRAY_LEN] %i64(r20)"

    ldi	%r22, \n

    write	"test command"
    ldi.l	%r21, 0xabcdef1234567890
    std		%r21, %r24, DEVICE_CONTROL_CMD

    write	"end_device_control_test"
.end

.text
    write	"test core mapping DEVICE_CONFIG_VIRT_BASE"
    alloc	96
    ldi.l	%r20, DEVICE_CONFIG_VIRT_BASE
    write	"DEVICE_CONFIG_VIRT_BASE: %x64(r20)"
    ldi.l	%r20, DEVICE_CONFIG_SPACE_SIZE
    write	"DEVICE_CONFIG_SPACE_SIZE: %x64(r20)"
    ldi.l	%r20, CONFIG_OFFSET_CORE_0
    write	"CONFIG_OFFSET_CORE_0: %x64(r20)"
    ldi.l	%r20, DEVICE_CORE_TIMECMP
    write	"DEVICE_CORE_TIMECMP: %x64(r20)"

    ldi.l	%r20, DEVICE_CONFIG_VIRT_BASE + CONFIG_OFFSET_CORE_0 * DEVICE_CONFIG_SPACE_SIZE ; core config
    ldi		%r19, 0xabcdef

    write	"test interrupt vector %x64(r20)"
    std		%r19, %r20, DEVICE_CORE_TIMECMP ; use DEVICE_CORE_INTERRUPT_VECTOR in place of DEVICE_CORE_TIMECMP for real interrupt

    write	"test timecmp"
    std		%r19, %r20, DEVICE_CORE_TIMECMP

    write	"test rom mapping ROM_VIRT_BASE"
    ldi.l	%r20, ROM_VIRT_BASE
    lddz	%r19, %r20, 0
    write	"mem[ROM_VIRT_BASE] %x64(r19)"

    write	"test video commands VIDEO_COMMAND_VIRT_BASE"
    ldi.l	%r20, VIDEO_COMMAND_VIRT_BASE
    ldi		%r21, 0x1234
    stw		%r21, %r20, 0x88	; clear
    stw		%r21, %r20, 0x8c	; redraw

    write	"video width/height base: %x64(r20)"
    ldwz	%r21, %r20, 0x80 ; width
    ldwz	%r22, %r20, 0x84 ; height
    write	"width=%i64(r21) heigth=%i64(r22)"

    write	"test video memory VIDEO_VIRT_BASE"
    ldi.l	%r20, VIDEO_VIRT_BASE
    write	"r20     %x64(r20)"

    ldi.l	%r25, 0x12345678
    stw		%r25, %r20, 0

    ldi		%r24, 0   ; y
loop_y: (64)
;	write	"%i64(r24)"
    ldi	%r23, 0   ; x
loop_x:
;	add	%r25, %r23, %r24
    stb		%r25, %r20, 0
    addi	%r20, %r20, 1
    addi	%r23, %r23, 1
    bdlt	%r23, %r21, loop_x

    addi	%r24, %r24,1
    bdlt	%r24, %r22, loop_y
    ; debug
    write	"end test video memory"
    nop		1234567
.end
.text
    write	"begin exception test"
    alloc	96

    ldafr	%r2, catch
    mtspr	%r2, %eip

; constructor 1
    ldi		%r4, 1
    ehadj	call_destructor_1
    write	"eip: %s(eip)"
; constructor 2
    ldi		%r5, 2
    ehadj	call_destructor_2
    write	"eip: %s(eip)"

    ldi		%r3, 0xFFFFFFFFFFFF1230
    ehthrow	%r3, 0    ; set eca, jump to eip
    write	"normal execution (never occurs)"

call_destructor_2:
    write	"call_destructor_2"
    ehcatch	%r6, end_destructor_2
    ; here dtor called
    ldi		%r4, 0
end_destructor_2:
    ehnext	%r6, call_destructor_1
    write	"normal continue after destructor_2"

call_destructor_1:
    write	"call_destructor_1"
    ehcatch	%r6, end_destructor_1
    ; here dtor called
    ldi		%r5, 0
end_destructor_1:
    ehnext	%r6, catch
    write	"normal continue after destructor_1"

call_ret:
    write	"normal exit"
    jmp		exception_exit

catch:
    write	"caught exception, exit"
    ehcatch	%r12, exception_exit
    write	"caught exception context: r12=%x64(r12)"
exception_exit:
    nop		1234567
    nop		7654321
.end
.text
; floating-point extension example
    alloc	96

    write	"test float128 immediate load (low/high parts)"
    fldqri	%r12, 3.1415926115461431423612436243
    write	"fldqri: %f128(r12)"

    write	"test fpcr modification (rm=3)"
    ldi		%r2, 3
    mtspr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"
    write	"test fpcr modification (rm=2)"
    ldi		%r2, 2
    mtspr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"
    write	"test fpcr modification (rm=1)"
    ldi		%r2, 1
    mtspr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"
    write	"test fpcr modification (rm=0)"
    ldi		%r2, 0
    mtspr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"

    write	"compare fldqri (full mantissa) & long fldi (63-bit mantissa)"
    fldqri	%r30, 3.14159265358979323846123456789012e+400
    write	"fldqri: %x128(r30) %f128(r30)"
    flddi	%r31, 3.14159265358979323846123456789012
    write	"flddi: %x128(r31) %f64(r31)"
    write	"compare fldqri (full mantissa) & short fldi (21-bit mantissa)"
    fldqri	%r30, 3.14159265358979323846123456789012
    write	"r30     %x128(r30)"
    flddi	%r31, 3.14159265358979323846123456789012
    write	"r31     %x128(r31)"
    write	"before1"
    write	"r30     %f128(r30)"
    write	"before2"
    write	"r31     %vf64(r31)"
    write	"after"
    flddi	%r30, -12.3456789e+04
.rodata
    align	16
float64data:
    double	1.234567890123456789124141241241
    double	3.1415925678888734535345231234564561
    double	3.4566345634563456346535463463456
.text
    ldar	%r21, float64data
    lddz	%r11, %r21, 8*0
    lddz	%r12, %r21, 8*1
    lddz	%r13, %r21, 8*2
    write	"ld8(f64): %f64(r11) %f64(r12) %f64(r13)"
    fldqri	%r14, 2.7182818289201
    write	"fldqri: %f128(r14)"

    fextsd2sq	%r11, %r11
    fextsd2sq	%r12, %r12
    fextsd2sq	%r13, %r13

    write	"test binary"
    fmulsq	%r15, %r11, %r14
    write	"fmulsq:  %f128(r15)"
    fnmulsq	%r15, %r11, %r14
    write	"fnmulsq: %f128(r15)"
    faddsq	%r15, %r11, %r14
    write	"faddsq:  %f128(r15)"
    fnaddsq	%r15, %r11, %r14
    write	"fnaddsq: %f128(r15)"
    fsubsq	%r15, %r14, %r11
    write	"fsubsq:  %f128(r15)"
    fdivsq	%r15, %r14, %r11
    write	"fdivsq:  %f128(r15)"

    write	"test fused fma"
;   jmp	skipfma
    fmaddsq	%r15, %r14, %r11, %r12
    write	"fmaddsq:  %f128(r15)"
    fnmaddsq %r15, %r14, %r11, %r12
    write	"fnmaddsq: %f128(r15)"
    fmsubsq	%r15, %r14, %r11, %r12
    write	"fmsubsq:  %f128(r15)"
    fnmsubsq %r15, %r14, %r11, %r12
    write	"fnmsubsq: %f128(r15)"

    write	"test unary"
    mov		%r16, %r15
    write	"r16     %f128(r16)"
    fabssq	%r16, %r15
    write	"r16     %f128(r16)"
    fnegsq	%r16, %r15
    write	"r16     %f128(r16)"
    fnabssq	%r16, %r15
    write	"r16     %f128(r16)"
    fsqrtsq	%r16, %r12
    write	"r16     %f128(r16)"
    frsqrtsq	%r16, %r12
    write	"r16     %f128(r16)"

    write	"test rounding"
    frndsq	%r17, %r12, 4
    write	"r17     %f128(r17)"
    frndsq	%r17, %r12, 3
    write	"r17     %f128(r17)"
    frndsq	%r17, %r12, 2
    write	"r17     %f128(r17)"
    frndsq	%r17, %r12, 0
    write	"r17     %f128(r17)"
    fcvtsq2iw	%r17, %r12,0
    write	"r17     %i64(r17)"
    ldi		%r17, 123456
    fcvtiw2sq	%r17, %r7,0
    write	"r17     %f128(r17)"

    write	"test fp minmax"
    fmaxsq	%r8, %r11, %r12
    write	"r8      %f128(r8)"
    fminsq	%r8, %r11, %r12
    write	"r8      %f128(r8)"
    write	"test fp abs minmax"
    famaxsq	%r8, %r11, %r12
    write	"r8      %f128(r8)"
    faminsq	%r8, %r11, %r12
    write	"r8      %f128(r8)"

    write	"test fmergesq"
    fmergesq	%r8, %r11, %r12, %r14
    write	"r8      %f128(r8)"
    fmergesq	%r8, %r14, %r11, %r12
    write	"r8      %f128(r8)"


.rodata
    align	16
xxxd:	double	1.122
    double	0.9999765432
.text
    ldar	%r21, xxxd
    ldi		%r15, 100
    lddz	%r25, %r21, 8*0
    lddz	%r26, %r21, 8*1
    fsubsq	%r22, %r25, %r16
    write	"r22     %f128(r22)"
xxloop:
    fmaddsq	%r22, %r25, %r16, %r22
    fmsubsq	%r22, %r25, %r16, %r22
    repdge	%r15, %gz, xxloop
    write	"r22     %f128(r22)"

    write	"other FPU"
    fmaddsq  %r60, %r61, %r62, %r63
    fmsubsq  %r61, %r61, %r72, %r73
    fnmaddsq %r62, %r71, %r82, %r63
    fnmsubsq %r63, %r81, %r12, %r53

    fmulsq	%r64, %r61, %r22
    fdivsq	%r65, %r11, %r27
    faddsq	%r66, %r17, %r42
    fsubsq	%r67, %r31, %r23
    fnaddsq	%r68, %r41, %r62
    fmaxsq	%r60, %r61, %r62
    fminsq	%r60, %r61, %r62
    famaxsq	%r60, %r61, %r62
    faminsq	%r60, %r61, %r62

    fcmpsqolt	%r10, %r61, %r72
    fcmpsqole	%r11, %r52, %r21
    fcmpsqole	%r12, %r43, %r12
    fcmpsqoeq	%r10, %r34, %r44
    fcmpsqueq	%r13, %r25, %r22
    fcmpsqule	%r12, %r15, %r23
    fcmpsquo	%r11, %r86, %r86

    fnegsq	%r24, %r58
    fabsdsq	%r45, %r61, %r20
    fnabsdsq	%r56, %r32, %r20
    frndsq	%r78, %r74,2
    frndsq	%r89, %r65,3
    frndsq	%r81, %r76,0
    frndsq	%r62, %r67,1
    fsqrtsq	%r63, %r78
    frsqrtsq %r64, %r69

    addi	%r45, %sp,-4800
    ldi		%r13, 2

    ldwz	%r12, %r45, 4*1
    stw		%r12, %r45, 4*1
    lddz	%r12, %r45, 8*3
    std		%r12, %r45, 8*3
    ldwzx	%r12, %r45, %r13, 2, 200
    stwx	%r12, %r45, %r13, 2, 200
    lddzx	%r12, %r45, %r13, 3, 200
    stdx	%r12, %r45, %r13, 3, 200

    faddsq	%r23, %r24, %r25
    fmaddsq	%r23, %r60, %r55, %r33
    fmulsq	%r23, %r60, %r55
    lddz	%r60, %r45, 8*6
    fmaddsq	%r23, %r60, %r55, %r33
    fmaddsq	%r24, %r61, %r25, %r32
    fmaddsq	%r25, %r62, %r55, %r23
    fmaddsq	%r26, %r63, %r75, %r73
    fmaddsq	%r27, %r64, %r75, %r73
    fmaddsq	%r28, %r65, %r85, %r63
    fmaddsq	%r29, %r66, %r85, %r63
    fmaddsq	%r30, %r67, %r55, %r23
    fmaddsq	%r31, %r68, %r55, %r23
    fmaddsq	%r12, %r32, %r76, %r85
    fmaddsq	%r12, %r32, %r76, %r85
    fmaddsq	%r10, %r32, %r76, %r85
    fmaddsq	%r10, %r32, %r76, %r85
    fmaddsq	%r10, %r32, %r76, %r85
    fmaddsq	%r13, %r32, %r76, %r85
    fmaddsq	%r14, %r32, %r76, %r85
    fmaddsq	%r15, %r32, %r76, %r85
    fmaddsq	%r16, %r32, %r76, %r85
    fmaddsq	%r17, %r32, %r76, %r85

    fcvtsq2iw	%r56, %r45, 0
    fcvtsq2uw	%r56, %r45, 0
    fcvtiw2sq	%r45, %r56, 0
    fcvtuw2sq	%r45, %r56, 0

    ldi		%r5, 0
    fldqri	%r4, 1.0
    fldqri	%r5, 1.0
    fldqri	%r6, 1.0
    fldqri	%r7, 1.0
    ldi		%r24, 128
tri_repeat:
    write	"r7      %x128(r7)"
    faddsq	%r5, %r5, %r4
    fmulsq	%r6, %r6, %r5
    fdivsq	%r7, %r4, %r6
;   write "%x128(r6)"
    repdle.l %r5, %r24, tri_repeat

    write	"test taylor series"
    fldqri	%r2, 0.44567	; f2 ,  x
    write	"x:   %f128(r2)"		; test value
    write	"test sin(x)"
    fldqri	%r5, sin(0.44567)
    write	"sin: %f128(r5)"		; test value
    ldi		%r3, 0		; s ,  0
    fmulsq	%r4, %r2, %r2	; f4 ,  x*x
    fmaddsq	%r3, %r3, %r4, %r25	; s ,  s * x*x + 1/25!
    fmsubsq	%r3, %r3, %r4, %r23	; s ,  s * x*x - 1/23!
    fmaddsq	%r3, %r3, %r4, %r21
    fmsubsq	%r3, %r3, %r4, %r19
    fmaddsq	%r3, %r3, %r4, %r17
    fmsubsq	%r3, %r3, %r4, %r15
    fmaddsq	%r3, %r3, %r4, %r13
    fmsubsq	%r3, %r3, %r4, %r11
    fmaddsq	%r3, %r3, %r4, %r9
    fmsubsq	%r3, %r3, %r4, %r7
    fmaddsq	%r3, %r3, %r4, %r5
    fmsubsq	%r3, %r3, %r4, %r3
    fmaddsq	%r3, %r3, %r4, %r1
    fmulsq	%r3, %r3, %r2	; s ,  s * x
    write	"sin: %f128(r3)"

    write	"test cos(x)"
    fldqri	%r5, cos(0.44567)
    write	"cos: %f128(r5)"		; test value
    ldi		%r3, 0		; s ,  0
    fmulsq	%r4, %r2, %r2	; f4 ,  x*x
    fmsubsq	%r3, %r3, %r4, %r26
    fmaddsq	%r3, %r3, %r4, %r24
    fmsubsq	%r3, %r3, %r4, %r22
    fmaddsq	%r3, %r3, %r4, %r20
    fmsubsq	%r3, %r3, %r4, %r18
    fmaddsq	%r3, %r3, %r4, %r16
    fmsubsq	%r3, %r3, %r4, %r14
    fmaddsq	%r3, %r3, %r4, %r12
    fmsubsq	%r3, %r3, %r4, %r10
    fmaddsq	%r3, %r3, %r4, %r8
    fmsubsq	%r3, %r3, %r4, %r6
    fmaddsq	%r3, %r3, %r4, %r4
    fmsubsq	%r3, %r3, %r4, %r2
    fmaddsq	%r3, %r3, %r4, %r1
    write	"cos: %f128(r3)"

    write	"test exp(x)"
    fldqri	%r5, exp(0.44567)
    write	"exp: %f128(r5)"	; test value
    ldi		%r3, 0			; s ,  0.0
    mov		%r4, %r2		; f4 ,  x
    flddi	%r6, 0.125
;   write	"%f128(r6)"
    fmulsq	%r4, %r4, %r6	; x ,  x/8
    fmaddsq	%r3, %r3, %r4, %r15
    fmaddsq	%r3, %r3, %r4, %r14
    fmaddsq	%r3, %r3, %r4, %r13
    fmaddsq	%r3, %r3, %r4, %r12
    fmaddsq	%r3, %r3, %r4, %r11
    fmaddsq	%r3, %r3, %r4, %r10
    fmaddsq	%r3, %r3, %r4, %r9
    fmaddsq	%r3, %r3, %r4, %r8
    fmaddsq	%r3, %r3, %r4, %r7
    fmaddsq	%r3, %r3, %r4, %r6
    fmaddsq	%r3, %r3, %r4, %r5
    fmaddsq	%r3, %r3, %r4, %r4
    fmaddsq	%r3, %r3, %r4, %r3
    fmaddsq	%r3, %r3, %r4, %r2
    fmaddsq	%r3, %r3, %r4, %r1
    fmaddsq	%r3, %r3, %r4, %r1
    fmulsq	%r3, %r3, %r3	; (e^x) ^ 8
    fmulsq	%r3, %r3, %r3
    fmulsq	%r3, %r3, %r3
    write	"exp: %f128(r3)"

    faddsq	%r1, %r2, %r3
    fmaddsq	%r2, %r10, %r20, %r30
    fmaddsq	%r1, %r11, %r21, %r31

    ; classification
    fclss	%r4, %r5, 120
    fclsd	%r4, %r5, 120
    fclsq	%r4, %r5, 120
    jmp		skipfma

fpu_backward_target:
; single branches
    bfssoeq	%r23, %r34, fpu_backward_target
    bfssoeq.l	%r23, %r34, fpu_backward_target
    bfssoeq	%r23, %r34, fpu_forward_target
    bfssoeq.l	%r23, %r34, fpu_forward_target

    bfssueq	%r23, %r34, fpu_backward_target
    bfssueq.l	%r23, %r34, fpu_backward_target
    bfssueq	%r23, %r34, fpu_forward_target
    bfssueq.l	%r23, %r34, fpu_forward_target

    bfssone	%r23, %r34, fpu_backward_target
    bfssone.l	%r23, %r34, fpu_backward_target
    bfssone	%r23, %r34, fpu_forward_target
    bfssone.l	%r23, %r34, fpu_forward_target

    bfssune	%r23, %r34, fpu_backward_target
    bfssune.l	%r23, %r34, fpu_backward_target
    bfssune	%r23, %r34, fpu_forward_target
    bfssune.l	%r23, %r34, fpu_forward_target

    bfssolt	%r23, %r34, fpu_backward_target
    bfssolt.l	%r23, %r34, fpu_backward_target
    bfssolt	%r23, %r34, fpu_forward_target
    bfssolt.l	%r23, %r34, fpu_forward_target

    bfssult	%r23, %r34, fpu_backward_target
    bfssult.l	%r23, %r34, fpu_backward_target
    bfssult	%r23, %r34, fpu_forward_target
    bfssult.l	%r23, %r34, fpu_forward_target

    bfssole	%r23, %r34, fpu_backward_target
    bfssole.l	%r23, %r34, fpu_backward_target
    bfssole	%r23, %r34, fpu_forward_target
    bfssole.l	%r23, %r34, fpu_forward_target

    bfssule	%r23, %r34, fpu_backward_target
    bfssule.l	%r23, %r34, fpu_backward_target
    bfssule	%r23, %r34, fpu_forward_target
    bfssule.l	%r23, %r34, fpu_forward_target

    bfsso	%r23, %r34, fpu_backward_target
    bfsso.l	%r23, %r34, fpu_backward_target
    bfsso	%r23, %r34, fpu_forward_target
    bfsso.l	%r23, %r34, fpu_forward_target

    bfssuo	%r23, %r34, fpu_backward_target
    bfssuo.l	%r23, %r34, fpu_backward_target
    bfssuo	%r23, %r34, fpu_forward_target
    bfssuo.l	%r23, %r34, fpu_forward_target

    bfssclass	%r23, 34, fpu_backward_target
    bfssclass.l	%r23, 34, fpu_backward_target
    bfssclass	%r23, 34, fpu_forward_target
    bfssclass.l	%r23, 34, fpu_forward_target

; double branches
    bfsdoeq	%r23, %r34, fpu_backward_target
    bfsdoeq.l	%r23, %r34, fpu_backward_target
    bfsdoeq	%r23, %r34, fpu_forward_target
    bfsdoeq.l	%r23, %r34, fpu_forward_target

    bfsdueq	%r23, %r34, fpu_backward_target
    bfsdueq.l	%r23, %r34, fpu_backward_target
    bfsdueq	%r23, %r34, fpu_forward_target
    bfsdueq.l	%r23, %r34, fpu_forward_target

    bfsdone	%r23, %r34, fpu_backward_target
    bfsdone.l	%r23, %r34, fpu_backward_target
    bfsdone	%r23, %r34, fpu_forward_target
    bfsdone.l	%r23, %r34, fpu_forward_target

    bfsdune	%r23, %r34, fpu_backward_target
    bfsdune.l	%r23, %r34, fpu_backward_target
    bfsdune	%r23, %r34, fpu_forward_target
    bfsdune.l	%r23, %r34, fpu_forward_target

    bfsdolt	%r23, %r34, fpu_backward_target
    bfsdolt.l	%r23, %r34, fpu_backward_target
    bfsdolt	%r23, %r34, fpu_forward_target
    bfsdolt.l	%r23, %r34, fpu_forward_target

    bfsdult	%r23, %r34, fpu_backward_target
    bfsdult.l	%r23, %r34, fpu_backward_target
    bfsdult	%r23, %r34, fpu_forward_target
    bfsdult.l	%r23, %r34, fpu_forward_target

    bfsdole	%r23, %r34, fpu_backward_target
    bfsdole.l	%r23, %r34, fpu_backward_target
    bfsdole	%r23, %r34, fpu_forward_target
    bfsdole.l	%r23, %r34, fpu_forward_target

    bfsdule	%r23, %r34, fpu_backward_target
    bfsdule.l	%r23, %r34, fpu_backward_target
    bfsdule	%r23, %r34, fpu_forward_target
    bfsdule.l	%r23, %r34, fpu_forward_target

    bfsdo	%r23, %r34, fpu_backward_target
    bfsdo.l	%r23, %r34, fpu_backward_target
    bfsdo	%r23, %r34, fpu_forward_target
    bfsdo.l	%r23, %r34, fpu_forward_target

    bfsduo	%r23, %r34, fpu_backward_target
    bfsduo.l	%r23, %r34, fpu_backward_target
    bfsduo	%r23, %r34, fpu_forward_target
    bfsduo.l	%r23, %r34, fpu_forward_target

    bfsdclass	%r23, 34, fpu_backward_target
    bfsdclass.l	%r23, 34, fpu_backward_target
    bfsdclass	%r23, 34, fpu_forward_target
    bfsdclass.l	%r23, 34, fpu_forward_target

; quadruple branches
    bfsqoeq	%r23, %r34, fpu_backward_target
    bfsqoeq.l	%r23, %r34, fpu_backward_target
    bfsqoeq	%r23, %r34, fpu_forward_target
    bfsqoeq.l	%r23, %r34, fpu_forward_target

    bfsqueq	%r23, %r34, fpu_backward_target
    bfsqueq.l	%r23, %r34, fpu_backward_target
    bfsqueq	%r23, %r34, fpu_forward_target
    bfsqueq.l	%r23, %r34, fpu_forward_target

    bfsqone	%r23, %r34, fpu_backward_target
    bfsqone.l	%r23, %r34, fpu_backward_target
    bfsqone	%r23, %r34, fpu_forward_target
    bfsqone.l	%r23, %r34, fpu_forward_target

    bfsqune	%r23, %r34, fpu_backward_target
    bfsqune.l	%r23, %r34, fpu_backward_target
    bfsqune	%r23, %r34, fpu_forward_target
    bfsqune.l	%r23, %r34, fpu_forward_target

    bfsqolt	%r23, %r34, fpu_backward_target
    bfsqolt.l	%r23, %r34, fpu_backward_target
    bfsqolt	%r23, %r34, fpu_forward_target
    bfsqolt.l	%r23, %r34, fpu_forward_target

    bfsqult	%r23, %r34, fpu_backward_target
    bfsqult.l	%r23, %r34, fpu_backward_target
    bfsqult	%r23, %r34, fpu_forward_target
    bfsqult.l	%r23, %r34, fpu_forward_target

    bfsqole	%r23, %r34, fpu_backward_target
    bfsqole.l	%r23, %r34, fpu_backward_target
    bfsqole	%r23, %r34, fpu_forward_target
    bfsqole.l	%r23, %r34, fpu_forward_target

    bfsqule	%r23, %r34, fpu_backward_target
    bfsqule.l	%r23, %r34, fpu_backward_target
    bfsqule	%r23, %r34, fpu_forward_target
    bfsqule.l	%r23, %r34, fpu_forward_target

    bfsqo	%r23, %r34, fpu_backward_target
    bfsqo.l	%r23, %r34, fpu_backward_target
    bfsqo	%r23, %r34, fpu_forward_target
    bfsqo.l	%r23, %r34, fpu_forward_target

    bfsquo	%r23, %r34, fpu_backward_target
    bfsquo.l	%r23, %r34, fpu_backward_target
    bfsquo	%r23, %r34, fpu_forward_target
    bfsquo.l	%r23, %r34, fpu_forward_target

    bfsqclass	%r23, 34, fpu_backward_target
    bfsqclass.l	%r23, 34, fpu_backward_target
    bfsqclass	%r23, 34, fpu_forward_target
    bfsqclass.l	%r23, 34, fpu_forward_target

fpu_forward_target:

    nulfssune	%r23, %r34, 1, 1
    nulfsdune	%r23, %r34, 1, 1
    nulfsqune	%r23, %r34, 1, 1

    nulfssone	%r23, %r34, 1, 1
    nulfsdone	%r23, %r34, 1, 1
    nulfsqone	%r23, %r34, 1, 1

    nulfssueq	%r23, %r34, 1, 1
    nulfsdueq	%r23, %r34, 1, 1
    nulfsqueq	%r23, %r34, 1, 1

    nulfssoeq	%r23, %r34, 1, 1
    nulfsdoeq	%r23, %r34, 1, 1
    nulfsqoeq	%r23, %r34, 1, 1

    nulfssclass	%r23, 94, 1, 1
    nulfsdclass	%r23, 94, 1, 1
    nulfsqclass	%r23, 94, 1, 1
skipfma:
    write	"end fpu"
.end
.text
    alloc	96
    write	"test base addressing with indexed post-update"
    ldi		%r12, 1
    addi	%r45, %sp, -512

    ldbzmia	%r23, %r45, 2
    ldhzmia	%r23, %r45, 2
    ldwzmia	%r23, %r45, 4
    lddzmia	%r23, %r45, 8
    ldqmia	%r23, %r45, 16

    ldbsmia	%r23, %r45, 2
    ldhsmia	%r23, %r45, 2
    ldwsmia	%r23, %r45, 4
    lddsmia	%r23, %r45, 8

    stbmia	%r23, %r45, 2 
    sthmia	%r23, %r45, 2
    stwmia	%r23, %r45, 4
    stdmia	%r23, %r45, 8
    stqmia	%r23, %r45, 16
    write	"end_indexed_modify_test"
.end.rodata
rodata1:
    d1	123
    align	2
rodata2:
    d2	12345
    align	4
rodata4:
    d4	123456789
    align	8
rodata8:
    d8	1234567890123456789

.data
data1:
    d1	123
    align	2
data2:
    d2	12345
    align	4
data4:
    d4	123456789
    align	8
data8:
    d8	1234567890123456789

.text
    alloc	96

    write "test ip-relative data addressing"
    ldbzr	%r34, rodata1
    ldhzr	%r34, rodata2
    ldwzr	%r34, rodata4
    lddzr	%r34, rodata8

    ldbsr	%r34, rodata1
    ldhsr	%r34, rodata2
    ldwsr	%r34, rodata4
    lddsr	%r34, rodata8

    ldbzr	%r34, data1
    ldhzr	%r34, data2
    ldwzr	%r34, data4
    lddzr	%r34, data8

    ldbsr	%r34, data1
    ldhsr	%r34, data2
    ldwsr	%r34, data4
    lddsr	%r34, data8

    stbr	%r34, data1
    sthr	%r34, data2
    stwr	%r34, data4
    stdr	%r34, data8

    write	"end ip-relative data test"
.end.text
    alloc	96
    write	"test ldafr"
    ldafr	%r22, ldafr_data
    write	"ldafr: %x64(r22)"

    write	"end_ldafr_test"
.data
ldafr_data:

.end.text
    alloc	96
    write	"check mbsel instruction"
    ldi.l	%r6, ((0x3333333333333333 ^ 0x5555555555555555) & 0xff00ff00ff00ff00) ^ 0x5555555555555555
    write	"mbsel: %x64(r6)"
    ldi.l	%r3, 0x3333333333333333
    ldi.l	%r4, 0x5555555555555555
    ldi.l	%r5, 0xff00ff00ff00ff00
    mbsel	%r6, %r3, %r4, %r5
    write	"mbsel: %x64(r6)"

    write	"end_mbsel_test"
.end.text
    alloc	61
    write	"\ntest write: special register"
    write	"ip      %s(ip)"
    write	"eip     %s(eip)"
    write	"eca     %s(eca)"
    write	"fpcr    %s(fpcr)"
    write	"rsc     %s(rsc)"
    write	"rsp     %s(rsp)"
    write	"bsp     %s(bsp)"
    write	"peb     %s(peb)"
    write	"teb     %s(teb)"
    write	"itc     %s(itc)"
    write	"itm     %s(itm)"
    write	"psr     %s(psr)"
    write	"pta     %s(pta)"
    write	"iva     %s(iva)"
    write	"kip     %s(kip)"
    write	"ksp     %s(ksp)"
    write	"krsp    %s(krsp)"
    write	"iip     %s(iip)"
    write	"iipa    %s(iipa)"
    write	"ipsr    %s(ipsr)"
    write	"cause   %s(cause)"
    write	"ifa     %s(ifa)"
    write	"iib     %s(iib)"
    write	"tpr     %s(tpr)"
    write	"lid     %s(lid)"
    write	"irr0    %s(irr0)"
    write	"irr1    %s(irr1)"
    write	"irr2    %s(irr2)"
    write	"irr3    %s(irr3)"
    write	"isr0    %s(isr0)"
    write	"isr1    %s(isr1)"
    write	"isr2    %s(isr2)"
    write	"isr3    %s(isr3)"
    write	"tsv     %s(tsv)"
    write	"cmcv    %s(cmcv)"
    write	"pmv     %s(pmv)"

    write	"\ntest mfspr: read special register"

    mfspr	%r12, %ip
    write	"ip      %x64(r12)"

    mfspr	%r12, %eip
    write	"eip     %x64(r12)"

    mfspr	%r12, %eca
    write	"%x64(r12)"

    mfspr	%r12, %fpcr
    write	"%x64(r12)"

    mfspr	%r12, %rsc
    write	"%x64(r12)"

    mfspr	%r12, %rsp
    write	"%x64(r12)"

    mfspr	%r12, %bsp
    write	"%x64(r12)"

    mfspr	%r12, %peb
    write	"%x64(r12)"

    mfspr	%r12, %teb
    write	"%x64(r12)"

    mfspr	%r12, %itc
    write	"%x64(r12)"

    mfspr	%r12, %itm
    write	"%x64(r12)"

    mfspr	%r12, %psr
    write	"%x64(r12)"

    mfspr	%r12, %pta
    write	"%x64(r12)"

    mfspr	%r12, %iva
    write	"%x64(r12)"

    mfspr	%r12, %kip
    write	"%x64(r12)"

    mfspr	%r12, %ksp
    write	"%x64(r12)"

    mfspr	%r12, %krsp
    write	"krsp    %x64(r12)"

    mfspr	%r12, %iip
    write	"iip     %x64(r12)"

    mfspr	%r12, %iipa
    write	"iipa    %x64(r12)"

    mfspr	%r12, %ipsr
    write	"ipsr    %x64(r12)"

    mfspr	%r12, %cause
    write	"cause   %x64(r12)"

    write	"%s(ifa)"
    mfspr	%r12, %ifa
    write	"ifa     %x64(r12)"

    mfspr	%r12, %iib
    write	"iib     %x128(r12)"

    mfspr	%r12, %tpr
    write	"tpr     %x64(r12)"

    mfspr	%r12, %lid
    write	"lid     %x64(r12)"

    mfspr	%r12, %irr0
    write	"irr0    %x64(r12)"

    mfspr	%r12, %irr1
    write	"irr1    %x64(r12)"

    mfspr	%r12, %irr2
    write	"irr2    %x64(r12)"

    mfspr	%r12, %irr3
    write	"irr3    %x64(r12)"

    mfspr	%r12, %isr0
    write	"%x64(r12)"

    mfspr	%r12, %isr1
    write	"%x64(r12)"

    mfspr	%r12, %isr2
    write	"%x64(r12)"

    mfspr	%r12, %isr3
    write	"%x64(r12)"

    mfspr	%r12, %tsv
    write	"%x64(r12)"

    mfspr	%r12, %cmcv
    write	"%x64(r12)"

    mfspr	%r12, %pmv
    write	"%x64(r12)"

    write	"end test mfspr"
.end
.text
    alloc	69
    write	"test min/max"
    mins	%r34, %r56, %r67
    minu	%r34, %r56, %r67
    maxs	%r34, %r56, %r67
    maxu	%r34, %r56, %r67

    minsi	%r34, %r56, 2671
    minui	%r34, %r56, 2671
    maxsi	%r34, %r56, 2671
    maxui	%r34, %r56, 2671
    write	"test minmax end"

.end

.text
    write	"test nullification (explicit masks)"
    alloc	96
    ldi		%r10, 0
    nuldeq	%r10, %r10, 5, 4
    write	"0" ; nullified
    write	"1" ; nullified
    write	"2" ; nullified
    write	"3" ; nullified
    write	"4" ; nullified
    write	"5" ; else
    write	"6" ; else
    write	"7" ; else
    write	"8" ; else

    write	"test nullification (predicate names)"
    ldi		%r10, 0
    nuldeq	%r10, %r10, equal, nonequal
    write	"0"
    write	"1"
    write	"2"
    write	"3"
    write	"4" (equal)
    write	"5"
    write	"6"
    write	"7"
    write	"8" (nonequal)


    write	"test nullification"
    ldi		%r10, 0
    nuldeq	%r10, %r10, 4, 3
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 1
    addi	%r10, %r10, 1
    addi	%r10, %r10, 1
    addi	%r10, %r10, 1

    write	"test nullification"
    ldi		%r10, 0
    nuldeq	%r10, %r10, true, false
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 1 (true)
    addi	%r10, %r10, 1
    addi	%r10, %r10, 1 (false)

    nop	0
    nop	0
    nuldeq	%r12, %r10, 4, 3
    write	"branch1: psr=%s(psr)"
    write	"branch1: %i64(r10)"
    write	"branch1: %i64(r10)"
    write	"branch1: %i64(r10)"
    write	"branch2: psr=%s(psr)"
    write	"branch2: %i64(r20)"
    write	"branch2: %i64(r20)"


    nuldeq	%r23, %r45, 0b1100, 0b0101
    nuldlt	%r23, %r45, 0b1100, 0b0101
    nuldltu	%r23, %r45, 0b1100, 0b0101

    nuldeqi	%r23, 45, 0b1100, 0b0101
    nuldlti	%r23, -45, 0b1100, 0b0101
    nuldltui	%r23, 45, 0b1100, 0b0101

    nuldeqi.l   %r23, 45000000000, 0b1100, 0b0101
    nuldlti.l   %r23, -45000000000, 0b1100, 0b0101
    nuldltui.l  %r23, 45000000000, 0b1100, 0b0101

    nulbs	%r23, %r45, 0b1100, 0b0101
    nulbsi	%r23, 45, 0b1100, 0b0101
    nop	1
    nop	2
    nop	3
    nop	4
    nop	5
    nop	6
    nop	7

    nuldeq	%r10, %r10, same_equal, same_nonequal
    write	"0e"
    write	"1e"
    write	"2e" (same_equal, same_nonequal)

    nuldne	%r10, %r10, same_equal2, same_nonequal2
    write	"0ne"
    write	"1ne"
    write	"2ne" (same_equal2, same_nonequal2)

    nuldeq	%r10, %r10, no_if_true, no_if_false (no_if_true)
    write	"else" (no_if_false)

    write	"end_nullification_test"
.end
.text
    alloc	21
    ldi		%r12, PMC_LAST
    write	"PMC_LAST = %i64(r12)"
; don't report runtine in unittests, this is non-reproducible
    mfmr	%r14, %gz, PMC_RUNTIME
;   write	"PMC_RUNTIME = %i64(r14)"
    mfmr	%r14, %gz, PMC_SHORT_INSTRUCTION
    write	"PMC_SHORT_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_LONG_INSTRUCTION
    write	"PMC_LONG_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_SHADOWED_INSTRUCTION
    write	"PMC_SHADOWED_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_NOP_INSTRUCTION
    write	"PMC_NOP_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_QUALIFIED_NOP_INSTRUCTION
    write	"PMC_QUALIFIED_NOP_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_REGISTER_SPILL
    write	"PMC_REGISTER_SPILL = %i64(r14)"
    mfmr	%r14, %gz, PMC_REGISTER_FILL
    write	"PMC_REGISTER_FILL = %i64(r14)"
    mfmr	%r14, %gz, PMC_ICACHE_HIT
    write	"PMC_ICACHE_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_ICACHE_MISS
    write	"PMC_ICACHE_MISS = %i64(r14)"
    mfmr	%r14, %gz, PMC_DCACHE_HIT
    write	"PMC_DCACHE_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_DCACHE_MISS
    write	"PMC_DCACHE_MISS = %i64(r14)"
    mfmr	%r14, %gz, PMC_INSTRUCTION_TRANSLATION_HIT
    write	"PMC_INSTRUCTION_TRANSLATION_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_INSTRUCTION_TRANSLATION_MISS
    write	"PMC_INSTRUCTION_TRANSLATION_MISS = %i64(r14)"
    mfmr	%r14, %gz, PMC_DATA_TRANSLATION_HIT
    write	"PMC_DATA_TRANSLATION_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_DATA_TRANSLATION_MISS
    write	"PMC_DATA_TRANSLATION_MISS = %i64(r14)"
    mfmr	%r14, %gz, PMC_BACKSTORE_TRANSLATION_HIT
    write	"PMC_BACKSTORE_TRANSLATION_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_BACKSTORE_TRANSLATION_MISS
    write	"PMC_BACKSTORE_TRANSLATION_MISS = %i64(r14)"
    mtmr	%r14, %gz, PMC_SHORT_INSTRUCTION
    mfmr	%r15, %gz, PMC_SHORT_INSTRUCTION
    write	"old pm reg = %i64(r15)"
.end
.text
; Simple test program
; 20! factorial compute
.text
    alloc	61
    ldi		%r15, -100
loop_stop_sard:
    srdi	%r13, %r15, 5
    repdle	%r15, %gz, loop_stop_sard

; performance test - long loop
; for(i = 1000000; i>0; i--) DoSome();

    ldi		%r20, 2500000
    ldi		%r15, 20 ; maximum factorial number
    ldi		%r21, 5
loop_stop: (64)
    addi	%r13, %r13, 5
    sub		%r14, %r14, %r55
    cmpdlt	%r24, %r14, %r14
    addi	%r13, %r13, 4
    sub		%r14, %r14, %r55
    cmpdlt	%r22, %r14, %r14
    addi	%r13, %r13, 33
    srpi	%r14, %r14, %r55, 13
    sub		%r14, %r13, %r21
    srai	%r14, %r14, 7
    repdgt	%r20, %gz, loop_stop
; print loop counter after loop (must be 0)
    write	"%i64(r20) factorials"
    ldi		%r13, 1
    ldi		%r14, 1
start:
    mul		%r13, %r13, %r14
    write	"factorial: %u64(r13)"
    repdle	%r14, %r15, start

    write	"%i64(r14) %i64(r13)"
.end
.text
    alloc	96
    write	"Example of strided loop instructions"
; fast_check
    ldi		%r12, 10000	; load loop number (10)
stride_loop_start:
;	write	"%i64(r12)"
    cmpdeq	%r4, %r12, %r12
    add		%r14, %r14, %r46
    repdgt	%r12, %gz, stride_loop_start

    write	"counter=%i64(r12)"

; Second example of strided loop.
; fast_check
    ldi		%r12, 10000	; load loop number (10)
    ldi		%r14, 10000	; load loop number (10)
stride_loop_start2:
;   write	"%i64(r12)"
    cmpdeq	%r4, %r12, %r12
    addi	%r14, %r14, -2
    repdgt	%r12, %gz, stride_loop_start2

    write	"%i64(r12) %i64(r14)"

;*****************************************************************
; 3x inner loop example
;*****************************************************************
    ldi		%r3, 0
    ldi		%r20, 0
    ldi		%r33, 80
    mov		%r10, %r33
    mov		%r11, %r33
    mov		%r12, %r33
ccloop:
;   write	"%i64(r12)"
    addi	%r20, %r20, 1
    addi	%r12, %r12, -1
    cmpdlt	%r2, %r3, %r12
;   jmp	ccloop
;   write	"%i64(r11)"
    addi	%r11, %r11, -1
    cmpdlt	%r4, %r3, %r11
    mov		%r12, %r33
;   jmp		ccloop
;   write	"%i64(r10)"
    addi	%r10, %r10, -1
    cmpdlt	%r6, %r3, %r10
    mov		%r11, %r33
    mov		%r12, %r33
;   jmp		ccloop

    write	"%i64(r20)"

; for(i=0; i<100; i++)

    ldi	%r8, 0
start1:
;   write	"%i64(r8)"
    addi	%r8, %r8,1
    cmpdlti	%r7, %r8,128
    bdnei	%r7,0,start1

; for(i=100; i>0; i--)
    ldi		%r8, 100
start2:
    write	"%i64(r8)"
    addi	%r8, %r8,-1		; current error
    cmpdlt	%r2, %r3, %r8
    bdnei	%r2, 0, start2

    write	"r3      %x64(r3)"
;	mtspr	%r3, %rsc


; for(i=100; i>0; i--) write "%x64((i)"
    ldi		%r10, 100
qqq:	cmpdlt	%r2, %r3, %r10
    write	"r10     %x64(r10)"
    addi	%r10, %r10, -1
;   jmp		qqq
sss:

    andi.l	%r55, %r55,0x000FFFFF00003F0F
    mtspr	%r12, %ifa
; test some special regs
    ldi.l	%r9, 0x123456789
;   mtspr	%r9, psr
    write	"ip: %s(ip) psr: %s(psr)"
;   mtspr	%r3, psr
    ldi		%r55, 120
    mtspr	%r55, %tpr
    write	"fpcr    %s(fpcr)"
    write	"psr     %s(psr)"

    write	"test long loop"
; test simple loop
; fast_check
    ldi		%r13, 350000 ; 35
    ldi		%r14, 350000 ; 35
    ldi		%r15, 88
    write	"%i64(r14)"
repeat_loop_start: (128)
;	write	"%i64(r12)"
    addi	%r13, %r13, 3
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 8

    addi	%r13, %r13, 4
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 7

    addi	%r13, %r13, 5
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 6

    addi	%r13, %r13, 6
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 5

    sub		%r13, %r13, %r15
    sladd	%r13, %r13, %r15, 5
    sladd	%r13, %r13, %r15, 5

    xor		%r13, %r14, %r15
    sll		%r13, %r13, %r13
    repdgt	%r14, %gz, repeat_loop_start

    write	"%i64(r13) %i64(r14)"

    write	"end test long loop"
.end
.text
    write	"test random"
    alloc	96

    random	%r3, %gz
    write	"random: %x64(r3)"
    random	%r3, %gz
    write	"random: %x64(r3)"
    ldi		%r4, 1
    random	%r3, %r4
    write	"random seed: %x64(r3)"

    write	"end_random_test"
.end.text
; test simple long loop
    alloc	61
    ldi		%r13, 1000000
    mov		%r14, %r13
    write	"loop limit: %i64(r14)"
    ldi		%r15, 88
repeat_long_loop_start: (128)
    addi	%r13, %r13, 3
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 8
    addi	%r13, %r13, 4
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 7
    addi	%r13, %r13, 5
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 6
    addi	%r13, %r13, 6
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 5
    add		%r30, %r31, %r14
    sub		%r31, %r30, %r15
    slli	%r40, %r40, 12
    ldax	%r41, %r40, %r12, 3, -12
    ldax	%r41, %r40, %r12, 4, 112
    repdgt	%r14, %gz, repeat_long_loop_start
    jmp		repeat_exit

    repdle	%r56, %r60, repeat_long_loop_start
    repdge	%r56, %r60, repeat_long_loop_start
    repdleu	%r56, %r20, repeat_long_loop_start
    repdgeu	%r56, %r20, repeat_long_loop_start

    repdle.l	%r56, %r60, repeat_long_loop_start
    repdge.l	%r56, %r60, repeat_long_loop_start
    repdleu.l	%r56, %r20, repeat_long_loop_start
    repdgeu.l	%r56, %r20, repeat_long_loop_start

repeat_exit:
    write	"end loop repeat test"
.end.text
; Here we test instructions for partial rotate register by fixed bitcount.
    alloc	90
    write	"initial values"
    ldi.l	%r50, 0x1234567890ABCDEF
    write	"%x64(r50)"
    write	"rotate left"
    srpi	%r51, %r50, %r50, 40-1
    write	"%x64(r51)"
    write	"rotate right"
    srpi	%r51, %r50, %r50, 64-40-1	; same as previous
    write	"%x64(r51)"
    write	"rotate left immediate"
    srpi	%r51, %r50, %r50, 64-40-1
    write	"%x64(r51)"
    write	"rotate right immediate"
    srpi	%r51, %r50, %r50, 40-1	; same as previous "rD+1-rC"
    write	"%x64(r51)"

; Here we test instructions for shift and mask register by fixed bitcount.
    write	"shift signed|unsigned by immediate 12 bit"
    ldi.l	%r50, 0xfedcba0123456789
    write	"%x64(r50)"
    srai	%r51, %r50, 12
    write	"%x64(r51)"
    srli	%r51, %r50, 12
    write	"%x64(r51)"
    slli	%r51, %r50, 12
    write	"%x64(r51)"
    slli	%r51, %r50, 12
    write	"%x64(r51)"

;	jmp	ddd
    ldi		%r10, 16
    slp	%r51, %r50, %r50, %r10
    write	"%x64(r51)"

    ldi.l	%r40, 0x1234567890abcdef
    ldi.l	%r50, 0xfedcba0987654321
    slsrli	%r41, %r40, 8, 40
    write	"%x64(r41)"
    slsrai	%r41, %r40, 11, 40
    write	"%x64(r41)"

    write	"test srpi"
    ldi.l	%r40, 0x1234123412341234
    ldi.l	%r50, 0x5678567856785678
    srpi	%r41, %r40, %r50, 39
    write	"%x64(r41)"
    srpi	%r41, %r50, %r40, 23
    write	"%x64(r41)"
    srpi	%r41, %r40, %r40, 24
    write	"%x64(r41)"

    write	"test vector shift right pair (srpi16) instruction"
    xor		%r2, %r2, %r2	; all zeroes
    nor		%r3, %r2, %r2	; all ones
    write	"r2      %x128(r2)"
    write	"r3      %x128(r3)"
    srpiq	%r4, %r2, %r3, 60
    write	"r4      %x128(r4)"
    srpiq	%r4, %r3, %r2, 60
    write	"r4      %x128(r4)"
    srpiq	%r4, %r2, %r3, 100
    write	"r4      %x128(r4)"
    srpiq	%r4, %r3, %r2, 100
    write	"r4      %x128(r4)"

; SHIFTS
    sll		%r42, %r33, %r34
    sll		%r42, %r33, %r34
    sra		%r52, %r73, %r44
    srl		%r62, %r73, %r44
    slp		%r72, %r17, %r17, %r24
    srp		%r82, %r16, %r16, %r15
    srpi	%r72, %r15, %r24, 32
    dep		%r10, %r14, %r85, 32, 30

    slli	%r12, %r67, 13
    slli	%r13, %r57, 13
    srai	%r14, %r48, 14
    srli	%r15, %r38, 14
    srpi	%r16, %r39, %r13, 13
    srpi	%r17, %r29, %r13, 64-13


    write	"test packed bitwise logical"
    and		%r10, %r71, %r13
    andn	%r21, %r81, %r22
    or		%r32, %r71, %r32
    orn		%r43, %r61, %r43
    nand	%r54, %r51, %r54
    nor		%r65, %r41, %r64
    xnor	%r76, %r31, %r73
    xor		%r87, %r21, %r83


    ldi		%r20, 65
    write	"r20     %c(r20)"   ; should be 'A'

    ldi		%r3, 0
    ldi.l	%r22, 0x12345FFFFFFFFFFF
    write	"%x64(r22)"
    depc	%r23, %r22, 0, 23
    write	"%x64(r23)"

    ldi.l	%r22, 0x1234567890ABCDEF
    ldi.l	%r23, 0xFEDCBA9876543210
    srpi	%r22, %r22, %r23, 24
    write	"%x64(r22)"

    ldi.l	%r24, 0x4321F00000000
    write	"%x64(r24)"
    subfi	%r25, %r24, 0
    write	"%x64(r25)"
    not		%r25, %r25
    write	"%x64(r25)"
    xor		%r25, %r25, %r24
    write	"%x64(r25)"

; Example of absd.
    ldi		%r12, -10000
    absd	%r12, %r12, %gz
    write	"r12: %i64(r12)"
.end
.text
    jmp		endfpsimd
; SSE double (SSE2)
    fmaddpd	%r16, %r71, %r69, %r13
    fmsubpd	%r15, %r78, %r58, %r23
    fnmaddpd	%r14, %r67, %r47, %r13
    fnmsubpd	%r13, %r86, %r36, %r16
    fmaddapd	%r82, %r52, %r69, %r63
    fmsubapd	%r50, %r91, %r69, %r63
    faddpd	%r12, %r86, %r25
    fnaddpd	%r11, %r82, %r19
    fsubpd	%r10, %r63, %r28
    faddcpd	%r81, %r61, %r37
    fsubcpd	%r82, %r81, %r46
    faddhpd	%r83, %r81, %r55
    fsubhpd	%r84, %r71, %r64
    fmulpd	%r81, %r71, %r11
    fmulhpd	%r60, %r11, %r22
    fdotpd	%r85, %r81, %r13
    fminpd	%r86, %r84, %r14
    fmaxpd	%r87, %r61, %r15
    faminpd	%r30, %r52, %r16
    famaxpd	%r61, %r51, %r17

    fcmppdoeq	%r80, %r81, %r63
    fcmppdone	%r11, %r81, %r32
    fcmppdolt	%r15, %r81, %r32
    fcmppdolt	%r60, %r81, %r82
    fcmppdone	%r62, %r72, %r83
    fcmppdole	%r62, %r72, %r62

    fpkpd	%r60, %r61, %r62
    fnegpd	%r61, %r51
    fabsdpd	%r61, %r51, %r3
    fnabsdpd	%r61, %r61, %r3
    frndpd	%r60, %r77,3
    frndpd	%r62, %r61,2
    frndpd	%r62, %r71,0
    frndpd	%r83, %r67,1
    fdivpd	%r83, %r67, %r20
    fsqrtpd	%r68, %r81
    frsqrtpd	%r68, %r81


; quadruple floating-point extension example
.rodata
    align	16
a:	quad	1.234567890123456789124141241241
b:	quad	3.1415925678888734535345231234564561
c:	quad	3.4566345634563456346535463463456
.text
    ldar	%r21, a
    ldq		%r3, %r21,0*16
    ldq		%r1, %r21,1*16
    ldq		%r2, %r21,2*16
    write	"%vf64(r3)"
    write	"%vf64(r1)"
    write	"%vf64(r2)"

    write	"test binary\0"
    fmulsd	%r3, %r1, %r2
    write	"%vf64(r3)"
    fnmulsd	%r3, %r1, %r2
    write	"%vf64(r3)"
    faddsd	%r4, %r1, %r2
    write	"%vf64(r4)"
    fnaddsd	%r4, %r1, %r2
    write	"%vf64(r4)"
    fsubsd	%r4, %r2, %r1
    write	"%vf64(r4)"
    fdivsd	%r4, %r2, %r1
    write	"%vf64(r4)"

    write	"test fused fma\0"
    fmaddsd	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"
    fnmaddsd	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"
    fmsubsd	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"
    fnmsubsd	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"

    write	"test unary\0"
    mov		%r6, %r5
    write	"%vf64(r6)"
    fabssd	%r6, %r5
    write	"%vf64(r6)"
    fnegsd	%r6, %r5
    write	"%vf64(r6)"
    fnabssd	%r6, %r5
    write	"%vf64(r6)"
    fsqrtsd	%r6, %r2
    write	"%vf64(r6)"
    frsqrtsd	%r6, %r2
    write	"%vf64(r6)"

    write	"test rounding\0"
    frndsd	%r7, %r2,4
    write	"%vf64(r7)"
    frndsd	%r7, %r2,2
    write	"%vf64(r7)"
    frndsd	%r7, %r2,1
    write	"%vf64(r7)"
    frndsd	%r7, %r2,0
    write	"%vf64(r7)"
    fcvtsd2iw	%r7, %r2,0
    write	"r7=%i64(r7)"
    ldi		%r7, 123456
    fcvtiw2sd	%r7, %r7,0
    write	"%vf64(r7)"

    write	"test minmax, abs minmax"
    fmaxsd	%r8, %r1, %r2
    write	"%vf64(r8)"
    fminsd	%r8, %r1, %r2
    write	"%vf64(r8)"
    famaxsd	%r8, %r1, %r2
    write	"%vf64(r8)"
    faminsd	%r8, %r1, %r2
    write	"%vf64(r8)"

    write	"test fmergesq\0"

.rodata
    align	16
xxxq:	quad	1.122
    quad	0.9999765432
.text
    ldar	%r21, a
; fast_check
    ldi		%r15, 100000 ; 10
    ldq		%r15, %r21, 0*16
    ldq		%r16, %r21, 1*16
    fsubsd	%r22, %r15, %r16
    write	"%vf64(r22)"
yyloop:
    fmaddsd	%r22, %r15, %r16, %r22
    fmsubsd	%r22, %r15, %r16, %r22
    repdge	%r15, %gz, yyloop
    write	"%vf64(r22)"


.rodata
    align	16
    quad	1.189731495357231765085759326628007e+4932
qqqq:   quad	1.23456789 + 32.0
    quad	0.2345678901234567890123456789012345678 + 0.2
    quad	2*asin(1)
    quad	255
dbl1:	double	acos(sin(3.1415926)) ;-1.2345678e+200
    double	444.689679
float1:	float	0.123456789123456789e+30
    float	2.123456789122233
    float	0.0
    float	1.0
octquad:
    quad	0.25
f32:	d4	0x3fff1234
.text
    ldar	%r45, qqqq
    ldar	%r46, dbl1
    ldar	%r47, float1
    write	"r45     %x64(r45)"
    ldq		%r63, %r45,0
    write	"%vf64(r63) %x128(r63)"
    ldq		%r63, %r45,0
    write	"%vf64(r63) %x128(r63)"
    fmulsq	%r62, %r63, %r63
    write	"%vf64(r62)"
    ldwz	%r60, %r47,0
    write	"%vf64(r60)"
    lddz	%r59, %r46,0
    ldwz	%r58, %r47,4
    ldwz	%r57, %r47,8
    write	"%vf64(r57)"
    write	"%vf64(r58)"
    write	"%vf64(r59)"
    ldq		%r53, %r45,1*16
    write	"%vf64(r53)"
    ldq		%r50, %r45,2*16
    write	"%vf64(r50)"
    ldq		%r49, %r45,3*16
    write	"%vf64(r49) %x128(r49)"
    ldwz	%r48, %r47,3*4
    write	"%vf64(r48)"
    fnegsq	%r46, %r48
    write	"%vf64(r46)"
    fmaddsq	%r40, %r52, %r52, %r53
    write	"%m(dump)"

.rodata
    align	16
__yyy:
    quad	0.5
    quad	1.0
    quad	2.25
    quad	22252.22424
    quad	-22252.22424
    quad	34.125
    quad	2.0 / 72.0
    d8	0xffffffffffffffff
    d8	0x3ffe
    d8	0xffffffffffffffff
    d8	0x3ff0
    d8	0x8000000000000000
    d8	0xbff3
    d8	0x8000000000000000
    d8	0xc003
    quad	-1.234567890123456789012345e+6
    d8	0x8000000000000000
    d8	0x3fe0
.text
    ldar	%r12, __yyy
    ldq		%r23, %r12, 0
    write	"%vf64(r23) %x128(r23)"
    ldq		%r23, %r12, 1*16
    write	"%vf64(r23) %x128(r23)"
    ldq		%r23, %r12, 2*16
    write	"%vf64(r23) %x128(r23)"
    ldq		%r23, %r12, 3*16
    write	"%vf64(r23) %x128(r23)"
    ldq		%r23, %r12, 4*16
    write	"%vf64(r23) %x128(r23)"
    ldq		%r23, %r12, 5*16
    write	"%vf64(r23) %x128(r23)"
    ldq		%r23, %r12, 6*16
    write	"%vf64(r23) %x128(r23)"
    ldq		%r27, %r12, 7*16
    write	"%vf64(r27) %x128(r27)"
    ldq		%r27, %r12, 8*16
    write	"%vf64(r27) %x128(r27)"
    ldq		%r27, %r12, 9*16
    write	"%vf64(r27) %x128(r27)"
    ldq		%r27, %r12, 10*16
    write	"%vf64(r27) %x128(r27)"
;   flddi	%r24, 8.5899345919999999995e+09 ;-1.234567890123456789012345e+6
;   write	"%vf64(r24) %x128(f24)"
;   flddi	%r24, 0.125 ; 4.656612873077392578125e-10 ; 4.656612873077392578125e-10
;   write	"%vf64(r24) %x128(f24)"
    ldq		%r25, %r12, 11*16
    write	"%vf64(r25) %x128(r25)"
    ldq		%r25, %r12, 12*16
    write	"%vf64(r25) %x128(r25)"
    fldqri	%r40, 4.345678912345678901234567890123456789012345678
    write	"%vf64(r40)"


    fmaddsd	%r23, %r60, %r55, %r33
    fmaddsd	%r24, %r61, %r25, %r32
    fmaddsd	%r25, %r62, %r55, %r23
    fmaddsd	%r26, %r63, %r75, %r73
    fmaddsd	%r27, %r64, %r75, %r73
    fmaddsd	%r28, %r65, %r85, %r63
    fmaddsd	%r29, %r66, %r85, %r63
    fmaddsd	%r30, %r67, %r95, %r23
    fmaddsd	%r31, %r68, %r95, %r23
    fmaddsd	%r10, %r21, %r26, %r27
    fmaddsd	%r13, %r21, %r26, %r27
    fmaddsd	%r10, %r21, %r26, %r27
    fmaddsd	%r12, %r21, %r26, %r27
    fmaddsd	%r11, %r21, %r26, %r27
    fmaddsd	%r13, %r21, %r26, %r27
    fmaddsd	%r14, %r21, %r26, %r27
    fmaddsd	%r15, %r21, %r26, %r27
    fmaddsd	%r16, %r21, %r26, %r27
    fmaddsd	%r17, %r21, %r26, %r27

    stq	%r16, %sp,16*2
    stq	%r17, %sp,16*3
    stq	%r18, %sp,16*4
    stq	%r19, %sp,16*5
    stq	%r20, %sp,16*6
    stq	%r21, %sp,16*7
    stq	%r22, %sp,16*8
    stq	%r23, %sp,16*9
    stq	%r24, %sp,16*10
    stq	%r25, %sp,16*11
    stq	%r26, %sp,16*12
    stq	%r27, %sp,16*13
    stq	%r28, %sp,16*14
    stq	%r29, %sp,16*15
    stq	%r30, %sp,16*16
    stq	%r31, %sp,16*17


; SSE single
    fmaddps	%r58, %r61, %r92, %r63
    fmsubps	%r82, %r52, %r92, %r63
    fnmaddps	%r82, %r52, %r69, %r63
    fnmsubps	%r50, %r91, %r69, %r63
    fmaddaps	%r82, %r52, %r69, %r63
    fmsubaps	%r50, %r91, %r69, %r63
    faddps	%r61, %r94, %r69
    fnaddps	%r68, %r54, %r72
    fsubps	%r68, %r61, %r82
    faddcps	%r81, %r71, %r82
    fsubcps	%r82, %r71, %r82
    faddhps	%r62, %r61, %r82
    fsubhps	%r62, %r61, %r62
    fmulps	%r62, %r51, %r62
    fmulhps	%r63, %r51, %r62
    fdotps	%r83, %r51, %r62
    fminps	%r83, %r61, %r62
    fmaxps	%r63, %r71, %r62
    faminps	%r64, %r71, %r82
    famaxps	%r64, %r71, %r82

    fcmppsone	%r65, %r61, %r62
    fcmppsolt	%r74, %r61, %r62
    fcmppsole	%r83, %r61, %r62
    fcmppsule	%r72, %r61, %r62
    fcmppsule	%r11, %r61, %r62
    fcmppsuo	%r20, %r61, %r62

    fpkps	%r33, %r64, %r62
    fnegps	%r60, %r69
    fabsdps	%r61, %r68, %r3
    fnabsdps	%r62, %r67, %r3
    frndps	%r63, %r66,0
    frndps	%r64, %r65,2
    frndps	%r65, %r64,1
    frndps	%r66, %r63,0
    fdivps	%r67, %r62, %r20
    fsqrtps	%r68, %r61
    frsqrtps	%r69, %r60

    faddps	%r24, %r61, %r60
    fmulpd	%r47, %r60, %r46

endfpsimd:

.end
.text
.rodata
    align	16
mmxdata:
    d8	0x123456759eabcd7f
    d8	0x123456789cabcdef

    d8	0xf87f5432afebcdf3
    d8	0xffffffffffffffff

    d8	0x1234567890abcdef
    d8	0x1234567890abcdef

    d8	0x1234567890abcdef
    d8	0x1234567890abcdef
.text
    alloc	90
    ldar	%r4, mmxdata
    ldq		%r1, %r4,0*16
    ldq		%r2, %r4,1*16
    ldq		%r3, %r4,2*16
    ldq		%r4, %r4,3*16
    write	"r1      %x128(r1)"
    write	"r2      %x128(r2)"

    write	"%vu8(r1)"
    write	"%vu16(r1)"
    write	"%vu32(r1)"
    write	"%vu64(r1)"

    vaddub	%r3, %r1, %r2
    write	"test vadd/vaddc (1 byte)\0"
    vaddcb	%r4, %r1, %r2
    write	"%vu8(r1)"
    write	"%vu16(r2)"
    write	"%vu32(r3)"
    write	"%vu64(r4)"
    write	"test vadd/vaddo signed (1 byte)\0"
    vaddob	%r4, %r1, %r2
    write	"%vi8(r1)"
    write	"%vi16(r2)"
    write	"%vi32(r3)"
    write	"%vu64(r4)"

    vsubub	%r3, %r1, %r2
    write	"test vsub/vsubb (1 byte)\0"
    vsubbb	%r4, %r1, %r2
    write	"%vu8(r1)"
    write	"%vu8(r2)"
    write	"%vu8(r3)"
    write	"%vu8(r4)"
    write	"test vsub/vsubo signed (1 byte)\0"
    vsubob	%r4, %r1, %r2
    write	"%vi8(r1)"
    write	"%vi8(r2)"
    write	"%vi8(r3)"
    write	"%vu8(r4)"

    write	"test vaddusb"
    vaddub	%r3, %r1, %r2
    vaddusb	%r4, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)\n%vu8(r4)"

    write	"test vsubusb"
    vsubub	%r3, %r1, %r2
    vsubusb	%r4, %r1, %r2
    write	"%vu8(r1):\n%vu8(r2)\n%vu8(r3)\n%vu8(r4)"

    write	"test vaddssb"
    vaddub	%r3, %r1, %r2
    vaddssb	%r4, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)\n%vi8(r4)"

    write	"test vsubssb"
    vsubub	%r3, %r1, %r2
    vsubssb	%r4, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)\n%vi8(r4)"

    write	"test pavgu (1 byte)\0"
    vavgub	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test pavgs (1 byte)\0"
    vavgsb	%r3, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write	"test vminu (1 byte)\0"
    vminub	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test vmins (1 byte)\0"
    vminsb	%r3, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write	"test vmaxu (1 byte)\0"
    vmaxub	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test vmaxs (1 byte)\0"
    vmaxsb	%r3, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write	"test merge low (1 byte)\0"
    vmrglb	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test merge high (1 byte)\0"
    vmrghb	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    vpkuush	%r2, %r3, %r4
    vpksush	%r2, %r3, %r4
    vpksssh	%r2, %r3, %r4

    vpkuusw	%r2, %r3, %r4
    vpksusw	%r2, %r3, %r4
    vpksssw	%r2, %r3, %r4

    vpkuusd	%r2, %r3, %r4
    vpksusd	%r2, %r3, %r4
    vpksssd	%r2, %r3, %r4

;	jmp	endmmx
; d1 abs
    vminsb	%r12, %r61, %r55
    vminsh	%r18, %r61, %r45
    vminsw	%r27, %r61, %r35
    vminsd	%r36, %r61, %r25

    vminub	%r14, %r61, %r15
    vminuh	%r15, %r62, %r75
    vminuw	%r17, %r63, %r85
    vminud	%r16, %r64, %r75

    vmaxsb	%r26, %r71, %r85
    vmaxsh	%r26, %r61, %r54
    vmaxsw	%r16, %r51, %r35
    vmaxsd	%r16, %r41, %r55

    vmaxub	%r11, %r61, %r53
    vmaxuh	%r12, %r55, %r55
    vmaxuw	%r16, %r46, %r56
    vmaxud	%r13, %r31, %r55

    vrolb	%r56, %r61, %r15
    vrolh	%r31, %r61, %r25
    vrolw	%r53, %r61, %r30
    vrold	%r62, %r61, %r41

    vrorb	%r16, %r11, %r52
    vrorh	%r11, %r21, %r63
    vrorw	%r71, %r31, %r74
    vrord	%r81, %r41, %r85

    vsllb	%r16, %r51, %r86
    vsllh	%r24, %r61, %r55
    vsllw	%r69, %r71, %r55
    vslld	%r77, %r81, %r55

    vsrlb	%r21, %r81, %r50
    vsrlh	%r12, %r63, %r51
    vsrlw	%r13, %r62, %r52
    vsrld	%r64, %r63, %r53

    vsrab	%r85, %r64, %r54
    vsrah	%r76, %r65, %r15
    vsraw	%r67, %r66, %r25
    vsrad	%r58, %r67, %r36

    vavgsb	%r49, %r68, %r47
    vavgsh	%r30, %r69, %r58
    vavgsw	%r26, %r11, %r69
    vavgsd	%r16, %r21, %r75

    vavgub	%r14, %r31, %r85
    vavguh	%r15, %r41, %r45
    vavguw	%r56, %r51, %r25
    vavgud	%r87, %r61, %r15

    vaddssb	%r42, %r71, %r15
    vaddssh	%r83, %r81, %r45
    vaddssw	%r74, %r41, %r85
    vaddssd	%r65, %r61, %r75

    vaddub	%r56, %r61, %r75
    vadduh	%r47, %r61, %r65
    vadduw	%r38, %r61, %r55
    vaddud	%r29, %r61, %r55

    vaddusb	%r55, %r61, %r45
    vaddush	%r65, %r61, %r35
    vaddusw	%r74, %r61, %r25
    vaddusd	%r84, %r61, %r15

    vaddcb	%r53, %r61, %r55
    vaddch	%r13, %r61, %r55
    vaddcw	%r12, %r61, %r55
    vaddcd	%r12, %r61, %r55

    vsubssb	%r56, %r61, %r15
    vsubssh	%r67, %r61, %r12
    vsubssw	%r78, %r61, %r13
    vsubssd	%r89, %r61, %r45

    vsubub	%r70, %r61, %r85
    vsubuh	%r86, %r61, %r45
    vsubuw	%r46, %r61, %r13
    vsubud	%r46, %r61, %r75

    vsubusb	%r41, %r68, %r65
    vsubush	%r12, %r37, %r55
    vsubusw	%r23, %r26, %r45
    vsubusd	%r14, %r18, %r35

    vcmpeqb	%r86, %r61, %r25
    vcmpeqh	%r44, %r72, %r15
    vcmpeqw	%r20, %r83, %r55
    vcmpeqd	%r16, %r84, %r55

;	pcmpne	%r106, %r61, %r55
;	pcmpgt	%r106, %r61, %r55
;	pcmpge	%r106, %r61, %r55
;	pcmple	%r106, %r61, %r55

    vcmpltb	%r13, %r61, %r15
    vcmplth	%r14, %r61, %r24
    vcmpltw	%r15, %r61, %r38
    vcmpltd	%r16, %r61, %r45

    vcmpltub	%r19, %r11, %r75
    vcmpltuh	%r18, %r21, %r82
    vcmpltuw	%r16, %r31, %r73
    vcmpltud	%r14, %r71, %r54

    vmrghb	%r11, %r71, %r13
    vmrghh	%r72, %r67, %r27
    vmrghw	%r13, %r58, %r55
    vmrghd	%r14, %r69, %r15

    vmrglb	%r76, %r61, %r11
    vmrglh	%r26, %r11, %r62
    vmrglw	%r16, %r15, %r73
    vmrgld	%r16, %r11, %r85

    write	"end simd(int) test"
endmmx:

.end
.text
    alloc	70
    write	"test system instructions (assembler only)"

    addi	%sp, %sp, -32	; alloc stack frame
    write	"test tpa for sp: 0x%x64(sp)"
    tpa		%r4, %sp
    write	"tpa(sp): 0x%x64(r4)"
    addi	%sp, %sp, 32	; rollback stack frame
    
    jmp		system_skip

    ldi		%r45, 1012
    syscall
    nop		0
    sysret
    rfi

    icbi	%r34, 16
    dcbt	%r34, 16
    dcbf	%r34, 16
    dcbi	%r34, 16


    mfspr	%r34, %lid
    mtspr	%r34, %lid
    mprobe	%r34, %r45, %r66
    retf	234567

    mfspr	%r32, %iv
    mfspr	%r32, %psr

; test system instructions
    ptc		%r10, %r45, %r11

    mfspr	%r12, %pta
    mfspr	%r12, %fpcr
    mtspr	%r11, %rsc

; test atomic fences
    fence.acquire
    fence.release
    fence.acq_rel
    fence.seq_cst

    mtdbr	%r44, %r66, 0
    mfdbr	%r55, %r66, 0
    mtibr	%r44, %r66, 0
    mfibr	%r55, %r66, 0
    mtitr	%r44, %r66, %r12
    mtdtr	%r44, %r66, %r12

;	bpa	b7, %r7
;	bpal	b7, b4, %r6
;	lpr	b7, %r6, label16

    undef
system_skip:
    write	"end test system instructions (assembler only)"
.end
.text
.data
data_unaligned:
align 16
    d1	0x00
    d1	0x01
    d1	0x02
    d1	0x03
    d1	0x04
    d1	0x05
    d1	0x06
    d1	0x07
    d1	0x08
    d1	0x09
    d1	0x0a
    d1	0x0b
    d1	0x0c
    d1	0x0d
    d1	0x0e
    d1	0x0f

    d1	0x10
    d1	0x11
    d1	0x12
    d1	0x13
    d1	0x14
    d1	0x15
    d1	0x16
    d1	0x17
    d1	0x18
    d1	0x19
    d1	0x1a
    d1	0x1b
    d1	0x1c
    d1	0x1d
    d1	0x1e
    d1	0x1f

.text
    write	"load/store unaligned"
    alloc	96
    ldafr	%r17, data_unaligned

    ldhz	%r3, %r17, 0
    write	"%x16(r3)"
    ldhz	%r3, %r17, 1
    write	"%x16(r3)"
    ldhz	%r3, %r17, 2
    write	"%x16(r3)"

    ldwz	%r3, %r17, 0
    write	"%x32(r3)"
    ldwz	%r3, %r17, 1
    write	"%x32(r3)"
    ldwz	%r3, %r17, 2
    write	"%x32(r3)"
    ldwz	%r3, %r17, 3
    write	"%x32(r3)"
    ldwz	%r3, %r17, 4
    write	"%x32(r3)"

    lddz	%r3, %r17, 0
    write	"%x64(r3)"
    lddz	%r3, %r17, 1
    write	"%x64(r3)"
    lddz	%r3, %r17, 2
    write	"%x64(r3)"
    lddz	%r3, %r17, 3
    write	"%x64(r3)"
    lddz	%r3, %r17, 4
    write	"%x64(r3)"
    lddz	%r3, %r17, 5
    write	"%x64(r3)"
    lddz	%r3, %r17, 6
    write	"%x64(r3)"
    lddz	%r3, %r17, 7
    write	"%x64(r3)"
    lddz	%r3, %r17, 8
    write	"%x64(r3)"

    ldq	%r3, %r17, 0
    write	"%x128(r3)"
    ldq	%r3, %r17, 1
    write	"%x128(r3)"
    ldq	%r3, %r17, 2
    write	"%x128(r3)"
    ldq	%r3, %r17, 3
    write	"%x128(r3)"
    ldq	%r3, %r17, 4
    write	"%x128(r3)"
    ldq	%r3, %r17, 5
    write	"%x128(r3)"
    ldq	%r3, %r17, 6
    write	"%x128(r3)"
    ldq	%r3, %r17, 7
    write	"%x128(r3)"
    ldq	%r3, %r17, 8
    write	"%x128(r3)"
    ldq	%r3, %r17, 9
    write	"%x128(r3)"
    ldq	%r3, %r17, 10
    write	"%x128(r3)"
    ldq	%r3, %r17, 11
    write	"%x128(r3)"
    ldq	%r3, %r17, 12
    write	"%x128(r3)"
    ldq	%r3, %r17, 13
    write	"%x128(r3)"
    ldq	%r3, %r17, 14
    write	"%x128(r3)"
    ldq	%r3, %r17, 15
    write	"%x128(r3)"
    ldq	%r3, %r17, 16
    write	"%x128(r3)"
.end
.rodata
align 4
    d4	table_cases
    d4	label_0
    d4	label_1
    d4	label_2

table_cases:
    i4	label_0 - table_cases
    i4	label_1 - table_cases
    i4	label_2 - table_cases

.text
    alloc	80
    write	"test table switch to case 1"
    ldi		%r4, 1
    ldafr	%r5, table_cases
    jmpt	%r5, %r4

label_0:
    write	"case 0"
    cmpqeq	%r12, %r24, %gz
    cmpqne	%r12, %r24, %gz
    deps	%r18, %r20, 13, 32
    depc	%r19, %r23, 13, 32
    ldi		%r12, -1234
    ldi		%r13, 3456
    jmp		label_after_switch

label_1:
    write	"case 1"
    andi	%r45, %r44, 12345
    sladd	%r14, %sp, %r12, 2
    sladd	%r12, %r23, %r44, 3
    mov		%r12, %r13
    ldi		%r24, 0
    mtspr	%r24, %psr
    mfspr	%r12, %psr
    nand	%r34, %r34, %r45
    sll		%r12, %r23, %r45
    slli	%r12, %r23, 45
    jmp		label_after_switch

label_2:
    write	"case 2"
    addi	%r34, %r34,-1
    mov		%r58, %r45
    sladd	%r12, %r15, %r30, 14
    sladd	%r12, %r15, %r30, 5
    sladd	%r12, %r15, %r30, 5
    srd		%r34, %r56, %r40
    srdi	%r34, %r56, 40
    depa	%r40, %r78, 40, 20
    sladd	%r54, %r45, %r22, 4
    sladd	%r54, %r45, %r22, 20
    ldax	%r3, %r45, %tp, 3, 55
    jmp		label_after_switch

label_after_switch:
    write	"end table switch test"
.end
.rodata
    align	16
console_test_quad:
    quad	1.189731495357231765085759326628007e+4932
console_test_quad2:
    quad	1.23456789 + 32.0
console_test_quad3:
    quad	0.2345678901234567890123456789012345678 + 0.2
    quad	2*asin(1)
    quad	255
console_test_double:
    double	acos(sin(3.1415926)) ;-1.2345678e+200
    double	444.689679
console_test_float:
    float	0.123456789123456789e+30
    float	2.123456789122233
    float	0.0
    float	1.0
.text
    alloc	35
    write	"ip=%s(ip), eip=%s(eip), psr=%s(psr)"

    write	"end test write special regs"

    write	"\ntest write: general register"

    write	"%%i8(sp)  = %i8(sp)"
    write	"%%i16(sp) = %i16(sp)"
    write	"%%i32(sp) = %i32(sp)"
    write	"%%i64(sp) = %i64(sp)"
    write	"%%u8(sp)  = %u8(sp)"
    write	"%%u16(sp) = %u16(sp)"
    write	"%%u32(sp) = %u32(sp)"
    write	"%%u64(sp) = %u64(sp)"
    write	"%%x8(sp)  = 0x%x8(sp)"
    write	"%%x16(sp) = 0x%x16(sp)"
    write	"%%x32(sp) = 0x%x32(sp)"
    write	"%%x64(sp) = 0x%x64(sp)"

    write	"%x64(r0)"
    write	"%x64(r1)"
    write	"%x64(r2)"
    write	"%x64(r22)"
    write	"%x64(r33)"
    write	"%x64(g0)"
    write	"%x64(g1)"
    write	"%x64(tp)"
    write	"%x64(sp)"

    write	"end test write general regs"

    ldqr	%r22, console_test_quad
    write	"r22 = %x128(r22) %f128(r22)"
    ldqr	%r22, console_test_quad2
    write	"r22 = %x128(r22) %f128(r22)"
    ldqr	%r22, console_test_quad3
    write	"r22 = %x128(r22) %f128(r22)"
    lddzr	%r22, console_test_double
    write	"r22 = %x64(r22) %f64(r22)"
    ldwzr	%r22, console_test_float
    write	"r22 = %x32(r22) %f32(r22)"

    write	"end test write fp regs"
.end