html-program

.text
    alloc	96
    write	"test carry-less multiply"
    clmul.ll	%r34, %r21, %r22
    clmul.hl	%r34, %r21, %r22
    clmul.hl	%r34, %r21, %r22
    clmul.hh	%r34, %r21, %r22
.rodata
align 16
vector_a:
    d8	0x7b5b546573745665
    d8	0x63746f725d53475d
vector_b:
    d8	0x4869285368617929
    d8	0x5b477565726f6e5d
result_00:
    d8	0x1d4d84c85c3440c0
    d8	0x929633d5d36f0451
result_01:
    d8	0x1bd17c8d556ab5a1
    d8	0x7fa540ac2a281315
result_10:
    d8	0x1a2bf6db3a30862f
    d8	0xbabf262df4b7d5c9
result_11:
    d8	0x1d1e1f2c592e7c45
    d8	0xd66ee03e410fd4ed
.text
    ld.q.r	%r12, vector_a
    ld.q.r	%r13, vector_b

    clmul.ll	%r11, %r12, %r13
    ld.q.r	%r21, result_00
    write	"clmul: %x128(r11) %x128(r21)"
    clmul.hl	%r11, %r13, %r12
    ld.q.r	%r21, result_01
    write	"clmul: %x128(r11) %x128(r21)"
    clmul.hl	%r11, %r12, %r13
    ld.q.r	%r21, result_10
    write	"clmul: %x128(r11) %x128(r21)"
    clmul.hh	%r11, %r12, %r13
    ld.q.r	%r21, result_11
    write	"clmul: %x128(r11) %x128(r21)"

    write	"test aes"
    aes.dec	%r11, %r12, %r13
    aes.dec.last	%r11, %r12, %r13
    aes.enc	%r11, %r12, %r13
    aes.enc.last	%r11, %r12, %r13
    aes.imc	%r11, %r12
    aes.keygen.assist %r11, %r12, 250
    write	"end aes test"
.end
.text
;*****************************************************************
; ARITHMETIC
;*****************************************************************
    alloc	96
    write	"test load constant (1234567)"
    ldi		%r1, 1234567
    write	"ldi: %i64(r1)"

    write	"test load long constant (123456789012345678)"
    ldi.l	%r1, 123456789012345678
    write	"ldi long: %i64(r1)"

    write	"test simple arithmetic"
    ldi		%r1, 1
    ldi		%r2, 2
    ldi		%r3, 3

    write	"add 1+2"
    add		%r4, %r1, %r2
    write	"add: %i64(r4)"

    write	"add immediate 1+6"
    addi	%r4, %r1, 6
    write	"addi: %i64(r4)"

    write	"sub 1-2"
    sub		%r4, %r1, %r2
    write	"sub: %i64(r4)"

    write	"sub from immediate 6-1"
    subfi	%r4, %r1, 6
    write	"subfi: %i64(r4)"

    write	"mul 3*4"
    ldi		%r1, 3
    ldi		%r2, 4
    mul		%r4, %r1, %r2
    write	"mul: %i64(r4)"

    write	"12 div 4"
    ldi		%r1, 12
    ldi		%r2, 4
    div		%r4, %r1, %r2
    write	"%i64(r4)"

    write	"15 mod 4"
    ldi		%r1, 15
    ldi		%r2, 4
    mod		%r4, %r1, %r2
    write	"mod: %i64(r4)"

    write	"test int32_t add"
    ldi.l	%r1, 0xFFFFFFFF
    ldi.l	%r2, 0xFFFFFFF0
    add.ws	%r3, %r1, %r2
    write	"add4: %i64(r3)"
    addi.ws.l	%r3, %r1, 0xFFFFFFFF
    write	"addis4.l: %i64(r3)"


    addi	%r45, %r45, 12
    mov		%r54, %r56
    sub		%r45, %r56, %r50
    addi	%r45, %r55, -1000
    cmp.ne.d	%r12, %r56, %r10
    subfi	%r45, %r56, -10000
    subfi	%r45, %r56, -20000
    cmp.eq.d	%r13, %r56, %r50
    add		%r45, %r56, %r50
    addi	%r45, %r56, -10000
    mul		%r45, %r56, %r50
    muli	%r45, %r56, -10000
    mov		%r55, %r20
    ldi		%r55, 1200
    ldi		%r55, 987654
    ldi.l	%r56, 98765432198765432
    addi	%r12, %r13, -789
    cmp.ne.d	%r14, %r13, %r77
    nand	%r43, %r44, %r34
    nor		%r43, %r44, %r34
    addi	%r56, %sp, 0
    ; callr	%r0, quadrat
    add		%r56, %sp, %sp

    ldi.l	%r55, -9223372036854775808
    addi	%r56, %sp, -64
    subfi.l	%r55, %r56,12345678901234567
    nor		%r12, %r14, %r14
    addi	%r56, %sp, -64
    nor		%r12, %r14, %r14
    subfi.l	%r55, %r56, 12345678901234567
    addi	%r56, %sp, -64
    subfi.l	%r55, %r56, -12345678901234567
    addi	%r56, %sp, -64
    subfi.l	%r55, %r56, -12345678901234567
    addi.l	%r45, %r56, 12345678



    ldi.l	%r5, 0xaFFFFFFF12345677
    ldi.l	%r6, 0xaFFFFFFF12345678

    write	"test signed overflow: %i64(r5) %i64(r6)"

    write	"add overflow"
    addo	%r2, %r5, %r6
    write	"addo: %i64(r2)"

    write	"subtract overflow"
    subo	%r2, %r5, %r6
    write	"subo: %i64(r2)"

    write	"test unsigned add carry"
    ldi		%r7, -1
    ldi		%r5, -2
    ldi		%r6, -1
    add.add.c	%r2, %r5, %r6, %r7
    write	"addaddc: %u64(r5) %u64(r6) %u64(r7) => %i64(r2)"

    write	"test unsigned subtract borrow"
    ldi		%r7, -1
    ldi		%r5, 12
    ldi		%r6, -1
    sub.sub.b	%r2, %r5, %r6, %r7
    write	"subsub: %u64(r5) %u64(r6) %u64(r7) => %i64(r2)"

    mul.add	%r34, %r45, %r67, %r80
    mul.sub	%r34, %r45, %r67, %r80
    mul.subf	%r34, %r45, %r67, %r80
    add.add	%r34, %r45, %r67, %r80
    add.sub	%r34, %r45, %r67, %r80
    sub.sub	%r34, %r45, %r67, %r80

.end
.text
    alloc 96
    write "test atomic fetch-op"
    addi %r5, %sp, -64
    write "atomic base: %x64(r5)"
    ldi  %r10, 5
    ldi  %r12, 10
    ldi  %r56, 5

    write "test amo-add"

    ld.add.b %r4, %r5, %r10
    ld.add.b.a %r4, %r5, %r10
    ld.add.b.r %r4, %r5, %r10
    ld.add.b.ar %r4, %r5, %r10

    ld.add.h %r4, %r5, %r10
    ld.add.h.a %r4, %r5, %r10
    ld.add.h.r %r4, %r5, %r10
    ld.add.h.ar %r4, %r5, %r10

    ld.add.w %r4, %r5, %r10
    ld.add.w.a %r4, %r5, %r10
    ld.add.w.r %r4, %r5, %r10
    ld.add.w.ar %r4, %r5, %r10

    ld.add.d %r4, %r5, %r10
    ld.add.d.a %r4, %r5, %r10
    ld.add.d.r %r4, %r5, %r10
    ld.add.d.ar %r4, %r5, %r10

    ld.add.q %r4, %r5, %r10
    ld.add.q.a %r4, %r5, %r10
    ld.add.q.r %r4, %r5, %r10
    ld.add.q.ar %r4, %r5, %r10

    write "test amo-and"

    ld.and.b %r4, %r5, %r10
    ld.and.b.a %r4, %r5, %r10
    ld.and.b.r %r4, %r5, %r10
    ld.and.b.ar %r4, %r5, %r10

    ld.and.h %r4, %r5, %r10
    ld.and.h.a %r4, %r5, %r10
    ld.and.h.r %r4, %r5, %r10
    ld.and.h.ar %r4, %r5, %r10

    ld.and.w %r4, %r5, %r10
    ld.and.w.a %r4, %r5, %r10
    ld.and.w.r %r4, %r5, %r10
    ld.and.w.ar %r4, %r5, %r10

    ld.and.d %r4, %r5, %r10
    ld.and.d.a %r4, %r5, %r10
    ld.and.d.r %r4, %r5, %r10
    ld.and.d.ar %r4, %r5, %r10

    ld.and.q %r4, %r5, %r10
    ld.and.q.a %r4, %r5, %r10
    ld.and.q.r %r4, %r5, %r10
    ld.and.q.ar %r4, %r5, %r10

    write "test amo-or"

    ld.or.b %r4, %r5, %r10
    ld.or.b.a %r4, %r5, %r10
    ld.or.b.r %r4, %r5, %r10
    ld.or.b.ar %r4, %r5, %r10

    ld.or.h %r4, %r5, %r10
    ld.or.h.a %r4, %r5, %r10
    ld.or.h.r %r4, %r5, %r10
    ld.or.h.ar %r4, %r5, %r10

    ld.or.w %r4, %r5, %r10
    ld.or.w.a %r4, %r5, %r10
    ld.or.w.r %r4, %r5, %r10
    ld.or.w.ar %r4, %r5, %r10

    ld.or.d %r4, %r5, %r10
    ld.or.d.a %r4, %r5, %r10
    ld.or.d.r %r4, %r5, %r10
    ld.or.d.ar %r4, %r5, %r10

    ld.or.q %r4, %r5, %r10
    ld.or.q.a %r4, %r5, %r10
    ld.or.q.r %r4, %r5, %r10
    ld.or.q.ar %r4, %r5, %r10

    write "test amo-xor"

    ld.xor.b %r4, %r5, %r10
    ld.xor.b.a %r4, %r5, %r10
    ld.xor.b.r %r4, %r5, %r10
    ld.xor.b.ar %r4, %r5, %r10

    ld.xor.h %r4, %r5, %r10
    ld.xor.h.a %r4, %r5, %r10
    ld.xor.h.r %r4, %r5, %r10
    ld.xor.h.ar %r4, %r5, %r10

    ld.xor.w %r4, %r5, %r10
    ld.xor.w.a %r4, %r5, %r10
    ld.xor.w.r %r4, %r5, %r10
    ld.xor.w.ar %r4, %r5, %r10

    ld.xor.d %r4, %r5, %r10
    ld.xor.d.a %r4, %r5, %r10
    ld.xor.d.r %r4, %r5, %r10
    ld.xor.d.ar %r4, %r5, %r10

    ld.xor.q %r4, %r5, %r10
    ld.xor.q.a %r4, %r5, %r10
    ld.xor.q.r %r4, %r5, %r10
    ld.xor.q.ar %r4, %r5, %r10

    write "test amo-smin"
    ld.smin.b %r4, %r5, %r10
    ld.smin.b.a %r4, %r5, %r10
    ld.smin.b.r %r4, %r5, %r10
    ld.smin.b.ar %r4, %r5, %r10

    ld.smin.h %r4, %r5, %r10
    ld.smin.h.a %r4, %r5, %r10
    ld.smin.h.r %r4, %r5, %r10
    ld.smin.h.ar %r4, %r5, %r10

    ld.smin.w %r4, %r5, %r10
    ld.smin.w.a %r4, %r5, %r10
    ld.smin.w.r %r4, %r5, %r10
    ld.smin.w.ar %r4, %r5, %r10

    ld.smin.d %r4, %r5, %r10
    ld.smin.d.a %r4, %r5, %r10
    ld.smin.d.r %r4, %r5, %r10
    ld.smin.d.ar %r4, %r5, %r10

    ld.smin.q %r4, %r5, %r10
    ld.smin.q.a %r4, %r5, %r10
    ld.smin.q.r %r4, %r5, %r10
    ld.smin.q.ar %r4, %r5, %r10

    write "test amo-smax"
    ld.smax.b %r4, %r5, %r10
    ld.smax.b.a %r4, %r5, %r10
    ld.smax.b.r %r4, %r5, %r10
    ld.smax.b.ar %r4, %r5, %r10

    ld.smax.h %r4, %r5, %r10
    ld.smax.h.a %r4, %r5, %r10
    ld.smax.h.r %r4, %r5, %r10
    ld.smax.h.ar %r4, %r5, %r10

    ld.smax.w %r4, %r5, %r10
    ld.smax.w.a %r4, %r5, %r10
    ld.smax.w.r %r4, %r5, %r10
    ld.smax.w.ar %r4, %r5, %r10

    ld.smax.d %r4, %r5, %r10
    ld.smax.d.a %r4, %r5, %r10
    ld.smax.d.r %r4, %r5, %r10
    ld.smax.d.ar %r4, %r5, %r10

    ld.smax.q %r4, %r5, %r10
    ld.smax.q.a %r4, %r5, %r10
    ld.smax.q.r %r4, %r5, %r10
    ld.smax.q.ar %r4, %r5, %r10

    write "test amo-umin"
    ld.umin.b %r4, %r5, %r10
    ld.umin.b.a %r4, %r5, %r10
    ld.umin.b.r %r4, %r5, %r10
    ld.umin.b.ar %r4, %r5, %r10

    ld.umin.h %r4, %r5, %r10
    ld.umin.h.a %r4, %r5, %r10
    ld.umin.h.r %r4, %r5, %r10
    ld.umin.h.ar %r4, %r5, %r10

    ld.umin.w %r4, %r5, %r10
    ld.umin.w.a %r4, %r5, %r10
    ld.umin.w.r %r4, %r5, %r10
    ld.umin.w.ar %r4, %r5, %r10

    ld.umin.d %r4, %r5, %r10
    ld.umin.d.a %r4, %r5, %r10
    ld.umin.d.r %r4, %r5, %r10
    ld.umin.d.ar %r4, %r5, %r10

    ld.umin.q %r4, %r5, %r10
    ld.umin.q.a %r4, %r5, %r10
    ld.umin.q.r %r4, %r5, %r10
    ld.umin.q.ar %r4, %r5, %r10

    write "test amo-umax"
    ld.umax.b %r4, %r5, %r10
    ld.umax.b.a %r4, %r5, %r10
    ld.umax.b.r %r4, %r5, %r10
    ld.umax.b.ar %r4, %r5, %r10

    ld.umax.h %r4, %r5, %r10
    ld.umax.h.a %r4, %r5, %r10
    ld.umax.h.r %r4, %r5, %r10
    ld.umax.h.ar %r4, %r5, %r10

    ld.umax.w %r4, %r5, %r10
    ld.umax.w.a %r4, %r5, %r10
    ld.umax.w.r %r4, %r5, %r10
    ld.umax.w.ar %r4, %r5, %r10

    ld.umax.d %r4, %r5, %r10
    ld.umax.d.a %r4, %r5, %r10
    ld.umax.d.r %r4, %r5, %r10
    ld.umax.d.ar %r4, %r5, %r10

    ld.umax.q %r4, %r5, %r10
    ld.umax.q.a %r4, %r5, %r10
    ld.umax.q.r %r4, %r5, %r10
    ld.umax.q.ar %r4, %r5, %r10

    write "test cas"

    cas.b %r12, %r5, %r56
    cas.b.a %r12, %r5, %r56
    cas.b.r %r12, %r5, %r56
    cas.b.ar %r12, %r5, %r56

    cas.h %r12, %r5, %r56
    cas.h.a %r12, %r5, %r56
    cas.h.r %r12, %r5, %r56
    cas.h.ar %r12, %r5, %r56

    cas.w %r12, %r5, %r56
    cas.w.a %r12, %r5, %r56
    cas.w.r %r12, %r5, %r56
    cas.w.ar %r12, %r5, %r56

    cas.d %r12, %r5, %r56
    cas.d.a %r12, %r5, %r56
    cas.d.r %r12, %r5, %r56
    cas.d.ar %r12, %r5, %r56

    cas.q %r12, %r5, %r56
    cas.q.a %r12, %r5, %r56
    cas.q.r %r12, %r5, %r56
    cas.q.ar %r12, %r5, %r56

    write "test load atomic relaxed"
    lda.b %r12, %r5
    lda.h %r12, %r5
    lda.w %r12, %r5
    lda.d %r12, %r5
    lda.q %r12, %r5

    write "test load atomic acquire"
    lda.b.a %r12, %r5
    lda.h.a %r12, %r5
    lda.w.a %r12, %r5
    lda.d.a %r12, %r5
    lda.q.a %r12, %r5

    write "test store atomic relaxed"
    sta.b %r12, %r5
    sta.h %r12, %r5
    sta.w %r12, %r5
    sta.d %r12, %r5
    sta.q %r12, %r5

    write "test store atomic release"
    sta.b.r %r12, %r5
    sta.h.r %r12, %r5
    sta.w.r %r12, %r5
    sta.d.r %r12, %r5
    sta.q.r %r12, %r5

.end
.text
.data
data_lbl:
    d1	25
    d1	26
    d1	27
    d1	28

.text
program_start:
; Here we test references to data section.
; Absolute offset from begin of section
    write	"base addressing"
    alloc	96
    ca.r	%r17, program_start
    ldi		%r12, data_lbl
    write	"data_lbl: %i64(r12)"

    ldi		%r12, data_hi(data_lbl)
    write	"data_hi(data_lbl): %i64(r12)"
    ldi		%r12, data_lo(data_lbl)
    write	"data_lo(data_lbl): %i64(r12)"
    ca.rf	%r13, data_lbl
    write	"ca.rf(data_lbl): %x64(r13)"
    ca.rf.l	%r13, data_lbl
    write	"ca.rf(data_lbl): %x64(r13)"

    addi	%r13, %r17, data_hi(data_lbl)
    write	"r13     %i64(r13)"
    addi	%r14, %r13, data_lo(data_lbl)+0
    write	"r14     %i64(r14)"

    addi	%r13, %r17, data_hi(data_lbl)
    write	"r13     %i64(r13)"
    ldz.b	%r25, %r13, data_lo(data_lbl)+0
    ldz.b	%r26, %r13, data_lo(data_lbl)+1
    ldz.b	%r27, %r13, data_lo(data_lbl)+2
    ldz.b	%r28, %r13, data_lo(data_lbl)+3
    write	"r25     %i64(r25)" ; must be 25
    write	"r26     %i64(r26)" ; must be 26
    write	"r27     %i64(r27)" ; must be 27
    write	"r28     %i64(r28)" ; must be 28

; test load context
    ldz.d	%r1, %sp, -16
    st.d	%r1, %sp, -16
    jmp		skipaddr
    jmp.l	skipaddr

; test indexed load/store
    st.b.xd	%r12, %r15, %r30, 4, 14
    st.h.xd	%r12, %r15, %r30, 4, 14
    st.w.xd	%r12, %r15, %r30, 4, 14
    st.d.xd	%r12, %r15, %r30, 4, 14

    lda.q %r30, %r56
    sta.q %r43, %r56

    sl.add	%r43, %r56, %r23, 4
    sl.sub	%r43, %r56, %r23, 42
    sl.subf	%r43, %r56, %r23, 12

    ldz.w	%r30, %r5, 66*4	; load mid
    ldz.d.xd	%r40, %tp, %r30, 0, 4	; load base

    lds.d.xd	%r12, %r23, %r40, 3, 54
    lds.d.xd	%r12, %r23, %r40, 3, 54
    ldz.d.xd	%r12, %r23, %r40, 3, 54
    ldz.d.xd	%r12, %r23, %r40, 3, 54
    st.w.xd	%r12, %r23, %r40, 3, 54
    st.d.xd	%r12, %r23, %r40, 3, 54

    lds.b.xd	%r12, %r23, %r40, 3, 54
    lds.b.xd	%r12, %r23, %r40, 3, 54
    ldz.b.xd	%r12, %r23, %r40, 3, 54
    ldz.b.xd	%r12, %r23, %r40, 3, 54
    st.b.xd	%r12, %r23, %r40, 3, 54
    st.b.xd	%r12, %r23, %r40, 3, 54

    lds.h.xd	%r12, %r23, %r40, 3, 54
    lds.h.xd	%r12, %r23, %r40, 3, 54
    ldz.h.xd	%r12, %r23, %r40, 3, 54
    ldz.h.xd	%r12, %r23, %r40, 3, 54
    st.h.xd	%r12, %r23, %r40, 3, 54
    st.h.xd	%r12, %r23, %r40, 3, 54

.text
; LOAD/STORE
    sl.add	%r54, %r56, %r12, 5

    ldz.b	%r16, %r45, 8900
    lds.b	%r15, %r46, 8900
    ldz.b.xd	%r54, %r56, %r12, 2, 37
    lds.b.xd	%r53, %r65, %r12, 2, 37
    ldz.b.xd.l	%r54, %r56, %r12, 2, 37000000
    lds.b.xd.l	%r53, %r65, %r12, 2, -37000000
    ldz.b.mia	%r52, %r75, 10
    lds.b.mia	%r51, %r76, 10
    ldz.b.mib	%r52, %r75, 10
    lds.b.mib	%r51, %r76, 10
    st.b.mia	%r51, %r76, 10
    st.b.mib	%r52, %r75, 10

    ldz.h	%r12, %r45, 8900
    lds.h	%r12, %r45, 8900
    ldz.h.xd	%r54, %r56, %r12, 3, -57
    lds.h.xd	%r54, %r56, %r12, 2, 37
    ldz.h.xd.l	%r54, %r56, %r12, 2, 37000000
    lds.h.xd.l	%r53, %r65, %r12, 2, -37000000
    ldz.h.mia	%r54, %r56, 12
    lds.h.mia	%r54, %r56, -60
    ldz.h.mib	%r54, %r56, 12
    lds.h.mib	%r54, %r56, -60
    st.h.mia	%r51, %r76, 10
    st.h.mib	%r52, %r75, 10

    ldz.w	%r12, %r45, 8900
    lds.w	%r12, %r45, 8900
    ldz.w.xd	%r54, %r56, %r12, 2, 7
    lds.w.xd	%r54, %r56, %r12, 2, 7
    ldz.w.xd.l	%r54, %r56, %r12, 2, 37000000
    lds.w.xd.l	%r53, %r65, %r12, 2, -37000000
    ldz.w.mia	%r54, %r56, 12
    lds.w.mia	%r54, %r56, 32
    ldz.w.mib	%r54, %r56, 12
    lds.w.mib	%r54, %r56, 32
    st.w.mia	%r51, %r76, 10
    st.w.mib	%r52, %r75, 10

    ldz.d	%r54, %r56, 5600
    lds.d	%r54, %r56, 5600
    ldz.d.l	%r53, %r46, 98765432
    ldz.d	%r52, %r45, -5600
    ldz.d.l	%r51, %r55, -98765432
    ldz.d.xd	%r50, %r56, %r12, 2, 37
    lds.d.xd	%r50, %r56, %r12, 2, 37
    ldz.d.xd.l	%r54, %r56, %r12, 2, 37000000
    lds.d.xd.l	%r53, %r65, %r12, 2, -37000000
    ldz.d.mia	%r57, %r56, -12
    ldz.d.mia	%r57, %r56, -12
    lds.d.mia	%r57, %r56, -12
    lds.d.mia	%r57, %r56, -12
    ldz.d.mib	%r57, %r56, -12
    ldz.d.mib	%r57, %r56, -12
    lds.d.mib	%r57, %r56, -12
    lds.d.mib	%r57, %r56, -12
    st.d.mia	%r51, %r76, 10
    st.d.mib	%r52, %r75, 10

    ld.q		%r16, %r45, 8900
    ld.q.l	%r16, %r45, 8900000
    ld.q.l	%r16, %r45, -8900000
    ld.q.xd	%r54, %r56, %r12, 2, 37
    ld.q.xd.l	%r54, %r56, %r12, 2, 37000000
    ld.q.xd.l	%r54, %r56, %r12, 2, -37000000
    ld.q.mia	%r52, %r75, 10
    ld.q.mia	%r52, %r75, 10
    ld.q.mib	%r52, %r75, 10
    ld.q.mib	%r52, %r75, 10
    st.q.mia	%r51, %r76, 10
    st.q.mib	%r52, %r75, 10

    st.b	%r12, %r45, 8900
    st.h	%r12, %r45, 8900
    st.w	%r12, %r45, 8900
    st.d	%r12, %r45, 890*8

    ldz.d	%r12, %r45, 8048
    st.d	%r12, %r45, 8064
    ldz.d.xd	%r12, %r45, %r13, 3, 7
    st.d.xd	%r12, %r45, %r13, 3, 7

    ldz.d	%r60, %r55, 56
    ldz.d	%r60, %r56, 56
    ldz.d	%r46, %r55, 120
    st.d	%r47, %r55, 56

    ldz.d	%r60, %sp, 624
    st.d	%r60, %sp, 624
    ldz.d.xd	%r60, %sp, %r12, 3, 28
    st.d.xd	%r60, %sp, %r12, 3, 26
    ldz.d	%r56, %r57, 567
    st.d	%r56, %r57, 567

    ldz.w	%r34, %r12, 900
    ldz.d	%r34, %r12, 900
    st.w	%r23, %r12, 900
    st.d	%r23, %r12, 900

    ld.q	%r34, %r13, 55*16
    st.q	%r35, %r13, 55*16
    ld.q.xd	%r34, %r13, %r45, 3, 60
    st.q.xd	%r34, %r13, %r45, 3, 60

skipaddr:
    nop	0
.end
.text
    alloc	25
    ldi.l	%r23, 0x1234567890abcdef
    write	"test population statistic instructions"
    cnt.pop	%r12, %r23, 3
    write	"cntpop: %i64(r12)"
    cnt.lz	%r12, %r23, 0
    write	"cntlz %i64(r12)"
    cnt.tz	%r12, %r23, 1
    cnt.lz	%r12, %r23, 2
    cnt.tz	%r12, %r23, 3
    cnt.lz	%r12, %r23, 4
    cnt.tz	%r12, %r23, 5
.end
.text
    write	"test bit reverse instruction (permb)"
    alloc	80
    ldi.l	%r55, 0x1234567890ABCDEF
    write	"initial value: %x64(r55)"
    permb	%r55, %r55, 63
    permb	%r56, %r78, 63
    write	"r55 %x64(r55) %b64(r55)"
    permb	%r55, %r55, 63
    write	"r55 %x64(r55) %b64(r55)"

    permb	%r56, %r55, 0b111111 ;63
    write	"reverse bits: %x64(r56)"

    permb	%r56, %r55, 0b111110  ;32+16+8+4+2
    write	"reverse bit-pairs: %x64(r56)"

    permb	%r56, %r55, 0b111100  ;32+16+8+4
    write	"reverse nibbles (4-bits): %x64(r56)"

    permb	%r56, %r55, 0b111000 ;32+16+8
    write	"reverse 1bytes: %x64(r55) => %x64(r56)"

    permb	%r56, %r55, 0b110000  ;32+16
    write	"reverse 2bytes: %x64(r55) => %x64(r56)"

    permb	%r56, %r55, 0b100000  ;32
    write	"reverse 4bytes: %x64(r55) => %x64(r56)"
.end
.text
    alloc	46
    write	"test bitwise logical"
    and		%r23, %r25, %r45
    andi	%r23, %r25, 12345
    andi.l	%r23, %r25, 1234567890
    andn	%r23, %r25, %r45
    andni	%r23, %r25, 12345
    or		%r23, %r25, %r45
    ori		%r23, %r25, 12345
    ori.l	%r23, %r25, 1234567890
    orn		%r23, %r25, %r45
    orni	%r23, %r25, 12345
    xor		%r23, %r25, %r45
    xori	%r23, %r25, 12345
    xori.l	%r23, %r25, 1234567890
    nor		%r23, %r25, %r45
    nand	%r23, %r25, %r45
    xnor	%r23, %r25, %r45
.end
.text
    write	"branch-int, test memory"
.data
align 8
test_memory:
    d8	0
    d8	1
    d8	2
    d8	3
    d8	4
    d8	5
    d8	6
    d8	7
.text
    alloc	20
    ca.rf	%r12, test_memory
    write	"test_memory: %x64(r12)"
    ldi		%r11, 0
    ldi		%r14, 0
memory_loop: (32)
    ldz.d.xd	%r13, %r12, %r11, 3, 0
    addi	%r11, %r11, 1
    addi	%r14, %r14, 1
    andi	%r11, %r11, 7
; fast_check
    bsi.lt.d.l	%r14, 200000, memory_loop
    write	"counter: %i64(r14)"
.end
.text
    alloc	20
    write	"test compare-with-zero-and-long-branch"
compare_with_zero_test_continue:
compare_with_zero_backward_target:
    addi	%r2, %r2, 1
    b.eq.d	%r2, %r2, compare_with_zero_test_exit

    b.eq.d	%r1, %gz, compare_with_zero_forward_target
    b.eq.d.l	%r1, %gz, compare_with_zero_forward_target
    b.eq.d	%r1, %gz, compare_with_zero_backward_target
    b.eq.d.l	%r1, %gz, compare_with_zero_backward_target
    b.ne.d	%r1, %gz, compare_with_zero_forward_target
    b.ne.d.l	%r1, %gz, compare_with_zero_forward_target
    b.ne.d	%r1, %gz, compare_with_zero_backward_target
    b.ne.d.l	%r1, %gz, compare_with_zero_backward_target
    bs.lt.d	%r1, %gz, compare_with_zero_forward_target
    bs.lt.d.l	%r1, %gz, compare_with_zero_forward_target
    bs.lt.d	%r1, %gz, compare_with_zero_backward_target
    bs.lt.d.l	%r1, %gz, compare_with_zero_backward_target
    bs.le.d	%r1, %gz, compare_with_zero_forward_target
    bs.le.d.l	%r1, %gz, compare_with_zero_forward_target
    bs.le.d	%r1, %gz, compare_with_zero_backward_target
    bs.le.d.l	%r1, %gz, compare_with_zero_backward_target
    bs.gt.d	%r1, %gz, compare_with_zero_forward_target
    bs.gt.d.l	%r1, %gz, compare_with_zero_forward_target
    bs.gt.d	%r1, %gz, compare_with_zero_backward_target
    bs.gt.d.l	%r1, %gz, compare_with_zero_backward_target
    bs.ge.d	%r1, %gz, compare_with_zero_forward_target
    bs.ge.d.l	%r1, %gz, compare_with_zero_forward_target
    bs.ge.d	%r1, %gz, compare_with_zero_backward_target
    bs.ge.d.l	%r1, %gz, compare_with_zero_backward_target

compare_with_zero_forward_target:
    jmp		compare_with_zero_test_continue
compare_with_zero_test_exit:
    write	"end test compare-with-zero-and-long-branch"
.end
.text

call_code_target:

.rodata
call_data_target:

.text
    jmp	callexample
;*****************************************************************
; Function  compute A**4 of parameter A, passed in register r33
;*****************************************************************
quadrat:
    write	"function quadrat entered: r0=%x128(r0)"
    alloc	93
    write	"rsc     %s(rsc)"
    write	"psr     %s(psr)"
    write	"rsc     %s(rsc)"
    mul	%r33, %r33, %r33
    mul	%r33, %r33, %r33
    write	"r0=%x128(r0) r33=%i64(r33)"
    write	"%m(dump)"
;	mtspr	%r45, psr
    write	"function quadrat exited"
    ret
end_quadrat:

;*****************************************************************
; Example of calling sequence with branch prediction
callexample:
    alloc	91
    ldi.l	%r90, 0x1234567890abcdef
    write	"arg3 %x64(r90)"
    srpi	%r89, %r90, %r90, 16
    write	"arg2 %x64(r89)"
    srpi	%r88, %r90, %r90, 16
    write	"arg1 %x64(r88)"
    ldi		%r87, 7		; setup arguments
;   write	"%m(dump)"
    write	"rsc: %s(rsc)"
    write	"function quadrat called"
    call.r	%r86, quadrat
    write	"rsc: %s(rsc)"
; Rest instructions after return from subroutine
;*****************************************************************
.text	; return to code section

; Here we test registers used by ABI (application binary interface)
; Check loader.
    write	"sp=%x64(sp) tp=%x64(tp) r0=%x128(r0)"
    write	"rsc: %s(rsc)"
    write	"psr: %s(psr)"
    write	"r14: %x64(r14)"
    write	"reta: %i64(r72)"		; out return address
    write	"retv: %i64(r73)"		; out return value
    write	"rsc: %s(rsc)"
    write	"rsc: %s(psr)"
    ldi.l	%r11, 0x407d8bffffccccff
    write	"r11: %x64(r11)"
    addi.l	%r12, %r11, 0x400000
    write	"r12: %x64(r12)"
    xor		%r20, %r19, %r11
    addi.l	%r20, %r20, 0x400000
    ldi		%r10, 10
    ldi		%r11, 11
    cmps.lt.d	%r2, %r11, %r10
    write	"%i64(r11) %i64(r10)"
    jmp		call_exit

    call.r	%r42, quadrat
    call.ri	%r42, %r34, %gz
    call.mi	%r42, %r34, 468
    call.plt	%r42, call_data_target
    call.ri	%r42, %r34, %gz

call_exit:
    write	"end call test"

.end
.text
    alloc	47
    write	"test recursive calls"
    ldi.l	%r46, 0x7FFFFFFFFFFFFFFF		; comment
    ldi.l	%r46, 0x8000000000000000
    addi	%r46, %r46, -1
    write	"%i64(r46)"

    mf.spr	%r20, %rsc

    alloc	54		; extend frame to 54 regs
    ldi		%r48, 1		; 
    ldi		%r53, 3		; 1 arg (33+16)
    ldi		%r52, 2		; 2 arg (34+16)
    ldi		%r51, 1		; 3 arg (35+16)
    write	"rsc: %s(rsc)"
    call.r	%r50, func	; call func subroutine, safe 50 regs
    write	"r51=%i64(r51) rsc=%s(rsc)"
    ldi		%r53, 10
    call.r	%r52, rekurs
    write	"rsc: %s(rsc)"
    write	"rsp: %s(rsp)"
;   write	"%m(dump)"
    jmp	smallend
func:
; at entry point func subroutine has 4 regs in frame
    alloc	8   ; extend frame from 4 to 8 regs
    write	"r0      %x128(r0)"		; print packed caller frame and return address
    write	"r1=%i64(r1) r2=%i64(r2) r3=%i64(r3)" ; print args
    ldi		%r1, 12345
    ret

rekurs:
    alloc	4
    write	"r0=%x128(r0) r1=%i64(r1)"
    write	"rsc: %s(rsc)"
    write	"rsp: %s(rsp)"
    addi	%r3, %r1, -1
    ldi		%r2, 0
    b.eq.d	%r1, %r2, rekret
;	cneq	%r1, %r2, 1, 0
    call.r	%r2, rekurs
rekret:
    write	"rsp: %s(rsp)"
    write	"r0: %x128(r0)"
    ret.f	0
smallend:
    nop		0
    nop		111
    alloc	96
    write	"end_call_recursive"
.end
.text
    ; at the beginning of the program, the register stack is empty
    alloc	54   ; expand frame to 54 registers
    eh.adj	simple_func_end
    ldi		%r47, 1  ; will be saved when called
    ldi		%r53, 3  ; first argument
    ldi		%r52, 2  ; second argument
    ldi		%r51, 1  ; third argument
    ; func procedure call, all registers up to 50 will be saved,
    ; return address, eip, frame size (50) are saved in r50
    call.r	%r50, simple_func
    ; at this point, after returning, the frame will be again 53
    jmp		simple_func_end
simple_func:
    ; at the starting point, the func procedure has a 5-register frame
    ; their previous numbers are 50, 51, 52, 53, new - 0, 1, 2, 3
    ; extend the frame to 10 registers (another 4,5,6,7,8,9)
    alloc	10
    write	"r0 = %x128(r0)"	; print packed return info
    write	"r1 = %i64(r1)"	; print 1st argument
    write	"r2 = %i64(r2)"	; print 2nd argument
    write	"r3 = %i64(r3)"	; print 3rd argument
    ret
simple_func_end:
    nop		123
.end
.text
    write "example of carry/borrow testing"
    alloc	96

; 256-bit add (g30,%r31,r32,r33) + (g40,r41,r42,r43) => (g50,r51,r52,r53)
    ldi	%r30, -1
    ldi	%r31, -1
    ldi	%r34, -1
    ldi	%r33, -1

    ldi	%r40, 1
    ldi	%r41, 0
    ldi	%r42, 0
    ldi	%r43, 0

; throw add
    cmp.eq.d	%r10, %r30, %r40	; add carry out
    add		%r50, %r30, %r40	; add
    cmpi.eq.d	%r12, %r31, 1
    addi	%r51, %r31, 1

    cmp.eq.d	%r12, %r31, %r41	; add carry out
    add		%r51, %r31, %r41	; add
    cmp.eq.d	%r14, %r34, %r42	; add carry out
    add		%r52, %r34, %r42	; add
    cmp.eq.d	%r8, %r33, %r43	; add carry out
    add		%r53, %r33, %r43	; add
    write	"add carryis"
    addi	%r51, %r51, 1
    addi	%r52, %r52, 1
    addi	%r53, %r53, 1
; set last carry
    ldi		%r54, 1
    ldi		%r54, 0
    write	"multiprecision add:\nr50,r51,r52,r53,r54 = %x64(r50) %x64(r51) %x64(r52) %x64(r53) %x64(r54)"

    ldi.l	%r40, 0x7fffffffffffffff
    mul.h	%r40, %r40, %r41
    write	"r40     %x64(r40)"

    ldi		%r12, 12345
    ldi.l	%r12, 12345678900

;	ldi	%r14, 0xFFFFFFFFF0
;	ld8	%r13, %r14, 0

    addc	%r12, %r14, %r46
    addc	%r12, %r14, %r46
    subb	%r12, %r14, %r46
    subb	%r12, %r14, %r46
    add.add.c	%r12, %r14, %r46, %r23
    add.add.c	%r12, %r14, %r46, %r22
    sub.sub.b	%r12, %r14, %r46, %r13
    sub.sub.b	%r12, %r14, %r46, %r14
    write	"end carry test"
    nop	11111
.end
.text
    write	"test compare"
    alloc	96
    ldi		%r20, 4
    ldi		%r21, 3
    ldi		%r22, -4
    ldi		%r23, -12
    write	"test compare instructions"
    cmp.eq.d	%r12, %r20, %r21
    cmps.lt.d	%r12, %r20, %r21
    cmpu.lt.d	%r12, %r20, %r21
    cmpi.eq.d	%r12, %r20, 123456
    cmpsi.lt.d	%r12, %r20, 123456
    cmpui.lt.d	%r12, %r20, 123456
    cmp.ne.d	%r12, %r20, %r21
    cmpi.ne.d	%r12, %r20, 123456
    cmpsi.gt.d	%r12, %r20, 123456
    cmpui.gt.d	%r12, %r20, 123456
    cmps.le.d	%r12, %r20, %r21
    cmpu.le.d	%r12, %r20, %r21

    cmpsi.ge.d	%r12, %r20, 123456
    cmpui.ge.d	%r12, %r20, 123456
    cmpsi.le.d	%r12, %r20, 123456
    cmpui.le.d	%r12, %r20, 123456

    cmp.eq.w	%r12, %r20, %r21
    cmps.lt.w	%r12, %r20, %r21
    cmpu.lt.w	%r12, %r20, %r21
    cmpi.eq.w	%r12, %r20, 123456
    cmpsi.lt.w	%r12, %r20, 123456
    cmpui.lt.w	%r12, %r20, 123456
    cmp.ne.w	%r12, %r20, %r21
    cmpi.ne.w	%r12, %r20, 123456
    cmpsi.gt.w	%r12, %r20, 123456
    cmpui.gt.w	%r12, %r20, 123456
    cmps.le.w	%r12, %r20, %r21
    cmpu.le.w	%r12, %r20, %r21

    write	"compare aliases (pseudo-instructions)"
    cmps.gt.d	%r12, %r20, %r21	; cmplt   r12, %r21, r20
    cmpu.gt.d	%r12, %r20, %r21	; cmpltu  r12, %r21, r20
    cmpsi.lt.d	%r12, %r20, 123456	; cmplti  r12, %r20, 12346
    cmpui.lt.d	%r12, %r20, 123456	; cmpltui r12, %r20, 12346
    cmps.ge.d	%r12, %r20, %r21	; cmpleq  r12, %r21, r20
    cmpu.ge.d	%r12, %r20, %r21	; cmpleu  r12, %r21, r20
    cmpsi.gt.d	%r12, %r20, 123456	; cmpgti  r12, %r20, 12346
    cmpui.gt.d	%r12, %r20, 123456	; cmpgtui r12, %r20, 12346


    cmps.gt.w	%r12, %r20, %r21	; cmplt4   r12, %r21, %r20
    cmpu.gt.w	%r12, %r20, %r21	; cmpltu4  r12, %r21, %r20
    cmpsi.lt.w	%r12, %r20, 123456	; cmplti4  r12, %r20, 12346
    cmpui.lt.w	%r12, %r20, 123456	; cmpltui4 r12, %r20, 12346
    cmps.ge.w	%r12, %r20, %r21	; cmpleq4  r12, %r21, r20
    cmpu.ge.w	%r12, %r20, %r21	; cmpleu4  r12, %r21, r20
    cmpsi.gt.w	%r12, %r20, 123456	; cmpgti4  r12, %r20, 12346
    cmpui.gt.w	%r12, %r20, 123456	; cmpgtui4 r12, %r20, 12346

; TESTS
    cmp.eq.d	%r14, %r12, %r45
    cmp.ne.d	%r14, %r12, %r45

    cmp.eq.d	%r14, %r45, %r34
    cmpi.eq.d	%r14, %r45, 123
    cmpi.eq.d.l	%r14, %r45, 1234567890123
    cmpsi.lt.d	%r14, %r45, 123
    cmpsi.lt.d.l	%r14, %r45, 1234567890123
    cmpsi.le.d	%r14, %r45, 123
    cmpsi.le.d.l	%r14, %r45, 1234567890123
    cmps.lt.d	%r14, %r45, %r34
    cmpui.gt.d	%r14, %r45, 123
    cmpui.gt.d.l	%r14, %r45, 1234567890123
    cmpui.ge.d	%r14, %r45, 123
    cmpui.ge.d.l	%r14, %r45, 1234567890123
    cmpu.gt.d	%r14, %r45, %r34

    cmp.eq.d	%r41, %r34, %r56
    cmps.lt.d	%r66, %r45, %r57
    cmpi.eq.d	%r64, %r56, 0
.end
.text
backward_target:
    alloc	61
    addi	%r2, %r2, 1
    b.eq.d	%r2, %r2, branch_test_exit

    b.eq.d	%r23, %r34, backward_target
    b.eq.d.l	%r23, %r34, backward_target
    b.eq.d	%r23, %r34, forward_target
    b.eq.d.l	%r23, %r34, forward_target
    bi.eq.d	%r23,34, backward_target
    bi.eq.d.l	%r23,34, backward_target
    bi.eq.d	%r23,34, forward_target
    bi.eq.d.l	%r23,34, forward_target

    b.eq.w	%r23, %r34, backward_target
    b.eq.w.l	%r23, %r34, backward_target
    b.eq.w	%r23, %r34, forward_target
    b.eq.w.l	%r23, %r34, forward_target
    bi.eq.w	%r23,34, backward_target
    bi.eq.w.l	%r23,34, backward_target
    bi.eq.w	%r23,34, forward_target
    bi.eq.w.l	%r23,34, forward_target

    b.ne.d	%r23, %r34, backward_target
    b.ne.d.l	%r23, %r34, backward_target
    b.ne.d	%r23, %r34, forward_target
    b.ne.d.l	%r23, %r34, forward_target
    bi.ne.d	%r23,34, backward_target
    bi.ne.d.l	%r23,34, backward_target
    bi.ne.d	%r23,34, forward_target
    bi.ne.d.l	%r23,34, forward_target

    b.ne.w	%r23, %r34, backward_target
    b.ne.w.l	%r23, %r34, backward_target
    b.ne.w	%r23, %r34, forward_target
    b.ne.w.l	%r23, %r34, forward_target
    bi.ne.w	%r23,34, backward_target
    bi.ne.w.l	%r23,34, backward_target
    bi.ne.w	%r23,34, forward_target
    bi.ne.w.l	%r23,34, forward_target

    bs.le.d	%r23, %r34, backward_target
    bs.le.d.l	%r23, %r34, backward_target
    bs.le.d	%r23, %r34, forward_target
    bs.le.d.l	%r23, %r34, forward_target
    bsi.le.d	%r23,34, backward_target
    bsi.le.d.l	%r23,34, backward_target
    bsi.le.d	%r23,34, forward_target
    bsi.le.d.l	%r23,34, forward_target

    bs.le.w	%r23, %r34, backward_target
    bs.le.w.l	%r23, %r34, backward_target
    bs.le.w	%r23, %r34, forward_target
    bs.le.w.l	%r23, %r34, forward_target
    bsi.le.w	%r23,34, backward_target
    bsi.le.w.l	%r23,34, backward_target
    bsi.le.w	%r23,34, forward_target
    bsi.le.w.l	%r23,34, forward_target

    bs.lt.d	%r23, %r34, backward_target
    bs.lt.d.l	%r23, %r34, backward_target
    bs.lt.d	%r23, %r34, forward_target
    bs.lt.d.l	%r23, %r34, forward_target
    bsi.lt.d	%r23,34, backward_target
    bsi.lt.d.l	%r23,34, backward_target
    bsi.lt.d	%r23,34, forward_target
    bsi.lt.d.l	%r23,34, forward_target

    bs.lt.w	%r23, %r34, backward_target
    bs.lt.w.l	%r23, %r34, backward_target
    bs.lt.w	%r23, %r34, forward_target
    bs.lt.w.l	%r23, %r34, forward_target
    bsi.lt.w	%r23,34, backward_target
    bsi.lt.w.l	%r23,34, backward_target
    bsi.lt.w	%r23,34, forward_target
    bsi.lt.w.l	%r23,34, forward_target

    bs.ge.d	%r23, %r34, backward_target
    bs.ge.d.l	%r23, %r34, backward_target
    bs.ge.d	%r23, %r34, forward_target
    bs.ge.d.l	%r23, %r34, forward_target
    bui.ge.d	%r23,34, backward_target
    bui.ge.d.l	%r23,34, backward_target
    bui.ge.d	%r23,34, forward_target
    bui.ge.d.l	%r23,34, forward_target

    bs.ge.w	%r23, %r34, backward_target
    bs.ge.w.l	%r23, %r34, backward_target
    bs.ge.w	%r23, %r34, forward_target
    bs.ge.w.l	%r23, %r34, forward_target
    bui.ge.w	%r23,34, backward_target
    bui.ge.w.l	%r23,34, backward_target
    bui.ge.w	%r23,34, forward_target
    bui.ge.w.l	%r23,34, forward_target

    bs.gt.d	%r23, %r34, backward_target
    bs.gt.d.l	%r23, %r34, backward_target
    bs.gt.d	%r23, %r34, forward_target
    bs.gt.d.l	%r23, %r34, forward_target
    bsi.gt.d	%r23,34, backward_target
    bsi.gt.d.l	%r23,34, backward_target
    bsi.gt.d	%r23,34, forward_target
    bsi.gt.d.l	%r23,34, forward_target

    bs.gt.w	%r23, %r34, backward_target
    bs.gt.w.l	%r23, %r34, backward_target
    bs.gt.w	%r23, %r34, forward_target
    bs.gt.w.l	%r23, %r34, forward_target
    bsi.gt.w	%r23,34, backward_target
    bsi.gt.w.l	%r23,34, backward_target
    bsi.gt.w	%r23,34, forward_target
    bsi.gt.w.l	%r23,34, forward_target

    bu.le.d	%r23, %r34, backward_target
    bu.le.d.l	%r23, %r34, backward_target
    bu.le.d	%r23, %r34, forward_target
    bu.le.d.l	%r23, %r34, forward_target
    bui.le.d	%r23,34, backward_target
    bui.le.d.l	%r23,34, backward_target
    bui.le.d	%r23,34, forward_target
    bui.le.d.l	%r23,34, forward_target

    bu.le.w	%r23, %r34, backward_target
    bu.le.w.l	%r23, %r34, backward_target
    bu.le.w	%r23, %r34, forward_target
    bu.le.w.l	%r23, %r34, forward_target
    bui.le.w	%r23,34, backward_target
    bui.le.w.l	%r23,34, backward_target
    bui.le.w	%r23,34, forward_target
    bui.le.w.l	%r23,34, forward_target

    bu.lt.d	%r23, %r34, backward_target
    bu.lt.d.l	%r23, %r34, backward_target
    bu.lt.d	%r23, %r34, forward_target
    bu.lt.d.l	%r23, %r34, forward_target
    bui.lt.d	%r23,34, backward_target
    bui.lt.d.l	%r23,34, backward_target
    bui.lt.d	%r23,34, forward_target
    bui.lt.d.l	%r23,34, forward_target

    bu.lt.w	%r23, %r34, backward_target
    bu.lt.w.l	%r23, %r34, backward_target
    bu.lt.w	%r23, %r34, forward_target
    bu.lt.w.l	%r23, %r34, forward_target
    bui.lt.w	%r23,34, backward_target
    bui.lt.w.l	%r23,34, backward_target
    bui.lt.w	%r23,34, forward_target
    bui.lt.w.l	%r23,34, forward_target

    bu.ge.d	%r23, %r34, backward_target
    bu.ge.d.l	%r23, %r34, backward_target
    bu.ge.d	%r23, %r34, forward_target
    bu.ge.d.l	%r23, %r34, forward_target
    bui.ge.d	%r23,34, backward_target
    bui.ge.d.l	%r23,34, backward_target
    bui.ge.d	%r23,34, forward_target
    bui.ge.d.l	%r23,34, forward_target

    bu.ge.w	%r23, %r34, backward_target
    bu.ge.w.l	%r23, %r34, backward_target
    bu.ge.w	%r23, %r34, forward_target
    bu.ge.w.l	%r23, %r34, forward_target
    bui.ge.w	%r23,34, backward_target
    bui.ge.w.l	%r23,34, backward_target
    bui.ge.w	%r23,34, forward_target
    bui.ge.w.l	%r23,34, forward_target

    bu.gt.d	%r23, %r34, backward_target
    bu.gt.d.l	%r23, %r34, backward_target
    bu.gt.d	%r23, %r34, forward_target
    bu.gt.d.l	%r23, %r34, forward_target
    bui.gt.d	%r23, 34, backward_target
    bui.gt.d.l	%r23, 34, backward_target
    bui.gt.d	%r23, 34, forward_target
    bui.gt.d.l	%r23, 34, forward_target

    bu.gt.w	%r23, %r34, backward_target
    bu.gt.w.l	%r23, %r34, backward_target
    bu.gt.w	%r23, %r34, forward_target
    bu.gt.w.l	%r23, %r34, forward_target
    bui.gt.w	%r23, 34, backward_target
    bui.gt.w.l	%r23, 34, backward_target
    bui.gt.w	%r23, 34, forward_target
    bui.gt.w.l	%r23, 34, forward_target

    bm.all	%r23, 34, backward_target
    bm.all.l	%r23, 34, backward_target
    bm.all	%r23, 34, forward_target
    bm.all.l	%r23, 34, forward_target

    bm.notall	%r23, 34, backward_target
    bm.notall.l	%r23, 34, backward_target
    bm.notall	%r23, 34, forward_target
    bm.notall.l	%r23, 34, forward_target

    bm.any	%r23, 34, backward_target
    bm.any.l	%r23, 34, backward_target
    bm.any	%r23, 34, forward_target
    bm.any.l	%r23, 34, forward_target

    bm.none	%r23, 34, backward_target
    bm.none.l	%r23, 34, backward_target
    bm.none	%r23, 34, forward_target
    bm.none.l	%r23, 34, forward_target

forward_target:
branch_test_exit:

    jmp		branch_exit

label:
    b.eq.d	%r12, %r13, qwe
    srpi	%r10, %r11, %r12, 45
    dep.q	%r61, %r91, %r32, 10
    mbsel	%r62, %r91, %r32, %r10
    perm	%r63, %r91, %r32, %r10
qwe:
    b.ne.d	%r15, %r46, label
    b.eq.d	%r25, %r45, label
    bs.lt.d	%r25, %r44, label
    bs.le.d	%r35, %r43, label
    bu.gt.d	%r35, %r42, label
    bu.ge.d	%r45, %r41, label
    bs.gt.d	%r45, %r40, label
    bu.lt.d	%r55, %r76, label
    bi.ne.d	%r55, 140, label
    bi.eq.d	%r65, 141, label
    bsi.lt.d	%r65, 142, label
    bsi.gt.d	%r75, 143, label
    bui.lt.d	%r75, 170, label
    bui.gt.d	%r85, 160, label

    addi.l	%r45, %r34, 1234
    b.bsi	%r85, 26, label
    b.bci.l	%r85, 36, label
    b.bsi	%r95, 46, label
    b.bci.l	%r95, 56, label

    jmp.r	%r45, %r23, 1
branch_exit:
    write	"end branch test"
.end
.text
    alloc	61
    write	"Example of test bit and branch"
    ldi		%r19, 0x20
    ldi		%r20, 12+3
    write	"%i64(r20)"
    ldi		%r10, 0
    b.bci	%r10, 10, xxx_n
    ldi.l	%r20, 123456789012345	; load immediate
    ldi		%r21, 321		; load immediate
    add		%r23, %r20, %r21	; add
    write	"%i64(r43)"
xxx_n:	write	"%i64(r23)"

    ldi		%r46, 0xabcdef
    b.bci	%r46, 56, branch_bit_exit
    b.bsi	%r46, 56, branch_bit_exit
    ldi		%r56, 56
    b.bc	%r46, %r56, branch_bit_exit
    b.bs	%r46, %r56, branch_bit_exit

branch_bit_exit:
    write	"end branch_bit test"
.end
.text
    write	"cpuid implemented number"
    alloc	96
    ldi		%r13, 0
    cpuid	%r14, %r13, 0
    write	"cpuid len %x64(r14)"
    write	"cpuid loop"
cpuid_loop:
    cpuid	%r15, %r13, 0
    write	"cpuid[%i64(r13)] = %x64(r15)"
    reps.lt.d	%r13, %r14, cpuid_loop
.end
.rodata
    align 16
crc32c_test_string:
    ascii	"The quick brown fox jumps over the lazy dog"
.text
    write	"crc32c = 0x22620404 (expected)"
    alloc	20
    ldi		%r12, -1  ; crc32c
    ldi		%r15, 43 ; length
    mov		%r14, %r15
    ca.rf	%r11, crc32c_test_string
crc32c_loop:
    ld.q.mia	%r13, %r11, 16
    crc32c	%r12, %r12, %r13, %r14
    addi	%r14, %r14, -16
    bs.gt.d	%r14, %gz, crc32c_loop
    xori	%r12, %r12, -1
    write	"crc32c = 0x%x32(r12) (computed)"
.end
.text
    alloc	61
    ca.xd	%r41, %r40, %r12, 4, 52
    ca.xd	%r41, %r40, %r12, 3, -12
    ca.xd	%r41, %r40, %r12, 4, 52
    ldi.l	%r5, -1
    mov2	%r3, %r4, %r4, %r3
    mov2	%r3, %r4, %r4, %r3


.rodata	; open text (read-only data) section
    align	16
text_lbl:	; this is label
    d1	111		; signed byte
    d1	112
    d1	113
ddd:
    align	4		; force 4-byte alignment for next data
    d1	6
    d1	7
    d1	8+0x3D	; you may use formulas!!!

.text
    write	"test addressing"

; Examples of IP-relative references.
    ldi		%r45, text_lo(text_lbl)
    write	"text_lo(text_lbl)=%i64(r45)"
    ldi		%r45, text_hi(text_lbl)
    write	"text_hi(text_lbl)=%i64(r45)"
    ldi		%r45, text_lbl
    write	"%i64(r45)"

; Example of access to text section.
; First get IP-relative reference to text section (+/- 64 MB from IP).
    ca.r	%r45, text_lbl

; Now in r45 we have base address.
; But it IS NOT true address of 'text_lbl'.
; We have in r45 nearest (to 'text_lbl') least address, aligned on 16-bytes boundary.
; Remember add 'text_lo' part of label address at each displacement calculation.
    ldz.b	%r50, %r45, text_lo(text_lbl)+0
    ldz.b	%r51, %r45, text_lo(text_lbl)+1
    ldz.b	%r52, %r45, text_lo(text_lbl)+2
    write	"%i64(r50)"	; must be 111
    write	"%i64(r51)"	; must be 112
    write	"%i64(r52)"	; must be 113

; Example of incorrect access to text section (without bundle alignment)
    ldz.b	%r50, %r45, 0
    write	"%i64(r50)" ; must be 101 - start of 16-byte portion
.end
.text
    alloc	96
    addi	%r20, %gz, 128
    addi	%sp, %sp, -32
    ldi.l	%r12, 0x07060504030201
    st.d	%r12, %sp,0

.data
    ascii	"data section marker"
    align	8
.rodata
    ascii	"rodata section marker"
    align	8

.data
    d2	1234
first_byte:
    d1	12
.text
    ca.rf	%r22, first_byte

; test interval time mask
    ldi		%r22, 0xFFFFFFFFFFFFFFFF
    ldi		%r15, 11

.rodata	; open rodata (read-only data) section
    align	8
text_begin:	; this is label
    d8	1	; signed 8-bytes
    d8	-2
    d1	101	; signed byte
    d1	102
    d1	103
    align	4
    d4	10000	; signed 4byte
    d2	10000	; signed 2byte
    space	4		; insert zeroed bytes
    d2	20000
.data	; open data (read-write) section
    align	8
eexxx:	d8	12345678	; signed 8-byte
    d8	1234567890
ssxxx:	d8	123456789012
    d8	12345678901234
.rodata
    d4	4555		; signed 4-byte
    d2	4555		; signed 2-byte
    align	8
    d8	11
text2:
.text	; open code (read-execute) section

.data	; switch to data section
    d1	120
    align	2
    d2	13400
align 8
dataname:
    d4	654321890
    d4	654321890
    d8	1234545345345
    d8	6789023356977
align 8
someplaceindata:
    d8	0x0000000000000001
    d8	0x0000000000000002
    d8	0x0000000000000003
    d8	0x0000000000000004
    d8	0x0000000000000005
    d8	0x0000000000000006
    d8	0x0000000000000007
    d8	0x0000000000000008
.text
    ca.rf	%r11, someplaceindata
    ldi.l	%r15, 987777777777
    ldi		%r46, 100000
    st.d		%r46, %r11, 8*3
    ldz.d	%r46, %r11, 8*3
    write	"%i64(r46)"
    mul		%r18, %r15, %r46
    add		%r17, %r15, %r46
    andn	%r17, %r15, %r46
    cmps.lt.d	%r12, %r17, %r15
    write	"%i64(r15) %i64(r46) %i64(r17)"
    addi	%r17, %r17, 22
    write	"%i64(r17) %i64(r17)"
    mf.spr	%r27, %itc
    write	"itc: %x64(r27)"
    write	"%m(dump)"
.end
.text
    ; at the beginning of the program, the register stack is empty
    alloc	54   ; expand frame to 54 registers
    ca.r	%r4, dense_call_test_end
    mt.spr	%r4, %eip
    mt.spr	%r4, %reip
    ldi		%r47, 1  ; will be saved when called
    ldi		%r53, 3  ; first argument
    ldi		%r52, 2  ; second argument
    ldi		%r51, 1  ; third argument
    ; func procedure call, all registers up to 50 will be saved,
    ; return address, eip, frame size (50) are saved in r50
check_label:
    call.r	%r48, simple_func_1
    call.r	%r50, simple_func_2
    call.r	%r52, simple_func_3
    
    jmp	dense_call_test_end

simple_func_1:
    alloc  10
    write  "simple_func_1"
    ret

simple_func_2:
    alloc  10
    write  "simple_func_2"
    ret

simple_func_3:
    alloc  10
    write  "simple_func_3"
    ret

dense_call_test_end:
    nop	123
    nop	123
    nop	123
    nop	123
    nop	123
    nop	123
.end
.text
    write	"test bit-field insert (deposit)"
    alloc	96
    ldi.l	%r30, 0xaaaaaaaaaaaaaaaa
    ldi.l	%r40, 0xeeeeeeeeeeeeeeee
    dep		%r20, %r30, %r40, 48, 24
    write	"dep: %x64(r20)"
    dep		%r20, %r40, %r30, 48, 24
    write	"dep: %x64(r20)"

    write	"test vector deposit (dep16)"
    nor		%r3, %r4, %r4
    dep.q	%r5, %r3, %r4, 100
    write	"dep16: %x128(r5)"
    write	"end deposit test"
.end

.text
    write	"test control device memory-mapped registers"
    alloc	96

    ; device_control base address
    ldi.l	%r24, DEVICE_CONFIG_VIRT_BASE

    write	"test pci"

    ldi.l	%r21, 0x1234567890abcdef

    ldz.d	%r20, %r24, DEVICE_CONTROL_DID
    write	"mem[DEVICE_CONTROL_DID] %x64(r20)"
    st.d	%r21, %r24, DEVICE_CONTROL_DID
    ldz.d	%r20, %r24, DEVICE_CONTROL_DID
    write	"mem[DEVICE_CONTROL_DID] %x64(r20)"

    ldz.d	%r20, %r24, DEVICE_CONTROL_CMD
    write	"mem[DEVICE_CONTROL_CMD] %x64(r20)"
    st.d	%r21, %r24, DEVICE_CONTROL_CMD
    ldz.d	%r20, %r24, DEVICE_CONTROL_CMD
    write	"mem[DEVICE_CONTROL_CMD] %x64(r20)"

    ldz.d	%r20, %r24, DEVICE_CONTROL_ARRAY_ADDRESS
    write	"mem[DEVICE_CONTROL_ARRAY_ADDRESS] (r20)"

    ldz.d	%r20, %r24, DEVICE_CONTROL_ARRAY_LEN
    write	"mem[DEVICE_CONTROL_ARRAY_LEN] %i64(r20)"

    ldi	%r22, \n

    write	"test command"
    ldi.l	%r21, 0xabcdef1234567890
    st.d	%r21, %r24, DEVICE_CONTROL_CMD

    write	"end_device_control_test"
.end
.text
    write	"test core mapping DEVICE_CONFIG_VIRT_BASE"
    alloc	96
    ldi.l	%r20, DEVICE_CONFIG_VIRT_BASE
    write	"DEVICE_CONFIG_VIRT_BASE: %x64(r20)"
    ldi.l	%r20, DEVICE_CONFIG_SPACE_SIZE
    write	"DEVICE_CONFIG_SPACE_SIZE: %x64(r20)"
    ldi.l	%r20, CONFIG_OFFSET_CORE_0
    write	"CONFIG_OFFSET_CORE_0: %x64(r20)"
    ldi.l	%r20, DEVICE_CORE_TIMECMP
    write	"DEVICE_CORE_TIMECMP: %x64(r20)"

    ldi.l	%r20, DEVICE_CONFIG_VIRT_BASE + CONFIG_OFFSET_CORE_0 * DEVICE_CONFIG_SPACE_SIZE ; core config
    ldi		%r19, 0xabcdef

    write	"test interrupt vector %x64(r20)"
    st.d	%r19, %r20, DEVICE_CORE_TIMECMP ; use DEVICE_CORE_INTERRUPT_VECTOR in place of DEVICE_CORE_TIMECMP for real interrupt

    write	"test timecmp"
    st.d	%r19, %r20, DEVICE_CORE_TIMECMP

    write	"test rom mapping ROM_VIRT_BASE"
    ldi.l	%r20, ROM_VIRT_BASE
    ldz.d	%r19, %r20, 0
    write	"mem[ROM_VIRT_BASE] %x64(r19)"

    write	"test video commands VIDEO_COMMAND_VIRT_BASE"
    ldi.l	%r20, VIDEO_COMMAND_VIRT_BASE
    ldi		%r21, 0x1234
    st.w	%r21, %r20, 0x88	; clear
    st.w	%r21, %r20, 0x8c	; redraw

    write	"video width/height base: %x64(r20)"
    ldz.w	%r21, %r20, 0x80 ; width
    ldz.w	%r22, %r20, 0x84 ; height
    write	"width=%i64(r21) heigth=%i64(r22)"

    write	"test video memory VIDEO_VIRT_BASE"
    ldi.l	%r20, VIDEO_VIRT_BASE
    write	"r20     %x64(r20)"

    ldi.l	%r25, 0x12345678
    st.w	%r25, %r20, 0

    ldi		%r24, 0   ; y
loop_y: (64)
;	write	"%i64(r24)"
    ldi	%r23, 0   ; x
loop_x:
;	add	%r25, %r23, %r24
    st.b	%r25, %r20, 0
    addi	%r20, %r20, 1
    addi	%r23, %r23, 1
    bs.lt.d	%r23, %r21, loop_x

    addi	%r24, %r24,1
    bs.lt.d	%r24, %r22, loop_y
    ; debug
    write	"end test video memory"
    nop		1234567
.end
.text
    write	"begin exception test"
    alloc	96

    ca.rf	%r2, catch
    mt.spr	%r2, %eip

; constructor 1
    ldi		%r4, 1
    eh.adj	call_destructor_1
    write	"eip: %s(eip)"
; constructor 2
    ldi		%r5, 2
    eh.adj	call_destructor_2
    write	"eip: %s(eip)"

    ldi		%r3, 0xFFFFFFFFFFFF1230
    eh.throw	%r3, 0    ; set eca, jump to eip
    write	"normal execution (never occurs)"

call_destructor_2:
    write	"call_destructor_2"
    eh.catch	%r6, end_destructor_2
    ; here dtor called
    ldi		%r4, 0
end_destructor_2:
    eh.next	%r6, call_destructor_1
    write	"normal continue after destructor_2"

call_destructor_1:
    write	"call_destructor_1"
    eh.catch	%r6, end_destructor_1
    ; here dtor called
    ldi		%r5, 0
end_destructor_1:
    eh.next	%r6, catch
    write	"normal continue after destructor_1"

call_ret:
    write	"normal exit"
    jmp		exception_exit

catch:
    write	"caught exception, exit"
    eh.catch	%r12, exception_exit
    write	"caught exception context: r12=%x64(r12)"
exception_exit:
    nop		1234567
    nop		7654321
.end
.text
; floating-point extension example
    alloc	96

    write	"test float128 immediate load (low/high parts)"
    fldri.q	%r12, 3.1415926115461431423612436243
    write	"fldqri: %f128(r12)"

    write	"test fpcr modification (rm=3)"
    ldi		%r2, 3
    mt.spr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"
    write	"test fpcr modification (rm=2)"
    ldi		%r2, 2
    mt.spr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"
    write	"test fpcr modification (rm=1)"
    ldi		%r2, 1
    mt.spr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"
    write	"test fpcr modification (rm=0)"
    ldi		%r2, 0
    mt.spr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"

    write	"compare fldqri (full mantissa) & long fldi (63-bit mantissa)"
    fldri.q	%r30, 3.14159265358979323846123456789012e+400
    write	"fldqri: %x128(r30) %f128(r30)"
    fldi.d	%r31, 3.14159265358979323846123456789012
    write	"flddi: %x128(r31) %f64(r31)"
    write	"compare fldqri (full mantissa) & short fldi (21-bit mantissa)"
    fldri.q	%r30, 3.14159265358979323846123456789012
    write	"r30     %x128(r30)"
    fldi.d	%r31, 3.14159265358979323846123456789012
    write	"r31     %x128(r31)"
    write	"before1"
    write	"r30     %f128(r30)"
    write	"before2"
    write	"r31     %vf64(r31)"
    write	"after"
    fldi.d	%r30, -12.3456789e+04
.rodata
    align	16
float64data:
    double	1.234567890123456789124141241241
    double	3.1415925678888734535345231234564561
    double	3.4566345634563456346535463463456
.text
    ca.r	%r21, float64data
    ldz.d	%r11, %r21, 8*0
    ldz.d	%r12, %r21, 8*1
    ldz.d	%r13, %r21, 8*2
    write	"ld8(f64): %f64(r11) %f64(r12) %f64(r13)"
    fldri.q	%r14, 2.7182818289201
    write	"fldqri: %f128(r14)"

    fext.sd.sq	%r11, %r11
    fext.sd.sq	%r12, %r12
    fext.sd.sq	%r13, %r13

    write	"test binary"
    fmul.sq	%r15, %r11, %r14
    write	"fmulsq:  %f128(r15)"
    fnmul.sq	%r15, %r11, %r14
    write	"fnmulsq: %f128(r15)"
    fadd.sq	%r15, %r11, %r14
    write	"faddsq:  %f128(r15)"
    fnadd.sq	%r15, %r11, %r14
    write	"fnaddsq: %f128(r15)"
    fsub.sq	%r15, %r14, %r11
    write	"fsubsq:  %f128(r15)"
    fdiv.sq	%r15, %r14, %r11
    write	"fdivsq:  %f128(r15)"

    write	"test fused fma"
;   jmp	skipfma
    fmadd.sq	%r15, %r14, %r11, %r12
    write	"fmaddsq:  %f128(r15)"
    fnmadd.sq %r15, %r14, %r11, %r12
    write	"fnmaddsq: %f128(r15)"
    fmsub.sq	%r15, %r14, %r11, %r12
    write	"fmsubsq:  %f128(r15)"
    fnmsub.sq %r15, %r14, %r11, %r12
    write	"fnmsubsq: %f128(r15)"

    write	"test unary"
    mov		%r16, %r15
    write	"r16     %f128(r16)"
    fabs.sq	%r16, %r15
    write	"r16     %f128(r16)"
    fneg.sq	%r16, %r15
    write	"r16     %f128(r16)"
    fnabs.sq	%r16, %r15
    write	"r16     %f128(r16)"
    fsqrt.sq	%r16, %r12
    write	"r16     %f128(r16)"
    frsqrt.sq	%r16, %r12
    write	"r16     %f128(r16)"

    write	"test rounding"
    frnd.sq	%r17, %r12, 4
    write	"r17     %f128(r17)"
    frnd.sq	%r17, %r12, 3
    write	"r17     %f128(r17)"
    frnd.sq	%r17, %r12, 2
    write	"r17     %f128(r17)"
    frnd.sq	%r17, %r12, 0
    write	"r17     %f128(r17)"
    fcvt.sq.iw	%r17, %r12,0
    write	"r17     %i64(r17)"
    ldi		%r17, 123456
    fcvt.iw.sq	%r17, %r7,0
    write	"r17     %f128(r17)"

    write	"test fp minmax"
    fmax.sq	%r8, %r11, %r12
    write	"r8      %f128(r8)"
    fmin.sq	%r8, %r11, %r12
    write	"r8      %f128(r8)"
    write	"test fp abs minmax"
    famax.sq	%r8, %r11, %r12
    write	"r8      %f128(r8)"
    famin.sq	%r8, %r11, %r12
    write	"r8      %f128(r8)"

    write	"test fmergesq"
    fmerge.sq	%r8, %r11, %r12, %r14
    write	"r8      %f128(r8)"
    fmerge.sq	%r8, %r14, %r11, %r12
    write	"r8      %f128(r8)"


.rodata
    align	16
xxxd:	double	1.122
    double	0.9999765432
.text
    ca.r	%r21, xxxd
    ldi		%r15, 100
    ldz.d	%r25, %r21, 8*0
    ldz.d	%r26, %r21, 8*1
    fsub.sq	%r22, %r25, %r16
    write	"r22     %f128(r22)"
xxloop:
    fmadd.sq	%r22, %r25, %r16, %r22
    fmsub.sq	%r22, %r25, %r16, %r22
    reps.ge.d	%r15, %gz, xxloop
    write	"r22     %f128(r22)"

    write	"other FPU"
    fmadd.sq  %r60, %r61, %r62, %r63
    fmsub.sq  %r61, %r61, %r72, %r73
    fnmadd.sq %r62, %r71, %r82, %r63
    fnmsub.sq %r63, %r81, %r12, %r53

    fmul.sq	%r64, %r61, %r22
    fdiv.sq	%r65, %r11, %r27
    fadd.sq	%r66, %r17, %r42
    fsub.sq	%r67, %r31, %r23
    fnadd.sq	%r68, %r41, %r62
    fmax.sq	%r60, %r61, %r62
    fmin.sq	%r60, %r61, %r62
    famax.sq	%r60, %r61, %r62
    famin.sq	%r60, %r61, %r62

    fcmpo.lt.sq	%r10, %r61, %r72
    fcmpo.le.sq	%r11, %r52, %r21
    fcmpo.le.sq	%r12, %r43, %r12
    fcmpo.eq.sq	%r10, %r34, %r44
    fcmpu.eq.sq	%r13, %r25, %r22
    fcmpu.le.sq	%r12, %r15, %r23
    fcmpu.sq	%r11, %r86, %r86

    fneg.sq	%r24, %r58
    fabsd.sq	%r45, %r61, %r20
    fnabsd.sq	%r56, %r32, %r20
    frnd.sq	%r78, %r74,2
    frnd.sq	%r89, %r65,3
    frnd.sq	%r81, %r76,0
    frnd.sq	%r62, %r67,1
    fsqrt.sq	%r63, %r78
    frsqrt.sq %r64, %r69

    addi	%r45, %sp,-4800
    ldi		%r13, 2

    ldz.w	%r12, %r45, 4*1
    st.w		%r12, %r45, 4*1
    ldz.d	%r12, %r45, 8*3
    st.d		%r12, %r45, 8*3
    ldz.w.xd	%r12, %r45, %r13, 2, 60
    st.w.xd	%r12, %r45, %r13, 2, 60
    ldz.d.xd	%r12, %r45, %r13, 3, 60
    st.d.xd	%r12, %r45, %r13, 3, 60

    fadd.sq	%r23, %r24, %r25
    fmadd.sq	%r23, %r60, %r55, %r33
    fmul.sq	%r23, %r60, %r55
    ldz.d	%r60, %r45, 8*6
    fmadd.sq	%r23, %r60, %r55, %r33
    fmadd.sq	%r24, %r61, %r25, %r32
    fmadd.sq	%r25, %r62, %r55, %r23
    fmadd.sq	%r26, %r63, %r75, %r73
    fmadd.sq	%r27, %r64, %r75, %r73
    fmadd.sq	%r28, %r65, %r85, %r63
    fmadd.sq	%r29, %r66, %r85, %r63
    fmadd.sq	%r30, %r67, %r55, %r23
    fmadd.sq	%r31, %r68, %r55, %r23
    fmadd.sq	%r12, %r32, %r76, %r85
    fmadd.sq	%r12, %r32, %r76, %r85
    fmadd.sq	%r10, %r32, %r76, %r85
    fmadd.sq	%r10, %r32, %r76, %r85
    fmadd.sq	%r10, %r32, %r76, %r85
    fmadd.sq	%r13, %r32, %r76, %r85
    fmadd.sq	%r14, %r32, %r76, %r85
    fmadd.sq	%r15, %r32, %r76, %r85
    fmadd.sq	%r16, %r32, %r76, %r85
    fmadd.sq	%r17, %r32, %r76, %r85

    fcvt.sq.iw	%r56, %r45, 0
    fcvt.sq.uw	%r56, %r45, 0
    fcvt.iw.sq	%r45, %r56, 0
    fcvt.uw.sq	%r45, %r56, 0

    ldi		%r5, 0
    fldri.q	%r4, 1.0
    fldri.q	%r5, 1.0
    fldri.q	%r6, 1.0
    fldri.q	%r7, 1.0
    ldi		%r24, 128
tri_repeat:
    write	"r7      %x128(r7)"
    fadd.sq	%r5, %r5, %r4
    fmul.sq	%r6, %r6, %r5
    fdiv.sq	%r7, %r4, %r6
;   write "%x128(r6)"
    reps.le.d.l %r5, %r24, tri_repeat

    write	"test taylor series"
    fldri.q	%r2, 0.44567	; f2 ,  x
    write	"x:   %f128(r2)"		; test value
    write	"test sin(x)"
    fldri.q	%r5, sin(0.44567)
    write	"sin: %f128(r5)"		; test value
    ldi		%r3, 0		; s ,  0
    fmul.sq	%r4, %r2, %r2	; f4 ,  x*x
    fmadd.sq	%r3, %r3, %r4, %r25	; s ,  s * x*x + 1/25!
    fmsub.sq	%r3, %r3, %r4, %r23	; s ,  s * x*x - 1/23!
    fmadd.sq	%r3, %r3, %r4, %r21
    fmsub.sq	%r3, %r3, %r4, %r19
    fmadd.sq	%r3, %r3, %r4, %r17
    fmsub.sq	%r3, %r3, %r4, %r15
    fmadd.sq	%r3, %r3, %r4, %r13
    fmsub.sq	%r3, %r3, %r4, %r11
    fmadd.sq	%r3, %r3, %r4, %r9
    fmsub.sq	%r3, %r3, %r4, %r7
    fmadd.sq	%r3, %r3, %r4, %r5
    fmsub.sq	%r3, %r3, %r4, %r3
    fmadd.sq	%r3, %r3, %r4, %r1
    fmul.sq	%r3, %r3, %r2	; s ,  s * x
    write	"sin: %f128(r3)"

    write	"test cos(x)"
    fldri.q	%r5, cos(0.44567)
    write	"cos: %f128(r5)"		; test value
    ldi		%r3, 0		; s ,  0
    fmul.sq	%r4, %r2, %r2	; f4 ,  x*x
    fmsub.sq	%r3, %r3, %r4, %r26
    fmadd.sq	%r3, %r3, %r4, %r24
    fmsub.sq	%r3, %r3, %r4, %r22
    fmadd.sq	%r3, %r3, %r4, %r20
    fmsub.sq	%r3, %r3, %r4, %r18
    fmadd.sq	%r3, %r3, %r4, %r16
    fmsub.sq	%r3, %r3, %r4, %r14
    fmadd.sq	%r3, %r3, %r4, %r12
    fmsub.sq	%r3, %r3, %r4, %r10
    fmadd.sq	%r3, %r3, %r4, %r8
    fmsub.sq	%r3, %r3, %r4, %r6
    fmadd.sq	%r3, %r3, %r4, %r4
    fmsub.sq	%r3, %r3, %r4, %r2
    fmadd.sq	%r3, %r3, %r4, %r1
    write	"cos: %f128(r3)"

    write	"test exp(x)"
    fldri.q	%r5, exp(0.44567)
    write	"exp: %f128(r5)"	; test value
    ldi		%r3, 0			; s ,  0.0
    mov		%r4, %r2		; f4 ,  x
    fldi.d	%r6, 0.125
;   write	"%f128(r6)"
    fmul.sq	%r4, %r4, %r6	; x ,  x/8
    fmadd.sq	%r3, %r3, %r4, %r15
    fmadd.sq	%r3, %r3, %r4, %r14
    fmadd.sq	%r3, %r3, %r4, %r13
    fmadd.sq	%r3, %r3, %r4, %r12
    fmadd.sq	%r3, %r3, %r4, %r11
    fmadd.sq	%r3, %r3, %r4, %r10
    fmadd.sq	%r3, %r3, %r4, %r9
    fmadd.sq	%r3, %r3, %r4, %r8
    fmadd.sq	%r3, %r3, %r4, %r7
    fmadd.sq	%r3, %r3, %r4, %r6
    fmadd.sq	%r3, %r3, %r4, %r5
    fmadd.sq	%r3, %r3, %r4, %r4
    fmadd.sq	%r3, %r3, %r4, %r3
    fmadd.sq	%r3, %r3, %r4, %r2
    fmadd.sq	%r3, %r3, %r4, %r1
    fmadd.sq	%r3, %r3, %r4, %r1
    fmul.sq	%r3, %r3, %r3	; (e^x) ^ 8
    fmul.sq	%r3, %r3, %r3
    fmul.sq	%r3, %r3, %r3
    write	"exp: %f128(r3)"

    fadd.sq	%r1, %r2, %r3
    fmadd.sq	%r2, %r10, %r20, %r30
    fmadd.sq	%r1, %r11, %r21, %r31

    ; classification
    fcl.ss	%r4, %r5, 120
    fcl.sd	%r4, %r5, 120
    fcl.sq	%r4, %r5, 120
    jmp		skipfma

fpu_backward_target:
; single branches
    bfo.eq.ss	%r23, %r34, fpu_backward_target
    bfo.eq.ss.l	%r23, %r34, fpu_backward_target
    bfo.eq.ss	%r23, %r34, fpu_forward_target
    bfo.eq.ss.l	%r23, %r34, fpu_forward_target

    bfu.eq.ss	%r23, %r34, fpu_backward_target
    bfu.eq.ss.l	%r23, %r34, fpu_backward_target
    bfu.eq.ss	%r23, %r34, fpu_forward_target
    bfu.eq.ss.l	%r23, %r34, fpu_forward_target

    bfo.ne.ss	%r23, %r34, fpu_backward_target
    bfo.ne.ss.l	%r23, %r34, fpu_backward_target
    bfo.ne.ss	%r23, %r34, fpu_forward_target
    bfo.ne.ss.l	%r23, %r34, fpu_forward_target

    bfu.ne.ss	%r23, %r34, fpu_backward_target
    bfu.ne.ss.l	%r23, %r34, fpu_backward_target
    bfu.ne.ss	%r23, %r34, fpu_forward_target
    bfu.ne.ss.l	%r23, %r34, fpu_forward_target

    bfo.lt.ss	%r23, %r34, fpu_backward_target
    bfo.lt.ss.l	%r23, %r34, fpu_backward_target
    bfo.lt.ss	%r23, %r34, fpu_forward_target
    bfo.lt.ss.l	%r23, %r34, fpu_forward_target

    bfu.lt.ss	%r23, %r34, fpu_backward_target
    bfu.lt.ss.l	%r23, %r34, fpu_backward_target
    bfu.lt.ss	%r23, %r34, fpu_forward_target
    bfu.lt.ss.l	%r23, %r34, fpu_forward_target

    bfo.le.ss	%r23, %r34, fpu_backward_target
    bfo.le.ss.l	%r23, %r34, fpu_backward_target
    bfo.le.ss	%r23, %r34, fpu_forward_target
    bfo.le.ss.l	%r23, %r34, fpu_forward_target

    bfu.le.ss	%r23, %r34, fpu_backward_target
    bfu.le.ss.l	%r23, %r34, fpu_backward_target
    bfu.le.ss	%r23, %r34, fpu_forward_target
    bfu.le.ss.l	%r23, %r34, fpu_forward_target

    bfo.ss	%r23, %r34, fpu_backward_target
    bfo.ss.l	%r23, %r34, fpu_backward_target
    bfo.ss	%r23, %r34, fpu_forward_target
    bfo.ss.l	%r23, %r34, fpu_forward_target

    bfu.ss	%r23, %r34, fpu_backward_target
    bfu.ss.l	%r23, %r34, fpu_backward_target
    bfu.ss	%r23, %r34, fpu_forward_target
    bfu.ss.l	%r23, %r34, fpu_forward_target

    bf.class.ss	%r23, 34, fpu_backward_target
    bf.class.ss.l	%r23, 34, fpu_backward_target
    bf.class.ss	%r23, 34, fpu_forward_target
    bf.class.ss.l	%r23, 34, fpu_forward_target

; double branches
    bfo.eq.sd	%r23, %r34, fpu_backward_target
    bfo.eq.sd.l	%r23, %r34, fpu_backward_target
    bfo.eq.sd	%r23, %r34, fpu_forward_target
    bfo.eq.sd.l	%r23, %r34, fpu_forward_target

    bfu.eq.sd	%r23, %r34, fpu_backward_target
    bfu.eq.sd.l	%r23, %r34, fpu_backward_target
    bfu.eq.sd	%r23, %r34, fpu_forward_target
    bfu.eq.sd.l	%r23, %r34, fpu_forward_target

    bfo.ne.sd	%r23, %r34, fpu_backward_target
    bfo.ne.sd.l	%r23, %r34, fpu_backward_target
    bfo.ne.sd	%r23, %r34, fpu_forward_target
    bfo.ne.sd.l	%r23, %r34, fpu_forward_target

    bfu.ne.sd	%r23, %r34, fpu_backward_target
    bfu.ne.sd.l	%r23, %r34, fpu_backward_target
    bfu.ne.sd	%r23, %r34, fpu_forward_target
    bfu.ne.sd.l	%r23, %r34, fpu_forward_target

    bfo.lt.sd	%r23, %r34, fpu_backward_target
    bfo.lt.sd.l	%r23, %r34, fpu_backward_target
    bfo.lt.sd	%r23, %r34, fpu_forward_target
    bfo.lt.sd.l	%r23, %r34, fpu_forward_target

    bfu.lt.sd	%r23, %r34, fpu_backward_target
    bfu.lt.sd.l	%r23, %r34, fpu_backward_target
    bfu.lt.sd	%r23, %r34, fpu_forward_target
    bfu.lt.sd.l	%r23, %r34, fpu_forward_target

    bfo.le.sd	%r23, %r34, fpu_backward_target
    bfo.le.sd.l	%r23, %r34, fpu_backward_target
    bfo.le.sd	%r23, %r34, fpu_forward_target
    bfo.le.sd.l	%r23, %r34, fpu_forward_target

    bfu.le.sd	%r23, %r34, fpu_backward_target
    bfu.le.sd.l	%r23, %r34, fpu_backward_target
    bfu.le.sd	%r23, %r34, fpu_forward_target
    bfu.le.sd.l	%r23, %r34, fpu_forward_target

    bfo.sd	%r23, %r34, fpu_backward_target
    bfo.sd.l	%r23, %r34, fpu_backward_target
    bfo.sd	%r23, %r34, fpu_forward_target
    bfo.sd.l	%r23, %r34, fpu_forward_target

    bfu.sd	%r23, %r34, fpu_backward_target
    bfu.sd.l	%r23, %r34, fpu_backward_target
    bfu.sd	%r23, %r34, fpu_forward_target
    bfu.sd.l	%r23, %r34, fpu_forward_target

    bf.class.sd	%r23, 34, fpu_backward_target
    bf.class.sd.l	%r23, 34, fpu_backward_target
    bf.class.sd	%r23, 34, fpu_forward_target
    bf.class.sd.l	%r23, 34, fpu_forward_target

; quadruple branches
    bfo.eq.sq	%r23, %r34, fpu_backward_target
    bfo.eq.sq.l	%r23, %r34, fpu_backward_target
    bfo.eq.sq	%r23, %r34, fpu_forward_target
    bfo.eq.sq.l	%r23, %r34, fpu_forward_target

    bfu.eq.sq	%r23, %r34, fpu_backward_target
    bfu.eq.sq.l	%r23, %r34, fpu_backward_target
    bfu.eq.sq	%r23, %r34, fpu_forward_target
    bfu.eq.sq.l	%r23, %r34, fpu_forward_target

    bfo.ne.sq	%r23, %r34, fpu_backward_target
    bfo.ne.sq.l	%r23, %r34, fpu_backward_target
    bfo.ne.sq	%r23, %r34, fpu_forward_target
    bfo.ne.sq.l	%r23, %r34, fpu_forward_target

    bfu.ne.sq	%r23, %r34, fpu_backward_target
    bfu.ne.sq.l	%r23, %r34, fpu_backward_target
    bfu.ne.sq	%r23, %r34, fpu_forward_target
    bfu.ne.sq.l	%r23, %r34, fpu_forward_target

    bfo.lt.sq	%r23, %r34, fpu_backward_target
    bfo.lt.sq.l	%r23, %r34, fpu_backward_target
    bfo.lt.sq	%r23, %r34, fpu_forward_target
    bfo.lt.sq.l	%r23, %r34, fpu_forward_target

    bfu.lt.sq	%r23, %r34, fpu_backward_target
    bfu.lt.sq.l	%r23, %r34, fpu_backward_target
    bfu.lt.sq	%r23, %r34, fpu_forward_target
    bfu.lt.sq.l	%r23, %r34, fpu_forward_target

    bfo.le.sq	%r23, %r34, fpu_backward_target
    bfo.le.sq.l	%r23, %r34, fpu_backward_target
    bfo.le.sq	%r23, %r34, fpu_forward_target
    bfo.le.sq.l	%r23, %r34, fpu_forward_target

    bfu.le.sq	%r23, %r34, fpu_backward_target
    bfu.le.sq.l	%r23, %r34, fpu_backward_target
    bfu.le.sq	%r23, %r34, fpu_forward_target
    bfu.le.sq.l	%r23, %r34, fpu_forward_target

    bfo.sq	%r23, %r34, fpu_backward_target
    bfo.sq.l	%r23, %r34, fpu_backward_target
    bfo.sq	%r23, %r34, fpu_forward_target
    bfo.sq.l	%r23, %r34, fpu_forward_target

    bfu.sq	%r23, %r34, fpu_backward_target
    bfu.sq.l	%r23, %r34, fpu_backward_target
    bfu.sq	%r23, %r34, fpu_forward_target
    bfu.sq.l	%r23, %r34, fpu_forward_target

    bf.class.sq	%r23, 34, fpu_backward_target
    bf.class.sq.l	%r23, 34, fpu_backward_target
    bf.class.sq	%r23, 34, fpu_forward_target
    bf.class.sq.l	%r23, 34, fpu_forward_target

fpu_forward_target:

    nulfu.ne.ss	%r23, %r34, 1, 1
    nulfu.ne.sd	%r23, %r34, 1, 1
    nulfu.ne.sq	%r23, %r34, 1, 1

    nulfo.ne.ss	%r23, %r34, 1, 1
    nulfo.ne.sd	%r23, %r34, 1, 1
    nulfo.ne.sq	%r23, %r34, 1, 1

    nulfu.eq.ss	%r23, %r34, 1, 1
    nulfu.eq.sd	%r23, %r34, 1, 1
    nulfu.eq.sq	%r23, %r34, 1, 1

    nulfo.eq.ss	%r23, %r34, 1, 1
    nulfo.eq.sd	%r23, %r34, 1, 1
    nulfo.eq.sq	%r23, %r34, 1, 1

    nulf.class.ss	%r23, 94, 1, 1
    nulf.class.sd	%r23, 94, 1, 1
    nulf.class.sq	%r23, 94, 1, 1
skipfma:
    write	"end fpu"
.end
.text
    alloc	96
    write	"test base addressing with indexed post-update"
    ldi		%r12, 1
    addi	%r45, %sp, -512

    ldz.b.mia	%r23, %r45, 2
    ldz.h.mia	%r23, %r45, 2
    ldz.w.mia	%r23, %r45, 4
    ldz.d.mia	%r23, %r45, 8
    ld.q.mia	%r23, %r45, 16

    lds.b.mia	%r23, %r45, 2
    lds.h.mia	%r23, %r45, 2
    lds.w.mia	%r23, %r45, 4
    lds.d.mia	%r23, %r45, 8

    st.b.mia	%r23, %r45, 2 
    st.h.mia	%r23, %r45, 2
    st.w.mia	%r23, %r45, 4
    st.d.mia	%r23, %r45, 8
    st.q.mia	%r23, %r45, 16
    write	"end_indexed_modify_test"
.end
.rodata
rodata1:
    d1	123
    align	2
rodata2:
    d2	12345
    align	4
rodata4:
    d4	123456789
    align	8
rodata8:
    d8	1234567890123456789

.data
data1:
    d1	123
    align	2
data2:
    d2	12345
    align	4
data4:
    d4	123456789
    align	8
data8:
    d8	1234567890123456789

.text
    alloc	96

    write "test ip-relative data addressing"
    ldz.b.r	%r34, rodata1
    ldz.h.r	%r34, rodata2
    ldz.w.r	%r34, rodata4
    ldz.d.r	%r34, rodata8

    lds.b.r	%r34, rodata1
    lds.h.r	%r34, rodata2
    lds.w.r	%r34, rodata4
    lds.d.r	%r34, rodata8

    ldz.b.r	%r34, data1
    ldz.h.r	%r34, data2
    ldz.w.r	%r34, data4
    ldz.d.r	%r34, data8

    lds.b.r	%r34, data1
    lds.h.r	%r34, data2
    lds.w.r	%r34, data4
    lds.d.r	%r34, data8

    st.b.r	%r34, data1
    st.h.r	%r34, data2
    st.w.r	%r34, data4
    st.d.r	%r34, data8

    write	"end ip-relative data test"
.end
.text
    alloc	96
    write	"test ca.rf"
    ca.rf	%r22, ca_rf_data
    write	"ca.rf: %x64(r22)"

    write	"end_ca_rf_test"
.data
ca_rf_data:

.end
.text
    alloc	96
    write	"check mbsel instruction"
    ldi.l	%r6, ((0x3333333333333333 ^ 0x5555555555555555) & 0xff00ff00ff00ff00) ^ 0x5555555555555555
    write	"mbsel: %x64(r6)"
    ldi.l	%r3, 0x3333333333333333
    ldi.l	%r4, 0x5555555555555555
    ldi.l	%r5, 0xff00ff00ff00ff00
    mbsel	%r6, %r3, %r4, %r5
    write	"mbsel: %x64(r6)"

    write	"end_mbsel_test"
.end
.text
    alloc	61
    write	"\ntest write: special register"
    write	"ip      %s(ip)"
    write	"eip     %s(eip)"
    write	"eca     %s(eca)"
    write	"fpcr    %s(fpcr)"
    write	"rsc     %s(rsc)"
    write	"rsp     %s(rsp)"
    write	"bsp     %s(bsp)"
    write	"peb     %s(peb)"
    write	"teb     %s(teb)"
    write	"itc     %s(itc)"
    write	"itm     %s(itm)"
    write	"psr     %s(psr)"
    write	"pta     %s(pta)"
    write	"iva     %s(iva)"
    write	"kip     %s(kip)"
    write	"ksp     %s(ksp)"
    write	"krsp    %s(krsp)"
    write	"iip     %s(iip)"
    write	"iipa    %s(iipa)"
    write	"ipsr    %s(ipsr)"
    write	"cause   %s(cause)"
    write	"ifa     %s(ifa)"
    write	"iib     %s(iib)"
    write	"tpr     %s(tpr)"
    write	"lid     %s(lid)"
    write	"irr0    %s(irr0)"
    write	"irr1    %s(irr1)"
    write	"irr2    %s(irr2)"
    write	"irr3    %s(irr3)"
    write	"isr0    %s(isr0)"
    write	"isr1    %s(isr1)"
    write	"isr2    %s(isr2)"
    write	"isr3    %s(isr3)"
    write	"tsv     %s(tsv)"
    write	"cmcv    %s(cmcv)"
    write	"pmv     %s(pmv)"

    write	"\ntest mfspr: read special register"

    mf.spr	%r12, %ip
    write	"ip      %x64(r12)"

    mf.spr	%r12, %eip
    write	"eip     %x64(r12)"

    mf.spr	%r12, %eca
    write	"%x64(r12)"

    mf.spr	%r12, %fpcr
    write	"%x64(r12)"

    mf.spr	%r12, %rsc
    write	"%x64(r12)"

    mf.spr	%r12, %rsp
    write	"%x64(r12)"

    mf.spr	%r12, %bsp
    write	"%x64(r12)"

    mf.spr	%r12, %peb
    write	"%x64(r12)"

    mf.spr	%r12, %teb
    write	"%x64(r12)"

    mf.spr	%r12, %itc
    write	"%x64(r12)"

    mf.spr	%r12, %itm
    write	"%x64(r12)"

    mf.spr	%r12, %psr
    write	"%x64(r12)"

    mf.spr	%r12, %pta
    write	"%x64(r12)"

    mf.spr	%r12, %iva
    write	"%x64(r12)"

    mf.spr	%r12, %kip
    write	"%x64(r12)"

    mf.spr	%r12, %ksp
    write	"%x64(r12)"

    mf.spr	%r12, %krsp
    write	"krsp    %x64(r12)"

    mf.spr	%r12, %iip
    write	"iip     %x64(r12)"

    mf.spr	%r12, %iipa
    write	"iipa    %x64(r12)"

    mf.spr	%r12, %ipsr
    write	"ipsr    %x64(r12)"

    mf.spr	%r12, %cause
    write	"cause   %x64(r12)"

    write	"%s(ifa)"
    mf.spr	%r12, %ifa
    write	"ifa     %x64(r12)"

    mf.spr	%r12, %iib
    write	"iib     %x128(r12)"

    mf.spr	%r12, %tpr
    write	"tpr     %x64(r12)"

    mf.spr	%r12, %lid
    write	"lid     %x64(r12)"

    mf.spr	%r12, %irr0
    write	"irr0    %x64(r12)"

    mf.spr	%r12, %irr1
    write	"irr1    %x64(r12)"

    mf.spr	%r12, %irr2
    write	"irr2    %x64(r12)"

    mf.spr	%r12, %irr3
    write	"irr3    %x64(r12)"

    mf.spr	%r12, %isr0
    write	"%x64(r12)"

    mf.spr	%r12, %isr1
    write	"%x64(r12)"

    mf.spr	%r12, %isr2
    write	"%x64(r12)"

    mf.spr	%r12, %isr3
    write	"%x64(r12)"

    mf.spr	%r12, %tsv
    write	"%x64(r12)"

    mf.spr	%r12, %cmcv
    write	"%x64(r12)"

    mf.spr	%r12, %pmv
    write	"%x64(r12)"

    write	"end test mfspr"
.end
.text
    alloc	69
    write	"test min/max"
    mins	%r34, %r56, %r67
    minu	%r34, %r56, %r67
    maxs	%r34, %r56, %r67
    maxu	%r34, %r56, %r67

    minsi	%r34, %r56, 2671
    minui	%r34, %r56, 2671
    maxsi	%r34, %r56, 2671
    maxui	%r34, %r56, 2671
    write	"test minmax end"

.end

.text
    write	"test nullification (explicit masks)"
    alloc	96
    ldi		%r10, 0
    nul.eq.d	%r10, %r10, 5, 4
    write	"0" ; nullified
    write	"1" ; nullified
    write	"2" ; nullified
    write	"3" ; nullified
    write	"4" ; nullified
    write	"5" ; else
    write	"6" ; else
    write	"7" ; else
    write	"8" ; else

    write	"test nullification (predicate names)"
    ldi		%r10, 0
    nul.eq.d	%r10, %r10, equal, nonequal
    write	"0"
    write	"1"
    write	"2"
    write	"3"
    write	"4" (equal)
    write	"5"
    write	"6"
    write	"7"
    write	"8" (nonequal)


    write	"test nullification"
    ldi		%r10, 0
    nul.eq.d	%r10, %r10, 4, 3
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 1
    addi	%r10, %r10, 1
    addi	%r10, %r10, 1
    addi	%r10, %r10, 1

    write	"test nullification"
    ldi		%r10, 0
    nul.eq.d	%r10, %r10, true, false
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 2
    addi	%r10, %r10, 1 (true)
    addi	%r10, %r10, 1
    addi	%r10, %r10, 1 (false)

    nop	0
    nop	0
    nul.eq.d	%r12, %r10, 4, 3
    write	"branch1: psr=%s(psr)"
    write	"branch1: %i64(r10)"
    write	"branch1: %i64(r10)"
    write	"branch1: %i64(r10)"
    write	"branch2: psr=%s(psr)"
    write	"branch2: %i64(r20)"
    write	"branch2: %i64(r20)"


    nul.eq.d	%r23, %r45, 0b1100, 0b0101
    nuls.lt.d	%r23, %r45, 0b1100, 0b0101
    nulu.lt.d	%r23, %r45, 0b1100, 0b0101

    nuli.eq.d	%r23, 45, 0b1100, 0b0101
    nulsi.lt.d	%r23, -45, 0b1100, 0b0101
    nului.lt.d	%r23, 45, 0b1100, 0b0101

    nuli.eq.d.l   %r23, 45000000000, 0b1100, 0b0101
    nulsi.lt.d.l  %r23, -45000000000, 0b1100, 0b0101
    nului.lt.d.l  %r23, 45000000000, 0b1100, 0b0101

    nul.bs	%r23, %r45, 0b1100, 0b0101
    nul.bsi	%r23, 45, 0b1100, 0b0101
    nop	1
    nop	2
    nop	3
    nop	4
    nop	5
    nop	6
    nop	7

    nul.eq.d	%r10, %r10, same_equal, same_nonequal
    write	"0e"
    write	"1e"
    write	"2e" (same_equal, same_nonequal)

    nul.ne.d	%r10, %r10, same_equal2, same_nonequal2
    write	"0ne"
    write	"1ne"
    write	"2ne" (same_equal2, same_nonequal2)

    nul.eq.d	%r10, %r10, no_if_true, no_if_false (no_if_true)
    write	"else" (no_if_false)

    write	"end_nullification_test"
.end
.text
    alloc	21
    ldi		%r12, PMC_LAST
    write	"PMC_LAST = %i64(r12)"
; don't report runtine in unittests, this is non-reproducible
    mf.mr	%r14, %gz, PMC_RUNTIME
;   write	"PMC_RUNTIME = %i64(r14)"
    mf.mr	%r14, %gz, PMC_SHORT_INSTRUCTION
    write	"PMC_SHORT_INSTRUCTION = %i64(r14)"
    mf.mr	%r14, %gz, PMC_LONG_INSTRUCTION
    write	"PMC_LONG_INSTRUCTION = %i64(r14)"
    mf.mr	%r14, %gz, PMC_SHADOWED_INSTRUCTION
    write	"PMC_SHADOWED_INSTRUCTION = %i64(r14)"
    mf.mr	%r14, %gz, PMC_NOP_INSTRUCTION
    write	"PMC_NOP_INSTRUCTION = %i64(r14)"
    mf.mr	%r14, %gz, PMC_QUALIFIED_NOP_INSTRUCTION
    write	"PMC_QUALIFIED_NOP_INSTRUCTION = %i64(r14)"
    mf.mr	%r14, %gz, PMC_REGISTER_SPILL
    write	"PMC_REGISTER_SPILL = %i64(r14)"
    mf.mr	%r14, %gz, PMC_REGISTER_FILL
    write	"PMC_REGISTER_FILL = %i64(r14)"
    mf.mr	%r14, %gz, PMC_ICACHE_HIT
    write	"PMC_ICACHE_HIT = %i64(r14)"
    mf.mr	%r14, %gz, PMC_ICACHE_MISS
    write	"PMC_ICACHE_MISS = %i64(r14)"
    mf.mr	%r14, %gz, PMC_DCACHE_HIT
    write	"PMC_DCACHE_HIT = %i64(r14)"
    mf.mr	%r14, %gz, PMC_DCACHE_MISS
    write	"PMC_DCACHE_MISS = %i64(r14)"
    mf.mr	%r14, %gz, PMC_INSTRUCTION_TRANSLATION_HIT
    write	"PMC_INSTRUCTION_TRANSLATION_HIT = %i64(r14)"
    mf.mr	%r14, %gz, PMC_INSTRUCTION_TRANSLATION_MISS
    write	"PMC_INSTRUCTION_TRANSLATION_MISS = %i64(r14)"
    mf.mr	%r14, %gz, PMC_DATA_TRANSLATION_HIT
    write	"PMC_DATA_TRANSLATION_HIT = %i64(r14)"
    mf.mr	%r14, %gz, PMC_DATA_TRANSLATION_MISS
    write	"PMC_DATA_TRANSLATION_MISS = %i64(r14)"
    mf.mr	%r14, %gz, PMC_BACKSTORE_TRANSLATION_HIT
    write	"PMC_BACKSTORE_TRANSLATION_HIT = %i64(r14)"
    mf.mr	%r14, %gz, PMC_BACKSTORE_TRANSLATION_MISS
    write	"PMC_BACKSTORE_TRANSLATION_MISS = %i64(r14)"
    mt.mr	%r14, %gz, PMC_SHORT_INSTRUCTION
    mf.mr	%r15, %gz, PMC_SHORT_INSTRUCTION
    write	"old pm reg = %i64(r15)"
.end
.text
; Simple test program
; 20! factorial compute
.text
    alloc	61
    ldi		%r15, -100
loop_stop_sard:
    srdi	%r13, %r15, 5
    reps.le.d	%r15, %gz, loop_stop_sard

; performance test - long loop
; for(i = 1000000; i>0; i--) DoSome();

    ldi		%r20, 2500000
    ldi		%r15, 20 ; maximum factorial number
    ldi		%r21, 5
loop_stop: (64)
    addi	%r13, %r13, 5
    sub		%r14, %r14, %r55
    cmps.lt.d	%r24, %r14, %r14
    addi	%r13, %r13, 4
    sub		%r14, %r14, %r55
    cmps.lt.d	%r22, %r14, %r14
    addi	%r13, %r13, 33
    srpi	%r14, %r14, %r55, 13
    sub		%r14, %r13, %r21
    srai	%r14, %r14, 7
    reps.gt.d	%r20, %gz, loop_stop
; print loop counter after loop (must be 0)
    write	"%i64(r20) factorials"
    ldi		%r13, 1
    ldi		%r14, 1
start:
    mul		%r13, %r13, %r14
    write	"factorial: %u64(r13)"
    reps.le.d	%r14, %r15, start

    write	"%i64(r14) %i64(r13)"
.end
.text
    alloc	96
    write	"Example of strided loop instructions"
; fast_check
    ldi		%r12, 10000	; load loop number (10)
stride_loop_start:
;	write	"%i64(r12)"
    cmp.eq.d	%r4, %r12, %r12
    add		%r14, %r14, %r46
    reps.gt.d	%r12, %gz, stride_loop_start

    write	"counter=%i64(r12)"

; Second example of strided loop.
; fast_check
    ldi		%r12, 10000	; load loop number (10)
    ldi		%r14, 10000	; load loop number (10)
stride_loop_start2:
;   write	"%i64(r12)"
    cmp.eq.d	%r4, %r12, %r12
    addi	%r14, %r14, -2
    reps.gt.d	%r12, %gz, stride_loop_start2

    write	"%i64(r12) %i64(r14)"

;*****************************************************************
; 3x inner loop example
;*****************************************************************
    ldi		%r3, 0
    ldi		%r20, 0
    ldi		%r33, 80
    mov		%r10, %r33
    mov		%r11, %r33
    mov		%r12, %r33
ccloop:
;   write	"%i64(r12)"
    addi	%r20, %r20, 1
    addi	%r12, %r12, -1
    cmps.lt.d	%r2, %r3, %r12
;   jmp	ccloop
;   write	"%i64(r11)"
    addi	%r11, %r11, -1
    cmps.lt.d	%r4, %r3, %r11
    mov		%r12, %r33
;   jmp		ccloop
;   write	"%i64(r10)"
    addi	%r10, %r10, -1
    cmps.lt.d	%r6, %r3, %r10
    mov		%r11, %r33
    mov		%r12, %r33
;   jmp		ccloop

    write	"%i64(r20)"

; for(i=0; i<100; i++)

    ldi	%r8, 0
start1:
;   write	"%i64(r8)"
    addi	%r8, %r8,1
    cmpsi.lt.d	%r7, %r8,128
    bi.ne.d	%r7,0,start1

; for(i=100; i>0; i--)
    ldi		%r8, 100
start2:
    write	"%i64(r8)"
    addi	%r8, %r8,-1		; current error
    cmps.lt.d	%r2, %r3, %r8
    bi.ne.d	%r2, 0, start2

    write	"r3      %x64(r3)"
;	mtspr	%r3, %rsc


; for(i=100; i>0; i--) write "%x64((i)"
    ldi		%r10, 100
qqq:	cmps.lt.d	%r2, %r3, %r10
    write	"r10     %x64(r10)"
    addi	%r10, %r10, -1
;   jmp		qqq
sss:

    andi.l	%r55, %r55,0x000FFFFF00003F0F
    mt.spr	%r12, %ifa
; test some special regs
    ldi.l	%r9, 0x123456789
;   mt.spr	%r9, psr
    write	"ip: %s(ip) psr: %s(psr)"
;   mt.spr	%r3, psr
    ldi		%r55, 120
    mt.spr	%r55, %tpr
    write	"fpcr    %s(fpcr)"
    write	"psr     %s(psr)"

    write	"test long loop"
; test simple loop
; fast_check
    ldi		%r13, 350000 ; 35
    ldi		%r14, 350000 ; 35
    ldi		%r15, 88
    write	"%i64(r14)"
repeat_loop_start: (128)
;	write	"%i64(r12)"
    addi	%r13, %r13, 3
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 8

    addi	%r13, %r13, 4
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 7

    addi	%r13, %r13, 5
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 6

    addi	%r13, %r13, 6
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 5

    sub		%r13, %r13, %r15
    sl.add	%r13, %r13, %r15, 5
    sl.add	%r13, %r13, %r15, 5

    xor		%r13, %r14, %r15
    sll		%r13, %r13, %r13
    reps.gt.d	%r14, %gz, repeat_loop_start

    write	"%i64(r13) %i64(r14)"

    write	"end test long loop"
.end
.text
    write	"test random"
    alloc	96

    random	%r3, %gz
    write	"random: %x64(r3)"
    random	%r3, %gz
    write	"random: %x64(r3)"
    ldi		%r4, 1
    random	%r3, %r4
    write	"random seed: %x64(r3)"

    write	"end_random_test"
.end
.text
; test simple long loop
    alloc	61
    ldi		%r13, 1000000
    mov		%r14, %r13
    write	"loop limit: %i64(r14)"
    ldi		%r15, 88
repeat_long_loop_start: (128)
    addi	%r13, %r13, 3
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 8
    addi	%r13, %r13, 4
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 7
    addi	%r13, %r13, 5
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 6
    addi	%r13, %r13, 6
    add		%r13, %r13, %r15
    srpi	%r13, %r13, %r15, 5
    add		%r30, %r31, %r14
    sub		%r31, %r30, %r15
    slli	%r40, %r40, 12
    ca.xd	%r41, %r40, %r12, 3, -12
    ca.xd	%r41, %r40, %r12, 4, 62
    reps.gt.d	%r14, %gz, repeat_long_loop_start
    jmp		repeat_exit

    reps.le.d	%r56, %r60, repeat_long_loop_start
    reps.ge.d	%r56, %r60, repeat_long_loop_start
    repu.le.d	%r56, %r20, repeat_long_loop_start
    repu.ge.d	%r56, %r20, repeat_long_loop_start

    reps.le.d.l	%r56, %r60, repeat_long_loop_start
    reps.ge.d.l	%r56, %r60, repeat_long_loop_start
    repu.le.d.l	%r56, %r20, repeat_long_loop_start
    repu.ge.d.l	%r56, %r20, repeat_long_loop_start

repeat_exit:
    write	"end loop repeat test"
.end
.text
; Here we test instructions for partial rotate register by fixed bitcount.
    alloc	90
    write	"initial values"
    ldi.l	%r50, 0x1234567890ABCDEF
    write	"%x64(r50)"
    write	"rotate left"
    srpi	%r51, %r50, %r50, 40-1
    write	"%x64(r51)"
    write	"rotate right"
    srpi	%r51, %r50, %r50, 64-40-1	; same as previous
    write	"%x64(r51)"
    write	"rotate left immediate"
    srpi	%r51, %r50, %r50, 64-40-1
    write	"%x64(r51)"
    write	"rotate right immediate"
    srpi	%r51, %r50, %r50, 40-1	; same as previous "rD+1-rC"
    write	"%x64(r51)"

; Here we test instructions for shift and mask register by fixed bitcount.
    write	"shift signed|unsigned by immediate 12 bit"
    ldi.l	%r50, 0xfedcba0123456789
    write	"%x64(r50)"
    srai	%r51, %r50, 12
    write	"%x64(r51)"
    srli	%r51, %r50, 12
    write	"%x64(r51)"
    slli	%r51, %r50, 12
    write	"%x64(r51)"
    slli	%r51, %r50, 12
    write	"%x64(r51)"

;	jmp	ddd
    ldi		%r10, 16
    slp	%r51, %r50, %r50, %r10
    write	"%x64(r51)"

    ldi.l	%r40, 0x1234567890abcdef
    ldi.l	%r50, 0xfedcba0987654321
    slsrli	%r41, %r40, 8, 40
    write	"%x64(r41)"
    slsrai	%r41, %r40, 11, 40
    write	"%x64(r41)"

    write	"test srpi"
    ldi.l	%r40, 0x1234123412341234
    ldi.l	%r50, 0x5678567856785678
    srpi	%r41, %r40, %r50, 39
    write	"%x64(r41)"
    srpi	%r41, %r50, %r40, 23
    write	"%x64(r41)"
    srpi	%r41, %r40, %r40, 24
    write	"%x64(r41)"

    write	"test vector shift right pair (srpi16) instruction"
    xor		%r2, %r2, %r2	; all zeroes
    nor		%r3, %r2, %r2	; all ones
    write	"r2      %x128(r2)"
    write	"r3      %x128(r3)"
    srpi.q	%r4, %r2, %r3, 60
    write	"r4      %x128(r4)"
    srpi.q	%r4, %r3, %r2, 60
    write	"r4      %x128(r4)"
    srpi.q	%r4, %r2, %r3, 100
    write	"r4      %x128(r4)"
    srpi.q	%r4, %r3, %r2, 100
    write	"r4      %x128(r4)"

; SHIFTS
    sll		%r42, %r33, %r34
    sll		%r42, %r33, %r34
    sra		%r52, %r73, %r44
    srl		%r62, %r73, %r44
    slp		%r72, %r17, %r17, %r24
    srp		%r82, %r16, %r16, %r15
    srpi	%r72, %r15, %r24, 32
    dep		%r10, %r14, %r85, 32, 30

    slli	%r12, %r67, 13
    slli	%r13, %r57, 13
    srai	%r14, %r48, 14
    srli	%r15, %r38, 14
    srpi	%r16, %r39, %r13, 13
    srpi	%r17, %r29, %r13, 64-13


    write	"test packed bitwise logical"
    and		%r10, %r71, %r13
    andn	%r21, %r81, %r22
    or		%r32, %r71, %r32
    orn		%r43, %r61, %r43
    nand	%r54, %r51, %r54
    nor		%r65, %r41, %r64
    xnor	%r76, %r31, %r73
    xor		%r87, %r21, %r83


    ldi		%r20, 65
    write	"r20     %c(r20)"   ; should be 'A'

    ldi		%r3, 0
    ldi.l	%r22, 0x12345FFFFFFFFFFF
    write	"%x64(r22)"
    dep.c	%r23, %r22, 0, 23
    write	"%x64(r23)"

    ldi.l	%r22, 0x1234567890ABCDEF
    ldi.l	%r23, 0xFEDCBA9876543210
    srpi	%r22, %r22, %r23, 24
    write	"%x64(r22)"

    ldi.l	%r24, 0x4321F00000000
    write	"%x64(r24)"
    subfi	%r25, %r24, 0
    write	"%x64(r25)"
    not		%r25, %r25
    write	"%x64(r25)"
    xor		%r25, %r25, %r24
    write	"%x64(r25)"

; Example of absd.
    ldi		%r12, -10000
    absd	%r12, %r12, %gz
    write	"r12: %i64(r12)"
.end
.text
    jmp		endfpsimd
; SSE double (SSE2)
    fmadd.pd	%r16, %r71, %r69, %r13
    fmsub.pd	%r15, %r78, %r58, %r23
    fnmadd.pd	%r14, %r67, %r47, %r13
    fnmsub.pd	%r13, %r86, %r36, %r16
    fmadda.pd	%r82, %r52, %r69, %r63
    fmsuba.pd	%r50, %r91, %r69, %r63
    fadd.pd	%r12, %r86, %r25
    fnadd.pd	%r11, %r82, %r19
    fsub.pd	%r10, %r63, %r28
    faddc.pd	%r81, %r61, %r37
    fsubc.pd	%r82, %r81, %r46
    faddh.pd	%r83, %r81, %r55
    fsubh.pd	%r84, %r71, %r64
    fmul.pd	%r81, %r71, %r11
    fmulh.pd	%r60, %r11, %r22
    fdot.pd	%r85, %r81, %r13
    fmin.pd	%r86, %r84, %r14
    fmax.pd	%r87, %r61, %r15
    famin.pd	%r30, %r52, %r16
    famax.pd	%r61, %r51, %r17

    fcmpo.eq.pd	%r80, %r81, %r63
    fcmpo.ne.pd	%r11, %r81, %r32
    fcmpo.lt.pd	%r15, %r81, %r32
    fcmpo.lt.pd	%r60, %r81, %r82
    fcmpo.ne.pd	%r62, %r72, %r83
    fcmpo.le.pd	%r62, %r72, %r62

    fpk.pd	%r60, %r61, %r62
    fneg.pd	%r61, %r51
    fabsd.pd	%r61, %r51, %r3
    fnabsd.pd	%r61, %r61, %r3
    frnd.pd	%r60, %r77,3
    frnd.pd	%r62, %r61,2
    frnd.pd	%r62, %r71,0
    frnd.pd	%r83, %r67,1
    fdiv.pd	%r83, %r67, %r20
    fsqrt.pd	%r68, %r81
    frsqrt.pd	%r68, %r81


; quadruple floating-point extension example
.rodata
    align	16
a:	quad	1.234567890123456789124141241241
b:	quad	3.1415925678888734535345231234564561
c:	quad	3.4566345634563456346535463463456
.text
    ca.r	%r21, a
    ld.q		%r3, %r21,0*16
    ld.q		%r1, %r21,1*16
    ld.q		%r2, %r21,2*16
    write	"%vf64(r3)"
    write	"%vf64(r1)"
    write	"%vf64(r2)"

    write	"test binary\0"
    fmul.sd	%r3, %r1, %r2
    write	"%vf64(r3)"
    fnmul.sd	%r3, %r1, %r2
    write	"%vf64(r3)"
    fadd.sd	%r4, %r1, %r2
    write	"%vf64(r4)"
    fnadd.sd	%r4, %r1, %r2
    write	"%vf64(r4)"
    fsub.sd	%r4, %r2, %r1
    write	"%vf64(r4)"
    fdiv.sd	%r4, %r2, %r1
    write	"%vf64(r4)"

    write	"test fused fma\0"
    fmadd.sd	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"
    fnmadd.sd	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"
    fmsub.sd	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"
    fnmsub.sd	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"

    write	"test unary\0"
    mov		%r6, %r5
    write	"%vf64(r6)"
    fabs.sd	%r6, %r5
    write	"%vf64(r6)"
    fneg.sd	%r6, %r5
    write	"%vf64(r6)"
    fnabs.sd	%r6, %r5
    write	"%vf64(r6)"
    fsqrt.sd	%r6, %r2
    write	"%vf64(r6)"
    frsqrt.sd	%r6, %r2
    write	"%vf64(r6)"

    write	"test rounding\0"
    frnd.sd	%r7, %r2,4
    write	"%vf64(r7)"
    frnd.sd	%r7, %r2,2
    write	"%vf64(r7)"
    frnd.sd	%r7, %r2,1
    write	"%vf64(r7)"
    frnd.sd	%r7, %r2,0
    write	"%vf64(r7)"
    fcvt.sd.iw	%r7, %r2,0
    write	"r7=%i64(r7)"
    ldi		%r7, 123456
    fcvt.iw.sd	%r7, %r7,0
    write	"%vf64(r7)"

    write	"test minmax, abs minmax"
    fmax.sd	%r8, %r1, %r2
    write	"%vf64(r8)"
    fmin.sd	%r8, %r1, %r2
    write	"%vf64(r8)"
    famax.sd	%r8, %r1, %r2
    write	"%vf64(r8)"
    famin.sd	%r8, %r1, %r2
    write	"%vf64(r8)"

    write	"test fmergesq\0"

.rodata
    align	16
xxxq:	quad	1.122
    quad	0.9999765432
.text
    ca.r	%r21, a
; fast_check
    ldi		%r15, 100000 ; 10
    ld.q	%r15, %r21, 0*16
    ld.q	%r16, %r21, 1*16
    fsub.sd	%r22, %r15, %r16
    write	"%vf64(r22)"
yyloop:
    fmadd.sd	%r22, %r15, %r16, %r22
    fmsub.sd	%r22, %r15, %r16, %r22
    reps.ge.d	%r15, %gz, yyloop
    write	"%vf64(r22)"


.rodata
    align	16
    quad	1.189731495357231765085759326628007e+4932
qqqq:   quad	1.23456789 + 32.0
    quad	0.2345678901234567890123456789012345678 + 0.2
    quad	2*asin(1)
    quad	255
dbl1:	double	acos(sin(3.1415926)) ;-1.2345678e+200
    double	444.689679
float1:	float	0.123456789123456789e+30
    float	2.123456789122233
    float	0.0
    float	1.0
octquad:
    quad	0.25
f32:	d4	0x3fff1234
.text
    ca.r	%r45, qqqq
    ca.r	%r46, dbl1
    ca.r	%r47, float1
    write	"r45     %x64(r45)"
    ld.q		%r63, %r45,0
    write	"%vf64(r63) %x128(r63)"
    ld.q		%r63, %r45,0
    write	"%vf64(r63) %x128(r63)"
    fmul.sq	%r62, %r63, %r63
    write	"%vf64(r62)"
    ldz.w	%r60, %r47,0
    write	"%vf64(r60)"
    ldz.d	%r59, %r46,0
    ldz.w	%r58, %r47,4
    ldz.w	%r57, %r47,8
    write	"%vf64(r57)"
    write	"%vf64(r58)"
    write	"%vf64(r59)"
    ld.q	%r53, %r45,1*16
    write	"%vf64(r53)"
    ld.q	%r50, %r45,2*16
    write	"%vf64(r50)"
    ld.q	%r49, %r45,3*16
    write	"%vf64(r49) %x128(r49)"
    ldz.w	%r48, %r47,3*4
    write	"%vf64(r48)"
    fneg.sq	%r46, %r48
    write	"%vf64(r46)"
    fmadd.sq	%r40, %r52, %r52, %r53
    write	"%m(dump)"

.rodata
    align	16
__yyy:
    quad	0.5
    quad	1.0
    quad	2.25
    quad	22252.22424
    quad	-22252.22424
    quad	34.125
    quad	2.0 / 72.0
    d8	0xffffffffffffffff
    d8	0x3ffe
    d8	0xffffffffffffffff
    d8	0x3ff0
    d8	0x8000000000000000
    d8	0xbff3
    d8	0x8000000000000000
    d8	0xc003
    quad	-1.234567890123456789012345e+6
    d8	0x8000000000000000
    d8	0x3fe0
.text
    ca.r	%r12, __yyy
    ld.q	%r23, %r12, 0
    write	"%vf64(r23) %x128(r23)"
    ld.q	%r23, %r12, 1*16
    write	"%vf64(r23) %x128(r23)"
    ld.q	%r23, %r12, 2*16
    write	"%vf64(r23) %x128(r23)"
    ld.q	%r23, %r12, 3*16
    write	"%vf64(r23) %x128(r23)"
    ld.q	%r23, %r12, 4*16
    write	"%vf64(r23) %x128(r23)"
    ld.q	%r23, %r12, 5*16
    write	"%vf64(r23) %x128(r23)"
    ld.q	%r23, %r12, 6*16
    write	"%vf64(r23) %x128(r23)"
    ld.q	%r27, %r12, 7*16
    write	"%vf64(r27) %x128(r27)"
    ld.q	%r27, %r12, 8*16
    write	"%vf64(r27) %x128(r27)"
    ld.q	%r27, %r12, 9*16
    write	"%vf64(r27) %x128(r27)"
    ld.q	%r27, %r12, 10*16
    write	"%vf64(r27) %x128(r27)"
;   flddi	%r24, 8.5899345919999999995e+09 ;-1.234567890123456789012345e+6
;   write	"%vf64(r24) %x128(f24)"
;   flddi	%r24, 0.125 ; 4.656612873077392578125e-10 ; 4.656612873077392578125e-10
;   write	"%vf64(r24) %x128(f24)"
    ld.q	%r25, %r12, 11*16
    write	"%vf64(r25) %x128(r25)"
    ld.q	%r25, %r12, 12*16
    write	"%vf64(r25) %x128(r25)"
    fldri.q	%r40, 4.345678912345678901234567890123456789012345678
    write	"%vf64(r40)"


    fmadd.sd	%r23, %r60, %r55, %r33
    fmadd.sd	%r24, %r61, %r25, %r32
    fmadd.sd	%r25, %r62, %r55, %r23
    fmadd.sd	%r26, %r63, %r75, %r73
    fmadd.sd	%r27, %r64, %r75, %r73
    fmadd.sd	%r28, %r65, %r85, %r63
    fmadd.sd	%r29, %r66, %r85, %r63
    fmadd.sd	%r30, %r67, %r95, %r23
    fmadd.sd	%r31, %r68, %r95, %r23
    fmadd.sd	%r10, %r21, %r26, %r27
    fmadd.sd	%r13, %r21, %r26, %r27
    fmadd.sd	%r10, %r21, %r26, %r27
    fmadd.sd	%r12, %r21, %r26, %r27
    fmadd.sd	%r11, %r21, %r26, %r27
    fmadd.sd	%r13, %r21, %r26, %r27
    fmadd.sd	%r14, %r21, %r26, %r27
    fmadd.sd	%r15, %r21, %r26, %r27
    fmadd.sd	%r16, %r21, %r26, %r27
    fmadd.sd	%r17, %r21, %r26, %r27

    st.q	%r16, %sp,16*2
    st.q	%r17, %sp,16*3
    st.q	%r18, %sp,16*4
    st.q	%r19, %sp,16*5
    st.q	%r20, %sp,16*6
    st.q	%r21, %sp,16*7
    st.q	%r22, %sp,16*8
    st.q	%r23, %sp,16*9
    st.q	%r24, %sp,16*10
    st.q	%r25, %sp,16*11
    st.q	%r26, %sp,16*12
    st.q	%r27, %sp,16*13
    st.q	%r28, %sp,16*14
    st.q	%r29, %sp,16*15
    st.q	%r30, %sp,16*16
    st.q	%r31, %sp,16*17


; SSE single
    fmadd.ps	%r58, %r61, %r92, %r63
    fmsub.ps	%r82, %r52, %r92, %r63
    fnmadd.ps	%r82, %r52, %r69, %r63
    fnmsub.ps	%r50, %r91, %r69, %r63
    fmadda.ps	%r82, %r52, %r69, %r63
    fmsuba.ps	%r50, %r91, %r69, %r63
    fadd.ps	%r61, %r94, %r69
    fnadd.ps	%r68, %r54, %r72
    fsub.ps	%r68, %r61, %r82
    faddc.ps	%r81, %r71, %r82
    fsubc.ps	%r82, %r71, %r82
    faddh.ps	%r62, %r61, %r82
    fsubh.ps	%r62, %r61, %r62
    fmul.ps	%r62, %r51, %r62
    fmulh.ps	%r63, %r51, %r62
    fdot.ps	%r83, %r51, %r62
    fmin.ps	%r83, %r61, %r62
    fmax.ps	%r63, %r71, %r62
    famin.ps	%r64, %r71, %r82
    famax.ps	%r64, %r71, %r82

    fcmpo.ne.ps	%r65, %r61, %r62
    fcmpo.lt.ps	%r74, %r61, %r62
    fcmpo.le.ps	%r83, %r61, %r62
    fcmpu.le.ps	%r72, %r61, %r62
    fcmpu.le.ps	%r11, %r61, %r62
    fcmpu.ps	%r20, %r61, %r62

    fpk.ps	%r33, %r64, %r62
    fneg.ps	%r60, %r69
    fabsd.ps	%r61, %r68, %r3
    fnabsd.ps	%r62, %r67, %r3
    frnd.ps	%r63, %r66,0
    frnd.ps	%r64, %r65,2
    frnd.ps	%r65, %r64,1
    frnd.ps	%r66, %r63,0
    fdiv.ps	%r67, %r62, %r20
    fsqrt.ps	%r68, %r61
    frsqrt.ps	%r69, %r60

    fadd.ps	%r24, %r61, %r60
    fmul.pd	%r47, %r60, %r46

endfpsimd:

.end
.text
.rodata
    align	16
mmxdata:
    d8	0x123456759eabcd7f
    d8	0x123456789cabcdef

    d8	0xf87f5432afebcdf3
    d8	0xffffffffffffffff

    d8	0x1234567890abcdef
    d8	0x1234567890abcdef

    d8	0x1234567890abcdef
    d8	0x1234567890abcdef
.text
    alloc	90
    ca.r	%r4, mmxdata
    ld.q	%r1, %r4,0*16
    ld.q	%r2, %r4,1*16
    ld.q	%r3, %r4,2*16
    ld.q	%r4, %r4,3*16
    write	"r1      %x128(r1)"
    write	"r2      %x128(r2)"

    write	"%vu8(r1)"
    write	"%vu16(r1)"
    write	"%vu32(r1)"
    write	"%vu64(r1)"

    vaddu.b	%r3, %r1, %r2
    write	"test vadd/vaddc (1 byte)\0"
    vaddc.b	%r4, %r1, %r2
    write	"%vu8(r1)"
    write	"%vu16(r2)"
    write	"%vu32(r3)"
    write	"%vu64(r4)"
    write	"test vadd/vaddo signed (1 byte)\0"
    vaddo.b	%r4, %r1, %r2
    write	"%vi8(r1)"
    write	"%vi16(r2)"
    write	"%vi32(r3)"
    write	"%vu64(r4)"

    vsubu.b	%r3, %r1, %r2
    write	"test vsub/vsubb (1 byte)\0"
    vsubb.b	%r4, %r1, %r2
    write	"%vu8(r1)"
    write	"%vu8(r2)"
    write	"%vu8(r3)"
    write	"%vu8(r4)"
    write	"test vsub/vsubo signed (1 byte)\0"
    vsubo.b	%r4, %r1, %r2
    write	"%vi8(r1)"
    write	"%vi8(r2)"
    write	"%vi8(r3)"
    write	"%vu8(r4)"

    write	"test vaddusb"
    vaddu.b	%r3, %r1, %r2
    vaddus.b	%r4, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)\n%vu8(r4)"

    write	"test vsubusb"
    vsubu.b	%r3, %r1, %r2
    vsubus.b	%r4, %r1, %r2
    write	"%vu8(r1):\n%vu8(r2)\n%vu8(r3)\n%vu8(r4)"

    write	"test vaddssb"
    vaddu.b	%r3, %r1, %r2
    vaddss.b	%r4, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)\n%vi8(r4)"

    write	"test vsubssb"
    vsubu.b	%r3, %r1, %r2
    vsubss.b	%r4, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)\n%vi8(r4)"

    write	"test pavgu (1 byte)\0"
    vavgu.b	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test pavgs (1 byte)\0"
    vavgs.b	%r3, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write	"test vminu (1 byte)\0"
    vminu.b	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test vmins (1 byte)\0"
    vmins.b	%r3, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write	"test vmaxu (1 byte)\0"
    vmaxu.b	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test vmaxs (1 byte)\0"
    vmaxs.b	%r3, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write	"test merge low (1 byte)\0"
    vmrg.l.b	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test merge high (1 byte)\0"
    vmrg.h.b	%r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    vpkuus.h	%r2, %r3, %r4
    vpksus.h	%r2, %r3, %r4
    vpksss.h	%r2, %r3, %r4

    vpkuus.w	%r2, %r3, %r4
    vpksus.w	%r2, %r3, %r4
    vpksss.w	%r2, %r3, %r4

    vpkuus.d	%r2, %r3, %r4
    vpksus.d	%r2, %r3, %r4
    vpksss.d	%r2, %r3, %r4

;	jmp	endmmx
; d1 abs
    vmins.b	%r12, %r61, %r55
    vmins.h	%r18, %r61, %r45
    vmins.w	%r27, %r61, %r35
    vmins.d	%r36, %r61, %r25

    vminu.b	%r14, %r61, %r15
    vminu.h	%r15, %r62, %r75
    vminu.w	%r17, %r63, %r85
    vminu.d	%r16, %r64, %r75

    vmaxs.b	%r26, %r71, %r85
    vmaxs.h	%r26, %r61, %r54
    vmaxs.w	%r16, %r51, %r35
    vmaxs.d	%r16, %r41, %r55

    vmaxu.b	%r11, %r61, %r53
    vmaxu.h	%r12, %r55, %r55
    vmaxu.w	%r16, %r46, %r56
    vmaxu.d	%r13, %r31, %r55

    vrol.b	%r56, %r61, %r15
    vrol.h	%r31, %r61, %r25
    vrol.w	%r53, %r61, %r30
    vrol.d	%r62, %r61, %r41

    vror.b	%r16, %r11, %r52
    vror.h	%r11, %r21, %r63
    vror.w	%r71, %r31, %r74
    vror.d	%r81, %r41, %r85

    vsll.b	%r16, %r51, %r86
    vsll.h	%r24, %r61, %r55
    vsll.w	%r69, %r71, %r55
    vsll.d	%r77, %r81, %r55

    vsrl.b	%r21, %r81, %r50
    vsrl.h	%r12, %r63, %r51
    vsrl.w	%r13, %r62, %r52
    vsrl.d	%r64, %r63, %r53

    vsra.b	%r85, %r64, %r54
    vsra.h	%r76, %r65, %r15
    vsra.w	%r67, %r66, %r25
    vsra.d	%r58, %r67, %r36

    vavgs.b	%r49, %r68, %r47
    vavgs.h	%r30, %r69, %r58
    vavgs.w	%r26, %r11, %r69
    vavgs.d	%r16, %r21, %r75

    vavgu.b	%r14, %r31, %r85
    vavgu.h	%r15, %r41, %r45
    vavgu.w	%r56, %r51, %r25
    vavgu.d	%r87, %r61, %r15

    vaddss.b	%r42, %r71, %r15
    vaddss.h	%r83, %r81, %r45
    vaddss.w	%r74, %r41, %r85
    vaddss.d	%r65, %r61, %r75

    vaddu.b	%r56, %r61, %r75
    vaddu.h	%r47, %r61, %r65
    vaddu.w	%r38, %r61, %r55
    vaddu.d	%r29, %r61, %r55

    vaddus.b	%r55, %r61, %r45
    vaddus.h	%r65, %r61, %r35
    vaddus.w	%r74, %r61, %r25
    vaddus.d	%r84, %r61, %r15

    vaddc.b	%r53, %r61, %r55
    vaddc.h	%r13, %r61, %r55
    vaddc.w	%r12, %r61, %r55
    vaddc.d	%r12, %r61, %r55

    vsubss.b	%r56, %r61, %r15
    vsubss.h	%r67, %r61, %r12
    vsubss.w	%r78, %r61, %r13
    vsubss.d	%r89, %r61, %r45

    vsubu.b	%r70, %r61, %r85
    vsubu.h	%r86, %r61, %r45
    vsubu.w	%r46, %r61, %r13
    vsubu.d	%r46, %r61, %r75

    vsubus.b	%r41, %r68, %r65
    vsubus.h	%r12, %r37, %r55
    vsubus.w	%r23, %r26, %r45
    vsubus.d	%r14, %r18, %r35

    vcmp.eq.b	%r86, %r61, %r25
    vcmp.eq.h	%r44, %r72, %r15
    vcmp.eq.w	%r20, %r83, %r55
    vcmp.eq.d	%r16, %r84, %r55

;	pcmpne	%r106, %r61, %r55
;	pcmpgt	%r106, %r61, %r55
;	pcmpge	%r106, %r61, %r55
;	pcmple	%r106, %r61, %r55

    vcmp.lt.b	%r13, %r61, %r15
    vcmp.lt.h	%r14, %r61, %r24
    vcmp.lt.w	%r15, %r61, %r38
    vcmp.lt.d	%r16, %r61, %r45

    vcmp.ltu.b	%r19, %r11, %r75
    vcmp.ltu.h	%r18, %r21, %r82
    vcmp.ltu.w	%r16, %r31, %r73
    vcmp.ltu.d	%r14, %r71, %r54

    vmrg.h.b	%r11, %r71, %r13
    vmrg.h.h	%r72, %r67, %r27
    vmrg.h.w	%r13, %r58, %r55
    vmrg.h.d	%r14, %r69, %r15

    vmrg.l.b	%r76, %r61, %r11
    vmrg.l.h	%r26, %r11, %r62
    vmrg.l.w	%r16, %r15, %r73
    vmrg.l.d	%r16, %r11, %r85

    write	"end simd(int) test"
endmmx:

.end
.text
    alloc	70
    write	"test system instructions (assembler only)"

    addi	%sp, %sp, -32	; alloc stack frame
    write	"test tpa for sp: 0x%x64(sp)"
    tpa		%r4, %sp
    write	"tpa(sp): 0x%x64(r4)"
    addi	%sp, %sp, 32	; rollback stack frame
    
    jmp		system_skip

    ldi		%r45, 1012
    syscall
    nop		0
    sysret
    rfi

    icb.i	%r34, 16
    dcb.t	%r34, 16
    dcb.f	%r34, 16
    dcb.i	%r34, 16


    mf.spr	%r34, %lid
    mt.spr	%r34, %lid
    m.probe	%r34, %r45, %r66
    ret.f	234567

    mf.spr	%r32, %iv
    mf.spr	%r32, %psr

; test system instructions
    ptc		%r10, %r45, %r11

    mf.spr	%r12, %pta
    mf.spr	%r12, %fpcr
    mt.spr	%r11, %rsc

; test atomic fences
    fence.a
    fence.r
    fence.ar
    fence.sc

    mt.dbr	%r44, %r66, 0
    mf.dbr	%r55, %r66, 0
    mt.ibr	%r44, %r66, 0
    mf.ibr	%r55, %r66, 0
    mt.itr	%r44, %r66, %r12
    mt.dtr	%r44, %r66, %r12

;	bpa	b7, %r7
;	bpal	b7, b4, %r6
;	lpr	b7, %r6, label16

    undef
system_skip:
    write	"end test system instructions (assembler only)"
.end
.text
.data
data_unaligned:
align 16
    d1	0x00
    d1	0x01
    d1	0x02
    d1	0x03
    d1	0x04
    d1	0x05
    d1	0x06
    d1	0x07
    d1	0x08
    d1	0x09
    d1	0x0a
    d1	0x0b
    d1	0x0c
    d1	0x0d
    d1	0x0e
    d1	0x0f

    d1	0x10
    d1	0x11
    d1	0x12
    d1	0x13
    d1	0x14
    d1	0x15
    d1	0x16
    d1	0x17
    d1	0x18
    d1	0x19
    d1	0x1a
    d1	0x1b
    d1	0x1c
    d1	0x1d
    d1	0x1e
    d1	0x1f

.text
    write	"load/store unaligned"
    alloc	96
    ca.rf	%r17, data_unaligned

    ldz.h	%r3, %r17, 0
    write	"%x16(r3)"
    ldz.h	%r3, %r17, 1
    write	"%x16(r3)"
    ldz.h	%r3, %r17, 2
    write	"%x16(r3)"

    ldz.w	%r3, %r17, 0
    write	"%x32(r3)"
    ldz.w	%r3, %r17, 1
    write	"%x32(r3)"
    ldz.w	%r3, %r17, 2
    write	"%x32(r3)"
    ldz.w	%r3, %r17, 3
    write	"%x32(r3)"
    ldz.w	%r3, %r17, 4
    write	"%x32(r3)"

    ldz.d	%r3, %r17, 0
    write	"%x64(r3)"
    ldz.d	%r3, %r17, 1
    write	"%x64(r3)"
    ldz.d	%r3, %r17, 2
    write	"%x64(r3)"
    ldz.d	%r3, %r17, 3
    write	"%x64(r3)"
    ldz.d	%r3, %r17, 4
    write	"%x64(r3)"
    ldz.d	%r3, %r17, 5
    write	"%x64(r3)"
    ldz.d	%r3, %r17, 6
    write	"%x64(r3)"
    ldz.d	%r3, %r17, 7
    write	"%x64(r3)"
    ldz.d	%r3, %r17, 8
    write	"%x64(r3)"

    ld.q	%r3, %r17, 0
    write	"%x128(r3)"
    ld.q	%r3, %r17, 1
    write	"%x128(r3)"
    ld.q	%r3, %r17, 2
    write	"%x128(r3)"
    ld.q	%r3, %r17, 3
    write	"%x128(r3)"
    ld.q	%r3, %r17, 4
    write	"%x128(r3)"
    ld.q	%r3, %r17, 5
    write	"%x128(r3)"
    ld.q	%r3, %r17, 6
    write	"%x128(r3)"
    ld.q	%r3, %r17, 7
    write	"%x128(r3)"
    ld.q	%r3, %r17, 8
    write	"%x128(r3)"
    ld.q	%r3, %r17, 9
    write	"%x128(r3)"
    ld.q	%r3, %r17, 10
    write	"%x128(r3)"
    ld.q	%r3, %r17, 11
    write	"%x128(r3)"
    ld.q	%r3, %r17, 12
    write	"%x128(r3)"
    ld.q	%r3, %r17, 13
    write	"%x128(r3)"
    ld.q	%r3, %r17, 14
    write	"%x128(r3)"
    ld.q	%r3, %r17, 15
    write	"%x128(r3)"
    ld.q	%r3, %r17, 16
    write	"%x128(r3)"
.end
.rodata
align 4
    d4	table_cases
    d4	label_0
    d4	label_1
    d4	label_2

table_cases:
    i4	label_0 - table_cases
    i4	label_1 - table_cases
    i4	label_2 - table_cases

.text
    alloc	80
    write	"test table switch to case 1"
    ldi		%r4, 1
    ca.rf	%r5, table_cases
    jmp.t	%r5, %r4

label_0:
    write	"case 0"
    cmp.eq.q	%r12, %r24, %gz
    cmp.ne.q	%r12, %r24, %gz
    dep.s	%r18, %r20, 13, 32
    dep.c	%r19, %r23, 13, 32
    ldi		%r12, -1234
    ldi		%r13, 3456
    jmp		label_after_switch

label_1:
    write	"case 1"
    andi	%r45, %r44, 12345
    sl.add	%r14, %sp, %r12, 2
    sl.add	%r12, %r23, %r44, 3
    mov		%r12, %r13
    ldi		%r24, 0
    mt.spr	%r24, %psr
    mf.spr	%r12, %psr
    nand	%r34, %r34, %r45
    sll		%r12, %r23, %r45
    slli	%r12, %r23, 45
    jmp		label_after_switch

label_2:
    write	"case 2"
    addi	%r34, %r34,-1
    mov		%r58, %r45
    sl.add	%r12, %r15, %r30, 14
    sl.add	%r12, %r15, %r30, 5
    sl.add	%r12, %r15, %r30, 5
    srd		%r34, %r56, %r40
    srdi	%r34, %r56, 40
    dep.a	%r40, %r78, 40, 20
    sl.add	%r54, %r45, %r22, 4
    sl.add	%r54, %r45, %r22, 20
    ca.xd	%r3, %r45, %tp, 3, 55
    jmp		label_after_switch

label_after_switch:
    write	"end table switch test"
.end
.rodata
    align	16
console_test_quad:
    quad	1.189731495357231765085759326628007e+4932
console_test_quad2:
    quad	1.23456789 + 32.0
console_test_quad3:
    quad	0.2345678901234567890123456789012345678 + 0.2
    quad	2*asin(1)
    quad	255
console_test_double:
    double	acos(sin(3.1415926)) ;-1.2345678e+200
    double	444.689679
console_test_float:
    float	0.123456789123456789e+30
    float	2.123456789122233
    float	0.0
    float	1.0
.text
    alloc	35
    write	"ip=%s(ip), eip=%s(eip), psr=%s(psr)"

    write	"end test write special regs"

    write	"\ntest write: general register"

    write	"%%i8(sp)  = %i8(sp)"
    write	"%%i16(sp) = %i16(sp)"
    write	"%%i32(sp) = %i32(sp)"
    write	"%%i64(sp) = %i64(sp)"
    write	"%%u8(sp)  = %u8(sp)"
    write	"%%u16(sp) = %u16(sp)"
    write	"%%u32(sp) = %u32(sp)"
    write	"%%u64(sp) = %u64(sp)"
    write	"%%x8(sp)  = 0x%x8(sp)"
    write	"%%x16(sp) = 0x%x16(sp)"
    write	"%%x32(sp) = 0x%x32(sp)"
    write	"%%x64(sp) = 0x%x64(sp)"

    write	"%x64(r0)"
    write	"%x64(r1)"
    write	"%x64(r2)"
    write	"%x64(r22)"
    write	"%x64(r33)"
    write	"%x64(g0)"
    write	"%x64(g1)"
    write	"%x64(tp)"
    write	"%x64(sp)"

    write	"end test write general regs"

    ld.q.r	%r22, console_test_quad
    write	"r22 = %x128(r22) %f128(r22)"
    ld.q.r	%r22, console_test_quad2
    write	"r22 = %x128(r22) %f128(r22)"
    ld.q.r	%r22, console_test_quad3
    write	"r22 = %x128(r22) %f128(r22)"
    ldz.d.r	%r22, console_test_double
    write	"r22 = %x64(r22) %f64(r22)"
    ldz.w.r	%r22, console_test_float
    write	"r22 = %x32(r22) %f32(r22)"

    write	"end test write fp regs"
.end