html-program

.text
    alloc 96
    write "test carry-less multiply"
    clmul_ll %r34, %r21, %r22
    clmul_hl %r34, %r21, %r22
    clmul_hl %r34, %r21, %r22
    clmul_hh %r34, %r21, %r22
.rodata
align 16
vector_a:
    d8 0x7b5b546573745665
    d8 0x63746f725d53475d
vector_b:
    d8 0x4869285368617929
    d8 0x5b477565726f6e5d
result_00:
    d8 0x1d4d84c85c3440c0
    d8 0x929633d5d36f0451
result_01:
    d8 0x1bd17c8d556ab5a1
    d8 0x7fa540ac2a281315
result_10:
    d8 0x1a2bf6db3a30862f
    d8 0xbabf262df4b7d5c9
result_11:
    d8 0x1d1e1f2c592e7c45
    d8 0xd66ee03e410fd4ed
.text
    ld_iprel_i128   %r12, vector_a
    ld_iprel_i128   %r13, vector_b

    clmul_ll   %r11, %r12, %r13
    ld_iprel_i128   %r21, result_00
    write "clmul: %x128(r11) %x128(r21)"
    clmul_hl   %r11, %r13, %r12
    ld_iprel_i128   %r21, result_01
    write "clmul: %x128(r11) %x128(r21)"
    clmul_hl   %r11, %r12, %r13
    ld_iprel_i128   %r21, result_10
    write "clmul: %x128(r11) %x128(r21)"
    clmul_hh   %r11, %r12, %r13
    ld_iprel_i128   %r21, result_11
    write "clmul: %x128(r11) %x128(r21)"

    write "test aes"
    aes_dec %r11, %r12, %r13
    aes_dec_last %r11, %r12, %r13
    aes_enc %r11, %r12, %r13
    aes_enc_last %r11, %r12, %r13
    aes_imc %r11, %r12
    aes_keygen_assist %r11, %r12, 250
    write "end aes test"
.end
.text
;*****************************************************************
; ARITHMETIC
;*****************************************************************
    alloc	96
    write	"test load constant (1234567)"
    ld_imm %r1, 1234567
    write	"ldi: %i64(r1)"

    write	"test load long constant (123456789012345678)"
    ld_imm.l	%r1, 123456789012345678
    write	"ldi long: %i64(r1)"

    write	"test simple arithmetic"
    ld_imm		%r1, 1
    ld_imm		%r2, 2
    ld_imm		%r3, 3

    write	"add 1+2"
    add_i64		%r4, %r1, %r2
    write	"add: %i64(r4)"

    write	"add immediate 1+6"
    add_imm_i64	%r4, %r1, 6
    write	"addi: %i64(r4)"

    write	"sub 1-2"
    sub_i64		%r4, %r1, %r2
    write	"sub: %i64(r4)"

    write	"sub from immediate 6-1"
    subf_imm_i64	%r4, %r1, 6
    write	"subfi: %i64(r4)"

    write	"mul 3*4"
    ld_imm		%r1, 3
    ld_imm		%r2, 4
    mul_i64 %r4, %r1, %r2
    write	"mul: %i64(r4)"

    write	"12 div 4"
    ld_imm		%r1, 12
    ld_imm		%r2, 4
    div_i64 %r4, %r1, %r2
    write	"%i64(r4)"

    write	"15 mod 4"
    ld_imm		%r1, 15
    ld_imm		%r2, 4
    mod_i64 %r4, %r1, %r2
    write	"mod: %i64(r4)"

    write	"test int32_t add"
    ld_imm.l	%r1, 0xFFFFFFFF
    ld_imm.l	%r2, 0xFFFFFFF0
    add_i32 %r3, %r1, %r2
    write "add4: %i64(r3)"
    add_imm_i32.l %r3, %r1, 0xFFFFFFFF
    write "addis4.l: %i64(r3)"


    add_imm_i64	%r45, %r45, 12
    mov		%r54, %r56
    sub_i64		%r45, %r56, %r50
    add_imm_i64	%r45, %r55, -1000
    cmp_ne_i64 %r12, %r56, %r10
    subf_imm_i64 %r45, %r56, -10000
    subf_imm_i64 %r45, %r56, -20000
    cmp_eq_i64 %r13, %r56, %r50
    add_i64 %r45, %r56, %r50
    add_imm_i64 %r45, %r56, -10000
    mul_i64 %r45, %r56, %r50
    mul_imm_i64 %r45, %r56, -10000
    mov %r55, %r20
    ld_imm %r55, 1200
    ld_imm %r55, 987654
    ld_imm.l %r56, 98765432198765432
    add_imm_i64 %r12, %r13, -789
    cmp_ne_i64 %r14, %r13, %r77
    nand %r43, %r44, %r34
    nor %r43, %r44, %r34
    add_imm_i64 %r56, %sp, 0
    ; callr %r0, quadrat
    add_i64 %r56, %sp, %sp

    ld_imm.l %r55, -9223372036854775808
    add_imm_i64  %r56, %sp, -64
    subf_imm_i64.l %r55, %r56,12345678901234567
    nor %r12, %r14, %r14
    add_imm_i64	%r56, %sp, -64
    nor %r12, %r14, %r14
    subf_imm_i64.l %r55, %r56, 12345678901234567
    add_imm_i64 %r56, %sp, -64
    subf_imm_i64.l %r55, %r56, -12345678901234567
    add_imm_i64   %r56, %sp, -64
    subf_imm_i64.l %r55, %r56, -12345678901234567
    add_imm_i64.l %r45, %r56, 12345678



    ld_imm.l	%r5, 0xaFFFFFFF12345677
    ld_imm.l	%r6, 0xaFFFFFFF12345678

    write	"test signed overflow: %i64(r5) %i64(r6)"

    write "add overflow"
    addo_i64 %r2, %r5, %r6
    write "addo: %i64(r2)"

    write "subtract overflow"
    subo_i64 %r2, %r5, %r6
    write "subo: %i64(r2)"

    write "test unsigned add carry"
    ld_imm %r7, -1
    ld_imm %r5, -2
    ld_imm %r6, -1
    add_addc_u64 %r2, %r5, %r6, %r7
    write "addaddc: %u64(r5) %u64(r6) %u64(r7) => %i64(r2)"

    write "test unsigned subtract borrow"
    ld_imm %r7, -1
    ld_imm %r5, 12
    ld_imm %r6, -1
    sub_subb_u64 %r2, %r5, %r6, %r7
    write "subsub: %u64(r5) %u64(r6) %u64(r7) => %i64(r2)"

    mul_add   %r34, %r45, %r67, %r80
    mul_sub   %r34, %r45, %r67, %r80
    mul_subf  %r34, %r45, %r67, %r80

    add_add_i64   %r34, %r45, %r67, %r80
    add_sub_i64   %r34, %r45, %r67, %r80
    sub_sub_i64   %r34, %r45, %r67, %r80

.end
.text
    alloc 96
    write "test atomic fetch-op"
    add_imm_i64 %r5, %sp, -64
    write "atomic base: %x64(r5)"
    ld_imm  %r10, 5
    ld_imm  %r12, 10
    ld_imm  %r56, 5

    write "test amo-add"

    amx_ld_add_u8 %r4, %r5, %r10
    amq_ld_add_u8 %r4, %r5, %r10
    amr_ld_add_u8 %r4, %r5, %r10
    amz_ld_add_u8 %r4, %r5, %r10

    amx_ld_add_u16  %r4, %r5, %r10
    amq_ld_add_u16 %r4, %r5, %r10
    amr_ld_add_u16 %r4, %r5, %r10
    amz_ld_add_u16 %r4, %r5, %r10

    amx_ld_add_u32 %r4, %r5, %r10
    amq_ld_add_u32 %r4, %r5, %r10
    amr_ld_add_u32 %r4, %r5, %r10
    amz_ld_add_u32 %r4, %r5, %r10

    amx_ld_add_u64 %r4, %r5, %r10
    amq_ld_add_u64 %r4, %r5, %r10
    amr_ld_add_u64 %r4, %r5, %r10
    amz_ld_add_u64 %r4, %r5, %r10

    amx_ld_add_u128 %r4, %r5, %r10
    amq_ld_add_u128 %r4, %r5, %r10
    amr_ld_add_u128 %r4, %r5, %r10
    amz_ld_add_u128 %r4, %r5, %r10

    write "test amo-and"

    amx_ld_and_u8 %r4, %r5, %r10
    amq_ld_and_u8 %r4, %r5, %r10
    amr_ld_and_u8 %r4, %r5, %r10
    amz_ld_and_u8 %r4, %r5, %r10

    amx_ld_and_u16 %r4, %r5, %r10
    amq_ld_and_u16 %r4, %r5, %r10
    amr_ld_and_u16 %r4, %r5, %r10
    amz_ld_and_u16 %r4, %r5, %r10

    amx_ld_and_u32 %r4, %r5, %r10
    amq_ld_and_u32 %r4, %r5, %r10
    amr_ld_and_u32 %r4, %r5, %r10
    amz_ld_and_u32 %r4, %r5, %r10

    amx_ld_and_u64 %r4, %r5, %r10
    amq_ld_and_u64 %r4, %r5, %r10
    amr_ld_and_u64 %r4, %r5, %r10
    amz_ld_and_u64 %r4, %r5, %r10

    amx_ld_and_u128 %r4, %r5, %r10
    amq_ld_and_u128 %r4, %r5, %r10
    amr_ld_and_u128 %r4, %r5, %r10
    amz_ld_and_u128 %r4, %r5, %r10

    write "test amo-or"

    amx_ld_or_u8 %r4, %r5, %r10
    amq_ld_or_u8 %r4, %r5, %r10
    amr_ld_or_u8 %r4, %r5, %r10
    amz_ld_or_u8 %r4, %r5, %r10

    amx_ld_or_u16 %r4, %r5, %r10
    amq_ld_or_u16 %r4, %r5, %r10
    amr_ld_or_u16 %r4, %r5, %r10
    amz_ld_or_u16 %r4, %r5, %r10

    amx_ld_or_u32 %r4, %r5, %r10
    amq_ld_or_u32 %r4, %r5, %r10
    amr_ld_or_u32 %r4, %r5, %r10
    amz_ld_or_u32 %r4, %r5, %r10

    amx_ld_or_u64 %r4, %r5, %r10
    amq_ld_or_u64 %r4, %r5, %r10
    amr_ld_or_u64 %r4, %r5, %r10
    amz_ld_or_u64 %r4, %r5, %r10

    amx_ld_or_u128 %r4, %r5, %r10
    amq_ld_or_u128 %r4, %r5, %r10
    amr_ld_or_u128 %r4, %r5, %r10
    amz_ld_or_u128 %r4, %r5, %r10

    write "test amo-xor"

    amx_ld_xor_u8 %r4, %r5, %r10
    amq_ld_xor_u8 %r4, %r5, %r10
    amr_ld_xor_u8 %r4, %r5, %r10
    amz_ld_xor_u8 %r4, %r5, %r10

    amx_ld_xor_u16 %r4, %r5, %r10
    amq_ld_xor_u16 %r4, %r5, %r10
    amr_ld_xor_u16 %r4, %r5, %r10
    amz_ld_xor_u16 %r4, %r5, %r10

    amx_ld_xor_u32 %r4, %r5, %r10
    amq_ld_xor_u32 %r4, %r5, %r10
    amr_ld_xor_u32 %r4, %r5, %r10
    amz_ld_xor_u32 %r4, %r5, %r10

    amx_ld_xor_u64 %r4, %r5, %r10
    amq_ld_xor_u64 %r4, %r5, %r10
    amr_ld_xor_u64 %r4, %r5, %r10
    amz_ld_xor_u64 %r4, %r5, %r10

    amx_ld_xor_u128 %r4, %r5, %r10
    amq_ld_xor_u128 %r4, %r5, %r10
    amr_ld_xor_u128 %r4, %r5, %r10
    amz_ld_xor_u128 %r4, %r5, %r10

    write "test amo-smin"
    amx_ld_min_i8     %r4, %r5, %r10
    amq_ld_min_i8     %r4, %r5, %r10
    amr_ld_min_i8     %r4, %r5, %r10
    amz_ld_min_i8    %r4, %r5, %r10

    amx_ld_min_i16   %r4, %r5, %r10
    amq_ld_min_i16   %r4, %r5, %r10
    amr_ld_min_i16   %r4, %r5, %r10
    amz_ld_min_i16  %r4, %r5, %r10

    amx_ld_min_i32    %r4, %r5, %r10
    amq_ld_min_i32    %r4, %r5, %r10
    amr_ld_min_i32    %r4, %r5, %r10
    amz_ld_min_i32   %r4, %r5, %r10

    amx_ld_min_i64   %r4, %r5, %r10
    amq_ld_min_i64   %r4, %r5, %r10
    amr_ld_min_i64   %r4, %r5, %r10
    amz_ld_min_i64  %r4, %r5, %r10

    amx_ld_min_i128   %r4, %r5, %r10
    amq_ld_min_i128   %r4, %r5, %r10
    amr_ld_min_i128   %r4, %r5, %r10
    amz_ld_min_i128  %r4, %r5, %r10

    write "test amo-smax"
    amx_ld_max_i8   %r4, %r5, %r10
    amq_ld_max_i8   %r4, %r5, %r10
    amr_ld_max_i8   %r4, %r5, %r10
    amz_ld_max_i8  %r4, %r5, %r10

    amx_ld_max_i16   %r4, %r5, %r10
    amq_ld_max_i16   %r4, %r5, %r10
    amr_ld_max_i16   %r4, %r5, %r10
    amz_ld_max_i16  %r4, %r5, %r10

    amx_ld_max_i32   %r4, %r5, %r10
    amq_ld_max_i32   %r4, %r5, %r10
    amr_ld_max_i32   %r4, %r5, %r10
    amz_ld_max_i32  %r4, %r5, %r10

    amx_ld_max_i64   %r4, %r5, %r10
    amq_ld_max_i64   %r4, %r5, %r10
    amr_ld_max_i64   %r4, %r5, %r10
    amz_ld_max_i64  %r4, %r5, %r10

    amx_ld_max_i128   %r4, %r5, %r10
    amq_ld_max_i128   %r4, %r5, %r10
    amr_ld_max_i128   %r4, %r5, %r10
    amz_ld_max_i128  %r4, %r5, %r10

    write "test amo-umin"
    amx_ld_min_u8   %r4, %r5, %r10
    amq_ld_min_u8   %r4, %r5, %r10
    amr_ld_min_u8   %r4, %r5, %r10
    amz_ld_min_u8  %r4, %r5, %r10

    amx_ld_min_u16   %r4, %r5, %r10
    amq_ld_min_u16   %r4, %r5, %r10
    amr_ld_min_u16   %r4, %r5, %r10
    amz_ld_min_u16  %r4, %r5, %r10

    amx_ld_min_u32  %r4, %r5, %r10
    amq_ld_min_u32  %r4, %r5, %r10
    amr_ld_min_u32  %r4, %r5, %r10
    amz_ld_min_u32 %r4, %r5, %r10

    amx_ld_min_u64   %r4, %r5, %r10
    amq_ld_min_u64   %r4, %r5, %r10
    amr_ld_min_u64   %r4, %r5, %r10
    amz_ld_min_u64  %r4, %r5, %r10

    amx_ld_min_u128   %r4, %r5, %r10
    amq_ld_min_u128   %r4, %r5, %r10
    amr_ld_min_u128   %r4, %r5, %r10
    amz_ld_min_u128  %r4, %r5, %r10

    write "test amo-umax"
    amx_ld_max_u8    %r4, %r5, %r10
    amq_ld_max_u8    %r4, %r5, %r10
    amr_ld_max_u8    %r4, %r5, %r10
    amz_ld_max_u8   %r4, %r5, %r10

    amx_ld_max_u16    %r4, %r5, %r10
    amq_ld_max_u16    %r4, %r5, %r10
    amr_ld_max_u16    %r4, %r5, %r10
    amz_ld_max_u16   %r4, %r5, %r10

    amx_ld_max_u32    %r4, %r5, %r10
    amq_ld_max_u32    %r4, %r5, %r10
    amr_ld_max_u32    %r4, %r5, %r10
    amz_ld_max_u32   %r4, %r5, %r10

    amx_ld_max_u64    %r4, %r5, %r10
    amq_ld_max_u64    %r4, %r5, %r10
    amr_ld_max_u64    %r4, %r5, %r10
    amz_ld_max_u64   %r4, %r5, %r10

    amx_ld_max_u128   %r4, %r5, %r10
    amq_ld_max_u128   %r4, %r5, %r10
    amr_ld_max_u128   %r4, %r5, %r10
    amz_ld_max_u128  %r4, %r5, %r10

    write "test cas"

    amx_cas_i8    %r12, %r5, %r56
    amq_cas_i8    %r12, %r5, %r56
    amr_cas_i8    %r12, %r5, %r56
    amz_cas_i8   %r12, %r5, %r56

    amx_cas_i16   %r12, %r5, %r56
    amq_cas_i16   %r12, %r5, %r56
    amr_cas_i16   %r12, %r5, %r56
    amz_cas_i16  %r12, %r5, %r56

    amx_cas_i32   %r12, %r5, %r56
    amq_cas_i32   %r12, %r5, %r56
    amr_cas_i32   %r12, %r5, %r56
    amz_cas_i32  %r12, %r5, %r56

    amx_cas_i64   %r12, %r5, %r56
    amq_cas_i64   %r12, %r5, %r56
    amr_cas_i64   %r12, %r5, %r56
    amz_cas_i64  %r12, %r5, %r56

    amx_cas_i128  %r12, %r5, %r56
    amq_cas_i128  %r12, %r5, %r56
    amr_cas_i128  %r12, %r5, %r56
    amz_cas_i128 %r12, %r5, %r56

    write "test load atomic relaxed"
    amx_ld_i8    %r12, %r5
    amx_ld_i16   %r12, %r5
    amx_ld_i32   %r12, %r5
    amx_ld_i64   %r12, %r5
    amx_ld_i128  %r12, %r5

    write "test load atomic acquire"
    amq_ld_i8    %r12, %r5
    amq_ld_i16   %r12, %r5
    amq_ld_i32   %r12, %r5
    amq_ld_i64   %r12, %r5
    amq_ld_i128  %r12, %r5

    write "test store atomic relaxed"
    amx_st_i8    %r12, %r5
    amx_st_i16   %r12, %r5
    amx_st_i32   %r12, %r5
    amx_st_i64   %r12, %r5
    amx_st_i128  %r12, %r5

    write "test store atomic release"
    amr_st_i8    %r12, %r5
    amr_st_i16   %r12, %r5
    amr_st_i32   %r12, %r5
    amr_st_i64   %r12, %r5
    amr_st_i128  %r12, %r5

.end
.text
.data
data_lbl:
    d1 25
    d1 26
    d1 27
    d1 28

.text
program_start:
; Here we test references to data section.
; Absolute offset from begin of section
    write "base addressing"
    alloc 96
    lda_r %r17, program_start
    ld_imm %r12, data_lbl
    write "data_lbl: %i64(r12)"

    ld_imm %r12, data_hi(data_lbl)
    write "data_hi(data_lbl): %i64(r12)"
    ld_imm %r12, data_lo(data_lbl)
    write "data_lo(data_lbl): %i64(r12)"
    lda_iprel %r13, data_lbl
    write "ca.rf(data_lbl): %x64(r13)"
    lda_iprel.l %r13, data_lbl
    write "ca.rf(data_lbl): %x64(r13)"

    add_imm_i64 %r13, %r17, data_hi(data_lbl)
    write "r13     %i64(r13)"
    add_imm_i64 %r14, %r13, data_lo(data_lbl)+0
    write "r14     %i64(r14)"

    add_imm_i64 %r13, %r17, data_hi(data_lbl)
    write "r13     %i64(r13)"
    ld_u8 %r25, %r13, data_lo(data_lbl)+0
    ld_u8 %r26, %r13, data_lo(data_lbl)+1
    ld_u8 %r27, %r13, data_lo(data_lbl)+2
    ld_u8 %r28, %r13, data_lo(data_lbl)+3
    write "r25     %i64(r25)" ; must be 25
    write "r26     %i64(r26)" ; must be 26
    write "r27     %i64(r27)" ; must be 27
    write "r28     %i64(r28)" ; must be 28

; test load context
    ld_u64 %r1, %sp, -16
    st_i64 %r1, %sp, -16
    jmp skipaddr
    jmp.l skipaddr

; test indexed load/store
    st_xi64_i8  %r12, %r15, %r30, 4, 14
    st_xi64_i16 %r12, %r15, %r30, 4, 14
    st_xi64_i32 %r12, %r15, %r30, 4, 14
    st_xi64_i64 %r12, %r15, %r30, 4, 14

    amx_ld_i128 %r30, %r56
    amx_st_i128 %r43, %r56

    sl_add_i64 %r43, %r56, %r23, 4
    sl_sub_i64 %r43, %r56, %r23, 42
    sl_subf_i64 %r43, %r56, %r23, 12

    ld_u32 %r30, %r5, 66*4 ; load mid
    ld_xi64_u64 %r40, %tp, %r30, 0, 4 ; load base

    ld_xi64_i64 %r12, %r23, %r40, 3, 54
    ld_xi64_i64 %r12, %r23, %r40, 3, 54
    ld_xi64_u64 %r12, %r23, %r40, 3, 54
    ld_xi64_u64 %r12, %r23, %r40, 3, 54
    st_xi64_i32 %r12, %r23, %r40, 3, 54
    st_xi64_i64 %r12, %r23, %r40, 3, 54

    ld_xi64_i8 %r12, %r23, %r40, 3, 54
    ld_xi64_i8 %r12, %r23, %r40, 3, 54
    ld_xi64_u8 %r12, %r23, %r40, 3, 54
    ld_xi64_u8 %r12, %r23, %r40, 3, 54
    st_xi64_i8 %r12, %r23, %r40, 3, 54
    st_xi64_i8 %r12, %r23, %r40, 3, 54

    ld_xi64_i16 %r12, %r23, %r40, 3, 54
    ld_xi64_i16 %r12, %r23, %r40, 3, 54
    ld_xi64_u16 %r12, %r23, %r40, 3, 54
    ld_xi64_u16 %r12, %r23, %r40, 3, 54
    st_xi64_i16 %r12, %r23, %r40, 3, 54
    st_xi64_i16 %r12, %r23, %r40, 3, 54

.text
; LOAD/STORE
    sl_add_i64 %r54, %r56, %r12, 5

    ld_u8 %r16, %r45, 8900
    ld_i8 %r15, %r46, 8900
    ld_xi64_u8 %r54, %r56, %r12, 2, 37
    ld_xi64_i8 %r53, %r65, %r12, 2, 37
    ld_xi64_u8.l %r54, %r56, %r12, 2, 37000000
    ld_xi64_i8.l %r53, %r65, %r12, 2, -37000000
    ld_mia_u8 %r52, %r75, 10
    ld_mia_i8 %r51, %r76, 10
    ld_mib_u8 %r52, %r75, 10
    ld_mib_i8 %r51, %r76, 10
    st_mia_i8 %r51, %r76, 10
    st_mib_i8 %r52, %r75, 10

    ld_u16 %r12, %r45, 8900
    ld_i16 %r12, %r45, 8900
    ld_xi64_u16 %r54, %r56, %r12, 3, -57
    ld_xi64_i16 %r54, %r56, %r12, 2, 37
    ld_xi64_u16.l %r54, %r56, %r12, 2, 37000000
    ld_xi64_i16.l %r53, %r65, %r12, 2, -37000000
    ld_mia_u16 %r54, %r56, 12
    ld_mia_i16 %r54, %r56, -60
    ld_mib_u16 %r54, %r56, 12
    ld_mib_i16 %r54, %r56, -60
    st_mia_i16 %r51, %r76, 10
    st_mib_i16 %r52, %r75, 10

    ld_u32 %r12, %r45, 8900
    ld_i32 %r12, %r45, 8900
    ld_xi64_u32 %r54, %r56, %r12, 2, 7
    ld_xi64_i32 %r54, %r56, %r12, 2, 7
    ld_xi64_u32.l %r54, %r56, %r12, 2, 37000000
    ld_xi64_i32.l %r53, %r65, %r12, 2, -37000000
    ld_mia_u32 %r54, %r56, 12
    ld_mia_i32 %r54, %r56, 32
    ld_mib_u32 %r54, %r56, 12
    ld_mib_i32 %r54, %r56, 32
    st_mia_i32 %r51, %r76, 10
    st_mib_i32 %r52, %r75, 10

    ld_u64   %r54, %r56, 5600
    ld_i64   %r54, %r56, 5600
    ld_u64.l %r53, %r46, 98765432
    ld_u64    %r52, %r45, -5600
    ld_u64.l   %r51, %r55, -98765432
    ld_xi64_u64  %r50, %r56, %r12, 2, 37
    ld_xi64_i64 %r50, %r56, %r12, 2, 37
    ld_xi64_u64.l %r54, %r56, %r12, 2, 37000000
    ld_xi64_i64.l %r53, %r65, %r12, 2, -37000000
    ld_mia_u64 %r57, %r56, -12
    ld_mia_u64 %r57, %r56, -12
    ld_mia_i64 %r57, %r56, -12
    ld_mia_i64 %r57, %r56, -12
    ld_mib_u64 %r57, %r56, -12
    ld_mib_u64 %r57, %r56, -12
    ld_mib_i64 %r57, %r56, -12
    ld_mib_i64 %r57, %r56, -12
    st_mia_i64 %r51, %r76, 10
    st_mib_i64 %r52, %r75, 10

    ld_i128  %r16, %r45, 8900
    ld_i128.l %r16, %r45, 8900000
    ld_i128.l %r16, %r45, -8900000
    ld_xi64_i128 %r54, %r56, %r12, 2, 37
    ld_xi64_i128.l %r54, %r56, %r12, 2, 37000000
    ld_xi64_i128.l %r54, %r56, %r12, 2, -37000000
    ld_mia_i128 %r52, %r75, 10
    ld_mia_i128 %r52, %r75, 10
    ld_mib_i128 %r52, %r75, 10
    ld_mib_i128 %r52, %r75, 10
    st_mia_i128 %r51, %r76, 10
    st_mib_i128 %r52, %r75, 10

    st_i8  %r12, %r45, 8900
    st_i16 %r12, %r45, 8900
    st_i32 %r12, %r45, 8900
    st_i64 %r12, %r45, 890*8

    ld_u64    %r12, %r45, 8048
    st_i64    %r12, %r45, 8064
    ld_xi64_u64 %r12, %r45, %r13, 3, 7
    st_xi64_i64 %r12, %r45, %r13, 3, 7

    ld_u64  %r60, %r55, 56
    ld_u64  %r60, %r56, 56
    ld_u64  %r46, %r55, 120
    st_i64  %r47, %r55, 56

    ld_u64     %r60, %sp, 624
    st_i64     %r60, %sp, 624
    ld_xi64_u64  %r60, %sp, %r12, 3, 28
    st_xi64_i64  %r60, %sp, %r12, 3, 26
    ld_u64     %r56, %r57, 567
    st_i64     %r56, %r57, 567

    ld_u32 %r34, %r12, 900
    ld_u64 %r34, %r12, 900
    st_i32 %r23, %r12, 900
    st_i64 %r23, %r12, 900

    ld_i128 %r34, %r13, 55*16
    st_i128 %r35, %r13, 55*16
    ld_xi64_i128 %r34, %r13, %r45, 3, 60
    st_xi64_i128 %r34, %r13, %r45, 3, 60

skipaddr:
    nop 0
.end
.text
    alloc 25
    ld_imm.l %r23, 0x1234567890abcdef
    write "test population statistic instructions"
    cnt_pop %r12, %r23, 3
    write "cntpop: %i64(r12)"
    cnt_lz %r12, %r23, 0
    write "cntlz %i64(r12)"
    cnt_tz %r12, %r23, 1
    cnt_lz %r12, %r23, 2
    cnt_tz %r12, %r23, 3
    cnt_lz %r12, %r23, 4
    cnt_tz %r12, %r23, 5
.end
.text
    write	"test bit reverse instruction (permb)"
    alloc	80
    ld_imm.l	%r55, 0x1234567890ABCDEF
    write	"initial value: %x64(r55)"
    permb	%r55, %r55, 63
    permb	%r56, %r78, 63
    write	"r55 %x64(r55) %b64(r55)"
    permb	%r55, %r55, 63
    write	"r55 %x64(r55) %b64(r55)"

    permb	%r56, %r55, 0b111111 ;63
    write	"reverse bits: %x64(r56)"

    permb	%r56, %r55, 0b111110  ;32+16+8+4+2
    write	"reverse bit-pairs: %x64(r56)"

    permb	%r56, %r55, 0b111100  ;32+16+8+4
    write	"reverse nibbles (4-bits): %x64(r56)"

    permb	%r56, %r55, 0b111000 ;32+16+8
    write	"reverse 1bytes: %x64(r55) => %x64(r56)"

    permb	%r56, %r55, 0b110000  ;32+16
    write	"reverse 2bytes: %x64(r55) => %x64(r56)"

    permb	%r56, %r55, 0b100000  ;32
    write	"reverse 4bytes: %x64(r55) => %x64(r56)"
.end
.text
    alloc 46
    write "test bitwise logical"
    and %r23,  %r25, %r45
    and_imm    %r23, %r25, 12345
    and_imm.l  %r23, %r25, 1234567890
    andn %r23, %r25, %r45
    andn_imm   %r23, %r25, 12345
    or %r23,   %r25, %r45
    or_imm     %r23, %r25, 12345
    or_imm.l   %r23, %r25, 1234567890
    orn %r23,  %r25, %r45
    orn_imm    %r23, %r25, 12345
    xor %r23,  %r25, %r45
    xor_imm    %r23, %r25, 12345
    xor_imm.l  %r23, %r25, 1234567890
    nor        %r23, %r25, %r45
    nand       %r23, %r25, %r45
    xnor       %r23, %r25, %r45
.end
.text
    write "branch-int, test memory"
.data
align 8
test_memory:
    d8 0
    d8 1
    d8 2
    d8 3
    d8 4
    d8 5
    d8 6
    d8 7
.text
    alloc 20
    lda_iprel %r12, test_memory
    write "test_memory: %x64(r12)"
    ld_imm %r11, 0
    ld_imm %r14, 0
memory_loop: (32)
    ld_xi64_u64 %r13, %r12, %r11, 3, 0
    add_imm_i64 %r11, %r11, 1
    add_imm_i64 %r14, %r14, 1
    and_imm %r11, %r11, 7
; fast_check
    br_lt_imm_i64.l %r14, 200000, memory_loop
    write "counter: %i64(r14)"
.end
.text
    alloc 20
    write "test compare-with-zero-and-long-branch"
compare_with_zero_test_continue:
compare_with_zero_backward_target:
    add_imm_i64    %r2, %r2, 1
    br_eq_i64 %r2, %r2, compare_with_zero_test_exit

    br_eq_i64   %r1, %gz, compare_with_zero_forward_target
    br_eq_i64.l %r1, %gz, compare_with_zero_forward_target
    br_eq_i64   %r1, %gz, compare_with_zero_backward_target
    br_eq_i64.l %r1, %gz, compare_with_zero_backward_target
    br_ne_i64   %r1, %gz, compare_with_zero_forward_target
    br_ne_i64.l %r1, %gz, compare_with_zero_forward_target
    br_ne_i64   %r1, %gz, compare_with_zero_backward_target
    br_ne_i64.l %r1, %gz, compare_with_zero_backward_target
    br_lt_i64   %r1, %gz, compare_with_zero_forward_target
    br_lt_i64.l %r1, %gz, compare_with_zero_forward_target
    br_lt_i64   %r1, %gz, compare_with_zero_backward_target
    br_lt_i64.l %r1, %gz, compare_with_zero_backward_target
    br_le_i64   %r1, %gz, compare_with_zero_forward_target
    br_le_i64.l %r1, %gz, compare_with_zero_forward_target
    br_le_i64   %r1, %gz, compare_with_zero_backward_target
    br_le_i64.l %r1, %gz, compare_with_zero_backward_target
    br_gt_i64   %r1, %gz, compare_with_zero_forward_target
    br_gt_i64.l %r1, %gz, compare_with_zero_forward_target
    br_gt_i64   %r1, %gz, compare_with_zero_backward_target
    br_gt_i64.l %r1, %gz, compare_with_zero_backward_target
    br_ge_i64   %r1, %gz, compare_with_zero_forward_target
    br_ge_i64.l %r1, %gz, compare_with_zero_forward_target
    br_ge_i64   %r1, %gz, compare_with_zero_backward_target
    br_ge_i64.l %r1, %gz, compare_with_zero_backward_target

compare_with_zero_forward_target:
    jmp compare_with_zero_test_continue
compare_with_zero_test_exit:
    write "end test compare-with-zero-and-long-branch"
.end
.text

call_code_target:

.rodata
call_data_target:

.text
    jmp	callexample
;*****************************************************************
; Function  compute A**4 of parameter A, passed in register r33
;*****************************************************************
quadrat:
    write	"function quadrat entered: r0=%x128(r0)"
    alloc	93
    write	"rsc     %s(rsc)"
    write	"psr     %s(psr)"
    write "rsc     %s(rsc)"
    mul_i64 %r33, %r33, %r33
    mul_i64 %r33, %r33, %r33
    write "r0=%x128(r0) r33=%i64(r33)"
    write "%m(dump)"
; mtspr %r45, psr
    write "function quadrat exited"
    ret
end_quadrat:

;*****************************************************************
; Example of calling sequence with branch prediction
callexample:
    alloc	91
    ld_imm.l	%r90, 0x1234567890abcdef
    write	"arg3 %x64(r90)"
    srp_imm_i64	%r89, %r90, %r90, 16
    write	"arg2 %x64(r89)"
    srp_imm_i64	%r88, %r90, %r90, 16
    write	"arg1 %x64(r88)"
    ld_imm		%r87, 7		; setup arguments
;   write	"%m(dump)"
    write	"rsc: %s(rsc)"
    write	"function quadrat called"
    call_r	%r86, quadrat
    write	"rsc: %s(rsc)"
; Rest instructions after return from subroutine
;*****************************************************************
.text	; return to code section

; Here we test registers used by ABI (application binary interface)
; Check loader.
    write	"sp=%x64(sp) tp=%x64(tp) r0=%x128(r0)"
    write	"rsc: %s(rsc)"
    write	"psr: %s(psr)"
    write	"r14: %x64(r14)"
    write	"reta: %i64(r72)"		; out return address
    write	"retv: %i64(r73)"		; out return value
    write	"rsc: %s(rsc)"
    write	"rsc: %s(psr)"
    ld_imm.l	%r11, 0x407d8bffffccccff
    write	"r11: %x64(r11)"
    add_imm_i64.l %r12, %r11, 0x400000
    write	"r12: %x64(r12)"
    xor		%r20, %r19, %r11
    add_imm_i64.l	%r20, %r20, 0x400000
    ld_imm		%r10, 10
    ld_imm		%r11, 11
    cmp_lt_i64  %r2, %r11, %r10
    write "%i64(r11) %i64(r10)"
    jmp  call_exit

    call_r %r42, quadrat
    call_ri %r42, %r34, %gz
    call_mi %r42, %r34, 468
    call_plt %r42, call_data_target
    call_ri %r42, %r34, %gz

call_exit:
    write	"end call test"

.end
.text
    alloc 47
    write "test recursive calls"
    ld_imm.l %r46, 0x7FFFFFFFFFFFFFFF ; comment
    ld_imm.l %r46, 0x8000000000000000
    add_imm_i64 %r46, %r46, -1
    write "%i64(r46)"

    mfspr %r20, %rsc

    alloc 54 ; extend frame to 54 regs
    ld_imm %r48, 1 ; 
    ld_imm %r53, 3 ; 1 arg (33+16)
    ld_imm %r52, 2 ; 2 arg (34+16)
    ld_imm %r51, 1 ; 3 arg (35+16)
    write "rsc: %s(rsc)"
    call_r %r50, func ; call func subroutine, safe 50 regs
    write "r51=%i64(r51) rsc=%s(rsc)"
    ld_imm %r53, 10
    call_r %r52, rekurs
    write "rsc: %s(rsc)"
    write "rsp: %s(rsp)"
;   write "%m(dump)"
    jmp smallend
func:
; at entry point func subroutine has 4 regs in frame
    alloc 8   ; extend frame from 4 to 8 regs
    write "r0      %x128(r0)" ; print packed caller frame and return address
    write "r1=%i64(r1) r2=%i64(r2) r3=%i64(r3)" ; print args
    ld_imm %r1, 12345
    ret

rekurs:
    alloc 4
    write "r0=%x128(r0) r1=%i64(r1)"
    write "rsc: %s(rsc)"
    write "rsp: %s(rsp)"
    add_imm_i64 %r3, %r1, -1
    ld_imm %r2, 0
    br_eq_i64 %r1, %r2, rekret
; cneq %r1, %r2, 1, 0
    call_r %r2, rekurs
rekret:
    write "rsp: %s(rsp)"
    write "r0: %x128(r0)"
    retf 0
smallend:
    nop 0
    nop 111
    alloc 96
    write "end_call_recursive"
.end
.text
    ; at the beginning of the program, the register stack is empty
    alloc 54   ; expand frame to 54 registers
    eh_adj simple_func_end
    ld_imm %r47, 1  ; will be saved when called
    ld_imm %r53, 3  ; first argument
    ld_imm %r52, 2  ; second argument
    ld_imm %r51, 1  ; third argument
    ; func procedure call, all registers up to 50 will be saved,
    ; return address, eip, frame size (50) are saved in r50
    call_r %r50, simple_func
    ; at this point, after returning, the frame will be again 53
    jmp simple_func_end
simple_func:
    ; at the starting point, the func procedure has a 5-register frame
    ; their previous numbers are 50, 51, 52, 53, new - 0, 1, 2, 3
    ; extend the frame to 10 registers (another 4,5,6,7,8,9)
    alloc 10
    write "r0 = %x128(r0)"; print packed return info
    write "r1 = %i64(r1)" ; print 1st argument
    write "r2 = %i64(r2)" ; print 2nd argument
    write "r3 = %i64(r3)" ; print 3rd argument
    ret
simple_func_end:
    nop 123
.end
.text
    write "example of carry/borrow testing"
    alloc 96

; 256-bit add (g30,%r31,r32,r33) + (g40,r41,r42,r43) => (g50,r51,r52,r53)
    ld_imm  %r30, -1
    ld_imm  %r31, -1
    ld_imm  %r34, -1
    ld_imm  %r33, -1

    ld_imm  %r40, 1
    ld_imm  %r41, 0
    ld_imm  %r42, 0
    ld_imm  %r43, 0

; throw add
    cmp_eq_i64     %r10, %r30, %r40 ; add carry out
    add_i64      %r50, %r30, %r40 ; add
    cmp_eq_imm_i64    %r12, %r31, 1
    add_imm_i64   %r51, %r31, 1

    cmp_eq_i64 %r12, %r31, %r41 ; add carry out
    add_i64  %r51, %r31, %r41 ; add
    cmp_eq_i64 %r14, %r34, %r42 ; add carry out
    add_i64  %r52, %r34, %r42 ; add
    cmp_eq_i64 %r8, %r33, %r43 ; add carry out
    add_i64 %r53, %r33, %r43 ; add
    write "add carryis"
    add_imm_i64 %r51, %r51, 1
    add_imm_i64 %r52, %r52, 1
    add_imm_i64 %r53, %r53, 1
; set last carry
    ld_imm  %r54, 1
    ld_imm  %r54, 0
    write "multiprecision add:\nr50,r51,r52,r53,r54 = %x64(r50) %x64(r51) %x64(r52) %x64(r53) %x64(r54)"

    ld_imm.l %r40, 0x7fffffffffffffff
    mul_h %r40, %r40, %r41
    write "r40     %x64(r40)"

    ld_imm   %r12, 12345
    ld_imm.l %r12, 12345678900

; ld_imm %r14, 0xFFFFFFFFF0
; ld8 %r13, %r14, 0

    addc_u64 %r12, %r14, %r46
    addc_u64 %r12, %r14, %r46
    subb_u64 %r12, %r14, %r46
    subb_u64 %r12, %r14, %r46
    add_addc_u64 %r12, %r14, %r46, %r23
    add_addc_u64 %r12, %r14, %r46, %r22
    sub_subb_u64 %r12, %r14, %r46, %r13
    sub_subb_u64 %r12, %r14, %r46, %r14
    write "end carry test"
    nop 11111
.end
.text
    write "test compare"
    alloc 96
    ld_imm %r20, 4
    ld_imm %r21, 3
    ld_imm %r22, -4
    ld_imm %r23, -12
    write "test compare instructions"

    cmp_eq_i64 %r12, %r20, %r21
    cmp_lt_i64 %r12, %r20, %r21
    cmp_lt_u64 %r12, %r20, %r21
    cmp_eq_imm_i64 %r12, %r20, 123456
    cmp_lt_imm_i64 %r12, %r20, 123456
    cmp_lt_imm_u64 %r12, %r20, 123456
    cmp_ne_i64 %r12, %r20, %r21
    cmp_ne_imm_i64 %r12, %r20, 123456
    cmp_gt_imm_i64 %r12, %r20, 123456
    cmp_gt_imm_u64 %r12, %r20, 123456
    cmp_le_i64 %r12, %r20, %r21
    cmp_le_u64 %r12, %r20, %r21

    cmp_ge_imm_i64 %r12, %r20, 123456
    cmp_ge_imm_u64 %r12, %r20, 123456
    cmp_le_imm_i64 %r12, %r20, 123456
    cmp_le_imm_u64 %r12, %r20, 123456

    cmp_eq_i32 %r12, %r20, %r21
    cmp_lt_i32 %r12, %r20, %r21
    cmp_lt_u32 %r12, %r20, %r21
    cmp_eq_imm_i32 %r12, %r20, 123456
    cmp_lt_imm_i32 %r12, %r20, 123456
    cmp_lt_imm_u32 %r12, %r20, 123456
    cmp_ne_i32 %r12, %r20, %r21
    cmp_ne_imm_i32 %r12, %r20, 123456
    cmp_gt_imm_i32 %r12, %r20, 123456
    cmp_gt_imm_u32 %r12, %r20, 123456
    cmp_le_i32 %r12, %r20, %r21
    cmp_le_u32 %r12, %r20, %r21

    write	"compare aliases (pseudo-instructions)"
    cmp_gt_i64 %r12, %r20, %r21	; cmp_lt   r12, %r21, r20
    cmp_gt_u64 %r12, %r20, %r21	; cmp_ltu  r12, %r21, r20
    cmp_lt_imm_i64 %r12, %r20, 123456	; cmp_lti  r12, %r20, 12346
    cmp_lt_imm_u64 %r12, %r20, 123456	; cmp_ltui r12, %r20, 12346
    cmp_ge_i64 %r12, %r20, %r21	; cmp_leq  r12, %r21, r20
    cmp_ge_u64 %r12, %r20, %r21	; cmp_leu  r12, %r21, r20
    cmp_gt_imm_i64 %r12, %r20, 123456	; cmp_gti  r12, %r20, 12346
    cmp_gt_imm_u64 %r12, %r20, 123456	; cmp_gtui r12, %r20, 12346


    cmp_gt_i32 %r12, %r20, %r21	; cmp_lt4   r12, %r21, %r20
    cmp_gt_u32 %r12, %r20, %r21	; cmp_ltu4  r12, %r21, %r20
    cmp_lt_imm_i32 %r12, %r20, 123456	; cmp_lti4  r12, %r20, 12346
    cmp_lt_imm_u32 %r12, %r20, 123456	; cmp_ltui4 r12, %r20, 12346
    cmp_ge_i32 %r12, %r20, %r21	; cmp_leq4  r12, %r21, r20
    cmp_ge_u32 %r12, %r20, %r21	; cmp_leu4  r12, %r21, r20
    cmp_gt_imm_i32 %r12, %r20, 123456	; cmp_gti4  r12, %r20, 12346
    cmp_gt_imm_u32 %r12, %r20, 123456	; cmp_gtui4 r12, %r20, 12346

; TESTS
    cmp_eq_i64 %r14, %r12, %r45
    cmp_ne_i64 %r14, %r12, %r45

    cmp_eq_i64 %r14, %r45, %r34
    cmp_eq_imm_i64 %r14, %r45, 123
    cmp_eq_imm_i64.l %r14, %r45, 1234567890123
    cmp_lt_imm_i64 %r14, %r45, 123
    cmp_lt_imm_i64.l %r14, %r45, 1234567890123
    cmp_le_imm_i64 %r14, %r45, 123
    cmp_le_imm_i64.l %r14, %r45, 1234567890123
    cmp_lt_i64 %r14, %r45, %r34
    cmp_gt_imm_u64 %r14, %r45, 123
    cmp_gt_imm_u64.l %r14, %r45, 1234567890123
    cmp_ge_imm_u64 %r14, %r45, 123
    cmp_ge_imm_u64.l %r14, %r45, 1234567890123
    cmp_gt_u64 %r14, %r45, %r34

    cmp_eq_i64 %r41, %r34, %r56
    cmp_lt_i64 %r66, %r45, %r57
    cmp_eq_imm_i64 %r64, %r56, 0
.end
.text
backward_target:
    alloc 61
    add_imm_i64 %r2, %r2, 1
    br_eq_i64 %r2, %r2, branch_test_exit

    br_eq_i64	%r23, %r34, backward_target
    br_eq_i64.l	%r23, %r34, backward_target
    br_eq_i64	%r23, %r34, forward_target
    br_eq_i64.l	%r23, %r34, forward_target
    br_eq_imm_i64	%r23,34, backward_target
    br_eq_imm_i64.l	%r23,34, backward_target
    br_eq_imm_i64	%r23,34, forward_target
    br_eq_imm_i64.l	%r23,34, forward_target

    br_eq_i32	%r23, %r34, backward_target
    br_eq_i32.l	%r23, %r34, backward_target
    br_eq_i32	%r23, %r34, forward_target
    br_eq_i32.l	%r23, %r34, forward_target
    br_eq_imm_i32	%r23,34, backward_target
    br_eq_imm_i32.l	%r23,34, backward_target
    br_eq_imm_i32	%r23,34, forward_target
    br_eq_imm_i32.l	%r23,34, forward_target

    br_ne_i64	%r23, %r34, backward_target
    br_ne_i64.l	%r23, %r34, backward_target
    br_ne_i64	%r23, %r34, forward_target
    br_ne_i64.l	%r23, %r34, forward_target
    br_ne_imm_i64	%r23,34, backward_target
    br_ne_imm_i64.l	%r23,34, backward_target
    br_ne_imm_i64	%r23,34, forward_target
    br_ne_imm_i64.l	%r23,34, forward_target

    br_ne_i32	%r23, %r34, backward_target
    br_ne_i32.l	%r23, %r34, backward_target
    br_ne_i32	%r23, %r34, forward_target
    br_ne_i32.l	%r23, %r34, forward_target
    br_ne_imm_i32	%r23,34, backward_target
    br_ne_imm_i32.l	%r23,34, backward_target
    br_ne_imm_i32	%r23,34, forward_target
    br_ne_imm_i32.l	%r23,34, forward_target

    br_le_i64	%r23, %r34, backward_target
    br_le_i64.l	%r23, %r34, backward_target
    br_le_i64	%r23, %r34, forward_target
    br_le_i64.l	%r23, %r34, forward_target
    br_le_imm_i64	%r23,34, backward_target
    br_le_imm_i64.l	%r23,34, backward_target
    br_le_imm_i64	%r23,34, forward_target
    br_le_imm_i64.l	%r23,34, forward_target

    br_le_i32	%r23, %r34, backward_target
    br_le_i32.l	%r23, %r34, backward_target
    br_le_i32	%r23, %r34, forward_target
    br_le_i32.l	%r23, %r34, forward_target
    br_le_imm_i32	%r23,34, backward_target
    br_le_imm_i32.l	%r23,34, backward_target
    br_le_imm_i32	%r23,34, forward_target
    br_le_imm_i32.l	%r23,34, forward_target

    br_lt_i64	%r23, %r34, backward_target
    br_lt_i64.l	%r23, %r34, backward_target
    br_lt_i64	%r23, %r34, forward_target
    br_lt_i64.l	%r23, %r34, forward_target
    br_lt_imm_i64	%r23,34, backward_target
    br_lt_imm_i64.l	%r23,34, backward_target
    br_lt_imm_i64	%r23,34, forward_target
    br_lt_imm_i64.l	%r23,34, forward_target

    br_lt_i32	%r23, %r34, backward_target
    br_lt_i32.l	%r23, %r34, backward_target
    br_lt_i32	%r23, %r34, forward_target
    br_lt_i32.l	%r23, %r34, forward_target
    br_lt_imm_i32	%r23,34, backward_target
    br_lt_imm_i32.l	%r23,34, backward_target
    br_lt_imm_i32	%r23,34, forward_target
    br_lt_imm_i32.l	%r23,34, forward_target

    br_ge_i64	%r23, %r34, backward_target
    br_ge_i64.l	%r23, %r34, backward_target
    br_ge_i64	%r23, %r34, forward_target
    br_ge_i64.l	%r23, %r34, forward_target
    br_ge_imm_u64	%r23,34, backward_target
    br_ge_imm_u64.l	%r23,34, backward_target
    br_ge_imm_u64	%r23,34, forward_target
    br_ge_imm_u64.l	%r23,34, forward_target

    br_ge_i32	%r23, %r34, backward_target
    br_ge_i32.l	%r23, %r34, backward_target
    br_ge_i32	%r23, %r34, forward_target
    br_ge_i32.l	%r23, %r34, forward_target
    br_ge_imm_u32	%r23,34, backward_target
    br_ge_imm_u32.l	%r23,34, backward_target
    br_ge_imm_u32	%r23,34, forward_target
    br_ge_imm_u32.l	%r23,34, forward_target

    br_gt_i64	%r23, %r34, backward_target
    br_gt_i64.l	%r23, %r34, backward_target
    br_gt_i64	%r23, %r34, forward_target
    br_gt_i64.l	%r23, %r34, forward_target
    br_gt_imm_i64	%r23,34, backward_target
    br_gt_imm_i64.l	%r23,34, backward_target
    br_gt_imm_i64	%r23,34, forward_target
    br_gt_imm_i64.l	%r23,34, forward_target

    br_gt_i32	%r23, %r34, backward_target
    br_gt_i32.l	%r23, %r34, backward_target
    br_gt_i32	%r23, %r34, forward_target
    br_gt_i32.l	%r23, %r34, forward_target
    br_gt_imm_i32	%r23,34, backward_target
    br_gt_imm_i32.l	%r23,34, backward_target
    br_gt_imm_i32	%r23,34, forward_target
    br_gt_imm_i32.l	%r23,34, forward_target

    br_le_u64	%r23, %r34, backward_target
    br_le_u64.l	%r23, %r34, backward_target
    br_le_u64	%r23, %r34, forward_target
    br_le_u64.l	%r23, %r34, forward_target
    br_le_imm_u64	%r23,34, backward_target
    br_le_imm_u64.l	%r23,34, backward_target
    br_le_imm_u64	%r23,34, forward_target
    br_le_imm_u64.l	%r23,34, forward_target

    br_le_u32	%r23, %r34, backward_target
    br_le_u32.l	%r23, %r34, backward_target
    br_le_u32	%r23, %r34, forward_target
    br_le_u32.l	%r23, %r34, forward_target
    br_le_imm_u32	%r23,34, backward_target
    br_le_imm_u32.l	%r23,34, backward_target
    br_le_imm_u32	%r23,34, forward_target
    br_le_imm_u32.l	%r23,34, forward_target

    br_lt_u64	%r23, %r34, backward_target
    br_lt_u64.l	%r23, %r34, backward_target
    br_lt_u64	%r23, %r34, forward_target
    br_lt_u64.l	%r23, %r34, forward_target
    br_lt_imm_u64	%r23,34, backward_target
    br_lt_imm_u64.l	%r23,34, backward_target
    br_lt_imm_u64	%r23,34, forward_target
    br_lt_imm_u64.l	%r23,34, forward_target

    br_lt_u32	%r23, %r34, backward_target
    br_lt_u32.l	%r23, %r34, backward_target
    br_lt_u32	%r23, %r34, forward_target
    br_lt_u32.l	%r23, %r34, forward_target
    br_lt_imm_u32	%r23,34, backward_target
    br_lt_imm_u32.l	%r23,34, backward_target
    br_lt_imm_u32	%r23,34, forward_target
    br_lt_imm_u32.l	%r23,34, forward_target

    br_ge_u64	%r23, %r34, backward_target
    br_ge_u64.l	%r23, %r34, backward_target
    br_ge_u64	%r23, %r34, forward_target
    br_ge_u64.l	%r23, %r34, forward_target
    br_ge_imm_u64	%r23,34, backward_target
    br_ge_imm_u64.l	%r23,34, backward_target
    br_ge_imm_u64	%r23,34, forward_target
    br_ge_imm_u64.l	%r23,34, forward_target

    br_ge_u32	%r23, %r34, backward_target
    br_ge_u32.l	%r23, %r34, backward_target
    br_ge_u32	%r23, %r34, forward_target
    br_ge_u32.l	%r23, %r34, forward_target
    br_ge_imm_u32	%r23,34, backward_target
    br_ge_imm_u32.l	%r23,34, backward_target
    br_ge_imm_u32	%r23,34, forward_target
    br_ge_imm_u32.l	%r23,34, forward_target

    br_gt_u64	%r23, %r34, backward_target
    br_gt_u64.l	%r23, %r34, backward_target
    br_gt_u64	%r23, %r34, forward_target
    br_gt_u64.l	%r23, %r34, forward_target
    br_gt_imm_u64	%r23, 34, backward_target
    br_gt_imm_u64.l	%r23, 34, backward_target
    br_gt_imm_u64	%r23, 34, forward_target
    br_gt_imm_u64.l	%r23, 34, forward_target

    br_gt_u32	%r23, %r34, backward_target
    br_gt_u32.l	%r23, %r34, backward_target
    br_gt_u32	%r23, %r34, forward_target
    br_gt_u32.l	%r23, %r34, forward_target
    br_gt_imm_u32	%r23, 34, backward_target
    br_gt_imm_u32.l	%r23, 34, backward_target
    br_gt_imm_u32	%r23, 34, forward_target
    br_gt_imm_u32.l	%r23, 34, forward_target

    br_mask_all	%r23, 34, backward_target
    br_mask_all.l	%r23, 34, backward_target
    br_mask_all	%r23, 34, forward_target
    br_mask_all.l	%r23, 34, forward_target

    br_mask_notall	%r23, 34, backward_target
    br_mask_notall.l	%r23, 34, backward_target
    br_mask_notall	%r23, 34, forward_target
    br_mask_notall.l	%r23, 34, forward_target

    br_mask_any   %r23, 34, backward_target
    br_mask_any.l %r23, 34, backward_target
    br_mask_any   %r23, 34, forward_target
    br_mask_any.l %r23, 34, forward_target

    br_mask_none   %r23, 34, backward_target
    br_mask_none.l %r23, 34, backward_target
    br_mask_none   %r23, 34, forward_target
    br_mask_none.l %r23, 34, forward_target

forward_target:
branch_test_exit:

    jmp branch_exit

label:
    br_eq_i64 %r12, %r13, qwe
    srp_imm_i64 %r10, %r11, %r12, 45
    dep_i128 %r61, %r91, %r32, 10
    mbsel %r62, %r91, %r32, %r10
    perm %r63, %r91, %r32, %r10
qwe:
    br_ne_i64 %r15, %r46, label
    br_eq_i64 %r25, %r45, label
    br_lt_i64 %r25, %r44, label
    br_le_i64 %r35, %r43, label
    br_gt_u64 %r35, %r42, label
    br_ge_u64 %r45, %r41, label
    br_gt_i64 %r45, %r40, label
    br_lt_u64 %r55, %r76, label
    br_ne_imm_i64 %r55, 140, label
    br_eq_imm_i64 %r65, 141, label
    br_lt_imm_i64 %r65, 142, label
    br_gt_imm_i64 %r75, 143, label
    br_lt_imm_u64 %r75, 170, label
    br_gt_imm_u64 %r85, 160, label

    add_imm_i64.l %r45, %r34, 1234
    br_bs_imm %r85, 26, label
    br_bc_imm.l %r85, 36, label
    br_bs_imm %r95, 46, label
    br_bc_imm.l %r95, 56, label

    jmp_r	%r45, %r23, 1
branch_exit:
    write	"end branch test"
.end
.text
    alloc 61
    write "Example of test bit and branch"
    ld_imm %r19, 0x20
    ld_imm %r20, 12+3
    write "%i64(r20)"
    ld_imm %r10, 0
    br_bc_imm %r10, 10, xxx_n
    ld_imm.l %r20, 123456789012345
    ld_imm %r21, 321
    add_i64 %r23, %r20, %r21
    write "%i64(r43)"
xxx_n: write "%i64(r23)"

    ld_imm %r46, 0xabcdef
    br_bc_imm %r46, 56, branch_bit_exit
    br_bs_imm %r46, 56, branch_bit_exit
    ld_imm %r56, 56
    br_bc %r46, %r56, branch_bit_exit
    br_bs %r46, %r56, branch_bit_exit

branch_bit_exit:
    write	"end branch_bit test"
.end
.text
    write "cpuid implemented number"
    alloc 96
    ld_imm %r13, 0
    cpuid %r14, %r13, 0
    write "cpuid len %x64(r14)"
    write "cpuid loop"
cpuid_loop:
    cpuid %r15, %r13, 0
    write "cpuid[%i64(r13)] = %x64(r15)"
    rep_lt_i64 %r13, %r14, cpuid_loop
.end
.rodata
    align 16
crc32c_test_string:
    ascii "The quick brown fox jumps over the lazy dog"
.text
    write "crc32c = 0x22620404 (expected)"
    alloc 20
    ld_imm %r12, -1  ; crc32c
    ld_imm %r15, 43 ; length
    mov %r14, %r15
    lda_iprel %r11, crc32c_test_string
crc32c_loop:
    ld_mia_i128 %r13, %r11, 16
    crc32c %r12, %r12, %r13, %r14
    add_imm_i64 %r14, %r14, -16
    br_gt_i64 %r14, %gz, crc32c_loop
    xor_imm %r12, %r12, -1
    write "crc32c = 0x%x32(r12) (computed)"
.end
.text
    alloc 61
    lda_xi64 %r41, %r40, %r12, 4, 52
    lda_xi64 %r41, %r40, %r12, 3, -12
    lda_xi64 %r41, %r40, %r12, 4, 52
    ld_imm.l %r5, -1
    mov2 %r3, %r4, %r4, %r3
    mov2 %r3, %r4, %r4, %r3


.rodata	; open text (read-only data) section
    align 16
text_lbl: ; this is label
    d1 111 ; signed byte
    d1 112
    d1 113
ddd:
    align 4 ; force 4-byte alignment for next data
    d1 6
    d1 7
    d1 8+0x3D ; you may use formulas!!!

.text
    write "test addressing"

; Examples of IP-relative references.
    ld_imm %r45, text_lo(text_lbl)
    write "text_lo(text_lbl)=%i64(r45)"
    ld_imm %r45, text_hi(text_lbl)
    write "text_hi(text_lbl)=%i64(r45)"
    ld_imm %r45, text_lbl
    write "%i64(r45)"

; Example of access to text section.
; First get IP-relative reference to text section (+/- 64 MB from IP).
    lda_r %r45, text_lbl

; Now in r45 we have base address.
; But it IS NOT true address of 'text_lbl'.
; We have in r45 nearest (to 'text_lbl') least address, aligned on 16-bytes boundary.
; Remember add 'text_lo' part of label address at each displacement calculation.
    ld_u8 %r50, %r45, text_lo(text_lbl)+0
    ld_u8 %r51, %r45, text_lo(text_lbl)+1
    ld_u8 %r52, %r45, text_lo(text_lbl)+2
    write "%i64(r50)" ; must be 111
    write "%i64(r51)" ; must be 112
    write "%i64(r52)" ; must be 113

; Example of incorrect access to text section (without bundle alignment)
    ld_u8 %r50, %r45, 0
    write "%i64(r50)" ; must be 101 - start of 16-byte portion
.end
.text
    alloc 96
    add_imm_i64 %r20, %gz, 128
    add_imm_i64 %sp, %sp, -32
    ld_imm.l %r12, 0x07060504030201
    st_i64 %r12, %sp,0

.data
    ascii "data section marker"
    align 8
.rodata
    ascii "rodata section marker"
    align 8

.data
    d2 1234
first_byte:
    d1 12
.text
    lda_iprel %r22, first_byte

; test interval time mask
    ld_imm %r22, 0xFFFFFFFFFFFFFFFF
    ld_imm %r15, 11

.rodata	; open rodata (read-only data) section
    align 8
text_begin: ; this is label
    d8 1 ; signed 8-bytes
    d8 -2
    d1 101 ; signed byte
    d1 102
    d1 103
    align 4
    d4 10000 ; signed 4byte
    d2 10000 ; signed 2byte
    space 4 ; insert zeroed bytes
    d2 20000
.data	; open data (read-write) section
    align 8
eexxx: d8 12345678 ; signed 8-byte
    d8 1234567890
ssxxx: d8 123456789012
    d8 12345678901234
.rodata
    d4 4555 ; signed 4-byte
    d2 4555 ; signed 2-byte
    align 8
    d8 11
text2:
.text ; open code (read-execute) section

.data ; switch to data section
    d1 120
    align 2
    d2 13400
align 8
dataname:
    d4 654321890
    d4 654321890
    d8 1234545345345
    d8 6789023356977
align 8
someplaceindata:
    d8 0x0000000000000001
    d8 0x0000000000000002
    d8 0x0000000000000003
    d8 0x0000000000000004
    d8 0x0000000000000005
    d8 0x0000000000000006
    d8 0x0000000000000007
    d8 0x0000000000000008
.text
    lda_iprel %r11, someplaceindata
    ld_imm.l %r15, 987777777777
    ld_imm %r46, 100000
    st_i64 %r46, %r11, 8*3
    ld_u64 %r46, %r11, 8*3
    write "%i64(r46)"
    mul_i64 %r18, %r15, %r46
    add_i64 %r17, %r15, %r46
    andn %r17, %r15, %r46
    cmp_lt_i64 %r12, %r17, %r15
    write "%i64(r15) %i64(r46) %i64(r17)"
    add_imm_i64 %r17, %r17, 22
    write "%i64(r17) %i64(r17)"
    mfspr %r27, %itc
    write "itc: %x64(r27)"
    write "%m(dump)"
.end
.text
    ; at the beginning of the program, the register stack is empty
    alloc 54   ; expand frame to 54 registers
    lda_r %r4, dense_call_test_end
    mtspr %r4, %eip
    mtspr %r4, %reip
    ld_imm %r47, 1  ; will be saved when called
    ld_imm %r53, 3  ; first argument
    ld_imm %r52, 2  ; second argument
    ld_imm %r51, 1  ; third argument
    ; func procedure call, all registers up to 50 will be saved,
    ; return address, eip, frame size (50) are saved in r50
check_label:
    call_r %r48, simple_func_1
    call_r %r50, simple_func_2
    call_r %r52, simple_func_3

    jmp dense_call_test_end

simple_func_1:
    alloc  10
    write  "simple_func_1"
    ret

simple_func_2:
    alloc  10
    write  "simple_func_2"
    ret

simple_func_3:
    alloc  10
    write  "simple_func_3"
    ret

dense_call_test_end:
    nop 123
    nop 123
    nop 123
    nop 123
    nop 123
    nop 123
.end
.text
    write "test bit-field insert (deposit)"
    alloc 96
    ld_imm.l %r30, 0xaaaaaaaaaaaaaaaa
    ld_imm.l %r40, 0xeeeeeeeeeeeeeeee
    dep %r20, %r30, %r40, 40, 24
    write "dep: %x64(r20)"
    dep %r20, %r40, %r30, 40, 24
    write "dep: %x64(r20)"

    write "test vector deposit (dep16)"
    nor %r3, %r4, %r4
    dep_i128 %r5, %r3, %r4, 100
    write "dep16: %x128(r5)"
    write "end deposit test"
.end

.text
    write "test control device memory-mapped registers"
    alloc 96

    ; device_control base address
    ld_imm.l %r24, DEVICE_CONFIG_VIRT_BASE

    write "test pci"

    ld_imm.l %r21, 0x1234567890abcdef

    ld_u64 %r20, %r24, DEVICE_CONTROL_DID
    write "mem[DEVICE_CONTROL_DID] %x64(r20)"
    st_i64 %r21, %r24, DEVICE_CONTROL_DID
    ld_u64 %r20, %r24, DEVICE_CONTROL_DID
    write "mem[DEVICE_CONTROL_DID] %x64(r20)"

    ld_u64 %r20, %r24, DEVICE_CONTROL_CMD
    write "mem[DEVICE_CONTROL_CMD] %x64(r20)"
    st_i64 %r21, %r24, DEVICE_CONTROL_CMD
    ld_u64 %r20, %r24, DEVICE_CONTROL_CMD
    write "mem[DEVICE_CONTROL_CMD] %x64(r20)"

    ld_u64 %r20, %r24, DEVICE_CONTROL_ARRAY_ADDRESS
    write "mem[DEVICE_CONTROL_ARRAY_ADDRESS] (r20)"

    ld_u64 %r20, %r24, DEVICE_CONTROL_ARRAY_LEN
    write "mem[DEVICE_CONTROL_ARRAY_LEN] %i64(r20)"

    ld_imm  %r22, \n

    write "test command"
    ld_imm.l %r21, 0xabcdef1234567890
    st_i64 %r21, %r24, DEVICE_CONTROL_CMD

    write "end_device_control_test"
.end
.text
    write "test core mapping DEVICE_CONFIG_VIRT_BASE"
    alloc 96
    ld_imm.l %r20, DEVICE_CONFIG_VIRT_BASE
    write "DEVICE_CONFIG_VIRT_BASE: %x64(r20)"
    ld_imm.l %r20, DEVICE_CONFIG_SPACE_SIZE
    write "DEVICE_CONFIG_SPACE_SIZE: %x64(r20)"
    ld_imm.l %r20, CONFIG_OFFSET_CORE_0
    write "CONFIG_OFFSET_CORE_0: %x64(r20)"
    ld_imm.l %r20, DEVICE_CORE_TIMECMP
    write "DEVICE_CORE_TIMECMP: %x64(r20)"

    ld_imm.l %r20, DEVICE_CONFIG_VIRT_BASE + CONFIG_OFFSET_CORE_0 * DEVICE_CONFIG_SPACE_SIZE ; core config
    ld_imm %r19, 0xabcdef

    write "test interrupt vector %x64(r20)"
    st_i64 %r19, %r20, DEVICE_CORE_TIMECMP ; use DEVICE_CORE_INTERRUPT_VECTOR in place of DEVICE_CORE_TIMECMP for real interrupt

    write "test timecmp"
    st_i64 %r19, %r20, DEVICE_CORE_TIMECMP

    write "test rom mapping ROM_VIRT_BASE"
    ld_imm.l %r20, ROM_VIRT_BASE
    ld_u64 %r19, %r20, 0
    write "mem[ROM_VIRT_BASE] %x64(r19)"

    write "test video commands VIDEO_COMMAND_VIRT_BASE"
    ld_imm.l %r20, VIDEO_COMMAND_VIRT_BASE
    ld_imm %r21, 0x1234
    st_i32 %r21, %r20, 0x88 ; clear
    st_i32 %r21, %r20, 0x8c ; redraw

    write "video width/height base: %x64(r20)"
    ld_u32 %r21, %r20, 0x80 ; width
    ld_u32 %r22, %r20, 0x84 ; height
    write "width=%i64(r21) heigth=%i64(r22)"

    write "test video memory VIDEO_VIRT_BASE"
    ld_imm.l %r20, VIDEO_VIRT_BASE
    write "r20     %x64(r20)"

    ld_imm.l %r25, 0x12345678
    st_i32 %r25, %r20, 0

    ld_imm %r24, 0   ; y
loop_y: (64)
; write "%i64(r24)"
    ld_imm %r23, 0   ; x
loop_x:
; add %r25, %r23, %r24
    st_i8 %r25, %r20, 0
    add_imm_i64 %r20, %r20, 1
    add_imm_i64 %r23, %r23, 1
    br_lt_i64 %r23, %r21, loop_x

    add_imm_i64 %r24, %r24,1
    br_lt_i64 %r24, %r22, loop_y
    ; debug
    write "end test video memory"
    nop 1234567
.end
.text
    write "begin exception test"
    alloc 96

    lda_iprel %r2, catch
    mtspr %r2, %eip

; constructor 1
    ld_imm %r4, 1
    eh_adj call_destructor_1
    write "eip: %s(eip)"
; constructor 2
    ld_imm %r5, 2
    eh_adj call_destructor_2
    write "eip: %s(eip)"

    ld_imm %r3, 0xFFFFFFFFFFFF1230
    eh_throw %r3, 0    ; set eca, jump to eip
    write "normal execution (never occurs)"

call_destructor_2:
    write "call_destructor_2"
    eh_catch %r6, end_destructor_2
    ; here dtor called
    ld_imm %r4, 0
end_destructor_2:
    eh_next %r6, call_destructor_1
    write "normal continue after destructor_2"

call_destructor_1:
    write "call_destructor_1"
    eh_catch %r6, end_destructor_1
    ; here dtor called
    ld_imm %r5, 0
end_destructor_1:
    eh_next %r6, catch
    write "normal continue after destructor_1"

call_ret:
    write "normal exit"
    jmp exception_exit

catch:
    write "caught exception, exit"
    eh_catch %r12, exception_exit
    write "caught exception context: r12=%x64(r12)"
exception_exit:
    nop 1234567
    nop 7654321
.end
.text
; floating-point extension example
    alloc	96

    write	"test float128 immediate load (low/high parts)"
    ld_iprel_f128 %r12, 3.1415926115461431423612436243
    write	"fldqri: %f128(r12)"

    write	"test fpcr modification (rm=3)"
    ld_imm		%r2, 3
    mtspr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"
    write	"test fpcr modification (rm=2)"
    ld_imm		%r2, 2
    mtspr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"
    write	"test fpcr modification (rm=1)"
    ld_imm		%r2, 1
    mtspr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"
    write	"test fpcr modification (rm=0)"
    ld_imm		%r2, 0
    mtspr	%r2, %fpcr
    write	"fpcr: %s(fpcr)"

    write	"compare fldqri (full mantissa) & long fldi (63-bit mantissa)"
    ld_iprel_f128	%r30, 3.14159265358979323846123456789012e+400
    write	"fldqri: %x128(r30) %f128(r30)"
    ld_imm_f64	%r31, 3.14159265358979323846123456789012
    write	"flddi: %x128(r31) %f64(r31)"
    write	"compare fldqri (full mantissa) & short fldi (21-bit mantissa)"
    ld_iprel_f128	%r30, 3.14159265358979323846123456789012
    write	"r30     %x128(r30)"
    ld_imm_f64	%r31, 3.14159265358979323846123456789012
    write	"r31     %x128(r31)"
    write	"before1"
    write	"r30     %f128(r30)"
    write	"before2"
    write	"r31     %vf64(r31)"
    write	"after"
    ld_imm_f64	%r30, -12.3456789e+04
.rodata
    align 16
float64data:
    double 1.234567890123456789124141241241
    double 3.1415925678888734535345231234564561
    double 3.4566345634563456346535463463456
.text
    lda_r %r21, float64data
    ld_u64	%r11, %r21, 8*0
    ld_u64	%r12, %r21, 8*1
    ld_u64	%r13, %r21, 8*2
    write	"ld8(f64): %f64(r11) %f64(r12) %f64(r13)"
    ld_iprel_f128	%r14, 2.7182818289201
    write	"fldqri: %f128(r14)"

    extend_f64_f128 %r11, %r11
    extend_f64_f128 %r12, %r12
    extend_f64_f128 %r13, %r13

    write	"test binary"
    mul_f128	%r15, %r11, %r14
    write	"fmulsq:  %f128(r15)"
    nmul_f128	%r15, %r11, %r14
    write	"fnmulsq: %f128(r15)"
    add_f128	%r15, %r11, %r14
    write	"faddsq:  %f128(r15)"
    nadd_f128	%r15, %r11, %r14
    write	"fnaddsq: %f128(r15)"
    sub_f128	%r15, %r14, %r11
    write	"fsubsq:  %f128(r15)"
    div_f128	%r15, %r14, %r11
    write	"fdivsq:  %f128(r15)"

    write	"test fused fma"
;   jmp	skipfma
    madd_f128	%r15, %r14, %r11, %r12
    write	"fmaddsq:  %f128(r15)"
    nmadd_f128 %r15, %r14, %r11, %r12
    write	"fnmaddsq: %f128(r15)"
    msub_f128	%r15, %r14, %r11, %r12
    write	"fmsubsq:  %f128(r15)"
    nmsub_f128 %r15, %r14, %r11, %r12
    write	"fnmsubsq: %f128(r15)"

    write	"test unary"
    mov		%r16, %r15
    write	"r16     %f128(r16)"
    abs_f128	%r16, %r15
    write	"r16     %f128(r16)"
    neg_f128	%r16, %r15
    write	"r16     %f128(r16)"
    nabs_f128	%r16, %r15
    write	"r16     %f128(r16)"
    sqrt_f128	%r16, %r12
    write	"r16     %f128(r16)"
    rsqrt_f128	%r16, %r12
    write	"r16     %f128(r16)"

    write	"test rounding"
    trunk_f128 %r17, %r12
    write "r17     %f128(r17)"
    floor_f128 %r17, %r12
    write "r17     %f128(r17)"
    ceil_f128 %r17, %r12
    write "r17     %f128(r17)"
    roundeven_f128 %r17, %r12
    write "r17     %f128(r17)"
    trunk_f128_i32 %r17, %r12
    write "r17     %i64(r17)"
    ld_imm %r17, 123456
    cvt_i32_f128 %r17, %r7
    write "r17     %f128(r17)"

    write "test fp minmax"
    max_f128 %r8, %r11, %r12
    write "r8      %f128(r8)"
    min_f128 %r8, %r11, %r12
    write "r8      %f128(r8)"
    write "test fp abs minmax"
    abs_max_f128 %r8, %r11, %r12
    write "r8      %f128(r8)"
    abs_min_f128 %r8, %r11, %r12
    write "r8      %f128(r8)"

    write "test fmergesq"
    merge_f128 %r8, %r11, %r12, %r14
    write "r8      %f128(r8)"
    merge_f128 %r8, %r14, %r11, %r12
    write "r8      %f128(r8)"


.rodata
    align 16
xxxd: double 1.122
    double 0.9999765432
.text
    lda_r %r21, xxxd
    ld_imm		%r15, 100
    ld_u64	%r25, %r21, 8*0
    ld_u64	%r26, %r21, 8*1
    sub_f128	%r22, %r25, %r16
    write	"r22     %f128(r22)"
xxloop:
    madd_f128	%r22, %r25, %r16, %r22
    msub_f128	%r22, %r25, %r16, %r22
    rep_ge_i64 %r15, %gz, xxloop
    write "r22     %f128(r22)"

    write "other FPU"
    madd_f128  %r60, %r61, %r62, %r63
    msub_f128  %r61, %r61, %r72, %r73
    nmadd_f128 %r62, %r71, %r82, %r63
    nmsub_f128 %r63, %r81, %r12, %r53

    mul_f128	%r64, %r61, %r22
    div_f128	%r65, %r11, %r27
    add_f128	%r66, %r17, %r42
    sub_f128	%r67, %r31, %r23
    nadd_f128	%r68, %r41, %r62
    max_f128 %r60, %r61, %r62
    min_f128 %r60, %r61, %r62
    abs_max_f128 %r60, %r61, %r62
    abs_min_f128 %r60, %r61, %r62

    cmp_olt_f128	%r10, %r61, %r72
    cmp_ole_f128	%r11, %r52, %r21
    cmp_ole_f128	%r12, %r43, %r12
    cmp_oeq_f128	%r10, %r34, %r44
    cmp_ueq_f128	%r13, %r25, %r22
    cmp_ule_f128	%r12, %r15, %r23
    cmp_u_f128	%r11, %r86, %r86

    neg_f128 %r24, %r58
    abs_diff_f128 %r45, %r61, %r20
    nabs_diff_f128 %r56, %r32, %r20
    round_f128 %r78, %r74
    trunk_f128 %r89, %r65
    floor_f128 %r81, %r76
    ceil_f128 %r62, %r67
    sqrt_f128 %r63, %r78
    rsqrt_f128 %r64, %r69

    add_imm_i64 %r45, %sp,-4800
    ld_imm %r13, 2

    ld_u32 %r12, %r45, 4*1
    st_i32 %r12, %r45, 4*1
    ld_u64 %r12, %r45, 8*3
    st_i64 %r12, %r45, 8*3
    ld_xi64_u32 %r12, %r45, %r13, 2, 60
    st_xi64_i32 %r12, %r45, %r13, 2, 60
    ld_xi64_u64 %r12, %r45, %r13, 3, 60
    st_xi64_i64 %r12, %r45, %r13, 3, 60

    add_f128	%r23, %r24, %r25
    madd_f128	%r23, %r60, %r55, %r33
    mul_f128	%r23, %r60, %r55
    ld_u64	%r60, %r45, 8*6
    madd_f128	%r23, %r60, %r55, %r33
    madd_f128	%r24, %r61, %r25, %r32
    madd_f128	%r25, %r62, %r55, %r23
    madd_f128	%r26, %r63, %r75, %r73
    madd_f128	%r27, %r64, %r75, %r73
    madd_f128	%r28, %r65, %r85, %r63
    madd_f128	%r29, %r66, %r85, %r63
    madd_f128	%r30, %r67, %r55, %r23
    madd_f128	%r31, %r68, %r55, %r23
    madd_f128	%r12, %r32, %r76, %r85
    madd_f128	%r12, %r32, %r76, %r85
    madd_f128	%r10, %r32, %r76, %r85
    madd_f128	%r10, %r32, %r76, %r85
    madd_f128	%r10, %r32, %r76, %r85
    madd_f128	%r13, %r32, %r76, %r85
    madd_f128	%r14, %r32, %r76, %r85
    madd_f128	%r15, %r32, %r76, %r85
    madd_f128	%r16, %r32, %r76, %r85
    madd_f128	%r17, %r32, %r76, %r85

    trunk_f128_i32 %r56, %r45
    trunk_f128_u32 %r56, %r45
    cvt_i32_f128 %r45, %r56
    cvt_u32_f128 %r45, %r56

    ld_imm		%r5, 0
    ld_iprel_f128	%r4, 1.0
    ld_iprel_f128	%r5, 1.0
    ld_iprel_f128	%r6, 1.0
    ld_iprel_f128	%r7, 1.0
    ld_imm		%r24, 128
tri_repeat:
    write	"r7      %x128(r7)"
    add_f128	%r5, %r5, %r4
    mul_f128	%r6, %r6, %r5
    div_f128	%r7, %r4, %r6
;   write "%x128(r6)"
    rep_le_i64.l %r5, %r24, tri_repeat

    write	"test taylor series"
    ld_iprel_f128	%r2, 0.44567	; f2 ,  x
    write	"x:   %f128(r2)"		; test value
    write	"test sin(x)"
    ld_iprel_f128	%r5, sin(0.44567)
    write	"sin: %f128(r5)"		; test value
    ld_imm		%r3, 0		; s ,  0
    mul_f128	%r4, %r2, %r2	; f4 ,  x*x
    madd_f128	%r3, %r3, %r4, %r25	; s ,  s * x*x + 1/25!
    msub_f128	%r3, %r3, %r4, %r23	; s ,  s * x*x - 1/23!
    madd_f128	%r3, %r3, %r4, %r21
    msub_f128	%r3, %r3, %r4, %r19
    madd_f128	%r3, %r3, %r4, %r17
    msub_f128	%r3, %r3, %r4, %r15
    madd_f128	%r3, %r3, %r4, %r13
    msub_f128	%r3, %r3, %r4, %r11
    madd_f128	%r3, %r3, %r4, %r9
    msub_f128	%r3, %r3, %r4, %r7
    madd_f128	%r3, %r3, %r4, %r5
    msub_f128	%r3, %r3, %r4, %r3
    madd_f128	%r3, %r3, %r4, %r1
    mul_f128	%r3, %r3, %r2	; s ,  s * x
    write	"sin: %f128(r3)"

    write	"test cos(x)"
    ld_iprel_f128	%r5, cos(0.44567)
    write	"cos: %f128(r5)"		; test value
    ld_imm		%r3, 0		; s ,  0
    mul_f128	%r4, %r2, %r2	; f4 ,  x*x
    msub_f128	%r3, %r3, %r4, %r26
    madd_f128	%r3, %r3, %r4, %r24
    msub_f128	%r3, %r3, %r4, %r22
    madd_f128	%r3, %r3, %r4, %r20
    msub_f128	%r3, %r3, %r4, %r18
    madd_f128	%r3, %r3, %r4, %r16
    msub_f128	%r3, %r3, %r4, %r14
    madd_f128	%r3, %r3, %r4, %r12
    msub_f128	%r3, %r3, %r4, %r10
    madd_f128	%r3, %r3, %r4, %r8
    msub_f128	%r3, %r3, %r4, %r6
    madd_f128	%r3, %r3, %r4, %r4
    msub_f128	%r3, %r3, %r4, %r2
    madd_f128	%r3, %r3, %r4, %r1
    write	"cos: %f128(r3)"

    write "test exp(x)"
    ld_iprel_f128 %r5, exp(0.44567)
    write "exp: %f128(r5)" ; test value
    ld_imm %r3, 0 ; s ,  0.0
    mov %r4, %r2 ; f4 ,  x
    ld_imm_f64	%r6, 0.125
;   write	"%f128(r6)"
    mul_f128	%r4, %r4, %r6 ; x ,  x/8
    madd_f128	%r3, %r3, %r4, %r15
    madd_f128	%r3, %r3, %r4, %r14
    madd_f128	%r3, %r3, %r4, %r13
    madd_f128	%r3, %r3, %r4, %r12
    madd_f128	%r3, %r3, %r4, %r11
    madd_f128	%r3, %r3, %r4, %r10
    madd_f128	%r3, %r3, %r4, %r9
    madd_f128	%r3, %r3, %r4, %r8
    madd_f128	%r3, %r3, %r4, %r7
    madd_f128	%r3, %r3, %r4, %r6
    madd_f128	%r3, %r3, %r4, %r5
    madd_f128	%r3, %r3, %r4, %r4
    madd_f128	%r3, %r3, %r4, %r3
    madd_f128	%r3, %r3, %r4, %r2
    madd_f128	%r3, %r3, %r4, %r1
    madd_f128	%r3, %r3, %r4, %r1
    mul_f128	%r3, %r3, %r3	; (e^x) ^ 8
    mul_f128	%r3, %r3, %r3
    mul_f128	%r3, %r3, %r3
    write	"exp: %f128(r3)"

    add_f128	%r1, %r2, %r3
    madd_f128	%r2, %r10, %r20, %r30
    madd_f128	%r1, %r11, %r21, %r31

    ; classification
    cl_f32	%r4, %r5, 120
    cl_f64	%r4, %r5, 120
    cl_f128	%r4, %r5, 120
    jmp		skipfma

fpu_backward_target:
; single branches
    br_oeq_f32	%r23, %r34, fpu_backward_target
    br_oeq_f32.l	%r23, %r34, fpu_backward_target
    br_oeq_f32	%r23, %r34, fpu_forward_target
    br_oeq_f32.l	%r23, %r34, fpu_forward_target

    br_ueq_f32	%r23, %r34, fpu_backward_target
    br_ueq_f32.l	%r23, %r34, fpu_backward_target
    br_ueq_f32	%r23, %r34, fpu_forward_target
    br_ueq_f32.l	%r23, %r34, fpu_forward_target

    br_one_f32	%r23, %r34, fpu_backward_target
    br_one_f32.l	%r23, %r34, fpu_backward_target
    br_one_f32	%r23, %r34, fpu_forward_target
    br_one_f32.l	%r23, %r34, fpu_forward_target

    br_une_f32	%r23, %r34, fpu_backward_target
    br_une_f32.l	%r23, %r34, fpu_backward_target
    br_une_f32	%r23, %r34, fpu_forward_target
    br_une_f32.l	%r23, %r34, fpu_forward_target

    br_olt_f32	%r23, %r34, fpu_backward_target
    br_olt_f32.l	%r23, %r34, fpu_backward_target
    br_olt_f32	%r23, %r34, fpu_forward_target
    br_olt_f32.l	%r23, %r34, fpu_forward_target

    br_ult_f32	%r23, %r34, fpu_backward_target
    br_ult_f32.l	%r23, %r34, fpu_backward_target
    br_ult_f32	%r23, %r34, fpu_forward_target
    br_ult_f32.l	%r23, %r34, fpu_forward_target

    br_ole_f32	%r23, %r34, fpu_backward_target
    br_ole_f32.l	%r23, %r34, fpu_backward_target
    br_ole_f32	%r23, %r34, fpu_forward_target
    br_ole_f32.l	%r23, %r34, fpu_forward_target

    br_ule_f32	%r23, %r34, fpu_backward_target
    br_ule_f32.l	%r23, %r34, fpu_backward_target
    br_ule_f32	%r23, %r34, fpu_forward_target
    br_ule_f32.l	%r23, %r34, fpu_forward_target

    br_o_f32	%r23, %r34, fpu_backward_target
    br_o_f32.l	%r23, %r34, fpu_backward_target
    br_o_f32	%r23, %r34, fpu_forward_target
    br_o_f32.l	%r23, %r34, fpu_forward_target

    br_u_f32	%r23, %r34, fpu_backward_target
    br_u_f32.l	%r23, %r34, fpu_backward_target
    br_u_f32	%r23, %r34, fpu_forward_target
    br_u_f32.l	%r23, %r34, fpu_forward_target

    br_class_f32   %r23, 34, fpu_backward_target
    br_class_f32.l %r23, 34, fpu_backward_target
    br_class_f32   %r23, 34, fpu_forward_target
    br_class_f32.l %r23, 34, fpu_forward_target

; double branches
    br_oeq_f64	%r23, %r34, fpu_backward_target
    br_oeq_f64.l	%r23, %r34, fpu_backward_target
    br_oeq_f64	%r23, %r34, fpu_forward_target
    br_oeq_f64.l	%r23, %r34, fpu_forward_target

    br_ueq_f64	%r23, %r34, fpu_backward_target
    br_ueq_f64.l	%r23, %r34, fpu_backward_target
    br_ueq_f64	%r23, %r34, fpu_forward_target
    br_ueq_f64.l	%r23, %r34, fpu_forward_target

    br_one_f64	%r23, %r34, fpu_backward_target
    br_one_f64.l	%r23, %r34, fpu_backward_target
    br_one_f64	%r23, %r34, fpu_forward_target
    br_one_f64.l	%r23, %r34, fpu_forward_target

    br_une_f64	%r23, %r34, fpu_backward_target
    br_une_f64.l	%r23, %r34, fpu_backward_target
    br_une_f64	%r23, %r34, fpu_forward_target
    br_une_f64.l	%r23, %r34, fpu_forward_target

    br_olt_f64	%r23, %r34, fpu_backward_target
    br_olt_f64.l	%r23, %r34, fpu_backward_target
    br_olt_f64	%r23, %r34, fpu_forward_target
    br_olt_f64.l	%r23, %r34, fpu_forward_target

    br_ult_f64	%r23, %r34, fpu_backward_target
    br_ult_f64.l	%r23, %r34, fpu_backward_target
    br_ult_f64	%r23, %r34, fpu_forward_target
    br_ult_f64.l	%r23, %r34, fpu_forward_target

    br_ole_f64	%r23, %r34, fpu_backward_target
    br_ole_f64.l	%r23, %r34, fpu_backward_target
    br_ole_f64	%r23, %r34, fpu_forward_target
    br_ole_f64.l	%r23, %r34, fpu_forward_target

    br_ule_f64	%r23, %r34, fpu_backward_target
    br_ule_f64.l	%r23, %r34, fpu_backward_target
    br_ule_f64	%r23, %r34, fpu_forward_target
    br_ule_f64.l	%r23, %r34, fpu_forward_target

    br_o_f64	%r23, %r34, fpu_backward_target
    br_o_f64.l	%r23, %r34, fpu_backward_target
    br_o_f64	%r23, %r34, fpu_forward_target
    br_o_f64.l	%r23, %r34, fpu_forward_target

    br_u_f64	%r23, %r34, fpu_backward_target
    br_u_f64.l	%r23, %r34, fpu_backward_target
    br_u_f64	%r23, %r34, fpu_forward_target
    br_u_f64.l	%r23, %r34, fpu_forward_target

    br_class_f64	%r23, 34, fpu_backward_target
    br_class_f64.l	%r23, 34, fpu_backward_target
    br_class_f64	%r23, 34, fpu_forward_target
    br_class_f64.l	%r23, 34, fpu_forward_target

; quadruple branches
    br_oeq_f128	%r23, %r34, fpu_backward_target
    br_oeq_f128.l	%r23, %r34, fpu_backward_target
    br_oeq_f128	%r23, %r34, fpu_forward_target
    br_oeq_f128.l	%r23, %r34, fpu_forward_target

    br_ueq_f128	%r23, %r34, fpu_backward_target
    br_ueq_f128.l	%r23, %r34, fpu_backward_target
    br_ueq_f128	%r23, %r34, fpu_forward_target
    br_ueq_f128.l	%r23, %r34, fpu_forward_target

    br_one_f128	%r23, %r34, fpu_backward_target
    br_one_f128.l	%r23, %r34, fpu_backward_target
    br_one_f128	%r23, %r34, fpu_forward_target
    br_one_f128.l	%r23, %r34, fpu_forward_target

    br_une_f128	%r23, %r34, fpu_backward_target
    br_une_f128.l	%r23, %r34, fpu_backward_target
    br_une_f128	%r23, %r34, fpu_forward_target
    br_une_f128.l	%r23, %r34, fpu_forward_target

    br_olt_f128	%r23, %r34, fpu_backward_target
    br_olt_f128.l	%r23, %r34, fpu_backward_target
    br_olt_f128	%r23, %r34, fpu_forward_target
    br_olt_f128.l	%r23, %r34, fpu_forward_target

    br_ult_f128	%r23, %r34, fpu_backward_target
    br_ult_f128.l	%r23, %r34, fpu_backward_target
    br_ult_f128	%r23, %r34, fpu_forward_target
    br_ult_f128.l	%r23, %r34, fpu_forward_target

    br_ole_f128	%r23, %r34, fpu_backward_target
    br_ole_f128.l	%r23, %r34, fpu_backward_target
    br_ole_f128	%r23, %r34, fpu_forward_target
    br_ole_f128.l	%r23, %r34, fpu_forward_target

    br_ule_f128	%r23, %r34, fpu_backward_target
    br_ule_f128.l	%r23, %r34, fpu_backward_target
    br_ule_f128	%r23, %r34, fpu_forward_target
    br_ule_f128.l	%r23, %r34, fpu_forward_target

    br_o_f128	%r23, %r34, fpu_backward_target
    br_o_f128.l	%r23, %r34, fpu_backward_target
    br_o_f128	%r23, %r34, fpu_forward_target
    br_o_f128.l	%r23, %r34, fpu_forward_target

    br_u_f128    %r23, %r34, fpu_backward_target
    br_u_f128.l  %r23, %r34, fpu_backward_target
    br_u_f128    %r23, %r34, fpu_forward_target
    br_u_f128.l  %r23, %r34, fpu_forward_target

    br_class_f128    %r23, 34, fpu_backward_target
    br_class_f128.l  %r23, 34, fpu_backward_target
    br_class_f128    %r23, 34, fpu_forward_target
    br_class_f128.l  %r23, 34, fpu_forward_target

fpu_forward_target:

    nul_une_f32  %r23, %r34, 1, 1
    nul_une_f64  %r23, %r34, 1, 1
    nul_une_f128 %r23, %r34, 1, 1

    nul_one_f32  %r23, %r34, 1, 1
    nul_one_f64  %r23, %r34, 1, 1
    nul_one_f128 %r23, %r34, 1, 1

    nul_ueq_f32  %r23, %r34, 1, 1
    nul_ueq_f64  %r23, %r34, 1, 1
    nul_ueq_f128 %r23, %r34, 1, 1

    nul_oeq_f32  %r23, %r34, 1, 1
    nul_oeq_f64  %r23, %r34, 1, 1
    nul_oeq_f128 %r23, %r34, 1, 1

    nul_class_f32  %r23, 94, 1, 1
    nul_class_f64  %r23, 94, 1, 1
    nul_class_f128 %r23, 94, 1, 1
skipfma:
    write	"end fpu"
.end
.text
    alloc 96
    write "test base addressing with indexed post-update"
    ld_imm %r12, 1
    add_imm_i64 %r45, %sp, -512

    ld_mia_u8  %r23, %r45, 2
    ld_mia_u16  %r23, %r45, 2
    ld_mia_u32  %r23, %r45, 4
    ld_mia_u64  %r23, %r45, 8
    ld_mia_i128 %r23, %r45, 16

    ld_mia_i8  %r23, %r45, 2
    ld_mia_i16 %r23, %r45, 2
    ld_mia_i32 %r23, %r45, 4
    ld_mia_i64 %r23, %r45, 8

    st_mia_i8   %r23, %r45, 2
    st_mia_i16  %r23, %r45, 2
    st_mia_i32  %r23, %r45, 4
    st_mia_i64  %r23, %r45, 8
    st_mia_i128 %r23, %r45, 16
    write "end_indexed_modify_test"
.end
.rodata
rodata1:
    d1 123
    align 2
rodata2:
    d2 12345
    align 4
rodata4:
    d4 123456789
    align 8
rodata8:
    d8 1234567890123456789

.data
data1:
    d1 123
    align 2
data2:
    d2 12345
    align 4
data4:
    d4 123456789
    align 8
data8:
    d8 1234567890123456789

.text
    alloc 96

    write "test ip-relative data addressing"
    ld_iprel_u8  %r34, rodata1
    ld_iprel_u16 %r34, rodata2
    ld_iprel_u32 %r34, rodata4
    ld_iprel_u64 %r34, rodata8

    ld_iprel_i8  %r34, rodata1
    ld_iprel_i16 %r34, rodata2
    ld_iprel_i32 %r34, rodata4
    ld_iprel_i64 %r34, rodata8

    ld_iprel_u8  %r34, data1
    ld_iprel_u16 %r34, data2
    ld_iprel_u32 %r34, data4
    ld_iprel_u64 %r34, data8

    ld_iprel_i8  %r34, data1
    ld_iprel_i16 %r34, data2
    ld_iprel_i32 %r34, data4
    ld_iprel_i64 %r34, data8

    st_iprel_i8  %r34, data1
    st_iprel_i16 %r34, data2
    st_iprel_i32 %r34, data4
    st_iprel_i64 %r34, data8

    write "end ip-relative data test"
.end
.text
    alloc 96
    write "test ca.rf"
    lda_iprel %r22, lda_rf_data
    write "ca.rf: %x64(r22)"

    write "end_ca_rf_test"
.data
lda_rf_data:

.end
.text
    alloc	96
    write	"check mbsel instruction"
    ld_imm.l	%r6, ((0x3333333333333333 ^ 0x5555555555555555) & 0xff00ff00ff00ff00) ^ 0x5555555555555555
    write	"mbsel: %x64(r6)"
    ld_imm.l	%r3, 0x3333333333333333
    ld_imm.l	%r4, 0x5555555555555555
    ld_imm.l	%r5, 0xff00ff00ff00ff00
    mbsel	%r6, %r3, %r4, %r5
    write	"mbsel: %x64(r6)"

    write	"end_mbsel_test"
.end
.text
    alloc	61
    write	"\ntest write: special register"
    write	"ip      %s(ip)"
    write	"eip     %s(eip)"
    write	"eca     %s(eca)"
    write	"fpcr    %s(fpcr)"
    write	"rsc     %s(rsc)"
    write	"rsp     %s(rsp)"
    write	"bsp     %s(bsp)"
    write	"peb     %s(peb)"
    write	"teb     %s(teb)"
    write	"itc     %s(itc)"
    write	"itm     %s(itm)"
    write	"psr     %s(psr)"
    write	"pta     %s(pta)"
    write	"iva     %s(iva)"
    write	"kip     %s(kip)"
    write	"ksp     %s(ksp)"
    write	"krsp    %s(krsp)"
    write	"iip     %s(iip)"
    write	"iipa    %s(iipa)"
    write	"ipsr    %s(ipsr)"
    write	"cause   %s(cause)"
    write	"ifa     %s(ifa)"
    write	"iib     %s(iib)"
    write	"tpr     %s(tpr)"
    write	"lid     %s(lid)"
    write	"irr0    %s(irr0)"
    write	"irr1    %s(irr1)"
    write	"irr2    %s(irr2)"
    write	"irr3    %s(irr3)"
    write	"isr0    %s(isr0)"
    write	"isr1    %s(isr1)"
    write	"isr2    %s(isr2)"
    write	"isr3    %s(isr3)"
    write	"tsv     %s(tsv)"
    write	"cmcv    %s(cmcv)"
    write	"pmv     %s(pmv)"

    write	"\ntest mfspr: read special register"

    mfspr	%r12, %ip
    write	"ip      %x64(r12)"

    mfspr	%r12, %eip
    write	"eip     %x64(r12)"

    mfspr	%r12, %eca
    write	"%x64(r12)"

    mfspr	%r12, %fpcr
    write	"%x64(r12)"

    mfspr	%r12, %rsc
    write	"%x64(r12)"

    mfspr	%r12, %rsp
    write	"%x64(r12)"

    mfspr	%r12, %bsp
    write	"%x64(r12)"

    mfspr	%r12, %peb
    write	"%x64(r12)"

    mfspr	%r12, %teb
    write	"%x64(r12)"

    mfspr	%r12, %itc
    write	"%x64(r12)"

    mfspr	%r12, %itm
    write	"%x64(r12)"

    mfspr	%r12, %psr
    write	"%x64(r12)"

    mfspr	%r12, %pta
    write	"%x64(r12)"

    mfspr	%r12, %iva
    write	"%x64(r12)"

    mfspr	%r12, %kip
    write	"%x64(r12)"

    mfspr	%r12, %ksp
    write	"%x64(r12)"

    mfspr	%r12, %krsp
    write	"krsp    %x64(r12)"

    mfspr	%r12, %iip
    write	"iip     %x64(r12)"

    mfspr	%r12, %iipa
    write	"iipa    %x64(r12)"

    mfspr	%r12, %ipsr
    write	"ipsr    %x64(r12)"

    mfspr	%r12, %cause
    write	"cause   %x64(r12)"

    write	"%s(ifa)"
    mfspr	%r12, %ifa
    write	"ifa     %x64(r12)"

    mfspr	%r12, %iib
    write	"iib     %x128(r12)"

    mfspr	%r12, %tpr
    write	"tpr     %x64(r12)"

    mfspr	%r12, %lid
    write	"lid     %x64(r12)"

    mfspr	%r12, %irr0
    write	"irr0    %x64(r12)"

    mfspr	%r12, %irr1
    write	"irr1    %x64(r12)"

    mfspr	%r12, %irr2
    write	"irr2    %x64(r12)"

    mfspr	%r12, %irr3
    write	"irr3    %x64(r12)"

    mfspr	%r12, %isr0
    write	"%x64(r12)"

    mfspr	%r12, %isr1
    write	"%x64(r12)"

    mfspr	%r12, %isr2
    write	"%x64(r12)"

    mfspr	%r12, %isr3
    write	"%x64(r12)"

    mfspr	%r12, %tsv
    write	"%x64(r12)"

    mfspr	%r12, %cmcv
    write	"%x64(r12)"

    mfspr	%r12, %pmv
    write	"%x64(r12)"

    write	"end test mfspr"
.end
.text
    alloc 69
    write "test min/max"
    min_i64 %r34, %r56, %r67
    min_u64 %r34, %r56, %r67
    max_i64 %r34, %r56, %r67
    max_u64 %r34, %r56, %r67

    min_imm_i64 %r34, %r56, 2671
    min_imm_u64 %r34, %r56, 2671
    max_imm_i64 %r34, %r56, 2671
    max_imm_u64 %r34, %r56, 2671
    write "test minmax end"

.end

.text
    write "test nullification (explicit masks)"
    alloc 96
    ld_imm %r10, 0
    nul_eq_i64 %r10, %r10, 5, 4
    write	"0" ; nullified
    write	"1" ; nullified
    write	"2" ; nullified
    write	"3" ; nullified
    write	"4" ; nullified
    write	"5" ; else
    write	"6" ; else
    write	"7" ; else
    write	"8" ; else

    write	"test nullification (predicate names)"
    ld_imm		%r10, 0
    nul_eq_i64 %r10, %r10, equal, nonequal
    write	"0"
    write	"1"
    write	"2"
    write	"3"
    write	"4" (equal)
    write	"5"
    write	"6"
    write	"7"
    write	"8" (nonequal)


    write "test nullification"
    ld_imm %r10, 0
    nul_eq_i64 %r10, %r10, 4, 3
    add_imm_i64 %r10, %r10, 2
    add_imm_i64 %r10, %r10, 2
    add_imm_i64 %r10, %r10, 2
    add_imm_i64 %r10, %r10, 1
    add_imm_i64 %r10, %r10, 1
    add_imm_i64 %r10, %r10, 1
    add_imm_i64 %r10, %r10, 1

    write "test nullification"
    ld_imm %r10, 0
    nul_eq_i64 %r10, %r10, true, false
    add_imm_i64 %r10, %r10, 2
    add_imm_i64 %r10, %r10, 2
    add_imm_i64 %r10, %r10, 2
    add_imm_i64 %r10, %r10, 2
    add_imm_i64 %r10, %r10, 1 (true)
    add_imm_i64 %r10, %r10, 1
    add_imm_i64 %r10, %r10, 1 (false)

    nop 0
    nop 0
    nul_eq_i64 %r12, %r10, 4, 3
    write	"branch1: psr=%s(psr)"
    write	"branch1: %i64(r10)"
    write	"branch1: %i64(r10)"
    write	"branch1: %i64(r10)"
    write	"branch2: psr=%s(psr)"
    write	"branch2: %i64(r20)"
    write	"branch2: %i64(r20)"


    nul_eq_i64 %r23, %r45, 0b1100, 0b0101
    nul_lt_i64 %r23, %r45, 0b1100, 0b0101
    nul_lt_u64 %r23, %r45, 0b1100, 0b0101

    nul_eq_imm_i64 %r23, 45, 0b1100, 0b0101
    nul_lt_imm_i64 %r23, -45, 0b1100, 0b0101
    nul_lt_imm_u64 %r23, 45, 0b1100, 0b0101

    nul_eq_imm_i64.l  %r23, 45000000000, 0b1100, 0b0101
    nul_lt_imm_i64.l  %r23, -45000000000, 0b1100, 0b0101
    nul_lt_imm_u64.l  %r23, 45000000000, 0b1100, 0b0101

    nul_bs %r23, %r45, 0b1100, 0b0101
    nul_bs_imm %r23, 45, 0b1100, 0b0101
    nop 1
    nop 2
    nop 3
    nop 4
    nop 5
    nop 6
    nop 7

    nul_eq_i64 %r10, %r10, same_equal, same_nonequal
    write "0e"
    write "1e"
    write "2e" (same_equal, same_nonequal)

    nul_ne_i64 %r10, %r10, same_equal2, same_nonequal2
    write "0ne"
    write "1ne"
    write "2ne" (same_equal2, same_nonequal2)

    nul_eq_i64 %r10, %r10, no_if_true, no_if_false (no_if_true)
    write "else" (no_if_false)

    write "end_nullification_test"
.end
.text
    alloc 21
    ld_imm %r12, PMC_LAST
    write	"PMC_LAST = %i64(r12)"
; don't report runtine in unittests, this is non-reproducible
    mfmr	%r14, %gz, PMC_RUNTIME
;   write	"PMC_RUNTIME = %i64(r14)"
    mfmr	%r14, %gz, PMC_SHORT_INSTRUCTION
    write	"PMC_SHORT_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_LONG_INSTRUCTION
    write	"PMC_LONG_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_SHADOWED_INSTRUCTION
    write	"PMC_SHADOWED_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_NOP_INSTRUCTION
    write	"PMC_NOP_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_QUALIFIED_NOP_INSTRUCTION
    write	"PMC_QUALIFIED_NOP_INSTRUCTION = %i64(r14)"
    mfmr	%r14, %gz, PMC_REGISTER_SPILL
    write	"PMC_REGISTER_SPILL = %i64(r14)"
    mfmr	%r14, %gz, PMC_REGISTER_FILL
    write	"PMC_REGISTER_FILL = %i64(r14)"
    mfmr	%r14, %gz, PMC_ICACHE_HIT
    write	"PMC_ICACHE_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_ICACHE_MISS
    write	"PMC_ICACHE_MISS = %i64(r14)"
    mfmr	%r14, %gz, PMC_DCACHE_HIT
    write	"PMC_DCACHE_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_DCACHE_MISS
    write	"PMC_DCACHE_MISS = %i64(r14)"
    mfmr	%r14, %gz, PMC_INSTRUCTION_TRANSLATION_HIT
    write	"PMC_INSTRUCTION_TRANSLATION_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_INSTRUCTION_TRANSLATION_MISS
    write	"PMC_INSTRUCTION_TRANSLATION_MISS = %i64(r14)"
    mfmr	%r14, %gz, PMC_DATA_TRANSLATION_HIT
    write	"PMC_DATA_TRANSLATION_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_DATA_TRANSLATION_MISS
    write	"PMC_DATA_TRANSLATION_MISS = %i64(r14)"
    mfmr	%r14, %gz, PMC_BACKSTORE_TRANSLATION_HIT
    write	"PMC_BACKSTORE_TRANSLATION_HIT = %i64(r14)"
    mfmr	%r14, %gz, PMC_BACKSTORE_TRANSLATION_MISS
    write	"PMC_BACKSTORE_TRANSLATION_MISS = %i64(r14)"
    mtmr	%r14, %gz, PMC_SHORT_INSTRUCTION
    mfmr	%r15, %gz, PMC_SHORT_INSTRUCTION
    write	"old pm reg = %i64(r15)"
.end
.text
; Simple test program
; 20! factorial compute
.text
    alloc 61
    ld_imm %r15, -100
loop_stop_sard:
    srd_imm_i64 %r13, %r15, 5
    rep_le_i64 %r15, %gz, loop_stop_sard

; performance test - long loop
; for(i = 1000000; i>0; i--) DoSome();

    ld_imm %r20, 2500000
    ld_imm %r15, 20 ; maximum factorial number
    ld_imm %r21, 5
loop_stop: (64)
    add_imm_i64 %r13, %r13, 5
    sub_i64 %r14, %r14, %r55
    cmp_lt_i64 %r24, %r14, %r14
    add_imm_i64 %r13, %r13, 4
    sub_i64 %r14, %r14, %r55
    cmp_lt_i64 %r22, %r14, %r14
    add_imm_i64 %r13, %r13, 33
    srp_imm_i64 %r14, %r14, %r55, 13
    sub_i64 %r14, %r13, %r21
    sra_imm_i64 %r14, %r14, 7
    rep_gt_i64 %r20, %gz, loop_stop
; print loop counter after loop (must be 0)
    write "%i64(r20) factorials"
    ld_imm %r13, 1
    ld_imm %r14, 1
start:
    mul_i64 %r13, %r13, %r14
    write "factorial: %u64(r13)"
    rep_le_i64 %r14, %r15, start

    write "%i64(r14) %i64(r13)"
.end
.text
    alloc	96
    write	"Example of strided loop instructions"
; fast_check
    ld_imm		%r12, 10000	; load loop number (10)
stride_loop_start:
;	write	"%i64(r12)"
    cmp_eq_i64 %r4, %r12, %r12
    add_i64		%r14, %r14, %r46
    rep_gt_i64 %r12, %gz, stride_loop_start

    write	"counter=%i64(r12)"

; Second example of strided loop.
; fast_check
    ld_imm		%r12, 10000	; load loop number (10)
    ld_imm		%r14, 10000	; load loop number (10)
stride_loop_start2:
;   write	"%i64(r12)"
    cmp_eq_i64 %r4, %r12, %r12
    add_imm_i64	%r14, %r14, -2
    rep_gt_i64 %r12, %gz, stride_loop_start2

    write	"%i64(r12) %i64(r14)"

;*****************************************************************
; 3x inner loop example
;*****************************************************************
    ld_imm		%r3, 0
    ld_imm		%r20, 0
    ld_imm		%r33, 80
    mov		%r10, %r33
    mov		%r11, %r33
    mov		%r12, %r33
ccloop:
;   write	"%i64(r12)"
    add_imm_i64	%r20, %r20, 1
    add_imm_i64	%r12, %r12, -1
    cmp_lt_i64	%r2, %r3, %r12
;   jmp	ccloop
;   write	"%i64(r11)"
    add_imm_i64	%r11, %r11, -1
    cmp_lt_i64	%r4, %r3, %r11
    mov		%r12, %r33
;   jmp		ccloop
;   write	"%i64(r10)"
    add_imm_i64	%r10, %r10, -1
    cmp_lt_i64	%r6, %r3, %r10
    mov		%r11, %r33
    mov		%r12, %r33
;   jmp		ccloop

    write	"%i64(r20)"

; for(i=0; i<100; i++)

    ld_imm	%r8, 0
start1:
;   write "%i64(r8)"
    add_imm_i64 %r8, %r8,1
    cmp_lt_imm_i64 %r7, %r8,128
    br_ne_imm_i64 %r7,0,start1

; for(i=100; i>0; i--)
    ld_imm %r8, 100
start2:
    write "%i64(r8)"
    add_imm_i64 %r8, %r8,-1 ; current error
    cmp_lt_i64 %r2, %r3, %r8
    br_ne_imm_i64 %r2, 0, start2

    write "r3      %x64(r3)"
; mtspr %r3, %rsc


; for(i=100; i>0; i--) write "%x64((i)"
    ld_imm %r10, 100
qqq: cmp_lt_i64 %r2, %r3, %r10
    write "r10     %x64(r10)"
    add_imm_i64 %r10, %r10, -1
;   jmp qqq
sss:

    and_imm.l %r55, %r55,0x000FFFFF00003F0F
    mtspr %r12, %ifa
; test some special regs
    ld_imm.l %r9, 0x123456789
;   mtspr %r9, psr
    write "ip: %s(ip) psr: %s(psr)"
;   mtspr %r3, psr
    ld_imm %r55, 120
    mtspr %r55, %tpr
    write "fpcr    %s(fpcr)"
    write "psr     %s(psr)"

    write "test long loop"
; test simple loop
; fast_check
    ld_imm %r13, 350000 ; 35
    ld_imm %r14, 350000 ; 35
    ld_imm %r15, 88
    write "%i64(r14)"
repeat_loop_start: (128)
; write "%i64(r12)"
    add_imm_i64 %r13, %r13, 3
    add_i64 %r13, %r13, %r15
    srp_imm_i64 %r13, %r13, %r15, 8

    add_imm_i64	%r13, %r13, 4
    add_i64 %r13, %r13, %r15
    srp_imm_i64 %r13, %r13, %r15, 7

    add_imm_i64	%r13, %r13, 5
    add_i64 %r13, %r13, %r15
    srp_imm_i64 %r13, %r13, %r15, 6

    add_imm_i64 %r13, %r13, 6
    add_i64 %r13, %r13, %r15
    srp_imm_i64 %r13, %r13, %r15, 5

    sub_i64 %r13, %r13, %r15
    sl_add_i64 %r13, %r13, %r15, 5
    sl_add_i64 %r13, %r13, %r15, 5

    xor %r13, %r14, %r15
    sll_u64 %r13, %r13, %r13
    rep_gt_i64 %r14, %gz, repeat_loop_start

    write "%i64(r13) %i64(r14)"

    write "end test long loop"
.end
.text
    write "test random"
    alloc 96

    random %r3, %gz
    write "random: %x64(r3)"
    random %r3, %gz
    write "random: %x64(r3)"
    ld_imm %r4, 1
    random %r3, %r4
    write "random seed: %x64(r3)"

    write "end_random_test"
.end
.text
; test simple long loop
    alloc 61
    ld_imm %r13, 1000000
    mov %r14, %r13
    write "loop limit: %i64(r14)"
    ld_imm %r15, 88
repeat_long_loop_start: (128)
    add_imm_i64 %r13, %r13, 3
    add_i64 %r13, %r13, %r15
    srp_imm_i64 %r13, %r13, %r15, 8
    add_imm_i64 %r13, %r13, 4
    add_i64 %r13, %r13, %r15
    srp_imm_i64 %r13, %r13, %r15, 7
    add_imm_i64 %r13, %r13, 5
    add_i64 %r13, %r13, %r15
    srp_imm_i64 %r13, %r13, %r15, 6
    add_imm_i64 %r13, %r13, 6
    add_i64 %r13, %r13, %r15
    srp_imm_i64 %r13, %r13, %r15, 5
    add_i64 %r30, %r31, %r14
    sub_i64 %r31, %r30, %r15
    sll_imm_u64 %r40, %r40, 12
    lda_xi64 %r41, %r40, %r12, 3, -12
    lda_xi64 %r41, %r40, %r12, 4, 62
    rep_gt_i64 %r14, %gz, repeat_long_loop_start
    jmp repeat_exit

    rep_le_i64 %r56, %r60, repeat_long_loop_start
    rep_ge_i64 %r56, %r60, repeat_long_loop_start
    rep_le_u64 %r56, %r20, repeat_long_loop_start
    rep_ge_u64 %r56, %r20, repeat_long_loop_start

    rep_le_i64.l %r56, %r60, repeat_long_loop_start
    rep_ge_i64.l %r56, %r60, repeat_long_loop_start
    rep_le_u64.l %r56, %r20, repeat_long_loop_start
    rep_ge_u64.l %r56, %r20, repeat_long_loop_start

repeat_exit:
    write "end loop repeat test"
.end
.text
; Here we test instructions for partial rotate register by fixed bitcount.
    alloc	90
    write	"initial values"
    ld_imm.l	%r50, 0x1234567890ABCDEF
    write	"%x64(r50)"
    write	"rotate left"
    srp_imm_i64	%r51, %r50, %r50, 40-1
    write	"%x64(r51)"
    write	"rotate right"
    srp_imm_i64	%r51, %r50, %r50, 64-40-1	; same as previous
    write	"%x64(r51)"
    write	"rotate left immediate"
    srp_imm_i64	%r51, %r50, %r50, 64-40-1
    write	"%x64(r51)"
    write	"rotate right immediate"
    srp_imm_i64	%r51, %r50, %r50, 40-1	; same as previous "rD+1-rC"
    write	"%x64(r51)"

; Here we test instructions for shift and mask register by fixed bitcount.
    write "shift signed|unsigned by immediate 12 bit"
    ld_imm.l %r50, 0xfedcba0123456789
    write "%x64(r50)"
    sra_imm_i64	%r51, %r50, 12
    write "%x64(r51)"
    srl_imm_u64 %r51, %r50, 12
    write "%x64(r51)"
    sll_imm_u64 %r51, %r50, 12
    write "%x64(r51)"
    sll_imm_u64 %r51, %r50, 12
    write "%x64(r51)"

;	jmp	ddd
    ld_imm %r10, 16
    slp_i64 %r51, %r50, %r50, %r10
    write "%x64(r51)"

    ld_imm.l %r40, 0x1234567890abcdef
    ld_imm.l %r50, 0xfedcba0987654321
    slsrl_imm_u64 %r41, %r40, 8, 40
    write "%x64(r41)"
    slsra_imm_i64 %r41, %r40, 11, 40
    write "%x64(r41)"

    write "test srpi"
    ld_imm.l %r40, 0x1234123412341234
    ld_imm.l %r50, 0x5678567856785678
    srp_imm_i64 %r41, %r40, %r50, 39
    write "%x64(r41)"
    srp_imm_i64 %r41, %r50, %r40, 23
    write "%x64(r41)"
    srp_imm_i64 %r41, %r40, %r40, 24
    write "%x64(r41)"

    write "test vector shift right pair (srpi16) instruction"
    xor %r2, %r2, %r2 ; all zeroes
    nor %r3, %r2, %r2 ; all ones
    write "r2      %x128(r2)"
    write "r3      %x128(r3)"
    srp_imm_i128 %r4, %r2, %r3, 60
    write "r4      %x128(r4)"
    srp_imm_i128 %r4, %r3, %r2, 60
    write "r4      %x128(r4)"
    srp_imm_i128 %r4, %r2, %r3, 100
    write "r4      %x128(r4)"
    srp_imm_i128 %r4, %r3, %r2, 100
    write "r4      %x128(r4)"

; SHIFTS
    sll_u64 %r42, %r33, %r34
    sll_u64 %r42, %r33, %r34
    sra_i64 %r52, %r73, %r44
    srl_u64 %r62, %r73, %r44
    slp_i64 %r72, %r17, %r17, %r24
    srp_i64 %r82, %r16, %r16, %r15
    srp_imm_i64 %r72, %r15, %r24, 32
    dep %r10, %r14, %r85, 32, 30

    sll_imm_u64 %r12, %r67, 13
    sll_imm_u64 %r13, %r57, 13
    sra_imm_i64 %r14, %r48, 14
    srl_imm_u64 %r15, %r38, 14
    srp_imm_i64 %r16, %r39, %r13, 13
    srp_imm_i64 %r17, %r29, %r13, 64-13


    write	"test packed bitwise logical"
    and		%r10, %r71, %r13
    andn	%r21, %r81, %r22
    or		%r32, %r71, %r32
    orn		%r43, %r61, %r43
    nand	%r54, %r51, %r54
    nor		%r65, %r41, %r64
    xnor	%r76, %r31, %r73
    xor		%r87, %r21, %r83


    ld_imm %r20, 65
    write "r20     %c(r20)"   ; should be 'A'

    ld_imm   %r3, 0
    ld_imm.l %r22, 0x12345FFFFFFFFFFF
    write "%x64(r22)"
    dep_c %r23, %r22, 0, 23
    write "%x64(r23)"

    ld_imm.l %r22, 0x1234567890ABCDEF
    ld_imm.l %r23, 0xFEDCBA9876543210
    srp_imm_i64 %r22, %r22, %r23, 24
    write "%x64(r22)"

    ld_imm.l %r24, 0x4321F00000000
    write "%x64(r24)"
    subf_imm_i64 %r25, %r24, 0
    write "%x64(r25)"
    not %r25, %r25
    write "%x64(r25)"
    xor  %r25, %r25, %r24
    write "%x64(r25)"

; Example of abs_diff
    ld_imm %r12, -10000
    abs_diff_i64 %r12, %r12, %gz
    write "r12: %i64(r12)"
.end
.text
    jmp		endfpsimd
; SSE double (SSE2)
    madd_vf64	%r16, %r71, %r69, %r13
    msub_vf64	%r15, %r78, %r58, %r23
    nmadd_vf64	%r14, %r67, %r47, %r13
    nmsub_vf64	%r13, %r86, %r36, %r16
    madd_alt_vf64	%r82, %r52, %r69, %r63
    msub_alt_vf64	%r50, %r91, %r69, %r63
    add_vf64	%r12, %r86, %r25
    nadd_vf64	%r11, %r82, %r19
    sub_vf64	%r10, %r63, %r28
    add_alt_vf64 %r81, %r61, %r37
    sub_alt_vf64 %r82, %r81, %r46
    add_horiz_vf64 %r83, %r81, %r55
    sub_horiz_vf64 %r84, %r71, %r64
    mul_vf64	%r81, %r71, %r11
    mul_horiz_vf64 %r60, %r11, %r22
    dot_vf64 %r85, %r81, %r13
    min_vf64 %r86, %r84, %r14
    max_vf64 %r87, %r61, %r15
    abs_min_vf64 %r30, %r52, %r16
    abs_max_vf64 %r61, %r51, %r17

    cmp_oeq_vf64 %r80, %r81, %r63
    cmp_one_vf64 %r11, %r81, %r32
    cmp_olt_vf64 %r15, %r81, %r32
    cmp_olt_vf64 %r60, %r81, %r82
    cmp_one_vf64 %r62, %r72, %r83
    cmp_ole_vf64 %r62, %r72, %r62

    pk_vf64 %r60, %r61, %r62
    neg_vf64 %r61, %r51
    abs_diff_vf64 %r61, %r51, %r3
    nabs_diff_vf64 %r61, %r61, %r3
    floor_vf64 %r60, %r77
    ceil_vf64 %r62, %r61
    roundeven_vf64 %r62, %r71
    trunk_vf64 %r83, %r67
    div_vf64 %r83, %r67, %r20
    sqrt_vf64 %r68, %r81
    rsqrt_vf64 %r68, %r81


; quadruple floating-point extension example
.rodata
    align	16
a:	quad	1.234567890123456789124141241241
b:	quad	3.1415925678888734535345231234564561
c:	quad	3.4566345634563456346535463463456
.text
    lda_r	%r21, a
    ld_i128		%r3, %r21,0*16
    ld_i128		%r1, %r21,1*16
    ld_i128		%r2, %r21,2*16
    write	"%vf64(r3)"
    write	"%vf64(r1)"
    write	"%vf64(r2)"

    write	"test binary\0"
    mul_f64	%r3, %r1, %r2
    write	"%vf64(r3)"
    nmul_f64	%r3, %r1, %r2
    write	"%vf64(r3)"
    add_f64	%r4, %r1, %r2
    write	"%vf64(r4)"
    nadd_f64	%r4, %r1, %r2
    write	"%vf64(r4)"
    sub_f64	%r4, %r2, %r1
    write	"%vf64(r4)"
    div_f64	%r4, %r2, %r1
    write	"%vf64(r4)"

    write	"test fused fma\0"
    madd_f64	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"
    nmadd_f64	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"
    msub_f64	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"
    nmsub_f64	%r5, %r4, %r1, %r2
    write	"%vf64(r5)"

    write	"test unary\0"
    mov		%r6, %r5
    write	"%vf64(r6)"
    abs_f64	%r6, %r5
    write	"%vf64(r6)"
    neg_f64	%r6, %r5
    write	"%vf64(r6)"
    nabs_f64	%r6, %r5
    write	"%vf64(r6)"
    sqrt_f64	%r6, %r2
    write "%vf64(r6)"
    rsqrt_f64 %r6, %r2
    write "%vf64(r6)"

    write "test rounding\0"
    ceil_f64 %r7, %r2
    write "%vf64(r7)"
    trunk_f64 %r7, %r2
    write "%vf64(r7)"
    floor_f64 %r7, %r2
    write "%vf64(r7)"
    roundeven_f64 %r7, %r2
    write "%vf64(r7)"
    trunk_f64_i32 %r7, %r2
    write "r7=%i64(r7)"
    ld_imm %r7, 123456
    cvt_i32_f64 %r7, %r7
    write "%vf64(r7)"

    write "test minmax, abs minmax"
    max_f64 %r8, %r1, %r2
    write "%vf64(r8)"
    min_f64 %r8, %r1, %r2
    write "%vf64(r8)"
    abs_max_f64 %r8, %r1, %r2
    write "%vf64(r8)"
    abs_min_f64 %r8, %r1, %r2
    write "%vf64(r8)"

    write	"test fmergesq\0"

.rodata
    align	16
xxxq:	quad	1.122
    quad	0.9999765432
.text
    lda_r %r21, a
; fast_check
    ld_imm %r15, 100000 ; 10
    ld_i128 %r15, %r21, 0*16
    ld_i128 %r16, %r21, 1*16
    sub_f64 %r22, %r15, %r16
    write "%vf64(r22)"
yyloop:
    madd_f64 %r22, %r15, %r16, %r22
    msub_f64 %r22, %r15, %r16, %r22
    rep_ge_i64 %r15, %gz, yyloop
    write "%vf64(r22)"


.rodata
    align	16
    quad	1.189731495357231765085759326628007e+4932
qqqq:   quad	1.23456789 + 32.0
    quad	0.2345678901234567890123456789012345678 + 0.2
    quad	2*asin(1)
    quad	255
dbl1:	double	acos(sin(3.1415926)) ;-1.2345678e+200
    double	444.689679
float1:	float	0.123456789123456789e+30
    float	2.123456789122233
    float	0.0
    float	1.0
octquad:
    quad	0.25
f32:	d4	0x3fff1234
.text
    lda_r %r45, qqqq
    lda_r %r46, dbl1
    lda_r %r47, float1
    write	"r45     %x64(r45)"
    ld_i128		%r63, %r45,0
    write	"%vf64(r63) %x128(r63)"
    ld_i128		%r63, %r45,0
    write	"%vf64(r63) %x128(r63)"
    mul_f128	%r62, %r63, %r63
    write	"%vf64(r62)"
    ld_u32	%r60, %r47,0
    write	"%vf64(r60)"
    ld_u64	%r59, %r46,0
    ld_u32	%r58, %r47,4
    ld_u32	%r57, %r47,8
    write	"%vf64(r57)"
    write	"%vf64(r58)"
    write	"%vf64(r59)"
    ld_i128	%r53, %r45,1*16
    write	"%vf64(r53)"
    ld_i128	%r50, %r45,2*16
    write	"%vf64(r50)"
    ld_i128	%r49, %r45,3*16
    write	"%vf64(r49) %x128(r49)"
    ld_u32	%r48, %r47,3*4
    write	"%vf64(r48)"
    neg_f128	%r46, %r48
    write	"%vf64(r46)"
    madd_f128	%r40, %r52, %r52, %r53
    write	"%m(dump)"

.rodata
    align 16
__yyy:
    quad 0.5
    quad 1.0
    quad 2.25
    quad 22252.22424
    quad -22252.22424
    quad 34.125
    quad 2.0 / 72.0
    d8 0xffffffffffffffff
    d8 0x3ffe
    d8 0xffffffffffffffff
    d8 0x3ff0
    d8 0x8000000000000000
    d8 0xbff3
    d8 0x8000000000000000
    d8 0xc003
    quad -1.234567890123456789012345e+6
    d8 0x8000000000000000
    d8 0x3fe0
.text
    lda_r %r12, __yyy
    ld_i128	%r23, %r12, 0
    write	"%vf64(r23) %x128(r23)"
    ld_i128	%r23, %r12, 1*16
    write	"%vf64(r23) %x128(r23)"
    ld_i128	%r23, %r12, 2*16
    write	"%vf64(r23) %x128(r23)"
    ld_i128	%r23, %r12, 3*16
    write	"%vf64(r23) %x128(r23)"
    ld_i128	%r23, %r12, 4*16
    write	"%vf64(r23) %x128(r23)"
    ld_i128	%r23, %r12, 5*16
    write	"%vf64(r23) %x128(r23)"
    ld_i128	%r23, %r12, 6*16
    write	"%vf64(r23) %x128(r23)"
    ld_i128	%r27, %r12, 7*16
    write	"%vf64(r27) %x128(r27)"
    ld_i128	%r27, %r12, 8*16
    write	"%vf64(r27) %x128(r27)"
    ld_i128	%r27, %r12, 9*16
    write	"%vf64(r27) %x128(r27)"
    ld_i128	%r27, %r12, 10*16
    write	"%vf64(r27) %x128(r27)"
;   flddi	%r24, 8.5899345919999999995e+09 ;-1.234567890123456789012345e+6
;   write	"%vf64(r24) %x128(f24)"
;   flddi	%r24, 0.125 ; 4.656612873077392578125e-10 ; 4.656612873077392578125e-10
;   write	"%vf64(r24) %x128(f24)"
    ld_i128	%r25, %r12, 11*16
    write	"%vf64(r25) %x128(r25)"
    ld_i128	%r25, %r12, 12*16
    write	"%vf64(r25) %x128(r25)"
    ld_iprel_f128 %r40, 4.345678912345678901234567890123456789012345678
    write	"%vf64(r40)"


    madd_f64	%r23, %r60, %r55, %r33
    madd_f64	%r24, %r61, %r25, %r32
    madd_f64	%r25, %r62, %r55, %r23
    madd_f64	%r26, %r63, %r75, %r73
    madd_f64	%r27, %r64, %r75, %r73
    madd_f64	%r28, %r65, %r85, %r63
    madd_f64	%r29, %r66, %r85, %r63
    madd_f64	%r30, %r67, %r95, %r23
    madd_f64	%r31, %r68, %r95, %r23
    madd_f64	%r10, %r21, %r26, %r27
    madd_f64	%r13, %r21, %r26, %r27
    madd_f64	%r10, %r21, %r26, %r27
    madd_f64	%r12, %r21, %r26, %r27
    madd_f64	%r11, %r21, %r26, %r27
    madd_f64	%r13, %r21, %r26, %r27
    madd_f64	%r14, %r21, %r26, %r27
    madd_f64	%r15, %r21, %r26, %r27
    madd_f64	%r16, %r21, %r26, %r27
    madd_f64	%r17, %r21, %r26, %r27

    st_i128	%r16, %sp,16*2
    st_i128	%r17, %sp,16*3
    st_i128	%r18, %sp,16*4
    st_i128	%r19, %sp,16*5
    st_i128	%r20, %sp,16*6
    st_i128	%r21, %sp,16*7
    st_i128	%r22, %sp,16*8
    st_i128	%r23, %sp,16*9
    st_i128	%r24, %sp,16*10
    st_i128	%r25, %sp,16*11
    st_i128	%r26, %sp,16*12
    st_i128	%r27, %sp,16*13
    st_i128	%r28, %sp,16*14
    st_i128	%r29, %sp,16*15
    st_i128	%r30, %sp,16*16
    st_i128	%r31, %sp,16*17


; SSE single
    madd_vf32 %r58, %r61, %r92, %r63
    msub_vf32 %r82, %r52, %r92, %r63
    nmadd_vf32 %r82, %r52, %r69, %r63
    nmsub_vf32 %r50, %r91, %r69, %r63
    madd_alt_vf32 %r82, %r52, %r69, %r63
    msub_alt_vf32 %r50, %r91, %r69, %r63
    add_vf32 %r61, %r94, %r69
    nadd_vf32 %r68, %r54, %r72
    sub_vf32 %r68, %r61, %r82
    add_alt_vf32 %r81, %r71, %r82
    sub_alt_vf32 %r82, %r71, %r82
    add_horiz_vf32 %r62, %r61, %r82
    sub_horiz_vf32 %r62, %r61, %r62
    mul_vf32 %r62, %r51, %r62
    mul_horiz_vf32 %r63, %r51, %r62
    dot_vf32 %r83, %r51, %r62
    min_vf32 %r83, %r61, %r62
    max_vf32 %r63, %r71, %r62
    abs_min_vf32 %r64, %r71, %r82
    abs_max_vf32 %r64, %r71, %r82

    cmp_one_vf32	%r65, %r61, %r62
    cmp_olt_vf32	%r74, %r61, %r62
    cmp_ole_vf32	%r83, %r61, %r62
    cmp_ule_vf32	%r72, %r61, %r62
    cmp_ule_vf32	%r11, %r61, %r62
    cmp_u_vf32	%r20, %r61, %r62

    pk_vf32 %r33, %r64, %r62
    neg_vf32 %r60, %r69
    abs_diff_vf32 %r61, %r68, %r3
    nabs_diff_vf32 %r62, %r67, %r3
    floor_vf32 %r63, %r66
    ceil_vf32 %r64, %r65
    roundeven_vf32 %r65, %r64
    trunk_vf32 %r66, %r63
    div_vf32 %r67, %r62, %r20
    sqrt_vf32 %r68, %r61
    rsqrt_vf32 %r69, %r60

    add_vf32 %r24, %r61, %r60
    mul_vf64 %r47, %r60, %r46

endfpsimd:

.end
.text
.rodata
    align	16
mmxdata:
    d8	0x123456759eabcd7f
    d8	0x123456789cabcdef

    d8	0xf87f5432afebcdf3
    d8	0xffffffffffffffff

    d8	0x1234567890abcdef
    d8	0x1234567890abcdef

    d8	0x1234567890abcdef
    d8	0x1234567890abcdef
.text
    alloc	90
    lda_r %r4, mmxdata
    ld_i128	%r1, %r4,0*16
    ld_i128	%r2, %r4,1*16
    ld_i128	%r3, %r4,2*16
    ld_i128	%r4, %r4,3*16
    write	"r1      %x128(r1)"
    write	"r2      %x128(r2)"

    write	"%vu8(r1)"
    write	"%vu16(r1)"
    write	"%vu32(r1)"
    write	"%vu64(r1)"

    add_vu8 %r3, %r1, %r2
    write	"test vadd/vaddc (1 byte)\0"
    addc_vu8 %r4, %r1, %r2
    write	"%vu8(r1)"
    write	"%vu16(r2)"
    write	"%vu32(r3)"
    write	"%vu64(r4)"
    write	"test vadd/vaddo signed (1 byte)\0"
    addo_vi8 %r4, %r1, %r2
    write	"%vi8(r1)"
    write	"%vi16(r2)"
    write	"%vi32(r3)"
    write	"%vu64(r4)"

    sub_vu8 %r3, %r1, %r2
    write	"test vsub/vsubb (1 byte)\0"
    subb_vu8 %r4, %r1, %r2
    write	"%vu8(r1)"
    write	"%vu8(r2)"
    write	"%vu8(r3)"
    write	"%vu8(r4)"
    write	"test vsub/vsubo signed (1 byte)\0"
    subo_vi8 %r4, %r1, %r2
    write	"%vi8(r1)"
    write	"%vi8(r2)"
    write	"%vi8(r3)"
    write	"%vu8(r4)"

    write	"test vaddusb"
    add_vu8 %r3, %r1, %r2
    add_sat_vu8 %r4, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)\n%vu8(r4)"

    write	"test vsubusb"
    sub_vu8 %r3, %r1, %r2
    sub_sat_vu8 %r4, %r1, %r2
    write	"%vu8(r1):\n%vu8(r2)\n%vu8(r3)\n%vu8(r4)"

    write	"test vaddssb"
    add_vu8 %r3, %r1, %r2
    add_sat_vi8 %r4, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)\n%vi8(r4)"

    write	"test vsubssb"
    sub_vu8 %r3, %r1, %r2
    sub_sat_vi8 %r4, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)\n%vi8(r4)"

    write	"test pavgu (1 byte)\0"
    avg_vu8 %r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test pavgs (1 byte)\0"
    avg_vi8 %r3, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write	"test vminu (1 byte)\0"
    min_vu8 %r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test vmins (1 byte)\0"
    min_vi8 %r3, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write	"test vmaxu (1 byte)\0"
    max_vu8 %r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test vmaxs (1 byte)\0"
    max_vi8 %r3, %r1, %r2
    write	"%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write	"test merge low (1 byte)\0"
    mrgl_vu8 %r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write	"test merge high (1 byte)\0"
    mrgh_vu8 %r3, %r1, %r2
    write	"%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    vpkuus_h	%r2, %r3, %r4
    vpksus_h	%r2, %r3, %r4
    vpksss_h	%r2, %r3, %r4

    vpkuus_w	%r2, %r3, %r4
    vpksus_w	%r2, %r3, %r4
    vpksss_w	%r2, %r3, %r4

    vpkuus_d	%r2, %r3, %r4
    vpksus_d	%r2, %r3, %r4
    vpksss_d	%r2, %r3, %r4

;	jmp	endmmx
; d1 abs
    min_vi8  %r12, %r61, %r55
    min_vi16 %r18, %r61, %r45
    min_vi32 %r27, %r61, %r35
    min_vi64 %r36, %r61, %r25

    min_vu8  %r14, %r61, %r15
    min_vu16 %r15, %r62, %r75
    min_vu32 %r17, %r63, %r85
    min_vu64 %r16, %r64, %r75

    max_vi8 %r26, %r71, %r85
    max_vi16 %r26, %r61, %r54
    max_vi32 %r16, %r51, %r35
    max_vi64 %r16, %r41, %r55

    max_vu8  %r11, %r61, %r53
    max_vu16 %r12, %r55, %r55
    max_vu32 %r16, %r46, %r56
    max_vu64 %r13, %r31, %r55

    rol_vu8  %r56, %r61, %r15
    rol_vu16 %r31, %r61, %r25
    rol_vu32 %r53, %r61, %r30
    rol_vu64 %r62, %r61, %r41

    ror_vu8  %r16, %r11, %r52
    ror_vu16 %r11, %r21, %r63
    ror_vu32 %r71, %r31, %r74
    ror_vu64 %r81, %r41, %r85

    sll_vu8  %r16, %r51, %r86
    sll_vu16 %r24, %r61, %r55
    sll_vu32 %r69, %r71, %r55
    sll_vu64 %r77, %r81, %r55

    srl_vu8  %r21, %r81, %r50
    srl_vu16 %r12, %r63, %r51
    srl_vu32 %r13, %r62, %r52
    srl_vu64 %r64, %r63, %r53

    sra_vi8  %r85, %r64, %r54
    sra_vi16 %r76, %r65, %r15
    sra_vi32 %r67, %r66, %r25
    sra_vi64 %r58, %r67, %r36

    avg_vi8 %r49, %r68, %r47
    avg_vi16 %r30, %r69, %r58
    avg_vi32 %r26, %r11, %r69
    avg_vi64 %r16, %r21, %r75

    avg_vu8  %r14, %r31, %r85
    avg_vu16 %r15, %r41, %r45
    avg_vu32 %r56, %r51, %r25
    avg_vu64 %r87, %r61, %r15

    add_sat_vi8  %r42, %r71, %r15
    add_sat_vi16 %r83, %r81, %r45
    add_sat_vi32 %r74, %r41, %r85
    add_sat_vi64 %r65, %r61, %r75

    add_vu8  %r56, %r61, %r75
    add_vu16 %r47, %r61, %r65
    add_vu32 %r38, %r61, %r55
    add_vu64 %r29, %r61, %r55

    add_sat_vu8  %r55, %r61, %r45
    add_sat_vu16 %r65, %r61, %r35
    add_sat_vu32 %r74, %r61, %r25
    add_sat_vu64 %r84, %r61, %r15

    addc_vu8  %r53, %r61, %r55
    addc_vu16 %r13, %r61, %r55
    addc_vu32 %r12, %r61, %r55
    addc_vu64 %r12, %r61, %r55

    sub_sat_vi8  %r56, %r61, %r15
    sub_sat_vi16 %r67, %r61, %r12
    sub_sat_vi32 %r78, %r61, %r13
    sub_sat_vi64 %r89, %r61, %r45

    sub_vu8  %r70, %r61, %r85
    sub_vu16 %r86, %r61, %r45
    sub_vu32 %r46, %r61, %r13
    sub_vu64 %r46, %r61, %r75

    sub_sat_vu8  %r41, %r68, %r65
    sub_sat_vu16 %r12, %r37, %r55
    sub_sat_vu32 %r23, %r26, %r45
    sub_sat_vu64 %r14, %r18, %r35

    cmp_eq_vi8  %r86, %r61, %r25
    cmp_eq_vi16 %r44, %r72, %r15
    cmp_eq_vi32 %r20, %r83, %r55
    cmp_eq_vi64 %r16, %r84, %r55

;	pcmp_ne	%r106, %r61, %r55
;	pcmp_gt	%r106, %r61, %r55
;	pcmp_ge	%r106, %r61, %r55
;	pcmp_le	%r106, %r61, %r55

    cmp_lt_vi8  %r13, %r61, %r15
    cmp_lt_vi16 %r14, %r61, %r24
    cmp_lt_vi32 %r15, %r61, %r38
    cmp_lt_vi64 %r16, %r61, %r45

    cmp_lt_vu8  %r19, %r11, %r75
    cmp_lt_vu16 %r18, %r21, %r82
    cmp_lt_vu32 %r16, %r31, %r73
    cmp_lt_vu64 %r14, %r71, %r54

    mrgh_vu8  %r11, %r71, %r13
    mrgh_vu16 %r72, %r67, %r27
    mrgh_vu32 %r13, %r58, %r55
    mrgh_vu64 %r14, %r69, %r15

    mrgl_vu8  %r76, %r61, %r11
    mrgl_vu16 %r26, %r11, %r62
    mrgl_vu32 %r16, %r15, %r73
    mrgl_vu64 %r16, %r11, %r85

    write	"end simd(int) test"
endmmx:

.end
.text
    alloc 70
    write "test system instructions (assembler only)"

    add_imm_i64 %sp, %sp, -32 ; alloc stack frame
    write "test tpa for sp: 0x%x64(sp)"
    tpa %r4, %sp
    write "tpa(sp): 0x%x64(r4)"
    add_imm_i64 %sp, %sp, 32 ; rollback stack frame

    jmp system_skip

    ld_imm %r45, 1012
    syscall
    nop 0
    sysret
    rfi

    icbi %r34, 16
    dcbt %r34, 16
    dcbf %r34, 16
    dcbi %r34, 16


    mfspr %r34, %lid
    mtspr %r34, %lid
    mprobe %r34, %r45, %r66
    retf 234567

    mfspr %r32, %iv
    mfspr %r32, %psr

; test system instructions
    ptc %r10, %r45, %r11

    mfspr %r12, %pta
    mfspr %r12, %fpcr
    mtspr %r11, %rsc

; test atomic fences
    fence_a
    fence_r
    fence_ar
    fence_sc

    mtdbr %r44, %r66, 0
    mfdbr %r55, %r66, 0
    mtibr %r44, %r66, 0
    mfibr %r55, %r66, 0
    mtitr %r44, %r66, %r12
    mtdtr %r44, %r66, %r12

; bpa b7, %r7
; bpal b7, b4, %r6
; lpr b7, %r6, label16

    undef
system_skip:
    write "end test system instructions (assembler only)"
.end
.text
.data
data_unaligned:
align 16
    d1	0x00
    d1	0x01
    d1	0x02
    d1	0x03
    d1	0x04
    d1	0x05
    d1	0x06
    d1	0x07
    d1	0x08
    d1	0x09
    d1	0x0a
    d1	0x0b
    d1	0x0c
    d1	0x0d
    d1	0x0e
    d1	0x0f

    d1	0x10
    d1	0x11
    d1	0x12
    d1	0x13
    d1	0x14
    d1	0x15
    d1	0x16
    d1	0x17
    d1	0x18
    d1	0x19
    d1	0x1a
    d1	0x1b
    d1	0x1c
    d1	0x1d
    d1	0x1e
    d1	0x1f

.text
    write "load/store unaligned"
    alloc 96
    lda_iprel %r17, data_unaligned

    ld_u16	%r3, %r17, 0
    write	"%x16(r3)"
    ld_u16	%r3, %r17, 1
    write	"%x16(r3)"
    ld_u16	%r3, %r17, 2
    write	"%x16(r3)"

    ld_u32	%r3, %r17, 0
    write	"%x32(r3)"
    ld_u32	%r3, %r17, 1
    write	"%x32(r3)"
    ld_u32	%r3, %r17, 2
    write	"%x32(r3)"
    ld_u32	%r3, %r17, 3
    write	"%x32(r3)"
    ld_u32	%r3, %r17, 4
    write	"%x32(r3)"

    ld_u64	%r3, %r17, 0
    write	"%x64(r3)"
    ld_u64	%r3, %r17, 1
    write	"%x64(r3)"
    ld_u64	%r3, %r17, 2
    write	"%x64(r3)"
    ld_u64	%r3, %r17, 3
    write	"%x64(r3)"
    ld_u64	%r3, %r17, 4
    write	"%x64(r3)"
    ld_u64	%r3, %r17, 5
    write	"%x64(r3)"
    ld_u64	%r3, %r17, 6
    write	"%x64(r3)"
    ld_u64	%r3, %r17, 7
    write	"%x64(r3)"
    ld_u64	%r3, %r17, 8
    write	"%x64(r3)"

    ld_i128	%r3, %r17, 0
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 1
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 2
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 3
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 4
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 5
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 6
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 7
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 8
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 9
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 10
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 11
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 12
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 13
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 14
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 15
    write	"%x128(r3)"
    ld_i128	%r3, %r17, 16
    write	"%x128(r3)"
.end
.rodata
align 4
    d4 table_cases
    d4 label_0
    d4 label_1
    d4 label_2

table_cases:
    i4 label_0 - table_cases
    i4 label_1 - table_cases
    i4 label_2 - table_cases

.text
    alloc 80
    write "test table switch to case 1"
    ld_imm %r4, 1
    lda_iprel %r5, table_cases
    jmp_t %r5, %r4

label_0:
    write "case 0"
    cmp_eq_i128  %r12, %r24, %gz
    cmp_ne_i128  %r12, %r24, %gz
    dep_s %r18, %r20, 13, 32
    dep_c %r19, %r23, 13, 32
    ld_imm %r12, -1234
    ld_imm %r13, 3456
    jmp  label_after_switch

label_1:
    write "case 1"
    and_imm %r45, %r44, 12345
    sl_add_i64 %r14, %sp, %r12, 2
    sl_add_i64 %r12, %r23, %r44, 3
    mov %r12, %r13
    ld_imm %r24, 0
    mtspr %r24, %psr
    mfspr %r12, %psr
    nand %r34, %r34, %r45
    sll_u64 %r12, %r23, %r45
    sll_imm_u64 %r12, %r23, 45
    jmp label_after_switch

label_2:
    write   "case 2"
    add_imm_i64  %r34, %r34,-1
    mov     %r58, %r45
    sl_add_i64  %r12, %r15, %r30, 14
    sl_add_i64  %r12, %r15, %r30, 5
    sl_add_i64  %r12, %r15, %r30, 5
    srd_i64     %r34, %r56, %r40
    srd_imm_i64    %r34, %r56, 40
    dep_a   %r40, %r78, 40, 20
    sl_add_i64  %r54, %r45, %r22, 4
    sl_add_i64  %r54, %r45, %r22, 20
    lda_xi64  %r3, %r45, %tp, 3, 55
    jmp  label_after_switch

label_after_switch:
    write "end table switch test"
.end
.rodata
    align	16
console_test_quad:
    quad	1.189731495357231765085759326628007e+4932
console_test_quad2:
    quad	1.23456789 + 32.0
console_test_quad3:
    quad	0.2345678901234567890123456789012345678 + 0.2
    quad	2*asin(1)
    quad	255
console_test_double:
    double	acos(sin(3.1415926)) ;-1.2345678e+200
    double	444.689679
console_test_float:
    float	0.123456789123456789e+30
    float	2.123456789122233
    float	0.0
    float	1.0
.text
    alloc	35
    write	"ip=%s(ip), eip=%s(eip), psr=%s(psr)"

    write	"end test write special regs"

    write	"\ntest write: general register"

    write	"%%i8(sp)  = %i8(sp)"
    write	"%%i16(sp) = %i16(sp)"
    write	"%%i32(sp) = %i32(sp)"
    write	"%%i64(sp) = %i64(sp)"
    write	"%%u8(sp)  = %u8(sp)"
    write	"%%u16(sp) = %u16(sp)"
    write	"%%u32(sp) = %u32(sp)"
    write	"%%u64(sp) = %u64(sp)"
    write	"%%x8(sp)  = 0x%x8(sp)"
    write	"%%x16(sp) = 0x%x16(sp)"
    write	"%%x32(sp) = 0x%x32(sp)"
    write	"%%x64(sp) = 0x%x64(sp)"

    write	"%x64(r0)"
    write	"%x64(r1)"
    write	"%x64(r2)"
    write	"%x64(r22)"
    write	"%x64(r33)"
    write	"%x64(g0)"
    write	"%x64(g1)"
    write	"%x64(tp)"
    write	"%x64(sp)"

    write	"end test write general regs"

    ld_iprel_i128 %r22, console_test_quad
    write "r22 = %x128(r22) %f128(r22)"
    ld_iprel_i128 %r22, console_test_quad2
    write "r22 = %x128(r22) %f128(r22)"
    ld_iprel_i128 %r22, console_test_quad3
    write "r22 = %x128(r22) %f128(r22)"
    ld_iprel_u64 %r22, console_test_double
    write "r22 = %x64(r22) %f64(r22)"
    ld_iprel_u32 %r22, console_test_float
    write "r22 = %x32(r22) %f32(r22)"

    write "end test write fp regs"
.end