html-program

.text
    alloc 96
    write "test carry-less multiply"
    clmul %r34, %r21, %r22, 0
    clmul %r34, %r21, %r22, 1
    clmul %r34, %r21, %r22, 2
    clmul %r34, %r21, %r22, 3
.rodata
align 16
vector_a:
    d8 0x7b5b546573745665
    d8 0x63746f725d53475d
vector_b:
    d8 0x4869285368617929
    d8 0x5b477565726f6e5d
result_00:
    d8 0x1d4d84c85c3440c0
    d8 0x929633d5d36f0451
result_01:
    d8 0x1bd17c8d556ab5a1
    d8 0x7fa540ac2a281315
result_10:
    d8 0x1a2bf6db3a30862f
    d8 0xbabf262df4b7d5c9
result_11:
    d8 0x1d1e1f2c592e7c45
    d8 0xd66ee03e410fd4ed
.text
    ldqr  %r12, vector_a
    ldqr  %r13, vector_b

    clmul  %r11, %r12, %r13, 0
    ldqr   %r21, result_00
    write "clmul: %x128(r11) %x128(r21)"
    clmul  %r11, %r13, %r12, 1
    ldqr   %r21, result_01
    write "clmul: %x128(r11) %x128(r21)"
    clmul  %r11, %r12, %r13, 1
    ldqr   %r21, result_10
    write "clmul: %x128(r11) %x128(r21)"
    clmul  %r11, %r12, %r13, 3
    ldqr   %r21, result_11
    write "clmul: %x128(r11) %x128(r21)"

    write "test aes"
    aesdec %r11, %r12, %r13
    aesdeclast %r11, %r12, %r13
    aesenc %r11, %r12, %r13
    aesenclast %r11, %r12, %r13
    aesimc %r11, %r12
    aeskeygenassist %r11, %r12, 250
    write "end aes test"
.end
.text
;*****************************************************************
; ARITHMETIC
;*****************************************************************
    alloc  96
    write  "test load constant (1234567)"
    ldi %r1, 1234567
    write  "ldi: %i64(r1)"

    write  "test load long constant (123456789012345678)"
    ldi.l  %r1, 123456789012345678
    write  "ldi long: %i64(r1)"

    write  "test simple arithmetic"
    ldi %r1, 1
    ldi %r2, 2
    ldi %r3, 3

    write "add 1+2"
    addd %r4, %r1, %r2
    write "add: %i64(r4)"

    write "add immediate 1+6"
    addid %r4, %r1, 6
    write "addi: %i64(r4)"

    write "sub 1-2"
    subd %r4, %r1, %r2
    write "sub: %i64(r4)"

    write "sub reverse 6-1"
    subrid %r4, %r1, 6
    write "sub reverse: %i64(r4)"

    write  "mul 3*4"
    ldi %r1, 3
    ldi %r2, 4
    muld %r4, %r1, %r2
    write  "mul: %i64(r4)"

    write  "12 div 4"
    ldi %r1, 12
    ldi %r2, 4
    divsd %r4, %r1, %r2
    write  "%i64(r4)"

    write  "15 mod 4"
    ldi %r1, 15
    ldi %r2, 4
    remsd %r4, %r1, %r2
    write  "mod: %i64(r4)"

    write  "test int32_t add"
    ldi.l %r1, 0xFFFFFFFF
    ldi.l %r2, 0xFFFFFFF0
    addws %r3, %r1, %r2
    write "add4: %i64(r3)"
    addiws.l %r3, %r1, 0xFFFFFFFF
    write "addis4.l: %i64(r3)"


    addid %r45, %r45, 12
    mov %r54, %r56
    subd %r45, %r56, %r50
    addid %r45, %r55, -1000
    cmpned %r12, %r56, %r10
    subrid %r45, %r56, -10000
    subrid %r45, %r56, -20000
    cmpeqd %r13, %r56, %r50
    addd %r45, %r56, %r50
    addid  %r45, %r56, -10000
    muld %r45, %r56, %r50
    mulid %r45, %r56, -10000
    mov %r55, %r20
    ldi %r55, 1200
    ldi %r55, 987654
    ldi.l %r56, 98765432198765432
    addid  %r12, %r13, -789
    cmpned %r14, %r13, %r77
    nand %r43, %r44, %r34
    nor %r43, %r44, %r34
    addid %r56, %sp, 0
    ; call %r0, quadrat
    addd %r56, %sp, %sp

    ldi.l %r55, -9223372036854775808
    addid  %r56, %sp, -64
    subrid.l %r55, %r56,12345678901234567
    nor %r12, %r14, %r14
    addid %r56, %sp, -64
    nor %r12, %r14, %r14
    subrid.l %r55, %r56, 12345678901234567
    addid %r56, %sp, -64
    subrid.l %r55, %r56, -12345678901234567
    addid   %r56, %sp, -64
    subrid.l %r55, %r56, -12345678901234567
    addid.l %r45, %r56, 12345678



    ldi.l %r5, 0xaFFFFFFF12345677
    ldi.l %r6, 0xaFFFFFFF12345678

    write "test signed overflow: %i64(r5) %i64(r6)"

    write "add overflow"
    addod %r2, %r5, %r6
    write "addo: %i64(r2)"

    write "subtract overflow"
    subod %r2, %r5, %r6
    write "subo: %i64(r2)"

    muladdd   %r34, %r45, %r67, %r80
    mulsubd   %r34, %r45, %r67, %r80
    mulsubrd  %r34, %r45, %r67, %r80

    addaddd   %r34, %r45, %r67, %r80
    addsubd   %r34, %r45, %r67, %r80
    subsubd   %r34, %r45, %r67, %r80

    sextb %r34, %r34
    sexth %r34, %r34
    sextw %r34, %r34
    sextd %r34, %r34

    zextb %r34, %r34
    zexth %r34, %r34
    zextw %r34, %r34
    zextd %r34, %r34

.end
.text
  alloc 96
  write "test atomic fetch-op"
  addid  %r5, %sp, -64
  write "atomic base: %x64(r5)"
  ldi %r10, 5
  ldi %r12, 10
  ldi %r56, 5

  write "test amo-add"

  amoaddb %r4, %r5, %r10, relaxed
  amoaddb %r4, %r5, %r10, acquire
  amoaddb %r4, %r5, %r10, release
  amoaddb %r4, %r5, %r10, acq_rel

  amoaddh %r4, %r5, %r10, relaxed
  amoaddh %r4, %r5, %r10, acquire
  amoaddh %r4, %r5, %r10, release
  amoaddh %r4, %r5, %r10, acq_rel

  amoaddw %r4, %r5, %r10, relaxed
  amoaddw %r4, %r5, %r10, acquire
  amoaddw %r4, %r5, %r10, release
  amoaddw %r4, %r5, %r10, acq_rel

  amoaddd %r4, %r5, %r10, relaxed
  amoaddd %r4, %r5, %r10, acquire
  amoaddd %r4, %r5, %r10, release
  amoaddd %r4, %r5, %r10, acq_rel

  amoaddq %r4, %r5, %r10, relaxed
  amoaddq %r4, %r5, %r10, acquire
  amoaddq %r4, %r5, %r10, release
  amoaddq %r4, %r5, %r10, acq_rel

  write "test amo-sub"

  amosubb %r4, %r5, %r10, relaxed
  amosubb %r4, %r5, %r10, acquire
  amosubb %r4, %r5, %r10, release
  amosubb %r4, %r5, %r10, acq_rel

  amosubh %r4, %r5, %r10, relaxed
  amosubh %r4, %r5, %r10, acquire
  amosubh %r4, %r5, %r10, release
  amosubh %r4, %r5, %r10, acq_rel

  amosubw %r4, %r5, %r10, relaxed
  amosubw %r4, %r5, %r10, acquire
  amosubw %r4, %r5, %r10, release
  amosubw %r4, %r5, %r10, acq_rel

  amosubd %r4, %r5, %r10, relaxed
  amosubd %r4, %r5, %r10, acquire
  amosubd %r4, %r5, %r10, release
  amosubd %r4, %r5, %r10, acq_rel

  amosubq %r4, %r5, %r10, relaxed
  amosubq %r4, %r5, %r10, acquire
  amosubq %r4, %r5, %r10, release
  amosubq %r4, %r5, %r10, acq_rel

  write "test amo-and"

  amoandb %r4, %r5, %r10, relaxed
  amoandb %r4, %r5, %r10, acquire
  amoandb %r4, %r5, %r10, release
  amoandb %r4, %r5, %r10, acq_rel

  amoandh %r4, %r5, %r10, relaxed
  amoandh %r4, %r5, %r10, acquire
  amoandh %r4, %r5, %r10, release
  amoandh %r4, %r5, %r10, acq_rel

  amoandw %r4, %r5, %r10, relaxed
  amoandw %r4, %r5, %r10, acquire
  amoandw %r4, %r5, %r10, release
  amoandw %r4, %r5, %r10, acq_rel

  amoandd %r4, %r5, %r10, relaxed
  amoandd %r4, %r5, %r10, acquire
  amoandd %r4, %r5, %r10, release
  amoandd %r4, %r5, %r10, acq_rel

  amoandq %r4, %r5, %r10, relaxed
  amoandq %r4, %r5, %r10, acquire
  amoandq %r4, %r5, %r10, release
  amoandq %r4, %r5, %r10, acq_rel

  write "test amo-or"

  amoorb %r4, %r5, %r10, relaxed
  amoorb %r4, %r5, %r10, acquire
  amoorb %r4, %r5, %r10, release
  amoorb %r4, %r5, %r10, acq_rel

  amoorh %r4, %r5, %r10, relaxed
  amoorh %r4, %r5, %r10, acquire
  amoorh %r4, %r5, %r10, release
  amoorh %r4, %r5, %r10, acq_rel

  amoorw %r4, %r5, %r10, relaxed
  amoorw %r4, %r5, %r10, acquire
  amoorw %r4, %r5, %r10, release
  amoorw %r4, %r5, %r10, acq_rel

  amoord %r4, %r5, %r10, relaxed
  amoord %r4, %r5, %r10, acquire
  amoord %r4, %r5, %r10, release
  amoord %r4, %r5, %r10, acq_rel

  amoorq %r4, %r5, %r10, relaxed
  amoorq %r4, %r5, %r10, acquire
  amoorq %r4, %r5, %r10, release
  amoorq %r4, %r5, %r10, acq_rel

  write "test amo-xor"

  amoxorb %r4, %r5, %r10, relaxed
  amoxorb %r4, %r5, %r10, acquire
  amoxorb %r4, %r5, %r10, release
  amoxorb %r4, %r5, %r10, acq_rel

  amoxorh %r4, %r5, %r10, relaxed
  amoxorh %r4, %r5, %r10, acquire
  amoxorh %r4, %r5, %r10, release
  amoxorh %r4, %r5, %r10, acq_rel

  amoxorw %r4, %r5, %r10, relaxed
  amoxorw %r4, %r5, %r10, acquire
  amoxorw %r4, %r5, %r10, release
  amoxorw %r4, %r5, %r10, acq_rel

  amoxord %r4, %r5, %r10, relaxed
  amoxord %r4, %r5, %r10, acquire
  amoxord %r4, %r5, %r10, release
  amoxord %r4, %r5, %r10, acq_rel

  amoxorq %r4, %r5, %r10, relaxed
  amoxorq %r4, %r5, %r10, acquire
  amoxorq %r4, %r5, %r10, release
  amoxorq %r4, %r5, %r10, acq_rel

  write "test amo-smin"
  amominsb   %r4, %r5, %r10, relaxed
  amominsb   %r4, %r5, %r10, acquire
  amominsb   %r4, %r5, %r10, release
  amominsb   %r4, %r5, %r10, acq_rel

  amominsh  %r4, %r5, %r10, relaxed
  amominsh  %r4, %r5, %r10, acquire
  amominsh  %r4, %r5, %r10, release
  amominsh  %r4, %r5, %r10, acq_rel

  amominsw  %r4, %r5, %r10, relaxed
  amominsw  %r4, %r5, %r10, acquire
  amominsw  %r4, %r5, %r10, release
  amominsw  %r4, %r5, %r10, acq_rel

  amominsd  %r4, %r5, %r10, relaxed
  amominsd  %r4, %r5, %r10, acquire
  amominsd  %r4, %r5, %r10, release
  amominsd  %r4, %r5, %r10, acq_rel

  amominsq  %r4, %r5, %r10, relaxed
  amominsq  %r4, %r5, %r10, acquire
  amominsq  %r4, %r5, %r10, release
  amominsq  %r4, %r5, %r10, acq_rel

  write "test amo-smax"
  amomaxsb  %r4, %r5, %r10, relaxed
  amomaxsb  %r4, %r5, %r10, acquire
  amomaxsb  %r4, %r5, %r10, release
  amomaxsb  %r4, %r5, %r10, acq_rel

  amomaxsh  %r4, %r5, %r10, relaxed
  amomaxsh  %r4, %r5, %r10, acquire
  amomaxsh  %r4, %r5, %r10, release
  amomaxsh  %r4, %r5, %r10, acq_rel

  amomaxsw  %r4, %r5, %r10, relaxed
  amomaxsw  %r4, %r5, %r10, acquire
  amomaxsw  %r4, %r5, %r10, release
  amomaxsw  %r4, %r5, %r10, acq_rel

  amomaxsd  %r4, %r5, %r10, relaxed
  amomaxsd  %r4, %r5, %r10, acquire
  amomaxsd  %r4, %r5, %r10, release
  amomaxsd  %r4, %r5, %r10, acq_rel

  amomaxsq  %r4, %r5, %r10, relaxed
  amomaxsq  %r4, %r5, %r10, acquire
  amomaxsq  %r4, %r5, %r10, release
  amomaxsq  %r4, %r5, %r10, acq_rel

  write "test amo-umin"
  amominub  %r4, %r5, %r10, relaxed
  amominub  %r4, %r5, %r10, acquire
  amominub  %r4, %r5, %r10, release
  amominub  %r4, %r5, %r10, acq_rel

  amominuh  %r4, %r5, %r10, relaxed
  amominuh  %r4, %r5, %r10, acquire
  amominuh  %r4, %r5, %r10, release
  amominuh  %r4, %r5, %r10, acq_rel

  amominuw  %r4, %r5, %r10, relaxed
  amominuw  %r4, %r5, %r10, acquire
  amominuw  %r4, %r5, %r10, release
  amominuw  %r4, %r5, %r10, acq_rel

  amominud  %r4, %r5, %r10, relaxed
  amominud  %r4, %r5, %r10, acquire
  amominud  %r4, %r5, %r10, release
  amominud  %r4, %r5, %r10, acq_rel

  amominuq  %r4, %r5, %r10, relaxed
  amominuq  %r4, %r5, %r10, acquire
  amominuq  %r4, %r5, %r10, release
  amominuq  %r4, %r5, %r10, acq_rel

  write "test amo-umax"
  amomaxub  %r4, %r5, %r10, relaxed
  amomaxub  %r4, %r5, %r10, acquire
  amomaxub  %r4, %r5, %r10, release
  amomaxub  %r4, %r5, %r10, acq_rel

  amomaxuh  %r4, %r5, %r10, relaxed
  amomaxuh  %r4, %r5, %r10, acquire
  amomaxuh  %r4, %r5, %r10, release
  amomaxuh  %r4, %r5, %r10, acq_rel

  amomaxuw  %r4, %r5, %r10, relaxed
  amomaxuw  %r4, %r5, %r10, acquire
  amomaxuw  %r4, %r5, %r10, release
  amomaxuw  %r4, %r5, %r10, acq_rel

  amomaxud  %r4, %r5, %r10, relaxed
  amomaxud  %r4, %r5, %r10, acquire
  amomaxud  %r4, %r5, %r10, release
  amomaxud  %r4, %r5, %r10, acq_rel

  amomaxuq %r4, %r5, %r10, relaxed
  amomaxuq %r4, %r5, %r10, acquire
  amomaxuq %r4, %r5, %r10, release
  amomaxuq %r4, %r5, %r10, acq_rel

  write "test cas"

  amocasb  %r12, %r5, %r56, %r34, relaxed
  amocasb  %r12, %r5, %r56, %r34, acquire
  amocasb  %r12, %r5, %r56, %r34, release
  amocasb  %r12, %r5, %r56, %r34, acq_rel

  amocash %r12, %r5, %r56, %r34, relaxed
  amocash %r12, %r5, %r56, %r34, acquire
  amocash %r12, %r5, %r56, %r34, release
  amocash %r12, %r5, %r56, %r34, acq_rel

  amocasw %r12, %r5, %r56, %r34, relaxed
  amocasw %r12, %r5, %r56, %r34, acquire
  amocasw %r12, %r5, %r56, %r34, release
  amocasw %r12, %r5, %r56, %r34, acq_rel

  amocasd %r12, %r5, %r56, %r34, relaxed
  amocasd %r12, %r5, %r56, %r34, acquire
  amocasd %r12, %r5, %r56, %r34, release
  amocasd %r12, %r5, %r56, %r34, acq_rel

  amocasq %r12, %r5, %r56, %r34, relaxed
  amocasq %r12, %r5, %r56, %r34, acquire
  amocasq %r12, %r5, %r56, %r34, release
  amocasq %r12, %r5, %r56, %r34, acq_rel

  write "test load atomic relaxed"
  amoldb  %r12, %r5, relaxed
  amoldh  %r12, %r5, relaxed
  amoldw  %r12, %r5, relaxed
  amoldd  %r12, %r5, relaxed
  amoldq  %r12, %r5, relaxed

  write "test load atomic acquire"
  amoldb  %r12, %r5, acquire
  amoldh  %r12, %r5, acquire
  amoldw  %r12, %r5, acquire
  amoldd  %r12, %r5, acquire
  amoldq  %r12, %r5, acquire

  write "test store atomic relaxed"
  amostb  %r12, %r5, relaxed
  amosth  %r12, %r5, relaxed
  amostw  %r12, %r5, relaxed
  amostd  %r12, %r5, relaxed
  amostq  %r12, %r5, relaxed

  write "test store atomic release"
  amostb  %r12, %r5, release
  amosth  %r12, %r5, release
  amostw  %r12, %r5, release
  amostd  %r12, %r5, release
  amostq  %r12, %r5, release

.end
.text
.data
data_lbl:
    d1 25
    d1 26
    d1 27
    d1 28

.text
program_start:
; Here we test references to data section.
; Absolute offset from begin of section
    write "base addressing"
    alloc 96
    ldarc %r17, program_start
    ldi %r12, data_lbl
    write "data_lbl: %i64(r12)"

    ldi %r12, data_hi(data_lbl)
    write "data_hi(data_lbl): %i64(r12)"
    ldi %r12, data_lo(data_lbl)
    write "data_lo(data_lbl): %i64(r12)"
    ldard %r13, data_lbl
    write "ca.rf(data_lbl): %x64(r13)"
    ldard.l %r13, data_lbl
    write "ca.rf(data_lbl): %x64(r13)"

    addid %r13, %r17, data_hi(data_lbl)
    write "r13     %i64(r13)"
    addid %r14, %r13, data_lo(data_lbl)+0
    write "r14     %i64(r14)"

    addid %r13, %r17, data_hi(data_lbl)
    write "r13     %i64(r13)"
    ldub %r25, %r13, data_lo(data_lbl)+0
    ldub %r26, %r13, data_lo(data_lbl)+1
    ldub %r27, %r13, data_lo(data_lbl)+2
    ldub %r28, %r13, data_lo(data_lbl)+3
    write "r25     %i64(r25)" ; must be 25
    write "r26     %i64(r26)" ; must be 26
    write "r27     %i64(r27)" ; must be 27
    write "r28     %i64(r28)" ; must be 28

; test load context
    ldud %r1, %sp, -16
    std %r1, %sp, -16
    jmp skipaddr
    jmp.l skipaddr

; test indexed load/store
    stbxsd %r12, %r15, %r30, 4, 14
    sthxsd %r12, %r15, %r30, 4, 14
    stwxsd %r12, %r15, %r30, 4, 14
    stdxsd %r12, %r15, %r30, 4, 14

    amoldq %r30, %r56, relaxed
    amostq %r43, %r56, relaxed

    sladdd %r43, %r56, %r23, 4
    slsubd %r43, %r56, %r23, 42
    slsubrd %r43, %r56, %r23, 12

    lduw %r30, %r5, 66*4 ; load mid
    ldudxsd %r40, %tp, %r30, 0, 4 ; load base

    ldsdxsd %r12, %r23, %r40, 3, 54
    ldsdxsd %r12, %r23, %r40, 3, 54
    ldudxsd %r12, %r23, %r40, 3, 54
    ldudxsd %r12, %r23, %r40, 3, 54
    stwxsd %r12, %r23, %r40, 3, 54
    stdxsd %r12, %r23, %r40, 3, 54

    ldsbxsd %r12, %r23, %r40, 3, 54
    ldsbxsd %r12, %r23, %r40, 3, 54
    ldubxsd %r12, %r23, %r40, 3, 54
    ldubxsd %r12, %r23, %r40, 3, 54
    stbxsd %r12, %r23, %r40, 3, 54
    stbxsd %r12, %r23, %r40, 3, 54

    ldshxsd %r12, %r23, %r40, 3, 54
    ldshxsd %r12, %r23, %r40, 3, 54
    lduhxsd %r12, %r23, %r40, 3, 54
    lduhxsd %r12, %r23, %r40, 3, 54
    sthxsd %r12, %r23, %r40, 3, 54
    sthxsd %r12, %r23, %r40, 3, 54

.text
; LOAD/STORE
    sladdd %r54, %r56, %r12, 5

    ldub %r16, %r45, 8900
    ldsb %r15, %r46, 8900
    ldubxsd %r54, %r56, %r12, 2, 37
    ldsbxsd %r53, %r65, %r12, 2, 37
    ldubxsd.l %r54, %r56, %r12, 2, 37000000
    ldsbxsd.l %r53, %r65, %r12, 2, -37000000
    ldubmia %r52, %r75, 10
    ldsbmia %r51, %r76, 10
    ldubmib %r52, %r75, 10
    ldsbmib %r51, %r76, 10
    stbmia %r51, %r76, 10
    stbmib %r52, %r75, 10

    lduh %r12, %r45, 8900
    ldsh %r12, %r45, 8900
    lduhxsd %r54, %r56, %r12, 3, -57
    ldshxsd %r54, %r56, %r12, 2, 37
    lduhxsd.l %r54, %r56, %r12, 2, 37000000
    ldshxsd.l %r53, %r65, %r12, 2, -37000000
    lduhmia %r54, %r56, 12
    ldshmia %r54, %r56, -60
    lduhmib %r54, %r56, 12
    ldshmib %r54, %r56, -60
    sthmia %r51, %r76, 10
    sthmib %r52, %r75, 10

    lduw %r12, %r45, 8900
    ldsw %r12, %r45, 8900
    lduwxsd %r54, %r56, %r12, 2, 7
    ldswxsd %r54, %r56, %r12, 2, 7
    lduwxsd.l %r54, %r56, %r12, 2, 37000000
    ldswxsd.l %r53, %r65, %r12, 2, -37000000
    lduwmia %r54, %r56, 12
    ldswmia %r54, %r56, 32
    lduwmib %r54, %r56, 12
    ldswmib %r54, %r56, 32
    stwmia %r51, %r76, 10
    stwmib %r52, %r75, 10

    ldud   %r54, %r56, 5600
    ldsd   %r54, %r56, 5600
    ldud.l %r53, %r46, 98765432
    ldud   %r52, %r45, -5600
    ldud.l  %r51, %r55, -98765432
    ldudxsd %r50, %r56, %r12, 2, 37
    ldsdxsd %r50, %r56, %r12, 2, 37
    ldudxsd.l %r54, %r56, %r12, 2, 37000000
    ldsdxsd.l %r53, %r65, %r12, 2, -37000000
    ldudmia %r57, %r56, -12
    ldudmia %r57, %r56, -12
    ldsdmia %r57, %r56, -12
    ldsdmia %r57, %r56, -12
    ldudmib %r57, %r56, -12
    ldudmib %r57, %r56, -12
    ldsdmib %r57, %r56, -12
    ldsdmib %r57, %r56, -12
    stdmia %r51, %r76, 10
    stdmib %r52, %r75, 10

    ldq  %r16, %r45, 8900
    ldq.l %r16, %r45, 8900000
    ldq.l %r16, %r45, -8900000
    ldqxsd %r54, %r56, %r12, 2, 37
    ldqxsd.l %r54, %r56, %r12, 2, 37000000
    ldqxsd.l %r54, %r56, %r12, 2, -37000000
    ldqmia %r52, %r75, 10
    ldqmia %r52, %r75, 10
    ldqmib %r52, %r75, 10
    ldqmib %r52, %r75, 10
    stqmia %r51, %r76, 10
    stqmib %r52, %r75, 10

    stb %r12, %r45, 8900
    sth %r12, %r45, 8900
    stw %r12, %r45, 8900
    std %r12, %r45, 890*8

    ldud    %r12, %r45, 8048
    std     %r12, %r45, 8064
    ldudxsd %r12, %r45, %r13, 3, 7
    stdxsd  %r12, %r45, %r13, 3, 7

    ldud  %r60, %r55, 56
    ldud  %r60, %r56, 56
    ldud  %r46, %r55, 120
    std   %r47, %r55, 56

    ldud    %r60, %sp, 624
    std     %r60, %sp, 624
    ldudxsd %r60, %sp, %r12, 3, 28
    stdxsd  %r60, %sp, %r12, 3, 26
    ldud    %r56, %r57, 567
    std     %r56, %r57, 567

    lduw %r34, %r12, 900
    ldud %r34, %r12, 900
    stw %r23, %r12, 900
    std %r23, %r12, 900

    ldq %r34, %r13, 55*16
    stq %r35, %r13, 55*16
    ldqxsd %r34, %r13, %r45, 3, 60
    stqxsd %r34, %r13, %r45, 3, 60

skipaddr:
    nop 0
.end
.text
    alloc 25
    ldi.l %r23, 0x1234567890abcdef
    write "test population statistic instructions"
    cntpop %r12, %r23, 3
    write "cntpop: %i64(r12)"
    cntlz %r12, %r23, 0
    write "cntlz %i64(r12)"
    cnttz %r12, %r23, 1
    cntlz %r12, %r23, 2
    cnttz %r12, %r23, 3
    cntlz %r12, %r23, 4
    cnttz %r12, %r23, 5
.end
.text
    write  "test bit reverse instruction (permb)"
    alloc  80
    ldi.l  %r55, 0x1234567890ABCDEF
    write  "initial value: %x64(r55)"
    permb  %r55, %r55, 63
    permb  %r56, %r78, 63
    write  "r55 %x64(r55) %b64(r55)"
    permb  %r55, %r55, 63
    write  "r55 %x64(r55) %b64(r55)"

    permb  %r56, %r55, 0b111111 ;63
    write  "reverse bits: %x64(r56)"

    permb  %r56, %r55, 0b111110  ;32+16+8+4+2
    write  "reverse bit-pairs: %x64(r56)"

    permb  %r56, %r55, 0b111100  ;32+16+8+4
    write  "reverse nibbles (4-bits): %x64(r56)"

    permb  %r56, %r55, 0b111000 ;32+16+8
    write  "reverse 1bytes: %x64(r55) => %x64(r56)"

    permb  %r56, %r55, 0b110000  ;32+16
    write  "reverse 2bytes: %x64(r55) => %x64(r56)"

    permb  %r56, %r55, 0b100000  ;32
    write  "reverse 4bytes: %x64(r55) => %x64(r56)"
.end
.text
    alloc 46
    write "test bitwise logical"
    and %r23,  %r25, %r45
    andi    %r23, %r25, 12345
    andi.l  %r23, %r25, 1234567890
    andn %r23, %r25, %r45
    andni   %r23, %r25, 12345
    or %r23,   %r25, %r45
    ori     %r23, %r25, 12345
    ori.l   %r23, %r25, 1234567890
    orn %r23,  %r25, %r45
    orni    %r23, %r25, 12345
    xor %r23,  %r25, %r45
    xori    %r23, %r25, 12345
    xori.l  %r23, %r25, 1234567890
    nor        %r23, %r25, %r45
    nand       %r23, %r25, %r45
    xnor       %r23, %r25, %r45
.end
.text
    write "branch-int, test memory"
.data
align 8
test_memory:
    d8 0
    d8 1
    d8 2
    d8 3
    d8 4
    d8 5
    d8 6
    d8 7
.text
    alloc 20
    ldard %r12, test_memory
    write "test_memory: %x64(r12)"
    ldi %r11, 0
    ldi %r14, 0
memory_loop: (32)
    ldudxsd %r13, %r12, %r11, 3, 0
    addid %r11, %r11, 1
    addid %r14, %r14, 1
    andi %r11, %r11, 7
; fast_check
    brltsid.l %r14, 200000, memory_loop
    write "counter: %i64(r14)"
.end
.text
    alloc 20
    write "test compare-with-zero-and-long-branch"
compare_with_zero_test_continue:
compare_with_zero_backward_target:
    addid    %r2, %r2, 1
    breqd %r2, %r2, compare_with_zero_test_exit

    breqd   %r1, %gz, compare_with_zero_forward_target
    breqd.l %r1, %gz, compare_with_zero_forward_target
    breqd   %r1, %gz, compare_with_zero_backward_target
    breqd.l %r1, %gz, compare_with_zero_backward_target
    brned   %r1, %gz, compare_with_zero_forward_target
    brned.l %r1, %gz, compare_with_zero_forward_target
    brned   %r1, %gz, compare_with_zero_backward_target
    brned.l %r1, %gz, compare_with_zero_backward_target

    brltsd   %r1, %gz, compare_with_zero_forward_target
    brltsd.l %r1, %gz, compare_with_zero_forward_target
    brltsd   %r1, %gz, compare_with_zero_backward_target
    brltsd.l %r1, %gz, compare_with_zero_backward_target
    brgesd   %r1, %gz, compare_with_zero_forward_target
    brgesd.l %r1, %gz, compare_with_zero_forward_target
    brgesd   %r1, %gz, compare_with_zero_backward_target
    brgesd.l %r1, %gz, compare_with_zero_backward_target

compare_with_zero_forward_target:
    jmp compare_with_zero_test_continue
compare_with_zero_test_exit:
    write "end test compare-with-zero-and-long-branch"
.end
.text

call_code_target:

.rodata
call_data_target:

.text
    jmp callexample
;*****************************************************************
; Function  compute A**4 of parameter A, passed in register r33
;*****************************************************************
quadrat:
    write "function quadrat entered: r0=%x128(r0)"
    alloc 93
    write "rsc     %s(rsc)"
    write "psr     %s(psr)"
    write "rsc     %s(rsc)"
    muld %r33, %r33, %r33
    muld %r33, %r33, %r33
    write "r0=%x128(r0) r33=%i64(r33)"
    write "%m(dump)"
; setspr %r45, psr
    write "function quadrat exited"
    ret
end_quadrat:

;*****************************************************************
; Example of calling sequence with branch prediction
callexample:
    alloc 91
    ldi.l %r90, 0x1234567890abcdef
    write "arg3 %x64(r90)"
    srpid %r89, %r90, %r90, 16
    write "arg2 %x64(r89)"
    srpid %r88, %r90, %r90, 16
    write "arg1 %x64(r88)"
    ldi %r87, 7 ; setup arguments
;   write "%m(dump)"
    write "rsc: %s(rsc)"
    write "function quadrat called"
    call %r86, quadrat
    write "rsc: %s(rsc)"
; Rest instructions after return from subroutine
;*****************************************************************
.text  ; return to code section

; Here we test registers used by ABI (application binary interface)
; Check loader.
    write "sp=%x64(sp) tp=%x64(tp) r0=%x128(r0)"
    write "rsc: %s(rsc)"
    write "psr: %s(psr)"
    write "r14: %x64(r14)"
    write "reta: %i64(r72)" ; out return address
    write "retv: %i64(r73)" ; out return value
    write "rsc: %s(rsc)"
    write "rsc: %s(psr)"
    ldi.l %r11, 0x407d8bffffccccff
    write "r11: %x64(r11)"
    addid.l %r12, %r11, 0x400000
    write "r12: %x64(r12)"
    xor %r20, %r19, %r11
    addid.l %r20, %r20, 0x400000
    ldi %r10, 10
    ldi %r11, 11
    cmpltsd  %r2, %r11, %r10
    write "%i64(r11) %i64(r10)"
    jmp  call_exit

    call %r42, quadrat
    callri %r42, %r34, %gz
    callmi %r42, %r34, 468
    callplt %r42, call_data_target
    callri %r42, %r34, %gz

call_exit:
    write "end call test"

.end
.text
    alloc 47
    write "test recursive calls"
    ldi.l %r46, 0x7FFFFFFFFFFFFFFF ; comment
    ldi.l %r46, 0x8000000000000000
    addid %r46, %r46, -1
    write "%i64(r46)"

    getspr %r20, %rsc

    alloc 54 ; extend frame to 54 regs
    ldi %r48, 1 ; 
    ldi %r53, 3 ; 1 arg (33+16)
    ldi %r52, 2 ; 2 arg (34+16)
    ldi %r51, 1 ; 3 arg (35+16)
    write "rsc: %s(rsc)"
    call %r50, func ; call func subroutine, safe 50 regs
    write "r51=%i64(r51) rsc=%s(rsc)"
    ldi %r53, 10
    call %r52, rekurs
    write "rsc: %s(rsc)"
    write "rsp: %s(rsp)"
;   write "%m(dump)"
    jmp smallend
func:
; at entry point func subroutine has 4 regs in frame
    alloc 8   ; extend frame from 4 to 8 regs
    write "r0      %x128(r0)" ; print packed caller frame and return address
    write "r1=%i64(r1) r2=%i64(r2) r3=%i64(r3)" ; print args
    ldi %r1, 12345
    ret

rekurs:
    alloc 4
    write "r0=%x128(r0) r1=%i64(r1)"
    write "rsc: %s(rsc)"
    write "rsp: %s(rsp)"
    addid %r3, %r1, -1
    ldi %r2, 0
    breqd %r1, %r2, rekret
; cneq %r1, %r2, 1, 0
    call %r2, rekurs
rekret:
    write "rsp: %s(rsp)"
    write "r0: %x128(r0)"
    retf 0
smallend:
    nop 0
    nop 111
    alloc 96
    write "end_call_recursive"
.end
.text
    ; at the beginning of the program, the register stack is empty
    alloc 54   ; expand frame to 54 registers
    ehadj simple_func_end
    ldi %r47, 1  ; will be saved when called
    ldi %r53, 3  ; first argument
    ldi %r52, 2  ; second argument
    ldi %r51, 1  ; third argument
    ; func procedure call, all registers up to 50 will be saved,
    ; return address, eip, frame size (50) are saved in r50
    call %r50, simple_func
    ; at this point, after returning, the frame will be again 53
    jmp simple_func_end
simple_func:
    ; at the starting point, the func procedure has a 5-register frame
    ; their previous numbers are 50, 51, 52, 53, new - 0, 1, 2, 3
    ; extend the frame to 10 registers (another 4,5,6,7,8,9)
    alloc 10
    write "r0 = %x128(r0)"; print packed return info
    write "r1 = %i64(r1)" ; print 1st argument
    write "r2 = %i64(r2)" ; print 2nd argument
    write "r3 = %i64(r3)" ; print 3rd argument
    ret
simple_func_end:
    nop 123
.end
.text
    write "example of carry/borrow testing"
    alloc 96

; 256-bit add (g30,%r31,r32,r33) + (g40,r41,r42,r43) => (g50,r51,r52,r53)
    ldi  %r30, -1
    ldi  %r31, -1
    ldi  %r34, -1
    ldi  %r33, -1

    ldi  %r40, 1
    ldi  %r41, 0
    ldi  %r42, 0
    ldi  %r43, 0

; throw add
    cmpeqd     %r10, %r30, %r40 ; add carry out
    addd      %r50, %r30, %r40 ; add
    cmpeqid   %r12, %r31, 1
    addid  %r51, %r31, 1

    cmpeqd %r12, %r31, %r41 ; add carry out
    addd  %r51, %r31, %r41 ; add
    cmpeqd %r14, %r34, %r42 ; add carry out
    addd  %r52, %r34, %r42 ; add
    cmpeqd %r8, %r33, %r43 ; add carry out
    addd %r53, %r33, %r43 ; add
    write "add carryis"
    addid %r51, %r51, 1
    addid %r52, %r52, 1
    addid %r53, %r53, 1
; set last carry
    ldi  %r54, 1
    ldi  %r54, 0
    write "multiprecision add:\nr50,r51,r52,r53,r54 = %x64(r50) %x64(r51) %x64(r52) %x64(r53) %x64(r54)"

    ldi.l %r40, 0x7fffffffffffffff
    mulh %r40, %r40, %r41
    write "r40     %x64(r40)"

    ldi   %r12, 12345
    ldi.l %r12, 12345678900

; ldi %r14, 0xFFFFFFFFF0
; ld8 %r13, %r14, 0

    write "test unsigned add carry"
    ldi %r7, -1
    ldi %r5, -2
    ldi %r6, -1
    addc3d %r2, %r5, %r6, %r7
    write "addc3: %u64(r5) %u64(r6) %u64(r7) => %i64(r2)"

    write "test unsigned subtract borrow"
    ldi %r7, -1
    ldi %r5, 12
    ldi %r6, -1
    subc3d %r2, %r5, %r6, %r7
    write "subc3: %u64(r5) %u64(r6) %u64(r7) => %i64(r2)"

    addcd %r12, %r14, %r46
    addcd %r12, %r14, %r46
    subcd %r12, %r14, %r46
    subcd %r12, %r14, %r46
    addc3d %r12, %r14, %r46, %r23
    addc3d %r12, %r14, %r46, %r22
    subc3d %r12, %r14, %r46, %r13
    subc3d %r12, %r14, %r46, %r14
    write "end carry test"
    nop 11111
.end
.text
    write "test compare"
    alloc 96
    ldi %r20, 4
    ldi %r21, 3
    ldi %r22, -4
    ldi %r23, -12
    write "test compare instructions"

    cmpeqd %r12, %r20, %r21
    cmpltsd %r12, %r20, %r21
    cmpltud %r12, %r20, %r21
    cmpeqid %r12, %r20, 123456
    cmpltsid %r12, %r20, 123456
    cmpltuid %r12, %r20, 123456
    cmpned %r12, %r20, %r21
    cmpneid %r12, %r20, 123456
    cmpgesid %r12, %r20, 123456
    cmpgeuid %r12, %r20, 123456
    cmpgesd %r12, %r20, %r21
    cmpgeud %r12, %r20, %r21

    cmpgesiw %r12, %r20, 123456
    cmpgeuiw %r12, %r20, 123456
    cmpgesid %r12, %r20, 123456
    cmpgeuid %r12, %r20, 123456

    cmpeqw %r12, %r20, %r21
    cmpltsw %r12, %r20, %r21
    cmpltuw %r12, %r20, %r21
    cmpeqiw %r12, %r20, 123456
    cmpltsiw %r12, %r20, 123456
    cmpltuiw %r12, %r20, 123456
    cmpnew %r12, %r20, %r21
    cmpneiw %r12, %r20, 123456
    cmpgesiw %r12, %r20, 123456
    cmpgeuiw %r12, %r20, 123456
    cmpgesw %r12, %r20, %r21
    cmpgeuw %r12, %r20, %r21

    cmpltsd %r12, %r20, %r21
    cmpltud %r12, %r20, %r21
    cmpltsid %r12, %r20, 123456
    cmpltuid %r12, %r20, 123456
    cmpltsd %r12, %r20, %r21
    cmpltud %r12, %r20, %r21
    cmpgesid %r12, %r20, 123456
    cmpgeuid %r12, %r20, 123456


    cmpltsw %r12, %r20, %r21
    cmpltuw %r12, %r20, %r21
    cmpltsiw %r12, %r20, 123456
    cmpltuiw %r12, %r20, 123456
    cmpltsw %r12, %r20, %r21
    cmpltuw %r12, %r20, %r21
    cmpgesiw %r12, %r20, 123456
    cmpgeuiw %r12, %r20, 123456

; TESTS
    cmpeqd %r14, %r12, %r45
    cmpned %r14, %r12, %r45

    cmpeqd %r14, %r45, %r34
    cmpeqid %r14, %r45, 123
    cmpeqid.l %r14, %r45, 1234567890123
    cmpltsid %r14, %r45, 123
    cmpltsid.l %r14, %r45, 1234567890123
    cmpgesid %r14, %r45, 123
    cmpgesid.l %r14, %r45, 1234567890123
    cmpltsd %r14, %r45, %r34
    cmpgeuid %r14, %r45, 123
    cmpgeuid.l %r14, %r45, 1234567890123
    cmpgeuid %r14, %r45, 123
    cmpgeuid.l %r14, %r45, 1234567890123
    cmpltud %r14, %r45, %r34

    cmpeqd %r41, %r34, %r56
    cmpltsd %r66, %r45, %r57
    cmpeqid %r64, %r56, 0
.end
.text
backward_target:
    alloc 61
    addid %r2, %r2, 1
    breqd %r2, %r2, branch_test_exit

    breqd  %r23, %r34, backward_target
    breqd.l  %r23, %r34, backward_target
    breqd  %r23, %r34, forward_target
    breqd.l  %r23, %r34, forward_target
    breqid  %r23,34, backward_target
    breqid.l  %r23,34, backward_target
    breqid  %r23,34, forward_target
    breqid.l  %r23,34, forward_target

    breqw  %r23, %r34, backward_target
    breqw.l  %r23, %r34, backward_target
    breqw  %r23, %r34, forward_target
    breqw.l  %r23, %r34, forward_target
    breqiw  %r23,34, backward_target
    breqiw.l  %r23,34, backward_target
    breqiw  %r23,34, forward_target
    breqiw.l  %r23,34, forward_target

    brned  %r23, %r34, backward_target
    brned.l  %r23, %r34, backward_target
    brned  %r23, %r34, forward_target
    brned.l  %r23, %r34, forward_target
    brneid  %r23,34, backward_target
    brneid.l  %r23,34, backward_target
    brneid  %r23,34, forward_target
    brneid.l  %r23,34, forward_target

    brnew  %r23, %r34, backward_target
    brnew.l  %r23, %r34, backward_target
    brnew  %r23, %r34, forward_target
    brnew.l  %r23, %r34, forward_target
    brneiw  %r23,34, backward_target
    brneiw.l  %r23,34, backward_target
    brneiw  %r23,34, forward_target
    brneiw.l  %r23,34, forward_target

    brgesd  %r23, %r34, backward_target
    brgesd.l  %r23, %r34, backward_target
    brgesd  %r23, %r34, forward_target
    brgesd.l  %r23, %r34, forward_target

    brgesw  %r23, %r34, backward_target
    brgesw.l  %r23, %r34, backward_target
    brgesw  %r23, %r34, forward_target
    brgesw.l  %r23, %r34, forward_target

    brltsd  %r23, %r34, backward_target
    brltsd.l  %r23, %r34, backward_target
    brltsd  %r23, %r34, forward_target
    brltsd.l  %r23, %r34, forward_target
    brltsid  %r23,34, backward_target
    brltsid.l  %r23,34, backward_target
    brltsid  %r23,34, forward_target
    brltsid.l  %r23,34, forward_target

    brltsw  %r23, %r34, backward_target
    brltsw.l  %r23, %r34, backward_target
    brltsw  %r23, %r34, forward_target
    brltsw.l  %r23, %r34, forward_target
    brltsiw  %r23,34, backward_target
    brltsiw.l  %r23,34, backward_target
    brltsiw  %r23,34, forward_target
    brltsiw.l  %r23,34, forward_target

    brgeuid  %r23,34, backward_target
    brgeuid.l  %r23,34, backward_target
    brgeuid  %r23,34, forward_target
    brgeuid.l  %r23,34, forward_target

    brgeuiw  %r23,34, backward_target
    brgeuiw.l  %r23,34, backward_target
    brgeuiw  %r23,34, forward_target
    brgeuiw.l  %r23,34, forward_target

    brgeud%r23, %r34, backward_target
    brgeud.l  %r23, %r34, backward_target
    brgeud  %r23, %r34, forward_target
    brgeud.l  %r23, %r34, forward_target

    brgeuw  %r23, %r34, backward_target
    brgeuw.l  %r23, %r34, backward_target
    brgeuw  %r23, %r34, forward_target
    brgeuw.l  %r23, %r34, forward_target

    brltud  %r23, %r34, backward_target
    brltud.l  %r23, %r34, backward_target
    brltud  %r23, %r34, forward_target
    brltud.l  %r23, %r34, forward_target

    brltuw  %r23, %r34, backward_target
    brltuw.l  %r23, %r34, backward_target
    brltuw  %r23, %r34, forward_target
    brltuw.l  %r23, %r34, forward_target

    brgeuid  %r23,34, backward_target
    brgeuid.l  %r23,34, backward_target
    brgeuid  %r23,34, forward_target
    brgeuid.l  %r23,34, forward_target

    brgeuiw  %r23,34, backward_target
    brgeuiw.l  %r23,34, backward_target
    brgeuiw  %r23,34, forward_target
    brgeuiw.l  %r23,34, forward_target

    brmall  %r23, 34, backward_target
    brmall.l  %r23, 34, backward_target
    brmall  %r23, 34, forward_target
    brmall.l  %r23, 34, forward_target

    brmnotall  %r23, 34, backward_target
    brmnotall.l  %r23, 34, backward_target
    brmnotall  %r23, 34, forward_target
    brmnotall.l  %r23, 34, forward_target

    brmany   %r23, 34, backward_target
    brmany.l %r23, 34, backward_target
    brmany   %r23, 34, forward_target
    brmany.l %r23, 34, forward_target

    brmnone   %r23, 34, backward_target
    brmnone.l %r23, 34, backward_target
    brmnone   %r23, 34, forward_target
    brmnone.l %r23, 34, forward_target

forward_target:
branch_test_exit:

    jmp branch_exit

label:
    breqd %r12, %r13, qwe
    srpid %r10, %r11, %r12, 45
    depositv %r61, %r91, %r32, %r10
    bitslct %r62, %r91, %r32, %r10
    vpermb %r63, %r91, %r32, %r10
qwe:
    brned %r15, %r46, label
    breqd %r25, %r45, label
    brltsd %r25, %r44, label
    brgesd %r35, %r43, label
    brltud %r55, %r76, label
    brneid %r55, 140, label
    breqid %r65, 141, label
    brltsid %r65, 142, label
    brltuid %r75, 170, label
    brgeuid %r85, 160, label

    addid.l %r45, %r34, 1234
    brbsi %r85, 26, label
    brbci.l %r85, 36, label
    brbsi %r95, 46, label
    brbci.l %r95, 56, label

    jmpr %r45, %r23, 1
branch_exit:
    write  "end branch test"
.end
.text
    alloc 61
    write "Example of test bit and branch"
    ldi %r19, 0x20
    ldi %r20, 12+3
    write "%i64(r20)"
    ldi %r10, 0
    brbci %r10, 10, xxx_n
    ldi.l %r20, 123456789012345
    ldi %r21, 321
    addd %r23, %r20, %r21
    write "%i64(r43)"
xxx_n: write "%i64(r23)"

    ldi %r46, 0xabcdef
    brbci %r46, 56, branch_bit_exit
    brbsi %r46, 56, branch_bit_exit
    ldi %r56, 56
    brbc %r46, %r56, branch_bit_exit
    brbs %r46, %r56, branch_bit_exit

branch_bit_exit:
    write  "end branch_bit test"
.end
.text
    write "cpuid implemented number"
    alloc 96
    ldi %r13, 0
    cpuid %r14, %r13, 0
    write "cpuid len %x64(r14)"
    write "cpuid loop"
cpuid_loop:
    cpuid %r15, %r13, 0
    write "cpuid[%i64(r13)] = %x64(r15)"
    repltd %r13, %r14, 1, cpuid_loop
.end
.rodata
    align 16
crc32c_test_string:
    ascii "The quick brown fox jumps over the lazy dog" ; 43 bytes
.text
    write "crc32c = 0x22620404 (expected)"
    alloc 20
    ldi %r12, -1  ; crc32c initial value
    ldard %r11, crc32c_test_string
    ldq %r13, %r11, 0
    crc32cq %r12, %r12, %r13
    ldq %r13, %r11, 16
    crc32cq %r12, %r12, %r13
    ldud %r13, %r11, 32
    crc32cd %r12, %r12, %r13
    lduh %r13, %r11, 40
    crc32ch %r12, %r12, %r13
    ldub %r13, %r11, 42
    crc32cb %r12, %r12, %r13
    xori %r12, %r12, -1
    write "crc32c = 0x%x32(r12) (computed)"
.end
.text
    alloc 61
    ldaxsd %r41, %r40, %r12, 4, 52
    ldaxsd %r41, %r40, %r12, 3, -12
    ldaxsd %r41, %r40, %r12, 4, 52
    ldi.l %r5, -1
    mov2 %r3, %r4, %r4, %r3
    mov2 %r3, %r4, %r4, %r3


.rodata  ; open text (read-only data) section
    align 16
text_lbl: ; this is label
    d1 111 ; signed byte
    d1 112
    d1 113
ddd:
    align 4 ; force 4-byte alignment for next data
    d1 6
    d1 7
    d1 8+0x3D ; you may use formulas!!!

.text
    write "test addressing"

; Examples of IP-relative references.
    ldi %r45, text_lo(text_lbl)
    write "text_lo(text_lbl)=%i64(r45)"
    ldi %r45, text_hi(text_lbl)
    write "text_hi(text_lbl)=%i64(r45)"
    ldi %r45, text_lbl
    write "%i64(r45)"

; Example of access to text section.
; First get IP-relative reference to text section (+/- 64 MB from IP).
    ldarc %r45, text_lbl

; Now in r45 we have base address.
; But it IS NOT true address of 'text_lbl'.
; We have in r45 nearest (to 'text_lbl') least address, aligned on 16-bytes boundary.
; Remember add 'text_lo' part of label address at each displacement calculation.
    ldub %r50, %r45, text_lo(text_lbl)+0
    ldub %r51, %r45, text_lo(text_lbl)+1
    ldub %r52, %r45, text_lo(text_lbl)+2
    write "%i64(r50)" ; must be 111
    write "%i64(r51)" ; must be 112
    write "%i64(r52)" ; must be 113

; Example of incorrect access to text section (without bundle alignment)
    ldub %r50, %r45, 0
    write "%i64(r50)" ; must be 101 - start of 16-byte portion
.end
.text
    alloc 96
    addid %r20, %gz, 128
    addid %sp, %sp, -32
    ldi.l %r12, 0x07060504030201
    std %r12, %sp,0

.data
    ascii "data section marker"
    align 8
.rodata
    ascii "rodata section marker"
    align 8

.data
    d2 1234
first_byte:
    d1 12
.text
    ldard %r22, first_byte

; test interval time mask
    ldi %r22, 0xFFFFFFFFFFFFFFFF
    ldi %r15, 11

.rodata  ; open rodata (read-only data) section
    align 8
text_begin: ; this is label
    d8 1 ; signed 8-bytes
    d8 -2
    d1 101 ; signed byte
    d1 102
    d1 103
    align 4
    d4 10000 ; signed 4byte
    d2 10000 ; signed 2byte
    space 4 ; insert zeroed bytes
    d2 20000
.data  ; open data (read-write) section
    align 8
eexxx: d8 12345678 ; signed 8-byte
    d8 1234567890
ssxxx: d8 123456789012
    d8 12345678901234
.rodata
    d4 4555 ; signed 4-byte
    d2 4555 ; signed 2-byte
    align 8
    d8 11
text2:
.text ; open code (read-execute) section

.data ; switch to data section
    d1 120
    align 2
    d2 13400
align 8
dataname:
    d4 654321890
    d4 654321890
    d8 1234545345345
    d8 6789023356977
align 8
someplaceindata:
    d8 0x0000000000000001
    d8 0x0000000000000002
    d8 0x0000000000000003
    d8 0x0000000000000004
    d8 0x0000000000000005
    d8 0x0000000000000006
    d8 0x0000000000000007
    d8 0x0000000000000008
.text
    ldard %r11, someplaceindata
    ldi.l %r15, 987777777777
    ldi %r46, 100000
    std %r46, %r11, 8*3
    ldud %r46, %r11, 8*3
    write "%i64(r46)"
    muld %r18, %r15, %r46
    addd %r17, %r15, %r46
    andn %r17, %r15, %r46
    cmpltsd %r12, %r17, %r15
    write "%i64(r15) %i64(r46) %i64(r17)"
    addid %r17, %r17, 22
    write "%i64(r17) %i64(r17)"
    getspr %r27, %itc
    write "itc: %x64(r27)"
    write "%m(dump)"
.end
.text
    ; at the beginning of the program, the register stack is empty
    alloc 54   ; expand frame to 54 registers
    ldarc %r4, dense_call_test_end
    setspr %r4, %eip
    setspr %r4, %reip
    ldi %r47, 1  ; will be saved when called
    ldi %r53, 3  ; first argument
    ldi %r52, 2  ; second argument
    ldi %r51, 1  ; third argument
    ; func procedure call, all registers up to 50 will be saved,
    ; return address, eip, frame size (50) are saved in r50
check_label:
    call %r48, simple_func_1
    call %r50, simple_func_2
    call %r52, simple_func_3

    jmp dense_call_test_end

simple_func_1:
    alloc  10
    write  "simple_func_1"
    ret

simple_func_2:
    alloc  10
    write  "simple_func_2"
    ret

simple_func_3:
    alloc  10
    write  "simple_func_3"
    ret

dense_call_test_end:
    nop 123
    nop 123
    nop 123
    nop 123
    nop 123
    nop 123
.end
.text
    write "test bit-field insert (deposit)"
    alloc 96
    ldi.l %r30, 0xaaaaaaaaaaaaaaaa
    ldi.l %r40, 0xeeeeeeeeeeeeeeee
    deposit %r20, %r30, %r40, 40, 24
    write "dep: %x64(r20)"
    deposit %r20, %r40, %r30, 40, 24
    write "dep: %x64(r20)"

    write "test vector deposit (dep16)"
    nor %r3, %r4, %r4
    deposit %r5, %r3, %r4, 100, 40
    write "dep16: %x128(r5)"
    write "end deposit test"
.end

.text
    write "test control device memory-mapped registers"
    alloc 96

    ; device_control base address
    ldi.l %r24, DEVICE_CONFIG_VIRT_BASE

    write "test pci"

    ldi.l %r21, 0x1234567890abcdef

    ldud %r20, %r24, DEVICE_CONTROL_DID
    write "mem[DEVICE_CONTROL_DID] %x64(r20)"
    std %r21, %r24, DEVICE_CONTROL_DID
    ldud %r20, %r24, DEVICE_CONTROL_DID
    write "mem[DEVICE_CONTROL_DID] %x64(r20)"

    ldud %r20, %r24, DEVICE_CONTROL_CMD
    write "mem[DEVICE_CONTROL_CMD] %x64(r20)"
    std %r21, %r24, DEVICE_CONTROL_CMD
    ldud %r20, %r24, DEVICE_CONTROL_CMD
    write "mem[DEVICE_CONTROL_CMD] %x64(r20)"

    ldud %r20, %r24, DEVICE_CONTROL_ARRAY_ADDRESS
    write "mem[DEVICE_CONTROL_ARRAY_ADDRESS] (r20)"

    ldud %r20, %r24, DEVICE_CONTROL_ARRAY_LEN
    write "mem[DEVICE_CONTROL_ARRAY_LEN] %i64(r20)"

    ldi  %r22, \n

    write "test command"
    ldi.l %r21, 0xabcdef1234567890
    std %r21, %r24, DEVICE_CONTROL_CMD

    write "end_device_control_test"
.end
.text
    write "test core mapping DEVICE_CONFIG_VIRT_BASE"
    alloc 96
    ldi.l %r20, DEVICE_CONFIG_VIRT_BASE
    write "DEVICE_CONFIG_VIRT_BASE: %x64(r20)"
    ldi.l %r20, DEVICE_CONFIG_SPACE_SIZE
    write "DEVICE_CONFIG_SPACE_SIZE: %x64(r20)"
    ldi.l %r20, CONFIG_OFFSET_CORE_0
    write "CONFIG_OFFSET_CORE_0: %x64(r20)"
    ldi.l %r20, DEVICE_CORE_TIMECMP
    write "DEVICE_CORE_TIMECMP: %x64(r20)"

    ldi.l %r20, DEVICE_CONFIG_VIRT_BASE + CONFIG_OFFSET_CORE_0 * DEVICE_CONFIG_SPACE_SIZE ; core config
    ldi %r19, 0xabcdef

    write "test interrupt vector %x64(r20)"
    std %r19, %r20, DEVICE_CORE_TIMECMP ; use DEVICE_CORE_INTERRUPT_VECTOR in place of DEVICE_CORE_TIMECMP for real interrupt

    write "test timecmp"
    std %r19, %r20, DEVICE_CORE_TIMECMP

    write "test rom mapping ROM_VIRT_BASE"
    ldi.l %r20, ROM_VIRT_BASE
    ldud %r19, %r20, 0
    write "mem[ROM_VIRT_BASE] %x64(r19)"

    write "test video commands VIDEO_COMMAND_VIRT_BASE"
    ldi.l %r20, VIDEO_COMMAND_VIRT_BASE
    ldi %r21, 0x1234
    stw %r21, %r20, 0x88 ; clear
    stw %r21, %r20, 0x8c ; redraw

    write "video width/height base: %x64(r20)"
    lduw %r21, %r20, 0x80 ; width
    lduw %r22, %r20, 0x84 ; height
    write "width=%i64(r21) heigth=%i64(r22)"

    write "test video memory VIDEO_VIRT_BASE"
    ldi.l %r20, VIDEO_VIRT_BASE
    write "r20     %x64(r20)"

    ldi.l %r25, 0x12345678
    stw %r25, %r20, 0

    ldi %r24, 0   ; y
loop_y: (64)
; write "%i64(r24)"
    ldi %r23, 0   ; x
loop_x:
; add %r25, %r23, %r24
    stb %r25, %r20, 0
    addid %r20, %r20, 1
    addid %r23, %r23, 1
    brltsd %r23, %r21, loop_x

    addid %r24, %r24,1
    brltsd %r24, %r22, loop_y
    ; debug
    write "end test video memory"
    nop 1234567
.end
.text
    write "begin exception test"
    alloc 96

    ldard %r2, catch
    setspr %r2, %eip

; constructor 1
    ldi %r4, 1
    ehadj call_destructor_1
    write "eip: %s(eip)"
; constructor 2
    ldi %r5, 2
    ehadj call_destructor_2
    write "eip: %s(eip)"

    ldi %r3, 0xFFFFFFFFFFFF1230
    ehthrow %r3, 0    ; set eca, jump to eip
    write "normal execution (never occurs)"

call_destructor_2:
    write "call_destructor_2"
    ehcatch %r6, end_destructor_2
    ; here dtor called
    ldi %r4, 0
end_destructor_2:
    ehnext %r6, call_destructor_1
    write "normal continue after destructor_2"

call_destructor_1:
    write "call_destructor_1"
    ehcatch %r6, end_destructor_1
    ; here dtor called
    ldi %r5, 0
end_destructor_1:
    ehnext %r6, catch
    write "normal continue after destructor_1"

call_ret:
    write "normal exit"
    jmp exception_exit

catch:
    write "caught exception, exit"
    ehcatch %r12, exception_exit
    write "caught exception context: r12=%x64(r12)"
exception_exit:
    nop 1234567
    nop 7654321
.end
.text
; floating-point extension example
    alloc 96

    write "test f128 ld rel"
    fldqr %r12, 3.1415926115461431423612436243
    write "f128 ld rel: %f128(r12)"

    write "test fpcr modification (rm=3)"
    ldi  %r2, 3
    setspr %r2, %fpcr
    write "fpcr: %s(fpcr)"
    write "test fpcr modification (rm=2)"
    ldi  %r2, 2
    setspr %r2, %fpcr
    write "fpcr: %s(fpcr)"
    write "test fpcr modification (rm=1)"
    ldi  %r2, 1
    setspr %r2, %fpcr
    write "fpcr: %s(fpcr)"
    write "test fpcr modification (rm=0)"
    ldi  %r2, 0
    setspr %r2, %fpcr
    write "fpcr: %s(fpcr)"

    write "compare f128 ld rel (full mantissa) & long f128 ld imm (63-bit mantissa)"
    fldqr %r30, 3.14159265358979323846123456789012e+400
    write "f128 ld rel: %x128(r30) %f128(r30)"
    fldid %r31, 3.14159265358979323846123456789012
    write "f128 ld imm: %x128(r31) %f64(r31)"
    write "compare f128 ld rel (full mantissa) & short f128 ld imm (21-bit mantissa)"
    fldqr %r30, 3.14159265358979323846123456789012
    write "r30     %x128(r30)"
    fldid %r31, 3.14159265358979323846123456789012
    write "r31     %x128(r31)"
    write "before1"
    write "r30     %f128(r30)"
    write "before2"
    write "r31     %vf64(r31)"
    write "after"
    fldid %r30, -12.3456789e+04
.rodata
    align 16
float64data:
    double 1.234567890123456789124141241241
    double 3.1415925678888734535345231234564561
    double 3.4566345634563456346535463463456
.text
    ldarc %r21, float64data
    ldud %r11, %r21, 8*0
    ldud %r12, %r21, 8*1
    ldud %r13, %r21, 8*2
    write "ld8(f64): %f64(r11) %f64(r12) %f64(r13)"
    fldqr %r14, 2.7182818289201
    write "f128 ld rel: %f128(r14)"

    fdtoq %r11, %r11
    fdtoq %r12, %r12
    fdtoq %r13, %r13

    write "test binary"
    fmulq %r15, %r11, %r14
    write "f128 mul:  %f128(r15)"
    fnmulq %r15, %r11, %r14
    write "f128 nmul: %f128(r15)"
    faddq %r15, %r11, %r14
    write "f128 add:  %f128(r15)"
    fnaddq %r15, %r11, %r14
    write "f128 nadd: %f128(r15)"
    fsubq %r15, %r14, %r11
    write "f128 subs:  %f128(r15)"
    fdivq %r15, %r14, %r11
    write "f128 div:  %f128(r15)"

    write "test fused fma"
;   jmp skipfma
    fmaddq %r15, %r14, %r11, %r12
    write "f128 madd:  %f128(r15)"
    fnmaddq %r15, %r14, %r11, %r12
    write "f128 nmadd: %f128(r15)"
    fmsubq %r15, %r14, %r11, %r12
    write "f128 msub:  %f128(r15)"
    fnmsubq %r15, %r14, %r11, %r12
    write "f128 nmsub: %f128(r15)"

    write "test unary"
    mov  %r16, %r15
    write "r16     %f128(r16)"
    fabsq %r16, %r15
    write "r16     %f128(r16)"
    fnegq %r16, %r15
    write "r16     %f128(r16)"
    fnabsq %r16, %r15
    write "r16     %f128(r16)"
    fsqrtq %r16, %r12
    write "r16     %f128(r16)"
    frsqrtq %r16, %r12
    write "r16     %f128(r16)"

    write "test rounding"
    frndq %r17, %r12, rtz
    write "r17     %f128(r17)"
    frndq %r17, %r12, rdn
    write "r17     %f128(r17)"
    frndq %r17, %r12, rup
    write "r17     %f128(r17)"
    frndq %r17, %r12, rne
    write "r17     %f128(r17)"
    fqtoiw %r17, %r12, rtz
    write "r17     %i64(r17)"
    ldi %r17, 123456
    fiwtoq %r17, %r7
    write "r17     %f128(r17)"

    write "test fp minmax"
    fmaxq %r8, %r11, %r12
    write "r8      %f128(r8)"
    fminq %r8, %r11, %r12
    write "r8      %f128(r8)"
    write "test fp abs minmax"
    famaxnmq %r8, %r11, %r12
    write "r8      %f128(r8)"
    faminnmq %r8, %r11, %r12
    write "r8      %f128(r8)"

    write "test f128 merge"
    fmergeq %r8, %r11, %r12, %r14
    write "r8      %f128(r8)"
    fmergeq %r8, %r14, %r11, %r12
    write "r8      %f128(r8)"


.rodata
    align 16
xxxd: double 1.122
    double 0.9999765432
.text
    ldarc %r21, xxxd
    ldi  %r15, 100
    ldud %r25, %r21, 8*0
    ldud %r26, %r21, 8*1
    fsubq %r22, %r25, %r16
    write "r22     %f128(r22)"
xxloop:
    fmaddq %r22, %r25, %r16, %r22
    fmsubq %r22, %r25, %r16, %r22
    repged %r15, %gz, 1, xxloop
    write "r22     %f128(r22)"

    write "other FPU"
    fmaddq  %r60, %r61, %r62, %r63
    fmsubq  %r61, %r61, %r72, %r73
    fnmaddq %r62, %r71, %r82, %r63
    fnmsubq %r63, %r81, %r12, %r53

    fmulq %r64, %r61, %r22
    fdivq %r65, %r11, %r27
    faddq %r66, %r17, %r42
    fsubq %r67, %r31, %r23
    fnaddq %r68, %r41, %r62
    fmaxq %r60, %r61, %r62
    fminq %r60, %r61, %r62
    famaxnmq %r60, %r61, %r62
    faminnmq %r60, %r61, %r62

    fcmpoltq %r10, %r61, %r72
    fcmpogeq %r11, %r52, %r21
    fcmpogeq %r12, %r43, %r12
    fcmpoeqq %r10, %r34, %r44
    fcmpueqq %r13, %r25, %r22
    fcmpugeq %r12, %r15, %r23
    fcmpuq %r11, %r86, %r86

    fnegq %r24, %r58
    fabdq %r45, %r61, %r20
    fnabdq %r56, %r32, %r20
    frndq %r78, %r74, rmm
    frndq %r89, %r65, rtz
    frndq %r81, %r76, rdn
    frndq %r62, %r67, rup
    fsqrtq %r63, %r78
    frsqrtq %r64, %r69

    addid %r45, %sp,-4800
    ldi %r13, 2

    lduw %r12, %r45, 4*1
    stw %r12, %r45, 4*1
    ldud %r12, %r45, 8*3
    std %r12, %r45, 8*3
    lduwxsd %r12, %r45, %r13, 2, 60
    stwxsd %r12, %r45, %r13, 2, 60
    ldudxsd %r12, %r45, %r13, 3, 60
    stdxsd %r12, %r45, %r13, 3, 60

    faddq %r23, %r24, %r25
    fmaddq %r23, %r60, %r55, %r33
    fmulq %r23, %r60, %r55
    ldud %r60, %r45, 8*6
    fmaddq %r23, %r60, %r55, %r33
    fmaddq %r24, %r61, %r25, %r32
    fmaddq %r25, %r62, %r55, %r23
    fmaddq %r26, %r63, %r75, %r73
    fmaddq %r27, %r64, %r75, %r73
    fmaddq %r28, %r65, %r85, %r63
    fmaddq %r29, %r66, %r85, %r63
    fmaddq %r30, %r67, %r55, %r23
    fmaddq %r31, %r68, %r55, %r23
    fmaddq %r12, %r32, %r76, %r85
    fmaddq %r12, %r32, %r76, %r85
    fmaddq %r10, %r32, %r76, %r85
    fmaddq %r10, %r32, %r76, %r85
    fmaddq %r10, %r32, %r76, %r85
    fmaddq %r13, %r32, %r76, %r85
    fmaddq %r14, %r32, %r76, %r85
    fmaddq %r15, %r32, %r76, %r85
    fmaddq %r16, %r32, %r76, %r85
    fmaddq %r17, %r32, %r76, %r85

    fqtoiw %r56, %r45, rmm
    fqtouw %r56, %r45, rmm
    fiwtoq %r45, %r56, rmm
    fuwtoq %r45, %r56, rmm

    ldi  %r3, 0
    fldqr %r4, 1.0
    fldqr %r5, 1.0
    fldqr %r6, 1.0
    fldqr %r7, 1.0
    ldi  %r24, 128
tri_repeat:
    write "r7      %x128(r7)"
    faddq %r5, %r5, %r4
    fmulq %r6, %r6, %r5
    fdivq %r7, %r4, %r6
;   write "%x128(r6)"
    repled.l %r3, %r24, 1, tri_repeat

    write "test taylor series"
    fldqr %r2, 0.44567 ; f2 ,  x
    write "x:   %f128(r2)"  ; test value
    write "test sin(x)"
    fldqr %r5, sin(0.44567)
    write "sin: %f128(r5)"  ; test value
    ldi  %r3, 0  ; s ,  0
    fmulq %r4, %r2, %r2 ; f4 ,  x*x
    fmaddq %r3, %r3, %r4, %r25 ; s ,  s * x*x + 1/25!
    fmsubq %r3, %r3, %r4, %r23 ; s ,  s * x*x - 1/23!
    fmaddq %r3, %r3, %r4, %r21
    fmsubq %r3, %r3, %r4, %r19
    fmaddq %r3, %r3, %r4, %r17
    fmsubq %r3, %r3, %r4, %r15
    fmaddq %r3, %r3, %r4, %r13
    fmsubq %r3, %r3, %r4, %r11
    fmaddq %r3, %r3, %r4, %r9
    fmsubq %r3, %r3, %r4, %r7
    fmaddq %r3, %r3, %r4, %r5
    fmsubq %r3, %r3, %r4, %r3
    fmaddq %r3, %r3, %r4, %r1
    fmulq %r3, %r3, %r2 ; s ,  s * x
    write "sin: %f128(r3)"

    write "test cos(x)"
    fldqr %r5, cos(0.44567)
    write "cos: %f128(r5)"  ; test value
    ldi  %r3, 0  ; s ,  0
    fmulq %r4, %r2, %r2 ; f4 ,  x*x
    fmsubq %r3, %r3, %r4, %r26
    fmaddq %r3, %r3, %r4, %r24
    fmsubq %r3, %r3, %r4, %r22
    fmaddq %r3, %r3, %r4, %r20
    fmsubq %r3, %r3, %r4, %r18
    fmaddq %r3, %r3, %r4, %r16
    fmsubq %r3, %r3, %r4, %r14
    fmaddq %r3, %r3, %r4, %r12
    fmsubq %r3, %r3, %r4, %r10
    fmaddq %r3, %r3, %r4, %r8
    fmsubq %r3, %r3, %r4, %r6
    fmaddq %r3, %r3, %r4, %r4
    fmsubq %r3, %r3, %r4, %r2
    fmaddq %r3, %r3, %r4, %r1
    write "cos: %f128(r3)"

    write "test exp(x)"
    fldqr %r5, exp(0.44567)
    write "exp: %f128(r5)" ; test value
    ldi %r3, 0 ; s ,  0.0
    mov %r4, %r2 ; f4 ,  x
    fldid %r6, 0.125
;   write "%f128(r6)"
    fmulq %r4, %r4, %r6 ; x ,  x/8
    fmaddq %r3, %r3, %r4, %r15
    fmaddq %r3, %r3, %r4, %r14
    fmaddq %r3, %r3, %r4, %r13
    fmaddq %r3, %r3, %r4, %r12
    fmaddq %r3, %r3, %r4, %r11
    fmaddq %r3, %r3, %r4, %r10
    fmaddq %r3, %r3, %r4, %r9
    fmaddq %r3, %r3, %r4, %r8
    fmaddq %r3, %r3, %r4, %r7
    fmaddq %r3, %r3, %r4, %r6
    fmaddq %r3, %r3, %r4, %r5
    fmaddq %r3, %r3, %r4, %r4
    fmaddq %r3, %r3, %r4, %r3
    fmaddq %r3, %r3, %r4, %r2
    fmaddq %r3, %r3, %r4, %r1
    fmaddq %r3, %r3, %r4, %r1
    fmulq %r3, %r3, %r3 ; (e^x) ^ 8
    fmulq %r3, %r3, %r3
    fmulq %r3, %r3, %r3
    write "exp: %f128(r3)"

    faddq %r1, %r2, %r3
    fmaddq %r2, %r10, %r20, %r30
    fmaddq %r1, %r11, %r21, %r31

    ; classification
    fclassh %r4, %r5, 120
    fclasss %r4, %r5, 120
    fclassd %r4, %r5, 120
    fclassq %r4, %r5, 120

    fclassh %r4, %r5, 1023
    fclasss %r4, %r5, 1023
    fclassd %r4, %r5, 1023
    fclassq %r4, %r5, 1023
    jmp skipfma

fpu_backward_target:
; single branches
    fbroeqs %r23, %r34, fpu_backward_target
    fbroeqs.l %r23, %r34, fpu_backward_target
    fbroeqs %r23, %r34, fpu_forward_target
    fbroeqs.l %r23, %r34, fpu_forward_target

    fbrueqs %r23, %r34, fpu_backward_target
    fbrueqs.l %r23, %r34, fpu_backward_target
    fbrueqs %r23, %r34, fpu_forward_target
    fbrueqs.l %r23, %r34, fpu_forward_target

    fbrones %r23, %r34, fpu_backward_target
    fbrones.l %r23, %r34, fpu_backward_target
    fbrones %r23, %r34, fpu_forward_target
    fbrones.l %r23, %r34, fpu_forward_target

    fbrunes %r23, %r34, fpu_backward_target
    fbrunes.l %r23, %r34, fpu_backward_target
    fbrunes %r23, %r34, fpu_forward_target
    fbrunes.l %r23, %r34, fpu_forward_target

    fbrolts %r23, %r34, fpu_backward_target
    fbrolts.l %r23, %r34, fpu_backward_target
    fbrolts %r23, %r34, fpu_forward_target
    fbrolts.l %r23, %r34, fpu_forward_target

    fbrults %r23, %r34, fpu_backward_target
    fbrults.l %r23, %r34, fpu_backward_target
    fbrults %r23, %r34, fpu_forward_target
    fbrults.l %r23, %r34, fpu_forward_target

    fbroges %r23, %r34, fpu_backward_target
    fbroges.l %r23, %r34, fpu_backward_target
    fbroges %r23, %r34, fpu_forward_target
    fbroges.l %r23, %r34, fpu_forward_target

    fbruges %r23, %r34, fpu_backward_target
    fbruges.l %r23, %r34, fpu_backward_target
    fbruges %r23, %r34, fpu_forward_target
    fbruges.l %r23, %r34, fpu_forward_target

    fbros %r23, %r34, fpu_backward_target
    fbros.l %r23, %r34, fpu_backward_target
    fbros %r23, %r34, fpu_forward_target
    fbros.l %r23, %r34, fpu_forward_target

    fbrus %r23, %r34, fpu_backward_target
    fbrus.l %r23, %r34, fpu_backward_target
    fbrus %r23, %r34, fpu_forward_target
    fbrus.l %r23, %r34, fpu_forward_target

; double branches
    fbroeqd %r23, %r34, fpu_backward_target
    fbroeqd.l %r23, %r34, fpu_backward_target
    fbroeqd %r23, %r34, fpu_forward_target
    fbroeqd.l %r23, %r34, fpu_forward_target

    fbrueqd %r23, %r34, fpu_backward_target
    fbrueqd.l %r23, %r34, fpu_backward_target
    fbrueqd %r23, %r34, fpu_forward_target
    fbrueqd.l %r23, %r34, fpu_forward_target

    fbroned %r23, %r34, fpu_backward_target
    fbroned.l %r23, %r34, fpu_backward_target
    fbroned %r23, %r34, fpu_forward_target
    fbroned.l %r23, %r34, fpu_forward_target

    fbruned %r23, %r34, fpu_backward_target
    fbruned.l %r23, %r34, fpu_backward_target
    fbruned %r23, %r34, fpu_forward_target
    fbruned.l %r23, %r34, fpu_forward_target

    fbroltd %r23, %r34, fpu_backward_target
    fbroltd.l %r23, %r34, fpu_backward_target
    fbroltd %r23, %r34, fpu_forward_target
    fbroltd.l %r23, %r34, fpu_forward_target

    fbrultd %r23, %r34, fpu_backward_target
    fbrultd.l %r23, %r34, fpu_backward_target
    fbrultd %r23, %r34, fpu_forward_target
    fbrultd.l %r23, %r34, fpu_forward_target

    fbroged %r23, %r34, fpu_backward_target
    fbroged.l %r23, %r34, fpu_backward_target
    fbroged %r23, %r34, fpu_forward_target
    fbroged.l %r23, %r34, fpu_forward_target

    fbruged %r23, %r34, fpu_backward_target
    fbruged.l %r23, %r34, fpu_backward_target
    fbruged %r23, %r34, fpu_forward_target
    fbruged.l %r23, %r34, fpu_forward_target

    fbrod %r23, %r34, fpu_backward_target
    fbrod.l %r23, %r34, fpu_backward_target
    fbrod %r23, %r34, fpu_forward_target
    fbrod.l %r23, %r34, fpu_forward_target

    fbrud %r23, %r34, fpu_backward_target
    fbrud.l %r23, %r34, fpu_backward_target
    fbrud %r23, %r34, fpu_forward_target
    fbrud.l %r23, %r34, fpu_forward_target

; quadruple branches
    fbroeqq %r23, %r34, fpu_backward_target
    fbroeqq.l %r23, %r34, fpu_backward_target
    fbroeqq %r23, %r34, fpu_forward_target
    fbroeqq.l %r23, %r34, fpu_forward_target

    fbrueqq %r23, %r34, fpu_backward_target
    fbrueqq.l %r23, %r34, fpu_backward_target
    fbrueqq %r23, %r34, fpu_forward_target
    fbrueqq.l %r23, %r34, fpu_forward_target

    fbroneq %r23, %r34, fpu_backward_target
    fbroneq.l %r23, %r34, fpu_backward_target
    fbroneq %r23, %r34, fpu_forward_target
    fbroneq.l %r23, %r34, fpu_forward_target

    fbruneq %r23, %r34, fpu_backward_target
    fbruneq.l %r23, %r34, fpu_backward_target
    fbruneq %r23, %r34, fpu_forward_target
    fbruneq.l %r23, %r34, fpu_forward_target

    fbroltq %r23, %r34, fpu_backward_target
    fbroltq.l %r23, %r34, fpu_backward_target
    fbroltq %r23, %r34, fpu_forward_target
    fbroltq.l %r23, %r34, fpu_forward_target

    fbrultq %r23, %r34, fpu_backward_target
    fbrultq.l %r23, %r34, fpu_backward_target
    fbrultq %r23, %r34, fpu_forward_target
    fbrultq.l %r23, %r34, fpu_forward_target

    fbrogeq %r23, %r34, fpu_backward_target
    fbrogeq.l %r23, %r34, fpu_backward_target
    fbrogeq %r23, %r34, fpu_forward_target
    fbrogeq.l %r23, %r34, fpu_forward_target

    fbrugeq %r23, %r34, fpu_backward_target
    fbrugeq.l %r23, %r34, fpu_backward_target
    fbrugeq %r23, %r34, fpu_forward_target
    fbrugeq.l %r23, %r34, fpu_forward_target

    fbroq %r23, %r34, fpu_backward_target
    fbroq.l %r23, %r34, fpu_backward_target
    fbroq %r23, %r34, fpu_forward_target
    fbroq.l %r23, %r34, fpu_forward_target

    fbruq    %r23, %r34, fpu_backward_target
    fbruq.l  %r23, %r34, fpu_backward_target
    fbruq    %r23, %r34, fpu_forward_target
    fbruq.l  %r23, %r34, fpu_forward_target

fpu_forward_target:

    fnulunes %r23, %r34, 1, 1
    fnuluned %r23, %r34, 1, 1
    fnuluneq %r23, %r34, 1, 1

    fnulones %r23, %r34, 1, 1
    fnuloned %r23, %r34, 1, 1
    fnuloneq %r23, %r34, 1, 1

    fnulueqs %r23, %r34, 1, 1
    fnulueqd %r23, %r34, 1, 1
    fnulueqq %r23, %r34, 1, 1

    fnuloeqs %r23, %r34, 1, 1
    fnuloeqd %r23, %r34, 1, 1
    fnuloeqq %r23, %r34, 1, 1

    frndh %r23, %r33, dyn
    frndh %r23, %r33, rne
    frndh %r23, %r33, rdn
    frndh %r23, %r33, rup
    frndh %r23, %r33, rtz
    frndh %r23, %r33, rmm

    frnds %r23, %r33, dyn
    frnds %r23, %r33, rne
    frnds %r23, %r33, rdn
    frnds %r23, %r33, rup
    frnds %r23, %r33, rtz
    frnds %r23, %r33, rmm

    frndd %r23, %r33, dyn
    frndd %r23, %r33, rne
    frndd %r23, %r33, rdn
    frndd %r23, %r33, rup
    frndd %r23, %r33, rtz
    frndd %r23, %r33, rmm

    frndq %r23, %r33, dyn
    frndq %r23, %r33, rne
    frndq %r23, %r33, rdn
    frndq %r23, %r33, rup
    frndq %r23, %r33, rtz
    frndq %r23, %r33, rmm

    frndxh %r23, %r33, dyn
    frndxh %r23, %r33, rne
    frndxh %r23, %r33, rdn
    frndxh %r23, %r33, rup
    frndxh %r23, %r33, rtz
    frndxh %r23, %r33, rmm

    frndxs %r23, %r33, dyn
    frndxs %r23, %r33, rne
    frndxs %r23, %r33, rdn
    frndxs %r23, %r33, rup
    frndxs %r23, %r33, rtz
    frndxs %r23, %r33, rmm

    frndxd %r23, %r33, dyn
    frndxd %r23, %r33, rne
    frndxd %r23, %r33, rdn
    frndxd %r23, %r33, rup
    frndxd %r23, %r33, rtz
    frndxd %r23, %r33, rmm

    frndxq %r23, %r33, dyn
    frndxq %r23, %r33, rne
    frndxq %r23, %r33, rdn
    frndxq %r23, %r33, rup
    frndxq %r23, %r33, rtz
    frndxq %r23, %r33, rmm

skipfma:
    write "end fpu"
.end
.text
    alloc 96
    write "test base addressing with indexed post-update"
    ldi %r12, 1
    addid %r45, %sp, -512

    ldubmia  %r23, %r45, 2
    lduhmia  %r23, %r45, 2
    lduwmia  %r23, %r45, 4
    ldudmia  %r23, %r45, 8

    ldqmia %r23, %r45, 16

    ldsbmia %r23, %r45, 2
    ldshmia %r23, %r45, 2
    ldswmia %r23, %r45, 4
    ldsdmia %r23, %r45, 8

    stbmia %r23, %r45, 2
    sthmia %r23, %r45, 2
    stwmia %r23, %r45, 4
    stdmia %r23, %r45, 8
    stqmia %r23, %r45, 16
    write "end_indexed_modify_test"
.end
.rodata
rodata1:
    d1 123
    align 2
rodata2:
    d2 12345
    align 4
rodata4:
    d4 123456789
    align 8
rodata8:
    d8 1234567890123456789

.data
data1:
    d1 123
    align 2
data2:
    d2 12345
    align 4
data4:
    d4 123456789
    align 8
data8:
    d8 1234567890123456789

.text
    alloc 96

    write "test ip-relative data addressing"
    ldubr %r34, rodata1
    lduhr %r34, rodata2
    lduwr %r34, rodata4
    ldudr %r34, rodata8

    ldsbr %r34, rodata1
    ldshr %r34, rodata2
    ldswr %r34, rodata4
    ldsdr %r34, rodata8

    ldubr %r34, data1
    lduhr %r34, data2
    lduwr %r34, data4
    ldudr %r34, data8

    ldsbr %r34, data1
    ldshr %r34, data2
    ldswr %r34, data4
    ldsdr %r34, data8

    stbr %r34, data1
    sthr %r34, data2
    stwr %r34, data4
    stdr %r34, data8

    write "end ip-relative data test"
.end
.text
    alloc 96
    write "test ca.rf"
    ldard %r22, ldarcf_data
    write "ca.rf: %x64(r22)"

    write "end_ca_rf_test"
.data
ldarcf_data:

.end
.text
    alloc 96
    write "check bit selection instruction"
    ldi.l %r6, ((0x3333333333333333 ^ 0x5555555555555555) & 0xff00ff00ff00ff00) ^ 0x5555555555555555
    write "expect: %x64(r6)"
    ldi.l %r3, 0x3333333333333333
    ldi.l %r4, 0x5555555555555555
    ldi.l %r5, 0xff00ff00ff00ff00
    bitslct %r6, %r3, %r4, %r5
    write "result: %x64(r6)"

    write "end_bitselect_test"
.end
.text
    alloc 61
    write "\ntest write: special register"
    write "ip      %s(ip)"
    write "eip     %s(eip)"
    write "eca     %s(eca)"
    write "fpcr    %s(fpcr)"
    write "rsc     %s(rsc)"
    write "rsp     %s(rsp)"
    write "bsp     %s(bsp)"
    write "peb     %s(peb)"
    write "teb     %s(teb)"
    write "itc     %s(itc)"
    write "itm     %s(itm)"
    write "psr     %s(psr)"
    write "pta     %s(pta)"
    write "iva     %s(iva)"
    write "kip     %s(kip)"
    write "ksp     %s(ksp)"
    write "krsp    %s(krsp)"
    write "iip     %s(iip)"
    write "iipa    %s(iipa)"
    write "ipsr    %s(ipsr)"
    write "cause   %s(cause)"
    write "ifa     %s(ifa)"
    write "iib     %s(iib)"
    write "tpr     %s(tpr)"
    write "lid     %s(lid)"
    write "irr0    %s(irr0)"
    write "irr1    %s(irr1)"
    write "irr2    %s(irr2)"
    write "irr3    %s(irr3)"
    write "isr0    %s(isr0)"
    write "isr1    %s(isr1)"
    write "isr2    %s(isr2)"
    write "isr3    %s(isr3)"
    write "tsv     %s(tsv)"
    write "cmcv    %s(cmcv)"
    write "pmv     %s(pmv)"

    write "\ntest read special register"

    getspr %r12, %ip
    write "ip      %x64(r12)"

    getspr %r12, %eip
    write "eip     %x64(r12)"

    getspr %r12, %eca
    write "%x64(r12)"

    getspr %r12, %fpcr
    write "%x64(r12)"

    getspr %r12, %rsc
    write "%x64(r12)"

    getspr %r12, %rsp
    write "%x64(r12)"

    getspr %r12, %bsp
    write "%x64(r12)"

    getspr %r12, %peb
    write "%x64(r12)"

    getspr %r12, %teb
    write "%x64(r12)"

    getspr %r12, %itc
    write "%x64(r12)"

    getspr %r12, %itm
    write "%x64(r12)"

    getspr %r12, %psr
    write "%x64(r12)"

    getspr %r12, %pta
    write "%x64(r12)"

    getspr %r12, %iva
    write "%x64(r12)"

    getspr %r12, %kip
    write "%x64(r12)"

    getspr %r12, %ksp
    write "%x64(r12)"

    getspr %r12, %krsp
    write "krsp    %x64(r12)"

    getspr %r12, %iip
    write "iip     %x64(r12)"

    getspr %r12, %iipa
    write "iipa    %x64(r12)"

    getspr %r12, %ipsr
    write "ipsr    %x64(r12)"

    getspr %r12, %cause
    write "cause   %x64(r12)"

    write "%s(ifa)"
    getspr %r12, %ifa
    write "ifa     %x64(r12)"

    getspr %r12, %iib
    write "iib     %x128(r12)"

    getspr %r12, %tpr
    write "tpr     %x64(r12)"

    getspr %r12, %lid
    write "lid     %x64(r12)"

    getspr %r12, %irr0
    write "irr0    %x64(r12)"

    getspr %r12, %irr1
    write "irr1    %x64(r12)"

    getspr %r12, %irr2
    write "irr2    %x64(r12)"

    getspr %r12, %irr3
    write "irr3    %x64(r12)"

    getspr %r12, %isr0
    write "%x64(r12)"

    getspr %r12, %isr1
    write "%x64(r12)"

    getspr %r12, %isr2
    write "%x64(r12)"

    getspr %r12, %isr3
    write "%x64(r12)"

    getspr %r12, %tsv
    write "%x64(r12)"

    getspr %r12, %cmcv
    write "%x64(r12)"

    getspr %r12, %pmv
    write "%x64(r12)"

    write "end test special register read/write"
.end
.text
    alloc 69
    write "test min/max"
    minsd %r34, %r56, %r67
    minud %r34, %r56, %r67
    maxsd %r34, %r56, %r67
    maxud %r34, %r56, %r67

    minsid %r34, %r56, 2671
    minuid %r34, %r56, 2671
    maxsid %r34, %r56, 2671
    maxuid %r34, %r56, 2671
    write "test minmax end"

.end

.text
    write "test nullification (explicit masks)"
    alloc 96
    ldi %r10, 0
    nuleqd %r10, %r10, 5, 4
    write  "0" ; nullified
    write  "1" ; nullified
    write  "2" ; nullified
    write  "3" ; nullified
    write  "4" ; nullified
    write  "5" ; else
    write  "6" ; else
    write  "7" ; else
    write  "8" ; else

    write  "test nullification (predicate names)"
    ldi    %r10, 0
    nuleqd %r10, %r10, equal, nonequal
    write  "0"
    write  "1"
    write  "2"
    write  "3"
    write  "4" (equal)
    write  "5"
    write  "6"
    write  "7"
    write  "8" (nonequal)


    write "test nullification"
    ldi %r10, 0
    nuleqd %r10, %r10, 4, 3
    addid %r10, %r10, 2
    addid %r10, %r10, 2
    addid %r10, %r10, 2
    addid %r10, %r10, 1
    addid %r10, %r10, 1
    addid %r10, %r10, 1
    addid %r10, %r10, 1

    write "test nullification"
    ldi %r10, 0
    nuleqd %r10, %r10, true, false
    addid %r10, %r10, 2
    addid %r10, %r10, 2
    addid %r10, %r10, 2
    addid %r10, %r10, 2
    addid %r10, %r10, 1 (true)
    addid %r10, %r10, 1
    addid %r10, %r10, 1 (false)

    nop 0
    nop 0
    nuleqd %r12, %r10, 4, 3
    write  "branch1: psr=%s(psr)"
    write  "branch1: %i64(r10)"
    write  "branch1: %i64(r10)"
    write  "branch1: %i64(r10)"
    write  "branch2: psr=%s(psr)"
    write  "branch2: %i64(r20)"
    write  "branch2: %i64(r20)"


    nuleqd %r23, %r45, 0b1100, 0b0101
    nulltsd %r23, %r45, 0b1100, 0b0101
    nulltud %r23, %r45, 0b1100, 0b0101

    nuleqid %r23, 45, 0b1100, 0b0101
    nulltsid %r23, -45, 0b1100, 0b0101
    nulltuid %r23, 45, 0b1100, 0b0101

    nuleqid.l  %r23, 45000000000, 0b1100, 0b0101
    nulltsid.l  %r23, -45000000000, 0b1100, 0b0101
    nulltuid.l  %r23, 45000000000, 0b1100, 0b0101

    nulbs %r23, %r45, 0b1100, 0b0101
    nulbsi %r23, 45, 0b1100, 0b0101
    nop 1
    nop 2
    nop 3
    nop 4
    nop 5
    nop 6
    nop 7

    nuleqd %r10, %r10, same_equal, same_nonequal
    write "0e"
    write "1e"
    write "2e" (same_equal, same_nonequal)

    nulned %r10, %r10, same_equal2, same_nonequal2
    write "0ne"
    write "1ne"
    write "2ne" (same_equal2, same_nonequal2)

    nuleqd %r10, %r10, no_if_true, no_if_false (no_if_true)
    write "else" (no_if_false)

    write "end_nullification_test"
.end
.text
    alloc 21
    ldi %r12, PMC_LAST
    write "PMC_LAST = %i64(r12)"
; don't report runtine in unittests, this is non-reproducible
    getmr %r14, %gz, PMC_RUNTIME
;   write "PMC_RUNTIME = %i64(r14)"
    getmr %r14, %gz, PMC_SYS_RUNTIME
;   write "PMC_SYS_RUNTIME = %i64(r14)"
    getmr %r14, %gz, PMC_SHORT_INSTRUCTION
    write "PMC_SHORT_INSTRUCTION = %i64(r14)"
    getmr %r14, %gz, PMC_LONG_INSTRUCTION
    write "PMC_LONG_INSTRUCTION = %i64(r14)"
    getmr %r14, %gz, PMC_SHADOWED_SLOT
    write "PMC_SHADOWED_SLOT = %i64(r14)"
    getmr %r14, %gz, PMC_NOP_INSTRUCTION
    write "PMC_NOP_INSTRUCTION = %i64(r14)"
    getmr %r14, %gz, PMC_QUALIFIED_NOP_INSTRUCTION
    write "PMC_QUALIFIED_NOP_INSTRUCTION = %i64(r14)"
    getmr %r14, %gz, PMC_REGISTER_SPILL
    write "PMC_REGISTER_SPILL = %i64(r14)"
    getmr %r14, %gz, PMC_REGISTER_FILL
    write "PMC_REGISTER_FILL = %i64(r14)"
    getmr %r14, %gz, PMC_ICACHE_HIT
    write "PMC_ICACHE_HIT = %i64(r14)"
    getmr %r14, %gz, PMC_ICACHE_MISS
    write "PMC_ICACHE_MISS = %i64(r14)"
    getmr %r14, %gz, PMC_DCACHE_HIT
    write "PMC_DCACHE_HIT = %i64(r14)"
    getmr %r14, %gz, PMC_DCACHE_MISS
    write "PMC_DCACHE_MISS = %i64(r14)"
    getmr %r14, %gz, PMC_INSTRUCTION_TRANSLATION_HIT
    write "PMC_INSTRUCTION_TRANSLATION_HIT = %i64(r14)"
    getmr %r14, %gz, PMC_INSTRUCTION_TRANSLATION_MISS
    write "PMC_INSTRUCTION_TRANSLATION_MISS = %i64(r14)"
    getmr %r14, %gz, PMC_DATA_TRANSLATION_HIT
    write "PMC_DATA_TRANSLATION_HIT = %i64(r14)"
    getmr %r14, %gz, PMC_DATA_TRANSLATION_MISS
    write "PMC_DATA_TRANSLATION_MISS = %i64(r14)"
    getmr %r14, %gz, PMC_BACKSTORE_TRANSLATION_HIT
    write "PMC_BACKSTORE_TRANSLATION_HIT = %i64(r14)"
    getmr %r14, %gz, PMC_BACKSTORE_TRANSLATION_MISS
    write "PMC_BACKSTORE_TRANSLATION_MISS = %i64(r14)"
    setmr %r14, %gz, PMC_SHORT_INSTRUCTION
    getmr %r15, %gz, PMC_SHORT_INSTRUCTION
    write "old pm reg = %i64(r15)"
.end
.text
; Simple test program
; 20! factorial compute
.text
    alloc 61
    ldi %r15, -100
loop_stop_sard:
    divp2id %r13, %r15, 5
    repled %r15, %gz, 1, loop_stop_sard

; performance test - long loop
; for(i = 1000000; i>0; i--) DoSome();

    ldi %r20, 2500000
    ldi %r15, 20 ; maximum factorial number
    ldi %r21, 5
loop_stop: (64)
    addid %r13, %r13, 5
    subd %r14, %r14, %r55
    cmpltsd %r24, %r14, %r14
    addid %r13, %r13, 4
    subd %r14, %r14, %r55
    cmpltsd %r22, %r14, %r14
    addid %r13, %r13, 33
    srpid %r14, %r14, %r55, 13
    subd %r14, %r13, %r21
    sraid %r14, %r14, 7
    repgtd %r20, %gz, 1, loop_stop
; print loop counter after loop (must be 0)
    write "%i64(r20) factorials"
    ldi %r13, 1
    ldi %r14, 1
start:
    muld %r13, %r13, %r14
    write "factorial: %u64(r13)"
    repled %r14, %r15, 1, start

    write "%i64(r14) %i64(r13)"
.end
.text
    alloc  96
    write  "Example of strided loop instructions"
; fast_check
    ldi    %r12, 10000  ; load loop number (10)
stride_loop_start:
;  write  "%i64(r12)"
    cmpeqd %r4, %r12, %r12
    addd %r14, %r14, %r46
    repgtd %r12, %gz, 1, stride_loop_start

    write  "counter=%i64(r12)"

; Second example of strided loop.
; fast_check
    ldi    %r12, 10000  ; load loop number (10)
    ldi    %r14, 10000  ; load loop number (10)
stride_loop_start2:
;   write  "%i64(r12)"
    cmpeqd %r4, %r12, %r12
    addid %r14, %r14, -2
    repgtd %r12, %gz, 1, stride_loop_start2

    write  "%i64(r12) %i64(r14)"

;*****************************************************************
; 3x inner loop example
;*****************************************************************
    ldi    %r3, 0
    ldi    %r20, 0
    ldi    %r33, 80
    mov    %r10, %r33
    mov    %r11, %r33
    mov    %r12, %r33
ccloop:
;   write  "%i64(r12)"
    addid %r20, %r20, 1
    addid %r12, %r12, -1
    cmpltsd %r2, %r3, %r12
;   jmp  ccloop
;   write  "%i64(r11)"
    addid %r11, %r11, -1
    cmpltsd %r4, %r3, %r11
    mov %r12, %r33
;   jmp ccloop
;   write "%i64(r10)"
    addid %r10, %r10, -1
    cmpltsd %r6, %r3, %r10
    mov %r11, %r33
    mov %r12, %r33
;   jmp ccloop

    write  "%i64(r20)"

; for(i=0; i<100; i++)

    ldi  %r8, 0
start1:
;   write "%i64(r8)"
    addid  %r8, %r8,1
    cmpltsid  %r7, %r8,128
    brneid %r7,0,start1

; for(i=100; i>0; i--)
    ldi %r8, 100
start2:
    write "%i64(r8)"
    addid %r8, %r8,-1 ; current error
    cmpltsd %r2, %r3, %r8
    brneid %r2, 0, start2

    write "r3      %x64(r3)"
; setspr %r3, %rsc


; for(i=100; i>0; i--) write "%x64((i)"
    ldi %r10, 100
qqq: cmpltsd %r2, %r3, %r10
    write "r10     %x64(r10)"
    addid %r10, %r10, -1
;   jmp qqq
sss:

    andi.l %r55, %r55,0x000FFFFF00003F0F
    setspr %r12, %ifa
; test some special regs
    ldi.l %r9, 0x123456789
;   setspr %r9, psr
    write "ip: %s(ip) psr: %s(psr)"
;   setspr %r3, psr
    ldi %r55, 120
    setspr %r55, %tpr
    write "fpcr    %s(fpcr)"
    write "psr     %s(psr)"

    write "test long loop"
; test simple loop
; fast_check
    ldi %r13, 350000 ; 35
    ldi %r14, 350000 ; 35
    ldi %r15, 88
    write "%i64(r14)"
repeat_loop_start: (128)
; write "%i64(r12)"
    addid %r13, %r13, 3
    addd %r13, %r13, %r15
    srpid %r13, %r13, %r15, 8

    addid %r13, %r13, 4
    addd %r13, %r13, %r15
    srpid %r13, %r13, %r15, 7

    addid %r13, %r13, 5
    addd %r13, %r13, %r15
    srpid %r13, %r13, %r15, 6

    addid %r13, %r13, 6
    addd %r13, %r13, %r15
    srpid %r13, %r13, %r15, 5

    subd %r13, %r13, %r15
    sladdd %r13, %r13, %r15, 5
    sladdd %r13, %r13, %r15, 5

    xor %r13, %r14, %r15
    slld %r13, %r13, %r13
    repgtd %r14, %gz, 1, repeat_loop_start

    write "%i64(r13) %i64(r14)"

    write "end test long loop"
.end
.text
    write "test random"
    alloc 96

    random %r3, %gz
    write "random: %x64(r3)"
    random %r3, %gz
    write "random: %x64(r3)"
    ldi %r4, 1
    random %r3, %r4
    write "random seed: %x64(r3)"

    write "end_random_test"
.end
.text
; test simple long loop
    alloc 61
    ldi %r13, 1000000
    mov %r14, %r13
    write "loop limit: %i64(r14)"
    ldi %r15, 88
repeat_long_loop_start: (128)
    addid %r13, %r13, 3
    addd %r13, %r13, %r15
    srpid %r13, %r13, %r15, 8
    addid %r13, %r13, 4
    addd %r13, %r13, %r15
    srpid %r13, %r13, %r15, 7
    addid %r13, %r13, 5
    addd %r13, %r13, %r15
    srpid %r13, %r13, %r15, 6
    addid %r13, %r13, 6
    addd %r13, %r13, %r15
    srpid %r13, %r13, %r15, 5
    addd %r30, %r31, %r14
    subd %r31, %r30, %r15
    sllid %r40, %r40, 12
    ldaxsd %r41, %r40, %r12, 3, -12
    ldaxsd %r41, %r40, %r12, 4, 62
    repgtd %r14, %gz, 1, repeat_long_loop_start
    jmp repeat_exit

    repled %r56, %r60, 1, repeat_long_loop_start
    repged %r56, %r60, 1, repeat_long_loop_start
    repleud %r56, %r20, 1, repeat_long_loop_start
    repgeud %r56, %r20, 1, repeat_long_loop_start

    repled.l %r56, %r60, 1, repeat_long_loop_start
    repged.l %r56, %r60, 1, repeat_long_loop_start
    repleud.l %r56, %r20, 1, repeat_long_loop_start
    repgeud.l %r56, %r20, 1, repeat_long_loop_start

repeat_exit:
    write "end loop repeat test"
.end
.text
; Here we test instructions for partial rotate register by fixed bitcount.
    alloc  90
    write  "initial values"
    ldi.l  %r50, 0x1234567890ABCDEF
    write  "%x64(r50)"
    write  "rotate left"
    srpid %r51, %r50, %r50, 40-1
    write  "%x64(r51)"
    write  "rotate right"
    srpid %r51, %r50, %r50, 64-40-1  ; same as previous
    write  "%x64(r51)"
    write  "rotate left immediate"
    srpid %r51, %r50, %r50, 64-40-1
    write  "%x64(r51)"
    write  "rotate right immediate"
    srpid %r51, %r50, %r50, 40-1  ; same as previous "rD+1-rC"
    write  "%x64(r51)"

; Here we test instructions for shift and mask register by fixed bitcount.
    write "shift signed|unsigned by immediate 12 bit"
    ldi.l %r50, 0xfedcba0123456789
    write "%x64(r50)"
    sraid %r51, %r50, 12
    write "%x64(r51)"
    srlid %r51, %r50, 12
    write "%x64(r51)"
    sllid %r51, %r50, 12
    write "%x64(r51)"
    sllid %r51, %r50, 12
    write "%x64(r51)"

;  jmp  ddd
    ldi %r10, 16
    slpd %r51, %r50, %r50, %r10
    write "%x64(r51)"

    ldi.l %r40, 0x1234567890abcdef
    ldi.l %r50, 0xfedcba0987654321
    slsrlid %r41, %r40, 8, 40
    write "%x64(r41)"
    slsraid %r41, %r40, 11, 40
    write "%x64(r41)"

    write "test srpi"
    ldi.l %r40, 0x1234123412341234
    ldi.l %r50, 0x5678567856785678
    srpid %r41, %r40, %r50, 39
    write "%x64(r41)"
    srpid %r41, %r50, %r40, 23
    write "%x64(r41)"
    srpid %r41, %r40, %r40, 24
    write "%x64(r41)"

    write "test vector shift right pair (srpi16) instruction"
    xor %r2, %r2, %r2 ; all zeroes
    nor %r3, %r2, %r2 ; all ones
    write "r2      %x128(r2)"
    write "r3      %x128(r3)"
    srpiq %r4, %r2, %r3, 60
    write "r4      %x128(r4)"
    srpiq %r4, %r3, %r2, 60
    write "r4      %x128(r4)"
    srpiq %r4, %r2, %r3, 100
    write "r4      %x128(r4)"
    srpiq %r4, %r3, %r2, 100
    write "r4      %x128(r4)"

; SHIFTS
    slld %r42, %r33, %r34
    slld %r42, %r33, %r34
    srad %r52, %r73, %r44
    srld %r62, %r73, %r44
    slpd %r72, %r17, %r17, %r24
    srpd %r82, %r16, %r16, %r15
    srpid %r72, %r15, %r24, 32
    deposit %r10, %r14, %r85, 32, 30

    sllid %r12, %r67, 13
    sllid %r13, %r57, 13
    sraid %r14, %r48, 14
    srlid %r15, %r38, 14
    srpid %r16, %r39, %r13, 13
    srpid %r17, %r29, %r13, 64-13


    write "test packed bitwise logical"
    and %r10, %r71, %r13
    andn %r21, %r81, %r22
    or %r32, %r71, %r32
    orn %r43, %r61, %r43
    nand %r54, %r51, %r54
    nor %r65, %r41, %r64
    xnor %r76, %r31, %r73
    xor %r87, %r21, %r83


    ldi %r20, 65
    write "r20     %c(r20)"   ; should be 'A'

    ldi   %r3, 0
    ldi.l %r22, 0x12345FFFFFFFFFFF
    write "%x64(r22)"
    deposit %r23, %r22, %gz, 0, 23
    write "%x64(r23)"

    ldi.l %r22, 0x1234567890ABCDEF
    ldi.l %r23, 0xFEDCBA9876543210
    srpid %r22, %r22, %r23, 24
    write "%x64(r22)"

    ldi.l %r24, 0x4321F00000000
    write "%x64(r24)"
    subrid %r25, %r24, 0
    write "%x64(r25)"
    not %r25, %r25
    write "%x64(r25)"
    xor  %r25, %r25, %r24
    write "%x64(r25)"

; Example of abs_diff
    ldi %r12, -10000
    abdd %r12, %r12, %gz
    write "r12: %i64(r12)"
.end
.text
    jmp  endfpsimd
; SSE double (SSE2)
    vfmaddd %r16, %r71, %r69, %r13
    vfmsubd %r15, %r78, %r58, %r23
    vfnmaddd %r14, %r67, %r47, %r13
    vfnmsubd %r13, %r86, %r36, %r16
    vfmaddsubd %r82, %r52, %r69, %r63
    vfmsubaddd %r50, %r91, %r69, %r63
    vfaddd %r12, %r86, %r25
    vfnaddd %r11, %r82, %r19
    vfsubd %r10, %r63, %r28
    vfaddsubd %r81, %r61, %r37
    vfsubaddd %r82, %r81, %r46
    vfhaddd %r83, %r81, %r55
    vfhsubd %r84, %r71, %r64
    vfmuld %r81, %r71, %r11
    vfhmuld %r60, %r11, %r22
    vfdotd %r85, %r81, %r13
    vfmind %r86, %r84, %r14
    vfmaxd %r87, %r61, %r15
    vfaminnmd %r30, %r52, %r16
    vfamaxnmd %r61, %r51, %r17

    vfcmpoeqd %r80, %r81, %r63
    vfcmponed %r11, %r81, %r32
    vfcmpoltd %r15, %r81, %r32
    vfcmpoltd %r60, %r81, %r82
    vfcmponed %r62, %r72, %r83
    vfcmpoged %r62, %r72, %r62

    vfpackd %r60, %r61, %r62
    vfnegd %r61, %r51
    vfabdd %r61, %r51, %r3
    vfnabdd %r61, %r61, %r3
    vfrndd %r60, %r77, rdn
    vfrndd %r62, %r61, rup
    vfrndd %r62, %r71, rne
    vfrndd %r83, %r67, rtz
    vfdivd %r83, %r67, %r20
    vfsqrtd %r68, %r81
    vfrsqrtd %r68, %r81


; quadruple floating-point extension example
.rodata
    align 16
a: quad 1.234567890123456789124141241241
b: quad 3.1415925678888734535345231234564561
c: quad 3.4566345634563456346535463463456
.text
    ldarc %r21, a
    ldq  %r3, %r21,0*16
    ldq  %r1, %r21,1*16
    ldq  %r2, %r21,2*16
    write "%vf64(r3)"
    write "%vf64(r1)"
    write "%vf64(r2)"

    write "test binary\0"
    fmuld %r3, %r1, %r2
    write "%vf64(r3)"
    fnmuld %r3, %r1, %r2
    write "%vf64(r3)"
    faddd %r4, %r1, %r2
    write "%vf64(r4)"
    fnaddd %r4, %r1, %r2
    write "%vf64(r4)"
    fsubd %r4, %r2, %r1
    write "%vf64(r4)"
    fdivd %r4, %r2, %r1
    write "%vf64(r4)"

    write "test fused fma\0"
    fmaddd %r5, %r4, %r1, %r2
    write "%vf64(r5)"
    fnmaddd %r5, %r4, %r1, %r2
    write "%vf64(r5)"
    fmsubd %r5, %r4, %r1, %r2
    write "%vf64(r5)"
    fnmsubd %r5, %r4, %r1, %r2
    write "%vf64(r5)"

    write "test unary\0"
    mov  %r6, %r5
    write "%vf64(r6)"
    fabsd %r6, %r5
    write "%vf64(r6)"
    fnegd %r6, %r5
    write "%vf64(r6)"
    fnabsd %r6, %r5
    write "%vf64(r6)"
    fsqrtd %r6, %r2
    write "%vf64(r6)"
    frsqrtd %r6, %r2
    write "%vf64(r6)"

    write "test rounding\0"
    frndd %r7, %r2, rup
    write "%vf64(r7)"
    frndd %r7, %r2, rtz
    write "%vf64(r7)"
    frndd %r7, %r2, rdn
    write "%vf64(r7)"
    frndd %r7, %r2, rne
    write "%vf64(r7)"
    fdtoiw %r7, %r2, rtz
    write "r7=%i64(r7)"
    ldi %r7, 123456
    fiwtod %r7, %r7
    write "%vf64(r7)"

    write "test minmax, abs minmax"
    fmaxd %r8, %r1, %r2
    write "%vf64(r8)"
    fmind %r8, %r1, %r2
    write "%vf64(r8)"
    famaxnmd %r8, %r1, %r2
    write "%vf64(r8)"
    faminnmd %r8, %r1, %r2
    write "%vf64(r8)"

    write "test fmergesq\0"

.rodata
    align 16
xxxq: quad 1.122
    quad 0.9999765432
.text
    ldarc %r21, a
; fast_check
    ldi %r15, 100000 ; 10
    ldq %r15, %r21, 0*16
    ldq %r16, %r21, 1*16
    fsubd %r22, %r15, %r16
    write "%vf64(r22)"
yyloop:
    fmaddd %r22, %r15, %r16, %r22
    fmsubd %r22, %r15, %r16, %r22
    repged %r15, %gz, 1, yyloop
    write "%vf64(r22)"


.rodata
    align 16
    quad 1.189731495357231765085759326628007e+4932
qqqq:   quad 1.23456789 + 32.0
    quad 0.2345678901234567890123456789012345678 + 0.2
    quad 2*asin(1)
    quad 255
dbl1: double acos(sin(3.1415926)) ;-1.2345678e+200
    double 444.689679
float1: float 0.123456789123456789e+30
    float 2.123456789122233
    float 0.0
    float 1.0
octquad:
    quad 0.25
f32: d4 0x3fff1234
.text
    ldarc %r45, qqqq
    ldarc %r46, dbl1
    ldarc %r47, float1
    write "r45     %x64(r45)"
    ldq  %r63, %r45,0
    write "%vf64(r63) %x128(r63)"
    ldq  %r63, %r45,0
    write "%vf64(r63) %x128(r63)"
    fmulq %r62, %r63, %r63
    write "%vf64(r62)"
    lduw %r60, %r47,0
    write "%vf64(r60)"
    ldud %r59, %r46,0
    lduw %r58, %r47,4
    lduw %r57, %r47,8
    write "%vf64(r57)"
    write "%vf64(r58)"
    write "%vf64(r59)"
    ldq %r53, %r45,1*16
    write "%vf64(r53)"
    ldq %r50, %r45,2*16
    write "%vf64(r50)"
    ldq %r49, %r45,3*16
    write "%vf64(r49) %x128(r49)"
    lduw %r48, %r47,3*4
    write "%vf64(r48)"
    fnegq %r46, %r48
    write "%vf64(r46)"
    fmaddq %r40, %r52, %r52, %r53
    write "%m(dump)"

.rodata
    align 16
__yyy:
    quad 0.5
    quad 1.0
    quad 2.25
    quad 22252.22424
    quad -22252.22424
    quad 34.125
    quad 2.0 / 72.0
    d8 0xffffffffffffffff
    d8 0x3ffe
    d8 0xffffffffffffffff
    d8 0x3ff0
    d8 0x8000000000000000
    d8 0xbff3
    d8 0x8000000000000000
    d8 0xc003
    quad -1.234567890123456789012345e+6
    d8 0x8000000000000000
    d8 0x3fe0
.text
    ldarc %r12, __yyy
    ldq %r23, %r12, 0
    write "%vf64(r23) %x128(r23)"
    ldq %r23, %r12, 1*16
    write "%vf64(r23) %x128(r23)"
    ldq %r23, %r12, 2*16
    write "%vf64(r23) %x128(r23)"
    ldq %r23, %r12, 3*16
    write "%vf64(r23) %x128(r23)"
    ldq %r23, %r12, 4*16
    write "%vf64(r23) %x128(r23)"
    ldq %r23, %r12, 5*16
    write "%vf64(r23) %x128(r23)"
    ldq %r23, %r12, 6*16
    write "%vf64(r23) %x128(r23)"
    ldq %r27, %r12, 7*16
    write "%vf64(r27) %x128(r27)"
    ldq %r27, %r12, 8*16
    write "%vf64(r27) %x128(r27)"
    ldq %r27, %r12, 9*16
    write "%vf64(r27) %x128(r27)"
    ldq %r27, %r12, 10*16
    write "%vf64(r27) %x128(r27)"
;   flddi %r24, 8.5899345919999999995e+09 ;-1.234567890123456789012345e+6
;   write "%vf64(r24) %x128(f24)"
;   flddi %r24, 0.125 ; 4.656612873077392578125e-10 ; 4.656612873077392578125e-10
;   write "%vf64(r24) %x128(f24)"
    ldq %r25, %r12, 11*16
    write "%vf64(r25) %x128(r25)"
    ldq %r25, %r12, 12*16
    write "%vf64(r25) %x128(r25)"
    fldqr %r40, 4.345678912345678901234567890123456789012345678
    write "%vf64(r40)"


    fmaddd %r23, %r60, %r55, %r33
    fmaddd %r24, %r61, %r25, %r32
    fmaddd %r25, %r62, %r55, %r23
    fmaddd %r26, %r63, %r75, %r73
    fmaddd %r27, %r64, %r75, %r73
    fmaddd %r28, %r65, %r85, %r63
    fmaddd %r29, %r66, %r85, %r63
    fmaddd %r30, %r67, %r95, %r23
    fmaddd %r31, %r68, %r95, %r23
    fmaddd %r10, %r21, %r26, %r27
    fmaddd %r13, %r21, %r26, %r27
    fmaddd %r10, %r21, %r26, %r27
    fmaddd %r12, %r21, %r26, %r27
    fmaddd %r11, %r21, %r26, %r27
    fmaddd %r13, %r21, %r26, %r27
    fmaddd %r14, %r21, %r26, %r27
    fmaddd %r15, %r21, %r26, %r27
    fmaddd %r16, %r21, %r26, %r27
    fmaddd %r17, %r21, %r26, %r27

    stq %r16, %sp,16*2
    stq %r17, %sp,16*3
    stq %r18, %sp,16*4
    stq %r19, %sp,16*5
    stq %r20, %sp,16*6
    stq %r21, %sp,16*7
    stq %r22, %sp,16*8
    stq %r23, %sp,16*9
    stq %r24, %sp,16*10
    stq %r25, %sp,16*11
    stq %r26, %sp,16*12
    stq %r27, %sp,16*13
    stq %r28, %sp,16*14
    stq %r29, %sp,16*15
    stq %r30, %sp,16*16
    stq %r31, %sp,16*17


; SSE single
    vfmadds %r58, %r61, %r92, %r63
    vfmsubs %r82, %r52, %r92, %r63
    vfnmadds %r82, %r52, %r69, %r63
    vfnmsubs %r50, %r91, %r69, %r63
    vfmaddsubs %r82, %r52, %r69, %r63
    vfmsubadds %r50, %r91, %r69, %r63
    vfadds %r61, %r94, %r69
    vfnadds %r68, %r54, %r72
    vfsubs %r68, %r61, %r82
    vfaddsubs %r81, %r71, %r82
    vfsubadds %r82, %r71, %r82
    vfhadds %r62, %r61, %r82
    vfhsubs %r62, %r61, %r62
    vfmuls %r62, %r51, %r62
    vfhmuls %r63, %r51, %r62
    vfdots %r83, %r51, %r62
    vfmins %r83, %r61, %r62
    vfmaxs %r63, %r71, %r62
    vfaminnms %r64, %r71, %r82
    vfamaxnms %r64, %r71, %r82

    vfcmpones %r65, %r61, %r62
    vfcmpolts %r74, %r61, %r62
    vfcmpoges %r83, %r61, %r62
    vfcmpuges %r72, %r61, %r62
    vfcmpuges %r11, %r61, %r62
    vfcmpus %r20, %r61, %r62

    vfpacks %r33, %r64, %r62
    vfnegs %r60, %r69
    vfabds %r61, %r68, %r3
    vfnabds %r62, %r67, %r3
    vfrnds %r63, %r66, rdn
    vfrnds %r64, %r65, rup
    vfrnds %r65, %r64, rne
    vfrnds %r66, %r63, rtz
    vfdivs %r67, %r62, %r20
    vfsqrts %r68, %r61
    vfrsqrts %r69, %r60

    vfadds %r24, %r61, %r60
    vfmuld %r47, %r60, %r46

endfpsimd:

.end
.text
.rodata
    align  16
mmxdata:
    d8  0x123456759eabcd7f
    d8  0x123456789cabcdef

    d8  0xf87f5432afebcdf3
    d8  0xffffffffffffffff

    d8  0x1234567890abcdef
    d8  0x1234567890abcdef

    d8  0x1234567890abcdef
    d8  0x1234567890abcdef
.text
    alloc 90
    ldarc %r4, mmxdata
    ldq %r1, %r4,0*16
    ldq %r2, %r4,1*16
    ldq %r3, %r4,2*16
    ldq %r4, %r4,3*16
    write "r1      %x128(r1)"
    write "r2      %x128(r2)"

    write "%vu8(r1)"
    write "%vu16(r1)"
    write "%vu32(r1)"
    write "%vu64(r1)"

    vaddb %r3, %r1, %r2
    write  "test vadd/vaddc (1 byte)\0"
    vaddcb %r4, %r1, %r2
    write  "%vu8(r1)"
    write  "%vu16(r2)"
    write  "%vu32(r3)"
    write  "%vu64(r4)"
    write  "test vadd/vaddo signed (1 byte)\0"
    vaddob %r4, %r1, %r2
    write  "%vi8(r1)"
    write  "%vi16(r2)"
    write  "%vi32(r3)"
    write  "%vu64(r4)"

    vsubb %r3, %r1, %r2
    write  "test vsub/vsubb (1 byte)\0"
    vsubcb %r4, %r1, %r2
    write  "%vu8(r1)"
    write  "%vu8(r2)"
    write  "%vu8(r3)"
    write  "%vu8(r4)"
    write  "test vsub/vsubo signed (1 byte)\0"
    vsubob %r4, %r1, %r2
    write  "%vi8(r1)"
    write  "%vi8(r2)"
    write  "%vi8(r3)"
    write  "%vu8(r4)"

    write  "test vaddusb"
    vaddb %r3, %r1, %r2
    vaddusatb %r4, %r1, %r2
    write  "%vu8(r1)\n%vu8(r2)\n%vu8(r3)\n%vu8(r4)"

    write  "test vsubusb"
    vsubb %r3, %r1, %r2
    vsubusatb %r4, %r1, %r2
    write  "%vu8(r1):\n%vu8(r2)\n%vu8(r3)\n%vu8(r4)"

    write  "test vaddssb"
    vaddb %r3, %r1, %r2
    vaddssatb %r4, %r1, %r2
    write  "%vi8(r1)\n%vi8(r2)\n%vi8(r3)\n%vi8(r4)"

    write  "test vsubssb"
    vsubb %r3, %r1, %r2
    vsubssatb %r4, %r1, %r2
    write  "%vi8(r1)\n%vi8(r2)\n%vi8(r3)\n%vi8(r4)"

    write  "test pavgu (1 byte)\0"
    vavgub %r3, %r1, %r2
    write  "%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write  "test pavgs (1 byte)\0"
    vavgsb %r3, %r1, %r2
    write  "%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write  "test vminu (1 byte)\0"
    vminub %r3, %r1, %r2
    write  "%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write  "test vmins (1 byte)\0"
    vminsb %r3, %r1, %r2
    write  "%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write  "test vmaxu (1 byte)\0"
    vmaxub %r3, %r1, %r2
    write  "%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write  "test vmaxs (1 byte)\0"
    vmaxsb %r3, %r1, %r2
    write  "%vi8(r1)\n%vi8(r2)\n%vi8(r3)"

    write  "test merge low (1 byte)\0"
    vmergelb %r3, %r1, %r2
    write  "%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    write  "test merge high (1 byte)\0"
    vmergehb %r3, %r1, %r2
    write  "%vu8(r1)\n%vu8(r2)\n%vu8(r3)"

    vpkusatsh %r2, %r3, %r4
    vpkusath %r2, %r3, %r4
    vpkssath %r2, %r3, %r4

    vpkusatsw %r2, %r3, %r4
    vpkusatw %r2, %r3, %r4
    vpkssatw %r2, %r3, %r4

    vpkusatsd %r2, %r3, %r4
    vpkusatd %r2, %r3, %r4
    vpkssatd %r2, %r3, %r4

;  jmp  endmmx
; d1 abs
    vminsb %r12, %r61, %r55
    vminsh %r18, %r61, %r45
    vminsw %r27, %r61, %r35
    vminsd %r36, %r61, %r25

    vminub %r14, %r61, %r15
    vminuh %r15, %r62, %r75
    vminuw %r17, %r63, %r85
    vminud %r16, %r64, %r75

    vmaxsb %r26, %r71, %r85
    vmaxsh %r26, %r61, %r54
    vmaxsw %r16, %r51, %r35
    vmaxsd %r16, %r41, %r55

    vmaxub %r11, %r61, %r53
    vmaxuh %r12, %r55, %r55
    vmaxuw %r16, %r46, %r56
    vmaxud %r13, %r31, %r55

    vslpb %r56, %r61, %r15, %r44
    vslph %r31, %r61, %r25, %r44
    vslpw %r53, %r61, %r30, %r44
    vslpd %r62, %r61, %r41, %r44

    vsrpb %r16, %r11, %r52, %r44
    vsrph %r11, %r21, %r63, %r44
    vsrpw %r71, %r31, %r74, %r44
    vsrpd %r81, %r41, %r85, %r44

    vsllb %r16, %r51, %r86
    vsllh %r24, %r61, %r55
    vsllw %r69, %r71, %r55
    vslld %r77, %r81, %r55

    vsrlb %r21, %r81, %r50
    vsrlh %r12, %r63, %r51
    vsrlw %r13, %r62, %r52
    vsrld %r64, %r63, %r53

    vsrab %r85, %r64, %r54
    vsrah %r76, %r65, %r15
    vsraw %r67, %r66, %r25
    vsrad %r58, %r67, %r36

    vavgsb %r49, %r68, %r47
    vavgsh %r30, %r69, %r58
    vavgsw %r26, %r11, %r69
    vavgsd %r16, %r21, %r75

    vavgub %r14, %r31, %r85
    vavguh %r15, %r41, %r45
    vavguw %r56, %r51, %r25
    vavgud %r87, %r61, %r15

    vaddssatb %r42, %r71, %r15
    vaddssath %r83, %r81, %r45
    vaddssatw %r74, %r41, %r85
    vaddssatd %r65, %r61, %r75

    vaddb %r56, %r61, %r75
    vaddh %r47, %r61, %r65
    vaddw %r38, %r61, %r55
    vaddd %r29, %r61, %r55

    vaddusatb %r55, %r61, %r45
    vaddusath %r65, %r61, %r35
    vaddusatw %r74, %r61, %r25
    vaddusatd %r84, %r61, %r15

    vaddcb %r53, %r61, %r55
    vaddch %r13, %r61, %r55
    vaddcw %r12, %r61, %r55
    vaddcd %r12, %r61, %r55

    vsubssatb %r56, %r61, %r15
    vsubssath %r67, %r61, %r12
    vsubssatw %r78, %r61, %r13
    vsubssatd %r89, %r61, %r45

    vsubb %r70, %r61, %r85
    vsubh %r86, %r61, %r45
    vsubw %r46, %r61, %r13
    vsubd %r46, %r61, %r75

    vsubusatb %r41, %r68, %r65
    vsubusath %r12, %r37, %r55
    vsubusatw %r23, %r26, %r45
    vsubusatd %r14, %r18, %r35

    vcmpeqb %r86, %r61, %r25
    vcmpeqh %r44, %r72, %r15
    vcmpeqw %r20, %r83, %r55
    vcmpeqd %r16, %r84, %r55

    vcmpltsb %r13, %r61, %r15
    vcmpltsh %r14, %r61, %r24
    vcmpltsw %r15, %r61, %r38
    vcmpltsd %r16, %r61, %r45

    vcmpltub %r19, %r11, %r75
    vcmpltuh %r18, %r21, %r82
    vcmpltuw %r16, %r31, %r73
    vcmpltud %r14, %r71, %r54

    vmergehb %r11, %r71, %r13
    vmergehh %r72, %r67, %r27
    vmergehw %r13, %r58, %r55
    vmergehd %r14, %r69, %r15

    vmergelb %r76, %r61, %r11
    vmergelh %r26, %r11, %r62
    vmergelw %r16, %r15, %r73
    vmergeld %r16, %r11, %r85

    vupklsb  %r76, %r61
    vupklsh  %r76, %r61
    vupklsw  %r76, %r61
    vupklub  %r76, %r61
    vupkluh  %r76, %r61
    vupkluw  %r76, %r61

    vupkhsb  %r76, %r61
    vupkhsh  %r76, %r61
    vupkhsw  %r76, %r61
    vupkhub  %r76, %r61
    vupkhuh  %r76, %r61
    vupkhuw  %r76, %r61

    write "end simd(int) test"
endmmx:

.end
.text
    alloc 70
    write "test system instructions (assembler only)"

    addid %sp, %sp, -32 ; alloc stack frame
    write "test tpa for sp: 0x%x64(sp)"
    tpa %r4, %sp, %gz
    write "tpa(sp): 0x%x64(r4)"
    addid %sp, %sp, 32 ; rollback stack frame

    jmp system_skip

    ldi %r45, 1012
    syscall
    nop 0
    sysret
    rfi

    icbi %r34, 16
    dcbt %r34, 16
    dcbf %r34, 16
    dcbi %r34, 16


    getspr %r34, %lid
    setspr %r34, %lid
    mprobe %r34, %r45, %r66
    retf 234567

    getspr %r32, %iv
    getspr %r32, %psr

; test system instructions
    ptc %r10, %r45, %r11

    getspr %r12, %pta
    getspr %r12, %fpcr
    getspr %r11, %rsc

; test atomic fences
    fence acquire
    fence release
    fence acq_rel
    fence seq_cst

    setdbr %r44, %r66, 0
    getdbr %r55, %r66, 0
    setibr %r44, %r66, 0
    getibr %r55, %r66, 0
    setitr %r44, %r66, %r12
    setdtr %r44, %r66, %r12

; bpa b7, %r7
; bpal b7, b4, %r6
; lpr b7, %r6, label16

    undef
system_skip:
    write "end test system instructions (assembler only)"
.end
.text
.data
data_unaligned:
align 16
    d1  0x00
    d1  0x01
    d1  0x02
    d1  0x03
    d1  0x04
    d1  0x05
    d1  0x06
    d1  0x07
    d1  0x08
    d1  0x09
    d1  0x0a
    d1  0x0b
    d1  0x0c
    d1  0x0d
    d1  0x0e
    d1  0x0f

    d1  0x10
    d1  0x11
    d1  0x12
    d1  0x13
    d1  0x14
    d1  0x15
    d1  0x16
    d1  0x17
    d1  0x18
    d1  0x19
    d1  0x1a
    d1  0x1b
    d1  0x1c
    d1  0x1d
    d1  0x1e
    d1  0x1f

.text
    write "load/store unaligned"
    alloc 96
    ldard %r17, data_unaligned

    lduh  %r3, %r17, 0
    write  "%x16(r3)"
    lduh  %r3, %r17, 1
    write  "%x16(r3)"
    lduh  %r3, %r17, 2
    write  "%x16(r3)"

    lduw  %r3, %r17, 0
    write  "%x32(r3)"
    lduw  %r3, %r17, 1
    write  "%x32(r3)"
    lduw  %r3, %r17, 2
    write  "%x32(r3)"
    lduw  %r3, %r17, 3
    write  "%x32(r3)"
    lduw  %r3, %r17, 4
    write  "%x32(r3)"

    ldud  %r3, %r17, 0
    write  "%x64(r3)"
    ldud  %r3, %r17, 1
    write  "%x64(r3)"
    ldud  %r3, %r17, 2
    write  "%x64(r3)"
    ldud  %r3, %r17, 3
    write  "%x64(r3)"
    ldud  %r3, %r17, 4
    write  "%x64(r3)"
    ldud  %r3, %r17, 5
    write  "%x64(r3)"
    ldud  %r3, %r17, 6
    write  "%x64(r3)"
    ldud  %r3, %r17, 7
    write  "%x64(r3)"
    ldud  %r3, %r17, 8
    write  "%x64(r3)"

    ldq  %r3, %r17, 0
    write  "%x128(r3)"
    ldq  %r3, %r17, 1
    write  "%x128(r3)"
    ldq  %r3, %r17, 2
    write  "%x128(r3)"
    ldq  %r3, %r17, 3
    write  "%x128(r3)"
    ldq  %r3, %r17, 4
    write  "%x128(r3)"
    ldq  %r3, %r17, 5
    write  "%x128(r3)"
    ldq  %r3, %r17, 6
    write  "%x128(r3)"
    ldq  %r3, %r17, 7
    write  "%x128(r3)"
    ldq  %r3, %r17, 8
    write  "%x128(r3)"
    ldq  %r3, %r17, 9
    write  "%x128(r3)"
    ldq  %r3, %r17, 10
    write  "%x128(r3)"
    ldq  %r3, %r17, 11
    write  "%x128(r3)"
    ldq  %r3, %r17, 12
    write  "%x128(r3)"
    ldq  %r3, %r17, 13
    write  "%x128(r3)"
    ldq  %r3, %r17, 14
    write  "%x128(r3)"
    ldq %r3, %r17, 15
    write  "%x128(r3)"
    ldq %r3, %r17, 16
    write  "%x128(r3)"
.end
.rodata
align 4
    d4 table_cases
    d4 label_0
    d4 label_1
    d4 label_2

table_cases:
    i4 label_0 - table_cases
    i4 label_1 - table_cases
    i4 label_2 - table_cases

.text
    alloc 80
    write "test table switch to case 1"
    ldi %r4, 1
    ldard %r5, table_cases
    jmpt %r5, %r4

label_0:
    write "case 0"
    cmpeqq  %r12, %r24, %gz
    cmpneq  %r12, %r24, %gz
    depositv %r18, %r20, %r13, %r32
    deposit %r19, %r23, %r12, 13, 32
    ldi %r12, -1234
    ldi %r13, 3456
    jmp  label_after_switch

label_1:
    write "case 1"
    andi %r45, %r44, 12345
    sladdd %r14, %sp, %r12, 2
    sladdd %r12, %r23, %r44, 3
    mov %r12, %r13
    ldi %r24, 0
    setspr %r24, %psr
    getspr %r12, %psr
    nand %r34, %r34, %r45
    slld %r12, %r23, %r45
    sllid %r12, %r23, 45
    jmp label_after_switch

label_2:
    write   "case 2"
    addid  %r34, %r34,-1
    mov     %r58, %r45
    sladdd  %r12, %r15, %r30, 14
    sladdd  %r12, %r15, %r30, 5
    sladdd  %r12, %r15, %r30, 5
    divp2d     %r34, %r56, %r40
    divp2id    %r34, %r56, 40
    depositv   %r40, %r78, %r40, %r20
    sladdd  %r54, %r45, %r22, 4
    sladdd  %r54, %r45, %r22, 20
    ldaxsd  %r3, %r45, %tp, 3, 55
    jmp  label_after_switch

label_after_switch:
    write "end table switch test"
.end
.rodata
    align  16
console_test_quad:
    quad  1.189731495357231765085759326628007e+4932
console_test_quad2:
    quad  1.23456789 + 32.0
console_test_quad3:
    quad  0.2345678901234567890123456789012345678 + 0.2
    quad  2*asin(1)
    quad  255
console_test_double:
    double  acos(sin(3.1415926)) ;-1.2345678e+200
    double  444.689679
console_test_float:
    float  0.123456789123456789e+30
    float  2.123456789122233
    float  0.0
    float  1.0
.text
    alloc  35
    write  "ip=%s(ip), eip=%s(eip), psr=%s(psr)"

    write  "end test write special regs"

    write  "\ntest write: general register"

    write  "%%i8(sp)  = %i8(sp)"
    write  "%%i16(sp) = %i16(sp)"
    write  "%%i32(sp) = %i32(sp)"
    write  "%%i64(sp) = %i64(sp)"
    write  "%%u8(sp)  = %u8(sp)"
    write  "%%u16(sp) = %u16(sp)"
    write  "%%u32(sp) = %u32(sp)"
    write  "%%u64(sp) = %u64(sp)"
    write  "%%x8(sp)  = 0x%x8(sp)"
    write  "%%x16(sp) = 0x%x16(sp)"
    write  "%%x32(sp) = 0x%x32(sp)"
    write  "%%x64(sp) = 0x%x64(sp)"

    write  "%x64(r0)"
    write  "%x64(r1)"
    write  "%x64(r2)"
    write  "%x64(r22)"
    write  "%x64(r33)"
    write  "%x64(g0)"
    write  "%x64(g1)"
    write  "%x64(tp)"
    write  "%x64(sp)"

    write  "end test write general regs"

    ldqr  %r22, console_test_quad
    write "r22 = %x128(r22) %f128(r22)"
    ldqr  %r22, console_test_quad2
    write "r22 = %x128(r22) %f128(r22)"
    ldqr  %r22, console_test_quad3
    write "r22 = %x128(r22) %f128(r22)"
    ldudr %r22, console_test_double
    write "r22 = %x64(r22) %f64(r22)"
    lduwr %r22, console_test_float
    write "r22 = %x32(r22) %f32(r22)"

    write "end test write fp regs"
.end