/*personal notes of renzo diomedi*/

~ 00001001 ~



.section .text
.globl _start
_start:
nop
fld1 # Push +1.0 into the FPU stack
fldl2t # Push log(base 2) 10 onto the FPU stack
fldl2e # Push log(base 2) e onto the FPU stack
fldpi # Push the value of pi onto the FPU stack
fldlg2 # Push log(base 10) 2 onto the FPU stack
fldln2 # Push log(base e) 2 onto the FPU stack
fldz # Push +0.0 onto the FPU stack
movl $1, %eax
movl $0, %ebx
int $0x80


(gdb) info all

st0 0 (raw 0x00000000000000000000)
st1 0.6931471805599453094286904741849753 (raw 0x3ffeb17217f7d1cf79ac)
st2 0.30102999566398119522564642835948945 (raw 0x3ffd9a209a84fbcff799)
st3 3.1415926535897932385128089594061862 (raw 0x4000c90fdaa22168c235)
st4 1.4426950408889634073876517827983434 (raw 0x3fffb8aa3b295c17f0bc)
st5 3.3219280948873623478083405569094566 (raw 0x4000d49a784bcd1b8afe)
st6 1 (raw 0x3fff8000000000000000)
st7 0 (raw 0x00000000000000000000)


The SSE technology incorporates eight 128-bit XMM registers that can be used to hold packed floating-point numbers.

Floating-point calculations can be performed in parallel using the multiple data elements, producing results quicker than sequentially processing the data.



The following two new 128-bit floating-point data types are available:

❑ 128-bit packed single-precision floating-point (in SSE)

❑ 128-bit packed double-precision floating-point (in SSE2)

Because a single-precision floating-point value requires 32 bits, the 128-bit register can hold four packed single-precision floating-point values.

These new data types are not available in the FPU or MMX registers.
They can only be used in the XMM registers and only on processors that support SSE or SSE2.
Special instructions must be used to load and retrieve the data values, as well as special math instructions for performing mathematical operations on the packed floating-point data.

SSE floating-point values
There is a complete set of instructions for moving 128-bit packed single-precision floating-point values between memory and the XMM registers on the processor.



Each of these instructions uses the 128-bit XMM registers to move packed 32-bit single-precision floatingpoint values between the XMM registers and memory. Not only can you move entire groups of packed single-precision floating-point values, you can also move a subset of two packed single-precision floating-point values between XMM registers.

# ssefloat
# developed in mingw-w64 environment

.section .data
value1:
.float 12.34, 2345.543, -3493.2, 0.44901
value2:
.float -5439.234, 32121.4, 1.0094, 0.000003
.section .bss
.lcomm data, 16
.section .text
nop
movups value1, %xmm0
movups value2, %xmm1
movups %xmm0, %xmm2
movups %xmm0, data
movl $1, %eax
movl $0, %ebx
int $0x21




ssefloat.s

ssefloat.exe




C:\>as -gstabs -o users\rnz\desktop\ssefloat.o users\rnz\desktop\ssefloat.s
C:\>ld -o users\rnz\desktop\ssefloat.exe users\rnz\desktop\ssefloat.o
C:\>gdb -q users\rnz\desktop\ssefloat.exe
Reading symbols from users\rnz\desktop\ssefloat.exe...done.
(gdb) break 1
Breakpoint 1 at 0x401000: file users\rnz\desktop\ssefloat.s, line 1.
(gdb) run
Starting program: C:\users\rnz\desktop\ssefloat.exe
[New Thread 13684.0x42b4]
Breakpoint 1, ?? () at users\rnz\desktop\ssefloat.s:10
10 nop
(gdb) s
11 movups value1, %xmm0
(gdb) s
12 movups value2, %xmm1
(gdb) s
13 movups %xmm0, %xmm2
(gdb) s 14 movups %xmm0, data

(gdb) print $xmm0
$1 = {v4_float = {12.3400002, 2345.54297, -3493.19995, 0.449010015}, v2_double = {5.6204289471764299e+24, 1.0439462282443856e-05}, v16_int8 = {-92, 112, 69, 65, -80, -104, 18, 69, 51, 83, 90, -59, -92, -28, -27, 62}, v8_int16 = {28836, 16709, -26448, 17682, 21299, -15014, -7004, 16101}, v4_int32 = {1095069860, 1158846640, -983936205, 1055253668}, v2_int64 = {4977208420974555300, 4532279996355072819}, uint128 = 83605809163155287727927076236493680804}

(gdb) print $xmm1
$2 = {v4_float = {-5439.23389, 32121.4004, 1.00940001, 3.00000011e-06}, v2_double = {8.7452727745837517e+33, 3.4658329842889617e-47}, v16_int8 = {-33, -7, -87, -59, -51, -14, -6, 70, 5, 52, -127, 63, -100, 83, 73, 54}, v8_int16 = {-1569, -14935, -3379, 18170, 13317, 16257, 21404, 13897}, v4_int32 = {-978716193, 1190851277, 1065432069, 910775196}, v2_int64 = {5114667292431088095, 3911749681893422085}, uint128 = 72159045262302707577450683077612927455}

(gdb) print $xmm2
$3 = {v4_float = {12.3400002, 2345.54297, -3493.19995, 0.449010015}, v2_double = {5.6204289471764299e+24, 1.0439462282443856e-05}, v16_int8 = {-92, 112, 69, 65, -80, -104, 18, 69, 51, 83, 90, -59, -92, -28, -27, 62}, v8_int16 = {28836, 16709, -26448, 17682, 21299, -15014, -7004, 16101}, v4_int32 = {1095069860, 1158846640, -983936205, 1055253668}, v2_int64 = {4977208420974555300, 4532279996355072819}, uint128 = 83605809163155287727927076236493680804}) (gdb)



(gdb) x/4f &data # NOTE the USE of " f " , 4f = 4 words = 8 bytes
0x403000 : 0 0 0 0 # NOTE that " movups %xmm0, data " is not sufficient
(gdb) s 15 movl $1, %eax # REQUIRED
(gdb) x/4f &data
0x403000 : 12.3400002 2345.54297 -3493.19995 0.449010015 (gdb)
(gdb) x/4f &value1
0x402000 : 12.3400002 2345.54297 -3493.19995 0.449010015
(gdb) x/16b &value1
0x402000 : -92 112 69 65 -80 -104 18 69
0x402008 : 51 83 90 -59 -92 -28 -27 62
(gdb) x/16b &data
0x403000 : -92 112 69 65 -80 -104 18 69
0x403008 : 51 83 90 -59 -92 -28 -27 62
(gdb) x/16x &value1
0x402000 : 0xa4 0x70 0x45 0x41 0xb0 0x98 0x12 0x45
0x402008 : 0x33 0x53 0x5a 0xc5 0xa4 0xe4 0xe5 0x3e
(gdb) x/16x &data
0x403000 : 0xa4 0x70 0x45 0x41 0xb0 0x98 0x12 0x45
0x403008 : 0x33 0x53 0x5a 0xc5 0xa4 0xe4 0xe5 0x3e




IA-32 platform includes instructions for moving the new SSE2 packed double-precision floating-point data types.



Each of these instructions uses the 128-bit XMM register to move 64-bit double-precision floating-point values.
MOVAPD and MOVUPD instructions move the complete packed double-precision floating-point value into and out of the XMM registers.





# sse2float #### Gnu Linux environment
.section .data
value1:
.double 12.34, 2345.543
value2:
.double -5439.234, 32121.4
.section .bss
.lcomm data, 16
.section .text
.globl _start
_start:
nop
movupd value1, %xmm0
movupd value2, %xmm1
movupd %xmm0, %xmm2
movupd %xmm0, data
movl $1, %eax
movl $0, %ebx
int $0x80




sse2double.s ; sse2double.o # windows environment


sse2double.exe




Because the data memory location contains two double-precision floating-point values, we must use the 2gf option of the x command to display both values stored at the memory location.




C:\>as -gstabs -o users\rnz\desktop\sse2double.o users\rnz\desktop\sse2double.s
C:\>ld -o users\rnz\desktop\sse2double.exe users\rnz\desktop\sse2double.o
C:\>gdb -q users\rnz\desktop\sse2double.exe
Reading symbols from users\rnz\desktop\sse2double.exe...done.
(gdb) break 1
Breakpoint 1 at 0x401000: file users\rnz\desktop\sse2double.s, line 1.
(gdb) run
Starting program: C:\users\rnz\desktop\sse2double.exe
[New Thread 1352.0x1c80]
Breakpoint 1, ?? () at users\rnz\desktop\sse2double.s:12
12 nop
(gdb) s
13 movupd value1, %xmm0
(gdb) s
14 movupd value2, %xmm1
(gdb) s
15 movupd %xmm0, %xmm2
(gdb) s
16 movupd %xmm0, data
(gdb) print $xmm0
$1 = {v4_float = {5.84860315e+35, 2.63562489, 1.79352231e-36, 5.07264233}, v2_double = {12.34, 2345.5430000000001}, v16_int8 = {-82, 71, -31, 122, 20, -82, 40, 64, 117, -109, 24, 4, 22, 83, -94, 64}, v8_int16 = {18350, 31457, -20972, 16424, -27787, 1048, 21270, 16546}, v4_int32 = {2061584302, 1076407828, 68719477, 1084379926}, v2_int64 = {4623136420479977390, 4657376318677619573}, uint128 = 85913429005601586953847513200535357358}
(gdb) print $xmm1
$2 = {v4_float = {-1.11704749e+24, -5.66396856, -1.58818684e-23, 6.98026705}, v2_double = {-5439.2340000000004, 32121.400000000001}, v16_int8 = {68, -117, 108, -25, 59, 63, -75, -64, -102, -103, -103, -103, 89, 94, -33, 64}, v8_int16 = {-29884, -6292, 16187, -16203, -26214, -26215, 24153, 16607}, v4_int32 = {-412316860, -1061863621, -1717986918, 1088380505}, v2_int64 = {-4560669521124488380, 4674558677155944858}, uint128 = 86230387575033986983375224144585853764}
(gdb) print $xmm2
$3 = {v4_float = {5.84860315e+35, 2.63562489, 1.79352231e-36, 5.07264233}, v2_double = {12.34, 2345.5430000000001}, v16_int8 = {-82, 71, -31, 122, 20, -82, 40, 64, 117, -109, 24, 4, 22, 83, -94, 64}, v8_int16 = {18350, 31457, -20972, 16424, -27787, 1048, 21270, 16546}, v4_int32 = {2061584302, 1076407828, 68719477, 1084379926}, v2_int64 = {4623136420479977390, 4657376318677619573}, uint128 = 85913429005601586953847513200535357358}
(gdb) x/2gf &data
0x403000 : 0 0
# movl $1, %eax REQUIRED
(gdb) s
17 movl $1, %eax # REQUIRED
(gdb) x/2gf &data # " gf " is not sufficient
0x403000 : 12.34 2345.5430000000001

(gdb) x/2gf &value1
0x402000 : 12.34 2345.5430000000001
(gdb) x/2gf &value2
0x402010 : -5439.2340000000004 32121.400000000001
(gdb) x/16b &data
0x403000 : -82 71 -31 122 20 -82 40 64
0x403008 : 117 -109 24 4 22 83 -94 64
(gdb) x/16b &value1
0x402000 : -82 71 -31 122 20 -82 40 64
0x402008 : 117 -109 24 4 22 83 -94 64
(gdb) x/16b &value2
0x402010 : 68 -117 108 -25 59 63 -75 -64
0x402018 : -102 -103 -103 -103 89 94 -33 64
(gdb) x/32x &data
0x403000 : 0xae 0x47 0xe1 0x7a 0x14 0xae 0x28 0x40
0x403008 : 0x75 0x93 0x18 0x04 0x16 0x53 0xa2 0x40
0x403010: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
0x403018: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
(gdb) x/16x &data
0x403000 : 0xae 0x47 0xe1 0x7a 0x14 0xae 0x28 0x40
0x403008 : 0x75 0x93 0x18 0x04 0x16 0x53 0xa2 0x40
(gdb) x/16x &value1
0x402000 : 0xae 0x47 0xe1 0x7a 0x14 0xae 0x28 0x40
0x402008 : 0x75 0x93 0x18 0x04 0x16 0x53 0xa2 0x40
(gdb) x/16x &value2
0x402010 : 0x44 0x8b 0x6c 0xe7 0x3b 0x3f 0xb5 0xc0
0x402018 : 0x9a 0x99 0x99 0x99 0x59 0x5e 0xdf 0x40
(gdb)







SSE3 instructions

❑ MOVSHDUP: Moves a 128-bit value from Memory or an XMM register, duplicating the second and fourth 32-bit data elements.
Thus, moving the data element consisting of 32-bit single-precision floating-point values DCBA would create the 128-bit packed single-precision floating-point value consisting of DDBB.

❑ MOVSLDUP: Moves a 128-bit value from Memory or an XMM register, duplicating the first and third 32-bit data elements.
Thus, moving the data element consisting of 32-bit single-precision floating-point values DCBA would create the 128-bit packed single-precision floating-point value consisting of CCAA.

❑ MOVDDUP: Moves a 64-bit double-precision floating-point value from Memory or an XMM register, duplicating it into a 128-bit XMM register.
Thus, moving the data element consisting of 64-bit double-precision floating-point value A would create the 128-bit packed double-precision floating-point value AA.



Home Page