// pi_x64.s - calculates Pi using the Leibniz formula.
// Each iteration prints a closer approximation to 50 digits.
// This is not an optimal implementation and it runs forever.
//
// x86-64/SSE3 with for Linux, Intel, gnu assembler, gcc
//
// assemble: as pi_x64.s -o pi_x64.o
// link: gcc -o pi_x64 pi_x64.o
// run: ./pi_x64
// output: 3.14159264858204423376264458056539297103881835937500
// 3.14159265108366625440794450696557760238647460937500
// 3.14159265191852199450295302085578441619873046875000
// 3.14159265233600137889879988506436347961425781250000
// .... and on forever ...
.section .data
.align 16
denom: .double 1.0, 3.0
numer: .double 4.0, -4.0
add4: .double 4.0, 4.0
zero: .double 0.0, 0.0
msg: .string "%1.50fn"
.section .text
.globl main
.type main, @function
.align 64
main:
pushq %rbp
movq %rsp, %rbp
movdqa (numer), %xmm2
movdqa (denom), %xmm6
movdqa (add4), %xmm3
movdqa %xmm2, %xmm4
movdqa (zero), %xmm5
movq $100000000, %r12
loop:
divpd %xmm6, %xmm2
addpd %xmm2, %xmm5
movdqa %xmm4, %xmm2
addpd %xmm3, %xmm6
subq $1, %r12
jnz loop
movq $100000000, %r12
movdqa %xmm5, %xmm0
movdqa %xmm6, %xmm1
haddpd %xmm0, %xmm0
movq $1, %rax
movq $msg, %rdi
call printf
movdqa (add4), %xmm3
jmp loop
movq $0, %rax
popq %rbp
ret