π in assembly (spigot algorithm)




//   pi_spigot.s - calculates Pi using a spigot algorithm
//                 as an array of n digits in base 10000.
//                 http://mathworld.wolfram.com/SpigotAlgorithm.html
//
//  x86-64/SSE3 with for Linux, Intel, gnu assembler, gcc
//
//  assemble: as pi_spigot.s -o pi_spigot.o
//  link:     gcc -o pi_spigot pi_spigot.o
//  example run:      ./pi_spigot 100
//  output: 3.14159265358979323846264338327950288419716939937510582097494459230 ...
//        ... 78164062862089986280348253421170679
//
    .section    .rodata
.LC0:
    .string "%d."
.LC1:
    .string "%04d"
    .text
.globl print
    .type   print, @function
print:
.LFB0:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    movq    %rsp, %rbp
    .cfi_offset 6, -16
    .cfi_def_cfa_register 6
    subq    $32, %rsp
    movq    %rdi, -24(%rbp)
    movl    %esi, -28(%rbp)
    movq    -24(%rbp), %rax
    addq    $2, %rax
    movzwl  (%rax), %eax
    movzwl  %ax, %edx
    movl    $.LC0, %eax
    movl    %edx, %esi
    movq    %rax, %rdi
    movl    $0, %eax
    call    printf
    movl    $2, -4(%rbp)
    jmp .L2
.L3:
    movl    -4(%rbp), %eax
    cltq
    addq    %rax, %rax
    addq    -24(%rbp), %rax
    movzwl  (%rax), %eax
    movzwl  %ax, %edx
    movl    $.LC1, %eax
    movl    %edx, %esi
    movq    %rax, %rdi
    movl    $0, %eax
    call    printf
    addl    $1, -4(%rbp)
.L2:
    movl    -28(%rbp), %eax
    subl    $1, %eax
    cmpl    -4(%rbp), %eax
    jg  .L3
    movl    $10, %edi
    call    putchar
    leave
    ret
    .cfi_endproc
.LFE0:
    .size   print, .-print
.globl main
    .type   main, @function
main:
.LFB1:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    movq    %rsp, %rbp
    .cfi_offset 6, -16
    .cfi_def_cfa_register 6
    pushq   %rbx
    subq    $56, %rsp
    movl    %edi, -52(%rbp)
    movq    %rsi, -64(%rbp)
    cmpl    $1, -52(%rbp)
    jle .L6
    .cfi_offset 3, -24
    movq    -64(%rbp), %rax
    addq    $8, %rax
    movq    (%rax), %rax
    movq    %rax, %rdi
    call    atoi
    addl    $3, %eax
    leal    3(%rax), %edx
    testl   %eax, %eax
    cmovs   %edx, %eax
    sarl    $2, %eax
    addl    $3, %eax
    jmp .L7
.L6:
    movl    $253, %eax
.L7:
    movl    %eax, -20(%rbp)
    movl    -20(%rbp), %eax
    cltq
    addq    %rax, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -40(%rbp)
    movl    -20(%rbp), %eax
    cltq
    leaq    (%rax,%rax), %rdx
    movq    -40(%rbp), %rax
    movl    $0, %esi
    movq    %rax, %rdi
    call    memset
    movq    -40(%rbp), %rax
    addq    $2, %rax
    movw    $4, (%rax)
    cvtsi2sd    -20(%rbp), %xmm0
    movsd   .LC2(%rip), %xmm1
    mulsd   %xmm1, %xmm0
    cvttsd2si   %xmm0, %eax
    movl    %eax, -24(%rbp)
    jmp .L8
.L13:
    movl    $0, -32(%rbp)
    movl    -20(%rbp), %eax
    subl    $1, %eax
    movl    %eax, -28(%rbp)
    jmp .L9
.L10:
    movl    -28(%rbp), %eax
    cltq
    addq    %rax, %rax
    addq    -40(%rbp), %rax
    movzwl  (%rax), %eax
    movzwl  %ax, %eax
    imull   -24(%rbp), %eax
    addl    %eax, -32(%rbp)
    movl    -28(%rbp), %eax
    cltq
    addq    %rax, %rax
    movq    %rax, %rbx
    addq    -40(%rbp), %rbx
    movl    -32(%rbp), %ecx
    movl    $1759218605, %edx
    movl    %ecx, %eax
    imull   %edx
    sarl    $12, %edx
    movl    %ecx, %eax
    sarl    $31, %eax
    movl    %edx, %esi
    subl    %eax, %esi
    movl    %esi, %eax
    imull   $10000, %eax, %eax
    movl    %ecx, %edx
    subl    %eax, %edx
    movl    %edx, %eax
    movw    %ax, (%rbx)
    movl    -32(%rbp), %ecx
    movl    $1759218605, %edx
    movl    %ecx, %eax
    imull   %edx
    sarl    $12, %edx
    movl    %ecx, %eax
    sarl    $31, %eax
    movl    %edx, %ecx
    subl    %eax, %ecx
    movl    %ecx, %eax
    movl    %eax, -32(%rbp)
    subl    $1, -28(%rbp)
.L9:
    cmpl    $0, -28(%rbp)
    jns .L10
    movl    $0, -44(%rbp)
    movl    -44(%rbp), %eax
    movl    %eax, -48(%rbp)
    movl    $0, -28(%rbp)
    jmp .L11
.L12:
    movl    -24(%rbp), %eax
    addl    %eax, %eax
    leal    1(%rax), %edx
    movl    -28(%rbp), %eax
    cltq
    addq    %rax, %rax
    addq    -40(%rbp), %rax
    movzwl  (%rax), %eax
    movzwl  %ax, %ecx
    movl    -44(%rbp), %eax
    imull   $10000, %eax, %eax
    leal    (%rcx,%rax), %eax
    movl    %edx, %esi
    movl    %eax, %edi
    call    div
    movq    %rax, -48(%rbp)
    movl    -28(%rbp), %eax
    cltq
    addq    %rax, %rax
    addq    -40(%rbp), %rax
    movl    -48(%rbp), %edx
    movw    %dx, (%rax)
    addl    $1, -28(%rbp)
.L11:
    movl    -28(%rbp), %eax
    cmpl    -20(%rbp), %eax
    jl  .L12
    movq    -40(%rbp), %rax
    addq    $2, %rax
    movq    -40(%rbp), %rdx
    addq    $2, %rdx
    movzwl  (%rdx), %edx
    addl    $2, %edx
    movw    %dx, (%rax)
    subl    $1, -24(%rbp)
.L8:
    cmpl    $0, -24(%rbp)
    jg  .L13
    movl    -20(%rbp), %edx
    movq    -40(%rbp), %rax
    movl    %edx, %esi
    movq    %rax, %rdi
    call    print
    movl    $0, %eax
    addq    $56, %rsp
    popq    %rbx
    leave
    ret
    .cfi_endproc
.LFE1:
    .size   main, .-main
    .section    .rodata
    .align 8
.LC2:
    .long   3161095930
    .long   1076532084