diff --git a/polly/www/example_manual_matmul.html b/polly/www/example_manual_matmul.html index 7e352a26802a..2636e1d60989 100644 --- a/polly/www/example_manual_matmul.html +++ b/polly/www/example_manual_matmul.html @@ -110,7 +110,7 @@ view-scops-only:
 [...]
 Printing analysis 'Polly - Create polyhedral description of Scops' for region:
-'%1 => %17' in function 'init_array':
+'for.cond => for.end19' in function 'init_array':
    Context:
    { [] }
    Statements {
@@ -135,7 +135,7 @@ Printing analysis 'Polly - Create polyhedral description of Scops' for region:
    }
 [...]
 Printing analysis 'Polly - Create polyhedral description of Scops' for region:
-'%1 => %17' in function 'main':
+'for.cond => for.end30' in function 'main':
    Context:
    { [] }
    Statements {
@@ -178,7 +178,7 @@ Printing analysis 'Polly - Create polyhedral description of Scops' for region:
 
  • Show the dependences for the SCoPs

    opt -basicaa -polly-dependences -analyze matmul.preopt.ll
    Printing analysis 'Polly - Calculate dependences for SCoP' for region:
    -'for.cond => for.end28' in function 'init_array':
    +'for.cond => for.end19' in function 'init_array':
        Must dependences:
            {  }
        May dependences:
    @@ -188,7 +188,7 @@ Printing analysis 'Polly - Create polyhedral description of Scops' for region:
        May no source:
            {  }
     Printing analysis 'Polly - Calculate dependences for SCoP' for region:
    -'for.cond => for.end48' in function 'main':
    +'for.cond => for.end30' in function 'main':
        Must dependences:
            {  Stmt_4[i0, i1] -> Stmt_6[i0, i1, 0] :
                   i0 >= 0 and i0 <= 1023 and i1 >= 0 and i1 <= 1023;
    @@ -221,8 +221,8 @@ Printing analysis 'Polly - Calculate dependences for SCoP' for region:
     Polly can export the polyhedral representation in so called jscop files. Jscop
     files contain the polyhedral representation stored in a JSON file.
     
    opt -basicaa -polly-export-jscop matmul.preopt.ll
    -
    Writing SCoP 'for.cond => for.end28' in function 'init_array' to './init_array___%for.cond---%for.end28.jscop'.
    -Writing SCoP 'for.cond => for.end48' in function 'main' to './main___%for.cond---%for.end48.jscop'.
    +
    Writing SCoP 'for.cond => for.end19' in function 'init_array' to './init_array___%for.cond---%for.end19.jscop'.
    +Writing SCoP 'for.cond => for.end30' in function 'main' to './main___%for.cond---%for.end30.jscop'.
     
  • Import the changed jscop files and print the updated SCoP structure @@ -268,7 +268,7 @@ opt matmul.preopt.ll -basicaa \

  •  [...]
    -Reading JScop '%1 => %17' in function 'main' from './main___%1---%17.jscop.interchanged'.
    +Reading JScop 'for.cond => for.end30' in function 'main' from './main___%for.cond---%for.end30.jscop.interchanged+tiled'.
     [...]
     main():
     for (c2=0;c2<=1535;c2++) {
    @@ -295,7 +295,7 @@ opt matmul.preopt.ll -basicaa \
     
     [...]
    -Reading JScop '%1 => %17' in function 'main' from './main___%1---%17.jscop.interchanged+tiled'.
    +Reading JScop 'for.cond => for.end30' in function 'main' from './main___%for.cond---%for.end30.jscop.interchanged+tiled'.
     [...]
     main():
     for (c2=0;c2<=1535;c2++) {
    @@ -329,7 +329,7 @@ opt matmul.preopt.ll -basicaa \
     
     
     [...]
    -Reading JScop '%1 => %17' in function 'main' from './main___%1---%17.jscop.interchanged+tiled+vector'.
    +Reading JScop 'for.cond => for.end30' in function 'main' from './main___%for.cond---%for.end30.jscop.interchanged+tiled+vector'.
     [...]
     main():
     for (c2=0;c2<=1535;c2++) {
    @@ -369,11 +369,11 @@ opt -basicaa \
         -polly-codegen matmul.preopt.ll \
        | opt -O3 > matmul.polly.interchanged.ll
    -Reading JScop '%1 => %19' in function 'init_array' from
    -    './init_array___%1---%19.jscop.interchanged'.
    +Reading JScop 'for.cond => for.end19' in function 'init_array' from
    +    './init_array___%for.cond---%for.end19.jscop.interchanged'.
     File could not be read: No such file or directory
    -Reading JScop '%1 => %17' in function 'main' from
    -    './main___%1---%17.jscop.interchanged'.
    +Reading JScop 'for.cond => for.end30' in function 'main' from
    +    './main___%for.cond---%for.end30.jscop.interchanged'.
     
     opt -basicaa \
    @@ -381,11 +381,11 @@ opt -basicaa \
         -polly-codegen matmul.preopt.ll \
        | opt -O3 > matmul.polly.interchanged+tiled.ll
    -Reading JScop '%1 => %19' in function 'init_array' from
    -    './init_array___%1---%19.jscop.interchanged+tiled'.
    +Reading JScop 'for.cond => for.end19' in function 'init_array' from
    +    './init_array___%for.cond---%for.end19.jscop.interchanged+tiled'.
     File could not be read: No such file or directory
    -Reading JScop '%1 => %17' in function 'main' from
    -    './main___%1---%17.jscop.interchanged+tiled'.
    +Reading JScop 'for.cond => for.end30' in function 'main' from
    +    './main___%for.cond---%for.end30.jscop.interchanged+tiled'.
     
     opt -basicaa \
    @@ -393,11 +393,11 @@ opt -basicaa \
         -polly-codegen -polly-vectorizer=polly matmul.preopt.ll \
        | opt -O3 > matmul.polly.interchanged+tiled+vector.ll
    -Reading JScop '%1 => %19' in function 'init_array' from
    -    './init_array___%1---%19.jscop.interchanged+tiled+vector'.
    +Reading JScop 'for.cond => for.end19' in function 'init_array' from
    +    './init_array___%for.cond---%for.end19.jscop.interchanged+tiled+vector'.
     File could not be read: No such file or directory
    -Reading JScop '%1 => %17' in function 'main' from
    -    './main___%1---%17.jscop.interchanged+tiled+vector'.
    +Reading JScop 'for.cond => for.end30' in function 'main' from
    +    './main___%for.cond---%for.end30.jscop.interchanged+tiled+vector'.
     
     opt -basicaa \
    @@ -405,11 +405,11 @@ opt -basicaa \
         -polly-codegen -polly-vectorizer=polly -enable-polly-openmp matmul.preopt.ll \
       | opt -O3 > matmul.polly.interchanged+tiled+openmp.ll
    -Reading JScop '%1 => %19' in function 'init_array' from
    -    './init_array___%1---%19.jscop.interchanged+tiled+vector'.
    +Reading JScop 'for.cond => for.end19' in function 'init_array' from
    +    './init_array___%for.cond---%for.end19.jscop.interchanged+tiled+vector'.
     File could not be read: No such file or directory
    -Reading JScop '%1 => %17' in function 'main' from
    -    './main___%1---%17.jscop.interchanged+tiled+vector'.
    +Reading JScop 'for.cond => for.end30' in function 'main' from
    +    './main___%for.cond---%for.end30.jscop.interchanged+tiled+vector'.
     
  • Create the executables

    diff --git a/polly/www/experiments/matmul/init_array___%1---%19.jscop b/polly/www/experiments/matmul/init_array___%1---%19.jscop deleted file mode 100644 index c7f9bb8c87ae..000000000000 --- a/polly/www/experiments/matmul/init_array___%1---%19.jscop +++ /dev/null @@ -1,21 +0,0 @@ -{ - "context" : "{ [] }", - "name" : "%1 => %19", - "statements" : [ - { - "accesses" : [ - { - "kind" : "write", - "relation" : "{ Stmt_5[i0, i1] -> MemRef_A[1536i0 + i1] }" - }, - { - "kind" : "write", - "relation" : "{ Stmt_5[i0, i1] -> MemRef_B[1536i0 + i1] }" - } - ], - "domain" : "{ Stmt_5[i0, i1] : i0 >= 0 and i0 <= 1535 and i1 >= 0 and i1 <= 1535 }", - "name" : "Stmt_5", - "schedule" : "{ Stmt_5[i0, i1] -> scattering[0, i0, 0, i1, 0] }" - } - ] -} diff --git a/polly/www/experiments/matmul/init_array___%for.cond---%for.end19.jscop b/polly/www/experiments/matmul/init_array___%for.cond---%for.end19.jscop new file mode 100644 index 000000000000..dfd10935988a --- /dev/null +++ b/polly/www/experiments/matmul/init_array___%for.cond---%for.end19.jscop @@ -0,0 +1,21 @@ +{ + "context" : "{ : }", + "name" : "for.cond => for.end19", + "statements" : [ + { + "accesses" : [ + { + "kind" : "write", + "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[1536i0 + i1] }" + }, + { + "kind" : "write", + "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_B[1536i0 + i1] }" + } + ], + "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 1535 and i1 >= 0 and i1 <= 1535 }", + "name" : "Stmt_for_body3", + "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }" + } + ] +} diff --git a/polly/www/experiments/matmul/main___%1---%17.jscop b/polly/www/experiments/matmul/main___%1---%17.jscop deleted file mode 100644 index c37839525add..000000000000 --- a/polly/www/experiments/matmul/main___%1---%17.jscop +++ /dev/null @@ -1,40 +0,0 @@ -{ - "context" : "{ : }", - "name" : "%1 => %17", - "statements" : [ - { - "accesses" : [ - { - "kind" : "write", - "relation" : "{ Stmt_4[i0, i1] -> MemRef_C[1536i0 + i1] }" - } - ], - "domain" : "{ Stmt_4[i0, i1] : i0 >= 0 and i0 <= 1535 and i1 >= 0 and i1 <= 1535 }", - "name" : "Stmt_4", - "schedule" : "{ Stmt_4[i0, i1] -> scattering[0, i0, 0, i1, 0, 0, 0] }" - }, - { - "accesses" : [ - { - "kind" : "read", - "relation" : "{ Stmt_6[i0, i1, i2] -> MemRef_C[1536i0 + i1] }" - }, - { - "kind" : "read", - "relation" : "{ Stmt_6[i0, i1, i2] -> MemRef_A[1536i0 + i2] }" - }, - { - "kind" : "read", - "relation" : "{ Stmt_6[i0, i1, i2] -> MemRef_B[i1 + 1536i2] }" - }, - { - "kind" : "write", - "relation" : "{ Stmt_6[i0, i1, i2] -> MemRef_C[1536i0 + i1] }" - } - ], - "domain" : "{ Stmt_6[i0, i1, i2] : i0 >= 0 and i0 <= 1535 and i1 >= 0 and i1 <= 1535 and i2 >= 0 and i2 <= 1535 }", - "name" : "Stmt_6", - "schedule" : "{ Stmt_6[i0, i1, i2] -> scattering[0, i0, 0, i1, 1, i2, 0] }" - } - ] -} diff --git a/polly/www/experiments/matmul/main___%for.cond---%for.end30.jscop b/polly/www/experiments/matmul/main___%for.cond---%for.end30.jscop new file mode 100644 index 000000000000..4d6e463a7dbc --- /dev/null +++ b/polly/www/experiments/matmul/main___%for.cond---%for.end30.jscop @@ -0,0 +1,40 @@ +{ + "context" : "{ : }", + "name" : "for.cond => for.end30", + "statements" : [ + { + "accesses" : [ + { + "kind" : "write", + "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_C[1536i0 + i1] }" + } + ], + "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 1535 and i1 >= 0 and i1 <= 1535 }", + "name" : "Stmt_for_body3", + "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0, 0, 0] }" + }, + { + "accesses" : [ + { + "kind" : "read", + "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[1536i0 + i1] }" + }, + { + "kind" : "read", + "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[1536i0 + i2] }" + }, + { + "kind" : "read", + "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i1 + 1536i2] }" + }, + { + "kind" : "write", + "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[1536i0 + i1] }" + } + ], + "domain" : "{ Stmt_for_body8[i0, i1, i2] : i0 >= 0 and i0 <= 1535 and i1 >= 0 and i1 <= 1535 and i2 >= 0 and i2 <= 1535 }", + "name" : "Stmt_for_body8", + "schedule" : "{ Stmt_for_body8[i0, i1, i2] -> scattering[0, i0, 0, i1, 1, i2, 0] }" + } + ] +} diff --git a/polly/www/experiments/matmul/main___%1---%17.jscop.interchanged b/polly/www/experiments/matmul/main___%for.cond---%for.end30.jscop.interchanged similarity index 100% rename from polly/www/experiments/matmul/main___%1---%17.jscop.interchanged rename to polly/www/experiments/matmul/main___%for.cond---%for.end30.jscop.interchanged diff --git a/polly/www/experiments/matmul/main___%1---%17.jscop.interchanged+tiled b/polly/www/experiments/matmul/main___%for.cond---%for.end30.jscop.interchanged+tiled similarity index 100% rename from polly/www/experiments/matmul/main___%1---%17.jscop.interchanged+tiled rename to polly/www/experiments/matmul/main___%for.cond---%for.end30.jscop.interchanged+tiled diff --git a/polly/www/experiments/matmul/main___%1---%17.jscop.interchanged+tiled+vector b/polly/www/experiments/matmul/main___%for.cond---%for.end30.jscop.interchanged+tiled+vector similarity index 100% rename from polly/www/experiments/matmul/main___%1---%17.jscop.interchanged+tiled+vector rename to polly/www/experiments/matmul/main___%for.cond---%for.end30.jscop.interchanged+tiled+vector diff --git a/polly/www/experiments/matmul/matmul.normalopt.exe b/polly/www/experiments/matmul/matmul.normalopt.exe index 73b94752d8ed..cdb9e67af454 100755 Binary files a/polly/www/experiments/matmul/matmul.normalopt.exe and b/polly/www/experiments/matmul/matmul.normalopt.exe differ diff --git a/polly/www/experiments/matmul/matmul.normalopt.ll b/polly/www/experiments/matmul/matmul.normalopt.ll index 182ed9aa2218..ba792c29f701 100644 Binary files a/polly/www/experiments/matmul/matmul.normalopt.ll and b/polly/www/experiments/matmul/matmul.normalopt.ll differ diff --git a/polly/www/experiments/matmul/matmul.normalopt.s b/polly/www/experiments/matmul/matmul.normalopt.s index f10f64411824..079af702a14f 100644 --- a/polly/www/experiments/matmul/matmul.normalopt.s +++ b/polly/www/experiments/matmul/matmul.normalopt.s @@ -2,74 +2,112 @@ .section .rodata.cst8,"aM",@progbits,8 .align 8 .LCPI0_0: - .quad 4602678819172646912 # double 5.000000e-01 + .quad 4602678819172646912 # double 0.5 .text .globl init_array .align 16, 0x90 .type init_array,@function init_array: # @init_array -# BB#0: - xorl %eax, %eax - movsd .LCPI0_0(%rip), %xmm0 - movq %rax, %rcx + .cfi_startproc +# BB#0: # %entry + pushq %rbp +.Ltmp2: + .cfi_def_cfa_offset 16 +.Ltmp3: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp4: + .cfi_def_cfa_register %rbp + xorl %r8d, %r8d + vmovsd .LCPI0_0(%rip), %xmm0 .align 16, 0x90 -.LBB0_1: # %.preheader +.LBB0_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB0_2 Depth 2 - movq $-1536, %rdx # imm = 0xFFFFFFFFFFFFFA00 - xorl %esi, %esi + xorl %ecx, %ecx .align 16, 0x90 -.LBB0_2: # Parent Loop BB0_1 Depth=1 +.LBB0_2: # %for.body3 + # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 - movl %esi, %edi - sarl $31, %edi - shrl $22, %edi - addl %esi, %edi - andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00 - negl %edi - leal 1(%rsi,%rdi), %edi - cvtsi2sd %edi, %xmm1 - mulsd %xmm0, %xmm1 - cvtsd2ss %xmm1, %xmm1 - movss %xmm1, A+6144(%rax,%rdx,4) - movss %xmm1, B+6144(%rax,%rdx,4) - addl %ecx, %esi - incq %rdx + movl %ecx, %edx + imull %r8d, %edx + movl %edx, %esi + sarl $31, %esi + shrl $22, %esi + addl %edx, %esi + andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 + negl %esi + movq %r8, %rax + shlq $11, %rax + leal 1(%rdx,%rsi), %edi + leaq (%rax,%rax,2), %rsi + leaq 1(%rcx), %rdx + cmpq $1536, %rdx # imm = 0x600 + vcvtsi2sdl %edi, %xmm0, %xmm1 + vmulsd %xmm0, %xmm1, %xmm1 + vcvtsd2ss %xmm1, %xmm1, %xmm1 + vmovss %xmm1, A(%rsi,%rcx,4) + vmovss %xmm1, B(%rsi,%rcx,4) + movq %rdx, %rcx jne .LBB0_2 -# BB#3: # in Loop: Header=BB0_1 Depth=1 - addq $6144, %rax # imm = 0x1800 - incq %rcx - cmpq $1536, %rcx # imm = 0x600 +# BB#3: # %for.inc17 + # in Loop: Header=BB0_1 Depth=1 + incq %r8 + cmpq $1536, %r8 # imm = 0x600 jne .LBB0_1 -# BB#4: +# BB#4: # %for.end19 + popq %rbp ret -.Ltmp0: - .size init_array, .Ltmp0-init_array +.Ltmp5: + .size init_array, .Ltmp5-init_array + .cfi_endproc .globl print_array .align 16, 0x90 .type print_array,@function print_array: # @print_array -# BB#0: + .cfi_startproc +# BB#0: # %entry + pushq %rbp +.Ltmp9: + .cfi_def_cfa_offset 16 +.Ltmp10: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp11: + .cfi_def_cfa_register %rbp + pushq %r15 pushq %r14 + pushq %r12 pushq %rbx - pushq %rax - movq $-9437184, %rbx # imm = 0xFFFFFFFFFF700000 +.Ltmp12: + .cfi_offset %rbx, -48 +.Ltmp13: + .cfi_offset %r12, -40 +.Ltmp14: + .cfi_offset %r14, -32 +.Ltmp15: + .cfi_offset %r15, -24 + xorl %r14d, %r14d + movl $C, %r15d .align 16, 0x90 -.LBB1_1: # %.preheader +.LBB1_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB1_2 Depth 2 - xorl %r14d, %r14d - movq stdout(%rip), %rdi + movq stdout(%rip), %rax + movq %r15, %r12 + xorl %ebx, %ebx .align 16, 0x90 -.LBB1_2: # Parent Loop BB1_1 Depth=1 +.LBB1_2: # %for.body3 + # Parent Loop BB1_1 Depth=1 # => This Inner Loop Header: Depth=2 - movss C+9437184(%rbx,%r14,4), %xmm0 - cvtss2sd %xmm0, %xmm0 + vmovss (%r12), %xmm0 + vcvtss2sd %xmm0, %xmm0, %xmm0 + movq %rax, %rdi movl $.L.str, %esi movb $1, %al callq fprintf - movslq %r14d, %rax + movslq %ebx, %rax imulq $1717986919, %rax, %rcx # imm = 0x66666667 movq %rcx, %rdx shrq $63, %rdx @@ -79,113 +117,146 @@ print_array: # @print_array subl %ecx, %eax cmpl $79, %eax jne .LBB1_4 -# BB#3: # in Loop: Header=BB1_2 Depth=2 +# BB#3: # %if.then + # in Loop: Header=BB1_2 Depth=2 movq stdout(%rip), %rsi movl $10, %edi callq fputc -.LBB1_4: # in Loop: Header=BB1_2 Depth=2 - incq %r14 - movq stdout(%rip), %rsi - cmpq $1536, %r14 # imm = 0x600 - movq %rsi, %rdi +.LBB1_4: # %for.inc + # in Loop: Header=BB1_2 Depth=2 + addq $4, %r12 + incq %rbx + movq stdout(%rip), %rax + cmpq $1536, %rbx # imm = 0x600 jne .LBB1_2 -# BB#5: # in Loop: Header=BB1_1 Depth=1 +# BB#5: # %for.end + # in Loop: Header=BB1_1 Depth=1 movl $10, %edi + movq %rax, %rsi callq fputc - addq $6144, %rbx # imm = 0x1800 + addq $6144, %r15 # imm = 0x1800 + incq %r14 + cmpq $1536, %r14 # imm = 0x600 jne .LBB1_1 -# BB#6: - addq $8, %rsp +# BB#6: # %for.end12 popq %rbx + popq %r12 popq %r14 + popq %r15 + popq %rbp ret -.Ltmp1: - .size print_array, .Ltmp1-print_array +.Ltmp16: + .size print_array, .Ltmp16-print_array + .cfi_endproc .section .rodata.cst8,"aM",@progbits,8 .align 8 .LCPI2_0: - .quad 4602678819172646912 # double 5.000000e-01 + .quad 4602678819172646912 # double 0.5 .text .globl main .align 16, 0x90 .type main,@function main: # @main -# BB#0: - xorl %eax, %eax - movsd .LCPI2_0(%rip), %xmm0 - movq %rax, %rcx + .cfi_startproc +# BB#0: # %entry + pushq %rbp +.Ltmp19: + .cfi_def_cfa_offset 16 +.Ltmp20: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp21: + .cfi_def_cfa_register %rbp + xorl %r8d, %r8d + vmovsd .LCPI2_0(%rip), %xmm0 .align 16, 0x90 -.LBB2_1: # %.preheader.i +.LBB2_1: # %for.cond1.preheader.i # =>This Loop Header: Depth=1 # Child Loop BB2_2 Depth 2 - movq $-1536, %rdx # imm = 0xFFFFFFFFFFFFFA00 - xorl %esi, %esi - .align 16, 0x90 -.LBB2_2: # Parent Loop BB2_1 Depth=1 - # => This Inner Loop Header: Depth=2 - movl %esi, %edi - sarl $31, %edi - shrl $22, %edi - addl %esi, %edi - andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00 - negl %edi - leal 1(%rsi,%rdi), %edi - cvtsi2sd %edi, %xmm1 - mulsd %xmm0, %xmm1 - cvtsd2ss %xmm1, %xmm1 - movss %xmm1, A+6144(%rax,%rdx,4) - movss %xmm1, B+6144(%rax,%rdx,4) - addl %ecx, %esi - incq %rdx - jne .LBB2_2 -# BB#3: # in Loop: Header=BB2_1 Depth=1 - addq $6144, %rax # imm = 0x1800 - incq %rcx - xorl %edx, %edx - cmpq $1536, %rcx # imm = 0x600 - jne .LBB2_1 - .align 16, 0x90 -.LBB2_4: # %.preheader - # =>This Loop Header: Depth=1 - # Child Loop BB2_5 Depth 2 - # Child Loop BB2_6 Depth 3 - xorl %eax, %eax xorl %ecx, %ecx .align 16, 0x90 -.LBB2_5: # Parent Loop BB2_4 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB2_6 Depth 3 - movl $0, C(%rcx,%rdx) - leaq B(%rcx), %rsi - pxor %xmm0, %xmm0 - movq %rax, %rdi +.LBB2_2: # %for.body3.i + # Parent Loop BB2_1 Depth=1 + # => This Inner Loop Header: Depth=2 + movl %ecx, %edx + imull %r8d, %edx + movl %edx, %esi + sarl $31, %esi + shrl $22, %esi + addl %edx, %esi + andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 + negl %esi + movq %r8, %rax + shlq $11, %rax + leal 1(%rdx,%rsi), %edi + leaq (%rax,%rax,2), %rsi + leaq 1(%rcx), %rdx + cmpq $1536, %rdx # imm = 0x600 + vcvtsi2sdl %edi, %xmm0, %xmm1 + vmulsd %xmm0, %xmm1, %xmm1 + vcvtsd2ss %xmm1, %xmm1, %xmm1 + vmovss %xmm1, A(%rsi,%rcx,4) + vmovss %xmm1, B(%rsi,%rcx,4) + movq %rdx, %rcx + jne .LBB2_2 +# BB#3: # %for.inc17.i + # in Loop: Header=BB2_1 Depth=1 + incq %r8 + cmpq $1536, %r8 # imm = 0x600 + jne .LBB2_1 +# BB#4: + xorl %r8d, %r8d + movl $A, %r9d .align 16, 0x90 -.LBB2_6: # Parent Loop BB2_4 Depth=1 - # Parent Loop BB2_5 Depth=2 +.LBB2_5: # %for.cond1.preheader + # =>This Loop Header: Depth=1 + # Child Loop BB2_6 Depth 2 + # Child Loop BB2_7 Depth 3 + leaq (%r8,%r8,2), %rdx + shlq $11, %rdx + leaq C(%rdx), %rsi + xorl %edi, %edi + .align 16, 0x90 +.LBB2_6: # %for.body3 + # Parent Loop BB2_5 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB2_7 Depth 3 + movl $0, (%rsi) + vxorps %xmm0, %xmm0, %xmm0 + movq $-9437184, %rax # imm = 0xFFFFFFFFFF700000 + movq %r9, %rcx + .align 16, 0x90 +.LBB2_7: # %for.body8 + # Parent Loop BB2_5 Depth=1 + # Parent Loop BB2_6 Depth=2 # => This Inner Loop Header: Depth=3 - movss A(%rdx,%rdi,4), %xmm1 - mulss (%rsi), %xmm1 - addss %xmm1, %xmm0 - addq $6144, %rsi # imm = 0x1800 + vmovss (%rcx), %xmm1 + vmulss B+9437184(%rax,%rdi,4), %xmm1, %xmm1 + vaddss %xmm1, %xmm0, %xmm0 + addq $4, %rcx + addq $6144, %rax # imm = 0x1800 + jne .LBB2_7 +# BB#8: # %for.inc25 + # in Loop: Header=BB2_6 Depth=2 + vmovss %xmm0, (%rsi) + leaq C+4(%rdx,%rdi,4), %rsi incq %rdi cmpq $1536, %rdi # imm = 0x600 jne .LBB2_6 -# BB#7: # in Loop: Header=BB2_5 Depth=2 - movss %xmm0, C(%rcx,%rdx) - addq $4, %rcx - cmpq $6144, %rcx # imm = 0x1800 +# BB#9: # %for.inc28 + # in Loop: Header=BB2_5 Depth=1 + addq $6144, %r9 # imm = 0x1800 + incq %r8 + cmpq $1536, %r8 # imm = 0x600 jne .LBB2_5 -# BB#8: # %init_array.exit - # in Loop: Header=BB2_4 Depth=1 - addq $6144, %rdx # imm = 0x1800 - cmpq $9437184, %rdx # imm = 0x900000 - jne .LBB2_4 -# BB#9: +# BB#10: # %for.end30 xorl %eax, %eax + popq %rbp ret -.Ltmp2: - .size main, .Ltmp2-main +.Ltmp22: + .size main, .Ltmp22-main + .cfi_endproc .type A,@object # @A .comm A,9437184,16 diff --git a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exe b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exe index 7a2e6de61388..feb24366d730 100755 Binary files a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exe and b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exe differ diff --git a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll index 710f706f68e8..593794ef380b 100644 Binary files a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll and b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll differ diff --git a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s index 04dc0656c068..ca87de11704e 100644 --- a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s +++ b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s @@ -1,55 +1,166 @@ .file "matmul.polly.interchanged+tiled+vector+openmp.ll" + .section .rodata.cst8,"aM",@progbits,8 + .align 8 +.LCPI0_0: + .quad 4602678819172646912 # double 0.5 .text .globl init_array .align 16, 0x90 .type init_array,@function init_array: # @init_array -# BB#0: # %pollyBB + .cfi_startproc +# BB#0: # %entry + pushq %rbp +.Ltmp3: + .cfi_def_cfa_offset 16 +.Ltmp4: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp5: + .cfi_def_cfa_register %rbp + pushq %r15 + pushq %r14 pushq %rbx - subq $16, %rsp - movq $A, (%rsp) - movq $B, 8(%rsp) + subq $24, %rsp +.Ltmp6: + .cfi_offset %rbx, -40 +.Ltmp7: + .cfi_offset %r14, -32 +.Ltmp8: + .cfi_offset %r15, -24 + leaq -32(%rbp), %rsi movl $init_array.omp_subfn, %edi - leaq (%rsp), %rbx xorl %edx, %edx xorl %ecx, %ecx movl $1536, %r8d # imm = 0x600 movl $1, %r9d - movq %rbx, %rsi callq GOMP_parallel_loop_runtime_start - movq %rbx, %rdi - callq init_array.omp_subfn + leaq -40(%rbp), %rdi + leaq -48(%rbp), %rsi + callq GOMP_loop_runtime_next + testb %al, %al + je .LBB0_4 +# BB#1: + leaq -40(%rbp), %r14 + leaq -48(%rbp), %r15 + vmovsd .LCPI0_0(%rip), %xmm1 + .align 16, 0x90 +.LBB0_2: # %omp.loadIVBounds.i + # =>This Loop Header: Depth=1 + # Child Loop BB0_8 Depth 2 + # Child Loop BB0_5 Depth 3 + movq -48(%rbp), %r8 + leaq -1(%r8), %rcx + movq -40(%rbp), %rax + cmpq %rcx, %rax + jg .LBB0_3 +# BB#7: # %polly.loop_preheader4.preheader.i + # in Loop: Header=BB0_2 Depth=1 + addq $-2, %r8 + .align 16, 0x90 +.LBB0_8: # %polly.loop_preheader4.i + # Parent Loop BB0_2 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB0_5 Depth 3 + xorl %edx, %edx + .align 16, 0x90 +.LBB0_5: # %polly.loop_header3.i + # Parent Loop BB0_2 Depth=1 + # Parent Loop BB0_8 Depth=2 + # => This Inner Loop Header: Depth=3 + movl %edx, %esi + imull %eax, %esi + movl %esi, %edi + sarl $31, %edi + shrl $22, %edi + addl %esi, %edi + andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00 + negl %edi + movq %rax, %rcx + shlq $11, %rcx + leal 1(%rsi,%rdi), %ebx + leaq (%rcx,%rcx,2), %rdi + leaq 1(%rdx), %rsi + cmpq $1536, %rsi # imm = 0x600 + vcvtsi2sdl %ebx, %xmm0, %xmm0 + vmulsd %xmm1, %xmm0, %xmm0 + vcvtsd2ss %xmm0, %xmm0, %xmm0 + vmovss %xmm0, A(%rdi,%rdx,4) + vmovss %xmm0, B(%rdi,%rdx,4) + movq %rsi, %rdx + jne .LBB0_5 +# BB#6: # %polly.loop_exit5.i + # in Loop: Header=BB0_8 Depth=2 + cmpq %r8, %rax + leaq 1(%rax), %rax + jle .LBB0_8 +.LBB0_3: # %omp.checkNext.backedge.i + # in Loop: Header=BB0_2 Depth=1 + movq %r14, %rdi + movq %r15, %rsi + callq GOMP_loop_runtime_next + vmovsd .LCPI0_0(%rip), %xmm1 + testb %al, %al + jne .LBB0_2 +.LBB0_4: # %init_array.omp_subfn.exit + callq GOMP_loop_end_nowait callq GOMP_parallel_end - addq $16, %rsp + addq $24, %rsp popq %rbx + popq %r14 + popq %r15 + popq %rbp ret -.Ltmp0: - .size init_array, .Ltmp0-init_array +.Ltmp9: + .size init_array, .Ltmp9-init_array + .cfi_endproc .globl print_array .align 16, 0x90 .type print_array,@function print_array: # @print_array -# BB#0: + .cfi_startproc +# BB#0: # %entry + pushq %rbp +.Ltmp13: + .cfi_def_cfa_offset 16 +.Ltmp14: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp15: + .cfi_def_cfa_register %rbp + pushq %r15 pushq %r14 + pushq %r12 pushq %rbx - pushq %rax - movq $-9437184, %rbx # imm = 0xFFFFFFFFFF700000 +.Ltmp16: + .cfi_offset %rbx, -48 +.Ltmp17: + .cfi_offset %r12, -40 +.Ltmp18: + .cfi_offset %r14, -32 +.Ltmp19: + .cfi_offset %r15, -24 + xorl %r14d, %r14d + movl $C, %r15d .align 16, 0x90 -.LBB1_1: # %.preheader +.LBB1_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB1_2 Depth 2 - xorl %r14d, %r14d - movq stdout(%rip), %rdi + movq stdout(%rip), %rax + movq %r15, %r12 + xorl %ebx, %ebx .align 16, 0x90 -.LBB1_2: # Parent Loop BB1_1 Depth=1 +.LBB1_2: # %for.body3 + # Parent Loop BB1_1 Depth=1 # => This Inner Loop Header: Depth=2 - movss C+9437184(%rbx,%r14,4), %xmm0 - cvtss2sd %xmm0, %xmm0 + vmovss (%r12), %xmm0 + vcvtss2sd %xmm0, %xmm0, %xmm0 + movq %rax, %rdi movl $.L.str, %esi movb $1, %al callq fprintf - movslq %r14d, %rax + movslq %ebx, %rax imulq $1717986919, %rax, %rcx # imm = 0x66666667 movq %rcx, %rdx shrq $63, %rdx @@ -59,127 +170,135 @@ print_array: # @print_array subl %ecx, %eax cmpl $79, %eax jne .LBB1_4 -# BB#3: # in Loop: Header=BB1_2 Depth=2 +# BB#3: # %if.then + # in Loop: Header=BB1_2 Depth=2 movq stdout(%rip), %rsi movl $10, %edi callq fputc -.LBB1_4: # in Loop: Header=BB1_2 Depth=2 - incq %r14 - movq stdout(%rip), %rsi - cmpq $1536, %r14 # imm = 0x600 - movq %rsi, %rdi +.LBB1_4: # %for.inc + # in Loop: Header=BB1_2 Depth=2 + addq $4, %r12 + incq %rbx + movq stdout(%rip), %rax + cmpq $1536, %rbx # imm = 0x600 jne .LBB1_2 -# BB#5: # in Loop: Header=BB1_1 Depth=1 +# BB#5: # %for.end + # in Loop: Header=BB1_1 Depth=1 movl $10, %edi + movq %rax, %rsi callq fputc - addq $6144, %rbx # imm = 0x1800 + addq $6144, %r15 # imm = 0x1800 + incq %r14 + cmpq $1536, %r14 # imm = 0x600 jne .LBB1_1 -# BB#6: - addq $8, %rsp +# BB#6: # %for.end12 popq %rbx + popq %r12 popq %r14 + popq %r15 + popq %rbp ret -.Ltmp1: - .size print_array, .Ltmp1-print_array +.Ltmp20: + .size print_array, .Ltmp20-print_array + .cfi_endproc .globl main .align 16, 0x90 .type main,@function main: # @main -# BB#0: # %pollyBB + .cfi_startproc +# BB#0: # %entry pushq %rbp +.Ltmp24: + .cfi_def_cfa_offset 16 +.Ltmp25: + .cfi_offset %rbp, -16 movq %rsp, %rbp +.Ltmp26: + .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbx - subq $56, %rsp - movq $A, -72(%rbp) - movq $B, -64(%rbp) - movl $init_array.omp_subfn, %edi - leaq -72(%rbp), %rbx - movq %rbx, %rsi - xorl %edx, %edx - xorl %ecx, %ecx - movl $1536, %r8d # imm = 0x600 - movl $1, %r9d - callq GOMP_parallel_loop_runtime_start - movq %rbx, %rdi - callq init_array.omp_subfn - callq GOMP_parallel_end + subq $24, %rsp +.Ltmp27: + .cfi_offset %rbx, -56 +.Ltmp28: + .cfi_offset %r12, -48 +.Ltmp29: + .cfi_offset %r13, -40 +.Ltmp30: + .cfi_offset %r14, -32 +.Ltmp31: + .cfi_offset %r15, -24 + callq init_array + leaq -48(%rbp), %rsi movl $main.omp_subfn, %edi - leaq -96(%rbp), %rsi - movq $C, -96(%rbp) - movq $A, -88(%rbp) - movq $B, -80(%rbp) xorl %edx, %edx xorl %ecx, %ecx movl $1536, %r8d # imm = 0x600 movl $1, %r9d callq GOMP_parallel_loop_runtime_start - leaq -48(%rbp), %rdi - leaq -56(%rbp), %rsi + leaq -56(%rbp), %rdi + leaq -64(%rbp), %rsi callq GOMP_loop_runtime_next - testb $1, %al - je .LBB2_6 + testb %al, %al + je .LBB2_4 # BB#1: - leaq -48(%rbp), %rbx leaq -56(%rbp), %r14 + leaq -64(%rbp), %r15 .align 16, 0x90 -.LBB2_3: # %omp.loadIVBounds.i +.LBB2_2: # %omp.loadIVBounds.i # =>This Loop Header: Depth=1 - # Child Loop BB2_5 Depth 2 - movq -56(%rbp), %r15 - decq %r15 - movq -48(%rbp), %r12 - cmpq %r15, %r12 - jg .LBB2_2 -# BB#4: # %polly.loop_header2.preheader.lr.ph.i - # in Loop: Header=BB2_3 Depth=1 - leaq (%r12,%r12,2), %rax - shlq $11, %rax - leaq C(%rax), %r13 + # Child Loop BB2_6 Depth 2 + movq -64(%rbp), %r12 + leaq -1(%r12), %rcx + movq -56(%rbp), %rax + cmpq %rcx, %rax + jg .LBB2_3 +# BB#5: # %polly.loop_preheader4.preheader.i + # in Loop: Header=BB2_2 Depth=1 + addq $-2, %r12 + leaq (%rax,%rax,2), %rcx + leaq -1(%rax), %r13 + shlq $11, %rcx + leaq C(%rcx), %rbx .align 16, 0x90 -.LBB2_5: # %polly.loop_header2.preheader.i - # Parent Loop BB2_3 Depth=1 +.LBB2_6: # %polly.loop_preheader4.i + # Parent Loop BB2_2 Depth=1 # => This Inner Loop Header: Depth=2 - movq %r13, %rdi + movq %rbx, %rdi xorl %esi, %esi movl $6144, %edx # imm = 0x1800 callq memset - addq $6144, %r13 # imm = 0x1800 - incq %r12 - cmpq %r15, %r12 - jle .LBB2_5 -.LBB2_2: # %omp.checkNext.loopexit.i - # in Loop: Header=BB2_3 Depth=1 - movq %rbx, %rdi - movq %r14, %rsi + addq $6144, %rbx # imm = 0x1800 + incq %r13 + cmpq %r12, %r13 + jle .LBB2_6 +.LBB2_3: # %omp.checkNext.backedge.i + # in Loop: Header=BB2_2 Depth=1 + movq %r14, %rdi + movq %r15, %rsi callq GOMP_loop_runtime_next - testb $1, %al - jne .LBB2_3 -.LBB2_6: # %main.omp_subfn.exit + testb %al, %al + jne .LBB2_2 +.LBB2_4: # %main.omp_subfn.exit callq GOMP_loop_end_nowait callq GOMP_parallel_end - movq %rsp, %rax - leaq -32(%rax), %rbx + leaq -48(%rbp), %rbx movl $main.omp_subfn1, %edi + movq %rbx, %rsi + xorl %edx, %edx xorl %ecx, %ecx movl $1536, %r8d # imm = 0x600 movl $64, %r9d - movq %rbx, %rsp - movq $C, -32(%rax) - movq $A, -24(%rax) - movq $B, -16(%rax) - movq %rbx, %rsi - xorl %edx, %edx callq GOMP_parallel_loop_runtime_start movq %rbx, %rdi callq main.omp_subfn1 callq GOMP_parallel_end xorl %eax, %eax - leaq -40(%rbp), %rsp + addq $24, %rsp popq %rbx popq %r12 popq %r13 @@ -187,418 +306,192 @@ main: # @main popq %r15 popq %rbp ret -.Ltmp2: - .size main, .Ltmp2-main +.Ltmp32: + .size main, .Ltmp32-main + .cfi_endproc .section .rodata.cst8,"aM",@progbits,8 .align 8 .LCPI3_0: - .quad 4602678819172646912 # double 5.000000e-01 + .quad 4602678819172646912 # double 0.5 .text .align 16, 0x90 .type init_array.omp_subfn,@function init_array.omp_subfn: # @init_array.omp_subfn -.Leh_func_begin3: -.Ltmp6: .cfi_startproc # BB#0: # %omp.setup - pushq %r14 -.Ltmp7: + pushq %rbp +.Ltmp36: .cfi_def_cfa_offset 16 +.Ltmp37: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp38: + .cfi_def_cfa_register %rbp + pushq %r15 + pushq %r14 pushq %rbx -.Ltmp8: - .cfi_def_cfa_offset 24 subq $24, %rsp -.Ltmp9: - .cfi_def_cfa_offset 48 -.Ltmp10: - .cfi_offset 3, -24 -.Ltmp11: - .cfi_offset 14, -16 - leaq 16(%rsp), %rdi - leaq 8(%rsp), %rsi +.Ltmp39: + .cfi_offset %rbx, -40 +.Ltmp40: + .cfi_offset %r14, -32 +.Ltmp41: + .cfi_offset %r15, -24 + leaq -32(%rbp), %rdi + leaq -40(%rbp), %rsi callq GOMP_loop_runtime_next - testb $1, %al - je .LBB3_2 + testb %al, %al + je .LBB3_4 # BB#1: - leaq 16(%rsp), %rbx - leaq 8(%rsp), %r14 - jmp .LBB3_4 -.LBB3_2: # %omp.exit + leaq -32(%rbp), %r14 + leaq -40(%rbp), %r15 + vmovsd .LCPI3_0(%rip), %xmm1 + .align 16, 0x90 +.LBB3_2: # %omp.loadIVBounds + # =>This Loop Header: Depth=1 + # Child Loop BB3_8 Depth 2 + # Child Loop BB3_5 Depth 3 + movq -40(%rbp), %r8 + leaq -1(%r8), %rcx + movq -32(%rbp), %rax + cmpq %rcx, %rax + jg .LBB3_3 +# BB#7: # %polly.loop_preheader4.preheader + # in Loop: Header=BB3_2 Depth=1 + addq $-2, %r8 + .align 16, 0x90 +.LBB3_8: # %polly.loop_preheader4 + # Parent Loop BB3_2 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB3_5 Depth 3 + xorl %edx, %edx + .align 16, 0x90 +.LBB3_5: # %polly.loop_header3 + # Parent Loop BB3_2 Depth=1 + # Parent Loop BB3_8 Depth=2 + # => This Inner Loop Header: Depth=3 + movl %edx, %esi + imull %eax, %esi + movl %esi, %edi + sarl $31, %edi + shrl $22, %edi + addl %esi, %edi + andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00 + negl %edi + movq %rax, %rcx + shlq $11, %rcx + leal 1(%rsi,%rdi), %ebx + leaq (%rcx,%rcx,2), %rdi + leaq 1(%rdx), %rsi + cmpq $1536, %rsi # imm = 0x600 + vcvtsi2sdl %ebx, %xmm0, %xmm0 + vmulsd %xmm1, %xmm0, %xmm0 + vcvtsd2ss %xmm0, %xmm0, %xmm0 + vmovss %xmm0, A(%rdi,%rdx,4) + vmovss %xmm0, B(%rdi,%rdx,4) + movq %rsi, %rdx + jne .LBB3_5 +# BB#6: # %polly.loop_exit5 + # in Loop: Header=BB3_8 Depth=2 + cmpq %r8, %rax + leaq 1(%rax), %rax + jle .LBB3_8 +.LBB3_3: # %omp.checkNext.backedge + # in Loop: Header=BB3_2 Depth=1 + movq %r14, %rdi + movq %r15, %rsi + callq GOMP_loop_runtime_next + vmovsd .LCPI3_0(%rip), %xmm1 + testb %al, %al + jne .LBB3_2 +.LBB3_4: # %omp.exit callq GOMP_loop_end_nowait addq $24, %rsp popq %rbx popq %r14 + popq %r15 + popq %rbp ret - .align 16, 0x90 -.LBB3_3: # %omp.checkNext.loopexit - # in Loop: Header=BB3_4 Depth=1 - movq %rbx, %rdi - movq %r14, %rsi - callq GOMP_loop_runtime_next - testb $1, %al - je .LBB3_2 -.LBB3_4: # %omp.loadIVBounds - # =>This Loop Header: Depth=1 - # Child Loop BB3_7 Depth 2 - # Child Loop BB3_8 Depth 3 - movq 8(%rsp), %rax - decq %rax - movq 16(%rsp), %rcx - cmpq %rax, %rcx - jg .LBB3_3 -# BB#5: # %polly.loop_header2.preheader.lr.ph - # in Loop: Header=BB3_4 Depth=1 - movq %rcx, %rdx - shlq $11, %rdx - leaq (%rdx,%rdx,2), %rdx - jmp .LBB3_7 - .align 16, 0x90 -.LBB3_6: # %polly.loop_header.loopexit - # in Loop: Header=BB3_7 Depth=2 - addq $6144, %rdx # imm = 0x1800 - incq %rcx - cmpq %rax, %rcx - jg .LBB3_3 -.LBB3_7: # %polly.loop_header2.preheader - # Parent Loop BB3_4 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB3_8 Depth 3 - movq $-1536, %rsi # imm = 0xFFFFFFFFFFFFFA00 - xorl %edi, %edi - .align 16, 0x90 -.LBB3_8: # %polly.loop_body3 - # Parent Loop BB3_4 Depth=1 - # Parent Loop BB3_7 Depth=2 - # => This Inner Loop Header: Depth=3 - movl %edi, %r8d - sarl $31, %r8d - shrl $22, %r8d - addl %edi, %r8d - andl $-1024, %r8d # imm = 0xFFFFFFFFFFFFFC00 - negl %r8d - leal 1(%rdi,%r8), %r8d - cvtsi2sd %r8d, %xmm0 - mulsd .LCPI3_0(%rip), %xmm0 - cvtsd2ss %xmm0, %xmm0 - movss %xmm0, A+6144(%rdx,%rsi,4) - movss %xmm0, B+6144(%rdx,%rsi,4) - addl %ecx, %edi - incq %rsi - jne .LBB3_8 - jmp .LBB3_6 -.Ltmp12: - .size init_array.omp_subfn, .Ltmp12-init_array.omp_subfn -.Ltmp13: +.Ltmp42: + .size init_array.omp_subfn, .Ltmp42-init_array.omp_subfn .cfi_endproc -.Leh_func_end3: .align 16, 0x90 .type main.omp_subfn,@function main.omp_subfn: # @main.omp_subfn -.Leh_func_begin4: -.Ltmp20: - .cfi_startproc -# BB#0: # %omp.setup - pushq %r15 -.Ltmp21: - .cfi_def_cfa_offset 16 - pushq %r14 -.Ltmp22: - .cfi_def_cfa_offset 24 - pushq %r13 -.Ltmp23: - .cfi_def_cfa_offset 32 - pushq %r12 -.Ltmp24: - .cfi_def_cfa_offset 40 - pushq %rbx -.Ltmp25: - .cfi_def_cfa_offset 48 - subq $16, %rsp -.Ltmp26: - .cfi_def_cfa_offset 64 -.Ltmp27: - .cfi_offset 3, -48 -.Ltmp28: - .cfi_offset 12, -40 -.Ltmp29: - .cfi_offset 13, -32 -.Ltmp30: - .cfi_offset 14, -24 -.Ltmp31: - .cfi_offset 15, -16 - leaq 8(%rsp), %rdi - leaq (%rsp), %rsi - callq GOMP_loop_runtime_next - testb $1, %al - je .LBB4_2 -# BB#1: - leaq 8(%rsp), %rbx - leaq (%rsp), %r14 - jmp .LBB4_4 -.LBB4_2: # %omp.exit - callq GOMP_loop_end_nowait - addq $16, %rsp - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - ret - .align 16, 0x90 -.LBB4_3: # %omp.checkNext.loopexit - # in Loop: Header=BB4_4 Depth=1 - movq %rbx, %rdi - movq %r14, %rsi - callq GOMP_loop_runtime_next - testb $1, %al - je .LBB4_2 -.LBB4_4: # %omp.loadIVBounds - # =>This Loop Header: Depth=1 - # Child Loop BB4_6 Depth 2 - movq (%rsp), %r15 - decq %r15 - movq 8(%rsp), %r12 - cmpq %r15, %r12 - jg .LBB4_3 -# BB#5: # %polly.loop_header2.preheader.lr.ph - # in Loop: Header=BB4_4 Depth=1 - leaq (%r12,%r12,2), %rax - shlq $11, %rax - leaq C(%rax), %r13 - .align 16, 0x90 -.LBB4_6: # %polly.loop_header2.preheader - # Parent Loop BB4_4 Depth=1 - # => This Inner Loop Header: Depth=2 - movq %r13, %rdi - xorl %esi, %esi - movl $6144, %edx # imm = 0x1800 - callq memset - addq $6144, %r13 # imm = 0x1800 - incq %r12 - cmpq %r15, %r12 - jle .LBB4_6 - jmp .LBB4_3 -.Ltmp32: - .size main.omp_subfn, .Ltmp32-main.omp_subfn -.Ltmp33: - .cfi_endproc -.Leh_func_end4: - - .align 16, 0x90 - .type main.omp_subfn1,@function -main.omp_subfn1: # @main.omp_subfn1 -.Leh_func_begin5: -.Ltmp41: .cfi_startproc # BB#0: # %omp.setup pushq %rbp -.Ltmp42: - .cfi_def_cfa_offset 16 - pushq %r15 -.Ltmp43: - .cfi_def_cfa_offset 24 - pushq %r14 -.Ltmp44: - .cfi_def_cfa_offset 32 - pushq %r13 -.Ltmp45: - .cfi_def_cfa_offset 40 - pushq %r12 .Ltmp46: - .cfi_def_cfa_offset 48 - pushq %rbx + .cfi_def_cfa_offset 16 .Ltmp47: - .cfi_def_cfa_offset 56 - subq $40, %rsp + .cfi_offset %rbp, -16 + movq %rsp, %rbp .Ltmp48: - .cfi_def_cfa_offset 96 + .cfi_def_cfa_register %rbp + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbx + subq $24, %rsp .Ltmp49: - .cfi_offset 3, -56 + .cfi_offset %rbx, -56 .Ltmp50: - .cfi_offset 12, -48 + .cfi_offset %r12, -48 .Ltmp51: - .cfi_offset 13, -40 + .cfi_offset %r13, -40 .Ltmp52: - .cfi_offset 14, -32 + .cfi_offset %r14, -32 .Ltmp53: - .cfi_offset 15, -24 -.Ltmp54: - .cfi_offset 6, -16 - leaq 32(%rsp), %rdi - leaq 24(%rsp), %rsi - jmp .LBB5_1 - .align 16, 0x90 -.LBB5_4: # %omp.loadIVBounds - # in Loop: Header=BB5_1 Depth=1 - movq 24(%rsp), %rax - decq %rax - movq %rax, (%rsp) # 8-byte Spill - movq 32(%rsp), %rcx - cmpq %rax, %rcx - jg .LBB5_3 -# BB#5: # %polly.loop_header2.preheader.lr.ph - # in Loop: Header=BB5_1 Depth=1 - leaq (%rcx,%rcx,2), %rax - movq %rcx, %rdx - shlq $9, %rdx - leaq (%rdx,%rdx,2), %rdx - movq %rdx, 16(%rsp) # 8-byte Spill - shlq $11, %rax - leaq A(%rax), %rax - movq %rax, 8(%rsp) # 8-byte Spill - jmp .LBB5_7 - .align 16, 0x90 -.LBB5_6: # %polly.loop_header.loopexit - # in Loop: Header=BB5_7 Depth=2 - addq $98304, 16(%rsp) # 8-byte Folded Spill - # imm = 0x18000 - addq $393216, 8(%rsp) # 8-byte Folded Spill - # imm = 0x60000 - addq $64, %rcx - cmpq (%rsp), %rcx # 8-byte Folded Reload - jg .LBB5_3 -.LBB5_7: # %polly.loop_header2.preheader - # Parent Loop BB5_1 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB5_9 Depth 3 - # Child Loop BB5_11 Depth 4 - # Child Loop BB5_14 Depth 5 - # Child Loop BB5_18 Depth 6 - # Child Loop BB5_19 Depth 7 - leaq 63(%rcx), %rax - xorl %edx, %edx - jmp .LBB5_9 - .align 16, 0x90 -.LBB5_8: # %polly.loop_header2.loopexit - # in Loop: Header=BB5_9 Depth=3 - addq $64, %rdx - cmpq $1536, %rdx # imm = 0x600 - je .LBB5_6 -.LBB5_9: # %polly.loop_header7.preheader - # Parent Loop BB5_1 Depth=1 - # Parent Loop BB5_7 Depth=2 - # => This Loop Header: Depth=3 - # Child Loop BB5_11 Depth 4 - # Child Loop BB5_14 Depth 5 - # Child Loop BB5_18 Depth 6 - # Child Loop BB5_19 Depth 7 - movq 16(%rsp), %rsi # 8-byte Reload - leaq (%rsi,%rdx), %rsi - leaq 63(%rdx), %rdi - xorl %r8d, %r8d - movq 8(%rsp), %r9 # 8-byte Reload - movq %rdx, %r10 - jmp .LBB5_11 - .align 16, 0x90 -.LBB5_10: # %polly.loop_header7.loopexit - # in Loop: Header=BB5_11 Depth=4 - addq $256, %r9 # imm = 0x100 - addq $98304, %r10 # imm = 0x18000 - addq $64, %r8 - cmpq $1536, %r8 # imm = 0x600 - je .LBB5_8 -.LBB5_11: # %polly.loop_body8 - # Parent Loop BB5_1 Depth=1 - # Parent Loop BB5_7 Depth=2 - # Parent Loop BB5_9 Depth=3 - # => This Loop Header: Depth=4 - # Child Loop BB5_14 Depth 5 - # Child Loop BB5_18 Depth 6 - # Child Loop BB5_19 Depth 7 - movabsq $9223372036854775744, %r11 # imm = 0x7FFFFFFFFFFFFFC0 - cmpq %r11, %rcx - jg .LBB5_10 -# BB#12: # %polly.loop_body13.lr.ph - # in Loop: Header=BB5_11 Depth=4 - leaq 63(%r8), %r11 - movq %rcx, %rbx - movq %rsi, %r14 - movq %r9, %r15 - jmp .LBB5_14 - .align 16, 0x90 -.LBB5_13: # %polly.loop_header12.loopexit - # in Loop: Header=BB5_14 Depth=5 - addq $1536, %r14 # imm = 0x600 - addq $6144, %r15 # imm = 0x1800 - incq %rbx - cmpq %rax, %rbx - jg .LBB5_10 -.LBB5_14: # %polly.loop_body13 - # Parent Loop BB5_1 Depth=1 - # Parent Loop BB5_7 Depth=2 - # Parent Loop BB5_9 Depth=3 - # Parent Loop BB5_11 Depth=4 - # => This Loop Header: Depth=5 - # Child Loop BB5_18 Depth 6 - # Child Loop BB5_19 Depth 7 - cmpq %r11, %r8 - jg .LBB5_13 -# BB#15: # %polly.loop_body13 - # in Loop: Header=BB5_14 Depth=5 - cmpq %rdi, %rdx - jg .LBB5_13 -# BB#16: # %polly.loop_body23.lr.ph.preheader - # in Loop: Header=BB5_14 Depth=5 - xorl %r12d, %r12d - movq %r10, %r13 - jmp .LBB5_18 - .align 16, 0x90 -.LBB5_17: # %polly.loop_header17.loopexit - # in Loop: Header=BB5_18 Depth=6 - addq $1536, %r13 # imm = 0x600 - incq %r12 - cmpq $64, %r12 - je .LBB5_13 -.LBB5_18: # %polly.loop_body23.lr.ph - # Parent Loop BB5_1 Depth=1 - # Parent Loop BB5_7 Depth=2 - # Parent Loop BB5_9 Depth=3 - # Parent Loop BB5_11 Depth=4 - # Parent Loop BB5_14 Depth=5 - # => This Loop Header: Depth=6 - # Child Loop BB5_19 Depth 7 - movss (%r15,%r12,4), %xmm0 - pshufd $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] - xorl %ebp, %ebp - .align 16, 0x90 -.LBB5_19: # %polly.loop_body23 - # Parent Loop BB5_1 Depth=1 - # Parent Loop BB5_7 Depth=2 - # Parent Loop BB5_9 Depth=3 - # Parent Loop BB5_11 Depth=4 - # Parent Loop BB5_14 Depth=5 - # Parent Loop BB5_18 Depth=6 - # => This Inner Loop Header: Depth=7 - movaps B(%rbp,%r13,4), %xmm1 - mulps %xmm0, %xmm1 - addps C(%rbp,%r14,4), %xmm1 - movaps %xmm1, C(%rbp,%r14,4) - addq $16, %rbp - cmpq $256, %rbp # imm = 0x100 - jne .LBB5_19 - jmp .LBB5_17 -.LBB5_3: # %omp.checkNext.loopexit - # in Loop: Header=BB5_1 Depth=1 - leaq 32(%rsp), %rax - movq %rax, %rdi - leaq 24(%rsp), %rax - movq %rax, %rsi -.LBB5_1: # %omp.setup - # =>This Loop Header: Depth=1 - # Child Loop BB5_7 Depth 2 - # Child Loop BB5_9 Depth 3 - # Child Loop BB5_11 Depth 4 - # Child Loop BB5_14 Depth 5 - # Child Loop BB5_18 Depth 6 - # Child Loop BB5_19 Depth 7 + .cfi_offset %r15, -24 + leaq -48(%rbp), %rdi + leaq -56(%rbp), %rsi callq GOMP_loop_runtime_next - testb $1, %al - jne .LBB5_4 -# BB#2: # %omp.exit + testb %al, %al + je .LBB4_4 +# BB#1: + leaq -48(%rbp), %r14 + leaq -56(%rbp), %r15 + .align 16, 0x90 +.LBB4_2: # %omp.loadIVBounds + # =>This Loop Header: Depth=1 + # Child Loop BB4_6 Depth 2 + movq -56(%rbp), %r12 + leaq -1(%r12), %rcx + movq -48(%rbp), %rax + cmpq %rcx, %rax + jg .LBB4_3 +# BB#5: # %polly.loop_preheader4.preheader + # in Loop: Header=BB4_2 Depth=1 + addq $-2, %r12 + leaq (%rax,%rax,2), %rcx + leaq -1(%rax), %r13 + shlq $11, %rcx + leaq C(%rcx), %rbx + .align 16, 0x90 +.LBB4_6: # %polly.loop_preheader4 + # Parent Loop BB4_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movq %rbx, %rdi + xorl %esi, %esi + movl $6144, %edx # imm = 0x1800 + callq memset + addq $6144, %rbx # imm = 0x1800 + incq %r13 + cmpq %r12, %r13 + jle .LBB4_6 +.LBB4_3: # %omp.checkNext.backedge + # in Loop: Header=BB4_2 Depth=1 + movq %r14, %rdi + movq %r15, %rsi + callq GOMP_loop_runtime_next + testb %al, %al + jne .LBB4_2 +.LBB4_4: # %omp.exit callq GOMP_loop_end_nowait - addq $40, %rsp + addq $24, %rsp popq %rbx popq %r12 popq %r13 @@ -606,11 +499,244 @@ main.omp_subfn1: # @main.omp_subfn1 popq %r15 popq %rbp ret -.Ltmp55: - .size main.omp_subfn1, .Ltmp55-main.omp_subfn1 -.Ltmp56: +.Ltmp54: + .size main.omp_subfn, .Ltmp54-main.omp_subfn + .cfi_endproc + + .align 16, 0x90 + .type main.omp_subfn1,@function +main.omp_subfn1: # @main.omp_subfn1 + .cfi_startproc +# BB#0: # %omp.setup + pushq %rbp +.Ltmp58: + .cfi_def_cfa_offset 16 +.Ltmp59: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp60: + .cfi_def_cfa_register %rbp + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbx + subq $72, %rsp +.Ltmp61: + .cfi_offset %rbx, -56 +.Ltmp62: + .cfi_offset %r12, -48 +.Ltmp63: + .cfi_offset %r13, -40 +.Ltmp64: + .cfi_offset %r14, -32 +.Ltmp65: + .cfi_offset %r15, -24 + jmp .LBB5_1 + .align 16, 0x90 +.LBB5_2: # %omp.loadIVBounds + # in Loop: Header=BB5_1 Depth=1 + movq -56(%rbp), %rax + movq %rax, -112(%rbp) # 8-byte Spill + leaq -1(%rax), %rax + movq -48(%rbp), %rcx + cmpq %rax, %rcx + jg .LBB5_1 +# BB#3: # %polly.loop_preheader4.preheader + # in Loop: Header=BB5_1 Depth=1 + leaq -1(%rcx), %rax + movq %rax, -88(%rbp) # 8-byte Spill + addq $-65, -112(%rbp) # 8-byte Folded Spill + movq %rcx, %rax + shlq $9, %rax + leaq (%rax,%rax,2), %rax + leaq C+16(,%rax,4), %rax + movq %rax, -104(%rbp) # 8-byte Spill + .align 16, 0x90 +.LBB5_7: # %polly.loop_preheader4 + # Parent Loop BB5_1 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB5_8 Depth 3 + # Child Loop BB5_9 Depth 4 + # Child Loop BB5_12 Depth 5 + # Child Loop BB5_17 Depth 6 + # Child Loop BB5_18 Depth 7 + # Child Loop BB5_14 Depth 5 + movq %rcx, -72(%rbp) # 8-byte Spill + leaq 62(%rcx), %rdi + xorl %edx, %edx + .align 16, 0x90 +.LBB5_8: # %polly.loop_preheader11 + # Parent Loop BB5_1 Depth=1 + # Parent Loop BB5_7 Depth=2 + # => This Loop Header: Depth=3 + # Child Loop BB5_9 Depth 4 + # Child Loop BB5_12 Depth 5 + # Child Loop BB5_17 Depth 6 + # Child Loop BB5_18 Depth 7 + # Child Loop BB5_14 Depth 5 + movq %rdx, -96(%rbp) # 8-byte Spill + leaq -4(%rdx), %rcx + movq %rdx, %rax + decq %rax + cmovsq %rcx, %rax + movq %rax, %r14 + sarq $63, %r14 + shrq $62, %r14 + addq %rax, %r14 + andq $-4, %r14 + movq %rdx, %rax + orq $63, %rax + leaq -4(%rax), %rdx + movq -104(%rbp), %rcx # 8-byte Reload + leaq (%rcx,%r14,4), %rcx + movq %rcx, -80(%rbp) # 8-byte Spill + leaq B+16(,%r14,4), %rbx + leaq 4(%r14), %rcx + movq %rcx, -64(%rbp) # 8-byte Spill + xorl %r11d, %r11d + .align 16, 0x90 +.LBB5_9: # %polly.loop_header10 + # Parent Loop BB5_1 Depth=1 + # Parent Loop BB5_7 Depth=2 + # Parent Loop BB5_8 Depth=3 + # => This Loop Header: Depth=4 + # Child Loop BB5_12 Depth 5 + # Child Loop BB5_17 Depth 6 + # Child Loop BB5_18 Depth 7 + # Child Loop BB5_14 Depth 5 + movabsq $9223372036854775744, %rcx # imm = 0x7FFFFFFFFFFFFFC0 + cmpq %rcx, -72(%rbp) # 8-byte Folded Reload + jg .LBB5_15 +# BB#10: # %polly.loop_header17.preheader + # in Loop: Header=BB5_9 Depth=4 + movq %r11, %r15 + orq $63, %r15 + cmpq %r15, %r11 + movq -88(%rbp), %rcx # 8-byte Reload + jle .LBB5_11 + .align 16, 0x90 +.LBB5_14: # %polly.loop_exit28.us + # Parent Loop BB5_1 Depth=1 + # Parent Loop BB5_7 Depth=2 + # Parent Loop BB5_8 Depth=3 + # Parent Loop BB5_9 Depth=4 + # => This Inner Loop Header: Depth=5 + incq %rcx + cmpq %rdi, %rcx + jle .LBB5_14 + jmp .LBB5_15 + .align 16, 0x90 +.LBB5_11: # in Loop: Header=BB5_9 Depth=4 + decq %r15 + movq -80(%rbp), %r13 # 8-byte Reload + movq -72(%rbp), %rcx # 8-byte Reload + .align 16, 0x90 +.LBB5_12: # %polly.loop_header26.preheader + # Parent Loop BB5_1 Depth=1 + # Parent Loop BB5_7 Depth=2 + # Parent Loop BB5_8 Depth=3 + # Parent Loop BB5_9 Depth=4 + # => This Loop Header: Depth=5 + # Child Loop BB5_17 Depth 6 + # Child Loop BB5_18 Depth 7 + cmpq %rax, -64(%rbp) # 8-byte Folded Reload + movq %rbx, %r12 + movq %r11, %r8 + jg .LBB5_13 + .align 16, 0x90 +.LBB5_17: # %polly.loop_header35.preheader + # Parent Loop BB5_1 Depth=1 + # Parent Loop BB5_7 Depth=2 + # Parent Loop BB5_8 Depth=3 + # Parent Loop BB5_9 Depth=4 + # Parent Loop BB5_12 Depth=5 + # => This Loop Header: Depth=6 + # Child Loop BB5_18 Depth 7 + leaq (%rcx,%rcx,2), %rsi + shlq $11, %rsi + vbroadcastss A(%rsi,%r8,4), %xmm0 + movq %r13, %r9 + movq %r12, %r10 + movq %r14, %rsi +.LBB5_18: # %polly.loop_header35 + # Parent Loop BB5_1 Depth=1 + # Parent Loop BB5_7 Depth=2 + # Parent Loop BB5_8 Depth=3 + # Parent Loop BB5_9 Depth=4 + # Parent Loop BB5_12 Depth=5 + # Parent Loop BB5_17 Depth=6 + # => This Inner Loop Header: Depth=7 + vmulps (%r10), %xmm0, %xmm1 + vaddps (%r9), %xmm1, %xmm1 + vmovaps %xmm1, (%r9) + addq $16, %r9 + addq $16, %r10 + addq $4, %rsi + cmpq %rdx, %rsi + jle .LBB5_18 +# BB#16: # %polly.loop_exit37 + # in Loop: Header=BB5_17 Depth=6 + addq $6144, %r12 # imm = 0x1800 + cmpq %r15, %r8 + leaq 1(%r8), %r8 + jle .LBB5_17 + .align 16, 0x90 +.LBB5_13: # %polly.loop_exit28 + # in Loop: Header=BB5_12 Depth=5 + addq $6144, %r13 # imm = 0x1800 + cmpq %rdi, %rcx + leaq 1(%rcx), %rcx + jle .LBB5_12 + .align 16, 0x90 +.LBB5_15: # %polly.loop_exit19 + # in Loop: Header=BB5_9 Depth=4 + addq $393216, %rbx # imm = 0x60000 + cmpq $1472, %r11 # imm = 0x5C0 + leaq 64(%r11), %r11 + jl .LBB5_9 +# BB#5: # %polly.loop_exit12 + # in Loop: Header=BB5_8 Depth=3 + movq -96(%rbp), %rdx # 8-byte Reload + cmpq $1472, %rdx # imm = 0x5C0 + leaq 64(%rdx), %rdx + jl .LBB5_8 +# BB#6: # %polly.loop_exit5 + # in Loop: Header=BB5_7 Depth=2 + addq $64, -88(%rbp) # 8-byte Folded Spill + addq $393216, -104(%rbp) # 8-byte Folded Spill + # imm = 0x60000 + movq -72(%rbp), %rcx # 8-byte Reload + cmpq -112(%rbp), %rcx # 8-byte Folded Reload + leaq 64(%rcx), %rcx + jle .LBB5_7 +.LBB5_1: # %omp.setup + # =>This Loop Header: Depth=1 + # Child Loop BB5_7 Depth 2 + # Child Loop BB5_8 Depth 3 + # Child Loop BB5_9 Depth 4 + # Child Loop BB5_12 Depth 5 + # Child Loop BB5_17 Depth 6 + # Child Loop BB5_18 Depth 7 + # Child Loop BB5_14 Depth 5 + leaq -48(%rbp), %rdi + leaq -56(%rbp), %rsi + callq GOMP_loop_runtime_next + testb %al, %al + jne .LBB5_2 +# BB#4: # %omp.exit + callq GOMP_loop_end_nowait + addq $72, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + ret +.Ltmp66: + .size main.omp_subfn1, .Ltmp66-main.omp_subfn1 .cfi_endproc -.Leh_func_end5: .type A,@object # @A .comm A,9437184,16 diff --git a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector.exe b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector.exe index fac17e216859..36b788ea9ac3 100755 Binary files a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector.exe and b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector.exe differ diff --git a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll index 7217bc92c804..9d1f9ad098f9 100644 Binary files a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll and b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll differ diff --git a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector.s b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector.s index a1d6f0bf9b04..485d230bc398 100644 --- a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector.s +++ b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled+vector.s @@ -2,76 +2,112 @@ .section .rodata.cst8,"aM",@progbits,8 .align 8 .LCPI0_0: - .quad 4602678819172646912 # double 5.000000e-01 + .quad 4602678819172646912 # double 0.5 .text .globl init_array .align 16, 0x90 .type init_array,@function init_array: # @init_array -# BB#0: # %pollyBB - xorl %eax, %eax - movsd .LCPI0_0(%rip), %xmm0 - movq %rax, %rcx + .cfi_startproc +# BB#0: # %entry + pushq %rbp +.Ltmp2: + .cfi_def_cfa_offset 16 +.Ltmp3: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp4: + .cfi_def_cfa_register %rbp + xorl %r8d, %r8d + vmovsd .LCPI0_0(%rip), %xmm0 .align 16, 0x90 -.LBB0_2: # %polly.loop_header1.preheader +.LBB0_1: # %polly.loop_preheader3 # =>This Loop Header: Depth=1 - # Child Loop BB0_3 Depth 2 - movq $-1536, %rdx # imm = 0xFFFFFFFFFFFFFA00 - xorl %esi, %esi + # Child Loop BB0_2 Depth 2 + xorl %ecx, %ecx .align 16, 0x90 -.LBB0_3: # %polly.loop_body2 - # Parent Loop BB0_2 Depth=1 +.LBB0_2: # %polly.loop_header2 + # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 - movl %esi, %edi - sarl $31, %edi - shrl $22, %edi - addl %esi, %edi - andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00 - negl %edi - leal 1(%rsi,%rdi), %edi - cvtsi2sd %edi, %xmm1 - mulsd %xmm0, %xmm1 - cvtsd2ss %xmm1, %xmm1 - movss %xmm1, A+6144(%rax,%rdx,4) - movss %xmm1, B+6144(%rax,%rdx,4) - addl %ecx, %esi - incq %rdx - jne .LBB0_3 -# BB#1: # %polly.loop_header.loopexit - # in Loop: Header=BB0_2 Depth=1 - addq $6144, %rax # imm = 0x1800 - incq %rcx - cmpq $1536, %rcx # imm = 0x600 + movl %ecx, %edx + imull %r8d, %edx + movl %edx, %esi + sarl $31, %esi + shrl $22, %esi + addl %edx, %esi + andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 + negl %esi + movq %r8, %rax + shlq $11, %rax + leal 1(%rdx,%rsi), %edi + leaq (%rax,%rax,2), %rsi + leaq 1(%rcx), %rdx + cmpq $1536, %rdx # imm = 0x600 + vcvtsi2sdl %edi, %xmm0, %xmm1 + vmulsd %xmm0, %xmm1, %xmm1 + vcvtsd2ss %xmm1, %xmm1, %xmm1 + vmovss %xmm1, A(%rsi,%rcx,4) + vmovss %xmm1, B(%rsi,%rcx,4) + movq %rdx, %rcx jne .LBB0_2 -# BB#4: # %polly.after_loop +# BB#3: # %polly.loop_exit4 + # in Loop: Header=BB0_1 Depth=1 + incq %r8 + cmpq $1536, %r8 # imm = 0x600 + jne .LBB0_1 +# BB#4: # %polly.loop_exit + popq %rbp ret -.Ltmp0: - .size init_array, .Ltmp0-init_array +.Ltmp5: + .size init_array, .Ltmp5-init_array + .cfi_endproc .globl print_array .align 16, 0x90 .type print_array,@function print_array: # @print_array -# BB#0: + .cfi_startproc +# BB#0: # %entry + pushq %rbp +.Ltmp9: + .cfi_def_cfa_offset 16 +.Ltmp10: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp11: + .cfi_def_cfa_register %rbp + pushq %r15 pushq %r14 + pushq %r12 pushq %rbx - pushq %rax - movq $-9437184, %rbx # imm = 0xFFFFFFFFFF700000 +.Ltmp12: + .cfi_offset %rbx, -48 +.Ltmp13: + .cfi_offset %r12, -40 +.Ltmp14: + .cfi_offset %r14, -32 +.Ltmp15: + .cfi_offset %r15, -24 + xorl %r14d, %r14d + movl $C, %r15d .align 16, 0x90 -.LBB1_1: # %.preheader +.LBB1_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB1_2 Depth 2 - xorl %r14d, %r14d - movq stdout(%rip), %rdi + movq stdout(%rip), %rax + movq %r15, %r12 + xorl %ebx, %ebx .align 16, 0x90 -.LBB1_2: # Parent Loop BB1_1 Depth=1 +.LBB1_2: # %for.body3 + # Parent Loop BB1_1 Depth=1 # => This Inner Loop Header: Depth=2 - movss C+9437184(%rbx,%r14,4), %xmm0 - cvtss2sd %xmm0, %xmm0 + vmovss (%r12), %xmm0 + vcvtss2sd %xmm0, %xmm0, %xmm0 + movq %rax, %rdi movl $.L.str, %esi movb $1, %al callq fprintf - movslq %r14d, %rax + movslq %ebx, %rax imulq $1717986919, %rax, %rcx # imm = 0x66666667 movq %rcx, %rdx shrq $63, %rdx @@ -81,217 +117,258 @@ print_array: # @print_array subl %ecx, %eax cmpl $79, %eax jne .LBB1_4 -# BB#3: # in Loop: Header=BB1_2 Depth=2 +# BB#3: # %if.then + # in Loop: Header=BB1_2 Depth=2 movq stdout(%rip), %rsi movl $10, %edi callq fputc -.LBB1_4: # in Loop: Header=BB1_2 Depth=2 - incq %r14 - movq stdout(%rip), %rsi - cmpq $1536, %r14 # imm = 0x600 - movq %rsi, %rdi +.LBB1_4: # %for.inc + # in Loop: Header=BB1_2 Depth=2 + addq $4, %r12 + incq %rbx + movq stdout(%rip), %rax + cmpq $1536, %rbx # imm = 0x600 jne .LBB1_2 -# BB#5: # in Loop: Header=BB1_1 Depth=1 +# BB#5: # %for.end + # in Loop: Header=BB1_1 Depth=1 movl $10, %edi + movq %rax, %rsi callq fputc - addq $6144, %rbx # imm = 0x1800 + addq $6144, %r15 # imm = 0x1800 + incq %r14 + cmpq $1536, %r14 # imm = 0x600 jne .LBB1_1 -# BB#6: - addq $8, %rsp +# BB#6: # %for.end12 popq %rbx + popq %r12 popq %r14 + popq %r15 + popq %rbp ret -.Ltmp1: - .size print_array, .Ltmp1-print_array +.Ltmp16: + .size print_array, .Ltmp16-print_array + .cfi_endproc .section .rodata.cst8,"aM",@progbits,8 .align 8 .LCPI2_0: - .quad 4602678819172646912 # double 5.000000e-01 + .quad 4602678819172646912 # double 0.5 .text .globl main .align 16, 0x90 .type main,@function main: # @main -# BB#0: # %pollyBB + .cfi_startproc +# BB#0: # %entry pushq %rbp +.Ltmp20: + .cfi_def_cfa_offset 16 +.Ltmp21: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp22: + .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbx - subq $24, %rsp - xorl %eax, %eax - movsd .LCPI2_0(%rip), %xmm0 - movq %rax, %rcx + subq $56, %rsp +.Ltmp23: + .cfi_offset %rbx, -56 +.Ltmp24: + .cfi_offset %r12, -48 +.Ltmp25: + .cfi_offset %r13, -40 +.Ltmp26: + .cfi_offset %r14, -32 +.Ltmp27: + .cfi_offset %r15, -24 + xorl %ebx, %ebx + vmovsd .LCPI2_0(%rip), %xmm0 .align 16, 0x90 -.LBB2_1: # %polly.loop_header1.preheader.i +.LBB2_1: # %polly.loop_preheader3.i # =>This Loop Header: Depth=1 # Child Loop BB2_2 Depth 2 - movq $-1536, %rdx # imm = 0xFFFFFFFFFFFFFA00 - xorl %esi, %esi + xorl %ecx, %ecx .align 16, 0x90 -.LBB2_2: # %polly.loop_body2.i +.LBB2_2: # %polly.loop_header2.i # Parent Loop BB2_1 Depth=1 # => This Inner Loop Header: Depth=2 - movl %esi, %edi - sarl $31, %edi - shrl $22, %edi - addl %esi, %edi - andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00 - negl %edi - leal 1(%rsi,%rdi), %edi - cvtsi2sd %edi, %xmm1 - mulsd %xmm0, %xmm1 - cvtsd2ss %xmm1, %xmm1 - movss %xmm1, A+6144(%rax,%rdx,4) - movss %xmm1, B+6144(%rax,%rdx,4) - addl %ecx, %esi - incq %rdx + movl %ecx, %edx + imull %ebx, %edx + movl %edx, %esi + sarl $31, %esi + shrl $22, %esi + addl %edx, %esi + andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 + negl %esi + movq %rbx, %rax + shlq $11, %rax + leal 1(%rdx,%rsi), %edi + leaq (%rax,%rax,2), %rsi + leaq 1(%rcx), %rdx + cmpq $1536, %rdx # imm = 0x600 + vcvtsi2sdl %edi, %xmm0, %xmm1 + vmulsd %xmm0, %xmm1, %xmm1 + vcvtsd2ss %xmm1, %xmm1, %xmm1 + vmovss %xmm1, A(%rsi,%rcx,4) + vmovss %xmm1, B(%rsi,%rcx,4) + movq %rdx, %rcx jne .LBB2_2 -# BB#3: # %polly.loop_header.loopexit.i +# BB#3: # %polly.loop_exit4.i # in Loop: Header=BB2_1 Depth=1 - addq $6144, %rax # imm = 0x1800 - incq %rcx - cmpq $1536, %rcx # imm = 0x600 + incq %rbx + cmpq $1536, %rbx # imm = 0x600 jne .LBB2_1 -# BB#4: # %polly.loop_header.preheader +# BB#4: # %polly.loop_preheader3.preheader movl $C, %edi xorl %esi, %esi movl $9437184, %edx # imm = 0x900000 callq memset - xorl %eax, %eax - movq %rax, 16(%rsp) # 8-byte Spill - movq %rax, (%rsp) # 8-byte Spill - jmp .LBB2_6 + xorl %esi, %esi + movl $C+16, %eax + movq %rax, -88(%rbp) # 8-byte Spill .align 16, 0x90 -.LBB2_5: # %polly.loop_header7.loopexit - # in Loop: Header=BB2_6 Depth=1 - addq $393216, (%rsp) # 8-byte Folded Spill - # imm = 0x60000 - movq 16(%rsp), %rax # 8-byte Reload - addq $64, %rax - movq %rax, 16(%rsp) # 8-byte Spill - cmpq $1536, %rax # imm = 0x600 - je .LBB2_7 -.LBB2_6: # %polly.loop_header12.preheader +.LBB2_5: # %polly.loop_preheader17 # =>This Loop Header: Depth=1 - # Child Loop BB2_9 Depth 2 - # Child Loop BB2_11 Depth 3 - # Child Loop BB2_14 Depth 4 - # Child Loop BB2_18 Depth 5 - # Child Loop BB2_19 Depth 6 - movq 16(%rsp), %rax # 8-byte Reload - leaq 63(%rax), %rax - movq (%rsp), %rcx # 8-byte Reload - leaq A(%rcx), %rdx - movq %rdx, 8(%rsp) # 8-byte Spill + # Child Loop BB2_15 Depth 2 + # Child Loop BB2_8 Depth 3 + # Child Loop BB2_11 Depth 4 + # Child Loop BB2_17 Depth 5 + # Child Loop BB2_18 Depth 6 + movq %rsi, -56(%rbp) # 8-byte Spill + movq %rsi, %rax + orq $63, %rax + movq %rax, -72(%rbp) # 8-byte Spill + leaq -1(%rax), %rax + movq %rax, -48(%rbp) # 8-byte Spill xorl %edx, %edx - jmp .LBB2_9 .align 16, 0x90 -.LBB2_8: # %polly.loop_header12.loopexit - # in Loop: Header=BB2_9 Depth=2 - addq $256, %rcx # imm = 0x100 - addq $64, %rdx - cmpq $1536, %rdx # imm = 0x600 - je .LBB2_5 -.LBB2_9: # %polly.loop_header17.preheader - # Parent Loop BB2_6 Depth=1 +.LBB2_15: # %polly.loop_preheader24 + # Parent Loop BB2_5 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB2_11 Depth 3 - # Child Loop BB2_14 Depth 4 - # Child Loop BB2_18 Depth 5 - # Child Loop BB2_19 Depth 6 - leaq 63(%rdx), %rsi - xorl %edi, %edi - movq 8(%rsp), %r8 # 8-byte Reload - movq %rdx, %r9 - jmp .LBB2_11 + # Child Loop BB2_8 Depth 3 + # Child Loop BB2_11 Depth 4 + # Child Loop BB2_17 Depth 5 + # Child Loop BB2_18 Depth 6 + movq %rdx, -80(%rbp) # 8-byte Spill + leaq -4(%rdx), %rcx + movq %rdx, %rax + decq %rax + cmovsq %rcx, %rax + movq %rax, %r15 + sarq $63, %r15 + shrq $62, %r15 + addq %rax, %r15 + andq $-4, %r15 + movq %rdx, %r13 + orq $63, %r13 + leaq -4(%r13), %rdx + xorl %r10d, %r10d + movq -88(%rbp), %rax # 8-byte Reload + leaq (%rax,%r15,4), %rax + movq %rax, -64(%rbp) # 8-byte Spill + leaq B+16(,%r15,4), %rbx + leaq 4(%r15), %r12 .align 16, 0x90 -.LBB2_10: # %polly.loop_header17.loopexit - # in Loop: Header=BB2_11 Depth=3 - addq $256, %r8 # imm = 0x100 - addq $98304, %r9 # imm = 0x18000 - addq $64, %rdi - cmpq $1536, %rdi # imm = 0x600 - je .LBB2_8 -.LBB2_11: # %polly.loop_body18 - # Parent Loop BB2_6 Depth=1 - # Parent Loop BB2_9 Depth=2 +.LBB2_8: # %polly.loop_header23 + # Parent Loop BB2_5 Depth=1 + # Parent Loop BB2_15 Depth=2 # => This Loop Header: Depth=3 - # Child Loop BB2_14 Depth 4 - # Child Loop BB2_18 Depth 5 - # Child Loop BB2_19 Depth 6 - cmpq %rax, 16(%rsp) # 8-byte Folded Reload - jg .LBB2_10 -# BB#12: # %polly.loop_body23.lr.ph - # in Loop: Header=BB2_11 Depth=3 - leaq 63(%rdi), %r10 - xorl %r11d, %r11d - jmp .LBB2_14 + # Child Loop BB2_11 Depth 4 + # Child Loop BB2_17 Depth 5 + # Child Loop BB2_18 Depth 6 + cmpq -72(%rbp), %rsi # 8-byte Folded Reload + jg .LBB2_13 +# BB#9: # %polly.loop_header30.preheader + # in Loop: Header=BB2_8 Depth=3 + movq %r10, %rax + orq $63, %rax + cmpq %rax, %r10 + jg .LBB2_13 +# BB#10: # in Loop: Header=BB2_8 Depth=3 + decq %rax + movq -64(%rbp), %r14 # 8-byte Reload + movq -56(%rbp), %r11 # 8-byte Reload .align 16, 0x90 -.LBB2_13: # %polly.loop_header22.loopexit - # in Loop: Header=BB2_14 Depth=4 - addq $6144, %r11 # imm = 0x1800 - cmpq $393216, %r11 # imm = 0x60000 - je .LBB2_10 -.LBB2_14: # %polly.loop_body23 - # Parent Loop BB2_6 Depth=1 - # Parent Loop BB2_9 Depth=2 - # Parent Loop BB2_11 Depth=3 +.LBB2_11: # %polly.loop_header37.preheader + # Parent Loop BB2_5 Depth=1 + # Parent Loop BB2_15 Depth=2 + # Parent Loop BB2_8 Depth=3 # => This Loop Header: Depth=4 - # Child Loop BB2_18 Depth 5 - # Child Loop BB2_19 Depth 6 - cmpq %r10, %rdi - jg .LBB2_13 -# BB#15: # %polly.loop_body23 - # in Loop: Header=BB2_14 Depth=4 - cmpq %rsi, %rdx - jg .LBB2_13 -# BB#16: # %polly.loop_body33.lr.ph.preheader - # in Loop: Header=BB2_14 Depth=4 - leaq (%r8,%r11), %rbx - xorl %r14d, %r14d - movq %r9, %r15 - movq %r14, %r12 - jmp .LBB2_18 + # Child Loop BB2_17 Depth 5 + # Child Loop BB2_18 Depth 6 + cmpq %r13, %r12 + movq %rbx, %r8 + movq %r10, %rsi + jg .LBB2_12 .align 16, 0x90 -.LBB2_17: # %polly.loop_header27.loopexit - # in Loop: Header=BB2_18 Depth=5 - addq $1536, %r15 # imm = 0x600 - incq %r12 - cmpq $64, %r12 - je .LBB2_13 -.LBB2_18: # %polly.loop_body33.lr.ph - # Parent Loop BB2_6 Depth=1 - # Parent Loop BB2_9 Depth=2 - # Parent Loop BB2_11 Depth=3 - # Parent Loop BB2_14 Depth=4 +.LBB2_17: # %polly.loop_header46.preheader + # Parent Loop BB2_5 Depth=1 + # Parent Loop BB2_15 Depth=2 + # Parent Loop BB2_8 Depth=3 + # Parent Loop BB2_11 Depth=4 # => This Loop Header: Depth=5 - # Child Loop BB2_19 Depth 6 - movss (%rbx,%r12,4), %xmm0 - pshufd $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] - movq %r14, %r13 - .align 16, 0x90 -.LBB2_19: # %polly.loop_body33 - # Parent Loop BB2_6 Depth=1 - # Parent Loop BB2_9 Depth=2 - # Parent Loop BB2_11 Depth=3 - # Parent Loop BB2_14 Depth=4 - # Parent Loop BB2_18 Depth=5 + # Child Loop BB2_18 Depth 6 + leaq (%r11,%r11,2), %rcx + shlq $11, %rcx + vbroadcastss A(%rcx,%rsi,4), %xmm0 + movq %r14, %rdi + movq %r8, %r9 + movq %r15, %rcx +.LBB2_18: # %polly.loop_header46 + # Parent Loop BB2_5 Depth=1 + # Parent Loop BB2_15 Depth=2 + # Parent Loop BB2_8 Depth=3 + # Parent Loop BB2_11 Depth=4 + # Parent Loop BB2_17 Depth=5 # => This Inner Loop Header: Depth=6 - movaps B(%r13,%r15,4), %xmm1 - mulps %xmm0, %xmm1 - leaq (%r11,%r13), %rbp - addps C(%rcx,%rbp), %xmm1 - movaps %xmm1, C(%rcx,%rbp) - addq $16, %r13 - cmpq $256, %r13 # imm = 0x100 - jne .LBB2_19 - jmp .LBB2_17 -.LBB2_7: # %polly.after_loop9 + vmulps (%r9), %xmm0, %xmm1 + vaddps (%rdi), %xmm1, %xmm1 + vmovaps %xmm1, (%rdi) + addq $16, %rdi + addq $16, %r9 + addq $4, %rcx + cmpq %rdx, %rcx + jle .LBB2_18 +# BB#16: # %polly.loop_exit48 + # in Loop: Header=BB2_17 Depth=5 + addq $6144, %r8 # imm = 0x1800 + cmpq %rax, %rsi + leaq 1(%rsi), %rsi + jle .LBB2_17 + .align 16, 0x90 +.LBB2_12: # %polly.loop_exit39 + # in Loop: Header=BB2_11 Depth=4 + addq $6144, %r14 # imm = 0x1800 + cmpq -48(%rbp), %r11 # 8-byte Folded Reload + leaq 1(%r11), %r11 + jle .LBB2_11 + .align 16, 0x90 +.LBB2_13: # %polly.loop_exit32 + # in Loop: Header=BB2_8 Depth=3 + addq $393216, %rbx # imm = 0x60000 + cmpq $1472, %r10 # imm = 0x5C0 + leaq 64(%r10), %r10 + movq -56(%rbp), %rsi # 8-byte Reload + jl .LBB2_8 +# BB#14: # %polly.loop_exit25 + # in Loop: Header=BB2_15 Depth=2 + movq -80(%rbp), %rdx # 8-byte Reload + cmpq $1472, %rdx # imm = 0x5C0 + leaq 64(%rdx), %rdx + jl .LBB2_15 +# BB#6: # %polly.loop_exit18 + # in Loop: Header=BB2_5 Depth=1 + addq $393216, -88(%rbp) # 8-byte Folded Spill + # imm = 0x60000 + cmpq $1472, %rsi # imm = 0x5C0 + leaq 64(%rsi), %rsi + jl .LBB2_5 +# BB#7: # %polly.loop_exit11 xorl %eax, %eax - addq $24, %rsp + addq $56, %rsp popq %rbx popq %r12 popq %r13 @@ -299,8 +376,9 @@ main: # @main popq %r15 popq %rbp ret -.Ltmp2: - .size main, .Ltmp2-main +.Ltmp28: + .size main, .Ltmp28-main + .cfi_endproc .type A,@object # @A .comm A,9437184,16 diff --git a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled.exe b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled.exe index 4334522f4587..fbd8b128fd88 100755 Binary files a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled.exe and b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled.exe differ diff --git a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled.ll b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled.ll index fa301cfa5eb0..acdd95f3bc4c 100644 Binary files a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled.ll and b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled.ll differ diff --git a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled.s b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled.s index 0f86df25d357..f7ab7fdd59cc 100644 --- a/polly/www/experiments/matmul/matmul.polly.interchanged+tiled.s +++ b/polly/www/experiments/matmul/matmul.polly.interchanged+tiled.s @@ -2,76 +2,112 @@ .section .rodata.cst8,"aM",@progbits,8 .align 8 .LCPI0_0: - .quad 4602678819172646912 # double 5.000000e-01 + .quad 4602678819172646912 # double 0.5 .text .globl init_array .align 16, 0x90 .type init_array,@function init_array: # @init_array -# BB#0: # %pollyBB - xorl %eax, %eax - movsd .LCPI0_0(%rip), %xmm0 - movq %rax, %rcx + .cfi_startproc +# BB#0: # %entry + pushq %rbp +.Ltmp2: + .cfi_def_cfa_offset 16 +.Ltmp3: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp4: + .cfi_def_cfa_register %rbp + xorl %r8d, %r8d + vmovsd .LCPI0_0(%rip), %xmm0 .align 16, 0x90 -.LBB0_2: # %polly.loop_header1.preheader +.LBB0_1: # %polly.loop_preheader3 # =>This Loop Header: Depth=1 - # Child Loop BB0_3 Depth 2 - movq $-1536, %rdx # imm = 0xFFFFFFFFFFFFFA00 - xorl %esi, %esi + # Child Loop BB0_2 Depth 2 + xorl %ecx, %ecx .align 16, 0x90 -.LBB0_3: # %polly.loop_body2 - # Parent Loop BB0_2 Depth=1 +.LBB0_2: # %polly.loop_header2 + # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 - movl %esi, %edi - sarl $31, %edi - shrl $22, %edi - addl %esi, %edi - andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00 - negl %edi - leal 1(%rsi,%rdi), %edi - cvtsi2sd %edi, %xmm1 - mulsd %xmm0, %xmm1 - cvtsd2ss %xmm1, %xmm1 - movss %xmm1, A+6144(%rax,%rdx,4) - movss %xmm1, B+6144(%rax,%rdx,4) - addl %ecx, %esi - incq %rdx - jne .LBB0_3 -# BB#1: # %polly.loop_header.loopexit - # in Loop: Header=BB0_2 Depth=1 - addq $6144, %rax # imm = 0x1800 - incq %rcx - cmpq $1536, %rcx # imm = 0x600 + movl %ecx, %edx + imull %r8d, %edx + movl %edx, %esi + sarl $31, %esi + shrl $22, %esi + addl %edx, %esi + andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 + negl %esi + movq %r8, %rax + shlq $11, %rax + leal 1(%rdx,%rsi), %edi + leaq (%rax,%rax,2), %rsi + leaq 1(%rcx), %rdx + cmpq $1536, %rdx # imm = 0x600 + vcvtsi2sdl %edi, %xmm0, %xmm1 + vmulsd %xmm0, %xmm1, %xmm1 + vcvtsd2ss %xmm1, %xmm1, %xmm1 + vmovss %xmm1, A(%rsi,%rcx,4) + vmovss %xmm1, B(%rsi,%rcx,4) + movq %rdx, %rcx jne .LBB0_2 -# BB#4: # %polly.after_loop +# BB#3: # %polly.loop_exit4 + # in Loop: Header=BB0_1 Depth=1 + incq %r8 + cmpq $1536, %r8 # imm = 0x600 + jne .LBB0_1 +# BB#4: # %polly.loop_exit + popq %rbp ret -.Ltmp0: - .size init_array, .Ltmp0-init_array +.Ltmp5: + .size init_array, .Ltmp5-init_array + .cfi_endproc .globl print_array .align 16, 0x90 .type print_array,@function print_array: # @print_array -# BB#0: + .cfi_startproc +# BB#0: # %entry + pushq %rbp +.Ltmp9: + .cfi_def_cfa_offset 16 +.Ltmp10: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp11: + .cfi_def_cfa_register %rbp + pushq %r15 pushq %r14 + pushq %r12 pushq %rbx - pushq %rax - movq $-9437184, %rbx # imm = 0xFFFFFFFFFF700000 +.Ltmp12: + .cfi_offset %rbx, -48 +.Ltmp13: + .cfi_offset %r12, -40 +.Ltmp14: + .cfi_offset %r14, -32 +.Ltmp15: + .cfi_offset %r15, -24 + xorl %r14d, %r14d + movl $C, %r15d .align 16, 0x90 -.LBB1_1: # %.preheader +.LBB1_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB1_2 Depth 2 - xorl %r14d, %r14d - movq stdout(%rip), %rdi + movq stdout(%rip), %rax + movq %r15, %r12 + xorl %ebx, %ebx .align 16, 0x90 -.LBB1_2: # Parent Loop BB1_1 Depth=1 +.LBB1_2: # %for.body3 + # Parent Loop BB1_1 Depth=1 # => This Inner Loop Header: Depth=2 - movss C+9437184(%rbx,%r14,4), %xmm0 - cvtss2sd %xmm0, %xmm0 + vmovss (%r12), %xmm0 + vcvtss2sd %xmm0, %xmm0, %xmm0 + movq %rax, %rdi movl $.L.str, %esi movb $1, %al callq fprintf - movslq %r14d, %rax + movslq %ebx, %rax imulq $1717986919, %rax, %rcx # imm = 0x66666667 movq %rcx, %rdx shrq $63, %rdx @@ -81,222 +117,252 @@ print_array: # @print_array subl %ecx, %eax cmpl $79, %eax jne .LBB1_4 -# BB#3: # in Loop: Header=BB1_2 Depth=2 +# BB#3: # %if.then + # in Loop: Header=BB1_2 Depth=2 movq stdout(%rip), %rsi movl $10, %edi callq fputc -.LBB1_4: # in Loop: Header=BB1_2 Depth=2 - incq %r14 - movq stdout(%rip), %rsi - cmpq $1536, %r14 # imm = 0x600 - movq %rsi, %rdi +.LBB1_4: # %for.inc + # in Loop: Header=BB1_2 Depth=2 + addq $4, %r12 + incq %rbx + movq stdout(%rip), %rax + cmpq $1536, %rbx # imm = 0x600 jne .LBB1_2 -# BB#5: # in Loop: Header=BB1_1 Depth=1 +# BB#5: # %for.end + # in Loop: Header=BB1_1 Depth=1 movl $10, %edi + movq %rax, %rsi callq fputc - addq $6144, %rbx # imm = 0x1800 + addq $6144, %r15 # imm = 0x1800 + incq %r14 + cmpq $1536, %r14 # imm = 0x600 jne .LBB1_1 -# BB#6: - addq $8, %rsp +# BB#6: # %for.end12 popq %rbx + popq %r12 popq %r14 + popq %r15 + popq %rbp ret -.Ltmp1: - .size print_array, .Ltmp1-print_array +.Ltmp16: + .size print_array, .Ltmp16-print_array + .cfi_endproc .section .rodata.cst8,"aM",@progbits,8 .align 8 .LCPI2_0: - .quad 4602678819172646912 # double 5.000000e-01 + .quad 4602678819172646912 # double 0.5 .text .globl main .align 16, 0x90 .type main,@function main: # @main -# BB#0: # %pollyBB + .cfi_startproc +# BB#0: # %entry pushq %rbp +.Ltmp20: + .cfi_def_cfa_offset 16 +.Ltmp21: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp22: + .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbx - subq $40, %rsp - xorl %eax, %eax - movsd .LCPI2_0(%rip), %xmm0 - movq %rax, %rcx + subq $56, %rsp +.Ltmp23: + .cfi_offset %rbx, -56 +.Ltmp24: + .cfi_offset %r12, -48 +.Ltmp25: + .cfi_offset %r13, -40 +.Ltmp26: + .cfi_offset %r14, -32 +.Ltmp27: + .cfi_offset %r15, -24 + xorl %ebx, %ebx + vmovsd .LCPI2_0(%rip), %xmm0 .align 16, 0x90 -.LBB2_1: # %polly.loop_header1.preheader.i +.LBB2_1: # %polly.loop_preheader3.i # =>This Loop Header: Depth=1 # Child Loop BB2_2 Depth 2 - movq $-1536, %rdx # imm = 0xFFFFFFFFFFFFFA00 - xorl %esi, %esi + xorl %ecx, %ecx .align 16, 0x90 -.LBB2_2: # %polly.loop_body2.i +.LBB2_2: # %polly.loop_header2.i # Parent Loop BB2_1 Depth=1 # => This Inner Loop Header: Depth=2 - movl %esi, %edi - sarl $31, %edi - shrl $22, %edi - addl %esi, %edi - andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00 - negl %edi - leal 1(%rsi,%rdi), %edi - cvtsi2sd %edi, %xmm1 - mulsd %xmm0, %xmm1 - cvtsd2ss %xmm1, %xmm1 - movss %xmm1, A+6144(%rax,%rdx,4) - movss %xmm1, B+6144(%rax,%rdx,4) - addl %ecx, %esi - incq %rdx + movl %ecx, %edx + imull %ebx, %edx + movl %edx, %esi + sarl $31, %esi + shrl $22, %esi + addl %edx, %esi + andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 + negl %esi + movq %rbx, %rax + shlq $11, %rax + leal 1(%rdx,%rsi), %edi + leaq (%rax,%rax,2), %rsi + leaq 1(%rcx), %rdx + cmpq $1536, %rdx # imm = 0x600 + vcvtsi2sdl %edi, %xmm0, %xmm1 + vmulsd %xmm0, %xmm1, %xmm1 + vcvtsd2ss %xmm1, %xmm1, %xmm1 + vmovss %xmm1, A(%rsi,%rcx,4) + vmovss %xmm1, B(%rsi,%rcx,4) + movq %rdx, %rcx jne .LBB2_2 -# BB#3: # %polly.loop_header.loopexit.i +# BB#3: # %polly.loop_exit4.i # in Loop: Header=BB2_1 Depth=1 - addq $6144, %rax # imm = 0x1800 - incq %rcx - cmpq $1536, %rcx # imm = 0x600 + incq %rbx + cmpq $1536, %rbx # imm = 0x600 jne .LBB2_1 -# BB#4: # %polly.loop_header.preheader - movl $C, %eax - movq %rax, 8(%rsp) # 8-byte Spill +# BB#4: # %polly.loop_preheader3.preheader + movl $C, %ebx + movl $C, %edi xorl %esi, %esi movl $9437184, %edx # imm = 0x900000 - movl $C, %edi callq memset - movl $A, %eax - movq %rax, 16(%rsp) # 8-byte Spill - movq $0, 32(%rsp) # 8-byte Folded Spill - jmp .LBB2_6 - .align 16, 0x90 -.LBB2_5: # %polly.loop_header7.loopexit - # in Loop: Header=BB2_6 Depth=1 - addq $393216, 16(%rsp) # 8-byte Folded Spill - # imm = 0x60000 - addq $393216, 8(%rsp) # 8-byte Folded Spill - # imm = 0x60000 - movq 32(%rsp), %rax # 8-byte Reload - addq $64, %rax - movq %rax, 32(%rsp) # 8-byte Spill - cmpq $1536, %rax # imm = 0x600 - je .LBB2_7 -.LBB2_6: # %polly.loop_header12.preheader - # =>This Loop Header: Depth=1 - # Child Loop BB2_9 Depth 2 - # Child Loop BB2_11 Depth 3 - # Child Loop BB2_14 Depth 4 - # Child Loop BB2_18 Depth 5 - # Child Loop BB2_19 Depth 6 - movq 32(%rsp), %rax # 8-byte Reload - leaq 63(%rax), %rax - movl $B, %ecx - movq %rcx, 24(%rsp) # 8-byte Spill - xorl %ecx, %ecx - movq 8(%rsp), %rdx # 8-byte Reload - jmp .LBB2_9 - .align 16, 0x90 -.LBB2_8: # %polly.loop_header12.loopexit - # in Loop: Header=BB2_9 Depth=2 - addq $256, %rdx # imm = 0x100 - addq $256, 24(%rsp) # 8-byte Folded Spill - # imm = 0x100 - addq $64, %rcx - cmpq $1536, %rcx # imm = 0x600 - je .LBB2_5 -.LBB2_9: # %polly.loop_header17.preheader - # Parent Loop BB2_6 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB2_11 Depth 3 - # Child Loop BB2_14 Depth 4 - # Child Loop BB2_18 Depth 5 - # Child Loop BB2_19 Depth 6 - leaq 63(%rcx), %rsi - xorl %edi, %edi - movq 16(%rsp), %r8 # 8-byte Reload - movq 24(%rsp), %r9 # 8-byte Reload - jmp .LBB2_11 - .align 16, 0x90 -.LBB2_10: # %polly.loop_header17.loopexit - # in Loop: Header=BB2_11 Depth=3 - addq $256, %r8 # imm = 0x100 - addq $393216, %r9 # imm = 0x60000 - addq $64, %rdi - cmpq $1536, %rdi # imm = 0x600 - je .LBB2_8 -.LBB2_11: # %polly.loop_body18 - # Parent Loop BB2_6 Depth=1 - # Parent Loop BB2_9 Depth=2 - # => This Loop Header: Depth=3 - # Child Loop BB2_14 Depth 4 - # Child Loop BB2_18 Depth 5 - # Child Loop BB2_19 Depth 6 - cmpq %rax, 32(%rsp) # 8-byte Folded Reload - jg .LBB2_10 -# BB#12: # %polly.loop_body23.lr.ph - # in Loop: Header=BB2_11 Depth=3 - leaq 63(%rdi), %r10 - xorl %r11d, %r11d - jmp .LBB2_14 - .align 16, 0x90 -.LBB2_13: # %polly.loop_header22.loopexit - # in Loop: Header=BB2_14 Depth=4 - addq $6144, %r11 # imm = 0x1800 - cmpq $393216, %r11 # imm = 0x60000 - je .LBB2_10 -.LBB2_14: # %polly.loop_body23 - # Parent Loop BB2_6 Depth=1 - # Parent Loop BB2_9 Depth=2 - # Parent Loop BB2_11 Depth=3 - # => This Loop Header: Depth=4 - # Child Loop BB2_18 Depth 5 - # Child Loop BB2_19 Depth 6 - cmpq %r10, %rdi - jg .LBB2_13 -# BB#15: # %polly.loop_body23 - # in Loop: Header=BB2_14 Depth=4 - cmpq %rsi, %rcx - jg .LBB2_13 -# BB#16: # %polly.loop_body33.lr.ph.preheader - # in Loop: Header=BB2_14 Depth=4 - leaq (%rdx,%r11), %rbx - leaq (%r8,%r11), %r14 - xorl %r15d, %r15d - movq %r9, %r12 - movq %r15, %r13 - jmp .LBB2_18 - .align 16, 0x90 -.LBB2_17: # %polly.loop_header27.loopexit - # in Loop: Header=BB2_18 Depth=5 - addq $6144, %r12 # imm = 0x1800 - incq %r13 - cmpq $64, %r13 - je .LBB2_13 -.LBB2_18: # %polly.loop_body33.lr.ph - # Parent Loop BB2_6 Depth=1 - # Parent Loop BB2_9 Depth=2 - # Parent Loop BB2_11 Depth=3 - # Parent Loop BB2_14 Depth=4 - # => This Loop Header: Depth=5 - # Child Loop BB2_19 Depth 6 - movss (%r14,%r13,4), %xmm0 - movq %r15, %rbp - .align 16, 0x90 -.LBB2_19: # %polly.loop_body33 - # Parent Loop BB2_6 Depth=1 - # Parent Loop BB2_9 Depth=2 - # Parent Loop BB2_11 Depth=3 - # Parent Loop BB2_14 Depth=4 - # Parent Loop BB2_18 Depth=5 - # => This Inner Loop Header: Depth=6 - movss (%r12,%rbp,4), %xmm1 - mulss %xmm0, %xmm1 - addss (%rbx,%rbp,4), %xmm1 - movss %xmm1, (%rbx,%rbp,4) - incq %rbp - cmpq $64, %rbp - jne .LBB2_19 - jmp .LBB2_17 -.LBB2_7: # %polly.after_loop9 xorl %eax, %eax - addq $40, %rsp + .align 16, 0x90 +.LBB2_5: # %polly.loop_preheader17 + # =>This Loop Header: Depth=1 + # Child Loop BB2_15 Depth 2 + # Child Loop BB2_8 Depth 3 + # Child Loop BB2_11 Depth 4 + # Child Loop BB2_17 Depth 5 + # Child Loop BB2_18 Depth 6 + movq %rax, -56(%rbp) # 8-byte Spill + movq %rbx, -88(%rbp) # 8-byte Spill + movq %rax, %rcx + orq $63, %rcx + movq %rcx, -72(%rbp) # 8-byte Spill + leaq -1(%rcx), %rcx + movq %rcx, -48(%rbp) # 8-byte Spill + movq $-1, %r15 + movl $B, %ecx + movq %rbx, -64(%rbp) # 8-byte Spill + xorl %r12d, %r12d + .align 16, 0x90 +.LBB2_15: # %polly.loop_preheader24 + # Parent Loop BB2_5 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB2_8 Depth 3 + # Child Loop BB2_11 Depth 4 + # Child Loop BB2_17 Depth 5 + # Child Loop BB2_18 Depth 6 + movq %rcx, -80(%rbp) # 8-byte Spill + movq %r12, %r13 + orq $63, %r13 + leaq -1(%r13), %rbx + xorl %r9d, %r9d + movq %rcx, %rdx + .align 16, 0x90 +.LBB2_8: # %polly.loop_header23 + # Parent Loop BB2_5 Depth=1 + # Parent Loop BB2_15 Depth=2 + # => This Loop Header: Depth=3 + # Child Loop BB2_11 Depth 4 + # Child Loop BB2_17 Depth 5 + # Child Loop BB2_18 Depth 6 + cmpq -72(%rbp), %rax # 8-byte Folded Reload + jg .LBB2_13 +# BB#9: # %polly.loop_header30.preheader + # in Loop: Header=BB2_8 Depth=3 + movq %r9, %rax + orq $63, %rax + cmpq %rax, %r9 + jg .LBB2_13 +# BB#10: # in Loop: Header=BB2_8 Depth=3 + decq %rax + movq -64(%rbp), %r10 # 8-byte Reload + movq -56(%rbp), %r11 # 8-byte Reload + .align 16, 0x90 +.LBB2_11: # %polly.loop_header37.preheader + # Parent Loop BB2_5 Depth=1 + # Parent Loop BB2_15 Depth=2 + # Parent Loop BB2_8 Depth=3 + # => This Loop Header: Depth=4 + # Child Loop BB2_17 Depth 5 + # Child Loop BB2_18 Depth 6 + cmpq %r13, %r12 + movq %rdx, %r14 + movq %r9, %rcx + jg .LBB2_12 + .align 16, 0x90 +.LBB2_17: # %polly.loop_header46.preheader + # Parent Loop BB2_5 Depth=1 + # Parent Loop BB2_15 Depth=2 + # Parent Loop BB2_8 Depth=3 + # Parent Loop BB2_11 Depth=4 + # => This Loop Header: Depth=5 + # Child Loop BB2_18 Depth 6 + leaq (%r11,%r11,2), %rsi + shlq $11, %rsi + vmovss A(%rsi,%rcx,4), %xmm0 + movq %r10, %rdi + movq %r14, %r8 + movq %r15, %rsi +.LBB2_18: # %polly.loop_header46 + # Parent Loop BB2_5 Depth=1 + # Parent Loop BB2_15 Depth=2 + # Parent Loop BB2_8 Depth=3 + # Parent Loop BB2_11 Depth=4 + # Parent Loop BB2_17 Depth=5 + # => This Inner Loop Header: Depth=6 + vmulss (%r8), %xmm0, %xmm1 + vaddss (%rdi), %xmm1, %xmm1 + vmovss %xmm1, (%rdi) + addq $4, %rdi + addq $4, %r8 + incq %rsi + cmpq %rbx, %rsi + jle .LBB2_18 +# BB#16: # %polly.loop_exit48 + # in Loop: Header=BB2_17 Depth=5 + addq $6144, %r14 # imm = 0x1800 + cmpq %rax, %rcx + leaq 1(%rcx), %rcx + jle .LBB2_17 + .align 16, 0x90 +.LBB2_12: # %polly.loop_exit39 + # in Loop: Header=BB2_11 Depth=4 + addq $6144, %r10 # imm = 0x1800 + cmpq -48(%rbp), %r11 # 8-byte Folded Reload + leaq 1(%r11), %r11 + jle .LBB2_11 + .align 16, 0x90 +.LBB2_13: # %polly.loop_exit32 + # in Loop: Header=BB2_8 Depth=3 + addq $393216, %rdx # imm = 0x60000 + cmpq $1472, %r9 # imm = 0x5C0 + leaq 64(%r9), %r9 + movq -56(%rbp), %rax # 8-byte Reload + jl .LBB2_8 +# BB#14: # %polly.loop_exit25 + # in Loop: Header=BB2_15 Depth=2 + addq $256, -64(%rbp) # 8-byte Folded Spill + # imm = 0x100 + movq -80(%rbp), %rcx # 8-byte Reload + addq $256, %rcx # imm = 0x100 + addq $64, %r15 + cmpq $1472, %r12 # imm = 0x5C0 + leaq 64(%r12), %r12 + jl .LBB2_15 +# BB#6: # %polly.loop_exit18 + # in Loop: Header=BB2_5 Depth=1 + movq -88(%rbp), %rbx # 8-byte Reload + addq $393216, %rbx # imm = 0x60000 + cmpq $1472, %rax # imm = 0x5C0 + leaq 64(%rax), %rax + jl .LBB2_5 +# BB#7: # %polly.loop_exit11 + xorl %eax, %eax + addq $56, %rsp popq %rbx popq %r12 popq %r13 @@ -304,8 +370,9 @@ main: # @main popq %r15 popq %rbp ret -.Ltmp2: - .size main, .Ltmp2-main +.Ltmp28: + .size main, .Ltmp28-main + .cfi_endproc .type A,@object # @A .comm A,9437184,16 diff --git a/polly/www/experiments/matmul/matmul.polly.interchanged.exe b/polly/www/experiments/matmul/matmul.polly.interchanged.exe index cc125c4b2b1a..240c95a7f790 100755 Binary files a/polly/www/experiments/matmul/matmul.polly.interchanged.exe and b/polly/www/experiments/matmul/matmul.polly.interchanged.exe differ diff --git a/polly/www/experiments/matmul/matmul.polly.interchanged.ll b/polly/www/experiments/matmul/matmul.polly.interchanged.ll index c0a54bb64f45..52fbccc7ed5c 100644 Binary files a/polly/www/experiments/matmul/matmul.polly.interchanged.ll and b/polly/www/experiments/matmul/matmul.polly.interchanged.ll differ diff --git a/polly/www/experiments/matmul/matmul.polly.interchanged.s b/polly/www/experiments/matmul/matmul.polly.interchanged.s index 8bbc523f764e..a764da0b3f22 100644 --- a/polly/www/experiments/matmul/matmul.polly.interchanged.s +++ b/polly/www/experiments/matmul/matmul.polly.interchanged.s @@ -2,76 +2,112 @@ .section .rodata.cst8,"aM",@progbits,8 .align 8 .LCPI0_0: - .quad 4602678819172646912 # double 5.000000e-01 + .quad 4602678819172646912 # double 0.5 .text .globl init_array .align 16, 0x90 .type init_array,@function init_array: # @init_array -# BB#0: # %pollyBB - xorl %eax, %eax - movsd .LCPI0_0(%rip), %xmm0 - movq %rax, %rcx + .cfi_startproc +# BB#0: # %entry + pushq %rbp +.Ltmp2: + .cfi_def_cfa_offset 16 +.Ltmp3: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp4: + .cfi_def_cfa_register %rbp + xorl %r8d, %r8d + vmovsd .LCPI0_0(%rip), %xmm0 .align 16, 0x90 -.LBB0_2: # %polly.loop_header1.preheader +.LBB0_1: # %polly.loop_preheader3 # =>This Loop Header: Depth=1 - # Child Loop BB0_3 Depth 2 - movq $-1536, %rdx # imm = 0xFFFFFFFFFFFFFA00 - xorl %esi, %esi + # Child Loop BB0_2 Depth 2 + xorl %ecx, %ecx .align 16, 0x90 -.LBB0_3: # %polly.loop_body2 - # Parent Loop BB0_2 Depth=1 +.LBB0_2: # %polly.loop_header2 + # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 - movl %esi, %edi - sarl $31, %edi - shrl $22, %edi - addl %esi, %edi - andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00 - negl %edi - leal 1(%rsi,%rdi), %edi - cvtsi2sd %edi, %xmm1 - mulsd %xmm0, %xmm1 - cvtsd2ss %xmm1, %xmm1 - movss %xmm1, A+6144(%rax,%rdx,4) - movss %xmm1, B+6144(%rax,%rdx,4) - addl %ecx, %esi - incq %rdx - jne .LBB0_3 -# BB#1: # %polly.loop_header.loopexit - # in Loop: Header=BB0_2 Depth=1 - addq $6144, %rax # imm = 0x1800 - incq %rcx - cmpq $1536, %rcx # imm = 0x600 + movl %ecx, %edx + imull %r8d, %edx + movl %edx, %esi + sarl $31, %esi + shrl $22, %esi + addl %edx, %esi + andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 + negl %esi + movq %r8, %rax + shlq $11, %rax + leal 1(%rdx,%rsi), %edi + leaq (%rax,%rax,2), %rsi + leaq 1(%rcx), %rdx + cmpq $1536, %rdx # imm = 0x600 + vcvtsi2sdl %edi, %xmm0, %xmm1 + vmulsd %xmm0, %xmm1, %xmm1 + vcvtsd2ss %xmm1, %xmm1, %xmm1 + vmovss %xmm1, A(%rsi,%rcx,4) + vmovss %xmm1, B(%rsi,%rcx,4) + movq %rdx, %rcx jne .LBB0_2 -# BB#4: # %polly.after_loop +# BB#3: # %polly.loop_exit4 + # in Loop: Header=BB0_1 Depth=1 + incq %r8 + cmpq $1536, %r8 # imm = 0x600 + jne .LBB0_1 +# BB#4: # %polly.loop_exit + popq %rbp ret -.Ltmp0: - .size init_array, .Ltmp0-init_array +.Ltmp5: + .size init_array, .Ltmp5-init_array + .cfi_endproc .globl print_array .align 16, 0x90 .type print_array,@function print_array: # @print_array -# BB#0: + .cfi_startproc +# BB#0: # %entry + pushq %rbp +.Ltmp9: + .cfi_def_cfa_offset 16 +.Ltmp10: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp11: + .cfi_def_cfa_register %rbp + pushq %r15 pushq %r14 + pushq %r12 pushq %rbx - pushq %rax - movq $-9437184, %rbx # imm = 0xFFFFFFFFFF700000 +.Ltmp12: + .cfi_offset %rbx, -48 +.Ltmp13: + .cfi_offset %r12, -40 +.Ltmp14: + .cfi_offset %r14, -32 +.Ltmp15: + .cfi_offset %r15, -24 + xorl %r14d, %r14d + movl $C, %r15d .align 16, 0x90 -.LBB1_1: # %.preheader +.LBB1_1: # %for.cond1.preheader # =>This Loop Header: Depth=1 # Child Loop BB1_2 Depth 2 - xorl %r14d, %r14d - movq stdout(%rip), %rdi + movq stdout(%rip), %rax + movq %r15, %r12 + xorl %ebx, %ebx .align 16, 0x90 -.LBB1_2: # Parent Loop BB1_1 Depth=1 +.LBB1_2: # %for.body3 + # Parent Loop BB1_1 Depth=1 # => This Inner Loop Header: Depth=2 - movss C+9437184(%rbx,%r14,4), %xmm0 - cvtss2sd %xmm0, %xmm0 + vmovss (%r12), %xmm0 + vcvtss2sd %xmm0, %xmm0, %xmm0 + movq %rax, %rdi movl $.L.str, %esi movb $1, %al callq fprintf - movslq %r14d, %rax + movslq %ebx, %rax imulq $1717986919, %rax, %rcx # imm = 0x66666667 movq %rcx, %rdx shrq $63, %rdx @@ -81,125 +117,158 @@ print_array: # @print_array subl %ecx, %eax cmpl $79, %eax jne .LBB1_4 -# BB#3: # in Loop: Header=BB1_2 Depth=2 +# BB#3: # %if.then + # in Loop: Header=BB1_2 Depth=2 movq stdout(%rip), %rsi movl $10, %edi callq fputc -.LBB1_4: # in Loop: Header=BB1_2 Depth=2 - incq %r14 - movq stdout(%rip), %rsi - cmpq $1536, %r14 # imm = 0x600 - movq %rsi, %rdi +.LBB1_4: # %for.inc + # in Loop: Header=BB1_2 Depth=2 + addq $4, %r12 + incq %rbx + movq stdout(%rip), %rax + cmpq $1536, %rbx # imm = 0x600 jne .LBB1_2 -# BB#5: # in Loop: Header=BB1_1 Depth=1 +# BB#5: # %for.end + # in Loop: Header=BB1_1 Depth=1 movl $10, %edi + movq %rax, %rsi callq fputc - addq $6144, %rbx # imm = 0x1800 + addq $6144, %r15 # imm = 0x1800 + incq %r14 + cmpq $1536, %r14 # imm = 0x600 jne .LBB1_1 -# BB#6: - addq $8, %rsp +# BB#6: # %for.end12 popq %rbx + popq %r12 popq %r14 + popq %r15 + popq %rbp ret -.Ltmp1: - .size print_array, .Ltmp1-print_array +.Ltmp16: + .size print_array, .Ltmp16-print_array + .cfi_endproc .section .rodata.cst8,"aM",@progbits,8 .align 8 .LCPI2_0: - .quad 4602678819172646912 # double 5.000000e-01 + .quad 4602678819172646912 # double 0.5 .text .globl main .align 16, 0x90 .type main,@function main: # @main -# BB#0: # %pollyBB - pushq %rax - xorl %eax, %eax - movsd .LCPI2_0(%rip), %xmm0 - movq %rax, %rcx + .cfi_startproc +# BB#0: # %entry + pushq %rbp +.Ltmp20: + .cfi_def_cfa_offset 16 +.Ltmp21: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +.Ltmp22: + .cfi_def_cfa_register %rbp + pushq %r14 + pushq %rbx +.Ltmp23: + .cfi_offset %rbx, -32 +.Ltmp24: + .cfi_offset %r14, -24 + xorl %ebx, %ebx + vmovsd .LCPI2_0(%rip), %xmm0 .align 16, 0x90 -.LBB2_1: # %polly.loop_header1.preheader.i +.LBB2_1: # %polly.loop_preheader3.i # =>This Loop Header: Depth=1 # Child Loop BB2_2 Depth 2 - movq $-1536, %rdx # imm = 0xFFFFFFFFFFFFFA00 - xorl %esi, %esi + xorl %ecx, %ecx .align 16, 0x90 -.LBB2_2: # %polly.loop_body2.i +.LBB2_2: # %polly.loop_header2.i # Parent Loop BB2_1 Depth=1 # => This Inner Loop Header: Depth=2 - movl %esi, %edi - sarl $31, %edi - shrl $22, %edi - addl %esi, %edi - andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00 - negl %edi - leal 1(%rsi,%rdi), %edi - cvtsi2sd %edi, %xmm1 - mulsd %xmm0, %xmm1 - cvtsd2ss %xmm1, %xmm1 - movss %xmm1, A+6144(%rax,%rdx,4) - movss %xmm1, B+6144(%rax,%rdx,4) - addl %ecx, %esi - incq %rdx + movl %ecx, %edx + imull %ebx, %edx + movl %edx, %esi + sarl $31, %esi + shrl $22, %esi + addl %edx, %esi + andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00 + negl %esi + movq %rbx, %rax + shlq $11, %rax + leal 1(%rdx,%rsi), %edi + leaq (%rax,%rax,2), %rsi + leaq 1(%rcx), %rdx + cmpq $1536, %rdx # imm = 0x600 + vcvtsi2sdl %edi, %xmm0, %xmm1 + vmulsd %xmm0, %xmm1, %xmm1 + vcvtsd2ss %xmm1, %xmm1, %xmm1 + vmovss %xmm1, A(%rsi,%rcx,4) + vmovss %xmm1, B(%rsi,%rcx,4) + movq %rdx, %rcx jne .LBB2_2 -# BB#3: # %polly.loop_header.loopexit.i +# BB#3: # %polly.loop_exit4.i # in Loop: Header=BB2_1 Depth=1 - addq $6144, %rax # imm = 0x1800 - incq %rcx - cmpq $1536, %rcx # imm = 0x600 + incq %rbx + cmpq $1536, %rbx # imm = 0x600 jne .LBB2_1 -# BB#4: # %polly.loop_header.preheader +# BB#4: # %polly.loop_preheader3.preheader + movl $C, %r14d movl $C, %edi xorl %esi, %esi movl $9437184, %edx # imm = 0x900000 callq memset xorl %eax, %eax - jmp .LBB2_6 .align 16, 0x90 -.LBB2_5: # %polly.loop_header7.loopexit - # in Loop: Header=BB2_6 Depth=1 - addq $6144, %rax # imm = 0x1800 - cmpq $9437184, %rax # imm = 0x900000 - je .LBB2_7 -.LBB2_6: # %polly.loop_header12.preheader +.LBB2_5: # %polly.loop_preheader17 # =>This Loop Header: Depth=1 - # Child Loop BB2_9 Depth 2 - # Child Loop BB2_10 Depth 3 - leaq A(%rax), %rcx - movq $-9437184, %rdx # imm = 0xFFFFFFFFFF700000 - jmp .LBB2_9 + # Child Loop BB2_10 Depth 2 + # Child Loop BB2_8 Depth 3 + movl $B, %ebx + xorl %edx, %edx .align 16, 0x90 -.LBB2_8: # %polly.loop_header12.loopexit - # in Loop: Header=BB2_9 Depth=2 - addq $4, %rcx - addq $6144, %rdx # imm = 0x1800 - je .LBB2_5 -.LBB2_9: # %polly.loop_header17.preheader - # Parent Loop BB2_6 Depth=1 +.LBB2_10: # %polly.loop_preheader24 + # Parent Loop BB2_5 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB2_10 Depth 3 - movss (%rcx), %xmm0 - xorl %esi, %esi + # Child Loop BB2_8 Depth 3 + leaq (%rax,%rax,2), %rcx + shlq $11, %rcx + vmovss A(%rcx,%rdx,4), %xmm0 + movl $1536, %esi # imm = 0x600 + movq %r14, %rdi + movq %rbx, %rcx .align 16, 0x90 -.LBB2_10: # %polly.loop_body18 - # Parent Loop BB2_6 Depth=1 - # Parent Loop BB2_9 Depth=2 +.LBB2_8: # %polly.loop_header23 + # Parent Loop BB2_5 Depth=1 + # Parent Loop BB2_10 Depth=2 # => This Inner Loop Header: Depth=3 - movss B+9437184(%rdx,%rsi,4), %xmm1 - mulss %xmm0, %xmm1 - addss C(%rax,%rsi,4), %xmm1 - movss %xmm1, C(%rax,%rsi,4) - incq %rsi - cmpq $1536, %rsi # imm = 0x600 + vmulss (%rcx), %xmm0, %xmm1 + vaddss (%rdi), %xmm1, %xmm1 + vmovss %xmm1, (%rdi) + addq $4, %rdi + addq $4, %rcx + decq %rsi + jne .LBB2_8 +# BB#9: # %polly.loop_exit25 + # in Loop: Header=BB2_10 Depth=2 + addq $6144, %rbx # imm = 0x1800 + incq %rdx + cmpq $1536, %rdx # imm = 0x600 jne .LBB2_10 - jmp .LBB2_8 -.LBB2_7: # %polly.after_loop9 +# BB#6: # %polly.loop_exit18 + # in Loop: Header=BB2_5 Depth=1 + addq $6144, %r14 # imm = 0x1800 + incq %rax + cmpq $1536, %rax # imm = 0x600 + jne .LBB2_5 +# BB#7: # %polly.loop_exit11 xorl %eax, %eax - popq %rdx + popq %rbx + popq %r14 + popq %rbp ret -.Ltmp2: - .size main, .Ltmp2-main +.Ltmp25: + .size main, .Ltmp25-main + .cfi_endproc .type A,@object # @A .comm A,9437184,16 diff --git a/polly/www/experiments/matmul/matmul.preopt.ll b/polly/www/experiments/matmul/matmul.preopt.ll index 9287d7e141b0..3931716619bf 100644 --- a/polly/www/experiments/matmul/matmul.preopt.ll +++ b/polly/www/experiments/matmul/matmul.preopt.ll @@ -1,5 +1,5 @@ ; ModuleID = 'matmul.s' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" %struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } @@ -8,173 +8,179 @@ target triple = "x86_64-unknown-linux-gnu" @A = common global [1536 x [1536 x float]] zeroinitializer, align 16 @B = common global [1536 x [1536 x float]] zeroinitializer, align 16 @stdout = external global %struct._IO_FILE* -@.str = private unnamed_addr constant [5 x i8] c"%lf \00" +@.str = private unnamed_addr constant [5 x i8] c"%lf \00", align 1 @C = common global [1536 x [1536 x float]] zeroinitializer, align 16 -@.str1 = private unnamed_addr constant [2 x i8] c"\0A\00" +@.str1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 -define void @init_array() nounwind { -;