Imported Upstream version 6.10.0.49

Former-commit-id: 1d6753294b2993e1fbf92de9366bb9544db4189b
This commit is contained in:
Xamarin Public Jenkins (auto-signing)
2020-01-16 16:38:04 +00:00
parent d94e79959b
commit 468663ddbb
48518 changed files with 2789335 additions and 61176 deletions

View File

@@ -0,0 +1,9 @@
define float @__nv_expf(float %a) {
ret float %a
}
define float @__nv_cosf(float %a) {
ret float %a
}
define float @__nv_logf(float %a) {
ret float %a
}

View File

@@ -0,0 +1,71 @@
; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP
; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
; REQUIRES: pollyacc
; Check that we detect a scop.
; SCOP: Function: checkScalarKill
; SCOP-NEXT: Region: %XLoopInit---%for.end
; SCOP-NEXT: Max Loop Depth: 1
; Check that we have a scalar that is not a phi node in the scop.
; SCOP: i32 MemRef_x_0; // Element size 4
; Check that kernel launch is generated in host IR.
; the declare would not be generated unless a call to a kernel exists.
; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
; Check that we add variables that are local to a scop into the kills that we
; pass to PPCG. This should enable PPCG to codegen this example.
; void checkScalarKill(int A[], int B[], int C[], const int control1, int control2) {
; int x;
; #pragma scop
; for(int i = 0; i < 1000; i++) {
; XLoopInit: x = 0;
;
; if (control1 > 2)
; C1Add: x += 10;
; if (control2 > 3)
; C2Add: x += A[i];
;
; BLoopAccumX: B[i] += x;
; }
;
; #pragma endscop
; }
; ModuleID = 'test.ll'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define void @checkScalarKill(i32* %A, i32* %B, i32* %C, i32 %control1, i32 %control2) {
entry:
br label %entry.split
entry.split: ; preds = %entry
br label %XLoopInit
XLoopInit: ; preds = %entry.split, %BLoopAccumX
%indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %BLoopAccumX ]
%cmp1 = icmp sgt i32 %control1, 2
%x.0 = select i1 %cmp1, i32 10, i32 0
%cmp2 = icmp sgt i32 %control2, 3
br i1 %cmp2, label %C2Add, label %BLoopAccumX
C2Add: ; preds = %XLoopInit
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
%tmp6 = load i32, i32* %arrayidx, align 4
%add4 = add nsw i32 %tmp6, %x.0
br label %BLoopAccumX
BLoopAccumX: ; preds = %XLoopInit, %C2Add
%x.1 = phi i32 [ %add4, %C2Add ], [ %x.0, %XLoopInit ]
%arrayidx7 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
%tmp11 = load i32, i32* %arrayidx7, align 4
%add8 = add nsw i32 %tmp11, %x.1
store i32 %add8, i32* %arrayidx7, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, 1000
br i1 %exitcond, label %XLoopInit, label %for.end
for.end: ; preds = %BLoopAccumX
ret void
}

View File

@@ -0,0 +1,53 @@
; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-codegen-ppcg \
; RUN: -polly-invariant-load-hoisting -polly-ignore-parameter-bounds < %s | \
; RUN: FileCheck %s
; REQUIRES: pollyacc
; CHECK: polly_launchKernel
; Verify that this program compiles. At some point, this compilation crashed
; due to insufficient parameters being available.
source_filename = "bugpoint-output-4d01492.bc"
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
%struct.barney = type { i8*, i64, i64, [2 x %struct.widget] }
%struct.widget = type { i64, i64, i64 }
@global = external unnamed_addr global %struct.barney, align 32
; Function Attrs: nounwind uwtable
define void @wobble(i32* noalias %arg) #0 {
bb:
%tmp = load i32, i32* %arg, align 4
br label %bb1
bb1: ; preds = %bb13, %bb
%tmp2 = phi i32 [ %tmp15, %bb13 ], [ 1, %bb ]
br label %bb3
bb3: ; preds = %bb3, %bb1
%tmp4 = load i32*, i32** bitcast (%struct.barney* @global to i32**), align 32
%tmp5 = sext i32 %tmp2 to i64
%tmp6 = load i64, i64* getelementptr inbounds (%struct.barney, %struct.barney* @global, i64 0, i32 3, i64 1, i32 0), align 8
%tmp7 = mul i64 %tmp6, %tmp5
%tmp8 = add i64 %tmp7, 0
%tmp9 = load i64, i64* getelementptr inbounds (%struct.barney, %struct.barney* @global, i64 0, i32 1), align 8
%tmp10 = add i64 %tmp8, %tmp9
%tmp11 = getelementptr i32, i32* %tmp4, i64 %tmp10
store i32 undef, i32* %tmp11, align 4
%tmp12 = icmp eq i32 0, 0
br i1 %tmp12, label %bb13, label %bb3
bb13: ; preds = %bb3
%tmp14 = icmp eq i32 %tmp2, %tmp
%tmp15 = add i32 %tmp2, 1
br i1 %tmp14, label %bb16, label %bb1
bb16: ; preds = %bb13
ret void
}
attributes #0 = { nounwind uwtable }

View File

@@ -0,0 +1,50 @@
; RUN: opt %loadPolly -S -polly-codegen-ppcg \
; RUN: -polly-use-llvm-names < %s
; ModuleID = 'test/GPGPU/zero-size-array.ll'
; REQUIRES: pollyacc
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
; We used to divide the element size by 8 to arrive at the 'actual' size
; of an array element. This used to cause arrays that have an element size
; of less than 8 to collapse to size 0. This test makes sure that it does
; not happen anymore.
; f(int *niters_ptr, int *arr[0]) {
; const int inters = *niters_ptr;
; for(int i = 0; i < niters; i++) {
; arr[0][i + 1] = 0
; }
; }
; Function Attrs: nounwind uwtable
define void @f(i32* noalias %niters.ptr, [0 x i32]* noalias %arr) #0 {
entry:
%niters = load i32, i32* %niters.ptr, align 4
br label %loop.body
loop.body: ; preds = %loop.body, %entry
%indvar = phi i32 [ %indvar.next, %loop.body ], [ 1, %entry ]
%indvar.sext = sext i32 %indvar to i64
%arr.slot = getelementptr [0 x i32], [0 x i32]* %arr, i64 0, i64 %indvar.sext
store i32 0, i32* %arr.slot, align 4
%tmp8 = icmp eq i32 %indvar, %niters
%indvar.next = add i32 %indvar, 1
br i1 %tmp8, label %loop.exit, label %loop.body
loop.exit: ; preds = %loop.body
%tmp10 = icmp sgt i32 undef, 0
br label %auxiliary.loop
auxiliary.loop: ; preds = %"101", %loop.exit
%tmp11 = phi i1 [ %tmp10, %loop.exit ], [ undef, %auxiliary.loop ]
br i1 undef, label %auxiliary.loop, label %exit
exit: ; preds = %auxiliary.loop
ret void
}
attributes #0 = { nounwind uwtable }

View File

@@ -0,0 +1,55 @@
; RUN: opt %loadPolly -S -polly-codegen-ppcg \
; RUN: -polly-ignore-parameter-bounds \
; RUN: -polly-invariant-load-hoisting < %s| FileCheck %s -check-prefix=HOST-IR
;
; REQUIRES: pollyacc
; When we have `-polly-ignore-parameter-bounds`, `Scop::Context` does not contain
; all the parameters present in the program.
;
; The construction of the `isl_multi_pw_aff` requires all the indivisual `pw_aff`
; to have the same parameter dimensions. To achieve this, we used to realign
; every `pw_aff` with `Scop::Context`. However, in conjunction with
; `-polly-ignore-parameter-bounds`, this is now incorrect, since `Scop::Context`
; does not contain all parameters.
;
; We check that Polly does the right thing in this case and sets up the parameter
; dimensions correctly.
; Check that kernel launch is generated in host IR.
; the declare would not be generated unless a call to a kernel exists.
; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
; ModuleID = 'test/GPGPU/bounds-construction-with-ignore-param-bounds.ll'
; C pseudocode
; ------------
; void f(int *arr, long niters, long stride) {
; for(int i = 0; i < niters; i++) {
; arr[i * stride] = 1;
; }
; }
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define void @f(i32 *%arr, i64 %niters, i64 %stride) unnamed_addr #1 {
entry:
br label %loop
loop: ; preds = %loop, %entry
%indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ]
%idx = mul nuw nsw i64 %indvar, %stride
%slot = getelementptr i32, i32* %arr, i64 %idx
store i32 1, i32* %slot, align 4
%indvar.next = add nuw nsw i64 %indvar, 1
%check = icmp sgt i64 %indvar.next, %niters
br i1 %check, label %exit, label %loop
exit: ; preds = %loop
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind uwtable }

View File

@@ -0,0 +1,102 @@
; RUN: opt %loadPolly -analyze -polly-scops \
; RUN: -polly-detect-fortran-arrays \
; RUN: -polly-invariant-load-hoisting \
; RUN: -polly-use-llvm-names \
; RUN: < %s | FileCheck %s --check-prefix=SCOP
; RUN: opt %loadPolly -S \
; RUN: -polly-detect-fortran-arrays \
; RUN: -polly-codegen-ppcg \
; RUN: -polly-invariant-load-hoisting \
; RUN: -polly-use-llvm-names \
; RUN: -polly-acc-fail-on-verify-module-failure \
; RUN: < %s | FileCheck %s --check-prefix=HOST-IR
; REQUIRES: pollyacc
; In Polly, we specifically add a parameter to represent the outermost dimension
; size of fortran arrays. We do this because this information is statically
; available from the fortran metadata generated by dragonegg.
; However, we were only materializing these parameters (meaning, creating an
; llvm::Value to back the isl_id) from *memory accesses*. This is wrong,
; we should materialize parameters from *scop array info*.
; It is wrong because if there is a case where we detect 2 fortran arrays,
; but only one of them is accessed, we may not materialize the other array's
; dimensions at all.
; This test case checks that we do not fail if there is an array that does
; not have an access (In this case, `memory`), we still generate the
; parameter.
; Check that we detect the function as a Scop.
; SCOP: Function: f
; SCOP-NEXT: Region: %loop.prepare---%for.exit
; SCOP-NEXT: Max Loop Depth: 1
; Check that we detect fortran arrays.
; SCOP: Arrays (Bounds as pw_affs) {
; SCOP: double* MemRef_global_arr[*]; // Element size 8
; SCOP-NEXT: double MemRef_memory[ [MemRef_memory_fortranarr_size] -> { [] -> [(MemRef_memory_fortranarr_size)] } ]; [BasePtrOrigin: MemRef_global_arr] // Element size 8
; SCOP-NEXT: double MemRef_memory2[ [MemRef_memory2_fortranarr_size] -> { [] -> [(MemRef_memory2_fortranarr_size)] } ]; [BasePtrOrigin: MemRef_global_arr] // Element size 8
; SCOP-NEXT: }
; Check that we have writes *only* into memory2, not into memory.
; SCOP: Statements {
; SCOP: Stmt_for_body
; SCOP: MustWriteAccess := [Reduction Type: NONE] [Fortran array descriptor: global_arr] [Scalar: 0]
; SCOP-NEXT: [start_val, end_val, offset, MemRef_memory_fortranarr_size, MemRef_memory2_fortranarr_size] -> { Stmt_for_body[i0] -> MemRef_memory2[start_val + offset + i0] };
; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Fortran array descriptor: global_arr] [Scalar: 0]
; SCOP-NEXT: [start_val, end_val, offset, MemRef_memory_fortranarr_size, MemRef_memory2_fortranarr_size] -> { Stmt_for_body[i0] -> MemRef_memory2[start_val + offset + i0] };
; SCOP-NEXT: }
; Check that we materialize the sizes and send it across to the kernel.
; HOST-IR: store i64 %MemRef_memory_size, i64* %polly_launch_0_param_4
; HOST-IR: store i64 %MemRef_memory2_size, i64* %polly_launch_0_param_5
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
module asm "\09.ident\09\22GCC: (GNU) 4.6.4 LLVM: 3.3.1\22"
%"struct.array1_real(kind=8)" = type { i8*, i64, i64, [1 x %struct.descriptor_dimension] }
%struct.descriptor_dimension = type { i64, i64, i64 }
@global_arr = external unnamed_addr global %"struct.array1_real(kind=8)", align 32
; Function Attrs: nounwind uwtable
define void @f(i32* noalias %ipstart, i32* noalias %ipend) unnamed_addr #0 {
entry:
br label %loop.prepare
loop.prepare: ; preds = %"6", %"3.preheader"
%start.val = load i32, i32* %ipstart, align 4
%end.val = load i32, i32* %ipend, align 4
%should.loop = icmp sgt i32 %start.val, %end.val
br i1 %should.loop, label %for.exit, label %for.body
for.body: ; preds = %for.body, %"4.preheader"
%i = phi i32 [ %i.next, %for.body ], [ %start.val, %loop.prepare ]
%i.sext = sext i32 %i to i64
%memory = load double*, double** bitcast (%"struct.array1_real(kind=8)"* @global_arr to double**), align 32
%offset = load i64, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @global_arr, i64 0, i32 1), align 8
%idx = add i64 %offset, %i.sext
%slot = getelementptr double, double* %memory, i64 %idx
store double 1.0, double* %slot, align 8
%memory2 = load double*, double** bitcast (%"struct.array1_real(kind=8)"* @global_arr to double**), align 32
%offset2 = load i64, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @global_arr, i64 0, i32 1), align 8
%idx2 = add i64 %offset2, %i.sext
%slot2 = getelementptr double, double* %memory2, i64 %idx2
%val = load double, double* %slot2, align 8
%should.loopexit = icmp eq i32 %i, %end.val
%i.next = add i32 %i, 1
br i1 %should.loopexit, label %for.exit, label %for.body
for.exit: ; preds = %for.body
ret void
}
attributes #0 = { nounwind uwtable }

View File

@@ -0,0 +1,37 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=KERNEL %s
; REQUIRES: pollyacc
; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %n) #0 {
; KERNEL: !nvvm.annotations = !{!0}
; KERNEL: !0 = !{void (i8 addrspace(1)*, i64)* @FUNC_foo_SCOP_0_KERNEL_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1}
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @foo(i64* %A, i64 %n) {
bb:
br label %bb1
bb1: ; preds = %bb6, %bb
%i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
%tmp = icmp slt i64 %i.0, %n
br i1 %tmp, label %bb2, label %bb8
bb2: ; preds = %bb1
%tmp3 = getelementptr inbounds i64, i64* %A, i64 %i.0
%tmp4 = load i64, i64* %tmp3, align 8
%tmp5 = add nsw i64 %tmp4, 100
store i64 %tmp5, i64* %tmp3, align 8
br label %bb6
bb6: ; preds = %bb2
%tmp7 = add nuw nsw i64 %i.0, 1
br label %bb1
bb8: ; preds = %bb1
ret void
}

View File

@@ -0,0 +1,118 @@
; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 -polly-target=gpu -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \
; RUN: FileCheck %s
; REQUIRES: pollyacc
;
; #include <cuda_runtime.h>
;
; static const int N = 45;
;
; void copy(int *R, int *A) {
; for (int i = 0; i < N; i++) {
; R[i] = A[i] * 10;
; }
; }
;
; int main() {
; int *A, *R;
;
; cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal);
; cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal);
;
; for (int i = 0; i < N; i++) {
; A[i] = i;
; R[i] = 0;
; }
; copy(R, A);
;
; return 0;
; }
;
; CHECK-NOT: polly_copyFromHostToDevice
; CHECK-NOT: polly_copyFromDeviceToHost
; CHECK-NOT: polly_freeDeviceMemory
; CHECK-NOT: polly_allocateMemoryForDevice
; CHECK: %[[REGCTX:[0-9]+]] = call i8* @polly_initContextCUDA()
; CHECK-NEXT: %[[REGCA:[0-9]+]] = bitcast i32* %A to i8*
; CHECK-NEXT: %[[REGCR:[0-9]+]] = bitcast i32* %R to i8*
; CHECK-NEXT: %[[REGGEP0:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
; CHECK-NEXT: store i8* %[[REGCA]], i8** %polly_launch_0_param_0
; CHECK-NEXT: %[[REGCP0:[0-9]+]] = bitcast i8** %polly_launch_0_param_0 to i8*
; CHECK-NEXT: store i8* %[[REGCP0]], i8** %[[REGGEP0]]
; CHECK-NEXT: %[[REGGEP1:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
; CHECK-NEXT: store i8* %[[REGCR]], i8** %polly_launch_0_param_1
; CHECK-NEXT: %[[REGCP1:[0-9]+]] = bitcast i8** %polly_launch_0_param_1 to i8*
; CHECK-NEXT: store i8* %[[REGCP1]], i8** %[[REGGEP1]]
; CHECK-NEXT: %[[REGKERNEL:[0-9]+]] = call i8* @polly_getKernel(i8* getelementptr inbounds ([852 x i8], [852 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0))
; CHECK-NEXT: call void @polly_launchKernel(i8* %[[REGKERNEL]], i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
; CHECK-NEXT: call void @polly_freeKernel(i8* %[[REGKERNEL]])
; CHECK-NEXT: call void @polly_synchronizeDevice()
; CHECK-NEXT: call void @polly_freeContext(i8* %[[REGCTX]])
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @copy(i32* %R, i32* %A) {
entry:
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
%exitcond = icmp ne i64 %indvars.iv, 45
br i1 %exitcond, label %for.body, label %for.end
for.body: ; preds = %for.cond
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
%tmp = load i32, i32* %arrayidx, align 4
%mul = mul nsw i32 %tmp, 10
%arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv
store i32 %mul, i32* %arrayidx2, align 4
br label %for.inc
for.inc: ; preds = %for.body
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
define i32 @main() {
entry:
%A = alloca i32*, align 8
%R = alloca i32*, align 8
%tmp = bitcast i32** %A to i8**
%call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2
%tmp1 = bitcast i32** %R to i8**
%call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
%exitcond = icmp ne i64 %indvars.iv, 45
br i1 %exitcond, label %for.body, label %for.end
for.body: ; preds = %for.cond
%tmp2 = load i32*, i32** %A, align 8
%arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv
%tmp3 = trunc i64 %indvars.iv to i32
store i32 %tmp3, i32* %arrayidx, align 4
%tmp4 = load i32*, i32** %R, align 8
%arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv
store i32 0, i32* %arrayidx3, align 4
br label %for.inc
for.inc: ; preds = %for.body
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
br label %for.cond
for.end: ; preds = %for.cond
%tmp5 = load i32*, i32** %R, align 8
%tmp6 = load i32*, i32** %A, align 8
call void @copy(i32* %tmp5, i32* %tmp6)
ret i32 0
}
declare i32 @cudaMallocManaged(i8**, i64, i32) #1

View File

@@ -0,0 +1,104 @@
; RUN: opt %loadPolly %s -polly-process-unprofitable -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: | FileCheck --check-prefix=KERNEL-IR %s
; REQUIRES: pollyacc
; KERNEL-IR: define ptx_kernel void @FUNC_vec_add_1_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_arr, i32 %N) #0 {
; The instruction marked <<<LeakyInst>>> is copied into the GPUModule,
; with changes only to the parameters to access data on the device instead of
; the host, i.e., MemRef_arr becomes polly.access.cast.MemRef_arr. Since the
; instruction is annotated with a DILocation, copying the instruction also copies
; the metadata into the GPUModule. This stops codegenerating the ptx_kernel by
; failing the verification of the Module in GPUNodeBuilder::finalize, due to the
; copied DICompileUnit not being listed in a llvm.dbg.cu which was neither copied
; nor created.
;
; https://reviews.llvm.org/D35630 removes this debug metadata before the
; instruction is copied to the GPUModule.
;
; vec_add_1.c:
; void vec_add_1(int N, int arr[N]) {
; int i=0;
; for( i=0 ; i<N ; i++) arr[i] += 1;
; }
;
source_filename = "vec_add_1.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define void @vec_add_1(i32 %N, i32* %arr) !dbg !7 {
entry:
call void @llvm.dbg.value(metadata i32 %N, i64 0, metadata !13, metadata !16), !dbg !17
call void @llvm.dbg.value(metadata i32* %arr, i64 0, metadata !14, metadata !16), !dbg !18
call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
%tmp = sext i32 %N to i64, !dbg !20
br label %for.cond, !dbg !20
for.cond: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
call void @llvm.dbg.value(metadata i32 undef, i64 0, metadata !15, metadata !16), !dbg !19
%cmp = icmp slt i64 %indvars.iv, %tmp, !dbg !22
br i1 %cmp, label %for.body, label %for.end, !dbg !24
for.body: ; preds = %for.cond
%arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv, !dbg !25
%tmp1 = load i32, i32* %arrayidx, align 4, !dbg !26, !tbaa !27
%add = add nsw i32 %tmp1, 1, !dbg !26 ; <<<LeakyInst>>>
store i32 %add, i32* %arrayidx, align 4, !dbg !26, !tbaa !27
br label %for.inc, !dbg !25
for.inc: ; preds = %for.body
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !31
call void @llvm.dbg.value(metadata !2, i64 0, metadata !15, metadata !16), !dbg !19
br label %for.cond, !dbg !32, !llvm.loop !33
for.end: ; preds = %for.cond
ret void, !dbg !35
}
declare void @llvm.dbg.declare(metadata, metadata, metadata)
declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 23e042ffe07a923db2dbebf4d2a3692c5a454fee) (http://llvm.org/git/llvm.git 39c5686a1f54884f12120927b1753a750fdb5e02)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "vec_add_1.c", directory: "/tmp")
!2 = !{}
!3 = !{i32 2, !"Dwarf Version", i32 4}
!4 = !{i32 2, !"Debug Info Version", i32 3}
!5 = !{i32 1, !"wchar_size", i32 4}
!6 = !{!"clang version 5.0.0 (http://llvm.org/git/clang.git 23e042ffe07a923db2dbebf4d2a3692c5a454fee) (http://llvm.org/git/llvm.git 39c5686a1f54884f12120927b1753a750fdb5e02)"}
!7 = distinct !DISubprogram(name: "vec_add_1", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !12)
!8 = !DISubroutineType(types: !9)
!9 = !{null, !10, !11}
!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
!12 = !{!13, !14, !15}
!13 = !DILocalVariable(name: "N", arg: 1, scope: !7, file: !1, line: 1, type: !10)
!14 = !DILocalVariable(name: "arr", arg: 2, scope: !7, file: !1, line: 1, type: !11)
!15 = !DILocalVariable(name: "i", scope: !7, file: !1, line: 2, type: !10)
!16 = !DIExpression()
!17 = !DILocation(line: 1, column: 20, scope: !7)
!18 = !DILocation(line: 1, column: 27, scope: !7)
!19 = !DILocation(line: 2, column: 7, scope: !7)
!20 = !DILocation(line: 3, column: 8, scope: !21)
!21 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
!22 = !DILocation(line: 3, column: 15, scope: !23)
!23 = distinct !DILexicalBlock(scope: !21, file: !1, line: 3, column: 3)
!24 = !DILocation(line: 3, column: 3, scope: !21)
!25 = !DILocation(line: 3, column: 25, scope: !23)
!26 = !DILocation(line: 3, column: 32, scope: !23)
!27 = !{!28, !28, i64 0}
!28 = !{!"int", !29, i64 0}
!29 = !{!"omnipotent char", !30, i64 0}
!30 = !{!"Simple C/C++ TBAA"}
!31 = !DILocation(line: 3, column: 21, scope: !23)
!32 = !DILocation(line: 3, column: 3, scope: !23)
!33 = distinct !{!33, !24, !34}
!34 = !DILocation(line: 3, column: 35, scope: !21)
!35 = !DILocation(line: 4, column: 1, scope: !7)

View File

@@ -0,0 +1,256 @@
; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-schedule \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=SCHED %s
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=CODE %s
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
; RUN: FileCheck %s -check-prefix=IR
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: -disable-output < %s | \
; RUN: FileCheck %s -check-prefix=KERNEL-IR
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-asm \
; RUN: -disable-output < %s | \
; RUN: FileCheck %s -check-prefix=KERNEL-ASM
; REQUIRES: pollyacc,nvptx
; CHECK: Stmt_bb5
; CHECK-NEXT: Domain :=
; CHECK-NEXT: { Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 };
; CHECK-NEXT: Schedule :=
; CHECK-NEXT: { Stmt_bb5[i0, i1] -> [i0, i1] };
; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]
; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
; SCHED: domain: "{ Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }"
; SCHED-NEXT: child:
; SCHED-NEXT: context: "{ [] }"
; SCHED-NEXT: child:
; SCHED-NEXT: extension: "{ [] -> from_device_MemRef_A[]; [] -> to_device_MemRef_A[] }"
; SCHED-NEXT: child:
; SCHED-NEXT: sequence:
; SCHED-NEXT: - filter: "{ to_device_MemRef_A[] }"
; SCHED-NEXT: child:
; SCHED-NEXT: set:
; SCHED-NEXT: - filter: "{ to_device_MemRef_A[] }"
; SCHED-NEXT: child:
; SCHED-NEXT: guard: "{ [] }"
; SCHED-NEXT: - filter: "{ Stmt_bb5[i0, i1] }"
; SCHED-NEXT: child:
; SCHED-NEXT: guard: "{ [] }"
; SCHED-NEXT: child:
; SCHED-NEXT: mark: "kernel"
; SCHED-NEXT: child:
; SCHED-NEXT: context: "[b0, b1, t0, t1] -> { [] : 0 <= b0 <= 31 and 0 <= b1 <= 31 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
; SCHED-NEXT: child:
; SCHED-NEXT: filter: "[b0, b1] -> { Stmt_bb5[i0, i1] : -31 - 32b0 + i0 <= 8192*floor((i0)/8192) <= -32b0 + i0 and -31 - 32b1 + i1 <= 8192*floor((i1)/8192) <= -32b1 + i1 }"
; SCHED-NEXT: child:
; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(floor((i0)/8192))] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/8192))] }]"
; SCHED-NEXT: permutable: 1
; SCHED-NEXT: coincident: [ 1, 1 ]
; SCHED-NEXT: child:
; SCHED-NEXT: filter: "[t0, t1] -> { Stmt_bb5[i0, i1] : 32*floor((-t0 + i0)/32) = -t0 + i0 and 16*floor((-t1 + i1)/16) = -t1 + i1 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
; SCHED-NEXT: child:
; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(0)] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/16) - 2*floor((i1)/32))] }]"
; SCHED-NEXT: permutable: 1
; SCHED-NEXT: coincident: [ 1, 1 ]
; SCHED-NEXT: - filter: "{ from_device_MemRef_A[] }"
; SCHED-NEXT: child:
; SCHED-NEXT: set:
; SCHED-NEXT: - filter: "{ from_device_MemRef_A[] }"
; SCHED-NEXT: child:
; SCHED-NEXT: guard: "{ [] }"
; CODE: Code
; CODE-NEXT: ====
; CODE-NEXT: # host
; CODE-NEXT: {
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyHostToDevice));
; CODE-NEXT: {
; CODE-NEXT: dim3 k0_dimBlock(16, 32);
; CODE-NEXT: dim3 k0_dimGrid(32, 32);
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyDeviceToHost));
; CODE-NEXT: }
; CODE: # kernel0
; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
; CODE-NEXT: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
; IR: polly.split_new_and_old:
; IR-NEXT: %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024)
; IR-NEXT: %.obit = extractvalue { i64, i1 } %0, 1
; IR-NEXT: %polly.overflow.state = or i1 false, %.obit
; IR-NEXT: %.res = extractvalue { i64, i1 } %0, 0
; IR-NEXT: %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024)
; IR-NEXT: %.obit1 = extractvalue { i64, i1 } %1, 1
; IR-NEXT: %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1
; IR-NEXT: %.res3 = extractvalue { i64, i1 } %1, 0
; IR-NEXT: %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3)
; IR-NEXT: %.obit4 = extractvalue { i64, i1 } %2, 1
; IR-NEXT: %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4
; IR-NEXT: %.res6 = extractvalue { i64, i1 } %2, 0
; IR-NEXT: %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6)
; IR-NEXT: %.obit7 = extractvalue { i64, i1 } %3, 1
; IR-NEXT: %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7
; IR-NEXT: %.res9 = extractvalue { i64, i1 } %3, 0
; IR-NEXT: %4 = icmp sge i64 %.res9, 2621440
; IR-NEXT: %5 = and i1 true, %4
; IR-NEXT: %polly.rtc.overflown = xor i1 %polly.overflow.state8, true
; IR-NEXT: %polly.rtc.result = and i1 %5, %polly.rtc.overflown
; IR-NEXT: br i1 %polly.rtc.result, label %polly.start, label %bb2
; IR: polly.start:
; IR-NEXT: br label %polly.acc.initialize
; IR: polly.acc.initialize:
; IR-NEXT: [[GPUContext:%.*]] = call i8* @polly_initContext()
; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304)
; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
; IR-NEXT: [[DevPtr:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
; IR-NEXT: [[ParamSlot:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0
; IR-NEXT: store i8* [[DevPtr]], i8** %polly_launch_0_param_0
; IR-NEXT: [[ParamTyped:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
; IR-NEXT: store i8* [[ParamTyped]], i8** [[ParamSlot]]
; IR-NEXT: call i8* @polly_getKernel
; IR-NEXT: call void @polly_launchKernel(i8* %11, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr)
; IR-NEXT: call void @polly_freeKernel
; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
; IR-NEXT: call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef_A)
; IR-NEXT: call void @polly_freeContext(i8* [[GPUContext]])
; IR-NEXT: br label %polly.exiting
; IR: polly.exiting:
; IR-NEXT: br label %polly.merge_new_and_old
; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(i8* %MemRef_A) #0 {
; KERNEL-IR-NEXT: entry:
; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64
; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
; KERNEL-IR-NEXT: %b1 = zext i32 %1 to i64
; KERNEL-IR-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; KERNEL-IR-NEXT: %t0 = zext i32 %2 to i64
; KERNEL-IR-NEXT: %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
; KERNEL-IR-NEXT: %t1 = zext i32 %3 to i64
; KERNEL-IR-NEXT: br label %polly.loop_preheader
; KERNEL-IR-LABEL: polly.loop_exit: ; preds = %polly.stmt.bb5
; KERNEL-IR-NEXT: ret void
; KERNEL-IR-LABEL: polly.loop_header: ; preds = %polly.stmt.bb5, %polly.loop_preheader
; KERNEL-IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
; KERNEL-IR-NEXT: %4 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %5 = add nsw i64 %4, %t0
; KERNEL-IR-NEXT: %6 = mul nsw i64 32, %b1
; KERNEL-IR-NEXT: %7 = add nsw i64 %6, %t1
; KERNEL-IR-NEXT: %8 = mul nsw i64 16, %polly.indvar
; KERNEL-IR-NEXT: %9 = add nsw i64 %7, %8
; KERNEL-IR-NEXT: br label %polly.stmt.bb5
; KERNEL-IR-LABEL: polly.stmt.bb5: ; preds = %polly.loop_header
; KERNEL-IR-NEXT: %10 = mul i64 %5, %9
; KERNEL-IR-NEXT: %p_tmp6 = sitofp i64 %10 to float
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A = bitcast i8* %MemRef_A to float*
; KERNEL-IR-NEXT: %11 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %12 = add nsw i64 %11, %t0
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024
; KERNEL-IR-NEXT: %13 = mul nsw i64 32, %b1
; KERNEL-IR-NEXT: %14 = add nsw i64 %13, %t1
; KERNEL-IR-NEXT: %15 = mul nsw i64 16, %polly.indvar
; KERNEL-IR-NEXT: %16 = add nsw i64 %14, %15
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16
; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, float* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
; KERNEL-IR-NEXT: %tmp8_p_scalar_ = load float, float* %polly.access.MemRef_A, align 4
; KERNEL-IR-NEXT: %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A1 = bitcast i8* %MemRef_A to float*
; KERNEL-IR-NEXT: %17 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %18 = add nsw i64 %17, %t0
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024
; KERNEL-IR-NEXT: %19 = mul nsw i64 32, %b1
; KERNEL-IR-NEXT: %20 = add nsw i64 %19, %t1
; KERNEL-IR-NEXT: %21 = mul nsw i64 16, %polly.indvar
; KERNEL-IR-NEXT: %22 = add nsw i64 %20, %21
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22
; KERNEL-IR-NEXT: %polly.access.MemRef_A4 = getelementptr float, float* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A3
; KERNEL-IR-NEXT: store float %p_tmp9, float* %polly.access.MemRef_A4, align 4
; KERNEL-IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
; KERNEL-IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 0
; KERNEL-IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
; KERNEL-IR-LABEL: polly.loop_preheader: ; preds = %entry
; KERNEL-IR-NEXT: br label %polly.loop_header
; KERNEL-IR: attributes #0 = { "polly.skip.fn" }
; KERNEL-ASM: .version 3.2
; KERNEL-ASM-NEXT: .target sm_30
; KERNEL-ASM-NEXT: .address_size 64
; KERNEL-ASM: // .globl kernel_0
; KERNEL-ASM: .visible .entry kernel_0(
; KERNEL-ASM-NEXT: .param .u64 kernel_0_param_0
; KERNEL-ASM-NEXT: )
; void double_parallel_loop(float A[][1024]) {
; for (long i = 0; i < 1024; i++)
; for (long j = 0; j < 1024; j++)
; A[i][j] += i * j;
; }
;
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @double_parallel_loop([1024 x float]* %A) {
bb:
br label %bb2
bb2: ; preds = %bb13, %bb
%i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
%exitcond1 = icmp ne i64 %i.0, 1024
br i1 %exitcond1, label %bb3, label %bb15
bb3: ; preds = %bb2
br label %bb4
bb4: ; preds = %bb10, %bb3
%j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
%exitcond = icmp ne i64 %j.0, 1024
br i1 %exitcond, label %bb5, label %bb12
bb5: ; preds = %bb4
%tmp = mul nuw nsw i64 %i.0, %j.0
%tmp6 = sitofp i64 %tmp to float
%tmp7 = getelementptr inbounds [1024 x float], [1024 x float]* %A, i64 %i.0, i64 %j.0
%tmp8 = load float, float* %tmp7, align 4
%tmp9 = fadd float %tmp8, %tmp6
store float %tmp9, float* %tmp7, align 4
br label %bb10
bb10: ; preds = %bb5
%tmp11 = add nuw nsw i64 %j.0, 1
br label %bb4
bb12: ; preds = %bb4
br label %bb13
bb13: ; preds = %bb12
%tmp14 = add nuw nsw i64 %i.0, 1
br label %bb2
bb15: ; preds = %bb2
ret void
}

View File

@@ -0,0 +1,58 @@
; RUN: opt %loadPolly < %s -analyze -polly-scops -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=SCOPS
; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
; REQUIRES: pollyacc
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
%S = type { i32, i32, [12 x %L] }
%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
define void @test(%S* %cpi, i1 %b) {
; SCOPS-LABEL: Region: %if.then14---%exit
; SCOPS: Invariant Accesses: {
; SCOPS-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; SCOPS-NEXT: [l2, l1] -> { Stmt_for_body_i[i0] -> MemRef_cpi[0, 0] };
; SCOPS-NEXT: Execution Context: [l2, l1] -> { : }
; SCOPS-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; SCOPS-NEXT: [l2, l1] -> { Stmt_for_body_lr_ph_i[] -> MemRef_cpi[0, 1] };
; SCOPS-NEXT: Execution Context: [l2, l1] -> { : l2 > 0 }
; SCOPS-NEXT: }
; SCOPS: Arrays {
; SCOPS-NEXT: i32 MemRef_cpi[*][(10 * %l1)]; // Element size 4
; SCOPS-NEXT: }
; Check that we gracefully handle failing invariant loads.
; This test case is taken from:
; test/Isl/CodeGen/invariant-load-dimension.ll
; FIXME: Figure out how to actually generate code for this loop.
; CODEGEN-NOT: LLVM ERROR: preloading invariant loads failed in function
entry:
%nt = getelementptr inbounds %S, %S* %cpi, i32 0, i32 1
br i1 %b, label %if.then14, label %exit
if.then14:
%ns = getelementptr inbounds %S, %S* %cpi, i32 0, i32 0
%l0 = load i32, i32* %ns, align 8
%cmp12.i = icmp sgt i32 %l0, 0
br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
for.body.lr.ph.i:
%l1 = load i32, i32* %nt, align 4
br label %for.body.i
for.body.i:
%phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
%mul.i163 = mul nsw i32 %phi, %l1
%cv = getelementptr inbounds %S, %S* %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
store i32 0, i32* %cv, align 8
%inc = add nuw nsw i32 %phi, 1
%l2 = load i32, i32* %ns, align 8
%cmp.i164 = icmp slt i32 %inc, %l2
br i1 %cmp.i164, label %for.body.i, label %exit
exit:
ret void
}

View File

@@ -0,0 +1,42 @@
; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg \
; RUN: -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
; REQUIRES: pollyacc
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
%S = type { i32, i32, [12 x %L] }
%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
define void @test(%S* %cpi, i1 %b) {
; CODEGEN-LABEL: @test(
; CODEGEN: polly.preload.begin:
; CODEGEN-NEXT: br i1 false
entry:
%nt = getelementptr inbounds %S, %S* %cpi, i32 0, i32 1
br i1 %b, label %if.then14, label %exit
if.then14:
%ns = getelementptr inbounds %S, %S* %cpi, i32 0, i32 0
%l0 = load i32, i32* %ns, align 8
%cmp12.i = icmp sgt i32 %l0, 0
br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
for.body.lr.ph.i:
%l1 = load i32, i32* %nt, align 4
br label %for.body.i
for.body.i:
%phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
%mul.i163 = mul nsw i32 %phi, %l1
%cv = getelementptr inbounds %S, %S* %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
store i32 0, i32* %cv, align 8
%inc = add nuw nsw i32 %phi, 1
%l2 = load i32, i32* %ns, align 8
%cmp.i164 = icmp slt i32 %inc, %l2
br i1 %cmp.i164, label %for.body.i, label %exit
exit:
ret void
}

View File

@@ -0,0 +1,176 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -disable-output \
; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE
; RUN: opt %loadPolly -polly-codegen-ppcg -disable-output \
; RUN: -polly-acc-dump-kernel-ir < %s | FileCheck %s -check-prefix=KERNEL-IR
; RUN: opt %loadPolly -polly-codegen-ppcg \
; RUN: -S < %s | FileCheck %s -check-prefix=IR
; void foo(float A[2][100]) {
; for (long t = 0; t < 100; t++)
; for (long i = 1; i < 99; i++)
; A[(t + 1) % 2][i] += A[t % 2][i - 1] + A[t % 2][i] + A[t % 2][i + 1];
; }
; REQUIRES: pollyacc
; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
; CODE-NEXT: for (int c0 = 0; c0 <= 99; c0 += 1)
; CODE-NEXT: {
; CODE-NEXT: dim3 k0_dimBlock(32);
; CODE-NEXT: dim3 k0_dimGrid(4);
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, c0);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));
; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A));
; CODE-NEXT: }
; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader
; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
; ...
; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1
; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*
; IR-NEXT: store i8* [[REGB]], i8** [[REGA]]
; IR: call i8* @polly_getKernel
; ...
; IR: call void @polly_freeKernel
; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar_next, 99
; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %c0)
; KERNEL-IR-LABEL: entry:
; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64
; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; KERNEL-IR-NEXT: %t0 = zext i32 %1 to i64
; KERNEL-IR-NEXT: br label %polly.cond
; KERNEL-IR-LABEL: polly.cond: ; preds = %entry
; KERNEL-IR-NEXT: %2 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %3 = add nsw i64 %2, %t0
; KERNEL-IR-NEXT: %4 = icmp sle i64 %3, 97
; KERNEL-IR-NEXT: br i1 %4, label %polly.then, label %polly.else
; KERNEL-IR-LABEL: polly.merge: ; preds = %polly.else, %polly.stmt.for.body3
; KERNEL-IR-NEXT: ret void
; KERNEL-IR-LABEL: polly.then: ; preds = %polly.cond
; KERNEL-IR-NEXT: %5 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %6 = add nsw i64 %5, %t0
; KERNEL-IR-NEXT: br label %polly.stmt.for.body3
; KERNEL-IR-LABEL: polly.stmt.for.body3: ; preds = %polly.then
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
; KERNEL-IR-NEXT: %pexp.pdiv_r = urem i64 %c0, 2
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100
; KERNEL-IR-NEXT: %7 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %8 = add nsw i64 %7, %t0
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8
; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
; KERNEL-IR-NEXT: %tmp_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
; KERNEL-IR-NEXT: %pexp.pdiv_r2 = urem i64 %c0, 2
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100
; KERNEL-IR-NEXT: %9 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %10 = add nsw i64 %9, %t0
; KERNEL-IR-NEXT: %11 = add nsw i64 %10, 1
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %11
; KERNEL-IR-NEXT: %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4
; KERNEL-IR-NEXT: %tmp2_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A5, align 4
; KERNEL-IR-NEXT: %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A6 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
; KERNEL-IR-NEXT: %pexp.pdiv_r7 = urem i64 %c0, 2
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100
; KERNEL-IR-NEXT: %12 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %13 = add nsw i64 %12, %t0
; KERNEL-IR-NEXT: %14 = add nsw i64 %13, 2
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %14
; KERNEL-IR-NEXT: %polly.access.MemRef_A10 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9
; KERNEL-IR-NEXT: %tmp3_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A10, align 4
; KERNEL-IR-NEXT: %p_add12 = fadd float %p_add, %tmp3_p_scalar_
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A11 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
; KERNEL-IR-NEXT: %15 = add nsw i64 %c0, 1
; KERNEL-IR-NEXT: %pexp.pdiv_r12 = urem i64 %15, 2
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100
; KERNEL-IR-NEXT: %16 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %17 = add nsw i64 %16, %t0
; KERNEL-IR-NEXT: %18 = add nsw i64 %17, 1
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %18
; KERNEL-IR-NEXT: %polly.access.MemRef_A15 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14
; KERNEL-IR-NEXT: %tmp4_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A15, align 4
; KERNEL-IR-NEXT: %p_add17 = fadd float %tmp4_p_scalar_, %p_add12
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A16 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
; KERNEL-IR-NEXT: %19 = add nsw i64 %c0, 1
; KERNEL-IR-NEXT: %pexp.pdiv_r17 = urem i64 %19, 2
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100
; KERNEL-IR-NEXT: %20 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %21 = add nsw i64 %20, %t0
; KERNEL-IR-NEXT: %22 = add nsw i64 %21, 1
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %22
; KERNEL-IR-NEXT: %polly.access.MemRef_A20 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19
; KERNEL-IR-NEXT: store float %p_add17, float addrspace(1)* %polly.access.MemRef_A20, align 4
; KERNEL-IR-NEXT: br label %polly.merge
; KERNEL-IR-LABEL: polly.else: ; preds = %polly.cond
; KERNEL-IR-NEXT: br label %polly.merge
; KERNEL-IR-NEXT: }
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @foo([100 x float]* %A) {
entry:
br label %for.cond
for.cond: ; preds = %for.inc18, %entry
%t.0 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ]
%exitcond1 = icmp ne i64 %t.0, 100
br i1 %exitcond1, label %for.body, label %for.end20
for.body: ; preds = %for.cond
br label %for.cond1
for.cond1: ; preds = %for.inc, %for.body
%i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ]
%exitcond = icmp ne i64 %i.0, 99
br i1 %exitcond, label %for.body3, label %for.end
for.body3: ; preds = %for.cond1
%sub = add nsw i64 %i.0, -1
%rem = srem i64 %t.0, 2
%arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem, i64 %sub
%tmp = load float, float* %arrayidx4, align 4
%rem5 = srem i64 %t.0, 2
%arrayidx7 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem5, i64 %i.0
%tmp2 = load float, float* %arrayidx7, align 4
%add = fadd float %tmp, %tmp2
%add8 = add nuw nsw i64 %i.0, 1
%rem9 = srem i64 %t.0, 2
%arrayidx11 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem9, i64 %add8
%tmp3 = load float, float* %arrayidx11, align 4
%add12 = fadd float %add, %tmp3
%add13 = add nuw nsw i64 %t.0, 1
%rem14 = srem i64 %add13, 2
%arrayidx16 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem14, i64 %i.0
%tmp4 = load float, float* %arrayidx16, align 4
%add17 = fadd float %tmp4, %add12
store float %add17, float* %arrayidx16, align 4
br label %for.inc
for.inc: ; preds = %for.body3
%inc = add nuw nsw i64 %i.0, 1
br label %for.cond1
for.end: ; preds = %for.cond1
br label %for.inc18
for.inc18: ; preds = %for.end
%inc19 = add nuw nsw i64 %t.0, 1
br label %for.cond
for.end20: ; preds = %for.cond
ret void
}

View File

@@ -0,0 +1,204 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
; RUN: -polly-invariant-load-hoisting=false \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=CODE %s
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: -polly-invariant-load-hoisting=false \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=KERNEL-IR %s
; REQUIRES: pollyacc
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
declare void @llvm.lifetime.start(i64, i8* nocapture) #0
; This test case tests that we can correctly handle a ScopStmt that is
; scheduled on the host, instead of within a kernel.
; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
; CODE-NEXT: {
; CODE-NEXT: dim3 k0_dimBlock(32);
; CODE-NEXT: dim3 k0_dimGrid(16);
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: if (p_0 <= 510 && p_1 <= 510) {
; CODE-NEXT: {
; CODE-NEXT: dim3 k1_dimBlock(32);
; CODE-NEXT: dim3 k1_dimGrid(p_1 <= -1048034 ? 32768 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: {
; CODE-NEXT: dim3 k2_dimBlock(16, 32);
; CODE-NEXT: dim3 k2_dimGrid(16, p_1 <= -7650 ? 256 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
; CODE-NEXT: kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: }
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_R, dev_MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_Q, dev_MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
; CODE-NEXT: Stmt_for_cond33_preheader();
; CODE: }
; CODE: # kernel0
; CODE-NEXT: Stmt_for_body16(32 * b0 + t0);
; CODE: # kernel1
; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 1048576; c0 += 1)
; CODE-NEXT: for (int c1 = 0; c1 <= 15; c1 += 1) {
; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510 && c1 == 0)
; CODE-NEXT: Stmt_for_body35(32 * b0 + t0 + 1048576 * c0);
; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510)
; CODE-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1)
; CODE-NEXT: Stmt_for_body42(32 * b0 + t0 + 1048576 * c0, 32 * c1 + c3);
; CODE-NEXT: sync0();
; CODE-NEXT: }
; CODE: # kernel2
; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 8192; c0 += 1)
; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 8192 * c0 <= 510)
; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
; CODE-NEXT: Stmt_for_body62(32 * b0 + t0 + 8192 * c0, 32 * b1 + t1 + 16 * c3);
; KERNEL-IR: call void @llvm.nvvm.barrier0()
; Function Attrs: nounwind uwtable
define internal void @kernel_gramschmidt(i32 %ni, i32 %nj, [512 x double]* %A, [512 x double]* %R, [512 x double]* %Q) #1 {
entry:
br label %entry.split
entry.split: ; preds = %entry
br label %for.cond1.preheader
for.cond1.preheader: ; preds = %entry.split, %for.inc86
%indvars.iv24 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next25, %for.inc86 ]
%indvars.iv19 = phi i64 [ 1, %entry.split ], [ %indvars.iv.next20, %for.inc86 ]
br label %for.inc
for.inc: ; preds = %for.cond1.preheader, %for.inc
%indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
%nrm.02 = phi double [ 0.000000e+00, %for.cond1.preheader ], [ %add, %for.inc ]
%arrayidx5 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv, i64 %indvars.iv24
%tmp = load double, double* %arrayidx5, align 8, !tbaa !1
%arrayidx9 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv, i64 %indvars.iv24
%tmp27 = load double, double* %arrayidx9, align 8, !tbaa !1
%mul = fmul double %tmp, %tmp27
%add = fadd double %nrm.02, %mul
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, 512
br i1 %exitcond, label %for.inc, label %for.end
for.end: ; preds = %for.inc
%add.lcssa = phi double [ %add, %for.inc ]
%call = tail call double @sqrt(double %add.lcssa) #2
%arrayidx13 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv24
store double %call, double* %arrayidx13, align 8, !tbaa !1
br label %for.body16
for.cond33.preheader: ; preds = %for.body16
%indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
%cmp347 = icmp slt i64 %indvars.iv.next25, 512
br i1 %cmp347, label %for.body35.lr.ph, label %for.inc86
for.body35.lr.ph: ; preds = %for.cond33.preheader
br label %for.body35
for.body16: ; preds = %for.end, %for.body16
%indvars.iv10 = phi i64 [ 0, %for.end ], [ %indvars.iv.next11, %for.body16 ]
%arrayidx20 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv10, i64 %indvars.iv24
%tmp28 = load double, double* %arrayidx20, align 8, !tbaa !1
%arrayidx24 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv24
%tmp29 = load double, double* %arrayidx24, align 8, !tbaa !1
%div = fdiv double %tmp28, %tmp29
%arrayidx28 = getelementptr inbounds [512 x double], [512 x double]* %Q, i64 %indvars.iv10, i64 %indvars.iv24
store double %div, double* %arrayidx28, align 8, !tbaa !1
%indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
%exitcond12 = icmp ne i64 %indvars.iv.next11, 512
br i1 %exitcond12, label %for.body16, label %for.cond33.preheader
for.cond33.loopexit: ; preds = %for.body62
%indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
%lftr.wideiv = trunc i64 %indvars.iv.next22 to i32
%exitcond23 = icmp ne i32 %lftr.wideiv, 512
br i1 %exitcond23, label %for.body35, label %for.cond33.for.inc86_crit_edge
for.body35: ; preds = %for.body35.lr.ph, %for.cond33.loopexit
%indvars.iv21 = phi i64 [ %indvars.iv19, %for.body35.lr.ph ], [ %indvars.iv.next22, %for.cond33.loopexit ]
%arrayidx39 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv21
store double 0.000000e+00, double* %arrayidx39, align 8, !tbaa !1
br label %for.body42
for.cond60.preheader: ; preds = %for.body42
br label %for.body62
for.body42: ; preds = %for.body35, %for.body42
%indvars.iv13 = phi i64 [ 0, %for.body35 ], [ %indvars.iv.next14, %for.body42 ]
%arrayidx46 = getelementptr inbounds [512 x double], [512 x double]* %Q, i64 %indvars.iv13, i64 %indvars.iv24
%tmp30 = load double, double* %arrayidx46, align 8, !tbaa !1
%arrayidx50 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv13, i64 %indvars.iv21
%tmp31 = load double, double* %arrayidx50, align 8, !tbaa !1
%mul51 = fmul double %tmp30, %tmp31
%arrayidx55 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv21
%tmp32 = load double, double* %arrayidx55, align 8, !tbaa !1
%add56 = fadd double %tmp32, %mul51
store double %add56, double* %arrayidx55, align 8, !tbaa !1
%indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
%exitcond15 = icmp ne i64 %indvars.iv.next14, 512
br i1 %exitcond15, label %for.body42, label %for.cond60.preheader
for.body62: ; preds = %for.cond60.preheader, %for.body62
%indvars.iv16 = phi i64 [ 0, %for.cond60.preheader ], [ %indvars.iv.next17, %for.body62 ]
%arrayidx66 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv16, i64 %indvars.iv21
%tmp33 = load double, double* %arrayidx66, align 8, !tbaa !1
%arrayidx70 = getelementptr inbounds [512 x double], [512 x double]* %Q, i64 %indvars.iv16, i64 %indvars.iv24
%tmp34 = load double, double* %arrayidx70, align 8, !tbaa !1
%arrayidx74 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv21
%tmp35 = load double, double* %arrayidx74, align 8, !tbaa !1
%mul75 = fmul double %tmp34, %tmp35
%sub = fsub double %tmp33, %mul75
%arrayidx79 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv16, i64 %indvars.iv21
store double %sub, double* %arrayidx79, align 8, !tbaa !1
%indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
%exitcond18 = icmp ne i64 %indvars.iv.next17, 512
br i1 %exitcond18, label %for.body62, label %for.cond33.loopexit
for.cond33.for.inc86_crit_edge: ; preds = %for.cond33.loopexit
br label %for.inc86
for.inc86: ; preds = %for.cond33.for.inc86_crit_edge, %for.cond33.preheader
%indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
%exitcond26 = icmp ne i64 %indvars.iv.next25, 512
br i1 %exitcond26, label %for.cond1.preheader, label %for.end88
for.end88: ; preds = %for.inc86
ret void
}
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) #0
; Function Attrs: nounwind
declare double @sqrt(double) #2
attributes #0 = { argmemonly nounwind }
attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }
!llvm.ident = !{!0}
!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"}
!1 = !{!2, !2, i64 0}
!2 = !{!"double", !3, i64 0}
!3 = !{!"omnipotent char", !4, i64 0}
!4 = !{!"Simple C/C++ TBAA"}

View File

@@ -0,0 +1,41 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=CODE %s
; REQUIRES: pollyacc
; CODE: Code
; CODE: ====
; CODE: No code generated
source_filename = "bugpoint-output-83bcdeb.bc"
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
@__data_radiation_MOD_cobi = external global [168 x double], align 32
; Function Attrs: nounwind uwtable
define void @__radiation_rg_MOD_coe_so() #0 {
entry:
%polly.access.kspec.load = load i32, i32* undef, align 4
%0 = or i1 undef, undef
br label %polly.preload.cond29
polly.preload.cond29: ; preds = %entry
br i1 %0, label %polly.preload.exec31, label %polly.preload.merge30
polly.preload.merge30: ; preds = %polly.preload.exec31, %polly.preload.cond29
%polly.preload..merge32 = phi double [ %polly.access.__data_radiation_MOD_cobi.load, %polly.preload.exec31 ], [ 0.000000e+00, %polly.preload.cond29 ]
ret void
polly.preload.exec31: ; preds = %polly.preload.cond29
%1 = sext i32 %polly.access.kspec.load to i64
%2 = mul nsw i64 7, %1
%3 = add nsw i64 0, %2
%4 = add nsw i64 %3, 48
%polly.access.__data_radiation_MOD_cobi = getelementptr double, double* getelementptr inbounds ([168 x double], [168 x double]* @__data_radiation_MOD_cobi, i32 0, i32 0), i64 %4
%polly.access.__data_radiation_MOD_cobi.load = load double, double* %polly.access.__data_radiation_MOD_cobi, align 8
br label %polly.preload.merge30
}
attributes #0 = { nounwind uwtable }

View File

@@ -0,0 +1,76 @@
; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s --check-prefix=SCOP
; RUN: opt %loadPolly -analyze -polly-codegen-ppcg -polly-acc-dump-kernel-ir < %s | FileCheck %s --check-prefix=KERNEL-IR
; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s --check-prefix=HOST-IR
; Test that we do recognise and codegen a kernel that has intrinsics.
; REQUIRES: pollyacc
; Check that we model the kernel as a scop.
; SCOP: Function: f
; SCOP-NEXT: Region: %entry.split---%for.end
; Check that the intrinsic call is present in the kernel IR.
; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
; KERNEL-IR: declare float @llvm.sqrt.f32(float)
; KERNEL-IR: declare float @llvm.fabs.f32(float)
; Check that kernel launch is generated in host IR.
; the declare would not be generated unless a call to a kernel exists.
; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
; void f(float *A, float *B, int N) {
; for(int i = 0; i < N; i++) {
; float tmp0 = A[i];
; float tmp1 = sqrt(tmp1);
; float tmp2 = fabs(tmp2);
; float tmp3 = copysignf(tmp1, tmp2);
; B[i] = tmp4;
; }
; }
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define void @f(float* %A, float* %B, i32 %N) {
entry:
br label %entry.split
entry.split: ; preds = %entry
%cmp1 = icmp sgt i32 %N, 0
br i1 %cmp1, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry.split
br label %for.body
for.body: ; preds = %for.body.lr.ph, %for.body
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv
%A.arr.i.val = load float, float* %A.arr.i, align 4
; Call to intrinsics that should be part of the kernel.
%sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
%fabs = tail call float @llvm.fabs.f32(float %sqrt);
%copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs);
%B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
store float %copysign, float* %B.arr.i, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%wide.trip.count = zext i32 %N to i64
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split
ret void
}
; Function Attrs: nounwind readnone
declare float @llvm.sqrt.f32(float) #0
declare float @llvm.fabs.f32(float) #0
declare float @llvm.copysign.f32(float, float) #0
attributes #0 = { nounwind readnone }

View File

@@ -0,0 +1,47 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \
; RUN: -disable-output < %s
; Make sure that if -polly-acc-fail-on-verify-module-failure is on, we actually
; fail on an illegal module.
; REQUIRES: pollyacc, asserts
; XFAIL: *
;
; void foo(long A[1024], long B[1024]) {
; for (long i = 0; i < 1024; i++)
; A[i] += (B[i] + (long)&B[i]);
; }
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @foo(i64* %A, i64* %B) {
bb:
br label %bb1
bb1: ; preds = %bb10, %bb
%i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
%exitcond = icmp ne i64 %i.0, 1024
br i1 %exitcond, label %bb2, label %bb12
bb2: ; preds = %bb1
%tmp = getelementptr inbounds i64, i64* %B, i64 %i.0
%tmp3 = load i64, i64* %tmp, align 8
%tmp4 = getelementptr inbounds i64, i64* %B, i64 %i.0
%tmp5 = ptrtoint i64* %tmp4 to i64
%tmp6 = add nsw i64 %tmp3, %tmp5
%tmp7 = getelementptr inbounds i64, i64* %A, i64 %i.0
%tmp8 = load i64, i64* %tmp7, align 8
%tmp9 = add nsw i64 %tmp8, %tmp6
store i64 %tmp9, i64* %tmp7, align 8
br label %bb10
bb10: ; preds = %bb2
%tmp11 = add nuw nsw i64 %i.0, 1
br label %bb1
bb12: ; preds = %bb1
ret void
}

View File

@@ -0,0 +1,73 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=CODE %s
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: -disable-output < %s | \
; RUN: not FileCheck %s -check-prefix=KERNEL-IR
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
; RUN: FileCheck %s -check-prefix=IR
; REQUIRES: pollyacc
;
; void foo(long A[1024], long B[1024]) {
; for (long i = 0; i < 1024; i++)
; A[i] += (B[i] + (long)&B[i]);
; }
; This kernel loads/stores a pointer address we model. This is a rare case,
; were we still lack proper code-generation support. We check here that we
; detect the invalid IR and bail out gracefully.
; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
; CODE-NEXT: {
; CODE-NEXT: dim3 k0_dimBlock(32);
; CODE-NEXT: dim3 k0_dimGrid(32);
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B, dev_MemRef_A);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost));
; CODE: # kernel0
; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
; RUN: FileCheck %s -check-prefix=IR
; KERNEL-IR: kernel
; IR: br i1 false, label %polly.start, label %bb1
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @foo(i64* %A, i64* %B) {
bb:
br label %bb1
bb1: ; preds = %bb10, %bb
%i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
%exitcond = icmp ne i64 %i.0, 1024
br i1 %exitcond, label %bb2, label %bb12
bb2: ; preds = %bb1
%tmp = getelementptr inbounds i64, i64* %B, i64 %i.0
%tmp3 = load i64, i64* %tmp, align 8
%tmp4 = getelementptr inbounds i64, i64* %B, i64 %i.0
%tmp5 = ptrtoint i64* %tmp4 to i64
%tmp6 = add nsw i64 %tmp3, %tmp5
%tmp7 = getelementptr inbounds i64, i64* %A, i64 %i.0
%tmp8 = load i64, i64* %tmp7, align 8
%tmp9 = add nsw i64 %tmp8, %tmp6
store i64 %tmp9, i64* %tmp7, align 8
br label %bb10
bb10: ; preds = %bb2
%tmp11 = add nuw nsw i64 %i.0, 1
br label %bb1
bb12: ; preds = %bb1
ret void
}

View File

@@ -0,0 +1,71 @@
; RUN: opt %loadPolly -analyze -polly-scops \
; RUN: -polly-invariant-load-hoisting < %s| FileCheck %s -check-prefix=SCOP
; RUN: opt %loadPolly -S -polly-codegen-ppcg \
; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
; REQUIRES: pollyacc
; Check that we detect a scop.
; SCOP: Function: f
; SCOP-NEXT: Region: %for.body---%for.end
; SCOP-NEXT: Max Loop Depth: 1
; SCOP-NEXT: Invariant Accesses: {
; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; SCOP-NEXT: [tmp] -> { Stmt_for_body[i0] -> MemRef_control[0] };
; SCOP-NEXT: Execution Context: [tmp] -> { : }
; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; SCOP-NEXT: [tmp] -> { Stmt_if_then[i0] -> MemRef_readarr[0] };
; SCOP-NEXT: Execution Context: [tmp] -> { : tmp >= 4 }
; SCOP-NEXT: }
; Check that kernel launch is generated in host IR.
; the declare would not be generated unless a call to a kernel exists.
; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
; This test makes sure that such an access pattern is handled correctly
; by PPCGCodeGeneration. It appears that not calling `preloadInvariantLoads`
; was the main reason that caused this test case to crash.
;
; void f(int *arr, const int *control, const int *readarr) {
; for(int i = 0; i < 1000; i++) {
; int t = 0;
; if (*control > 3) {
; t += *readarr;
; }
; arr[i] = t;
; }
; }
target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
target triple = "i386-apple-macosx10.12.0"
define void @f(i32* %arr, i32* %control, i32* %readarr) {
entry:
br label %entry.split
entry.split: ; preds = %entry
br label %for.body
for.body: ; preds = %entry.split, %if.end
%i.01 = phi i32 [ 0, %entry.split ], [ %inc, %if.end ]
%tmp = load i32, i32* %control, align 4
%cmp1 = icmp sgt i32 %tmp, 3
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%tmp1 = load i32, i32* %readarr, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
%t.0 = phi i32 [ %tmp1, %if.then ], [ 0, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %arr, i32 %i.01
store i32 %t.0, i32* %arrayidx, align 4
%inc = add nuw nsw i32 %i.01, 1
%exitcond = icmp eq i32 %inc, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %if.end
ret void
}

View File

@@ -0,0 +1,30 @@
; RUN: opt %loadPolly -S -polly-codegen-ppcg \
; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s
; REQUIRES: pollyacc
; CHECK: store i64 %polly.access.B.load, i64* %invariant.preload.s2a
; CHECK: %invariant.final_reload = load i64, i64* %invariant.preload.s2a
; Verify that the final reload of an invariant scalar memory access uses the
; same stack slot that into which the invariant memory access was stored
; originally. Earlier, this was broken as we introduce a new stack slot aside
; of the preload stack slot, which remained uninitialized and caused our escaping
; loads to contain garbage.
define i64 @foo(float* %A, i64* %B) {
entry:
br label %loop
loop:
%indvar = phi i64 [0, %entry], [%indvar.next, %loop]
%indvar.next = add nsw i64 %indvar, 1
%idx = getelementptr float, float* %A, i64 %indvar
store float 42.0, float* %idx
%invariant = load i64, i64* %B
%cmp = icmp sle i64 %indvar, 1024
br i1 %cmp, label %loop, label %exit
exit:
ret i64 %invariant
}

Some files were not shown because too many files have changed in this diff Show More