Imported Upstream version 6.10.0.49

Former-commit-id: 1d6753294b2993e1fbf92de9366bb9544db4189b
2020-01-16 16:38:04 +00:00
parent d94e79959b
commit 468663ddbb
48518 changed files with 2789335 additions and 61176 deletions
--- a/external/llvm-project/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll
+++ b/external/llvm-project/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll
@@ -0,0 +1,9 @@
+define float @__nv_expf(float %a) {
+  ret float %a
+}
+define float @__nv_cosf(float %a) {
+  ret float %a
+}
+define float @__nv_logf(float %a) {
+  ret float %a
+}
--- a/external/llvm-project/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll
+++ b/external/llvm-project/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll
@@ -0,0 +1,71 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP
+; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
+
+; REQUIRES: pollyacc
+
+; Check that we detect a scop.
+; SCOP:       Function: checkScalarKill
+; SCOP-NEXT: Region: %XLoopInit---%for.end
+; SCOP-NEXT: Max Loop Depth:  1
+
+; Check that we have a scalar that is not a phi node in the scop.
+; SCOP: i32 MemRef_x_0; // Element size 4
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+; Check that we add variables that are local to a scop into the kills that we
+; pass to PPCG. This should enable PPCG to codegen this example.
+; void checkScalarKill(int A[], int B[], int C[], const int control1, int control2) {
+; int x;
+; #pragma scop
+;     for(int i = 0; i < 1000; i++) {
+; XLoopInit:        x = 0;
+; 
+;         if (control1 > 2)
+;             C1Add: x += 10;
+;         if (control2 > 3)
+;             C2Add: x += A[i];
+; 
+; BLoopAccumX:        B[i] += x;
+;     }
+; 
+; #pragma endscop
+; }
+; ModuleID = 'test.ll'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @checkScalarKill(i32* %A, i32* %B, i32* %C, i32 %control1, i32 %control2) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %XLoopInit
+
+XLoopInit:                                        ; preds = %entry.split, %BLoopAccumX
+  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %BLoopAccumX ]
+  %cmp1 = icmp sgt i32 %control1, 2
+  %x.0 = select i1 %cmp1, i32 10, i32 0
+  %cmp2 = icmp sgt i32 %control2, 3
+  br i1 %cmp2, label %C2Add, label %BLoopAccumX
+
+C2Add:                                            ; preds = %XLoopInit
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp6 = load i32, i32* %arrayidx, align 4
+  %add4 = add nsw i32 %tmp6, %x.0
+  br label %BLoopAccumX
+
+BLoopAccumX:                                      ; preds = %XLoopInit, %C2Add
+  %x.1 = phi i32 [ %add4, %C2Add ], [ %x.0, %XLoopInit ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %tmp11 = load i32, i32* %arrayidx7, align 4
+  %add8 = add nsw i32 %tmp11, %x.1
+  store i32 %add8, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %XLoopInit, label %for.end
+
+for.end:                                          ; preds = %BLoopAccumX
+  ret void
+}
--- a/external/llvm-project/polly/test/GPGPU/align-params-in-schedule.ll
+++ b/external/llvm-project/polly/test/GPGPU/align-params-in-schedule.ll
@@ -0,0 +1,53 @@
+; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-codegen-ppcg \
+; RUN: -polly-invariant-load-hoisting -polly-ignore-parameter-bounds < %s | \
+; RUN: FileCheck %s
+
+; REQUIRES: pollyacc
+
+; CHECK: polly_launchKernel
+
+; Verify that this program compiles. At some point, this compilation crashed
+; due to insufficient parameters being available.
+
+source_filename = "bugpoint-output-4d01492.bc"
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.barney = type { i8*, i64, i64, [2 x %struct.widget] }
+%struct.widget = type { i64, i64, i64 }
+
+@global = external unnamed_addr global %struct.barney, align 32
+
+; Function Attrs: nounwind uwtable
+define void @wobble(i32* noalias %arg) #0 {
+bb:
+  %tmp = load i32, i32* %arg, align 4
+  br label %bb1
+
+bb1:                                              ; preds = %bb13, %bb
+  %tmp2 = phi i32 [ %tmp15, %bb13 ], [ 1, %bb ]
+  br label %bb3
+
+bb3:                                              ; preds = %bb3, %bb1
+  %tmp4 = load i32*, i32** bitcast (%struct.barney* @global to i32**), align 32
+  %tmp5 = sext i32 %tmp2 to i64
+  %tmp6 = load i64, i64* getelementptr inbounds (%struct.barney, %struct.barney* @global, i64 0, i32 3, i64 1, i32 0), align 8
+  %tmp7 = mul i64 %tmp6, %tmp5
+  %tmp8 = add i64 %tmp7, 0
+  %tmp9 = load i64, i64* getelementptr inbounds (%struct.barney, %struct.barney* @global, i64 0, i32 1), align 8
+  %tmp10 = add i64 %tmp8, %tmp9
+  %tmp11 = getelementptr i32, i32* %tmp4, i64 %tmp10
+  store i32 undef, i32* %tmp11, align 4
+  %tmp12 = icmp eq i32 0, 0
+  br i1 %tmp12, label %bb13, label %bb3
+
+bb13:                                             ; preds = %bb3
+  %tmp14 = icmp eq i32 %tmp2, %tmp
+  %tmp15 = add i32 %tmp2, 1
+  br i1 %tmp14, label %bb16, label %bb1
+
+bb16:                                             ; preds = %bb13
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
--- a/external/llvm-project/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll
+++ b/external/llvm-project/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll
@@ -0,0 +1,50 @@
+; RUN: opt %loadPolly -S -polly-codegen-ppcg \
+; RUN: -polly-use-llvm-names < %s
+; ModuleID = 'test/GPGPU/zero-size-array.ll'
+
+; REQUIRES: pollyacc
+
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+
+; We used to divide the element size by 8 to arrive at the 'actual' size
+; of an array element. This used to cause arrays that have an element size
+; of less than 8 to collapse to size 0. This test makes sure that it does
+; not happen anymore.
+
+; f(int *niters_ptr, int *arr[0]) {
+;     const int inters = *niters_ptr;
+;     for(int i = 0; i < niters; i++) {
+;       arr[0][i + 1] = 0
+;     }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @f(i32* noalias %niters.ptr, [0 x i32]* noalias %arr) #0 {
+entry:
+  %niters = load i32, i32* %niters.ptr, align 4
+  br label %loop.body
+
+loop.body:                                             ; preds = %loop.body, %entry
+  %indvar = phi i32 [ %indvar.next, %loop.body ], [ 1, %entry ]
+  %indvar.sext = sext i32 %indvar to i64
+  %arr.slot = getelementptr [0 x i32], [0 x i32]* %arr, i64 0, i64 %indvar.sext
+  store i32 0, i32* %arr.slot, align 4
+  %tmp8 = icmp eq i32 %indvar, %niters
+  %indvar.next = add i32 %indvar, 1
+  br i1 %tmp8, label %loop.exit, label %loop.body
+
+loop.exit:                                    ; preds = %loop.body
+  %tmp10 = icmp sgt i32 undef, 0
+  br label %auxiliary.loop
+
+auxiliary.loop:                                            ; preds = %"101", %loop.exit
+  %tmp11 = phi i1 [ %tmp10, %loop.exit ], [ undef, %auxiliary.loop ]
+  br i1 undef, label %auxiliary.loop, label %exit
+
+exit:                              ; preds = %auxiliary.loop
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
--- a/external/llvm-project/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll
+++ b/external/llvm-project/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll
@@ -0,0 +1,55 @@
+; RUN: opt %loadPolly -S -polly-codegen-ppcg \
+; RUN: -polly-ignore-parameter-bounds \
+; RUN: -polly-invariant-load-hoisting < %s| FileCheck %s -check-prefix=HOST-IR
+;
+; REQUIRES: pollyacc
+
+; When we have `-polly-ignore-parameter-bounds`, `Scop::Context` does not contain
+; all the parameters present in the program.
+;
+; The construction of the `isl_multi_pw_aff` requires all the indivisual `pw_aff`
+; to have the same parameter dimensions. To achieve this, we used to realign
+; every `pw_aff` with `Scop::Context`. However, in conjunction with
+; `-polly-ignore-parameter-bounds`, this is now incorrect, since `Scop::Context`
+; does not contain all parameters.
+;
+; We check that Polly does the right thing in this case and sets up the parameter
+; dimensions correctly.
+
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+; ModuleID = 'test/GPGPU/bounds-construction-with-ignore-param-bounds.ll'
+
+; C pseudocode
+; ------------
+; void f(int *arr, long niters, long stride) {
+;     for(int i = 0; i < niters; i++) {
+;       arr[i * stride] = 1;
+;     }
+; }
+
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define void @f(i32 *%arr, i64 %niters, i64 %stride) unnamed_addr #1 {
+entry:
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ]
+  %idx = mul nuw nsw i64 %indvar, %stride
+  %slot = getelementptr i32, i32* %arr, i64 %idx
+  store i32 1, i32* %slot, align 4
+  %indvar.next = add nuw nsw i64 %indvar, 1
+  %check = icmp sgt i64 %indvar.next, %niters
+  br i1 %check, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind uwtable }
--- a/external/llvm-project/polly/test/GPGPU/check-unused-fortran-array-size-param-offloaded-to-kernel.ll
+++ b/external/llvm-project/polly/test/GPGPU/check-unused-fortran-array-size-param-offloaded-to-kernel.ll
@@ -0,0 +1,102 @@
+; RUN: opt %loadPolly -analyze -polly-scops \
+; RUN:     -polly-detect-fortran-arrays \
+; RUN:     -polly-invariant-load-hoisting \
+; RUN:     -polly-use-llvm-names \
+; RUN:     < %s | FileCheck %s --check-prefix=SCOP
+
+; RUN: opt %loadPolly -S \
+; RUN:     -polly-detect-fortran-arrays \
+; RUN:     -polly-codegen-ppcg \
+; RUN:     -polly-invariant-load-hoisting \
+; RUN:     -polly-use-llvm-names \
+; RUN:     -polly-acc-fail-on-verify-module-failure \
+; RUN:     < %s | FileCheck %s --check-prefix=HOST-IR
+
+; REQUIRES: pollyacc
+
+; In Polly, we specifically add a parameter to represent the outermost dimension
+; size of fortran arrays. We do this because this information is statically
+; available from the fortran metadata generated by dragonegg.
+; However, we were only materializing these parameters (meaning, creating an
+; llvm::Value to back the isl_id) from *memory accesses*. This is wrong,
+; we should materialize parameters from *scop array info*.
+
+; It is wrong because if there is a case where we detect 2 fortran arrays,
+; but only one of them is accessed, we may not materialize the other array's
+; dimensions at all.
+
+; This test case checks that we do not fail if there is an array that does
+; not have an access (In this case, `memory`), we still generate the
+; parameter.
+
+; Check that we detect the function as a Scop.
+; SCOP:      Function: f
+; SCOP-NEXT: Region: %loop.prepare---%for.exit
+; SCOP-NEXT: Max Loop Depth:  1
+
+; Check that we detect fortran arrays.
+; SCOP:    Arrays (Bounds as pw_affs) {
+; SCOP:             double* MemRef_global_arr[*]; // Element size 8
+; SCOP-NEXT:        double MemRef_memory[ [MemRef_memory_fortranarr_size] -> { [] -> [(MemRef_memory_fortranarr_size)] } ]; [BasePtrOrigin: MemRef_global_arr] // Element size 8
+; SCOP-NEXT:        double MemRef_memory2[ [MemRef_memory2_fortranarr_size] -> { [] -> [(MemRef_memory2_fortranarr_size)] } ]; [BasePtrOrigin: MemRef_global_arr] // Element size 8
+; SCOP-NEXT:    }
+
+; Check that we have writes *only* into memory2, not into memory.
+; SCOP:    Statements {
+; SCOP:    	Stmt_for_body
+; SCOP:            MustWriteAccess :=	[Reduction Type: NONE] [Fortran array descriptor: global_arr] [Scalar: 0]
+; SCOP-NEXT:                [start_val, end_val, offset, MemRef_memory_fortranarr_size, MemRef_memory2_fortranarr_size] -> { Stmt_for_body[i0] -> MemRef_memory2[start_val + offset + i0] };
+; SCOP-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Fortran array descriptor: global_arr] [Scalar: 0]
+; SCOP-NEXT:                [start_val, end_val, offset, MemRef_memory_fortranarr_size, MemRef_memory2_fortranarr_size] -> { Stmt_for_body[i0] -> MemRef_memory2[start_val + offset + i0] };
+; SCOP-NEXT:    }
+
+; Check that we materialize the sizes and send it across to the kernel.
+; HOST-IR: store i64 %MemRef_memory_size, i64* %polly_launch_0_param_4
+; HOST-IR: store i64 %MemRef_memory2_size, i64* %polly_launch_0_param_5
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm "\09.ident\09\22GCC: (GNU) 4.6.4 LLVM: 3.3.1\22"
+
+%"struct.array1_real(kind=8)" = type { i8*, i64, i64, [1 x %struct.descriptor_dimension] }
+%struct.descriptor_dimension = type { i64, i64, i64 }
+
+@global_arr = external unnamed_addr global %"struct.array1_real(kind=8)", align 32
+
+; Function Attrs: nounwind uwtable
+define void @f(i32* noalias %ipstart, i32* noalias %ipend) unnamed_addr #0 {
+entry:
+  br label %loop.prepare
+
+
+loop.prepare:                                              ; preds = %"6", %"3.preheader"
+  %start.val = load i32, i32* %ipstart, align 4
+  %end.val = load i32, i32* %ipend, align 4
+  %should.loop = icmp sgt i32 %start.val, %end.val
+  br i1 %should.loop, label %for.exit, label %for.body
+
+
+for.body:                                              ; preds = %for.body, %"4.preheader"
+  %i = phi i32 [ %i.next, %for.body ], [ %start.val, %loop.prepare ]
+  %i.sext = sext i32 %i to i64
+  %memory = load double*, double** bitcast (%"struct.array1_real(kind=8)"* @global_arr to double**), align 32
+  %offset = load i64, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @global_arr, i64 0, i32 1), align 8
+  %idx = add i64 %offset, %i.sext
+  %slot = getelementptr double, double* %memory, i64 %idx
+  store double 1.0, double* %slot, align 8
+
+  %memory2 = load double*, double** bitcast (%"struct.array1_real(kind=8)"* @global_arr to double**), align 32
+  %offset2 = load i64, i64* getelementptr inbounds (%"struct.array1_real(kind=8)", %"struct.array1_real(kind=8)"* @global_arr, i64 0, i32 1), align 8
+  %idx2 = add i64 %offset2, %i.sext
+  %slot2 = getelementptr double, double* %memory2, i64 %idx2
+  %val = load double, double* %slot2, align 8
+
+  %should.loopexit = icmp eq i32 %i, %end.val
+  %i.next = add i32 %i, 1
+  br i1 %should.loopexit, label %for.exit, label %for.body
+
+for.exit:                                     ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
--- a/external/llvm-project/polly/test/GPGPU/cuda-annotations.ll
+++ b/external/llvm-project/polly/test/GPGPU/cuda-annotations.ll
@@ -0,0 +1,37 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=KERNEL %s
+
+; REQUIRES: pollyacc
+
+; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %n) #0 {
+
+; KERNEL: !nvvm.annotations = !{!0}
+
+; KERNEL: !0 = !{void (i8 addrspace(1)*, i64)* @FUNC_foo_SCOP_0_KERNEL_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1}
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(i64* %A, i64 %n) {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb6, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
+  %tmp = icmp slt i64 %i.0, %n
+  br i1 %tmp, label %bb2, label %bb8
+
+bb2:                                              ; preds = %bb1
+  %tmp3 = getelementptr inbounds i64, i64* %A, i64 %i.0
+  %tmp4 = load i64, i64* %tmp3, align 8
+  %tmp5 = add nsw i64 %tmp4, 100
+  store i64 %tmp5, i64* %tmp3, align 8
+  br label %bb6
+
+bb6:                                              ; preds = %bb2
+  %tmp7 = add nuw nsw i64 %i.0, 1
+  br label %bb1
+
+bb8:                                              ; preds = %bb1
+  ret void
+}
--- a/external/llvm-project/polly/test/GPGPU/cuda-managed-memory-simple.ll
+++ b/external/llvm-project/polly/test/GPGPU/cuda-managed-memory-simple.ll
@@ -0,0 +1,118 @@
+; RUN: opt %loadPolly -S  -polly-process-unprofitable -polly-acc-mincompute=0 -polly-target=gpu  -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \
+; RUN: FileCheck %s
+
+; REQUIRES: pollyacc
+
+;
+;    #include <cuda_runtime.h>
+;
+;    static const int N = 45;
+;
+;    void copy(int *R, int *A) {
+;      for (int i = 0; i < N; i++) {
+;        R[i] = A[i] * 10;
+;      }
+;    }
+;
+;    int main() {
+;      int *A, *R;
+;
+;      cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal);
+;      cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal);
+;
+;      for (int i = 0; i < N; i++) {
+;        A[i] = i;
+;        R[i] = 0;
+;      }
+;      copy(R, A);
+;
+;      return 0;
+;    }
+;
+
+; CHECK-NOT: polly_copyFromHostToDevice
+; CHECK-NOT: polly_copyFromDeviceToHost
+; CHECK-NOT: polly_freeDeviceMemory
+; CHECK-NOT: polly_allocateMemoryForDevice
+
+; CHECK:       %[[REGCTX:[0-9]+]] = call i8* @polly_initContextCUDA()
+; CHECK-NEXT:  %[[REGCA:[0-9]+]] = bitcast i32* %A to i8*
+; CHECK-NEXT:  %[[REGCR:[0-9]+]] = bitcast i32* %R to i8*
+; CHECK-NEXT:  %[[REGGEP0:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
+; CHECK-NEXT:  store i8* %[[REGCA]], i8** %polly_launch_0_param_0
+; CHECK-NEXT:  %[[REGCP0:[0-9]+]] = bitcast i8** %polly_launch_0_param_0 to i8*
+; CHECK-NEXT:  store i8* %[[REGCP0]], i8** %[[REGGEP0]]
+; CHECK-NEXT:  %[[REGGEP1:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
+; CHECK-NEXT:  store i8* %[[REGCR]], i8** %polly_launch_0_param_1
+; CHECK-NEXT:  %[[REGCP1:[0-9]+]] = bitcast i8** %polly_launch_0_param_1 to i8*
+; CHECK-NEXT:  store i8* %[[REGCP1]], i8** %[[REGGEP1]]
+; CHECK-NEXT:  %[[REGKERNEL:[0-9]+]] = call i8* @polly_getKernel(i8* getelementptr inbounds ([852 x i8], [852 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0))
+; CHECK-NEXT:  call void @polly_launchKernel(i8* %[[REGKERNEL]], i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
+; CHECK-NEXT:  call void @polly_freeKernel(i8* %[[REGKERNEL]])
+; CHECK-NEXT:  call void @polly_synchronizeDevice()
+; CHECK-NEXT:  call void @polly_freeContext(i8* %[[REGCTX]])
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @copy(i32* %R, i32* %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, 45
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %tmp, 10
+  %arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define i32 @main() {
+entry:
+  %A = alloca i32*, align 8
+  %R = alloca i32*, align 8
+  %tmp = bitcast i32** %A to i8**
+  %call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2
+  %tmp1 = bitcast i32** %R to i8**
+  %call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, 45
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %tmp2 = load i32*, i32** %A, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv
+  %tmp3 = trunc i64 %indvars.iv to i32
+  store i32 %tmp3, i32* %arrayidx, align 4
+  %tmp4 = load i32*, i32** %R, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv
+  store i32 0, i32* %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %tmp5 = load i32*, i32** %R, align 8
+  %tmp6 = load i32*, i32** %A, align 8
+  call void @copy(i32* %tmp5, i32* %tmp6)
+  ret i32 0
+}
+
+declare i32 @cudaMallocManaged(i8**, i64, i32) #1
--- a/external/llvm-project/polly/test/GPGPU/debug-metadata-leak.ll
+++ b/external/llvm-project/polly/test/GPGPU/debug-metadata-leak.ll
@@ -0,0 +1,104 @@
+; RUN: opt %loadPolly %s -polly-process-unprofitable -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
+; RUN: | FileCheck --check-prefix=KERNEL-IR %s
+
+; REQUIRES: pollyacc
+
+; KERNEL-IR: define ptx_kernel void @FUNC_vec_add_1_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_arr, i32 %N) #0 {
+
+; The instruction marked <<<LeakyInst>>> is copied into the GPUModule,
+; with changes only to the parameters to access data on the device instead of
+; the host, i.e., MemRef_arr becomes polly.access.cast.MemRef_arr. Since the
+; instruction is annotated with a DILocation, copying the instruction also copies
+; the metadata into the GPUModule. This stops codegenerating the ptx_kernel by
+; failing the verification of the Module in GPUNodeBuilder::finalize, due to the
+; copied DICompileUnit not being listed in a llvm.dbg.cu which was neither copied
+; nor created.
+;
+; https://reviews.llvm.org/D35630 removes this debug metadata before the
+; instruction is copied to the GPUModule.
+; 
+; vec_add_1.c:
+;      void vec_add_1(int N, int arr[N]) {
+;        int i=0;
+;        for( i=0 ; i<N ; i++) arr[i] += 1;
+;      }
+;
+source_filename = "vec_add_1.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @vec_add_1(i32 %N, i32* %arr) !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %N, i64 0, metadata !13, metadata !16), !dbg !17
+  call void @llvm.dbg.value(metadata i32* %arr, i64 0, metadata !14, metadata !16), !dbg !18
+  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
+  %tmp = sext i32 %N to i64, !dbg !20
+  br label %for.cond, !dbg !20
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  call void @llvm.dbg.value(metadata i32 undef, i64 0, metadata !15, metadata !16), !dbg !19
+  %cmp = icmp slt i64 %indvars.iv, %tmp, !dbg !22
+  br i1 %cmp, label %for.body, label %for.end, !dbg !24
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv, !dbg !25
+  %tmp1 = load i32, i32* %arrayidx, align 4, !dbg !26, !tbaa !27
+  %add = add nsw i32 %tmp1, 1, !dbg !26    ;   <<<LeakyInst>>>
+  store i32 %add, i32* %arrayidx, align 4, !dbg !26, !tbaa !27
+  br label %for.inc, !dbg !25
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !31
+  call void @llvm.dbg.value(metadata !2, i64 0, metadata !15, metadata !16), !dbg !19
+  br label %for.cond, !dbg !32, !llvm.loop !33
+
+for.end:                                          ; preds = %for.cond
+  ret void, !dbg !35
+}
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 23e042ffe07a923db2dbebf4d2a3692c5a454fee) (http://llvm.org/git/llvm.git 39c5686a1f54884f12120927b1753a750fdb5e02)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "vec_add_1.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 5.0.0 (http://llvm.org/git/clang.git 23e042ffe07a923db2dbebf4d2a3692c5a454fee) (http://llvm.org/git/llvm.git 39c5686a1f54884f12120927b1753a750fdb5e02)"}
+!7 = distinct !DISubprogram(name: "vec_add_1", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10, !11}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
+!12 = !{!13, !14, !15}
+!13 = !DILocalVariable(name: "N", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!14 = !DILocalVariable(name: "arr", arg: 2, scope: !7, file: !1, line: 1, type: !11)
+!15 = !DILocalVariable(name: "i", scope: !7, file: !1, line: 2, type: !10)
+!16 = !DIExpression()
+!17 = !DILocation(line: 1, column: 20, scope: !7)
+!18 = !DILocation(line: 1, column: 27, scope: !7)
+!19 = !DILocation(line: 2, column: 7, scope: !7)
+!20 = !DILocation(line: 3, column: 8, scope: !21)
+!21 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
+!22 = !DILocation(line: 3, column: 15, scope: !23)
+!23 = distinct !DILexicalBlock(scope: !21, file: !1, line: 3, column: 3)
+!24 = !DILocation(line: 3, column: 3, scope: !21)
+!25 = !DILocation(line: 3, column: 25, scope: !23)
+!26 = !DILocation(line: 3, column: 32, scope: !23)
+!27 = !{!28, !28, i64 0}
+!28 = !{!"int", !29, i64 0}
+!29 = !{!"omnipotent char", !30, i64 0}
+!30 = !{!"Simple C/C++ TBAA"}
+!31 = !DILocation(line: 3, column: 21, scope: !23)
+!32 = !DILocation(line: 3, column: 3, scope: !23)
+!33 = distinct !{!33, !24, !34}
+!34 = !DILocation(line: 3, column: 35, scope: !21)
+!35 = !DILocation(line: 4, column: 1, scope: !7)
--- a/external/llvm-project/polly/test/GPGPU/double-parallel-loop.ll
+++ b/external/llvm-project/polly/test/GPGPU/double-parallel-loop.ll
@@ -0,0 +1,256 @@
+; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-schedule \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=SCHED %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
+; RUN: FileCheck %s -check-prefix=IR
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck %s -check-prefix=KERNEL-IR
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-asm \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck %s -check-prefix=KERNEL-ASM
+
+; REQUIRES: pollyacc,nvptx
+
+; CHECK: Stmt_bb5
+; CHECK-NEXT:       Domain :=
+; CHECK-NEXT:           { Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 };
+; CHECK-NEXT:       Schedule :=
+; CHECK-NEXT:           { Stmt_bb5[i0, i1] -> [i0, i1] };
+; CHECK-NEXT:       ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:           { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
+; CHECK-NEXT:       MustWriteAccess :=  [Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:           { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
+
+; SCHED: domain: "{ Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }"
+; SCHED-NEXT: child:
+; SCHED-NEXT:   context: "{ [] }"
+; SCHED-NEXT:   child:
+; SCHED-NEXT:     extension: "{ [] -> from_device_MemRef_A[]; [] -> to_device_MemRef_A[] }"
+; SCHED-NEXT:     child:
+; SCHED-NEXT:       sequence:
+; SCHED-NEXT:       - filter: "{ to_device_MemRef_A[] }"
+; SCHED-NEXT:         child:
+; SCHED-NEXT:           set:
+; SCHED-NEXT:           - filter: "{ to_device_MemRef_A[] }"
+; SCHED-NEXT:             child:
+; SCHED-NEXT:               guard: "{ [] }"
+; SCHED-NEXT:       - filter: "{ Stmt_bb5[i0, i1] }"
+; SCHED-NEXT:         child:
+; SCHED-NEXT:           guard: "{ [] }"
+; SCHED-NEXT:           child:
+; SCHED-NEXT:             mark: "kernel"
+; SCHED-NEXT:             child:
+; SCHED-NEXT:               context: "[b0, b1, t0, t1] -> { [] : 0 <= b0 <= 31 and 0 <= b1 <= 31 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
+; SCHED-NEXT:               child:
+; SCHED-NEXT:                 filter: "[b0, b1] -> { Stmt_bb5[i0, i1] : -31 - 32b0 + i0 <= 8192*floor((i0)/8192) <= -32b0 + i0 and -31 - 32b1 + i1 <= 8192*floor((i1)/8192) <= -32b1 + i1 }"
+; SCHED-NEXT:                 child:
+; SCHED-NEXT:                   schedule: "[{ Stmt_bb5[i0, i1] -> [(floor((i0)/8192))] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/8192))] }]"
+; SCHED-NEXT:                   permutable: 1
+; SCHED-NEXT:                   coincident: [ 1, 1 ]
+; SCHED-NEXT:                   child:
+; SCHED-NEXT:                     filter: "[t0, t1] -> { Stmt_bb5[i0, i1] : 32*floor((-t0 + i0)/32) = -t0 + i0 and 16*floor((-t1 + i1)/16) = -t1 + i1 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
+; SCHED-NEXT:                     child:
+; SCHED-NEXT:                       schedule: "[{ Stmt_bb5[i0, i1] -> [(0)] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/16) - 2*floor((i1)/32))] }]"
+; SCHED-NEXT:                       permutable: 1
+; SCHED-NEXT:                       coincident: [ 1, 1 ]
+; SCHED-NEXT:       - filter: "{ from_device_MemRef_A[] }"
+; SCHED-NEXT:         child:
+; SCHED-NEXT:           set:
+; SCHED-NEXT:           - filter: "{ from_device_MemRef_A[] }"
+; SCHED-NEXT:             child:
+; SCHED-NEXT:               guard: "{ [] }"
+
+; CODE: Code
+; CODE-NEXT: ====
+; CODE-NEXT: # host
+; CODE-NEXT: {
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE-NEXT:   {
+; CODE-NEXT:     dim3 k0_dimBlock(16, 32);
+; CODE-NEXT:     dim3 k0_dimGrid(32, 32);
+; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT: }
+
+; CODE: # kernel0
+; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
+; CODE-NEXT:   Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
+
+; IR: polly.split_new_and_old:
+; IR-NEXT:   %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024)
+; IR-NEXT:   %.obit = extractvalue { i64, i1 } %0, 1
+; IR-NEXT:   %polly.overflow.state = or i1 false, %.obit
+; IR-NEXT:   %.res = extractvalue { i64, i1 } %0, 0
+; IR-NEXT:   %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024)
+; IR-NEXT:   %.obit1 = extractvalue { i64, i1 } %1, 1
+; IR-NEXT:   %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1
+; IR-NEXT:   %.res3 = extractvalue { i64, i1 } %1, 0
+; IR-NEXT:   %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3)
+; IR-NEXT:   %.obit4 = extractvalue { i64, i1 } %2, 1
+; IR-NEXT:   %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4
+; IR-NEXT:   %.res6 = extractvalue { i64, i1 } %2, 0
+; IR-NEXT:   %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6)
+; IR-NEXT:   %.obit7 = extractvalue { i64, i1 } %3, 1
+; IR-NEXT:   %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7
+; IR-NEXT:   %.res9 = extractvalue { i64, i1 } %3, 0
+; IR-NEXT:   %4 = icmp sge i64 %.res9, 2621440
+; IR-NEXT:   %5 = and i1 true, %4
+; IR-NEXT:   %polly.rtc.overflown = xor i1 %polly.overflow.state8, true
+; IR-NEXT:   %polly.rtc.result = and i1 %5, %polly.rtc.overflown
+; IR-NEXT:   br i1 %polly.rtc.result, label %polly.start, label %bb2
+
+; IR: polly.start:
+; IR-NEXT: br label %polly.acc.initialize
+
+; IR: polly.acc.initialize:
+; IR-NEXT:    [[GPUContext:%.*]] = call i8* @polly_initContext()
+; IR-NEXT:    %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304)
+; IR-NEXT:    [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
+; IR-NEXT:    call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
+; IR-NEXT:    [[DevPtr:%.*]]  = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
+; IR-NEXT:    [[ParamSlot:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0
+; IR-NEXT:    store i8* [[DevPtr]], i8** %polly_launch_0_param_0
+; IR-NEXT:    [[ParamTyped:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
+; IR-NEXT:    store i8* [[ParamTyped]], i8** [[ParamSlot]]
+; IR-NEXT:    call i8* @polly_getKernel
+; IR-NEXT:    call void @polly_launchKernel(i8* %11, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr)
+; IR-NEXT:    call void @polly_freeKernel
+; IR-NEXT:    [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
+; IR-NEXT:    call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
+; IR-NEXT:    call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef_A)
+; IR-NEXT:    call void @polly_freeContext(i8* [[GPUContext]])
+; IR-NEXT:    br label %polly.exiting
+
+; IR: polly.exiting:
+; IR-NEXT:    br label %polly.merge_new_and_old
+
+; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(i8* %MemRef_A) #0 {
+; KERNEL-IR-NEXT: entry:
+; KERNEL-IR-NEXT:   %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+; KERNEL-IR-NEXT:   %b0 = zext i32 %0 to i64
+; KERNEL-IR-NEXT:   %1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+; KERNEL-IR-NEXT:   %b1 = zext i32 %1 to i64
+; KERNEL-IR-NEXT:   %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; KERNEL-IR-NEXT:   %t0 = zext i32 %2 to i64
+; KERNEL-IR-NEXT:   %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+; KERNEL-IR-NEXT:   %t1 = zext i32 %3 to i64
+; KERNEL-IR-NEXT:   br label %polly.loop_preheader
+
+; KERNEL-IR-LABEL: polly.loop_exit:                                  ; preds = %polly.stmt.bb5
+; KERNEL-IR-NEXT:   ret void
+
+; KERNEL-IR-LABEL: polly.loop_header:                                ; preds = %polly.stmt.bb5, %polly.loop_preheader
+; KERNEL-IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
+; KERNEL-IR-NEXT:   %4 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %5 = add nsw i64 %4, %t0
+; KERNEL-IR-NEXT:   %6 = mul nsw i64 32, %b1
+; KERNEL-IR-NEXT:   %7 = add nsw i64 %6, %t1
+; KERNEL-IR-NEXT:   %8 = mul nsw i64 16, %polly.indvar
+; KERNEL-IR-NEXT:   %9 = add nsw i64 %7, %8
+; KERNEL-IR-NEXT:   br label %polly.stmt.bb5
+
+; KERNEL-IR-LABEL: polly.stmt.bb5:                                   ; preds = %polly.loop_header
+; KERNEL-IR-NEXT:   %10 = mul i64 %5, %9
+; KERNEL-IR-NEXT:   %p_tmp6 = sitofp i64 %10 to float
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %11 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %12 = add nsw i64 %11, %t0
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024
+; KERNEL-IR-NEXT:   %13 = mul nsw i64 32, %b1
+; KERNEL-IR-NEXT:   %14 = add nsw i64 %13, %t1
+; KERNEL-IR-NEXT:   %15 = mul nsw i64 16, %polly.indvar
+; KERNEL-IR-NEXT:   %16 = add nsw i64 %14, %15
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A = getelementptr float, float* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
+; KERNEL-IR-NEXT:   %tmp8_p_scalar_ = load float, float* %polly.access.MemRef_A, align 4
+; KERNEL-IR-NEXT:   %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A1 = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %17 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %18 = add nsw i64 %17, %t0
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024
+; KERNEL-IR-NEXT:   %19 = mul nsw i64 32, %b1
+; KERNEL-IR-NEXT:   %20 = add nsw i64 %19, %t1
+; KERNEL-IR-NEXT:   %21 = mul nsw i64 16, %polly.indvar
+; KERNEL-IR-NEXT:   %22 = add nsw i64 %20, %21
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A4 = getelementptr float, float* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A3
+; KERNEL-IR-NEXT:   store float %p_tmp9, float* %polly.access.MemRef_A4, align 4
+; KERNEL-IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
+; KERNEL-IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar, 0
+; KERNEL-IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
+
+; KERNEL-IR-LABEL: polly.loop_preheader:                             ; preds = %entry
+; KERNEL-IR-NEXT:   br label %polly.loop_header
+
+; KERNEL-IR: attributes #0 = { "polly.skip.fn" }
+
+; KERNEL-ASM: .version 3.2
+; KERNEL-ASM-NEXT: .target sm_30
+; KERNEL-ASM-NEXT: .address_size 64
+
+; KERNEL-ASM:   // .globl     kernel_0
+
+; KERNEL-ASM: .visible .entry kernel_0(
+; KERNEL-ASM-NEXT:   .param .u64 kernel_0_param_0
+; KERNEL-ASM-NEXT: )
+
+;    void double_parallel_loop(float A[][1024]) {
+;      for (long i = 0; i < 1024; i++)
+;        for (long j = 0; j < 1024; j++)
+;          A[i][j] += i * j;
+;    }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @double_parallel_loop([1024 x float]* %A) {
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb13, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
+  %exitcond1 = icmp ne i64 %i.0, 1024
+  br i1 %exitcond1, label %bb3, label %bb15
+
+bb3:                                              ; preds = %bb2
+  br label %bb4
+
+bb4:                                              ; preds = %bb10, %bb3
+  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
+  %exitcond = icmp ne i64 %j.0, 1024
+  br i1 %exitcond, label %bb5, label %bb12
+
+bb5:                                              ; preds = %bb4
+  %tmp = mul nuw nsw i64 %i.0, %j.0
+  %tmp6 = sitofp i64 %tmp to float
+  %tmp7 = getelementptr inbounds [1024 x float], [1024 x float]* %A, i64 %i.0, i64 %j.0
+  %tmp8 = load float, float* %tmp7, align 4
+  %tmp9 = fadd float %tmp8, %tmp6
+  store float %tmp9, float* %tmp7, align 4
+  br label %bb10
+
+bb10:                                             ; preds = %bb5
+  %tmp11 = add nuw nsw i64 %j.0, 1
+  br label %bb4
+
+bb12:                                             ; preds = %bb4
+  br label %bb13
+
+bb13:                                             ; preds = %bb12
+  %tmp14 = add nuw nsw i64 %i.0, 1
+  br label %bb2
+
+bb15:                                             ; preds = %bb2
+  ret void
+}
--- a/external/llvm-project/polly/test/GPGPU/failing-invariant-load-handling.ll
+++ b/external/llvm-project/polly/test/GPGPU/failing-invariant-load-handling.ll
@@ -0,0 +1,58 @@
+; RUN: opt %loadPolly < %s -analyze -polly-scops -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
+
+; REQUIRES: pollyacc
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+
+%S = type { i32, i32, [12 x %L] }
+%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
+
+define void @test(%S* %cpi, i1 %b) {
+; SCOPS-LABEL: Region: %if.then14---%exit
+; SCOPS:         Invariant Accesses: {
+; SCOPS-NEXT:            ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
+; SCOPS-NEXT:                [l2, l1] -> { Stmt_for_body_i[i0] -> MemRef_cpi[0, 0] };
+; SCOPS-NEXT:            Execution Context: [l2, l1] -> {  :  }
+; SCOPS-NEXT:            ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
+; SCOPS-NEXT:                [l2, l1] -> { Stmt_for_body_lr_ph_i[] -> MemRef_cpi[0, 1] };
+; SCOPS-NEXT:            Execution Context: [l2, l1] -> {  : l2 > 0 }
+; SCOPS-NEXT:    }
+; SCOPS:         Arrays {
+; SCOPS-NEXT:        i32 MemRef_cpi[*][(10 * %l1)]; // Element size 4
+; SCOPS-NEXT:    }
+
+; Check that we gracefully handle failing invariant loads.
+; This test case is taken from:
+; test/Isl/CodeGen/invariant-load-dimension.ll
+
+; FIXME: Figure out how to actually generate code for this loop.
+; CODEGEN-NOT: LLVM ERROR: preloading invariant loads failed in function
+
+entry:
+  %nt = getelementptr inbounds %S, %S* %cpi, i32 0, i32 1
+  br i1 %b, label %if.then14, label %exit
+
+if.then14:
+  %ns = getelementptr inbounds %S, %S* %cpi, i32 0, i32 0
+  %l0 = load i32, i32* %ns, align 8
+  %cmp12.i = icmp sgt i32 %l0, 0
+  br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
+
+for.body.lr.ph.i:
+  %l1 = load i32, i32* %nt, align 4
+  br label %for.body.i
+
+for.body.i:
+  %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
+  %mul.i163 = mul nsw i32 %phi, %l1
+  %cv = getelementptr inbounds %S, %S* %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
+  store i32 0, i32* %cv, align 8
+  %inc = add nuw nsw i32 %phi, 1
+  %l2 = load i32, i32* %ns, align 8
+  %cmp.i164 = icmp slt i32 %inc, %l2
+  br i1 %cmp.i164, label %for.body.i, label %exit
+
+exit:
+  ret void
+}
--- a/external/llvm-project/polly/test/GPGPU/failing-invariant-load-hoisting.ll
+++ b/external/llvm-project/polly/test/GPGPU/failing-invariant-load-hoisting.ll
@@ -0,0 +1,42 @@
+; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg \
+; RUN: -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
+
+; REQUIRES: pollyacc
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+
+%S = type { i32, i32, [12 x %L] }
+%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
+
+define void @test(%S* %cpi, i1 %b) {
+; CODEGEN-LABEL: @test(
+; CODEGEN:    polly.preload.begin:
+; CODEGEN-NEXT:  br i1 false
+
+entry:
+  %nt = getelementptr inbounds %S, %S* %cpi, i32 0, i32 1
+  br i1 %b, label %if.then14, label %exit
+
+if.then14:
+  %ns = getelementptr inbounds %S, %S* %cpi, i32 0, i32 0
+  %l0 = load i32, i32* %ns, align 8
+  %cmp12.i = icmp sgt i32 %l0, 0
+  br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
+
+for.body.lr.ph.i:
+  %l1 = load i32, i32* %nt, align 4
+  br label %for.body.i
+
+for.body.i:
+  %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
+  %mul.i163 = mul nsw i32 %phi, %l1
+  %cv = getelementptr inbounds %S, %S* %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
+  store i32 0, i32* %cv, align 8
+  %inc = add nuw nsw i32 %phi, 1
+  %l2 = load i32, i32* %ns, align 8
+  %cmp.i164 = icmp slt i32 %inc, %l2
+  br i1 %cmp.i164, label %for.body.i, label %exit
+
+exit:
+  ret void
+}
--- a/external/llvm-project/polly/test/GPGPU/host-control-flow.ll
+++ b/external/llvm-project/polly/test/GPGPU/host-control-flow.ll
@@ -0,0 +1,176 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -disable-output \
+; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -disable-output \
+; RUN: -polly-acc-dump-kernel-ir < %s | FileCheck %s -check-prefix=KERNEL-IR
+
+; RUN: opt %loadPolly -polly-codegen-ppcg \
+; RUN: -S < %s | FileCheck %s -check-prefix=IR
+;    void foo(float A[2][100]) {
+;      for (long t = 0; t < 100; t++)
+;        for (long i = 1; i < 99; i++)
+;          A[(t + 1) % 2][i] += A[t % 2][i - 1] + A[t % 2][i] + A[t % 2][i + 1];
+;    }
+
+; REQUIRES: pollyacc
+
+; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE-NEXT:   for (int c0 = 0; c0 <= 99; c0 += 1)
+; CODE-NEXT:     {
+; CODE-NEXT:       dim3 k0_dimBlock(32);
+; CODE-NEXT:       dim3 k0_dimGrid(4);
+; CODE-NEXT:       kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, c0);
+; CODE-NEXT:       cudaCheckKernel();
+; CODE-NEXT:     }
+
+; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A));
+; CODE-NEXT: }
+
+; IR-LABEL: polly.loop_header:                                ; preds = %polly.loop_header, %polly.loop_preheader
+; IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
+; ...
+; IR:  store i64 %polly.indvar, i64* %polly_launch_0_param_1
+; IR-NEXT:  [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
+; IR-NEXT:  [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*
+; IR-NEXT:  store i8* [[REGB]], i8** [[REGA]]
+; IR: call i8* @polly_getKernel
+; ...
+; IR: call void @polly_freeKernel
+; IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
+; IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar_next, 99
+; IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
+
+; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %c0)
+; KERNEL-IR-LABEL: entry:
+; KERNEL-IR-NEXT:   %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+; KERNEL-IR-NEXT:   %b0 = zext i32 %0 to i64
+; KERNEL-IR-NEXT:   %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; KERNEL-IR-NEXT:   %t0 = zext i32 %1 to i64
+; KERNEL-IR-NEXT:   br label %polly.cond
+
+; KERNEL-IR-LABEL: polly.cond:                                       ; preds = %entry
+; KERNEL-IR-NEXT:   %2 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %3 = add nsw i64 %2, %t0
+; KERNEL-IR-NEXT:   %4 = icmp sle i64 %3, 97
+; KERNEL-IR-NEXT:   br i1 %4, label %polly.then, label %polly.else
+
+; KERNEL-IR-LABEL: polly.merge:                                      ; preds = %polly.else, %polly.stmt.for.body3
+; KERNEL-IR-NEXT:   ret void
+
+; KERNEL-IR-LABEL: polly.then:                                       ; preds = %polly.cond
+; KERNEL-IR-NEXT:   %5 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %6 = add nsw i64 %5, %t0
+; KERNEL-IR-NEXT:   br label %polly.stmt.for.body3
+
+; KERNEL-IR-LABEL: polly.stmt.for.body3:                             ; preds = %polly.then
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
+; KERNEL-IR-NEXT:   %pexp.pdiv_r = urem i64 %c0, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100
+; KERNEL-IR-NEXT:   %7 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %8 = add nsw i64 %7, %t0
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
+; KERNEL-IR-NEXT:   %tmp_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
+; KERNEL-IR-NEXT:   %pexp.pdiv_r2 = urem i64 %c0, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100
+; KERNEL-IR-NEXT:   %9 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %10 = add nsw i64 %9, %t0
+; KERNEL-IR-NEXT:   %11 = add nsw i64 %10, 1
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %11
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4
+; KERNEL-IR-NEXT:   %tmp2_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A5, align 4
+; KERNEL-IR-NEXT:   %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A6 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
+; KERNEL-IR-NEXT:   %pexp.pdiv_r7 = urem i64 %c0, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100
+; KERNEL-IR-NEXT:   %12 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %13 = add nsw i64 %12, %t0
+; KERNEL-IR-NEXT:   %14 = add nsw i64 %13, 2
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %14
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A10 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9
+; KERNEL-IR-NEXT:   %tmp3_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A10, align 4
+; KERNEL-IR-NEXT:   %p_add12 = fadd float %p_add, %tmp3_p_scalar_
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A11 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
+; KERNEL-IR-NEXT:   %15 = add nsw i64 %c0, 1
+; KERNEL-IR-NEXT:   %pexp.pdiv_r12 = urem i64 %15, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100
+; KERNEL-IR-NEXT:   %16 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %17 = add nsw i64 %16, %t0
+; KERNEL-IR-NEXT:   %18 = add nsw i64 %17, 1
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %18
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A15 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14
+; KERNEL-IR-NEXT:   %tmp4_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A15, align 4
+; KERNEL-IR-NEXT:   %p_add17 = fadd float %tmp4_p_scalar_, %p_add12
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A16 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
+; KERNEL-IR-NEXT:   %19 = add nsw i64 %c0, 1
+; KERNEL-IR-NEXT:   %pexp.pdiv_r17 = urem i64 %19, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100
+; KERNEL-IR-NEXT:   %20 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %21 = add nsw i64 %20, %t0
+; KERNEL-IR-NEXT:   %22 = add nsw i64 %21, 1
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %22
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A20 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19
+; KERNEL-IR-NEXT:   store float %p_add17, float addrspace(1)* %polly.access.MemRef_A20, align 4
+; KERNEL-IR-NEXT:   br label %polly.merge
+
+; KERNEL-IR-LABEL: polly.else:                                       ; preds = %polly.cond
+; KERNEL-IR-NEXT:   br label %polly.merge
+; KERNEL-IR-NEXT: }
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo([100 x float]* %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc18, %entry
+  %t.0 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ]
+  %exitcond1 = icmp ne i64 %t.0, 100
+  br i1 %exitcond1, label %for.body, label %for.end20
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i64 %i.0, 99
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %sub = add nsw i64 %i.0, -1
+  %rem = srem i64 %t.0, 2
+  %arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem, i64 %sub
+  %tmp = load float, float* %arrayidx4, align 4
+  %rem5 = srem i64 %t.0, 2
+  %arrayidx7 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem5, i64 %i.0
+  %tmp2 = load float, float* %arrayidx7, align 4
+  %add = fadd float %tmp, %tmp2
+  %add8 = add nuw nsw i64 %i.0, 1
+  %rem9 = srem i64 %t.0, 2
+  %arrayidx11 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem9, i64 %add8
+  %tmp3 = load float, float* %arrayidx11, align 4
+  %add12 = fadd float %add, %tmp3
+  %add13 = add nuw nsw i64 %t.0, 1
+  %rem14 = srem i64 %add13, 2
+  %arrayidx16 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem14, i64 %i.0
+  %tmp4 = load float, float* %arrayidx16, align 4
+  %add17 = fadd float %tmp4, %add12
+  store float %add17, float* %arrayidx16, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %inc = add nuw nsw i64 %i.0, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc18
+
+for.inc18:                                        ; preds = %for.end
+  %inc19 = add nuw nsw i64 %t.0, 1
+  br label %for.cond
+
+for.end20:                                        ; preds = %for.cond
+  ret void
+}
--- a/external/llvm-project/polly/test/GPGPU/host-statement.ll
+++ b/external/llvm-project/polly/test/GPGPU/host-statement.ll
@@ -0,0 +1,204 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -polly-invariant-load-hoisting=false \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
+; RUN: -polly-invariant-load-hoisting=false \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=KERNEL-IR %s
+
+; REQUIRES: pollyacc
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; This test case tests that we can correctly handle a ScopStmt that is
+; scheduled on the host, instead of within a kernel.
+
+; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
+; CODE-NEXT:   {
+; CODE-NEXT:     dim3 k0_dimBlock(32);
+; CODE-NEXT:     dim3 k0_dimGrid(16);
+; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   if (p_0 <= 510 && p_1 <= 510) {
+; CODE-NEXT:     {
+; CODE-NEXT:       dim3 k1_dimBlock(32);
+; CODE-NEXT:       dim3 k1_dimGrid(p_1 <= -1048034 ? 32768 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
+; CODE-NEXT:       kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
+; CODE-NEXT:       cudaCheckKernel();
+; CODE-NEXT:     }
+
+; CODE:     {
+; CODE-NEXT:       dim3 k2_dimBlock(16, 32);
+; CODE-NEXT:       dim3 k2_dimGrid(16, p_1 <= -7650 ? 256 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
+; CODE-NEXT:       kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
+; CODE-NEXT:       cudaCheckKernel();
+; CODE-NEXT:     }
+
+; CODE:   }
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_R, dev_MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_Q, dev_MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
+; CODE-NEXT:     Stmt_for_cond33_preheader();
+
+; CODE: }
+
+; CODE: # kernel0
+; CODE-NEXT: Stmt_for_body16(32 * b0 + t0);
+
+; CODE: # kernel1
+; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 1048576; c0 += 1)
+; CODE-NEXT:   for (int c1 = 0; c1 <= 15; c1 += 1) {
+; CODE-NEXT:     if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510 && c1 == 0)
+; CODE-NEXT:       Stmt_for_body35(32 * b0 + t0 + 1048576 * c0);
+; CODE-NEXT:     if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510)
+; CODE-NEXT:       for (int c3 = 0; c3 <= 31; c3 += 1)
+; CODE-NEXT:         Stmt_for_body42(32 * b0 + t0 + 1048576 * c0, 32 * c1 + c3);
+; CODE-NEXT:     sync0();
+; CODE-NEXT:   }
+
+; CODE: # kernel2
+; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 8192; c0 += 1)
+; CODE-NEXT:   if (p_1 + 32 * b0 + t0 + 8192 * c0 <= 510)
+; CODE-NEXT:     for (int c3 = 0; c3 <= 1; c3 += 1)
+; CODE-NEXT:       Stmt_for_body62(32 * b0 + t0 + 8192 * c0, 32 * b1 + t1 + 16 * c3);
+
+; KERNEL-IR: call void @llvm.nvvm.barrier0()
+
+; Function Attrs: nounwind uwtable
+define internal void @kernel_gramschmidt(i32 %ni, i32 %nj, [512 x double]* %A, [512 x double]* %R, [512 x double]* %Q) #1 {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry.split, %for.inc86
+  %indvars.iv24 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next25, %for.inc86 ]
+  %indvars.iv19 = phi i64 [ 1, %entry.split ], [ %indvars.iv.next20, %for.inc86 ]
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.cond1.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
+  %nrm.02 = phi double [ 0.000000e+00, %for.cond1.preheader ], [ %add, %for.inc ]
+  %arrayidx5 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv, i64 %indvars.iv24
+  %tmp = load double, double* %arrayidx5, align 8, !tbaa !1
+  %arrayidx9 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv, i64 %indvars.iv24
+  %tmp27 = load double, double* %arrayidx9, align 8, !tbaa !1
+  %mul = fmul double %tmp, %tmp27
+  %add = fadd double %nrm.02, %mul
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 512
+  br i1 %exitcond, label %for.inc, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  %add.lcssa = phi double [ %add, %for.inc ]
+  %call = tail call double @sqrt(double %add.lcssa) #2
+  %arrayidx13 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv24
+  store double %call, double* %arrayidx13, align 8, !tbaa !1
+  br label %for.body16
+
+for.cond33.preheader:                             ; preds = %for.body16
+  %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
+  %cmp347 = icmp slt i64 %indvars.iv.next25, 512
+  br i1 %cmp347, label %for.body35.lr.ph, label %for.inc86
+
+for.body35.lr.ph:                                 ; preds = %for.cond33.preheader
+  br label %for.body35
+
+for.body16:                                       ; preds = %for.end, %for.body16
+  %indvars.iv10 = phi i64 [ 0, %for.end ], [ %indvars.iv.next11, %for.body16 ]
+  %arrayidx20 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv10, i64 %indvars.iv24
+  %tmp28 = load double, double* %arrayidx20, align 8, !tbaa !1
+  %arrayidx24 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv24
+  %tmp29 = load double, double* %arrayidx24, align 8, !tbaa !1
+  %div = fdiv double %tmp28, %tmp29
+  %arrayidx28 = getelementptr inbounds [512 x double], [512 x double]* %Q, i64 %indvars.iv10, i64 %indvars.iv24
+  store double %div, double* %arrayidx28, align 8, !tbaa !1
+  %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
+  %exitcond12 = icmp ne i64 %indvars.iv.next11, 512
+  br i1 %exitcond12, label %for.body16, label %for.cond33.preheader
+
+for.cond33.loopexit:                              ; preds = %for.body62
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next22 to i32
+  %exitcond23 = icmp ne i32 %lftr.wideiv, 512
+  br i1 %exitcond23, label %for.body35, label %for.cond33.for.inc86_crit_edge
+
+for.body35:                                       ; preds = %for.body35.lr.ph, %for.cond33.loopexit
+  %indvars.iv21 = phi i64 [ %indvars.iv19, %for.body35.lr.ph ], [ %indvars.iv.next22, %for.cond33.loopexit ]
+  %arrayidx39 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv21
+  store double 0.000000e+00, double* %arrayidx39, align 8, !tbaa !1
+  br label %for.body42
+
+for.cond60.preheader:                             ; preds = %for.body42
+  br label %for.body62
+
+for.body42:                                       ; preds = %for.body35, %for.body42
+  %indvars.iv13 = phi i64 [ 0, %for.body35 ], [ %indvars.iv.next14, %for.body42 ]
+  %arrayidx46 = getelementptr inbounds [512 x double], [512 x double]* %Q, i64 %indvars.iv13, i64 %indvars.iv24
+  %tmp30 = load double, double* %arrayidx46, align 8, !tbaa !1
+  %arrayidx50 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv13, i64 %indvars.iv21
+  %tmp31 = load double, double* %arrayidx50, align 8, !tbaa !1
+  %mul51 = fmul double %tmp30, %tmp31
+  %arrayidx55 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv21
+  %tmp32 = load double, double* %arrayidx55, align 8, !tbaa !1
+  %add56 = fadd double %tmp32, %mul51
+  store double %add56, double* %arrayidx55, align 8, !tbaa !1
+  %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
+  %exitcond15 = icmp ne i64 %indvars.iv.next14, 512
+  br i1 %exitcond15, label %for.body42, label %for.cond60.preheader
+
+for.body62:                                       ; preds = %for.cond60.preheader, %for.body62
+  %indvars.iv16 = phi i64 [ 0, %for.cond60.preheader ], [ %indvars.iv.next17, %for.body62 ]
+  %arrayidx66 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv16, i64 %indvars.iv21
+  %tmp33 = load double, double* %arrayidx66, align 8, !tbaa !1
+  %arrayidx70 = getelementptr inbounds [512 x double], [512 x double]* %Q, i64 %indvars.iv16, i64 %indvars.iv24
+  %tmp34 = load double, double* %arrayidx70, align 8, !tbaa !1
+  %arrayidx74 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv21
+  %tmp35 = load double, double* %arrayidx74, align 8, !tbaa !1
+  %mul75 = fmul double %tmp34, %tmp35
+  %sub = fsub double %tmp33, %mul75
+  %arrayidx79 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv16, i64 %indvars.iv21
+  store double %sub, double* %arrayidx79, align 8, !tbaa !1
+  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
+  %exitcond18 = icmp ne i64 %indvars.iv.next17, 512
+  br i1 %exitcond18, label %for.body62, label %for.cond33.loopexit
+
+for.cond33.for.inc86_crit_edge:                   ; preds = %for.cond33.loopexit
+  br label %for.inc86
+
+for.inc86:                                        ; preds = %for.cond33.for.inc86_crit_edge, %for.cond33.preheader
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %exitcond26 = icmp ne i64 %indvars.iv.next25, 512
+  br i1 %exitcond26, label %for.cond1.preheader, label %for.end88
+
+for.end88:                                        ; preds = %for.inc86
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare double @sqrt(double) #2
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"double", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
--- a/external/llvm-project/polly/test/GPGPU/ignore-parameter-bounds.ll
+++ b/external/llvm-project/polly/test/GPGPU/ignore-parameter-bounds.ll
@@ -0,0 +1,41 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; REQUIRES: pollyacc
+
+; CODE: Code
+; CODE: ====
+; CODE: No code generated
+
+source_filename = "bugpoint-output-83bcdeb.bc"
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+@__data_radiation_MOD_cobi = external global [168 x double], align 32
+
+; Function Attrs: nounwind uwtable
+define void @__radiation_rg_MOD_coe_so() #0 {
+entry:
+  %polly.access.kspec.load = load i32, i32* undef, align 4
+  %0 = or i1 undef, undef
+  br label %polly.preload.cond29
+
+polly.preload.cond29:                             ; preds = %entry
+  br i1 %0, label %polly.preload.exec31, label %polly.preload.merge30
+
+polly.preload.merge30:                            ; preds = %polly.preload.exec31, %polly.preload.cond29
+  %polly.preload..merge32 = phi double [ %polly.access.__data_radiation_MOD_cobi.load, %polly.preload.exec31 ], [ 0.000000e+00, %polly.preload.cond29 ]
+  ret void
+
+polly.preload.exec31:                             ; preds = %polly.preload.cond29
+  %1 = sext i32 %polly.access.kspec.load to i64
+  %2 = mul nsw i64 7, %1
+  %3 = add nsw i64 0, %2
+  %4 = add nsw i64 %3, 48
+  %polly.access.__data_radiation_MOD_cobi = getelementptr double, double* getelementptr inbounds ([168 x double], [168 x double]* @__data_radiation_MOD_cobi, i32 0, i32 0), i64 %4
+  %polly.access.__data_radiation_MOD_cobi.load = load double, double* %polly.access.__data_radiation_MOD_cobi, align 8
+  br label %polly.preload.merge30
+}
+
+attributes #0 = { nounwind uwtable }
--- a/external/llvm-project/polly/test/GPGPU/intrinsic-copied-into-kernel.ll
+++ b/external/llvm-project/polly/test/GPGPU/intrinsic-copied-into-kernel.ll
@@ -0,0 +1,76 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadPolly -analyze -polly-codegen-ppcg -polly-acc-dump-kernel-ir < %s | FileCheck %s --check-prefix=KERNEL-IR
+; RUN: opt %loadPolly -S -polly-codegen-ppcg  < %s | FileCheck %s --check-prefix=HOST-IR
+
+; Test that we do recognise and codegen a kernel that has intrinsics.
+
+; REQUIRES: pollyacc
+
+; Check that we model the kernel as a scop.
+; SCOP:      Function: f
+; SCOP-NEXT:       Region: %entry.split---%for.end
+
+; Check that the intrinsic call is present in the kernel IR.
+; KERNEL-IR:   %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
+; KERNEL-IR:   declare float @llvm.sqrt.f32(float)
+; KERNEL-IR:   declare float @llvm.fabs.f32(float)
+
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+
+; void f(float *A, float *B, int N) {
+;   for(int i = 0; i < N; i++) {
+;       float tmp0 = A[i];
+;       float tmp1 = sqrt(tmp1);
+;       float tmp2 = fabs(tmp2);
+;       float tmp3 = copysignf(tmp1, tmp2);
+;       B[i] = tmp4;
+;   }
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(float* %A, float* %B, i32 %N) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry.split
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %A.arr.i.val = load float, float* %A.arr.i, align 4
+  ; Call to intrinsics that should be part of the kernel.
+  %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
+  %fabs = tail call float @llvm.fabs.f32(float %sqrt);
+  %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs);
+  %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  store float %copysign, float* %B.arr.i, align 4
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %wide.trip.count = zext i32 %N to i64
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.sqrt.f32(float) #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.copysign.f32(float, float) #0
+
+attributes #0 = { nounwind readnone }
+
--- a/external/llvm-project/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll
+++ b/external/llvm-project/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll
@@ -0,0 +1,47 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg  -polly-acc-fail-on-verify-module-failure \
+; RUN: -disable-output < %s
+
+; Make sure that if -polly-acc-fail-on-verify-module-failure is on, we actually
+; fail on an illegal module.
+
+; REQUIRES: pollyacc, asserts
+; XFAIL: *
+;
+;    void foo(long A[1024], long B[1024]) {
+;      for (long i = 0; i < 1024; i++)
+;        A[i] += (B[i] + (long)&B[i]);
+;    }
+
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s 
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(i64* %A, i64* %B) {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb10, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
+  %exitcond = icmp ne i64 %i.0, 1024
+  br i1 %exitcond, label %bb2, label %bb12
+
+bb2:                                              ; preds = %bb1
+  %tmp = getelementptr inbounds i64, i64* %B, i64 %i.0
+  %tmp3 = load i64, i64* %tmp, align 8
+  %tmp4 = getelementptr inbounds i64, i64* %B, i64 %i.0
+  %tmp5 = ptrtoint i64* %tmp4 to i64
+  %tmp6 = add nsw i64 %tmp3, %tmp5
+  %tmp7 = getelementptr inbounds i64, i64* %A, i64 %i.0
+  %tmp8 = load i64, i64* %tmp7, align 8
+  %tmp9 = add nsw i64 %tmp8, %tmp6
+  store i64 %tmp9, i64* %tmp7, align 8
+  br label %bb10
+
+bb10:                                             ; preds = %bb2
+  %tmp11 = add nuw nsw i64 %i.0, 1
+  br label %bb1
+
+bb12:                                             ; preds = %bb1
+  ret void
+}
--- a/external/llvm-project/polly/test/GPGPU/invalid-kernel.ll
+++ b/external/llvm-project/polly/test/GPGPU/invalid-kernel.ll
@@ -0,0 +1,73 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
+; RUN: -disable-output < %s | \
+; RUN: not FileCheck %s -check-prefix=KERNEL-IR
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
+; RUN: FileCheck %s -check-prefix=IR
+
+; REQUIRES: pollyacc
+;
+;    void foo(long A[1024], long B[1024]) {
+;      for (long i = 0; i < 1024; i++)
+;        A[i] += (B[i] + (long)&B[i]);
+;    }
+
+; This kernel loads/stores a pointer address we model. This is a rare case,
+; were we still lack proper code-generation support. We check here that we
+; detect the invalid IR and bail out gracefully.
+
+; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
+; CODE-NEXT:   {
+; CODE-NEXT:     dim3 k0_dimBlock(32);
+; CODE-NEXT:     dim3 k0_dimGrid(32);
+; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B, dev_MemRef_A);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost));
+
+; CODE: # kernel0
+; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
+; RUN: FileCheck %s -check-prefix=IR
+
+; KERNEL-IR: kernel
+
+; IR: br i1 false, label %polly.start, label %bb1
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(i64* %A, i64* %B) {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb10, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
+  %exitcond = icmp ne i64 %i.0, 1024
+  br i1 %exitcond, label %bb2, label %bb12
+
+bb2:                                              ; preds = %bb1
+  %tmp = getelementptr inbounds i64, i64* %B, i64 %i.0
+  %tmp3 = load i64, i64* %tmp, align 8
+  %tmp4 = getelementptr inbounds i64, i64* %B, i64 %i.0
+  %tmp5 = ptrtoint i64* %tmp4 to i64
+  %tmp6 = add nsw i64 %tmp3, %tmp5
+  %tmp7 = getelementptr inbounds i64, i64* %A, i64 %i.0
+  %tmp8 = load i64, i64* %tmp7, align 8
+  %tmp9 = add nsw i64 %tmp8, %tmp6
+  store i64 %tmp9, i64* %tmp7, align 8
+  br label %bb10
+
+bb10:                                             ; preds = %bb2
+  %tmp11 = add nuw nsw i64 %i.0, 1
+  br label %bb1
+
+bb12:                                             ; preds = %bb1
+  ret void
+}
--- a/external/llvm-project/polly/test/GPGPU/invariant-load-array-access.ll
+++ b/external/llvm-project/polly/test/GPGPU/invariant-load-array-access.ll
@@ -0,0 +1,71 @@
+; RUN: opt %loadPolly -analyze -polly-scops \
+; RUN: -polly-invariant-load-hoisting < %s| FileCheck %s -check-prefix=SCOP
+
+; RUN: opt %loadPolly -S -polly-codegen-ppcg \
+; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
+
+
+; REQUIRES: pollyacc
+
+; Check that we detect a scop.
+; SCOP:      Function: f
+; SCOP-NEXT: Region: %for.body---%for.end
+; SCOP-NEXT: Max Loop Depth:  1
+; SCOP-NEXT: Invariant Accesses: {
+; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; SCOP-NEXT:             [tmp] -> { Stmt_for_body[i0] -> MemRef_control[0] };
+; SCOP-NEXT:         Execution Context: [tmp] -> {  :  }
+; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; SCOP-NEXT:             [tmp] -> { Stmt_if_then[i0] -> MemRef_readarr[0] };
+; SCOP-NEXT:         Execution Context: [tmp] -> {  : tmp >= 4 }
+; SCOP-NEXT: }
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+; This test makes sure that such an access pattern is handled correctly
+; by PPCGCodeGeneration. It appears that not calling `preloadInvariantLoads`
+; was the main reason that caused this test case to crash.
+;
+; void f(int *arr, const int *control, const int *readarr) {
+;     for(int i = 0; i < 1000; i++) {
+;         int t = 0;
+;         if (*control > 3) {
+;             t += *readarr;
+;         }
+;         arr[i] = t;
+;     }
+; }
+
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.12.0"
+define void @f(i32* %arr, i32* %control, i32* %readarr) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %entry.split, %if.end
+  %i.01 = phi i32 [ 0, %entry.split ], [ %inc, %if.end ]
+  %tmp = load i32, i32* %control, align 4
+  %cmp1 = icmp sgt i32 %tmp, 3
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %tmp1 = load i32, i32* %readarr, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.0 = phi i32 [ %tmp1, %if.then ], [ 0, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i32 %i.01
+  store i32 %t.0, i32* %arrayidx, align 4
+  %inc = add nuw nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %if.end
+  ret void
+}
--- a/external/llvm-project/polly/test/GPGPU/invariant-load-escaping-values.ll
+++ b/external/llvm-project/polly/test/GPGPU/invariant-load-escaping-values.ll
@@ -0,0 +1,30 @@
+; RUN: opt %loadPolly -S -polly-codegen-ppcg \
+; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s
+
+; REQUIRES: pollyacc
+
+; CHECK: store i64 %polly.access.B.load, i64* %invariant.preload.s2a
+; CHECK: %invariant.final_reload = load i64, i64* %invariant.preload.s2a
+
+; Verify that the final reload of an invariant scalar memory access uses the
+; same stack slot that into which the invariant memory access was stored
+; originally. Earlier, this was broken as we introduce a new stack slot aside
+; of the preload stack slot, which remained uninitialized and caused our escaping
+; loads to contain garbage.
+
+define i64 @foo(float* %A, i64* %B) {
+entry:
+  br label %loop
+
+loop:
+  %indvar = phi i64 [0, %entry], [%indvar.next, %loop]
+  %indvar.next = add nsw i64 %indvar, 1
+  %idx = getelementptr float, float* %A, i64 %indvar
+  store float 42.0, float* %idx
+  %invariant = load i64, i64* %B
+  %cmp = icmp sle i64 %indvar, 1024
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i64 %invariant
+}
--- a/Show More
+++ b/Show More