Imported Upstream version 6.10.0.49

Former-commit-id: 1d6753294b2993e1fbf92de9366bb9544db4189b
2020-01-16 16:38:04 +00:00
parent d94e79959b
commit 468663ddbb
48518 changed files with 2789335 additions and 61176 deletions
--- a/external/llvm-project/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
@@ -0,0 +1,16 @@
+; RUN: opt %loadPolly -polly-opt-isl -S < %s
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
+
+define void @sdbout_label() nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %0 = phi i32 [ 0, %entry ], [ %1, %for.cond ]
+  %1 = add nsw i32 %0, 1
+  %exitcond72 = icmp eq i32 %1, 7
+  br i1 %exitcond72, label %sw.epilog66, label %for.cond
+
+sw.epilog66:                                      ; preds = %for.cond
+  ret void
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/2012-04-16-Trivially-vectorizable-loops.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/2012-04-16-Trivially-vectorizable-loops.ll
@@ -0,0 +1,204 @@
+; RUN: opt %loadPolly -basicaa -polly-opt-isl -polly-vectorizer=polly < %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
+%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
+
+@A = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@B = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@stdout = external global %struct._IO_FILE*
+@.str = private unnamed_addr constant [5 x i8] c"%lf \00", align 1
+@C = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@.str1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
+
+define void @init_array() nounwind uwtable {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc17, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc18, %for.inc17 ]
+  %cmp = icmp slt i32 %i.0, 1536
+  br i1 %cmp, label %for.body, label %for.end19
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %cmp2 = icmp slt i32 %j.0, 1536
+  br i1 %cmp2, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %mul = mul nsw i32 %i.0, %j.0
+  %rem = srem i32 %mul, 1024
+  %add = add nsw i32 1, %rem
+  %conv = sitofp i32 %add to double
+  %div = fdiv double %conv, 2.000000e+00
+  %conv4 = fptrunc double %div to float
+  %idxprom = sext i32 %j.0 to i64
+  %idxprom5 = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i32 0, i64 %idxprom5
+  %arrayidx6 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i32 0, i64 %idxprom
+  store float %conv4, float* %arrayidx6, align 4
+  %mul7 = mul nsw i32 %i.0, %j.0
+  %rem8 = srem i32 %mul7, 1024
+  %add9 = add nsw i32 1, %rem8
+  %conv10 = sitofp i32 %add9 to double
+  %div11 = fdiv double %conv10, 2.000000e+00
+  %conv12 = fptrunc double %div11 to float
+  %idxprom13 = sext i32 %j.0 to i64
+  %idxprom14 = sext i32 %i.0 to i64
+  %arrayidx15 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i32 0, i64 %idxprom14
+  %arrayidx16 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx15, i32 0, i64 %idxprom13
+  store float %conv12, float* %arrayidx16, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc17
+
+for.inc17:                                        ; preds = %for.end
+  %inc18 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end19:                                        ; preds = %for.cond
+  ret void
+}
+
+define void @print_array() nounwind uwtable {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc10, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc11, %for.inc10 ]
+  %cmp = icmp slt i32 %i.0, 1536
+  br i1 %cmp, label %for.body, label %for.end12
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %cmp2 = icmp slt i32 %j.0, 1536
+  br i1 %cmp2, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8
+  %idxprom = sext i32 %j.0 to i64
+  %idxprom4 = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i32 0, i64 %idxprom4
+  %arrayidx5 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i32 0, i64 %idxprom
+  %1 = load float, float* %arrayidx5, align 4
+  %conv = fpext float %1 to double
+  %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), double %conv)
+  %rem = srem i32 %j.0, 80
+  %cmp6 = icmp eq i32 %rem, 79
+  br i1 %cmp6, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body3
+  %2 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8
+  %call8 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str1, i32 0, i32 0))
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body3
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  %3 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8
+  %call9 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %3, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str1, i32 0, i32 0))
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.end
+  %inc11 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end12:                                        ; preds = %for.cond
+  ret void
+}
+
+declare i32 @fprintf(%struct._IO_FILE*, i8*, ...)
+
+define i32 @main() nounwind uwtable {
+entry:
+  call void @init_array()
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc28, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc29, %for.inc28 ]
+  %cmp = icmp slt i32 %i.0, 1536
+  br i1 %cmp, label %for.body, label %for.end30
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc25, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc26, %for.inc25 ]
+  %cmp2 = icmp slt i32 %j.0, 1536
+  br i1 %cmp2, label %for.body3, label %for.end27
+
+for.body3:                                        ; preds = %for.cond1
+  %idxprom = sext i32 %j.0 to i64
+  %idxprom4 = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i32 0, i64 %idxprom4
+  %arrayidx5 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i32 0, i64 %idxprom
+  store float 0.000000e+00, float* %arrayidx5, align 4
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.inc, %for.body3
+  %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
+  %cmp7 = icmp slt i32 %k.0, 1536
+  br i1 %cmp7, label %for.body8, label %for.end
+
+for.body8:                                        ; preds = %for.cond6
+  %idxprom9 = sext i32 %j.0 to i64
+  %idxprom10 = sext i32 %i.0 to i64
+  %arrayidx11 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i32 0, i64 %idxprom10
+  %arrayidx12 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx11, i32 0, i64 %idxprom9
+  %0 = load float, float* %arrayidx12, align 4
+  %idxprom13 = sext i32 %k.0 to i64
+  %idxprom14 = sext i32 %i.0 to i64
+  %arrayidx15 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i32 0, i64 %idxprom14
+  %arrayidx16 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx15, i32 0, i64 %idxprom13
+  %1 = load float, float* %arrayidx16, align 4
+  %idxprom17 = sext i32 %j.0 to i64
+  %idxprom18 = sext i32 %k.0 to i64
+  %arrayidx19 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i32 0, i64 %idxprom18
+  %arrayidx20 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx19, i32 0, i64 %idxprom17
+  %2 = load float, float* %arrayidx20, align 4
+  %mul = fmul float %1, %2
+  %add = fadd float %0, %mul
+  %idxprom21 = sext i32 %j.0 to i64
+  %idxprom22 = sext i32 %i.0 to i64
+  %arrayidx23 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i32 0, i64 %idxprom22
+  %arrayidx24 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx23, i32 0, i64 %idxprom21
+  store float %add, float* %arrayidx24, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body8
+  %inc = add nsw i32 %k.0, 1
+  br label %for.cond6
+
+for.end:                                          ; preds = %for.cond6
+  br label %for.inc25
+
+for.inc25:                                        ; preds = %for.end
+  %inc26 = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end27:                                        ; preds = %for.cond1
+  br label %for.inc28
+
+for.inc28:                                        ; preds = %for.end27
+  %inc29 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end30:                                        ; preds = %for.cond
+  ret i32 0
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
@@ -0,0 +1,23 @@
+; RUN: opt %loadPolly -polly-opt-isl -S < %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Check that we handle statements with an empty iteration domain correctly.
+
+define void @f() {
+entry:
+  %A = alloca double
+  br label %for
+
+for:
+  %indvar = phi i32 [ %indvar.next, %for.inc ], [ 0, %entry ]
+  %exitcond = icmp ne i32 %indvar, -1
+  br i1 %exitcond, label %for.inc, label %return
+
+for.inc:
+  %indvar.next = add i32 %indvar, 1
+  store double 1.0, double* %A
+  br label %for
+
+return:
+  ret void
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/computeout.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/computeout.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S %loadPolly -basicaa -polly-opt-isl -polly-opt-fusion=max -polly-ast -analyze < %s | FileCheck %s 
+; RUN: opt -S %loadPolly -basicaa -polly-opt-isl -polly-opt-fusion=max -polly-ast -analyze -polly-dependences-computeout=1 < %s | FileCheck %s -check-prefix=TIMEOUT
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+;     for(i = 0; i < 100; i++ )
+; S1:   A[i] = 2;
+;
+;     for (i = 0; i < 10; i++ )
+; S2:   A[i]  = 5;
+;
+;     for (i = 0; i < 200; i++ )
+; S3:   A[i] = 5;
+
+define void @sequential_writes() {
+entry:
+  %A = alloca [200 x i32]
+  br label %S1
+
+S1:
+  %indvar.1 = phi i64 [ 0, %entry ], [ %indvar.next.1, %S1 ]
+  %arrayidx.1 = getelementptr [200 x i32], [200 x i32]* %A, i64 0, i64 %indvar.1
+  store i32 2, i32* %arrayidx.1
+  %indvar.next.1 = add i64 %indvar.1, 1
+  %exitcond.1 = icmp ne i64 %indvar.next.1, 100
+  br i1 %exitcond.1, label %S1, label %exit.1
+
+exit.1:
+  br label %S2
+
+S2:
+  %indvar.2 = phi i64 [ 0, %exit.1 ], [ %indvar.next.2, %S2 ]
+  %arrayidx.2 = getelementptr [200 x i32], [200 x i32]* %A, i64 0, i64 %indvar.2
+  store i32 5, i32* %arrayidx.2
+  %indvar.next.2 = add i64 %indvar.2, 1
+  %exitcond.2 = icmp ne i64 %indvar.next.2, 10
+  br i1 %exitcond.2, label %S2, label %exit.2
+
+exit.2:
+  br label %S3
+
+S3:
+  %indvar.3 = phi i64 [ 0, %exit.2 ], [ %indvar.next.3, %S3 ]
+  %arrayidx.3 = getelementptr [200 x i32], [200 x i32]* %A, i64 0, i64 %indvar.3
+  store i32 7, i32* %arrayidx.3
+  %indvar.next.3 = add i64 %indvar.3, 1
+  %exitcond.3 = icmp ne i64 %indvar.next.3, 200
+  br i1 %exitcond.3, label %S3 , label %exit.3
+
+exit.3:
+  ret void
+}
+
+
+; CHECK: for (int c0 = 0; c0 <= 199; c0 += 1) {
+; CHECK:   if (c0 <= 99) {
+; CHECK:     Stmt_S1(c0);
+; CHECK:     if (c0 <= 9)
+; CHECK:       Stmt_S2(c0);
+; CHECK:   }
+; CHECK:   Stmt_S3(c0);
+; CHECK: }
+
+; TIMEOUT: for (int c0 = 0; c0 <= 99; c0 += 1)
+; TIMEOUT: Stmt_S1(c0);
+; TIMEOUT: for (int c0 = 0; c0 <= 9; c0 += 1)
+; TIMEOUT: Stmt_S2(c0);
+; TIMEOUT: for (int c0 = 0; c0 <= 199; c0 += 1)
+; TIMEOUT: Stmt_S3(c0);
+
--- a/external/llvm-project/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
@@ -0,0 +1,234 @@
+; RUN: opt %loadPolly -analyze -polly-process-unprofitable  -polly-remarks-minimal \
+; RUN:     -polly-opt-isl  -polly-pattern-matching-based-opts=true \
+; RUN:     -polly-target-throughput-vector-fma=1 \
+; RUN:     -polly-target-latency-vector-fma=1 \
+; RUN:     -polly-ast -polly-target-vector-register-bitwidth=4096 \
+; RUN:     -polly-target-1st-cache-level-associativity=3 < %s | FileCheck %s
+;
+;     /* Test that Polly does not crash due to configurations that can lead to
+;    incorrect tile size computations.
+;    The parameters are setup such that Car in `getMacroKernelParams`
+;    is evaluated to 0. */
+;
+;    static const int N = 3000;
+;
+;    void f(int A[N][N], int B[N][N], int C[N][N]) {
+;      for (int i = 0; i < N; i++) {
+;        for (int j = 0; j < N; j++) {
+;          A[i][j] = 0;
+;          for (int k = 0; k < N; k++) {
+;            A[i][j] += B[i][k] * C[k][j];
+;          }
+;        }
+;      }
+;    }
+;
+; CHECK:           // 1st level tiling - Tiles
+; CHECK-NEXT:      for (int c0 = 0; c0 <= 93; c0 += 1)
+; CHECK-NEXT:        for (int c1 = 0; c1 <= 93; c1 += 1) {
+; CHECK-NEXT:          // 1st level tiling - Points
+; CHECK-NEXT:          for (int c2 = 0; c2 <= min(31, -32 * c0 + 2999); c2 += 1)
+; CHECK-NEXT:            for (int c3 = 0; c3 <= min(31, -32 * c1 + 2999); c3 += 1)
+; CHECK-NEXT:              Stmt_for_body3(32 * c0 + c2, 32 * c1 + c3);
+; CHECK-NEXT:        }
+; CHECK-NEXT:      // Inter iteration alias-free
+; CHECK-NEXT:      // Register tiling - Tiles
+; CHECK-NEXT:      for (int c0 = 0; c0 <= 23; c0 += 1)
+; CHECK-NEXT:        for (int c1 = 0; c1 <= 2999; c1 += 1)
+; CHECK-NEXT:          for (int c2 = 0; c2 <= 2999; c2 += 1) {
+; CHECK-NEXT:            // Register tiling - Points
+; CHECK-NEXT:            {
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 1, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 2, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 3, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 4, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 5, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 6, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 7, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 8, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 9, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 10, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 11, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 12, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 13, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 14, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 15, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 16, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 17, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 18, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 19, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 20, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 21, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 22, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 23, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 24, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 25, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 26, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 27, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 28, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 29, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 30, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 31, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 32, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 33, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 34, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 35, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 36, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 37, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 38, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 39, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 40, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 41, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 42, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 43, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 44, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 45, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 46, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 47, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 48, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 49, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 50, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 51, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 52, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 53, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 54, c2);
+; CHECK-NEXT:              Stmt_for_body8(c1, 128 * c0 + 55, c2);
+; CHECK-NEXT:              if (c0 <= 22) {
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 56, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 57, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 58, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 59, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 60, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 61, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 62, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 63, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 64, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 65, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 66, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 67, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 68, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 69, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 70, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 71, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 72, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 73, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 74, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 75, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 76, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 77, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 78, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 79, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 80, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 81, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 82, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 83, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 84, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 85, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 86, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 87, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 88, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 89, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 90, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 91, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 92, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 93, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 94, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 95, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 96, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 97, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 98, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 99, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 100, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 101, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 102, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 103, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 104, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 105, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 106, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 107, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 108, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 109, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 110, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 111, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 112, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 113, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 114, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 115, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 116, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 117, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 118, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 119, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 120, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 121, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 122, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 123, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 124, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 125, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 126, c2);
+; CHECK-NEXT:                Stmt_for_body8(c1, 128 * c0 + 127, c2);
+; CHECK-NEXT:              }
+; CHECK-NEXT:            }
+; CHECK-NEXT:          }
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f([3000 x i32]* %A, [3000 x i32]* %B, [3000 x i32]* %C) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc24, %entry
+  %indvars.iv4 = phi i64 [ %indvars.iv.next5, %for.inc24 ], [ 0, %entry ]
+  %exitcond6 = icmp ne i64 %indvars.iv4, 3000
+  br i1 %exitcond6, label %for.body, label %for.end26
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc21, %for.body
+  %indvars.iv1 = phi i64 [ %indvars.iv.next2, %for.inc21 ], [ 0, %for.body ]
+  %exitcond3 = icmp ne i64 %indvars.iv1, 3000
+  br i1 %exitcond3, label %for.body3, label %for.end23
+
+for.body3:                                        ; preds = %for.cond1
+  %arrayidx5 = getelementptr inbounds [3000 x i32], [3000 x i32]* %A, i64 %indvars.iv4, i64 %indvars.iv1
+  store i32 0, i32* %arrayidx5, align 4
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.inc, %for.body3
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body3 ]
+  %exitcond = icmp ne i64 %indvars.iv, 3000
+  br i1 %exitcond, label %for.body8, label %for.end
+
+for.body8:                                        ; preds = %for.cond6
+  %arrayidx12 = getelementptr inbounds [3000 x i32], [3000 x i32]* %B, i64 %indvars.iv4, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx12, align 4
+  %arrayidx16 = getelementptr inbounds [3000 x i32], [3000 x i32]* %C, i64 %indvars.iv, i64 %indvars.iv1
+  %tmp7 = load i32, i32* %arrayidx16, align 4
+  %mul = mul nsw i32 %tmp, %tmp7
+  %arrayidx20 = getelementptr inbounds [3000 x i32], [3000 x i32]* %A, i64 %indvars.iv4, i64 %indvars.iv1
+  %tmp8 = load i32, i32* %arrayidx20, align 4
+  %add = add nsw i32 %tmp8, %mul
+  store i32 %add, i32* %arrayidx20, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond6
+
+for.end:                                          ; preds = %for.cond6
+  br label %for.inc21
+
+for.inc21:                                        ; preds = %for.end
+  %indvars.iv.next2 = add nuw nsw i64 %indvars.iv1, 1
+  br label %for.cond1
+
+for.end23:                                        ; preds = %for.cond1
+  br label %for.inc24
+
+for.inc24:                                        ; preds = %for.end23
+  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+  br label %for.cond
+
+for.end26:                                        ; preds = %for.cond
+  ret void
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
@@ -0,0 +1,91 @@
+; RUN: opt -S %loadPolly -polly-pattern-matching-based-opts=false \
+; RUN: -polly-vectorizer=stripmine -polly-opt-isl -polly-ast -analyze \
+; RUN: < %s | FileCheck %s
+; CHECK:          // 1st level tiling - Tiles
+; CHECK-NEXT:    #pragma known-parallel
+; CHECK-NEXT:    for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1)
+; CHECK-NEXT:      for (int c1 = 0; c1 <= floord(nj - 1, 32); c1 += 1)
+; CHECK-NEXT:        for (int c2 = 0; c2 <= floord(nk - 1, 32); c2 += 1) {
+; CHECK-NEXT:          // 1st level tiling - Points
+; CHECK-NEXT:          for (int c3 = 0; c3 <= min(31, ni - 32 * c0 - 1); c3 += 1) {
+; CHECK-NEXT:            for (int c4 = 0; c4 <= min(7, -8 * c1 + nj / 4 - 1); c4 += 1)
+; CHECK-NEXT:              for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) {
+; CHECK-NEXT:                // SIMD
+; CHECK-NEXT:                for (int c6 = 0; c6 <= 3; c6 += 1)
+; CHECK-NEXT:                  Stmt_for_body_6(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
+; CHECK-NEXT:              }
+; CHECK-NEXT:            if (32 * c1 + 31 >= nj)
+; CHECK-NEXT:              #pragma minimal dependence distance: 1
+; CHECK-NEXT:              for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) {
+; CHECK-NEXT:                // SIMD
+; CHECK-NEXT:                for (int c6 = 0; c6 < nj % 4; c6 += 1)
+; CHECK-NEXT:                  Stmt_for_body_6(32 * c0 + c3, -(nj % 4) + nj + c6, 32 * c2 + c5);
+; CHECK-NEXT:              }
+; CHECK-NEXT:          }
+; CHECK-NEXT:        }
+
+; Function Attrs: nounwind uwtable
+define void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [1024 x double]* %C, [1024 x double]* %A, [1024 x double]* %B) #0 {
+entry:
+  %cmp.27 = icmp sgt i32 %ni, 0
+  br i1 %cmp.27, label %for.cond.1.preheader.lr.ph, label %for.end.22
+
+for.cond.1.preheader.lr.ph:                       ; preds = %entry
+  br label %for.cond.1.preheader
+
+for.cond.1.preheader:                             ; preds = %for.cond.1.preheader.lr.ph, %for.inc.20
+  %indvars.iv33 = phi i64 [ 0, %for.cond.1.preheader.lr.ph ], [ %indvars.iv.next34, %for.inc.20 ]
+  %cmp2.25 = icmp sgt i32 %nj, 0
+  br i1 %cmp2.25, label %for.cond.4.preheader.lr.ph, label %for.inc.20
+
+for.cond.4.preheader.lr.ph:                       ; preds = %for.cond.1.preheader
+  br label %for.cond.4.preheader
+
+for.cond.4.preheader:                             ; preds = %for.cond.4.preheader.lr.ph, %for.inc.17
+  %indvars.iv29 = phi i64 [ 0, %for.cond.4.preheader.lr.ph ], [ %indvars.iv.next30, %for.inc.17 ]
+  %cmp5.23 = icmp sgt i32 %nk, 0
+  br i1 %cmp5.23, label %for.body.6.lr.ph, label %for.inc.17
+
+for.body.6.lr.ph:                                 ; preds = %for.cond.4.preheader
+  br label %for.body.6
+
+for.body.6:                                       ; preds = %for.body.6.lr.ph, %for.body.6
+  %indvars.iv = phi i64 [ 0, %for.body.6.lr.ph ], [ %indvars.iv.next, %for.body.6 ]
+  %arrayidx8 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i64 %indvars.iv33, i64 %indvars.iv
+  %0 = load double, double* %arrayidx8, align 8
+  %arrayidx12 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv29
+  %1 = load double, double* %arrayidx12, align 8
+  %mul = fmul double %0, %1
+  %arrayidx16 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv33, i64 %indvars.iv29
+  %2 = load double, double* %arrayidx16, align 8
+  %add = fadd double %2, %mul
+  store double %add, double* %arrayidx16, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %nk
+  br i1 %exitcond, label %for.body.6, label %for.cond.4.for.inc.17_crit_edge
+
+for.cond.4.for.inc.17_crit_edge:                  ; preds = %for.body.6
+  br label %for.inc.17
+
+for.inc.17:                                       ; preds = %for.cond.4.for.inc.17_crit_edge, %for.cond.4.preheader
+  %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp ne i32 %lftr.wideiv31, %nj
+  br i1 %exitcond32, label %for.cond.4.preheader, label %for.cond.1.for.inc.20_crit_edge
+
+for.cond.1.for.inc.20_crit_edge:                  ; preds = %for.inc.17
+  br label %for.inc.20
+
+for.inc.20:                                       ; preds = %for.cond.1.for.inc.20_crit_edge, %for.cond.1.preheader
+  %indvars.iv.next34 = add nuw nsw i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp ne i32 %lftr.wideiv35, %ni
+  br i1 %exitcond36, label %for.cond.1.preheader, label %for.cond.for.end.22_crit_edge
+
+for.cond.for.end.22_crit_edge:                    ; preds = %for.inc.20
+  br label %for.end.22
+
+for.end.22:                                       ; preds = %for.cond.for.end.22_crit_edge, %entry
+  ret void
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/kernel_gemm___%for.body---%for.end24.jscop
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/kernel_gemm___%for.body---%for.end24.jscop
@@ -0,0 +1,55 @@
+{
+   "arrays" : [
+      {
+         "name" : "MemRef_C1",
+         "sizes" : [ "*" ],
+         "type" : "double"
+      },
+      {
+         "name" : "MemRef_A",
+         "sizes" : [ "*", "1024" ],
+         "type" : "double"
+      },
+      {
+         "name" : "MemRef_B",
+         "sizes" : [ "*", "1024" ],
+         "type" : "double"
+      },
+      {
+         "name" : "MemRef_C",
+         "sizes" : [ "*", "1024" ],
+         "type" : "double"
+      }
+   ],
+   "context" : "{  :  }",
+   "name" : "%for.body---%for.end24",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C1[0] }"
+            },
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0, i2] }"
+            },
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_B[i2, i1] }"
+            },
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 and 0 <= i2 <= 1023 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> [i0, i1, i2] }"
+      }
+   ]
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/kernel_gemm___%for.body---%for.end24.jscop.transformed
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/kernel_gemm___%for.body---%for.end24.jscop.transformed
@@ -0,0 +1,55 @@
+{
+   "arrays" : [
+      {
+         "name" : "MemRef_C1",
+         "sizes" : [ "*" ],
+         "type" : "double"
+      },
+      {
+         "name" : "MemRef_A",
+         "sizes" : [ "*", "1024" ],
+         "type" : "double"
+      },
+      {
+         "name" : "MemRef_B",
+         "sizes" : [ "*", "1024" ],
+         "type" : "double"
+      },
+      {
+         "name" : "MemRef_C",
+         "sizes" : [ "*", "1024" ],
+         "type" : "double"
+      }
+   ],
+   "context" : "{  :  }",
+   "name" : "%for.body---%for.end24",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }"
+            },
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0, i2] }"
+            },
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_B[i2, i1] }"
+            },
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C1[0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 and 0 <= i2 <= 1023 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> [i0, i1, i2] }"
+      }
+   ]
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/kernel_gemm___%for.cond1.preheader---%for.end18.jscop.transformed
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/kernel_gemm___%for.cond1.preheader---%for.end18.jscop.transformed
@@ -0,0 +1,46 @@
+{
+   "arrays" : [
+      {
+         "name" : "MemRef_B",
+         "sizes" : [ "*", "1024" ],
+         "type" : "double"
+      },
+      {
+         "name" : "MemRef_C",
+         "sizes" : [ "*", "1024" ],
+         "type" : "double"
+      },
+      {
+         "name" : "New_MemRef_A",
+         "sizes" : [ "1024", "1024" ],
+         "type" : "double"
+      }
+   ],
+   "context" : "{  :  }",
+   "name" : "%for.cond1.preheader---%for.end18",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_B[i2, i1] }"
+            },
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> New_MemRef_A[i0, i2] }"
+            },
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_C[i0, i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 and 0 <= i2 <= 1023 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> [i0, i1, i2] }"
+      }
+   ]
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/line-tiling-2.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/line-tiling-2.ll
@@ -0,0 +1,40 @@
+; RUN: opt %loadPolly -polly-opt-isl -analyze \
+; RUN:                -polly-ast -polly-tile-sizes=1,64 < %s | FileCheck %s
+
+; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1)
+; CHECK:   for (int c1 = 0; c1 <= 7; c1 += 1)
+; CHECK:     for (int c3 = 0; c3 <= 63; c3 += 1)
+; CHECK:       Stmt_for_body3(c0, 64 * c1 + c3);
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+; Function Attrs: nounwind
+define void @line([512 x i32]* %A) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body3.lr.ph
+
+for.body3.lr.ph:                                  ; preds = %for.inc5, %entry.split
+  %i.0 = phi i32 [ 0, %entry.split ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.lr.ph, %for.body3
+  %j.0 = phi i32 [ 0, %for.body3.lr.ph ], [ %inc, %for.body3 ]
+  %mul = mul nsw i32 %j.0, %i.0
+  %rem = srem i32 %mul, 42
+  %arrayidx4 = getelementptr inbounds [512 x i32], [512 x i32]* %A, i32 %i.0, i32 %j.0
+  store i32 %rem, i32* %arrayidx4, align 4
+  %inc = add nsw i32 %j.0, 1
+  %cmp2 = icmp slt i32 %inc, 512
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i32 %i.0, 1
+  %cmp = icmp slt i32 %inc6, 1024
+  br i1 %cmp, label %for.body3.lr.ph, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/line-tiling.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/line-tiling.ll
@@ -0,0 +1,39 @@
+; RUN: opt %loadPolly -polly-opt-isl -analyze -polly-ast -polly-tile-sizes=64,1 < %s | FileCheck %s
+
+; CHECK: for (int c0 = 0; c0 <= 15; c0 += 1)
+; CHECK:   for (int c1 = 0; c1 <= 511; c1 += 1)
+; CHECK:     for (int c2 = 0; c2 <= 63; c2 += 1)
+; CHECK:       Stmt_for_body3(64 * c0 + c2, c1);
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+; Function Attrs: nounwind
+define void @line([512 x i32]* %A) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body3.lr.ph
+
+for.body3.lr.ph:                                  ; preds = %for.inc5, %entry.split
+  %i.0 = phi i32 [ 0, %entry.split ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.lr.ph, %for.body3
+  %j.0 = phi i32 [ 0, %for.body3.lr.ph ], [ %inc, %for.body3 ]
+  %mul = mul nsw i32 %j.0, %i.0
+  %rem = srem i32 %mul, 42
+  %arrayidx4 = getelementptr inbounds [512 x i32], [512 x i32]* %A, i32 %i.0, i32 %j.0
+  store i32 %rem, i32* %arrayidx4, align 4
+  %inc = add nsw i32 %j.0, 1
+  %cmp2 = icmp slt i32 %inc, 512
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i32 %i.0, 1
+  %cmp = icmp slt i32 %inc6, 1024
+  br i1 %cmp, label %for.body3.lr.ph, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
@@ -0,0 +1,101 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: -polly-optimized-scops \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -disable-output < %s 2>&1 | FileCheck %s
+;
+;    /* C := alpha*A*B + beta*C */
+;    for (i = 0; i < _PB_NI; i++)
+;      for (j = 0; j < _PB_NJ; j++)
+;        {
+;	   C[i][j] *= beta;
+;	   for (k = 0; k < _PB_NK; ++k)
+;	     C[i][j] += alpha * A[i][k] * B[k][j];
+;        }
+;
+; CHECK:        double Packed_B[ { [] -> [(256)] } ][ { [] -> [(256)] } ][ { [] -> [(8)] } ];
+; CHECK-NEXT:        double Packed_A[ { [] -> [(24)] } ][ { [] -> [(256)] } ][ { [] -> [(4)] } ]; // Element size 8
+;
+; CHECK:                { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg6[i0, i2] };
+; CHECK-NEXT:           new: { Stmt_Copy_0[i0, i1, i2] -> Packed_A[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 96*floor((-i0 + 4o0 + o2)/96) = -i0 + 4o0 + o2 and 0 <= o1 <= 255 and o2 >= 0 and -4o0 <= o2 <= 95 - 4o0 and o2 <= 3 };
+;
+; CHECK:                { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg7[i2, i1] };
+; CHECK-NEXT:           new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1, i1 - 8o0] : 256*floor((-i2 + o1)/256) = -i2 + o1 and -7 + i1 <= 8o0 <= i1 and 0 <= o1 <= 255 };
+;
+; CHECK:    	CopyStmt_0
+; CHECK-NEXT:            Domain :=
+; CHECK-NEXT:                { CopyStmt_0[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };
+; CHECK-NEXT:            Schedule :=
+; CHECK-NEXT:                ;
+; CHECK-NEXT:            MustWriteAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:                ;
+; CHECK-NEXT:           new: { CopyStmt_0[i0, i1, i2] -> Packed_B[o0, o1, i1 - 8o0] : 256*floor((-i2 + o1)/256) = -i2 + o1 and -7 + i1 <= 8o0 <= i1 and 0 <= o1 <= 255 };
+; CHECK-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:                ;
+; CHECK-NEXT:           new: { CopyStmt_0[i0, i1, i2] -> MemRef_arg7[i2, i1] };
+; CHECK-NEXT:    	CopyStmt_1
+; CHECK-NEXT:            Domain :=
+; CHECK-NEXT:                { CopyStmt_1[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 };
+; CHECK-NEXT:            Schedule :=
+; CHECK-NEXT:                ;
+; CHECK-NEXT:            MustWriteAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:                ;
+; CHECK-NEXT:           new: { CopyStmt_1[i0, i1, i2] -> Packed_A[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 96*floor((-i0 + 4o0 + o2)/96) = -i0 + 4o0 + o2 and 0 <= o1 <= 255 and o2 >= 0 and -4o0 <= o2 <= 95 - 4o0 and o2 <= 3 };
+; CHECK-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; CHECK-NEXT:                ;
+; CHECK-NEXT:           new: { CopyStmt_1[i0, i1, i2] -> MemRef_arg6[i0, i2] };
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) #0 {
+bb:
+  br label %bb8
+
+bb8:                                              ; preds = %bb29, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ]
+  br label %bb9
+
+bb9:                                              ; preds = %bb26, %bb8
+  %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ]
+  %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10
+  %tmp12 = load double, double* %tmp11, align 8
+  %tmp13 = fmul double %tmp12, %arg4
+  store double %tmp13, double* %tmp11, align 8
+  br label %Copy_0
+
+Copy_0:                                             ; preds = %Copy_0, %bb9
+  %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ]
+  %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15
+  %tmp17 = load double, double* %tmp16, align 8
+  %tmp18 = fmul double %tmp17, %arg3
+  %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10
+  %tmp20 = load double, double* %tmp19, align 8
+  %tmp21 = fmul double %tmp18, %tmp20
+  %tmp22 = load double, double* %tmp11, align 8
+  %tmp23 = fadd double %tmp22, %tmp21
+  store double %tmp23, double* %tmp11, align 8
+  %tmp24 = add nuw nsw i64 %tmp15, 1
+  %tmp25 = icmp ne i64 %tmp24, 1024
+  br i1 %tmp25, label %Copy_0, label %bb26
+
+bb26:                                             ; preds = %Copy_0
+  %tmp27 = add nuw nsw i64 %tmp10, 1
+  %tmp28 = icmp ne i64 %tmp27, 1056
+  br i1 %tmp28, label %bb9, label %bb29
+
+bb29:                                             ; preds = %bb26
+  %tmp30 = add nuw nsw i64 %tmp, 1
+  %tmp31 = icmp ne i64 %tmp30, 1056
+  br i1 %tmp31, label %bb8, label %bb32
+
+bb32:                                             ; preds = %bb29
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+aes,+avx,+cmov,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" }
--- a/external/llvm-project/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
@@ -0,0 +1,134 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-2nd-cache-level-size=262144 -polly-ast \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN:  -analyze < %s | FileCheck %s
+;
+;    /* C := alpha*A*B + beta*C */
+;    /* _PB_NK % Kc != 0 */
+;    for (i = 0; i < _PB_NI; i++)
+;      for (j = 0; j < _PB_NJ; j++)
+;        {
+;	   C[i][j] *= beta;
+;	   for (k = 0; k < _PB_NK; ++k)
+;	     C[i][j] += alpha * A[i][k] * B[k][j];
+;        }
+;
+; CHECK:    {
+; CHECK-NEXT:      // 1st level tiling - Tiles
+; CHECK-NEXT:      for (int c0 = 0; c0 <= 32; c0 += 1)
+; CHECK-NEXT:        for (int c1 = 0; c1 <= 32; c1 += 1) {
+; CHECK-NEXT:          // 1st level tiling - Points
+; CHECK-NEXT:          for (int c2 = 0; c2 <= 31; c2 += 1)
+; CHECK-NEXT:            for (int c3 = 0; c3 <= 31; c3 += 1)
+; CHECK-NEXT:              Stmt_bb9(32 * c0 + c2, 32 * c1 + c3);
+; CHECK-NEXT:        }
+; CHECK-NEXT:      // Inter iteration alias-free
+; CHECK-NEXT:      // 1st level tiling - Tiles
+; CHECK-NEXT:      for (int c1 = 0; c1 <= 3; c1 += 1) {
+; CHECK-NEXT:        for (int c3 = 0; c3 <= 1055; c3 += 1)
+; CHECK-NEXT:          for (int c4 = 256 * c1; c4 <= min(1022, 256 * c1 + 255); c4 += 1)
+; CHECK-NEXT:            CopyStmt_0(0, c3, c4);
+; CHECK-NEXT:        for (int c2 = 0; c2 <= 10; c2 += 1) {
+; CHECK-NEXT:          for (int c3 = 96 * c2; c3 <= 96 * c2 + 95; c3 += 1)
+; CHECK-NEXT:            for (int c5 = 256 * c1; c5 <= min(1022, 256 * c1 + 255); c5 += 1)
+; CHECK-NEXT:              CopyStmt_1(c3, 0, c5);
+; CHECK-NEXT:          // 1st level tiling - Points
+; CHECK-NEXT:          // Register tiling - Tiles
+; CHECK-NEXT:          for (int c3 = 0; c3 <= 131; c3 += 1)
+; CHECK-NEXT:            for (int c4 = 0; c4 <= 23; c4 += 1)
+; CHECK-NEXT:              for (int c5 = 0; c5 <= min(255, -256 * c1 + 1022); c5 += 1) {
+; CHECK-NEXT:                // Loop Vectorizer Disabled
+; CHECK-NEXT:                // Register tiling - Points
+; CHECK-NEXT:                {
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5);
+; CHECK-NEXT:                }
+; CHECK-NEXT:              }
+; CHECK-NEXT:        }
+; CHECK-NEXT:      }
+; CHECK-NEXT:    }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1023 x double]* %arg6, [1056 x double]* %arg7) #0 {
+bb:
+  br label %bb8
+
+bb8:                                              ; preds = %bb29, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ]
+  br label %bb9
+
+bb9:                                              ; preds = %bb26, %bb8
+  %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ]
+  %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10
+  %tmp12 = load double, double* %tmp11, align 8
+  %tmp13 = fmul double %tmp12, %arg4
+  store double %tmp13, double* %tmp11, align 8
+  br label %Copy_0
+
+Copy_0:                                             ; preds = %Copy_0, %bb9
+  %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ]
+  %tmp16 = getelementptr inbounds [1023 x double], [1023 x double]* %arg6, i64 %tmp, i64 %tmp15
+  %tmp17 = load double, double* %tmp16, align 8
+  %tmp18 = fmul double %tmp17, %arg3
+  %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10
+  %tmp20 = load double, double* %tmp19, align 8
+  %tmp21 = fmul double %tmp18, %tmp20
+  %tmp22 = load double, double* %tmp11, align 8
+  %tmp23 = fadd double %tmp22, %tmp21
+  store double %tmp23, double* %tmp11, align 8
+  %tmp24 = add nuw nsw i64 %tmp15, 1
+  %tmp25 = icmp ne i64 %tmp24, 1023
+  br i1 %tmp25, label %Copy_0, label %bb26
+
+bb26:                                             ; preds = %Copy_0
+  %tmp27 = add nuw nsw i64 %tmp10, 1
+  %tmp28 = icmp ne i64 %tmp27, 1056
+  br i1 %tmp28, label %bb9, label %bb29
+
+bb29:                                             ; preds = %bb26
+  %tmp30 = add nuw nsw i64 %tmp, 1
+  %tmp31 = icmp ne i64 %tmp30, 1056
+  br i1 %tmp31, label %bb8, label %bb32
+
+bb32:                                             ; preds = %bb29
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+aes,+avx,+cmov,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" }
--- a/external/llvm-project/polly/test/ScheduleOptimizer/one-dimensional-band.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/one-dimensional-band.ll
@@ -0,0 +1,107 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-ast -analyze < %s | FileCheck %s
+;
+;    void jacobi1d(long T, long N, float *A, float *B) {
+;      long t, i, j;
+;      for (t = 0; t < T; t++) {
+;        for (i = 1; i < N - 1; i++)
+;          B[i] = 0.33333 * (A[i - 1] + A[i] + A[i + 1]);
+;        for (j = 1; j < N - 1; j++)
+;          A[j] = 0.33333 * (B[i - 1] + B[i] + B[i + 1]);
+;      }
+;    }
+
+; Verify that we do not tile bands that have just a single dimension.
+
+; CHECK: for (int c0 = 0; c0 < T; c0 += 1) {
+; CHECK:   for (int c1 = 0; c1 < N - 2; c1 += 1)
+; CHECK:     Stmt_for_body3(c0, c1);
+; CHECK:   for (int c1 = 0; c1 < N - 2; c1 += 1)
+; CHECK:     Stmt_for_body15(c0, c1);
+; CHECK: }
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @jacobi1d(i64 %T, i64 %N, float* %A, float* %B) {
+entry:
+  %tmp = add i64 %N, -1
+  %tmp1 = icmp sgt i64 %tmp, 1
+  %smax = select i1 %tmp1, i64 %tmp, i64 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc30, %entry
+  %t.0 = phi i64 [ 0, %entry ], [ %inc31, %for.inc30 ]
+  %cmp = icmp slt i64 %t.0, %T
+  br i1 %cmp, label %for.body, label %for.end32
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ]
+  %sub = add nsw i64 %N, -1
+  %cmp2 = icmp slt i64 %i.0, %sub
+  br i1 %cmp2, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %sub4 = add nsw i64 %i.0, -1
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %sub4
+  %tmp2 = load float, float* %arrayidx, align 4
+  %arrayidx5 = getelementptr inbounds float, float* %A, i64 %i.0
+  %tmp3 = load float, float* %arrayidx5, align 4
+  %add = fadd float %tmp2, %tmp3
+  %add6 = add nuw nsw i64 %i.0, 1
+  %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add6
+  %tmp4 = load float, float* %arrayidx7, align 4
+  %add8 = fadd float %add, %tmp4
+  %conv = fpext float %add8 to double
+  %mul = fmul double %conv, 3.333300e-01
+  %conv9 = fptrunc double %mul to float
+  %arrayidx10 = getelementptr inbounds float, float* %B, i64 %i.0
+  store float %conv9, float* %arrayidx10, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %inc = add nuw nsw i64 %i.0, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.cond11
+
+for.cond11:                                       ; preds = %for.inc27, %for.end
+  %j.0 = phi i64 [ 1, %for.end ], [ %inc28, %for.inc27 ]
+  %sub12 = add nsw i64 %N, -1
+  %cmp13 = icmp slt i64 %j.0, %sub12
+  br i1 %cmp13, label %for.body15, label %for.end29
+
+for.body15:                                       ; preds = %for.cond11
+  %sub16 = add nsw i64 %smax, -1
+  %arrayidx17 = getelementptr inbounds float, float* %B, i64 %sub16
+  %tmp5 = load float, float* %arrayidx17, align 4
+  %arrayidx18 = getelementptr inbounds float, float* %B, i64 %smax
+  %tmp6 = load float, float* %arrayidx18, align 4
+  %add19 = fadd float %tmp5, %tmp6
+  %add20 = add nsw i64 %smax, 1
+  %arrayidx21 = getelementptr inbounds float, float* %B, i64 %add20
+  %tmp7 = load float, float* %arrayidx21, align 4
+  %add22 = fadd float %add19, %tmp7
+  %conv23 = fpext float %add22 to double
+  %mul24 = fmul double %conv23, 3.333300e-01
+  %conv25 = fptrunc double %mul24 to float
+  %arrayidx26 = getelementptr inbounds float, float* %A, i64 %j.0
+  store float %conv25, float* %arrayidx26, align 4
+  br label %for.inc27
+
+for.inc27:                                        ; preds = %for.body15
+  %inc28 = add nuw nsw i64 %j.0, 1
+  br label %for.cond11
+
+for.end29:                                        ; preds = %for.cond11
+  br label %for.inc30
+
+for.inc30:                                        ; preds = %for.end29
+  %inc31 = add nuw nsw i64 %t.0, 1
+  br label %for.cond
+
+for.end32:                                        ; preds = %for.cond
+  ret void
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/outer_coincidence.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/outer_coincidence.ll
@@ -0,0 +1,69 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-ast -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-ast -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes -analyze < %s | FileCheck %s --check-prefix=OUTER
+
+; By skewing, the diagonal can be made parallel. ISL does this when the Check
+; the 'outer_coincidence' option is enabled.
+;
+; void func(int m, int n, float A[static const restrict m][n]) {
+;  for (int i = 1; i < m; i+=1)
+;    for (int j = 1; j < n; j+=1)
+;      A[i][j] = A[i-1][j] + A[i][j-1];
+;}
+
+define void @func(i64 %m, i64 %n, float* noalias nonnull %A) #0 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc11, %entry
+  %i.0 = phi i64 [ 1, %entry ], [ %add12, %for.inc11 ]
+  %cmp = icmp slt i64 %i.0, %m
+  br i1 %cmp, label %for.cond1.preheader, label %for.end13
+
+for.cond1.preheader:                              ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.0 = phi i64 [ %add10, %for.body3 ], [ 1, %for.cond1.preheader ]
+  %cmp2 = icmp slt i64 %j.0, %n
+  br i1 %cmp2, label %for.body3, label %for.inc11
+
+for.body3:                                        ; preds = %for.cond1
+  %sub = add nsw i64 %i.0, -1
+  %tmp = mul nsw i64 %sub, %n
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %tmp
+  %arrayidx4 = getelementptr inbounds float, float* %arrayidx, i64 %j.0
+  %tmp13 = load float, float* %arrayidx4, align 4
+  %sub5 = add nsw i64 %j.0, -1
+  %tmp14 = mul nsw i64 %i.0, %n
+  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %tmp14
+  %arrayidx7 = getelementptr inbounds float, float* %arrayidx6, i64 %sub5
+  %tmp15 = load float, float* %arrayidx7, align 4
+  %add = fadd float %tmp13, %tmp15
+  %tmp16 = mul nsw i64 %i.0, %n
+  %arrayidx8 = getelementptr inbounds float, float* %A, i64 %tmp16
+  %arrayidx9 = getelementptr inbounds float, float* %arrayidx8, i64 %j.0
+  store float %add, float* %arrayidx9, align 4
+  %add10 = add nuw nsw i64 %j.0, 1
+  br label %for.cond1
+
+for.inc11:                                        ; preds = %for.cond1
+  %add12 = add nuw nsw i64 %i.0, 1
+  br label %for.cond
+
+for.end13:                                        ; preds = %for.cond
+  ret void
+}
+
+
+; CHECK:      #pragma minimal dependence distance: 1
+; CHECK-NEXT: for (int c0 = 0; c0 < m - 1; c0 += 1)
+; CHECK-NEXT:   #pragma minimal dependence distance: 1
+; CHECK-NEXT:   for (int c1 = 0; c1 < n - 1; c1 += 1)
+; CHECK-NEXT:     Stmt_for_body3(c0, c1);
+
+; OUTER:      #pragma minimal dependence distance: 1
+; OUTER-NEXT: for (int c0 = 0; c0 < m + n - 3; c0 += 1)
+; OUTER-NEXT:   #pragma simd
+; OUTER-NEXT:   #pragma known-parallel
+; OUTER-NEXT:   for (int c1 = max(0, -n + c0 + 2); c1 <= min(m - 2, c0); c1 += 1)
+; OUTER-NEXT:     Stmt_for_body3(c1, c0 - c1);
--- a/external/llvm-project/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
@@ -0,0 +1,76 @@
+; RUN: opt %loadPolly \
+; RUN: -polly-pattern-matching-based-opts=true \
+; RUN: -polly-optree -polly-delicm -polly-simplify \
+; RUN: -polly-opt-isl -debug < %s 2>&1 \
+; RUN: | FileCheck %s
+; REQUIRES: asserts
+
+; Check that the pattern matching detects the matrix multiplication pattern
+; after a full run of -polly-optree and -polly-delicm, where the write access
+; is not through the original memory access, but trough a PHI node that was
+; delicmed. This test covers the polybench 2mm and 3mm cases.
+;
+; CHECK: The matrix multiplication pattern was detected
+;
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, [1800 x double]* nocapture %tmp, [2200 x double]* nocapture readonly %A, [1800 x double]* nocapture readonly %B, [2400 x double]* nocapture readnone %C, [2400 x double]* nocapture readnone %D) local_unnamed_addr #0 {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc25, %entry.split
+  %indvars.iv50 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next51, %for.inc25 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.inc22, %for.body
+  %indvars.iv46 = phi i64 [ 0, %for.body ], [ %indvars.iv.next47, %for.inc22 ]
+  %arrayidx5 = getelementptr inbounds [1800 x double], [1800 x double]* %tmp, i64 %indvars.iv50, i64 %indvars.iv46
+  store double 0.000000e+00, double* %arrayidx5, align 8, !tbaa !2
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.body8, %for.body3
+  %0 = phi double [ 0.000000e+00, %for.body3 ], [ %add, %for.body8 ]
+  %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body8 ]
+  %arrayidx12 = getelementptr inbounds [2200 x double], [2200 x double]* %A, i64 %indvars.iv50, i64 %indvars.iv
+  %1 = load double, double* %arrayidx12, align 8, !tbaa !2
+  %mul = fmul double %1, %alpha
+  %arrayidx16 = getelementptr inbounds [1800 x double], [1800 x double]* %B, i64 %indvars.iv, i64 %indvars.iv46
+  %2 = load double, double* %arrayidx16, align 8, !tbaa !2
+  %mul17 = fmul double %mul, %2
+  %add = fadd double %0, %mul17
+  store double %add, double* %arrayidx5, align 8, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 2200
+  br i1 %exitcond, label %for.inc22, label %for.body8
+
+for.inc22:                                        ; preds = %for.body8
+  %indvars.iv.next47 = add nuw nsw i64 %indvars.iv46, 1
+  %exitcond48 = icmp eq i64 %indvars.iv.next47, 1800
+  br i1 %exitcond48, label %for.inc25, label %for.body3
+
+for.inc25:                                        ; preds = %for.inc22
+  %indvars.iv.next51 = add nuw nsw i64 %indvars.iv50, 1
+  %exitcond52 = icmp eq i64 %indvars.iv.next51, 1600
+  br i1 %exitcond52, label %for.end27, label %for.body
+
+for.end27:                                        ; preds = %for.inc25
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vl,-avx512vpopcntdq,-clflushopt,-clwb,-clzero,-fma4,-lwp,-mwaitx,-pku,-prefetchwt1,-prfchw,-rdseed,-rtm,-sgx,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0 (trunk 309912) (llvm/trunk 309933)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"double", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
@@ -0,0 +1,66 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false \
+; RUN: -debug < %s 2>&1| FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1| FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1| FileCheck %s --check-prefix=STATS -match-full-lines
+; REQUIRES: asserts
+;
+;    /* C := alpha*A*B + beta*C */
+;    for (i = 0; i < _PB_NI; i++)
+;      for (j = 0; j < _PB_NJ; j++)
+;        {
+;	   C[i][j] *= beta;
+;	   for (k = 0; k < _PB_NK; ++k)
+;	     C[i][j] += alpha * A[i][k] * B[k][j];
+;        }
+;
+; CHECK-NOT: The matrix multiplication pattern was detected
+; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected
+; STATS:  1 polly-opt-isl    - Number of matrix multiplication patterns detected and optimized
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) #0 {
+bb:
+  br label %bb8
+
+bb8:                                              ; preds = %bb29, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ]
+  br label %bb9
+
+bb9:                                              ; preds = %bb26, %bb8
+  %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ]
+  %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10
+  %tmp12 = load double, double* %tmp11, align 8
+  %tmp13 = fmul double %tmp12, %arg4
+  store double %tmp13, double* %tmp11, align 8
+  br label %Copy_0
+
+Copy_0:                                             ; preds = %Copy_0, %bb9
+  %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ]
+  %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15
+  %tmp17 = load double, double* %tmp16, align 8
+  %tmp18 = fmul double %tmp17, %arg3
+  %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10
+  %tmp20 = load double, double* %tmp19, align 8
+  %tmp21 = fmul double %tmp18, %tmp20
+  %tmp22 = load double, double* %tmp11, align 8
+  %tmp23 = fadd double %tmp22, %tmp21
+  store double %tmp23, double* %tmp11, align 8
+  %tmp24 = add nuw nsw i64 %tmp15, 1
+  %tmp25 = icmp ne i64 %tmp24, 1024
+  br i1 %tmp25, label %Copy_0, label %bb26
+
+bb26:                                             ; preds = %Copy_0
+  %tmp27 = add nuw nsw i64 %tmp10, 1
+  %tmp28 = icmp ne i64 %tmp27, 1056
+  br i1 %tmp28, label %bb9, label %bb29
+
+bb29:                                             ; preds = %bb26
+  %tmp30 = add nuw nsw i64 %tmp, 1
+  %tmp31 = icmp ne i64 %tmp30, 1056
+  br i1 %tmp31, label %bb8, label %bb32
+
+bb32:                                             ; preds = %bb29
+  ret void
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/pattern-matching-based-opts_10.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/pattern-matching-based-opts_10.ll
@@ -0,0 +1,69 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-invariant-load-hoisting=true \
+; RUN: -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=1 \
+; RUN: -polly-codegen -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 -S < %s \
+; RUN: | FileCheck %s
+;
+; This test case checks whether Polly generates second level alias metadata
+; to distinguish the specific accesses in case of the ublas gemm kernel.
+;
+; CHECK: !11 = distinct !{!11, !0, !"second level alias metadata"}
+; CHECK: !12 = distinct !{!12, !0, !"second level alias metadata"}
+; CHECK: !13 = !{!3, !4, !5, !6, !11}
+; CHECK: !14 = distinct !{!14, !0, !"second level alias metadata"}
+; CHECK: !15 = !{!3, !4, !5, !6, !11, !12}
+; CHECK: !16 = distinct !{!16, !0, !"second level alias metadata"}
+; CHECK: !17 = !{!3, !4, !5, !6, !11, !12, !14}
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) {
+bb:
+  br label %bb8
+
+bb8:                                              ; preds = %bb29, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ]
+  br label %bb9
+
+bb9:                                              ; preds = %bb26, %bb8
+  %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ]
+  %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10
+  %tmp12 = load double, double* %tmp11, align 8
+  %tmp13 = fmul double %tmp12, %arg4
+  store double %tmp13, double* %tmp11, align 8
+  br label %Copy_0
+
+Copy_0:                                             ; preds = %Copy_0, %bb9
+  %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ]
+  %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15
+  %tmp17 = load double, double* %tmp16, align 8
+  %tmp18 = fmul double %tmp17, %arg3
+  %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10
+  %tmp20 = load double, double* %tmp19, align 8
+  %tmp21 = fmul double %tmp18, %tmp20
+  %tmp22 = load double, double* %tmp11, align 8
+  %tmp23 = fadd double %tmp22, %tmp21
+  store double %tmp23, double* %tmp11, align 8
+  %tmp24 = add nuw nsw i64 %tmp15, 1
+  %tmp25 = icmp ne i64 %tmp24, 1024
+  br i1 %tmp25, label %Copy_0, label %bb26
+
+bb26:                                             ; preds = %Copy_0
+  %tmp27 = add nuw nsw i64 %tmp10, 1
+  %tmp28 = icmp ne i64 %tmp27, 1056
+  br i1 %tmp28, label %bb9, label %bb29
+
+bb29:                                             ; preds = %bb26
+  %tmp30 = add nuw nsw i64 %tmp, 1
+  %tmp31 = icmp ne i64 %tmp30, 1056
+  br i1 %tmp31, label %bb8, label %bb32
+
+bb32:                                             ; preds = %bb29
+  ret void
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
@@ -0,0 +1,64 @@
+; RUN: opt %loadPolly -polly-import-jscop \
+; RUN: -polly-import-jscop-postfix=transformed \
+; RUN: -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: -polly-opt-isl -debug < %s 2>&1 \
+; RUN: | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that the pattern matching detects the matrix multiplication pattern
+; in case scalar memory accesses were replaced by accesses to newly created
+; arrays.
+;
+; CHECK: The matrix multiplication pattern was detected
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %A, [1024 x double]* %B, [1024 x double]* %C) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc16, %entry.split
+  %indvars.iv35 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next36, %for.inc16 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc13, %for.cond1.preheader
+  %indvars.iv32 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next33, %for.inc13 ]
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv32
+  %tmp = load double, double* %arrayidx8, align 8
+  %mul = fmul double %tmp, %A
+  %arrayidx12 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv35, i64 %indvars.iv32
+  %tmp1 = load double, double* %arrayidx12, align 8
+  %add = fadd double %tmp1, %mul
+  store double %add, double* %arrayidx12, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.body6, label %for.inc13
+
+for.inc13:                                        ; preds = %for.body6
+  %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
+  %exitcond34 = icmp ne i64 %indvars.iv.next33, 1024
+  br i1 %exitcond34, label %for.cond4.preheader, label %for.inc16
+
+for.inc16:                                        ; preds = %for.inc13
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond37 = icmp ne i64 %indvars.iv.next36, 1024
+  br i1 %exitcond37, label %for.cond1.preheader, label %for.end18
+
+for.end18:                                        ; preds = %for.inc16
+  ret void
+}
--- a/external/llvm-project/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
+++ b/external/llvm-project/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
--- a/Show More
+++ b/Show More