Imported Upstream version 5.18.0.167

Former-commit-id: 289509151e0fee68a1b591a20c9f109c3c789d3a
2018-10-20 08:25:10 +00:00
parent e19d552987
commit b084638f15
28489 changed files with 184 additions and 3866856 deletions
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
@ -1,22 +0,0 @@
-; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
-; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone < %s | FileCheck %s
-; Currently disabled for a few subtargets (e.g. Kryo):
-; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck --check-prefix=NO_SLP %s
-; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -slp-min-reg-size=128 < %s | FileCheck --check-prefix=NO_SLP %s
-
-define void @f(float* %r, float* %w) {
-  %r0 = getelementptr inbounds float, float* %r, i64 0
-  %r1 = getelementptr inbounds float, float* %r, i64 1
-  %f0 = load float, float* %r0
-  %f1 = load float, float* %r1
-  %add0 = fadd float %f0, %f0
-; CHECK:  fadd <2 x float>
-; NO_SLP: fadd float
-; NO_SLP: fadd float
-  %add1 = fadd float %f1, %f1
-  %w0 = getelementptr inbounds float, float* %w, i64 0
-  %w1 = getelementptr inbounds float, float* %w, i64 1
-  store float %add0, float* %w0
-  store float %add1, float* %w1
-  ret void
-}
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@ -1,75 +0,0 @@
-; RUN: opt -S -slp-vectorizer %s -slp-threshold=-10 | FileCheck %s
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-%structA = type { [2 x float] }
-
-define void @test1(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
-; CHECK-LABEL: test1
-; CHECK: %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0
-; CHECK: %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1
-; CHECK: %3 = bitcast float* %arrayidx4 to <2 x float>*
-; CHECK: %4 = load <2 x float>, <2 x float>* %3, align 4
-; CHECK: %5 = fsub fast <2 x float> %2, %4
-; CHECK: %6 = fmul fast <2 x float> %5, %5
-; CHECK: %7 = extractelement <2 x float> %6, i32 0
-; CHECK: %8 = extractelement <2 x float> %6, i32 1
-; CHECK: %add = fadd fast float %7, %8
-; CHECK: %cmp = fcmp oeq float %add, 0.000000e+00
-
-entry:
-  br label %for.body3.lr.ph
-
-for.body3.lr.ph:
-  %conv5 = sitofp i32 %ymin to float
-  %conv = sitofp i32 %xmin to float
-  %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0
-  %0 = load float, float* %arrayidx4, align 4
-  %sub = fsub fast float %conv, %0
-  %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1
-  %1 = load float, float* %arrayidx9, align 4
-  %sub10 = fsub fast float %conv5, %1
-  %mul11 = fmul fast float %sub, %sub
-  %mul12 = fmul fast float %sub10, %sub10
-  %add = fadd fast float %mul11, %mul12
-  %cmp = fcmp oeq float %add, 0.000000e+00
-  br i1 %cmp, label %for.body3.lr.ph, label %for.end27
-
-for.end27:
-  ret void
-}
-
-define void @test2(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
-; CHECK-LABEL: test2
-; CHECK: %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0
-; CHECK: %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1
-; CHECK: %3 = bitcast float* %arrayidx4 to <2 x float>*
-; CHECK: %4 = load <2 x float>, <2 x float>* %3, align 4
-; CHECK: %5 = fsub fast <2 x float> %2, %4
-; CHECK: %6 = fmul fast <2 x float> %5, %5
-; CHECK: %7 = extractelement <2 x float> %6, i32 0
-; CHECK: %8 = extractelement <2 x float> %6, i32 1
-; CHECK: %add = fadd fast float %8, %7
-; CHECK: %cmp = fcmp oeq float %add, 0.000000e+00
-
-entry:
-  br label %for.body3.lr.ph
-
-for.body3.lr.ph:
-  %conv5 = sitofp i32 %ymin to float
-  %conv = sitofp i32 %xmin to float
-  %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0
-  %0 = load float, float* %arrayidx4, align 4
-  %sub = fsub fast float %conv, %0
-  %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1
-  %1 = load float, float* %arrayidx9, align 4
-  %sub10 = fsub fast float %conv5, %1
-  %mul11 = fmul fast float %sub, %sub
-  %mul12 = fmul fast float %sub10, %sub10
-  %add = fadd fast float %mul12, %mul11         ;;;<---- Operands commuted!!
-  %cmp = fcmp oeq float %add, 0.000000e+00
-  br i1 %cmp, label %for.body3.lr.ph, label %for.end27
-
-for.end27:
-  ret void
-}
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@ -1,258 +0,0 @@
-; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=GENERIC
-; RUN: opt -S -mcpu=kryo -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=KRYO
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-; These tests check that we vectorize the index calculations in the
-; gather-reduce pattern shown below. We check cases having i32 and i64
-; subtraction.
-;
-; int gather_reduce_8x16(short *a, short *b, short *g, int n) {
-;   int sum = 0;
-;   for (int i = 0; i < n ; ++i) {
-;     sum += g[*a++ - b[0]]; sum += g[*a++ - b[1]];
-;     sum += g[*a++ - b[2]]; sum += g[*a++ - b[3]];
-;     sum += g[*a++ - b[4]]; sum += g[*a++ - b[5]];
-;     sum += g[*a++ - b[6]]; sum += g[*a++ - b[7]];
-;   }
-;   return sum;
-; }
-
-; GENERIC-LABEL: @gather_reduce_8x16_i32
-;
-; GENERIC: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
-; GENERIC: zext <8 x i16> [[L]] to <8 x i32>
-; GENERIC: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
-; GENERIC: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
-; GENERIC: sext i32 [[X]] to i64
-;
-define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
-entry:
-  %cmp.99 = icmp sgt i32 %n, 0
-  br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  br label %for.body
-
-for.cond.cleanup.loopexit:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
-  ret i32 %sum.0.lcssa
-
-for.body:
-  %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
-  %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
-  %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
-  %0 = load i16, i16* %a.addr.0101, align 2
-  %conv = zext i16 %0 to i32
-  %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
-  %1 = load i16, i16* %b, align 2
-  %conv2 = zext i16 %1 to i32
-  %sub = sub nsw i32 %conv, %conv2
-  %arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub
-  %2 = load i16, i16* %arrayidx, align 2
-  %conv3 = zext i16 %2 to i32
-  %add = add nsw i32 %conv3, %sum.0102
-  %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
-  %3 = load i16, i16* %incdec.ptr, align 2
-  %conv5 = zext i16 %3 to i32
-  %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
-  %4 = load i16, i16* %incdec.ptr1, align 2
-  %conv7 = zext i16 %4 to i32
-  %sub8 = sub nsw i32 %conv5, %conv7
-  %arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8
-  %5 = load i16, i16* %arrayidx10, align 2
-  %conv11 = zext i16 %5 to i32
-  %add12 = add nsw i32 %add, %conv11
-  %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
-  %6 = load i16, i16* %incdec.ptr4, align 2
-  %conv14 = zext i16 %6 to i32
-  %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
-  %7 = load i16, i16* %incdec.ptr6, align 2
-  %conv16 = zext i16 %7 to i32
-  %sub17 = sub nsw i32 %conv14, %conv16
-  %arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17
-  %8 = load i16, i16* %arrayidx19, align 2
-  %conv20 = zext i16 %8 to i32
-  %add21 = add nsw i32 %add12, %conv20
-  %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
-  %9 = load i16, i16* %incdec.ptr13, align 2
-  %conv23 = zext i16 %9 to i32
-  %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
-  %10 = load i16, i16* %incdec.ptr15, align 2
-  %conv25 = zext i16 %10 to i32
-  %sub26 = sub nsw i32 %conv23, %conv25
-  %arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26
-  %11 = load i16, i16* %arrayidx28, align 2
-  %conv29 = zext i16 %11 to i32
-  %add30 = add nsw i32 %add21, %conv29
-  %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
-  %12 = load i16, i16* %incdec.ptr22, align 2
-  %conv32 = zext i16 %12 to i32
-  %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
-  %13 = load i16, i16* %incdec.ptr24, align 2
-  %conv34 = zext i16 %13 to i32
-  %sub35 = sub nsw i32 %conv32, %conv34
-  %arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35
-  %14 = load i16, i16* %arrayidx37, align 2
-  %conv38 = zext i16 %14 to i32
-  %add39 = add nsw i32 %add30, %conv38
-  %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
-  %15 = load i16, i16* %incdec.ptr31, align 2
-  %conv41 = zext i16 %15 to i32
-  %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
-  %16 = load i16, i16* %incdec.ptr33, align 2
-  %conv43 = zext i16 %16 to i32
-  %sub44 = sub nsw i32 %conv41, %conv43
-  %arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44
-  %17 = load i16, i16* %arrayidx46, align 2
-  %conv47 = zext i16 %17 to i32
-  %add48 = add nsw i32 %add39, %conv47
-  %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
-  %18 = load i16, i16* %incdec.ptr40, align 2
-  %conv50 = zext i16 %18 to i32
-  %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
-  %19 = load i16, i16* %incdec.ptr42, align 2
-  %conv52 = zext i16 %19 to i32
-  %sub53 = sub nsw i32 %conv50, %conv52
-  %arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53
-  %20 = load i16, i16* %arrayidx55, align 2
-  %conv56 = zext i16 %20 to i32
-  %add57 = add nsw i32 %add48, %conv56
-  %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
-  %21 = load i16, i16* %incdec.ptr49, align 2
-  %conv59 = zext i16 %21 to i32
-  %22 = load i16, i16* %incdec.ptr51, align 2
-  %conv61 = zext i16 %22 to i32
-  %sub62 = sub nsw i32 %conv59, %conv61
-  %arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62
-  %23 = load i16, i16* %arrayidx64, align 2
-  %conv65 = zext i16 %23 to i32
-  %add66 = add nsw i32 %add57, %conv65
-  %inc = add nuw nsw i32 %i.0103, 1
-  %exitcond = icmp eq i32 %inc, %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; KRYO-LABEL: @gather_reduce_8x16_i64
-;
-; KRYO: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
-; KRYO: zext <8 x i16> [[L]] to <8 x i32>
-; KRYO: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
-; KRYO: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
-; KRYO: sext i32 [[X]] to i64
-;
-define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
-entry:
-  %cmp.99 = icmp sgt i32 %n, 0
-  br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  br label %for.body
-
-for.cond.cleanup.loopexit:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
-  ret i32 %sum.0.lcssa
-
-for.body:
-  %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
-  %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
-  %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
-  %0 = load i16, i16* %a.addr.0101, align 2
-  %conv = zext i16 %0 to i64
-  %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
-  %1 = load i16, i16* %b, align 2
-  %conv2 = zext i16 %1 to i64
-  %sub = sub nsw i64 %conv, %conv2
-  %arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub
-  %2 = load i16, i16* %arrayidx, align 2
-  %conv3 = zext i16 %2 to i32
-  %add = add nsw i32 %conv3, %sum.0102
-  %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
-  %3 = load i16, i16* %incdec.ptr, align 2
-  %conv5 = zext i16 %3 to i64
-  %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
-  %4 = load i16, i16* %incdec.ptr1, align 2
-  %conv7 = zext i16 %4 to i64
-  %sub8 = sub nsw i64 %conv5, %conv7
-  %arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8
-  %5 = load i16, i16* %arrayidx10, align 2
-  %conv11 = zext i16 %5 to i32
-  %add12 = add nsw i32 %add, %conv11
-  %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
-  %6 = load i16, i16* %incdec.ptr4, align 2
-  %conv14 = zext i16 %6 to i64
-  %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
-  %7 = load i16, i16* %incdec.ptr6, align 2
-  %conv16 = zext i16 %7 to i64
-  %sub17 = sub nsw i64 %conv14, %conv16
-  %arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17
-  %8 = load i16, i16* %arrayidx19, align 2
-  %conv20 = zext i16 %8 to i32
-  %add21 = add nsw i32 %add12, %conv20
-  %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
-  %9 = load i16, i16* %incdec.ptr13, align 2
-  %conv23 = zext i16 %9 to i64
-  %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
-  %10 = load i16, i16* %incdec.ptr15, align 2
-  %conv25 = zext i16 %10 to i64
-  %sub26 = sub nsw i64 %conv23, %conv25
-  %arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26
-  %11 = load i16, i16* %arrayidx28, align 2
-  %conv29 = zext i16 %11 to i32
-  %add30 = add nsw i32 %add21, %conv29
-  %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
-  %12 = load i16, i16* %incdec.ptr22, align 2
-  %conv32 = zext i16 %12 to i64
-  %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
-  %13 = load i16, i16* %incdec.ptr24, align 2
-  %conv34 = zext i16 %13 to i64
-  %sub35 = sub nsw i64 %conv32, %conv34
-  %arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35
-  %14 = load i16, i16* %arrayidx37, align 2
-  %conv38 = zext i16 %14 to i32
-  %add39 = add nsw i32 %add30, %conv38
-  %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
-  %15 = load i16, i16* %incdec.ptr31, align 2
-  %conv41 = zext i16 %15 to i64
-  %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
-  %16 = load i16, i16* %incdec.ptr33, align 2
-  %conv43 = zext i16 %16 to i64
-  %sub44 = sub nsw i64 %conv41, %conv43
-  %arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44
-  %17 = load i16, i16* %arrayidx46, align 2
-  %conv47 = zext i16 %17 to i32
-  %add48 = add nsw i32 %add39, %conv47
-  %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
-  %18 = load i16, i16* %incdec.ptr40, align 2
-  %conv50 = zext i16 %18 to i64
-  %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
-  %19 = load i16, i16* %incdec.ptr42, align 2
-  %conv52 = zext i16 %19 to i64
-  %sub53 = sub nsw i64 %conv50, %conv52
-  %arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53
-  %20 = load i16, i16* %arrayidx55, align 2
-  %conv56 = zext i16 %20 to i32
-  %add57 = add nsw i32 %add48, %conv56
-  %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
-  %21 = load i16, i16* %incdec.ptr49, align 2
-  %conv59 = zext i16 %21 to i64
-  %22 = load i16, i16* %incdec.ptr51, align 2
-  %conv61 = zext i16 %22 to i64
-  %sub62 = sub nsw i64 %conv59, %conv61
-  %arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62
-  %23 = load i16, i16* %arrayidx64, align 2
-  %conv65 = zext i16 %23 to i32
-  %add66 = add nsw i32 %add57, %conv65
-  %inc = add nuw nsw i32 %i.0103, 1
-  %exitcond = icmp eq i32 %inc, %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@ -1,310 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT
-; RUN: opt < %s -slp-schedule-budget=0 -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
-; RUN: opt < %s -slp-schedule-budget=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-@a = common global [80 x i8] zeroinitializer, align 16
-
-define void @PR28330(i32 %n) {
-; DEFAULT-LABEL: @PR28330(
-; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
-; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
-; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
-; DEFAULT:       for.body:
-; DEFAULT-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
-; DEFAULT-NEXT:    [[TMP20:%.*]] = add i32 [[TMP17]], undef
-; DEFAULT-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], undef
-; DEFAULT-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], undef
-; DEFAULT-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], undef
-; DEFAULT-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], undef
-; DEFAULT-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], undef
-; DEFAULT-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], undef
-; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP2]])
-; DEFAULT-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP3]], [[TMP17]]
-; DEFAULT-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], undef
-; DEFAULT-NEXT:    br label [[FOR_BODY]]
-;
-; GATHER-LABEL: @PR28330(
-; GATHER-NEXT:  entry:
-; GATHER-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
-; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
-; GATHER-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; GATHER-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
-; GATHER-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; GATHER-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
-; GATHER-NEXT:    [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-; GATHER-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0
-; GATHER-NEXT:    [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-; GATHER-NEXT:    [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0
-; GATHER-NEXT:    [[TMP12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-; GATHER-NEXT:    [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0
-; GATHER-NEXT:    [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-; GATHER-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
-; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
-; GATHER:       for.body:
-; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; GATHER-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> <i32 -720, i32 -720>, <2 x i32> <i32 -80, i32 -80>
-; GATHER-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; GATHER-NEXT:    [[TMP20:%.*]] = add i32 [[TMP17]], [[TMP3]]
-; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; GATHER-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]]
-; GATHER-NEXT:    [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]]
-; GATHER-NEXT:    [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]]
-; GATHER-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
-; GATHER-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]]
-; GATHER-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]]
-; GATHER-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
-; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1
-; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP23]], i32 2
-; GATHER-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP25]], i32 3
-; GATHER-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP27]], i32 4
-; GATHER-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP29]], i32 5
-; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6
-; GATHER-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7
-; GATHER-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]])
-; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP13]], [[TMP17]]
-; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
-; GATHER-NEXT:    br label [[FOR_BODY]]
-;
-; MAX-COST-LABEL: @PR28330(
-; MAX-COST-NEXT:  entry:
-; MAX-COST-NEXT:    [[TMP0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; MAX-COST-NEXT:    [[TMP2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
-; MAX-COST-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0
-; MAX-COST-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
-; MAX-COST-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; MAX-COST-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
-; MAX-COST-NEXT:    [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-; MAX-COST-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0
-; MAX-COST-NEXT:    [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-; MAX-COST-NEXT:    [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0
-; MAX-COST-NEXT:    [[TMP12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-; MAX-COST-NEXT:    [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0
-; MAX-COST-NEXT:    [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-; MAX-COST-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
-; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
-; MAX-COST:       for.body:
-; MAX-COST-NEXT:    [[TMP17:%.*]] = phi i32 [ [[TMP34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP19:%.*]] = select i1 [[TMP1]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP20:%.*]] = add i32 [[TMP17]], [[TMP19]]
-; MAX-COST-NEXT:    [[TMP21:%.*]] = select i1 [[TMP3]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP21]]
-; MAX-COST-NEXT:    [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]]
-; MAX-COST-NEXT:    [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]]
-; MAX-COST-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
-; MAX-COST-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]]
-; MAX-COST-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]]
-; MAX-COST-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP34]] = add i32 [[TMP32]], [[TMP33]]
-; MAX-COST-NEXT:    br label [[FOR_BODY]]
-;
-entry:
-  %tmp0 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
-  %tmp1 = icmp eq i8 %tmp0, 0
-  %tmp2 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
-  %tmp3 = icmp eq i8 %tmp2, 0
-  %tmp4 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-  %tmp5 = icmp eq i8 %tmp4, 0
-  %tmp6 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-  %tmp7 = icmp eq i8 %tmp6, 0
-  %tmp8 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-  %tmp9 = icmp eq i8 %tmp8, 0
-  %tmp10 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-  %tmp11 = icmp eq i8 %tmp10, 0
-  %tmp12 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-  %tmp13 = icmp eq i8 %tmp12, 0
-  %tmp14 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-  %tmp15 = icmp eq i8 %tmp14, 0
-  br label %for.body
-
-for.body:
-  %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
-  %tmp19 = select i1 %tmp1, i32 -720, i32 -80
-  %tmp20 = add i32 %tmp17, %tmp19
-  %tmp21 = select i1 %tmp3, i32 -720, i32 -80
-  %tmp22 = add i32 %tmp20, %tmp21
-  %tmp23 = select i1 %tmp5, i32 -720, i32 -80
-  %tmp24 = add i32 %tmp22, %tmp23
-  %tmp25 = select i1 %tmp7, i32 -720, i32 -80
-  %tmp26 = add i32 %tmp24, %tmp25
-  %tmp27 = select i1 %tmp9, i32 -720, i32 -80
-  %tmp28 = add i32 %tmp26, %tmp27
-  %tmp29 = select i1 %tmp11, i32 -720, i32 -80
-  %tmp30 = add i32 %tmp28, %tmp29
-  %tmp31 = select i1 %tmp13, i32 -720, i32 -80
-  %tmp32 = add i32 %tmp30, %tmp31
-  %tmp33 = select i1 %tmp15, i32 -720, i32 -80
-  %tmp34 = add i32 %tmp32, %tmp33
-  br label %for.body
-}
-
-define void @PR32038(i32 %n) {
-; DEFAULT-LABEL: @PR32038(
-; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
-; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
-; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
-; DEFAULT:       for.body:
-; DEFAULT-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
-; DEFAULT-NEXT:    [[TMP20:%.*]] = add i32 -5, undef
-; DEFAULT-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], undef
-; DEFAULT-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], undef
-; DEFAULT-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], undef
-; DEFAULT-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], undef
-; DEFAULT-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], undef
-; DEFAULT-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], undef
-; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP2]])
-; DEFAULT-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP3]], -5
-; DEFAULT-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], undef
-; DEFAULT-NEXT:    br label [[FOR_BODY]]
-;
-; GATHER-LABEL: @PR32038(
-; GATHER-NEXT:  entry:
-; GATHER-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
-; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
-; GATHER-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; GATHER-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
-; GATHER-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; GATHER-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
-; GATHER-NEXT:    [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-; GATHER-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0
-; GATHER-NEXT:    [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-; GATHER-NEXT:    [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0
-; GATHER-NEXT:    [[TMP12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-; GATHER-NEXT:    [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0
-; GATHER-NEXT:    [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-; GATHER-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
-; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
-; GATHER:       for.body:
-; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; GATHER-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> <i32 -720, i32 -720>, <2 x i32> <i32 -80, i32 -80>
-; GATHER-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; GATHER-NEXT:    [[TMP20:%.*]] = add i32 -5, [[TMP3]]
-; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; GATHER-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]]
-; GATHER-NEXT:    [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]]
-; GATHER-NEXT:    [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]]
-; GATHER-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
-; GATHER-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]]
-; GATHER-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]]
-; GATHER-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
-; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
-; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1
-; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP23]], i32 2
-; GATHER-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP25]], i32 3
-; GATHER-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP27]], i32 4
-; GATHER-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP29]], i32 5
-; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6
-; GATHER-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7
-; GATHER-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]])
-; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP13]], -5
-; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
-; GATHER-NEXT:    br label [[FOR_BODY]]
-;
-; MAX-COST-LABEL: @PR32038(
-; MAX-COST-NEXT:  entry:
-; MAX-COST-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
-; MAX-COST-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT:    [[TMPP5:%.*]] = icmp eq i8 [[TMP4]], 0
-; MAX-COST-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; MAX-COST-NEXT:    [[TMPP7:%.*]] = icmp eq i8 [[TMP6]], 0
-; MAX-COST-NEXT:    [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-; MAX-COST-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0
-; MAX-COST-NEXT:    [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-; MAX-COST-NEXT:    [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0
-; MAX-COST-NEXT:    [[TMP12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-; MAX-COST-NEXT:    [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0
-; MAX-COST-NEXT:    [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-; MAX-COST-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
-; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
-; MAX-COST:       for.body:
-; MAX-COST-NEXT:    [[TMP17:%.*]] = phi i32 [ [[TMP34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0
-; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
-; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[TMPP5]], i32 2
-; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[TMPP7]], i32 3
-; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
-; MAX-COST-NEXT:    [[TMP20:%.*]] = add i32 -5, undef
-; MAX-COST-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], undef
-; MAX-COST-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], undef
-; MAX-COST-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], undef
-; MAX-COST-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
-; MAX-COST-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP8]])
-; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP27]]
-; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP29]]
-; MAX-COST-NEXT:    [[BIN_EXTRA:%.*]] = add i32 [[TMP11]], -5
-; MAX-COST-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]]
-; MAX-COST-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP32:%.*]] = add i32 [[BIN_EXTRA]], [[TMP31]]
-; MAX-COST-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP34]] = add i32 [[TMP32]], [[TMP33]]
-; MAX-COST-NEXT:    br label [[FOR_BODY]]
-;
-entry:
-  %tmp0 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
-  %tmp1 = icmp eq i8 %tmp0, 0
-  %tmp2 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
-  %tmp3 = icmp eq i8 %tmp2, 0
-  %tmp4 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-  %tmp5 = icmp eq i8 %tmp4, 0
-  %tmp6 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-  %tmp7 = icmp eq i8 %tmp6, 0
-  %tmp8 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-  %tmp9 = icmp eq i8 %tmp8, 0
-  %tmp10 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-  %tmp11 = icmp eq i8 %tmp10, 0
-  %tmp12 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-  %tmp13 = icmp eq i8 %tmp12, 0
-  %tmp14 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-  %tmp15 = icmp eq i8 %tmp14, 0
-  br label %for.body
-
-for.body:
-  %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
-  %tmp19 = select i1 %tmp1, i32 -720, i32 -80
-  %tmp20 = add i32 -5, %tmp19
-  %tmp21 = select i1 %tmp3, i32 -720, i32 -80
-  %tmp22 = add i32 %tmp20, %tmp21
-  %tmp23 = select i1 %tmp5, i32 -720, i32 -80
-  %tmp24 = add i32 %tmp22, %tmp23
-  %tmp25 = select i1 %tmp7, i32 -720, i32 -80
-  %tmp26 = add i32 %tmp24, %tmp25
-  %tmp27 = select i1 %tmp9, i32 -720, i32 -80
-  %tmp28 = add i32 %tmp26, %tmp27
-  %tmp29 = select i1 %tmp11, i32 -720, i32 -80
-  %tmp30 = add i32 %tmp28, %tmp29
-  %tmp31 = select i1 %tmp13, i32 -720, i32 -80
-  %tmp32 = add i32 %tmp30, %tmp31
-  %tmp33 = select i1 %tmp15, i32 -720, i32 -80
-  %tmp34 = add i32 %tmp32, %tmp33
-  br label %for.body
-}
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@ -1,155 +0,0 @@
-; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s
-; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-18 -pass-remarks-output=%t < %s | FileCheck %s
-; RUN: cat %t | FileCheck -check-prefix=YAML %s
-
-
-target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-; These tests check that we remove from consideration pairs of seed
-; getelementptrs when they are known to have a constant difference. Such pairs
-; are likely not good candidates for vectorization since one can be computed
-; from the other. We use an unprofitable threshold to force vectorization.
-;
-; int getelementptr(int *g, int n, int w, int x, int y, int z) {
-;   int sum = 0;
-;   for (int i = 0; i < n ; ++i) {
-;     sum += g[2*i + w]; sum += g[2*i + x];
-;     sum += g[2*i + y]; sum += g[2*i + z];
-;   }
-;   return sum;
-; }
-;
-
-; CHECK-LABEL: @getelementptr_4x32
-;
-; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32>
-; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]]
-; CHECK: sext i32 [[X]] to i64
-
-; YAML:      --- !Passed
-; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
-; YAML-NEXT: Function:        getelementptr_4x32
-; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '11'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '5'
-
-; YAML:      --- !Passed
-; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
-; YAML-NEXT: Function:        getelementptr_4x32
-; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '16'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '3'
-
-define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
-entry:
-  %cmp31 = icmp sgt i32 %n, 0
-  br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  br label %for.body
-
-for.cond.cleanup.loopexit:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
-  ret i32 %sum.0.lcssa
-
-for.body:
-  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
-  %t4 = shl nsw i32 %indvars.iv, 1
-  %t5 = add nsw i32 %t4, 0
-  %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5
-  %t6 = load i32, i32* %arrayidx, align 4
-  %add1 = add nsw i32 %t6, %sum.032
-  %t7 = add nsw i32 %t4, %x
-  %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7
-  %t8 = load i32, i32* %arrayidx5, align 4
-  %add6 = add nsw i32 %add1, %t8
-  %t9 = add nsw i32 %t4, %y
-  %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9
-  %t10 = load i32, i32* %arrayidx10, align 4
-  %add11 = add nsw i32 %add6, %t10
-  %t11 = add nsw i32 %t4, %z
-  %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11
-  %t12 = load i32, i32* %arrayidx15, align 4
-  %add16 = add nsw i32 %add11, %t12
-  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
-  %exitcond = icmp eq i32 %indvars.iv.next , %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; CHECK-LABEL: @getelementptr_2x32
-;
-; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32>
-; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]]
-; CHECK: sext i32 [[X]] to i64
-
-; YAML:      --- !Passed
-; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
-; YAML-NEXT: Function:        getelementptr_2x32
-; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '11'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '5'
-
-; YAML:      --- !Passed
-; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
-; YAML-NEXT: Function:        getelementptr_2x32
-; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '6'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '3'
-
-define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
-entry:
-  %cmp31 = icmp sgt i32 %n, 0
-  br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  br label %for.body
-
-for.cond.cleanup.loopexit:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
-  ret i32 %sum.0.lcssa
-
-for.body:
-  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
-  %t4 = shl nsw i32 %indvars.iv, 1
-  %t5 = add nsw i32 %t4, 0
-  %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5
-  %t6 = load i32, i32* %arrayidx, align 4
-  %add1 = add nsw i32 %t6, %sum.032
-  %t7 = add nsw i32 %t4, 1
-  %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7
-  %t8 = load i32, i32* %arrayidx5, align 4
-  %add6 = add nsw i32 %add1, %t8
-  %t9 = add nsw i32 %t4, %y
-  %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9
-  %t10 = load i32, i32* %arrayidx10, align 4
-  %add11 = add nsw i32 %add6, %t10
-  %t11 = add nsw i32 %t4, %z
-  %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11
-  %t12 = load i32, i32* %arrayidx15, align 4
-  %add16 = add nsw i32 %add11, %t12
-  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
-  %exitcond = icmp eq i32 %indvars.iv.next , %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@ -1,307 +0,0 @@
-; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t <  %s | FileCheck %s
-; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -passes=slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t <  %s | FileCheck %s
-; RUN: cat %t | FileCheck -check-prefix=YAML %s
-
-
-; FIXME: The threshold is changed to keep this test case a bit smaller.
-; The AArch64 cost model should not give such high costs to select statements.
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux"
-
-; CHECK-LABEL: test_select
-; CHECK: load <4 x i32>
-; CHECK: load <4 x i32>
-; CHECK: select <4 x i1>
-
-; YAML:      --- !Passed
-; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedHorizontalReduction
-; YAML-NEXT: Function:        test_select
-; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '4'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '8'
-
-define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) {
-entry:
-  %cmp.22 = icmp sgt i32 %h, 0
-  br i1 %cmp.22, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  %idx.ext = sext i32 %lx to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %s.026 = phi i32 [ 0, %for.body.lr.ph ], [ %add27, %for.body ]
-  %j.025 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %p2.024 = phi i32* [ %blk2, %for.body.lr.ph ], [ %add.ptr29, %for.body ]
-  %p1.023 = phi i32* [ %blk1, %for.body.lr.ph ], [ %add.ptr, %for.body ]
-  %0 = load i32, i32* %p1.023, align 4
-  %1 = load i32, i32* %p2.024, align 4
-  %sub = sub nsw i32 %0, %1
-  %cmp2 = icmp slt i32 %sub, 0
-  %sub3 = sub nsw i32 0, %sub
-  %sub3.sub = select i1 %cmp2, i32 %sub3, i32 %sub
-  %add = add nsw i32 %sub3.sub, %s.026
-  %arrayidx4 = getelementptr inbounds i32, i32* %p1.023, i64 1
-  %2 = load i32, i32* %arrayidx4, align 4
-  %arrayidx5 = getelementptr inbounds i32, i32* %p2.024, i64 1
-  %3 = load i32, i32* %arrayidx5, align 4
-  %sub6 = sub nsw i32 %2, %3
-  %cmp7 = icmp slt i32 %sub6, 0
-  %sub9 = sub nsw i32 0, %sub6
-  %v.1 = select i1 %cmp7, i32 %sub9, i32 %sub6
-  %add11 = add nsw i32 %add, %v.1
-  %arrayidx12 = getelementptr inbounds i32, i32* %p1.023, i64 2
-  %4 = load i32, i32* %arrayidx12, align 4
-  %arrayidx13 = getelementptr inbounds i32, i32* %p2.024, i64 2
-  %5 = load i32, i32* %arrayidx13, align 4
-  %sub14 = sub nsw i32 %4, %5
-  %cmp15 = icmp slt i32 %sub14, 0
-  %sub17 = sub nsw i32 0, %sub14
-  %sub17.sub14 = select i1 %cmp15, i32 %sub17, i32 %sub14
-  %add19 = add nsw i32 %add11, %sub17.sub14
-  %arrayidx20 = getelementptr inbounds i32, i32* %p1.023, i64 3
-  %6 = load i32, i32* %arrayidx20, align 4
-  %arrayidx21 = getelementptr inbounds i32, i32* %p2.024, i64 3
-  %7 = load i32, i32* %arrayidx21, align 4
-  %sub22 = sub nsw i32 %6, %7
-  %cmp23 = icmp slt i32 %sub22, 0
-  %sub25 = sub nsw i32 0, %sub22
-  %v.3 = select i1 %cmp23, i32 %sub25, i32 %sub22
-  %add27 = add nsw i32 %add19, %v.3
-  %add.ptr = getelementptr inbounds i32, i32* %p1.023, i64 %idx.ext
-  %add.ptr29 = getelementptr inbounds i32, i32* %p2.024, i64 %idx.ext
-  %inc = add nuw nsw i32 %j.025, 1
-  %exitcond = icmp eq i32 %inc, %h
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add27, %for.end.loopexit ]
-  ret i32 %s.0.lcssa
-}
-
-;; Check whether SLP can find a reduction phi whose incoming blocks are not
-;; the same as the block containing the phi.
-;;
-;; Came from code like,
-;;
-;; int s = 0;
-;; for (int j = 0; j < h; j++) {
-;;   s += p1[0] * p2[0]
-;;   s += p1[1] * p2[1];
-;;   s += p1[2] * p2[2];
-;;   s += p1[3] * p2[3];
-;;   if (s >= lim)
-;;      break;
-;;   p1 += lx;
-;;   p2 += lx;
-;; }
-define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) {
-; CHECK-LABEL: reduction_with_br
-; CHECK: load <4 x i32>
-; CHECK: load <4 x i32>
-; CHECK: mul nsw <4 x i32>
-
-; YAML:      --- !Passed
-; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedHorizontalReduction
-; YAML-NEXT: Function:        reduction_with_br
-; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '1'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '3'
-
-entry:
-  %cmp.16 = icmp sgt i32 %h, 0
-  br i1 %cmp.16, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  %idx.ext = sext i32 %lx to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %if.end
-  %s.020 = phi i32 [ 0, %for.body.lr.ph ], [ %add13, %if.end ]
-  %j.019 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ]
-  %p2.018 = phi i32* [ %blk2, %for.body.lr.ph ], [ %add.ptr16, %if.end ]
-  %p1.017 = phi i32* [ %blk1, %for.body.lr.ph ], [ %add.ptr, %if.end ]
-  %0 = load i32, i32* %p1.017, align 4
-  %1 = load i32, i32* %p2.018, align 4
-  %mul = mul nsw i32 %1, %0
-  %add = add nsw i32 %mul, %s.020
-  %arrayidx2 = getelementptr inbounds i32, i32* %p1.017, i64 1
-  %2 = load i32, i32* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32* %p2.018, i64 1
-  %3 = load i32, i32* %arrayidx3, align 4
-  %mul4 = mul nsw i32 %3, %2
-  %add5 = add nsw i32 %add, %mul4
-  %arrayidx6 = getelementptr inbounds i32, i32* %p1.017, i64 2
-  %4 = load i32, i32* %arrayidx6, align 4
-  %arrayidx7 = getelementptr inbounds i32, i32* %p2.018, i64 2
-  %5 = load i32, i32* %arrayidx7, align 4
-  %mul8 = mul nsw i32 %5, %4
-  %add9 = add nsw i32 %add5, %mul8
-  %arrayidx10 = getelementptr inbounds i32, i32* %p1.017, i64 3
-  %6 = load i32, i32* %arrayidx10, align 4
-  %arrayidx11 = getelementptr inbounds i32, i32* %p2.018, i64 3
-  %7 = load i32, i32* %arrayidx11, align 4
-  %mul12 = mul nsw i32 %7, %6
-  %add13 = add nsw i32 %add9, %mul12
-  %cmp14 = icmp slt i32 %add13, %lim
-  br i1 %cmp14, label %if.end, label %for.end.loopexit
-
-if.end:                                           ; preds = %for.body
-  %add.ptr = getelementptr inbounds i32, i32* %p1.017, i64 %idx.ext
-  %add.ptr16 = getelementptr inbounds i32, i32* %p2.018, i64 %idx.ext
-  %inc = add nuw nsw i32 %j.019, 1
-  %cmp = icmp slt i32 %inc, %h
-  br i1 %cmp, label %for.body, label %for.end.loopexit
-
-for.end.loopexit:                                 ; preds = %for.body, %if.end
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %s.1 = phi i32 [ 0, %entry ], [ %add13, %for.end.loopexit ]
-  ret i32 %s.1
-}
-
-; CHECK: test_unrolled_select
-; CHECK: load <8 x i8>
-; CHECK: load <8 x i8>
-; CHECK: select <8 x i1>
-
-; YAML:      --- !Passed
-; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedHorizontalReduction
-; YAML-NEXT: Function:        test_unrolled_select
-; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-33'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '10'
-
-define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 {
-entry:
-  %cmp.43 = icmp sgt i32 %h, 0
-  br i1 %cmp.43, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  %idx.ext = sext i32 %lx to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %if.end.86
-  %s.047 = phi i32 [ 0, %for.body.lr.ph ], [ %add82, %if.end.86 ]
-  %j.046 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end.86 ]
-  %p2.045 = phi i8* [ %blk2, %for.body.lr.ph ], [ %add.ptr88, %if.end.86 ]
-  %p1.044 = phi i8* [ %blk1, %for.body.lr.ph ], [ %add.ptr, %if.end.86 ]
-  %0 = load i8, i8* %p1.044, align 1
-  %conv = zext i8 %0 to i32
-  %1 = load i8, i8* %p2.045, align 1
-  %conv2 = zext i8 %1 to i32
-  %sub = sub nsw i32 %conv, %conv2
-  %cmp3 = icmp slt i32 %sub, 0
-  %sub5 = sub nsw i32 0, %sub
-  %sub5.sub = select i1 %cmp3, i32 %sub5, i32 %sub
-  %add = add nsw i32 %sub5.sub, %s.047
-  %arrayidx6 = getelementptr inbounds i8, i8* %p1.044, i64 1
-  %2 = load i8, i8* %arrayidx6, align 1
-  %conv7 = zext i8 %2 to i32
-  %arrayidx8 = getelementptr inbounds i8, i8* %p2.045, i64 1
-  %3 = load i8, i8* %arrayidx8, align 1
-  %conv9 = zext i8 %3 to i32
-  %sub10 = sub nsw i32 %conv7, %conv9
-  %cmp11 = icmp slt i32 %sub10, 0
-  %sub14 = sub nsw i32 0, %sub10
-  %v.1 = select i1 %cmp11, i32 %sub14, i32 %sub10
-  %add16 = add nsw i32 %add, %v.1
-  %arrayidx17 = getelementptr inbounds i8, i8* %p1.044, i64 2
-  %4 = load i8, i8* %arrayidx17, align 1
-  %conv18 = zext i8 %4 to i32
-  %arrayidx19 = getelementptr inbounds i8, i8* %p2.045, i64 2
-  %5 = load i8, i8* %arrayidx19, align 1
-  %conv20 = zext i8 %5 to i32
-  %sub21 = sub nsw i32 %conv18, %conv20
-  %cmp22 = icmp slt i32 %sub21, 0
-  %sub25 = sub nsw i32 0, %sub21
-  %sub25.sub21 = select i1 %cmp22, i32 %sub25, i32 %sub21
-  %add27 = add nsw i32 %add16, %sub25.sub21
-  %arrayidx28 = getelementptr inbounds i8, i8* %p1.044, i64 3
-  %6 = load i8, i8* %arrayidx28, align 1
-  %conv29 = zext i8 %6 to i32
-  %arrayidx30 = getelementptr inbounds i8, i8* %p2.045, i64 3
-  %7 = load i8, i8* %arrayidx30, align 1
-  %conv31 = zext i8 %7 to i32
-  %sub32 = sub nsw i32 %conv29, %conv31
-  %cmp33 = icmp slt i32 %sub32, 0
-  %sub36 = sub nsw i32 0, %sub32
-  %v.3 = select i1 %cmp33, i32 %sub36, i32 %sub32
-  %add38 = add nsw i32 %add27, %v.3
-  %arrayidx39 = getelementptr inbounds i8, i8* %p1.044, i64 4
-  %8 = load i8, i8* %arrayidx39, align 1
-  %conv40 = zext i8 %8 to i32
-  %arrayidx41 = getelementptr inbounds i8, i8* %p2.045, i64 4
-  %9 = load i8, i8* %arrayidx41, align 1
-  %conv42 = zext i8 %9 to i32
-  %sub43 = sub nsw i32 %conv40, %conv42
-  %cmp44 = icmp slt i32 %sub43, 0
-  %sub47 = sub nsw i32 0, %sub43
-  %sub47.sub43 = select i1 %cmp44, i32 %sub47, i32 %sub43
-  %add49 = add nsw i32 %add38, %sub47.sub43
-  %arrayidx50 = getelementptr inbounds i8, i8* %p1.044, i64 5
-  %10 = load i8, i8* %arrayidx50, align 1
-  %conv51 = zext i8 %10 to i32
-  %arrayidx52 = getelementptr inbounds i8, i8* %p2.045, i64 5
-  %11 = load i8, i8* %arrayidx52, align 1
-  %conv53 = zext i8 %11 to i32
-  %sub54 = sub nsw i32 %conv51, %conv53
-  %cmp55 = icmp slt i32 %sub54, 0
-  %sub58 = sub nsw i32 0, %sub54
-  %v.5 = select i1 %cmp55, i32 %sub58, i32 %sub54
-  %add60 = add nsw i32 %add49, %v.5
-  %arrayidx61 = getelementptr inbounds i8, i8* %p1.044, i64 6
-  %12 = load i8, i8* %arrayidx61, align 1
-  %conv62 = zext i8 %12 to i32
-  %arrayidx63 = getelementptr inbounds i8, i8* %p2.045, i64 6
-  %13 = load i8, i8* %arrayidx63, align 1
-  %conv64 = zext i8 %13 to i32
-  %sub65 = sub nsw i32 %conv62, %conv64
-  %cmp66 = icmp slt i32 %sub65, 0
-  %sub69 = sub nsw i32 0, %sub65
-  %sub69.sub65 = select i1 %cmp66, i32 %sub69, i32 %sub65
-  %add71 = add nsw i32 %add60, %sub69.sub65
-  %arrayidx72 = getelementptr inbounds i8, i8* %p1.044, i64 7
-  %14 = load i8, i8* %arrayidx72, align 1
-  %conv73 = zext i8 %14 to i32
-  %arrayidx74 = getelementptr inbounds i8, i8* %p2.045, i64 7
-  %15 = load i8, i8* %arrayidx74, align 1
-  %conv75 = zext i8 %15 to i32
-  %sub76 = sub nsw i32 %conv73, %conv75
-  %cmp77 = icmp slt i32 %sub76, 0
-  %sub80 = sub nsw i32 0, %sub76
-  %v.7 = select i1 %cmp77, i32 %sub80, i32 %sub76
-  %add82 = add nsw i32 %add71, %v.7
-  %cmp83 = icmp slt i32 %add82, %lim
-  br i1 %cmp83, label %if.end.86, label %for.end.loopexit
-
-if.end.86:                                        ; preds = %for.body
-  %add.ptr = getelementptr inbounds i8, i8* %p1.044, i64 %idx.ext
-  %add.ptr88 = getelementptr inbounds i8, i8* %p2.045, i64 %idx.ext
-  %inc = add nuw nsw i32 %j.046, 1
-  %cmp = icmp slt i32 %inc, %h
-  br i1 %cmp, label %for.body, label %for.end.loopexit
-
-for.end.loopexit:                                 ; preds = %for.body, %if.end.86
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %s.1 = phi i32 [ 0, %entry ], [ %add82, %for.end.loopexit ]
-  ret i32 %s.1
-}
-
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
@ -1,2 +0,0 @@
-if not 'AArch64' in config.root.targets:
-    config.unsupported = True
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/load-store-q.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/load-store-q.ll
@ -1,46 +0,0 @@
-; RUN: opt -S -basicaa -slp-vectorizer < %s | FileCheck %s
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios5.0.0"
-
-; Holding a value live over a call boundary may require
-; spills and fills. This is the case for <2 x double>,
-; as it occupies a Q register of which there are no
-; callee-saves.
- 
-; CHECK: load double
-; CHECK: load double
-; CHECK: call void @g
-; CHECK: store double
-; CHECK: store double
-define void @f(double* %p, double* %q) {
-  %addr2 = getelementptr double, double* %q, i32 1
-  %addr = getelementptr double, double* %p, i32 1
-  %x = load double, double* %p
-  %y = load double, double* %addr
-  call void @g()
-  store double %x, double* %q
-  store double %y, double* %addr2
-  ret void
-}
-declare void @g()
-
-; Check we deal with loops correctly.
-;
-; CHECK: store <2 x double>
-; CHECK: load <2 x double>
-define void @f2(double* %p, double* %q) {
-entry:
-  br label %loop
-
-loop:
-  %p1 = phi double [0.0, %entry], [%x, %loop]
-  %p2 = phi double [0.0, %entry], [%y, %loop]
-  %addr2 = getelementptr double, double* %q, i32 1
-  %addr = getelementptr double, double* %p, i32 1
-  store double %p1, double* %q
-  store double %p2, double* %addr2
-
-  %x = load double, double* %p
-  %y = load double, double* %addr
-  br label %loop
-}
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll
@ -1,55 +0,0 @@
-; RUN: opt -S -slp-vectorizer < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-; This test ensures that we do not regress due to PR26364. The vectorizer
-; should not compute a smaller size for %k.13 since it is in a use-def cycle
-; and cannot be demoted.
-;
-; CHECK-LABEL: @PR26364
-; CHECK: %k.13 = phi i32
-;
-define fastcc void @PR26364() {
-entry:
-  br i1 undef, label %for.end11, label %for.cond4
-
-for.cond4:
-  %k.13 = phi i32 [ undef, %entry ], [ %k.3, %for.cond4 ]
-  %e.02 = phi i32 [ 1, %entry ], [ 0, %for.cond4 ]
-  %e.1 = select i1 undef, i32 %e.02, i32 0
-  %k.3 = select i1 undef, i32 %k.13, i32 undef
-  br label %for.cond4
-
-for.end11:
-  ret void
-}
-
-; This test ensures that we do not regress due to PR26629. We must look at
-; every root in the vectorizable tree when computing minimum sizes since one
-; root may require fewer bits than another.
-;
-; CHECK-LABEL: @PR26629
-; CHECK-NOT: {{.*}} and <2 x i72>
-;
-define void @PR26629(i32* %c) {
-entry:
-  br i1 undef, label %for.ph, label %for.end
-
-for.ph:
-  %0 = load i32, i32* %c, align 4
-  br label %for.body
-
-for.body:
-  %d = phi i72 [ 576507472957710340, %for.ph ], [ %bf.set17, %for.body ]
-  %sub = sub i32 %0, undef
-  %bf.clear13 = and i72 %d, -576460748008464384
-  %1 = zext i32 %sub to i72
-  %bf.value15 = and i72 %1, 8191
-  %bf.clear16 = or i72 %bf.value15, %bf.clear13
-  %bf.set17 = or i72 %bf.clear16, undef
-  br label %for.body
-
-for.end:
-  ret void
-}
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll
@ -1,18 +0,0 @@
-; RUN: opt -S -slp-vectorizer %s | FileCheck %s
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios5.0.0"
-
-define i64 @mismatched_intrinsics(<4 x i32> %in1, <2 x i32> %in2) nounwind {
-; CHECK-LABEL: @mismatched_intrinsics
-; CHECK: call i64 @llvm.arm64.neon.saddlv.i64.v4i32
-; CHECK: call i64 @llvm.arm64.neon.saddlv.i64.v2i32
-
-  %vaddlvq_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1) #2
-  %vaddlv_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in2) #2
-  %tst = icmp sgt i64 %vaddlvq_s32.i, %vaddlv_s32.i
-  %equal = sext i1 %tst to i64
-  ret i64 %equal
-}
-
-declare i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1)
-declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in1)
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll
@ -1,76 +0,0 @@
-; RUN: opt -S -basicaa -slp-vectorizer -dce < %s | FileCheck %s
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios5.0.0"
-
-; CHECK-LABEL: @foo
-define void @foo(float* noalias %a, float* noalias %b, float* noalias %c) {
-entry:
-; Check that we don't lose !nontemporal hint when vectorizing loads.
-; CHECK: %{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
-  %b1 = load float, float* %b, align 4, !nontemporal !0
-  %arrayidx.1 = getelementptr inbounds float, float* %b, i64 1
-  %b2 = load float, float* %arrayidx.1, align 4, !nontemporal !0
-  %arrayidx.2 = getelementptr inbounds float, float* %b, i64 2
-  %b3 = load float, float* %arrayidx.2, align 4, !nontemporal !0
-  %arrayidx.3 = getelementptr inbounds float, float* %b, i64 3
-  %b4 = load float, float* %arrayidx.3, align 4, !nontemporal !0
-
-; Check that we don't introduce !nontemporal hint when the original scalar loads didn't have it.
-; CHECK: %{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4{{$}}
-  %c1 = load float, float* %c, align 4
-  %arrayidx2.1 = getelementptr inbounds float, float* %c, i64 1
-  %c2 = load float, float* %arrayidx2.1, align 4
-  %arrayidx2.2 = getelementptr inbounds float, float* %c, i64 2
-  %c3 = load float, float* %arrayidx2.2, align 4
-  %arrayidx2.3 = getelementptr inbounds float, float* %c, i64 3
-  %c4 = load float, float* %arrayidx2.3, align 4
-
-  %a1 = fadd float %b1, %c1
-  %a2 = fadd float %b2, %c2
-  %a3 = fadd float %b3, %c3
-  %a4 = fadd float %b4, %c4
-
-; Check that we don't lose !nontemporal hint when vectorizing stores.
-; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
-  store float %a1, float* %a, align 4, !nontemporal !0
-  %arrayidx3.1 = getelementptr inbounds float, float* %a, i64 1
-  store float %a2, float* %arrayidx3.1, align 4, !nontemporal !0
-  %arrayidx3.2 = getelementptr inbounds float, float* %a, i64 2
-  store float %a3, float* %arrayidx3.2, align 4, !nontemporal !0
-  %arrayidx3.3 = getelementptr inbounds float, float* %a, i64 3
-  store float %a4, float* %arrayidx3.3, align 4, !nontemporal !0
-
-; CHECK: ret void
-  ret void
-}
-
-; CHECK-LABEL: @foo2
-define void @foo2(float* noalias %a, float* noalias %b) {
-entry:
-; Check that we don't mark vector load with !nontemporal attribute if some of
-; the original scalar loads don't have it.
-; CHECK: %{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4{{$}}
-  %b1 = load float, float* %b, align 4, !nontemporal !0
-  %arrayidx.1 = getelementptr inbounds float, float* %b, i64 1
-  %b2 = load float, float* %arrayidx.1, align 4
-  %arrayidx.2 = getelementptr inbounds float, float* %b, i64 2
-  %b3 = load float, float* %arrayidx.2, align 4
-  %arrayidx.3 = getelementptr inbounds float, float* %b, i64 3
-  %b4 = load float, float* %arrayidx.3, align 4, !nontemporal !0
-
-; Check that we don't mark vector store with !nontemporal attribute if some of
-; the original scalar stores don't have it.
-; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 4{{$}}
-  store float %b1, float* %a, align 4, !nontemporal !0
-  %arrayidx3.1 = getelementptr inbounds float, float* %a, i64 1
-  store float %b2, float* %arrayidx3.1, align 4
-  %arrayidx3.2 = getelementptr inbounds float, float* %a, i64 2
-  store float %b3, float* %arrayidx3.2, align 4
-  %arrayidx3.3 = getelementptr inbounds float, float* %a, i64 3
-  store float %b4, float* %arrayidx3.3, align 4, !nontemporal !0
-
-; CHECK: ret void
-  ret void
-}
-
-!0 = !{i32 1}
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
@ -1,32 +0,0 @@
-; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -pass-remarks=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s
-
-define void @f(double* %r, double* %w) {
-  %r0 = getelementptr inbounds double, double* %r, i64 0
-  %r1 = getelementptr inbounds double, double* %r, i64 1
-  %f0 = load double, double* %r0
-  %f1 = load double, double* %r1
-  %add0 = fadd double %f0, %f0
-  %add1 = fadd double %f1, %f1
-  %w0 = getelementptr inbounds double, double* %w, i64 0
-  %w1 = getelementptr inbounds double, double* %w, i64 1
-; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3
-  store double %add0, double* %w0, !dbg !9
-  store double %add1, double* %w1
-  ret void
-}
-
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
-!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"PIC Level", i32 2}
-!6 = !{!"clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)"}
-!7 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, isOptimized: true, unit: !0, variables: !2)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 5, column: 10, scope: !7)
--- a/external/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
@ -1,42 +0,0 @@
-; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 | FileCheck %s
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-; CHECK-LABEL: @test1
-; CHECK: load <4 x i32>
-; CHECK: add nsw <4 x i32>
-; CHECK: sdiv <4 x i32>
-
-define void @test1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c) {
-entry:
-  %0 = load i32, i32* %b, align 4
-  %1 = load i32, i32* %c, align 4
-  %add = add nsw i32 %1, %0
-  %div = sdiv i32 %add, 2
-  store i32 %div, i32* %a, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 1
-  %2 = load i32, i32* %arrayidx3, align 4
-  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 1
-  %3 = load i32, i32* %arrayidx4, align 4
-  %add5 = add nsw i32 %3, %2
-  %div6 = sdiv i32 %add5, 2
-  %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 1
-  store i32 %div6, i32* %arrayidx7, align 4
-  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 2
-  %4 = load i32, i32* %arrayidx8, align 4
-  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 2
-  %5 = load i32, i32* %arrayidx9, align 4
-  %add10 = add nsw i32 %5, %4
-  %div11 = sdiv i32 %add10, 2
-  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 2
-  store i32 %div11, i32* %arrayidx12, align 4
-  %arrayidx13 = getelementptr inbounds i32, i32* %b, i64 3
-  %6 = load i32, i32* %arrayidx13, align 4
-  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 3
-  %7 = load i32, i32* %arrayidx14, align 4
-  %add15 = add nsw i32 %7, %6
-  %div16 = sdiv i32 %add15, 2
-  %arrayidx17 = getelementptr inbounds i32, i32* %a, i64 3
-  store i32 %div16, i32* %arrayidx17, align 4
-  ret void
-}
--- a/external/llvm/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg
+++ b/external/llvm/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg
@ -1,3 +0,0 @@
-if not 'AMDGPU' in config.root.targets:
-    config.unsupported = True
-
--- a/external/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
@ -1,195 +0,0 @@
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s
-
-; FIXME: Should still like to vectorize the memory operations for VI
-
-; Simple 3-pair chain with loads and stores
-; GCN-LABEL: @test1_as_3_3_3_v2f16(
-; GFX9: load <2 x half>, <2 x half> addrspace(3)*
-; GFX9: load <2 x half>, <2 x half> addrspace(3)*
-; GFX9: fmul <2 x half>
-; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
-; GFX9: ret
-
-; VI: load half
-; VI: load half
-define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
-  %i0 = load half, half addrspace(3)* %a, align 2
-  %i1 = load half, half addrspace(3)* %b, align 2
-  %mul = fmul half %i0, %i1
-  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
-  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
-  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
-  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
-  %mul5 = fmul half %i3, %i4
-  store half %mul, half addrspace(3)* %c, align 2
-  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
-  store half %mul5, half addrspace(3)* %arrayidx5, align 2
-  ret void
-}
-
-; GCN-LABEL: @test1_as_3_0_0(
-; GFX9: load <2 x half>, <2 x half> addrspace(3)*
-; GFX9: load <2 x half>, <2 x half>*
-; GFX9: fmul <2 x half>
-; GFX9: store <2 x half> %{{.*}}, <2 x half>* %
-; GFX9: ret
-
-; VI: load half
-; VI: load half
-define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
-  %i0 = load half, half addrspace(3)* %a, align 2
-  %i1 = load half, half* %b, align 2
-  %mul = fmul half %i0, %i1
-  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
-  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
-  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
-  %i4 = load half, half* %arrayidx4, align 2
-  %mul5 = fmul half %i3, %i4
-  store half %mul, half* %c, align 2
-  %arrayidx5 = getelementptr inbounds half, half* %c, i64 1
-  store half %mul5, half* %arrayidx5, align 2
-  ret void
-}
-
-; GCN-LABEL: @test1_as_0_0_3_v2f16(
-; GFX9: load <2 x half>, <2 x half>*
-; GFX9: load <2 x half>, <2 x half>*
-; GFX9: fmul <2 x half>
-; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
-; GFX9: ret
-
-; VI: load half
-; VI: load half
-define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
-  %i0 = load half, half* %a, align 2
-  %i1 = load half, half* %b, align 2
-  %mul = fmul half %i0, %i1
-  %arrayidx3 = getelementptr inbounds half, half* %a, i64 1
-  %i3 = load half, half* %arrayidx3, align 2
-  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
-  %i4 = load half, half* %arrayidx4, align 2
-  %mul5 = fmul half %i3, %i4
-  store half %mul, half addrspace(3)* %c, align 2
-  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
-  store half %mul5, half addrspace(3)* %arrayidx5, align 2
-  ret void
-}
-
-; GCN-LABEL: @test1_fma_v2f16(
-; GFX9: load <2 x half>
-; GFX9: load <2 x half>
-; GFX9: load <2 x half>
-; GFX9: call <2 x half> @llvm.fma.v2f16(
-; GFX9: store <2 x half>
-define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
-  %i0 = load half, half addrspace(3)* %a, align 2
-  %i1 = load half, half addrspace(3)* %b, align 2
-  %i2 = load half, half addrspace(3)* %c, align 2
-  %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
-  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
-  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
-  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
-  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
-  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
-  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
-  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
-  store half %fma0, half addrspace(3)* %d, align 2
-  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
-  store half %fma1, half addrspace(3)* %arrayidx6, align 2
-  ret void
-}
-
-; GCN-LABEL: @mul_scalar_v2f16(
-; GFX9: load <2 x half>
-; GFX9: fmul <2 x half>
-; GFX9: store <2 x half>
-define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
-  %i0 = load half, half addrspace(3)* %a, align 2
-  %mul = fmul half %i0, %scalar
-  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
-  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
-  %mul5 = fmul half %i3, %scalar
-  store half %mul, half addrspace(3)* %c, align 2
-  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
-  store half %mul5, half addrspace(3)* %arrayidx5, align 2
-  ret void
-}
-
-; GCN-LABEL: @fabs_v2f16
-; GFX9: load <2 x half>
-; GFX9: call <2 x half> @llvm.fabs.v2f16(
-; GFX9: store <2 x half>
-define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
-  %i0 = load half, half addrspace(3)* %a, align 2
-  %fabs0 = call half @llvm.fabs.f16(half %i0)
-  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
-  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
-  %fabs1 = call half @llvm.fabs.f16(half %i3)
-  store half %fabs0, half addrspace(3)* %c, align 2
-  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
-  store half %fabs1, half addrspace(3)* %arrayidx5, align 2
-  ret void
-}
-
-; GCN-LABEL: @test1_fabs_fma_v2f16(
-; GFX9: load <2 x half>
-; GFX9: call <2 x half> @llvm.fabs.v2f16(
-; GFX9: call <2 x half> @llvm.fma.v2f16(
-; GFX9: store <2 x half>
-define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
-  %i0 = load half, half addrspace(3)* %a, align 2
-  %i1 = load half, half addrspace(3)* %b, align 2
-  %i2 = load half, half addrspace(3)* %c, align 2
-  %i0.fabs = call half @llvm.fabs.f16(half %i0)
-
-  %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
-  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
-  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
-  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
-  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
-  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
-  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
-  %i3.fabs = call half @llvm.fabs.f16(half %i3)
-
-  %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
-  store half %fma0, half addrspace(3)* %d, align 2
-  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
-  store half %fma1, half addrspace(3)* %arrayidx6, align 2
-  ret void
-}
-
-; FIXME: Should do vector load and extract component for fabs
-; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
-; GFX9: load half
-; GFX9: call half @llvm.fabs.f16(
-; GFX9: load <2 x half>
-; GFX9: load half
-; GFX9: load <2 x half>
-; GFX9: call <2 x half> @llvm.fma.v2f16(
-; GFX9: store <2 x half>
-define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
-  %i0 = load half, half addrspace(3)* %a, align 2
-  %i1 = load half, half addrspace(3)* %b, align 2
-  %i2 = load half, half addrspace(3)* %c, align 2
-  %i1.fabs = call half @llvm.fabs.f16(half %i1)
-
-  %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
-  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
-  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
-  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
-  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
-  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
-  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
-  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
-  store half %fma0, half addrspace(3)* %d, align 2
-  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
-  store half %fma1, half addrspace(3)* %arrayidx6, align 2
-  ret void
-}
-
-declare half @llvm.fabs.f16(half) #1
-declare half @llvm.fma.f16(half, half, half) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
--- a/external/llvm/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
+++ b/external/llvm/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
@ -1,2 +0,0 @@
-if not 'ARM' in config.root.targets:
-    config.unsupported = True
--- a/external/llvm/test/Transforms/SLPVectorizer/ARM/memory.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/ARM/memory.ll
@ -1,20 +0,0 @@
-; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-
-; On swift unaligned <2 x double> stores need 4uops and it is there for cheaper
-; to do this scalar.
-
-; CHECK-LABEL: expensive_double_store
-; CHECK-NOT: load <2 x double>
-; CHECK-NOT: store <2 x double>
-define void @expensive_double_store(double* noalias %dst, double* noalias %src, i64 %count) {
-entry:
-  %0 = load double, double* %src, align 8
-  store double %0, double* %dst, align 8
-  %arrayidx2 = getelementptr inbounds double, double* %src, i64 1
-  %1 = load double, double* %arrayidx2, align 8
-  %arrayidx3 = getelementptr inbounds double, double* %dst, i64 1
-  store double %1, double* %arrayidx3, align 8
-  ret void
-}
--- a/external/llvm/test/Transforms/SLPVectorizer/ARM/sroa.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/ARM/sroa.ll
@ -1,52 +0,0 @@
-; RUN: opt -S -mcpu=swift -mtriple=thumbv7-apple-ios -basicaa -slp-vectorizer < %s | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-
-%class.Complex = type { double, double }
-
-; Code like this is the result of SROA. Make sure we don't vectorize this
-; because the scalar version of the shl/or are handled by the
-; backend and disappear, the vectorized code stays.
-
-; CHECK-LABEL: SROAed
-; CHECK-NOT: shl nuw <2 x i64>
-; CHECK-NOT: or <2 x i64>
-
-define void @SROAed(%class.Complex* noalias nocapture sret %agg.result, [4 x i32] %a.coerce, [4 x i32] %b.coerce) {
-entry:
-  %a.coerce.fca.0.extract = extractvalue [4 x i32] %a.coerce, 0
-  %a.sroa.0.0.insert.ext = zext i32 %a.coerce.fca.0.extract to i64
-  %a.coerce.fca.1.extract = extractvalue [4 x i32] %a.coerce, 1
-  %a.sroa.0.4.insert.ext = zext i32 %a.coerce.fca.1.extract to i64
-  %a.sroa.0.4.insert.shift = shl nuw i64 %a.sroa.0.4.insert.ext, 32
-  %a.sroa.0.4.insert.insert = or i64 %a.sroa.0.4.insert.shift, %a.sroa.0.0.insert.ext
-  %0 = bitcast i64 %a.sroa.0.4.insert.insert to double
-  %a.coerce.fca.2.extract = extractvalue [4 x i32] %a.coerce, 2
-  %a.sroa.3.8.insert.ext = zext i32 %a.coerce.fca.2.extract to i64
-  %a.coerce.fca.3.extract = extractvalue [4 x i32] %a.coerce, 3
-  %a.sroa.3.12.insert.ext = zext i32 %a.coerce.fca.3.extract to i64
-  %a.sroa.3.12.insert.shift = shl nuw i64 %a.sroa.3.12.insert.ext, 32
-  %a.sroa.3.12.insert.insert = or i64 %a.sroa.3.12.insert.shift, %a.sroa.3.8.insert.ext
-  %1 = bitcast i64 %a.sroa.3.12.insert.insert to double
-  %b.coerce.fca.0.extract = extractvalue [4 x i32] %b.coerce, 0
-  %b.sroa.0.0.insert.ext = zext i32 %b.coerce.fca.0.extract to i64
-  %b.coerce.fca.1.extract = extractvalue [4 x i32] %b.coerce, 1
-  %b.sroa.0.4.insert.ext = zext i32 %b.coerce.fca.1.extract to i64
-  %b.sroa.0.4.insert.shift = shl nuw i64 %b.sroa.0.4.insert.ext, 32
-  %b.sroa.0.4.insert.insert = or i64 %b.sroa.0.4.insert.shift, %b.sroa.0.0.insert.ext
-  %2 = bitcast i64 %b.sroa.0.4.insert.insert to double
-  %b.coerce.fca.2.extract = extractvalue [4 x i32] %b.coerce, 2
-  %b.sroa.3.8.insert.ext = zext i32 %b.coerce.fca.2.extract to i64
-  %b.coerce.fca.3.extract = extractvalue [4 x i32] %b.coerce, 3
-  %b.sroa.3.12.insert.ext = zext i32 %b.coerce.fca.3.extract to i64
-  %b.sroa.3.12.insert.shift = shl nuw i64 %b.sroa.3.12.insert.ext, 32
-  %b.sroa.3.12.insert.insert = or i64 %b.sroa.3.12.insert.shift, %b.sroa.3.8.insert.ext
-  %3 = bitcast i64 %b.sroa.3.12.insert.insert to double
-  %add = fadd double %0, %2
-  %add3 = fadd double %1, %3
-  %re.i.i = getelementptr inbounds %class.Complex, %class.Complex* %agg.result, i32 0, i32 0
-  store double %add, double* %re.i.i, align 4
-  %im.i.i = getelementptr inbounds %class.Complex, %class.Complex* %agg.result, i32 0, i32 1
-  store double %add3, double* %im.i.i, align 4
-  ret void
-}
--- a/external/llvm/test/Transforms/SLPVectorizer/PowerPC/aggregate.ll
+++ b/external/llvm/test/Transforms/SLPVectorizer/PowerPC/aggregate.ll
@ -1,19 +0,0 @@
-; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr9 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s
-
-%struct.S = type { i8*, i8* }
-
-@kS0 = common global %struct.S zeroinitializer, align 8
-
-define { i64, i64 } @getS() {
-entry:
-  %0 = load i64, i64* bitcast (%struct.S* @kS0 to i64*), align 8
-  %1 = load i64, i64* bitcast (i8** getelementptr inbounds (%struct.S, %struct.S* @kS0, i64 0, i32 1) to i64*), align 8
-  %2 = insertvalue { i64, i64 } undef, i64 %0, 0
-  %3 = insertvalue { i64, i64 } %2, i64 %1, 1
-  ret { i64, i64 } %3
-}
-
-; CHECK: load i64
-; CHECK-NOT: load <2 x i64>
-; CHECK-NOT: extractelement
-
--- a/external/llvm/test/Transforms/SLPVectorizer/PowerPC/lit.local.cfg
+++ b/external/llvm/test/Transforms/SLPVectorizer/PowerPC/lit.local.cfg
@ -1,2 +0,0 @@
-if not 'PowerPC' in config.root.targets:
-    config.unsupported = True
--- a/Show More
+++ b/Show More