Imported Upstream version 5.18.0.167

Former-commit-id: 289509151e0fee68a1b591a20c9f109c3c789d3a
2018-10-20 08:25:10 +00:00
parent e19d552987
commit b084638f15
28489 changed files with 184 additions and 3866856 deletions
--- a/external/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
@ -1,46 +0,0 @@
-; RUN: opt < %s -disable-loop-unrolling -debug-only=loop-vectorize -O3 -S 2>&1 | FileCheck %s
-; REQUIRES: asserts
-; We want to make sure that we don't even try to vectorize loops again
-; The vectorizer used to mark the un-vectorized loop only as already vectorized
-; thus, trying to vectorize the vectorized loop again
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@a = external global [255 x i32]
-
-; Function Attrs: nounwind readonly uwtable
-define i32 @vect() {
-; CHECK: LV: Checking a loop in "vect"
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-; We need to make sure we did vectorize the loop
-; CHECK: LV: Found a loop: for.body
-; CHECK: LV: We can vectorize this loop!
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds [255 x i32], [255 x i32]* @a, i64 0, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %add = add nsw i32 %0, %red.05
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 255
-  br i1 %exitcond, label %for.end, label %for.body
-
-; If it did, we have two loops:
-; CHECK: vector.body:
-; CHECK: br {{.*}} label %vector.body, !llvm.loop [[vect:![0-9]+]]
-; CHECK: for.body:
-; CHECK: br {{.*}} label %for.body, !llvm.loop [[scalar:![0-9]+]]
-
-for.end:                                          ; preds = %for.body
-  ret i32 %add
-}
-
-; Now, we check for the Hint metadata
-; CHECK: [[vect]] = distinct !{[[vect]], [[width:![0-9]+]]}
-; CHECK: [[width]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[scalar]] = distinct !{[[scalar]], [[runtime_unroll:![0-9]+]], [[width]]}
-; CHECK: [[runtime_unroll]] = !{!"llvm.loop.unroll.runtime.disable"}
-
--- a/external/llvm/test/Transforms/LoopVectorize/X86/assume.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/assume.ll
@ -1,100 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: nounwind uwtable
-define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 {
-entry:
-  br label %for.body
-
-; CHECK-LABEL: @test1
-; CHECK: vector.body:
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: for.body:
-; CHECK: ret void
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4
-  %cmp1 = fcmp ogt float %0, 1.000000e+02
-  tail call void @llvm.assume(i1 %cmp1)
-  %add = fadd float %0, 1.000000e+00
-  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
-  store float %add, float* %arrayidx5, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv, 1599
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.assume(i1) #1
-
-attributes #0 = { nounwind uwtable }
-attributes #1 = { nounwind }
-
-%struct.data = type { float*, float* }
-
-; Function Attrs: nounwind uwtable
-define void @test2(%struct.data* nocapture readonly %d) #0 {
-entry:
-  %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1
-  %0 = load float*, float** %b, align 8
-  %ptrint = ptrtoint float* %0 to i64
-  %maskedptr = and i64 %ptrint, 31
-  %maskcond = icmp eq i64 %maskedptr, 0
-  %a = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 0
-  %1 = load float*, float** %a, align 8
-  %ptrint2 = ptrtoint float* %1 to i64
-  %maskedptr3 = and i64 %ptrint2, 31
-  %maskcond4 = icmp eq i64 %maskedptr3, 0
-  br label %for.body
-
-; CHECK-LABEL: @test2
-; CHECK: vector.body:
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: @llvm.assume
-; CHECK: for.body:
-; CHECK: ret void
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  tail call void @llvm.assume(i1 %maskcond)
-  %arrayidx = getelementptr inbounds float, float* %0, i64 %indvars.iv
-  %2 = load float, float* %arrayidx, align 4
-  %add = fadd float %2, 1.000000e+00
-  tail call void @llvm.assume(i1 %maskcond4)
-  %arrayidx5 = getelementptr inbounds float, float* %1, i64 %indvars.iv
-  store float %add, float* %arrayidx5, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv, 1599
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
--- a/external/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
@ -1,52 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -mattr=avx,+slow-unaligned-mem-32 -S | FileCheck %s --check-prefix=SLOWMEM32 --check-prefix=CHECK
-; RUN: opt < %s  -loop-vectorize -mattr=avx,-slow-unaligned-mem-32 -S | FileCheck %s --check-prefix=FASTMEM32 --check-prefix=CHECK
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
-
-; CHECK-LABEL: @read_mod_write_single_ptr(
-; CHECK: load <8 x float>
-; CHECK: ret i32
-define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
-  %3 = load float, float* %2, align 4
-  %4 = fmul float %3, 3.000000e+00
-  store float %4, float* %2, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
-}
-
-
-; CHECK-LABEL: @read_mod_i64(
-; SLOWMEM32: load <2 x i64>
-; FASTMEM32: load <4 x i64>
-; CHECK: ret i32
-define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
-  %3 = load i64, i64* %2, align 4
-  %4 = add i64 %3, 3
-  store i64 %4, i64* %2, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
-}
-
--- a/external/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
@ -1,35 +0,0 @@
-; RUN: opt -mattr=+avx512f --loop-vectorize -S < %s | llc -mattr=+avx512f | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.9.0"
-
-; Verify that we generate 512-bit wide vectors for a basic integer memset
-; loop.
-
-; CHECK-LABEL: f:
-; CHECK: vmovdqu32 %zmm{{.}},
-; CHECK-NOT: %ymm
-
-define void @f(i32* %a, i32 %n) {
-entry:
-  %cmp4 = icmp sgt i32 %n, 0
-  br i1 %cmp4, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  store i32 %n, i32* %arrayidx, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
@ -1,68 +0,0 @@
-; RUN: opt -loop-vectorize -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
-target triple = "x86_64-unknown-linux-gnu"
-
-; PR34965/D39346
-
-; LV retains the original scalar loop intact as remainder loop. However,
-; after this transformation, analysis information concerning the remainder
-; loop may differ from the original scalar loop. This test is an example of
-; that behaviour, where values inside the remainder loop which SCEV could
-; originally analyze now require flow-sensitive analysis currently not
-; supported in SCEV. In particular, during LV code generation, after turning
-; the original scalar loop into the remainder loop, LV expected
-; Legal->isConsecutivePtr() to be consistent and return the same output as
-; during legal/cost model phases (original scalar loop). Unfortunately, that
-; condition was not satisfied because of the aforementioned SCEV limitation.
-; After D39346, LV code generation doesn't rely on Legal->isConsecutivePtr(),
-; i.e., SCEV. This test verifies that LV is able to handle the described cases.
-;
-; TODO: The SCEV limitation described before may affect plans to further
-; optimize the remainder loop of this particular test case. One tentative
-; solution is to detect the problematic IVs in LV (%7 and %8) and perform an
-; in-place IV optimization by replacing:
-;   %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] with
-; with
-;   %8 = sub i32 %7, 1.
-
-
-; Verify that store is vectorized as stride-1 memory access.
-
-; CHECK: vector.body:
-; CHECK: store <4 x i32>
-
-; Function Attrs: uwtable
-define void @test() {
-  br label %.outer
-
-; <label>:1:                                      ; preds = %2
-  ret void
-
-; <label>:2:                                      ; preds = %._crit_edge.loopexit
-  %3 = add nsw i32 %.ph, -2
-  br i1 undef, label %1, label %.outer
-
-.outer:                                           ; preds = %2, %0
-  %.ph = phi i32 [ %3, %2 ], [ 336, %0 ]
-  %.ph2 = phi i32 [ 62, %2 ], [ 110, %0 ]
-  %4 = and i32 %.ph, 30
-  %5 = add i32 %.ph2, 1
-  br label %6
-
-; <label>:6:                                      ; preds = %6, %.outer
-  %7 = phi i32 [ %5, %.outer ], [ %13, %6 ]
-  %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
-  %9 = add i32 %8, 2
-  %10 = zext i32 %9 to i64
-  %11 = getelementptr inbounds i32, i32 addrspace(1)* undef, i64 %10
-  %12 = ashr i32 undef, %4
-  store i32 %12, i32 addrspace(1)* %11, align 4
-  %13 = add i32 %7, 1
-  %14 = icmp sgt i32 %13, 61
-  br i1 %14, label %._crit_edge.loopexit, label %6
-
-._crit_edge.loopexit:                             ; preds = %._crit_edge.loopexit, %6
-  br i1 undef, label %2, label %._crit_edge.loopexit
-}
-
--- a/external/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@ -1,67 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; CHECK-LABEL: PR31671
-;
-; Check a pointer in which one of its uses is consecutive-like and another of
-; its uses is non-consecutive-like. In the test case below, %tmp3 is the
-; pointer operand of an interleaved load, making it consecutive-like. However,
-; it is also the pointer operand of a non-interleaved store that will become a
-; scatter operation. %tmp3 (and the induction variable) should not be marked
-; uniform-after-vectorization.
-;
-; CHECK:       LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
-; CHECK-NOT:   LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
-; CHECK-NOT:   LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-; CHECK-NOT:   LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> undef, float %x, i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 5, i64 10, i64 15, i64 20, i64 25, i64 30, i64 35, i64 40, i64 45, i64 50, i64 55, i64 60, i64 65, i64 70, i64 75>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <80 x float>*
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <80 x float>, <80 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <80 x float> [[WIDE_VEC]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <16 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[BC:%.*]] = bitcast <16 x float*> [[TMP3]] to <16 x <80 x float>*>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x <80 x float>*> [[BC]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80>
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-%data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] }
-
-define void @PR31671(float %x, %data* %d) #0 {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
-  %tmp1 = load float, float* %tmp0, align 4
-  %tmp2 = fmul float %x, %tmp1
-  %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
-  %tmp4 = load float, float* %tmp3, align 4
-  %tmp5 = fadd float %tmp4, %tmp2
-  store float %tmp5, float* %tmp3, align 4
-  %i.next = add nuw nsw i64 %i, 5
-  %cond = icmp slt i64 %i.next, 32000
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-attributes #0 = { "target-cpu"="knl" }
--- a/external/llvm/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
@ -1,30 +0,0 @@
-; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core2 -loop-vectorize -dce -instcombine -S < %s | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-
-@B = common global [1024 x i32] zeroinitializer, align 16
-@A = common global [1024 x i32] zeroinitializer, align 16
-
-; We use to not vectorize this loop because the shift was deemed to expensive.
-; Now that we differentiate shift cost base on the operand value kind, we will
-; vectorize this loop.
-; CHECK: ashr <4 x i32>
-define void @f() {
-entry:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %shl = ashr i32 %0, 3
-  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
-  store i32 %shl, i32* %arrayidx2, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@ -1,47 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
-
-;CHECK-LABEL: @conversion_cost1(
-;CHECK: store <32 x i8>
-;CHECK: ret
-define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
-  %1 = icmp sgt i32 %n, 3
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 3, %0 ]
-  %2 = trunc i64 %indvars.iv to i8
-  %3 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
-  store i8 %2, i8* %3, align 1
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
-}
-
-;CHECK-LABEL: @conversion_cost2(
-;CHECK: <2 x float>
-;CHECK: ret
-define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
-  %1 = icmp sgt i32 %n, 9
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
-  %add = add nsw i64 %indvars.iv, 3
-  %tofp = sitofp i64 %add to float
-  %gep = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %tofp, float* %gep, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
-}
--- a/external/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@ -1,82 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
-
-@c = common global [2048 x i32] zeroinitializer, align 16
-@b = common global [2048 x i32] zeroinitializer, align 16
-@d = common global [2048 x i32] zeroinitializer, align 16
-@a = common global [2048 x i32] zeroinitializer, align 16
-
-; The program below gathers and scatters data. We better not vectorize it.
-;CHECK-LABEL: @cost_model_1(
-;CHECK-NOT: <2 x i32>
-;CHECK-NOT: <4 x i32>
-;CHECK-NOT: <8 x i32>
-;CHECK: ret void
-define void @cost_model_1() nounwind uwtable noinline ssp {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %0
-  %1 = load i32, i32* %arrayidx, align 8
-  %idxprom1 = sext i32 %1 to i64
-  %arrayidx2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %idxprom1
-  %2 = load i32, i32* %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @d, i64 0, i64 %indvars.iv
-  %3 = load i32, i32* %arrayidx4, align 4
-  %idxprom5 = sext i32 %3 to i64
-  %arrayidx6 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %idxprom5
-  store i32 %2, i32* %arrayidx6, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; This function uses a stride that is generally too big to benefit from vectorization without
-; really good support for a gather load. We were not computing an accurate cost for the 
-; vectorization and subsequent scalarization of the pointer induction variables.
-
-define float @PR27826(float* nocapture readonly %a, float* nocapture readonly %b, i32 %n) {
-; CHECK-LABEL: @PR27826(
-; CHECK-NOT:   <4 x float> 
-; CHECK-NOT:   <8 x float> 
-; CHECK:       ret float %s.0.lcssa
-
-entry:
-  %cmp = icmp sgt i32 %n, 0
-  br i1 %cmp, label %preheader, label %for.end
-
-preheader:
-  %t0 = sext i32 %n to i64
-  br label %for
-
-for:
-  %indvars.iv = phi i64 [ 0, %preheader ], [ %indvars.iv.next, %for ]
-  %s.02 = phi float [ 0.0, %preheader ], [ %add4, %for ]
-  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
-  %t1 = load float, float* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds float, float* %b, i64 %indvars.iv
-  %t2 = load float, float* %arrayidx3, align 4
-  %add = fadd fast float %t1, %s.02
-  %add4 = fadd fast float %add, %t2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 32
-  %cmp1 = icmp slt i64 %indvars.iv.next, %t0
-  br i1 %cmp1, label %for, label %loopexit
-
-loopexit:
-  %add4.lcssa = phi float [ %add4, %for ]
-  br label %for.end
-
-for.end:
-  %s.0.lcssa = phi float [ 0.0, %entry ], [ %add4.lcssa, %loopexit ]
-  ret float %s.0.lcssa
-}
-
--- a/external/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@ -1,149 +0,0 @@
-; RUN: opt < %s  -O3 -simplifycfg -keep-loops=false -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix AUTO_VEC %s
-
-; This test checks auto-vectorization with FP induction variable.
-; The FP operation is not "fast" and requires "fast-math" function attribute.
-
-;void fp_iv_loop1(float * __restrict__ A, int N) {
-;  float x = 1.0;
-;  for (int i=0; i < N; ++i) {
-;    A[i] = x;
-;    x += 0.5;
-;  }
-;}
-
-
-; AUTO_VEC-LABEL: @fp_iv_loop1(
-; AUTO_VEC: vector.body
-; AUTO_VEC: store <8 x float>
-
-define void @fp_iv_loop1(float* noalias nocapture %A, i32 %N) #0 {
-entry:
-  %cmp4 = icmp sgt i32 %N, 0
-  br i1 %cmp4, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  store float %x.06, float* %arrayidx, align 4
-  %conv1 = fadd float %x.06, 5.000000e-01
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
-
-; The same as the previous, FP operation is not fast, different function attribute
-; Vectorization should be rejected.
-;void fp_iv_loop2(float * __restrict__ A, int N) {
-;  float x = 1.0;
-;  for (int i=0; i < N; ++i) {
-;    A[i] = x;
-;    x += 0.5;
-;  }
-;}
-
-; AUTO_VEC-LABEL: @fp_iv_loop2(
-; AUTO_VEC-NOT: vector.body
-; AUTO_VEC-NOT: store <{{.*}} x float>
-
-define void @fp_iv_loop2(float* noalias nocapture %A, i32 %N) #1 {
-entry:
-  %cmp4 = icmp sgt i32 %N, 0
-  br i1 %cmp4, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  store float %x.06, float* %arrayidx, align 4
-  %conv1 = fadd float %x.06, 5.000000e-01
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
-
-; AUTO_VEC-LABEL: @external_use_with_fast_math(
-; AUTO_VEC-NEXT:  entry:
-; AUTO_VEC-NEXT:    [[TMP0:%.*]] = icmp sgt i64 %n, 1
-; AUTO_VEC-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 %n, i64 1
-; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %vector.ph
-; AUTO_VEC:       vector.ph:
-; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
-; AUTO_VEC:         br label %vector.body
-; AUTO_VEC:       middle.block:
-; AUTO_VEC:         [[TMP11:%.*]] = add nsw i64 [[N_VEC]], -1
-; AUTO_VEC-NEXT:    [[CAST_CMO:%.*]] = sitofp i64 [[TMP11]] to double
-; AUTO_VEC-NEXT:    [[TMP12:%.*]] = fmul fast double [[CAST_CMO]], 3.000000e+00
-; AUTO_VEC-NEXT:    br i1 {{.*}}, label %for.end, label %for.body
-; AUTO_VEC:       for.end:
-; AUTO_VEC-NEXT:    [[J_LCSSA:%.*]] = phi double [ [[TMP12]], %middle.block ], [ %j, %for.body ]
-; AUTO_VEC-NEXT:    ret double [[J_LCSSA]]
-;
-define double @external_use_with_fast_math(double* %a, i64 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [%i.next, %for.body]
-  %j = phi double [ 0.0, %entry ], [ %j.next, %for.body ]
-  %tmp0 = getelementptr double, double* %a, i64 %i
-  store double %j, double* %tmp0
-  %i.next = add i64 %i, 1
-  %j.next = fadd fast double %j, 3.0
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  %tmp1 = phi double [ %j, %for.body ]
-  ret double %tmp1
-}
-
-; AUTO_VEC-LABEL: @external_use_without_fast_math(
-; AUTO_VEC:       for.body:
-; AUTO_VEC:         [[J:%.*]] = phi double [ 0.000000e+00, %entry ], [ [[J_NEXT:%.*]], %for.body ]
-; AUTO_VEC:         [[J_NEXT]] = fadd double [[J]], 3.000000e+00
-; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %for.end
-; AUTO_VEC:       for.end:
-; AUTO_VEC-NEXT:    ret double [[J]]
-;
-define double @external_use_without_fast_math(double* %a, i64 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [%i.next, %for.body]
-  %j = phi double [ 0.0, %entry ], [ %j.next, %for.body ]
-  %tmp0 = getelementptr double, double* %a, i64 %i
-  store double %j, double* %tmp0
-  %i.next = add i64 %i, 1
-  %j.next = fadd double %j, 3.0
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  %tmp1 = phi double [ %j, %for.body ]
-  ret double %tmp1
-}
-
-attributes #0 = { "no-nans-fp-math"="true" }
-attributes #1 = { "no-nans-fp-math"="false" }
--- a/external/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll
@ -1,41 +0,0 @@
-; RUN: opt -loop-vectorize -S < %s | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind uwtable
-define void @Test(i32* nocapture %res, i32* nocapture readnone %c, i32* nocapture readonly %d, i32* nocapture readonly %p) #0 {
-entry:
-  br label %for.body
-
-; CHECK-LABEL: @Test
-; CHECK: <4 x i32>
-
-for.body:                                         ; preds = %cond.end, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %cond.end ]
-  %arrayidx = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !0
-  %cmp1 = icmp eq i32 %0, 0
-  %arrayidx3 = getelementptr inbounds i32, i32* %res, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx3, align 4, !llvm.mem.parallel_loop_access !0
-  br i1 %cmp1, label %cond.end, label %cond.false
-
-cond.false:                                       ; preds = %for.body
-  %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
-  %2 = load i32, i32* %arrayidx7, align 4, !llvm.mem.parallel_loop_access !0
-  %add = add nsw i32 %2, %1
-  br label %cond.end
-
-cond.end:                                         ; preds = %for.body, %cond.false
-  %cond = phi i32 [ %add, %cond.false ], [ %1, %for.body ]
-  store i32 %cond, i32* %arrayidx3, align 4, !llvm.mem.parallel_loop_access !0
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 16
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
-
-for.end:                                          ; preds = %cond.end
-  ret void
-}
-
-attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
-
-!0 = distinct !{!0}
--- a/external/llvm/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
@ -1,39 +0,0 @@
-; RUN: opt < %s -mcpu=core-avx2 -loop-vectorize -S | llc -mcpu=core-avx2 | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx"
-
-@float_array = common global [10000 x float] zeroinitializer, align 16
-@unsigned_array = common global [10000 x i32] zeroinitializer, align 16
-
-; If we need to scalarize the fptoui and then use inserts to build up the
-; vector again, then there is certainly no value in going 256-bit wide.
-; CHECK-NOT: vinserti128
-
-define void @convert(i32 %N) {
-entry:
-  %0 = icmp eq i32 %N, 0
-  br i1 %0, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds [10000 x float], [10000 x float]* @float_array, i64 0, i64 %indvars.iv
-  %1 = load float, float* %arrayidx, align 4
-  %conv = fptoui float %1 to i32
-  %arrayidx2 = getelementptr inbounds [10000 x i32], [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
-  store i32 %conv, i32* %arrayidx2, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
-
--- a/external/llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
@ -1,40 +0,0 @@
-; RUN: opt < %s -mcpu=core-avx2 -loop-vectorize -S | llc -mcpu=core-avx2 | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx"
-
-@n = global i32 10000, align 4
-@double_array = common global [10000 x double] zeroinitializer, align 16
-@unsigned_array = common global [10000 x i32] zeroinitializer, align 16
-
-; If we need to scalarize the fptoui and then use inserts to build up the
-; vector again, then there is certainly no value in going 256-bit wide.
-; CHECK-NOT: vpinsrd
-
-define void @convert() {
-entry:
-  %0 = load i32, i32* @n, align 4
-  %cmp4 = icmp eq i32 %0, 0
-  br i1 %cmp4, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds [10000 x double], [10000 x double]* @double_array, i64 0, i64 %indvars.iv
-  %1 = load double, double* %arrayidx, align 8
-  %conv = fptoui double %1 to i32
-  %arrayidx2 = getelementptr inbounds [10000 x i32], [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
-  store i32 %conv, i32* %arrayidx2, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %2 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp ult i32 %2, %0
-  br i1 %cmp, label %for.body, label %for.end.loopexit
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
@ -1,25 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
-; REQUIRES: asserts
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
-
-
-; CHECK: cost of 7 for VF 8 For instruction:   %conv = fptosi float %tmp to i8
-define void @float_to_sint8_cost(i8* noalias nocapture %a, float* noalias nocapture readonly %b) nounwind {
-entry:
-  br label %for.body
-for.body:
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
-  %tmp = load float, float* %arrayidx, align 4
-  %conv = fptosi float %tmp to i8
-  %arrayidx2 = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
-  store i8 %conv, i8* %arrayidx2, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
@ -1,45 +0,0 @@
-; RUN: opt -S -loop-vectorize < %s | FileCheck %s
-target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
-target triple = "i686-pc-windows-msvc18.0.0"
-
-define void @test1() #0 personality i32 (...)* @__CxxFrameHandler3 {
-entry:
-  invoke void @_CxxThrowException(i8* null, i8* null)
-          to label %unreachable unwind label %catch.dispatch
-
-catch.dispatch:                                   ; preds = %entry
-  %0 = catchswitch within none [label %catch] unwind to caller
-
-catch:                                            ; preds = %catch.dispatch
-  %1 = catchpad within %0 [i8* null, i32 64, i8* null]
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body
-  catchret from %1 to label %try.cont
-
-for.body:                                         ; preds = %for.body, %catch
-  %i.07 = phi i32 [ 0, %catch ], [ %inc, %for.body ]
-  %call = call double @floor(double 1.0) #1 [ "funclet"(token %1) ]
-  %inc = add nuw nsw i32 %i.07, 1
-  %exitcond = icmp eq i32 %inc, 1024
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-
-try.cont:                                         ; preds = %for.cond.cleanup
-  ret void
-
-unreachable:                                      ; preds = %entry
-  unreachable
-}
-
-; CHECK-LABEL: define void @test1(
-; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null]
-; CHECK: call <16 x double> @llvm.floor.v16f64(<16 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ]
-
-declare x86_stdcallcc void @_CxxThrowException(i8*, i8*)
-
-declare i32 @__CxxFrameHandler3(...)
-
-declare double @floor(double) #1
-
-attributes #0 = { "target-features"="+sse2" }
-attributes #1 = { nounwind readnone }
--- a/external/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll
@ -1,86 +0,0 @@
-; RUN: opt -loop-vectorize -mtriple=x86_64-apple-macosx -S -mcpu=corei7-avx -enable-interleaved-mem-accesses=false < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-
-@kernel = global [512 x float] zeroinitializer, align 16
-@kernel2 = global [512 x float] zeroinitializer, align 16
-@kernel3 = global [512 x float] zeroinitializer, align 16
-@kernel4 = global [512 x float] zeroinitializer, align 16
-@src_data = global [1536 x float] zeroinitializer, align 16
-@r_ = global i8 0, align 1
-@g_ = global i8 0, align 1
-@b_ = global i8 0, align 1
-
-; We don't want to vectorize most loops containing gathers because they are
-; expensive. This function represents a point where vectorization starts to
-; become beneficial.
-; Make sure we are conservative and don't vectorize it.
-; CHECK-NOT: x float>
-
-define void @_Z4testmm(i64 %size, i64 %offset) {
-entry:
-  %cmp53 = icmp eq i64 %size, 0
-  br i1 %cmp53, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:
-  br label %for.body
-
-for.body:
-  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
-  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
-  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
-  %add = add i64 %v.055, %offset
-  %mul = mul i64 %add, 3
-  %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %mul
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 %v.055
-  %1 = load float, float* %arrayidx2, align 4
-  %mul3 = fmul fast float %0, %1
-  %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 %v.055
-  %2 = load float, float* %arrayidx4, align 4
-  %mul5 = fmul fast float %mul3, %2
-  %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 %v.055
-  %3 = load float, float* %arrayidx6, align 4
-  %mul7 = fmul fast float %mul5, %3
-  %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 %v.055
-  %4 = load float, float* %arrayidx8, align 4
-  %mul9 = fmul fast float %mul7, %4
-  %add10 = fadd fast float %r.057, %mul9
-  %arrayidx.sum = add i64 %mul, 1
-  %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
-  %5 = load float, float* %arrayidx11, align 4
-  %mul13 = fmul fast float %1, %5
-  %mul15 = fmul fast float %2, %mul13
-  %mul17 = fmul fast float %3, %mul15
-  %mul19 = fmul fast float %4, %mul17
-  %add20 = fadd fast float %g.056, %mul19
-  %arrayidx.sum52 = add i64 %mul, 2
-  %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
-  %6 = load float, float* %arrayidx21, align 4
-  %mul23 = fmul fast float %1, %6
-  %mul25 = fmul fast float %2, %mul23
-  %mul27 = fmul fast float %3, %mul25
-  %mul29 = fmul fast float %4, %mul27
-  %add30 = fadd fast float %b.054, %mul29
-  %inc = add i64 %v.055, 1
-  %exitcond = icmp ne i64 %inc, %size
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:
-  %add30.lcssa = phi float [ %add30, %for.body ]
-  %add20.lcssa = phi float [ %add20, %for.body ]
-  %add10.lcssa = phi float [ %add10, %for.body ]
-  %phitmp = fptoui float %add10.lcssa to i8
-  %phitmp60 = fptoui float %add20.lcssa to i8
-  %phitmp61 = fptoui float %add30.lcssa to i8
-  br label %for.end
-
-for.end:
-  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  store i8 %r.0.lcssa, i8* @r_, align 1
-  store i8 %g.0.lcssa, i8* @g_, align 1
-  store i8 %b.0.lcssa, i8* @b_, align 1
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
@ -1,41 +0,0 @@
-; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512  < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; This test checks that "gather" operation is choosen since it's cost is better
-; than interleaving pattern.
-;
-;unsigned long A[SIZE];
-;unsigned long B[SIZE];
-;
-;void foo() {
-;  for (int i=0; i<N; i+=8) {
-;    B[i] = A[i] + 5;
-;  }
-;}
-
-@A = global [10240 x i64] zeroinitializer, align 16
-@B = global [10240 x i64] zeroinitializer, align 16
-
-
-; CHECK_LABEL: strided_load_i64
-; CHECK: masked.gather
-define void @strided_load_i64() {
-  br label %1
-
-; <label>:1:                                      ; preds = %0, %1
-  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv
-  %3 = load i64, i64* %2, align 16
-  %4 = add i64 %3, 5
-  %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
-  store i64 %4, i64* %5, align 16
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
-  %6 = icmp slt i64 %indvars.iv.next, 1024
-  br i1 %6, label %1, label %7
-
-; <label>:7:                                      ; preds = %1
-  ret void
-}
-
--- a/external/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@ -1,426 +0,0 @@
-; RUN: opt < %s  -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
-
-;AVX1-NOT: llvm.masked
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc_linux"
-
-; The source code:
-;
-;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) {
-;
-;  for (int i=0; i < SIZE; ++i) {
-;    if (trigger[i] > 0) {
-;      out[i] = in[index[i]] + (float) 0.5;
-;    }
-;  }
-;}
-
-;AVX512-LABEL: @foo1
-;AVX512: llvm.masked.load.v16i32.p0v16i32
-;AVX512: llvm.masked.gather.v16f32.v16p0f32
-;AVX512: llvm.masked.store.v16f32.p0v16f32
-;AVX512: ret void
-
-; Function Attrs: nounwind uwtable
-define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) {
-entry:
-  %in.addr = alloca float*, align 8
-  %out.addr = alloca float*, align 8
-  %trigger.addr = alloca i32*, align 8
-  %index.addr = alloca i32*, align 8
-  %i = alloca i32, align 4
-  store float* %in, float** %in.addr, align 8
-  store float* %out, float** %out.addr, align 8
-  store i32* %trigger, i32** %trigger.addr, align 8
-  store i32* %index, i32** %index.addr, align 8
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32, i32* %i, align 4
-  %cmp = icmp slt i32 %0, 4096
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %1 = load i32, i32* %i, align 4
-  %idxprom = sext i32 %1 to i64
-  %2 = load i32*, i32** %trigger.addr, align 8
-  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
-  %3 = load i32, i32* %arrayidx, align 4
-  %cmp1 = icmp sgt i32 %3, 0
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %4 = load i32, i32* %i, align 4
-  %idxprom2 = sext i32 %4 to i64
-  %5 = load i32*, i32** %index.addr, align 8
-  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
-  %6 = load i32, i32* %arrayidx3, align 4
-  %idxprom4 = sext i32 %6 to i64
-  %7 = load float*, float** %in.addr, align 8
-  %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4
-  %8 = load float, float* %arrayidx5, align 4
-  %add = fadd float %8, 5.000000e-01
-  %9 = load i32, i32* %i, align 4
-  %idxprom6 = sext i32 %9 to i64
-  %10 = load float*, float** %out.addr, align 8
-  %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6
-  store float %add, float* %arrayidx7, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end
-  %11 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %11, 1
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  ret void
-}
-
-; The source code
-;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) {
-;
-;  for (int i=0; i<SIZE; i += 16) {
-;    if (trigger[i] > 0) {
-;      out[i] = in[i].b + (float) 0.5;
-;    }
-;  }
-;}
-
-%struct.In = type { float, float }
-
-;AVX512-LABEL: @foo2
-;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.gather.v16f32.v16p0f32
-;AVX512: llvm.masked.scatter.v16f32.v16p0f32
-;AVX512: ret void
-define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
-entry:
-  %in.addr = alloca %struct.In*, align 8
-  %out.addr = alloca float*, align 8
-  %trigger.addr = alloca i32*, align 8
-  %index.addr = alloca i32*, align 8
-  %i = alloca i32, align 4
-  store %struct.In* %in, %struct.In** %in.addr, align 8
-  store float* %out, float** %out.addr, align 8
-  store i32* %trigger, i32** %trigger.addr, align 8
-  store i32* %index, i32** %index.addr, align 8
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32, i32* %i, align 4
-  %cmp = icmp slt i32 %0, 4096
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %1 = load i32, i32* %i, align 4
-  %idxprom = sext i32 %1 to i64
-  %2 = load i32*, i32** %trigger.addr, align 8
-  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
-  %3 = load i32, i32* %arrayidx, align 4
-  %cmp1 = icmp sgt i32 %3, 0
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %4 = load i32, i32* %i, align 4
-  %idxprom2 = sext i32 %4 to i64
-  %5 = load %struct.In*, %struct.In** %in.addr, align 8
-  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
-  %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
-  %6 = load float, float* %b, align 4
-  %add = fadd float %6, 5.000000e-01
-  %7 = load i32, i32* %i, align 4
-  %idxprom4 = sext i32 %7 to i64
-  %8 = load float*, float** %out.addr, align 8
-  %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4
-  store float %add, float* %arrayidx5, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end
-  %9 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %9, 16
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  ret void
-}
-
-; The source code
-;struct Out {
-;  float a;
-;  float b;
-;};
-;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
-;
-;  for (int i=0; i<SIZE; i += 16) {
-;    if (trigger[i] > 0) {
-;      out[i].b = in[i].b + (float) 0.5;
-;    }
-;  }
-;}
-
-;AVX512-LABEL: @foo3
-;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.gather.v16f32.v16p0f32
-;AVX512: fadd <16 x float>
-;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.scatter.v16f32.v16p0f32
-;AVX512: ret void
-
-%struct.Out = type { float, float }
-
-define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) {
-entry:
-  %in.addr = alloca %struct.In*, align 8
-  %out.addr = alloca %struct.Out*, align 8
-  %trigger.addr = alloca i32*, align 8
-  %i = alloca i32, align 4
-  store %struct.In* %in, %struct.In** %in.addr, align 8
-  store %struct.Out* %out, %struct.Out** %out.addr, align 8
-  store i32* %trigger, i32** %trigger.addr, align 8
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32, i32* %i, align 4
-  %cmp = icmp slt i32 %0, 4096
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %1 = load i32, i32* %i, align 4
-  %idxprom = sext i32 %1 to i64
-  %2 = load i32*, i32** %trigger.addr, align 8
-  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
-  %3 = load i32, i32* %arrayidx, align 4
-  %cmp1 = icmp sgt i32 %3, 0
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %4 = load i32, i32* %i, align 4
-  %idxprom2 = sext i32 %4 to i64
-  %5 = load %struct.In*, %struct.In** %in.addr, align 8
-  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
-  %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
-  %6 = load float, float* %b, align 4
-  %add = fadd float %6, 5.000000e-01
-  %7 = load i32, i32* %i, align 4
-  %idxprom4 = sext i32 %7 to i64
-  %8 = load %struct.Out*, %struct.Out** %out.addr, align 8
-  %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4
-  %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1
-  store float %add, float* %b6, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end
-  %9 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %9, 16
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  ret void
-}
-declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
-
-; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1
-
-;AVX512-LABEL: @foo2_addrspace
-;AVX512: getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.gather.v16f32.v16p1f32
-;AVX512: llvm.masked.scatter.v16f32.v16p1f32
-;AVX512: ret void
-define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
-entry:
-  %in.addr = alloca %struct.In addrspace(1)*, align 8
-  %out.addr = alloca float addrspace(1)*, align 8
-  %trigger.addr = alloca i32*, align 8
-  %index.addr = alloca i32*, align 8
-  %i = alloca i32, align 4
-  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
-  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
-  store i32* %trigger, i32** %trigger.addr, align 8
-  store i32* %index, i32** %index.addr, align 8
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32, i32* %i, align 4
-  %cmp = icmp slt i32 %0, 4096
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %1 = load i32, i32* %i, align 4
-  %idxprom = sext i32 %1 to i64
-  %2 = load i32*, i32** %trigger.addr, align 8
-  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
-  %3 = load i32, i32* %arrayidx, align 4
-  %cmp1 = icmp sgt i32 %3, 0
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %4 = load i32, i32* %i, align 4
-  %idxprom2 = sext i32 %4 to i64
-  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
-  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
-  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
-  %6 = load float, float addrspace(1)* %b, align 4
-  %add = fadd float %6, 5.000000e-01
-  %7 = load i32, i32* %i, align 4
-  %idxprom4 = sext i32 %7 to i64
-  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
-  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
-  store float %add, float addrspace(1)* %arrayidx5, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end
-  %9 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %9, 16
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  ret void
-}
-
-; Same as foo2_addrspace but here only the input has the non-default address space.
-
-;AVX512-LABEL: @foo2_addrspace2
-;AVX512: getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.gather.v16f32.v16p1f32
-;AVX512: llvm.masked.scatter.v16f32.v16p0f32
-;AVX512: ret void
-define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
-entry:
-  %in.addr = alloca %struct.In addrspace(1)*, align 8
-  %out.addr = alloca float addrspace(0)*, align 8
-  %trigger.addr = alloca i32*, align 8
-  %index.addr = alloca i32*, align 8
-  %i = alloca i32, align 4
-  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
-  store float addrspace(0)* %out, float addrspace(0)** %out.addr, align 8
-  store i32* %trigger, i32** %trigger.addr, align 8
-  store i32* %index, i32** %index.addr, align 8
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32, i32* %i, align 4
-  %cmp = icmp slt i32 %0, 4096
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %1 = load i32, i32* %i, align 4
-  %idxprom = sext i32 %1 to i64
-  %2 = load i32*, i32** %trigger.addr, align 8
-  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
-  %3 = load i32, i32* %arrayidx, align 4
-  %cmp1 = icmp sgt i32 %3, 0
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %4 = load i32, i32* %i, align 4
-  %idxprom2 = sext i32 %4 to i64
-  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
-  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
-  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
-  %6 = load float, float addrspace(1)* %b, align 4
-  %add = fadd float %6, 5.000000e-01
-  %7 = load i32, i32* %i, align 4
-  %idxprom4 = sext i32 %7 to i64
-  %8 = load float addrspace(0)*, float addrspace(0)** %out.addr, align 8
-  %arrayidx5 = getelementptr inbounds float, float addrspace(0)* %8, i64 %idxprom4
-  store float %add, float addrspace(0)* %arrayidx5, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end
-  %9 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %9, 16
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  ret void
-}
-
-; Same as foo2_addrspace but here only the output has the non-default address space.
-
-;AVX512-LABEL: @foo2_addrspace3
-;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.gather.v16f32.v16p0f32
-;AVX512: llvm.masked.scatter.v16f32.v16p1f32
-;AVX512: ret void
-
-define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
-entry:
-  %in.addr = alloca %struct.In addrspace(0)*, align 8
-  %out.addr = alloca float addrspace(1)*, align 8
-  %trigger.addr = alloca i32*, align 8
-  %index.addr = alloca i32*, align 8
-  %i = alloca i32, align 4
-  store %struct.In addrspace(0)* %in, %struct.In addrspace(0)** %in.addr, align 8
-  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
-  store i32* %trigger, i32** %trigger.addr, align 8
-  store i32* %index, i32** %index.addr, align 8
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32, i32* %i, align 4
-  %cmp = icmp slt i32 %0, 4096
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %1 = load i32, i32* %i, align 4
-  %idxprom = sext i32 %1 to i64
-  %2 = load i32*, i32** %trigger.addr, align 8
-  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
-  %3 = load i32, i32* %arrayidx, align 4
-  %cmp1 = icmp sgt i32 %3, 0
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %4 = load i32, i32* %i, align 4
-  %idxprom2 = sext i32 %4 to i64
-  %5 = load %struct.In addrspace(0)*, %struct.In addrspace(0)** %in.addr, align 8
-  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %5, i64 %idxprom2
-  %b = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %arrayidx3, i32 0, i32 1
-  %6 = load float, float addrspace(0)* %b, align 4
-  %add = fadd float %6, 5.000000e-01
-  %7 = load i32, i32* %i, align 4
-  %idxprom4 = sext i32 %7 to i64
-  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
-  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
-  store float %add, float addrspace(1)* %arrayidx5, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end
-  %9 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %9, 16
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@ -1,77 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -S | FileCheck %s
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-interleave=0 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
-
-@b = common global [2048 x i32] zeroinitializer, align 16
-@c = common global [2048 x i32] zeroinitializer, align 16
-@a = common global [2048 x i32] zeroinitializer, align 16
-
-; Select VF = 8;
-;CHECK-LABEL: @example1(
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: ret void
-
-;UNROLL-LABEL: @example1(
-;UNROLL: load <4 x i32>
-;UNROLL: load <4 x i32>
-;UNROLL: add nsw <4 x i32>
-;UNROLL: add nsw <4 x i32>
-;UNROLL: store <4 x i32>
-;UNROLL: store <4 x i32>
-;UNROLL: ret void
-define void @example1() nounwind uwtable ssp {
-  br label %1
-
-; <label>:1                                       ; preds = %1, %0
-  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
-  %3 = load i32, i32* %2, align 4
-  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
-  %5 = load i32, i32* %4, align 4
-  %6 = add nsw i32 %5, %3
-  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
-  store i32 %6, i32* %7, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 256
-  br i1 %exitcond, label %8, label %1
-
-; <label>:8                                       ; preds = %1
-  ret void
-}
-
-; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
-;CHECK-LABEL: @example10b(
-;CHECK: load <4 x i16>
-;CHECK: sext <4 x i16>
-;CHECK: store <4 x i32>
-;CHECK: ret void
-;UNROLL-LABEL: @example10b(
-;UNROLL: load <4 x i16>
-;UNROLL: load <4 x i16>
-;UNROLL: store <4 x i32>
-;UNROLL: store <4 x i32>
-;UNROLL: ret void
-define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
-  br label %1
-
-; <label>:1                                       ; preds = %1, %0
-  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
-  %3 = load i16, i16* %2, align 2
-  %4 = sext i16 %3 to i32
-  %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
-  store i32 %4, i32* %5, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %6, label %1
-
-; <label>:6                                       ; preds = %1
-  ret void
-}
-
--- a/external/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@ -1,56 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-;CHECK-LABEL: @foo(
-;CHECK-NOT: <4 x i32>
-;CHECK: ret void
-
-; Function Attrs: nounwind uwtable 
-define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
-entry:
-  %cmp27 = icmp sgt i32 %m, 0
-  br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
-
-for.end.us:                                       ; preds = %for.body3.us
-  %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
-  %0 = load i32, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
-  %add10.us = add nsw i32 %0, 3
-  store i32 %add10.us, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
-  %indvars.iv.next34 = add i64 %indvars.iv33, 1
-  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
-  %exitcond36 = icmp eq i32 %lftr.wideiv35, %m
-  br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop !5
-
-for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
-  %indvars.iv29 = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next30, %for.body3.us ]
-  %1 = trunc i64 %indvars.iv29 to i32
-  %add4.us = add i32 %add.us, %1
-  %idxprom.us = sext i32 %add4.us to i64
-  %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
-  %2 = load i32, i32* %arrayidx.us, align 4, !llvm.mem.parallel_loop_access !3
-  %add5.us = add nsw i32 %2, 1
-  store i32 %add5.us, i32* %arrayidx7.us, align 4, !llvm.mem.parallel_loop_access !3
-  %indvars.iv.next30 = add i64 %indvars.iv29, 1
-  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
-  %exitcond32 = icmp eq i32 %lftr.wideiv31, %m
-  br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop !4
-
-for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
-  %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
-  %3 = trunc i64 %indvars.iv33 to i32
-  %add.us = add i32 %3, %k
-  %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
-  br label %for.body3.us
-
-for.end15:                                        ; preds = %for.end.us, %entry
-  ret void
-}
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!3 = !{!4, !5}
-!4 = !{!4}
-!5 = !{!5}
-
--- a/Show More
+++ b/Show More