Imported Upstream version 5.18.0.167

Former-commit-id: 289509151e0fee68a1b591a20c9f109c3c789d3a
2018-10-20 08:25:10 +00:00
parent e19d552987
commit b084638f15
28489 changed files with 184 additions and 3866856 deletions
--- a/external/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@ -1,330 +0,0 @@
-; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
-; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
-; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
-; REQUIRES: asserts
-
-; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
-; regarding IEEE 754 standard.
-; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
-; because NEON is not IEEE compliant.
-; Darwin, on the other hand, doesn't support subnormals, and all optimizations
-; are allowed, even without -ffast-math.
-
-; Integer loops are always vectorizeable
-; CHECK: Checking a loop in "sumi"
-; CHECK: We can vectorize this loop!
-define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
-  %1 = load i32, i32* %arrayidx1, align 4
-  %mul = mul nsw i32 %1, %0
-  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
-  store i32 %mul, i32* %arrayidx2, align 4
-  %inc = add nuw nsw i32 %i.06, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
-
-; Floating-point loops need fast-math to be vectorizeable
-; LINUX: Checking a loop in "sumf"
-; LINUX: Potentially unsafe FP op prevents vectorization
-; DARWIN: Checking a loop in "sumf"
-; DARWIN: We can vectorize this loop!
-define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
-  %1 = load float, float* %arrayidx1, align 4
-  %mul = fmul float %0, %1
-  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
-  store float %mul, float* %arrayidx2, align 4
-  %inc = add nuw nsw i32 %i.06, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
-
-; Integer loops are always vectorizeable
-; CHECK: Checking a loop in "redi"
-; CHECK: We can vectorize this loop!
-define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
-  %1 = load i32, i32* %arrayidx1, align 4
-  %mul = mul nsw i32 %1, %0
-  %add = add nsw i32 %mul, %Red.06
-  %inc = add nuw nsw i32 %i.07, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %add.lcssa = phi i32 [ %add, %for.body ]
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret i32 %Red.0.lcssa
-}
-
-; Floating-point loops need fast-math to be vectorizeable
-; LINUX: Checking a loop in "redf"
-; LINUX: Potentially unsafe FP op prevents vectorization
-; DARWIN: Checking a loop in "redf"
-; DARWIN: We can vectorize this loop!
-define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
-  %1 = load float, float* %arrayidx1, align 4
-  %mul = fmul float %0, %1
-  %add = fadd float %Red.06, %mul
-  %inc = add nuw nsw i32 %i.07, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %add.lcssa = phi float [ %add, %for.body ]
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret float %Red.0.lcssa
-}
-
-; Make sure calls that turn into builtins are also covered
-; LINUX: Checking a loop in "fabs"
-; LINUX: Potentially unsafe FP op prevents vectorization
-; DARWIN: Checking a loop in "fabs"
-; DARWIN: We can vectorize this loop!
-define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
-entry:
-  %cmp10 = icmp eq i32 %N, 0
-  br i1 %cmp10, label %for.end, label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
-  %1 = load float, float* %arrayidx1, align 4
-  %fabsf = tail call float @fabsf(float %1) #1
-  %conv3 = fmul float %0, %fabsf
-  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
-  store float %conv3, float* %arrayidx4, align 4
-  %inc = add nuw nsw i32 %i.011, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-; Integer loops are always vectorizeable
-; CHECK: Checking a loop in "sumi_fast"
-; CHECK: We can vectorize this loop!
-define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
-  %1 = load i32, i32* %arrayidx1, align 4
-  %mul = mul nsw i32 %1, %0
-  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
-  store i32 %mul, i32* %arrayidx2, align 4
-  %inc = add nuw nsw i32 %i.06, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
-
-; Floating-point loops can be vectorizeable with fast-math
-; CHECK: Checking a loop in "sumf_fast"
-; CHECK: We can vectorize this loop!
-define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
-  %1 = load float, float* %arrayidx1, align 4
-  %mul = fmul fast float %1, %0
-  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
-  store float %mul, float* %arrayidx2, align 4
-  %inc = add nuw nsw i32 %i.06, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
-
-; Integer loops are always vectorizeable
-; CHECK: Checking a loop in "redi_fast"
-; CHECK: We can vectorize this loop!
-define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
-  %1 = load i32, i32* %arrayidx1, align 4
-  %mul = mul nsw i32 %1, %0
-  %add = add nsw i32 %mul, %Red.06
-  %inc = add nuw nsw i32 %i.07, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %add.lcssa = phi i32 [ %add, %for.body ]
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret i32 %Red.0.lcssa
-}
-
-; Floating-point loops can be vectorizeable with fast-math
-; CHECK: Checking a loop in "redf_fast"
-; CHECK: We can vectorize this loop!
-define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
-  %1 = load float, float* %arrayidx1, align 4
-  %mul = fmul fast float %1, %0
-  %add = fadd fast float %mul, %Red.06
-  %inc = add nuw nsw i32 %i.07, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %add.lcssa = phi float [ %add, %for.body ]
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret float %Red.0.lcssa
-}
-
-; Make sure calls that turn into builtins are also covered
-; CHECK: Checking a loop in "fabs_fast"
-; CHECK: We can vectorize this loop!
-define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
-entry:
-  %cmp10 = icmp eq i32 %N, 0
-  br i1 %cmp10, label %for.end, label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
-  %1 = load float, float* %arrayidx1, align 4
-  %fabsf = tail call fast float @fabsf(float %1) #2
-  %conv3 = fmul fast float %fabsf, %0
-  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
-  store float %conv3, float* %arrayidx4, align 4
-  %inc = add nuw nsw i32 %i.011, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-declare float @fabsf(float)
-
-attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
--- a/external/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
@ -1,71 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
-; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT
-; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFTUNROLL
-
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-target triple = "thumbv7-apple-ios3.0.0"
-
-;CHECK-LABEL: @foo(
-;CHECK: load <4 x i32>
-;CHECK-NOT: load <4 x i32>
-;CHECK: ret
-;SWIFT-LABEL: @foo(
-;SWIFT: load <4 x i32>
-;SWIFT: load <4 x i32>
-;SWIFT: ret
-define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
-  %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds i32, i32* %A, i32 %i.02
-  %3 = load i32, i32* %2, align 4
-  %4 = add nsw i32 %3, %sum.01
-  %5 = add nsw i32 %i.02, 1
-  %exitcond = icmp eq i32 %5, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
-  ret i32 %sum.0.lcssa
-}
-
-; Verify the register limit. On arm we don't have 16 allocatable registers.
-;SWIFTUNROLL-LABEL: @register_limit(
-;SWIFTUNROLL: load i32
-;SWIFTUNROLL-NOT: load i32
-define i32 @register_limit(i32* nocapture %A, i32 %n) {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:
-  %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
-  %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
-  %sum.02 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ]
-  %sum.03 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ]
-  %sum.04 = phi i32 [ %8, %.lr.ph ], [ 0, %0 ]
-  %sum.05 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
-  %sum.06 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds i32, i32* %A, i32 %i.02
-  %3 = load i32, i32* %2, align 4
-  %4 = add nsw i32 %3, %sum.01
-  %5 = add nsw i32 %i.02, 1
-  %6 = add nsw i32 %3, %sum.02
-  %7 = add nsw i32 %3, %sum.03
-  %8 = add nsw i32 %3, %sum.04
-  %9 = add nsw i32 %3, %sum.05
-  %10 = add nsw i32 %3, %sum.05
-  %exitcond = icmp eq i32 %5, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
-  %sum.1.lcssa = phi i32 [ 0, %0 ], [ %6, %.lr.ph ]
-  %sum.2.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ]
-  %sum.4.lcssa = phi i32 [ 0, %0 ], [ %8, %.lr.ph ]
-  %sum.5.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
-  %sum.6.lcssa = phi i32 [ 0, %0 ], [ %10, %.lr.ph ]
-  ret i32 %sum.0.lcssa
-}
--- a/external/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll
@ -1,88 +0,0 @@
-; RUN: opt -loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-
-@kernel = global [512 x float] zeroinitializer, align 4
-@kernel2 = global [512 x float] zeroinitializer, align 4
-@kernel3 = global [512 x float] zeroinitializer, align 4
-@kernel4 = global [512 x float] zeroinitializer, align 4
-@src_data = global [1536 x float] zeroinitializer, align 4
-@r_ = global i8 0, align 4
-@g_ = global i8 0, align 4
-@b_ = global i8 0, align 4
-
-; We don't want to vectorize most loops containing gathers because they are
-; expensive. This function represents a point where vectorization starts to
-; become beneficial.
-; Make sure we are conservative and don't vectorize it.
-; CHECK-NOT: <2 x float>
-; CHECK-NOT: <4 x float>
-
-define void @_Z4testmm(i32 %size, i32 %offset) {
-entry:
-  %cmp53 = icmp eq i32 %size, 0
-  br i1 %cmp53, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:
-  br label %for.body
-
-for.body:
-  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
-  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
-  %v.055 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
-  %add = add i32 %v.055, %offset
-  %mul = mul i32 %add, 3
-  %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %mul
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 %v.055
-  %1 = load float, float* %arrayidx2, align 4
-  %mul3 = fmul fast float %0, %1
-  %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 %v.055
-  %2 = load float, float* %arrayidx4, align 4
-  %mul5 = fmul fast float %mul3, %2
-  %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 %v.055
-  %3 = load float, float* %arrayidx6, align 4
-  %mul7 = fmul fast float %mul5, %3
-  %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 %v.055
-  %4 = load float, float* %arrayidx8, align 4
-  %mul9 = fmul fast float %mul7, %4
-  %add10 = fadd fast float %r.057, %mul9
-  %arrayidx.sum = add i32 %mul, 1
-  %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum
-  %5 = load float, float* %arrayidx11, align 4
-  %mul13 = fmul fast float %1, %5
-  %mul15 = fmul fast float %2, %mul13
-  %mul17 = fmul fast float %3, %mul15
-  %mul19 = fmul fast float %4, %mul17
-  %add20 = fadd fast float %g.056, %mul19
-  %arrayidx.sum52 = add i32 %mul, 2
-  %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum52
-  %6 = load float, float* %arrayidx21, align 4
-  %mul23 = fmul fast float %1, %6
-  %mul25 = fmul fast float %2, %mul23
-  %mul27 = fmul fast float %3, %mul25
-  %mul29 = fmul fast float %4, %mul27
-  %add30 = fadd fast float %b.054, %mul29
-  %inc = add i32 %v.055, 1
-  %exitcond = icmp ne i32 %inc, %size
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:
-  %add30.lcssa = phi float [ %add30, %for.body ]
-  %add20.lcssa = phi float [ %add20, %for.body ]
-  %add10.lcssa = phi float [ %add10, %for.body ]
-  %phitmp = fptoui float %add10.lcssa to i8
-  %phitmp60 = fptoui float %add20.lcssa to i8
-  %phitmp61 = fptoui float %add30.lcssa to i8
-  br label %for.end
-
-for.end:
-  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  store i8 %r.0.lcssa, i8* @r_, align 4
-  store i8 %g.0.lcssa, i8* @g_, align 4
-  store i8 %b.0.lcssa, i8* @b_, align 4
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
@ -1,60 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-target triple = "thumbv7-apple-ios3.0.0"
-
-@b = common global [2048 x i32] zeroinitializer, align 16
-@c = common global [2048 x i32] zeroinitializer, align 16
-@a = common global [2048 x i32] zeroinitializer, align 16
-
-; Select VF = 8;
-;CHECK-LABEL: @example1(
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: ret void
-define void @example1() nounwind uwtable ssp {
-  br label %1
-
-; <label>:1                                       ; preds = %1, %0
-  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
-  %3 = load i32, i32* %2, align 4
-  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
-  %5 = load i32, i32* %4, align 4
-  %6 = add nsw i32 %5, %3
-  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
-  store i32 %6, i32* %7, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 256
-  br i1 %exitcond, label %8, label %1
-
-; <label>:8                                       ; preds = %1
-  ret void
-}
-
-;CHECK-LABEL: @example10b(
-;CHECK: load <4 x i16>
-;CHECK: sext <4 x i16>
-;CHECK: store <4 x i32>
-;CHECK: ret void
-define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
-  br label %1
-
-; <label>:1                                       ; preds = %1, %0
-  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
-  %3 = load i16, i16* %2, align 2
-  %4 = sext i16 %3 to i32
-  %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
-  store i32 %4, i32* %5, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %6, label %1
-
-; <label>:6                                       ; preds = %1
-  ret void
-}
-
--- a/external/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
@ -1,147 +0,0 @@
-; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
-; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
-; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
-; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
-; REQUIRES: asserts
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "armv8--linux-gnueabihf"
-
-%i8.2 = type {i8, i8}
-define void @i8_factor_2(%i8.2* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; VF_8-LABEL:  Checking a loop in "i8_factor_2"
-; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
-; VF_16-LABEL: Checking a loop in "i8_factor_2"
-; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
-  %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
-  %tmp2 = load i8, i8* %tmp0, align 1
-  %tmp3 = load i8, i8* %tmp1, align 1
-  store i8 0, i8* %tmp0, align 1
-  store i8 0, i8* %tmp1, align 1
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-%i16.2 = type {i16, i16}
-define void @i16_factor_2(%i16.2* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; VF_4-LABEL:  Checking a loop in "i16_factor_2"
-; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_8-LABEL:  Checking a loop in "i16_factor_2"
-; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_16-LABEL: Checking a loop in "i16_factor_2"
-; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
-  %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
-  %tmp2 = load i16, i16* %tmp0, align 2
-  %tmp3 = load i16, i16* %tmp1, align 2
-  store i16 0, i16* %tmp0, align 2
-  store i16 0, i16* %tmp1, align 2
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-%i32.2 = type {i32, i32}
-define void @i32_factor_2(%i32.2* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; VF_2-LABEL:  Checking a loop in "i32_factor_2"
-; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_4-LABEL:  Checking a loop in "i32_factor_2"
-; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_8-LABEL:  Checking a loop in "i32_factor_2"
-; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_16-LABEL: Checking a loop in "i32_factor_2"
-; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
-  %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
-  %tmp2 = load i32, i32* %tmp0, align 4
-  %tmp3 = load i32, i32* %tmp1, align 4
-  store i32 0, i32* %tmp0, align 4
-  store i32 0, i32* %tmp1, align 4
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-%half.2 = type {half, half}
-define void @half_factor_2(%half.2* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; VF_4-LABEL: Checking a loop in "half_factor_2"
-; VF_4:         Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
-; VF_4-NEXT:    Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
-; VF_8-LABEL: Checking a loop in "half_factor_2"
-; VF_8:         Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
-; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
-; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
-; VF_8-NEXT:    Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0
-  %tmp1 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 1
-  %tmp2 = load half, half* %tmp0, align 2
-  %tmp3 = load half, half* %tmp1, align 2
-  store half 0., half* %tmp0, align 2
-  store half 0., half* %tmp1, align 2
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg
+++ b/external/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg
@ -1,3 +0,0 @@
-if not 'ARM' in config.root.targets:
-    config.unsupported = True
-
--- a/external/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll
@ -1,114 +0,0 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
-; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
-; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
-
-; ModuleID = 'arm.ll'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
-target triple = "armv7--linux-gnueabihf"
-
-%T216 = type <2 x i16>
-%T232 = type <2 x i32>
-%T264 = type <2 x i64>
-
-%T416 = type <4 x i16>
-%T432 = type <4 x i32>
-%T464 = type <4 x i64>
-
-define void @direct(%T432* %loadaddr, %T432* %loadaddr2, %T432* %storeaddr) {
-; COST: function 'direct':
-  %v0 = load %T432, %T432* %loadaddr
-; ASM: vld1.64
-  %v1 = load %T432, %T432* %loadaddr2
-; ASM: vld1.64
-  %r3 = mul %T432 %v0, %v1 
-; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
-; ASM: vmul.i32
-  store %T432 %r3, %T432* %storeaddr
-; ASM: vst1.64
-  ret void
-}
-
-define void @ups1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
-; COST: function 'ups1632':
-  %v0 = load %T416, %T416* %loadaddr
-; ASM: vldr
-  %v1 = load %T416, %T416* %loadaddr2
-; ASM: vldr
-  %r1 = sext %T416 %v0 to %T432
-  %r2 = sext %T416 %v1 to %T432
-; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32>
-  %r3 = mul %T432 %r1, %r2 
-; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
-; ASM: vmull.s16
-  store %T432 %r3, %T432* %storeaddr
-; ASM: vst1.64
-  ret void
-}
-
-define void @upu1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
-; COST: function 'upu1632':
-  %v0 = load %T416, %T416* %loadaddr
-; ASM: vldr
-  %v1 = load %T416, %T416* %loadaddr2
-; ASM: vldr
-  %r1 = zext %T416 %v0 to %T432
-  %r2 = zext %T416 %v1 to %T432
-; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32>
-  %r3 = mul %T432 %r1, %r2 
-; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
-; ASM: vmull.u16
-  store %T432 %r3, %T432* %storeaddr
-; ASM: vst1.64
-  ret void
-}
-
-define void @ups3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
-; COST: function 'ups3264':
-  %v0 = load %T232, %T232* %loadaddr
-; ASM: vldr
-  %v1 = load %T232, %T232* %loadaddr2
-; ASM: vldr
-  %r3 = mul %T232 %v0, %v1 
-; ASM: vmul.i32
-; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
-  %st = sext %T232 %r3 to %T264
-; ASM: vmovl.s32
-; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64>
-  store %T264 %st, %T264* %storeaddr
-; ASM: vst1.64
-  ret void
-}
-
-define void @upu3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
-; COST: function 'upu3264':
-  %v0 = load %T232, %T232* %loadaddr
-; ASM: vldr
-  %v1 = load %T232, %T232* %loadaddr2
-; ASM: vldr
-  %r3 = mul %T232 %v0, %v1 
-; ASM: vmul.i32
-; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
-  %st = zext %T232 %r3 to %T264
-; ASM: vmovl.u32
-; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64>
-  store %T264 %st, %T264* %storeaddr
-; ASM: vst1.64
-  ret void
-}
-
-define void @dn3216(%T432* %loadaddr, %T432* %loadaddr2, %T416* %storeaddr) {
-; COST: function 'dn3216':
-  %v0 = load %T432, %T432* %loadaddr
-; ASM: vld1.64
-  %v1 = load %T432, %T432* %loadaddr2
-; ASM: vld1.64
-  %r3 = mul %T432 %v0, %v1 
-; ASM: vmul.i32
-; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
-  %st = trunc %T432 %r3 to %T416
-; ASM: vmovn.i32
-; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16>
-  store %T416 %st, %T416* %storeaddr
-; ASM: vstr
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll
@ -1,37 +0,0 @@
-; RUN: opt -loop-vectorize -tbaa -S -mattr=+neon < %s | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "armv7--linux-gnueabi"
-
-; This requires the loop vectorizer to create an interleaved access group
-; for the stores to the struct. Here we need to perform a bitcast from a vector
-; of pointers to a vector i32s.
-
-%class.A = type { i8*, i32 }
-
-; CHECK-LABEL: test0
-define void @test0(%class.A* %StartPtr, %class.A* %APtr) {
-entry:
-  br label %for.body.i
-
-for.body.i:
-  %addr = phi %class.A* [ %StartPtr, %entry ], [ %incdec.ptr.i, %for.body.i ]
-  %Data.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 0
-  store i8* null, i8** %Data.i.i, align 4, !tbaa !8
-  %Length.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 1
-  store i32 0, i32* %Length.i.i, align 4, !tbaa !11
-  %incdec.ptr.i = getelementptr inbounds %class.A, %class.A* %addr, i32 1
-  %cmp.i = icmp eq %class.A* %incdec.ptr.i, %APtr
-  br i1 %cmp.i, label %exit, label %for.body.i
-
-exit:
-  ret void
-}
-
-!5 = !{!"any pointer", !6, i64 0}
-!6 = !{!"omnipotent char", !7, i64 0}
-!7 = !{!"Simple C/C++ TBAA"}
-!8 = !{!9, !5, i64 0}
-!9 = !{!"some struct", !5, i64 0, !10, i64 4}
-!10 = !{!"int", !6, i64 0}
-!11 = !{!9, !10, i64 4}
--- a/external/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll
@ -1,52 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-target triple = "thumbv7-apple-ios3.0.0"
-
-;CHECK:foo_F32
-;CHECK: <4 x float>
-;CHECK:ret
-define float @foo_F32(float* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %prod.01 = phi float [ %4, %.lr.ph ], [ 0.000000e+00, %0 ]
-  %2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  %3 = load float, float* %2, align 8
-  %4 = fmul fast float %prod.01, %3
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  %prod.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %4, %.lr.ph ]
-  ret float %prod.0.lcssa
-}
-
-;CHECK:foo_I8
-;CHECK: xor <16 x i8>
-;CHECK:ret
-define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
-  %3 = load i8, i8* %2, align 1
-  %4 = xor i8 %3, %red.01
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  %red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ]
-  ret i8 %red.0.lcssa
-}
-
-