Imported Upstream version 5.18.0.207

Former-commit-id: 3b152f462918d427ce18620a2cbe4f8b79650449
2018-11-17 08:23:10 +00:00
parent 8e12397d70
commit eb85e2fc17
28480 changed files with 72 additions and 3866936 deletions
--- a/external/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
@@ -1,39 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-
-;CHECK-LABEL: @foo(
-;CHECK: icmp eq <4 x i32>
-;CHECK: select <4 x i1>
-;CHECK: ret i32
-define i32 @foo(i32 %x, i32 %t, i32* nocapture %A) nounwind uwtable ssp {
-entry:
-  %cmp10 = icmp sgt i32 %x, 0
-  br i1 %cmp10, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %if.end
-  %indvars.iv = phi i64 [ %indvars.iv.next, %if.end ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %tobool = icmp eq i32 %0, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %1 = add nsw i64 %indvars.iv, 45
-  %2 = trunc i64 %indvars.iv to i32
-  %mul = mul nsw i32 %2, %t
-  %3 = trunc i64 %1 to i32
-  %add1 = add nsw i32 %3, %mul
-  br label %if.end
-
-if.end:                                           ; preds = %for.body, %if.then
-  %z.0 = phi i32 [ %add1, %if.then ], [ 9, %for.body ]
-  store i32 %z.0, i32* %arrayidx, align 4
-  %indvars.iv.next = add nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %x
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %if.end, %entry
-  ret i32 undef
-}
--- a/external/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -1,71 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce
-
-; Check that we don't fall into an infinite loop.
-define void @test() nounwind {
-entry:
- br label %for.body
-
-for.body:
- %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
- br label %for.body
-}
-
-
-
-define void @test2() nounwind {
-entry:
- br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
- %indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ]
- %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
- %indvars.iv.next48 = add i64 %indvars.iv47, 1
- br i1 undef, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
- unreachable
-}
-
-;PR14701
-define void @start_model_rare() nounwind uwtable ssp {
-entry:
-  br i1 undef, label %return, label %if.end
-
-if.end:                                           ; preds = %entry
-  br i1 undef, label %cond.false, label %cond.true
-
-cond.true:                                        ; preds = %if.end
-  unreachable
-
-cond.false:                                       ; preds = %if.end
-  br i1 undef, label %cond.false28, label %cond.true20
-
-cond.true20:                                      ; preds = %cond.false
-  unreachable
-
-cond.false28:                                     ; preds = %cond.false
-  br label %for.body40
-
-for.body40:                                       ; preds = %for.inc50, %cond.false28
-  %indvars.iv123 = phi i64 [ 3, %cond.false28 ], [ %indvars.iv.next124, %for.inc50 ]
-  %step.0121 = phi i32 [ 1, %cond.false28 ], [ %step.1, %for.inc50 ]
-  br i1 undef, label %if.then46, label %for.inc50
-
-if.then46:                                        ; preds = %for.body40
-  %inc47 = add nsw i32 %step.0121, 1
-  br label %for.inc50
-
-for.inc50:                                        ; preds = %if.then46, %for.body40
-  %k.1 = phi i32 [ undef, %for.body40 ], [ %inc47, %if.then46 ]
-  %step.1 = phi i32 [ %step.0121, %for.body40 ], [ %inc47, %if.then46 ]
-  %indvars.iv.next124 = add i64 %indvars.iv123, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next124 to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 256
-  br i1 %exitcond, label %for.end52, label %for.body40
-
-for.end52:                                        ; preds = %for.inc50
-  unreachable
-
-return:                                           ; preds = %entry
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
@@ -1,53 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -dce -force-vector-interleave=1 -force-vector-width=4 
-
-; Check that we don't crash.
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-
-module asm "\09.ident\09\22GCC: (GNU) 4.6.3 LLVM: 3.2svn\22"
-
-@b = common global [32000 x float] zeroinitializer, align 16
-
-define i32 @set1ds(i32 %_n, float* nocapture %arr, float %value, i32 %stride) nounwind uwtable {
-entry:
-  %0 = icmp sgt i32 %_n, 0
-  br i1 %0, label %"3.lr.ph", label %"5"
-
-"3.lr.ph":                                        ; preds = %entry
-  %1 = bitcast float* %arr to i8*
-  %2 = sext i32 %stride to i64
-  br label %"3"
-
-"3":                                              ; preds = %"3.lr.ph", %"3"
-  %indvars.iv = phi i64 [ 0, %"3.lr.ph" ], [ %indvars.iv.next, %"3" ]
-  %3 = shl nsw i64 %indvars.iv, 2
-  %4 = getelementptr inbounds i8, i8* %1, i64 %3
-  %5 = bitcast i8* %4 to float*
-  store float %value, float* %5, align 4
-  %indvars.iv.next = add i64 %indvars.iv, %2
-  %6 = trunc i64 %indvars.iv.next to i32
-  %7 = icmp slt i32 %6, %_n
-  br i1 %7, label %"3", label %"5"
-
-"5":                                              ; preds = %"3", %entry
-  ret i32 0
-}
-
-define i32 @init(i8* nocapture %name) unnamed_addr nounwind uwtable {
-entry:
-  br label %"3"
-
-"3":                                              ; preds = %"3", %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %"3" ]
-  %0 = shl nsw i64 %indvars.iv, 2
-  %1 = getelementptr inbounds i8, i8* bitcast (float* getelementptr inbounds ([32000 x float], [32000 x float]* @b, i64 0, i64 16000) to i8*), i64 %0
-  %2 = bitcast i8* %1 to float*
-  store float -1.000000e+00, float* %2, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 16000
-  br i1 %exitcond, label %"5", label %"3"
-
-"5":                                              ; preds = %"3"
-  ret i32 0
-}
--- a/external/llvm/test/Transforms/LoopVectorize/2016-07-27-loop-vec.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/2016-07-27-loop-vec.ll
@@ -1,19 +0,0 @@
-; RUN: opt < %s -loop-vectorize -S
-
-define void @foo() local_unnamed_addr {
-entry:
-  %exitcond = icmp eq i64 3, 3
-  br label %for.body
-
-for.body:                                         ; preds = %entry
-  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
-  %total1 = add nsw i64 %i.05, 3
-  %inc = add nuw nsw i64 %i.05, 1
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-!0 = distinct !{!0, !1}
-!1 = !{!"llvm.loop.vectorize.enable", i1 true}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -1,79 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
-; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -S | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-; This test checks that we correctly compute the scalarized operands for a
-; user-specified vectorization factor when interleaving is disabled. We use the
-; "optsize" attribute to disable all interleaving calculations.  A cost of 4
-; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving
-; %tmp4 a lower scalarization overhead.
-;
-; COST-LABEL:  predicated_udiv_scalarized_operand
-; COST:        LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
-;
-; CHECK-LABEL: @predicated_udiv_scalarized_operand(
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]]
-; CHECK:       [[PRED_UDIV_IF]]:
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], %x
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0
-; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE]]
-; CHECK:       [[PRED_UDIV_CONTINUE]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i64> [ undef, %vector.body ], [ [[TMP8]], %[[PRED_UDIV_IF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]]
-; CHECK:       [[PRED_UDIV_IF1]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i64 [[TMP11]], %x
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = udiv i64 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1
-; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE2]]
-; CHECK:       [[PRED_UDIV_CONTINUE2]]:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], %[[PRED_UDIV_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP16]], <2 x i64> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-;
-define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
-  %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ]
-  %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
-  %tmp2 = load i64, i64* %tmp0, align 4
-  %cond0 = icmp sgt i64 %tmp2, 0
-  br i1 %cond0, label %if.then, label %for.inc
-
-if.then:
-  %tmp3 = add nsw i64 %tmp2, %x
-  %tmp4 = udiv i64 %tmp2, %tmp3
-  br label %for.inc
-
-for.inc:
-  %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then]
-  %tmp6 = add i64 %r, %tmp5
-  %i.next = add nuw nsw i64 %i, 1
-  %cond1 = icmp slt i64 %i.next, 100
-  br i1 %cond1, label %for.body, label %for.end
-
-for.end:
-  %tmp7 = phi i64 [ %tmp6, %for.inc ]
-  ret i64 %tmp7
-}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll
@@ -1,42 +0,0 @@
-; RUN: opt < %s -loop-vectorize -mtriple=aarch64-none-linux-gnu -mattr=+neon -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-
-; Function Attrs: nounwind
-define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
-;CHECK-LABEL: array_add
-;CHECK: load <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: ret
-entry:
-  %cmp10 = icmp sgt i32 %size, 0
-  br i1 %cmp10, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx2, align 4
-  %add = add nsw i32 %1, %0
-  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
-  store i32 %add, i32* %arrayidx4, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %size
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret i32* %c
-}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -1,147 +0,0 @@
-; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 | FileCheck %s
-; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnueabi"
-
-; Test integer induction variable of step 2:
-;   for (int i = 0; i < 1024; i+=2) {
-;     int tmp = *A++;
-;     sum += i * tmp;
-;   }
-
-; CHECK-LABEL: @ind_plus2(
-; CHECK: load <4 x i32>, <4 x i32>*
-; CHECK: load <4 x i32>, <4 x i32>*
-; CHECK: mul nsw <4 x i32>
-; CHECK: mul nsw <4 x i32>
-; CHECK: add nsw <4 x i32>
-; CHECK: add nsw <4 x i32>
-; CHECK: %index.next = add i64 %index, 8
-; CHECK: icmp eq i64 %index.next, 512
-
-; FORCE-VEC-LABEL: @ind_plus2(
-; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>*
-; FORCE-VEC: mul nsw <2 x i32>
-; FORCE-VEC: add nsw <2 x i32>
-; FORCE-VEC: %index.next = add i64 %index, 2
-; FORCE-VEC: icmp eq i64 %index.next, 512
-define i32 @ind_plus2(i32* %A) {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %A.addr = phi i32* [ %A, %entry ], [ %inc.ptr, %for.body ]
-  %i = phi i32 [ 0, %entry ], [ %add1, %for.body ]
-  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %inc.ptr = getelementptr inbounds i32, i32* %A.addr, i64 1
-  %0 = load i32, i32* %A.addr, align 4
-  %mul = mul nsw i32 %0, %i
-  %add = add nsw i32 %mul, %sum
-  %add1 = add nsw i32 %i, 2
-  %cmp = icmp slt i32 %add1, 1024
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body
-  %add.lcssa = phi i32 [ %add, %for.body ]
-  ret i32 %add.lcssa
-}
-
-
-; Test integer induction variable of step -2:
-;   for (int i = 1024; i > 0; i-=2) {
-;     int tmp = *A++;
-;     sum += i * tmp;
-;   }
-
-; CHECK-LABEL: @ind_minus2(
-; CHECK: load <4 x i32>, <4 x i32>*
-; CHECK: load <4 x i32>, <4 x i32>*
-; CHECK: mul nsw <4 x i32>
-; CHECK: mul nsw <4 x i32>
-; CHECK: add nsw <4 x i32>
-; CHECK: add nsw <4 x i32>
-; CHECK: %index.next = add i64 %index, 8
-; CHECK: icmp eq i64 %index.next, 512
-
-; FORCE-VEC-LABEL: @ind_minus2(
-; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>*
-; FORCE-VEC: mul nsw <2 x i32>
-; FORCE-VEC: add nsw <2 x i32>
-; FORCE-VEC: %index.next = add i64 %index, 2
-; FORCE-VEC: icmp eq i64 %index.next, 512
-define i32 @ind_minus2(i32* %A) {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %A.addr = phi i32* [ %A, %entry ], [ %inc.ptr, %for.body ]
-  %i = phi i32 [ 1024, %entry ], [ %sub, %for.body ]
-  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %inc.ptr = getelementptr inbounds i32, i32* %A.addr, i64 1
-  %0 = load i32, i32* %A.addr, align 4
-  %mul = mul nsw i32 %0, %i
-  %add = add nsw i32 %mul, %sum
-  %sub = add nsw i32 %i, -2
-  %cmp = icmp sgt i32 %i, 2
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body
-  %add.lcssa = phi i32 [ %add, %for.body ]
-  ret i32 %add.lcssa
-}
-
-
-; Test pointer induction variable of step 2. As currently we don't support
-; masked load/store, vectorization is possible but not beneficial. If loop
-; vectorization is not enforced, LV will only do interleave.
-;   for (int i = 0; i < 1024; i++) {
-;     int tmp0 = *A++;
-;     int tmp1 = *A++;
-;     sum += tmp0 * tmp1;
-;   }
-
-; CHECK-LABEL: @ptr_ind_plus2(
-; CHECK: %[[V0:.*]] = load <8 x i32>
-; CHECK: %[[V1:.*]] = load <8 x i32>
-; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK: mul nsw <4 x i32>
-; CHECK: mul nsw <4 x i32>
-; CHECK: add nsw <4 x i32>
-; CHECK: add nsw <4 x i32>
-; CHECK: %index.next = add i64 %index, 8
-; CHECK: icmp eq i64 %index.next, 1024
-
-; FORCE-VEC-LABEL: @ptr_ind_plus2(
-; FORCE-VEC: %[[V:.*]] = load <4 x i32>
-; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
-; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
-; FORCE-VEC: mul nsw <2 x i32>
-; FORCE-VEC: add nsw <2 x i32>
-; FORCE-VEC: %index.next = add i64 %index, 2
-; FORCE-VEC: icmp eq i64 %index.next, 1024
-define i32 @ptr_ind_plus2(i32* %A) {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %A.addr = phi i32* [ %A, %entry ], [ %inc.ptr1, %for.body ]
-  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %inc.ptr = getelementptr inbounds i32, i32* %A.addr, i64 1
-  %0 = load i32, i32* %A.addr, align 4
-  %inc.ptr1 = getelementptr inbounds i32, i32* %A.addr, i64 2
-  %1 = load i32, i32* %inc.ptr, align 4
-  %mul = mul nsw i32 %1, %0
-  %add = add nsw i32 %mul, %sum
-  %inc = add nsw i32 %i, 1
-  %exitcond = icmp eq i32 %inc, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  %add.lcssa = phi i32 [ %add, %for.body ]
-  ret i32 %add.lcssa
-}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
@@ -1,42 +0,0 @@
-; RUN: opt < %s -loop-vectorize -mtriple=arm64-none-linux-gnu -mattr=+neon -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-
-; Function Attrs: nounwind
-define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
-;CHECK-LABEL: array_add
-;CHECK: load <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: ret
-entry:
-  %cmp10 = icmp sgt i32 %size, 0
-  br i1 %cmp10, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx2, align 4
-  %add = add nsw i32 %1, %0
-  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
-  store i32 %add, i32* %arrayidx4, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %size
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret i32* %c
-}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll
@@ -1,166 +0,0 @@
-; RUN: opt -mtriple=aarch64--linux-gnueabi -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s -S | FileCheck %s
-
-; The following tests contain loops for which SCEV cannot determine the backedge
-; taken count. This is because the backedge taken condition is produced by an
-; icmp with one of the sides being a loop varying non-AddRec expression.
-; However, there is a possibility to normalize this to an AddRec expression
-; using SCEV predicates. This allows us to compute a 'guarded' backedge count.
-; The Loop Vectorizer is able to version to loop in order to use this guarded
-; backedge count and vectorize more loops.
-
-
-; CHECK-LABEL: test_sge
-; CHECK-LABEL: vector.scevcheck
-; CHECK-LABEL: vector.body
-define void @test_sge(i32* noalias %A,
-                      i32* noalias %B,
-                      i32* noalias %C, i32 %N) {
-entry:
-  %cmp13 = icmp eq i32 %N, 0
-  br i1 %cmp13, label %for.end, label %for.body.preheader
-
-for.body.preheader:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i16 [ %indvars.next, %for.body ], [ 0, %for.body.preheader ]
-  %indvars.next = add i16 %indvars.iv, 1
-  %indvars.ext = zext i16 %indvars.iv to i32
-
-  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.ext
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.ext
-  %1 = load i32, i32* %arrayidx3, align 4
-
-  %mul4 = mul i32 %1, %0
-
-  %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.ext
-  store i32 %mul4, i32* %arrayidx7, align 4
-
-  %exitcond = icmp sge i32 %indvars.ext, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:
-  br label %for.end
-
-for.end:
-  ret void
-}
-
-; CHECK-LABEL: test_uge
-; CHECK-LABEL: vector.scevcheck
-; CHECK-LABEL: vector.body
-define void @test_uge(i32* noalias %A,
-                      i32* noalias %B,
-                      i32* noalias %C, i32 %N, i32 %Offset) {
-entry:
-  %cmp13 = icmp eq i32 %N, 0
-  br i1 %cmp13, label %for.end, label %for.body.preheader
-
-for.body.preheader:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i16 [ %indvars.next, %for.body ], [ 0, %for.body.preheader ]
-  %indvars.next = add i16 %indvars.iv, 1
-
-  %indvars.ext = sext i16 %indvars.iv to i32
-  %indvars.access = add i32 %Offset, %indvars.ext
-
-  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.access
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.access
-  %1 = load i32, i32* %arrayidx3, align 4
-
-  %mul4 = add i32 %1, %0
-
-  %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.access
-  store i32 %mul4, i32* %arrayidx7, align 4
-
-  %exitcond = icmp uge i32 %indvars.ext, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:
-  br label %for.end
-
-for.end:
-  ret void
-}
-
-; CHECK-LABEL: test_ule
-; CHECK-LABEL: vector.scevcheck
-; CHECK-LABEL: vector.body
-define void @test_ule(i32* noalias %A,
-                      i32* noalias %B,
-                      i32* noalias %C, i32 %N,
-                      i16 %M) {
-entry:
-  %cmp13 = icmp eq i32 %N, 0
-  br i1 %cmp13, label %for.end, label %for.body.preheader
-
-for.body.preheader:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i16 [ %indvars.next, %for.body ], [ %M, %for.body.preheader ]
-  %indvars.next = sub i16 %indvars.iv, 1
-  %indvars.ext = zext i16 %indvars.iv to i32
-
-  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.ext
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.ext
-  %1 = load i32, i32* %arrayidx3, align 4
-
-  %mul4 = mul i32 %1, %0
-
-  %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.ext
-  store i32 %mul4, i32* %arrayidx7, align 4
-
-  %exitcond = icmp ule i32 %indvars.ext, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:
-  br label %for.end
-
-for.end:
-  ret void
-}
-
-; CHECK-LABEL: test_sle
-; CHECK-LABEL: vector.scevcheck
-; CHECK-LABEL: vector.body
-define void @test_sle(i32* noalias %A,
-                   i32* noalias %B,
-                   i32* noalias %C, i32 %N,
-                   i16 %M) {
-entry:
-  %cmp13 = icmp eq i32 %N, 0
-  br i1 %cmp13, label %for.end, label %for.body.preheader
-
-for.body.preheader:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i16 [ %indvars.next, %for.body ], [ %M, %for.body.preheader ]
-  %indvars.next = sub i16 %indvars.iv, 1
-  %indvars.ext = sext i16 %indvars.iv to i32
-
-  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.ext
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.ext
-  %1 = load i32, i32* %arrayidx3, align 4
-
-  %mul4 = mul i32 %1, %0
-
-  %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.ext
-  store i32 %mul4, i32* %arrayidx7, align 4
-
-  %exitcond = icmp sle i32 %indvars.ext, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:
-  br label %for.end
-
-for.end:
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
@@ -1,54 +0,0 @@
-; RUN: opt -S < %s -loop-vectorize -instcombine 2>&1 | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64"
-
-;; See https://llvm.org/bugs/show_bug.cgi?id=25490
-;; Due to the data structures used, the LLVM IR was not determinisic.
-;; This test comes from the PR.
-
-;; CHECK-LABEL: @test(
-; CHECK: load <16 x i8>
-; CHECK-NEXT: getelementptr
-; CHECK-NEXT: bitcast
-; CHECK-NEXT: load <16 x i8>
-; CHECK-NEXT: zext <16 x i8>
-; CHECK-NEXT: zext <16 x i8>
-define void @test(i32 %n, i8* nocapture %a, i8* nocapture %b, i8* nocapture readonly %c) {
-entry:
-  %cmp.28 = icmp eq i32 %n, 0
-  br i1 %cmp.28, label %for.cond.cleanup, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  ret void
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i8, i8* %c, i64 %indvars.iv
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %arrayidx2 = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
-  %1 = load i8, i8* %arrayidx2, align 1
-  %conv3 = zext i8 %1 to i32
-  %mul = mul nuw nsw i32 %conv3, %conv
-  %shr.26 = lshr i32 %mul, 8
-  %conv4 = trunc i32 %shr.26 to i8
-  store i8 %conv4, i8* %arrayidx2, align 1
-  %arrayidx8 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
-  %2 = load i8, i8* %arrayidx8, align 1
-  %conv9 = zext i8 %2 to i32
-  %mul10 = mul nuw nsw i32 %conv9, %conv
-  %shr11.27 = lshr i32 %mul10, 8
-  %conv12 = trunc i32 %shr11.27 to i8
-  store i8 %conv12, i8* %arrayidx8, align 1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
@@ -1,85 +0,0 @@
-; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone -enable-interleaved-mem-accesses=false < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-
-@kernel = global [512 x float] zeroinitializer, align 16
-@kernel2 = global [512 x float] zeroinitializer, align 16
-@kernel3 = global [512 x float] zeroinitializer, align 16
-@kernel4 = global [512 x float] zeroinitializer, align 16
-@src_data = global [1536 x float] zeroinitializer, align 16
-@r_ = global i8 0, align 1
-@g_ = global i8 0, align 1
-@b_ = global i8 0, align 1
-
-; We don't want to vectorize most loops containing gathers because they are
-; expensive.
-; Make sure we don't vectorize it.
-; CHECK-NOT: x float>
-
-define void @_Z4testmm(i64 %size, i64 %offset) {
-entry:
-  %cmp53 = icmp eq i64 %size, 0
-  br i1 %cmp53, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:
-  br label %for.body
-
-for.body:
-  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
-  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
-  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
-  %add = add i64 %v.055, %offset
-  %mul = mul i64 %add, 3
-  %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %mul
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 %v.055
-  %1 = load float, float* %arrayidx2, align 4
-  %mul3 = fmul fast float %0, %1
-  %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 %v.055
-  %2 = load float, float* %arrayidx4, align 4
-  %mul5 = fmul fast float %mul3, %2
-  %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 %v.055
-  %3 = load float, float* %arrayidx6, align 4
-  %mul7 = fmul fast float %mul5, %3
-  %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 %v.055
-  %4 = load float, float* %arrayidx8, align 4
-  %mul9 = fmul fast float %mul7, %4
-  %add10 = fadd fast float %r.057, %mul9
-  %arrayidx.sum = add i64 %mul, 1
-  %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
-  %5 = load float, float* %arrayidx11, align 4
-  %mul13 = fmul fast float %1, %5
-  %mul15 = fmul fast float %2, %mul13
-  %mul17 = fmul fast float %3, %mul15
-  %mul19 = fmul fast float %4, %mul17
-  %add20 = fadd fast float %g.056, %mul19
-  %arrayidx.sum52 = add i64 %mul, 2
-  %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
-  %6 = load float, float* %arrayidx21, align 4
-  %mul23 = fmul fast float %1, %6
-  %mul25 = fmul fast float %2, %mul23
-  %mul27 = fmul fast float %3, %mul25
-  %mul29 = fmul fast float %4, %mul27
-  %add30 = fadd fast float %b.054, %mul29
-  %inc = add i64 %v.055, 1
-  %exitcond = icmp ne i64 %inc, %size
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:
-  %add30.lcssa = phi float [ %add30, %for.body ]
-  %add20.lcssa = phi float [ %add20, %for.body ]
-  %add10.lcssa = phi float [ %add10, %for.body ]
-  %phitmp = fptoui float %add10.lcssa to i8
-  %phitmp60 = fptoui float %add20.lcssa to i8
-  %phitmp61 = fptoui float %add30.lcssa to i8
-  br label %for.end
-
-for.end:
-  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  store i8 %r.0.lcssa, i8* @r_, align 1
-  store i8 %g.0.lcssa, i8* @g_, align 1
-  store i8 %b.0.lcssa, i8* @b_, align 1
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll
@@ -1,30 +0,0 @@
-; RUN: opt < %s -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-; CHECK-LABEL: @non_primary_iv_trunc_free(
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 5
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDUCTION]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDUCTION1]] to i32
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-;
-define void @non_primary_iv_trunc_free(i64 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %tmp0 = trunc i64 %i to i32
-  %i.next = add nuw nsw i64 %i, 5
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -1,38 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s
-
-; This test shows extremely high interleaving cost that, probably, should be fixed.
-; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize
-; the load instructions.
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-%pair = type { i8, i8 }
-
-; CHECK-LABEL: test
-; CHECK: Found an estimated cost of 20 for VF 2 For instruction:   {{.*}} load i8
-; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   {{.*}} load i8
-; CHECK: vector.body
-; CHECK: load i8
-; CHECK: load i8
-; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
-
-define void @test(%pair* %p, i64 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0
-  %tmp1 = load i8, i8* %tmp0, align 1
-  %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1
-  %tmp3 = load i8, i8* %tmp2, align 1
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, %n
-  br i1 %cond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -1,189 +0,0 @@
-; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
-; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
-; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
-; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
-; REQUIRES: asserts
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnueabi"
-
-%i8.2 = type {i8, i8}
-define void @i8_factor_2(%i8.2* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; VF_8-LABEL:  Checking a loop in "i8_factor_2"
-; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
-; VF_16-LABEL: Checking a loop in "i8_factor_2"
-; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
-  %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
-  %tmp2 = load i8, i8* %tmp0, align 1
-  %tmp3 = load i8, i8* %tmp1, align 1
-  store i8 0, i8* %tmp0, align 1
-  store i8 0, i8* %tmp1, align 1
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-%i16.2 = type {i16, i16}
-define void @i16_factor_2(%i16.2* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; VF_4-LABEL: Checking a loop in "i16_factor_2"
-; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_8-LABEL:  Checking a loop in "i16_factor_2"
-; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_16-LABEL: Checking a loop in "i16_factor_2"
-; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
-  %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
-  %tmp2 = load i16, i16* %tmp0, align 2
-  %tmp3 = load i16, i16* %tmp1, align 2
-  store i16 0, i16* %tmp0, align 2
-  store i16 0, i16* %tmp1, align 2
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-%i32.2 = type {i32, i32}
-define void @i32_factor_2(%i32.2* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; VF_2-LABEL:  Checking a loop in "i32_factor_2"
-; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_4-LABEL:  Checking a loop in "i32_factor_2"
-; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_8-LABEL:  Checking a loop in "i32_factor_2"
-; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_16-LABEL: Checking a loop in "i32_factor_2"
-; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
-  %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
-  %tmp2 = load i32, i32* %tmp0, align 4
-  %tmp3 = load i32, i32* %tmp1, align 4
-  store i32 0, i32* %tmp0, align 4
-  store i32 0, i32* %tmp1, align 4
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-%i64.2 = type {i64, i64}
-define void @i64_factor_2(%i64.2* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; VF_2-LABEL:  Checking a loop in "i64_factor_2"
-; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
-; VF_4-LABEL:  Checking a loop in "i64_factor_2"
-; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
-; VF_8-LABEL:  Checking a loop in "i64_factor_2"
-; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
-; VF_16-LABEL: Checking a loop in "i64_factor_2"
-; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
-  %tmp1 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 1
-  %tmp2 = load i64, i64* %tmp0, align 8
-  %tmp3 = load i64, i64* %tmp1, align 8
-  store i64 0, i64* %tmp0, align 8
-  store i64 0, i64* %tmp1, align 8
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-%i64.8 = type {i64, i64, i64, i64, i64, i64, i64, i64}
-define void @i64_factor_8(%i64.8* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; The interleave factor in this test is 8, which is greater than the maximum
-; allowed factor for AArch64 (4). Thus, we will fall back to the basic TTI
-; implementation for determining the cost of the interleaved load group. The
-; stores do not form a legal interleaved group because the group would contain
-; gaps.
-;
-; VF_2-LABEL: Checking a loop in "i64_factor_8"
-; VF_2:         Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
-; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
-; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
-; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2
-  %tmp1 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 6
-  %tmp2 = load i64, i64* %tmp0, align 8
-  %tmp3 = load i64, i64* %tmp1, align 8
-  store i64 0, i64* %tmp0, align 8
-  store i64 0, i64* %tmp1, align 8
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/lit.local.cfg
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/lit.local.cfg
@@ -1,5 +0,0 @@
-config.suffixes = ['.ll']
-
-if not 'AArch64' in config.root.targets:
-    config.unsupported = True
-
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -1,310 +0,0 @@
-; RUN: opt -S < %s -basicaa -loop-vectorize -force-vector-interleave=1 2>&1 | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64"
-
-; CHECK-LABEL: @add_a(
-; CHECK: load <16 x i8>, <16 x i8>*
-; CHECK: add <16 x i8>
-; CHECK: store <16 x i8>
-; Function Attrs: nounwind
-define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
-entry:
-  %cmp8 = icmp sgt i32 %len, 0
-  br i1 %cmp8, label %for.body, label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
-  %0 = load i8, i8* %arrayidx
-  %conv = zext i8 %0 to i32
-  %add = add nuw nsw i32 %conv, 2
-  %conv1 = trunc i32 %add to i8
-  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
-  store i8 %conv1, i8* %arrayidx3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %len
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
-
-; Ensure that we preserve nuw/nsw if we're not shrinking the values we're
-; working with.
-; CHECK-LABEL: @add_a1(
-; CHECK: load <16 x i8>, <16 x i8>*
-; CHECK: add nuw nsw <16 x i8>
-; CHECK: store <16 x i8>
-; Function Attrs: nounwind
-define void @add_a1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
-entry:
-  %cmp8 = icmp sgt i32 %len, 0
-  br i1 %cmp8, label %for.body, label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
-  %0 = load i8, i8* %arrayidx
-  %add = add nuw nsw i8 %0, 2
-  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
-  store i8 %add, i8* %arrayidx3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %len
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
-
-; CHECK-LABEL: @add_b(
-; CHECK: load <8 x i16>, <8 x i16>*
-; CHECK: add <8 x i16>
-; CHECK: store <8 x i16>
-; Function Attrs: nounwind
-define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
-entry:
-  %cmp9 = icmp sgt i32 %len, 0
-  br i1 %cmp9, label %for.body, label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
-  %0 = load i16, i16* %arrayidx
-  %conv8 = zext i16 %0 to i32
-  %add = add nuw nsw i32 %conv8, 2
-  %conv1 = trunc i32 %add to i16
-  %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
-  store i16 %conv1, i16* %arrayidx3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %len
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
-
-; CHECK-LABEL: @add_c(
-; CHECK: load <8 x i8>, <8 x i8>*
-; CHECK: add <8 x i16>
-; CHECK: store <8 x i16>
-; Function Attrs: nounwind
-define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
-entry:
-  %cmp8 = icmp sgt i32 %len, 0
-  br i1 %cmp8, label %for.body, label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
-  %0 = load i8, i8* %arrayidx
-  %conv = zext i8 %0 to i32
-  %add = add nuw nsw i32 %conv, 2
-  %conv1 = trunc i32 %add to i16
-  %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
-  store i16 %conv1, i16* %arrayidx3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %len
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
-
-; CHECK-LABEL: @add_d(
-; CHECK: load <4 x i16>
-; CHECK: add nsw <4 x i32>
-; CHECK: store <4 x i32>
-define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
-entry:
-  %cmp7 = icmp sgt i32 %len, 0
-  br i1 %cmp7, label %for.body, label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
-  %0 = load i16, i16* %arrayidx
-  %conv = sext i16 %0 to i32
-  %add = add nsw i32 %conv, 2
-  %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
-  store i32 %add, i32* %arrayidx2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %len
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
-
-; CHECK-LABEL: @add_e(
-; CHECK: load <16 x i8>
-; CHECK: shl <16 x i8>
-; CHECK: add <16 x i8>
-; CHECK: or <16 x i8>
-; CHECK: mul <16 x i8>
-; CHECK: and <16 x i8>
-; CHECK: xor <16 x i8>
-; CHECK: mul <16 x i8>
-; CHECK: store <16 x i8>
-define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
-entry:
-  %cmp.32 = icmp sgt i32 %len, 0
-  br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph:                                   ; preds = %entry
-  %conv11 = zext i8 %arg2 to i32
-  %conv13 = zext i8 %arg1 to i32
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
-  %0 = load i8, i8* %arrayidx
-  %conv = zext i8 %0 to i32
-  %add = shl i32 %conv, 4
-  %conv2 = add nuw nsw i32 %add, 32
-  %or = or i32 %conv, 51
-  %mul = mul nuw nsw i32 %or, 60
-  %and = and i32 %conv2, %conv13
-  %mul.masked = and i32 %mul, 252
-  %conv17 = xor i32 %mul.masked, %conv11
-  %mul18 = mul nuw nsw i32 %conv17, %and
-  %conv19 = trunc i32 %mul18 to i8
-  %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
-  store i8 %conv19, i8* %arrayidx21
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %len
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
-
-; CHECK-LABEL: @add_f
-; CHECK: load <8 x i16>
-; CHECK: trunc <8 x i16>
-; CHECK: shl <8 x i8>
-; CHECK: add <8 x i8>
-; CHECK: or <8 x i8>
-; CHECK: mul <8 x i8>
-; CHECK: and <8 x i8>
-; CHECK: xor <8 x i8>
-; CHECK: mul <8 x i8>
-; CHECK: store <8 x i8>
-define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
-entry:
-  %cmp.32 = icmp sgt i32 %len, 0
-  br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph:                                   ; preds = %entry
-  %conv11 = zext i8 %arg2 to i32
-  %conv13 = zext i8 %arg1 to i32
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
-  %0 = load i16, i16* %arrayidx
-  %conv = sext i16 %0 to i32
-  %add = shl i32 %conv, 4
-  %conv2 = add nsw i32 %add, 32
-  %or = and i32 %conv, 204
-  %conv8 = or i32 %or, 51
-  %mul = mul nuw nsw i32 %conv8, 60
-  %and = and i32 %conv2, %conv13
-  %mul.masked = and i32 %mul, 252
-  %conv17 = xor i32 %mul.masked, %conv11
-  %mul18 = mul nuw nsw i32 %conv17, %and
-  %conv19 = trunc i32 %mul18 to i8
-  %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
-  store i8 %conv19, i8* %arrayidx21
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %len
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
-
-; CHECK-LABEL: @add_phifail(
-; CHECK: load <16 x i8>, <16 x i8>*
-; CHECK: add nuw nsw <16 x i32>
-; CHECK: store <16 x i8>
-; Function Attrs: nounwind
-define void @add_phifail(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
-entry:
-  %cmp8 = icmp sgt i32 %len, 0
-  br i1 %cmp8, label %for.body, label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
-  %0 = load i8, i8* %arrayidx
-  %conv = zext i8 %0 to i32
-  %add = add nuw nsw i32 %conv, 2
-  %conv1 = trunc i32 %add to i8
-  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
-  store i8 %conv1, i8* %arrayidx3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %len
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
-
-; Function Attrs: nounwind
-; When we vectorize this loop, we generate correct code
-; even when %len exactly divides VF (since we extract from the second last index
-; and pass this to the for.cond.cleanup block). Vectorized loop returns 
-; the correct value a_phi = p[len -2]
-define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
-; CHECK-LABEL: @add_phifail2(
-; CHECK: vector.body:
-; CHECK:   %wide.load = load <16 x i8>, <16 x i8>*
-; CHECK:   %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32>
-; CHECK:   add nuw nsw <16 x i32>
-; CHECK:   store <16 x i8>
-; CHECK:   add i64 %index, 16
-; CHECK:   icmp eq i64 %index.next, %n.vec
-; CHECK: middle.block:
-; CHECK:   %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15
-; CHECK:   %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14
-; CHECK: for.cond.cleanup:
-; CHECK:   %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
-; CHECK:   %ret = trunc i32 %a_phi.lcssa to i8
-; CHECK:   ret i8 %ret
-entry:
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  %ret = trunc i32 %a_phi to i8
-  ret i8 %ret
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
-  %0 = load i8, i8* %arrayidx
-  %conv = zext i8 %0 to i32
-  %add = add nuw nsw i32 %conv, 2
-  %conv1 = trunc i32 %add to i8
-  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
-  store i8 %conv1, i8* %arrayidx3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %len
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
-
-attributes #0 = { nounwind }
-
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/max-vf-for-interleaved.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/max-vf-for-interleaved.ll
@@ -1,56 +0,0 @@
-; RUN: opt < %s -force-vector-interleave=1 -store-to-load-forwarding-conflict-detection=false -loop-vectorize -dce -instcombine -S | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-%struct.pair = type { i32, i32 }
-
-; Check vectorization of interleaved access groups with positive dependence
-; distances. In this test, the maximum safe dependence distance for
-; vectorization is 16 bytes. Normally, this would lead to a maximum VF of 4.
-; However, for interleaved groups, the effective VF is VF * IF, where IF is the
-; interleave factor. Here, the maximum safe dependence distance is recomputed
-; as 16 / IF bytes, resulting in VF=2. Since IF=2, we should generate <4 x i32>
-; loads and stores instead of <8 x i32> accesses.
-;
-; Note: LAA's conflict detection optimization has to be disabled for this test
-;       to be vectorized.
-
-; struct pair {
-;   int x;
-;   int y;
-; };
-;
-; void max_vf(struct pair *restrict p) {
-;   for (int i = 0; i < 1000; i++) {
-;     p[i + 2].x = p[i].x
-;     p[i + 2].y = p[i].y
-;   }
-; }
-
-; CHECK-LABEL: @max_vf
-; CHECK: load <4 x i32>
-; CHECK: store <4 x i32>
-
-define void @max_vf(%struct.pair* noalias nocapture %p) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %0 = add nuw nsw i64 %i, 2
-  %p_i.x = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %i, i32 0
-  %p_i_plus_2.x = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %0, i32 0
-  %1 = load i32, i32* %p_i.x, align 4
-  store i32 %1, i32* %p_i_plus_2.x, align 4
-  %p_i.y = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %i, i32 1
-  %p_i_plus_2.y = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %0, i32 1
-  %2 = load i32, i32* %p_i.y, align 4
-  store i32 %2, i32* %p_i_plus_2.y, align 4
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, 1000
-  br i1 %cond, label %for.exit, label %for.body
-
-for.exit:
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -1,49 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-; CHECK-LABEL: all_scalar
-; CHECK:       LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
-; CHECK:       LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
-; CHECK:       LV: Not considering vector loop of width 2 because it will not generate any vector instructions
-;
-define void @all_scalar(i64* %a, i64 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr i64, i64* %a, i64 %i
-  store i64 0, i64* %tmp0, align 1
-  %i.next = add nuw nsw i64 %i, 2
-  %cond = icmp eq i64 %i.next, %n
-  br i1 %cond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-; CHECK-LABEL: PR33193
-; CHECK:       LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
-; CHECK:       LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
-; CHECK:       LV: Not considering vector loop of width 8 because it will not generate any vector instructions
-%struct.a = type { i32, i8 }
-define void @PR33193(%struct.a* %a, i64 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %j = phi i32 [ 0, %entry ], [ %j.next, %for.body ]
-  %tmp0 = getelementptr inbounds %struct.a, %struct.a* %a, i64 %i, i32 1
-  store i8 0, i8* %tmp0, align 4
-  %j.next = add i32 %j, 1
-  %i.next = zext i32 %j.next to i64
-  %cond = icmp ugt i64 %n, %i.next
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/pr31900.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/pr31900.ll
@@ -1,37 +0,0 @@
-; RUN: opt -S -mtriple=aarch64-apple-ios -loop-vectorize -enable-interleaved-mem-accesses -force-vector-width=2 < %s | FileCheck %s
-
-; Reproducer for address space fault in the LoopVectorizer (pr31900). Added
-; different sized address space pointers (p:16:16-p4:32:16) to the aarch64
-; datalayout to reproduce the fault.
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128-p:16:16-p4:32:16"
-
-; Check that all the loads are scalarized
-; CHECK: load i16, i16*
-; CHECK: load i16, i16*
-; CHECK: load i16, i16 addrspace(4)*
-; CHECK: load i16, i16 addrspace(4)*
-
-%rec1445 = type { i16, i16, i16, i16, i16 }
-
-define void @foo() {
-bb1:
-  br label %bb4
-
-bb4:
-  %tmp1 = phi i16 [ undef, %bb1 ], [ %_tmp1013, %bb4 ]
-  %tmp2 = phi %rec1445* [ undef, %bb1 ], [ %_tmp1015, %bb4 ]
-  %tmp3 = phi %rec1445 addrspace(4)* [ undef, %bb1 ], [ %_tmp1017, %bb4 ]
-  %0 = getelementptr %rec1445, %rec1445* %tmp2, i16 0, i32 1
-  %_tmp987 = load i16, i16* %0, align 1
-  %1 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 0, i32 1
-  %_tmp993 = load i16, i16 addrspace(4)* %1, align 1
-  %_tmp1013 = add i16 %tmp1, 1
-  %_tmp1015 = getelementptr %rec1445, %rec1445* %tmp2, i16 1
-  %_tmp1017 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 1
-  %_tmp1019 = icmp ult i16 %_tmp1013, 24
-  br i1 %_tmp1019, label %bb4, label %bb16
-
-bb16:
-  unreachable
-}
--- a/external/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
+++ b/external/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@@ -1,56 +0,0 @@
-; RUN: opt -S -mtriple=aarch64 -loop-vectorize -force-vector-width=2 < %s | FileCheck %s
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-@b = common local_unnamed_addr global i32 0, align 4
-@a = common local_unnamed_addr global i16* null, align 8
-
-; Function Attrs: norecurse nounwind readonly
-define i32 @fn1() local_unnamed_addr #0 {
-; Ensure that we don't emit reduction intrinsics for unsupported short reductions.
-; CHECK-NOT: @llvm.experimental.vector.reduce
-entry:
-  %0 = load i32, i32* @b, align 4, !tbaa !1
-  %cmp40 = icmp sgt i32 %0, 0
-  br i1 %cmp40, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  %1 = load i16*, i16** @a, align 8, !tbaa !5
-  %2 = load i32, i32* @b, align 4, !tbaa !1
-  %3 = sext i32 %2 to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %d.043 = phi i16 [ undef, %for.body.lr.ph ], [ %.sink28, %for.body ]
-  %c.042 = phi i16 [ undef, %for.body.lr.ph ], [ %c.0., %for.body ]
-  %arrayidx = getelementptr inbounds i16, i16* %1, i64 %indvars.iv
-  %4 = load i16, i16* %arrayidx, align 2, !tbaa !7
-  %cmp2 = icmp sgt i16 %c.042, %4
-  %c.0. = select i1 %cmp2, i16 %c.042, i16 %4
-  %cmp13 = icmp slt i16 %d.043, %4
-  %.sink28 = select i1 %cmp13, i16 %d.043, i16 %4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %cmp = icmp slt i64 %indvars.iv.next, %3
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
-  %c.0.lcssa = phi i16 [ undef, %entry ], [ %c.0., %for.body ]
-  %d.0.lcssa = phi i16 [ undef, %entry ], [ %.sink28, %for.body ]
-  %cmp26 = icmp sgt i16 %c.0.lcssa, %d.0.lcssa
-  %conv27 = zext i1 %cmp26 to i32
-  ret i32 %conv27
-}
-
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-!llvm.ident = !{!0}
-
-!0 = !{!"clang"}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"int", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
-!5 = !{!6, !6, i64 0}
-!6 = !{!"any pointer", !3, i64 0}
-!7 = !{!8, !8, i64 0}
-!8 = !{!"short", !3, i64 0}
--- a/Show More
+++ b/Show More