Imported Upstream version 6.10.0.49

Former-commit-id: 1d6753294b2993e1fbf92de9366bb9544db4189b
This commit is contained in:
Xamarin Public Jenkins (auto-signing)
2020-01-16 16:38:04 +00:00
parent d94e79959b
commit 468663ddbb
48518 changed files with 2789335 additions and 61176 deletions

View File

@ -0,0 +1,39 @@
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
;CHECK-LABEL: @foo(
;CHECK: icmp eq <4 x i32>
;CHECK: select <4 x i1>
;CHECK: ret i32
define i32 @foo(i32 %x, i32 %t, i32* nocapture %A) nounwind uwtable ssp {
entry:
%cmp10 = icmp sgt i32 %x, 0
br i1 %cmp10, label %for.body, label %for.end
for.body: ; preds = %entry, %if.end
%indvars.iv = phi i64 [ %indvars.iv.next, %if.end ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%tobool = icmp eq i32 %0, 0
br i1 %tobool, label %if.end, label %if.then
if.then: ; preds = %for.body
%1 = add nsw i64 %indvars.iv, 45
%2 = trunc i64 %indvars.iv to i32
%mul = mul nsw i32 %2, %t
%3 = trunc i64 %1 to i32
%add1 = add nsw i32 %3, %mul
br label %if.end
if.end: ; preds = %for.body, %if.then
%z.0 = phi i32 [ %add1, %if.then ], [ 9, %for.body ]
store i32 %z.0, i32* %arrayidx, align 4
%indvars.iv.next = add nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %x
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %if.end, %entry
ret i32 undef
}

View File

@ -0,0 +1,71 @@
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce
; Check that we don't fall into an infinite loop.
define void @test() nounwind {
entry:
br label %for.body
for.body:
%0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
br label %for.body
}
define void @test2() nounwind {
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ]
%0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
%indvars.iv.next48 = add i64 %indvars.iv47, 1
br i1 undef, label %for.end, label %for.body
for.end: ; preds = %for.body
unreachable
}
;PR14701
define void @start_model_rare() nounwind uwtable ssp {
entry:
br i1 undef, label %return, label %if.end
if.end: ; preds = %entry
br i1 undef, label %cond.false, label %cond.true
cond.true: ; preds = %if.end
unreachable
cond.false: ; preds = %if.end
br i1 undef, label %cond.false28, label %cond.true20
cond.true20: ; preds = %cond.false
unreachable
cond.false28: ; preds = %cond.false
br label %for.body40
for.body40: ; preds = %for.inc50, %cond.false28
%indvars.iv123 = phi i64 [ 3, %cond.false28 ], [ %indvars.iv.next124, %for.inc50 ]
%step.0121 = phi i32 [ 1, %cond.false28 ], [ %step.1, %for.inc50 ]
br i1 undef, label %if.then46, label %for.inc50
if.then46: ; preds = %for.body40
%inc47 = add nsw i32 %step.0121, 1
br label %for.inc50
for.inc50: ; preds = %if.then46, %for.body40
%k.1 = phi i32 [ undef, %for.body40 ], [ %inc47, %if.then46 ]
%step.1 = phi i32 [ %step.0121, %for.body40 ], [ %inc47, %if.then46 ]
%indvars.iv.next124 = add i64 %indvars.iv123, 1
%lftr.wideiv = trunc i64 %indvars.iv.next124 to i32
%exitcond = icmp eq i32 %lftr.wideiv, 256
br i1 %exitcond, label %for.end52, label %for.body40
for.end52: ; preds = %for.inc50
unreachable
return: ; preds = %entry
ret void
}

View File

@ -0,0 +1,53 @@
; RUN: opt < %s -loop-vectorize -dce -force-vector-interleave=1 -force-vector-width=4
; Check that we don't crash.
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
module asm "\09.ident\09\22GCC: (GNU) 4.6.3 LLVM: 3.2svn\22"
@b = common global [32000 x float] zeroinitializer, align 16
define i32 @set1ds(i32 %_n, float* nocapture %arr, float %value, i32 %stride) nounwind uwtable {
entry:
%0 = icmp sgt i32 %_n, 0
br i1 %0, label %"3.lr.ph", label %"5"
"3.lr.ph": ; preds = %entry
%1 = bitcast float* %arr to i8*
%2 = sext i32 %stride to i64
br label %"3"
"3": ; preds = %"3.lr.ph", %"3"
%indvars.iv = phi i64 [ 0, %"3.lr.ph" ], [ %indvars.iv.next, %"3" ]
%3 = shl nsw i64 %indvars.iv, 2
%4 = getelementptr inbounds i8, i8* %1, i64 %3
%5 = bitcast i8* %4 to float*
store float %value, float* %5, align 4
%indvars.iv.next = add i64 %indvars.iv, %2
%6 = trunc i64 %indvars.iv.next to i32
%7 = icmp slt i32 %6, %_n
br i1 %7, label %"3", label %"5"
"5": ; preds = %"3", %entry
ret i32 0
}
define i32 @init(i8* nocapture %name) unnamed_addr nounwind uwtable {
entry:
br label %"3"
"3": ; preds = %"3", %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %"3" ]
%0 = shl nsw i64 %indvars.iv, 2
%1 = getelementptr inbounds i8, i8* bitcast (float* getelementptr inbounds ([32000 x float], [32000 x float]* @b, i64 0, i64 16000) to i8*), i64 %0
%2 = bitcast i8* %1 to float*
store float -1.000000e+00, float* %2, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 16000
br i1 %exitcond, label %"5", label %"3"
"5": ; preds = %"3"
ret i32 0
}

View File

@ -0,0 +1,19 @@
; RUN: opt < %s -loop-vectorize -S
define void @foo() local_unnamed_addr {
entry:
%exitcond = icmp eq i64 3, 3
br label %for.body
for.body: ; preds = %entry
%i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%total1 = add nsw i64 %i.05, 3
%inc = add nuw nsw i64 %i.05, 1
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
for.end: ; preds = %for.body
ret void
}
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.enable", i1 true}

View File

@ -0,0 +1,79 @@
; REQUIRES: asserts
; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -S | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
; This test checks that we correctly compute the scalarized operands for a
; user-specified vectorization factor when interleaving is disabled. We use the
; "optsize" attribute to disable all interleaving calculations. A cost of 4
; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving
; %tmp4 a lower scalarization overhead.
;
; COST-LABEL: predicated_udiv_scalarized_operand
; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
;
; CHECK-LABEL: @predicated_udiv_scalarized_operand(
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]]
; CHECK: [[PRED_UDIV_IF]]:
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], %x
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP6]], [[TMP5]]
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]]
; CHECK: [[PRED_UDIV_CONTINUE]]:
; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i64> [ undef, %vector.body ], [ [[TMP8]], %[[PRED_UDIV_IF]] ]
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]]
; CHECK: [[PRED_UDIV_IF1]]:
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP11]], %x
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP13]], [[TMP12]]
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]]
; CHECK: [[PRED_UDIV_CONTINUE2]]:
; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], %[[PRED_UDIV_IF1]] ]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP16]], <2 x i64> [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
;
define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize {
entry:
br label %for.body
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
%r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ]
%tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
%tmp2 = load i64, i64* %tmp0, align 4
%cond0 = icmp sgt i64 %tmp2, 0
br i1 %cond0, label %if.then, label %for.inc
if.then:
%tmp3 = add nsw i64 %tmp2, %x
%tmp4 = udiv i64 %tmp2, %tmp3
br label %for.inc
for.inc:
%tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then]
%tmp6 = add i64 %r, %tmp5
%i.next = add nuw nsw i64 %i, 1
%cond1 = icmp slt i64 %i.next, 100
br i1 %cond1, label %for.body, label %for.end
for.end:
%tmp7 = phi i64 [ %tmp6, %for.inc ]
ret i64 %tmp7
}

View File

@ -0,0 +1,42 @@
; RUN: opt < %s -loop-vectorize -mtriple=aarch64-none-linux-gnu -mattr=+neon -S | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
; Function Attrs: nounwind
define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
;CHECK-LABEL: array_add
;CHECK: load <4 x i32>
;CHECK: load <4 x i32>
;CHECK: load <4 x i32>
;CHECK: load <4 x i32>
;CHECK: add nsw <4 x i32>
;CHECK: add nsw <4 x i32>
;CHECK: store <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret
entry:
%cmp10 = icmp sgt i32 %size, 0
br i1 %cmp10, label %for.body.preheader, label %for.end
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
%1 = load i32, i32* %arrayidx2, align 4
%add = add nsw i32 %1, %0
%arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
store i32 %add, i32* %arrayidx4, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %size
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret i32* %c
}

View File

@ -0,0 +1,147 @@
; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 | FileCheck %s
; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnueabi"
; Test integer induction variable of step 2:
; for (int i = 0; i < 1024; i+=2) {
; int tmp = *A++;
; sum += i * tmp;
; }
; CHECK-LABEL: @ind_plus2(
; CHECK: load <4 x i32>, <4 x i32>*
; CHECK: load <4 x i32>, <4 x i32>*
; CHECK: mul nsw <4 x i32>
; CHECK: mul nsw <4 x i32>
; CHECK: add nsw <4 x i32>
; CHECK: add nsw <4 x i32>
; CHECK: %index.next = add i64 %index, 8
; CHECK: icmp eq i64 %index.next, 512
; FORCE-VEC-LABEL: @ind_plus2(
; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>*
; FORCE-VEC: mul nsw <2 x i32>
; FORCE-VEC: add nsw <2 x i32>
; FORCE-VEC: %index.next = add i64 %index, 2
; FORCE-VEC: icmp eq i64 %index.next, 512
define i32 @ind_plus2(i32* %A) {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%A.addr = phi i32* [ %A, %entry ], [ %inc.ptr, %for.body ]
%i = phi i32 [ 0, %entry ], [ %add1, %for.body ]
%sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
%inc.ptr = getelementptr inbounds i32, i32* %A.addr, i64 1
%0 = load i32, i32* %A.addr, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%add1 = add nsw i32 %i, 2
%cmp = icmp slt i32 %add1, 1024
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
%add.lcssa = phi i32 [ %add, %for.body ]
ret i32 %add.lcssa
}
; Test integer induction variable of step -2:
; for (int i = 1024; i > 0; i-=2) {
; int tmp = *A++;
; sum += i * tmp;
; }
; CHECK-LABEL: @ind_minus2(
; CHECK: load <4 x i32>, <4 x i32>*
; CHECK: load <4 x i32>, <4 x i32>*
; CHECK: mul nsw <4 x i32>
; CHECK: mul nsw <4 x i32>
; CHECK: add nsw <4 x i32>
; CHECK: add nsw <4 x i32>
; CHECK: %index.next = add i64 %index, 8
; CHECK: icmp eq i64 %index.next, 512
; FORCE-VEC-LABEL: @ind_minus2(
; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>*
; FORCE-VEC: mul nsw <2 x i32>
; FORCE-VEC: add nsw <2 x i32>
; FORCE-VEC: %index.next = add i64 %index, 2
; FORCE-VEC: icmp eq i64 %index.next, 512
define i32 @ind_minus2(i32* %A) {
entry:
br label %for.body
for.body: ; preds = %entry, %for.body
%A.addr = phi i32* [ %A, %entry ], [ %inc.ptr, %for.body ]
%i = phi i32 [ 1024, %entry ], [ %sub, %for.body ]
%sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
%inc.ptr = getelementptr inbounds i32, i32* %A.addr, i64 1
%0 = load i32, i32* %A.addr, align 4
%mul = mul nsw i32 %0, %i
%add = add nsw i32 %mul, %sum
%sub = add nsw i32 %i, -2
%cmp = icmp sgt i32 %i, 2
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
%add.lcssa = phi i32 [ %add, %for.body ]
ret i32 %add.lcssa
}
; Test pointer induction variable of step 2. As currently we don't support
; masked load/store, vectorization is possible but not beneficial. If loop
; vectorization is not enforced, LV will only do interleave.
; for (int i = 0; i < 1024; i++) {
; int tmp0 = *A++;
; int tmp1 = *A++;
; sum += tmp0 * tmp1;
; }
; CHECK-LABEL: @ptr_ind_plus2(
; CHECK: %[[V0:.*]] = load <8 x i32>
; CHECK: %[[V1:.*]] = load <8 x i32>
; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; CHECK: mul nsw <4 x i32>
; CHECK: mul nsw <4 x i32>
; CHECK: add nsw <4 x i32>
; CHECK: add nsw <4 x i32>
; CHECK: %index.next = add i64 %index, 8
; CHECK: icmp eq i64 %index.next, 1024
; FORCE-VEC-LABEL: @ptr_ind_plus2(
; FORCE-VEC: %[[V:.*]] = load <4 x i32>
; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
; FORCE-VEC: mul nsw <2 x i32>
; FORCE-VEC: add nsw <2 x i32>
; FORCE-VEC: %index.next = add i64 %index, 2
; FORCE-VEC: icmp eq i64 %index.next, 1024
define i32 @ptr_ind_plus2(i32* %A) {
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%A.addr = phi i32* [ %A, %entry ], [ %inc.ptr1, %for.body ]
%sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
%i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%inc.ptr = getelementptr inbounds i32, i32* %A.addr, i64 1
%0 = load i32, i32* %A.addr, align 4
%inc.ptr1 = getelementptr inbounds i32, i32* %A.addr, i64 2
%1 = load i32, i32* %inc.ptr, align 4
%mul = mul nsw i32 %1, %0
%add = add nsw i32 %mul, %sum
%inc = add nsw i32 %i, 1
%exitcond = icmp eq i32 %inc, 1024
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
%add.lcssa = phi i32 [ %add, %for.body ]
ret i32 %add.lcssa
}

View File

@ -0,0 +1,42 @@
; RUN: opt < %s -loop-vectorize -mtriple=arm64-none-linux-gnu -mattr=+neon -S | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
; Function Attrs: nounwind
define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
;CHECK-LABEL: array_add
;CHECK: load <4 x i32>
;CHECK: load <4 x i32>
;CHECK: load <4 x i32>
;CHECK: load <4 x i32>
;CHECK: add nsw <4 x i32>
;CHECK: add nsw <4 x i32>
;CHECK: store <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret
entry:
%cmp10 = icmp sgt i32 %size, 0
br i1 %cmp10, label %for.body.preheader, label %for.end
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
%1 = load i32, i32* %arrayidx2, align 4
%add = add nsw i32 %1, %0
%arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
store i32 %add, i32* %arrayidx4, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %size
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret i32* %c
}

View File

@ -0,0 +1,166 @@
; RUN: opt -mtriple=aarch64--linux-gnueabi -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s -S | FileCheck %s
; The following tests contain loops for which SCEV cannot determine the backedge
; taken count. This is because the backedge taken condition is produced by an
; icmp with one of the sides being a loop varying non-AddRec expression.
; However, there is a possibility to normalize this to an AddRec expression
; using SCEV predicates. This allows us to compute a 'guarded' backedge count.
; The Loop Vectorizer is able to version to loop in order to use this guarded
; backedge count and vectorize more loops.
; CHECK-LABEL: test_sge
; CHECK-LABEL: vector.scevcheck
; CHECK-LABEL: vector.body
define void @test_sge(i32* noalias %A,
i32* noalias %B,
i32* noalias %C, i32 %N) {
entry:
%cmp13 = icmp eq i32 %N, 0
br i1 %cmp13, label %for.end, label %for.body.preheader
for.body.preheader:
br label %for.body
for.body:
%indvars.iv = phi i16 [ %indvars.next, %for.body ], [ 0, %for.body.preheader ]
%indvars.next = add i16 %indvars.iv, 1
%indvars.ext = zext i16 %indvars.iv to i32
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.ext
%0 = load i32, i32* %arrayidx, align 4
%arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.ext
%1 = load i32, i32* %arrayidx3, align 4
%mul4 = mul i32 %1, %0
%arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.ext
store i32 %mul4, i32* %arrayidx7, align 4
%exitcond = icmp sge i32 %indvars.ext, %N
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: test_uge
; CHECK-LABEL: vector.scevcheck
; CHECK-LABEL: vector.body
define void @test_uge(i32* noalias %A,
i32* noalias %B,
i32* noalias %C, i32 %N, i32 %Offset) {
entry:
%cmp13 = icmp eq i32 %N, 0
br i1 %cmp13, label %for.end, label %for.body.preheader
for.body.preheader:
br label %for.body
for.body:
%indvars.iv = phi i16 [ %indvars.next, %for.body ], [ 0, %for.body.preheader ]
%indvars.next = add i16 %indvars.iv, 1
%indvars.ext = sext i16 %indvars.iv to i32
%indvars.access = add i32 %Offset, %indvars.ext
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.access
%0 = load i32, i32* %arrayidx, align 4
%arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.access
%1 = load i32, i32* %arrayidx3, align 4
%mul4 = add i32 %1, %0
%arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.access
store i32 %mul4, i32* %arrayidx7, align 4
%exitcond = icmp uge i32 %indvars.ext, %N
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: test_ule
; CHECK-LABEL: vector.scevcheck
; CHECK-LABEL: vector.body
define void @test_ule(i32* noalias %A,
i32* noalias %B,
i32* noalias %C, i32 %N,
i16 %M) {
entry:
%cmp13 = icmp eq i32 %N, 0
br i1 %cmp13, label %for.end, label %for.body.preheader
for.body.preheader:
br label %for.body
for.body:
%indvars.iv = phi i16 [ %indvars.next, %for.body ], [ %M, %for.body.preheader ]
%indvars.next = sub i16 %indvars.iv, 1
%indvars.ext = zext i16 %indvars.iv to i32
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.ext
%0 = load i32, i32* %arrayidx, align 4
%arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.ext
%1 = load i32, i32* %arrayidx3, align 4
%mul4 = mul i32 %1, %0
%arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.ext
store i32 %mul4, i32* %arrayidx7, align 4
%exitcond = icmp ule i32 %indvars.ext, %N
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; CHECK-LABEL: test_sle
; CHECK-LABEL: vector.scevcheck
; CHECK-LABEL: vector.body
define void @test_sle(i32* noalias %A,
i32* noalias %B,
i32* noalias %C, i32 %N,
i16 %M) {
entry:
%cmp13 = icmp eq i32 %N, 0
br i1 %cmp13, label %for.end, label %for.body.preheader
for.body.preheader:
br label %for.body
for.body:
%indvars.iv = phi i16 [ %indvars.next, %for.body ], [ %M, %for.body.preheader ]
%indvars.next = sub i16 %indvars.iv, 1
%indvars.ext = sext i16 %indvars.iv to i32
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.ext
%0 = load i32, i32* %arrayidx, align 4
%arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.ext
%1 = load i32, i32* %arrayidx3, align 4
%mul4 = mul i32 %1, %0
%arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.ext
store i32 %mul4, i32* %arrayidx7, align 4
%exitcond = icmp sle i32 %indvars.ext, %N
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit:
br label %for.end
for.end:
ret void
}

View File

@ -0,0 +1,54 @@
; RUN: opt -S < %s -loop-vectorize -instcombine 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"
;; See https://llvm.org/bugs/show_bug.cgi?id=25490
;; Due to the data structures used, the LLVM IR was not determinisic.
;; This test comes from the PR.
;; CHECK-LABEL: @test(
; CHECK: load <16 x i8>
; CHECK-NEXT: getelementptr
; CHECK-NEXT: bitcast
; CHECK-NEXT: load <16 x i8>
; CHECK-NEXT: zext <16 x i8>
; CHECK-NEXT: zext <16 x i8>
define void @test(i32 %n, i8* nocapture %a, i8* nocapture %b, i8* nocapture readonly %c) {
entry:
%cmp.28 = icmp eq i32 %n, 0
br i1 %cmp.28, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i8, i8* %c, i64 %indvars.iv
%0 = load i8, i8* %arrayidx, align 1
%conv = zext i8 %0 to i32
%arrayidx2 = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
%1 = load i8, i8* %arrayidx2, align 1
%conv3 = zext i8 %1 to i32
%mul = mul nuw nsw i32 %conv3, %conv
%shr.26 = lshr i32 %mul, 8
%conv4 = trunc i32 %shr.26 to i8
store i8 %conv4, i8* %arrayidx2, align 1
%arrayidx8 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
%2 = load i8, i8* %arrayidx8, align 1
%conv9 = zext i8 %2 to i32
%mul10 = mul nuw nsw i32 %conv9, %conv
%shr11.27 = lshr i32 %mul10, 8
%conv12 = trunc i32 %shr11.27 to i8
store i8 %conv12, i8* %arrayidx8, align 1
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}

View File

@ -0,0 +1,85 @@
; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone -enable-interleaved-mem-accesses=false < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
@kernel = global [512 x float] zeroinitializer, align 16
@kernel2 = global [512 x float] zeroinitializer, align 16
@kernel3 = global [512 x float] zeroinitializer, align 16
@kernel4 = global [512 x float] zeroinitializer, align 16
@src_data = global [1536 x float] zeroinitializer, align 16
@r_ = global i8 0, align 1
@g_ = global i8 0, align 1
@b_ = global i8 0, align 1
; We don't want to vectorize most loops containing gathers because they are
; expensive.
; Make sure we don't vectorize it.
; CHECK-NOT: x float>
define void @_Z4testmm(i64 %size, i64 %offset) {
entry:
%cmp53 = icmp eq i64 %size, 0
br i1 %cmp53, label %for.end, label %for.body.lr.ph
for.body.lr.ph:
br label %for.body
for.body:
%r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
%g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
%v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
%b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
%add = add i64 %v.055, %offset
%mul = mul i64 %add, 3
%arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %mul
%0 = load float, float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 %v.055
%1 = load float, float* %arrayidx2, align 4
%mul3 = fmul fast float %0, %1
%arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 %v.055
%2 = load float, float* %arrayidx4, align 4
%mul5 = fmul fast float %mul3, %2
%arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 %v.055
%3 = load float, float* %arrayidx6, align 4
%mul7 = fmul fast float %mul5, %3
%arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 %v.055
%4 = load float, float* %arrayidx8, align 4
%mul9 = fmul fast float %mul7, %4
%add10 = fadd fast float %r.057, %mul9
%arrayidx.sum = add i64 %mul, 1
%arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
%5 = load float, float* %arrayidx11, align 4
%mul13 = fmul fast float %1, %5
%mul15 = fmul fast float %2, %mul13
%mul17 = fmul fast float %3, %mul15
%mul19 = fmul fast float %4, %mul17
%add20 = fadd fast float %g.056, %mul19
%arrayidx.sum52 = add i64 %mul, 2
%arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
%6 = load float, float* %arrayidx21, align 4
%mul23 = fmul fast float %1, %6
%mul25 = fmul fast float %2, %mul23
%mul27 = fmul fast float %3, %mul25
%mul29 = fmul fast float %4, %mul27
%add30 = fadd fast float %b.054, %mul29
%inc = add i64 %v.055, 1
%exitcond = icmp ne i64 %inc, %size
br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge:
%add30.lcssa = phi float [ %add30, %for.body ]
%add20.lcssa = phi float [ %add20, %for.body ]
%add10.lcssa = phi float [ %add10, %for.body ]
%phitmp = fptoui float %add10.lcssa to i8
%phitmp60 = fptoui float %add20.lcssa to i8
%phitmp61 = fptoui float %add30.lcssa to i8
br label %for.end
for.end:
%r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
%g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
%b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
store i8 %r.0.lcssa, i8* @r_, align 1
store i8 %g.0.lcssa, i8* @g_, align 1
store i8 %b.0.lcssa, i8* @b_, align 1
ret void
}

View File

@ -0,0 +1,30 @@
; RUN: opt < %s -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
; CHECK-LABEL: @non_primary_iv_trunc_free(
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 5
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDUCTION]] to i32
; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDUCTION1]] to i32
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
;
define void @non_primary_iv_trunc_free(i64 %n) {
entry:
br label %for.body
for.body:
%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
%tmp0 = trunc i64 %i to i32
%i.next = add nuw nsw i64 %i, 5
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end:
ret void
}

View File

@ -0,0 +1,38 @@
; REQUIRES: asserts
; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s
; This test shows extremely high interleaving cost that, probably, should be fixed.
; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize
; the load instructions.
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
%pair = type { i8, i8 }
; CHECK-LABEL: test
; CHECK: Found an estimated cost of 20 for VF 2 For instruction: {{.*}} load i8
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8
; CHECK: vector.body
; CHECK: load i8
; CHECK: load i8
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
define void @test(%pair* %p, i64 %n) {
entry:
br label %for.body
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0
%tmp1 = load i8, i8* %tmp0, align 1
%tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1
%tmp3 = load i8, i8* %tmp2, align 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp eq i64 %i.next, %n
br i1 %cond, label %for.end, label %for.body
for.end:
ret void
}

View File

@ -0,0 +1,189 @@
; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
; REQUIRES: asserts
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnueabi"
%i8.2 = type {i8, i8}
define void @i8_factor_2(%i8.2* %data, i64 %n) {
entry:
br label %for.body
; VF_8-LABEL: Checking a loop in "i8_factor_2"
; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
; VF_16-LABEL: Checking a loop in "i8_factor_2"
; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
%tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
%tmp2 = load i8, i8* %tmp0, align 1
%tmp3 = load i8, i8* %tmp1, align 1
store i8 0, i8* %tmp0, align 1
store i8 0, i8* %tmp1, align 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end:
ret void
}
%i16.2 = type {i16, i16}
define void @i16_factor_2(%i16.2* %data, i64 %n) {
entry:
br label %for.body
; VF_4-LABEL: Checking a loop in "i16_factor_2"
; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
; VF_8-LABEL: Checking a loop in "i16_factor_2"
; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
; VF_16-LABEL: Checking a loop in "i16_factor_2"
; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
%tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
%tmp2 = load i16, i16* %tmp0, align 2
%tmp3 = load i16, i16* %tmp1, align 2
store i16 0, i16* %tmp0, align 2
store i16 0, i16* %tmp1, align 2
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end:
ret void
}
%i32.2 = type {i32, i32}
define void @i32_factor_2(%i32.2* %data, i64 %n) {
entry:
br label %for.body
; VF_2-LABEL: Checking a loop in "i32_factor_2"
; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
; VF_4-LABEL: Checking a loop in "i32_factor_2"
; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
; VF_8-LABEL: Checking a loop in "i32_factor_2"
; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
; VF_16-LABEL: Checking a loop in "i32_factor_2"
; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
%tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
%tmp2 = load i32, i32* %tmp0, align 4
%tmp3 = load i32, i32* %tmp1, align 4
store i32 0, i32* %tmp0, align 4
store i32 0, i32* %tmp1, align 4
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end:
ret void
}
%i64.2 = type {i64, i64}
define void @i64_factor_2(%i64.2* %data, i64 %n) {
entry:
br label %for.body
; VF_2-LABEL: Checking a loop in "i64_factor_2"
; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
; VF_4-LABEL: Checking a loop in "i64_factor_2"
; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
; VF_8-LABEL: Checking a loop in "i64_factor_2"
; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
; VF_16-LABEL: Checking a loop in "i64_factor_2"
; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
%tmp1 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 1
%tmp2 = load i64, i64* %tmp0, align 8
%tmp3 = load i64, i64* %tmp1, align 8
store i64 0, i64* %tmp0, align 8
store i64 0, i64* %tmp1, align 8
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end:
ret void
}
%i64.8 = type {i64, i64, i64, i64, i64, i64, i64, i64}
define void @i64_factor_8(%i64.8* %data, i64 %n) {
entry:
br label %for.body
; The interleave factor in this test is 8, which is greater than the maximum
; allowed factor for AArch64 (4). Thus, we will fall back to the basic TTI
; implementation for determining the cost of the interleaved load group. The
; stores do not form a legal interleaved group because the group would contain
; gaps.
;
; VF_2-LABEL: Checking a loop in "i64_factor_8"
; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2
%tmp1 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 6
%tmp2 = load i64, i64* %tmp0, align 8
%tmp3 = load i64, i64* %tmp1, align 8
store i64 0, i64* %tmp0, align 8
store i64 0, i64* %tmp1, align 8
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end:
ret void
}

View File

@ -0,0 +1,5 @@
config.suffixes = ['.ll']
if not 'AArch64' in config.root.targets:
config.unsupported = True

View File

@ -0,0 +1,310 @@
; RUN: opt -S < %s -basicaa -loop-vectorize -force-vector-interleave=1 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"
; CHECK-LABEL: @add_a(
; CHECK: load <16 x i8>, <16 x i8>*
; CHECK: add <16 x i8>
; CHECK: store <16 x i8>
; Function Attrs: nounwind
define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp8 = icmp sgt i32 %len, 0
br i1 %cmp8, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = add nuw nsw i32 %conv, 2
%conv1 = trunc i32 %add to i8
%arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv1, i8* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; Ensure that we preserve nuw/nsw if we're not shrinking the values we're
; working with.
; CHECK-LABEL: @add_a1(
; CHECK: load <16 x i8>, <16 x i8>*
; CHECK: add nuw nsw <16 x i8>
; CHECK: store <16 x i8>
; Function Attrs: nounwind
define void @add_a1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp8 = icmp sgt i32 %len, 0
br i1 %cmp8, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%add = add nuw nsw i8 %0, 2
%arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %add, i8* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_b(
; CHECK: load <8 x i16>, <8 x i16>*
; CHECK: add <8 x i16>
; CHECK: store <8 x i16>
; Function Attrs: nounwind
define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp9 = icmp sgt i32 %len, 0
br i1 %cmp9, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
%0 = load i16, i16* %arrayidx
%conv8 = zext i16 %0 to i32
%add = add nuw nsw i32 %conv8, 2
%conv1 = trunc i32 %add to i16
%arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
store i16 %conv1, i16* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_c(
; CHECK: load <8 x i8>, <8 x i8>*
; CHECK: add <8 x i16>
; CHECK: store <8 x i16>
; Function Attrs: nounwind
define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp8 = icmp sgt i32 %len, 0
br i1 %cmp8, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = add nuw nsw i32 %conv, 2
%conv1 = trunc i32 %add to i16
%arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
store i16 %conv1, i16* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_d(
; CHECK: load <4 x i16>
; CHECK: add nsw <4 x i32>
; CHECK: store <4 x i32>
define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp7 = icmp sgt i32 %len, 0
br i1 %cmp7, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
%0 = load i16, i16* %arrayidx
%conv = sext i16 %0 to i32
%add = add nsw i32 %conv, 2
%arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
store i32 %add, i32* %arrayidx2
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_e(
; CHECK: load <16 x i8>
; CHECK: shl <16 x i8>
; CHECK: add <16 x i8>
; CHECK: or <16 x i8>
; CHECK: mul <16 x i8>
; CHECK: and <16 x i8>
; CHECK: xor <16 x i8>
; CHECK: mul <16 x i8>
; CHECK: store <16 x i8>
define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
entry:
%cmp.32 = icmp sgt i32 %len, 0
br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
for.body.lr.ph: ; preds = %entry
%conv11 = zext i8 %arg2 to i32
%conv13 = zext i8 %arg1 to i32
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %for.body, %for.body.lr.ph
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = shl i32 %conv, 4
%conv2 = add nuw nsw i32 %add, 32
%or = or i32 %conv, 51
%mul = mul nuw nsw i32 %or, 60
%and = and i32 %conv2, %conv13
%mul.masked = and i32 %mul, 252
%conv17 = xor i32 %mul.masked, %conv11
%mul18 = mul nuw nsw i32 %conv17, %and
%conv19 = trunc i32 %mul18 to i8
%arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv19, i8* %arrayidx21
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_f
; CHECK: load <8 x i16>
; CHECK: trunc <8 x i16>
; CHECK: shl <8 x i8>
; CHECK: add <8 x i8>
; CHECK: or <8 x i8>
; CHECK: mul <8 x i8>
; CHECK: and <8 x i8>
; CHECK: xor <8 x i8>
; CHECK: mul <8 x i8>
; CHECK: store <8 x i8>
define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
entry:
%cmp.32 = icmp sgt i32 %len, 0
br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
for.body.lr.ph: ; preds = %entry
%conv11 = zext i8 %arg2 to i32
%conv13 = zext i8 %arg1 to i32
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %for.body, %for.body.lr.ph
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
%0 = load i16, i16* %arrayidx
%conv = sext i16 %0 to i32
%add = shl i32 %conv, 4
%conv2 = add nsw i32 %add, 32
%or = and i32 %conv, 204
%conv8 = or i32 %or, 51
%mul = mul nuw nsw i32 %conv8, 60
%and = and i32 %conv2, %conv13
%mul.masked = and i32 %mul, 252
%conv17 = xor i32 %mul.masked, %conv11
%mul18 = mul nuw nsw i32 %conv17, %and
%conv19 = trunc i32 %mul18 to i8
%arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv19, i8* %arrayidx21
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_phifail(
; CHECK: load <16 x i8>, <16 x i8>*
; CHECK: add nuw nsw <16 x i32>
; CHECK: store <16 x i8>
; Function Attrs: nounwind
define void @add_phifail(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp8 = icmp sgt i32 %len, 0
br i1 %cmp8, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = add nuw nsw i32 %conv, 2
%conv1 = trunc i32 %add to i8
%arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv1, i8* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; Function Attrs: nounwind
; When we vectorize this loop, we generate correct code
; even when %len exactly divides VF (since we extract from the second last index
; and pass this to the for.cond.cleanup block). Vectorized loop returns
; the correct value a_phi = p[len -2]
define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
; CHECK-LABEL: @add_phifail2(
; CHECK: vector.body:
; CHECK: %wide.load = load <16 x i8>, <16 x i8>*
; CHECK: %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32>
; CHECK: add nuw nsw <16 x i32>
; CHECK: store <16 x i8>
; CHECK: add i64 %index, 16
; CHECK: icmp eq i64 %index.next, %n.vec
; CHECK: middle.block:
; CHECK: %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15
; CHECK: %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14
; CHECK: for.cond.cleanup:
; CHECK: %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
; CHECK: %ret = trunc i32 %a_phi.lcssa to i8
; CHECK: ret i8 %ret
entry:
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
%ret = trunc i32 %a_phi to i8
ret i8 %ret
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = add nuw nsw i32 %conv, 2
%conv1 = trunc i32 %add to i8
%arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv1, i8* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
attributes #0 = { nounwind }

View File

@ -0,0 +1,56 @@
; RUN: opt < %s -force-vector-interleave=1 -store-to-load-forwarding-conflict-detection=false -loop-vectorize -dce -instcombine -S | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
%struct.pair = type { i32, i32 }
; Check vectorization of interleaved access groups with positive dependence
; distances. In this test, the maximum safe dependence distance for
; vectorization is 16 bytes. Normally, this would lead to a maximum VF of 4.
; However, for interleaved groups, the effective VF is VF * IF, where IF is the
; interleave factor. Here, the maximum safe dependence distance is recomputed
; as 16 / IF bytes, resulting in VF=2. Since IF=2, we should generate <4 x i32>
; loads and stores instead of <8 x i32> accesses.
;
; Note: LAA's conflict detection optimization has to be disabled for this test
; to be vectorized.
; struct pair {
; int x;
; int y;
; };
;
; void max_vf(struct pair *restrict p) {
; for (int i = 0; i < 1000; i++) {
; p[i + 2].x = p[i].x
; p[i + 2].y = p[i].y
; }
; }
; CHECK-LABEL: @max_vf
; CHECK: load <4 x i32>
; CHECK: store <4 x i32>
define void @max_vf(%struct.pair* noalias nocapture %p) {
entry:
br label %for.body
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%0 = add nuw nsw i64 %i, 2
%p_i.x = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %i, i32 0
%p_i_plus_2.x = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %0, i32 0
%1 = load i32, i32* %p_i.x, align 4
store i32 %1, i32* %p_i_plus_2.x, align 4
%p_i.y = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %i, i32 1
%p_i_plus_2.y = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %0, i32 1
%2 = load i32, i32* %p_i.y, align 4
store i32 %2, i32* %p_i_plus_2.y, align 4
%i.next = add nuw nsw i64 %i, 1
%cond = icmp eq i64 %i.next, 1000
br i1 %cond, label %for.exit, label %for.body
for.exit:
ret void
}

View File

@ -0,0 +1,49 @@
; REQUIRES: asserts
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
; CHECK-LABEL: all_scalar
; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions
;
define void @all_scalar(i64* %a, i64 %n) {
entry:
br label %for.body
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr i64, i64* %a, i64 %i
store i64 0, i64* %tmp0, align 1
%i.next = add nuw nsw i64 %i, 2
%cond = icmp eq i64 %i.next, %n
br i1 %cond, label %for.end, label %for.body
for.end:
ret void
}
; CHECK-LABEL: PR33193
; CHECK: LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
; CHECK: LV: Not considering vector loop of width 8 because it will not generate any vector instructions
%struct.a = type { i32, i8 }
define void @PR33193(%struct.a* %a, i64 %n) {
entry:
br label %for.body
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%j = phi i32 [ 0, %entry ], [ %j.next, %for.body ]
%tmp0 = getelementptr inbounds %struct.a, %struct.a* %a, i64 %i, i32 1
store i8 0, i8* %tmp0, align 4
%j.next = add i32 %j, 1
%i.next = zext i32 %j.next to i64
%cond = icmp ugt i64 %n, %i.next
br i1 %cond, label %for.body, label %for.end
for.end:
ret void
}

View File

@ -0,0 +1,37 @@
; RUN: opt -S -mtriple=aarch64-apple-ios -loop-vectorize -enable-interleaved-mem-accesses -force-vector-width=2 < %s | FileCheck %s
; Reproducer for address space fault in the LoopVectorizer (pr31900). Added
; different sized address space pointers (p:16:16-p4:32:16) to the aarch64
; datalayout to reproduce the fault.
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128-p:16:16-p4:32:16"
; Check that all the loads are scalarized
; CHECK: load i16, i16*
; CHECK: load i16, i16*
; CHECK: load i16, i16 addrspace(4)*
; CHECK: load i16, i16 addrspace(4)*
%rec1445 = type { i16, i16, i16, i16, i16 }
define void @foo() {
bb1:
br label %bb4
bb4:
%tmp1 = phi i16 [ undef, %bb1 ], [ %_tmp1013, %bb4 ]
%tmp2 = phi %rec1445* [ undef, %bb1 ], [ %_tmp1015, %bb4 ]
%tmp3 = phi %rec1445 addrspace(4)* [ undef, %bb1 ], [ %_tmp1017, %bb4 ]
%0 = getelementptr %rec1445, %rec1445* %tmp2, i16 0, i32 1
%_tmp987 = load i16, i16* %0, align 1
%1 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 0, i32 1
%_tmp993 = load i16, i16 addrspace(4)* %1, align 1
%_tmp1013 = add i16 %tmp1, 1
%_tmp1015 = getelementptr %rec1445, %rec1445* %tmp2, i16 1
%_tmp1017 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 1
%_tmp1019 = icmp ult i16 %_tmp1013, 24
br i1 %_tmp1019, label %bb4, label %bb16
bb16:
unreachable
}

View File

@ -0,0 +1,56 @@
; RUN: opt -S -mtriple=aarch64 -loop-vectorize -force-vector-width=2 < %s | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
@b = common local_unnamed_addr global i32 0, align 4
@a = common local_unnamed_addr global i16* null, align 8
; Function Attrs: norecurse nounwind readonly
define i32 @fn1() local_unnamed_addr #0 {
; Ensure that we don't emit reduction intrinsics for unsupported short reductions.
; CHECK-NOT: @llvm.experimental.vector.reduce
entry:
%0 = load i32, i32* @b, align 4, !tbaa !1
%cmp40 = icmp sgt i32 %0, 0
br i1 %cmp40, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
%1 = load i16*, i16** @a, align 8, !tbaa !5
%2 = load i32, i32* @b, align 4, !tbaa !1
%3 = sext i32 %2 to i64
br label %for.body
for.body: ; preds = %for.body.lr.ph, %for.body
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%d.043 = phi i16 [ undef, %for.body.lr.ph ], [ %.sink28, %for.body ]
%c.042 = phi i16 [ undef, %for.body.lr.ph ], [ %c.0., %for.body ]
%arrayidx = getelementptr inbounds i16, i16* %1, i64 %indvars.iv
%4 = load i16, i16* %arrayidx, align 2, !tbaa !7
%cmp2 = icmp sgt i16 %c.042, %4
%c.0. = select i1 %cmp2, i16 %c.042, i16 %4
%cmp13 = icmp slt i16 %d.043, %4
%.sink28 = select i1 %cmp13, i16 %d.043, i16 %4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%cmp = icmp slt i64 %indvars.iv.next, %3
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body, %entry
%c.0.lcssa = phi i16 [ undef, %entry ], [ %c.0., %for.body ]
%d.0.lcssa = phi i16 [ undef, %entry ], [ %.sink28, %for.body ]
%cmp26 = icmp sgt i16 %c.0.lcssa, %d.0.lcssa
%conv27 = zext i1 %cmp26 to i32
ret i32 %conv27
}
attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = !{!"clang"}
!1 = !{!2, !2, i64 0}
!2 = !{!"int", !3, i64 0}
!3 = !{!"omnipotent char", !4, i64 0}
!4 = !{!"Simple C/C++ TBAA"}
!5 = !{!6, !6, i64 0}
!6 = !{!"any pointer", !3, i64 0}
!7 = !{!8, !8, i64 0}
!8 = !{!"short", !3, i64 0}

Some files were not shown because too many files have changed in this diff Show More