You've already forked linux-packaging-mono
Imported Upstream version 5.18.0.167
Former-commit-id: 289509151e0fee68a1b591a20c9f109c3c789d3a
This commit is contained in:
parent
e19d552987
commit
b084638f15
@ -1,330 +0,0 @@
|
||||
; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
|
||||
; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
|
||||
; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
|
||||
; REQUIRES: asserts
|
||||
|
||||
; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
|
||||
; regarding IEEE 754 standard.
|
||||
; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
|
||||
; because NEON is not IEEE compliant.
|
||||
; Darwin, on the other hand, doesn't support subnormals, and all optimizations
|
||||
; are allowed, even without -ffast-math.
|
||||
|
||||
; Integer loops are always vectorizeable
|
||||
; CHECK: Checking a loop in "sumi"
|
||||
; CHECK: We can vectorize this loop!
|
||||
define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
|
||||
entry:
|
||||
%cmp5 = icmp eq i32 %N, 0
|
||||
br i1 %cmp5, label %for.end, label %for.body.preheader
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.body
|
||||
%i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
|
||||
%0 = load i32, i32* %arrayidx, align 4
|
||||
%arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
|
||||
%1 = load i32, i32* %arrayidx1, align 4
|
||||
%mul = mul nsw i32 %1, %0
|
||||
%arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
|
||||
store i32 %mul, i32* %arrayidx2, align 4
|
||||
%inc = add nuw nsw i32 %i.06, 1
|
||||
%exitcond = icmp eq i32 %inc, %N
|
||||
br i1 %exitcond, label %for.end.loopexit, label %for.body
|
||||
|
||||
for.end.loopexit: ; preds = %for.body
|
||||
br label %for.end
|
||||
|
||||
for.end: ; preds = %for.end.loopexit, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Floating-point loops need fast-math to be vectorizeable
|
||||
; LINUX: Checking a loop in "sumf"
|
||||
; LINUX: Potentially unsafe FP op prevents vectorization
|
||||
; DARWIN: Checking a loop in "sumf"
|
||||
; DARWIN: We can vectorize this loop!
|
||||
define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
|
||||
entry:
|
||||
%cmp5 = icmp eq i32 %N, 0
|
||||
br i1 %cmp5, label %for.end, label %for.body.preheader
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.body
|
||||
%i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
|
||||
%0 = load float, float* %arrayidx, align 4
|
||||
%arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
|
||||
%1 = load float, float* %arrayidx1, align 4
|
||||
%mul = fmul float %0, %1
|
||||
%arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
|
||||
store float %mul, float* %arrayidx2, align 4
|
||||
%inc = add nuw nsw i32 %i.06, 1
|
||||
%exitcond = icmp eq i32 %inc, %N
|
||||
br i1 %exitcond, label %for.end.loopexit, label %for.body
|
||||
|
||||
for.end.loopexit: ; preds = %for.body
|
||||
br label %for.end
|
||||
|
||||
for.end: ; preds = %for.end.loopexit, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Integer loops are always vectorizeable
|
||||
; CHECK: Checking a loop in "redi"
|
||||
; CHECK: We can vectorize this loop!
|
||||
define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
|
||||
entry:
|
||||
%cmp5 = icmp eq i32 %N, 0
|
||||
br i1 %cmp5, label %for.end, label %for.body.preheader
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.body
|
||||
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
||||
%Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
|
||||
%0 = load i32, i32* %arrayidx, align 4
|
||||
%arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
|
||||
%1 = load i32, i32* %arrayidx1, align 4
|
||||
%mul = mul nsw i32 %1, %0
|
||||
%add = add nsw i32 %mul, %Red.06
|
||||
%inc = add nuw nsw i32 %i.07, 1
|
||||
%exitcond = icmp eq i32 %inc, %N
|
||||
br i1 %exitcond, label %for.end.loopexit, label %for.body
|
||||
|
||||
for.end.loopexit: ; preds = %for.body
|
||||
%add.lcssa = phi i32 [ %add, %for.body ]
|
||||
br label %for.end
|
||||
|
||||
for.end: ; preds = %for.end.loopexit, %entry
|
||||
%Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
|
||||
ret i32 %Red.0.lcssa
|
||||
}
|
||||
|
||||
; Floating-point loops need fast-math to be vectorizeable
|
||||
; LINUX: Checking a loop in "redf"
|
||||
; LINUX: Potentially unsafe FP op prevents vectorization
|
||||
; DARWIN: Checking a loop in "redf"
|
||||
; DARWIN: We can vectorize this loop!
|
||||
define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
|
||||
entry:
|
||||
%cmp5 = icmp eq i32 %N, 0
|
||||
br i1 %cmp5, label %for.end, label %for.body.preheader
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.body
|
||||
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
||||
%Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
|
||||
%0 = load float, float* %arrayidx, align 4
|
||||
%arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
|
||||
%1 = load float, float* %arrayidx1, align 4
|
||||
%mul = fmul float %0, %1
|
||||
%add = fadd float %Red.06, %mul
|
||||
%inc = add nuw nsw i32 %i.07, 1
|
||||
%exitcond = icmp eq i32 %inc, %N
|
||||
br i1 %exitcond, label %for.end.loopexit, label %for.body
|
||||
|
||||
for.end.loopexit: ; preds = %for.body
|
||||
%add.lcssa = phi float [ %add, %for.body ]
|
||||
br label %for.end
|
||||
|
||||
for.end: ; preds = %for.end.loopexit, %entry
|
||||
%Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
|
||||
ret float %Red.0.lcssa
|
||||
}
|
||||
|
||||
; Make sure calls that turn into builtins are also covered
|
||||
; LINUX: Checking a loop in "fabs"
|
||||
; LINUX: Potentially unsafe FP op prevents vectorization
|
||||
; DARWIN: Checking a loop in "fabs"
|
||||
; DARWIN: We can vectorize this loop!
|
||||
define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
|
||||
entry:
|
||||
%cmp10 = icmp eq i32 %N, 0
|
||||
br i1 %cmp10, label %for.end, label %for.body
|
||||
|
||||
for.body: ; preds = %entry, %for.body
|
||||
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
||||
%arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
|
||||
%0 = load float, float* %arrayidx, align 4
|
||||
%arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
|
||||
%1 = load float, float* %arrayidx1, align 4
|
||||
%fabsf = tail call float @fabsf(float %1) #1
|
||||
%conv3 = fmul float %0, %fabsf
|
||||
%arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
|
||||
store float %conv3, float* %arrayidx4, align 4
|
||||
%inc = add nuw nsw i32 %i.011, 1
|
||||
%exitcond = icmp eq i32 %inc, %N
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end: ; preds = %for.body, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Integer loops are always vectorizeable
|
||||
; CHECK: Checking a loop in "sumi_fast"
|
||||
; CHECK: We can vectorize this loop!
|
||||
define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
|
||||
entry:
|
||||
%cmp5 = icmp eq i32 %N, 0
|
||||
br i1 %cmp5, label %for.end, label %for.body.preheader
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.body
|
||||
%i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
|
||||
%0 = load i32, i32* %arrayidx, align 4
|
||||
%arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
|
||||
%1 = load i32, i32* %arrayidx1, align 4
|
||||
%mul = mul nsw i32 %1, %0
|
||||
%arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
|
||||
store i32 %mul, i32* %arrayidx2, align 4
|
||||
%inc = add nuw nsw i32 %i.06, 1
|
||||
%exitcond = icmp eq i32 %inc, %N
|
||||
br i1 %exitcond, label %for.end.loopexit, label %for.body
|
||||
|
||||
for.end.loopexit: ; preds = %for.body
|
||||
br label %for.end
|
||||
|
||||
for.end: ; preds = %for.end.loopexit, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Floating-point loops can be vectorizeable with fast-math
|
||||
; CHECK: Checking a loop in "sumf_fast"
|
||||
; CHECK: We can vectorize this loop!
|
||||
define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
|
||||
entry:
|
||||
%cmp5 = icmp eq i32 %N, 0
|
||||
br i1 %cmp5, label %for.end, label %for.body.preheader
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.body
|
||||
%i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
|
||||
%0 = load float, float* %arrayidx, align 4
|
||||
%arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
|
||||
%1 = load float, float* %arrayidx1, align 4
|
||||
%mul = fmul fast float %1, %0
|
||||
%arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
|
||||
store float %mul, float* %arrayidx2, align 4
|
||||
%inc = add nuw nsw i32 %i.06, 1
|
||||
%exitcond = icmp eq i32 %inc, %N
|
||||
br i1 %exitcond, label %for.end.loopexit, label %for.body
|
||||
|
||||
for.end.loopexit: ; preds = %for.body
|
||||
br label %for.end
|
||||
|
||||
for.end: ; preds = %for.end.loopexit, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Integer loops are always vectorizeable
|
||||
; CHECK: Checking a loop in "redi_fast"
|
||||
; CHECK: We can vectorize this loop!
|
||||
define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
|
||||
entry:
|
||||
%cmp5 = icmp eq i32 %N, 0
|
||||
br i1 %cmp5, label %for.end, label %for.body.preheader
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.body
|
||||
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
||||
%Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
|
||||
%0 = load i32, i32* %arrayidx, align 4
|
||||
%arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
|
||||
%1 = load i32, i32* %arrayidx1, align 4
|
||||
%mul = mul nsw i32 %1, %0
|
||||
%add = add nsw i32 %mul, %Red.06
|
||||
%inc = add nuw nsw i32 %i.07, 1
|
||||
%exitcond = icmp eq i32 %inc, %N
|
||||
br i1 %exitcond, label %for.end.loopexit, label %for.body
|
||||
|
||||
for.end.loopexit: ; preds = %for.body
|
||||
%add.lcssa = phi i32 [ %add, %for.body ]
|
||||
br label %for.end
|
||||
|
||||
for.end: ; preds = %for.end.loopexit, %entry
|
||||
%Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
|
||||
ret i32 %Red.0.lcssa
|
||||
}
|
||||
|
||||
; Floating-point loops can be vectorizeable with fast-math
|
||||
; CHECK: Checking a loop in "redf_fast"
|
||||
; CHECK: We can vectorize this loop!
|
||||
define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
|
||||
entry:
|
||||
%cmp5 = icmp eq i32 %N, 0
|
||||
br i1 %cmp5, label %for.end, label %for.body.preheader
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.body
|
||||
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
||||
%Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
|
||||
%0 = load float, float* %arrayidx, align 4
|
||||
%arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
|
||||
%1 = load float, float* %arrayidx1, align 4
|
||||
%mul = fmul fast float %1, %0
|
||||
%add = fadd fast float %mul, %Red.06
|
||||
%inc = add nuw nsw i32 %i.07, 1
|
||||
%exitcond = icmp eq i32 %inc, %N
|
||||
br i1 %exitcond, label %for.end.loopexit, label %for.body
|
||||
|
||||
for.end.loopexit: ; preds = %for.body
|
||||
%add.lcssa = phi float [ %add, %for.body ]
|
||||
br label %for.end
|
||||
|
||||
for.end: ; preds = %for.end.loopexit, %entry
|
||||
%Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
|
||||
ret float %Red.0.lcssa
|
||||
}
|
||||
|
||||
; Make sure calls that turn into builtins are also covered
|
||||
; CHECK: Checking a loop in "fabs_fast"
|
||||
; CHECK: We can vectorize this loop!
|
||||
define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
|
||||
entry:
|
||||
%cmp10 = icmp eq i32 %N, 0
|
||||
br i1 %cmp10, label %for.end, label %for.body
|
||||
|
||||
for.body: ; preds = %entry, %for.body
|
||||
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
||||
%arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
|
||||
%0 = load float, float* %arrayidx, align 4
|
||||
%arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
|
||||
%1 = load float, float* %arrayidx1, align 4
|
||||
%fabsf = tail call fast float @fabsf(float %1) #2
|
||||
%conv3 = fmul fast float %fabsf, %0
|
||||
%arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
|
||||
store float %conv3, float* %arrayidx4, align 4
|
||||
%inc = add nuw nsw i32 %i.011, 1
|
||||
%exitcond = icmp eq i32 %inc, %N
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end: ; preds = %for.body, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @fabsf(float)
|
||||
|
||||
attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
|
@ -1,71 +0,0 @@
|
||||
; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
|
||||
; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT
|
||||
; RUN: opt < %s -loop-vectorize -force-vector-width=1 -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFTUNROLL
|
||||
|
||||
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
|
||||
target triple = "thumbv7-apple-ios3.0.0"
|
||||
|
||||
;CHECK-LABEL: @foo(
|
||||
;CHECK: load <4 x i32>
|
||||
;CHECK-NOT: load <4 x i32>
|
||||
;CHECK: ret
|
||||
;SWIFT-LABEL: @foo(
|
||||
;SWIFT: load <4 x i32>
|
||||
;SWIFT: load <4 x i32>
|
||||
;SWIFT: ret
|
||||
define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp {
|
||||
%1 = icmp sgt i32 %n, 0
|
||||
br i1 %1, label %.lr.ph, label %._crit_edge
|
||||
|
||||
.lr.ph: ; preds = %0, %.lr.ph
|
||||
%i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
|
||||
%sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
|
||||
%2 = getelementptr inbounds i32, i32* %A, i32 %i.02
|
||||
%3 = load i32, i32* %2, align 4
|
||||
%4 = add nsw i32 %3, %sum.01
|
||||
%5 = add nsw i32 %i.02, 1
|
||||
%exitcond = icmp eq i32 %5, %n
|
||||
br i1 %exitcond, label %._crit_edge, label %.lr.ph
|
||||
|
||||
._crit_edge: ; preds = %.lr.ph, %0
|
||||
%sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
|
||||
ret i32 %sum.0.lcssa
|
||||
}
|
||||
|
||||
; Verify the register limit. On arm we don't have 16 allocatable registers.
|
||||
;SWIFTUNROLL-LABEL: @register_limit(
|
||||
;SWIFTUNROLL: load i32
|
||||
;SWIFTUNROLL-NOT: load i32
|
||||
define i32 @register_limit(i32* nocapture %A, i32 %n) {
|
||||
%1 = icmp sgt i32 %n, 0
|
||||
br i1 %1, label %.lr.ph, label %._crit_edge
|
||||
|
||||
.lr.ph:
|
||||
%i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
|
||||
%sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
|
||||
%sum.02 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ]
|
||||
%sum.03 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ]
|
||||
%sum.04 = phi i32 [ %8, %.lr.ph ], [ 0, %0 ]
|
||||
%sum.05 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
|
||||
%sum.06 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
|
||||
%2 = getelementptr inbounds i32, i32* %A, i32 %i.02
|
||||
%3 = load i32, i32* %2, align 4
|
||||
%4 = add nsw i32 %3, %sum.01
|
||||
%5 = add nsw i32 %i.02, 1
|
||||
%6 = add nsw i32 %3, %sum.02
|
||||
%7 = add nsw i32 %3, %sum.03
|
||||
%8 = add nsw i32 %3, %sum.04
|
||||
%9 = add nsw i32 %3, %sum.05
|
||||
%10 = add nsw i32 %3, %sum.05
|
||||
%exitcond = icmp eq i32 %5, %n
|
||||
br i1 %exitcond, label %._crit_edge, label %.lr.ph
|
||||
|
||||
._crit_edge: ; preds = %.lr.ph, %0
|
||||
%sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
|
||||
%sum.1.lcssa = phi i32 [ 0, %0 ], [ %6, %.lr.ph ]
|
||||
%sum.2.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ]
|
||||
%sum.4.lcssa = phi i32 [ 0, %0 ], [ %8, %.lr.ph ]
|
||||
%sum.5.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
|
||||
%sum.6.lcssa = phi i32 [ 0, %0 ], [ %10, %.lr.ph ]
|
||||
ret i32 %sum.0.lcssa
|
||||
}
|
@ -1,88 +0,0 @@
|
||||
; RUN: opt -loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
|
||||
|
||||
@kernel = global [512 x float] zeroinitializer, align 4
|
||||
@kernel2 = global [512 x float] zeroinitializer, align 4
|
||||
@kernel3 = global [512 x float] zeroinitializer, align 4
|
||||
@kernel4 = global [512 x float] zeroinitializer, align 4
|
||||
@src_data = global [1536 x float] zeroinitializer, align 4
|
||||
@r_ = global i8 0, align 4
|
||||
@g_ = global i8 0, align 4
|
||||
@b_ = global i8 0, align 4
|
||||
|
||||
; We don't want to vectorize most loops containing gathers because they are
|
||||
; expensive. This function represents a point where vectorization starts to
|
||||
; become beneficial.
|
||||
; Make sure we are conservative and don't vectorize it.
|
||||
; CHECK-NOT: <2 x float>
|
||||
; CHECK-NOT: <4 x float>
|
||||
|
||||
define void @_Z4testmm(i32 %size, i32 %offset) {
|
||||
entry:
|
||||
%cmp53 = icmp eq i32 %size, 0
|
||||
br i1 %cmp53, label %for.end, label %for.body.lr.ph
|
||||
|
||||
for.body.lr.ph:
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
|
||||
%g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
|
||||
%v.055 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
|
||||
%b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
|
||||
%add = add i32 %v.055, %offset
|
||||
%mul = mul i32 %add, 3
|
||||
%arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %mul
|
||||
%0 = load float, float* %arrayidx, align 4
|
||||
%arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 %v.055
|
||||
%1 = load float, float* %arrayidx2, align 4
|
||||
%mul3 = fmul fast float %0, %1
|
||||
%arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 %v.055
|
||||
%2 = load float, float* %arrayidx4, align 4
|
||||
%mul5 = fmul fast float %mul3, %2
|
||||
%arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 %v.055
|
||||
%3 = load float, float* %arrayidx6, align 4
|
||||
%mul7 = fmul fast float %mul5, %3
|
||||
%arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 %v.055
|
||||
%4 = load float, float* %arrayidx8, align 4
|
||||
%mul9 = fmul fast float %mul7, %4
|
||||
%add10 = fadd fast float %r.057, %mul9
|
||||
%arrayidx.sum = add i32 %mul, 1
|
||||
%arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum
|
||||
%5 = load float, float* %arrayidx11, align 4
|
||||
%mul13 = fmul fast float %1, %5
|
||||
%mul15 = fmul fast float %2, %mul13
|
||||
%mul17 = fmul fast float %3, %mul15
|
||||
%mul19 = fmul fast float %4, %mul17
|
||||
%add20 = fadd fast float %g.056, %mul19
|
||||
%arrayidx.sum52 = add i32 %mul, 2
|
||||
%arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum52
|
||||
%6 = load float, float* %arrayidx21, align 4
|
||||
%mul23 = fmul fast float %1, %6
|
||||
%mul25 = fmul fast float %2, %mul23
|
||||
%mul27 = fmul fast float %3, %mul25
|
||||
%mul29 = fmul fast float %4, %mul27
|
||||
%add30 = fadd fast float %b.054, %mul29
|
||||
%inc = add i32 %v.055, 1
|
||||
%exitcond = icmp ne i32 %inc, %size
|
||||
br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
|
||||
|
||||
for.cond.for.end_crit_edge:
|
||||
%add30.lcssa = phi float [ %add30, %for.body ]
|
||||
%add20.lcssa = phi float [ %add20, %for.body ]
|
||||
%add10.lcssa = phi float [ %add10, %for.body ]
|
||||
%phitmp = fptoui float %add10.lcssa to i8
|
||||
%phitmp60 = fptoui float %add20.lcssa to i8
|
||||
%phitmp61 = fptoui float %add30.lcssa to i8
|
||||
br label %for.end
|
||||
|
||||
for.end:
|
||||
%r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
|
||||
%g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
|
||||
%b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
|
||||
store i8 %r.0.lcssa, i8* @r_, align 4
|
||||
store i8 %g.0.lcssa, i8* @g_, align 4
|
||||
store i8 %b.0.lcssa, i8* @b_, align 4
|
||||
ret void
|
||||
}
|
@ -1,60 +0,0 @@
|
||||
; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
|
||||
target triple = "thumbv7-apple-ios3.0.0"
|
||||
|
||||
@b = common global [2048 x i32] zeroinitializer, align 16
|
||||
@c = common global [2048 x i32] zeroinitializer, align 16
|
||||
@a = common global [2048 x i32] zeroinitializer, align 16
|
||||
|
||||
; Select VF = 8;
|
||||
;CHECK-LABEL: @example1(
|
||||
;CHECK: load <4 x i32>
|
||||
;CHECK: add nsw <4 x i32>
|
||||
;CHECK: store <4 x i32>
|
||||
;CHECK: ret void
|
||||
define void @example1() nounwind uwtable ssp {
|
||||
br label %1
|
||||
|
||||
; <label>:1 ; preds = %1, %0
|
||||
%indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
|
||||
%2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
|
||||
%3 = load i32, i32* %2, align 4
|
||||
%4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
|
||||
%5 = load i32, i32* %4, align 4
|
||||
%6 = add nsw i32 %5, %3
|
||||
%7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
|
||||
store i32 %6, i32* %7, align 4
|
||||
%indvars.iv.next = add i64 %indvars.iv, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||
%exitcond = icmp eq i32 %lftr.wideiv, 256
|
||||
br i1 %exitcond, label %8, label %1
|
||||
|
||||
; <label>:8 ; preds = %1
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK-LABEL: @example10b(
|
||||
;CHECK: load <4 x i16>
|
||||
;CHECK: sext <4 x i16>
|
||||
;CHECK: store <4 x i32>
|
||||
;CHECK: ret void
|
||||
define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
|
||||
br label %1
|
||||
|
||||
; <label>:1 ; preds = %1, %0
|
||||
%indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
|
||||
%2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
|
||||
%3 = load i16, i16* %2, align 2
|
||||
%4 = sext i16 %3 to i32
|
||||
%5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
|
||||
store i32 %4, i32* %5, align 4
|
||||
%indvars.iv.next = add i64 %indvars.iv, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||
%exitcond = icmp eq i32 %lftr.wideiv, 1024
|
||||
br i1 %exitcond, label %6, label %1
|
||||
|
||||
; <label>:6 ; preds = %1
|
||||
ret void
|
||||
}
|
||||
|
@ -1,147 +0,0 @@
|
||||
; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
|
||||
; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
|
||||
; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
|
||||
; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
|
||||
; REQUIRES: asserts
|
||||
|
||||
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
|
||||
target triple = "armv8--linux-gnueabihf"
|
||||
|
||||
%i8.2 = type {i8, i8}
|
||||
define void @i8_factor_2(%i8.2* %data, i64 %n) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
; VF_8-LABEL: Checking a loop in "i8_factor_2"
|
||||
; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
|
||||
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
|
||||
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
|
||||
; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
|
||||
; VF_16-LABEL: Checking a loop in "i8_factor_2"
|
||||
; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
|
||||
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
|
||||
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
|
||||
; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
|
||||
for.body:
|
||||
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
|
||||
%tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
|
||||
%tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
|
||||
%tmp2 = load i8, i8* %tmp0, align 1
|
||||
%tmp3 = load i8, i8* %tmp1, align 1
|
||||
store i8 0, i8* %tmp0, align 1
|
||||
store i8 0, i8* %tmp1, align 1
|
||||
%i.next = add nuw nsw i64 %i, 1
|
||||
%cond = icmp slt i64 %i.next, %n
|
||||
br i1 %cond, label %for.body, label %for.end
|
||||
|
||||
for.end:
|
||||
ret void
|
||||
}
|
||||
|
||||
%i16.2 = type {i16, i16}
|
||||
define void @i16_factor_2(%i16.2* %data, i64 %n) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
; VF_4-LABEL: Checking a loop in "i16_factor_2"
|
||||
; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
|
||||
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
|
||||
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
|
||||
; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
|
||||
; VF_8-LABEL: Checking a loop in "i16_factor_2"
|
||||
; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
|
||||
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
|
||||
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
|
||||
; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
|
||||
; VF_16-LABEL: Checking a loop in "i16_factor_2"
|
||||
; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
|
||||
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
|
||||
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
|
||||
; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
|
||||
for.body:
|
||||
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
|
||||
%tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
|
||||
%tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
|
||||
%tmp2 = load i16, i16* %tmp0, align 2
|
||||
%tmp3 = load i16, i16* %tmp1, align 2
|
||||
store i16 0, i16* %tmp0, align 2
|
||||
store i16 0, i16* %tmp1, align 2
|
||||
%i.next = add nuw nsw i64 %i, 1
|
||||
%cond = icmp slt i64 %i.next, %n
|
||||
br i1 %cond, label %for.body, label %for.end
|
||||
|
||||
for.end:
|
||||
ret void
|
||||
}
|
||||
|
||||
%i32.2 = type {i32, i32}
|
||||
define void @i32_factor_2(%i32.2* %data, i64 %n) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
; VF_2-LABEL: Checking a loop in "i32_factor_2"
|
||||
; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
|
||||
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
|
||||
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
|
||||
; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
|
||||
; VF_4-LABEL: Checking a loop in "i32_factor_2"
|
||||
; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
|
||||
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
|
||||
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
|
||||
; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
|
||||
; VF_8-LABEL: Checking a loop in "i32_factor_2"
|
||||
; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
|
||||
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
|
||||
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
|
||||
; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
|
||||
; VF_16-LABEL: Checking a loop in "i32_factor_2"
|
||||
; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
|
||||
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
|
||||
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
|
||||
; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
|
||||
for.body:
|
||||
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
|
||||
%tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
|
||||
%tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
|
||||
%tmp2 = load i32, i32* %tmp0, align 4
|
||||
%tmp3 = load i32, i32* %tmp1, align 4
|
||||
store i32 0, i32* %tmp0, align 4
|
||||
store i32 0, i32* %tmp1, align 4
|
||||
%i.next = add nuw nsw i64 %i, 1
|
||||
%cond = icmp slt i64 %i.next, %n
|
||||
br i1 %cond, label %for.body, label %for.end
|
||||
|
||||
for.end:
|
||||
ret void
|
||||
}
|
||||
|
||||
%half.2 = type {half, half}
|
||||
define void @half_factor_2(%half.2* %data, i64 %n) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
; VF_4-LABEL: Checking a loop in "half_factor_2"
|
||||
; VF_4: Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
|
||||
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
|
||||
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
|
||||
; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
|
||||
; VF_8-LABEL: Checking a loop in "half_factor_2"
|
||||
; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
|
||||
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
|
||||
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
|
||||
; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
|
||||
for.body:
|
||||
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
|
||||
%tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0
|
||||
%tmp1 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 1
|
||||
%tmp2 = load half, half* %tmp0, align 2
|
||||
%tmp3 = load half, half* %tmp1, align 2
|
||||
store half 0., half* %tmp0, align 2
|
||||
store half 0., half* %tmp1, align 2
|
||||
%i.next = add nuw nsw i64 %i, 1
|
||||
%cond = icmp slt i64 %i.next, %n
|
||||
br i1 %cond, label %for.body, label %for.end
|
||||
|
||||
for.end:
|
||||
ret void
|
||||
}
|
@ -1,3 +0,0 @@
|
||||
if not 'ARM' in config.root.targets:
|
||||
config.unsupported = True
|
||||
|
@ -1,114 +0,0 @@
|
||||
; RUN: opt < %s -cost-model -analyze -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
|
||||
; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
|
||||
; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
|
||||
|
||||
; ModuleID = 'arm.ll'
|
||||
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
|
||||
target triple = "armv7--linux-gnueabihf"
|
||||
|
||||
%T216 = type <2 x i16>
|
||||
%T232 = type <2 x i32>
|
||||
%T264 = type <2 x i64>
|
||||
|
||||
%T416 = type <4 x i16>
|
||||
%T432 = type <4 x i32>
|
||||
%T464 = type <4 x i64>
|
||||
|
||||
define void @direct(%T432* %loadaddr, %T432* %loadaddr2, %T432* %storeaddr) {
|
||||
; COST: function 'direct':
|
||||
%v0 = load %T432, %T432* %loadaddr
|
||||
; ASM: vld1.64
|
||||
%v1 = load %T432, %T432* %loadaddr2
|
||||
; ASM: vld1.64
|
||||
%r3 = mul %T432 %v0, %v1
|
||||
; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
|
||||
; ASM: vmul.i32
|
||||
store %T432 %r3, %T432* %storeaddr
|
||||
; ASM: vst1.64
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ups1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
|
||||
; COST: function 'ups1632':
|
||||
%v0 = load %T416, %T416* %loadaddr
|
||||
; ASM: vldr
|
||||
%v1 = load %T416, %T416* %loadaddr2
|
||||
; ASM: vldr
|
||||
%r1 = sext %T416 %v0 to %T432
|
||||
%r2 = sext %T416 %v1 to %T432
|
||||
; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32>
|
||||
%r3 = mul %T432 %r1, %r2
|
||||
; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
|
||||
; ASM: vmull.s16
|
||||
store %T432 %r3, %T432* %storeaddr
|
||||
; ASM: vst1.64
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @upu1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
|
||||
; COST: function 'upu1632':
|
||||
%v0 = load %T416, %T416* %loadaddr
|
||||
; ASM: vldr
|
||||
%v1 = load %T416, %T416* %loadaddr2
|
||||
; ASM: vldr
|
||||
%r1 = zext %T416 %v0 to %T432
|
||||
%r2 = zext %T416 %v1 to %T432
|
||||
; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32>
|
||||
%r3 = mul %T432 %r1, %r2
|
||||
; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
|
||||
; ASM: vmull.u16
|
||||
store %T432 %r3, %T432* %storeaddr
|
||||
; ASM: vst1.64
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ups3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
|
||||
; COST: function 'ups3264':
|
||||
%v0 = load %T232, %T232* %loadaddr
|
||||
; ASM: vldr
|
||||
%v1 = load %T232, %T232* %loadaddr2
|
||||
; ASM: vldr
|
||||
%r3 = mul %T232 %v0, %v1
|
||||
; ASM: vmul.i32
|
||||
; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
|
||||
%st = sext %T232 %r3 to %T264
|
||||
; ASM: vmovl.s32
|
||||
; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64>
|
||||
store %T264 %st, %T264* %storeaddr
|
||||
; ASM: vst1.64
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @upu3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
|
||||
; COST: function 'upu3264':
|
||||
%v0 = load %T232, %T232* %loadaddr
|
||||
; ASM: vldr
|
||||
%v1 = load %T232, %T232* %loadaddr2
|
||||
; ASM: vldr
|
||||
%r3 = mul %T232 %v0, %v1
|
||||
; ASM: vmul.i32
|
||||
; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
|
||||
%st = zext %T232 %r3 to %T264
|
||||
; ASM: vmovl.u32
|
||||
; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64>
|
||||
store %T264 %st, %T264* %storeaddr
|
||||
; ASM: vst1.64
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @dn3216(%T432* %loadaddr, %T432* %loadaddr2, %T416* %storeaddr) {
|
||||
; COST: function 'dn3216':
|
||||
%v0 = load %T432, %T432* %loadaddr
|
||||
; ASM: vld1.64
|
||||
%v1 = load %T432, %T432* %loadaddr2
|
||||
; ASM: vld1.64
|
||||
%r3 = mul %T432 %v0, %v1
|
||||
; ASM: vmul.i32
|
||||
; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
|
||||
%st = trunc %T432 %r3 to %T416
|
||||
; ASM: vmovn.i32
|
||||
; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16>
|
||||
store %T416 %st, %T416* %storeaddr
|
||||
; ASM: vstr
|
||||
ret void
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
; RUN: opt -loop-vectorize -tbaa -S -mattr=+neon < %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
|
||||
target triple = "armv7--linux-gnueabi"
|
||||
|
||||
; This requires the loop vectorizer to create an interleaved access group
|
||||
; for the stores to the struct. Here we need to perform a bitcast from a vector
|
||||
; of pointers to a vector i32s.
|
||||
|
||||
%class.A = type { i8*, i32 }
|
||||
|
||||
; CHECK-LABEL: test0
|
||||
define void @test0(%class.A* %StartPtr, %class.A* %APtr) {
|
||||
entry:
|
||||
br label %for.body.i
|
||||
|
||||
for.body.i:
|
||||
%addr = phi %class.A* [ %StartPtr, %entry ], [ %incdec.ptr.i, %for.body.i ]
|
||||
%Data.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 0
|
||||
store i8* null, i8** %Data.i.i, align 4, !tbaa !8
|
||||
%Length.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 1
|
||||
store i32 0, i32* %Length.i.i, align 4, !tbaa !11
|
||||
%incdec.ptr.i = getelementptr inbounds %class.A, %class.A* %addr, i32 1
|
||||
%cmp.i = icmp eq %class.A* %incdec.ptr.i, %APtr
|
||||
br i1 %cmp.i, label %exit, label %for.body.i
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
!5 = !{!"any pointer", !6, i64 0}
|
||||
!6 = !{!"omnipotent char", !7, i64 0}
|
||||
!7 = !{!"Simple C/C++ TBAA"}
|
||||
!8 = !{!9, !5, i64 0}
|
||||
!9 = !{!"some struct", !5, i64 0, !10, i64 4}
|
||||
!10 = !{!"int", !6, i64 0}
|
||||
!11 = !{!9, !10, i64 4}
|
@ -1,52 +0,0 @@
|
||||
; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
|
||||
target triple = "thumbv7-apple-ios3.0.0"
|
||||
|
||||
;CHECK:foo_F32
|
||||
;CHECK: <4 x float>
|
||||
;CHECK:ret
|
||||
define float @foo_F32(float* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
|
||||
%1 = icmp sgt i32 %n, 0
|
||||
br i1 %1, label %.lr.ph, label %._crit_edge
|
||||
|
||||
.lr.ph: ; preds = %0, %.lr.ph
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
|
||||
%prod.01 = phi float [ %4, %.lr.ph ], [ 0.000000e+00, %0 ]
|
||||
%2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
|
||||
%3 = load float, float* %2, align 8
|
||||
%4 = fmul fast float %prod.01, %3
|
||||
%indvars.iv.next = add i64 %indvars.iv, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
||||
br i1 %exitcond, label %._crit_edge, label %.lr.ph
|
||||
|
||||
._crit_edge: ; preds = %.lr.ph, %0
|
||||
%prod.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %4, %.lr.ph ]
|
||||
ret float %prod.0.lcssa
|
||||
}
|
||||
|
||||
;CHECK:foo_I8
|
||||
;CHECK: xor <16 x i8>
|
||||
;CHECK:ret
|
||||
define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
|
||||
%1 = icmp sgt i32 %n, 0
|
||||
br i1 %1, label %.lr.ph, label %._crit_edge
|
||||
|
||||
.lr.ph: ; preds = %0, %.lr.ph
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
|
||||
%red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ]
|
||||
%2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
|
||||
%3 = load i8, i8* %2, align 1
|
||||
%4 = xor i8 %3, %red.01
|
||||
%indvars.iv.next = add i64 %indvars.iv, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
||||
br i1 %exitcond, label %._crit_edge, label %.lr.ph
|
||||
|
||||
._crit_edge: ; preds = %.lr.ph, %0
|
||||
%red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ]
|
||||
ret i8 %red.0.lcssa
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user