You've already forked linux-packaging-mono
Imported Upstream version 5.18.0.167
Former-commit-id: 289509151e0fee68a1b591a20c9f109c3c789d3a
This commit is contained in:
parent
e19d552987
commit
b084638f15
@ -1,32 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -scoped-noalias -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=SCOPE -check-prefix=ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=NOSCOPE -check-prefix=ALL %s
|
||||
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
|
||||
; This fails to vectorize if the !alias.scope is not used
|
||||
|
||||
; ALL-LABEL: @vectorize_alias_scope(
|
||||
; SCOPE: load float, float addrspace(1)* %c
|
||||
; SCOPE: bitcast float addrspace(1)* %a to <2 x float> addrspace(1)*
|
||||
; SCOPE: store <2 x float> zeroinitializer
|
||||
; SCOPE: store float %ld.c, float addrspace(1)* %b,
|
||||
|
||||
; NOSCOPE: store float
|
||||
; NOSCOPE: load float
|
||||
; NOSCOPE: store float
|
||||
; NOSCOPE: store float
|
||||
define amdgpu_kernel void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
|
||||
entry:
|
||||
%a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
store float 0.0, float addrspace(1)* %a, align 4, !noalias !0
|
||||
%ld.c = load float, float addrspace(1)* %c, align 4, !alias.scope !0
|
||||
store float 0.0, float addrspace(1)* %a.idx.1, align 4, !noalias !0
|
||||
store float %ld.c, float addrspace(1)* %b, align 4, !noalias !0
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
||||
!0 = !{!1}
|
||||
!1 = distinct !{!1, !2, !"some scope"}
|
||||
!2 = distinct !{!2, !"some domain"}
|
@ -1,132 +0,0 @@
|
||||
; RUN: opt -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
|
||||
; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
|
||||
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
target triple = "amdgcn--"
|
||||
|
||||
; ALL-LABEL: @load_unknown_offset_align1_i8(
|
||||
; ALL: alloca [128 x i8], align 1
|
||||
; UNALIGNED: load <2 x i8>, <2 x i8>* %{{[0-9]+}}, align 1{{$}}
|
||||
|
||||
; ALIGNED: load i8, i8* %ptr0, align 1{{$}}
|
||||
; ALIGNED: load i8, i8* %ptr1, align 1{{$}}
|
||||
define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
|
||||
%alloca = alloca [128 x i8], align 1
|
||||
%ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset
|
||||
%val0 = load i8, i8* %ptr0, align 1
|
||||
%ptr1 = getelementptr inbounds i8, i8* %ptr0, i32 1
|
||||
%val1 = load i8, i8* %ptr1, align 1
|
||||
%add = add i8 %val0, %val1
|
||||
store i8 %add, i8 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @load_unknown_offset_align1_i16(
|
||||
; ALL: alloca [128 x i16], align 1{{$}}
|
||||
; UNALIGNED: load <2 x i16>, <2 x i16>* %{{[0-9]+}}, align 1{{$}}
|
||||
|
||||
; ALIGNED: load i16, i16* %ptr0, align 1{{$}}
|
||||
; ALIGNED: load i16, i16* %ptr1, align 1{{$}}
|
||||
define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
|
||||
%alloca = alloca [128 x i16], align 1
|
||||
%ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset
|
||||
%val0 = load i16, i16* %ptr0, align 1
|
||||
%ptr1 = getelementptr inbounds i16, i16* %ptr0, i32 1
|
||||
%val1 = load i16, i16* %ptr1, align 1
|
||||
%add = add i16 %val0, %val1
|
||||
store i16 %add, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Although the offset is unknown here, we know it is a multiple
|
||||
; of the element size, so should still be align 4
|
||||
|
||||
; ALL-LABEL: @load_unknown_offset_align1_i32(
|
||||
; ALL: alloca [128 x i32], align 1
|
||||
; UNALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 1{{$}}
|
||||
|
||||
; ALIGNED: load i32, i32* %ptr0, align 1
|
||||
; ALIGNED: load i32, i32* %ptr1, align 1
|
||||
define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
|
||||
%alloca = alloca [128 x i32], align 1
|
||||
%ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
|
||||
%val0 = load i32, i32* %ptr0, align 1
|
||||
%ptr1 = getelementptr inbounds i32, i32* %ptr0, i32 1
|
||||
%val1 = load i32, i32* %ptr1, align 1
|
||||
%add = add i32 %val0, %val1
|
||||
store i32 %add, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should always increase alignment of the load
|
||||
; Make sure alloca alignment isn't decreased
|
||||
; ALL-LABEL: @load_alloca16_unknown_offset_align1_i32(
|
||||
; ALL: alloca [128 x i32], align 16
|
||||
|
||||
; UNALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 1{{$}}
|
||||
|
||||
; FIXME: Should change alignment
|
||||
; ALIGNED: load i32
|
||||
; ALIGNED: load i32
|
||||
define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
|
||||
%alloca = alloca [128 x i32], align 16
|
||||
%ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
|
||||
%val0 = load i32, i32* %ptr0, align 1
|
||||
%ptr1 = getelementptr inbounds i32, i32* %ptr0, i32 1
|
||||
%val1 = load i32, i32* %ptr1, align 1
|
||||
%add = add i32 %val0, %val1
|
||||
store i32 %add, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @store_unknown_offset_align1_i8(
|
||||
; ALL: alloca [128 x i8], align 1
|
||||
; UNALIGNED: store <2 x i8> <i8 9, i8 10>, <2 x i8>* %{{[0-9]+}}, align 1{{$}}
|
||||
|
||||
; ALIGNED: store i8 9, i8* %ptr0, align 1{{$}}
|
||||
; ALIGNED: store i8 10, i8* %ptr1, align 1{{$}}
|
||||
define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
|
||||
%alloca = alloca [128 x i8], align 1
|
||||
%ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset
|
||||
store i8 9, i8* %ptr0, align 1
|
||||
%ptr1 = getelementptr inbounds i8, i8* %ptr0, i32 1
|
||||
store i8 10, i8* %ptr1, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @store_unknown_offset_align1_i16(
|
||||
; ALL: alloca [128 x i16], align 1
|
||||
; UNALIGNED: store <2 x i16> <i16 9, i16 10>, <2 x i16>* %{{[0-9]+}}, align 1{{$}}
|
||||
|
||||
; ALIGNED: store i16 9, i16* %ptr0, align 1{{$}}
|
||||
; ALIGNED: store i16 10, i16* %ptr1, align 1{{$}}
|
||||
define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
|
||||
%alloca = alloca [128 x i16], align 1
|
||||
%ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset
|
||||
store i16 9, i16* %ptr0, align 1
|
||||
%ptr1 = getelementptr inbounds i16, i16* %ptr0, i32 1
|
||||
store i16 10, i16* %ptr1, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Although the offset is unknown here, we know it is a multiple
|
||||
; of the element size, so it still should be align 4.
|
||||
|
||||
; ALL-LABEL: @store_unknown_offset_align1_i32(
|
||||
; ALL: alloca [128 x i32], align 1
|
||||
|
||||
; UNALIGNED: store <2 x i32> <i32 9, i32 10>, <2 x i32>* %{{[0-9]+}}, align 1{{$}}
|
||||
|
||||
; ALIGNED: store i32 9, i32* %ptr0, align 1
|
||||
; ALIGNED: store i32 10, i32* %ptr1, align 1
|
||||
define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
|
||||
%alloca = alloca [128 x i32], align 1
|
||||
%ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
|
||||
store i32 9, i32* %ptr0, align 1
|
||||
%ptr1 = getelementptr inbounds i32, i32* %ptr0, i32 1
|
||||
store i32 10, i32* %ptr1, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
@ -1,150 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
; CHECK-LABEL: @basic_merge_sext_index(
|
||||
; CHECK: sext i32 %id.x to i64
|
||||
; CHECK: load <2 x float>
|
||||
; CHECK: store <2 x float> zeroinitializer
|
||||
define amdgpu_kernel void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
|
||||
entry:
|
||||
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%sext.id.x = sext i32 %id.x to i64
|
||||
%a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %sext.id.x
|
||||
%c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %sext.id.x
|
||||
%a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
|
||||
%c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1
|
||||
|
||||
%ld.c = load float, float addrspace(1)* %c.idx.x, align 4
|
||||
%ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
|
||||
|
||||
store float 0.0, float addrspace(1)* %a.idx.x, align 4
|
||||
store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
|
||||
|
||||
%add = fadd float %ld.c, %ld.c.idx.1
|
||||
store float %add, float addrspace(1)* %b, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @basic_merge_zext_index(
|
||||
; CHECK: zext i32 %id.x to i64
|
||||
; CHECK: load <2 x float>
|
||||
; CHECK: store <2 x float>
|
||||
define amdgpu_kernel void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
|
||||
entry:
|
||||
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%zext.id.x = zext i32 %id.x to i64
|
||||
%a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x
|
||||
%c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x
|
||||
%a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
|
||||
%c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1
|
||||
|
||||
%ld.c = load float, float addrspace(1)* %c.idx.x, align 4
|
||||
%ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
|
||||
store float 0.0, float addrspace(1)* %a.idx.x, align 4
|
||||
store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
|
||||
|
||||
%add = fadd float %ld.c, %ld.c.idx.1
|
||||
store float %add, float addrspace(1)* %b, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_op_zext_index(
|
||||
; CHECK: load <2 x float>
|
||||
; CHECK: store <2 x float>
|
||||
define amdgpu_kernel void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
|
||||
entry:
|
||||
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%shl = shl i32 %id.x, 2
|
||||
%zext.id.x = zext i32 %shl to i64
|
||||
%a.0 = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x
|
||||
%c.0 = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x
|
||||
|
||||
%id.x.1 = or i32 %shl, 1
|
||||
%id.x.1.ext = zext i32 %id.x.1 to i64
|
||||
|
||||
%a.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 %id.x.1.ext
|
||||
%c.1 = getelementptr inbounds float, float addrspace(1)* %c, i64 %id.x.1.ext
|
||||
|
||||
%ld.c.0 = load float, float addrspace(1)* %c.0, align 4
|
||||
store float 0.0, float addrspace(1)* %a.0, align 4
|
||||
%ld.c.1 = load float, float addrspace(1)* %c.1, align 4
|
||||
store float 0.0, float addrspace(1)* %a.1, align 4
|
||||
|
||||
%add = fadd float %ld.c.0, %ld.c.1
|
||||
store float %add, float addrspace(1)* %b, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_op_sext_index(
|
||||
; CHECK: load <2 x float>
|
||||
; CHECK: store <2 x float>
|
||||
define amdgpu_kernel void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
|
||||
entry:
|
||||
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%shl = shl i32 %id.x, 2
|
||||
%zext.id.x = sext i32 %shl to i64
|
||||
%a.0 = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x
|
||||
%c.0 = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x
|
||||
|
||||
%id.x.1 = or i32 %shl, 1
|
||||
%id.x.1.ext = sext i32 %id.x.1 to i64
|
||||
|
||||
%a.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 %id.x.1.ext
|
||||
%c.1 = getelementptr inbounds float, float addrspace(1)* %c, i64 %id.x.1.ext
|
||||
|
||||
%ld.c.0 = load float, float addrspace(1)* %c.0, align 4
|
||||
store float 0.0, float addrspace(1)* %a.0, align 4
|
||||
%ld.c.1 = load float, float addrspace(1)* %c.1, align 4
|
||||
store float 0.0, float addrspace(1)* %a.1, align 4
|
||||
|
||||
%add = fadd float %ld.c.0, %ld.c.1
|
||||
store float %add, float addrspace(1)* %b, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; This case fails to vectorize if not using the extra extension
|
||||
; handling in isConsecutiveAccess.
|
||||
|
||||
; CHECK-LABEL: @zext_trunc_phi_1(
|
||||
; CHECK: loop:
|
||||
; CHECK: load <2 x i32>
|
||||
; CHECK: store <2 x i32>
|
||||
define amdgpu_kernel void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
|
||||
entry:
|
||||
%cmp0 = icmp eq i32 %n, 0
|
||||
br i1 %cmp0, label %exit, label %loop
|
||||
|
||||
loop:
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ]
|
||||
%trunc.iv = trunc i64 %indvars.iv to i32
|
||||
%idx = shl i32 %trunc.iv, 4
|
||||
|
||||
%idx.ext = zext i32 %idx to i64
|
||||
%c.0 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %idx.ext
|
||||
%a.0 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.ext
|
||||
|
||||
%idx.1 = or i32 %idx, 1
|
||||
%idx.1.ext = zext i32 %idx.1 to i64
|
||||
%c.1 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %idx.1.ext
|
||||
%a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.1.ext
|
||||
|
||||
%ld.c.0 = load i32, i32 addrspace(1)* %c.0, align 4
|
||||
store i32 %ld.c.0, i32 addrspace(1)* %a.0, align 4
|
||||
%ld.c.1 = load i32, i32 addrspace(1)* %c.1, align 4
|
||||
store i32 %ld.c.1, i32 addrspace(1)* %a.1, align 4
|
||||
|
||||
%indvars.iv.next = add i64 %indvars.iv, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||
|
||||
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
||||
br i1 %exitcond, label %exit, label %loop
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
@ -1,83 +0,0 @@
|
||||
; RUN: opt -S -mtriple=amdgcn--amdhsa -load-store-vectorizer < %s | FileCheck %s
|
||||
|
||||
; Check that vectorizer can find a GEP through bitcast
|
||||
; CHECK-LABEL: @vect_zext_bitcast_f32_to_i32_idx
|
||||
; CHECK: load <4 x i32>
|
||||
define void @vect_zext_bitcast_f32_to_i32_idx(float addrspace(1)* %arg1, i32 %base) {
|
||||
%add1 = add nuw i32 %base, 0
|
||||
%zext1 = zext i32 %add1 to i64
|
||||
%gep1 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext1
|
||||
%f2i1 = bitcast float addrspace(1)* %gep1 to i32 addrspace(1)*
|
||||
%load1 = load i32, i32 addrspace(1)* %f2i1, align 4
|
||||
%add2 = add nuw i32 %base, 1
|
||||
%zext2 = zext i32 %add2 to i64
|
||||
%gep2 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext2
|
||||
%f2i2 = bitcast float addrspace(1)* %gep2 to i32 addrspace(1)*
|
||||
%load2 = load i32, i32 addrspace(1)* %f2i2, align 4
|
||||
%add3 = add nuw i32 %base, 2
|
||||
%zext3 = zext i32 %add3 to i64
|
||||
%gep3 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext3
|
||||
%f2i3 = bitcast float addrspace(1)* %gep3 to i32 addrspace(1)*
|
||||
%load3 = load i32, i32 addrspace(1)* %f2i3, align 4
|
||||
%add4 = add nuw i32 %base, 3
|
||||
%zext4 = zext i32 %add4 to i64
|
||||
%gep4 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext4
|
||||
%f2i4 = bitcast float addrspace(1)* %gep4 to i32 addrspace(1)*
|
||||
%load4 = load i32, i32 addrspace(1)* %f2i4, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @vect_zext_bitcast_i8_st1_to_i32_idx
|
||||
; CHECK: load i32
|
||||
; CHECK: load i32
|
||||
; CHECK: load i32
|
||||
; CHECK: load i32
|
||||
define void @vect_zext_bitcast_i8_st1_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) {
|
||||
%add1 = add nuw i32 %base, 0
|
||||
%zext1 = zext i32 %add1 to i64
|
||||
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext1
|
||||
%f2i1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||||
%load1 = load i32, i32 addrspace(1)* %f2i1, align 4
|
||||
%add2 = add nuw i32 %base, 1
|
||||
%zext2 = zext i32 %add2 to i64
|
||||
%gep2 = getelementptr inbounds i8,i8 addrspace(1)* %arg1, i64 %zext2
|
||||
%f2i2 = bitcast i8 addrspace(1)* %gep2 to i32 addrspace(1)*
|
||||
%load2 = load i32, i32 addrspace(1)* %f2i2, align 4
|
||||
%add3 = add nuw i32 %base, 2
|
||||
%zext3 = zext i32 %add3 to i64
|
||||
%gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3
|
||||
%f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)*
|
||||
%load3 = load i32, i32 addrspace(1)* %f2i3, align 4
|
||||
%add4 = add nuw i32 %base, 3
|
||||
%zext4 = zext i32 %add4 to i64
|
||||
%gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4
|
||||
%f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)*
|
||||
%load4 = load i32, i32 addrspace(1)* %f2i4, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; TODO: This can be vectorized, but currently vectorizer unable to do it.
|
||||
; CHECK-LABEL: @vect_zext_bitcast_i8_st4_to_i32_idx
|
||||
define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) {
|
||||
%add1 = add nuw i32 %base, 0
|
||||
%zext1 = zext i32 %add1 to i64
|
||||
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext1
|
||||
%f2i1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||||
%load1 = load i32, i32 addrspace(1)* %f2i1, align 4
|
||||
%add2 = add nuw i32 %base, 4
|
||||
%zext2 = zext i32 %add2 to i64
|
||||
%gep2 = getelementptr inbounds i8,i8 addrspace(1)* %arg1, i64 %zext2
|
||||
%f2i2 = bitcast i8 addrspace(1)* %gep2 to i32 addrspace(1)*
|
||||
%load2 = load i32, i32 addrspace(1)* %f2i2, align 4
|
||||
%add3 = add nuw i32 %base, 8
|
||||
%zext3 = zext i32 %add3 to i64
|
||||
%gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3
|
||||
%f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)*
|
||||
%load3 = load i32, i32 addrspace(1)* %f2i3, align 4
|
||||
%add4 = add nuw i32 %base, 16
|
||||
%zext4 = zext i32 %add4 to i64
|
||||
%gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4
|
||||
%f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)*
|
||||
%load4 = load i32, i32 addrspace(1)* %f2i4, align 4
|
||||
ret void
|
||||
}
|
@ -1,117 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
|
||||
; Check position of the inserted vector load/store. Vectorized loads should be
|
||||
; inserted at the position of the first load in the chain, and stores should be
|
||||
; inserted at the position of the last store.
|
||||
|
||||
; CHECK-LABEL: @insert_load_point(
|
||||
; CHECK: %z = add i32 %x, 4
|
||||
; CHECK: load <2 x float>
|
||||
; CHECK: %w = add i32 %y, 9
|
||||
; CHECK: %foo = add i32 %z, %w
|
||||
define amdgpu_kernel void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
|
||||
entry:
|
||||
%a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
|
||||
%c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
|
||||
%a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
|
||||
%c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1
|
||||
|
||||
%z = add i32 %x, 4
|
||||
%ld.c = load float, float addrspace(1)* %c.idx.x, align 4
|
||||
%w = add i32 %y, 9
|
||||
%ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
|
||||
%foo = add i32 %z, %w
|
||||
|
||||
store float 0.0, float addrspace(1)* %a.idx.x, align 4
|
||||
store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
|
||||
|
||||
%add = fadd float %ld.c, %ld.c.idx.1
|
||||
store float %add, float addrspace(1)* %b, align 4
|
||||
store i32 %foo, i32 addrspace(3)* null, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @insert_store_point(
|
||||
; CHECK: %z = add i32 %x, 4
|
||||
; CHECK: %w = add i32 %y, 9
|
||||
; CHECK: store <2 x float>
|
||||
; CHECK: %foo = add i32 %z, %w
|
||||
define amdgpu_kernel void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
|
||||
entry:
|
||||
%a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
|
||||
%c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
|
||||
%a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
|
||||
%c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1
|
||||
|
||||
%ld.c = load float, float addrspace(1)* %c.idx.x, align 4
|
||||
%ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
|
||||
|
||||
%z = add i32 %x, 4
|
||||
store float 0.0, float addrspace(1)* %a.idx.x, align 4
|
||||
%w = add i32 %y, 9
|
||||
store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
|
||||
%foo = add i32 %z, %w
|
||||
|
||||
%add = fadd float %ld.c, %ld.c.idx.1
|
||||
store float %add, float addrspace(1)* %b, align 4
|
||||
store i32 %foo, i32 addrspace(3)* null, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Here we have four stores, with an aliasing load before the last one. We can
|
||||
; vectorize the first three stores as <3 x float>, but this vectorized store must
|
||||
; be inserted at the location of the third scalar store, not the fourth one.
|
||||
;
|
||||
; CHECK-LABEL: @insert_store_point_alias
|
||||
; CHECK: store <3 x float>
|
||||
; CHECK: load float, float addrspace(1)* %a.idx.2
|
||||
; CHECK: store float
|
||||
; CHECK-SAME: %a.idx.3
|
||||
define float @insert_store_point_alias(float addrspace(1)* nocapture %a, i64 %idx) {
|
||||
%a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
|
||||
%a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1
|
||||
%a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1
|
||||
%a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1
|
||||
|
||||
store float 0.0, float addrspace(1)* %a.idx, align 4
|
||||
store float 0.0, float addrspace(1)* %a.idx.1, align 4
|
||||
store float 0.0, float addrspace(1)* %a.idx.2, align 4
|
||||
%x = load float, float addrspace(1)* %a.idx.2, align 4
|
||||
store float 0.0, float addrspace(1)* %a.idx.3, align 4
|
||||
|
||||
ret float %x
|
||||
}
|
||||
|
||||
; Here we have four stores, with an aliasing load before the last one. We
|
||||
; could vectorize two of the stores before the load (although we currently
|
||||
; don't), but the important thing is that we *don't* sink the store to
|
||||
; a[idx + 1] below the load.
|
||||
;
|
||||
; CHECK-LABEL: @insert_store_point_alias_ooo
|
||||
; CHECK: store float
|
||||
; CHECK-SAME: %a.idx.3
|
||||
; CHECK: store float
|
||||
; CHECK-SAME: %a.idx.1
|
||||
; CHECK: store float
|
||||
; CHECK-SAME: %a.idx.2
|
||||
; CHECK: load float, float addrspace(1)* %a.idx.2
|
||||
; CHECK: store float
|
||||
; CHECK-SAME: %a.idx
|
||||
define float @insert_store_point_alias_ooo(float addrspace(1)* nocapture %a, i64 %idx) {
|
||||
%a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
|
||||
%a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1
|
||||
%a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1
|
||||
%a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1
|
||||
|
||||
store float 0.0, float addrspace(1)* %a.idx.3, align 4
|
||||
store float 0.0, float addrspace(1)* %a.idx.1, align 4
|
||||
store float 0.0, float addrspace(1)* %a.idx.2, align 4
|
||||
%x = load float, float addrspace(1)* %a.idx.2, align 4
|
||||
store float 0.0, float addrspace(1)* %a.idx, align 4
|
||||
|
||||
ret float %x
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
@ -1,28 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
|
||||
; This is NOT OK to vectorize, as either load may alias either store.
|
||||
|
||||
; CHECK: load double
|
||||
; CHECK: store double 0.000000e+00, double addrspace(1)* %a,
|
||||
; CHECK: load double
|
||||
; CHECK: store double 0.000000e+00, double addrspace(1)* %a.idx.1
|
||||
define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
|
||||
entry:
|
||||
%a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
|
||||
%c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1
|
||||
|
||||
%ld.c = load double, double addrspace(1)* %c, align 8 ; may alias store to %a
|
||||
store double 0.0, double addrspace(1)* %a, align 8
|
||||
|
||||
%ld.c.idx.1 = load double, double addrspace(1)* %c.idx.1, align 8 ; may alias store to %a
|
||||
store double 0.0, double addrspace(1)* %a.idx.1, align 8
|
||||
|
||||
%add = fadd double %ld.c, %ld.c.idx.1
|
||||
store double %add, double addrspace(1)* %b
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
@ -1,3 +0,0 @@
|
||||
if not 'AMDGPU' in config.root.targets:
|
||||
config.unsupported = True
|
||||
|
@ -1,231 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-ALIGNED,ALIGNED,ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-ALIGNED,ALIGNED,ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-ALIGNED,ALIGNED,ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-UNALIGNED,UNALIGNED,ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s
|
||||
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
|
||||
; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32
|
||||
; ALIGNED: store i32
|
||||
; ALIGNED: store i32
|
||||
; ALIGNED: store i32
|
||||
; ALIGNED: store i32
|
||||
|
||||
; ELT8-UNALIGNED: store <2 x i32>
|
||||
; ELT8-UNALIGNED: store <2 x i32>
|
||||
|
||||
; ELT16-UNALIGNED: store <4 x i32>
|
||||
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
|
||||
%out.gep.1 = getelementptr i32, i32* %out, i32 1
|
||||
%out.gep.2 = getelementptr i32, i32* %out, i32 2
|
||||
%out.gep.3 = getelementptr i32, i32* %out, i32 3
|
||||
|
||||
store i32 9, i32* %out
|
||||
store i32 1, i32* %out.gep.1
|
||||
store i32 23, i32* %out.gep.2
|
||||
store i32 19, i32* %out.gep.3
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1(
|
||||
; ALIGNED: store i32 9, i32* %out, align 1
|
||||
; ALIGNED: store i32 1, i32* %out.gep.1, align 1
|
||||
; ALIGNED: store i32 23, i32* %out.gep.2, align 1
|
||||
; ALIGNED: store i32 19, i32* %out.gep.3, align 1
|
||||
|
||||
; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32>* %1, align 1
|
||||
|
||||
; ELT8-UNALIGNED: store <2 x i32> <i32 9, i32 1>, <2 x i32>* %1, align 1
|
||||
; ELT8-UNALIGNED: store <2 x i32> <i32 23, i32 19>, <2 x i32>* %2, align 1
|
||||
|
||||
; ELT4-UNALIGNED: store i32
|
||||
; ELT4-UNALIGNED: store i32
|
||||
; ELT4-UNALIGNED: store i32
|
||||
; ELT4-UNALIGNED: store i32
|
||||
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 {
|
||||
%out.gep.1 = getelementptr i32, i32* %out, i32 1
|
||||
%out.gep.2 = getelementptr i32, i32* %out, i32 2
|
||||
%out.gep.3 = getelementptr i32, i32* %out, i32 3
|
||||
|
||||
store i32 9, i32* %out, align 1
|
||||
store i32 1, i32* %out.gep.1, align 1
|
||||
store i32 23, i32* %out.gep.2, align 1
|
||||
store i32 19, i32* %out.gep.3, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2(
|
||||
; ALIGNED: store i32 9, i32* %out, align 2
|
||||
; ALIGNED: store i32 1, i32* %out.gep.1, align 2
|
||||
; ALIGNED: store i32 23, i32* %out.gep.2, align 2
|
||||
; ALIGNED: store i32 19, i32* %out.gep.3, align 2
|
||||
|
||||
; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32>* %1, align 2
|
||||
|
||||
; ELT8-UNALIGNED: store <2 x i32>
|
||||
; ELT8-UNALIGNED: store <2 x i32>
|
||||
|
||||
; ELT4-UNALIGNED: store i32
|
||||
; ELT4-UNALIGNED: store i32
|
||||
; ELT4-UNALIGNED: store i32
|
||||
; ELT4-UNALIGNED: store i32
|
||||
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 {
|
||||
%out.gep.1 = getelementptr i32, i32* %out, i32 1
|
||||
%out.gep.2 = getelementptr i32, i32* %out, i32 2
|
||||
%out.gep.3 = getelementptr i32, i32* %out, i32 3
|
||||
|
||||
store i32 9, i32* %out, align 2
|
||||
store i32 1, i32* %out.gep.1, align 2
|
||||
store i32 23, i32* %out.gep.2, align 2
|
||||
store i32 19, i32* %out.gep.3, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
|
||||
; ALL: store <4 x i8>
|
||||
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
|
||||
%out.gep.1 = getelementptr i8, i8* %out, i32 1
|
||||
%out.gep.2 = getelementptr i8, i8* %out, i32 2
|
||||
%out.gep.3 = getelementptr i8, i8* %out, i32 3
|
||||
|
||||
store i8 9, i8* %out, align 4
|
||||
store i8 1, i8* %out.gep.1
|
||||
store i8 23, i8* %out.gep.2
|
||||
store i8 19, i8* %out.gep.3
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8_align1(
|
||||
; ALIGNED: store i8
|
||||
; ALIGNED: store i8
|
||||
; ALIGNED: store i8
|
||||
; ALIGNED: store i8
|
||||
|
||||
; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8>* %1, align 1
|
||||
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 {
|
||||
%out.gep.1 = getelementptr i8, i8* %out, i32 1
|
||||
%out.gep.2 = getelementptr i8, i8* %out, i32 2
|
||||
%out.gep.3 = getelementptr i8, i8* %out, i32 3
|
||||
|
||||
store i8 9, i8* %out, align 1
|
||||
store i8 1, i8* %out.gep.1, align 1
|
||||
store i8 23, i8* %out.gep.2, align 1
|
||||
store i8 19, i8* %out.gep.3, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16(
|
||||
; ALL: store <2 x i16>
|
||||
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
|
||||
%out.gep.1 = getelementptr i16, i16* %out, i32 1
|
||||
|
||||
store i16 9, i16* %out, align 4
|
||||
store i16 12, i16* %out.gep.1
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2(
|
||||
; ALIGNED: store i16
|
||||
; ALIGNED: store i16
|
||||
|
||||
; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 2
|
||||
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 {
|
||||
%out.gep.1 = getelementptr i16, i16* %out, i32 1
|
||||
|
||||
store i16 9, i16* %out, align 2
|
||||
store i16 12, i16* %out.gep.1, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align1(
|
||||
; ALIGNED: store i16
|
||||
; ALIGNED: store i16
|
||||
|
||||
; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 1
|
||||
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 {
|
||||
%out.gep.1 = getelementptr i16, i16* %out, i32 1
|
||||
|
||||
store i16 9, i16* %out, align 1
|
||||
store i16 12, i16* %out.gep.1, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8(
|
||||
; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 8
|
||||
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 {
|
||||
%out.gep.1 = getelementptr i16, i16* %out, i32 1
|
||||
|
||||
store i16 9, i16* %out, align 8
|
||||
store i16 12, i16* %out.gep.1, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32
|
||||
; ELT4: store i32
|
||||
; ELT4: store i32
|
||||
; ELT4: store i32
|
||||
|
||||
; ELT8-ALIGNED: store i32
|
||||
; ELT8-ALIGNED: store i32
|
||||
; ELT8-ALIGNED: store i32
|
||||
|
||||
; ELT8-UNALIGNED: store <2 x i32>
|
||||
; ELT8-UNALIGNED: store i32
|
||||
|
||||
; ELT16-ALIGNED: store i32
|
||||
; ELT16-ALIGNED: store i32
|
||||
; ELT16-ALIGNED: store i32
|
||||
|
||||
; ELT16-UNALIGNED: store <3 x i32>
|
||||
define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 {
|
||||
%out.gep.1 = getelementptr i32, i32* %out, i32 1
|
||||
%out.gep.2 = getelementptr i32, i32* %out, i32 2
|
||||
|
||||
store i32 9, i32* %out
|
||||
store i32 1, i32* %out.gep.1
|
||||
store i32 23, i32* %out.gep.2
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32_align1(
|
||||
; ALIGNED: store i32
|
||||
; ALIGNED: store i32
|
||||
; ALIGNED: store i32
|
||||
|
||||
; ELT4-UNALIGNED: store i32
|
||||
; ELT4-UNALIGNED: store i32
|
||||
; ELT4-UNALIGNED: store i32
|
||||
|
||||
; ELT8-UNALIGNED: store <2 x i32>
|
||||
; ELT8-UNALIGNED: store i32
|
||||
|
||||
; ELT16-UNALIGNED: store <3 x i32>
|
||||
define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 {
|
||||
%out.gep.1 = getelementptr i32, i32* %out, i32 1
|
||||
%out.gep.2 = getelementptr i32, i32* %out, i32 2
|
||||
|
||||
store i32 9, i32* %out, align 1
|
||||
store i32 1, i32* %out.gep.1, align 1
|
||||
store i32 23, i32* %out.gep.2, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i8_align1(
|
||||
; ALIGNED: store i8
|
||||
; ALIGNED: store i8
|
||||
; ALIGNED: store i8
|
||||
|
||||
; UNALIGNED: store <3 x i8>
|
||||
define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 {
|
||||
%out.gep.1 = getelementptr i8, i8* %out, i8 1
|
||||
%out.gep.2 = getelementptr i8, i8* %out, i8 2
|
||||
|
||||
store i8 9, i8* %out, align 1
|
||||
store i8 1, i8* %out.gep.1, align 1
|
||||
store i8 23, i8* %out.gep.2, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
File diff suppressed because it is too large
Load Diff
@ -1,91 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
|
||||
; CHECK-LABEL: @merge_v2i32_v2i32(
|
||||
; CHECK: load <4 x i32>
|
||||
; CHECK: store <4 x i32> zeroinitializer
|
||||
define amdgpu_kernel void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 1
|
||||
%b.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 1
|
||||
|
||||
%ld.c = load <2 x i32>, <2 x i32> addrspace(1)* %b, align 4
|
||||
%ld.c.idx.1 = load <2 x i32>, <2 x i32> addrspace(1)* %b.1, align 4
|
||||
|
||||
store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %a, align 4
|
||||
store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %a.1, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_v1i32_v1i32(
|
||||
; CHECK: load <2 x i32>
|
||||
; CHECK: store <2 x i32> zeroinitializer
|
||||
define amdgpu_kernel void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %a, i64 1
|
||||
%b.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %b, i64 1
|
||||
|
||||
%ld.c = load <1 x i32>, <1 x i32> addrspace(1)* %b, align 4
|
||||
%ld.c.idx.1 = load <1 x i32>, <1 x i32> addrspace(1)* %b.1, align 4
|
||||
|
||||
store <1 x i32> zeroinitializer, <1 x i32> addrspace(1)* %a, align 4
|
||||
store <1 x i32> zeroinitializer, <1 x i32> addrspace(1)* %a.1, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @no_merge_v3i32_v3i32(
|
||||
; CHECK: load <3 x i32>
|
||||
; CHECK: load <3 x i32>
|
||||
; CHECK: store <3 x i32> zeroinitializer
|
||||
; CHECK: store <3 x i32> zeroinitializer
|
||||
define amdgpu_kernel void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a, i64 1
|
||||
%b.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b, i64 1
|
||||
|
||||
%ld.c = load <3 x i32>, <3 x i32> addrspace(1)* %b, align 4
|
||||
%ld.c.idx.1 = load <3 x i32>, <3 x i32> addrspace(1)* %b.1, align 4
|
||||
|
||||
store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %a, align 4
|
||||
store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %a.1, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_v2i16_v2i16(
|
||||
; CHECK: load <4 x i16>
|
||||
; CHECK: store <4 x i16> zeroinitializer
|
||||
define amdgpu_kernel void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a, i64 1
|
||||
%b.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i64 1
|
||||
|
||||
%ld.c = load <2 x i16>, <2 x i16> addrspace(1)* %b, align 4
|
||||
%ld.c.idx.1 = load <2 x i16>, <2 x i16> addrspace(1)* %b.1, align 4
|
||||
|
||||
store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %a, align 4
|
||||
store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %a.1, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; Ideally this would be merged
|
||||
; CHECK-LABEL: @merge_load_i32_v2i16(
|
||||
; CHECK: load i32,
|
||||
; CHECK: load <2 x i16>
|
||||
define amdgpu_kernel void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 1
|
||||
%a.1.cast = bitcast i32 addrspace(1)* %a.1 to <2 x i16> addrspace(1)*
|
||||
|
||||
%ld.0 = load i32, i32 addrspace(1)* %a
|
||||
%ld.1 = load <2 x i16>, <2 x i16> addrspace(1)* %a.1.cast
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
@ -1,30 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-- -load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
@lds = internal addrspace(3) global [512 x float] undef, align 4
|
||||
|
||||
; The original load has an implicit alignment of 4, and should not
|
||||
; increase to an align 8 load.
|
||||
|
||||
; CHECK-LABEL: @load_keep_base_alignment_missing_align(
|
||||
; CHECK: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
|
||||
define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
|
||||
%ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
|
||||
%val0 = load float, float addrspace(3)* %ptr0
|
||||
|
||||
%ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12
|
||||
%val1 = load float, float addrspace(3)* %ptr1
|
||||
%add = fadd float %val0, %val1
|
||||
store float %add, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; CHECK-LABEL: @store_keep_base_alignment_missing_align(
|
||||
; CHECK: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
|
||||
define amdgpu_kernel void @store_keep_base_alignment_missing_align() {
|
||||
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1
|
||||
%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
|
||||
store float 0.0, float addrspace(3)* %arrayidx0
|
||||
store float 0.0, float addrspace(3)* %arrayidx1
|
||||
ret void
|
||||
}
|
@ -1,64 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
|
||||
; Checks that there is no crash when there are multiple tails
|
||||
; for a the same head starting a chain.
|
||||
@0 = internal addrspace(3) global [16384 x i32] undef
|
||||
|
||||
; CHECK-LABEL: @no_crash(
|
||||
; CHECK: store <2 x i32> zeroinitializer
|
||||
; CHECK: store i32 0
|
||||
; CHECK: store i32 0
|
||||
|
||||
define amdgpu_kernel void @no_crash(i32 %arg) {
|
||||
%tmp2 = add i32 %arg, 14
|
||||
%tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2
|
||||
%tmp4 = add i32 %arg, 15
|
||||
%tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp4
|
||||
|
||||
store i32 0, i32 addrspace(3)* %tmp3, align 4
|
||||
store i32 0, i32 addrspace(3)* %tmp5, align 4
|
||||
store i32 0, i32 addrspace(3)* %tmp5, align 4
|
||||
store i32 0, i32 addrspace(3)* %tmp5, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; Check adjiacent memory locations are properly matched and the
|
||||
; longest chain vectorized
|
||||
|
||||
; CHECK-LABEL: @interleave_get_longest
|
||||
; CHECK: load <2 x i32>
|
||||
; CHECK: load i32
|
||||
; CHECK: store <2 x i32> zeroinitializer
|
||||
; CHECK: load i32
|
||||
; CHECK: load <2 x i32>
|
||||
; CHECK: load i32
|
||||
; CHECK: load i32
|
||||
|
||||
define amdgpu_kernel void @interleave_get_longest(i32 %arg) {
|
||||
%a1 = add i32 %arg, 1
|
||||
%a2 = add i32 %arg, 2
|
||||
%a3 = add i32 %arg, 3
|
||||
%a4 = add i32 %arg, 4
|
||||
%tmp1 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %arg
|
||||
%tmp2 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a1
|
||||
%tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a2
|
||||
%tmp4 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a3
|
||||
%tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a4
|
||||
|
||||
%l1 = load i32, i32 addrspace(3)* %tmp2, align 4
|
||||
%l2 = load i32, i32 addrspace(3)* %tmp1, align 4
|
||||
store i32 0, i32 addrspace(3)* %tmp2, align 4
|
||||
store i32 0, i32 addrspace(3)* %tmp1, align 4
|
||||
%l3 = load i32, i32 addrspace(3)* %tmp2, align 4
|
||||
%l4 = load i32, i32 addrspace(3)* %tmp3, align 4
|
||||
%l5 = load i32, i32 addrspace(3)* %tmp4, align 4
|
||||
%l6 = load i32, i32 addrspace(3)* %tmp5, align 4
|
||||
%l7 = load i32, i32 addrspace(3)* %tmp5, align 4
|
||||
%l8 = load i32, i32 addrspace(3)* %tmp5, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
@ -1,20 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: @no_implicit_float(
|
||||
; CHECK: store i32
|
||||
; CHECK: store i32
|
||||
; CHECK: store i32
|
||||
; CHECK: store i32
|
||||
define amdgpu_kernel void @no_implicit_float(i32 addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
|
||||
%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
|
||||
%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
|
||||
|
||||
store i32 123, i32 addrspace(1)* %out.gep.1
|
||||
store i32 456, i32 addrspace(1)* %out.gep.2
|
||||
store i32 333, i32 addrspace(1)* %out.gep.3
|
||||
store i32 1234, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind noimplicitfloat }
|
@ -1,22 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: @optnone(
|
||||
; CHECK: store i32
|
||||
; CHECK: store i32
|
||||
define amdgpu_kernel void @optnone(i32 addrspace(1)* %out) noinline optnone {
|
||||
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
|
||||
|
||||
store i32 123, i32 addrspace(1)* %out.gep.1
|
||||
store i32 456, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @do_opt(
|
||||
; CHECK: store <2 x i32>
|
||||
define amdgpu_kernel void @do_opt(i32 addrspace(1)* %out) {
|
||||
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
|
||||
|
||||
store i32 123, i32 addrspace(1)* %out.gep.1
|
||||
store i32 456, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
@ -1,311 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
; CHECK-LABEL: @merge_v2p1i8(
|
||||
; CHECK: load <2 x i64>
|
||||
; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
|
||||
; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
|
||||
; CHECK: store <2 x i64> zeroinitializer
|
||||
define amdgpu_kernel void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
|
||||
%b.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, i64 1
|
||||
|
||||
%ld.c = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, align 4
|
||||
%ld.c.idx.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b.1, align 4
|
||||
|
||||
store i8 addrspace(1)* null, i8 addrspace(1)* addrspace(1)* %a, align 4
|
||||
store i8 addrspace(1)* null, i8 addrspace(1)* addrspace(1)* %a.1, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_v2p3i8(
|
||||
; CHECK: load <2 x i32>
|
||||
; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
|
||||
; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
|
||||
; CHECK: store <2 x i32> zeroinitializer
|
||||
define amdgpu_kernel void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i64 1
|
||||
%b.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, i64 1
|
||||
|
||||
%ld.c = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, align 4
|
||||
%ld.c.idx.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b.1, align 4
|
||||
|
||||
store i8 addrspace(3)* null, i8 addrspace(3)* addrspace(3)* %a, align 4
|
||||
store i8 addrspace(3)* null, i8 addrspace(3)* addrspace(3)* %a.1, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_load_i64_ptr64(
|
||||
; CHECK: load <2 x i64>
|
||||
; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
|
||||
; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
|
||||
define amdgpu_kernel void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
|
||||
%a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
|
||||
|
||||
%ld.0 = load i64, i64 addrspace(1)* %a
|
||||
%ld.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.1.cast
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_load_ptr64_i64(
|
||||
; CHECK: load <2 x i64>
|
||||
; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
|
||||
; CHECK: inttoptr i64 [[ELT0]] to i8 addrspace(1)*
|
||||
define amdgpu_kernel void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
|
||||
entry:
|
||||
%a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
|
||||
%a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
|
||||
|
||||
%ld.0 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.cast
|
||||
%ld.1 = load i64, i64 addrspace(1)* %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_store_ptr64_i64(
|
||||
; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64
|
||||
; CHECK: insertelement <2 x i64> undef, i64 [[ELT0]], i32 0
|
||||
; CHECK: store <2 x i64>
|
||||
define amdgpu_kernel void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
|
||||
entry:
|
||||
%a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
|
||||
%a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
|
||||
|
||||
|
||||
store i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* %a.cast
|
||||
store i64 %val1, i64 addrspace(1)* %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_store_i64_ptr64(
|
||||
; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
|
||||
; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1]], i32 1
|
||||
; CHECK: store <2 x i64>
|
||||
define amdgpu_kernel void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
|
||||
%a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to i64 addrspace(1)*
|
||||
|
||||
store i64 %val0, i64 addrspace(1)* %a.cast
|
||||
store i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_load_i32_ptr32(
|
||||
; CHECK: load <2 x i32>
|
||||
; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1
|
||||
; CHECK: inttoptr i32 [[ELT1]] to i8 addrspace(3)*
|
||||
define amdgpu_kernel void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
|
||||
%a.1.cast = bitcast i32 addrspace(3)* %a.1 to i8 addrspace(3)* addrspace(3)*
|
||||
|
||||
%ld.0 = load i32, i32 addrspace(3)* %a
|
||||
%ld.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a.1.cast
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_load_ptr32_i32(
|
||||
; CHECK: load <2 x i32>
|
||||
; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0
|
||||
; CHECK: inttoptr i32 [[ELT0]] to i8 addrspace(3)*
|
||||
define amdgpu_kernel void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
|
||||
entry:
|
||||
%a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
|
||||
%a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
|
||||
|
||||
%ld.0 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a.cast
|
||||
%ld.1 = load i32, i32 addrspace(3)* %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_store_ptr32_i32(
|
||||
; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32
|
||||
; CHECK: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
|
||||
; CHECK: store <2 x i32>
|
||||
define amdgpu_kernel void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
|
||||
entry:
|
||||
%a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
|
||||
%a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
|
||||
|
||||
store i8 addrspace(3)* %ptr0, i8 addrspace(3)* addrspace(3)* %a.cast
|
||||
store i32 %val1, i32 addrspace(3)* %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_store_i32_ptr32(
|
||||
; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32
|
||||
; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1
|
||||
; CHECK: store <2 x i32>
|
||||
define amdgpu_kernel void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i32 1
|
||||
%a.cast = bitcast i8 addrspace(3)* addrspace(3)* %a to i32 addrspace(3)*
|
||||
|
||||
store i32 %val0, i32 addrspace(3)* %a.cast
|
||||
store i8 addrspace(3)* %ptr1, i8 addrspace(3)* addrspace(3)* %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @no_merge_store_ptr32_i64(
|
||||
; CHECK: store i8 addrspace(3)*
|
||||
; CHECK: store i64
|
||||
define amdgpu_kernel void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
|
||||
entry:
|
||||
%a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
|
||||
%a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
|
||||
|
||||
|
||||
store i8 addrspace(3)* %ptr0, i8 addrspace(3)* addrspace(1)* %a.cast
|
||||
store i64 %val1, i64 addrspace(1)* %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @no_merge_store_i64_ptr32(
|
||||
; CHECK: store i64
|
||||
; CHECK: store i8 addrspace(3)*
|
||||
define amdgpu_kernel void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a, i64 1
|
||||
%a.cast = bitcast i8 addrspace(3)* addrspace(1)* %a to i64 addrspace(1)*
|
||||
|
||||
store i64 %val0, i64 addrspace(1)* %a.cast
|
||||
store i8 addrspace(3)* %ptr1, i8 addrspace(3)* addrspace(1)* %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @no_merge_load_i64_ptr32(
|
||||
; CHECK: load i64,
|
||||
; CHECK: load i8 addrspace(3)*,
|
||||
define amdgpu_kernel void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
|
||||
%a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(3)* addrspace(1)*
|
||||
|
||||
%ld.0 = load i64, i64 addrspace(1)* %a
|
||||
%ld.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a.1.cast
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @no_merge_load_ptr32_i64(
|
||||
; CHECK: load i8 addrspace(3)*,
|
||||
; CHECK: load i64,
|
||||
define amdgpu_kernel void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
|
||||
entry:
|
||||
%a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
|
||||
%a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
|
||||
|
||||
%ld.0 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a.cast
|
||||
%ld.1 = load i64, i64 addrspace(1)* %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; XXX - This isn't merged for some reason
|
||||
; CHECK-LABEL: @merge_v2p1i8_v2p1i8(
|
||||
; CHECK: load <2 x i8 addrspace(1)*>
|
||||
; CHECK: load <2 x i8 addrspace(1)*>
|
||||
; CHECK: store <2 x i8 addrspace(1)*>
|
||||
; CHECK: store <2 x i8 addrspace(1)*>
|
||||
define amdgpu_kernel void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %a, i64 1
|
||||
%b.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, i64 1
|
||||
|
||||
%ld.c = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, align 4
|
||||
%ld.c.idx.1 = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b.1, align 4
|
||||
|
||||
store <2 x i8 addrspace(1)*> zeroinitializer, <2 x i8 addrspace(1)*> addrspace(1)* %a, align 4
|
||||
store <2 x i8 addrspace(1)*> zeroinitializer, <2 x i8 addrspace(1)*> addrspace(1)* %a.1, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_load_ptr64_f64(
|
||||
; CHECK: load <2 x i64>
|
||||
; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
|
||||
; CHECK: [[ELT0_INT:%[^ ]+]] = inttoptr i64 [[ELT0]] to i8 addrspace(1)*
|
||||
; CHECK: [[ELT1_INT:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
|
||||
; CHECK: bitcast i64 [[ELT1_INT]] to double
|
||||
define amdgpu_kernel void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
|
||||
entry:
|
||||
%a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
|
||||
%a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
|
||||
|
||||
%ld.0 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.cast
|
||||
%ld.1 = load double, double addrspace(1)* %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_load_f64_ptr64(
|
||||
; CHECK: load <2 x i64>
|
||||
; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
|
||||
; CHECK: bitcast i64 [[ELT0]] to double
|
||||
; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
|
||||
; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
|
||||
define amdgpu_kernel void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
|
||||
%a.1.cast = bitcast double addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
|
||||
|
||||
%ld.0 = load double, double addrspace(1)* %a
|
||||
%ld.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.1.cast
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_store_ptr64_f64(
|
||||
; CHECK: [[ELT0_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64
|
||||
; CHECK: insertelement <2 x i64> undef, i64 [[ELT0_INT]], i32 0
|
||||
; CHECK: [[ELT1_INT:%[^ ]+]] = bitcast double %val1 to i64
|
||||
; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
|
||||
; CHECK: store <2 x i64>
|
||||
define amdgpu_kernel void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
|
||||
entry:
|
||||
%a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
|
||||
%a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
|
||||
|
||||
store i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* %a.cast
|
||||
store double %val1, double addrspace(1)* %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_store_f64_ptr64(
|
||||
; CHECK: [[ELT0_INT:%[^ ]+]] = bitcast double %val0 to i64
|
||||
; CHECK: insertelement <2 x i64> undef, i64 [[ELT0_INT]], i32 0
|
||||
; CHECK: [[ELT1_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
|
||||
; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
|
||||
; CHECK: store <2 x i64>
|
||||
define amdgpu_kernel void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
|
||||
%a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to double addrspace(1)*
|
||||
|
||||
store double %val0, double addrspace(1)* %a.cast
|
||||
store i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
@ -1,58 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
; Check that, in the presence of an aliasing load, the stores preceding the
|
||||
; aliasing load are safe to vectorize.
|
||||
|
||||
; CHECK-LABEL: store_vectorize_with_alias
|
||||
; CHECK: store <4 x float>
|
||||
; CHECK: load <4 x float>
|
||||
; CHECK: store <4 x float>
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define amdgpu_kernel void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
|
||||
bb:
|
||||
%tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)*
|
||||
%tmp1 = load float, float addrspace(1)* %tmp, align 4
|
||||
|
||||
%tmp2 = bitcast i8 addrspace(1)* %a to float addrspace(1)*
|
||||
store float %tmp1, float addrspace(1)* %tmp2, align 4
|
||||
%tmp3 = getelementptr i8, i8 addrspace(1)* %a, i64 4
|
||||
%tmp4 = bitcast i8 addrspace(1)* %tmp3 to float addrspace(1)*
|
||||
store float %tmp1, float addrspace(1)* %tmp4, align 4
|
||||
%tmp5 = getelementptr i8, i8 addrspace(1)* %a, i64 8
|
||||
%tmp6 = bitcast i8 addrspace(1)* %tmp5 to float addrspace(1)*
|
||||
store float %tmp1, float addrspace(1)* %tmp6, align 4
|
||||
%tmp7 = getelementptr i8, i8 addrspace(1)* %a, i64 12
|
||||
%tmp8 = bitcast i8 addrspace(1)* %tmp7 to float addrspace(1)*
|
||||
store float %tmp1, float addrspace(1)* %tmp8, align 4
|
||||
|
||||
%tmp9 = getelementptr i8, i8 addrspace(1)* %b, i64 16
|
||||
%tmp10 = bitcast i8 addrspace(1)* %tmp9 to float addrspace(1)*
|
||||
%tmp11 = load float, float addrspace(1)* %tmp10, align 4
|
||||
%tmp12 = getelementptr i8, i8 addrspace(1)* %b, i64 20
|
||||
%tmp13 = bitcast i8 addrspace(1)* %tmp12 to float addrspace(1)*
|
||||
%tmp14 = load float, float addrspace(1)* %tmp13, align 4
|
||||
%tmp15 = getelementptr i8, i8 addrspace(1)* %b, i64 24
|
||||
%tmp16 = bitcast i8 addrspace(1)* %tmp15 to float addrspace(1)*
|
||||
%tmp17 = load float, float addrspace(1)* %tmp16, align 4
|
||||
%tmp18 = getelementptr i8, i8 addrspace(1)* %b, i64 28
|
||||
%tmp19 = bitcast i8 addrspace(1)* %tmp18 to float addrspace(1)*
|
||||
%tmp20 = load float, float addrspace(1)* %tmp19, align 4
|
||||
|
||||
%tmp21 = getelementptr i8, i8 addrspace(1)* %a, i64 16
|
||||
%tmp22 = bitcast i8 addrspace(1)* %tmp21 to float addrspace(1)*
|
||||
store float %tmp11, float addrspace(1)* %tmp22, align 4
|
||||
%tmp23 = getelementptr i8, i8 addrspace(1)* %a, i64 20
|
||||
%tmp24 = bitcast i8 addrspace(1)* %tmp23 to float addrspace(1)*
|
||||
store float %tmp14, float addrspace(1)* %tmp24, align 4
|
||||
%tmp25 = getelementptr i8, i8 addrspace(1)* %a, i64 24
|
||||
%tmp26 = bitcast i8 addrspace(1)* %tmp25 to float addrspace(1)*
|
||||
store float %tmp17, float addrspace(1)* %tmp26, align 4
|
||||
%tmp27 = getelementptr i8, i8 addrspace(1)* %a, i64 28
|
||||
%tmp28 = bitcast i8 addrspace(1)* %tmp27 to float addrspace(1)*
|
||||
store float %tmp20, float addrspace(1)* %tmp28, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { argmemonly nounwind }
|
@ -1,199 +0,0 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
; Checks that we don't merge loads/stores of types smaller than one
|
||||
; byte, or vectors with elements smaller than one byte.
|
||||
|
||||
%struct.foo = type { i32, i8 }
|
||||
|
||||
declare void @use_i1(i1)
|
||||
declare void @use_i2(i2)
|
||||
declare void @use_i8(i8)
|
||||
declare void @use_foo(%struct.foo)
|
||||
declare void @use_v2i2(<2 x i2>)
|
||||
declare void @use_v4i2(<4 x i2>)
|
||||
declare void @use_v2i9(<2 x i9>)
|
||||
|
||||
; CHECK-LABEL: @merge_store_2_constants_i1(
|
||||
; CHECK: store i1
|
||||
; CHECK: store i1
|
||||
define amdgpu_kernel void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
|
||||
store i1 true, i1 addrspace(1)* %out.gep.1
|
||||
store i1 false, i1 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_store_2_constants_i2(
|
||||
; CHECK: store i2 1
|
||||
; CHECK: store i2 -1
|
||||
define amdgpu_kernel void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
|
||||
store i2 1, i2 addrspace(1)* %out.gep.1
|
||||
store i2 -1, i2 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_different_store_sizes_i1_i8(
|
||||
; CHECK: store i1 true
|
||||
; CHECK: store i8 123
|
||||
define amdgpu_kernel void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
|
||||
%out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
|
||||
%out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
||||
store i1 true, i1 addrspace(1)* %out.i1
|
||||
store i8 123, i8 addrspace(1)* %out.gep.1
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_different_store_sizes_i8_i1(
|
||||
; CHECK: store i8 123
|
||||
; CHECK: store i1 true
|
||||
define amdgpu_kernel void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
|
||||
%out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
|
||||
%out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
|
||||
store i8 123, i8 addrspace(1)* %out.gep.1
|
||||
store i1 true, i1 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_store_2_constant_structs(
|
||||
; CHECK: store %struct.foo
|
||||
; CHECK: store %struct.foo
|
||||
define amdgpu_kernel void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
|
||||
store %struct.foo { i32 12, i8 3 }, %struct.foo addrspace(1)* %out.gep.1
|
||||
store %struct.foo { i32 92, i8 9 }, %struct.foo addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; sub-byte element size
|
||||
; CHECK-LABEL: @merge_store_2_constants_v2i2(
|
||||
; CHECK: store <2 x i2>
|
||||
; CHECK: store <2 x i2>
|
||||
define amdgpu_kernel void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
|
||||
store <2 x i2> <i2 1, i2 -1>, <2 x i2> addrspace(1)* %out.gep.1
|
||||
store <2 x i2> <i2 -1, i2 1>, <2 x i2> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; sub-byte element size but byte size
|
||||
|
||||
; CHECK-LABEL: @merge_store_2_constants_v4i2(
|
||||
; CHECK: store <4 x i2>
|
||||
; CHECK: store <4 x i2>
|
||||
define amdgpu_kernel void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
|
||||
store <4 x i2> <i2 1, i2 -1, i2 1, i2 -1>, <4 x i2> addrspace(1)* %out.gep.1
|
||||
store <4 x i2> <i2 -1, i2 1, i2 -1, i2 1>, <4 x i2> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_load_2_constants_i1(
|
||||
; CHECK: load i1
|
||||
; CHECK: load i1
|
||||
define amdgpu_kernel void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
|
||||
%x = load i1, i1 addrspace(1)* %out.gep.1
|
||||
%y = load i1, i1 addrspace(1)* %out
|
||||
call void @use_i1(i1 %x)
|
||||
call void @use_i1(i1 %y)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_load_2_constants_i2(
|
||||
; CHECK: load i2
|
||||
; CHECK: load i2
|
||||
define amdgpu_kernel void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
|
||||
%x = load i2, i2 addrspace(1)* %out.gep.1
|
||||
%y = load i2, i2 addrspace(1)* %out
|
||||
call void @use_i2(i2 %x)
|
||||
call void @use_i2(i2 %y)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_different_load_sizes_i1_i8(
|
||||
; CHECK: load i1
|
||||
; CHECK: load i8
|
||||
define amdgpu_kernel void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
|
||||
%out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
|
||||
%out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
||||
%x = load i1, i1 addrspace(1)* %out.i1
|
||||
%y = load i8, i8 addrspace(1)* %out.gep.1
|
||||
call void @use_i1(i1 %x)
|
||||
call void @use_i8(i8 %y)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_different_load_sizes_i8_i1(
|
||||
; CHECK: load i8
|
||||
; CHECK: load i1
|
||||
define amdgpu_kernel void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
|
||||
%out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
|
||||
%out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
|
||||
%x = load i8, i8 addrspace(1)* %out.gep.1
|
||||
%y = load i1, i1 addrspace(1)* %out
|
||||
call void @use_i8(i8 %x)
|
||||
call void @use_i1(i1 %y)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_load_2_constant_structs(
|
||||
; CHECK: load %struct.foo
|
||||
; CHECK: load %struct.foo
|
||||
define amdgpu_kernel void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
|
||||
%x = load %struct.foo, %struct.foo addrspace(1)* %out.gep.1
|
||||
%y = load %struct.foo, %struct.foo addrspace(1)* %out
|
||||
call void @use_foo(%struct.foo %x)
|
||||
call void @use_foo(%struct.foo %y)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_load_2_constants_v2i2(
|
||||
; CHECK: load <2 x i2>
|
||||
; CHECK: load <2 x i2>
|
||||
define amdgpu_kernel void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
|
||||
%x = load <2 x i2>, <2 x i2> addrspace(1)* %out.gep.1
|
||||
%y = load <2 x i2>, <2 x i2> addrspace(1)* %out
|
||||
call void @use_v2i2(<2 x i2> %x)
|
||||
call void @use_v2i2(<2 x i2> %y)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_load_2_constants_v4i2(
|
||||
; CHECK: load <4 x i2>
|
||||
; CHECK: load <4 x i2>
|
||||
define amdgpu_kernel void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
|
||||
%x = load <4 x i2>, <4 x i2> addrspace(1)* %out.gep.1
|
||||
%y = load <4 x i2>, <4 x i2> addrspace(1)* %out
|
||||
call void @use_v4i2(<4 x i2> %x)
|
||||
call void @use_v4i2(<4 x i2> %y)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_store_2_constants_i9(
|
||||
; CHECK: store i9 3
|
||||
; CHECK: store i9 -5
|
||||
define amdgpu_kernel void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr i9, i9 addrspace(1)* %out, i32 1
|
||||
store i9 3, i9 addrspace(1)* %out.gep.1
|
||||
store i9 -5, i9 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @merge_load_2_constants_v2i9(
|
||||
; CHECK: load <2 x i9>
|
||||
; CHECK: load <2 x i9>
|
||||
define amdgpu_kernel void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr <2 x i9>, <2 x i9> addrspace(1)* %out, i32 1
|
||||
%x = load <2 x i9>, <2 x i9> addrspace(1)* %out.gep.1
|
||||
%y = load <2 x i9>, <2 x i9> addrspace(1)* %out
|
||||
call void @use_v2i9(<2 x i9> %x)
|
||||
call void @use_v2i9(<2 x i9> %y)
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
Reference in New Issue
Block a user