Imported Upstream version 6.4.0.137

Former-commit-id: 943baa9f16a098c33e129777827f3a9d20da00d6
2019-07-26 19:53:28 +00:00
parent e9207cf623
commit ef583813eb
2712 changed files with 74169 additions and 40587 deletions
--- a/external/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
+++ b/external/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
@@ -28,6 +28,28 @@ return:                                           ; preds = %if.then172, %cond.e
  ret void
 }

+; Avoid an assert/bad codegen in LD1LANEPOST lowering by not forming
+; LD1LANEPOST ISD nodes with a non-constant lane index.
+define <4 x i32> @f2(i32 *%p, <4 x i1> %m, <4 x i32> %v1, <4 x i32> %v2, i32 %idx) {
+  %L0 = load i32, i32* %p
+  %p1 = getelementptr i32, i32* %p, i64 1
+  %L1 = load i32, i32* %p1
+  %v = select <4 x i1> %m, <4 x i32> %v1, <4 x i32> %v2
+  %vret = insertelement <4 x i32> %v, i32 %L0, i32 %idx
+  store i32 %L1, i32 *%p
+  ret <4 x i32> %vret
+}
+
+; Check that a cycle is avoided during isel between the LD1LANEPOST instruction and the load of %L1.
+define <4 x i32> @f3(i32 *%p, <4 x i1> %m, <4 x i32> %v1, <4 x i32> %v2) {
+  %L0 = load i32, i32* %p
+  %p1 = getelementptr i32, i32* %p, i64 1
+  %L1 = load i32, i32* %p1
+  %v = select <4 x i1> %m, <4 x i32> %v1, <4 x i32> %v2
+  %vret = insertelement <4 x i32> %v, i32 %L0, i32 %L1
+  ret <4 x i32> %vret
+}
+
 ; Function Attrs: nounwind readnone
 declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) #1

--- a/external/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ b/external/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@@ -1,27 +1,31 @@
-; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s -check-prefix=CYCLONE --check-prefix=ALL
-; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s -check-prefix=KRYO --check-prefix=ALL
-; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=falkor < %s | FileCheck %s -check-prefix=FALKOR --check-prefix=ALL
+; RUN: llc -mtriple=arm64-apple-ios   -mcpu=cyclone   < %s | FileCheck %s -check-prefixes=ALL,CYCLONE
+; RUN: llc -mtriple=arm64-apple-ios   -mcpu=cyclone -mattr=+fullfp16 < %s | FileCheck %s -check-prefixes=CYCLONE-FULLFP16
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=exynos-m1 < %s | FileCheck %s -check-prefixes=ALL,OTHERS
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=exynos-m3 < %s | FileCheck %s -check-prefixes=ALL,OTHERS
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo      < %s | FileCheck %s -check-prefixes=ALL,OTHERS
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=falkor    < %s | FileCheck %s -check-prefixes=ALL,OTHERS

-; rdar://11481771
-; rdar://13713797
+declare void @bar(half, float, double, <2 x double>)
+declare void @bari(i32, i32)
+declare void @barl(i64, i64)
+declare void @barf(float, float)

 define void @t1() nounwind ssp {
 entry:
 ; ALL-LABEL: t1:
 ; ALL-NOT: fmov
-; CYCLONE: fmov d0, xzr
-; CYCLONE: fmov d1, xzr
+; ALL:     ldr h0,{{.*}}
+; CYCLONE: fmov s1, wzr
 ; CYCLONE: fmov d2, xzr
-; CYCLONE: fmov d3, xzr
-; KRYO: movi v0.2d, #0000000000000000
-; KRYO: movi v1.2d, #0000000000000000
-; KRYO: movi v2.2d, #0000000000000000
-; KRYO: movi v3.2d, #0000000000000000
-; FALKOR: movi v0.2d, #0000000000000000
-; FALKOR: movi v1.2d, #0000000000000000
-; FALKOR: movi v2.2d, #0000000000000000
-; FALKOR: movi v3.2d, #0000000000000000
-  tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind
+; CYCLONE: movi.16b v3, #0
+; CYCLONE-FULLFP16: fmov h0, wzr
+; CYCLONE-FULLFP16: fmov s1, wzr
+; CYCLONE-FULLFP16: fmov d2, xzr
+; CYCLONE-FULLFP16: movi.16b v3, #0
+; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
+; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
+; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
+  tail call void @bar(half 0.000000e+00, float 0.000000e+00, double 0.000000e+00, <2 x double> <double 0.000000e+00, double 0.000000e+00>) nounwind
  ret void
 }

@@ -29,8 +33,8 @@ define void @t2() nounwind ssp {
 entry:
 ; ALL-LABEL: t2:
 ; ALL-NOT: mov w0, wzr
-; ALL: mov w0, #0
-; ALL: mov w1, #0
+; ALL: mov w{{[0-3]+}}, #0
+; ALL: mov w{{[0-3]+}}, #0
  tail call void @bari(i32 0, i32 0) nounwind
  ret void
 }
@@ -39,8 +43,8 @@ define void @t3() nounwind ssp {
 entry:
 ; ALL-LABEL: t3:
 ; ALL-NOT: mov x0, xzr
-; ALL: mov x0, #0
-; ALL: mov x1, #0
+; ALL: mov x{{[0-3]+}}, #0
+; ALL: mov x{{[0-3]+}}, #0
  tail call void @barl(i64 0, i64 0) nounwind
  ret void
 }
@@ -48,26 +52,21 @@ entry:
 define void @t4() nounwind ssp {
 ; ALL-LABEL: t4:
 ; ALL-NOT: fmov
-; CYCLONE: fmov s0, wzr
-; CYCLONE: fmov s1, wzr
-; KRYO: movi v0.2d, #0000000000000000
-; KRYO: movi v1.2d, #0000000000000000
-; FALKOR: movi v0.2d, #0000000000000000
-; FALKOR: movi v1.2d, #0000000000000000
+; CYCLONE: fmov s{{[0-3]+}}, wzr
+; CYCLONE: fmov s{{[0-3]+}}, wzr
+; CYCLONE-FULLFP16: fmov s{{[0-3]+}}, wzr
+; CYCLONE-FULLFP16: fmov s{{[0-3]+}}, wzr
+; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
+; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000
  tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
  ret void
 }

-declare void @bar(double, double, double, double)
-declare void @bari(i32, i32)
-declare void @barl(i64, i64)
-declare void @barf(float, float)
-
 ; We used to produce spills+reloads for a Q register with zero cycle zeroing
 ; enabled.
 ; ALL-LABEL: foo:
-; ALL-NOT: str {{q[0-9]+}}
-; ALL-NOT: ldr {{q[0-9]+}}
+; ALL-NOT: str q{{[0-9]+}}
+; ALL-NOT: ldr q{{[0-9]+}}
 define double @foo(i32 %n) {
 entry:
  br label %for.body
@@ -90,8 +89,7 @@ for.end:
 define <2 x i64> @t6() {
 ; ALL-LABEL: t6:
 ; CYCLONE: movi.16b v0, #0
-; KRYO: movi v0.2d, #0000000000000000
-; FALKOR: movi v0.2d, #0000000000000000
+; OTHERS: movi v0.2d, #0000000000000000
 ret <2 x i64> zeroinitializer
 }

--- a/external/llvm/test/CodeGen/AArch64/falkor-hwpf-fix.mir
+++ b/external/llvm/test/CodeGen/AArch64/falkor-hwpf-fix.mir
@@ -353,3 +353,28 @@ body: |
  bb.1:
    RET_ReallyLR
 ...
+---
+# Check that non-base registers are considered live when finding a
+# scratch register by making sure we don't use %x2 for the scratch
+# register for the inserted ORRXrs.
+# CHECK-LABEL: name: hwpf_offreg
+# CHECK: %x3 = ORRXrs %xzr, %x1, 0
+# CHECK: %w10 = LDRWroX %x3, %x2, 0, 0
+name:            hwpf_offreg
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: %w0, %x1, %x2, %x17, %x18
+
+    %w10 = LDRWroX %x1, %x2, 0, 0 :: ("aarch64-strided-access" load 4)
+
+    %x2 = ORRXrs %xzr, %x10, 0
+    %w26 = LDRWroX %x1, %x2, 0, 0
+
+    %w0 = SUBWri %w0, 1, 0
+    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
+    Bcc 9, %bb.0, implicit %nzcv
+
+  bb.1:
+    RET_ReallyLR
+...
--- a/external/llvm/test/CodeGen/AArch64/inlineasm-S-constraint.ll
+++ b/external/llvm/test/CodeGen/AArch64/inlineasm-S-constraint.ll
@@ -0,0 +1,20 @@
+;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+@var = global i32 0
+define void @test_inline_constraint_S() {
+; CHECK-LABEL: test_inline_constraint_S:
+  call void asm sideeffect "adrp x0, $0", "S"(i32* @var)
+  call void asm sideeffect "add x0, x0, :lo12:$0", "S"(i32* @var)
+; CHECK: adrp x0, var
+; CHECK: add x0, x0, :lo12:var
+  ret void
+}
+define i32 @test_inline_constraint_S_label(i1 %in) {
+; CHECK-LABEL: test_inline_constraint_S_label:
+  call void asm sideeffect "adr x0, $0", "S"(i8* blockaddress(@test_inline_constraint_S_label, %loc))
+; CHECK: adr x0, .Ltmp{{[0-9]+}}
+br i1 %in, label %loc, label %loc2
+loc:
+  ret i32 0
+loc2:
+  ret i32 42
+}
--- a/external/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir
+++ b/external/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir
@@ -0,0 +1,35 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s
+
+# Ensure references to scavenged stack slots in the CSR area use the
+# FP as a base when the stack pointer must be aligned to something
+# larger than required by the target. This is necessary because the
+# alignment padding area is between the CSR area and the SP, so the SP
+# cannot be used to reference the CSR area.
+name:            test
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    64
+# CHECK:      stack:
+# CHECK:        id: 0, name: '', type: default, offset: -64, size: 4, alignment: 64
+# CHECK-NEXT:     stack-id: 0
+# CHECK-NEXT:     local-offset: -64
+# CHECK:        id: 1, name: '', type: default, offset: -20, size: 4, alignment: 4
+# CHECK-NEXT:     stack-id: 0
+# CHECK-NEXT:     local-offset: -68
+stack:
+  - { id: 0, size: 4, alignment: 64, local-offset: -64 }
+  - { id: 1, size: 4, alignment: 4, local-offset: -68 }
+
+# CHECK: body:
+# CHECK:   %sp = ANDXri killed %{{x[0-9]+}}, 7865
+# CHECK:   STRSui %s0, %sp, 0
+# CHECK:   STURSi %s0, %fp, -4
+body:             |
+  bb.0.entry:
+    liveins: %s0
+
+    STRSui %s0, %stack.0, 0
+    STRSui %s0, %stack.1, 0
+    ; Force preserve a CSR to create a hole in the CSR stack region.
+    %x28 = IMPLICIT_DEF
+    RET_ReallyLR
--- a/external/llvm/test/CodeGen/Hexagon/ifcvt-diamond-ret.mir
+++ b/external/llvm/test/CodeGen/Hexagon/ifcvt-diamond-ret.mir
@@ -0,0 +1,25 @@
+# RUN: llc -march=hexagon -run-pass if-converter %s -o - | FileCheck %s
+
+# Make sure this gets if-converted and it doesn't crash.
+# CHECK-LABEL: bb.0
+# CHECK: PS_jmpret %r31
+# CHECK-NOT: bb.{{[1-9]+}}:
+
+---
+name: fred
+tracksRegLiveness: true
+body: |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: %r0
+    renamable %p0 = C2_cmpeqi killed renamable %r0, 0
+    J2_jumpf killed renamable %p0, %bb.2, implicit-def dead %pc
+
+  bb.1:
+    S4_storeiri_io undef renamable %r0, 0, 32768 :: (store 4 into `i32* undef`)
+    PS_jmpret %r31, implicit-def dead %pc
+
+  bb.2:
+    S4_storeiri_io undef renamable %r0, 0, 32768 :: (store 4 into `i32* undef`)
+    PS_jmpret %r31, implicit-def dead %pc
+...
--- a/external/llvm/test/CodeGen/MIR/PowerPC/ifcvt-diamond-ret.mir
+++ b/external/llvm/test/CodeGen/MIR/PowerPC/ifcvt-diamond-ret.mir
@@ -0,0 +1,34 @@
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -run-pass=if-converter %s -o - | FileCheck %s
+---
+name:           foo
+body:           |
+  bb.0:
+  liveins: %x0, %x3
+  successors: %bb.1(0x40000000), %bb.2(0x40000000)
+
+  dead renamable %x3 = ANDIo8 killed renamable %x3, 1, implicit-def dead %cr0, implicit-def %cr0gt
+  %cr2lt = CROR %cr0gt, %cr0gt
+  BCn killed renamable %cr2lt, %bb.2
+  B %bb.1
+
+  bb.1:
+    renamable %x3 = LIS8 4096
+    MTLR8 %x0, implicit-def %lr8
+    BLR8 implicit %lr8, implicit %rm, implicit %x3
+
+  bb.2:
+    renamable %x3 = LIS8 4096
+    MTLR8 %x0, implicit-def %lr8
+    BLR8 implicit %lr8, implicit %rm, implicit %x3
+...
+
+# Diamond testcase with equivalent branches terminating in returns.
+
+# CHECK: body:             |          
+# CHECK:  bb.0:
+# CHECK:    dead renamable %x3 = ANDIo8 killed renamable %x3, 1, implicit-def dead %cr0, implicit-def %cr0gt
+# CHECK:    %cr2lt = CROR %cr0gt, %cr0gt
+# CHECK:    renamable %x3 = LIS8 4096
+# CHECK:    MTLR8 %x0, implicit-def %lr8
+# CHECK:    BLR8 implicit %lr8, implicit %rm, implicit %x3
+
--- a/external/llvm/test/CodeGen/Mips/const-mult.ll
+++ b/external/llvm/test/CodeGen/Mips/const-mult.ll
--- a/external/llvm/test/CodeGen/Mips/inlineasm-cnstrnt-bad-l1.ll
+++ b/external/llvm/test/CodeGen/Mips/inlineasm-cnstrnt-bad-l1.ll
@@ -0,0 +1,13 @@
+; Negative test. The constraint 'l' represents the register 'lo'.
+; Check error message in case of invalid usage.
+;
+; RUN: not llc -march=mips -filetype=obj < %s 2>&1 | FileCheck %s
+
+define void @constraint_l() nounwind {
+entry:
+
+; CHECK: error: invalid operand for instruction
+
+  tail call i16 asm sideeffect "addiu $0,$1,$2", "=l,r,r,~{$1}"(i16 0, i16 0)
+  ret void
+}
--- a/external/llvm/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
+++ b/external/llvm/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
@@ -41,5 +41,15 @@ entry:
  call i32 asm sideeffect "\09mtlo $3 \0A\09\09madd $1, $2 ", "=l,r,r,r"(i32 7, i32 6, i32 44) nounwind
  store volatile i32 %4, i32* %bosco, align 4
 
+; Check the 'l' constraint for 16-bit type.
+; CHECK:       #APP
+; CHECK:       mtlo ${{[0-9]+}}
+; CHECK-NEXT:  madd ${{[0-9]+}}, ${{[0-9]+}}
+; CHECK:       #NO_APP
+; CHECK-NEXT:  mflo ${{[0-9]+}}
+  %bosco16 = alloca i16, align 4
+  call i16 asm sideeffect "\09mtlo $3 \0A\09\09madd $1, $2 ", "=l,r,r,r"(i32 7, i32 6, i32 44) nounwind
+  store volatile i16 %5, i16* %bosco16, align 4
+
  ret i32 0
 }
--- a/external/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir.REMOVED.git-id
+++ b/external/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir.REMOVED.git-id
@@ -1 +1 @@
-67733795ed5de0cf155b38a0ff3bb1a92a429e27
+f2ca07367b991eaea539df025ca1dc1a8d20b401
--- a/external/llvm/test/CodeGen/PowerPC/no-dup-of-bdnz.ll
+++ b/external/llvm/test/CodeGen/PowerPC/no-dup-of-bdnz.ll
@@ -0,0 +1,75 @@
+; RUN: opt -early-cse-memssa -loop-rotate -licm -loop-rotate -S %s -o - | FileCheck %s
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+source_filename = "bugpoint-output-8903f29.bc"
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+define void @test(i64 %arg.ssa, i64 %arg.nb) local_unnamed_addr {
+; Ensure that loop rotation doesn't duplicate the call to
+; llvm.ppc.is.decremented.ctr.nonzero
+; CHECK-LABEL: test
+; CHECK: call i1 @llvm.ppc.is.decremented.ctr.nonzero
+; CHECK-NOT: call i1 @llvm.ppc.is.decremented.ctr.nonzero
+; CHECK: declare i1 @llvm.ppc.is.decremented.ctr.nonzero
+entry:
+  switch i32 undef, label %BB_8 [
+    i32 -2, label %BB_9
+    i32 0, label %BB_9
+  ]
+
+BB_1:                                    ; preds = %BB_12, %BB_4
+  %bcount.1.us = phi i64 [ %.810.us, %BB_4 ], [ 0, %BB_12 ]
+  %0 = add i64 %arg.ssa, %bcount.1.us
+  %.568.us = load i32, i32* undef, align 4
+  %.15.i.us = icmp slt i32 0, %.568.us
+  br i1 %.15.i.us, label %BB_3, label %BB_2
+
+BB_2:                                          ; preds = %BB_1
+  %.982.us = add nsw i64 %0, 1
+  unreachable
+
+BB_3:                                          ; preds = %BB_1
+  %1 = add i64 %arg.ssa, %bcount.1.us
+  %2 = add i64 %1, 1
+  %3 = call i1 @llvm.ppc.is.decremented.ctr.nonzero()
+  br i1 %3, label %BB_4, label %BB_7
+
+BB_4:                                          ; preds = %BB_3
+  %.810.us = add nuw nsw i64 %bcount.1.us, 1
+  br label %BB_1
+
+BB_5:                                         ; preds = %BB_7, %BB_5
+  %lsr.iv20.i116 = phi i64 [ %2, %BB_7 ], [ %lsr.iv.next21.i126, %BB_5 ]
+  %lsr.iv.next21.i126 = add i64 %lsr.iv20.i116, 1
+  br i1 undef, label %BB_5, label %BB_6
+
+BB_6:                                         ; preds = %BB_5
+  ret void
+
+BB_7:                                     ; preds = %BB_3
+  br label %BB_5
+
+BB_8:                                           ; preds = %entry
+  ret void
+
+BB_9:                                        ; preds = %entry, %entry
+  br label %BB_10
+
+BB_10:                               ; preds = %BB_9
+  br label %BB_11
+
+BB_11:                                         ; preds = %BB_11, %BB_10
+  br i1 undef, label %BB_11, label %BB_12
+
+BB_12:                                         ; preds = %BB_11
+  call void @llvm.ppc.mtctr.i64(i64 %arg.nb)
+  br label %BB_1
+}
+
+; Function Attrs: nounwind
+declare void @llvm.ppc.mtctr.i64(i64) #0
+
+; Function Attrs: nounwind
+declare i1 @llvm.ppc.is.decremented.ctr.nonzero() #0
+
+attributes #0 = { nounwind }
--- a/external/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll
+++ b/external/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -10,16 +10,10 @@ define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
 ;
 ; X32-LABEL: test_add_i64:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    movl 16(%ebp), %eax
-; X32-NEXT:    movl 20(%ebp), %edx
-; X32-NEXT:    addl 8(%ebp), %eax
-; X32-NEXT:    adcl 12(%ebp), %edx
-; X32-NEXT:    popl %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    retl
  %ret = add i64 %arg1, %arg2
  ret i64 %ret
--- a/external/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/external/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -37,6 +37,8 @@
 ; CHECK-NEXT:       X86 PIC Global Base Reg Initialization
 ; CHECK-NEXT:       Expand ISel Pseudo-instructions
 ; CHECK-NEXT:       Local Stack Slot Allocation
+; CHECK-NEXT:       MachineDominator Tree Construction
+; CHECK-NEXT:       X86 EFLAGS copy lowering
 ; CHECK-NEXT:       X86 WinAlloca Expander
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
 ; CHECK-NEXT:       Two-Address instruction pass
--- a/external/llvm/test/CodeGen/X86/clobber-fi0.ll
+++ b/external/llvm/test/CodeGen/X86/clobber-fi0.ll
@@ -1,37 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.7.0"
-
-; In the code below we need to copy the EFLAGS because of scheduling constraints.
-; When copying the EFLAGS we need to write to the stack with push/pop. This forces
-; us to emit the prolog.
-
-; CHECK: main
-; CHECK: subq{{.*}}rsp
-; CHECK: ret
-define i32 @main(i32 %arg, i8** %arg1) nounwind {
-bb:
-  %tmp = alloca i32, align 4                      ; [#uses=3 type=i32*]
-  %tmp2 = alloca i32, align 4                     ; [#uses=3 type=i32*]
-  %tmp3 = alloca i32                              ; [#uses=1 type=i32*]
-  store volatile i32 1, i32* %tmp, align 4
-  store volatile i32 1, i32* %tmp2, align 4
-  br label %bb4
-
-bb4:                                              ; preds = %bb4, %bb
-  %tmp6 = load volatile i32, i32* %tmp2, align 4                ; [#uses=1 type=i32]
-  %tmp7 = add i32 %tmp6, -1                       ; [#uses=2 type=i32]
-  store volatile i32 %tmp7, i32* %tmp2, align 4
-  %tmp8 = icmp eq i32 %tmp7, 0                    ; [#uses=1 type=i1]
-  %tmp9 = load volatile i32, i32* %tmp                          ; [#uses=1 type=i32]
-  %tmp10 = add i32 %tmp9, -1              ; [#uses=1 type=i32]
-  store volatile i32 %tmp10, i32* %tmp3
-  br i1 %tmp8, label %bb11, label %bb4
-
-bb11:                                             ; preds = %bb4
-  %tmp12 = load volatile i32, i32* %tmp, align 4                ; [#uses=1 type=i32]
-  ret i32 %tmp12
-}
-
-
--- a/external/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/external/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -1,100 +1,110 @@
-; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386
-; RUN: llc -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i386-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=32-ALL,32-GOOD-RA
+; RUN: llc -mtriple=i386-linux-gnu -verify-machineinstrs -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefixes=32-ALL,32-FAST-RA

-; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664
-; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s -check-prefix=x8664-sahf
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664-sahf
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7 %s -o - | FileCheck %s -check-prefix=x8664-sahf
-
-; TODO: Reenable verify-machineinstr once the if (!AXDead) // FIXME
-; in X86InstrInfo::copyPhysReg() is resolved.
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=64-ALL,64-GOOD-RA
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefixes=64-ALL,64-FAST-RA
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf %s -o - | FileCheck %s --check-prefixes=64-ALL,64-GOOD-RA-SAHF
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefixes=64-ALL,64-FAST-RA-SAHF
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mcpu=corei7 %s -o - | FileCheck %s --check-prefixes=64-ALL,64-GOOD-RA-SAHF

 declare i32 @foo()
 declare i32 @bar(i64)

-define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
-; i386-LABEL: test_intervening_call:
-; i386: cmpxchg8b
-; i386-NEXT: pushl %eax
-; i386-NEXT: seto %al
-; i386-NEXT: lahf
-; i386-NEXT: movl %eax, [[FLAGS:%.*]]
-; i386-NEXT: popl %eax
-; i386-NEXT: subl $8, %esp
-; i386-NEXT: pushl %edx
-; i386-NEXT: pushl %eax
-; i386-NEXT: calll bar
-; i386-NEXT: addl $16, %esp
-; i386-NEXT: movl [[FLAGS]], %eax
-; i386-NEXT: addb $127, %al
-; i386-NEXT: sahf
-; i386-NEXT: jne
-
-; In the following case we get a long chain of EFLAGS save/restore due to
-; a sequence of:
+; In the following case when using fast scheduling we get a long chain of
+; EFLAGS save/restore due to a sequence of:
 ; cmpxchg8b (implicit-def eflags)
 ; eax = copy eflags
 ; adjcallstackdown32
 ; ...
 ; use of eax
 ; During PEI the adjcallstackdown32 is replaced with the subl which
-; clobbers eflags, effectively interfering in the liveness interval.
-; Is this a case we care about? Maybe no, considering this issue
-; happens with the fast pre-regalloc scheduler enforced. A more
-; performant scheduler would move the adjcallstackdown32 out of the
-; eflags liveness interval.
-
-; i386f-LABEL: test_intervening_call:
-; i386f: cmpxchg8b
-; i386f-NEXT: pushl  %eax
-; i386f-NEXT: seto  %al
-; i386f-NEXT: lahf
-; i386f-NEXT: movl  %eax, [[FLAGS:%.*]]
-; i386f-NEXT: popl  %eax
-; i386f-NEXT: subl  $8, %esp
-; i386f-NEXT: pushl  %eax
-; i386f-NEXT: movl  %ecx, %eax
-; i386f-NEXT: addb  $127, %al
-; i386f-NEXT: sahf
-; i386f-NEXT: popl  %eax
-; i386f-NEXT: pushl  %eax
-; i386f-NEXT: seto  %al
-; i386f-NEXT: lahf
-; i386f-NEXT: movl  %eax, %esi
-; i386f-NEXT: popl  %eax
-; i386f-NEXT: pushl  %edx
-; i386f-NEXT: pushl  %eax
-; i386f-NEXT: calll  bar
-; i386f-NEXT: addl  $16, %esp
-; i386f-NEXT: movl  %esi, %eax
-; i386f-NEXT: addb  $127, %al
-
-; x8664-LABEL: test_intervening_call:
-; x8664: cmpxchgq
-; x8664: pushfq
-; x8664-NEXT: popq [[FLAGS:%.*]]
-; x8664-NEXT: movq %rax, %rdi
-; x8664-NEXT: callq bar
-; x8664-NEXT: pushq [[FLAGS]]
-; x8664-NEXT: popfq
-; x8664-NEXT: jne
-
-; x8664-sahf-LABEL: test_intervening_call:
-; x8664-sahf: cmpxchgq
-; x8664-sahf: pushq %rax
-; x8664-sahf-NEXT: seto %al
-; x8664-sahf-NEXT: lahf
-; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]]
-; x8664-sahf-NEXT: popq %rax
-; x8664-sahf-NEXT: movq %rax, %rdi
-; x8664-sahf-NEXT: callq bar
-; RAX is dead, no need to push and pop it.
-; x8664-sahf-NEXT: movq [[FLAGS]], %rax
-; x8664-sahf-NEXT: addb $127, %al
-; x8664-sahf-NEXT: sahf
-; x8664-sahf-NEXT: jne
-
+; clobbers eflags, effectively interfering in the liveness interval. However,
+; we then promote these copies into independent conditions in GPRs that avoids
+; repeated saving and restoring logic and can be trivially managed by the
+; register allocator.
+define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) nounwind {
+; 32-GOOD-RA-LABEL: test_intervening_call:
+; 32-GOOD-RA:       # %bb.0: # %entry
+; 32-GOOD-RA-NEXT:    pushl %ebx
+; 32-GOOD-RA-NEXT:    pushl %esi
+; 32-GOOD-RA-NEXT:    pushl %eax
+; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; 32-GOOD-RA-NEXT:    lock cmpxchg8b (%esi)
+; 32-GOOD-RA-NEXT:    setne %bl
+; 32-GOOD-RA-NEXT:    subl $8, %esp
+; 32-GOOD-RA-NEXT:    pushl %edx
+; 32-GOOD-RA-NEXT:    pushl %eax
+; 32-GOOD-RA-NEXT:    calll bar
+; 32-GOOD-RA-NEXT:    addl $16, %esp
+; 32-GOOD-RA-NEXT:    testb %bl, %bl
+; 32-GOOD-RA-NEXT:    jne .LBB0_3
+; 32-GOOD-RA-NEXT:  # %bb.1: # %t
+; 32-GOOD-RA-NEXT:    movl $42, %eax
+; 32-GOOD-RA-NEXT:    jmp .LBB0_2
+; 32-GOOD-RA-NEXT:  .LBB0_3: # %f
+; 32-GOOD-RA-NEXT:    xorl %eax, %eax
+; 32-GOOD-RA-NEXT:  .LBB0_2: # %t
+; 32-GOOD-RA-NEXT:    xorl %edx, %edx
+; 32-GOOD-RA-NEXT:    addl $4, %esp
+; 32-GOOD-RA-NEXT:    popl %esi
+; 32-GOOD-RA-NEXT:    popl %ebx
+; 32-GOOD-RA-NEXT:    retl
+;
+; 32-FAST-RA-LABEL: test_intervening_call:
+; 32-FAST-RA:       # %bb.0: # %entry
+; 32-FAST-RA-NEXT:    pushl %ebx
+; 32-FAST-RA-NEXT:    pushl %esi
+; 32-FAST-RA-NEXT:    pushl %eax
+; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; 32-FAST-RA-NEXT:    lock cmpxchg8b (%esi)
+; 32-FAST-RA-NEXT:    setne %bl
+; 32-FAST-RA-NEXT:    subl $8, %esp
+; 32-FAST-RA-NEXT:    pushl %edx
+; 32-FAST-RA-NEXT:    pushl %eax
+; 32-FAST-RA-NEXT:    calll bar
+; 32-FAST-RA-NEXT:    addl $16, %esp
+; 32-FAST-RA-NEXT:    testb %bl, %bl
+; 32-FAST-RA-NEXT:    jne .LBB0_3
+; 32-FAST-RA-NEXT:  # %bb.1: # %t
+; 32-FAST-RA-NEXT:    movl $42, %eax
+; 32-FAST-RA-NEXT:    jmp .LBB0_2
+; 32-FAST-RA-NEXT:  .LBB0_3: # %f
+; 32-FAST-RA-NEXT:    xorl %eax, %eax
+; 32-FAST-RA-NEXT:  .LBB0_2: # %t
+; 32-FAST-RA-NEXT:    xorl %edx, %edx
+; 32-FAST-RA-NEXT:    addl $4, %esp
+; 32-FAST-RA-NEXT:    popl %esi
+; 32-FAST-RA-NEXT:    popl %ebx
+; 32-FAST-RA-NEXT:    retl
+;
+; 64-ALL-LABEL: test_intervening_call:
+; 64-ALL:       # %bb.0: # %entry
+; 64-ALL-NEXT:    pushq %rbx
+; 64-ALL-NEXT:    movq %rsi, %rax
+; 64-ALL-NEXT:    lock cmpxchgq %rdx, (%rdi)
+; 64-ALL-NEXT:    setne %bl
+; 64-ALL-NEXT:    movq %rax, %rdi
+; 64-ALL-NEXT:    callq bar
+; 64-ALL-NEXT:    testb %bl, %bl
+; 64-ALL-NEXT:    jne .LBB0_2
+; 64-ALL-NEXT:  # %bb.1: # %t
+; 64-ALL-NEXT:    movl $42, %eax
+; 64-ALL-NEXT:    popq %rbx
+; 64-ALL-NEXT:    retq
+; 64-ALL-NEXT:  .LBB0_2: # %f
+; 64-ALL-NEXT:    xorl %eax, %eax
+; 64-ALL-NEXT:    popq %rbx
+; 64-ALL-NEXT:    retq
+entry:
  %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
  %v = extractvalue { i64, i1 } %cx, 0
  %p = extractvalue { i64, i1 } %cx, 1
@@ -109,23 +119,62 @@ f:
 }

 ; Interesting in producing a clobber without any function calls.
-define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) {
-; i386-LABEL: test_control_flow:
-; i386: cmpxchg
-; i386-NEXT: jne
-
-; i386f-LABEL: test_control_flow:
-; i386f: cmpxchg
-; i386f-NEXT: jne
-
-; x8664-LABEL: test_control_flow:
-; x8664: cmpxchg
-; x8664-NEXT: jne
-
-; x8664-sahf-LABEL: test_control_flow:
-; x8664-sahf: cmpxchg
-; x8664-sahf-NEXT: jne
-
+define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) nounwind {
+; 32-ALL-LABEL: test_control_flow:
+; 32-ALL:       # %bb.0: # %entry
+; 32-ALL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; 32-ALL-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; 32-ALL-NEXT:    jle .LBB1_6
+; 32-ALL-NEXT:  # %bb.1: # %loop_start
+; 32-ALL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; 32-ALL-NEXT:    .p2align 4, 0x90
+; 32-ALL-NEXT:  .LBB1_2: # %while.condthread-pre-split.i
+; 32-ALL-NEXT:    # =>This Loop Header: Depth=1
+; 32-ALL-NEXT:    # Child Loop BB1_3 Depth 2
+; 32-ALL-NEXT:    movl (%ecx), %edx
+; 32-ALL-NEXT:    .p2align 4, 0x90
+; 32-ALL-NEXT:  .LBB1_3: # %while.cond.i
+; 32-ALL-NEXT:    # Parent Loop BB1_2 Depth=1
+; 32-ALL-NEXT:    # => This Inner Loop Header: Depth=2
+; 32-ALL-NEXT:    movl %edx, %eax
+; 32-ALL-NEXT:    xorl %edx, %edx
+; 32-ALL-NEXT:    testl %eax, %eax
+; 32-ALL-NEXT:    je .LBB1_3
+; 32-ALL-NEXT:  # %bb.4: # %while.body.i
+; 32-ALL-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; 32-ALL-NEXT:    lock cmpxchgl %eax, (%ecx)
+; 32-ALL-NEXT:    jne .LBB1_2
+; 32-ALL-NEXT:  # %bb.5:
+; 32-ALL-NEXT:    xorl %eax, %eax
+; 32-ALL-NEXT:  .LBB1_6: # %cond.end
+; 32-ALL-NEXT:    retl
+;
+; 64-ALL-LABEL: test_control_flow:
+; 64-ALL:       # %bb.0: # %entry
+; 64-ALL-NEXT:    cmpl %edx, %esi
+; 64-ALL-NEXT:    jle .LBB1_5
+; 64-ALL-NEXT:    .p2align 4, 0x90
+; 64-ALL-NEXT:  .LBB1_1: # %while.condthread-pre-split.i
+; 64-ALL-NEXT:    # =>This Loop Header: Depth=1
+; 64-ALL-NEXT:    # Child Loop BB1_2 Depth 2
+; 64-ALL-NEXT:    movl (%rdi), %ecx
+; 64-ALL-NEXT:    .p2align 4, 0x90
+; 64-ALL-NEXT:  .LBB1_2: # %while.cond.i
+; 64-ALL-NEXT:    # Parent Loop BB1_1 Depth=1
+; 64-ALL-NEXT:    # => This Inner Loop Header: Depth=2
+; 64-ALL-NEXT:    movl %ecx, %eax
+; 64-ALL-NEXT:    xorl %ecx, %ecx
+; 64-ALL-NEXT:    testl %eax, %eax
+; 64-ALL-NEXT:    je .LBB1_2
+; 64-ALL-NEXT:  # %bb.3: # %while.body.i
+; 64-ALL-NEXT:    # in Loop: Header=BB1_1 Depth=1
+; 64-ALL-NEXT:    lock cmpxchgl %eax, (%rdi)
+; 64-ALL-NEXT:    jne .LBB1_1
+; 64-ALL-NEXT:  # %bb.4:
+; 64-ALL-NEXT:    xorl %esi, %esi
+; 64-ALL-NEXT:  .LBB1_5: # %cond.end
+; 64-ALL-NEXT:    movl %esi, %eax
+; 64-ALL-NEXT:    retq
 entry:
  %cmp = icmp sgt i32 %i, %j
  br i1 %cmp, label %loop_start, label %cond.end
@@ -158,52 +207,68 @@ cond.end:

 ; This one is an interesting case because CMOV doesn't have a chain
 ; operand. Naive attempts to limit cmpxchg EFLAGS use are likely to fail here.
-define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
-; i386-LABEL: test_feed_cmov:
-; i386: cmpxchgl
-; i386-NEXT: seto %al
-; i386-NEXT: lahf
-; i386-NEXT: movl %eax, [[FLAGS:%.*]]
-; i386-NEXT: calll foo
-; i386-NEXT: pushl %eax
-; i386-NEXT: movl [[FLAGS]], %eax
-; i386-NEXT: addb $127, %al
-; i386-NEXT: sahf
-; i386-NEXT: popl %eax
-
-; i386f-LABEL: test_feed_cmov:
-; i386f: cmpxchgl
-; i386f-NEXT: seto %al
-; i386f-NEXT: lahf
-; i386f-NEXT: movl %eax, [[FLAGS:%.*]]
-; i386f-NEXT: calll foo
-; i386f-NEXT: pushl %eax
-; i386f-NEXT: movl [[FLAGS]], %eax
-; i386f-NEXT: addb $127, %al
-; i386f-NEXT: sahf
-; i386f-NEXT: popl %eax
-
-; x8664-LABEL: test_feed_cmov:
-; x8664: cmpxchg
-; x8664: pushfq
-; x8664-NEXT: popq [[FLAGS:%.*]]
-; x8664-NEXT: callq foo
-; x8664-NEXT: pushq [[FLAGS]]
-; x8664-NEXT: popfq
-
-; x8664-sahf-LABEL: test_feed_cmov:
-; x8664-sahf: cmpxchgl
-; RAX is dead, do not push or pop it.
-; x8664-sahf-NEXT: seto %al
-; x8664-sahf-NEXT: lahf
-; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]]
-; x8664-sahf-NEXT: callq foo
-; x8664-sahf-NEXT: pushq %rax
-; x8664-sahf-NEXT: movq [[FLAGS]], %rax
-; x8664-sahf-NEXT: addb $127, %al
-; x8664-sahf-NEXT: sahf
-; x8664-sahf-NEXT: popq %rax
-
+define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) nounwind {
+; 32-GOOD-RA-LABEL: test_feed_cmov:
+; 32-GOOD-RA:       # %bb.0: # %entry
+; 32-GOOD-RA-NEXT:    pushl %ebx
+; 32-GOOD-RA-NEXT:    pushl %esi
+; 32-GOOD-RA-NEXT:    pushl %eax
+; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; 32-GOOD-RA-NEXT:    lock cmpxchgl %esi, (%ecx)
+; 32-GOOD-RA-NEXT:    sete %bl
+; 32-GOOD-RA-NEXT:    calll foo
+; 32-GOOD-RA-NEXT:    testb %bl, %bl
+; 32-GOOD-RA-NEXT:    jne .LBB2_2
+; 32-GOOD-RA-NEXT:  # %bb.1: # %entry
+; 32-GOOD-RA-NEXT:    movl %eax, %esi
+; 32-GOOD-RA-NEXT:  .LBB2_2: # %entry
+; 32-GOOD-RA-NEXT:    movl %esi, %eax
+; 32-GOOD-RA-NEXT:    addl $4, %esp
+; 32-GOOD-RA-NEXT:    popl %esi
+; 32-GOOD-RA-NEXT:    popl %ebx
+; 32-GOOD-RA-NEXT:    retl
+;
+; 32-FAST-RA-LABEL: test_feed_cmov:
+; 32-FAST-RA:       # %bb.0: # %entry
+; 32-FAST-RA-NEXT:    pushl %ebx
+; 32-FAST-RA-NEXT:    pushl %esi
+; 32-FAST-RA-NEXT:    pushl %eax
+; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; 32-FAST-RA-NEXT:    lock cmpxchgl %esi, (%ecx)
+; 32-FAST-RA-NEXT:    sete %bl
+; 32-FAST-RA-NEXT:    calll foo
+; 32-FAST-RA-NEXT:    testb %bl, %bl
+; 32-FAST-RA-NEXT:    jne .LBB2_2
+; 32-FAST-RA-NEXT:  # %bb.1: # %entry
+; 32-FAST-RA-NEXT:    movl %eax, %esi
+; 32-FAST-RA-NEXT:  .LBB2_2: # %entry
+; 32-FAST-RA-NEXT:    movl %esi, %eax
+; 32-FAST-RA-NEXT:    addl $4, %esp
+; 32-FAST-RA-NEXT:    popl %esi
+; 32-FAST-RA-NEXT:    popl %ebx
+; 32-FAST-RA-NEXT:    retl
+;
+; 64-ALL-LABEL: test_feed_cmov:
+; 64-ALL:       # %bb.0: # %entry
+; 64-ALL-NEXT:    pushq %rbp
+; 64-ALL-NEXT:    pushq %rbx
+; 64-ALL-NEXT:    pushq %rax
+; 64-ALL-NEXT:    movl %edx, %ebx
+; 64-ALL-NEXT:    movl %esi, %eax
+; 64-ALL-NEXT:    lock cmpxchgl %ebx, (%rdi)
+; 64-ALL-NEXT:    sete %bpl
+; 64-ALL-NEXT:    callq foo
+; 64-ALL-NEXT:    testb %bpl, %bpl
+; 64-ALL-NEXT:    cmovnel %ebx, %eax
+; 64-ALL-NEXT:    addq $8, %rsp
+; 64-ALL-NEXT:    popq %rbx
+; 64-ALL-NEXT:    popq %rbp
+; 64-ALL-NEXT:    retq
+entry:
  %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
  %success = extractvalue { i32, i1 } %res, 1

--- a/external/llvm/test/CodeGen/X86/copy-eflags.ll
+++ b/external/llvm/test/CodeGen/X86/copy-eflags.ll
@@ -1,6 +1,8 @@
-; RUN: llc -o - %s | FileCheck %s
-; This tests for the problem originally reported in http://llvm.org/PR25951
-target triple = "i686-unknown-linux-gnu"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -o - -mtriple=i686-unknown-unknown %s | FileCheck %s --check-prefixes=ALL,X32
+; RUN: llc -o - -mtriple=x86_64-unknown-unknown %s | FileCheck %s --check-prefixes=ALL,X64
+;
+; Test patterns that require preserving and restoring flags.

@b = common global i8 0, align 1
@c = common global i32 0, align 4
@@ -8,13 +10,61 @@ target triple = "i686-unknown-linux-gnu"
@d = common global i8 0, align 1
@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1

-; CHECK-LABEL: func:
-; This tests whether eax is properly saved/restored around the
-; lahf/sahf instruction sequences. We make mem op volatile to prevent
-; their reordering to avoid spills.
+declare void @external(i32)

-
-define i32 @func() {
+; A test that re-uses flags in interesting ways due to volatile accesses.
+; Specifically, the first increment's flags are reused for the branch despite
+; being clobbered by the second increment.
+define i32 @test1() nounwind {
+; X32-LABEL: test1:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb b, %cl
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    incb %al
+; X32-NEXT:    movb %al, b
+; X32-NEXT:    incl c
+; X32-NEXT:    sete %dl
+; X32-NEXT:    movb a, %ah
+; X32-NEXT:    movb %ah, %ch
+; X32-NEXT:    incb %ch
+; X32-NEXT:    cmpb %cl, %ah
+; X32-NEXT:    sete d
+; X32-NEXT:    movb %ch, a
+; X32-NEXT:    testb %dl, %dl
+; X32-NEXT:    jne .LBB0_2
+; X32-NEXT:  # %bb.1: # %if.then
+; X32-NEXT:    movsbl %al, %eax
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll external
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:  .LBB0_2: # %if.end
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test1:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movb {{.*}}(%rip), %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    incb %al
+; X64-NEXT:    movb %al, {{.*}}(%rip)
+; X64-NEXT:    incl {{.*}}(%rip)
+; X64-NEXT:    sete %sil
+; X64-NEXT:    movb {{.*}}(%rip), %cl
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    incb %dl
+; X64-NEXT:    cmpb %dil, %cl
+; X64-NEXT:    sete {{.*}}(%rip)
+; X64-NEXT:    movb %dl, {{.*}}(%rip)
+; X64-NEXT:    testb %sil, %sil
+; X64-NEXT:    jne .LBB0_2
+; X64-NEXT:  # %bb.1: # %if.then
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movsbl %al, %edi
+; X64-NEXT:    callq external
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:  .LBB0_2: # %if.end
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
 entry:
  %bval = load i8, i8* @b
  %inc = add i8 %bval, 1
@@ -25,33 +75,290 @@ entry:
  %aval = load volatile i8, i8* @a
  %inc2 = add i8 %aval, 1
  store volatile i8 %inc2, i8* @a
-; Copy flags produced by the incb of %inc1 to a register, need to save+restore
-; eax around it. The flags will be reused by %tobool.
-; CHECK: pushl %eax
-; CHECK: seto %al
-; CHECK: lahf
-; CHECK: movl %eax, [[REG:%[a-z]+]]
-; CHECK: popl %eax
  %cmp = icmp eq i8 %aval, %bval
  %conv5 = zext i1 %cmp to i8
  store i8 %conv5, i8* @d
  %tobool = icmp eq i32 %inc1, 0
-; We restore flags with an 'addb, sahf' sequence, need to save+restore eax
-; around it.
-; CHECK: pushl %eax
-; CHECK: movl [[REG]], %eax
-; CHECK: addb $127, %al
-; CHECK: sahf
-; CHECK: popl %eax
  br i1 %tobool, label %if.end, label %if.then

 if.then:
  %conv6 = sext i8 %inc to i32
-  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %conv6)
+  call void @external(i32 %conv6)
  br label %if.end

 if.end:
  ret i32 0
 }

-declare i32 @printf(i8* nocapture readonly, ...)
+; Preserve increment flags across a call.
+define i32 @test2(i32* %ptr) nounwind {
+; X32-LABEL: test2:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    incl (%eax)
+; X32-NEXT:    setne %bl
+; X32-NEXT:    pushl $42
+; X32-NEXT:    calll external
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    testb %bl, %bl
+; X32-NEXT:    je .LBB1_1
+; X32-NEXT:  # %bb.2: # %else
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    retl
+; X32-NEXT:  .LBB1_1: # %then
+; X32-NEXT:    movl $64, %eax
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    retl
+;
+; X64-LABEL: test2:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    incl (%rdi)
+; X64-NEXT:    setne %bl
+; X64-NEXT:    movl $42, %edi
+; X64-NEXT:    callq external
+; X64-NEXT:    testb %bl, %bl
+; X64-NEXT:    je .LBB1_1
+; X64-NEXT:  # %bb.2: # %else
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB1_1: # %then
+; X64-NEXT:    movl $64, %eax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+entry:
+  %val = load i32, i32* %ptr
+  %inc = add i32 %val, 1
+  store i32 %inc, i32* %ptr
+  %cmp = icmp eq i32 %inc, 0
+  call void @external(i32 42)
+  br i1 %cmp, label %then, label %else
+
+then:
+  ret i32 64
+
+else:
+  ret i32 0
+}
+
+declare void @external_a()
+declare void @external_b()
+
+; This lowers to a conditional tail call instead of a conditional branch. This
+; is tricky because we can only do this from a leaf function, and so we have to
+; use volatile stores similar to test1 to force the save and restore of
+; a condition without calling another function. We then set up subsequent calls
+; in tail position.
+define void @test_tail_call(i32* %ptr) nounwind optsize {
+; X32-LABEL: test_tail_call:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    incl (%eax)
+; X32-NEXT:    setne %al
+; X32-NEXT:    incb a
+; X32-NEXT:    sete d
+; X32-NEXT:    testb %al, %al
+; X32-NEXT:    jne external_b # TAILCALL
+; X32-NEXT:  # %bb.1: # %then
+; X32-NEXT:    jmp external_a # TAILCALL
+;
+; X64-LABEL: test_tail_call:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    incl (%rdi)
+; X64-NEXT:    setne %al
+; X64-NEXT:    incb {{.*}}(%rip)
+; X64-NEXT:    sete {{.*}}(%rip)
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    jne external_b # TAILCALL
+; X64-NEXT:  # %bb.1: # %then
+; X64-NEXT:    jmp external_a # TAILCALL
+entry:
+  %val = load i32, i32* %ptr
+  %inc = add i32 %val, 1
+  store i32 %inc, i32* %ptr
+  %cmp = icmp eq i32 %inc, 0
+  %aval = load volatile i8, i8* @a
+  %inc2 = add i8 %aval, 1
+  store volatile i8 %inc2, i8* @a
+  %cmp2 = icmp eq i8 %inc2, 0
+  %conv5 = zext i1 %cmp2 to i8
+  store i8 %conv5, i8* @d
+  br i1 %cmp, label %then, label %else
+
+then:
+  tail call void @external_a()
+  ret void
+
+else:
+  tail call void @external_b()
+  ret void
+}
+
+; Test a function that gets special select lowering into CFG with copied EFLAGS
+; threaded across the CFG. This requires our EFLAGS copy rewriting to handle
+; cross-block rewrites in at least some narrow cases.
+define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, i8* %ptr1, i32* %ptr2) {
+; X32-LABEL: PR37100:
+; X32:       # %bb.0: # %bb
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    .cfi_offset %esi, -20
+; X32-NEXT:    .cfi_offset %edi, -16
+; X32-NEXT:    .cfi_offset %ebx, -12
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT:    jmp .LBB3_1
+; X32-NEXT:    .p2align 4, 0x90
+; X32-NEXT:  .LBB3_5: # %bb1
+; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    idivl %ebp
+; X32-NEXT:  .LBB3_1: # %bb1
+; X32-NEXT:    # =>This Inner Loop Header: Depth=1
+; X32-NEXT:    movsbl %cl, %eax
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    sarl $31, %edx
+; X32-NEXT:    cmpl %eax, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    sbbl %edx, %eax
+; X32-NEXT:    setl %al
+; X32-NEXT:    setl %dl
+; X32-NEXT:    movzbl %dl, %ebp
+; X32-NEXT:    negl %ebp
+; X32-NEXT:    testb %al, %al
+; X32-NEXT:    jne .LBB3_3
+; X32-NEXT:  # %bb.2: # %bb1
+; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT:    movb %ch, %cl
+; X32-NEXT:  .LBB3_3: # %bb1
+; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT:    movb %cl, (%ebx)
+; X32-NEXT:    movl (%edi), %edx
+; X32-NEXT:    testb %al, %al
+; X32-NEXT:    jne .LBB3_5
+; X32-NEXT:  # %bb.4: # %bb1
+; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    jmp .LBB3_5
+;
+; X64-LABEL: PR37100:
+; X64:       # %bb.0: # %bb
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    jmp .LBB3_1
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB3_5: # %bb1
+; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    idivl %esi
+; X64-NEXT:  .LBB3_1: # %bb1
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movsbq %dil, %rax
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    cmpq %rax, %r10
+; X64-NEXT:    setl %sil
+; X64-NEXT:    negl %esi
+; X64-NEXT:    cmpq %rax, %r10
+; X64-NEXT:    jl .LBB3_3
+; X64-NEXT:  # %bb.2: # %bb1
+; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT:    movl %ecx, %edi
+; X64-NEXT:  .LBB3_3: # %bb1
+; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT:    movb %dil, (%r8)
+; X64-NEXT:    jl .LBB3_5
+; X64-NEXT:  # %bb.4: # %bb1
+; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT:    movl (%r9), %esi
+; X64-NEXT:    jmp .LBB3_5
+bb:
+  br label %bb1
+
+bb1:
+  %tmp = phi i8 [ %tmp8, %bb1 ], [ %arg1, %bb ]
+  %tmp2 = phi i16 [ %tmp12, %bb1 ], [ %arg2, %bb ]
+  %tmp3 = icmp sgt i16 %tmp2, 7
+  %tmp4 = select i1 %tmp3, i16 %tmp2, i16 7
+  %tmp5 = sext i8 %tmp to i64
+  %tmp6 = icmp slt i64 %arg3, %tmp5
+  %tmp7 = sext i1 %tmp6 to i32
+  %tmp8 = select i1 %tmp6, i8 %tmp, i8 %arg4
+  store volatile i8 %tmp8, i8* %ptr1
+  %tmp9 = load volatile i32, i32* %ptr2
+  %tmp10 = select i1 %tmp6, i32 %tmp7, i32 %tmp9
+  %tmp11 = srem i32 0, %tmp10
+  %tmp12 = trunc i32 %tmp11 to i16
+  br label %bb1
+}
+
+; Use a particular instruction pattern in order to lower to the post-RA pseudo
+; used to lower SETB into an SBB pattern in order to make sure that kind of
+; usage of a copied EFLAGS continues to work.
+define void @PR37431(i32* %arg1, i8* %arg2, i8* %arg3) {
+; X32-LABEL: PR37431:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    sarl $31, %ecx
+; X32-NEXT:    cmpl %eax, %eax
+; X32-NEXT:    sbbl %ecx, %eax
+; X32-NEXT:    setb %al
+; X32-NEXT:    sbbb %cl, %cl
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movb %cl, (%edx)
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    xorl %ecx, %ecx
+; X32-NEXT:    subl %eax, %ecx
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    idivl %ecx
+; X32-NEXT:    movb %dl, (%esi)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    retl
+;
+; X64-LABEL: PR37431:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movslq (%rdi), %rax
+; X64-NEXT:    cmpq %rax, %rax
+; X64-NEXT:    sbbb %dl, %dl
+; X64-NEXT:    cmpq %rax, %rax
+; X64-NEXT:    movb %dl, (%rsi)
+; X64-NEXT:    sbbl %esi, %esi
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    idivl %esi
+; X64-NEXT:    movb %dl, (%rcx)
+; X64-NEXT:    retq
+entry:
+  %tmp = load i32, i32* %arg1
+  %tmp1 = sext i32 %tmp to i64
+  %tmp2 = icmp ugt i64 %tmp1, undef
+  %tmp3 = zext i1 %tmp2 to i8
+  %tmp4 = sub i8 0, %tmp3
+  store i8 %tmp4, i8* %arg2
+  %tmp5 = sext i8 %tmp4 to i32
+  %tmp6 = srem i32 0, %tmp5
+  %tmp7 = trunc i32 %tmp6 to i8
+  store i8 %tmp7, i8* %arg3
+  ret void
+}
--- a/external/llvm/test/CodeGen/X86/domain-reassignment-implicit-def.ll
+++ b/external/llvm/test/CodeGen/X86/domain-reassignment-implicit-def.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mcpu=skylake-avx512 -mtriple=x86_64-unknown-linux-gnu %s -o - | FileCheck %s
+
+; Check that the X86 Domain Reassignment pass doesn't drop IMPLICIT_DEF nodes,
+; which would later cause crashes (e.g. in LiveVariables) - see PR37430
+define void @domain_reassignment_implicit_def(i1 %cond, i8 *%mem, float %arg) {
+; CHECK:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK:    vcmpneqss %xmm1, %xmm0, %k0
+; CHECK:    kmovb %k0, (%rsi)
+top:
+  br i1 %cond, label %L19, label %L15
+
+L15:                                              ; preds = %top
+  %tmp47 = fcmp une float 0.000000e+00, %arg
+  %tmp48 = zext i1 %tmp47 to i8
+  br label %L21
+
+L19:                                              ; preds = %top
+  br label %L21
+
+L21:                                              ; preds = %L19, %L15
+  %.sroa.0.0 = phi i8 [ undef, %L19 ], [ %tmp48, %L15 ]
+  store i8 %.sroa.0.0, i8* %mem, align 1
+  ret void
+}
--- a/external/llvm/test/CodeGen/X86/domain-reassignment-test.ll
+++ b/external/llvm/test/CodeGen/X86/domain-reassignment-test.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mcpu=skylake-avx512 -mtriple=x86_64-unknown-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mcpu=skylake-avx512 -mtriple=x86_64-unknown-linux-gnu %s -o - | llvm-mc -triple=x86_64-unknown-linux-gnu -mcpu=skylake-avx512
+
+; Check that the X86 domain reassignment pass doesn't introduce an illegal
+; test instruction. See PR37396
+define void @japi1_foo2_34617() {
+pass2:
+  br label %if5
+
+L174:
+  %tmp = icmp sgt <2 x i64> undef, zeroinitializer
+  %tmp1 = icmp sle <2 x i64> undef, undef
+  %tmp2 = and <2 x i1> %tmp, %tmp1
+  %tmp3 = extractelement <2 x i1> %tmp2, i32 0
+  %tmp4 = extractelement <2 x i1> %tmp2, i32 1
+  %tmp106 = and i1 %tmp4, %tmp3
+  %tmp107 = zext i1 %tmp106 to i8
+  %tmp108 = and i8 %tmp122, %tmp107
+  %tmp109 = icmp eq i8 %tmp108, 0
+; CHECK-NOT: testb  {{%k[0-7]}}
+  br i1 %tmp109, label %L188, label %L190
+
+if5:
+  %b.055 = phi i8 [ 1, %pass2 ], [ %tmp122, %if5 ]
+  %tmp118 = icmp sgt i64 undef, 0
+  %tmp119 = icmp sle i64 undef, undef
+  %tmp120 = and i1 %tmp118, %tmp119
+  %tmp121 = zext i1 %tmp120 to i8
+  %tmp122 = and i8 %b.055, %tmp121
+  br i1 undef, label %L174, label %if5
+
+L188:
+  unreachable
+
+L190:
+  ret void
+}
--- a/external/llvm/test/CodeGen/X86/eflags-copy-expansion.mir
+++ b/external/llvm/test/CodeGen/X86/eflags-copy-expansion.mir
@@ -1,64 +0,0 @@
-# RUN: llc -run-pass postrapseudos -mtriple=i386-apple-macosx -o - %s | FileCheck %s
-
-# Verify that we correctly save and restore eax when copying eflags,
-# even when only a smaller alias of eax is used. We used to check only
-# eax and not its aliases.
-# PR27624.
-
--- |
-  target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-  define void @foo() {
-  entry:
-    br label %false
-  false:
-    ret void
-  }
-
-...
-
---
-name:            foo
-tracksRegLiveness: true
-liveins:
-  - { reg: '%edi' }
-body:             |
-  bb.0.entry:
-    liveins: %edi
-    NOOP implicit-def %al
-
-    ; The bug was triggered only when LivePhysReg is used, which
-    ; happens only when the heuristic for the liveness computation
-    ; failed. The liveness computation heuristic looks at 10 instructions
-    ; before and after the copy. Make sure we do not reach the definition of
-    ; AL in 10 instructions, otherwise the heuristic will see that it is live.
-    NOOP
-    NOOP
-    NOOP
-    NOOP
-    NOOP
-    NOOP
-    NOOP
-    NOOP
-    NOOP
-    NOOP
-    NOOP
-    NOOP
-    NOOP
-    ; Save AL.
-    ; CHECK: PUSH32r killed %eax
-
-    ; Copy edi into EFLAGS
-    ; CHECK-NEXT: %eax = MOV32rr %edi
-    ; CHECK-NEXT: %al = ADD8ri %al, 127, implicit-def %eflags
-    ; CHECK-NEXT: SAHF implicit-def %eflags, implicit %ah
-    %eflags = COPY %edi
-
-    ; Restore AL.
-    ; CHECK-NEXT: %eax = POP32r
-  bb.1.false:
-    liveins: %al
-    NOOP implicit %al
-    RETQ
-
-...
--- a/Show More
+++ b/Show More