- 
                Notifications
    You must be signed in to change notification settings 
- Fork 13.9k
Description
I have experienced more than 300% longer execution time in specific functions that use loops along with indexing into slices. After several hours of work with a profiler, I was able to isolate the problem from a 60K lines codebase into the following short program
use std::cmp;
#[inline(never)]
pub fn cmp_gt_and(in1: &[i16], in2: &[i16], destination: &mut [bool]) {
    let max = cmp::min(cmp::min(in1.len(), in2.len()), destination.len());
    let src1 = &in1[0..max];
    let src2 = &in2[0..max];
    let dst = &mut destination[0..max];
    for index in 0..max {
        dst[index] &= src1[index] < src2[index];
    }
}
fn main() {
    let len = 100;
    let a: Vec<i16> = (1..len).collect();
    let b: Vec<i16> = (1..len).map(|x| len - x).collect();
    let mut result = vec![false; len as usize];
    for _ in 0..100*1000*1000 {
        cmp_gt_and(&a, &b, &mut result);
    }
    let sum: i32 = b.into_iter().map(|x| x as i32).sum();
    std::process::exit(sum);
}Code is also available in the following repository
With rust 1.44.0, I observe excecution time around 1.7 sec
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.44.0 (49cae5576 2020-06-01)
binary: rustc
commit-hash: 49cae55760da0a43428eba73abcb659bb70cf2e4
commit-date: 2020-06-01
host: x86_64-unknown-linux-gnu
release: 1.44.0
LLVM version: 9.0
    Finished release [optimized] target(s) in 0.04s
real    0m1.681s
user    0m1.676s
sys     0m0.004s
Rust versions 1.45.2 and current stable 1.46.0 produce binaries that run more than 6.0 seconds with the same source code
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.45.2 (d3fb005a3 2020-07-31)
binary: rustc
commit-hash: d3fb005a39e62501b8b0b356166e515ae24e2e54
commit-date: 2020-07-31
host: x86_64-unknown-linux-gnu
release: 1.45.2
LLVM version: 10.0
    Finished release [optimized] target(s) in 0.05s
real    0m6.643s
user    0m6.643s
sys     0m0.000s
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.46.0 (04488afe3 2020-08-24)
binary: rustc
commit-hash: 04488afe34512aa4c33566eb16d8c912a3ae04f9
commit-date: 2020-08-24
host: x86_64-unknown-linux-gnu
release: 1.46.0
LLVM version: 10.0
    Finished release [optimized] target(s) in 0.00s
real    0m6.642s
user    0m6.606s
sys     0m0.012s
I use several more functions like cmp_gt_and in a core of image processing software that also show similar performance drop.
Has anything significantly changed between rustc 1.44 and 1.45 that may have impacted the code so significantly? Maybe LLVM 10 has a different behavior? Any thoughts how to modify the code to gain the performance back with the current compiler or other things to try in order to clarify the problem? For some time, I can stick with 1.44 to keep the performance.
Function  cmp_gt_and also appears to have much shorter assembly code with rustc 1.44 than with its successors, not sure if that is the reason for the performnace drop, though:
Rustc 1.44.0
_ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE:
	.cfi_startproc
	push	rax
	.cfi_def_cfa_offset 16
	mov	r10, rdi
	cmp	rsi, rcx
	mov	rdi, rsi
	cmova	rdi, rcx
	cmp	rdi, r9
	cmova	rdi, r9
	cmp	rdi, rsi
	ja	.LBB8_10
	cmp	rdi, rcx
	ja	.LBB8_11
	test	rdi, rdi
	je	.LBB8_9
	cmp	rdi, 15
	ja	.LBB8_5
	xor	ecx, ecx
	jmp	.LBB8_8
.LBB8_5:
	mov	rcx, rdi
	and	rcx, -16
	xor	esi, esi
	pxor	xmm0, xmm0
	.p2align	4, 0x90
.LBB8_6:
	movdqu	xmm1, xmmword ptr [r10 + 2*rsi]
	movdqu	xmm2, xmmword ptr [r10 + 2*rsi + 16]
	movdqu	xmm3, xmmword ptr [rdx + 2*rsi]
	pcmpgtw	xmm3, xmm1
	movdqu	xmm1, xmmword ptr [rdx + 2*rsi + 16]
	pcmpgtw	xmm1, xmm2
	movq	xmm2, qword ptr [r8 + rsi]
	punpcklbw	xmm2, xmm0
	movq	xmm4, qword ptr [r8 + rsi + 8]
	punpcklbw	xmm4, xmm0
	pcmpeqw	xmm2, xmm0
	pandn	xmm2, xmm3
	pcmpeqw	xmm4, xmm0
	pandn	xmm4, xmm1
	psrlw	xmm2, 15
	packuswb	xmm2, xmm0
	psrlw	xmm4, 15
	packuswb	xmm4, xmm0
	movq	qword ptr [r8 + rsi], xmm2
	movq	qword ptr [r8 + rsi + 8], xmm4
	add	rsi, 16
	cmp	rcx, rsi
	jne	.LBB8_6
	cmp	rdi, rcx
	je	.LBB8_9
	.p2align	4, 0x90
.LBB8_8:
	movzx	esi, word ptr [r10 + 2*rcx]
	cmp	si, word ptr [rdx + 2*rcx]
	setl	sil
	cmp	byte ptr [r8 + rcx], 0
	setne	al
	and	al, sil
	mov	byte ptr [r8 + rcx], al
	add	rcx, 1
	cmp	rcx, rdi
	jb	.LBB8_8
.LBB8_9:
	pop	rax
	.cfi_def_cfa_offset 8
	ret
.LBB8_10:
	.cfi_def_cfa_offset 16
	lea	rdx, [rip + .L__unnamed_2]
	call	qword ptr [rip + _ZN4core5slice20slice_index_len_fail17he661f5dd1689ef3bE@GOTPCREL]
	ud2
.LBB8_11:
	lea	rdx, [rip + .L__unnamed_3]
	mov	rsi, rcx
	call	qword ptr [rip + _ZN4core5slice20slice_index_len_fail17he661f5dd1689ef3bE@GOTPCREL]
	ud2
.Lfunc_end8:
	.size	_ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE, .Lfunc_end8-_ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE
	.cfi_endproc
	.section	.rodata.cst16,"aM",@progbits,16
	.p2align	4Rustc 1.45.2
_ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE:
	.cfi_startproc
	push	rbx
	.cfi_def_cfa_offset 16
	sub	rsp, 32
	.cfi_def_cfa_offset 48
	.cfi_offset rbx, -16
	mov	r10, rdi
	cmp	rsi, rcx
	mov	rdi, rsi
	cmova	rdi, rcx
	cmp	rdi, r9
	cmova	rdi, r9
	cmp	rdi, rsi
	ja	.LBB8_10
	cmp	rdi, rcx
	ja	.LBB8_11
	test	rdi, rdi
	je	.LBB8_9
	cmp	rdi, 15
	ja	.LBB8_5
	xor	esi, esi
	jmp	.LBB8_8
.LBB8_5:
	mov	rsi, rdi
	and	rsi, -16
	xor	r11d, r11d
	pxor	xmm0, xmm0
	pcmpeqd	xmm1, xmm1
	.p2align	4, 0x90
.LBB8_6:
	movdqu	xmm2, xmmword ptr [r10 + 2*r11]
	movdqu	xmm3, xmmword ptr [r10 + 2*r11 + 16]
	movdqu	xmm4, xmmword ptr [rdx + 2*r11]
	pcmpgtw	xmm4, xmm2
	movdqu	xmm2, xmmword ptr [rdx + 2*r11 + 16]
	pcmpgtw	xmm2, xmm3
	movq	xmm5, qword ptr [r8 + r11]
	movq	xmm3, qword ptr [r8 + r11 + 8]
	pcmpeqb	xmm5, xmm0
	pxor	xmm5, xmm1
	punpcklbw	xmm5, xmm0
	pand	xmm5, xmm4
	pcmpeqb	xmm3, xmm0
	pxor	xmm3, xmm1
	punpcklbw	xmm3, xmm0
	pand	xmm3, xmm2
	movdqa	xmmword ptr [rsp], xmm5
	movzx	eax, byte ptr [rsp + 4]
	and	al, 1
	movzx	r9d, al
	movzx	eax, byte ptr [rsp + 6]
	and	al, 1
	movzx	eax, al
	shl	eax, 8
	or	eax, r9d
	movzx	ecx, byte ptr [rsp]
	movzx	r9d, byte ptr [rsp + 2]
	and	cl, 1
	movzx	ebx, cl
	and	r9b, 1
	movzx	ecx, r9b
	shl	ecx, 8
	or	ecx, ebx
	movd	xmm2, ecx
	pinsrw	xmm2, eax, 1
	movzx	eax, byte ptr [rsp + 8]
	and	al, 1
	movzx	eax, al
	movzx	ecx, byte ptr [rsp + 10]
	and	cl, 1
	movzx	ecx, cl
	shl	ecx, 8
	or	ecx, eax
	pinsrw	xmm2, ecx, 2
	movzx	eax, byte ptr [rsp + 12]
	and	al, 1
	movzx	eax, al
	movzx	ecx, byte ptr [rsp + 14]
	and	cl, 1
	movzx	ecx, cl
	shl	ecx, 8
	or	ecx, eax
	pinsrw	xmm2, ecx, 3
	movdqa	xmmword ptr [rsp + 16], xmm3
	movzx	eax, byte ptr [rsp + 20]
	and	al, 1
	movzx	eax, al
	movzx	ecx, byte ptr [rsp + 22]
	and	cl, 1
	movzx	ecx, cl
	shl	ecx, 8
	or	ecx, eax
	movzx	eax, byte ptr [rsp + 16]
	movzx	ebx, byte ptr [rsp + 18]
	and	al, 1
	movzx	eax, al
	and	bl, 1
	movzx	ebx, bl
	shl	ebx, 8
	or	ebx, eax
	movd	xmm3, ebx
	pinsrw	xmm3, ecx, 1
	movzx	eax, byte ptr [rsp + 24]
	and	al, 1
	movzx	eax, al
	movzx	ecx, byte ptr [rsp + 26]
	and	cl, 1
	movzx	ecx, cl
	shl	ecx, 8
	or	ecx, eax
	pinsrw	xmm3, ecx, 2
	movzx	eax, byte ptr [rsp + 28]
	and	al, 1
	movzx	eax, al
	movzx	ecx, byte ptr [rsp + 30]
	and	cl, 1
	movzx	ecx, cl
	shl	ecx, 8
	or	ecx, eax
	pinsrw	xmm3, ecx, 3
	movq	qword ptr [r8 + r11], xmm2
	movq	qword ptr [r8 + r11 + 8], xmm3
	add	r11, 16
	cmp	rsi, r11
	jne	.LBB8_6
	cmp	rdi, rsi
	je	.LBB8_9
	.p2align	4, 0x90
.LBB8_8:
	movzx	eax, word ptr [r10 + 2*rsi]
	cmp	ax, word ptr [rdx + 2*rsi]
	setl	al
	cmp	byte ptr [r8 + rsi], 0
	setne	cl
	and	cl, al
	mov	byte ptr [r8 + rsi], cl
	add	rsi, 1
	cmp	rsi, rdi
	jb	.LBB8_8
.LBB8_9:
	add	rsp, 32
	.cfi_def_cfa_offset 16
	pop	rbx
	.cfi_def_cfa_offset 8
	ret
.LBB8_10:
	.cfi_def_cfa_offset 48
	lea	rdx, [rip + .L__unnamed_2]
	call	qword ptr [rip + _ZN4core5slice20slice_index_len_fail17h9254c9506d16ff21E@GOTPCREL]
	ud2
.LBB8_11:
	lea	rdx, [rip + .L__unnamed_3]
	mov	rsi, rcx
	call	qword ptr [rip + _ZN4core5slice20slice_index_len_fail17h9254c9506d16ff21E@GOTPCREL]
	ud2
.Lfunc_end8:
	.size	_ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE, .Lfunc_end8-_ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE
	.cfi_endproc
	.section	.rodata.cst16,"aM",@progbits,16
	.p2align	4