Rust assembly generation: Mapping a bool vector to string slice vector

We have already looked at assembly code generated for a vector iteration. We will build on that knowledge to understand the assembly code generated when mapping a Rust vector to a string slice vector.

Example code: Map a vector of bools to a vector of string slices with static lifetime

The following code shows two functions:

/// Convert a vector of type A into a vector of type B. The user must provide a
/// closure that maps type A to type B. This is a generic function and
/// does not generate any assembly code.
pub fn convert<A, B> (v: Vec<A>, f: impl Fn(A) -> B) -> Vec<B> {
    v.into_iter().map(f).collect()
}

/// Convert a vector of bools into a vector of string slices with a static lifetime.
/// This function uses the convert generic function to perform the conversion.
/// This is a concrete function and generates assembly code.
pub fn convert_bool_vec_to_static_str_vec(v: Vec<bool>) -> Vec<& 'static str> {
    convert(v, |n| if n {"true"} else {"false"})
}

Visualizing the input and output vectors

Let's understand the input and output vectors of the convert_bool_vec_to_static_str_vec function. This will help us understand the assembly code generated.

The input vector passed to the convert_bool_vec_to_static_str_vec function is a vector of bools. The memory organization of this vector is shown below. As discussed in the vector iteration article, the memory organization of a vector is as follows:

Bool vector

The output vector of the convert_bool_vec_to_static_str_vec function is a vector of string slices. The memory organization of this vector is shown below.

String slice vector

String-slice vector generation overview

The following figure gives an overview of the generated assembly code for the convert_bool_vec_to_static_str_vec function. A few key points to note here are:

Overflow and memory allocation failure handling

Length-based optimization

The generated code takes the following input vector length-based decisions:

Preparing the string slice

Cleaning up the input vector on exit from the function

The compiler generates a call to __rust_dealloc to free the input vector. Note that the function owns the input vector, so it is responsible for freeing up the memory allocated for the input vector array just before the function returns.

Flow chart describing the generated assembly code

String slice vector generation

Annotated assembly code for the convert_bool_vec_to_static_str_vec function

The generated assembly code has been annotated to help understand the mapping from Rust code.

; rsi points to the input vector v
; rdi points to the output vector 
; Note: The caller is responsible for reserving the space for the output vector struct.
;       The called function allocates the data array on the heap.

example::convert_bool_vec_to_static_str_vec:
        push    rbp                         ; Save rbp
        push    r15                         ; Save r15
        push    r14                         ; Save r14
        push    r13                         ; Save r13
        push    r12                         ; Save r12
        push    rbx                         ; Save rbx
        sub     rsp, 40                     ; Allocate space on the stack for the local variables

        ; ⭐ Setting up registers based on the input vector v 

        mov     rbp, rdi                    ; Set rbp to the address of the output vector
        mov     r12, qword ptr [rsi]        ; Load the address of the input vector v's data array into r12
        mov     r10, qword ptr [rsi + 8]    ; Load the capacity of the input vector v into r10
        mov     r14, qword ptr [rsi + 16]   ; Load the length of the input vector v into r14
        lea     r15, [r12 + r14]            ; Load the address that points after the last element.

        ; ⭐ Save the input vector v's data array address, capacity, and length into the stack

        mov     qword ptr [rsp + 8], r12    ; Store the address of vector v's data array into the stack
        mov     qword ptr [rsp + 16], r10   ; Store the capacity of vector v into the stack frame
        mov     qword ptr [rsp + 24], r12   ; Store the address of vector v's data array into the stack
        mov     qword ptr [rsp + 32], r15   ; Store the address of the end of vector v's data array into the stack

        ; 0️⃣ Check if the length of vector v is 0 

        test    r14, r14                    ; Test if the length of vector v is 0
        je      .LBB1_1                     ; Jump if the length of vector v is 0

        ; 🔴 Check if this iteration will result in a capacity overflow

        xor     ebx, ebx                    ; Set ebx to 0
        mov     rax, r14                    ; Set rax to the length of vector v
        shr     rax, 59                     ; The length of an array should not exceed 2^59 bits
                                            ; Move the uppermost nibble of the length into lower bits
        sete    al                          ; Set al to 1 if the uppermost nibble in the address is 0
        jne     .LBB1_3                     ; Jump if the length of vector v is greater than 2^59 bytes

        ; 0️⃣ No capacity overflows. Now check if the length is 0.

        ; The length of the vector v is less than 2^59 bytes                          
        mov     qword ptr [rsp], r10        ; Store the capacity of vector v into the stack
        mov     r13, r14                    ; Set r13 to the length of vector v
        shl     r13, 4                      ; Convert the length of vector v to the number of bytes
        mov     bl, al                      ; Set bl to a1 (which happens to be 1 if the length of vector v is less than 2^59 bytes)
        shl     rbx, 3                      ; Shift rbx to the left by 3 bits so that rbx is set 8.
        test    r13, r13                    ; Check if the number of bytes needed for the output vector is 0
        je      .LBB1_6                     ; Jump to .LBB1_6 if no bytes are needed

        ; ⭐ Vector is greater than 0 bytes. Allocate memory for the output vector

        mov     rdi, r13                    ; Set rdi to the number of bytes needed for the output vector
        mov     rsi, rbx                    ; Set rsi to 8 (the alignment of the output vector)
        call    qword ptr [rip + __rust_alloc@GOTPCREL] ; Call __rust_alloc to allocate memory for the output vector
        test    rax, rax                    ; Test if the allocation failed
        je      .LBB1_9                     ; Jump to .LBB1_9 if rax is 0 (allocation failed)
.LBB1_10:

        ; 📦 Allocation succeeded. Now set the address of the output vector's data array to rax 
        ; ⭐ Check if the length of vector v is odd. Handle one entry before entering the even loop

        mov     qword ptr [rbp], rax        ; Set the address of the output vector to rax
        mov     qword ptr [rbp + 8], r14    ; Set the capacity of the output vector to the length of vector v
        mov     qword ptr [rbp + 16], 0     ; Set the length of the output vector to 0
        test    r14b, 1                     ; Test if the length of vector v is odd
        mov     r10, qword ptr [rsp]        ; Set r10 to the length of vector v
        jne     .LBB1_12                    ; Jump to .LBB1_12 if the length of vector v is odd

        ; ⭐ The capacity of vector v is even.

        mov     rdx, r12                    ; Set rdx to the address of vector v's data array
        lea     r8, [rbp + 16]              ; Set r8 to the address of the output vector's length

        ; ⭐ Check if the vector length is 1

        cmp     r14, 1                      ; Compare the length of vector v to 1
        jne     .LBB1_14                    ; Jump to .LBB1_14 if the length of vector v is not 1       
        jmp     .LBB1_17                    ; Jump to .LBB1_17 if the length of vector v is 1
                                            ; The length of vector v is 1 so the loop is not entered.
.LBB1_1:

        ; 0️⃣ The length of vector v is 0.

        mov     qword ptr [rbp], 8          ; Set the address of the output vector to 8
        lea     r8, [rbp + 16]              ; Set r8 to the address of the length of the output vector
        xorps   xmm0, xmm0                  ; Set xmm0 to 0
        movups  xmmword ptr [rbp + 8], xmm0 ; Set the length and capacity of the output vector to 0
        jmp     .LBB1_17                    ; Jump to .LBB1_17

.LBB1_12:

        ; ⭐ The length of the output vector v is odd so one loop entry is performed
        ; before entering the loop that handles even number of entries.

        movzx   ecx, byte ptr [r12]         ; Set ecx to the value of the first element of input vector v's bool data array
        and     ecx, 1                      ; Mask of the upper bits of ecx to ensure that the value is 0 or 1

        ; The length and pointer need to be initialized in the output vector entry
        ; The compiler uses an XOR trick to determine the length of the slice.
        ; If 1 is XORed 5, the result is 4 (length of "true").
        ; If 0 is XORed 5, the result is 5 (length of "false").
        ; The compiler uses this trick to avoid a branch.

        mov     rsi, rcx                    ; Set rsi to ecx
        xor     rsi, 5                      ; Set rsi to 5 if rsi is 0, otherwise set rsi to 4
        test    rcx, rcx                    ; Test if the first bit of vector v's data array is 0
        lea     rcx, [rip + .L__unnamed_1]  ; Set rcx to the address of "false"
        lea     rdi, [rip + .L__unnamed_2]  ; Set rdi to the address of "true"

        ; The following conditional move is used to make the rdi point to "true" if the bool value is 1.
        ; Otherwise, rdi points to "false".
        cmove   rdi, rcx                    ; rdi points to "false" if input element's entry is 0, otherwise rdi points to "true"
        lea     rdx, [r12 + 1]              ; Set rcx to the address of the next element of input vector v
        mov     qword ptr [rax], rdi        ; Set the address of the mapped string in the string slice
        mov     qword ptr [rax + 8], rsi    ; Set the length of the string slice
        add     rax, 16                     ; Point to the next slice in the output data array
        lea     r8, [rbp + 16]              ; Set r8 to the address of the output vector's data array
        cmp     r14, 1                      ; Compare the length of vector v to 1
        je      .LBB1_17                    ; Jump to .LBB1_17 if the length of vector v is 1 (we are done)
.LBB1_14:

        ; ⭐ Begin loop for even number of entries (two entries are processed in each iteration)

        mov     r9, rbp                     ; Set r9 to the address of the output vector
        lea     rsi, [rip + .L__unnamed_1]  ; Set rsi to the address of "false"
        lea     rbp, [rip + .L__unnamed_2]  ; Set rbp to the address of "true"
.LBB1_15:
        movzx   ebx, byte ptr [rdx]         ; Set ebx to the first byte of input vector v's data array
        and     ebx, 1                      ; Mask off the upper bits to get the bool value

        ; The length and pointer need to be initialized in the output vector entry
        ; The compiler uses an XOR trick to determine the length of the slice.
        ; If 1 is XORed 5, the result is 4 (length of "true").
        ; If 0 is XORed 5, the result is 5 (length of "false").
        mov     rcx, rbx                    ; Set rcx to the bool value mapped to 0 or 1
        xor     rcx, 5                      ; Set rcx to 5 if rcx is 0, otherwise set rcx to 4    
        test    rbx, rbx                    ; Set the condition code bits based on the content of bool value
        mov     rdi, rbp                    ; Set rdi to point to "true"

        ; ⭐ Processing of the first entry within the iteration

        ; The following conditional move is used to make the rdi point to "false" if the bool value is 0.
        ; Otherwise, rdi points to "true". 
        cmove   rdi, rsi                    ; Set rdi to point to "false" if bool value is 0, otherwise set rdi to point to "true"
        mov     qword ptr [rax], rdi        ; Copy the address of "true" or "false" to the string slice
        mov     qword ptr [rax + 8], rcx    ; Set the length of the string slice

        ; ⭐ Processing of the second entry within the iteration

        movzx   ecx, byte ptr [rdx + 1]     ; Set ecx to the next element of the input vector v's data array
        and     ecx, 1                      ; Mask off the upper bits to convert bool into 0 or 1
        mov     rdi, rcx                    ; Move the bool's value to rdi
        xor     rdi, 5                      ; Convert the bool's value into the length using the XOR trick
        test    rcx, rcx                    ; Set the condition codes based on the bool's value
        mov     rcx, rbp                    ; rcx points to "true"
        cmove   rcx, rsi                    ; rcx points to "false" if the bool's value is 0
        mov     qword ptr [rax + 16], rcx   ; Copy the address of "true" or "false" to the string slice
        mov     qword ptr [rax + 24], rdi   ; Set the length of the string slice
        add     rdx, 2                      ; Move in the input vector past the two entries we have just processed.
        add     rax, 32                     ; Move the output vector pointer past the two entries we have just processed.
        cmp     rdx, r15                    ; Check if we have processed all the entries of the input vector
        jne     .LBB1_15                    ; Jump to .LBB1_15 if more entries need to be processed

        ; ⭐ End loop for even number of entries

        mov     rbp, r9                     ; Set rbp to the address of the output vector
.LBB1_17:

        ; ⭐ Returning when the length is 0 or 1

        mov     qword ptr [r8], r14         ; Set the length of the output vector to the length of vector v
        test    r10, r10                    ; Test if the length of vector v is 0
        je      .LBB1_19                    ; Jump to .LBB1_19 if the length of vector v is 0
        mov     rdx, r10                    ; Set rdx to the length of vector v
        not     rdx                         ; Set rdx to the bitwise not of rdx
        shr     rdx, 63                     ; Set rdx to 0 if rdx is 0, otherwise set rdx to 1

        ; ♻️ Free the input vector v's buffer as the function owned the vector and
        ; vector v is going out of scope.

        mov     rdi, r12                    ; Set rdi to the address of input vector v's data array
        mov     rsi, r10                    ; Set rsi to the length of input vector v
        call    qword ptr [rip + __rust_dealloc@GOTPCREL]   ; Call __rust_dealloc
.LBB1_19:

        ; ⭐ Return the output vector

        mov     rax, rbp                    ; Set rax to the address of the output vector
        add     rsp, 40                     ; Set rsp to the address of the first byte of the output vector's data array
        pop     rbx                         ; Restore rbx
        pop     r12                         ; Restore r12
        pop     r13                         ; Restore r13
        pop     r14                         ; Restore r14
        pop     r15                         ; Restore r15
        pop     rbp                         ; Restore rbp
        ret
.LBB1_6:

        ; 0️⃣ The following code is generated when the length of vector v is 0

        mov     rax, rbx                    ; Set rax to rbx
        test    rax, rax                    ; Test if rbx is 0
        jne     .LBB1_10                    ; Jump to .LBB1_10 if rbx is not 0

.LBB1_9:

        ; ❌ Memory allocation for the output vector has failed so signal an error

        mov     rdi, r13                    ; Set rdi to r13
        mov     rsi, rbx                    ; Set rsi to rbx
        call    qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL] ; Call alloc::alloc::handle_alloc_error
        jmp     .LBB1_4                     ; Jump to .LBB1_4
.LBB1_3:

        ; 🔴 Capacity overflow has occurred so signal an error

        call    qword ptr [rip + alloc::raw_vec::capacity_overflow@GOTPCREL]    ; Call alloc::raw_vec::capacity_overflow
.LBB1_4:

        ; 💀 Common logic for exception handling

        ud2                                 ; Halt execution
        mov     rbx, rax                    ; Set rbx to rax
        lea     rdi, [rsp + 8]              ; Set rdi to the address of the first byte of the output vector's data array
        call    core::ptr::drop_in_place<core::iter::adapters::map::Map<alloc::vec::into_iter::IntoIter<bool>,example::convert_bool_vec_to_static_str_vec::{{closure}}>> ; Call core::ptr::drop_in_place
        mov     rdi, rbx                    ; Set rdi to rbx
        call    _Unwind_Resume@PLT          ; Call _Unwind_Resume
        ud2                                 ; Halt execution    

.L__unnamed_2:
        .ascii  "true"

.L__unnamed_1:
        .ascii  "false"

DW.ref.rust_eh_personality:
        .quad   rust_eh_personality

Key takeaways

Experiment in the Compiler Explorer

You can experiment with this code in the Compiler Explorer. Here are a few things to try:

pub fn convert_u64_vec_to_bool_vec(v: Vec<u64>) -> Vec<bool> {
    convert(v, |n| n != 0)
}

pub fn convert_f32_f32_vec_to_f32_vec(v: Vec<(f32,f32)>) -> Vec<f32> {
    convert(v, |n| n.0*n.1)
}