From patchwork Sun Jan 16 14:13:05 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Luc Pelletier X-Patchwork-Id: 105877 X-Patchwork-Delegate: thomas@monjalon.net Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 6C335A0350; Sun, 16 Jan 2022 15:15:09 +0100 (CET) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id E861440395; Sun, 16 Jan 2022 15:15:08 +0100 (CET) Received: from mail-qt1-f174.google.com (mail-qt1-f174.google.com [209.85.160.174]) by mails.dpdk.org (Postfix) with ESMTP id BCFF940040; Sun, 16 Jan 2022 15:15:07 +0100 (CET) Received: by mail-qt1-f174.google.com with SMTP id q14so16043660qtx.10; Sun, 16 Jan 2022 06:15:07 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=from:to:cc:subject:date:message-id:in-reply-to:references :mime-version:content-transfer-encoding; bh=UZAAlS6hxnJCmpXXev8LGBMyXZMuhINKNMhkod38f0Y=; b=iU8A+ayblAMqe00mTn7vcwZ+5OZmROLlJW35JyEn465u2x10FegKIqPbNyJWwqN7wk z6kFKqbrxwyEy7lzm90meIbwXe9nwB2Noa8ncs+Di5bYi0Hw92faC2SmckMkpMrrJrpo q86+/v1NCPopV7DSu+31qLycbZFumqWLlBG0/k8XNW8SuhLr7o7HUZ9c7A1gZUwV3U9K KwFhHWCK1v0lTxHkGJl5cV1CABEQhOs6hmzJRY0TGGw6D5ccG09g3C3rMuV4BEyWHw8C AUz7j8ujRhdrfhqve3ebyn/zIH+SMC9J6O2qzccBswWmQj1zZ3HHdamjPw1rIBrjORhu vOCA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=UZAAlS6hxnJCmpXXev8LGBMyXZMuhINKNMhkod38f0Y=; b=I7ypN2p3l9BC9hmpjLnUbFTsktZ38Z5D+rNdVBe9kJhtArBtMwF1oVcw+nPrKzV/2U WBB2Hgh+R1jE5Ka1NqAVRG+KH3ldd6723wdWAXmscPm84KC70zrectkJQlAsYBudyLDH ZPdZhj0JVzDkiHKwwYN+K3ARZZjxlFdW0qeT2jQe5+GGhWZpdKVwxXsDiHp4zb0N311t 3fuXXX1U1w+LUu4zUhPILivvAtn7hGxydLnH9CD6X1m84N7dykgyW4prtLYyjk+pC3vg +PrLHE3gEhCieS4GFDmWN6XWAAxHtWG6oS+g1GNHMCIAhrsWzMc/cL8U0aikun3dyRiW 6dlw== X-Gm-Message-State: AOAM532R+k0LLi+OdNKSFfj7+yK+aTvLVCYMm29MveIspMHmlOgPDc2U usOrf2+LDIv+vFHllS+GTedo+6oda0A= X-Google-Smtp-Source: ABdhPJytRBHhp20gFLanHcuDRITvab5mSDyKj1564GfzoqQW1Tm/2iS8a04MsjxEBn7IXJ+zbrOyLQ== X-Received: by 2002:a05:622a:1881:: with SMTP id v1mr14157798qtc.327.1642342506163; Sun, 16 Jan 2022 06:15:06 -0800 (PST) Received: from localhost.localdomain (bras-base-hullpq2034w-grc-18-74-15-213-135.dsl.bell.ca. [74.15.213.135]) by smtp.gmail.com with ESMTPSA id k8sm7433480qtx.35.2022.01.16.06.15.05 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Sun, 16 Jan 2022 06:15:05 -0800 (PST) From: Luc Pelletier To: bruce.richardson@intel.com, konstantin.ananyev@intel.com Cc: dev@dpdk.org, Luc Pelletier , Xiaoyun Li , stable@dpdk.org Subject: [PATCH v3] eal: fix unaligned loads/stores in rte_memcpy_generic Date: Sun, 16 Jan 2022 09:13:05 -0500 Message-Id: <20220116141304.474374-1-lucp.at.work@gmail.com> In-Reply-To: <20220115194102.444140-1-lucp.at.work@gmail.com> References: <20220115194102.444140-1-lucp.at.work@gmail.com> MIME-Version: 1.0 X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Calls to rte_memcpy_generic could result in unaligned loads/stores for 1 < n < 16. This is undefined behavior according to the C standard, and it gets flagged by the clang undefined behavior sanitizer. rte_memcpy_generic is called with unaligned src and dst addresses. When 1 < n < 16, the code would cast both src and dst to a qword, dword or word pointer, without verifying the alignment of src/dst. The code was changed to use inline assembly for the load/store operations. Unaligned load/store operations are permitted in x86/x64 assembly. Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time") Cc: Xiaoyun Li Cc: stable@dpdk.org Signed-off-by: Luc Pelletier --- Please note that I didn't write the entire function in inline assembly. The reason why I kept the bitwise ands as C code is so the optimizer can remove the branches when n is known at compile-time. lib/eal/x86/include/rte_memcpy.h | 134 +++++++++++++++++-------------- 1 file changed, 72 insertions(+), 62 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 1b6c6e585f..b99c1b2ca5 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -45,6 +45,75 @@ extern "C" { static __rte_always_inline void * rte_memcpy(void *dst, const void *src, size_t n); +#if defined(__i386__) + #define RTE_ACCUMULATOR_REGISTER_NAME "eax" +#elif defined(__x86_64__) + #define RTE_ACCUMULATOR_REGISTER_NAME "rax" +#endif + +/** + * Copy bytes from one location to another, + * locations should not overlap. + * Use with unaligned src/dst, and n <= 15. + */ +static __rte_always_inline void * +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n) +{ + void *ret = dst; + if (n & 8) { + asm ( +#if defined(__i386__) + "movl (%[src]), %%eax\n" + "movl %%eax, (%[dst])\n" + "add $4, %[src]\n" + "add $4, %[dst]\n" + "movl (%[src]), %%eax\n" + "movl %%eax, (%[dst])\n" + "add $4, %[src]\n" + "add $4, %[dst]\n" +#elif defined(__x86_64__) + "movq (%[src]), %%rax\n" + "movq %%rax, (%[dst])\n" + "add $8, %[src]\n" + "add $8, %[dst]\n" +#else + #error Unsupported architecture +#endif + : [dst] "+r" (dst), [src] "+r" (src) + : + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); + } + if (n & 4) { + asm ( + "movl (%[src]), %%eax\n" + "movl %%eax, (%[dst])\n" + "add $4, %[src]\n" + "add $4, %[dst]\n" + : [dst] "+r" (dst), [src] "+r" (src) + : + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); + } + if (n & 2) { + asm ( + "movw (%[src]), %%ax\n" + "movw %%ax, (%[dst])\n" + "add $2, %[src]\n" + "add $2, %[dst]\n" + : [dst] "+r" (dst), [src] "+r" (src) + : + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); + } + if (n & 1) { + asm ( + "movb (%[src]), %%al\n" + "movb %%al, (%[dst])\n" + : [dst] "+r" (dst), [src] "+r" (src) + : + : RTE_ACCUMULATOR_REGISTER_NAME, "memory"); + } + return ret; +} + #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 #define ALIGNMENT_MASK 0x3F @@ -171,8 +240,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -181,24 +248,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) - *(uint64_t *)dstu = *(const uint64_t *)srcu; - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -379,8 +429,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -389,25 +437,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -672,8 +702,6 @@ static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t srcofs; @@ -682,25 +710,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /**