get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/133872/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 133872,
    "url": "http://patchwork.dpdk.org/api/patches/133872/?format=api",
    "web_url": "http://patchwork.dpdk.org/project/dpdk/patch/98CBD80474FA8B44BF855DF32C47DC35E9EFD4@smartserver.smartshare.dk/",
    "project": {
        "id": 1,
        "url": "http://patchwork.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<98CBD80474FA8B44BF855DF32C47DC35E9EFD4@smartserver.smartshare.dk>",
    "list_archive_url": "https://inbox.dpdk.org/dev/98CBD80474FA8B44BF855DF32C47DC35E9EFD4@smartserver.smartshare.dk",
    "date": "2023-11-04T17:29:40",
    "name": "[RFC] mempool: CPU cache aligning mempool driver accesses",
    "commit_ref": null,
    "pull_url": null,
    "state": "new",
    "archived": false,
    "hash": "0e79f41a913f94418ba8335498f4a87ff494cfd9",
    "submitter": {
        "id": 591,
        "url": "http://patchwork.dpdk.org/api/people/591/?format=api",
        "name": "Morten Brørup",
        "email": "mb@smartsharesystems.com"
    },
    "delegate": {
        "id": 1,
        "url": "http://patchwork.dpdk.org/api/users/1/?format=api",
        "username": "tmonjalo",
        "first_name": "Thomas",
        "last_name": "Monjalon",
        "email": "thomas@monjalon.net"
    },
    "mbox": "http://patchwork.dpdk.org/project/dpdk/patch/98CBD80474FA8B44BF855DF32C47DC35E9EFD4@smartserver.smartshare.dk/mbox/",
    "series": [
        {
            "id": 30159,
            "url": "http://patchwork.dpdk.org/api/series/30159/?format=api",
            "web_url": "http://patchwork.dpdk.org/project/dpdk/list/?series=30159",
            "date": "2023-11-04T17:29:40",
            "name": "[RFC] mempool: CPU cache aligning mempool driver accesses",
            "version": 1,
            "mbox": "http://patchwork.dpdk.org/series/30159/mbox/"
        }
    ],
    "comments": "http://patchwork.dpdk.org/api/patches/133872/comments/",
    "check": "warning",
    "checks": "http://patchwork.dpdk.org/api/patches/133872/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from mails.dpdk.org (mails.dpdk.org [217.70.189.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 3345143287;\n\tSat,  4 Nov 2023 18:29:44 +0100 (CET)",
            "from mails.dpdk.org (localhost [127.0.0.1])\n\tby mails.dpdk.org (Postfix) with ESMTP id B494F4029B;\n\tSat,  4 Nov 2023 18:29:43 +0100 (CET)",
            "from dkmailrelay1.smartsharesystems.com\n (smartserver.smartsharesystems.com [77.243.40.215])\n by mails.dpdk.org (Postfix) with ESMTP id D84C640282\n for <dev@dpdk.org>; Sat,  4 Nov 2023 18:29:42 +0100 (CET)",
            "from smartserver.smartsharesystems.com\n (smartserver.smartsharesys.local [192.168.4.10])\n by dkmailrelay1.smartsharesystems.com (Postfix) with ESMTP id AB539206C3\n for <dev@dpdk.org>; Sat,  4 Nov 2023 18:29:42 +0100 (CET)"
        ],
        "Content-class": "urn:content-classes:message",
        "MIME-Version": "1.0",
        "Content-Type": "text/plain;\n\tcharset=\"iso-8859-1\"",
        "Content-Transfer-Encoding": "quoted-printable",
        "Subject": "[RFC] mempool: CPU cache aligning mempool driver accesses",
        "X-MimeOLE": "Produced By Microsoft Exchange V6.5",
        "Date": "Sat, 4 Nov 2023 18:29:40 +0100",
        "Message-ID": "<98CBD80474FA8B44BF855DF32C47DC35E9EFD4@smartserver.smartshare.dk>",
        "X-MS-Has-Attach": "",
        "X-MS-TNEF-Correlator": "",
        "Thread-Topic": "[RFC] mempool: CPU cache aligning mempool driver accesses",
        "Thread-Index": "AdoPRH2dqaLCk8BNQ/aikLEUxLBH7w==",
        "From": "=?iso-8859-1?q?Morten_Br=F8rup?= <mb@smartsharesystems.com>",
        "To": "<dev@dpdk.org>",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.29",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org"
    },
    "content": "I tried a little experiment, which gave a 25 % improvement in mempool\nperf tests for long bursts (n_get_bulk=32 n_put_bulk=32 n_keep=512\nconstant_n=0) on a Xeon E5-2620 v4 based system.\n\nThis is the concept:\n\nIf all accesses to the mempool driver goes through the mempool cache,\nwe can ensure that these bulk load/stores are always CPU cache aligned,\nby using cache->size when loading/storing to the mempool driver.\n\nFurthermore, it is rumored that most applications use the default\nmempool cache size, so if the driver tests for that specific value,\nit can use rte_memcpy(src,dst,N) with N known at build time, allowing\noptimal performance for copying the array of objects.\n\nUnfortunately, I need to change the flush threshold from 1.5 to 2 to\nbe able to always use cache->size when loading/storing to the mempool\ndriver.\n\nWhat do you think?\n\nPS: If we can't get rid of the mempool cache size threshold factor,\nwe really need to expose it through public APIs. A job for another day.\n\nSigned-off-by: Morten Brørup <mb@smartsharesystems.com>\n---",
    "diff": "diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c\nindex 7a7a9bf6db..b21033209b 100644\n--- a/lib/mempool/rte_mempool.c\n+++ b/lib/mempool/rte_mempool.c\n@@ -48,7 +48,7 @@ static void\n mempool_event_callback_invoke(enum rte_mempool_event event,\n                              struct rte_mempool *mp);\n\n-#define CACHE_FLUSHTHRESH_MULTIPLIER 1.5\n+#define CACHE_FLUSHTHRESH_MULTIPLIER 2\n #define CALC_CACHE_FLUSHTHRESH(c)      \\\n        ((typeof(c))((c) * CACHE_FLUSHTHRESH_MULTIPLIER))\n\ndiff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h\nindex df87cd231e..76efeff59e 100644\n--- a/lib/mempool/rte_mempool.h\n+++ b/lib/mempool/rte_mempool.h\n@@ -1014,7 +1014,7 @@ typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *);\n  *   If cache_size is non-zero, the rte_mempool library will try to\n  *   limit the accesses to the common lockless pool, by maintaining a\n  *   per-lcore object cache. This argument must be lower or equal to\n- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose\n+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 2. It is advised to choose\n  *   cache_size to have \"n modulo cache_size == 0\": if this is\n  *   not the case, some elements will always stay in the pool and will\n  *   never be used. The access to the per-lcore table is of course\n@@ -1373,24 +1373,24 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,\n        RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);\n\n        /* The request itself is too big for the cache */\n-       if (unlikely(n > cache->flushthresh))\n+       if (unlikely(n > cache->size))\n                goto driver_enqueue_stats_incremented;\n\n        /*\n         * The cache follows the following algorithm:\n         *   1. If the objects cannot be added to the cache without crossing\n-        *      the flush threshold, flush the cache to the backend.\n+        *      the flush threshold, flush a fixed amount of the cache to the backend.\n         *   2. Add the objects to the cache.\n         */\n\n        if (cache->len + n <= cache->flushthresh) {\n                cache_objs = &cache->objs[cache->len];\n-               cache->len += n;\n        } else {\n-               cache_objs = &cache->objs[0];\n-               rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);\n-               cache->len = n;\n+               cache->len -= cache->size;\n+               cache_objs = &cache->objs[cache->len];\n+               rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->size);\n        }\n+       cache->len += n;\n\n        /* Add the objects to the cache. */\n        rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);\n@@ -1547,13 +1547,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,\n                return 0;\n        }\n\n-       /* if dequeue below would overflow mem allocated for cache */\n-       if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))\n+       /* More remaining than the cache size */\n+       if (unlikely(remaining > cache->size))\n                goto driver_dequeue;\n\n-       /* Fill the cache from the backend; fetch size + remaining objects. */\n+       /* Fill the cache from the backend; fetch size objects. */\n        ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,\n-                       cache->size + remaining);\n+                       cache->size);\n        if (unlikely(ret < 0)) {\n                /*\n                 * We are buffer constrained, and not able to allocate\n@@ -1565,11 +1565,11 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,\n        }\n\n        /* Satisfy the remaining part of the request from the filled cache. */\n-       cache_objs = &cache->objs[cache->size + remaining];\n+       cache_objs = &cache->objs[cache->size];\n        for (index = 0; index < remaining; index++)\n                *obj_table++ = *--cache_objs;\n\n-       cache->len = cache->size;\n+       cache->len = cache->size - remaining;\n\n        RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);\n        RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);\ndiff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h\nindex 4b80f58980..2b10b76fc1 100644\n--- a/lib/ring/rte_ring_elem_pvt.h\n+++ b/lib/ring/rte_ring_elem_pvt.h\n@@ -10,6 +10,9 @@\n #ifndef _RTE_RING_ELEM_PVT_H_\n #define _RTE_RING_ELEM_PVT_H_\n\n+#include <rte_config.h>\n+#include <rte_memcpy.h>\n+\n #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 120000)\n #pragma GCC diagnostic push\n #pragma GCC diagnostic ignored \"-Wstringop-overflow\"\n@@ -24,6 +27,12 @@ __rte_ring_enqueue_elems_32(struct rte_ring *r, const uint32_t size,\n        uint32_t *ring = (uint32_t *)&r[1];\n        const uint32_t *obj = (const uint32_t *)obj_table;\n        if (likely(idx + n <= size)) {\n+#ifdef RTE_ARCH_32\n+               if (n == RTE_MEMPOOL_CACHE_MAX_SIZE) {\n+                       rte_memcpy(&ring[idx], obj_table, RTE_MEMPOOL_CACHE_MAX_SIZE * sizeof(uint32_t));\n+                       return;\n+               }\n+#endif\n                for (i = 0; i < (n & ~0x7); i += 8, idx += 8) {\n                        ring[idx] = obj[i];\n                        ring[idx + 1] = obj[i + 1];\n@@ -69,6 +78,12 @@ __rte_ring_enqueue_elems_64(struct rte_ring *r, uint32_t prod_head,\n        uint64_t *ring = (uint64_t *)&r[1];\n        const unaligned_uint64_t *obj = (const unaligned_uint64_t *)obj_table;\n        if (likely(idx + n <= size)) {\n+#ifdef RTE_ARCH_64\n+               if (n == RTE_MEMPOOL_CACHE_MAX_SIZE) {\n+                       rte_memcpy(&ring[idx], obj_table, RTE_MEMPOOL_CACHE_MAX_SIZE * sizeof(uint64_t));\n+                       return;\n+               }\n+#endif\n                for (i = 0; i < (n & ~0x3); i += 4, idx += 4) {\n                        ring[idx] = obj[i];\n                        ring[idx + 1] = obj[i + 1];\n@@ -158,6 +173,12 @@ __rte_ring_dequeue_elems_32(struct rte_ring *r, const uint32_t size,\n        uint32_t *ring = (uint32_t *)&r[1];\n        uint32_t *obj = (uint32_t *)obj_table;\n        if (likely(idx + n <= size)) {\n+#ifdef RTE_ARCH_32\n+               if (n == RTE_MEMPOOL_CACHE_MAX_SIZE) {\n+                       rte_memcpy(obj_table, &ring[idx], RTE_MEMPOOL_CACHE_MAX_SIZE * sizeof(uint32_t));\n+                       return;\n+               }\n+#endif\n                for (i = 0; i < (n & ~0x7); i += 8, idx += 8) {\n                        obj[i] = ring[idx];\n                        obj[i + 1] = ring[idx + 1];\n@@ -203,6 +224,12 @@ __rte_ring_dequeue_elems_64(struct rte_ring *r, uint32_t cons_head,\n        uint64_t *ring = (uint64_t *)&r[1];\n        unaligned_uint64_t *obj = (unaligned_uint64_t *)obj_table;\n        if (likely(idx + n <= size)) {\n+#ifdef RTE_ARCH_64\n+               if (n == RTE_MEMPOOL_CACHE_MAX_SIZE) {\n+                       rte_memcpy(obj_table, &ring[idx], RTE_MEMPOOL_CACHE_MAX_SIZE * sizeof(uint64_t));\n+                       return;\n+               }\n+#endif\n                for (i = 0; i < (n & ~0x3); i += 4, idx += 4) {\n                        obj[i] = ring[idx];\n                        obj[i + 1] = ring[idx + 1];\n",
    "prefixes": [
        "RFC"
    ]
}