[v2] eal/linux: enhanced error handling for affinity
Checks
Commit Message
From: Jianyue Wu <jianyue.wu@nokia-sbell.com>
Improve the robustness of setting thread affinity in DPDK
by adding detailed error logging.
Changes:
1. Check the return value of pthread_setaffinity_np() and log an error
if the call fails.
2. Include the current thread name, the intended CPU set, and a detailed
error message in the log.
Sample prints:
EAL: Cannot set affinity for thread dpdk-test with cpus 0,
ret: 22, errno: 0, error description: Success
EAL: Cannot set affinity for thread dpdk-worker1 with cpus 1,
ret: 22, errno: 0, error description: Success
Signed-off-by: Jianyue Wu <jianyue.wu@nokia-sbell.com>
---
lib/eal/common/eal_common_thread.c | 2 +-
lib/eal/common/eal_thread.h | 2 +-
lib/eal/unix/rte_thread.c | 27 +++++++++++++++++++++++++--
3 files changed, 27 insertions(+), 4 deletions(-)
Comments
On Thu, Apr 25, 2024 at 07:11:30PM +0800, Jianyue Wu wrote:
> From: Jianyue Wu <jianyue.wu@nokia-sbell.com>
>
> Improve the robustness of setting thread affinity in DPDK
> by adding detailed error logging.
>
> Changes:
> 1. Check the return value of pthread_setaffinity_np() and log an error
> if the call fails.
> 2. Include the current thread name, the intended CPU set, and a detailed
> error message in the log.
>
> Sample prints:
> EAL: Cannot set affinity for thread dpdk-test with cpus 0,
> ret: 22, errno: 0, error description: Success
> EAL: Cannot set affinity for thread dpdk-worker1 with cpus 1,
> ret: 22, errno: 0, error description: Success
>
> Signed-off-by: Jianyue Wu <jianyue.wu@nokia-sbell.com>
> ---
> lib/eal/common/eal_common_thread.c | 2 +-
> lib/eal/common/eal_thread.h | 2 +-
> lib/eal/unix/rte_thread.c | 27 +++++++++++++++++++++++++--
> 3 files changed, 27 insertions(+), 4 deletions(-)
>
> diff --git a/lib/eal/common/eal_common_thread.c b/lib/eal/common/eal_common_thread.c
> index a53bc639ae..31a2fab2a7 100644
> --- a/lib/eal/common/eal_common_thread.c
> +++ b/lib/eal/common/eal_common_thread.c
> @@ -103,7 +103,7 @@ rte_thread_get_affinity(rte_cpuset_t *cpusetp)
> }
>
> int
> -eal_thread_dump_affinity(rte_cpuset_t *cpuset, char *str, unsigned int size)
> +eal_thread_dump_affinity(const rte_cpuset_t *cpuset, char *str, unsigned int size)
> {
> unsigned cpu;
> int ret;
> diff --git a/lib/eal/common/eal_thread.h b/lib/eal/common/eal_thread.h
> index 1c3c3442d3..85ab84baa5 100644
> --- a/lib/eal/common/eal_thread.h
> +++ b/lib/eal/common/eal_thread.h
> @@ -50,7 +50,7 @@ unsigned eal_cpu_socket_id(unsigned cpu_id);
> * 0 for success, -1 if truncation happens.
> */
> int
> -eal_thread_dump_affinity(rte_cpuset_t *cpuset, char *str, unsigned int size);
> +eal_thread_dump_affinity(const rte_cpuset_t *cpuset, char *str, unsigned int size);
no objection to adding const
>
> /**
> * Dump the current thread cpuset.
> diff --git a/lib/eal/unix/rte_thread.c b/lib/eal/unix/rte_thread.c
> index 1b4c73f58e..34ac0eabbf 100644
> --- a/lib/eal/unix/rte_thread.c
> +++ b/lib/eal/unix/rte_thread.c
> @@ -369,8 +369,31 @@ int
> rte_thread_set_affinity_by_id(rte_thread_t thread_id,
> const rte_cpuset_t *cpuset)
> {
> - return pthread_setaffinity_np((pthread_t)thread_id.opaque_id,
> - sizeof(*cpuset), cpuset);
> + int ret;
> +#if defined(__linux__) && defined(_GNU_SOURCE)
> + char cpus_str[RTE_CPU_AFFINITY_STR_LEN] = {'\0'};
> + char thread_name[RTE_MAX_THREAD_NAME_LEN] = {'\0'};
> + errno = 0;
> +#endif
> +
> + ret = pthread_setaffinity_np((pthread_t)thread_id.opaque_id,
> + sizeof(*cpuset), cpuset);
> +
> +#if defined(__linux__) && defined(_GNU_SOURCE)
> + if (ret != 0) {
> + if (pthread_getname_np((pthread_t)thread_id.opaque_id,
> + thread_name, sizeof(thread_name)) != 0)
> + EAL_LOG(ERR, "pthread_getname_np failed!");
> + if (eal_thread_dump_affinity(cpuset, cpus_str, RTE_CPU_AFFINITY_STR_LEN) != 0)
> + EAL_LOG(ERR, "eal_thread_dump_affinity failed!");
> + EAL_LOG(ERR, "Cannot set affinity for thread %s with cpus %s, "
> + "ret: %d, errno: %d, error description: %s",
> + thread_name, cpus_str,
> + ret, errno, strerror(errno));
> + }
> +#endif
> +
> + return ret;
> }
>
> int
> --
i do not think introducing os specific behavior/logging to the EAL
is a good idea. logging although not formally part of the api surface
should present the same experience for all platforms. the EAL should
have a higher standard here.
> 2.34.1
On Fri, 26 Apr 2024 08:47:37 -0700
Tyler Retzlaff <roretzla@linux.microsoft.com> wrote:
> > int
> > --
>
> i do not think introducing os specific behavior/logging to the EAL
> is a good idea. logging although not formally part of the api surface
> should present the same experience for all platforms. the EAL should
> have a higher standard here.
>
> > 2.34.1
For this case, the error message would be better if there was some way
to look at the cgroups and mark the cores that are not available as if
they were offline lcores.
Yes, agree with that, there is also trace from kernel can see that. I'll ignore this patch.
At 2024-04-27 08:18:53, "Stephen Hemminger" <stephen@networkplumber.org> wrote:
>On Fri, 26 Apr 2024 08:47:37 -0700
>Tyler Retzlaff <roretzla@linux.microsoft.com> wrote:
>
>> > int
>> > --
>>
>> i do not think introducing os specific behavior/logging to the EAL
>> is a good idea. logging although not formally part of the api surface
>> should present the same experience for all platforms. the EAL should
>> have a higher standard here.
>>
>> > 2.34.1
>
>For this case, the error message would be better if there was some way
>to look at the cgroups and mark the cores that are not available as if
>they were offline lcores.
>
>
@@ -103,7 +103,7 @@ rte_thread_get_affinity(rte_cpuset_t *cpusetp)
}
int
-eal_thread_dump_affinity(rte_cpuset_t *cpuset, char *str, unsigned int size)
+eal_thread_dump_affinity(const rte_cpuset_t *cpuset, char *str, unsigned int size)
{
unsigned cpu;
int ret;
@@ -50,7 +50,7 @@ unsigned eal_cpu_socket_id(unsigned cpu_id);
* 0 for success, -1 if truncation happens.
*/
int
-eal_thread_dump_affinity(rte_cpuset_t *cpuset, char *str, unsigned int size);
+eal_thread_dump_affinity(const rte_cpuset_t *cpuset, char *str, unsigned int size);
/**
* Dump the current thread cpuset.
@@ -369,8 +369,31 @@ int
rte_thread_set_affinity_by_id(rte_thread_t thread_id,
const rte_cpuset_t *cpuset)
{
- return pthread_setaffinity_np((pthread_t)thread_id.opaque_id,
- sizeof(*cpuset), cpuset);
+ int ret;
+#if defined(__linux__) && defined(_GNU_SOURCE)
+ char cpus_str[RTE_CPU_AFFINITY_STR_LEN] = {'\0'};
+ char thread_name[RTE_MAX_THREAD_NAME_LEN] = {'\0'};
+ errno = 0;
+#endif
+
+ ret = pthread_setaffinity_np((pthread_t)thread_id.opaque_id,
+ sizeof(*cpuset), cpuset);
+
+#if defined(__linux__) && defined(_GNU_SOURCE)
+ if (ret != 0) {
+ if (pthread_getname_np((pthread_t)thread_id.opaque_id,
+ thread_name, sizeof(thread_name)) != 0)
+ EAL_LOG(ERR, "pthread_getname_np failed!");
+ if (eal_thread_dump_affinity(cpuset, cpus_str, RTE_CPU_AFFINITY_STR_LEN) != 0)
+ EAL_LOG(ERR, "eal_thread_dump_affinity failed!");
+ EAL_LOG(ERR, "Cannot set affinity for thread %s with cpus %s, "
+ "ret: %d, errno: %d, error description: %s",
+ thread_name, cpus_str,
+ ret, errno, strerror(errno));
+ }
+#endif
+
+ return ret;
}
int