[v9,1/4] lib: add generic support for reading PMU events

Message ID 20230202124951.2915770-2-tduszynski@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series add support for self monitoring |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Tomasz Duszynski Feb. 2, 2023, 12:49 p.m. UTC
  Add support for programming PMU counters and reading their values
in runtime bypassing kernel completely.

This is especially useful in cases where CPU cores are isolated
(nohz_full) i.e run dedicated tasks. In such cases one cannot use
standard perf utility without sacrificing latency and performance.

Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 MAINTAINERS                            |   5 +
 app/test/meson.build                   |   1 +
 app/test/test_pmu.c                    |  55 +++
 doc/api/doxy-api-index.md              |   3 +-
 doc/api/doxy-api.conf.in               |   1 +
 doc/guides/prog_guide/profile_app.rst  |   8 +
 doc/guides/rel_notes/release_23_03.rst |   9 +
 lib/meson.build                        |   1 +
 lib/pmu/meson.build                    |  13 +
 lib/pmu/pmu_private.h                  |  29 ++
 lib/pmu/rte_pmu.c                      | 464 +++++++++++++++++++++++++
 lib/pmu/rte_pmu.h                      | 205 +++++++++++
 lib/pmu/version.map                    |  20 ++
 13 files changed, 813 insertions(+), 1 deletion(-)
 create mode 100644 app/test/test_pmu.c
 create mode 100644 lib/pmu/meson.build
 create mode 100644 lib/pmu/pmu_private.h
 create mode 100644 lib/pmu/rte_pmu.c
 create mode 100644 lib/pmu/rte_pmu.h
 create mode 100644 lib/pmu/version.map
  

Comments

David Marchand Feb. 6, 2023, 11:02 a.m. UTC | #1
Hello,

On Thu, Feb 2, 2023 at 1:50 PM Tomasz Duszynski <tduszynski@marvell.com> wrote:
>
> Add support for programming PMU counters and reading their values
> in runtime bypassing kernel completely.
>
> This is especially useful in cases where CPU cores are isolated
> (nohz_full) i.e run dedicated tasks. In such cases one cannot use
> standard perf utility without sacrificing latency and performance.

For my understanding, what OS capability/permission are required to
use this library?


>
> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>  MAINTAINERS                            |   5 +
>  app/test/meson.build                   |   1 +
>  app/test/test_pmu.c                    |  55 +++
>  doc/api/doxy-api-index.md              |   3 +-
>  doc/api/doxy-api.conf.in               |   1 +
>  doc/guides/prog_guide/profile_app.rst  |   8 +
>  doc/guides/rel_notes/release_23_03.rst |   9 +
>  lib/meson.build                        |   1 +
>  lib/pmu/meson.build                    |  13 +
>  lib/pmu/pmu_private.h                  |  29 ++
>  lib/pmu/rte_pmu.c                      | 464 +++++++++++++++++++++++++
>  lib/pmu/rte_pmu.h                      | 205 +++++++++++
>  lib/pmu/version.map                    |  20 ++
>  13 files changed, 813 insertions(+), 1 deletion(-)
>  create mode 100644 app/test/test_pmu.c
>  create mode 100644 lib/pmu/meson.build
>  create mode 100644 lib/pmu/pmu_private.h
>  create mode 100644 lib/pmu/rte_pmu.c
>  create mode 100644 lib/pmu/rte_pmu.h
>  create mode 100644 lib/pmu/version.map
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 9a0f416d2e..9f13eafd95 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1697,6 +1697,11 @@ M: Nithin Dabilpuram <ndabilpuram@marvell.com>
>  M: Pavan Nikhilesh <pbhagavatula@marvell.com>
>  F: lib/node/
>
> +PMU - EXPERIMENTAL
> +M: Tomasz Duszynski <tduszynski@marvell.com>
> +F: lib/pmu/
> +F: app/test/test_pmu*
> +
>
>  Test Applications
>  -----------------
> diff --git a/app/test/meson.build b/app/test/meson.build
> index f34d19e3c3..7b6b69dcf1 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -111,6 +111,7 @@ test_sources = files(
>          'test_reciprocal_division_perf.c',
>          'test_red.c',
>          'test_pie.c',
> +        'test_pmu.c',
>          'test_reorder.c',
>          'test_rib.c',
>          'test_rib6.c',

This code adds a new test.
This test should be added to an existing testsuite, like fast-tests etc...


> diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c
> new file mode 100644
> index 0000000000..a9bfb1a427
> --- /dev/null
> +++ b/app/test/test_pmu.c
> @@ -0,0 +1,55 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2023 Marvell International Ltd.
> + */
> +
> +#include "test.h"
> +
> +#ifndef RTE_EXEC_ENV_LINUX
> +
> +static int
> +test_pmu(void)
> +{
> +       printf("pmu_autotest only supported on Linux, skipping test\n");
> +       return TEST_SKIPPED;
> +}
> +
> +#else
> +
> +#include <rte_pmu.h>
> +
> +static int
> +test_pmu_read(void)
> +{
> +       int tries = 10, event = -1;
> +       uint64_t val = 0;
> +
> +       if (rte_pmu_init() < 0)
> +               return TEST_FAILED;
> +
> +       while (tries--)
> +               val += rte_pmu_read(event);
> +
> +       rte_pmu_fini();
> +
> +       return val ? TEST_SUCCESS : TEST_FAILED;
> +}
> +
> +static struct unit_test_suite pmu_tests = {
> +       .suite_name = "pmu autotest",
> +       .setup = NULL,
> +       .teardown = NULL,
> +       .unit_test_cases = {
> +               TEST_CASE(test_pmu_read),
> +               TEST_CASES_END()
> +       }
> +};
> +
> +static int
> +test_pmu(void)
> +{
> +       return unit_test_suite_runner(&pmu_tests);
> +}
> +
> +#endif /* RTE_EXEC_ENV_LINUX */
> +
> +REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
> index de488c7abf..7f1938f92f 100644
> --- a/doc/api/doxy-api-index.md
> +++ b/doc/api/doxy-api-index.md
> @@ -222,7 +222,8 @@ The public API headers are grouped by topics:
>    [log](@ref rte_log.h),
>    [errno](@ref rte_errno.h),
>    [trace](@ref rte_trace.h),
> -  [trace_point](@ref rte_trace_point.h)
> +  [trace_point](@ref rte_trace_point.h),
> +  [pmu](@ref rte_pmu.h)
>
>  - **misc**:
>    [EAL config](@ref rte_eal.h),
> diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in
> index f0886c3bd1..920e615996 100644
> --- a/doc/api/doxy-api.conf.in
> +++ b/doc/api/doxy-api.conf.in
> @@ -63,6 +63,7 @@ INPUT                   = @TOPDIR@/doc/api/doxy-api-index.md \
>                            @TOPDIR@/lib/pci \
>                            @TOPDIR@/lib/pdump \
>                            @TOPDIR@/lib/pipeline \
> +                          @TOPDIR@/lib/pmu \
>                            @TOPDIR@/lib/port \
>                            @TOPDIR@/lib/power \
>                            @TOPDIR@/lib/rawdev \
> diff --git a/doc/guides/prog_guide/profile_app.rst b/doc/guides/prog_guide/profile_app.rst
> index 14292d4c25..a8b501fe0c 100644
> --- a/doc/guides/prog_guide/profile_app.rst
> +++ b/doc/guides/prog_guide/profile_app.rst
> @@ -7,6 +7,14 @@ Profile Your Application
>  The following sections describe methods of profiling DPDK applications on
>  different architectures.
>
> +Performance counter based profiling
> +-----------------------------------
> +
> +Majority of architectures support some sort hardware measurement unit which provides a set of
> +programmable counters that monitor specific events. There are different tools which can gather
> +that information, perf being an example here. Though in some scenarios, eg. when CPU cores are
> +isolated (nohz_full) and run dedicated tasks, using perf is less than ideal. In such cases one can
> +read specific events directly from application via ``rte_pmu_read()``.

We need a common definition in the documentation of what PMU stands
for and use it consistently.
I am not sure this documentation is the best place, but at least, I'd
prefer we go with "Performance Monitoring Unit" and stick to it.

Plus, this block is a bit hard to read too, what do you think of:

"""
A majority of architectures support some performance monitoring unit (PMU).
Such unit provides programmable counters that monitor specific events.

Different tools gather that information, like for example perf.
However, in some scenarios when CPU cores are isolated (nohz_full) and
run dedicated tasks, interrupting those tasks with perf may be
undesirable.
In such cases, an application can use the PMU library to read such
events via ``rte_pmu_read()``.
"""

And, a double newline is used between sections in this doc.


>
>  Profiling on x86
>  ----------------
> diff --git a/doc/guides/rel_notes/release_23_03.rst b/doc/guides/rel_notes/release_23_03.rst
> index 73f5d94e14..733541d56c 100644
> --- a/doc/guides/rel_notes/release_23_03.rst
> +++ b/doc/guides/rel_notes/release_23_03.rst
> @@ -55,10 +55,19 @@ New Features
>       Also, make sure to start the actual text at the margin.
>       =======================================================
>
> +* **Added PMU library.**
> +
> +  Added a new PMU (performance measurement unit) library which allows applications

Performance Monitoring Unit.

> +  to perform self monitoring activities without depending on external utilities like perf.

> +  After integration with :doc:`../prog_guide/trace_lib` data gathered from hardware counters
> +  can be stored in CTF format for further analysis.

Afaiu, this integration comes later in the series.
This part of the RN update should go with it.


> +
>  * **Updated AMD axgbe driver.**
>
>    * Added multi-process support.
>
> +* **Added multi-process support for axgbe PMD.**
> +
>  * **Updated Corigine nfp driver.**
>

Unrelated rebase damage.. please pay attention to such detail.


>    * Added support for meter options.
> diff --git a/lib/meson.build b/lib/meson.build
> index a90fee31b7..7132131b5c 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -11,6 +11,7 @@
>  libraries = [
>          'kvargs', # eal depends on kvargs
>          'telemetry', # basic info querying
> +        'pmu',
>          'eal', # everything depends on eal
>          'ring',
>          'rcu', # rcu depends on ring
> diff --git a/lib/pmu/meson.build b/lib/pmu/meson.build
> new file mode 100644
> index 0000000000..a4160b494e
> --- /dev/null
> +++ b/lib/pmu/meson.build
> @@ -0,0 +1,13 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(C) 2023 Marvell International Ltd.
> +
> +if not is_linux
> +    build = false
> +    reason = 'only supported on Linux'
> +    subdir_done()
> +endif
> +
> +includes = [global_inc]
> +
> +sources = files('rte_pmu.c')
> +headers = files('rte_pmu.h')
> diff --git a/lib/pmu/pmu_private.h b/lib/pmu/pmu_private.h
> new file mode 100644
> index 0000000000..849549b125
> --- /dev/null
> +++ b/lib/pmu/pmu_private.h
> @@ -0,0 +1,29 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Marvell
> + */
> +
> +#ifndef _PMU_PRIVATE_H_
> +#define _PMU_PRIVATE_H_
> +
> +/**
> + * Architecture specific PMU init callback.
> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +int
> +pmu_arch_init(void);
> +
> +/**
> + * Architecture specific PMU cleanup callback.
> + */
> +void
> +pmu_arch_fini(void);
> +
> +/**
> + * Apply architecture specific settings to config before passing it to syscall.

Please describe config[].


> + */
> +void
> +pmu_arch_fixup_config(uint64_t config[3]);
> +
> +#endif /* _PMU_PRIVATE_H_ */
> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c
> new file mode 100644
> index 0000000000..4cf3161155
> --- /dev/null
> +++ b/lib/pmu/rte_pmu.c
> @@ -0,0 +1,464 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2023 Marvell International Ltd.
> + */
> +
> +#include <ctype.h>
> +#include <dirent.h>
> +#include <errno.h>
> +#include <regex.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/queue.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>

Asking to be sure because I did not check:
do we need all those includes, or is this just copy/pasted from somewhere else?


> +
> +#include <rte_atomic.h>
> +#include <rte_per_lcore.h>
> +#include <rte_pmu.h>
> +#include <rte_spinlock.h>
> +#include <rte_tailq.h>
> +
> +#include "pmu_private.h"
> +
> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> +
> +#ifndef GENMASK_ULL

This macro is copy/pasted all over the dpdk tree...
This is worth a cleanup later, read: I am not asking for it as part of
this series.

However, here, there is no need for protecting against its definition.


> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 - (h)))))
> +#endif
> +
> +#ifndef FIELD_PREP
> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
> +#endif

Idem.


> +
> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> +struct rte_pmu rte_pmu;
> +
> +/*
> + * Following __rte_weak functions provide default no-op. Architectures should override them if
> + * necessary.
> + */

Prefer using per architectures #ifdef.
It is easier to get a simple link error than use weak symbols that
make it look like it could work on some arch.


> +
> +int
> +__rte_weak pmu_arch_init(void)
> +{
> +       return 0;

Add a debug log message indicating that this arch does not support PMU.


> +}
> +
> +void
> +__rte_weak pmu_arch_fini(void)
> +{
> +}
> +
> +void
> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
> +{
> +}
> +
> +static int
> +get_term_format(const char *name, int *num, uint64_t *mask)
> +{
> +       char *config = NULL;
> +       char path[PATH_MAX];
> +       int high, low, ret;
> +       FILE *fp;

Reverse xmas tree when possible.


> +
> +       /* quiesce -Wmaybe-uninitialized warning */

This comment just seems to be a note for yourself.
What was the issue exactly?


> +       *num = 0;
> +       *mask = 0;
> +
> +       snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
> +       fp = fopen(path, "r");
> +       if (fp == NULL)
> +               return -errno;
> +
> +       errno = 0;
> +       ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> +       if (ret < 2) {
> +               ret = -ENODATA;
> +               goto out;
> +       }
> +       if (errno) {
> +               ret = -errno;
> +               goto out;
> +       }
> +
> +       if (ret == 2)
> +               high = low;
> +
> +       *mask = GENMASK_ULL(high, low);
> +       /* Last digit should be [012]. If last digit is missing 0 is implied. */
> +       *num = config[strlen(config) - 1];
> +       *num = isdigit(*num) ? *num - '0' : 0;
> +
> +       ret = 0;
> +out:
> +       free(config);
> +       fclose(fp);
> +
> +       return ret;
> +}
> +
> +static int
> +parse_event(char *buf, uint64_t config[3])
> +{
> +       char *token, *term;
> +       int num, ret, val;
> +       uint64_t mask;
> +
> +       config[0] = config[1] = config[2] = 0;
> +
> +       token = strtok(buf, ",");
> +       while (token) {
> +               errno = 0;
> +               /* <term>=<value> */
> +               ret = sscanf(token, "%m[^=]=%i", &term, &val);
> +               if (ret < 1)
> +                       return -ENODATA;
> +               if (errno)
> +                       return -errno;
> +               if (ret == 1)
> +                       val = 1;
> +
> +               ret = get_term_format(term, &num, &mask);
> +               free(term);
> +               if (ret)
> +                       return ret;
> +
> +               config[num] |= FIELD_PREP(mask, val);
> +               token = strtok(NULL, ",");
> +       }
> +
> +       return 0;
> +}
> +
> +static int
> +get_event_config(const char *name, uint64_t config[3])
> +{
> +       char path[PATH_MAX], buf[BUFSIZ];
> +       FILE *fp;
> +       int ret;
> +
> +       snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> +       fp = fopen(path, "r");
> +       if (fp == NULL)
> +               return -errno;
> +
> +       ret = fread(buf, 1, sizeof(buf), fp);
> +       if (ret == 0) {
> +               fclose(fp);
> +
> +               return -EINVAL;
> +       }
> +       fclose(fp);
> +       buf[ret] = '\0';
> +
> +       return parse_event(buf, config);
> +}
> +
> +static int
> +do_perf_event_open(uint64_t config[3], int group_fd)
> +{
> +       struct perf_event_attr attr = {
> +               .size = sizeof(struct perf_event_attr),
> +               .type = PERF_TYPE_RAW,
> +               .exclude_kernel = 1,
> +               .exclude_hv = 1,
> +               .disabled = 1,
> +       };
> +
> +       pmu_arch_fixup_config(config);
> +
> +       attr.config = config[0];
> +       attr.config1 = config[1];
> +       attr.config2 = config[2];
> +
> +       return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0);
> +}
> +
> +static int
> +open_events(struct rte_pmu_event_group *group)
> +{
> +       struct rte_pmu_event *event;
> +       uint64_t config[3];
> +       int num = 0, ret;
> +
> +       /* group leader gets created first, with fd = -1 */
> +       group->fds[0] = -1;
> +
> +       TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> +               ret = get_event_config(event->name, config);
> +               if (ret)
> +                       continue;
> +
> +               ret = do_perf_event_open(config, group->fds[0]);
> +               if (ret == -1) {
> +                       ret = -errno;
> +                       goto out;
> +               }
> +
> +               group->fds[event->index] = ret;
> +               num++;
> +       }
> +
> +       return 0;
> +out:
> +       for (--num; num >= 0; num--) {
> +               close(group->fds[num]);
> +               group->fds[num] = -1;
> +       }
> +
> +
> +       return ret;
> +}
> +
> +static int
> +mmap_events(struct rte_pmu_event_group *group)
> +{
> +       long page_size = sysconf(_SC_PAGE_SIZE);
> +       unsigned int i;
> +       void *addr;
> +       int ret;
> +
> +       for (i = 0; i < rte_pmu.num_group_events; i++) {
> +               addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
> +               if (addr == MAP_FAILED) {
> +                       ret = -errno;
> +                       goto out;
> +               }
> +
> +               group->mmap_pages[i] = addr;
> +       }
> +
> +       return 0;
> +out:
> +       for (; i; i--) {
> +               munmap(group->mmap_pages[i - 1], page_size);
> +               group->mmap_pages[i - 1] = NULL;
> +       }
> +
> +       return ret;
> +}
> +
> +static void
> +cleanup_events(struct rte_pmu_event_group *group)
> +{
> +       unsigned int i;
> +
> +       if (group->fds[0] != -1)
> +               ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
> +
> +       for (i = 0; i < rte_pmu.num_group_events; i++) {
> +               if (group->mmap_pages[i]) {
> +                       munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
> +                       group->mmap_pages[i] = NULL;
> +               }
> +
> +               if (group->fds[i] != -1) {
> +                       close(group->fds[i]);
> +                       group->fds[i] = -1;
> +               }
> +       }
> +
> +       group->enabled = false;
> +}
> +
> +int __rte_noinline

This symbol is exported out of this library, no need for noinline.


> +rte_pmu_enable_group(void)
> +{
> +       struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> +       int ret;
> +
> +       if (rte_pmu.num_group_events == 0)
> +               return -ENODEV;
> +
> +       ret = open_events(group);
> +       if (ret)
> +               goto out;
> +
> +       ret = mmap_events(group);
> +       if (ret)
> +               goto out;
> +
> +       if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
> +               ret = -errno;
> +               goto out;
> +       }
> +
> +       if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
> +               ret = -errno;
> +               goto out;
> +       }
> +
> +       rte_spinlock_lock(&rte_pmu.lock);
> +       TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
> +       rte_spinlock_unlock(&rte_pmu.lock);
> +       group->enabled = true;
> +
> +       return 0;
> +
> +out:
> +       cleanup_events(group);
> +
> +       return ret;
> +}
> +
> +static int
> +scan_pmus(void)
> +{
> +       char path[PATH_MAX];
> +       struct dirent *dent;
> +       const char *name;
> +       DIR *dirp;
> +
> +       dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> +       if (dirp == NULL)
> +               return -errno;
> +
> +       while ((dent = readdir(dirp))) {
> +               name = dent->d_name;
> +               if (name[0] == '.')
> +                       continue;
> +
> +               /* sysfs entry should either contain cpus or be a cpu */
> +               if (!strcmp(name, "cpu"))
> +                       break;
> +
> +               snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
> +               if (access(path, F_OK) == 0)
> +                       break;
> +       }
> +
> +       if (dent) {
> +               rte_pmu.name = strdup(name);
> +               if (rte_pmu.name == NULL) {
> +                       closedir(dirp);
> +
> +                       return -ENOMEM;
> +               }
> +       }
> +
> +       closedir(dirp);
> +
> +       return rte_pmu.name ? 0 : -ENODEV;
> +}
> +
> +static struct rte_pmu_event *
> +new_event(const char *name)
> +{
> +       struct rte_pmu_event *event;
> +
> +       event = calloc(1, sizeof(*event));
> +       if (event == NULL)
> +               goto out;
> +
> +       event->name = strdup(name);
> +       if (event->name == NULL) {
> +               free(event);
> +               event = NULL;
> +       }
> +
> +out:
> +       return event;
> +}
> +
> +static void
> +free_event(struct rte_pmu_event *event)
> +{
> +       free(event->name);
> +       free(event);
> +}
> +
> +int
> +rte_pmu_add_event(const char *name)
> +{
> +       struct rte_pmu_event *event;
> +       char path[PATH_MAX];
> +
> +       if (rte_pmu.name == NULL)
> +               return -ENODEV;
> +
> +       if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
> +               return -ENOSPC;
> +
> +       snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> +       if (access(path, R_OK))
> +               return -ENODEV;
> +
> +       TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> +               if (!strcmp(event->name, name))
> +                       return event->index;
> +               continue;
> +       }
> +
> +       event = new_event(name);
> +       if (event == NULL)
> +               return -ENOMEM;
> +
> +       event->index = rte_pmu.num_group_events++;
> +       TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
> +
> +       return event->index;
> +}
> +
> +int
> +rte_pmu_init(void)
> +{
> +       int ret;
> +
> +       /* Allow calling init from multiple contexts within a single thread. This simplifies
> +        * resource management a bit e.g in case fast-path tracepoint has already been enabled
> +        * via command line but application doesn't care enough and performs init/fini again.
> +        */
> +       if (rte_pmu.initialized) {

This is an integer so check against 0 explicitly (there may be other
cases in this patch, I did not recheck the whole patch).


> +               rte_pmu.initialized++;
> +               return 0;
> +       }
> +
> +       ret = scan_pmus();
> +       if (ret)
> +               goto out;
> +
> +       ret = pmu_arch_init();
> +       if (ret)
> +               goto out;
> +
> +       TAILQ_INIT(&rte_pmu.event_list);
> +       TAILQ_INIT(&rte_pmu.event_group_list);
> +       rte_spinlock_init(&rte_pmu.lock);
> +       rte_pmu.initialized = 1;
> +
> +       return 0;
> +out:
> +       free(rte_pmu.name);
> +       rte_pmu.name = NULL;
> +
> +       return ret;
> +}
> +
> +void
> +rte_pmu_fini(void)
> +{
> +       struct rte_pmu_event_group *group, *tmp_group;
> +       struct rte_pmu_event *event, *tmp_event;
> +
> +       /* cleanup once init count drops to zero */
> +       if (!rte_pmu.initialized || --rte_pmu.initialized)
> +               return;
> +
> +       RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
> +               TAILQ_REMOVE(&rte_pmu.event_list, event, next);
> +               free_event(event);
> +       }
> +
> +       RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
> +               TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
> +               cleanup_events(group);
> +       }
> +
> +       pmu_arch_fini();
> +       free(rte_pmu.name);
> +       rte_pmu.name = NULL;
> +       rte_pmu.num_group_events = 0;
> +}
> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h
> new file mode 100644
> index 0000000000..e360375a0c
> --- /dev/null
> +++ b/lib/pmu/rte_pmu.h
> @@ -0,0 +1,205 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Marvell
> + */
> +
> +#ifndef _RTE_PMU_H_
> +#define _RTE_PMU_H_
> +
> +/**
> + * @file
> + *
> + * PMU event tracing operations
> + *
> + * This file defines generic API and types necessary to setup PMU and
> + * read selected counters in runtime.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <linux/perf_event.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_branch_prediction.h>
> +#include <rte_common.h>
> +#include <rte_compat.h>
> +#include <rte_spinlock.h>
> +
> +/** Maximum number of events in a group */
> +#define MAX_NUM_GROUP_EVENTS 8
> +
> +/**
> + * A structure describing a group of events.
> + */
> +struct rte_pmu_event_group {
> +       struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
> +       int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> +       bool enabled; /**< true if group was enabled on particular lcore */
> +       TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
> +} __rte_cache_aligned;

One problem for the future is that we have a fixed size fd array.
Do we need to expose this whole structure to the application?


> +
> +/**
> + * A structure describing an event.
> + */
> +struct rte_pmu_event {
> +       char *name; /**< name of an event */
> +       unsigned int index; /**< event index into fds/mmap_pages */

This is an internal consideration.
Do we need to expose this to the application?


> +       TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */
> +};
> +
> +/**
> + * A PMU state container.
> + */
> +struct rte_pmu {
> +       char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
> +       rte_spinlock_t lock; /**< serialize access to event group list */
> +       TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
> +       unsigned int num_group_events; /**< number of events in a group */
> +       TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> +       unsigned int initialized; /**< initialization counter */
> +};

Idem, do we need to expose this to the application?


> +
> +/** lcore event group */
> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> +
> +/** PMU state container */
> +extern struct rte_pmu rte_pmu;
> +
> +/** Each architecture supporting PMU needs to provide its own version */
> +#ifndef rte_pmu_pmc_read
> +#define rte_pmu_pmc_read(index) ({ 0; })
> +#endif
> +
> +/**
> + * @internal
> + *
> + * Read PMU counter.
> + *
> + * @param pc
> + *   Pointer to the mmapped user page.
> + * @return
> + *   Counter value read from hardware.
> + */
> +__rte_internal
> +static __rte_always_inline uint64_t
> +rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
> +{
> +       uint64_t width, offset;
> +       uint32_t seq, index;
> +       int64_t pmc;
> +
> +       for (;;) {
> +               seq = pc->lock;
> +               rte_compiler_barrier();
> +               index = pc->index;
> +               offset = pc->offset;
> +               width = pc->pmc_width;
> +
> +               /* index set to 0 means that particular counter cannot be used */
> +               if (likely(pc->cap_user_rdpmc && index)) {
> +                       pmc = rte_pmu_pmc_read(index - 1);
> +                       pmc <<= 64 - width;
> +                       pmc >>= 64 - width;
> +                       offset += pmc;
> +               }
> +
> +               rte_compiler_barrier();
> +
> +               if (likely(pc->lock == seq))
> +                       return offset;
> +       }
> +
> +       return 0;
> +}
> +
> +/**
> + * @internal
> + *
> + * Enable group of events on the calling lcore.
> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +__rte_internal

Unless I missed something, this symbol is called from rte_pmu_read()
so this makes rte_pmu_read() itself internal.
So external applications won't be able to use the PMU API.

This can probably be confirmed by adding some call to the PMU API in
an examples/.


> +int
> +rte_pmu_enable_group(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Initialize PMU library.
> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_init(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Finalize PMU library. This should be called after PMU counters are no longer being read.
> + */
> +__rte_experimental
> +void
> +rte_pmu_fini(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add event to the group of enabled events.
> + *
> + * @param name
> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
> + * @return
> + *   Event index in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_add_event(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read hardware counter configured to count occurrences of an event.
> + *
> + * @param index
> + *   Index of an event to be read.
> + * @return
> + *   Event value read from register. In case of errors or lack of support
> + *   0 is returned. In other words, stream of zeros in a trace file
> + *   indicates problem with reading particular PMU event register.
> + */
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(unsigned int index)
> +{
> +       struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> +       int ret;
> +
> +       if (unlikely(!rte_pmu.initialized))
> +               return 0;
> +
> +       if (unlikely(!group->enabled)) {
> +               ret = rte_pmu_enable_group();
> +               if (ret)
> +                       return 0;
> +       }
> +
> +       if (unlikely(index >= rte_pmu.num_group_events))
> +               return 0;
> +
> +       return rte_pmu_read_userpage(group->mmap_pages[index]);
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PMU_H_ */
> diff --git a/lib/pmu/version.map b/lib/pmu/version.map
> new file mode 100644
> index 0000000000..50fb0f354e
> --- /dev/null
> +++ b/lib/pmu/version.map
> @@ -0,0 +1,20 @@
> +DPDK_23 {
> +       local: *;
> +};
> +
> +EXPERIMENTAL {
> +       global:
> +
> +       per_lcore__event_group;
> +       rte_pmu;
> +       rte_pmu_add_event;
> +       rte_pmu_fini;
> +       rte_pmu_init;
> +       rte_pmu_read;
> +};
> +
> +INTERNAL {
> +       global:
> +
> +       rte_pmu_enable_group;
> +};
> --
> 2.34.1
>
  
Tomasz Duszynski Feb. 9, 2023, 11:09 a.m. UTC | #2
Hi David, 

Thanks for review. Comments inline. 

>-----Original Message-----
>From: David Marchand <david.marchand@redhat.com>
>Sent: Monday, February 6, 2023 12:03 PM
>To: Tomasz Duszynski <tduszynski@marvell.com>
>Cc: dev@dpdk.org; Thomas Monjalon <thomas@monjalon.net>; roretzla@linux.microsoft.com;
>Ruifeng.Wang@arm.com; bruce.richardson@intel.com; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
>mattias.ronnblom@ericsson.com; mb@smartsharesystems.com; zhoumin@loongson.cn
>Subject: [EXT] Re: [PATCH v9 1/4] lib: add generic support for reading PMU events
>
>External Email
>
>----------------------------------------------------------------------
>Hello,
>
>On Thu, Feb 2, 2023 at 1:50 PM Tomasz Duszynski <tduszynski@marvell.com> wrote:
>>
>> Add support for programming PMU counters and reading their values in
>> runtime bypassing kernel completely.
>>
>> This is especially useful in cases where CPU cores are isolated
>> (nohz_full) i.e run dedicated tasks. In such cases one cannot use
>> standard perf utility without sacrificing latency and performance.
>
>For my understanding, what OS capability/permission are required to use this library?
>

On x86 it sufficient for self-monitoring to have kernel built with perf events enabled
and /proc/sys/kernel/perf_event_paranoid knob should be set to 2, which
should be a default value anyway, unless changed by some scripts. 

On ARM64 you need to additionally set /proc/sys/kernel/perf_user_access to bypass
kernel when accessing hw counters. 

>
>>
>> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>> ---
>>  MAINTAINERS                            |   5 +
>>  app/test/meson.build                   |   1 +
>>  app/test/test_pmu.c                    |  55 +++
>>  doc/api/doxy-api-index.md              |   3 +-
>>  doc/api/doxy-api.conf.in               |   1 +
>>  doc/guides/prog_guide/profile_app.rst  |   8 +
>>  doc/guides/rel_notes/release_23_03.rst |   9 +
>>  lib/meson.build                        |   1 +
>>  lib/pmu/meson.build                    |  13 +
>>  lib/pmu/pmu_private.h                  |  29 ++
>>  lib/pmu/rte_pmu.c                      | 464 +++++++++++++++++++++++++
>>  lib/pmu/rte_pmu.h                      | 205 +++++++++++
>>  lib/pmu/version.map                    |  20 ++
>>  13 files changed, 813 insertions(+), 1 deletion(-)  create mode
>> 100644 app/test/test_pmu.c  create mode 100644 lib/pmu/meson.build
>> create mode 100644 lib/pmu/pmu_private.h  create mode 100644
>> lib/pmu/rte_pmu.c  create mode 100644 lib/pmu/rte_pmu.h  create mode
>> 100644 lib/pmu/version.map
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS index 9a0f416d2e..9f13eafd95
>> 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -1697,6 +1697,11 @@ M: Nithin Dabilpuram <ndabilpuram@marvell.com>
>>  M: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>  F: lib/node/
>>
>> +PMU - EXPERIMENTAL
>> +M: Tomasz Duszynski <tduszynski@marvell.com>
>> +F: lib/pmu/
>> +F: app/test/test_pmu*
>> +
>>
>>  Test Applications
>>  -----------------
>> diff --git a/app/test/meson.build b/app/test/meson.build index
>> f34d19e3c3..7b6b69dcf1 100644
>> --- a/app/test/meson.build
>> +++ b/app/test/meson.build
>> @@ -111,6 +111,7 @@ test_sources = files(
>>          'test_reciprocal_division_perf.c',
>>          'test_red.c',
>>          'test_pie.c',
>> +        'test_pmu.c',
>>          'test_reorder.c',
>>          'test_rib.c',
>>          'test_rib6.c',
>
>This code adds a new test.
>This test should be added to an existing testsuite, like fast-tests etc...
>
>
>> diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c new file mode
>> 100644 index 0000000000..a9bfb1a427
>> --- /dev/null
>> +++ b/app/test/test_pmu.c
>> @@ -0,0 +1,55 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(C) 2023 Marvell International Ltd.
>> + */
>> +
>> +#include "test.h"
>> +
>> +#ifndef RTE_EXEC_ENV_LINUX
>> +
>> +static int
>> +test_pmu(void)
>> +{
>> +       printf("pmu_autotest only supported on Linux, skipping test\n");
>> +       return TEST_SKIPPED;
>> +}
>> +
>> +#else
>> +
>> +#include <rte_pmu.h>
>> +
>> +static int
>> +test_pmu_read(void)
>> +{
>> +       int tries = 10, event = -1;
>> +       uint64_t val = 0;
>> +
>> +       if (rte_pmu_init() < 0)
>> +               return TEST_FAILED;
>> +
>> +       while (tries--)
>> +               val += rte_pmu_read(event);
>> +
>> +       rte_pmu_fini();
>> +
>> +       return val ? TEST_SUCCESS : TEST_FAILED; }
>> +
>> +static struct unit_test_suite pmu_tests = {
>> +       .suite_name = "pmu autotest",
>> +       .setup = NULL,
>> +       .teardown = NULL,
>> +       .unit_test_cases = {
>> +               TEST_CASE(test_pmu_read),
>> +               TEST_CASES_END()
>> +       }
>> +};
>> +
>> +static int
>> +test_pmu(void)
>> +{
>> +       return unit_test_suite_runner(&pmu_tests);
>> +}
>> +
>> +#endif /* RTE_EXEC_ENV_LINUX */
>> +
>> +REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
>> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
>> index de488c7abf..7f1938f92f 100644
>> --- a/doc/api/doxy-api-index.md
>> +++ b/doc/api/doxy-api-index.md
>> @@ -222,7 +222,8 @@ The public API headers are grouped by topics:
>>    [log](@ref rte_log.h),
>>    [errno](@ref rte_errno.h),
>>    [trace](@ref rte_trace.h),
>> -  [trace_point](@ref rte_trace_point.h)
>> +  [trace_point](@ref rte_trace_point.h),  [pmu](@ref rte_pmu.h)
>>
>>  - **misc**:
>>    [EAL config](@ref rte_eal.h),
>> diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in index
>> f0886c3bd1..920e615996 100644
>> --- a/doc/api/doxy-api.conf.in
>> +++ b/doc/api/doxy-api.conf.in
>> @@ -63,6 +63,7 @@ INPUT                   = @TOPDIR@/doc/api/doxy-api-index.md \
>>                            @TOPDIR@/lib/pci \
>>                            @TOPDIR@/lib/pdump \
>>                            @TOPDIR@/lib/pipeline \
>> +                          @TOPDIR@/lib/pmu \
>>                            @TOPDIR@/lib/port \
>>                            @TOPDIR@/lib/power \
>>                            @TOPDIR@/lib/rawdev \ diff --git
>> a/doc/guides/prog_guide/profile_app.rst
>> b/doc/guides/prog_guide/profile_app.rst
>> index 14292d4c25..a8b501fe0c 100644
>> --- a/doc/guides/prog_guide/profile_app.rst
>> +++ b/doc/guides/prog_guide/profile_app.rst
>> @@ -7,6 +7,14 @@ Profile Your Application  The following sections
>> describe methods of profiling DPDK applications on  different
>> architectures.
>>
>> +Performance counter based profiling
>> +-----------------------------------
>> +
>> +Majority of architectures support some sort hardware measurement unit
>> +which provides a set of programmable counters that monitor specific
>> +events. There are different tools which can gather that information,
>> +perf being an example here. Though in some scenarios, eg. when CPU
>> +cores are isolated (nohz_full) and run dedicated tasks, using perf is less than ideal. In such
>cases one can read specific events directly from application via ``rte_pmu_read()``.
>
>We need a common definition in the documentation of what PMU stands for and use it consistently.
>I am not sure this documentation is the best place, but at least, I'd prefer we go with
>"Performance Monitoring Unit" and stick to it.

For the time being I think it's good enough. Frankly I don’t have better idea where to put that. 

>
>Plus, this block is a bit hard to read too, what do you think of:
>
>"""
>A majority of architectures support some performance monitoring unit (PMU).
>Such unit provides programmable counters that monitor specific events.
>
>Different tools gather that information, like for example perf.
>However, in some scenarios when CPU cores are isolated (nohz_full) and run dedicated tasks,
>interrupting those tasks with perf may be undesirable.
>In such cases, an application can use the PMU library to read such events via ``rte_pmu_read()``.
>"""
>
>And, a double newline is used between sections in this doc.
>
>

No problem. 

>>
>>  Profiling on x86
>>  ----------------
>> diff --git a/doc/guides/rel_notes/release_23_03.rst
>> b/doc/guides/rel_notes/release_23_03.rst
>> index 73f5d94e14..733541d56c 100644
>> --- a/doc/guides/rel_notes/release_23_03.rst
>> +++ b/doc/guides/rel_notes/release_23_03.rst
>> @@ -55,10 +55,19 @@ New Features
>>       Also, make sure to start the actual text at the margin.
>>       =======================================================
>>
>> +* **Added PMU library.**
>> +
>> +  Added a new PMU (performance measurement unit) library which allows
>> + applications
>
>Performance Monitoring Unit.
>
>> +  to perform self monitoring activities without depending on external utilities like perf.
>
>> +  After integration with :doc:`../prog_guide/trace_lib` data gathered
>> + from hardware counters  can be stored in CTF format for further analysis.
>
>Afaiu, this integration comes later in the series.
>This part of the RN update should go with it.
>
>
>> +
>>  * **Updated AMD axgbe driver.**
>>
>>    * Added multi-process support.
>>
>> +* **Added multi-process support for axgbe PMD.**
>> +
>>  * **Updated Corigine nfp driver.**
>>
>
>Unrelated rebase damage.. please pay attention to such detail.
>

Thanks, that creeped in somehow. 

>
>>    * Added support for meter options.
>> diff --git a/lib/meson.build b/lib/meson.build index
>> a90fee31b7..7132131b5c 100644
>> --- a/lib/meson.build
>> +++ b/lib/meson.build
>> @@ -11,6 +11,7 @@
>>  libraries = [
>>          'kvargs', # eal depends on kvargs
>>          'telemetry', # basic info querying
>> +        'pmu',
>>          'eal', # everything depends on eal
>>          'ring',
>>          'rcu', # rcu depends on ring
>> diff --git a/lib/pmu/meson.build b/lib/pmu/meson.build new file mode
>> 100644 index 0000000000..a4160b494e
>> --- /dev/null
>> +++ b/lib/pmu/meson.build
>> @@ -0,0 +1,13 @@
>> +# SPDX-License-Identifier: BSD-3-Clause # Copyright(C) 2023 Marvell
>> +International Ltd.
>> +
>> +if not is_linux
>> +    build = false
>> +    reason = 'only supported on Linux'
>> +    subdir_done()
>> +endif
>> +
>> +includes = [global_inc]
>> +
>> +sources = files('rte_pmu.c')
>> +headers = files('rte_pmu.h')
>> diff --git a/lib/pmu/pmu_private.h b/lib/pmu/pmu_private.h new file
>> mode 100644 index 0000000000..849549b125
>> --- /dev/null
>> +++ b/lib/pmu/pmu_private.h
>> @@ -0,0 +1,29 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2023 Marvell
>> + */
>> +
>> +#ifndef _PMU_PRIVATE_H_
>> +#define _PMU_PRIVATE_H_
>> +
>> +/**
>> + * Architecture specific PMU init callback.
>> + *
>> + * @return
>> + *   0 in case of success, negative value otherwise.
>> + */
>> +int
>> +pmu_arch_init(void);
>> +
>> +/**
>> + * Architecture specific PMU cleanup callback.
>> + */
>> +void
>> +pmu_arch_fini(void);
>> +
>> +/**
>> + * Apply architecture specific settings to config before passing it to syscall.
>
>Please describe config[].
>

Well, the problem here is that each and every arch may expect different config so 
anyone adding new stuff would need to consult kernel sources anyway.

What kind of documentation would you like to see here?

>
>> + */
>> +void
>> +pmu_arch_fixup_config(uint64_t config[3]);
>> +
>> +#endif /* _PMU_PRIVATE_H_ */
>> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c new file mode
>> 100644 index 0000000000..4cf3161155
>> --- /dev/null
>> +++ b/lib/pmu/rte_pmu.c
>> @@ -0,0 +1,464 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(C) 2023 Marvell International Ltd.
>> + */
>> +
>> +#include <ctype.h>
>> +#include <dirent.h>
>> +#include <errno.h>
>> +#include <regex.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <sys/ioctl.h>
>> +#include <sys/mman.h>
>> +#include <sys/queue.h>
>> +#include <sys/syscall.h>
>> +#include <unistd.h>
>
>Asking to be sure because I did not check:
>do we need all those includes, or is this just copy/pasted from somewhere else?
>

No, it was not copy/pasted. Each header exports something used in this file. I'll
double check if all still required.

>
>> +
>> +#include <rte_atomic.h>
>> +#include <rte_per_lcore.h>
>> +#include <rte_pmu.h>
>> +#include <rte_spinlock.h>
>> +#include <rte_tailq.h>
>> +
>> +#include "pmu_private.h"
>> +
>> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
>> +
>> +#ifndef GENMASK_ULL
>
>This macro is copy/pasted all over the dpdk tree...
>This is worth a cleanup later, read: I am not asking for it as part of this series.
>
>However, here, there is no need for protecting against its definition.
>

Okay. 

>
>> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >>
>> +((64 - 1 - (h))))) #endif
>> +
>> +#ifndef FIELD_PREP
>> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1))
>> +& (m)) #endif
>
>Idem.
>
>
>> +
>> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
>> +struct rte_pmu rte_pmu;
>> +
>> +/*
>> + * Following __rte_weak functions provide default no-op.
>> +Architectures should override them if
>> + * necessary.
>> + */
>
>Prefer using per architectures #ifdef.
>It is easier to get a simple link error than use weak symbols that make it look like it could work
>on some arch.
>

Rationale for that was actually to not break anything. That means you can compile that code
for every arch, except on unsupported ones you'll see stream of zeros inside a trace file. 

>
>> +
>> +int
>> +__rte_weak pmu_arch_init(void)
>> +{
>> +       return 0;
>
>Add a debug log message indicating that this arch does not support PMU.
>

Prove me wrong but logs are part of eal and unless they are moved to a separate 
library this lib shouldn't call these APIs. Otherwise we'll introduce dependency
on eal which we wanted to avoid in the first place. 

>
>> +}
>> +
>> +void
>> +__rte_weak pmu_arch_fini(void)
>> +{
>> +}
>> +
>> +void
>> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3]) { }
>> +
>> +static int
>> +get_term_format(const char *name, int *num, uint64_t *mask) {
>> +       char *config = NULL;
>> +       char path[PATH_MAX];
>> +       int high, low, ret;
>> +       FILE *fp;
>
>Reverse xmas tree when possible.
>

Okay. 

>
>> +
>> +       /* quiesce -Wmaybe-uninitialized warning */
>
>This comment just seems to be a note for yourself.
>What was the issue exactly?
>

Generally speaking, compiler thinks that function calling this function may use
'num' even though callee returned an error. 

[1/198] Compiling C object lib/librte_pmu.a.p/pmu_rte_pmu.c.o
In function _parse_event_,
    inlined from _get_event_config_ at ../lib/pmu/rte_pmu.c:157:9:
../lib/pmu/rte_pmu.c:129:23: warning: _num_ may be used uninitialized [-Wmaybe-uninitialized]
  129 |                 config[num] |= FIELD_PREP(mask, val);
      |                 ~~~~~~^~~~~
../lib/pmu/rte_pmu.c: In function _get_event_config_:
../lib/pmu/rte_pmu.c:107:13: note: _num_ was declared here
  107 |         int num, ret, val;
      |

>
>> +       *num = 0;
>> +       *mask = 0;
>> +
>> +       snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name,
>name);
>> +       fp = fopen(path, "r");
>> +       if (fp == NULL)
>> +               return -errno;
>> +
>> +       errno = 0;
>> +       ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
>> +       if (ret < 2) {
>> +               ret = -ENODATA;
>> +               goto out;
>> +       }
>> +       if (errno) {
>> +               ret = -errno;
>> +               goto out;
>> +       }
>> +
>> +       if (ret == 2)
>> +               high = low;
>> +
>> +       *mask = GENMASK_ULL(high, low);
>> +       /* Last digit should be [012]. If last digit is missing 0 is implied. */
>> +       *num = config[strlen(config) - 1];
>> +       *num = isdigit(*num) ? *num - '0' : 0;
>> +
>> +       ret = 0;
>> +out:
>> +       free(config);
>> +       fclose(fp);
>> +
>> +       return ret;
>> +}
>> +
>> +static int
>> +parse_event(char *buf, uint64_t config[3]) {
>> +       char *token, *term;
>> +       int num, ret, val;
>> +       uint64_t mask;
>> +
>> +       config[0] = config[1] = config[2] = 0;
>> +
>> +       token = strtok(buf, ",");
>> +       while (token) {
>> +               errno = 0;
>> +               /* <term>=<value> */
>> +               ret = sscanf(token, "%m[^=]=%i", &term, &val);
>> +               if (ret < 1)
>> +                       return -ENODATA;
>> +               if (errno)
>> +                       return -errno;
>> +               if (ret == 1)
>> +                       val = 1;
>> +
>> +               ret = get_term_format(term, &num, &mask);
>> +               free(term);
>> +               if (ret)
>> +                       return ret;
>> +
>> +               config[num] |= FIELD_PREP(mask, val);
>> +               token = strtok(NULL, ",");
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static int
>> +get_event_config(const char *name, uint64_t config[3]) {
>> +       char path[PATH_MAX], buf[BUFSIZ];
>> +       FILE *fp;
>> +       int ret;
>> +
>> +       snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name,
>name);
>> +       fp = fopen(path, "r");
>> +       if (fp == NULL)
>> +               return -errno;
>> +
>> +       ret = fread(buf, 1, sizeof(buf), fp);
>> +       if (ret == 0) {
>> +               fclose(fp);
>> +
>> +               return -EINVAL;
>> +       }
>> +       fclose(fp);
>> +       buf[ret] = '\0';
>> +
>> +       return parse_event(buf, config); }
>> +
>> +static int
>> +do_perf_event_open(uint64_t config[3], int group_fd) {
>> +       struct perf_event_attr attr = {
>> +               .size = sizeof(struct perf_event_attr),
>> +               .type = PERF_TYPE_RAW,
>> +               .exclude_kernel = 1,
>> +               .exclude_hv = 1,
>> +               .disabled = 1,
>> +       };
>> +
>> +       pmu_arch_fixup_config(config);
>> +
>> +       attr.config = config[0];
>> +       attr.config1 = config[1];
>> +       attr.config2 = config[2];
>> +
>> +       return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd,
>> +0); }
>> +
>> +static int
>> +open_events(struct rte_pmu_event_group *group) {
>> +       struct rte_pmu_event *event;
>> +       uint64_t config[3];
>> +       int num = 0, ret;
>> +
>> +       /* group leader gets created first, with fd = -1 */
>> +       group->fds[0] = -1;
>> +
>> +       TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> +               ret = get_event_config(event->name, config);
>> +               if (ret)
>> +                       continue;
>> +
>> +               ret = do_perf_event_open(config, group->fds[0]);
>> +               if (ret == -1) {
>> +                       ret = -errno;
>> +                       goto out;
>> +               }
>> +
>> +               group->fds[event->index] = ret;
>> +               num++;
>> +       }
>> +
>> +       return 0;
>> +out:
>> +       for (--num; num >= 0; num--) {
>> +               close(group->fds[num]);
>> +               group->fds[num] = -1;
>> +       }
>> +
>> +
>> +       return ret;
>> +}
>> +
>> +static int
>> +mmap_events(struct rte_pmu_event_group *group) {
>> +       long page_size = sysconf(_SC_PAGE_SIZE);
>> +       unsigned int i;
>> +       void *addr;
>> +       int ret;
>> +
>> +       for (i = 0; i < rte_pmu.num_group_events; i++) {
>> +               addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
>> +               if (addr == MAP_FAILED) {
>> +                       ret = -errno;
>> +                       goto out;
>> +               }
>> +
>> +               group->mmap_pages[i] = addr;
>> +       }
>> +
>> +       return 0;
>> +out:
>> +       for (; i; i--) {
>> +               munmap(group->mmap_pages[i - 1], page_size);
>> +               group->mmap_pages[i - 1] = NULL;
>> +       }
>> +
>> +       return ret;
>> +}
>> +
>> +static void
>> +cleanup_events(struct rte_pmu_event_group *group) {
>> +       unsigned int i;
>> +
>> +       if (group->fds[0] != -1)
>> +               ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE,
>> + PERF_IOC_FLAG_GROUP);
>> +
>> +       for (i = 0; i < rte_pmu.num_group_events; i++) {
>> +               if (group->mmap_pages[i]) {
>> +                       munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
>> +                       group->mmap_pages[i] = NULL;
>> +               }
>> +
>> +               if (group->fds[i] != -1) {
>> +                       close(group->fds[i]);
>> +                       group->fds[i] = -1;
>> +               }
>> +       }
>> +
>> +       group->enabled = false;
>> +}
>> +
>> +int __rte_noinline
>
>This symbol is exported out of this library, no need for noinline.
>

I recall that it was added deliberately because this function was actually being inlined.
But given code changed a bit through multiple revisions this may not be necessary anymore. 

>
>> +rte_pmu_enable_group(void)
>> +{
>> +       struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>> +       int ret;
>> +
>> +       if (rte_pmu.num_group_events == 0)
>> +               return -ENODEV;
>> +
>> +       ret = open_events(group);
>> +       if (ret)
>> +               goto out;
>> +
>> +       ret = mmap_events(group);
>> +       if (ret)
>> +               goto out;
>> +
>> +       if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
>> +               ret = -errno;
>> +               goto out;
>> +       }
>> +
>> +       if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
>> +               ret = -errno;
>> +               goto out;
>> +       }
>> +
>> +       rte_spinlock_lock(&rte_pmu.lock);
>> +       TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
>> +       rte_spinlock_unlock(&rte_pmu.lock);
>> +       group->enabled = true;
>> +
>> +       return 0;
>> +
>> +out:
>> +       cleanup_events(group);
>> +
>> +       return ret;
>> +}
>> +
>> +static int
>> +scan_pmus(void)
>> +{
>> +       char path[PATH_MAX];
>> +       struct dirent *dent;
>> +       const char *name;
>> +       DIR *dirp;
>> +
>> +       dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
>> +       if (dirp == NULL)
>> +               return -errno;
>> +
>> +       while ((dent = readdir(dirp))) {
>> +               name = dent->d_name;
>> +               if (name[0] == '.')
>> +                       continue;
>> +
>> +               /* sysfs entry should either contain cpus or be a cpu */
>> +               if (!strcmp(name, "cpu"))
>> +                       break;
>> +
>> +               snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
>> +               if (access(path, F_OK) == 0)
>> +                       break;
>> +       }
>> +
>> +       if (dent) {
>> +               rte_pmu.name = strdup(name);
>> +               if (rte_pmu.name == NULL) {
>> +                       closedir(dirp);
>> +
>> +                       return -ENOMEM;
>> +               }
>> +       }
>> +
>> +       closedir(dirp);
>> +
>> +       return rte_pmu.name ? 0 : -ENODEV; }
>> +
>> +static struct rte_pmu_event *
>> +new_event(const char *name)
>> +{
>> +       struct rte_pmu_event *event;
>> +
>> +       event = calloc(1, sizeof(*event));
>> +       if (event == NULL)
>> +               goto out;
>> +
>> +       event->name = strdup(name);
>> +       if (event->name == NULL) {
>> +               free(event);
>> +               event = NULL;
>> +       }
>> +
>> +out:
>> +       return event;
>> +}
>> +
>> +static void
>> +free_event(struct rte_pmu_event *event) {
>> +       free(event->name);
>> +       free(event);
>> +}
>> +
>> +int
>> +rte_pmu_add_event(const char *name)
>> +{
>> +       struct rte_pmu_event *event;
>> +       char path[PATH_MAX];
>> +
>> +       if (rte_pmu.name == NULL)
>> +               return -ENODEV;
>> +
>> +       if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
>> +               return -ENOSPC;
>> +
>> +       snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name,
>name);
>> +       if (access(path, R_OK))
>> +               return -ENODEV;
>> +
>> +       TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> +               if (!strcmp(event->name, name))
>> +                       return event->index;
>> +               continue;
>> +       }
>> +
>> +       event = new_event(name);
>> +       if (event == NULL)
>> +               return -ENOMEM;
>> +
>> +       event->index = rte_pmu.num_group_events++;
>> +       TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
>> +
>> +       return event->index;
>> +}
>> +
>> +int
>> +rte_pmu_init(void)
>> +{
>> +       int ret;
>> +
>> +       /* Allow calling init from multiple contexts within a single thread. This simplifies
>> +        * resource management a bit e.g in case fast-path tracepoint has already been enabled
>> +        * via command line but application doesn't care enough and performs init/fini again.
>> +        */
>> +       if (rte_pmu.initialized) {
>
>This is an integer so check against 0 explicitly (there may be other cases in this patch, I did not
>recheck the whole patch).
>

Okay. 

>
>> +               rte_pmu.initialized++;
>> +               return 0;
>> +       }
>> +
>> +       ret = scan_pmus();
>> +       if (ret)
>> +               goto out;
>> +
>> +       ret = pmu_arch_init();
>> +       if (ret)
>> +               goto out;
>> +
>> +       TAILQ_INIT(&rte_pmu.event_list);
>> +       TAILQ_INIT(&rte_pmu.event_group_list);
>> +       rte_spinlock_init(&rte_pmu.lock);
>> +       rte_pmu.initialized = 1;
>> +
>> +       return 0;
>> +out:
>> +       free(rte_pmu.name);
>> +       rte_pmu.name = NULL;
>> +
>> +       return ret;
>> +}
>> +
>> +void
>> +rte_pmu_fini(void)
>> +{
>> +       struct rte_pmu_event_group *group, *tmp_group;
>> +       struct rte_pmu_event *event, *tmp_event;
>> +
>> +       /* cleanup once init count drops to zero */
>> +       if (!rte_pmu.initialized || --rte_pmu.initialized)
>> +               return;
>> +
>> +       RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
>> +               TAILQ_REMOVE(&rte_pmu.event_list, event, next);
>> +               free_event(event);
>> +       }
>> +
>> +       RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
>> +               TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
>> +               cleanup_events(group);
>> +       }
>> +
>> +       pmu_arch_fini();
>> +       free(rte_pmu.name);
>> +       rte_pmu.name = NULL;
>> +       rte_pmu.num_group_events = 0;
>> +}
>> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode
>> 100644 index 0000000000..e360375a0c
>> --- /dev/null
>> +++ b/lib/pmu/rte_pmu.h
>> @@ -0,0 +1,205 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2023 Marvell
>> + */
>> +
>> +#ifndef _RTE_PMU_H_
>> +#define _RTE_PMU_H_
>> +
>> +/**
>> + * @file
>> + *
>> + * PMU event tracing operations
>> + *
>> + * This file defines generic API and types necessary to setup PMU and
>> + * read selected counters in runtime.
>> + */
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +#include <linux/perf_event.h>
>> +
>> +#include <rte_atomic.h>
>> +#include <rte_branch_prediction.h>
>> +#include <rte_common.h>
>> +#include <rte_compat.h>
>> +#include <rte_spinlock.h>
>> +
>> +/** Maximum number of events in a group */ #define
>> +MAX_NUM_GROUP_EVENTS 8
>> +
>> +/**
>> + * A structure describing a group of events.
>> + */
>> +struct rte_pmu_event_group {
>> +       struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages
>*/
>> +       int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>> +       bool enabled; /**< true if group was enabled on particular lcore */
>> +       TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
>> +__rte_cache_aligned;
>
>One problem for the future is that we have a fixed size fd array.

This number can be increased if needed but rationale for that
was PMUs come with relatively small number of hw counters. Here 
events are grouped together (i.e scheduled together) which means there must 
be enough hw counters for all events for things to work.

>Do we need to expose this whole structure to the application?
>

Probably some part of it could have been hidden but all those structures are so small
that presumably such partitioning would not bring much to the table.

>
>> +
>> +/**
>> + * A structure describing an event.
>> + */
>> +struct rte_pmu_event {
>> +       char *name; /**< name of an event */
>> +       unsigned int index; /**< event index into fds/mmap_pages */
>
>This is an internal consideration.
>Do we need to expose this to the application?
>
>
>> +       TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
>> +
>> +/**
>> + * A PMU state container.
>> + */
>> +struct rte_pmu {
>> +       char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
>> +       rte_spinlock_t lock; /**< serialize access to event group list */
>> +       TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
>> +       unsigned int num_group_events; /**< number of events in a group */
>> +       TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
>> +       unsigned int initialized; /**< initialization counter */ };
>
>Idem, do we need to expose this to the application?
>
>
>> +
>> +/** lcore event group */
>> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
>> +
>> +/** PMU state container */
>> +extern struct rte_pmu rte_pmu;
>> +
>> +/** Each architecture supporting PMU needs to provide its own version
>> +*/ #ifndef rte_pmu_pmc_read #define rte_pmu_pmc_read(index) ({ 0; })
>> +#endif
>> +
>> +/**
>> + * @internal
>> + *
>> + * Read PMU counter.
>> + *
>> + * @param pc
>> + *   Pointer to the mmapped user page.
>> + * @return
>> + *   Counter value read from hardware.
>> + */
>> +__rte_internal
>> +static __rte_always_inline uint64_t
>> +rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
>> +       uint64_t width, offset;
>> +       uint32_t seq, index;
>> +       int64_t pmc;
>> +
>> +       for (;;) {
>> +               seq = pc->lock;
>> +               rte_compiler_barrier();
>> +               index = pc->index;
>> +               offset = pc->offset;
>> +               width = pc->pmc_width;
>> +
>> +               /* index set to 0 means that particular counter cannot be used */
>> +               if (likely(pc->cap_user_rdpmc && index)) {
>> +                       pmc = rte_pmu_pmc_read(index - 1);
>> +                       pmc <<= 64 - width;
>> +                       pmc >>= 64 - width;
>> +                       offset += pmc;
>> +               }
>> +
>> +               rte_compiler_barrier();
>> +
>> +               if (likely(pc->lock == seq))
>> +                       return offset;
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +/**
>> + * @internal
>> + *
>> + * Enable group of events on the calling lcore.
>> + *
>> + * @return
>> + *   0 in case of success, negative value otherwise.
>> + */
>> +__rte_internal
>
>Unless I missed something, this symbol is called from rte_pmu_read() so this makes rte_pmu_read()
>itself internal.
>So external applications won't be able to use the PMU API.
>
>This can probably be confirmed by adding some call to the PMU API in an examples/.
>

Good point actually. This was not that obvious when I looked at the patch introducing that change. 
So in this case it needs to be exported but given app should not call that itself maybe I'll
just make the intent clear by renaming it perhaps to __rte_pmu_enable_group() or something
alike. 

>
>> +int
>> +rte_pmu_enable_group(void);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Initialize PMU library.
>> + *
>> + * @return
>> + *   0 in case of success, negative value otherwise.
>> + */
>> +__rte_experimental
>> +int
>> +rte_pmu_init(void);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Finalize PMU library. This should be called after PMU counters are no longer being read.
>> + */
>> +__rte_experimental
>> +void
>> +rte_pmu_fini(void);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Add event to the group of enabled events.
>> + *
>> + * @param name
>> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
>> + * @return
>> + *   Event index in case of success, negative value otherwise.
>> + */
>> +__rte_experimental
>> +int
>> +rte_pmu_add_event(const char *name);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Read hardware counter configured to count occurrences of an event.
>> + *
>> + * @param index
>> + *   Index of an event to be read.
>> + * @return
>> + *   Event value read from register. In case of errors or lack of support
>> + *   0 is returned. In other words, stream of zeros in a trace file
>> + *   indicates problem with reading particular PMU event register.
>> + */
>> +__rte_experimental
>> +static __rte_always_inline uint64_t
>> +rte_pmu_read(unsigned int index)
>> +{
>> +       struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>> +       int ret;
>> +
>> +       if (unlikely(!rte_pmu.initialized))
>> +               return 0;
>> +
>> +       if (unlikely(!group->enabled)) {
>> +               ret = rte_pmu_enable_group();
>> +               if (ret)
>> +                       return 0;
>> +       }
>> +
>> +       if (unlikely(index >= rte_pmu.num_group_events))
>> +               return 0;
>> +
>> +       return rte_pmu_read_userpage(group->mmap_pages[index]);
>> +}
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif /* _RTE_PMU_H_ */
>> diff --git a/lib/pmu/version.map b/lib/pmu/version.map new file mode
>> 100644 index 0000000000..50fb0f354e
>> --- /dev/null
>> +++ b/lib/pmu/version.map
>> @@ -0,0 +1,20 @@
>> +DPDK_23 {
>> +       local: *;
>> +};
>> +
>> +EXPERIMENTAL {
>> +       global:
>> +
>> +       per_lcore__event_group;
>> +       rte_pmu;
>> +       rte_pmu_add_event;
>> +       rte_pmu_fini;
>> +       rte_pmu_init;
>> +       rte_pmu_read;
>> +};
>> +
>> +INTERNAL {
>> +       global:
>> +
>> +       rte_pmu_enable_group;
>> +};
>> --
>> 2.34.1
>>
>
>
>--
>David Marchand
  

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 9a0f416d2e..9f13eafd95 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1697,6 +1697,11 @@  M: Nithin Dabilpuram <ndabilpuram@marvell.com>
 M: Pavan Nikhilesh <pbhagavatula@marvell.com>
 F: lib/node/
 
+PMU - EXPERIMENTAL
+M: Tomasz Duszynski <tduszynski@marvell.com>
+F: lib/pmu/
+F: app/test/test_pmu*
+
 
 Test Applications
 -----------------
diff --git a/app/test/meson.build b/app/test/meson.build
index f34d19e3c3..7b6b69dcf1 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -111,6 +111,7 @@  test_sources = files(
         'test_reciprocal_division_perf.c',
         'test_red.c',
         'test_pie.c',
+        'test_pmu.c',
         'test_reorder.c',
         'test_rib.c',
         'test_rib6.c',
diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c
new file mode 100644
index 0000000000..a9bfb1a427
--- /dev/null
+++ b/app/test/test_pmu.c
@@ -0,0 +1,55 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell International Ltd.
+ */
+
+#include "test.h"
+
+#ifndef RTE_EXEC_ENV_LINUX
+
+static int
+test_pmu(void)
+{
+	printf("pmu_autotest only supported on Linux, skipping test\n");
+	return TEST_SKIPPED;
+}
+
+#else
+
+#include <rte_pmu.h>
+
+static int
+test_pmu_read(void)
+{
+	int tries = 10, event = -1;
+	uint64_t val = 0;
+
+	if (rte_pmu_init() < 0)
+		return TEST_FAILED;
+
+	while (tries--)
+		val += rte_pmu_read(event);
+
+	rte_pmu_fini();
+
+	return val ? TEST_SUCCESS : TEST_FAILED;
+}
+
+static struct unit_test_suite pmu_tests = {
+	.suite_name = "pmu autotest",
+	.setup = NULL,
+	.teardown = NULL,
+	.unit_test_cases = {
+		TEST_CASE(test_pmu_read),
+		TEST_CASES_END()
+	}
+};
+
+static int
+test_pmu(void)
+{
+	return unit_test_suite_runner(&pmu_tests);
+}
+
+#endif /* RTE_EXEC_ENV_LINUX */
+
+REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index de488c7abf..7f1938f92f 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -222,7 +222,8 @@  The public API headers are grouped by topics:
   [log](@ref rte_log.h),
   [errno](@ref rte_errno.h),
   [trace](@ref rte_trace.h),
-  [trace_point](@ref rte_trace_point.h)
+  [trace_point](@ref rte_trace_point.h),
+  [pmu](@ref rte_pmu.h)
 
 - **misc**:
   [EAL config](@ref rte_eal.h),
diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in
index f0886c3bd1..920e615996 100644
--- a/doc/api/doxy-api.conf.in
+++ b/doc/api/doxy-api.conf.in
@@ -63,6 +63,7 @@  INPUT                   = @TOPDIR@/doc/api/doxy-api-index.md \
                           @TOPDIR@/lib/pci \
                           @TOPDIR@/lib/pdump \
                           @TOPDIR@/lib/pipeline \
+                          @TOPDIR@/lib/pmu \
                           @TOPDIR@/lib/port \
                           @TOPDIR@/lib/power \
                           @TOPDIR@/lib/rawdev \
diff --git a/doc/guides/prog_guide/profile_app.rst b/doc/guides/prog_guide/profile_app.rst
index 14292d4c25..a8b501fe0c 100644
--- a/doc/guides/prog_guide/profile_app.rst
+++ b/doc/guides/prog_guide/profile_app.rst
@@ -7,6 +7,14 @@  Profile Your Application
 The following sections describe methods of profiling DPDK applications on
 different architectures.
 
+Performance counter based profiling
+-----------------------------------
+
+Majority of architectures support some sort hardware measurement unit which provides a set of
+programmable counters that monitor specific events. There are different tools which can gather
+that information, perf being an example here. Though in some scenarios, eg. when CPU cores are
+isolated (nohz_full) and run dedicated tasks, using perf is less than ideal. In such cases one can
+read specific events directly from application via ``rte_pmu_read()``.
 
 Profiling on x86
 ----------------
diff --git a/doc/guides/rel_notes/release_23_03.rst b/doc/guides/rel_notes/release_23_03.rst
index 73f5d94e14..733541d56c 100644
--- a/doc/guides/rel_notes/release_23_03.rst
+++ b/doc/guides/rel_notes/release_23_03.rst
@@ -55,10 +55,19 @@  New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* **Added PMU library.**
+
+  Added a new PMU (performance measurement unit) library which allows applications
+  to perform self monitoring activities without depending on external utilities like perf.
+  After integration with :doc:`../prog_guide/trace_lib` data gathered from hardware counters
+  can be stored in CTF format for further analysis.
+
 * **Updated AMD axgbe driver.**
 
   * Added multi-process support.
 
+* **Added multi-process support for axgbe PMD.**
+
 * **Updated Corigine nfp driver.**
 
   * Added support for meter options.
diff --git a/lib/meson.build b/lib/meson.build
index a90fee31b7..7132131b5c 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -11,6 +11,7 @@ 
 libraries = [
         'kvargs', # eal depends on kvargs
         'telemetry', # basic info querying
+        'pmu',
         'eal', # everything depends on eal
         'ring',
         'rcu', # rcu depends on ring
diff --git a/lib/pmu/meson.build b/lib/pmu/meson.build
new file mode 100644
index 0000000000..a4160b494e
--- /dev/null
+++ b/lib/pmu/meson.build
@@ -0,0 +1,13 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2023 Marvell International Ltd.
+
+if not is_linux
+    build = false
+    reason = 'only supported on Linux'
+    subdir_done()
+endif
+
+includes = [global_inc]
+
+sources = files('rte_pmu.c')
+headers = files('rte_pmu.h')
diff --git a/lib/pmu/pmu_private.h b/lib/pmu/pmu_private.h
new file mode 100644
index 0000000000..849549b125
--- /dev/null
+++ b/lib/pmu/pmu_private.h
@@ -0,0 +1,29 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell
+ */
+
+#ifndef _PMU_PRIVATE_H_
+#define _PMU_PRIVATE_H_
+
+/**
+ * Architecture specific PMU init callback.
+ *
+ * @return
+ *   0 in case of success, negative value otherwise.
+ */
+int
+pmu_arch_init(void);
+
+/**
+ * Architecture specific PMU cleanup callback.
+ */
+void
+pmu_arch_fini(void);
+
+/**
+ * Apply architecture specific settings to config before passing it to syscall.
+ */
+void
+pmu_arch_fixup_config(uint64_t config[3]);
+
+#endif /* _PMU_PRIVATE_H_ */
diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c
new file mode 100644
index 0000000000..4cf3161155
--- /dev/null
+++ b/lib/pmu/rte_pmu.c
@@ -0,0 +1,464 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell International Ltd.
+ */
+
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <regex.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <rte_atomic.h>
+#include <rte_per_lcore.h>
+#include <rte_pmu.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+
+#include "pmu_private.h"
+
+#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
+
+#ifndef GENMASK_ULL
+#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 - (h)))))
+#endif
+
+#ifndef FIELD_PREP
+#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
+#endif
+
+RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
+struct rte_pmu rte_pmu;
+
+/*
+ * Following __rte_weak functions provide default no-op. Architectures should override them if
+ * necessary.
+ */
+
+int
+__rte_weak pmu_arch_init(void)
+{
+	return 0;
+}
+
+void
+__rte_weak pmu_arch_fini(void)
+{
+}
+
+void
+__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
+{
+}
+
+static int
+get_term_format(const char *name, int *num, uint64_t *mask)
+{
+	char *config = NULL;
+	char path[PATH_MAX];
+	int high, low, ret;
+	FILE *fp;
+
+	/* quiesce -Wmaybe-uninitialized warning */
+	*num = 0;
+	*mask = 0;
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
+	fp = fopen(path, "r");
+	if (fp == NULL)
+		return -errno;
+
+	errno = 0;
+	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
+	if (ret < 2) {
+		ret = -ENODATA;
+		goto out;
+	}
+	if (errno) {
+		ret = -errno;
+		goto out;
+	}
+
+	if (ret == 2)
+		high = low;
+
+	*mask = GENMASK_ULL(high, low);
+	/* Last digit should be [012]. If last digit is missing 0 is implied. */
+	*num = config[strlen(config) - 1];
+	*num = isdigit(*num) ? *num - '0' : 0;
+
+	ret = 0;
+out:
+	free(config);
+	fclose(fp);
+
+	return ret;
+}
+
+static int
+parse_event(char *buf, uint64_t config[3])
+{
+	char *token, *term;
+	int num, ret, val;
+	uint64_t mask;
+
+	config[0] = config[1] = config[2] = 0;
+
+	token = strtok(buf, ",");
+	while (token) {
+		errno = 0;
+		/* <term>=<value> */
+		ret = sscanf(token, "%m[^=]=%i", &term, &val);
+		if (ret < 1)
+			return -ENODATA;
+		if (errno)
+			return -errno;
+		if (ret == 1)
+			val = 1;
+
+		ret = get_term_format(term, &num, &mask);
+		free(term);
+		if (ret)
+			return ret;
+
+		config[num] |= FIELD_PREP(mask, val);
+		token = strtok(NULL, ",");
+	}
+
+	return 0;
+}
+
+static int
+get_event_config(const char *name, uint64_t config[3])
+{
+	char path[PATH_MAX], buf[BUFSIZ];
+	FILE *fp;
+	int ret;
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
+	fp = fopen(path, "r");
+	if (fp == NULL)
+		return -errno;
+
+	ret = fread(buf, 1, sizeof(buf), fp);
+	if (ret == 0) {
+		fclose(fp);
+
+		return -EINVAL;
+	}
+	fclose(fp);
+	buf[ret] = '\0';
+
+	return parse_event(buf, config);
+}
+
+static int
+do_perf_event_open(uint64_t config[3], int group_fd)
+{
+	struct perf_event_attr attr = {
+		.size = sizeof(struct perf_event_attr),
+		.type = PERF_TYPE_RAW,
+		.exclude_kernel = 1,
+		.exclude_hv = 1,
+		.disabled = 1,
+	};
+
+	pmu_arch_fixup_config(config);
+
+	attr.config = config[0];
+	attr.config1 = config[1];
+	attr.config2 = config[2];
+
+	return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0);
+}
+
+static int
+open_events(struct rte_pmu_event_group *group)
+{
+	struct rte_pmu_event *event;
+	uint64_t config[3];
+	int num = 0, ret;
+
+	/* group leader gets created first, with fd = -1 */
+	group->fds[0] = -1;
+
+	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
+		ret = get_event_config(event->name, config);
+		if (ret)
+			continue;
+
+		ret = do_perf_event_open(config, group->fds[0]);
+		if (ret == -1) {
+			ret = -errno;
+			goto out;
+		}
+
+		group->fds[event->index] = ret;
+		num++;
+	}
+
+	return 0;
+out:
+	for (--num; num >= 0; num--) {
+		close(group->fds[num]);
+		group->fds[num] = -1;
+	}
+
+
+	return ret;
+}
+
+static int
+mmap_events(struct rte_pmu_event_group *group)
+{
+	long page_size = sysconf(_SC_PAGE_SIZE);
+	unsigned int i;
+	void *addr;
+	int ret;
+
+	for (i = 0; i < rte_pmu.num_group_events; i++) {
+		addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
+		if (addr == MAP_FAILED) {
+			ret = -errno;
+			goto out;
+		}
+
+		group->mmap_pages[i] = addr;
+	}
+
+	return 0;
+out:
+	for (; i; i--) {
+		munmap(group->mmap_pages[i - 1], page_size);
+		group->mmap_pages[i - 1] = NULL;
+	}
+
+	return ret;
+}
+
+static void
+cleanup_events(struct rte_pmu_event_group *group)
+{
+	unsigned int i;
+
+	if (group->fds[0] != -1)
+		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+
+	for (i = 0; i < rte_pmu.num_group_events; i++) {
+		if (group->mmap_pages[i]) {
+			munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
+			group->mmap_pages[i] = NULL;
+		}
+
+		if (group->fds[i] != -1) {
+			close(group->fds[i]);
+			group->fds[i] = -1;
+		}
+	}
+
+	group->enabled = false;
+}
+
+int __rte_noinline
+rte_pmu_enable_group(void)
+{
+	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
+	int ret;
+
+	if (rte_pmu.num_group_events == 0)
+		return -ENODEV;
+
+	ret = open_events(group);
+	if (ret)
+		goto out;
+
+	ret = mmap_events(group);
+	if (ret)
+		goto out;
+
+	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
+		ret = -errno;
+		goto out;
+	}
+
+	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
+		ret = -errno;
+		goto out;
+	}
+
+	rte_spinlock_lock(&rte_pmu.lock);
+	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
+	rte_spinlock_unlock(&rte_pmu.lock);
+	group->enabled = true;
+
+	return 0;
+
+out:
+	cleanup_events(group);
+
+	return ret;
+}
+
+static int
+scan_pmus(void)
+{
+	char path[PATH_MAX];
+	struct dirent *dent;
+	const char *name;
+	DIR *dirp;
+
+	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
+	if (dirp == NULL)
+		return -errno;
+
+	while ((dent = readdir(dirp))) {
+		name = dent->d_name;
+		if (name[0] == '.')
+			continue;
+
+		/* sysfs entry should either contain cpus or be a cpu */
+		if (!strcmp(name, "cpu"))
+			break;
+
+		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
+		if (access(path, F_OK) == 0)
+			break;
+	}
+
+	if (dent) {
+		rte_pmu.name = strdup(name);
+		if (rte_pmu.name == NULL) {
+			closedir(dirp);
+
+			return -ENOMEM;
+		}
+	}
+
+	closedir(dirp);
+
+	return rte_pmu.name ? 0 : -ENODEV;
+}
+
+static struct rte_pmu_event *
+new_event(const char *name)
+{
+	struct rte_pmu_event *event;
+
+	event = calloc(1, sizeof(*event));
+	if (event == NULL)
+		goto out;
+
+	event->name = strdup(name);
+	if (event->name == NULL) {
+		free(event);
+		event = NULL;
+	}
+
+out:
+	return event;
+}
+
+static void
+free_event(struct rte_pmu_event *event)
+{
+	free(event->name);
+	free(event);
+}
+
+int
+rte_pmu_add_event(const char *name)
+{
+	struct rte_pmu_event *event;
+	char path[PATH_MAX];
+
+	if (rte_pmu.name == NULL)
+		return -ENODEV;
+
+	if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
+		return -ENOSPC;
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
+	if (access(path, R_OK))
+		return -ENODEV;
+
+	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
+		if (!strcmp(event->name, name))
+			return event->index;
+		continue;
+	}
+
+	event = new_event(name);
+	if (event == NULL)
+		return -ENOMEM;
+
+	event->index = rte_pmu.num_group_events++;
+	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
+
+	return event->index;
+}
+
+int
+rte_pmu_init(void)
+{
+	int ret;
+
+	/* Allow calling init from multiple contexts within a single thread. This simplifies
+	 * resource management a bit e.g in case fast-path tracepoint has already been enabled
+	 * via command line but application doesn't care enough and performs init/fini again.
+	 */
+	if (rte_pmu.initialized) {
+		rte_pmu.initialized++;
+		return 0;
+	}
+
+	ret = scan_pmus();
+	if (ret)
+		goto out;
+
+	ret = pmu_arch_init();
+	if (ret)
+		goto out;
+
+	TAILQ_INIT(&rte_pmu.event_list);
+	TAILQ_INIT(&rte_pmu.event_group_list);
+	rte_spinlock_init(&rte_pmu.lock);
+	rte_pmu.initialized = 1;
+
+	return 0;
+out:
+	free(rte_pmu.name);
+	rte_pmu.name = NULL;
+
+	return ret;
+}
+
+void
+rte_pmu_fini(void)
+{
+	struct rte_pmu_event_group *group, *tmp_group;
+	struct rte_pmu_event *event, *tmp_event;
+
+	/* cleanup once init count drops to zero */
+	if (!rte_pmu.initialized || --rte_pmu.initialized)
+		return;
+
+	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
+		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
+		free_event(event);
+	}
+
+	RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
+		TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
+		cleanup_events(group);
+	}
+
+	pmu_arch_fini();
+	free(rte_pmu.name);
+	rte_pmu.name = NULL;
+	rte_pmu.num_group_events = 0;
+}
diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h
new file mode 100644
index 0000000000..e360375a0c
--- /dev/null
+++ b/lib/pmu/rte_pmu.h
@@ -0,0 +1,205 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell
+ */
+
+#ifndef _RTE_PMU_H_
+#define _RTE_PMU_H_
+
+/**
+ * @file
+ *
+ * PMU event tracing operations
+ *
+ * This file defines generic API and types necessary to setup PMU and
+ * read selected counters in runtime.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <linux/perf_event.h>
+
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_compat.h>
+#include <rte_spinlock.h>
+
+/** Maximum number of events in a group */
+#define MAX_NUM_GROUP_EVENTS 8
+
+/**
+ * A structure describing a group of events.
+ */
+struct rte_pmu_event_group {
+	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
+	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
+	bool enabled; /**< true if group was enabled on particular lcore */
+	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
+} __rte_cache_aligned;
+
+/**
+ * A structure describing an event.
+ */
+struct rte_pmu_event {
+	char *name; /**< name of an event */
+	unsigned int index; /**< event index into fds/mmap_pages */
+	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */
+};
+
+/**
+ * A PMU state container.
+ */
+struct rte_pmu {
+	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
+	rte_spinlock_t lock; /**< serialize access to event group list */
+	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
+	unsigned int num_group_events; /**< number of events in a group */
+	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
+	unsigned int initialized; /**< initialization counter */
+};
+
+/** lcore event group */
+RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
+
+/** PMU state container */
+extern struct rte_pmu rte_pmu;
+
+/** Each architecture supporting PMU needs to provide its own version */
+#ifndef rte_pmu_pmc_read
+#define rte_pmu_pmc_read(index) ({ 0; })
+#endif
+
+/**
+ * @internal
+ *
+ * Read PMU counter.
+ *
+ * @param pc
+ *   Pointer to the mmapped user page.
+ * @return
+ *   Counter value read from hardware.
+ */
+__rte_internal
+static __rte_always_inline uint64_t
+rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
+{
+	uint64_t width, offset;
+	uint32_t seq, index;
+	int64_t pmc;
+
+	for (;;) {
+		seq = pc->lock;
+		rte_compiler_barrier();
+		index = pc->index;
+		offset = pc->offset;
+		width = pc->pmc_width;
+
+		/* index set to 0 means that particular counter cannot be used */
+		if (likely(pc->cap_user_rdpmc && index)) {
+			pmc = rte_pmu_pmc_read(index - 1);
+			pmc <<= 64 - width;
+			pmc >>= 64 - width;
+			offset += pmc;
+		}
+
+		rte_compiler_barrier();
+
+		if (likely(pc->lock == seq))
+			return offset;
+	}
+
+	return 0;
+}
+
+/**
+ * @internal
+ *
+ * Enable group of events on the calling lcore.
+ *
+ * @return
+ *   0 in case of success, negative value otherwise.
+ */
+__rte_internal
+int
+rte_pmu_enable_group(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Initialize PMU library.
+ *
+ * @return
+ *   0 in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+rte_pmu_init(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Finalize PMU library. This should be called after PMU counters are no longer being read.
+ */
+__rte_experimental
+void
+rte_pmu_fini(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add event to the group of enabled events.
+ *
+ * @param name
+ *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
+ * @return
+ *   Event index in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+rte_pmu_add_event(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Read hardware counter configured to count occurrences of an event.
+ *
+ * @param index
+ *   Index of an event to be read.
+ * @return
+ *   Event value read from register. In case of errors or lack of support
+ *   0 is returned. In other words, stream of zeros in a trace file
+ *   indicates problem with reading particular PMU event register.
+ */
+__rte_experimental
+static __rte_always_inline uint64_t
+rte_pmu_read(unsigned int index)
+{
+	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
+	int ret;
+
+	if (unlikely(!rte_pmu.initialized))
+		return 0;
+
+	if (unlikely(!group->enabled)) {
+		ret = rte_pmu_enable_group();
+		if (ret)
+			return 0;
+	}
+
+	if (unlikely(index >= rte_pmu.num_group_events))
+		return 0;
+
+	return rte_pmu_read_userpage(group->mmap_pages[index]);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PMU_H_ */
diff --git a/lib/pmu/version.map b/lib/pmu/version.map
new file mode 100644
index 0000000000..50fb0f354e
--- /dev/null
+++ b/lib/pmu/version.map
@@ -0,0 +1,20 @@ 
+DPDK_23 {
+	local: *;
+};
+
+EXPERIMENTAL {
+	global:
+
+	per_lcore__event_group;
+	rte_pmu;
+	rte_pmu_add_event;
+	rte_pmu_fini;
+	rte_pmu_init;
+	rte_pmu_read;
+};
+
+INTERNAL {
+	global:
+
+	rte_pmu_enable_group;
+};